├── .github └── workflows │ ├── codecov.yml │ ├── lint.yml │ ├── pages.yml │ ├── publish.yml │ └── tests.yml ├── .gitignore ├── CITATION.cff ├── LICENSE ├── README.md ├── data ├── basque.csv ├── germany.csv └── texas.csv ├── doc ├── Makefile ├── make.bat └── source │ ├── augsynth.rst │ ├── biblio.bib │ ├── bibliography.rst │ ├── conf.py │ ├── dataprep.rst │ ├── generator.rst │ ├── index.rst │ ├── penalized.rst │ ├── placebo.rst │ ├── robust.rst │ └── synth.rst ├── examples ├── augsynth │ └── basque_augsynth.ipynb ├── basque.ipynb ├── factor-model.ipynb ├── germany.ipynb ├── penalized │ └── basque_penalized.ipynb ├── robust │ └── basque_robust.ipynb └── texas.ipynb ├── pyproject.toml ├── pysyncon ├── __init__.py ├── augsynth.py ├── base.py ├── dataprep.py ├── generator.py ├── inference.py ├── penalized.py ├── robust.py ├── synth.py └── utils.py ├── requirements-dev.txt ├── setup.cfg └── tests ├── test_augsynth.py ├── test_augsynth_basque.py ├── test_conformal_interence.py ├── test_dataprep.py ├── test_linear_factor_model.py ├── test_penalized.py ├── test_penalized_basque.py ├── test_robust.py ├── test_robust_basque.py ├── test_synth.py ├── test_synth_basque.py ├── test_synth_germany.py ├── test_synth_texas.py └── test_utils.py /.github/workflows/codecov.yml: -------------------------------------------------------------------------------- 1 | name: Generate Code Coverage report and upload to codecov.io 2 | 3 | on: 4 | pull_request: 5 | branches: main 6 | paths: 7 | - "pysyncon/**" 8 | - "tests/**" 9 | push: 10 | branches: main 11 | paths: 12 | - "pysyncon/**" 13 | - "tests/**" 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | codecov: 20 | runs-on: ubuntu-latest 21 | steps: 22 | - name: Checkout 23 | uses: actions/checkout@v4 24 | - name: Set up Python 25 | uses: actions/setup-python@v4 26 | with: 27 | python-version: "3.9" 28 | - name: Install dependencies 29 | run: | 30 | python -m pip install --upgrade pip 31 | python -m pip install -r requirements-dev.txt 32 | python -m pip install codecov 33 | - name: Run tests and collect coverage 34 | run: coverage run -m unittest discover -s tests 35 | - name: Upload coverage reports to Codecov with GitHub Action 36 | uses: codecov/codecov-action@v4.2.0 37 | env: 38 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 39 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Lint codebase using Black 2 | 3 | on: 4 | pull_request: 5 | branches: main 6 | paths: 7 | - "pysyncon/**" 8 | - "tests/**" 9 | - "examples/**" 10 | 11 | permissions: 12 | contents: read 13 | 14 | jobs: 15 | lint: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - name: Checkout 19 | uses: actions/checkout@v4 20 | - name: Set up Python 21 | uses: actions/setup-python@v4 22 | with: 23 | python-version: "3.9" 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | python -m pip install -r requirements-dev.txt 28 | - name: Lint 29 | run: black --check . 30 | -------------------------------------------------------------------------------- /.github/workflows/pages.yml: -------------------------------------------------------------------------------- 1 | name: Build html using Sphinx and upload to Github-pages 2 | 3 | on: 4 | push: 5 | branches: main 6 | paths: 7 | - "pysyncon/**" 8 | - "doc/**" 9 | 10 | permissions: 11 | contents: read 12 | 13 | jobs: 14 | pages: 15 | runs-on: ubuntu-latest 16 | environment: 17 | name: github-pages 18 | url: ${{ steps.deployment.outputs.page_url }} 19 | permissions: 20 | pages: write 21 | id-token: write 22 | steps: 23 | - name: Checkout 24 | uses: actions/checkout@v4 25 | - name: Set up Python 26 | uses: actions/setup-python@v4 27 | with: 28 | python-version: "3.9" 29 | - name: Install Sphinx 30 | run: | 31 | python -m pip install --upgrade pip 32 | python -m pip install -r requirements-dev.txt 33 | python -m pip install sphinx 34 | python -m pip install sphinxcontrib-bibtex 35 | - name: Build html 36 | run: sphinx-build -b html ./doc/source/ ./doc/build/ 37 | - name: Setup Pages 38 | uses: actions/configure-pages@v2 39 | - name: Upload artifact 40 | uses: actions/upload-pages-artifact@v3 41 | with: 42 | path: ./doc/build/ 43 | - name: Deploy to GitHub Pages 44 | id: deployment 45 | uses: actions/deploy-pages@v1 46 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Build package and upload to PyPI 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | permissions: 8 | contents: read 9 | 10 | jobs: 11 | upload: 12 | runs-on: ubuntu-latest 13 | permissions: 14 | id-token: write 15 | steps: 16 | - uses: actions/checkout@v4 17 | - name: Set up Python 18 | uses: actions/setup-python@v4 19 | with: 20 | python-version: '3.x' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install build 25 | - name: Build package 26 | run: python -m build 27 | - name: Publish package 28 | uses: pypa/gh-action-pypi-publish@79739dc2f2bf6bcfd21ecf9af9f06bd643dbeeae 29 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Run unittests 2 | 3 | on: 4 | pull_request: 5 | branches: main 6 | paths: 7 | - "pysyncon/**" 8 | - "tests/**" 9 | 10 | permissions: 11 | contents: read 12 | 13 | jobs: 14 | tests: 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | buildplat: [ubuntu-20.04, windows-2019] 19 | python: ["3.8", "3.9", "3.10", "3.11", "3.12"] 20 | runs-on: ${{ matrix.buildplat }} 21 | steps: 22 | - name: Checkout 23 | uses: actions/checkout@v4 24 | - name: Set up Python 25 | uses: actions/setup-python@v4 26 | with: 27 | python-version: ${{ matrix.python }} 28 | - name: Install dependencies 29 | run: | 30 | python -m pip install --upgrade pip 31 | python -m pip install -r requirements-dev.txt 32 | - name: Run tests 33 | run: python -m unittest discover -s tests 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/python,r,virtualenv,visualstudiocode 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,r,virtualenv,visualstudiocode 3 | 4 | ### Python ### 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # poetry 102 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 103 | # This is especially recommended for binary packages to ensure reproducibility, and is more 104 | # commonly ignored for libraries. 105 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 106 | #poetry.lock 107 | 108 | # pdm 109 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 110 | #pdm.lock 111 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 112 | # in version control. 113 | # https://pdm.fming.dev/#use-with-ide 114 | .pdm.toml 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | 153 | # pytype static type analyzer 154 | .pytype/ 155 | 156 | # Cython debug symbols 157 | cython_debug/ 158 | 159 | # PyCharm 160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 162 | # and can be added to the global gitignore or merged into this file. For a more nuclear 163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 164 | #.idea/ 165 | 166 | ### Python Patch ### 167 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration 168 | poetry.toml 169 | 170 | 171 | ### R ### 172 | # History files 173 | .Rhistory 174 | .Rapp.history 175 | 176 | # Session Data files 177 | .RData 178 | .RDataTmp 179 | 180 | # User-specific files 181 | .Ruserdata 182 | 183 | # Example code in package build process 184 | *-Ex.R 185 | 186 | # Output files from R CMD build 187 | /*.tar.gz 188 | 189 | # Output files from R CMD check 190 | /*.Rcheck/ 191 | 192 | # RStudio files 193 | .Rproj.user/ 194 | 195 | # produced vignettes 196 | vignettes/*.html 197 | vignettes/*.pdf 198 | 199 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 200 | .httr-oauth 201 | 202 | # knitr and R markdown default cache directories 203 | *_cache/ 204 | /cache/ 205 | 206 | # Temporary files created by R markdown 207 | *.utf8.md 208 | *.knit.md 209 | 210 | # R Environment Variables 211 | .Renviron 212 | 213 | # pkgdown site 214 | docs/ 215 | 216 | # translation temp files 217 | po/*~ 218 | 219 | # RStudio Connect folder 220 | rsconnect/ 221 | 222 | ### R.Bookdown Stack ### 223 | # R package: bookdown caching files 224 | /*_files/ 225 | 226 | ### VirtualEnv ### 227 | # Virtualenv 228 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ 229 | [Bb]in 230 | [Ii]nclude 231 | [Ll]ib 232 | [Ll]ib64 233 | [Ll]ocal 234 | [Ss]cripts 235 | pyvenv.cfg 236 | pip-selfcheck.json 237 | 238 | ### VisualStudioCode ### 239 | .vscode/* 240 | !.vscode/settings.json 241 | !.vscode/tasks.json 242 | !.vscode/launch.json 243 | !.vscode/extensions.json 244 | !.vscode/*.code-snippets 245 | 246 | # Local History for Visual Studio Code 247 | .history/ 248 | 249 | # Built Visual Studio Code Extensions 250 | *.vsix 251 | 252 | ### VisualStudioCode Patch ### 253 | # Ignore all local history of files 254 | .history 255 | .ionide 256 | 257 | # Settings 258 | settings.json 259 | 260 | # End of https://www.toptal.com/developers/gitignore/api/python,r,virtualenv,visualstudiocode 261 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.0.0 2 | message: "If you use this software in your research, please cite it as below." 3 | authors: 4 | - family-names: "Fordham" 5 | given-names: "Stiofán" 6 | orcid: "https://orcid.org/0009-0003-1345-3252" 7 | title: "pysyncon: a Python package for the Synthetic Control Method" 8 | date-released: 2022-12-20 9 | url: "https://github.com/sdfordham/pysyncon" 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Stiofán Fordham 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # pysyncon ![](https://img.shields.io/badge/python-3.8+-blue.svg) [![codecov](https://codecov.io/gh/sdfordham/pysyncon/graph/badge.svg?token=hmi7xHQ4OT)](https://codecov.io/gh/sdfordham/pysyncon) 4 | 5 | A python module for the synthetic control method that provides implementations of: 6 | 7 | - Synthetic Control Method (Abadie & Gardeazabal 2003) 8 | - Robust Synthetic Control Method (Amjad, Shah & Shen 2018) 9 | - Augmented Synthetic Control Method (Ben-Michael, Feller & Rothstein 2021) 10 | - Penalized Synthetic Control Method (Abadie & L'Hour 2021) 11 | 12 | The package also provides methods for performing placebo tests and generating confidence intervals. 13 | 14 | The implementation of the synthetic control method aims to be reconcilable with the R package [Synth](https://CRAN.R-project.org/package=Synth) and similarly the implementation of the Augmented synthetic control method and the R package [augsynth](https://github.com/ebenmichael/augsynth). 15 | 16 | ## Installation 17 | Install it from PyPI using pip: 18 | 19 | ````bash 20 | python -m pip install pysyncon 21 | ```` 22 | 23 | ## Usage 24 | 25 | Documentation is available on [github-pages](https://sdfordham.github.io/pysyncon/). In the examples folder are notebooks reproducing the weights from: 26 | 27 | - The Economic Costs of Conflict: A Case Study of the Basque Country, Alberto Abadie and Javier Gardeazabal; The American Economic Review Vol. 93, No. 1 (Mar., 2003), pp. 113-132. ([notebook here](examples/basque.ipynb)) 28 | - The worked example 'Prison construction and Black male incarceration' from the last chapter of 'Causal Inference: The Mixtape' by Scott Cunningham. ([notebook here](examples/texas.ipynb)) 29 | - Comparative Politics and the Synthetic Control Method, Alberto Abadie, Alexis Diamond and Jens Hainmueller; American Journal of Political Science Vol. 59, No. 2 (April 2015), pp. 495-510. ([notebook here](examples/germany.ipynb)) 30 | 31 | ## Citation 32 | 33 | If you use this package in your research, you can cite it as below. 34 | 35 | ``` 36 | @software{pysyncon, 37 | author = {Fordham, Stiofán}, 38 | month = dec, 39 | title = {{pysyncon: a Python package for the Synthetic Control Method}}, 40 | url = {https://github.com/sdfordham/pysyncon}, 41 | year = {2022} 42 | } 43 | ``` 44 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /doc/source/augsynth.rst: -------------------------------------------------------------------------------- 1 | 2 | Augmented Synthetic Control Method 3 | ================================== 4 | 5 | The *Augmented Synthetic Control Method* is due to Ben-Michael, Feller & Rothstein 6 | :cite:`augsynth2021` and adapts the :doc:`Synthetic Control Method ` in an 7 | effort to adjust for poor pre-treatment fit. 8 | 9 | The authors do this by adjusting the Synthetic Control Method estimate by adding 10 | a term that is an imbalance in a particular function of the pre-treatment outcomes. 11 | In the *Ridge Augmented Synthetic Control Method* this function is 12 | linear in the pre-treatment outcomes and fit by ridge regression of the control 13 | post-treatment outcomes against pre-treatment outcomes. 14 | 15 | In particular, the method constructs a vector of weights :math:`w = (w_1, w_2, \dots, w_k)` 16 | such that 17 | 18 | .. math:: 19 | w = w_\mathrm{scm} + w_\mathrm{aug}, 20 | 21 | where :math:`w_\mathrm{scm}` are the weights obtained from the standard 22 | :doc:`Synthetic Control Method ` and :math:`w_\mathrm{aug}` are 23 | augmentations that are included when the treated unit lies outside the 24 | convex hull defined by the control units. The weights may be negative and 25 | larger than 1, the degree of extrapolation is controlled by a ridge 26 | parameter :math:`\lambda`. 27 | 28 | In general, this method will obtain weights at least as good as the synthetic 29 | control method in terms of pre-treatment fit. 30 | 31 | The :class:`AugSynth` class 32 | *************************** 33 | 34 | The :class:`AugSynth ` class implements the Ridge Augmented 35 | Synthetic Control Method. The expected way to use the class is to first create a 36 | :class:`Dataprep ` object that defines the study data and 37 | then use it as input to a :class:`AugSynth ` object. See the 38 | `examples folder `_ 39 | of the repository for examples illustrating usage. 40 | 41 | The implementation is based on the same method in the R 42 | `augsynth package `_ 43 | and aims to produce results that can be reconciled with that package. 44 | 45 | .. autoclass:: pysyncon.AugSynth 46 | :members: 47 | :inherited-members: 48 | -------------------------------------------------------------------------------- /doc/source/biblio.bib: -------------------------------------------------------------------------------- 1 | @article{basque2003, 2 | Author = {Abadie, Alberto and Gardeazabal, Javier}, 3 | Title = {The Economic Costs of Conflict: A Case Study of the Basque Country}, 4 | Journal = {American Economic Review}, 5 | Volume = {93}, 6 | Number = {1}, 7 | Year = {2003}, 8 | Month = {March}, 9 | Pages = {113-132}, 10 | DOI = {10.1257/000282803321455188}, 11 | } 12 | @book{california2007, 13 | title={Synthetic Control Methods for Comparative Case Studies: Estimating the Effect of California's Tobacco Control Program}, 14 | DOI={10.3386/w12831}, 15 | publisher={National Bureau of Economic Research}, 16 | author={Abadie, Alberto and Diamond, Alexis and Hainmueller, Jens}, 17 | year={2007}, 18 | month={January} 19 | } 20 | @article{germany2015, 21 | title = {Comparative Politics and the Synthetic Control Method}, 22 | author = {Abadie, Alberto and Diamond, Alexis and Hainmueller, Jens}, 23 | year = {2015}, 24 | journal = {American Journal of Political Science}, 25 | volume = {59}, 26 | number = {2}, 27 | pages = {495--510}, 28 | doi = {10.1111/ajps.12116} 29 | } 30 | @article{penalized2021, 31 | author = {Alberto Abadie and Jérémy L'Hour}, 32 | title = {A Penalized Synthetic Control Estimator for Disaggregated Data}, 33 | journal = {Journal of the American Statistical Association}, 34 | volume = {116}, 35 | number = {536}, 36 | pages = {1817-1834}, 37 | year = {2021}, 38 | publisher = {Taylor & Francis}, 39 | doi = {10.1080/01621459.2021.1971535}, 40 | } 41 | @article{robust2018, 42 | author = {Muhammad Amjad and Devavrat Shah and Dennis Shen}, 43 | title = {Robust Synthetic Control}, 44 | journal = {Journal of Machine Learning Research}, 45 | year = {2018}, 46 | volume = {19}, 47 | number = {22}, 48 | pages = {1-51}, 49 | url = {http://jmlr.org/papers/v19/17-777.html} 50 | } 51 | @article{augsynth2021, 52 | author = {Eli Ben-Michael and Avi Feller and Jesse Rothstein}, 53 | title = {The Augmented Synthetic Control Method}, 54 | journal = {Journal of the American Statistical Association}, 55 | volume = {116}, 56 | number = {536}, 57 | pages = {1789-1803}, 58 | year = {2021}, 59 | publisher = {Taylor & Francis}, 60 | doi = {10.1080/01621459.2021.1929245}, 61 | } 62 | @article{fp2018, 63 | author = {Firpo, Sergio and Possebom, Vitor}, 64 | title = {Synthetic Control Method: Inference, Sensitivity Analysis and Confidence Sets}, 65 | journal = {Journal of Causal Inference}, 66 | volume = {6}, 67 | number = {2}, 68 | year = {2018}, 69 | pages = {20160026}, 70 | publisher = {De Gruyter}, 71 | doi = {10.1515/jci-2016-0026}, 72 | } 73 | @article{inference2021, 74 | author = {Victor Chernozhukov, Kaspar Wüthrich and Yinchu Zhu}, 75 | title = {An Exact and Robust Conformal Inference Method for Counterfactual and Synthetic Controls}, 76 | journal = {Journal of the American Statistical Association}, 77 | volume = {116}, 78 | number = {536}, 79 | year = {2021}, 80 | pages = {1849--1864}, 81 | publisher = {Taylor \& Francis}, 82 | doi = {10.1080/01621459.2021.1920957}, 83 | } 84 | -------------------------------------------------------------------------------- /doc/source/bibliography.rst: -------------------------------------------------------------------------------- 1 | Bibliography 2 | ============ 3 | 4 | .. bibliography:: 5 | -------------------------------------------------------------------------------- /doc/source/conf.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | 5 | sys.path.insert(0, os.path.abspath("../../")) 6 | 7 | project = "pysyncon" 8 | copyright = "2025, Stiofán Fordham" 9 | author = "Stiofán Fordham" 10 | release = "1.5.2" 11 | 12 | extensions = [ 13 | "sphinx.ext.autodoc", 14 | "sphinx.ext.napoleon", 15 | "sphinx.ext.mathjax", 16 | "sphinx.ext.githubpages", 17 | "sphinxcontrib.bibtex", 18 | ] 19 | html_theme = "alabaster" 20 | bibtex_bibfiles = ["biblio.bib"] 21 | -------------------------------------------------------------------------------- /doc/source/dataprep.rst: -------------------------------------------------------------------------------- 1 | :class:`Dataprep` class 2 | ======================== 3 | 4 | This class and its API are based on the similarly named function in the R 5 | `Synth package `_. 6 | 7 | The ``dataprep`` class defines all the information necessary for the synthetic 8 | control study. It takes in as argument a ``pandas.DataFrame`` `foo` containing 9 | the panel data, a list of predictors, special predictors, the statistical operation to 10 | apply to the predictors over the selected time frame, the dependant variable, 11 | the columns denoting the unit labels, the label denoting the control units, 12 | the label denoting the treated unit, the time period to carry out the optimisation 13 | procedure over and the time period to apply the statistical operation to the 14 | predictors. See below for further details about each individual argument, and also see 15 | the `examples folder `_ 16 | of the repository to see how this class is set up in three real research contexts. 17 | 18 | The principal difference between the function signature here and the one in 19 | the ``R`` ``synth`` package is that whereas there are two arguments `unit.variable` 20 | and `unit.names.variable` in that package, in this package these are 21 | consolidated into one argument `unit_variable` as here it is unnecessary to have 22 | both. 23 | 24 | .. autoclass:: pysyncon.Dataprep 25 | :members: 26 | -------------------------------------------------------------------------------- /doc/source/generator.rst: -------------------------------------------------------------------------------- 1 | Sample data generation 2 | ====================== 3 | 4 | The package provides a method for generating fake data for testing purposes. 5 | 6 | Linear Factor model 7 | ******************* 8 | 9 | Let :math:`Y_{it}^N` (resp. :math:`Y_{it}^I`) denote the outcome for unit :math:`i` at time :math:`t` 10 | in the absence of treatment (resp. in the presence of treatment). The :class:`LinearFactorModel` 11 | generates sample potential outcomes data according to a Linear 12 | Factor model: 13 | 14 | .. math:: 15 | 16 | Y_{jt}^N &= \theta_t^T Z_j + \lambda_t^T \mu_j + \epsilon_{tj},\\ 17 | Y_{jt}^I &= Y_{jt}^N + \delta_t, 18 | 19 | where :math:`Z_j` denotes a vector of observable covariates, :math:`\mu_j` is a vector of unobservable 20 | covariates and :math:`\epsilon_{tj}` are mean-zero normal shocks. The vector :math:`\delta_t` denotes 21 | a vector of treatment effects and the remaining variables are model parameters. 22 | 23 | .. autoclass:: pysyncon.generator.LinearFactorModel 24 | :members: 25 | :inherited-members: -------------------------------------------------------------------------------- /doc/source/index.rst: -------------------------------------------------------------------------------- 1 | pysyncon 2 | ======== 3 | 4 | pysyncon is a Python package that provides methods for the synthetic control 5 | method and derivative methods. 6 | 7 | The types of synthetic control studies available in the package are: 8 | 9 | .. toctree:: 10 | :maxdepth: 1 11 | 12 | Synthetic Control Method 13 | Augmented Synthetic Control Method 14 | Robust Synthetic Control Method 15 | Penalized Synthetic Control Method 16 | 17 | The package also provides a method for performing permutation tests/placebo 18 | tests with the above methods: 19 | 20 | .. toctree:: 21 | :maxdepth: 1 22 | 23 | Placebo Tests 24 | 25 | The main helper class that is used to describe the study data and used as 26 | input to a synthetic control method is the dataprep class: 27 | 28 | .. toctree:: 29 | :maxdepth: 1 30 | 31 | Dataprep 32 | 33 | How to use the package 34 | ********************** 35 | 36 | There are notebooks in the examples folder of the package illustrating how 37 | to use the package `here `_. 38 | -------------------------------------------------------------------------------- /doc/source/penalized.rst: -------------------------------------------------------------------------------- 1 | Penalized Synthetic Control Method 2 | ================================== 3 | 4 | The penalized synthetic control method is due to Abadie & L'Hour :cite:`penalized2021`. 5 | 6 | This version of the synthetic control method adds a penalization term to the loss 7 | function that has the effect of serving to reduce the interpolation bias. It does this 8 | by penalizing pairwise discrepancies in any unit contributing to the synthetic control 9 | and the treated unit. 10 | 11 | The :class:`PenalizedSynth` class 12 | ********************************* 13 | 14 | The :class:`PenalizedSynth ` class implements the penalized 15 | synthetic control method. The expected way to use the class is to first create a 16 | :class:`Dataprep ` object that defines the study data and 17 | then use it as input to a :class:`PenalizedSynth ` object. See the 18 | `examples folder `_ 19 | of the repository for examples illustrating usage. 20 | 21 | .. autoclass:: pysyncon.PenalizedSynth 22 | :members: 23 | :inherited-members: 24 | -------------------------------------------------------------------------------- /doc/source/placebo.rst: -------------------------------------------------------------------------------- 1 | Placebo Tests 2 | ============= 3 | 4 | A placebo test is used to assess the significance of a synthetic control study 5 | by running the study once for each control unit set as treated unit and the 6 | remaining control units set as controls. See :cite:`germany2015` (section I.B) 7 | for a motivation. An example of usage is in the python notebook reproducing 8 | the weights from that paper in the package repository 9 | `here `_ 10 | 11 | The :class:`PlaceboTest` class 12 | ****************************** 13 | 14 | .. autoclass:: pysyncon.utils.PlaceboTest 15 | :members: 16 | -------------------------------------------------------------------------------- /doc/source/robust.rst: -------------------------------------------------------------------------------- 1 | Robust Synthetic Control Method 2 | =============================== 3 | 4 | The Robust Synthetic Control Method is due to Amjad, Shah & Shen :cite:`robust2018`. 5 | 6 | This method de-noises the data matrix of the control units by 7 | applying a threshold to the singular values of the observation matrix 8 | and then fits a linear model using ridge regression of the de-noised control 9 | post-treatment outcomes against pre-treatment outcomes. Similarly to the 10 | :doc:`Ridge Agumented Synthetic Control Method ` the weights here 11 | may be negative or larger than 1. 12 | 13 | The :class:`RobustSynth` class 14 | ****************************** 15 | 16 | The :class:`RobustSynth ` class implements the robust control 17 | method. The expected way to use the class is to first create a 18 | :class:`Dataprep ` object that defines the study data and 19 | then use it as input to a :class:`RobustSynth ` object. See the 20 | `examples folder `_ 21 | of the repository for examples illustrating usage. 22 | 23 | .. autoclass:: pysyncon.RobustSynth 24 | :members: 25 | :inherited-members: 26 | -------------------------------------------------------------------------------- /doc/source/synth.rst: -------------------------------------------------------------------------------- 1 | Synthetic Control Method 2 | ======================== 3 | 4 | Overview 5 | ******** 6 | 7 | The synthetic control method is due to Abadie and Gardeazabal :cite:`basque2003` 8 | (also see Abadie, Diamond and Hainmueller :cite:`california2007` :cite:`germany2015`). 9 | This method constructs a weighted combination of the control units that 10 | most resembles the selected characteristics of the treated unit in a time period 11 | prior to the treatment time. This so-constructed "synthetic control unit" can then be 12 | compared with the treated unit to investigate the causal effect of the treatment. 13 | 14 | Details 15 | ******* 16 | 17 | In particular, this method constructs a vector of non-negative weights 18 | :math:`w = (w_1, w_2, \dots, w_k)` whose sum is 1 and :math:`k` is the number 19 | of control units that minimizes 20 | 21 | .. math:: 22 | \|x_1-X_0w^T\|_V, 23 | 24 | where 25 | 26 | - :math:`\|A\|_V=\sqrt{A^TVA}`, where :math:`V` is a diagonal matrix 27 | with non-negative entries that captures the relationship between the 28 | outcome variable and the predictors, 29 | - :math:`X_0` is a matrix of the values for the control units of the chosen 30 | statistic for the chosen predictors over the selected (pre-intervention) 31 | time-period (each column corresponds to a control), 32 | - :math:`x_1` is a (column) vector of the corresponding values for the 33 | treated unit. 34 | 35 | The matrix :math:`V` can be supplied otherwise it is part of the 36 | optimization problem: it is obtained by minimizing the quantity 37 | 38 | .. math:: 39 | \|z_1-Z_0w^T\|, 40 | 41 | where 42 | 43 | - :math:`Z_0` is a matrix of the values of the outcome variable for the 44 | control units over the (pre-intervention) time-period (each column 45 | corresponds to a control), 46 | - :math:`z_1` is a (column) vector of the corresponding values for the 47 | treated unit. 48 | 49 | The :class:`Synth` class 50 | ************************ 51 | 52 | The :class:`Synth ` class implements the synthetic control 53 | method. The expected way to use the class is to first create a 54 | :class:`Dataprep ` object that defines the study data and 55 | then use it as input to a :class:`Synth ` object. See the 56 | `examples folder `_ 57 | of the repository for examples illustrating usage. 58 | 59 | The implementation is based on the same method in the R 60 | `Synth package `_ 61 | and aims to produce results that can be reconciled with that package. 62 | 63 | .. autoclass:: pysyncon.Synth 64 | :members: 65 | :inherited-members: 66 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools"] 3 | build-backend = "setuptools.build_meta" 4 | -------------------------------------------------------------------------------- /pysyncon/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.5.1" 2 | 3 | from .dataprep import Dataprep 4 | from .synth import Synth 5 | from .augsynth import AugSynth 6 | from .robust import RobustSynth 7 | from .penalized import PenalizedSynth 8 | -------------------------------------------------------------------------------- /pysyncon/augsynth.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Optional 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from .dataprep import Dataprep 8 | from .base import BaseSynth, VanillaOptimMixin 9 | from .utils import HoldoutSplitter, CrossValidationResult 10 | 11 | 12 | class AugSynth(BaseSynth, VanillaOptimMixin): 13 | """Implementation of the augmented synthetic control method due to Ben- 14 | Michael, Feller & Rothstein :cite:`augsynth2021`. 15 | 16 | The implementation follows the augsynth R package with the option 17 | `progfunc="Ridge"`. 18 | """ 19 | 20 | def __init__(self) -> None: 21 | super().__init__() 22 | self.lambda_: Optional[float] = None 23 | self.cv_result: Optional[CrossValidationResult] = None 24 | 25 | def fit(self, dataprep: Dataprep, lambda_: Optional[float] = None) -> None: 26 | """Fit the model/calculate the weights. 27 | 28 | Parameters 29 | ---------- 30 | dataprep : Dataprep, optional 31 | :class:`Dataprep` object containing data to model. 32 | lambda_ : float, optional 33 | Ridge parameter to use. If not supplied, then it is obtained by 34 | cross-validation, by default None 35 | """ 36 | if ( 37 | isinstance(dataprep.treatment_identifier, (list, tuple)) 38 | and len(dataprep.treatment_identifier) > 1 39 | ): 40 | raise ValueError("AugSynth requires exactly one treated unit.") 41 | self.dataprep = dataprep 42 | Z0, Z1 = self.dataprep.make_covariate_mats() 43 | X0, X1 = self.dataprep.make_outcome_mats() 44 | 45 | X0_demean, X1_demean, Z0_normal, Z1_normal = self._normalize(X0, X1, Z0, Z1) 46 | X0_stacked = pd.concat([X0_demean, Z0_normal], axis=0) 47 | X1_stacked = pd.concat([X1_demean, Z1_normal], axis=0) 48 | 49 | if lambda_ is None: 50 | lambdas = self.generate_lambdas(X0) 51 | self.cv_result = self.cross_validate(X0, X1, lambdas) 52 | self.lambda_ = self.cv_result.best_lambda() 53 | else: 54 | self.lambda_ = lambda_ 55 | 56 | n_r, _ = X0.shape 57 | V_mat = np.diag(np.full(n_r, 1 / n_r)) 58 | W, _ = self.w_optimize(V_mat=V_mat, X0=X0.to_numpy(), X1=X1.to_numpy()) 59 | 60 | W_ridge = self.solve_ridge( 61 | X1_stacked.to_numpy(), X0_stacked.to_numpy(), W, self.lambda_ 62 | ) 63 | self.W = W + W_ridge 64 | 65 | @staticmethod 66 | def solve_ridge( 67 | A: np.ndarray, B: np.ndarray, W: np.ndarray, lambda_: float 68 | ) -> np.ndarray: 69 | """Calculate the ridge adjustment to the weights. 70 | 71 | :meta private: 72 | """ 73 | M = A - B @ W 74 | N = np.linalg.inv(B @ B.T + lambda_ * np.identity(B.shape[0])) 75 | return M @ N @ B 76 | 77 | def _normalize( 78 | self, X0: pd.DataFrame, X1: pd.Series, Z0: pd.DataFrame, Z1: pd.Series 79 | ) -> tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: 80 | """Normalise the data before the weight calculation. 81 | 82 | :meta private: 83 | """ 84 | X0_demean = X0.subtract(X0.mean(axis=1), axis=0) 85 | X1_demean = X1.subtract(X0.mean(axis=1), axis=0) 86 | 87 | Z0_demean = Z0.subtract(Z0.mean(axis=1), axis=0) 88 | Z1_demean = Z1.subtract(Z0.mean(axis=1), axis=0) 89 | 90 | Z0_std = Z0_demean.std(axis=1) 91 | X0_std = X0_demean.to_numpy().std(ddof=1).item() 92 | 93 | Z0_normal = Z0_demean.divide(Z0_std, axis=0) * X0_std 94 | Z1_normal = Z1_demean.divide(Z0_std, axis=0) * X0_std 95 | return X0_demean, X1_demean, Z0_normal, Z1_normal 96 | 97 | def cross_validate( 98 | self, X0: np.ndarray, X1: np.ndarray, lambdas: np.ndarray, holdout_len: int = 1 99 | ) -> CrossValidationResult: 100 | """Method that calculates the mean error and standard error to the mean 101 | error using a cross-validation procedure for the given ridge parameter 102 | values. 103 | 104 | :meta private: 105 | """ 106 | V = np.identity(X0.shape[0] - holdout_len) 107 | res = list() 108 | for X0_t, X0_v, X1_t, X1_v in HoldoutSplitter(X0, X1, holdout_len=holdout_len): 109 | W, _ = self.w_optimize(V_mat=V, X0=X0_t.to_numpy(), X1=X1_t.to_numpy()) 110 | this_res = list() 111 | for l in lambdas: 112 | ridge_weights = self.solve_ridge(A=X1_t, B=X0_t, W=W, lambda_=l) 113 | W_aug = W + ridge_weights 114 | err = (X1_v - X0_v @ W_aug).pow(2).sum() 115 | this_res.append(err.item()) 116 | res.append(this_res) 117 | means = np.array(res).mean(axis=0) 118 | ses = np.array(res).std(axis=0) / np.sqrt(len(lambdas)) 119 | return CrossValidationResult(lambdas, means, ses) 120 | 121 | def generate_lambdas( 122 | self, X: pd.DataFrame, lambda_min_ratio: float = 1e-8, n_lambda: int = 20 123 | ) -> np.ndarray: 124 | """Generate a suitable set of lambdas to run the cross-validation 125 | procedure on. 126 | 127 | :meta private: 128 | """ 129 | _, sing, _ = np.linalg.svd(X.T) 130 | lambda_max = sing[0].item() ** 2.0 131 | scaler = lambda_min_ratio ** (1 / n_lambda) 132 | return lambda_max * (scaler ** np.array(range(n_lambda))) 133 | -------------------------------------------------------------------------------- /pysyncon/base.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Optional, Literal, Sequence 3 | from abc import ABCMeta, abstractmethod 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import matplotlib.pyplot as plt 8 | from scipy.optimize import minimize, Bounds, LinearConstraint 9 | 10 | from .dataprep import Dataprep, IsinArg_t 11 | 12 | 13 | class BaseSynth(metaclass=ABCMeta): 14 | """Metaclass that defines methods common to synthetic control methods.""" 15 | 16 | def __init__(self) -> None: 17 | self.dataprep: Optional[Dataprep] = None 18 | self.W: Optional[np.ndarray] = None 19 | self.W_names: Optional[Sequence] = None 20 | 21 | @abstractmethod 22 | def fit(*args, **kwargs) -> None: 23 | raise NotImplementedError 24 | 25 | def _synthetic(self, Z0: pd.DataFrame) -> pd.Series: 26 | """Assemble the synthetic unit using the calculated weight matrix. 27 | 28 | Parameters 29 | ---------- 30 | Z0 : pandas.DataFrame, shape (n, c) 31 | A matrix of the time series of the outcome variable with each 32 | column corresponding to a control unit and the rows are the time 33 | steps 34 | 35 | Returns 36 | ------- 37 | pd.Series 38 | Time series of the synthetic unit. 39 | """ 40 | ts_synthetic = (Z0 * self.W).sum(axis=1) 41 | return ts_synthetic 42 | 43 | def path_plot( 44 | self, 45 | time_period: Optional[IsinArg_t] = None, 46 | treatment_time: Optional[int] = None, 47 | grid: bool = True, 48 | Z0: Optional[pd.DataFrame] = None, 49 | Z1: Optional[pd.Series] = None, 50 | ) -> None: 51 | """Plot the outcome variable over time for the treated unit and the 52 | synthetic control. 53 | 54 | Parameters 55 | ---------- 56 | time_period : Iterable | pandas.Series | dict, optional 57 | Time range to plot, if none is supplied then the time range used 58 | is the time period over which the optimisation happens, by default 59 | None 60 | treatment_time : int, optional 61 | If supplied, plot a vertical line at the time period that the 62 | treatment time occurred, by default None 63 | grid : bool, optional 64 | Whether or not to plot a grid, by default True 65 | Z0 : pandas.DataFrame, shape (n, c), optional 66 | The matrix of the time series of the outcome variable for the control units. 67 | If no dataprep is set, then this must be supplied along with Z1, by default None. 68 | Z1 : pandas.Series, shape (n, 1), optional 69 | The matrix of the time series of the outcome variable for the treated unit. 70 | If no dataprep is set, then this must be supplied along with Z0, by default None. 71 | 72 | Raises 73 | ------ 74 | ValueError 75 | If there is no weight matrix available 76 | ValueError 77 | If there is no :class:`Dataprep` object set or (Z0, Z1) is not supplied 78 | """ 79 | if self.dataprep is not None: 80 | Z0, Z1 = self.dataprep.make_outcome_mats(time_period=time_period) 81 | elif Z0 is None or Z1 is None: 82 | raise ValueError("dataprep must be set or (Z0, Z1) must be set for plots.") 83 | if self.W is None: 84 | raise ValueError("No weight matrix available; fit data first.") 85 | 86 | ts_synthetic = self._synthetic(Z0=Z0) 87 | plt.plot(Z1, color="black", linewidth=1, label=Z1.name) 88 | plt.plot( 89 | ts_synthetic, 90 | color="black", 91 | linewidth=1, 92 | linestyle="dashed", 93 | label="Synthetic", 94 | ) 95 | if self.dataprep is not None: 96 | plt.ylabel(self.dataprep.dependent) 97 | if treatment_time: 98 | plt.axvline(x=treatment_time, ymin=0.05, ymax=0.95, linestyle="dashed") 99 | plt.legend() 100 | plt.grid(grid) 101 | plt.show() 102 | 103 | def _gaps(self, Z0: pd.DataFrame, Z1: pd.Series) -> pd.Series: 104 | """Calculate the gaps (difference between factual 105 | and estimated counterfactual) 106 | 107 | Parameters 108 | ---------- 109 | Z0 : pandas.DataFrame, shape (n, c) 110 | A matrix of the time series of the outcome variable with each 111 | column corresponding to a control unit and the rows are the time 112 | steps 113 | Z1 : pandas.DataFrame, shape (n, 1) 114 | A matrix of the time series of the outcome variable for the treated 115 | unit and the rows are the time steps 116 | 117 | Returns 118 | ------- 119 | pd.Series 120 | Series containing the gaps 121 | 122 | :meta private: 123 | """ 124 | ts_synthetic = self._synthetic(Z0=Z0) 125 | ts_gap = Z1 - ts_synthetic 126 | return ts_gap 127 | 128 | def gaps_plot( 129 | self, 130 | time_period: Optional[IsinArg_t] = None, 131 | treatment_time: Optional[int] = None, 132 | grid: bool = True, 133 | Z0: Optional[pd.DataFrame] = None, 134 | Z1: Optional[pd.Series] = None, 135 | ) -> None: 136 | """Plots the gap between the treated unit and the synthetic unit over 137 | time. 138 | 139 | Parameters 140 | ---------- 141 | time_period : Iterable | pandas.Series | dict, optional 142 | Time range to plot, if none is supplied then the time range used 143 | is the time period over which the optimisation happens, by default 144 | None 145 | treatment_time : int, optional 146 | If supplied, plot a vertical line at the time period that the 147 | treatment time occurred, by default None 148 | grid : bool, optional 149 | Whether or not to plot a grid, by default True 150 | Z0 : pandas.DataFrame, shape (n, c), optional 151 | The matrix of the time series of the outcome variable for the control units. 152 | If no dataprep is set, then this must be supplied along with Z1, by default None. 153 | Z1 : pandas.Series, shape (n, 1), optional 154 | The matrix of the time series of the outcome variable for the treated unit. 155 | If no dataprep is set, then this must be supplied along with Z0, by default None. 156 | 157 | Raises 158 | ------ 159 | ValueError 160 | If there is no weight matrix available 161 | ValueError 162 | If there is no :class:`Dataprep` object set or (Z0, Z1) is not supplied 163 | """ 164 | if self.dataprep is not None: 165 | Z0, Z1 = self.dataprep.make_outcome_mats(time_period=time_period) 166 | elif Z0 is None or Z1 is None: 167 | raise ValueError("dataprep must be set or (Z0, Z1) must be set for plots.") 168 | if self.W is None: 169 | raise ValueError("No weight matrix available; fit data first.") 170 | 171 | ts_gap = self._gaps(Z0=Z0, Z1=Z1) 172 | plt.plot(ts_gap, color="black", linewidth=1) 173 | if self.dataprep is not None: 174 | plt.ylabel(self.dataprep.dependent) 175 | plt.hlines( 176 | y=0, 177 | xmin=min(ts_gap.index), 178 | xmax=max(ts_gap.index), 179 | color="black", 180 | linestyle="dashed", 181 | ) 182 | if treatment_time: 183 | plt.axvline(x=treatment_time, ymin=0.05, ymax=0.95, linestyle="dashed") 184 | plt.grid(grid) 185 | plt.show() 186 | 187 | def weights(self, round: int = 3, threshold: Optional[float] = None) -> pd.Series: 188 | """Return a ``pandas.Series`` of the weights for each control unit. 189 | 190 | Parameters 191 | ---------- 192 | round : int, optional 193 | Round the weights to given number of places, by default 3 194 | threshold : float, optional 195 | If supplied, will only show weights above this value, by default 196 | None 197 | 198 | Returns 199 | ------- 200 | pandas.Series 201 | The weights computed 202 | 203 | Raises 204 | ------ 205 | ValueError 206 | If there is no weight matrix available 207 | """ 208 | if self.W is None: 209 | raise ValueError("No weight matrix available; fit data first.") 210 | if self.dataprep is None: 211 | weights_ser = pd.Series(self.W, index=self.W_names, name="weights") 212 | else: 213 | weights_ser = pd.Series( 214 | self.W, index=list(self.dataprep.controls_identifier), name="weights" 215 | ) 216 | weights_ser = ( 217 | weights_ser[weights_ser >= threshold] if threshold else weights_ser 218 | ) 219 | return weights_ser.round(round) 220 | 221 | def summary( 222 | self, 223 | round: int = 3, 224 | X0: Optional[pd.DataFrame] = None, 225 | X1: Optional[pd.Series] = None, 226 | ) -> pd.DataFrame: 227 | """Generates a ``pandas.DataFrame`` with summary data. The 228 | first column will show the mean value of each predictor over the time 229 | period ``time_predictors_prior`` for the treated unit and the second 230 | column the case of the synthetic unit and finally there will be a 231 | column 'sample mean' that shows the mean value of each predictor 232 | over the time period ``time_predictors_prior`` across all the control 233 | units, i.e. this will be the same as a synthetic control where all 234 | the weights are equal. 235 | 236 | Parameters 237 | ---------- 238 | round : int, optional 239 | Round the table values to the given number of places, by 240 | default 3 241 | X0 : pd.DataFrame, shape (n_cov, n_controls), optional 242 | Matrix with each column corresponding to a control unit and each 243 | row is a covariate. If no dataprep is set, then this must be 244 | supplied along with X1, by default None. 245 | X1 : pandas.Series, shape (n_cov, 1), optional 246 | Column vector giving the covariate values for the treated unit. 247 | If no dataprep is set, then this must be supplied along with Z1, 248 | by default None. 249 | 250 | Returns 251 | ------- 252 | pandas.DataFrame 253 | Summary data. 254 | 255 | Raises 256 | ------ 257 | ValueError 258 | If there is no weight matrix available 259 | ValueError 260 | If there is no :class:`Dataprep` object set or (Z0, Z1) is not supplied 261 | """ 262 | if self.W is None: 263 | raise ValueError("No weight matrix available: fit data first.") 264 | if self.dataprep is not None: 265 | X0, X1 = self.dataprep.make_covariate_mats() 266 | elif X0 is None or X1 is None: 267 | raise ValueError( 268 | "dataprep must be set or (X0, X1) must be set for summary." 269 | ) 270 | 271 | treated = X1.rename("treated") 272 | synthetic = (X0 * self.W).sum(axis=1).rename("synthetic") 273 | sample_mean = X0.mean(axis=1).rename("sample mean") 274 | 275 | return pd.concat([treated, synthetic, sample_mean], axis=1).round(round) 276 | 277 | def att( 278 | self, 279 | time_period: IsinArg_t, 280 | Z0: Optional[pd.DataFrame] = None, 281 | Z1: Optional[pd.Series] = None, 282 | ) -> dict[str, float]: 283 | """Computes the average treatment effect on the treated unit (ATT) and 284 | the standard error to the value over the chosen time-period. 285 | 286 | Parameters 287 | ---------- 288 | time_period : Iterable | pandas.Series | dict, optional 289 | Time period to compute the ATT over. 290 | Z0 : pandas.DataFrame, shape (n, c), optional 291 | The matrix of the time series of the outcome variable for the control units. 292 | If no dataprep is set, then this must be supplied along with Z1, by default None. 293 | Z1 : pandas.Series, shape (n, 1), optional 294 | The matrix of the time series of the outcome variable for the treated unit. 295 | If no dataprep is set, then this must be supplied along with Z0, by default None. 296 | 297 | Returns 298 | ------- 299 | dict 300 | A dictionary with the ATT value and the standard error to the ATT. 301 | 302 | Raises 303 | ------ 304 | ValueError 305 | If there is no weight matrix available 306 | ValueError 307 | If there is no :class:`Dataprep` object set or (Z0, Z1) is not supplied 308 | """ 309 | if self.W is None: 310 | raise ValueError("No weight matrix available; fit data first.") 311 | if self.dataprep is not None: 312 | Z0, Z1 = self.dataprep.make_outcome_mats(time_period=time_period) 313 | gaps = self._gaps(Z0=Z0, Z1=Z1) 314 | if Z0 is not None and Z1 is not None: 315 | gaps = self._gaps(Z0=Z0.loc[time_period, :], Z1=Z1.loc[time_period]) 316 | else: 317 | raise ValueError("dataprep must be set or (Z0, Z1) must be set for att.") 318 | att = np.mean(gaps) 319 | se = np.std(gaps, ddof=1) / np.sqrt(len(time_period)) 320 | 321 | return {"att": att.item(), "se": se.item()} 322 | 323 | def mspe( 324 | self, Z0: Optional[pd.DataFrame] = None, Z1: Optional[pd.Series] = None 325 | ) -> float: 326 | """Returns the mean square prediction error in the fit of 327 | the synthetic control versus the treated unit over the 328 | optimization time-period. 329 | 330 | Parameters 331 | ---------- 332 | Z0 : pandas.DataFrame, shape (n, c), optional 333 | The matrix of the time series of the outcome variable for the control units. 334 | If no dataprep is set, then this must be supplied along with Z1, by default None. 335 | Z1 : pandas.Series, shape (n, 1), optional 336 | The matrix of the time series of the outcome variable for the treated unit. 337 | If no dataprep is set, then this must be supplied along with Z0, by default None. 338 | 339 | Returns 340 | ------- 341 | float 342 | Mean square prediction Error 343 | 344 | Raises 345 | ------ 346 | ValueError 347 | If the fit method has not been run (no weights available.) 348 | ValueError 349 | If there is no :class:`Dataprep` object set or (Z0, Z1) is not supplied 350 | """ 351 | if self.W is None: 352 | raise ValueError("No weight matrix available; fit data first.") 353 | if self.dataprep is not None: 354 | Z0, Z1 = self.dataprep.make_outcome_mats( 355 | time_period=self.dataprep.time_optimize_ssr 356 | ) 357 | if Z0 is None or Z1 is None: 358 | raise ValueError("dataprep must be set or (Z0, Z1) must be set for plots.") 359 | ts_synthetic = self._synthetic(Z0=Z0) 360 | 361 | n = len(ts_synthetic) 362 | return (1 / n) * (Z1 - ts_synthetic).pow(2).sum().item() 363 | 364 | def mape( 365 | self, Z0: Optional[pd.DataFrame] = None, Z1: Optional[pd.Series] = None 366 | ) -> float: 367 | """Returns the mean absolute percentage error in the fit of 368 | the synthetic control versus the treated unit over the 369 | optimization time-period. 370 | 371 | Parameters 372 | ---------- 373 | Z0 : pandas.DataFrame, shape (n, c), optional 374 | The matrix of the time series of the outcome variable for the control units. 375 | If no dataprep is set, then this must be supplied along with Z1, by default None. 376 | Z1 : pandas.Series, shape (n, 1), optional 377 | The matrix of the time series of the outcome variable for the treated unit. 378 | If no dataprep is set, then this must be supplied along with Z0, by default None. 379 | 380 | Returns 381 | ------- 382 | float 383 | Mean absolute percentage error 384 | 385 | Raises 386 | ------ 387 | ValueError 388 | If the fit method has not been run (no weights available.) 389 | ValueError 390 | If there is no :class:`Dataprep` object set or (Z0, Z1) is not supplied 391 | """ 392 | if self.W is None: 393 | raise ValueError("No weight matrix available; fit data first.") 394 | if self.dataprep is not None: 395 | Z0, Z1 = self.dataprep.make_outcome_mats( 396 | time_period=self.dataprep.time_optimize_ssr 397 | ) 398 | if Z0 is None or Z1 is None: 399 | raise ValueError("dataprep must be set or (Z0, Z1) must be set for plots.") 400 | ts_synthetic = self._synthetic(Z0=Z0) 401 | 402 | n = len(ts_synthetic) 403 | return (1 / n) * ((Z1 - ts_synthetic) / Z1).abs().sum().item() 404 | 405 | def mae( 406 | self, Z0: Optional[pd.DataFrame] = None, Z1: Optional[pd.Series] = None 407 | ) -> float: 408 | """Returns the mean absolute error in the fit of 409 | the synthetic control versus the treated unit over the 410 | optimization time-period. 411 | 412 | Parameters 413 | ---------- 414 | Z0 : pandas.DataFrame, shape (n, c), optional 415 | The matrix of the time series of the outcome variable for the control units. 416 | If no dataprep is set, then this must be supplied along with Z1, by default None. 417 | Z1 : pandas.Series, shape (n, 1), optional 418 | The matrix of the time series of the outcome variable for the treated unit. 419 | If no dataprep is set, then this must be supplied along with Z0, by default None. 420 | 421 | Returns 422 | ------- 423 | float 424 | Mean absolute error 425 | 426 | Raises 427 | ------ 428 | ValueError 429 | If the fit method has not been run (no weights available.) 430 | ValueError 431 | If there is no :class:`Dataprep` object set or (Z0, Z1) is not supplied 432 | """ 433 | if self.W is None: 434 | raise ValueError("No weight matrix available; fit data first.") 435 | if self.dataprep is not None: 436 | Z0, Z1 = self.dataprep.make_outcome_mats( 437 | time_period=self.dataprep.time_optimize_ssr 438 | ) 439 | if Z0 is None or Z1 is None: 440 | raise ValueError("dataprep must be set or (Z0, Z1) must be set for plots.") 441 | ts_synthetic = self._synthetic(Z0=Z0) 442 | 443 | n = len(ts_synthetic) 444 | return (1 / n) * (Z1 - ts_synthetic).abs().sum().item() 445 | 446 | 447 | class VanillaOptimMixin: 448 | @staticmethod 449 | def w_optimize( 450 | V_mat: np.ndarray, 451 | X0: np.ndarray, 452 | X1: np.ndarray, 453 | qp_method: Literal["SLSQP"] = "SLSQP", 454 | qp_options: dict = {"maxiter": 1000}, 455 | ) -> tuple[np.ndarray, float]: 456 | """Solves the inner part of the quadratic minimization problem for a 457 | given V matrix (see Abadie and Gardeazabal :cite:`basque2003`). 458 | 459 | Parameters 460 | ---------- 461 | V_mat : numpy.ndarray, shape (c, c) 462 | V matrix using the notation of the Abadie, Diamond & Hainmueller 463 | paper defining. 464 | X0 : numpy.ndarray, shape (m, c) 465 | Matrix with each column corresponding to a control unit and each 466 | row is covariates. 467 | X1 : numpy.ndarray, shape (m,) 468 | Column vector giving the covariate values for the treated unit. 469 | qp_method : str, optional 470 | Minimization routine to use in scipy minimize to solve the problem 471 | , by default "SLSQP" 472 | qp_options : dict, optional 473 | Options for scipy minimize, by default {"maxiter": 1000} 474 | 475 | Returns 476 | ------- 477 | tuple[np.ndarray, float] 478 | tuple of the optimal weights and the loss 479 | 480 | :meta private: 481 | """ 482 | _, n_c = X0.shape 483 | 484 | P = X0.T @ V_mat @ X0 485 | q = X1.T @ V_mat @ X0 486 | 487 | def fun(x): 488 | return 0.5 * x.T @ P @ x - q.T @ x 489 | 490 | bounds = Bounds(lb=np.full(n_c, 0.0), ub=np.full(n_c, 1.0)) 491 | constraints = LinearConstraint(A=np.full(n_c, 1.0), lb=1.0, ub=1.0) 492 | 493 | x0 = np.full(n_c, 1 / n_c) 494 | res = minimize( 495 | fun=fun, 496 | x0=x0, 497 | bounds=bounds, 498 | constraints=constraints, 499 | method=qp_method, 500 | options=qp_options, 501 | ) 502 | W, loss_W = res["x"], res["fun"] 503 | return W, loss_W.item() 504 | -------------------------------------------------------------------------------- /pysyncon/dataprep.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Any, Iterable, Union, Optional, Literal, Sequence, Mapping, Tuple 3 | 4 | import pandas as pd 5 | from pandas._typing import Axes 6 | 7 | 8 | AGG_OP = ("mean", "std", "median", "sum", "count", "max", "min", "var") 9 | PredictorsOp_t = Literal["mean", "std", "median", "sum", "count", "max", "min", "var"] 10 | IsinArg_t = Union[Iterable, pd.Series, dict] 11 | SpecialPredictor_t = Tuple[ 12 | Any, Union[pd.Series, pd.DataFrame, Sequence, Mapping], PredictorsOp_t 13 | ] 14 | 15 | 16 | class Dataprep: 17 | """Helper class that takes in the panel data and all necessary information 18 | needed to describe the study setup. It is used to automatically generate 19 | the matrices needed for the optimisation methods, plots of the results etc. 20 | 21 | Parameters 22 | ---------- 23 | foo : pandas.DataFrame 24 | A pandas DataFrame containing the panel data where the columns are 25 | predictor/outcome variables and each row is a time-step for some unit 26 | predictors : Axes 27 | The columns of ``foo`` to use as predictors 28 | predictors_op : "mean" | "std" | "median" | "sum" | "count" | "max" | "min" | "var" 29 | The statistical operation to use on the predictors - the time range that 30 | the operation is applied to is ``time_predictors_prior`` 31 | dependent : Any 32 | The column of ``foo`` to use as the dependent variable 33 | unit_variable : Any 34 | The column of ``foo`` that contains the unit labels 35 | time_variable : Any 36 | The column of ``foo`` that contains the time period 37 | treatment_identifier : Any 38 | The unit label that denotes the treated unit 39 | controls_identifier : Iterable 40 | The unit labels denoting the control units 41 | time_predictors_prior : Iterable 42 | The time range over which to apply the statistical operation to the 43 | predictors (see ``predictors_op`` argument) 44 | time_optimize_ssr : Iterable 45 | The time range over which the loss function should be minimised 46 | special_predictors : Iterable[SpecialPredictor_t], optional 47 | An iterable of special predictors which are additional predictors 48 | that should be averaged over a custom time period and an indicated 49 | statistical operator. In particular, a special predictor 50 | consists of a triple of: 51 | 52 | - ``column``: the column of ``foo`` containing the predictor to use, 53 | - ``time-range``: the time range to apply ``operator`` over - it should 54 | have the same type as ``time_predictors_prior`` or ``time_optimize_ssr`` 55 | - ``operator``: the statistical operator to apply to ``column`` - it should 56 | have the same type as ``predictors_op`` 57 | 58 | by default None 59 | 60 | Raises 61 | ------ 62 | TypeError 63 | if ``foo`` is not of type ``pandas.DataFrame`` 64 | ValueError 65 | if ``predictor`` is not a column of ``foo`` 66 | ValueError 67 | if ``predictor_op`` is not one of "mean", "std", "median", 68 | "sum", "count", "max", "min" or "var". 69 | ValueError 70 | if ``dependent`` is not a column of ``foo`` 71 | ValueError 72 | if ``unit_variable`` is not a column of ``foo`` 73 | ValueError 74 | if ``time_variable`` is not a column of ``foo`` 75 | ValueError 76 | if ``treatment_identifier`` is not present in ``foo['unit_variable']`` 77 | TypeError 78 | if ``controls_identifier`` is not of type ``Iterable`` 79 | ValueError 80 | if ``treatment_identifier`` is in the list of controls 81 | ValueError 82 | if any of the controls is not in ``foo['unit_variable']`` 83 | ValueError 84 | if any element of ``special_predictors`` is not an Iterable of length 85 | 3 86 | ValueError 87 | if a predictor in an element of ``special_predictors`` is not a column 88 | of foo 89 | ValueError 90 | if one of the operators in an element of ``special_predictors`` is not 91 | one of "mean", "std", "median", "sum", "count", "max", "min" or "var". 92 | """ 93 | 94 | def __init__( 95 | self, 96 | foo: pd.DataFrame, 97 | predictors: Axes, 98 | predictors_op: PredictorsOp_t, 99 | dependent: Any, 100 | unit_variable: Any, 101 | time_variable: Any, 102 | treatment_identifier: Union[Any, list, tuple], 103 | controls_identifier: Union[list, tuple], 104 | time_predictors_prior: IsinArg_t, 105 | time_optimize_ssr: IsinArg_t, 106 | special_predictors: Optional[Iterable[SpecialPredictor_t]] = None, 107 | ) -> None: 108 | if not isinstance(foo, pd.DataFrame): 109 | raise TypeError("foo must be pandas.DataFrame.") 110 | self.foo = foo 111 | 112 | for predictor in predictors: 113 | if predictor not in foo.columns: 114 | raise ValueError(f"predictor {predictor} not in foo columns.") 115 | self.predictors = predictors 116 | 117 | if predictors_op not in AGG_OP: 118 | agg_op_str = ", ".join([f'"{o}"' for o in AGG_OP]) 119 | raise ValueError(f"predictors_op must be one of {agg_op_str}.") 120 | self.predictors_op = predictors_op 121 | 122 | if dependent not in foo.columns: 123 | raise ValueError(f"dependent {dependent} not in foo columns.") 124 | self.dependent = dependent 125 | 126 | if unit_variable not in foo.columns: 127 | raise ValueError(f"unit_variable {unit_variable} not in foo columns.") 128 | self.unit_variable = unit_variable 129 | 130 | if time_variable not in foo.columns: 131 | raise ValueError(f"time_variable {time_variable} not in foo columns.") 132 | self.time_variable = time_variable 133 | 134 | if foo[[unit_variable, time_variable]].duplicated().any(): 135 | raise ValueError( 136 | "Multiple rows found in `foo` for same [unit, time] pairs." 137 | ) 138 | 139 | if isinstance(treatment_identifier, (list, tuple)): 140 | for treated in treatment_identifier: 141 | # This throws FutureWarning (see https://stackoverflow.com/a/46721064/11594901) 142 | if treated not in foo[unit_variable].values: 143 | raise ValueError( 144 | f'treatment_identifier {treated} not found in foo["{unit_variable}"].' 145 | ) 146 | else: 147 | # This throws FutureWarning (see https://stackoverflow.com/a/46721064/11594901) 148 | if treatment_identifier not in foo[unit_variable].values: 149 | raise ValueError( 150 | f'treatment_identifier {treatment_identifier} not found in foo["{unit_variable}"].' 151 | ) 152 | if ( 153 | isinstance(treatment_identifier, (list, tuple)) 154 | and len(treatment_identifier) == 1 155 | ): 156 | self.treatment_identifier = treatment_identifier[0] 157 | else: 158 | self.treatment_identifier = treatment_identifier 159 | 160 | if not isinstance(controls_identifier, (list, tuple)): 161 | raise TypeError("controls_identifier should be an list or tuple") 162 | for control in controls_identifier: 163 | if isinstance(self.treatment_identifier, (list, tuple)): 164 | if control in treatment_identifier: 165 | raise ValueError( 166 | f"{control} in both treatment_identifier and controls_identifier." 167 | ) 168 | else: 169 | if control == treatment_identifier: 170 | raise ValueError("treatment_identifier in controls_identifier.") 171 | if control not in foo[unit_variable].values: 172 | raise ValueError( 173 | f'controls_identifier {control} not found in foo["{unit_variable}"].' 174 | ) 175 | self.controls_identifier = controls_identifier 176 | 177 | if self.foo[self.foo[self.time_variable].isin(time_predictors_prior)].empty: 178 | raise ValueError( 179 | f"foo has no rows in the time range `time_predictors_prior`." 180 | ) 181 | self.time_predictors_prior = time_predictors_prior 182 | 183 | if self.foo[self.foo[self.time_variable].isin(time_optimize_ssr)].empty: 184 | raise ValueError(f"foo has no rows in the time range `time_optimize_ssr`.") 185 | self.time_optimize_ssr = time_optimize_ssr 186 | 187 | if special_predictors: 188 | for el in special_predictors: 189 | if not isinstance(el, tuple) or len(el) != 3: 190 | raise ValueError( 191 | "Elements of special_predictors should be tuples of length 3." 192 | ) 193 | predictor, time_range, op = el 194 | if predictor not in foo.columns: 195 | raise ValueError( 196 | f"{predictor} in special_predictors not in foo columns." 197 | ) 198 | if self.foo[self.foo[self.time_variable].isin(time_range)].empty: 199 | raise ValueError( 200 | f"foo has no rows in the time range {time_range} for `special_predictor` {el}." 201 | ) 202 | if op not in AGG_OP: 203 | agg_op_str = ", ".join([f'"{o}"' for o in AGG_OP]) 204 | raise ValueError( 205 | f"{op} in special_predictors must be one of {agg_op_str}." 206 | ) 207 | self.special_predictors = special_predictors 208 | 209 | def make_covariate_mats( 210 | self, 211 | ) -> tuple[pd.DataFrame, Union[pd.Series, pd.DataFrame]]: 212 | """Generate the covariate matrices to use as input to the fit method 213 | of the synthetic control computation. 214 | 215 | Returns 216 | ------- 217 | tuple[pandas.DataFrame, pandas.Series] 218 | Returns the matrices :math:`X_0`, :math:`X_1` (using the notation of Abadie 219 | & Gardeazabal :cite:`basque2003`). 220 | 221 | :meta private: 222 | """ 223 | X_nonspecial = ( 224 | self.foo[self.foo[self.time_variable].isin(self.time_predictors_prior)] 225 | .groupby(self.unit_variable)[self.predictors] 226 | .agg(self.predictors_op) 227 | .T 228 | ) 229 | X1_nonspecial = X_nonspecial[self.treatment_identifier] 230 | X0_nonspecial = X_nonspecial[list(self.controls_identifier)] 231 | 232 | if self.special_predictors is None: 233 | return X0_nonspecial, X1_nonspecial 234 | 235 | X0_special = list() 236 | for control in self.controls_identifier: 237 | this_control = list() 238 | for predictor, time_period, op in self.special_predictors: 239 | mask = (self.foo[self.unit_variable] == control) & ( 240 | self.foo[self.time_variable].isin(time_period) 241 | ) 242 | this_control.append(self.foo[mask][predictor].agg(op)) 243 | X0_special.append(this_control) 244 | 245 | X0_special_columns = list() 246 | for idx, (predictor, _, _) in enumerate(self.special_predictors, 1): 247 | X0_special_columns.append(f"special.{idx}.{predictor}") 248 | 249 | X0_special = pd.DataFrame( 250 | X0_special, columns=X0_special_columns, index=self.controls_identifier 251 | ).T 252 | X0 = pd.concat([X0_nonspecial, X0_special], axis=0) 253 | 254 | X1_special = list() 255 | if isinstance(self.treatment_identifier, (list, tuple)): 256 | for treated in self.treatment_identifier: 257 | this_treated = list() 258 | for predictor, time_period, op in self.special_predictors: 259 | mask = (self.foo[self.unit_variable] == treated) & ( 260 | self.foo[self.time_variable].isin(time_period) 261 | ) 262 | this_treated.append(self.foo[mask][predictor].agg(op)) 263 | X1_special.append(this_treated) 264 | X1_special = pd.DataFrame( 265 | X1_special, columns=X0_special_columns, index=self.treatment_identifier 266 | ).T 267 | else: 268 | for predictor, time_period, op in self.special_predictors: 269 | mask = (self.foo[self.unit_variable] == self.treatment_identifier) & ( 270 | self.foo[self.time_variable].isin(time_period) 271 | ) 272 | X1_special.append(self.foo[mask][predictor].agg(op)) 273 | 274 | X1_special = pd.Series(X1_special, index=X0_special_columns).rename( 275 | self.treatment_identifier 276 | ) 277 | X1 = pd.concat([X1_nonspecial, X1_special], axis=0) 278 | return X0, X1 279 | 280 | def make_outcome_mats( 281 | self, time_period: Optional[IsinArg_t] = None 282 | ) -> tuple[pd.DataFrame, Union[pd.Series, pd.DataFrame]]: 283 | """Generates the time-series (outcome) matrices to use as input to the fit 284 | method of the synthetic control computation. 285 | 286 | Parameters 287 | ---------- 288 | time_period : Iterable | pandas.Series | dict, optional 289 | Time period to use when generating the matrices, defaults to 290 | time_optimize_ssr set when initialising the class, by default None 291 | 292 | Returns 293 | ------- 294 | tuple[pd.DataFrame, Union[pd.Series, pd.DataFrame]] 295 | Returns the matrices :math:`Z_0`, :math:`Z_1` (using the notation 296 | of Abadie & Gardeazabal :cite:`basque2003`). 297 | 298 | :meta private: 299 | """ 300 | time_period = time_period if time_period is not None else self.time_optimize_ssr 301 | 302 | Z = self.foo[self.foo[self.time_variable].isin(time_period)].pivot( 303 | index=self.time_variable, columns=self.unit_variable, values=self.dependent 304 | ) 305 | Z0, Z1 = Z[list(self.controls_identifier)], Z[self.treatment_identifier] 306 | return Z0, Z1 307 | 308 | def __str__(self) -> str: 309 | str_rep = ( 310 | "Dataprep\n" 311 | f"Treated unit: {self.treatment_identifier}\n" 312 | f"Dependent variable: {self.dependent}\n" 313 | f"Control units: {', '.join([str(c) for c in self.controls_identifier])}\n" 314 | f"Time range in data: {min(self.foo[self.time_variable])}" 315 | f" - {max(self.foo[self.time_variable])}\n" 316 | f"Time range for loss minimization: {self.time_optimize_ssr}\n" 317 | f"Time range for predictors: {self.time_predictors_prior}\n" 318 | f"Predictors: {', '.join([str(p) for p in self.predictors])}\n" 319 | ) 320 | 321 | if self.special_predictors: 322 | str_special_pred = "" 323 | for predictor, time_range, op in self.special_predictors: 324 | rep = f" `{predictor}` over `{time_range}` using `{op}`\n" 325 | str_special_pred = str_special_pred + rep 326 | str_rep = str_rep + f"Special predictors:\n" + str_special_pred 327 | return str_rep 328 | -------------------------------------------------------------------------------- /pysyncon/generator.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Optional 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | 8 | class LinearFactorModel: 9 | """Generates potential outcomes following a linear factor model""" 10 | 11 | def __init__( 12 | self, 13 | observed_dist: tuple[int] = (0, 1), 14 | observed_params_dist: tuple[int] = (0, 10), 15 | unobserved_dist: tuple[int] = (0, 1), 16 | unobserved_params_dist: tuple[int] = (0, 10), 17 | effect_dist: tuple[int] = (0, 20), 18 | shocks_dist: tuple[int] = (0, 1), 19 | seed: Optional[int] = None, 20 | rng: Optional[np.random.Generator] = None, 21 | ) -> None: 22 | """Generates potential outcomes following a linear factor model 23 | 24 | Parameters 25 | ---------- 26 | observed_dist : tuple, optional 27 | Parameters for the uniform distribution that the observed 28 | covariates follow, by default (0, 1) 29 | observed_params_dist : tuple, optional 30 | Parameters for the uniform distribution that the observed 31 | covariates model parameters follow, by default (0, 10) 32 | unobserved_dist : tuple, optional 33 | Parameters for the uniform distribution that the unobserved 34 | covariates follow, by default (0, 1) 35 | unobserved_params_dist : tuple, optional 36 | Parameters for the uniform distribution that the unobserved 37 | covariates model parameters follow, by default (0, 10) 38 | effect_dist : tuple, optional 39 | Uniform distribution parameters that the effect follows, 40 | by default (0, 20) 41 | shocks_dist : tuple, optional 42 | Normal distribution parameters that the shocks follow, by default (0, 1) 43 | seed : int, optional 44 | Random number generator seed, by default None 45 | rng : numpy.random.Generator, optional 46 | Provide a numpy random number generator, by default None 47 | """ 48 | self.observed_dist = observed_dist 49 | self.observed_params_dist = observed_params_dist 50 | self.unobserved_dist = unobserved_dist 51 | self.unobserved_params_dist = unobserved_params_dist 52 | self.effect_dist = effect_dist 53 | self.shocks_dist = shocks_dist 54 | self.seed = seed 55 | self.rng = rng 56 | 57 | def generate( 58 | self, 59 | n_units: int, 60 | n_observable: int, 61 | n_unobservable: int, 62 | n_periods_pre: int, 63 | n_periods_post: int, 64 | ) -> tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: 65 | """Generate the matrices (:math:`X_0`, :math:`X_1`, :math:`Z_0`, 66 | :math:`Z_1`) that can be used as input to a synthetic control 67 | method (using the notation of Abadie & Gardeazabal :cite:`basque2003`). 68 | 69 | Parameters 70 | ---------- 71 | n_units : int 72 | Number of units in the model 73 | n_observable : int 74 | Number of observable covariates in the model 75 | n_unobservable : int 76 | Number of unobservable covariates in the model 77 | n_periods_pre : int 78 | Number of time periods prior to the intervention 79 | n_periods_post : int 80 | Number of time periods post the intervention 81 | 82 | Returns 83 | ------- 84 | tuple[pandas.DataFrame, pandas.Series, pandas.DataFrame, pandas.Series] 85 | Returns a tuple of 4 pandas objects: :math:`X_0` a pandas DataFrame 86 | of shape (n_periods_pre + n_periods_post, n_units - 1), :math:`X_1` a 87 | pandas Series of shape (n_periods_pre + n_periods_post, 1), :math:`Z_0` 88 | a pandas DataFrame of shape (n_observable, n_units - 1), :math:`Z_1` 89 | a pandas Series of shape (n_observable, 1). 90 | """ 91 | rng = self.rng(self.seed) if self.rng else np.random.default_rng(seed=self.seed) 92 | 93 | n_periods = n_periods_pre + n_periods_post 94 | 95 | delta = rng.uniform(*self.effect_dist, size=n_periods).reshape(-1, 1) 96 | delta = np.column_stack([delta] * n_units) 97 | 98 | Z = rng.uniform(*self.observed_dist, size=(n_observable, n_units)) 99 | mu = rng.uniform(*self.unobserved_dist, size=(n_unobservable, n_units)) 100 | theta = rng.uniform(*self.observed_params_dist, size=(n_observable, n_periods)) 101 | lambda_ = rng.uniform( 102 | *self.unobserved_params_dist, size=(n_unobservable, n_periods) 103 | ) 104 | epsilon = rng.normal(*self.shocks_dist, size=(n_periods, n_units)) 105 | 106 | Y_N = theta.T @ Z + lambda_.T @ mu + epsilon 107 | Y_I = Y_N + delta 108 | 109 | X0 = pd.DataFrame( 110 | data=Z[:, 1:], 111 | columns=range(2, n_units + 1), 112 | index=[f"observable{i}" for i in range(1, n_observable + 1)], 113 | ) 114 | X1 = pd.Series( 115 | data=Z[:, 0], 116 | name=1, 117 | index=[f"observable{i}" for i in range(1, n_observable + 1)], 118 | ) 119 | Z0 = pd.DataFrame( 120 | data=Y_N[:, 1:], 121 | columns=range(2, n_units + 1), 122 | index=range(1, n_periods + 1), 123 | ) 124 | Z1 = pd.Series( 125 | data=np.concatenate( 126 | [Y_N[:n_periods_pre, 0], Y_I[n_periods_pre:, 0]], axis=0 127 | ), 128 | name=1, 129 | index=range(1, n_periods + 1), 130 | ) 131 | return X0, X1, Z0, Z1 132 | -------------------------------------------------------------------------------- /pysyncon/inference.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Optional, Callable, Literal 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from pysyncon.base import BaseSynth 8 | 9 | 10 | class ConformalInference: 11 | """Implementation of the conformal inference based confidence intervals 12 | following Chernozhukov et al. :cite:`inference2021` 13 | """ 14 | 15 | def __init__(self) -> None: 16 | pass 17 | 18 | def confidence_intervals( 19 | self, 20 | alpha: float, 21 | scm: BaseSynth, 22 | Z0: pd.DataFrame, 23 | Z1: pd.Series, 24 | pre_periods: list, 25 | post_periods: list, 26 | tol: float = 0.1, 27 | max_iter: int = 50, 28 | step_sz: Optional[float] = None, 29 | step_sz_div: float = 20.0, 30 | verbose: bool = True, 31 | scm_fit_args: dict = {}, 32 | ) -> pd.DataFrame: 33 | """Confidence intervals obtained from test-inversion, where 34 | the p-values are obtained by adjusted re-fits of the data 35 | following Chernozhukov et al. :cite:`inference2021`. 36 | 37 | Parameters 38 | ---------- 39 | alpha : float 40 | The required significance level, e.g. alpha = 0.05 will 41 | yield a confidence level of 100 * (1 - alpha) = 95%. 42 | scm : BaseSynth 43 | The synth object to calculate the confidence interval for. 44 | Z0 : pandas.DataFrame, shape (n, c) 45 | A matrix of the time series of the outcome variable with each 46 | column corresponding to a control unit and the rows are the time 47 | steps. 48 | Z1 : pd.Series 49 | Column vector giving the outcome variable values over time for the 50 | treated unit. 51 | tol : float 52 | The required tolerance (accuracy) required when calculating the 53 | lower/upper cut-off point of the confidence interval. The search 54 | will try to obtain this tolerance level but will not exceed `max_iter` 55 | iterations trying to achieve that. 56 | pre_periods : list 57 | The time-periods to use for the optimization when refitting the 58 | data with the adjusted outcomes. 59 | post_periods : list 60 | The time-periods to calculate confidence intervals for. 61 | max_iter : int, optional 62 | Maximum number of times to re-fit the data when trying to locate 63 | the lower/upper cut-off point, by default 50 64 | step_sz : Optional[float], optional 65 | Step size to use when searching for an interval that contains the 66 | lower or upper cut-off point of the confidence interval, by default None 67 | step_sz_div : float, optional 68 | Alternative way to define step size: it is the fraction that defines 69 | step-size in terms of the standard deviation of the att, i.e. if 70 | `step_sz_div=20.0` then the step size used will be (att +/- 2.5 * std(att)) / 20.0, 71 | by default 20.0 72 | verbose : bool, optional 73 | Print output, by default True 74 | scm_fit_args : dict, optional 75 | A dictionary defining anything extra that should be provided to the 76 | synthetic control object `fit` method when doing the refits, by default {} 77 | 78 | Returns 79 | ------- 80 | pd.DataFrame 81 | A pandas.DataFrame indexed by `post_periods`, with 3 columns: `value` that 82 | gives the calculated treatment effect, `lower_ci` that gives the value 83 | defining the lower-end of the confidence interval, `upper_ci` that gives 84 | the value defining the upper-end of the confidence interval. 85 | 86 | Raises 87 | ------ 88 | TypeError 89 | if `alpha` is not a float 90 | ValueError 91 | if `alpha` is not in the open interval (0, 1). 92 | TypeError 93 | if `max_iter` is not an integer 94 | ValueError 95 | if `max_iter` is not at least 1 96 | TypeError 97 | if `tol` is not a float 98 | ValueError 99 | if `tol` is less than 0.0 100 | TypeError 101 | if `step_sz` is not a float 102 | ValueError 103 | if `step_sz` is not greater than 0.0 104 | TypeError 105 | if `step_sz_div` is not a float 106 | ValueError 107 | if `step_sz_div` is not greater than 0.0 108 | """ 109 | if not isinstance(alpha, float): 110 | raise TypeError("`alpha` must be a float") 111 | elif not 0.0 < alpha < 1.0: 112 | raise ValueError("`alpha` must be greater than 0.0 and less than 1.0") 113 | if not isinstance(max_iter, int): 114 | raise TypeError("`max_iter` must be an integer") 115 | elif max_iter < 1: 116 | raise ValueError("`max_iter` must be at least 1") 117 | if not isinstance(tol, float): 118 | raise TypeError("`tol` must be a float") 119 | elif tol <= 0.0: 120 | raise ValueError("`tol` must be greater than 0.0") 121 | if step_sz != None: 122 | if not isinstance(step_sz, float): 123 | raise TypeError("`step_sz` should be a float") 124 | elif step_sz <= 0.0: 125 | raise ValueError("`step_sz` should be greater than 0.0") 126 | elif step_sz <= tol: 127 | raise ValueError("`step_sz` must be greater than `tol`.") 128 | if not isinstance(step_sz_div, float): 129 | raise TypeError("`step_sz_div` must be a float") 130 | elif step_sz_div <= 0.0: 131 | raise ValueError("`step_sz_div` must be greater than 0.0") 132 | if scm.W is None: 133 | raise ValueError("No weight matrix available; fit data first.") 134 | 135 | gaps = scm._gaps(Z0=Z0, Z1=Z1) 136 | if step_sz is None: 137 | # Try to guess a step-size 138 | if len(post_periods) > 1: 139 | factor = np.std(gaps.loc[post_periods]) 140 | else: 141 | factor = gaps.loc[post_periods].item() / 2.0 142 | step_sz = 2.5 * factor / step_sz_div 143 | if step_sz <= tol: 144 | # Failed to guess a sensible step-size :( 145 | step_sz = 1.1 * tol 146 | 147 | conf_interval = dict() 148 | n_periods = len(post_periods) 149 | for idx, post_period in enumerate(post_periods, 1): 150 | if verbose: 151 | print( 152 | f"({idx}/{n_periods}) Calculating confidence interval " 153 | f"for time-period t={post_period}..." 154 | ) 155 | new_time_range = pre_periods + [post_period] 156 | Z0_new, Z1_new = Z0.loc[new_time_range], Z1.loc[new_time_range] 157 | Z1_post_orig = Z1_new.loc[post_period].item() 158 | 159 | def _compute_p_value(g): 160 | Z1_new.loc[post_period] = Z1_post_orig - g 161 | scm.fit(Z0=Z0_new, Z1=Z1_new, **scm_fit_args) 162 | _gaps = scm._gaps(Z0=Z0_new, Z1=Z1_new) 163 | 164 | u_hat = _gaps.loc[new_time_range] 165 | u_hat_post = u_hat.loc[post_period] 166 | return np.mean(abs(u_hat) >= abs(u_hat_post)) 167 | 168 | lower_ci = self._root_search( 169 | fn=lambda x: _compute_p_value(x) - alpha, 170 | x0=gaps.loc[post_period], 171 | direction=-1.0, 172 | tol=tol, 173 | step_sz=step_sz, 174 | max_iter=max_iter, 175 | ) 176 | 177 | upper_ci = self._root_search( 178 | fn=lambda x: _compute_p_value(x) - alpha, 179 | x0=gaps.loc[post_period], 180 | direction=1.0, 181 | tol=tol, 182 | step_sz=step_sz, 183 | max_iter=max_iter, 184 | ) 185 | 186 | conf_interval[post_period] = (lower_ci, upper_ci) 187 | if verbose: 188 | print( 189 | f"\t{100 * (1 - alpha)}% CI: [{round(lower_ci, 3)}, {round(upper_ci, 3)}]" 190 | ) 191 | 192 | df_ci = pd.DataFrame.from_dict( 193 | conf_interval, orient="index", columns=["lower_ci", "upper_ci"] 194 | ) 195 | df_ci = pd.concat([gaps.loc[post_periods].rename("value"), df_ci], axis=1) 196 | df_ci.index.name = "time" 197 | return df_ci 198 | 199 | def _root_search( 200 | self, 201 | fn: Callable, 202 | x0: float, 203 | direction: Literal[+1, -1], 204 | tol: float, 205 | step_sz: float, 206 | max_iter: int, 207 | theta: float = 0.75, 208 | phi: float = 1.3, 209 | ) -> float: 210 | """Search for a root 211 | 212 | Parameters 213 | ---------- 214 | fn : callable 215 | Function to find a root of 216 | x0 : float 217 | Starting point 218 | direction : int 219 | Direction, either -1.0 or +1.0. 220 | tol : float 221 | Tolerance 222 | step_sz : float 223 | Step size in the search 224 | max_iter : int 225 | Maximum number of iterations 226 | theta : float, optional 227 | Step size reduction factor, should be positive and < 1.0, by default 0.75 228 | phi : float, optional 229 | Step size increase factor, should be positive and > 1.0, by default 1.3 230 | 231 | Returns 232 | ------- 233 | float 234 | Root of the function 235 | 236 | Raises 237 | ------ 238 | Exception 239 | if `max_iter` iterations exceeded before satisfying tolerance condition. 240 | 241 | :meta private: 242 | """ 243 | x, gamma = x0, step_sz 244 | for _ in range(max_iter): 245 | if gamma <= tol: 246 | return x 247 | y = fn(x + gamma * direction) 248 | if y > 0.0: 249 | x = x + gamma * direction 250 | gamma = phi * gamma 251 | else: 252 | gamma = theta * gamma 253 | raise Exception( 254 | "Exceeded `max_iter` iterations without satisfying tolerance requirement." 255 | ) 256 | -------------------------------------------------------------------------------- /pysyncon/penalized.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Optional, Literal, Union 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from scipy.optimize import minimize, Bounds, LinearConstraint 7 | 8 | from .dataprep import Dataprep 9 | from .base import BaseSynth 10 | 11 | 12 | class PenalizedOptimMixin: 13 | @staticmethod 14 | def w_optimize( 15 | V_mat: np.ndarray, 16 | X0: np.ndarray, 17 | X1: np.ndarray, 18 | lambda_: float, 19 | qp_method: Literal["SLSQP"] = "SLSQP", 20 | qp_options: dict = {"maxiter": 1000}, 21 | ) -> tuple[np.ndarray, float]: 22 | """Solves the weight optimisation problem in the penalized setting, 23 | see Abadie & L'Hour :cite:`penalized2021`. 24 | 25 | Parameters 26 | ---------- 27 | V_mat : numpy.ndarray, shape (c, c) 28 | The V matrix (using the notation of the Abadie, Diamond & 29 | Hainmueller paper, this matrix is denoted by Γ in the Abadie and 30 | L'Hour paper). 31 | X0 : numpy.ndarray, shape (c, m) 32 | Matrix with each column corresponding to a control unit and each 33 | row is covariates. 34 | X1 : numpy.ndarray, shape (c,) 35 | Column vector giving the covariate values for the treated unit. 36 | lambda_ : float, 37 | Regularization parameter. 38 | qp_method : str, optional 39 | Minimization routine to use in scipy minimize to solve the problem 40 | , by default "SLSQP" 41 | qp_options : dict, optional 42 | Options for scipy minimize, by default {"maxiter": 1000} 43 | 44 | Returns 45 | ------- 46 | tuple[np.ndarray, float] 47 | tuple of the optimal weights and the loss 48 | 49 | :meta private: 50 | """ 51 | _, n_c = X0.shape 52 | 53 | diff = np.subtract(X0, X1.reshape(-1, 1)) 54 | r = np.diag(diff.T @ V_mat @ diff) 55 | 56 | P = X0.T @ V_mat @ X0 57 | q = -1.0 * X1.T @ V_mat @ X0 + (lambda_ / 2.0) * r.T 58 | 59 | def fun(x): 60 | return q.T @ x + 0.5 * x.T @ P @ x 61 | 62 | bounds = Bounds(lb=np.full(n_c, 0.0), ub=np.full(n_c, 1.0)) 63 | constraints = LinearConstraint(A=np.full(n_c, 1.0), lb=1.0, ub=1.0) 64 | 65 | x0 = np.full(n_c, 1 / n_c) 66 | res = minimize( 67 | fun=fun, 68 | x0=x0, 69 | bounds=bounds, 70 | constraints=constraints, 71 | method=qp_method, 72 | options=qp_options, 73 | ) 74 | W, loss_W = res["x"], res["fun"] 75 | return W, loss_W.item() 76 | 77 | 78 | class PenalizedSynth(BaseSynth, PenalizedOptimMixin): 79 | """Implementation of the penalized synthetic control method due to 80 | Abadie & L'Hour :cite:`penalized2021`. 81 | """ 82 | 83 | def __init__(self) -> None: 84 | super().__init__() 85 | self.loss_W: Optional[float] = None 86 | self.lambda_: Optional[float] = None 87 | 88 | def fit( 89 | self, 90 | dataprep: Optional[Dataprep] = None, 91 | X0: Optional[pd.DataFrame] = None, 92 | X1: Optional[Union[pd.Series, pd.DataFrame]] = None, 93 | lambda_: Optional[float] = 0.01, 94 | custom_V: Optional[np.ndarray] = None, 95 | ) -> None: 96 | """Fit the model/calculate the weights. 97 | 98 | Parameters 99 | ---------- 100 | dataprep : Dataprep, optional 101 | :class:`Dataprep` object containing data to model, by default None. 102 | X0 : pd.DataFrame, shape (c, m), optional 103 | Matrix with each column corresponding to a control unit and each 104 | row is a covariate value, by default None. 105 | X1 : pandas.Series, shape (c, 1), optional 106 | Column vector giving the covariate values for the treated unit, by 107 | default None. 108 | lambda_ : float, optional 109 | Ridge parameter to use, default 0.01 110 | custom_V : numpy.ndarray, shape (c, c), optional 111 | Provide a V matrix (using the notation of the Abadie, Diamond & 112 | Hainmueller paper, this matrix is denoted by Γ in the Abadie and 113 | L'Hour paper), if not provided then the identity matrix is used 114 | (equal importance to all covariates). 115 | 116 | Returns 117 | ------- 118 | NoneType 119 | None 120 | 121 | Raises 122 | ------ 123 | ValueError 124 | if neither a Dataprep object nor all of (X0, X1) are 125 | supplied. 126 | """ 127 | if dataprep: 128 | if ( 129 | isinstance(dataprep.treatment_identifier, (list, tuple)) 130 | and len(dataprep.treatment_identifier) > 1 131 | ): 132 | raise ValueError("PenalizedSynth requires exactly one treated unit.") 133 | self.dataprep = dataprep 134 | X0, X1 = dataprep.make_covariate_mats() 135 | else: 136 | if X0 is None or X1 is None: 137 | raise ValueError("dataprep must be set or (X0, X1) must all be set.") 138 | if not isinstance(X1, pd.Series): 139 | raise TypeError("X1 must be of type `pandas.Series`.") 140 | self.lambda_ = lambda_ 141 | 142 | X = pd.concat([X0, X1], axis=1) 143 | X_scaled = X.divide(X.std(axis=1), axis=0) 144 | X0_scaled, X1_scaled = X_scaled.drop(columns=X1.name), X_scaled[X1.name] 145 | 146 | X0_arr = X0_scaled.to_numpy() 147 | X1_arr = X1_scaled.to_numpy() 148 | 149 | if custom_V is None: 150 | V_mat = np.identity(X0_arr.shape[0]) 151 | else: 152 | V_mat = np.diag(custom_V) 153 | 154 | W, loss_W = self.w_optimize(V_mat=V_mat, X0=X0_arr, X1=X1_arr, lambda_=lambda_) 155 | self.W, self.loss_W = W, loss_W 156 | -------------------------------------------------------------------------------- /pysyncon/robust.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Optional 3 | 4 | import numpy as np 5 | 6 | from .dataprep import Dataprep 7 | from .base import BaseSynth 8 | 9 | 10 | class RobustSynth(BaseSynth): 11 | """Implementation of the robust synthetic control method due to 12 | Amjad, Shah & Shen :cite:`robust2018`. 13 | """ 14 | 15 | def __init__(self) -> None: 16 | super().__init__() 17 | self.W: Optional[np.ndarray] = None 18 | self.lambda_: Optional[float] = None 19 | 20 | def fit( 21 | self, 22 | dataprep: Dataprep, 23 | lambda_: float, 24 | threshold: Optional[float] = None, 25 | sv_count: Optional[int] = None, 26 | ) -> None: 27 | """Fit the model/calculate the weights. 28 | 29 | Parameters 30 | ---------- 31 | dataprep : Dataprep 32 | :class:`Dataprep` object containing data to model. 33 | lambda_ : float 34 | Ridge parameter to use. 35 | threshold: float, optional 36 | Remove singular values that are less than this threshold. 37 | sv_count: int, optional 38 | Keep this many of the largest singular values when 39 | reducing the outcome matrix 40 | """ 41 | if ( 42 | isinstance(dataprep.treatment_identifier, (list, tuple)) 43 | and len(dataprep.treatment_identifier) > 1 44 | ): 45 | raise ValueError("RobustSynth requires exactly one treated unit.") 46 | self.dataprep = dataprep 47 | time_period_min = dataprep.foo[dataprep.time_variable].astype("int").min() 48 | time_period_max = dataprep.foo[dataprep.time_variable].astype("int").max() 49 | 50 | X0, X1 = dataprep.make_outcome_mats( 51 | time_period=range(time_period_min, 1 + time_period_max) 52 | ) 53 | Y = X0.T.values 54 | 55 | M_hat = self._sv_decomposition(Y, threshold, sv_count).T 56 | 57 | time_optim_end = 1 + dataprep.time_optimize_ssr[-1] 58 | end_idx = X0.index.to_list().index(time_optim_end) 59 | M_hat_neg = M_hat[:end_idx, :] 60 | Y1_neg = X1.to_numpy()[:end_idx] 61 | 62 | self.W = np.matmul( 63 | np.linalg.inv( 64 | M_hat_neg.T @ M_hat_neg + lambda_ * np.identity(M_hat_neg.shape[1]) 65 | ), 66 | M_hat_neg.T @ Y1_neg, 67 | ) 68 | 69 | def _sv_decomposition( 70 | self, 71 | Y: np.ndarray, 72 | threshold: Optional[float] = None, 73 | sv_count: Optional[int] = None, 74 | ) -> np.ndarray: 75 | """Calculate the :math:`\hat{M}` matrix from the paper (see :cite:`robust2018`) by 76 | carrying out an SVD of the outcome matrix and remove the specified number 77 | of singular values. 78 | 79 | Parameters 80 | ---------- 81 | Y : np.ndarray 82 | The outcome matrix (:math:`Y` matrix in the notation of the paper). 83 | threshold : Optional[float], optional 84 | Remove singular values that are less that `threshold`, 85 | either this must be specified or `sv_count`, by default None 86 | sv_count : Optional[int], optional 87 | Keep this many of the largest singular values, 88 | either this must be specified or `threshold`, by default None 89 | 90 | Returns 91 | ------- 92 | np.ndarray 93 | The :math:`\hat{M}` matrix from the paper (see :cite:`robust2018`). 94 | 95 | Raises 96 | ------ 97 | ValueError 98 | If neither `threshold` nor `sv_count` are supplied. 99 | 100 | :meta private: 101 | """ 102 | if not threshold and not sv_count: 103 | raise ValueError("One of `threshold` or `sv_count` must be supplied.") 104 | u, s, v = np.linalg.svd(Y) 105 | s_shape = s.shape[0] - 1 106 | if threshold: 107 | idx = 0 108 | while s[idx] > threshold and idx < s_shape: 109 | idx += 1 110 | else: 111 | idx = sv_count 112 | 113 | s_res = np.zeros_like(Y) 114 | s_res[:idx, :idx] = np.diag(s[:idx]) 115 | 116 | r, c = Y.shape 117 | p_hat = max(np.count_nonzero(Y) / (r * c), 1 / (r * c)) 118 | M_hat = (1 / p_hat) * (u @ s_res @ v) 119 | return M_hat 120 | -------------------------------------------------------------------------------- /pysyncon/synth.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Union, Optional, Literal 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from scipy.optimize import minimize 7 | 8 | from .dataprep import Dataprep 9 | from .base import BaseSynth, VanillaOptimMixin 10 | from .inference import ConformalInference 11 | 12 | 13 | OptimizerMethod_t = Literal[ 14 | "Nelder-Mead", "Powell", "CG", "BFGS", "L-BFGS-B", "TNC", "COBYLA", "trust-constr" 15 | ] 16 | 17 | 18 | class Synth(BaseSynth, VanillaOptimMixin): 19 | """Implementation of the synthetic control method due to 20 | Abadie & Gardeazabal :cite:`basque2003`. 21 | """ 22 | 23 | def __init__(self) -> None: 24 | super().__init__() 25 | self.loss_W: Optional[float] = None 26 | self.V: Optional[np.ndarray] = None 27 | self.loss_V: Optional[float] = None 28 | 29 | def fit( 30 | self, 31 | dataprep: Optional[Dataprep] = None, 32 | X0: Optional[pd.DataFrame] = None, 33 | X1: Optional[pd.Series] = None, 34 | Z0: Optional[pd.DataFrame] = None, 35 | Z1: Optional[pd.Series] = None, 36 | custom_V: Optional[np.ndarray] = None, 37 | optim_method: OptimizerMethod_t = "Nelder-Mead", 38 | optim_initial: Literal["equal", "ols"] = "equal", 39 | optim_options: dict = {"maxiter": 1000}, 40 | ) -> None: 41 | """Fit the model/calculate the weights. Either a :class:`Dataprep` object 42 | should be provided or otherwise matrices (:math:`X_0`, :math:`X_1`, :math:`Z_0`, 43 | :math:`Z_1`) should be provided (using the notation of Abadie & 44 | Gardeazabal :cite:`basque2003`). 45 | 46 | Parameters 47 | ---------- 48 | dataprep : Dataprep, optional 49 | :class:`Dataprep` object containing data to model, by default None. 50 | X0 : pd.DataFrame, shape (m, c), optional 51 | Matrix with each column corresponding to a control unit and each 52 | row is covariates, by default None. 53 | X1 : pandas.Series, shape (m, 1), optional 54 | Column vector giving the covariate values for the treated unit, by 55 | default None. 56 | Z0 : pandas.DataFrame, shape (n, c), optional 57 | A matrix of the time series of the outcome variable with each 58 | column corresponding to a control unit and the rows are the time 59 | steps; the columns correspond with the columns of X0, by default 60 | None. 61 | Z1 : pandas.Series, shape (n, 1), optional 62 | Column vector giving the outcome variable values over time for the 63 | treated unit, by default None. 64 | custom_V : numpy.ndarray, shape (c, c), optional 65 | Provide a V matrix (using the notation of the Abadie, Diamond & 66 | Hainmueller paper), the optimisation problem will only then be 67 | solved for the weight matrix W, by default None. 68 | optim_method : str, optional 69 | Optimisation method to use for the outer optimisation, can be 70 | any of the valid options for scipy minimize that do not require a 71 | jacobian matrix, namely 72 | 73 | - 'Nelder-Mead' 74 | - 'Powell' 75 | - 'CG' 76 | - 'BFGS' 77 | - 'L-BFGS-B' 78 | - 'TNC' 79 | - 'COBYLA' 80 | - 'trust-constr' 81 | 82 | By default 'Nelder-Mead'. 83 | optim_initial : str, optional 84 | Starting value for the outer optimisation, possible starting 85 | values are 86 | 87 | - 'equal', where the weights are all equal, 88 | - 'ols', which uses a starting value obtained for fitting a 89 | regression. 90 | 91 | By default 'equal'. 92 | optim_options : dict, optional 93 | options to provide to the outer part of the optimisation, value 94 | options are any option that can be provided to scipy minimize for 95 | the given optimisation method, by default `{'maxiter': 1000}`. 96 | 97 | Returns 98 | ------- 99 | NoneType 100 | None 101 | 102 | Raises 103 | ------ 104 | ValueError 105 | if neither a `Dataprep` object nor all of (:math:`X_0`, :math:`X_1`, 106 | :math:`Z_0`, :math:`Z_1`) are supplied. 107 | TypeError 108 | if (:math:`X1`, :math:`Z1`) are not of type `pandas.Series`. 109 | ValueError 110 | if `optim_initial=ols` and there is collinearity in the data. 111 | ValueError 112 | if `optim_initial` is not one of `'equal'` or `'ols'`. 113 | """ 114 | if dataprep: 115 | if ( 116 | isinstance(dataprep.treatment_identifier, (list, tuple)) 117 | and len(dataprep.treatment_identifier) > 1 118 | ): 119 | raise ValueError("Synth requires exactly one treated unit.") 120 | self.dataprep = dataprep 121 | X0, X1 = dataprep.make_covariate_mats() 122 | Z0, Z1 = dataprep.make_outcome_mats() 123 | else: 124 | if X0 is None or X1 is None or Z0 is None or Z1 is None: 125 | raise ValueError( 126 | "dataprep must be set or (X0, X1, Z0, Z1) must all be set." 127 | ) 128 | if not isinstance(X1, pd.Series) or not isinstance(Z1, pd.Series): 129 | raise TypeError("X1 and Z1 must be of type `pandas.Series`.") 130 | 131 | X = pd.concat([X0, X1], axis=1) 132 | X_scaled = X.divide(X.std(axis=1), axis=0) 133 | X0_scaled, X1_scaled = X_scaled.drop(columns=X1.name), X_scaled[X1.name] 134 | 135 | X0_arr = X0_scaled.to_numpy() 136 | X1_arr = X1_scaled.to_numpy() 137 | Z0_arr = Z0.to_numpy() 138 | Z1_arr = Z1.to_numpy() 139 | 140 | if custom_V is not None: 141 | V_mat = np.diag(custom_V) 142 | W, loss_W = self.w_optimize(V_mat=V_mat, X0=X0_arr, X1=X1_arr) 143 | loss_V = self.calc_loss_V(W=W, Z0=Z0_arr, Z1=Z1_arr) 144 | self.W, self.loss_W, self.V, self.loss_V = W, loss_W, custom_V, loss_V 145 | return 146 | 147 | n_r, _ = X0_arr.shape 148 | 149 | if optim_initial == "equal": 150 | x0 = [1 / n_r] * n_r 151 | elif optim_initial == "ols": 152 | X_arr = np.hstack([X0_arr, X1_arr.reshape(-1, 1)]) 153 | X_arr = np.hstack([np.full((X_arr.shape[1], 1), 1), X_arr.T]) 154 | Z_arr = np.hstack([Z0_arr, Z1_arr.reshape(-1, 1)]) 155 | 156 | try: 157 | beta = np.linalg.inv(X_arr.T @ X_arr) @ X_arr.T @ Z_arr.T 158 | except np.linalg.LinAlgError: 159 | raise ValueError( 160 | 'Could not invert X^T.X required for `optim_initial="ols"`, ' 161 | "probably there is collinearity in your data." 162 | ) 163 | 164 | beta = beta[1:,] # fmt: skip 165 | x0 = np.diag(beta @ beta.T) 166 | x0 = x0 / sum(x0) 167 | else: 168 | raise ValueError("Unknown option for `optim_initial`.") 169 | 170 | def fun(x): 171 | V_mat = np.diag(np.abs(x)) / np.sum(np.abs(x)) 172 | W, _ = self.w_optimize(V_mat=V_mat, X0=X0_arr, X1=X1_arr) 173 | loss_V = self.calc_loss_V(W=W, Z0=Z0_arr, Z1=Z1_arr) 174 | return loss_V 175 | 176 | res = minimize(fun=fun, x0=x0, method=optim_method, options=optim_options) 177 | V_mat = np.diag(np.abs(res["x"])) / np.sum(np.abs(res["x"])) 178 | W, loss_W = self.w_optimize(V_mat=V_mat, X0=X0_arr, X1=X1_arr) 179 | loss_V = self.calc_loss_V(W=W, Z0=Z0_arr, Z1=Z1_arr) 180 | 181 | self.W, self.loss_W, self.V, self.loss_V = W, loss_W, V_mat.diagonal(), loss_V 182 | self.W_names = Z0.columns 183 | 184 | @staticmethod 185 | def calc_loss_V(W: np.ndarray, Z0: np.ndarray, Z1: np.ndarray) -> float: 186 | """Calculates the V loss. 187 | 188 | Parameters 189 | ---------- 190 | W : numpy.ndarray, shape (n,) 191 | Vector of the control weights 192 | Z0 : numpy.ndarray, shape (m, n) 193 | Matrix of the time series of the outcome variable with each 194 | column corresponding to a control unit and the rows are the time 195 | steps. 196 | Z1 : numpy.ndarray, shape (m,) 197 | Column vector giving the outcome variable values over time for the 198 | treated unit 199 | 200 | Returns 201 | ------- 202 | float 203 | V loss. 204 | 205 | :meta private: 206 | """ 207 | loss_V = (Z1 - Z0 @ W).T @ (Z1 - Z0 @ W) / len(Z0) 208 | return loss_V.item() 209 | 210 | def summary( 211 | self, 212 | round: int = 3, 213 | X0: Optional[pd.DataFrame] = None, 214 | X1: Optional[pd.Series] = None, 215 | ) -> pd.DataFrame: 216 | """Generates a ``pandas.DataFrame`` with summary data. In particular, 217 | it will show the values of the V matrix for each predictor, then the 218 | next column will show the mean value of each predictor over the time 219 | period ``time_predictors_prior`` for the treated unit and the synthetic 220 | unit and finally there will be a column 'sample mean' that shows the 221 | mean value of each predictor over the time period 222 | ``time_predictors_prior`` across all the control units, i.e. this will 223 | be the same as a synthetic control where all the weights are equal. 224 | 225 | Parameters 226 | ---------- 227 | round : int, optional 228 | Round the numbers to given number of places, by default 3 229 | X0 : pd.DataFrame, shape (n_cov, n_controls), optional 230 | Matrix with each column corresponding to a control unit and each 231 | row is a covariate. If no dataprep is set, then this must be 232 | supplied along with X1, by default None. 233 | X1 : pandas.Series, shape (n_cov, 1), optional 234 | Column vector giving the covariate values for the treated unit. 235 | If no dataprep is set, then this must be supplied along with Z1, 236 | by default None. 237 | 238 | Returns 239 | ------- 240 | pandas.DataFrame 241 | Summary data. 242 | 243 | Raises 244 | ------ 245 | ValueError 246 | If there is no V matrix available 247 | ValueError 248 | If there is no :class:`Dataprep` object set or (Z0, Z1) is not supplied 249 | ValueError 250 | If there is no weight matrix available 251 | """ 252 | if self.V is None: 253 | raise ValueError("No V matrix available; fit data first.") 254 | summary_ser = super().summary(round=round, X0=X0, X1=X1) 255 | 256 | V = pd.Series(self.V, index=summary_ser.index, name="V") 257 | return pd.concat([V, summary_ser], axis=1).round(round) 258 | 259 | def confidence_interval( 260 | self, 261 | alpha: float, 262 | time_periods: list, 263 | tol: float, 264 | pre_periods: Optional[list] = None, 265 | dataprep: Optional[Dataprep] = None, 266 | X0: Optional[pd.DataFrame] = None, 267 | X1: Optional[pd.Series] = None, 268 | Z0: Optional[pd.DataFrame] = None, 269 | Z1: Optional[pd.Series] = None, 270 | custom_V: Optional[np.ndarray] = None, 271 | optim_method: OptimizerMethod_t = None, 272 | optim_initial: Literal["equal", "ols"] = None, 273 | optim_options: dict = None, 274 | method: Literal["conformal"] = "conformal", 275 | max_iter: int = 50, 276 | step_sz: Optional[float] = None, 277 | step_sz_div: float = 20.0, 278 | verbose: bool = True, 279 | ) -> pd.DataFrame: 280 | """Confidence intervals obtained from test-inversion, where 281 | the p-values are obtained by adjusted refits of the data 282 | following Chernozhukov et al. :cite:`inference2021`. 283 | 284 | Parameters 285 | ---------- 286 | alpha : float 287 | The required significance level, e.g. alpha = 0.05 will 288 | yield a confidence level of 100 * (1 - alpha) = 95%. 289 | time_periods : list 290 | The time-periods to calculate confidence intervals for. 291 | tol : float 292 | The required tolerance (accuracy) required when calculating the 293 | lower/upper cut-off point of the confidence interval. The search 294 | will try to obtain this tolerance level but will not exceed `max_iter` 295 | iterations trying to achieve that. 296 | pre_periods : Optional[list], optional 297 | The time-periods to use for the optimization when refitting the 298 | data with the adjusted outcomes, optional. 299 | dataprep : Optional[Dataprep], optional 300 | Dataprep object defining the study data, if this is not supplied 301 | then either self.dataprep must be set or else (X0, X1, Z0, Z1) must 302 | all be supplied, by default None. 303 | X0 : pd.DataFrame, shape (m, c), optional 304 | Matrix with each column corresponding to a control unit and each 305 | row is covariates, if this is not supplied then either `dataprep` must 306 | be supplied or `self.dataprep` must be set by default None. 307 | X1 : pandas.Series, shape (m, 1), optional 308 | Column vector giving the covariate values for the treated unit, if 309 | this is not supplied then either `dataprep` must 310 | be supplied or `self.dataprep` must be set by default None. 311 | Z0 : pandas.DataFrame, shape (n, c), optional 312 | A matrix of the time series of the outcome variable with each 313 | column corresponding to a control unit and the rows are the time 314 | steps; the columns correspond with the columns of X0, if this 315 | is not supplied then either `dataprep` must be supplied or 316 | `self.dataprep` must be set by default None. 317 | Z1 : pandas.Series, shape (n, 1), optional 318 | Column vector giving the outcome variable values over time for the 319 | treated unit, if this is not supplied then either `dataprep` must 320 | be supplied or `self.dataprep` must be set by default None. 321 | custom_V : numpy.ndarray, shape (c, c), optional 322 | Provide a V matrix (using the notation of the Abadie, Diamond & 323 | Hainmueller paper), the optimisation problem will only then be 324 | solved for the weight matrix W. This is the same argument 325 | as in the `fit` method, by default None. 326 | optim_method : str, optional 327 | Optimisation method to use for the outer optimisation, can be 328 | any of the valid options for scipy minimize that do not require a 329 | jacobian matrix, namely 330 | 331 | - 'Nelder-Mead' 332 | - 'Powell' 333 | - 'CG' 334 | - 'BFGS' 335 | - 'L-BFGS-B' 336 | - 'TNC' 337 | - 'COBYLA' 338 | - 'trust-constr' 339 | 340 | This is the same argument as in the `fit` method, by default 341 | 'Nelder-Mead'. 342 | optim_initial : str, optional 343 | Starting value for the outer optimisation, possible starting 344 | values are 345 | 346 | - 'equal', where the weights are all equal, 347 | - 'ols', which uses a starting value obtained for fitting a 348 | regression. 349 | 350 | This is the same argument as in the `fit` method, by default 351 | 'equal'. 352 | optim_options : dict, optional 353 | options to provide to the outer part of the optimisation, value 354 | options are any option that can be provided to scipy minimize for 355 | the given optimisation method. This is the same argument as in 356 | the `fit` method, by default `{'maxiter': 1000}`. 357 | method : str, optional 358 | The type of method to use when computing the confidence intervals, 359 | currently only conformal inference (`conformal`) is implemented, 360 | by default "conformal". 361 | max_iter : int, optional 362 | Maximum number of times to re-fit the data when trying to locate 363 | the lower/upper cut-off point and when binary searching for the 364 | cut-off point, by default 20. 365 | step_sz : Optional[float], optional 366 | Step size to use when searching for an interval that contains the 367 | lower or upper cut-off point of the confidence interval, by default None. 368 | step_sz_div : float, optional 369 | Alternative way to define step size: it is the fraction that defines 370 | step-size in terms of the standard deviation of the att, i.e. if 371 | `step_sz_div=20.0` then the step size used will be (att +/- 2.5 * std(att)) / 20.0, 372 | by default 20.0. 373 | verbose : bool, optional 374 | Print output, by default True. 375 | 376 | Returns 377 | ------- 378 | pd.DataFrame 379 | A pandas.DataFrame indexed by `post_periods`, with 3 columns: `value` that 380 | gives the calculated treatment effect, `lower_ci` that gives the value 381 | defining the lower-end of the confidence interval, `upper_ci` that gives 382 | the value defining the upper-end of the confidence interval. 383 | 384 | Raises 385 | ------ 386 | ValueError 387 | If there is no :class:`Dataprep` object set or (X0, X1, Z0, Z1) is not supplied or 388 | `self.dataprep` is not set. 389 | TypeError 390 | if (:math:`X1`, :math:`Z1`) are not of type `pandas.Series`. 391 | ValueError 392 | if `dataprep` is not set and `pre-periods` is not set. 393 | ValueError 394 | if an invalid option for `method` is given, currently only `conformal` is supported. 395 | """ 396 | if method == "conformal": 397 | if dataprep is not None: 398 | X0, X1 = dataprep.make_covariate_mats() 399 | if pre_periods is None: 400 | pre_periods = list(dataprep.time_optimize_ssr) 401 | if 1.0 / len(pre_periods) > alpha: 402 | raise ValueError( 403 | "Too few pre-intervention time-periods available for " 404 | f"significance level `alpha`={alpha}, either increase `alpha` " 405 | "or use more pre-intervention time-periods." 406 | ) 407 | all_time_periods = time_periods + list(pre_periods) 408 | Z0, Z1 = dataprep.make_outcome_mats(time_period=all_time_periods) 409 | elif self.dataprep is not None: 410 | X0, X1 = self.dataprep.make_covariate_mats() 411 | if pre_periods is None: 412 | pre_periods = list(self.dataprep.time_optimize_ssr) 413 | if 1.0 / len(pre_periods) > alpha: 414 | raise ValueError( 415 | "Too few pre-intervention time-periods available for " 416 | f"significance level `alpha`={alpha}, either increase `alpha` " 417 | "or use more pre-intervention time-periods." 418 | ) 419 | all_time_periods = time_periods + list(pre_periods) 420 | Z0, Z1 = self.dataprep.make_outcome_mats(time_period=all_time_periods) 421 | else: 422 | if X0 is None or X1 is None or Z0 is None or Z1 is None: 423 | raise ValueError( 424 | "dataprep must be set or (X0, X1, Z0, Z1) must all be set." 425 | ) 426 | if not isinstance(X1, pd.Series) or not isinstance(Z1, pd.Series): 427 | raise TypeError("X1 and Z1 must be of type `pandas.Series`.") 428 | if pre_periods is None: 429 | raise ValueError("`pre_periods` must be set if not using dataprep.") 430 | if 1.0 / len(pre_periods) > alpha: 431 | raise ValueError( 432 | "Too few pre-intervention time-periods available for " 433 | f"significance level `alpha`={alpha}, either increase `alpha` " 434 | "or use more pre-intervention time-periods." 435 | ) 436 | 437 | scm_fit_args = {"X0": X0, "X1": X1} 438 | if custom_V is not None: 439 | scm_fit_args["custom_V"] = custom_V 440 | if optim_method: 441 | scm_fit_args["optim_method"] = optim_method 442 | if optim_initial: 443 | scm_fit_args["optim_initial"] = optim_initial 444 | if optim_options: 445 | scm_fit_args["optim_options"] = optim_options 446 | 447 | conformal_inf = ConformalInference() 448 | df_cis = conformal_inf.confidence_intervals( 449 | alpha=alpha, 450 | scm=self, 451 | Z0=Z0, 452 | Z1=Z1, 453 | pre_periods=pre_periods, 454 | post_periods=time_periods, 455 | scm_fit_args=scm_fit_args, 456 | max_iter=max_iter, 457 | tol=tol, 458 | step_sz=step_sz, 459 | step_sz_div=step_sz_div, 460 | verbose=verbose, 461 | ) 462 | return df_cis 463 | else: 464 | raise ValueError("Invalid option for `method`.") 465 | -------------------------------------------------------------------------------- /pysyncon/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Optional, Union 3 | from concurrent import futures 4 | import copy 5 | from dataclasses import dataclass 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import matplotlib.pyplot as plt 10 | 11 | from .dataprep import Dataprep, IsinArg_t 12 | from .base import BaseSynth 13 | 14 | 15 | class HoldoutSplitter: 16 | """Iterator that prepares the time series for cross-validation by 17 | progressively removing blocks of length `holdout_len`. 18 | """ 19 | 20 | def __init__(self, df: pd.DataFrame, ser: pd.Series, holdout_len: int = 1): 21 | """Iterator that prepares the time series for cross-validation by 22 | progressively removing blocks of length `holdout_len`. 23 | 24 | Parameters 25 | ---------- 26 | df : pandas.DataFrame, shape (r, c) 27 | Dataframe that will be split for the cross-validation. 28 | ser : pandas.Series, shape (r, 1) 29 | Series that will split for the cross-validation. 30 | holdout_len : int, optional 31 | Number of days to remove in each iteration, by default 1. 32 | 33 | Raises 34 | ------ 35 | ValueError 36 | if df and ser do not have the same number of rows. 37 | ValueError 38 | if `holdout_len` is not >= 1. 39 | ValueError 40 | if `holdout_len` is larger than the number of rows of df. 41 | """ 42 | if df.shape[0] != ser.shape[0]: 43 | raise ValueError("`df` and `ser` must have the same number of rows.") 44 | if holdout_len < 1: 45 | raise ValueError("`holdout_len` must be at least 1.") 46 | if holdout_len >= df.shape[0]: 47 | raise ValueError("`holdout_len` must be less than df.shape[0]") 48 | self.df = df 49 | self.ser = ser 50 | self.holdout_len = holdout_len 51 | self.idx = 0 52 | 53 | def __iter__(self): 54 | self.idx = 0 55 | return self 56 | 57 | def __next__(self) -> tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]: 58 | if (self.idx + self.holdout_len) > self.df.shape[0]: 59 | raise StopIteration 60 | holdout = slice(self.idx, self.idx + self.holdout_len) 61 | 62 | df_holdout = self.df.iloc[holdout,] # fmt: skip 63 | ser_holdout = self.ser.iloc[holdout] 64 | 65 | df = self.df.drop(index=self.df.index[holdout]) 66 | ser = self.ser.drop(index=self.ser.index[holdout]) 67 | 68 | self.idx += 1 69 | return df, df_holdout, ser, ser_holdout 70 | 71 | 72 | @dataclass 73 | class CrossValidationResult: 74 | """Convenience class for holding the results of the cross-validation 75 | procedure from the AugSynth. 76 | """ 77 | 78 | lambdas: np.ndarray 79 | errors_mean: np.ndarray 80 | errors_se: np.ndarray 81 | 82 | def best_lambda(self, min_1se: bool = True) -> float: 83 | """Return the best lambda. 84 | 85 | Parameters 86 | ---------- 87 | min_1se : bool, optional 88 | return the largest lambda within 1 standard error of the minimum 89 | , by default True 90 | 91 | Returns 92 | ------- 93 | float 94 | """ 95 | if min_1se: 96 | return ( 97 | self.lambdas[ 98 | self.errors_mean 99 | <= self.errors_mean.min() 100 | + self.errors_se[self.errors_mean.argmin()] 101 | ] 102 | .max() 103 | .item() 104 | ) 105 | return self.lambdas[self.errors_mean.argmin()].item() 106 | 107 | def plot(self) -> None: 108 | """Plots the mean errors against the lambda values with the standard 109 | errors as error bars. 110 | """ 111 | plt.errorbar( 112 | x=self.lambdas, 113 | y=self.errors_mean, 114 | yerr=self.errors_se, 115 | ecolor="black", 116 | capsize=2, 117 | ) 118 | plt.xlabel("Lambda") 119 | plt.ylabel("Mean error") 120 | plt.xscale("log") 121 | plt.yscale("log") 122 | plt.title("Cross validation result") 123 | plt.grid() 124 | plt.show() 125 | 126 | 127 | class PlaceboTest: 128 | """Class that carries out placebo tests by running a synthetic control 129 | study using each possible control unit as the treated unit and the 130 | remaining control units as controls. See :cite:`germany2015` for more details. 131 | """ 132 | 133 | def __init__(self) -> None: 134 | self.paths: Optional[pd.DataFrame] = None 135 | self.treated_path: Optional[pd.DataFrame] = None 136 | self.gaps: Optional[pd.DataFrame] = None 137 | self.treated_gap: Optional[pd.DataFrame] = None 138 | self.time_optimize_ssr: Optional[IsinArg_t] = None 139 | 140 | def fit( 141 | self, 142 | dataprep: Dataprep, 143 | scm: BaseSynth, 144 | scm_options: dict = {}, 145 | max_workers: Optional[int] = None, 146 | verbose: bool = True, 147 | ): 148 | """Run the placebo tests. This method is multi-process and by default 149 | will use all available processors. Use the `max_workers` option to change 150 | this behaviour. 151 | 152 | Parameters 153 | ---------- 154 | dataprep : Dataprep 155 | :class:`Dataprep` object containing data to model, by default None. 156 | scm : Synth | AugSynth 157 | Synthetic control study to use 158 | scm_options : dict, optional 159 | Options to provide to the fit method of the synthetic control 160 | study, valid options are any valid option that the `scm_type` 161 | takes, by default {} 162 | max_workers : Optional[int], optional 163 | Maximum number of processes to use, if not provided then will use 164 | all available, by default None 165 | verbose : bool, optional 166 | Whether or not to output progress, by default True 167 | """ 168 | paths, gaps = list(), list() 169 | n_tests = len(dataprep.controls_identifier) 170 | with futures.ProcessPoolExecutor(max_workers=max_workers) as executor: 171 | to_do = list() 172 | for treated, controls in self.placebo_iter(dataprep.controls_identifier): 173 | _dataprep = copy.copy(dataprep) 174 | _dataprep.treatment_identifier = treated 175 | _dataprep.controls_identifier = controls 176 | to_do.append( 177 | executor.submit( 178 | self._single_placebo, 179 | dataprep=_dataprep, 180 | scm=scm, 181 | scm_options=scm_options, 182 | ) 183 | ) 184 | for idx, future in enumerate(futures.as_completed(to_do), 1): 185 | path, gap = future.result() 186 | if verbose: 187 | print(f"({idx}/{n_tests}) Completed placebo test for {path.name}.") 188 | paths.append(path) 189 | gaps.append(gap) 190 | 191 | self.paths = pd.concat(paths, axis=1) 192 | self.gaps = pd.concat(gaps, axis=1) 193 | self.time_optimize_ssr = dataprep.time_optimize_ssr 194 | 195 | print(f"Calculating treated unit gaps.") 196 | self.treated_path, self.treated_gap = self._single_placebo( 197 | dataprep=dataprep, scm=scm, scm_options=scm_options 198 | ) 199 | print("Done.") 200 | 201 | @staticmethod 202 | def placebo_iter(controls: list[str]) -> tuple[str, list[str]]: 203 | """Generates combinations of (treated unit, control units) for the 204 | placebo tests. 205 | 206 | Parameters 207 | ---------- 208 | controls : list[str] 209 | List of unit labels to use 210 | 211 | Yields 212 | ------ 213 | tuple[str, list[str]] 214 | Tuple of (treated unit label, control unit labels) 215 | 216 | :meta private: 217 | """ 218 | for control in controls: 219 | yield (control, [c for c in controls if c != control]) 220 | 221 | @staticmethod 222 | def _single_placebo( 223 | dataprep: Dataprep, scm: BaseSynth, scm_options: dict = {} 224 | ) -> tuple[pd.Series, pd.Series]: 225 | """Run a single placebo test. 226 | 227 | Parameters 228 | ---------- 229 | dataprep : Dataprep 230 | :class:`Dataprep` object containing data to model 231 | scm : Synth | AugSynth 232 | Type of synthetic control study to use 233 | scm_options : dict, optional 234 | Options to provide to the fit method of the synthetic control 235 | study, valid options are any valid option that `scm` takes, by 236 | default {} 237 | 238 | Returns 239 | ------- 240 | tuple[pandas.Series, pandas.Series] 241 | A time-series of the path of the synthetic control and a 242 | time-series of the gap between the treated unit and the synthetic 243 | control. 244 | 245 | :meta private: 246 | """ 247 | scm.fit(dataprep=dataprep, **scm_options) 248 | 249 | Z0, Z1 = dataprep.make_outcome_mats( 250 | time_period=dataprep.foo[dataprep.time_variable] 251 | ) 252 | synthetic = scm._synthetic(Z0=Z0) 253 | gaps = scm._gaps(Z0=Z0, Z1=Z1) 254 | return synthetic.rename(dataprep.treatment_identifier), gaps.rename( 255 | dataprep.treatment_identifier 256 | ) 257 | 258 | def gaps_plot( 259 | self, 260 | time_period: Optional[IsinArg_t] = None, 261 | grid: bool = True, 262 | treatment_time: Optional[int] = None, 263 | mspe_threshold: Optional[float] = None, 264 | exclude_units: Optional[list] = None, 265 | ): 266 | """Plot the gaps between the treated unit and the synthetic control 267 | for each placebo test. 268 | 269 | Parameters 270 | ---------- 271 | time_period : Iterable | pandas.Series | dict, optional 272 | Time range to plot, if none is supplied then the time range used 273 | is the time period over which the optimisation happens, by default 274 | None 275 | grid : bool, optional 276 | Whether or not to plot a grid, by default True 277 | treatment_time : int, optional 278 | If supplied, plot a vertical line at the time period that the 279 | treatment time occurred, by default None 280 | mspe_threshold : float, optional 281 | Remove any non-treated units whose MSPE pre-treatment is :math:`>` 282 | mspe_threshold :math:`\\times` the MSPE of the treated unit pre-treatment. 283 | This serves to exclude any non-treated units whose synthetic control 284 | had a poor pre-treatment match to the actual relative to how the 285 | actual treated unit matched pre-treatment. 286 | 287 | Raises 288 | ------ 289 | ValueError 290 | if no placebo test has been run yet 291 | ValueError 292 | if `mspe_threshold` is supplied but `treatment_year` is not. 293 | """ 294 | if self.gaps is None: 295 | raise ValueError("No gaps available; run a placebo test first.") 296 | time_period = time_period if time_period is not None else self.time_optimize_ssr 297 | 298 | gaps = self.gaps.drop(columns=exclude_units) if exclude_units else self.gaps 299 | 300 | if mspe_threshold: 301 | if not treatment_time: 302 | raise ValueError("Need `treatment_time` to use `mspe_threshold`.") 303 | pre_mspe = gaps.loc[:treatment_time].pow(2).sum(axis=0) 304 | pre_mspe_treated = self.treated_gap.loc[:treatment_time].pow(2).sum(axis=0) 305 | keep = pre_mspe[pre_mspe < mspe_threshold * pre_mspe_treated].index 306 | placebo_gaps = gaps[gaps.index.isin(time_period)][keep] 307 | else: 308 | placebo_gaps = gaps[gaps.index.isin(time_period)] 309 | 310 | plt.plot(placebo_gaps, color="black", alpha=0.1) 311 | plt.plot(self.treated_gap, color="black", alpha=1.0) 312 | if treatment_time: 313 | plt.axvline(x=treatment_time, ymin=0.05, ymax=0.95, linestyle="dashed") 314 | plt.grid(grid) 315 | plt.show() 316 | 317 | def pvalue(self, treatment_time: int) -> float: 318 | """Calculate p-value of Abadie et al's version of Fisher's 319 | exact hypothesis test for no effect of treatment null, see also 320 | section 2.2. of :cite:`fp2018`. 321 | 322 | Parameters 323 | ---------- 324 | treatment_time : int 325 | The time period that the treatment time occurred 326 | 327 | Returns 328 | ------- 329 | float 330 | p-value for null hypothesis of no effect of treatment 331 | 332 | Raises 333 | ------ 334 | ValueError 335 | if no placebo test has been run yet 336 | """ 337 | if self.gaps is None or self.treated_gap is None: 338 | raise ValueError("Run a placebo test first.") 339 | 340 | all_ = pd.concat([self.gaps, self.treated_gap], axis=1) 341 | 342 | denom = all_.loc[:treatment_time].pow(2).sum(axis=0) 343 | num = all_.loc[treatment_time:].pow(2).sum(axis=0) 344 | 345 | t, _ = self.gaps.shape 346 | t0, _ = self.gaps.loc[:treatment_time].shape 347 | 348 | rmspe = (num / (t - t0)) / (denom / t0) 349 | return sum( 350 | rmspe.drop(index=self.treated_gap.name) >= rmspe.loc[self.treated_gap.name] 351 | ) / len(rmspe) 352 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdfordham/pysyncon/9aa6b546d7f96c1e699e9ed145214d6ff17fee12/requirements-dev.txt -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = pysyncon 3 | version = 1.5.2 4 | author = Stiofán Fordham 5 | url = https://github.com/sdfordham/pysyncon/ 6 | long_description = file: README.md 7 | long_description_content_type = text/markdown 8 | keywords = Synth,augsynth,synthetic-control-method,causal-inference, 9 | license = MIT License 10 | 11 | [options] 12 | packages = pysyncon, 13 | python_requires = >=3.8 14 | install_requires = 15 | numpy >= 1.24.0 16 | matplotlib >= 3.6.2 17 | pandas >= 1.5.2 18 | scipy >= 1.9.3 19 | 20 | [options.extras_require] 21 | dev = 22 | black == 23.10.1 23 | -------------------------------------------------------------------------------- /tests/test_augsynth.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | import pandas as pd 4 | 5 | import pysyncon 6 | 7 | 8 | class TestAugSynth(unittest.TestCase): 9 | def setUp(self): 10 | self.foo = pd.DataFrame( 11 | { 12 | "time": [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4], 13 | "name": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], 14 | "dependent": np.random.random(12), 15 | "predictor1": np.random.random(12), 16 | "predictor2": np.random.random(12), 17 | } 18 | ) 19 | self.predictors = ["predictor1"] 20 | self.predictors_op = "mean" 21 | self.dependent = "dependent" 22 | self.unit_variable = "name" 23 | self.time_variable = "time" 24 | self.treatment_identifier = 1 25 | self.treatment_identifier_list = [1, 2] 26 | self.controls_identifier = [2, 3] 27 | self.controls_identifier_alt = [3] 28 | self.time_predictors_prior = [2, 3] 29 | self.time_optimize_ssr = [1, 2, 3] 30 | self.special_predictors = [ 31 | ("predictor1", [2], "mean"), 32 | ("predictor2", [1, 2], "median"), 33 | ("predictor2", [1, 2], "std"), 34 | ] 35 | 36 | def test_fit_treated(self): 37 | kwargs = { 38 | "foo": self.foo, 39 | "predictors": self.predictors, 40 | "predictors_op": self.predictors_op, 41 | "dependent": self.dependent, 42 | "unit_variable": self.unit_variable, 43 | "time_variable": self.time_variable, 44 | "time_predictors_prior": self.time_predictors_prior, 45 | "time_optimize_ssr": self.time_optimize_ssr, 46 | "special_predictors": self.special_predictors, 47 | } 48 | 49 | dataprep = pysyncon.Dataprep( 50 | treatment_identifier=self.treatment_identifier_list, 51 | controls_identifier=self.controls_identifier_alt, 52 | **kwargs, 53 | ) 54 | augsynth = pysyncon.AugSynth() 55 | self.assertRaises(ValueError, augsynth.fit, dataprep) 56 | 57 | dataprep = pysyncon.Dataprep( 58 | treatment_identifier=self.treatment_identifier, 59 | controls_identifier=self.controls_identifier, 60 | **kwargs, 61 | ) 62 | augsynth = pysyncon.AugSynth() 63 | try: 64 | augsynth.fit(dataprep) 65 | except Exception as e: 66 | self.fail(f"Augsynth fit with single treated failed: {e}.") 67 | 68 | dataprep = pysyncon.Dataprep( 69 | treatment_identifier=[self.treatment_identifier], 70 | controls_identifier=self.controls_identifier, 71 | **kwargs, 72 | ) 73 | augsynth = pysyncon.AugSynth() 74 | try: 75 | augsynth.fit(dataprep) 76 | except Exception as e: 77 | self.fail(f"Augsynth fit with single treated in list failed: {e}.") 78 | -------------------------------------------------------------------------------- /tests/test_augsynth_basque.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pandas as pd 3 | 4 | from pysyncon import Dataprep, AugSynth 5 | 6 | 7 | class TestAugsynthBasque(unittest.TestCase): 8 | def setUp(self): 9 | df = pd.read_csv("./data/basque.csv") 10 | self.dataprep = Dataprep( 11 | foo=df, 12 | predictors=[ 13 | "school.illit", 14 | "school.prim", 15 | "school.med", 16 | "school.high", 17 | "school.post.high", 18 | "invest", 19 | ], 20 | predictors_op="mean", 21 | time_predictors_prior=range(1964, 1970), 22 | special_predictors=[ 23 | ("gdpcap", range(1960, 1970), "mean"), 24 | ("sec.agriculture", range(1961, 1970, 2), "mean"), 25 | ("sec.energy", range(1961, 1970, 2), "mean"), 26 | ("sec.industry", range(1961, 1970, 2), "mean"), 27 | ("sec.construction", range(1961, 1970, 2), "mean"), 28 | ("sec.services.venta", range(1961, 1970, 2), "mean"), 29 | ("sec.services.nonventa", range(1961, 1970, 2), "mean"), 30 | ("popdens", [1969], "mean"), 31 | ], 32 | dependent="gdpcap", 33 | unit_variable="regionname", 34 | time_variable="year", 35 | treatment_identifier="Basque Country (Pais Vasco)", 36 | controls_identifier=[ 37 | "Andalucia", 38 | "Aragon", 39 | "Baleares (Islas)", 40 | "Canarias", 41 | "Cantabria", 42 | "Castilla-La Mancha", 43 | "Castilla Y Leon", 44 | "Cataluna", 45 | "Comunidad Valenciana", 46 | "Extremadura", 47 | "Galicia", 48 | "Madrid (Comunidad De)", 49 | "Murcia (Region de)", 50 | "Navarra (Comunidad Foral De)", 51 | "Principado De Asturias", 52 | "Rioja (La)", 53 | "Spain (Espana)", 54 | ], 55 | time_optimize_ssr=range(1960, 1970), 56 | ) 57 | self.optim_method = "Nelder-Mead" 58 | self.optim_initial = "equal" 59 | self.weights = { 60 | "Andalucia": 0.113627911, 61 | "Aragon": 1.774922286, 62 | "Baleares (Islas)": -0.713432799, 63 | "Canarias": 1.19397534, 64 | "Cantabria": 0.497825351, 65 | "Castilla-La Mancha": 0.131573892, 66 | "Castilla Y Leon": -1.405974956, 67 | "Cataluna": 1.31890027, 68 | "Comunidad Valenciana": -1.731140541, 69 | "Extremadura": -1.134362989, 70 | "Galicia": 1.982136937, 71 | "Madrid (Comunidad De)": 0.110801212, 72 | "Murcia (Region de)": -1.31476635, 73 | "Navarra (Comunidad Foral De)": -1.303045915, 74 | "Principado De Asturias": -0.02423815, 75 | "Rioja (La)": 1.58950474, 76 | "Spain (Espana)": -0.086306241, 77 | } 78 | 79 | def test_weights(self): 80 | augsynth = AugSynth() 81 | augsynth.fit(dataprep=self.dataprep) 82 | 83 | weights = pd.Series(self.weights, name="weights") 84 | # Allow a tolerance of 2.5% 85 | pd.testing.assert_series_equal( 86 | weights, augsynth.weights(round=9), check_exact=False, atol=0.025 87 | ) 88 | -------------------------------------------------------------------------------- /tests/test_conformal_interence.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pandas as pd 3 | import numpy as np 4 | 5 | from pysyncon import Synth 6 | from pysyncon.inference import ConformalInference 7 | 8 | 9 | class TestConformalInference(unittest.TestCase): 10 | def setUp(self): 11 | self.rng = np.random.default_rng() 12 | self.alpha = 0.05 13 | self.scm = Synth() 14 | self.Z0 = pd.DataFrame( 15 | data=self.rng.random(size=(30, 10)), 16 | index=range(1, 31), 17 | columns=range(1, 11), 18 | ) 19 | self.Z1 = pd.Series( 20 | data=self.rng.random(size=(30,)), index=range(1, 31), name=0 21 | ) 22 | self.X0 = pd.DataFrame( 23 | data=self.rng.random(size=(4, 10)), 24 | index=range(1, 5), 25 | columns=range(1, 11), 26 | ) 27 | self.X1 = pd.Series(data=self.rng.random(size=(4,)), index=range(1, 5), name=0) 28 | self.pre_periods = list(range(1, 21)) 29 | self.post_periods = list(range(21, 31)) 30 | self.max_iter = 20 31 | self.tol = 0.1 32 | self.step_sz = None 33 | self.step_sz_div = 20.0 34 | self.verbose = False 35 | 36 | def test_alpha(self): 37 | kwargs = { 38 | "scm": self.scm, 39 | "Z0": self.Z0, 40 | "Z1": self.Z1, 41 | "pre_periods": self.pre_periods, 42 | "post_periods": self.post_periods, 43 | "max_iter": self.max_iter, 44 | "tol": self.tol, 45 | "step_sz": self.step_sz, 46 | "step_sz_div": self.step_sz_div, 47 | "verbose": self.verbose, 48 | } 49 | 50 | conformal_inf = ConformalInference() 51 | 52 | cases = [-1.0, 0.0, 1.0, 17.0] 53 | for case in cases: 54 | with self.subTest(case=case): 55 | self.assertRaises( 56 | ValueError, conformal_inf.confidence_intervals, alpha=case, **kwargs 57 | ) 58 | 59 | cases = [True, ["foo"], {"foo": "bar"}] 60 | for case in cases: 61 | with self.subTest(case=case): 62 | self.assertRaises( 63 | TypeError, conformal_inf.confidence_intervals, alpha=case, **kwargs 64 | ) 65 | 66 | def test_max_iter(self): 67 | kwargs = { 68 | "alpha": self.alpha, 69 | "scm": self.scm, 70 | "Z0": self.Z0, 71 | "Z1": self.Z1, 72 | "pre_periods": self.pre_periods, 73 | "post_periods": self.post_periods, 74 | "tol": self.tol, 75 | "step_sz": self.step_sz, 76 | "step_sz_div": self.step_sz_div, 77 | "verbose": self.verbose, 78 | } 79 | 80 | conformal_inf = ConformalInference() 81 | 82 | cases = [-17, 0] 83 | for case in cases: 84 | with self.subTest(case=case): 85 | self.assertRaises( 86 | ValueError, 87 | conformal_inf.confidence_intervals, 88 | max_iter=case, 89 | **kwargs 90 | ) 91 | 92 | cases = [5.2, 10.0] 93 | for case in cases: 94 | with self.subTest(case=case): 95 | self.assertRaises( 96 | TypeError, 97 | conformal_inf.confidence_intervals, 98 | max_iter=case, 99 | **kwargs 100 | ) 101 | 102 | def test_tol(self): 103 | kwargs = { 104 | "alpha": self.alpha, 105 | "scm": self.scm, 106 | "Z0": self.Z0, 107 | "Z1": self.Z1, 108 | "pre_periods": self.pre_periods, 109 | "post_periods": self.post_periods, 110 | "max_iter": self.max_iter, 111 | "step_sz": self.step_sz, 112 | "step_sz_div": self.step_sz_div, 113 | "verbose": self.verbose, 114 | } 115 | 116 | conformal_inf = ConformalInference() 117 | 118 | cases = [-4.2, 0.0] 119 | for case in cases: 120 | with self.subTest(case=case): 121 | self.assertRaises( 122 | ValueError, conformal_inf.confidence_intervals, tol=case, **kwargs 123 | ) 124 | 125 | cases = [-4, 0] 126 | for case in cases: 127 | with self.subTest(case=case): 128 | self.assertRaises( 129 | TypeError, conformal_inf.confidence_intervals, tol=case, **kwargs 130 | ) 131 | 132 | def test_step_sz(self): 133 | kwargs = { 134 | "alpha": self.alpha, 135 | "scm": self.scm, 136 | "Z0": self.Z0, 137 | "Z1": self.Z1, 138 | "pre_periods": self.pre_periods, 139 | "post_periods": self.post_periods, 140 | "tol": self.tol, 141 | "max_iter": self.max_iter, 142 | "step_sz_div": self.step_sz_div, 143 | "verbose": self.verbose, 144 | } 145 | 146 | conformal_inf = ConformalInference() 147 | 148 | cases = [-4.2, 0.0] 149 | for case in cases: 150 | with self.subTest(case=case): 151 | self.assertRaises( 152 | ValueError, 153 | conformal_inf.confidence_intervals, 154 | step_sz=case, 155 | **kwargs 156 | ) 157 | 158 | cases = [-4, 0] 159 | for case in cases: 160 | with self.subTest(case=case): 161 | self.assertRaises( 162 | TypeError, 163 | conformal_inf.confidence_intervals, 164 | step_sz=case, 165 | **kwargs 166 | ) 167 | 168 | def test_step_sz_tol(self): 169 | kwargs = { 170 | "alpha": self.alpha, 171 | "scm": self.scm, 172 | "Z0": self.Z0, 173 | "Z1": self.Z1, 174 | "pre_periods": self.pre_periods, 175 | "post_periods": self.post_periods, 176 | "max_iter": self.max_iter, 177 | "step_sz_div": self.step_sz_div, 178 | "verbose": self.verbose, 179 | } 180 | 181 | conformal_inf = ConformalInference() 182 | 183 | # Step-size is less than tolerance 184 | self.assertRaises( 185 | ValueError, 186 | conformal_inf.confidence_intervals, 187 | tol=1.0, 188 | step_sz=0.1, 189 | **kwargs 190 | ) 191 | 192 | # Step-size = tolerance 193 | self.assertRaises( 194 | ValueError, 195 | conformal_inf.confidence_intervals, 196 | tol=1.0, 197 | step_sz=1.0, 198 | **kwargs 199 | ) 200 | 201 | def test_step_sz_guessing(self): 202 | kwargs = { 203 | "alpha": self.alpha, 204 | "scm": self.scm, 205 | "Z0": self.Z0, 206 | "Z1": self.Z1, 207 | "pre_periods": self.pre_periods, 208 | "post_periods": self.post_periods, 209 | "max_iter": self.max_iter, 210 | "step_sz_div": self.step_sz_div, 211 | "verbose": self.verbose, 212 | "scm_fit_args": {"X0": self.X0, "X1": self.X1}, 213 | } 214 | 215 | conformal_inf = ConformalInference() 216 | 217 | # No step-size and a big tolerance 218 | # (step-size guessing) 219 | _, n_c = self.Z0.shape 220 | self.scm.W = np.full(n_c, 1.0 / n_c) 221 | conformal_inf.confidence_intervals(tol=100.0, **kwargs) 222 | self.scm.W = None 223 | 224 | def test_step_sz_div(self): 225 | kwargs = { 226 | "alpha": self.alpha, 227 | "scm": self.scm, 228 | "Z0": self.Z0, 229 | "Z1": self.Z1, 230 | "pre_periods": self.pre_periods, 231 | "post_periods": self.post_periods, 232 | "tol": self.tol, 233 | "max_iter": self.max_iter, 234 | "step_sz": self.step_sz, 235 | "verbose": self.verbose, 236 | } 237 | 238 | conformal_inf = ConformalInference() 239 | 240 | cases = [-4.2, 0.0] 241 | for case in cases: 242 | with self.subTest(case=case): 243 | self.assertRaises( 244 | ValueError, 245 | conformal_inf.confidence_intervals, 246 | step_sz_div=case, 247 | **kwargs 248 | ) 249 | 250 | cases = [-4, 0] 251 | for case in cases: 252 | with self.subTest(case=case): 253 | self.assertRaises( 254 | TypeError, 255 | conformal_inf.confidence_intervals, 256 | step_sz_div=case, 257 | **kwargs 258 | ) 259 | 260 | def test_no_weights(self): 261 | kwargs = { 262 | "alpha": self.alpha, 263 | "scm": self.scm, 264 | "Z0": self.Z0, 265 | "Z1": self.Z1, 266 | "pre_periods": self.pre_periods, 267 | "post_periods": self.post_periods, 268 | "tol": self.tol, 269 | "max_iter": self.max_iter, 270 | "step_sz": self.step_sz, 271 | "verbose": self.verbose, 272 | } 273 | 274 | conformal_inf = ConformalInference() 275 | self.assertRaises(ValueError, conformal_inf.confidence_intervals, **kwargs) 276 | 277 | def test_root_search(self): 278 | cases_roots_x0 = [ 279 | ((-1, 3), 0.5), 280 | ((-1, 3), 1.0), 281 | ((-1, 3), 2.5), 282 | ((-1, 400), 0.5), 283 | ((-1, 400), 100), 284 | ((-1, 400), 399), 285 | ] 286 | cases_step_sz = [0.1, 1.0] 287 | 288 | ci = ConformalInference() 289 | tol = 0.01 290 | for case_root_x0 in cases_roots_x0: 291 | for case_step_sz in cases_step_sz: 292 | case = (case_root_x0, case_step_sz) 293 | with self.subTest(case=case): 294 | ((lower, upper), x0) = case_root_x0 295 | 296 | res = ci._root_search( 297 | fn=lambda x: (lower - x) * (x - upper), 298 | x0=x0, 299 | direction=-1, 300 | tol=tol, 301 | step_sz=case_step_sz, 302 | max_iter=100, 303 | ) 304 | self.assertAlmostEqual(lower, res, delta=tol) 305 | 306 | res = ci._root_search( 307 | fn=lambda x: (lower - x) * (x - upper), 308 | x0=x0, 309 | direction=1, 310 | tol=tol, 311 | step_sz=case_step_sz, 312 | max_iter=100, 313 | ) 314 | self.assertAlmostEqual(upper, res, delta=tol) 315 | 316 | self.assertRaises( 317 | Exception, 318 | ci._root_search, 319 | fn=lambda x: (-1 - x) * (x - 400), 320 | x0=200, 321 | direction=-1, 322 | tol=0.01, 323 | step_sz=1.0, 324 | max_iter=1, 325 | ) 326 | -------------------------------------------------------------------------------- /tests/test_linear_factor_model.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | import pandas as pd 4 | 5 | from pysyncon.generator import LinearFactorModel 6 | 7 | 8 | class TestLinearFactorModel(unittest.TestCase): 9 | def setUp(self): 10 | self.n_units = np.random.randint(low=10, high=20) 11 | self.n_observable = np.random.randint(low=10, high=15) 12 | self.n_unobservable = np.random.randint(low=10, high=15) 13 | self.n_periods_pre = np.random.randint(low=50, high=80) 14 | self.n_periods_post = np.random.randint(low=10, high=20) 15 | 16 | def test_matrix_dims(self): 17 | lfm = LinearFactorModel() 18 | X0, X1, Z0, Z1 = lfm.generate( 19 | n_units=self.n_units, 20 | n_observable=self.n_observable, 21 | n_unobservable=self.n_unobservable, 22 | n_periods_pre=self.n_periods_pre, 23 | n_periods_post=self.n_periods_post, 24 | ) 25 | 26 | self.assertEqual(X0.shape, (self.n_observable, self.n_units - 1)) 27 | self.assertEqual(X1.shape, (self.n_observable,)) 28 | self.assertEqual( 29 | Z0.shape, (self.n_periods_pre + self.n_periods_post, self.n_units - 1) 30 | ) 31 | self.assertEqual(Z1.shape, (self.n_periods_pre + self.n_periods_post,)) 32 | -------------------------------------------------------------------------------- /tests/test_penalized.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | import pandas as pd 4 | 5 | import pysyncon 6 | 7 | 8 | class TestPenalizedSynth(unittest.TestCase): 9 | def setUp(self): 10 | self.foo = pd.DataFrame( 11 | { 12 | "time": [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4], 13 | "name": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], 14 | "dependent": np.random.random(12), 15 | "predictor1": np.random.random(12), 16 | "predictor2": np.random.random(12), 17 | } 18 | ) 19 | self.predictors = ["predictor1"] 20 | self.predictors_op = "mean" 21 | self.dependent = "dependent" 22 | self.unit_variable = "name" 23 | self.time_variable = "time" 24 | self.treatment_identifier = 1 25 | self.treatment_identifier_list = [1, 2] 26 | self.controls_identifier = [2, 3] 27 | self.controls_identifier_alt = [3] 28 | self.time_predictors_prior = [2, 3] 29 | self.time_optimize_ssr = [1, 2, 3] 30 | self.special_predictors = [ 31 | ("predictor1", [2], "mean"), 32 | ("predictor2", [1, 2], "median"), 33 | ("predictor2", [1, 2], "std"), 34 | ] 35 | self.custom_V = np.full(4, 1.0) 36 | 37 | def test_fit_treated(self): 38 | kwargs = { 39 | "foo": self.foo, 40 | "predictors": self.predictors, 41 | "predictors_op": self.predictors_op, 42 | "dependent": self.dependent, 43 | "unit_variable": self.unit_variable, 44 | "time_variable": self.time_variable, 45 | "time_predictors_prior": self.time_predictors_prior, 46 | "time_optimize_ssr": self.time_optimize_ssr, 47 | "special_predictors": self.special_predictors, 48 | } 49 | 50 | dataprep = pysyncon.Dataprep( 51 | treatment_identifier=self.treatment_identifier_list, 52 | controls_identifier=self.controls_identifier_alt, 53 | **kwargs, 54 | ) 55 | pen = pysyncon.PenalizedSynth() 56 | self.assertRaises(ValueError, pen.fit, dataprep) 57 | 58 | dataprep = pysyncon.Dataprep( 59 | treatment_identifier=self.treatment_identifier, 60 | controls_identifier=self.controls_identifier, 61 | **kwargs, 62 | ) 63 | pen = pysyncon.PenalizedSynth() 64 | try: 65 | pen.fit(dataprep) 66 | except Exception as e: 67 | self.fail(f"PenalizedSynth fit with single treated failed: {e}.") 68 | 69 | dataprep = pysyncon.Dataprep( 70 | treatment_identifier=[self.treatment_identifier], 71 | controls_identifier=self.controls_identifier, 72 | **kwargs, 73 | ) 74 | pen = pysyncon.PenalizedSynth() 75 | try: 76 | pen.fit(dataprep) 77 | except Exception as e: 78 | self.fail(f"PenalizedSynth fit with single treated in list failed: {e}.") 79 | 80 | def test_X0_X1_fit(self): 81 | pen = pysyncon.PenalizedSynth() 82 | 83 | # X1 needs to be pd.Series 84 | X0 = pd.DataFrame(np.random.rand(5, 5)) 85 | X1 = pd.DataFrame(np.random.rand(5, 2)) 86 | self.assertRaises(TypeError, pen.fit, X0=X0, X1=X1) 87 | 88 | # X1 needs to be pd.Series 89 | X0 = pd.DataFrame(np.random.rand(5, 5)) 90 | X1 = pd.DataFrame(np.random.rand(5, 1)) 91 | self.assertRaises(TypeError, pen.fit, X0=X0, X1=X1) 92 | 93 | def test_fit_no_data(self): 94 | pen = pysyncon.PenalizedSynth() 95 | self.assertRaises(ValueError, pen.fit) 96 | 97 | def test_fit_custom_V(self): 98 | kwargs = { 99 | "foo": self.foo, 100 | "predictors": self.predictors, 101 | "predictors_op": self.predictors_op, 102 | "dependent": self.dependent, 103 | "unit_variable": self.unit_variable, 104 | "time_variable": self.time_variable, 105 | "treatment_identifier": self.treatment_identifier, 106 | "controls_identifier": self.controls_identifier, 107 | "time_predictors_prior": self.time_predictors_prior, 108 | "time_optimize_ssr": self.time_optimize_ssr, 109 | "special_predictors": self.special_predictors, 110 | } 111 | 112 | dataprep = pysyncon.Dataprep(**kwargs) 113 | pen = pysyncon.PenalizedSynth() 114 | try: 115 | pen.fit(dataprep=dataprep, custom_V=self.custom_V) 116 | except Exception as e: 117 | self.fail(f"PenalizedSynth fit failed with custom_V: {e}") 118 | -------------------------------------------------------------------------------- /tests/test_penalized_basque.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pandas as pd 3 | 4 | from pysyncon import Dataprep, PenalizedSynth 5 | 6 | 7 | class TestPenalizedBasque(unittest.TestCase): 8 | def setUp(self): 9 | df = pd.read_csv("./data/basque.csv") 10 | self.dataprep = Dataprep( 11 | foo=df, 12 | predictors=[ 13 | "school.illit", 14 | "school.prim", 15 | "school.med", 16 | "school.high", 17 | "school.post.high", 18 | "invest", 19 | ], 20 | predictors_op="mean", 21 | time_predictors_prior=range(1964, 1970), 22 | special_predictors=[ 23 | ("gdpcap", range(1960, 1970), "mean"), 24 | ("sec.agriculture", range(1961, 1970, 2), "mean"), 25 | ("sec.energy", range(1961, 1970, 2), "mean"), 26 | ("sec.industry", range(1961, 1970, 2), "mean"), 27 | ("sec.construction", range(1961, 1970, 2), "mean"), 28 | ("sec.services.venta", range(1961, 1970, 2), "mean"), 29 | ("sec.services.nonventa", range(1961, 1970, 2), "mean"), 30 | ("popdens", [1969], "mean"), 31 | ], 32 | dependent="gdpcap", 33 | unit_variable="regionname", 34 | time_variable="year", 35 | treatment_identifier="Basque Country (Pais Vasco)", 36 | controls_identifier=[ 37 | "Aragon", 38 | "Baleares (Islas)", 39 | "Andalucia", 40 | "Canarias", 41 | "Cantabria", 42 | "Castilla Y Leon", 43 | "Castilla-La Mancha", 44 | "Cataluna", 45 | "Comunidad Valenciana", 46 | "Extremadura", 47 | "Galicia", 48 | "Madrid (Comunidad De)", 49 | "Murcia (Region de)", 50 | "Navarra (Comunidad Foral De)", 51 | "Principado De Asturias", 52 | "Rioja (La)", 53 | "Spain (Espana)", 54 | ], 55 | time_optimize_ssr=range(1960, 1970), 56 | ) 57 | self.lambda_ = 0.01 58 | self.weights = { 59 | "Aragon": 0.0, 60 | "Baleares (Islas)": 0.0, 61 | "Andalucia": 0.0, 62 | "Canarias": 0.0, 63 | "Cantabria": 0.241, 64 | "Castilla Y Leon": 0.0, 65 | "Castilla-La Mancha": 0.0, 66 | "Cataluna": 0.759, 67 | "Comunidad Valenciana": 0.0, 68 | "Extremadura": 0.0, 69 | "Galicia": 0.0, 70 | "Madrid (Comunidad De)": 0.0, 71 | "Murcia (Region de)": 0.0, 72 | "Navarra (Comunidad Foral De)": 0.0, 73 | "Principado De Asturias": 0.0, 74 | "Rioja (La)": 0.0, 75 | "Spain (Espana)": 0.0, 76 | } 77 | 78 | def test_weights(self): 79 | robust = PenalizedSynth() 80 | robust.fit(dataprep=self.dataprep, lambda_=self.lambda_) 81 | 82 | weights = pd.Series(self.weights, name="weights") 83 | # Allow a tolerance of 2.5% 84 | pd.testing.assert_series_equal( 85 | weights, robust.weights(round=9), check_exact=False, atol=0.025 86 | ) 87 | -------------------------------------------------------------------------------- /tests/test_robust.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | import pandas as pd 4 | 5 | import pysyncon 6 | 7 | 8 | class TestRobustSynth(unittest.TestCase): 9 | def setUp(self): 10 | self.foo = pd.DataFrame( 11 | { 12 | "time": [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4], 13 | "name": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], 14 | "dependent": np.random.random(12), 15 | "predictor1": np.random.random(12), 16 | "predictor2": np.random.random(12), 17 | } 18 | ) 19 | self.predictors = ["predictor1"] 20 | self.predictors_op = "mean" 21 | self.dependent = "dependent" 22 | self.unit_variable = "name" 23 | self.time_variable = "time" 24 | self.treatment_identifier = 1 25 | self.treatment_identifier_list = [1, 2] 26 | self.controls_identifier = [2, 3] 27 | self.controls_identifier_alt = [3] 28 | self.time_predictors_prior = [2, 3] 29 | self.time_optimize_ssr = [1, 2, 3] 30 | self.special_predictors = [ 31 | ("predictor1", [2], "mean"), 32 | ("predictor2", [1, 2], "median"), 33 | ("predictor2", [1, 2], "std"), 34 | ] 35 | self.lambda_ = 0.01 36 | self.sv_count = 1 37 | 38 | def test_fit_treated(self): 39 | kwargs = { 40 | "foo": self.foo, 41 | "predictors": self.predictors, 42 | "predictors_op": self.predictors_op, 43 | "dependent": self.dependent, 44 | "unit_variable": self.unit_variable, 45 | "time_variable": self.time_variable, 46 | "time_predictors_prior": self.time_predictors_prior, 47 | "time_optimize_ssr": self.time_optimize_ssr, 48 | "special_predictors": self.special_predictors, 49 | } 50 | 51 | dataprep = pysyncon.Dataprep( 52 | treatment_identifier=self.treatment_identifier_list, 53 | controls_identifier=self.controls_identifier_alt, 54 | **kwargs, 55 | ) 56 | robust = pysyncon.RobustSynth() 57 | self.assertRaises( 58 | ValueError, 59 | robust.fit, 60 | dataprep, 61 | lambda_=self.lambda_, 62 | sv_count=self.sv_count, 63 | ) 64 | 65 | dataprep = pysyncon.Dataprep( 66 | treatment_identifier=self.treatment_identifier, 67 | controls_identifier=self.controls_identifier, 68 | **kwargs, 69 | ) 70 | robust = pysyncon.RobustSynth() 71 | try: 72 | robust.fit(dataprep, lambda_=self.lambda_, sv_count=self.sv_count) 73 | except Exception as e: 74 | self.fail(f"RobustSynth fit with single treated failed: {e}.") 75 | 76 | dataprep = pysyncon.Dataprep( 77 | treatment_identifier=[self.treatment_identifier], 78 | controls_identifier=self.controls_identifier, 79 | **kwargs, 80 | ) 81 | robust = pysyncon.RobustSynth() 82 | try: 83 | robust.fit(dataprep, lambda_=self.lambda_, sv_count=self.sv_count) 84 | except Exception as e: 85 | self.fail(f"RobustSynth fit with single treated in list failed: {e}.") 86 | -------------------------------------------------------------------------------- /tests/test_robust_basque.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pandas as pd 3 | 4 | from pysyncon import Dataprep, RobustSynth 5 | 6 | 7 | class TestRobustBasque(unittest.TestCase): 8 | def setUp(self): 9 | df = pd.read_csv("./data/basque.csv") 10 | self.dataprep = Dataprep( 11 | foo=df, 12 | predictors=[ 13 | "school.illit", 14 | "school.prim", 15 | "school.med", 16 | "school.high", 17 | "school.post.high", 18 | "invest", 19 | ], 20 | predictors_op="mean", 21 | time_predictors_prior=range(1964, 1970), 22 | special_predictors=[ 23 | ("gdpcap", range(1960, 1970), "mean"), 24 | ("sec.agriculture", range(1961, 1970, 2), "mean"), 25 | ("sec.energy", range(1961, 1970, 2), "mean"), 26 | ("sec.industry", range(1961, 1970, 2), "mean"), 27 | ("sec.construction", range(1961, 1970, 2), "mean"), 28 | ("sec.services.venta", range(1961, 1970, 2), "mean"), 29 | ("sec.services.nonventa", range(1961, 1970, 2), "mean"), 30 | ("popdens", [1969], "mean"), 31 | ], 32 | dependent="gdpcap", 33 | unit_variable="regionname", 34 | time_variable="year", 35 | treatment_identifier="Basque Country (Pais Vasco)", 36 | controls_identifier=[ 37 | "Aragon", 38 | "Baleares (Islas)", 39 | "Andalucia", 40 | "Canarias", 41 | "Cantabria", 42 | "Castilla Y Leon", 43 | "Castilla-La Mancha", 44 | "Cataluna", 45 | "Comunidad Valenciana", 46 | "Extremadura", 47 | "Galicia", 48 | "Madrid (Comunidad De)", 49 | "Murcia (Region de)", 50 | "Navarra (Comunidad Foral De)", 51 | "Principado De Asturias", 52 | "Rioja (La)", 53 | ], 54 | time_optimize_ssr=range(1960, 1970), 55 | ) 56 | self.lambda_ = 0.1 57 | self.sv_count = 2 58 | self.weights = { 59 | "Aragon": 0.042750725, 60 | "Baleares (Islas)": 0.095687916, 61 | "Andalucia": 0.05471977, 62 | "Canarias": 0.029348893, 63 | "Cantabria": 0.131449835, 64 | "Castilla Y Leon": 0.00534905, 65 | "Castilla-La Mancha": -0.023989253, 66 | "Cataluna": 0.172766943, 67 | "Comunidad Valenciana": 0.098502043, 68 | "Extremadura": -0.024916194, 69 | "Galicia": 0.000285705, 70 | "Madrid (Comunidad De)": 0.306908016, 71 | "Murcia (Region de)": 0.037554988, 72 | "Navarra (Comunidad Foral De)": 0.042127484, 73 | "Principado De Asturias": 0.144568216, 74 | "Rioja (La)": 0.018474723, 75 | } 76 | 77 | def test_weights(self): 78 | robust = RobustSynth() 79 | robust.fit(dataprep=self.dataprep, lambda_=self.lambda_, sv_count=self.sv_count) 80 | 81 | weights = pd.Series(self.weights, name="weights") 82 | # Allow a tolerance of 2.5% 83 | pd.testing.assert_series_equal( 84 | weights, robust.weights(round=9), check_exact=False, atol=0.025 85 | ) 86 | -------------------------------------------------------------------------------- /tests/test_synth.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import patch, Mock 3 | import numpy as np 4 | import pandas as pd 5 | 6 | import pysyncon 7 | 8 | 9 | class TestSynth(unittest.TestCase): 10 | def setUp(self): 11 | self.foo = pd.DataFrame( 12 | { 13 | "time": [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4], 14 | "name": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], 15 | "dependent": np.random.random(12), 16 | "predictor1": np.random.random(12), 17 | "predictor2": np.random.random(12), 18 | } 19 | ) 20 | self.predictors = ["predictor1"] 21 | self.predictors_op = "mean" 22 | self.dependent = "dependent" 23 | self.unit_variable = "name" 24 | self.time_variable = "time" 25 | self.treatment_identifier = 1 26 | self.treatment_identifier_list = [1, 2] 27 | self.controls_identifier = [2, 3] 28 | self.controls_identifier_alt = [3] 29 | self.time_predictors_prior = [2, 3] 30 | self.time_optimize_ssr = [1, 2, 3] 31 | self.special_predictors = [ 32 | ("predictor1", [2], "mean"), 33 | ("predictor2", [1, 2], "median"), 34 | ("predictor2", [1, 2], "std"), 35 | ] 36 | 37 | def test_fit_treated(self): 38 | kwargs = { 39 | "foo": self.foo, 40 | "predictors": self.predictors, 41 | "predictors_op": self.predictors_op, 42 | "dependent": self.dependent, 43 | "unit_variable": self.unit_variable, 44 | "time_variable": self.time_variable, 45 | "time_predictors_prior": self.time_predictors_prior, 46 | "time_optimize_ssr": self.time_optimize_ssr, 47 | "special_predictors": self.special_predictors, 48 | } 49 | dataprep = pysyncon.Dataprep( 50 | treatment_identifier=self.treatment_identifier_list, 51 | controls_identifier=self.controls_identifier_alt, 52 | **kwargs, 53 | ) 54 | synth = pysyncon.Synth() 55 | self.assertRaises(ValueError, synth.fit, dataprep) 56 | 57 | dataprep = pysyncon.Dataprep( 58 | treatment_identifier=self.treatment_identifier, 59 | controls_identifier=self.controls_identifier, 60 | **kwargs, 61 | ) 62 | synth = pysyncon.Synth() 63 | 64 | # Run with normal controls list 65 | synth.fit(dataprep) 66 | 67 | dataprep = pysyncon.Dataprep( 68 | treatment_identifier=[self.treatment_identifier], 69 | controls_identifier=self.controls_identifier, 70 | **kwargs, 71 | ) 72 | synth = pysyncon.Synth() 73 | 74 | # Run with a list of treatment identifiers 75 | synth.fit(dataprep) 76 | 77 | def test_X0_X1_fit(self): 78 | synth = pysyncon.Synth() 79 | 80 | # Neither dataprep nor matrices set 81 | self.assertRaises(ValueError, synth.fit) 82 | 83 | # X1 needs to be pd.Series 84 | X0 = pd.DataFrame(np.random.rand(5, 5)) 85 | X1 = pd.DataFrame(np.random.rand(5, 2)) 86 | Z0 = pd.DataFrame(np.random.rand(5, 5)) 87 | Z1 = pd.DataFrame(np.random.rand(5, 2)) 88 | self.assertRaises(TypeError, synth.fit, X0=X0, X1=X1, Z0=Z0, Z1=Z1) 89 | 90 | # X1 needs to be pd.Series 91 | X0 = pd.DataFrame(np.random.rand(5, 5)) 92 | X1 = pd.DataFrame(np.random.rand(5, 1)) 93 | Z0 = pd.DataFrame(np.random.rand(5, 5)) 94 | Z1 = pd.DataFrame(np.random.rand(5, 1)) 95 | self.assertRaises(TypeError, synth.fit, X0=X0, X1=X1, Z0=Z0, Z1=Z1) 96 | 97 | @patch("pysyncon.base.plt") 98 | def test_path_plot(self, mock_plt: Mock): 99 | kwargs = { 100 | "foo": self.foo, 101 | "predictors": self.predictors, 102 | "predictors_op": self.predictors_op, 103 | "dependent": self.dependent, 104 | "unit_variable": self.unit_variable, 105 | "time_variable": self.time_variable, 106 | "treatment_identifier": self.treatment_identifier, 107 | "controls_identifier": self.controls_identifier, 108 | "time_predictors_prior": self.time_predictors_prior, 109 | "time_optimize_ssr": self.time_optimize_ssr, 110 | "special_predictors": self.special_predictors, 111 | } 112 | 113 | dataprep = pysyncon.Dataprep(**kwargs) 114 | synth = pysyncon.Synth() 115 | # No weight matrix set 116 | self.assertRaises(ValueError, synth.path_plot) 117 | 118 | X0, X1 = dataprep.make_covariate_mats() 119 | Z0, Z1 = dataprep.make_outcome_mats() 120 | synth.fit(X0=X0, X1=X1, Z0=Z0, Z1=Z1) 121 | # No Dataprep object available 122 | self.assertRaises(ValueError, synth.path_plot) 123 | 124 | synth.fit(dataprep=dataprep) 125 | synth.path_plot() 126 | 127 | self.assertEqual(mock_plt.plot.call_count, 2) 128 | first_call, second_call = mock_plt.plot.call_args_list 129 | 130 | _, first_call_kwargs = first_call 131 | self.assertEqual(first_call_kwargs["color"], "black") 132 | self.assertEqual(first_call_kwargs["linewidth"], 1) 133 | self.assertEqual(first_call_kwargs["label"], dataprep.treatment_identifier) 134 | 135 | _, second_call_kwargs = second_call 136 | self.assertEqual(second_call_kwargs["color"], "black") 137 | self.assertEqual(second_call_kwargs["linewidth"], 1) 138 | self.assertEqual(second_call_kwargs["linestyle"], "dashed") 139 | self.assertEqual(second_call_kwargs["label"], "Synthetic") 140 | 141 | mock_plt.axvline.assert_not_called() 142 | mock_plt.legend.assert_called() 143 | mock_plt.grid.assert_called_with(True) 144 | mock_plt.show.assert_called() 145 | 146 | synth.path_plot(treatment_time=3) 147 | mock_plt.axvline.assert_called_once() 148 | 149 | _, kwargs = mock_plt.axvline.call_args 150 | self.assertEqual(kwargs["x"], 3) 151 | self.assertEqual(kwargs["ymin"], 0.05) 152 | self.assertEqual(kwargs["ymax"], 0.95) 153 | self.assertEqual(kwargs["linestyle"], "dashed") 154 | 155 | @patch("pysyncon.base.plt") 156 | def test_gaps_plot(self, mock_plt: Mock): 157 | kwargs = { 158 | "foo": self.foo, 159 | "predictors": self.predictors, 160 | "predictors_op": self.predictors_op, 161 | "dependent": self.dependent, 162 | "unit_variable": self.unit_variable, 163 | "time_variable": self.time_variable, 164 | "treatment_identifier": self.treatment_identifier, 165 | "controls_identifier": self.controls_identifier, 166 | "time_predictors_prior": self.time_predictors_prior, 167 | "time_optimize_ssr": self.time_optimize_ssr, 168 | "special_predictors": self.special_predictors, 169 | } 170 | 171 | dataprep = pysyncon.Dataprep(**kwargs) 172 | synth = pysyncon.Synth() 173 | # No weight matrix set 174 | self.assertRaises(ValueError, synth.gaps_plot) 175 | 176 | X0, X1 = dataprep.make_covariate_mats() 177 | Z0, Z1 = dataprep.make_outcome_mats() 178 | synth.fit(X0=X0, X1=X1, Z0=Z0, Z1=Z1) 179 | # No Dataprep object available 180 | self.assertRaises(ValueError, synth.gaps_plot) 181 | 182 | synth.fit(dataprep=dataprep) 183 | synth.gaps_plot() 184 | 185 | self.assertEqual(mock_plt.plot.call_count, 1) 186 | _, kwargs = mock_plt.plot.call_args 187 | 188 | self.assertEqual(kwargs["color"], "black") 189 | self.assertEqual(kwargs["linewidth"], 1) 190 | 191 | mock_plt.axvline.assert_not_called() 192 | mock_plt.grid.assert_called_with(True) 193 | mock_plt.show.assert_called() 194 | 195 | synth.path_plot(treatment_time=3) 196 | mock_plt.axvline.assert_called_once() 197 | 198 | _, kwargs = mock_plt.axvline.call_args 199 | self.assertEqual(kwargs["x"], 3) 200 | self.assertEqual(kwargs["ymin"], 0.05) 201 | self.assertEqual(kwargs["ymax"], 0.95) 202 | self.assertEqual(kwargs["linestyle"], "dashed") 203 | 204 | def test_weight(self): 205 | synth = pysyncon.Synth() 206 | # No weight matrix set 207 | self.assertRaises(ValueError, synth.weights) 208 | 209 | def test_summary(self): 210 | kwargs = { 211 | "foo": self.foo, 212 | "predictors": self.predictors, 213 | "predictors_op": self.predictors_op, 214 | "dependent": self.dependent, 215 | "unit_variable": self.unit_variable, 216 | "time_variable": self.time_variable, 217 | "treatment_identifier": self.treatment_identifier, 218 | "controls_identifier": self.controls_identifier, 219 | "time_predictors_prior": self.time_predictors_prior, 220 | "time_optimize_ssr": self.time_optimize_ssr, 221 | "special_predictors": self.special_predictors, 222 | } 223 | 224 | dataprep = pysyncon.Dataprep(**kwargs) 225 | synth = pysyncon.Synth() 226 | # No weight matrix set 227 | self.assertRaises(ValueError, synth.summary) 228 | X0, X1 = dataprep.make_covariate_mats() 229 | Z0, Z1 = dataprep.make_outcome_mats() 230 | synth.fit(X0=X0, X1=X1, Z0=Z0, Z1=Z1) 231 | # No Dataprep object available 232 | self.assertRaises(ValueError, synth.summary) 233 | 234 | synth.V = None 235 | # No V matrix available 236 | self.assertRaises(ValueError, synth.summary) 237 | 238 | def test_att(self): 239 | synth = pysyncon.Synth() 240 | # No weight matrix set 241 | self.assertRaises(ValueError, synth.att, range(1)) 242 | 243 | def test_metrics(self): 244 | kwargs = { 245 | "foo": self.foo, 246 | "predictors": self.predictors, 247 | "predictors_op": self.predictors_op, 248 | "dependent": self.dependent, 249 | "unit_variable": self.unit_variable, 250 | "time_variable": self.time_variable, 251 | "treatment_identifier": self.treatment_identifier, 252 | "controls_identifier": self.controls_identifier, 253 | "time_predictors_prior": self.time_predictors_prior, 254 | "time_optimize_ssr": self.time_optimize_ssr, 255 | "special_predictors": self.special_predictors, 256 | } 257 | 258 | dataprep = pysyncon.Dataprep(**kwargs) 259 | synth = pysyncon.Synth() 260 | 261 | X0, X1 = dataprep.make_covariate_mats() 262 | Z0, Z1 = dataprep.make_outcome_mats() 263 | synth.fit(X0=X0, X1=X1, Z0=Z0, Z1=Z1) 264 | # No Dataprep object available 265 | self.assertRaises(ValueError, synth.mspe) 266 | self.assertRaises(ValueError, synth.mape) 267 | self.assertRaises(ValueError, synth.mae) 268 | 269 | del synth 270 | 271 | synth = pysyncon.Synth() 272 | synth.dataprep = dataprep 273 | # No weights availble/fit not run available 274 | self.assertRaises(ValueError, synth.mspe) 275 | self.assertRaises(ValueError, synth.mape) 276 | self.assertRaises(ValueError, synth.mae) 277 | 278 | def test_confidence_intervals(self): 279 | kwargs = { 280 | "foo": self.foo, 281 | "predictors": self.predictors, 282 | "predictors_op": self.predictors_op, 283 | "dependent": self.dependent, 284 | "unit_variable": self.unit_variable, 285 | "time_variable": self.time_variable, 286 | "treatment_identifier": self.treatment_identifier, 287 | "controls_identifier": self.controls_identifier, 288 | "time_predictors_prior": self.time_predictors_prior, 289 | "time_optimize_ssr": self.time_optimize_ssr, 290 | "special_predictors": self.special_predictors, 291 | } 292 | 293 | dataprep = pysyncon.Dataprep(**kwargs) 294 | synth = pysyncon.Synth() 295 | synth.fit(dataprep=dataprep) 296 | 297 | # Bad option 298 | self.assertRaises( 299 | ValueError, 300 | synth.confidence_interval, 301 | alpha=0.5, 302 | time_periods=[4], 303 | tol=0.01, 304 | method="foo", 305 | ) 306 | 307 | # Run with dataprep supplied 308 | synth.confidence_interval( 309 | alpha=0.5, time_periods=[4], dataprep=dataprep, tol=0.01 310 | ) 311 | 312 | # Too few time periods for alpha value 313 | self.assertRaises( 314 | ValueError, 315 | synth.confidence_interval, 316 | alpha=0.05, 317 | time_periods=[4], 318 | tol=0.01, 319 | dataprep=dataprep, 320 | ) 321 | 322 | # Run without dataprep supplied 323 | synth.confidence_interval(alpha=0.5, time_periods=[4], tol=0.01) 324 | 325 | # Too few time periods for alpha value 326 | self.assertRaises( 327 | ValueError, 328 | synth.confidence_interval, 329 | alpha=0.05, 330 | time_periods=[4], 331 | tol=0.01, 332 | ) 333 | 334 | # Without dataprep supplied or matrices 335 | synth.dataprep = None 336 | self.assertRaises( 337 | ValueError, synth.confidence_interval, alpha=0.5, time_periods=[4], tol=0.01 338 | ) 339 | 340 | # No pre-periods supplied 341 | synth.dataprep = None 342 | X0, X1 = dataprep.make_covariate_mats() 343 | Z0, Z1 = dataprep.make_outcome_mats(time_period=[1, 2, 3, 4]) 344 | self.assertRaises( 345 | ValueError, 346 | synth.confidence_interval, 347 | alpha=0.5, 348 | time_periods=[4], 349 | tol=0.01, 350 | X0=X0, 351 | X1=X1, 352 | Z0=Z0, 353 | Z1=Z1, 354 | ) 355 | 356 | # Bad alpha value 357 | self.assertRaises( 358 | ValueError, 359 | synth.confidence_interval, 360 | alpha=0.05, 361 | time_periods=[4], 362 | pre_periods=[1, 2, 3], 363 | tol=0.01, 364 | X0=X0, 365 | X1=X1, 366 | Z0=Z0, 367 | Z1=Z1, 368 | ) 369 | 370 | # Dataframes supplied instead of series 371 | X1 = X1.to_frame() 372 | Z1 = Z1.to_frame() 373 | self.assertRaises( 374 | TypeError, 375 | synth.confidence_interval, 376 | alpha=0.5, 377 | time_periods=[4], 378 | pre_periods=[1, 2, 3], 379 | tol=0.01, 380 | X0=X0, 381 | X1=X1, 382 | Z0=Z0, 383 | Z1=Z1, 384 | ) 385 | -------------------------------------------------------------------------------- /tests/test_synth_basque.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pandas as pd 3 | 4 | from pysyncon import Dataprep, Synth 5 | from pysyncon.utils import PlaceboTest 6 | 7 | 8 | class TestSynthBasque(unittest.TestCase): 9 | def setUp(self): 10 | df = pd.read_csv("./data/basque.csv") 11 | self.dataprep = Dataprep( 12 | foo=df, 13 | predictors=[ 14 | "school.illit", 15 | "school.prim", 16 | "school.med", 17 | "school.high", 18 | "school.post.high", 19 | "invest", 20 | ], 21 | predictors_op="mean", 22 | time_predictors_prior=range(1964, 1970), 23 | special_predictors=[ 24 | ("gdpcap", range(1960, 1970), "mean"), 25 | ("sec.agriculture", range(1961, 1970, 2), "mean"), 26 | ("sec.energy", range(1961, 1970, 2), "mean"), 27 | ("sec.industry", range(1961, 1970, 2), "mean"), 28 | ("sec.construction", range(1961, 1970, 2), "mean"), 29 | ("sec.services.venta", range(1961, 1970, 2), "mean"), 30 | ("sec.services.nonventa", range(1961, 1970, 2), "mean"), 31 | ("popdens", [1969], "mean"), 32 | ], 33 | dependent="gdpcap", 34 | unit_variable="regionname", 35 | time_variable="year", 36 | treatment_identifier="Basque Country (Pais Vasco)", 37 | controls_identifier=[ 38 | "Spain (Espana)", 39 | "Andalucia", 40 | "Aragon", 41 | "Principado De Asturias", 42 | "Baleares (Islas)", 43 | "Canarias", 44 | "Cantabria", 45 | "Castilla Y Leon", 46 | "Castilla-La Mancha", 47 | "Cataluna", 48 | "Comunidad Valenciana", 49 | "Extremadura", 50 | "Galicia", 51 | "Madrid (Comunidad De)", 52 | "Murcia (Region de)", 53 | "Navarra (Comunidad Foral De)", 54 | "Rioja (La)", 55 | ], 56 | time_optimize_ssr=range(1960, 1970), 57 | ) 58 | self.optim_method = "Nelder-Mead" 59 | self.optim_initial = "equal" 60 | self.weights = { 61 | "Spain (Espana)": 0.0, 62 | "Andalucia": 0.0, 63 | "Aragon": 0.0, 64 | "Principado De Asturias": 0.0, 65 | "Baleares (Islas)": 0.0, 66 | "Canarias": 0.0, 67 | "Cantabria": 0.0, 68 | "Castilla Y Leon": 0.0, 69 | "Castilla-La Mancha": 0.0, 70 | "Cataluna": 0.850816306, 71 | "Comunidad Valenciana": 0.0, 72 | "Extremadura": 0.0, 73 | "Galicia": 0.0, 74 | "Madrid (Comunidad De)": 0.149183694, 75 | "Murcia (Region de)": 0.0, 76 | "Navarra (Comunidad Foral De)": 0.0, 77 | "Rioja (La)": 0.0, 78 | } 79 | self.placebo_gaps = { 80 | "Cataluna": { 81 | 1960.0: 0.203808058, 82 | 1961.0: 0.22013128, 83 | 1962.0: 0.263867425, 84 | 1963.0: 0.305086227, 85 | 1964.0: 0.307812892, 86 | 1965.0: 0.310500949, 87 | 1966.0: 0.369694004, 88 | 1967.0: 0.423575362, 89 | 1968.0: 0.458736716, 90 | 1969.0: 0.488697369, 91 | 1970.0: 0.492355223, 92 | }, 93 | "Madrid (Comunidad De)": { 94 | 1960.0: 0.927170193, 95 | 1961.0: 1.066511653, 96 | 1962.0: 1.011029922, 97 | 1963.0: 0.950455684, 98 | 1964.0: 0.945846094, 99 | 1965.0: 0.930053083, 100 | 1966.0: 0.772220243, 101 | 1967.0: 0.614648344, 102 | 1968.0: 0.557832902, 103 | 1969.0: 0.491439776, 104 | 1970.0: 0.441262212, 105 | }, 106 | "Andalucia": { 107 | 1960.0: -0.005071144, 108 | 1961.0: 0.002029757, 109 | 1962.0: -0.002976465, 110 | 1963.0: -0.008368432, 111 | 1964.0: -0.012947738, 112 | 1965.0: -0.018273511, 113 | 1966.0: -0.002324632, 114 | 1967.0: 0.012943551, 115 | 1968.0: 0.009046557, 116 | 1969.0: 0.004579814, 117 | 1970.0: 0.013673678, 118 | }, 119 | } 120 | self.summary = pd.DataFrame( 121 | data=[ 122 | [7.26559110e-02, 3.98884646e01, 2.56336977e02, 3.23825543e02], 123 | [1.19777358e-01, 1.03174230e03, 2.73010720e03, 2.18245335e03], 124 | [3.48611100e-03, 9.03586680e01, 2.23340172e02, 1.48864075e02], 125 | [1.02189247e-01, 2.57275251e01, 6.34368045e01, 4.71326627e01], 126 | [1.08267860e-02, 1.34797198e01, 3.61534897e01, 2.61630325e01], 127 | [5.32110000e-05, 2.46473831e01, 2.15826359e01, 2.14454579e01], 128 | [1.17260969e-01, 5.28546845e00, 5.27078346e00, 3.58401509e00], 129 | [6.33926060e-02, 6.84399996e00, 6.17934020e00, 2.10581177e01], 130 | [1.55350772e-01, 4.10600004e00, 2.75975796e00, 5.25223529e00], 131 | [9.58688000e-02, 4.50820000e01, 3.76359420e01, 2.26702353e01], 132 | [5.30811070e-02, 6.15000000e00, 6.95245150e00, 7.27400001e00], 133 | [1.63475200e-03, 3.37540001e01, 4.11037607e01, 3.66458824e01], 134 | [2.37097130e-02, 4.07200012e00, 5.37134427e00, 7.10294116e00], 135 | [1.80712657e-01, 2.46889999e02, 1.96283316e02, 9.74682350e01], 136 | ], 137 | columns=["V", "treated", "synthetic", "sample mean"], 138 | index=[ 139 | "school.illit", 140 | "school.prim", 141 | "school.med", 142 | "school.high", 143 | "school.post.high", 144 | "invest", 145 | "special.1.gdpcap", 146 | "special.2.sec.agriculture", 147 | "special.3.sec.energy", 148 | "special.4.sec.industry", 149 | "special.5.sec.construction", 150 | "special.6.sec.services.venta", 151 | "special.7.sec.services.nonventa", 152 | "special.8.popdens", 153 | ], 154 | ) 155 | self.treatment_time = 1975 156 | self.pvalue = 0.16666666666666666 157 | self.att = {"att": -0.6995647842110987, "se": 0.07078092130438395} 158 | self.att_time_period = range(1975, 1998) 159 | self.mspe = 0.008864544955047298 160 | self.mape = 0.016928135318837897 161 | self.mae = 0.08777554288632104 162 | 163 | def test_weights(self): 164 | synth = Synth() 165 | synth.fit( 166 | dataprep=self.dataprep, 167 | optim_method=self.optim_method, 168 | optim_initial=self.optim_initial, 169 | ) 170 | weights = pd.Series(self.weights, name="weights") 171 | # Allow a tolerance of 2.5% 172 | pd.testing.assert_series_equal( 173 | weights, synth.weights(round=9), check_exact=False, atol=0.025 174 | ) 175 | pd.testing.assert_frame_equal( 176 | self.summary, synth.summary(round=9), check_exact=False, atol=0.025 177 | ) 178 | 179 | def test_placebo_weights(self): 180 | synth = Synth() 181 | placebo_test = PlaceboTest() 182 | placebo_test.fit( 183 | dataprep=self.dataprep, 184 | scm=synth, 185 | scm_options={ 186 | "optim_method": self.optim_method, 187 | "optim_initial": self.optim_initial, 188 | }, 189 | ) 190 | 191 | placebo_gaps = pd.DataFrame.from_dict(self.placebo_gaps).rename_axis( 192 | index="year" 193 | ) 194 | regions = self.placebo_gaps.keys() 195 | years = list(self.placebo_gaps["Cataluna"].keys()) 196 | pd.testing.assert_frame_equal( 197 | placebo_gaps, 198 | placebo_test.gaps[regions].loc[years], 199 | check_exact=False, 200 | atol=0.025, 201 | ) 202 | self.assertAlmostEqual( 203 | self.pvalue, 204 | placebo_test.pvalue(treatment_time=self.treatment_time), 205 | places=3, 206 | ) 207 | 208 | def test_att(self): 209 | synth = Synth() 210 | synth.fit( 211 | dataprep=self.dataprep, 212 | optim_method=self.optim_method, 213 | optim_initial=self.optim_initial, 214 | ) 215 | synth_att = synth.att(time_period=self.att_time_period) 216 | 217 | # Allow a tolerance of 2.5% 218 | att_perc_delta = abs(1.0 - self.att["att"] / synth_att["att"]) 219 | self.assertLessEqual(att_perc_delta, 0.025) 220 | 221 | # Allow a tolerance of 2.5% 222 | se_perc_delta = abs(1.0 - self.att["se"] / synth_att["se"]) 223 | self.assertLessEqual(se_perc_delta, 0.025) 224 | 225 | def test_metric_values(self): 226 | synth = Synth() 227 | synth.fit( 228 | dataprep=self.dataprep, 229 | optim_method=self.optim_method, 230 | optim_initial=self.optim_initial, 231 | ) 232 | 233 | # Allow a tolerance of 2.5% 234 | mspe_perc_delta = abs(1.0 - self.mspe / synth.mspe()) 235 | self.assertLessEqual(mspe_perc_delta, 0.025) 236 | mape_perc_delta = abs(1.0 - self.mape / synth.mape()) 237 | self.assertLessEqual(mape_perc_delta, 0.025) 238 | mae_perc_delta = abs(1.0 - self.mae / synth.mae()) 239 | self.assertLessEqual(mae_perc_delta, 0.025) 240 | -------------------------------------------------------------------------------- /tests/test_synth_germany.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pandas as pd 3 | 4 | from pysyncon import Dataprep, Synth 5 | 6 | 7 | class TestSynthGermany(unittest.TestCase): 8 | def setUp(self): 9 | df = pd.read_csv("./data/germany.csv") 10 | dataprep_train = Dataprep( 11 | foo=df, 12 | predictors=["gdp", "trade", "infrate"], 13 | predictors_op="mean", 14 | time_predictors_prior=range(1971, 1981), 15 | special_predictors=[ 16 | ("industry", range(1971, 1981), "mean"), 17 | ("schooling", [1970, 1975], "mean"), 18 | ("invest70", [1980], "mean"), 19 | ], 20 | dependent="gdp", 21 | unit_variable="country", 22 | time_variable="year", 23 | treatment_identifier="West Germany", 24 | controls_identifier=[ 25 | "USA", 26 | "UK", 27 | "Austria", 28 | "Belgium", 29 | "Denmark", 30 | "France", 31 | "Italy", 32 | "Netherlands", 33 | "Norway", 34 | "Switzerland", 35 | "Japan", 36 | "Greece", 37 | "Portugal", 38 | "Spain", 39 | "Australia", 40 | "New Zealand", 41 | ], 42 | time_optimize_ssr=range(1981, 1991), 43 | ) 44 | synth_train = Synth() 45 | synth_train.fit( 46 | dataprep=dataprep_train, optim_method="Nelder-Mead", optim_initial="equal" 47 | ) 48 | self.custom_V = synth_train.V 49 | 50 | self.dataprep = Dataprep( 51 | foo=df, 52 | predictors=["gdp", "trade", "infrate"], 53 | predictors_op="mean", 54 | time_predictors_prior=range(1981, 1991), 55 | special_predictors=[ 56 | ("industry", range(1981, 1991), "mean"), 57 | ("schooling", [1980, 1985], "mean"), 58 | ("invest80", [1980], "mean"), 59 | ], 60 | dependent="gdp", 61 | unit_variable="country", 62 | time_variable="year", 63 | treatment_identifier="West Germany", 64 | controls_identifier=[ 65 | "USA", 66 | "UK", 67 | "Austria", 68 | "Belgium", 69 | "Denmark", 70 | "France", 71 | "Italy", 72 | "Netherlands", 73 | "Norway", 74 | "Switzerland", 75 | "Japan", 76 | "Greece", 77 | "Portugal", 78 | "Spain", 79 | "Australia", 80 | "New Zealand", 81 | ], 82 | time_optimize_ssr=range(1960, 1990), 83 | ) 84 | 85 | self.optim_method = "Nelder-Mead" 86 | self.optim_initial = "equal" 87 | self.weights = { 88 | "USA": 0.21624982, 89 | "UK": 0.0, 90 | "Austria": 0.414522077, 91 | "Belgium": 0.0, 92 | "Denmark": 0.0, 93 | "France": 0.0, 94 | "Italy": 0.0, 95 | "Netherlands": 0.09841208, 96 | "Norway": 0.0, 97 | "Switzerland": 0.107654851, 98 | "Japan": 0.163161172, 99 | "Greece": 0.0, 100 | "Portugal": 0.0, 101 | "Spain": 0.0, 102 | "Australia": 0.0, 103 | "New Zealand": 0.0, 104 | } 105 | self.att = {"att": -1555.1346777620479, "se": 317.6469306023242} 106 | self.att_time_period = range(1990, 2004) 107 | self.cis = { 108 | "value": { 109 | 1991: 279.09685975333196, 110 | 1992: 99.76203427529981, 111 | 1993: -631.5437231770848, 112 | 1994: -1050.2679900905205, 113 | 1995: -1205.2549226793199, 114 | 1996: -1467.2491625958974, 115 | 1997: -1954.3741689815615, 116 | 1998: -2008.3960300490326, 117 | 1999: -2160.627036515649, 118 | 2000: -2620.7330909274606, 119 | }, 120 | "lower_ci": { 121 | 1991: 43.148688105431994, 122 | 1992: -136.18613737260014, 123 | 1993: -867.4918948249846, 124 | 1994: -1286.2161617384206, 125 | 1995: -1441.20309432722, 126 | 1996: -1703.1973342437975, 127 | 1997: -2190.3223406294615, 128 | 1998: -2244.3442016969325, 129 | 1999: -2396.5752081635487, 130 | 2000: -2856.6812625753605, 131 | }, 132 | "upper_ci": { 133 | 1991: 515.0450314012319, 134 | 1992: 335.7102059231998, 135 | 1993: -395.59555152918483, 136 | 1994: -814.3198184426207, 137 | 1995: -969.3067510314198, 138 | 1996: -1231.3009909479972, 139 | 1997: -1718.4259973336614, 140 | 1998: -1772.4478584011324, 141 | 1999: -1924.6788648677486, 142 | 2000: -2384.7849192795607, 143 | }, 144 | } 145 | self.ci_args = { 146 | "alpha": 0.05, 147 | "time_periods": [ 148 | 1991, 149 | 1992, 150 | 1993, 151 | 1994, 152 | 1995, 153 | 1996, 154 | 1997, 155 | 1998, 156 | 1999, 157 | 2000, 158 | ], 159 | "max_iter": 50, 160 | "tol": 0.1, 161 | "verbose": False, 162 | } 163 | 164 | def test_weights(self): 165 | synth = Synth() 166 | synth.fit( 167 | dataprep=self.dataprep, 168 | optim_method=self.optim_method, 169 | optim_initial=self.optim_initial, 170 | custom_V=self.custom_V, 171 | ) 172 | weights = pd.Series(self.weights, name="weights") 173 | pd.testing.assert_series_equal( 174 | weights, synth.weights(round=9), check_exact=False, atol=0.025 175 | ) 176 | 177 | def test_att(self): 178 | synth = Synth() 179 | synth.fit( 180 | dataprep=self.dataprep, 181 | optim_method=self.optim_method, 182 | optim_initial=self.optim_initial, 183 | custom_V=self.custom_V, 184 | ) 185 | synth_att = synth.att(time_period=self.att_time_period) 186 | 187 | # Allow a tolerance of 2.5% 188 | att_perc_delta = abs(1.0 - self.att["att"] / synth_att["att"]) 189 | self.assertLessEqual(att_perc_delta, 0.025) 190 | 191 | # Allow a tolerance of 2.5% 192 | se_perc_delta = abs(1.0 - self.att["se"] / synth_att["se"]) 193 | self.assertLessEqual(se_perc_delta, 0.025) 194 | 195 | def test_cis(self): 196 | synth = Synth() 197 | synth.fit( 198 | dataprep=self.dataprep, 199 | optim_method=self.optim_method, 200 | optim_initial=self.optim_initial, 201 | custom_V=self.custom_V, 202 | ) 203 | 204 | cis = pd.DataFrame.from_dict(self.cis) 205 | cis.index.name = "time" 206 | pd.testing.assert_frame_equal( 207 | cis, 208 | synth.confidence_interval(custom_V=self.custom_V, **self.ci_args), 209 | check_exact=False, 210 | atol=0.025, 211 | ) 212 | -------------------------------------------------------------------------------- /tests/test_synth_texas.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pandas as pd 3 | 4 | from pysyncon import Dataprep, Synth 5 | 6 | 7 | class TestSynthTexas(unittest.TestCase): 8 | def setUp(self): 9 | df = pd.read_csv("./data/texas.csv") 10 | self.dataprep = Dataprep( 11 | foo=df, 12 | predictors=["income", "ur", "poverty"], 13 | predictors_op="mean", 14 | time_predictors_prior=range(1985, 1994), 15 | special_predictors=[ 16 | ("bmprison", [1988], "mean"), 17 | ("bmprison", [1990], "mean"), 18 | ("bmprison", [1991], "mean"), 19 | ("bmprison", [1992], "mean"), 20 | ("alcohol", [1990], "mean"), 21 | ("aidscapita", [1990], "mean"), 22 | ("aidscapita", [1991], "mean"), 23 | ("black", [1990], "mean"), 24 | ("black", [1991], "mean"), 25 | ("black", [1992], "mean"), 26 | ("perc1519", [1990], "mean"), 27 | ], 28 | dependent="bmprison", 29 | unit_variable="state", 30 | time_variable="year", 31 | treatment_identifier="Texas", 32 | controls_identifier=[ 33 | "Alabama", 34 | "Alaska", 35 | "Arizona", 36 | "Arkansas", 37 | "California", 38 | "Colorado", 39 | "Connecticut", 40 | "Delaware", 41 | "District of Columbia", 42 | "Florida", 43 | "Georgia", 44 | "Hawaii", 45 | "Idaho", 46 | "Illinois", 47 | "Indiana", 48 | "Iowa", 49 | "Kansas", 50 | "Kentucky", 51 | "Louisiana", 52 | "Maine", 53 | "Maryland", 54 | "Massachusetts", 55 | "Michigan", 56 | "Minnesota", 57 | "Mississippi", 58 | "Missouri", 59 | "Montana", 60 | "Nebraska", 61 | "Nevada", 62 | "New Hampshire", 63 | "New Jersey", 64 | "New Mexico", 65 | "New York", 66 | "North Carolina", 67 | "North Dakota", 68 | "Ohio", 69 | "Oklahoma", 70 | "Oregon", 71 | "Pennsylvania", 72 | "Rhode Island", 73 | "South Carolina", 74 | "South Dakota", 75 | "Tennessee", 76 | "Utah", 77 | "Vermont", 78 | "Virginia", 79 | "Washington", 80 | "West Virginia", 81 | "Wisconsin", 82 | "Wyoming", 83 | ], 84 | time_optimize_ssr=range(1985, 1994), 85 | ) 86 | self.optim_method = "BFGS" 87 | self.optim_initial = "ols" 88 | self.weights = { 89 | "Alabama": 0.0, 90 | "Alaska": 0.0, 91 | "Arizona": 0.0, 92 | "Arkansas": 0.0, 93 | "California": 0.407651414, 94 | "Colorado": 0.0, 95 | "Connecticut": 0.0, 96 | "Delaware": 0.0, 97 | "District of Columbia": 0.0, 98 | "Florida": 0.110543548, 99 | "Georgia": 0.0, 100 | "Hawaii": 0.0, 101 | "Idaho": 0.0, 102 | "Illinois": 0.36027434, 103 | "Indiana": 0.0, 104 | "Iowa": 0.0, 105 | "Kansas": 0.0, 106 | "Kentucky": 0.0, 107 | "Louisiana": 0.121530698, 108 | "Maine": 0.0, 109 | "Maryland": 0.0, 110 | "Massachusetts": 0.0, 111 | "Michigan": 0.0, 112 | "Minnesota": 0.0, 113 | "Mississippi": 0.0, 114 | "Missouri": 0.0, 115 | "Montana": 0.0, 116 | "Nebraska": 0.0, 117 | "Nevada": 0.0, 118 | "New Hampshire": 0.0, 119 | "New Jersey": 0.0, 120 | "New Mexico": 0.0, 121 | "New York": 0.0, 122 | "North Carolina": 0.0, 123 | "North Dakota": 0.0, 124 | "Ohio": 0.0, 125 | "Oklahoma": 0.0, 126 | "Oregon": 0.0, 127 | "Pennsylvania": 0.0, 128 | "Rhode Island": 0.0, 129 | "South Carolina": 0.0, 130 | "South Dakota": 0.0, 131 | "Tennessee": 0.0, 132 | "Utah": 0.0, 133 | "Vermont": 0.0, 134 | "Virginia": 0.0, 135 | "Washington": 0.0, 136 | "West Virginia": 0.0, 137 | "Wisconsin": 0.0, 138 | "Wyoming": 0.0, 139 | } 140 | self.att = {"att": 20339.375838131393, "se": 3190.4946788704715} 141 | self.att_time_period = range(1993, 2001) 142 | 143 | def test_weights(self): 144 | synth = Synth() 145 | synth.fit( 146 | dataprep=self.dataprep, 147 | optim_method=self.optim_method, 148 | optim_initial=self.optim_initial, 149 | ) 150 | weights = pd.Series(self.weights, name="weights") 151 | # Allow a tolerance of 2.5% 152 | pd.testing.assert_series_equal( 153 | weights, synth.weights(round=9), check_exact=False, atol=0.025 154 | ) 155 | 156 | def test_att(self): 157 | synth = Synth() 158 | synth.fit( 159 | dataprep=self.dataprep, 160 | optim_method=self.optim_method, 161 | optim_initial=self.optim_initial, 162 | ) 163 | synth_att = synth.att(time_period=self.att_time_period) 164 | 165 | # Allow a tolerance of 2.5% 166 | att_perc_delta = abs(1.0 - self.att["att"] / synth_att["att"]) 167 | self.assertLessEqual(att_perc_delta, 0.025) 168 | 169 | # Allow a tolerance of 2.5% 170 | se_perc_delta = abs(1.0 - self.att["se"] / synth_att["se"]) 171 | self.assertLessEqual(se_perc_delta, 0.025) 172 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import patch, Mock 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from pysyncon import Dataprep, Synth 7 | from pysyncon.utils import HoldoutSplitter, CrossValidationResult, PlaceboTest 8 | 9 | 10 | class TestHoldoutSplitter(unittest.TestCase): 11 | def test_values(self): 12 | cases = [(3, 3, 1), (3, 3, 2), (5, 1, 1), (5, 1, 2)] 13 | for case in cases: 14 | with self.subTest(case=case): 15 | rows, columns, holdout = case 16 | df = pd.DataFrame(np.random.random(size=(rows, columns))) 17 | ser = pd.Series(np.random.random(size=rows)) 18 | 19 | iter_len = 0 20 | for df_, df_h, ser_, ser_h in HoldoutSplitter( 21 | df=df, ser=ser, holdout_len=holdout 22 | ): 23 | self.assertIsInstance(df_, pd.DataFrame) 24 | pd.testing.assert_frame_equal( 25 | df_, 26 | df.drop(index=df.index[iter_len : iter_len + holdout,]), 27 | ) 28 | 29 | self.assertIsInstance(ser_, pd.Series) 30 | pd.testing.assert_series_equal( 31 | ser_, 32 | ser.drop(index=ser.index[iter_len : iter_len + holdout]), 33 | ) 34 | 35 | self.assertIsInstance(df_h, pd.DataFrame) 36 | pd.testing.assert_frame_equal( 37 | df_h, 38 | df.iloc[iter_len : iter_len + holdout,], 39 | ) 40 | 41 | self.assertIsInstance(ser_h, pd.Series) 42 | pd.testing.assert_series_equal( 43 | ser_h, 44 | ser.iloc[iter_len : iter_len + holdout], 45 | ) 46 | iter_len += 1 47 | self.assertEqual(iter_len - 1, rows - holdout) 48 | 49 | def test_errs(self): 50 | cases = [(1, 1, 2, 2), (2, 2, 1, 1), (3, 2, 1, 2), (2, 1, 2, 3)] 51 | for case in cases: 52 | with self.subTest(case=case): 53 | df_rows, df_cols, holdout, ser_rows = case 54 | 55 | df = pd.DataFrame(np.random.random(size=(df_rows, df_cols))) 56 | ser = pd.Series(np.random.random(size=ser_rows)) 57 | 58 | self.assertRaises( 59 | ValueError, 60 | HoldoutSplitter, 61 | df=df, 62 | ser=ser, 63 | holdout_len=holdout, 64 | ) 65 | 66 | cases = [(1, 1, 0, 1), (2, 2, 2, 2), (3, 3, 4, 3)] 67 | for case in cases: 68 | with self.subTest(case=case): 69 | df_rows, df_cols, holdout, ser_rows = case 70 | 71 | df = pd.DataFrame(np.random.random(size=(df_rows, df_cols))) 72 | ser = pd.Series(np.random.random(size=ser_rows)) 73 | 74 | self.assertRaises( 75 | ValueError, 76 | HoldoutSplitter, 77 | df=df, 78 | ser=ser, 79 | holdout_len=holdout, 80 | ) 81 | 82 | 83 | class TestCrossValidationResult(unittest.TestCase): 84 | def test_best_lambda(self): 85 | cases = [1, 2, 3, 10] 86 | for case in cases: 87 | with self.subTest(case=case): 88 | cv_result = CrossValidationResult( 89 | lambdas=np.random.random(size=case), 90 | errors_mean=np.random.random(size=case), 91 | errors_se=np.random.random(size=case), 92 | ) 93 | 94 | best_lambda = cv_result.best_lambda() 95 | min_mean = cv_result.errors_mean.min() 96 | min_mean_idx = cv_result.errors_mean.argmin() 97 | min_mean_se = cv_result.errors_se[min_mean_idx] 98 | self.assertEqual( 99 | best_lambda, 100 | cv_result.lambdas[cv_result.errors_mean <= min_mean + min_mean_se] 101 | .max() 102 | .item(), 103 | ) 104 | 105 | best_lambda = cv_result.best_lambda(min_1se=False) 106 | min_mean_idx = cv_result.errors_mean.argmin() 107 | self.assertEqual(best_lambda, cv_result.lambdas[min_mean_idx].item()) 108 | 109 | @patch("pysyncon.utils.plt") 110 | def test_result_plot(self, mock_plt: Mock): 111 | cv_result = CrossValidationResult( 112 | lambdas=np.random.random(size=10), 113 | errors_mean=np.random.random(size=10), 114 | errors_se=np.random.random(size=10), 115 | ) 116 | cv_result.plot() 117 | 118 | self.assertEqual(mock_plt.errorbar.call_count, 1) 119 | _, kwargs = mock_plt.errorbar.call_args 120 | self.assertEqual(kwargs["ecolor"], "black") 121 | self.assertEqual(kwargs["capsize"], 2) 122 | 123 | mock_plt.xlabel.assert_called_with("Lambda") 124 | mock_plt.ylabel.assert_called_with("Mean error") 125 | mock_plt.xscale.assert_called_with("log") 126 | mock_plt.yscale.assert_called_with("log") 127 | mock_plt.title.assert_called_with("Cross validation result") 128 | mock_plt.grid.assert_called() 129 | mock_plt.show.assert_called() 130 | 131 | 132 | class TestPlaceboTests(unittest.TestCase): 133 | def setUp(self): 134 | # 1 -> treated, (2, 3) -> controls 135 | self.dataprep = Dataprep( 136 | foo=pd.DataFrame( 137 | { 138 | "time": [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4], 139 | "name": [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], 140 | "dependent": np.random.random(12), 141 | "predictor1": np.random.random(12), 142 | "predictor2": np.random.random(12), 143 | } 144 | ), 145 | predictors=["predictor1"], 146 | predictors_op="mean", 147 | dependent="dependent", 148 | unit_variable="name", 149 | time_variable="time", 150 | treatment_identifier=1, 151 | controls_identifier=[2, 3], 152 | time_predictors_prior=[2, 3], 153 | time_optimize_ssr=[1, 2, 3], 154 | special_predictors=[ 155 | ("predictor1", [2], "mean"), 156 | ("predictor2", [1, 2], "median"), 157 | ("predictor2", [1, 2], "std"), 158 | ], 159 | ) 160 | self.synth = Synth() 161 | self.synth.fit(dataprep=self.dataprep) 162 | 163 | self.placebo_test = PlaceboTest() 164 | self.placebo_test.fit(dataprep=self.dataprep, scm=self.synth) 165 | 166 | @patch("pysyncon.utils.plt") 167 | def test_gaps_plot(self, mock_plt: Mock): 168 | self.placebo_test.gaps_plot() 169 | 170 | self.assertEqual(mock_plt.plot.call_count, 2) 171 | _, kwargs = mock_plt.plot.call_args 172 | self.assertEqual(kwargs["color"], "black") 173 | self.assertEqual(kwargs["alpha"], 1.0) 174 | mock_plt.axvline.assert_not_called() 175 | mock_plt.grid.assert_called() 176 | 177 | @patch("pysyncon.utils.plt") 178 | def test_gaps_plot_axvline(self, mock_plt: Mock): 179 | self.placebo_test.gaps_plot(treatment_time=3) 180 | 181 | mock_plt.axvline.assert_called() 182 | _, kwargs = mock_plt.axvline.call_args 183 | self.assertEqual(kwargs["ymin"], 0.05) 184 | self.assertEqual(kwargs["ymax"], 0.95) 185 | self.assertEqual(kwargs["linestyle"], "dashed") 186 | 187 | @patch("pysyncon.utils.plt") 188 | def test_gaps_plot_mspe_threshold(self, mock_plt: Mock): 189 | self.placebo_test.gaps_plot(treatment_time=3, mspe_threshold=1) 190 | --------------------------------------------------------------------------------