├── .all-contributorsrc ├── .gitattributes ├── .github └── workflows │ ├── build-and-relase.yaml │ ├── ci-tests.yaml │ └── extended_tests.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Cargo.lock ├── Cargo.toml ├── LICENSE.md ├── README.md ├── benchmarks ├── benchmarks_readme.md ├── data_generation.r ├── gpu_benchmarks.ipynb ├── gpu_big_benchmarks.ipynb ├── gpu_results.csv ├── gpu_runtime_res.csv ├── lets-plot-images │ ├── benchmarks_ols.svg │ └── benchmarks_poisson.svg ├── results_all.txt ├── results_py.csv ├── run_benchmarks.ipynb └── visualise_benchmarks.ipynb ├── codecov.yml ├── coverage.xml ├── docs ├── .gitignore ├── .nojekyll ├── _quarto.yml ├── _sidebar.yml ├── changelog.qmd ├── compare-fixest-pyfixest.qmd ├── contributing.qmd ├── difference-in-differences.qmd ├── figures │ ├── benchmarks_ols.svg │ ├── benchmarks_poisson.svg │ ├── gpu_benchmarks.png │ └── pyfixest-logo.png ├── latexdocs │ ├── SampleTableDoc.pdf │ ├── SampleTableDoc.tex │ ├── SampleTableDoc2.pdf │ ├── SampleTableDoc2.tex │ ├── SampleTableDoc3.pdf │ └── SampleTableDoc3.tex ├── marginaleffects.qmd ├── multiple_testing.ipynb ├── pyfixest.md ├── pyfixest_gpu.ipynb ├── quarto_example │ └── QuartoExample.qmd ├── quickstart.ipynb ├── quickstart.qmd ├── regression_decomposition.ipynb ├── replicating-the-effect.qmd ├── ssc.qmd ├── stata-2-pyfixest.qmd └── table-layout.qmd ├── figures ├── benchmarks_ols.svg ├── benchmarks_poisson.svg └── pyfixest-logo.png ├── pixi.lock ├── pyfixest ├── __init__.py ├── core │ ├── __init__.py │ ├── _core_impl.pyi │ ├── collinear.py │ ├── crv1.py │ ├── demean.py │ ├── nested_fixed_effects.py │ └── py.typed ├── did │ ├── __init__.py │ ├── data │ │ ├── df_het.csv │ │ └── lpdidtestdata1.dta │ ├── did.py │ ├── did2s.py │ ├── estimation.py │ ├── lpdid.py │ ├── saturated_twfe.py │ ├── twfe.py │ └── visualize.py ├── errors │ └── __init__.py ├── estimation │ ├── FixestMulti_.py │ ├── FormulaParser.py │ ├── __init__.py │ ├── backends.py │ ├── ccv.py │ ├── decomposition.py │ ├── demean_.py │ ├── demean_jax_.py │ ├── detect_singletons_.py │ ├── detect_singletons_jax.py │ ├── estimation.py │ ├── fegaussian_.py │ ├── feglm_.py │ ├── feiv_.py │ ├── felogit_.py │ ├── feols_.py │ ├── feols_compressed_.py │ ├── fepois_.py │ ├── feprobit_.py │ ├── jax │ │ ├── demean_jax_.py │ │ └── detect_singletons_jax.py │ ├── literals.py │ ├── model_matrix_fixest_.py │ ├── multcomp.py │ ├── numba │ │ ├── find_collinear_variables_nb.py │ │ └── nested_fixef_nb.py │ ├── prediction.py │ ├── ritest.py │ ├── solvers.py │ └── vcov_utils.py ├── report │ ├── __init__.py │ ├── summarize.py │ ├── utils.py │ └── visualize.py └── utils │ ├── __init__.py │ ├── _exceptions.py │ ├── check_r_install.py │ ├── dev_utils.py │ ├── dgps.py │ ├── set_rpy2_path.py │ └── utils.py ├── pyproject.toml ├── r_test_requirements.R ├── scripts └── run_notebooks.py ├── src ├── collinear.rs ├── crv1.rs ├── demean.rs ├── lib.rs └── nested_fixed_effects.rs └── tests ├── .coverage ├── __init__.py ├── data ├── all_did2s_dfs.csv ├── all_did2s_dfs_weights.csv ├── gelbach.csv ├── gelbach.dta ├── gelbach.txt ├── ppmlhdfe_separation_examples │ ├── 01.csv │ ├── 02.csv │ ├── 03.csv │ ├── 04.csv │ ├── 05.csv │ ├── 06.csv │ ├── 07.csv │ ├── 08.csv │ ├── 09.csv │ ├── 10.csv │ ├── 11.csv │ ├── 12.csv │ ├── 13.csv │ ├── 14.csv │ ├── 15.csv │ ├── 16.csv │ ├── 17.csv │ ├── 18.csv │ └── readme.md └── ritest_results.csv ├── r_test_comparisons.R ├── readme.md ├── test_api.py ├── test_ccv.py ├── test_collinearity.py ├── test_confint.py ├── test_count_fixef_fully_nested.py ├── test_crv1.py ├── test_decomposition.py ├── test_demean.py ├── test_detect_singletons.py ├── test_dgps.py ├── test_did.py ├── test_dtable.py ├── test_errors.py ├── test_event_study.py ├── test_exceptions.py ├── test_feols_compressed.py ├── test_feols_feglm_internally.py ├── test_formulas.py ├── test_i.py ├── test_iv.py ├── test_model_matrix.py ├── test_multcomp.py ├── test_multicollinearity.py ├── test_others.py ├── test_plots.py ├── test_poisson.py ├── test_predict_resid_fixef.py ├── test_ritest.py ├── test_ses.py ├── test_solvers.py ├── test_summarise.py ├── test_visualize.py ├── test_vs_fixest.py ├── test_wald_test.py ├── test_wildboottest.py ├── test_wls_types.py └── texfiles └── test.tex /.gitattributes: -------------------------------------------------------------------------------- 1 | # GitHub syntax highlighting 2 | pixi.lock linguist-language=YAML linguist-generated=true 3 | -------------------------------------------------------------------------------- /.github/workflows/extended_tests.yaml: -------------------------------------------------------------------------------- 1 | name: Extended Tests (Optional) 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | test: 10 | name: Tests Core 11 | runs-on: ${{ matrix.os }} 12 | strategy: 13 | fail-fast: false 14 | matrix: 15 | os: ["ubuntu-latest"] 16 | python-version: ["3.12"] 17 | pytest_opts: ["--workers 4 --tests-per-worker 1"] 18 | 19 | steps: 20 | - name: Checkout source 21 | uses: actions/checkout@v4 22 | 23 | - name: Setup python 24 | uses: actions/setup-python@v5 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | architecture: x64 28 | 29 | - uses: prefix-dev/setup-pixi@v0.8.3 30 | with: 31 | pixi-version: v0.41.4 32 | cache: true 33 | 34 | - name: Set numba parallel flags 35 | run: echo "NUMBA_NUM_THREADS=1" >> $GITHUB_ENV 36 | 37 | - name: Setup r2u 38 | uses: eddelbuettel/github-actions/r2u-setup@master 39 | 40 | - name: Print R version 41 | run: Rscript -e 'R.version' 42 | 43 | - name: Install R packages 44 | run: | 45 | R_LIB_PATH="${{ github.workspace }}/.pixi/envs/dev/lib/R/library" 46 | mkdir -p $R_LIB_PATH 47 | Rscript -e ".libPaths('$R_LIB_PATH'); install.packages(c('did2s', 'wildrwolf', 'ivDiag'), lib='/home/runner/work/pyfixest/pyfixest/.pixi/envs/dev/lib/R/library', repos = c('https://cran.rstudio.com', 'https://s3alfisc.r-universe.dev'))" 48 | Rscript -e ".libPaths('$R_LIB_PATH'); install.packages('ritest', lib='/home/runner/work/pyfixest/pyfixest/.pixi/envs/dev/lib/R/library', repos = c('https://grantmcdermott.r-universe.dev'))" 49 | 50 | - name: Compile Rust extension (no wheel) 51 | run: | 52 | pixi r maturin-develop 53 | 54 | - name: Run long tests with coverage 55 | run: pixi run tests-extended 56 | 57 | - name: Upload coverage to Codecov (partial) 58 | uses: codecov/codecov-action@v4 59 | with: 60 | token: ${{ secrets.CODECOV_TOKEN }} 61 | files: coverage.xml 62 | flags: tests-extended 63 | partial: true 64 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | pyfixest/debug.py 6 | pyfixest/profile.py 7 | site 8 | dist 9 | .mypy_cache 10 | .pytest_cache 11 | .vscode 12 | .coverage* 13 | *.pyc 14 | __pycache__/ 15 | #docs/figures 16 | docs/news_files 17 | readme_files 18 | benchmarks/data/_STATA/* 19 | benchmarks/data/data/* 20 | readme.ipynb 21 | /.quarto/ 22 | #objects.json 23 | #docs/site_libs 24 | #site_libs 25 | tests/.coverage 26 | .venv/# pixi environments 27 | .pixi 28 | *.egg-info 29 | # pixi environments 30 | .pixi 31 | *.egg-info 32 | pyfixest/did/data/mpdata.csv 33 | target 34 | # Ignore compiled Python extensions from Rust (PyO3/maturin/pyo3-pack) 35 | *.so 36 | *.pyd 37 | *.dll 38 | *.dylib 39 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | ci: 2 | autofix_prs: false 3 | 4 | repos: 5 | - repo: https://github.com/pre-commit/pre-commit-hooks 6 | rev: v5.0.0 7 | hooks: 8 | - id: debug-statements 9 | - id: trailing-whitespace 10 | - id: end-of-file-fixer 11 | - id: check-yaml 12 | args: [--allow-multiple-documents] 13 | - id: check-toml 14 | - id: check-added-large-files 15 | exclude: pixi.lock 16 | - repo: https://github.com/python-jsonschema/check-jsonschema 17 | rev: 0.33.0 18 | hooks: 19 | - id: check-github-workflows 20 | args: ["--verbose"] 21 | 22 | - repo: https://github.com/astral-sh/ruff-pre-commit 23 | rev: v0.11.11 24 | hooks: 25 | - id: ruff 26 | args: ["--fix", "--output-format=full"] 27 | - id: ruff-format 28 | 29 | - repo: https://github.com/nbQA-dev/nbQA 30 | rev: 1.9.1 31 | hooks: 32 | - id: nbqa-ruff 33 | args: ["--fix", "--output-format=full"] 34 | files: ^docs/.*\.ipynb$ 35 | 36 | - repo: https://github.com/pre-commit/mirrors-mypy 37 | rev: v1.15.0 38 | hooks: 39 | - id: mypy 40 | args: [--ignore-missing-imports] 41 | files: ^pyfixest/ 42 | additional_dependencies: [numpy>=1.20, pandas-stubs] 43 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, on all formal and 54 | informal events, and also applies when an individual is officially representing 55 | the community in public spaces. Examples of representing our community include 56 | - using an official e-mail address 57 | - posting via an official social media account 58 | - acting as a representative at an online or offline event 59 | - acting as a representative surrounding an event 60 | 61 | ## Enforcement 62 | 63 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 64 | reported to the community leaders responsible for enforcement. You can contact 65 | [Alexander Fischer](https://github.com/s3alfisc) 66 | (alexander-fischer1801@t-online.de). All complaints will be reviewed and investigated 67 | promptly and fairly. 68 | 69 | All community leaders are obligated to respect the privacy and security of the 70 | reporter of any incident. 71 | 72 | ## Enforcement Guidelines 73 | 74 | Community leaders will follow these Community Impact Guidelines in determining 75 | the consequences for any action they deem in violation of this Code of Conduct: 76 | 77 | ### 1. Correction 78 | 79 | **Community Impact**: Use of inappropriate language or other behavior deemed 80 | unprofessional or unwelcome in the community. 81 | 82 | **Consequence**: A private, written warning from community leaders, providing 83 | clarity around the nature of the violation and an explanation of why the 84 | behavior was inappropriate. A public apology may be requested. 85 | 86 | ### 2. Warning 87 | 88 | **Community Impact**: A violation through a single incident or series 89 | of actions. 90 | 91 | **Consequence**: A warning with consequences for continued behavior. No 92 | interaction with the people involved, including unsolicited interaction with 93 | those enforcing the Code of Conduct, for a specified period of time. This 94 | includes avoiding interactions in community spaces as well as external channels 95 | like social media. Violating these terms may lead to a temporary or 96 | permanent ban. 97 | 98 | ### 3. Temporary Ban 99 | 100 | **Community Impact**: A serious violation of community standards, including 101 | sustained inappropriate behavior. 102 | 103 | **Consequence**: A temporary ban from any sort of interaction or public 104 | communication with the community for a specified period of time. No public or 105 | private interaction with the people involved, including unsolicited interaction 106 | with those enforcing the Code of Conduct, is allowed during this period. 107 | Violating these terms may lead to a permanent ban. 108 | 109 | ### 4. Permanent Ban 110 | 111 | **Community Impact**: Demonstrating a pattern of violation of community 112 | standards, including sustained inappropriate behavior, harassment of an 113 | individual, or aggression toward or disparagement of classes of individuals. 114 | 115 | **Consequence**: A permanent ban from any sort of public interaction within 116 | the community. 117 | 118 | ## Attribution 119 | 120 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 121 | version 2.0, available at 122 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 123 | 124 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 125 | enforcement ladder](https://github.com/mozilla/diversity). 126 | 127 | [homepage]: https://www.contributor-covenant.org 128 | 129 | For answers to common questions about this code of conduct, see the FAQ at 130 | https://www.contributor-covenant.org/faq. Translations are available at 131 | https://www.contributor-covenant.org/translations. 132 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to pyfixest 2 | 3 | Thanks for taking the time to contribute! We appreciate all contributions, from reporting bugs to implementing new features. 4 | 5 | Please refer to the [contributing section](https://py-econometrics.github.io/pyfixest/contributing.html) of our documentation to get started. 6 | 7 | We look forward to your contributions! 8 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "pyfixest_core" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | [lib] 8 | name = "_core_impl" 9 | crate-type = ["cdylib"] 10 | 11 | [dependencies] 12 | pyo3 = { version = "0.18.3", features = ["extension-module"] } 13 | ndarray = { version = "0.15", features = ["rayon"] } 14 | rayon = "1.9" 15 | numpy = "0.18" 16 | thiserror = "2.0.12" 17 | 18 | [profile.release] 19 | opt-level = 3 # Maximize performance 20 | lto = "fat" # Full link-time optimization 21 | codegen-units = 1 # Whole-program optimization 22 | panic = "abort" # Smaller binary, no unwind support 23 | strip = true # Remove symbol table 24 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2022 pyfixest authors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /benchmarks/benchmarks_readme.md: -------------------------------------------------------------------------------- 1 | ## Readme 2 | 3 | All benchmarks follow `fixest`'s benchmarks, which you can find [here](https://github.com/lrberge/fixest/tree/master/_BENCHMARK). 4 | `PyFixest` benchmarks were run on a Intel(R) Core(TM) i7-10510U CPU @ 1.80GHz, 2304Mhz, 4 Core(s), 8 Logical Processor(s). 5 | Timings for `R`, `Stata` and `Julia` programs are taken from the `fixest`'s benchmarks. 6 | 7 | To run the python benchmarks, you need to install the following packages: 8 | - `pyfixest` 9 | - `pandas` 10 | - `numpy` 11 | - `tqdm` 12 | First, you need to create the data by running the `data_generation.R` files. This will populate the `_STATA` and `data` folders. 13 | Then, you can run the `run_benchmarks.ipynb` notebook to run the benchmarks. This will populate the `results_py.csv` file. 14 | Finally, you can run the `plot_benchmarks.ipynb` notebook to generate the plots. 15 | -------------------------------------------------------------------------------- /benchmarks/data_generation.r: -------------------------------------------------------------------------------- 1 | #----------------------------------------------# 2 | # Author: Laurent Berge 3 | # Date creation: Fri Oct 18 17:05:15 2019 4 | # ~: Benchmarking: data generation 5 | #----------------------------------------------# 6 | 7 | #### 8 | #### SIMULATION #### 9 | #### 10 | setwd("/Users/marc/Documents/pyfixest/benchmarks/DATA/") 11 | 12 | library(MASS) 13 | library(here) 14 | library(data.table) 15 | library(haven) 16 | 17 | # Some constants 18 | DATA_DIR <- "/Users/marc/Documents/pyfixest/benchmarks/DATA/data" 19 | STATA_DIR <- "/Users/marc/Documents/pyfixest/benchmarks/DATA/_STATA" 20 | RESULTS_DIR <- "/Users/marc/Documents/pyfixest/benchmarks/DATA/results" 21 | 22 | 23 | dir.create(DATA_DIR, showWarnings = FALSE) 24 | dir.create(STATA_DIR, showWarnings = FALSE) 25 | dir.create(RESULTS_DIR, showWarnings = FALSE) 26 | 27 | # We simulate databases 28 | 29 | all_n <- 1000 * 10**(0:3) 30 | all_rep <- 1:10 31 | 32 | # META parameters 33 | a <- 1 34 | b <- 0.05 35 | 36 | # Array of lists to store the results 37 | base_all <- array(data.frame(), dim = c(length(all_n), length(all_rep))) 38 | 39 | for (i in 1:length(all_n)) { 40 | cat("i =", i) 41 | n <- all_n[i] 42 | 43 | dum_all <- list() 44 | 45 | nb_dum <- c(n / 20, floor(sqrt(n)), floor(n**.33)) 46 | N <- nb_dum**3 47 | dum_all[[1]] <- sample(nb_dum[1], n, TRUE) 48 | dum_all[[2]] <- sample(nb_dum[2], n, TRUE) 49 | dum_all[[3]] <- sample(nb_dum[3], n, TRUE) 50 | 51 | for (r in all_rep) { 52 | cat(".") 53 | 54 | X1 <- rnorm(n) 55 | X2 <- X1**2 56 | 57 | mu <- a * X1 + b * X2 58 | 59 | for (m in 1:3) { 60 | coef_dum <- rnorm(nb_dum[m]) 61 | mu <- mu + coef_dum[dum_all[[m]]] 62 | } 63 | 64 | mu <- exp(mu) 65 | y <- rnegbin(mu, theta = 0.5) 66 | 67 | base <- data.frame(y, X1, ln_y = log(y + 1)) 68 | 69 | for (m in 1:3) { 70 | base[[paste0("dum_", m)]] <- dum_all[[m]] 71 | } 72 | 73 | base_all[i, r][[1]] <- base 74 | } 75 | cat("\n") 76 | } 77 | 78 | save(base_all, file = here(DATA_DIR, "base_all_simulations.Rdata")) 79 | 80 | # 81 | # Data with 10M observation for OLS (just one to save size) 82 | # 83 | 84 | n <- 1e7 85 | dum_all <- list() 86 | 87 | nb_dum <- c(n / 20, floor(sqrt(n)), floor(n**.33)) 88 | N <- nb_dum**3 89 | dum_all[[1]] <- sample(nb_dum[1], n, TRUE) 90 | dum_all[[2]] <- sample(nb_dum[2], n, TRUE) 91 | dum_all[[3]] <- sample(nb_dum[3], n, TRUE) 92 | 93 | X1 <- rnorm(n) 94 | X2 <- X1**2 95 | 96 | mu <- a * X1 + b * X2 97 | 98 | for (m in 1:3) { 99 | coef_dum <- rnorm(nb_dum[m]) 100 | mu <- mu + coef_dum[dum_all[[m]]] 101 | } 102 | 103 | mu <- exp(mu) 104 | y <- rnegbin(mu, theta = 0.5) 105 | 106 | base <- data.frame(ln_y = log(y + 1), X1) 107 | 108 | for (m in 1:3) { 109 | base[[paste0("dum_", m)]] <- dum_all[[m]] 110 | } 111 | 112 | fwrite(base, here(DATA_DIR, "base_10M.csv")) 113 | 114 | 115 | 116 | # 117 | # Exportation to stata 118 | # 119 | 120 | load(here(DATA_DIR, "base_all_simulations.Rdata")) 121 | 122 | # base_all: 4 sizes, 2 groups, 10 replications 123 | 124 | for (size in 1:4) { 125 | for (replication in 1:10) { 126 | cat(".") 127 | stata_name <- paste0("_STATA/base_s", size, "_r", replication, ".dta") 128 | write_dta(as.data.frame(base_all[size, replication]), stata_name) 129 | } 130 | } 131 | 132 | 133 | #### 134 | #### Difficult Data #### 135 | #### 136 | 137 | # This benchmark data set is an adaptation of a benchmark of 138 | # the authors of the Julia FixedEffectModels.jl software 139 | 140 | set.seed(1) # for replication 141 | base_all_diff <- list() 142 | 143 | for (pow in 4:7) { 144 | cat(".") 145 | n <- 10**pow 146 | nb_indiv <- n / 20 147 | nb_firm <- round(n / 160) 148 | nb_year <- round(n**.3) 149 | 150 | id_indiv <- sample(1:nb_indiv, n, TRUE) 151 | id_firm <- pmin(sample(0:20, n, TRUE) + pmax(1, id_indiv %/% 8 - 10), nb_firm) 152 | id_year <- sample(nb_year, n, TRUE) 153 | 154 | x1 <- 5 * cos(id_indiv) + 5 * sin(id_firm) + 5 * sin(id_year) + runif(n) 155 | x2 <- cos(id_indiv) + sin(id_firm) + sin(id_year) + rnorm(n) 156 | y <- 3 * x1 + 5 * x2 + cos(id_indiv) + cos(id_firm)^2 + sin(id_year) + rnorm(n) 157 | df <- data.frame(id_indiv = id_indiv, id_firm = id_firm, id_year = id_year, x1 = x1, x2 = x2, y = y) 158 | 159 | base_all_diff[[length(base_all_diff) + 1]] <- df 160 | 161 | if (pow < 7) { 162 | stata_name <- paste0("_STATA/base_diff_e", pow, ".dta") 163 | haven::write_dta(df, stata_name) 164 | } 165 | } 166 | 167 | save(base_all_diff, file = here(DATA_DIR, "base_all_diff.Rdata")) 168 | -------------------------------------------------------------------------------- /benchmarks/gpu_results.csv: -------------------------------------------------------------------------------- 1 | ,method,demeaner_backend,k,G,n_obs,full_feols_timing,demean_timing 2 | 0,feols,jax,1,1,10000,0.22883999347686768,0.08213257789611816 3 | 1,feols,jax,1,1,100000,0.27838222980499266,0.6599355697631836 4 | 2,feols,jax,1,2,10000,0.2594131946563721,0.07008242607116699 5 | 3,feols,jax,1,2,100000,0.5564646482467651,0.6762551069259644 6 | 4,feols,jax,1,3,10000,0.29452004432678225,0.08089225292205811 7 | 5,feols,jax,1,3,100000,0.7971727609634399,0.7361236095428467 8 | 6,feols,jax,10,1,10000,0.23609719276428223,0.08923766613006592 9 | 7,feols,jax,10,1,100000,0.35655946731567384,0.8534042596817016 10 | 8,feols,jax,10,2,10000,0.2677977800369263,0.07926526069641113 11 | 9,feols,jax,10,2,100000,0.7200102090835572,0.880342435836792 12 | 10,feols,jax,10,3,10000,0.30958850383758546,0.0926206350326538 13 | 11,feols,jax,10,3,100000,0.9418511629104614,0.8934823036193847 14 | 12,feols,numba,1,1,10000,0.2558188199996948,0.0025653839111328125 15 | 13,feols,numba,1,1,100000,0.26205954551696775,0.025234317779541014 16 | 14,feols,numba,1,2,10000,0.23201637268066405,0.0021322011947631837 17 | 15,feols,numba,1,2,100000,0.33342432975769043,0.03563899993896484 18 | 16,feols,numba,1,3,10000,0.23092274665832518,0.0023543596267700194 19 | 17,feols,numba,1,3,100000,0.25670347213745115,0.021407365798950195 20 | 18,feols,numba,10,1,10000,0.4105666160583496,0.0046176910400390625 21 | 19,feols,numba,10,1,100000,0.32753868103027345,0.08355832099914551 22 | 20,feols,numba,10,2,10000,0.23588848114013672,0.005055141448974609 23 | 21,feols,numba,10,2,100000,0.3476848602294922,0.0862342357635498 24 | 22,feols,numba,10,3,10000,0.25297954082489016,0.005847930908203125 25 | 23,feols,numba,10,3,100000,0.3176179170608521,0.056423068046569824 26 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | status: 3 | project: 4 | default: 5 | # basic 6 | target: auto 7 | threshold: 2% 8 | base: auto 9 | paths: 10 | - "pyfixest" 11 | # advanced settings 12 | branches: 13 | - master 14 | if_ci_failed: error 15 | informational: false 16 | only_pulls: false 17 | ignore: 18 | - "pyfixest/utils/dgps.py" 19 | - "pyfixest/utils/_exceptions.py" 20 | # cannot compute codecov for numba files (but all are tested) 21 | - "pyfixest/estimation/numba/*" 22 | 23 | comment: 24 | show_diff_only: false 25 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | /.quarto/ 2 | site_libs/ 3 | _site/ 4 | search.json 5 | objects.json 6 | reference/ 7 | -------------------------------------------------------------------------------- /docs/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-econometrics/pyfixest/1eb5f18bdceeece42db77e2337526fc64eff2346/docs/.nojekyll -------------------------------------------------------------------------------- /docs/_quarto.yml: -------------------------------------------------------------------------------- 1 | project: 2 | type: website 3 | output-dir: _site 4 | 5 | execute: 6 | # point quarto to the correct python environment 7 | python: "/pyfixest/.pixi/envs/docs/Scripts/python.exe" 8 | 9 | metadata-files: 10 | - _sidebar.yml 11 | 12 | website: 13 | navbar: 14 | page_navigation: true 15 | favicon: "figures/pyfixest-logo.png" 16 | #page-footer: 17 | # center: | 18 | # Developed by [Alexander Fischer](https://github.com/py-econometrics) and [Styfen Schär](https://github.com/styfenschaer) 19 | search: true 20 | right: 21 | - icon: github 22 | href: https://github.com/py-econometrics/pyfixest/ 23 | left: 24 | - text: "PyFixest" 25 | file: pyfixest.md 26 | - text: "Quickstart" 27 | file: quickstart.qmd 28 | - text: "Function Reference" 29 | file: reference/index.qmd 30 | - text: "Changelog" 31 | file: changelog.qmd 32 | - text: "Contributing" 33 | file: contributing.qmd 34 | - text: Learn more 35 | menu: 36 | - text: "Regression Tables and Summary Statistics" 37 | file: table-layout.qmd 38 | - text: "Hypothesis Testing and Marginal Effects" 39 | file: marginaleffects.qmd 40 | - text: "Difference-in-Differences Estimation" 41 | file: difference-in-differences.qmd 42 | - file: multiple_testing.ipynb 43 | text: "Multiple Testing Corrections" 44 | - file: regression_decomposition.ipynb 45 | text: "Regression Decomposition" 46 | - file: ssc.qmd 47 | text: "On Small Sample Corrections" 48 | - text: "Compare fixest & PyFixest" 49 | file: compare-fixest-pyfixest.qmd 50 | - text: "Compare Stata & PyFixest" 51 | file: stata-2-pyfixest.qmd 52 | - text: "PyFixest on the GPU" 53 | file: pyfixest_gpu.ipynb 54 | - text: "Replicating 'The Effect' with PyFixest" 55 | file: replicating-the-effect.qmd 56 | 57 | 58 | quartodoc: 59 | package: pyfixest 60 | title: "PyFixest Function Reference" 61 | parser: numpy 62 | rewrite_all_pages: False 63 | sidebar: _sidebar.yml 64 | 65 | sections: 66 | - title: Estimation Functions 67 | desc: | 68 | User facing estimation functions 69 | contents: 70 | - estimation.estimation.feols 71 | - estimation.estimation.fepois 72 | - estimation.estimation.feglm 73 | - did.estimation.did2s 74 | - did.estimation.lpdid 75 | - did.estimation.event_study 76 | - estimation.bonferroni 77 | - estimation.rwolf 78 | - title: Estimation Classes 79 | desc: | 80 | Details on Methods and Attributes 81 | contents: 82 | - estimation.feols_.Feols 83 | - estimation.fepois_.Fepois 84 | - estimation.feiv_.Feiv 85 | - estimation.feglm_.Feglm 86 | - estimation.felogit_.Felogit 87 | - estimation.feprobit_.Feprobit 88 | - estimation.fegaussian_.Fegaussian 89 | - estimation.feols_compressed_.FeolsCompressed 90 | #- did.did.DID 91 | #- did.did2s.DID2s 92 | #- did.lpdid.LPDID 93 | #- did.twfe.TWFE 94 | - title: Summarize and Visualize 95 | desc: | 96 | Post-Processing of Estimation Results 97 | contents: 98 | - did.visualize.panelview 99 | - report.summary 100 | - report.etable 101 | - report.dtable 102 | - report.coefplot 103 | - report.iplot 104 | - did.visualize.panelview 105 | - title: Misc / Utilities 106 | desc: | 107 | PyFixest internals and utilities 108 | contents: 109 | - estimation.demean 110 | - estimation.detect_singletons 111 | - estimation.model_matrix_fixest 112 | -------------------------------------------------------------------------------- /docs/_sidebar.yml: -------------------------------------------------------------------------------- 1 | website: 2 | sidebar: 3 | - contents: 4 | - reference/index.qmd 5 | - contents: 6 | - reference/estimation.estimation.feols.qmd 7 | - reference/estimation.estimation.fepois.qmd 8 | - reference/estimation.estimation.feglm.qmd 9 | - reference/did.estimation.did2s.qmd 10 | - reference/did.estimation.lpdid.qmd 11 | - reference/did.estimation.event_study.qmd 12 | - reference/estimation.bonferroni.qmd 13 | - reference/estimation.rwolf.qmd 14 | section: Estimation Functions 15 | - contents: 16 | - reference/estimation.feols_.Feols.qmd 17 | - reference/estimation.fepois_.Fepois.qmd 18 | - reference/estimation.feiv_.Feiv.qmd 19 | - reference/estimation.feglm_.Feglm.qmd 20 | - reference/estimation.felogit_.Felogit.qmd 21 | - reference/estimation.feprobit_.Feprobit.qmd 22 | - reference/estimation.fegaussian_.Fegaussian.qmd 23 | - reference/estimation.feols_compressed_.FeolsCompressed.qmd 24 | section: Estimation Classes 25 | - contents: 26 | - reference/did.visualize.panelview.qmd 27 | - reference/report.summary.qmd 28 | - reference/report.etable.qmd 29 | - reference/report.dtable.qmd 30 | - reference/report.coefplot.qmd 31 | - reference/report.iplot.qmd 32 | - reference/did.visualize.panelview.qmd 33 | section: Summarize and Visualize 34 | - contents: 35 | - reference/estimation.demean.qmd 36 | - reference/estimation.detect_singletons.qmd 37 | - reference/estimation.model_matrix_fixest.qmd 38 | section: Misc / Utilities 39 | id: reference 40 | - id: dummy-sidebar 41 | -------------------------------------------------------------------------------- /docs/figures/gpu_benchmarks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-econometrics/pyfixest/1eb5f18bdceeece42db77e2337526fc64eff2346/docs/figures/gpu_benchmarks.png -------------------------------------------------------------------------------- /docs/figures/pyfixest-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-econometrics/pyfixest/1eb5f18bdceeece42db77e2337526fc64eff2346/docs/figures/pyfixest-logo.png -------------------------------------------------------------------------------- /docs/latexdocs/SampleTableDoc.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-econometrics/pyfixest/1eb5f18bdceeece42db77e2337526fc64eff2346/docs/latexdocs/SampleTableDoc.pdf -------------------------------------------------------------------------------- /docs/latexdocs/SampleTableDoc.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article}% 2 | \usepackage[T1]{fontenc}% 3 | \usepackage[utf8]{inputenc}% 4 | \usepackage{lmodern}% 5 | \usepackage{textcomp}% 6 | \usepackage{lastpage}% 7 | \usepackage{booktabs}% 8 | \usepackage{threeparttable}% 9 | \usepackage{makecell}% 10 | % 11 | % 12 | % 13 | \begin{document}% 14 | \normalsize% 15 | \section{A PyFixest LateX Table}% 16 | \label{sec:APyFixestLateXTable}% 17 | 18 | 19 | \begin{table}[htbp]% 20 | \renewcommand\cellalign{t} 21 | \begin{threeparttable} 22 | \begin{tabular}{lccccccc} 23 | \toprule 24 | & \multicolumn{3}{c}{Y} & \multicolumn{3}{c}{Y2} \\ 25 | \cmidrule(lr){2-4} \cmidrule(lr){5-7} 26 | & (1) & (2) & (3) & (4) & (5) & (6) \\ 27 | \midrule 28 | \addlinespace 29 | X1 & \makecell{-0.95*** \\ (0.07)} & \makecell{-0.92*** \\ (0.06)} & \makecell{-0.92*** \\ (0.06)} & \makecell{-1.27*** \\ (0.17)} & \makecell{-1.23*** \\ (0.19)} & \makecell{-1.23*** \\ (0.19)} \\ 30 | X2 & \makecell{-0.17*** \\ (0.02)} & \makecell{-0.17*** \\ (0.01)} & \makecell{-0.19*** \\ (0.03)} & \makecell{-0.13*** \\ (0.04)} & \makecell{-0.12*** \\ (0.04)} & \makecell{-0.07 \\ (0.10)} \\ 31 | X1:X2 & & & \makecell{0.01 \\ (0.02)} & & & \makecell{-0.04 \\ (0.08)} \\ 32 | \midrule 33 | \addlinespace 34 | f2 & - & x & x & - & x & x \\ 35 | f1 & x & x & x & x & x & x \\ 36 | \midrule 37 | \addlinespace 38 | Observations & 997 & 997 & 997 & 998 & 998 & 998 \\ 39 | S.E. type & by: f1 & by: f1 & by: f1 & by: f1 & by: f1 & by: f1 \\ 40 | $R^2$ & 0.49 & 0.66 & 0.66 & 0.12 & 0.17 & 0.17 \\ 41 | \bottomrule 42 | \end{tabular} 43 | \footnotesize 44 | \end{threeparttable}% 45 | \end{table} 46 | 47 | % 48 | \end{document} 49 | -------------------------------------------------------------------------------- /docs/latexdocs/SampleTableDoc2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-econometrics/pyfixest/1eb5f18bdceeece42db77e2337526fc64eff2346/docs/latexdocs/SampleTableDoc2.pdf -------------------------------------------------------------------------------- /docs/latexdocs/SampleTableDoc2.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article}% 2 | \usepackage[T1]{fontenc}% 3 | \usepackage[utf8]{inputenc}% 4 | \usepackage{lmodern}% 5 | \usepackage{textcomp}% 6 | \usepackage{lastpage}% 7 | \usepackage{booktabs}% 8 | \usepackage{threeparttable}% 9 | \usepackage{makecell}% 10 | % 11 | % 12 | % 13 | \begin{document}% 14 | \normalsize% 15 | \section{A PyFixest LateX Table}% 16 | \label{sec:APyFixestLateXTable}% 17 | 18 | 19 | \begin{table}[htbp]% 20 | \renewcommand\cellalign{t} 21 | \begin{threeparttable} 22 | \begin{tabular}{lccccccc} 23 | \toprule 24 | & \multicolumn{2}{c}{US} & \multicolumn{2}{c}{China} & \multicolumn{2}{c}{EU} \\ 25 | \cmidrule(lr){2-3} \cmidrule(lr){4-5} \cmidrule(lr){6-7} 26 | & Wage & Wealth & Wage & Wealth & Wage & Wealth \\ 27 | & (1) & (2) & (3) & (4) & (5) & (6) \\ 28 | \midrule 29 | \addlinespace 30 | Age & \makecell{-0.950*** \\ (0.067)} & \makecell{-1.267*** \\ (0.174)} & \makecell{-0.924*** \\ (0.061)} & \makecell{-1.232*** \\ (0.192)} & \makecell{-0.924*** \\ (0.061)} & \makecell{-1.231*** \\ (0.192)} \\ 31 | Years of Schooling & \makecell{-0.174*** \\ (0.018)} & \makecell{-0.131** \\ (0.042)} & \makecell{-0.174*** \\ (0.015)} & \makecell{-0.118** \\ (0.042)} & \makecell{-0.185*** \\ (0.025)} & \makecell{-0.074 \\ (0.104)} \\ 32 | Age $\times$ Years of Schooling & & & & & \makecell{0.011 \\ (0.018)} & \makecell{-0.041 \\ (0.081)} \\ 33 | \midrule 34 | \addlinespace 35 | Year & - & - & x & x & x & x \\ 36 | Industry & x & x & x & x & x & x \\ 37 | \midrule 38 | \addlinespace 39 | Number of Clusters & 42 & 42 & 42 & 37 & 37 & 37 \\ 40 | Observations & 997 & 998 & 997 & 998 & 997 & 998 \\ 41 | $R^2$ & 0.489 & 0.120 & 0.659 & 0.172 & 0.659 & 0.172 \\ 42 | \bottomrule 43 | \end{tabular} 44 | \footnotesize Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. 45 | \end{threeparttable}% 46 | \end{table} 47 | 48 | % 49 | \end{document} 50 | -------------------------------------------------------------------------------- /docs/latexdocs/SampleTableDoc3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-econometrics/pyfixest/1eb5f18bdceeece42db77e2337526fc64eff2346/docs/latexdocs/SampleTableDoc3.pdf -------------------------------------------------------------------------------- /docs/latexdocs/SampleTableDoc3.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article}% 2 | \usepackage[T1]{fontenc}% 3 | \usepackage[utf8]{inputenc}% 4 | \usepackage{lmodern}% 5 | \usepackage{textcomp}% 6 | \usepackage{lastpage}% 7 | \usepackage{booktabs}% 8 | \usepackage{threeparttable}% 9 | \usepackage{makecell}% 10 | % 11 | % 12 | % 13 | \begin{document}% 14 | \normalsize% 15 | \section{A PyFixest LateX Table}% 16 | \label{sec:APyFixestLateXTable}% 17 | 18 | 19 | \begin{table}[htbp]% 20 | \renewcommand\cellalign{t} 21 | \begin{threeparttable} 22 | \begin{tabular}{lccccccc} 23 | \toprule 24 | & \multicolumn{3}{c}{EU} & \multicolumn{3}{c}{US} \\ 25 | \cmidrule(lr){2-4} \cmidrule(lr){5-7} 26 | & N & Mean & Std. Dev. & N & Mean & Std. Dev. \\ 27 | \midrule 28 | \addlinespace 29 | \emph{Blue collar} \\ 30 | \addlinespace 31 | Wage & 242 & -0.09 & 2.40 & 241 & -0.18 & 2.36 \\ 32 | Wealth & 242 & -0.38 & 5.64 & 241 & 0.10 & 5.08 \\ 33 | Age & 242 & 1.07 & 0.81 & 241 & 1.04 & 0.82 \\ 34 | Years of Schooling & 242 & 0.08 & 3.01 & 241 & -0.02 & 3.03 \\ 35 | \addlinespace 36 | \midrule 37 | \addlinespace 38 | \emph{White collar} \\ 39 | \addlinespace 40 | Wage & 275 & -0.08 & 2.17 & 239 & -0.16 & 2.32 \\ 41 | Wealth & 275 & -0.26 & 5.64 & 239 & -0.75 & 5.96 \\ 42 | Age & 275 & 1.05 & 0.81 & 239 & 1.01 & 0.78 \\ 43 | Years of Schooling & 275 & -0.36 & 3.12 & 239 & -0.19 & 3.02 \\ 44 | \addlinespace 45 | \bottomrule 46 | \end{tabular} 47 | \footnotesize 48 | \end{threeparttable}% 49 | \end{table} 50 | 51 | % 52 | \end{document} 53 | -------------------------------------------------------------------------------- /docs/marginaleffects.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Marginal Effects and Hypothesis Tests via `marginaleffects` 3 | format: 4 | html: 5 | html-table-processing: none 6 | toc: true 7 | toc-title: "On this page" 8 | toc-location: left 9 | --- 10 | 11 | We can compute marginal effects and linear and non-linear hypothesis tests via the excellent [marginaleffects](https://github.com/vincentarelbundock/pymarginaleffects) package. 12 | 13 | 14 | ```{python} 15 | from marginaleffects import hypotheses 16 | 17 | import pyfixest as pf 18 | 19 | data = pf.get_data() 20 | fit = pf.feols("Y ~ X1 + X2", data=data) 21 | 22 | fit.tidy() 23 | ``` 24 | 25 | 26 | Suppose we were interested in testing the hypothesis that $X_{1} = X_{2}$. Given the relatively large differences in coefficients and 27 | small standard errors, we will likely reject the null that the two parameters are equal. 28 | 29 | We can run the formal test via the `hypotheses` function from the `marginaleffects` package. 30 | 31 | 32 | ```{python} 33 | hypotheses(fit, "X1 - X2 = 0") 34 | ``` 35 | 36 | 37 | And indeed, we reject the null of equality of coefficients: we get a p-value of zero and a confidence interval that does not contain 0. 38 | 39 | ## Non-Linear Hypothesis Tests: Ratio Estimates 40 | 41 | We can also test run-linear hypotheses, in which case `marginaleffects` will automatically compute correct standard errors 42 | based on the estimated covariance matrix and the Delta method. This is for example useful for computing inferential 43 | statistics for the "relative uplift" in an AB test. 44 | 45 | For the moment, let's assume that $X1$ is a randomly assigned treatment variable. As before, $Y$ is our variable / KPI of interest. 46 | 47 | Under randomization, the model intercept measures the "baseline", i.e. the population average of $Y$ in the absence of treatment. To compute a relative uplift, we might compute 48 | 49 | 50 | ```{python} 51 | (fit.coef().xs("X1") / fit.coef().xs("Intercept") - 1) * 100 52 | ``` 53 | 54 | 55 | So we have a really big negative treatment effect of around minus 212%! To conduct correct inference on this 56 | ratio statistic, we need to use the delta method. 57 | 58 | 59 | ### The Multivariate Delta Method 60 | 61 | In a nutshell, the delta method provides a way to approximate the asympotic distribution of any non-linear transformation $g()$ or one or more random variables. 62 | 63 | In the case of the ratio statistics, this non-linear transformation can be denoted as $g(\theta_{1}, \theta_{2}) = \theta_{1} / \theta_{2}$. 64 | 65 | Here's the **Delta Method theorem**: 66 | 67 | First, we define $\theta = (\theta_{1}, \theta_{2})'$ and $\mu = (\mu_{1}, \mu_{2})'$. 68 | 69 | By the law of large numbers, we know that 70 | 71 | $$ 72 | \sqrt{N} (\theta - \mu) \rightarrow_{d} N(0_{2}, \Sigma_{2,2}) \text{ if } N \rightarrow \infty. 73 | $$ 74 | 75 | By the **Delta Method**, we can then approximate the limit distribution of $g(\theta)$ as 76 | 77 | 78 | $$ 79 | \sqrt{N} (g(\theta) - g(\mu)) \rightarrow_{d} N(0_{1}, g'(\theta) \times \Sigma \times g(\theta)) \text{ if } N \rightarrow \infty. 80 | $$. 81 | 82 | [Here's a long derivation of how to use the the delta method for inference of ratio statistics.](https://stats.stackexchange.com/questions/291594/estimation-of-population-ratio-using-delta-method). The key steps from the formula above is to derive the expression for the asymptotic variance $ g'(\theta) \times \Sigma \times g(\theta)$. 83 | 84 | But hey - we're lucky, because marginaleffects will do all this work for us: we don't have to derive analytic gradients ourselves =) 85 | 86 | ### Using the Delta Method via `marginaleffects`: 87 | 88 | We can employ the Delta Method via `marginaleffects` via the `hypotheses` function: 89 | 90 | 91 | ```{python} 92 | hypotheses(fit, "(X1 / Intercept - 1) * 100 = 0") 93 | ``` 94 | 95 | As before, we get an estimate of around -212%. Additionally, we obtain a 95% CI via the Delta Method of [-228%, -195%]. 96 | 97 | Besides hypopotheses testing, you can do a range of other cool things with the `marginaleffects` package. 98 | For example (and likely unsurprisingly), you can easily compute all sorts of marginal effects for your regression models. 99 | For all the details, we highly recommend to take a look 100 | at the [marginaleffects zoo book!](https://marginaleffects.com/index.html). 101 | -------------------------------------------------------------------------------- /docs/pyfixest_gpu.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## `PyFixest` on professional-tier GPUs \n", 8 | "\n", 9 | "`PyFixest` allows to run the fixed effects demeaning on the GPU via the `demeaner_backend` argument. \n", 10 | "To do so, you will have to install `jax` and `jaxblib`, for example by typing `pip install pyfixest[jax]`.\n", 11 | "\n", 12 | "We test two back-ends for the iterative alternating-projections component of the fixed-effects regression on an Nvidia A100 GPU with 40 GB VRAM (a GPU that one typically wouldn't have installed to play graphics-intensive videogames on consumer hardware). `numba` benchmarks are run on a 12-core xeon CPU. \n", 13 | "\n", 14 | "The JAX backend exhibits major performance improvements **on the GPU** over numba in large problems. " 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "![](figures/gpu_benchmarks.png)\n" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "On the **CPU** instead, we find that `numba` outperforms the JAX backend. You can find details in the [benchmark section](https://github.com/py-econometrics/pyfixest/tree/master/benchmarks) of the github repo. " 29 | ] 30 | } 31 | ], 32 | "metadata": { 33 | "kernelspec": { 34 | "display_name": "dev", 35 | "language": "python", 36 | "name": "python3" 37 | }, 38 | "language_info": { 39 | "codemirror_mode": { 40 | "name": "ipython", 41 | "version": 3 42 | }, 43 | "file_extension": ".py", 44 | "mimetype": "text/x-python", 45 | "name": "python", 46 | "nbconvert_exporter": "python", 47 | "pygments_lexer": "ipython3", 48 | "version": "3.12.8" 49 | } 50 | }, 51 | "nbformat": 4, 52 | "nbformat_minor": 2 53 | } 54 | -------------------------------------------------------------------------------- /docs/quarto_example/QuartoExample.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Quarto and PyFixest" 3 | jupyter: python3 4 | format: 5 | pdf: 6 | number-sections: true 7 | include-in-header: 8 | - text: | 9 | \usepackage{booktabs} 10 | \usepackage{makecell} 11 | \usepackage{threeparttable} 12 | execute: 13 | echo: false 14 | warning: false 15 | author: 16 | - Peter Pan^[University of Neverland] 17 | date: last-modified 18 | abstract: | 19 | \ 20 | \ 21 | We study the effect of X1 and X2 on Y and Y2 using PyFixest. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. 22 | \ 23 | \ 24 | **Keywords**: Regressions, PyFixest, Tables, Quarto 25 | --- 26 | 27 | {{< pagebreak >}} 28 | 29 | # Introduction 30 | 31 | Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet." 32 | 33 | ```{python} 34 | #| echo: false 35 | #| output: false 36 | import pandas as pd 37 | import pyfixest as pf 38 | 39 | data = pf.get_data() 40 | 41 | fit1 = pf.feols("Y ~ X1 + X2 | f1", data = data) 42 | fit2 = pf.feols("Y ~ X1 + X2 | f1 + f2", data = data) 43 | fit3 = pf.feols("Y ~ X1 *X2 | f1 + f2", data = data) 44 | fit4 = pf.feols("Y2 ~ X1 + X2 | f1", data = data) 45 | fit5 = pf.feols("Y2 ~ X1 + X2 | f1 + f2", data = data) 46 | fit6 = pf.feols("Y2 ~ X1 *X2 | f1 + f2", data = data) 47 | 48 | labels={ 49 | "Y": "Wage", 50 | "Y2": "Wealth", 51 | "X1": "Age", 52 | "X2": "Years of Schooling", 53 | "f1": "Industry", 54 | "f2": "Year" 55 | } 56 | 57 | ``` 58 | 59 | # A PyFixest Regression Table 60 | As @tbl-main shows, LaTex Tables generated by PyFixest can be easily integrated into Quarto document when rendered as pdf. 61 | 62 | ```{python} 63 | #| label: tbl-main 64 | #| tbl-cap: A PyFixest Regression Table 65 | #| output: asis 66 | #| echo: false 67 | #| tbl-pos: H 68 | 69 | mynotes="Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet." 70 | 71 | tab=pf.etable([fit1, fit2, fit4, fit5], 72 | labels=labels, notes=mynotes, type="tex", 73 | model_heads=["US", "China", "US", "China"]) 74 | 75 | print(tab) 76 | ``` 77 | -------------------------------------------------------------------------------- /docs/replicating-the-effect.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Replicating Examples from "The Effect" 3 | format: 4 | html: 5 | html-table-processing: none 6 | toc: true 7 | toc-title: "On this page" 8 | toc-location: left 9 | --- 10 | 11 | This notebook replicates code examples from Nick Huntington-Klein's book on causal inference, [The Effect](https://theeffectbook.net/). 12 | 13 | 14 | ```{python} 15 | from causaldata import Mroz, gapminder, organ_donations, restaurant_inspections 16 | 17 | import pyfixest as pf 18 | 19 | %load_ext watermark 20 | %watermark --iversions 21 | ``` 22 | 23 | 24 | ## Chapter 4: Describing Relationships 25 | 26 | 27 | ```{python} 28 | # Read in data 29 | dt = Mroz.load_pandas().data 30 | # Keep just working women 31 | dt = dt.query("lfp") 32 | # Create unlogged earnings 33 | dt.loc[:, "earn"] = dt["lwg"].apply("exp") 34 | 35 | # 5. Run multiple linear regression models by succesively adding controls 36 | fit = pf.feols(fml="lwg ~ csw(inc, wc, k5)", data=dt, vcov="iid") 37 | pf.etable(fit) 38 | ``` 39 | 40 | ## Chapter 13: Regression 41 | 42 | ### Example 1 43 | 44 | 45 | ```{python} 46 | res = restaurant_inspections.load_pandas().data 47 | res.inspection_score = res.inspection_score.astype(float) 48 | res.NumberofLocations = res.NumberofLocations.astype(float) 49 | res.dtypes 50 | 51 | fit = pf.feols(fml="inspection_score ~ NumberofLocations", data=res) 52 | pf.etable([fit]) 53 | ``` 54 | 55 | 56 | ### Example 2 57 | 58 | 59 | ```{python} 60 | df = restaurant_inspections.load_pandas().data 61 | 62 | fit1 = pf.feols( 63 | fml="inspection_score ~ NumberofLocations + I(NumberofLocations^2) + Year", data=df 64 | ) 65 | fit2 = pf.feols(fml="inspection_score ~ NumberofLocations*Weekend + Year", data=df) 66 | 67 | pf.etable([fit1, fit2]) 68 | ``` 69 | 70 | 71 | 72 | ### Example 3: HC Standard Errors 73 | 74 | 75 | ```{python} 76 | pf.feols(fml="inspection_score ~ Year + Weekend", data=df, vcov="HC3").summary() 77 | ``` 78 | 79 | 80 | ### Example 4: Clustered Standard Errors 81 | 82 | 83 | ```{python} 84 | pf.feols( 85 | fml="inspection_score ~ Year + Weekend", data=df, vcov={"CRV1": "Weekend"} 86 | ).tidy() 87 | ``` 88 | 89 | ### Example 5: Bootstrap Inference 90 | 91 | 92 | ```{python} 93 | fit = pf.feols(fml="inspection_score ~ Year + Weekend", data=df) 94 | fit.wildboottest(reps=999, param="Year") 95 | ``` 96 | 97 | 98 | ## Chapter 16: Fixed Effects 99 | 100 | ### Example 1 101 | 102 | tba 103 | 104 | ### Example 2 105 | 106 | 107 | ```{python} 108 | gm = gapminder.load_pandas().data 109 | gm["logGDPpercap"] = gm["gdpPercap"].apply("log") 110 | 111 | fit = pf.feols(fml="lifeExp ~ C(country) + np.log(gdpPercap)", data=gm) 112 | fit.tidy().head() 113 | ``` 114 | 115 | 116 | ### Example 3: TWFE 117 | 118 | 119 | ```{python} 120 | # Set our individual and time (index) for our data 121 | fit = pf.feols(fml="lifeExp ~ np.log(gdpPercap) | country + year", data=gm) 122 | fit.summary() 123 | ``` 124 | 125 | 126 | ## Chapter 18: Difference-in-Differences 127 | 128 | ### Example 1 129 | 130 | 131 | ```{python} 132 | od = organ_donations.load_pandas().data 133 | 134 | # Create Treatment Variable 135 | od["California"] = od["State"] == "California" 136 | od["After"] = od["Quarter_Num"] > 3 137 | od["Treated"] = 1 * (od["California"] & od["After"]) 138 | 139 | did = pf.feols(fml="Rate ~ Treated | State + Quarter", data=od) 140 | did.summary() 141 | ``` 142 | 143 | 144 | ### Example 3: Dynamic Treatment Effect 145 | 146 | 147 | ```{python} 148 | od = organ_donations.load_pandas().data 149 | 150 | # Create Treatment Variable 151 | od["California"] = od["State"] == "California" 152 | # od["Quarter_Num"] = pd.Categorical(od.Quarter_Num) 153 | od["California"] = od.California.astype(float) 154 | 155 | did2 = pf.feols( 156 | fml="Rate ~ i(Quarter_Num, California,ref=3) | State + Quarter_Num", data=od 157 | ) 158 | 159 | did2.tidy() 160 | ``` 161 | -------------------------------------------------------------------------------- /docs/ssc.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: On Small Sample Corrections 3 | format: 4 | html: 5 | html-table-processing: none 6 | toc: true 7 | toc-title: "On this page" 8 | toc-location: left 9 | --- 10 | 11 | The `fixest` R package provides various options for small sample corrections. While it has an excellent [vignette](https://cran.r-project.org/web/packages/fixest/vignettes/standard_errors.html) on the topic, reproducing its behavior in `pyfixest` took more time than expected. So that future developers (and my future self) can stay sane, I’ve compiled all of my hard-earned understanding of how small sample adjustments work in `fixest` and how they are implemented in `pyfixest` in this document. 12 | 13 | In both `fixest` and `pyfixest`, small sample corrections are controlled by the `ssc` function. In `pyfixest`, `ssc` accepts four arguments: `adj`, `cluster_adj`, `fixef_k` and `cluster_df`. 14 | 15 | Based on these inputs, the adjusted variance-covariance matrix is computed as: 16 | 17 | ``` 18 | vcov_adj = adj_val(N, dof_k) if adj else 1 19 | * cluster_adj_val(G, cluster_df) if cluster_adj else 1 20 | * vcov 21 | ``` 22 | 23 | Where: 24 | 25 | - **`adj`**: Enables or disables the first scalar adjustment. 26 | - **`cluster_adj`**: Enables or disables the second scalar adjustment. 27 | - **`vcov`**: The unadjusted variance-covariance matrix. 28 | - **`dof_k`**: The number of estimated parameters considered in the first adjustment. Impacts `adj_val`. 29 | - **`fixef_k`**: Determines how `dof_k` is computed (how fixed effects are counted). 30 | - **`cluster_df`**: Determines how `cluster_adj_val` is computed (only relevant for multi-way clustering). 31 | - **`G`**: The number of unique clusters (`G = N` for heteroskedastic errors). 32 | 33 | Outside of this formula, we have **`df_t`**, which is the degrees of freedom used for p-values and confidence intervals: 34 | 35 | - `df_t = N - dof_k` for IID or heteroskedastic errors. 36 | - `df_t = G - 1` for clustered errors. 37 | 38 | --- 39 | 40 | # Small Sample Adjustments 41 | 42 | ## `adj = True` 43 | 44 | If `adj = True`, the adjustment factor is: 45 | 46 | `adj_val = (N - 1) / (N - dof_k)` 47 | 48 | If `adj = False`, no adjustment is applied. 49 | 50 | --- 51 | 52 | ## `fixef_k` 53 | 54 | The `fixef_k` argument controls how fixed effects contribute to `dof_k`, and thus to `adj_val`. It supports three options: 55 | 56 | - **`"none"`** 57 | - **`"full"`** 58 | - **`"nested"`** 59 | 60 | ### `fixef_k = "none"` 61 | 62 | Fixed effects are ignored when counting parameters: 63 | 64 | - **Example**: 65 | - `Y ~ X1 | f1` → `k = 1` 66 | - `Y ~ X1 + X2 | f1` → `k = 2` 67 | 68 | ### `fixef_k = "full"` 69 | 70 | Fixed effects are fully counted. For `n_fe` total fixed effects and each fixed effect `f_i`, we set `dof_k = k + k_fe`, 71 | 72 | 73 | - If there is **more than one** fixed effect, we drop one level from each fixed effects except the first (to avoid multicollinearity) 74 | `k_fe = sum_{i=1}^{n_fe} levels(f_i) - (n_fe - 1)` 75 | 76 | - If there is **only one** fixed effect: 77 | `k_fe = sum_{i=1}^{n_fe} levels(f_i) = levels(f_1)` 78 | 79 | ### `fixef_k = "nested"` 80 | 81 | Fixed effects may be **nested** within cluster variables (e.g., district FEs nested in state clusters). If `fixef_k = "nested"`, nested fixed effects do not count toward `k_fe`: 82 | 83 | `k_fe = sum_{i=1}^{n_fe} levels(f_i) - k_fe_nested - (n_fe - 1)` 84 | 85 | where `k_fe_nested` is the count of nested fixed effects. For cluster fixed effects, `k_fe_nested = G`, the number of clusters. 86 | 87 | > ⚠️ *Note:* If you already subtracted a level from a nested FE, you may need to add it back. 88 | 89 | --- 90 | 91 | ## `cluster_adj` 92 | 93 | If `cluster_adj = True`, we apply a second correction: 94 | 95 | `cluster_df_val = G / (G - 1)` 96 | 97 | Where: 98 | 99 | - `G` is the number of clusters for clustered errors, or `N` for heteroskedastic errors. 100 | - This follows the approach in R’s `sandwich` package, interpreting heteroskedastic errors as “singleton clusters.” 101 | 102 | > *Tip:* If `cluster_adj = True` for IID errors, `cluster_df_val` defaults to `1`. For *heteroskedastic erros*, despite its name, `cluster_adj=True` will apply an adjustment of (N-1) / N, as there are $G = N$ singleton clusters. 103 | 104 | --- 105 | 106 | ## `cluster_df` 107 | 108 | Relevant only for **multi-way clustering**. Two-way clustering, for example, can be written as: 109 | 110 | `vcov = ssc_A * vcov_A + ssc_B * vcov_B - ssc_AB * vcov_AB` 111 | 112 | where `A` and `B` are clustering dimensions, with `G_AB > G_A > G_B`. 113 | 114 | - If `cluster_df = "min"`, then G is set to the minimum value of `G_A`, `G_B`, and `G_AB`. 115 | - If `cluster_df = "conventional"`, each clustering dimension uses its own cluster count (`G_A`, `G_B`, etc.) for its respective adjustment. 116 | 117 | --- 118 | 119 | # More on Inference 120 | 121 | For computing critical values: 122 | 123 | - **OLS and IV**: use t-statistics with `df_t = N - dof_k` (non-clustered) or `df_t = G - 1` (clustered). 124 | - **GLMs**: use z-statistics (normal approximation). 125 | 126 | For multi-way clustering: 127 | 128 | - **Two-way**: `df_t = min(G_1 - 1, G_2 - 1)` 129 | - **Three-way**: `df_t = min(G_1 - 1, G_2 - 1, G_3 - 1)` *(not currently supported)* 130 | 131 | See [this implementation](https://github.com/py-econometrics/pyfixest/blob/864da9c0d1797aff70e3f5b420e4c73f7256642d/pyfixest/estimation/feols_.py#L851) for details. 132 | 133 | # In Code 134 | 135 | All of the above logic is implemented [here](https://github.com/py-econometrics/pyfixest/blob/69acf9d22eab4300853d80264ee6d01bc4bdcb35/pyfixest/utils/utils.py#L108). 136 | -------------------------------------------------------------------------------- /figures/pyfixest-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-econometrics/pyfixest/1eb5f18bdceeece42db77e2337526fc64eff2346/figures/pyfixest-logo.png -------------------------------------------------------------------------------- /pyfixest/__init__.py: -------------------------------------------------------------------------------- 1 | # Import modules 2 | from pyfixest import ( 3 | did, 4 | errors, 5 | estimation, 6 | report, 7 | utils, 8 | ) 9 | from pyfixest.did import ( 10 | SaturatedEventStudy, 11 | did2s, 12 | event_study, 13 | lpdid, 14 | panelview, 15 | ) 16 | 17 | # Import frequently used functions and classes 18 | from pyfixest.estimation import ( 19 | bonferroni, 20 | feglm, 21 | feols, 22 | fepois, 23 | rwolf, 24 | wyoung, 25 | ) 26 | from pyfixest.report import coefplot, dtable, etable, iplot, make_table, summary 27 | from pyfixest.utils import ( 28 | get_data, 29 | get_ssc, 30 | ssc, 31 | ) 32 | 33 | __all__ = [ 34 | "SaturatedEventStudy", 35 | "bonferroni", 36 | "coefplot", 37 | "did", 38 | "did2s", 39 | "dtable", 40 | "errors", 41 | "estimation", 42 | "etable", 43 | "event_study", 44 | "feglm", 45 | "feols", 46 | "fepois", 47 | "get_data", 48 | "get_ssc", 49 | "iplot", 50 | "lpdid", 51 | "make_table", 52 | "panelview", 53 | "report", 54 | "rwolf", 55 | "ssc", 56 | "summary", 57 | "utils", 58 | "wyoung", 59 | ] 60 | 61 | from importlib.metadata import PackageNotFoundError, version 62 | 63 | try: 64 | __version__ = version("pyfixest") 65 | except PackageNotFoundError: 66 | __version__ = "unknown" 67 | -------------------------------------------------------------------------------- /pyfixest/core/__init__.py: -------------------------------------------------------------------------------- 1 | from .collinear import find_collinear_variables 2 | from .crv1 import crv1_meat_loop 3 | from .demean import demean 4 | from .nested_fixed_effects import count_fixef_fully_nested_all 5 | 6 | __all__ = [ 7 | "count_fixef_fully_nested_all", 8 | "crv1_meat_loop", 9 | "demean", 10 | "find_collinear_variables", 11 | ] 12 | -------------------------------------------------------------------------------- /pyfixest/core/_core_impl.pyi: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.typing import NDArray 3 | 4 | def _find_collinear_variables_rs(x: NDArray[np.float64], tol: float = 1e-10): ... 5 | def _crv1_meat_loop_rs( 6 | scores: NDArray[np.float64], 7 | clustid: NDArray[np.float64], 8 | cluster_col: NDArray[np.float64], 9 | ) -> NDArray: ... 10 | def _demean_rs( 11 | x: NDArray[np.float64], 12 | flist: NDArray[np.uint], 13 | weights: NDArray[np.float64], 14 | tol: float = 1e-08, 15 | maxiter: int = 100_000, 16 | ) -> tuple[np.ndarray, bool]: ... 17 | def _count_fixef_fully_nested_all_rs( 18 | all_fixef_array: NDArray, 19 | cluster_colnames: NDArray, 20 | cluster_data: NDArray[np.uint], 21 | fe_data: NDArray[np.uint], 22 | ) -> tuple[np.ndarray, int]: ... 23 | -------------------------------------------------------------------------------- /pyfixest/core/collinear.py: -------------------------------------------------------------------------------- 1 | from ._core_impl import ( 2 | _find_collinear_variables_rs as find_collinear_variables, # noqa: F401 3 | ) 4 | -------------------------------------------------------------------------------- /pyfixest/core/crv1.py: -------------------------------------------------------------------------------- 1 | from ._core_impl import _crv1_meat_loop_rs as crv1_meat_loop # noqa: F401 2 | -------------------------------------------------------------------------------- /pyfixest/core/demean.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.typing import NDArray 3 | 4 | from ._core_impl import _demean_rs 5 | 6 | 7 | def demean( 8 | x: NDArray[np.float64], 9 | flist: NDArray[np.uint], 10 | weights: NDArray[np.float64], 11 | tol: float = 1e-08, 12 | maxiter: int = 100_000, 13 | ) -> tuple[NDArray, bool]: 14 | """ 15 | Demean an array. 16 | 17 | Workhorse for demeaning an input array `x` based on the specified fixed 18 | effects and weights via the alternating projections algorithm. 19 | 20 | Parameters 21 | ---------- 22 | x : numpy.ndarray 23 | Input array of shape (n_samples, n_features). Needs to be of type float. 24 | flist : numpy.ndarray 25 | Array of shape (n_samples, n_factors) specifying the fixed effects. 26 | Needs to already be converted to integers. 27 | weights : numpy.ndarray 28 | Array of shape (n_samples,) specifying the weights. 29 | tol : float, optional 30 | Tolerance criterion for convergence. Defaults to 1e-08. 31 | maxiter : int, optional 32 | Maximum number of iterations. Defaults to 100_000. 33 | 34 | Returns 35 | ------- 36 | tuple[numpy.ndarray, bool] 37 | A tuple containing the demeaned array of shape (n_samples, n_features) 38 | and a boolean indicating whether the algorithm converged successfully. 39 | 40 | Examples 41 | -------- 42 | ```{python} 43 | import numpy as np 44 | import pyfixest as pf 45 | from pyfixest.utils.dgps import get_blw 46 | from pyfixest.estimation.demean_ import demean 47 | from formulaic import model_matrix 48 | 49 | fml = "y ~ treat | state + year" 50 | 51 | data = get_blw() 52 | data.head() 53 | 54 | Y, rhs = model_matrix(fml, data) 55 | X = rhs[0].drop(columns="Intercept") 56 | fe = rhs[1].drop(columns="Intercept") 57 | YX = np.concatenate([Y, X], axis=1) 58 | 59 | # to numpy 60 | Y = Y.to_numpy() 61 | X = X.to_numpy() 62 | YX = np.concatenate([Y, X], axis=1) 63 | fe = fe.to_numpy().astype(int) # demean requires fixed effects as ints! 64 | 65 | YX_demeaned, success = demean(YX, fe, weights = np.ones(YX.shape[0])) 66 | Y_demeaned = YX_demeaned[:, 0] 67 | X_demeaned = YX_demeaned[:, 1:] 68 | 69 | print(np.linalg.lstsq(X_demeaned, Y_demeaned, rcond=None)[0]) 70 | print(pf.feols(fml, data).coef()) 71 | ``` 72 | """ 73 | return _demean_rs(x, flist.astype(np.uint), weights, tol, maxiter) 74 | -------------------------------------------------------------------------------- /pyfixest/core/nested_fixed_effects.py: -------------------------------------------------------------------------------- 1 | from ._core_impl import ( 2 | _count_fixef_fully_nested_all_rs as count_fixef_fully_nested_all, # noqa: F401 3 | ) 4 | -------------------------------------------------------------------------------- /pyfixest/core/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-econometrics/pyfixest/1eb5f18bdceeece42db77e2337526fc64eff2346/pyfixest/core/py.typed -------------------------------------------------------------------------------- /pyfixest/did/__init__.py: -------------------------------------------------------------------------------- 1 | from pyfixest.did.estimation import ( 2 | did2s, 3 | event_study, 4 | lpdid, 5 | ) 6 | from pyfixest.did.saturated_twfe import SaturatedEventStudy 7 | from pyfixest.did.visualize import ( 8 | panelview, 9 | ) 10 | 11 | __all__ = ["SaturatedEventStudy", "did2s", "event_study", "lpdid", "panelview"] 12 | -------------------------------------------------------------------------------- /pyfixest/did/data/lpdidtestdata1.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-econometrics/pyfixest/1eb5f18bdceeece42db77e2337526fc64eff2346/pyfixest/did/data/lpdidtestdata1.dta -------------------------------------------------------------------------------- /pyfixest/did/did.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Optional 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | 8 | class DID(ABC): 9 | """ 10 | A class used to represent the DID (Differences-in-Differences) model. 11 | 12 | Attributes 13 | ---------- 14 | data : pandas.DataFrame 15 | The DataFrame containing all variables. 16 | yname : str 17 | The name of the dependent variable. 18 | idname : str 19 | The name of the identifier variable. 20 | tname : str 21 | Variable name for calendar period. Must be an integer in the format 22 | YYYYMMDDHHMMSS, i.e. it must be possible to compare two dates via '>'. 23 | Datetime variables are currently not accepted. 24 | gname : str 25 | unit-specific time of initial treatment. Must be an integer in the format 26 | YYYYMMDDHHMMSS, i.e. it must be possible to compare two dates via '>'. 27 | Datetime variables are currently not accepted. Never treated units must 28 | have a value of 0. 29 | xfml : str 30 | The formula for the covariates. 31 | att : str 32 | Whether to estimate the average treatment effect on the treated (ATT) or 33 | the canonical event study design with all leads and lags. Default is True. 34 | cluster : str 35 | The name of the cluster variable. 36 | """ 37 | 38 | @abstractmethod 39 | def __init__( 40 | self, 41 | data: pd.DataFrame, 42 | yname: str, 43 | idname: str, 44 | tname: str, 45 | gname: str, 46 | cluster: Optional[str] = None, 47 | xfml: Optional[str] = None, 48 | att: bool = True, 49 | ): 50 | # do some checks here 51 | 52 | self._data = data.copy() 53 | self._yname = yname 54 | self._idname = idname 55 | self._tname = tname 56 | self._gname = gname 57 | self._xfml = xfml 58 | self._att = att 59 | self._cluster = cluster 60 | 61 | # check if tname and gname are of type int (either int 64, 32, 8) 62 | 63 | for var in [self._tname, self._gname]: 64 | if self._data[var].dtype not in [ 65 | "int64", 66 | "int32", 67 | "int8", 68 | "float64", 69 | "float32", 70 | ]: 71 | raise ValueError( 72 | f"""The variable {var} must be of a numeric type, and more 73 | specifically, in the format YYYYMMDDHHMMSS. I.e. either 2012, 2013, 74 | etc. or 201201, 201202, 201203 etc.""" 75 | ) 76 | 77 | # create a treatment variable 78 | self._data["is_treated"] = ( 79 | self._data[self._tname] >= self._data[self._gname] 80 | ) * (self._data[self._gname] > 0) 81 | self._data = self._data.merge( 82 | self._data.assign( 83 | first_treated_period=self._data[self._tname] * self._data["is_treated"] 84 | ) 85 | .groupby(self._idname)["first_treated_period"] 86 | .apply(lambda x: x[x > 0].min()), 87 | on=self._idname, 88 | ) 89 | self._data["rel_time"] = ( 90 | self._data[self._tname] - self._data["first_treated_period"] 91 | ) 92 | self._data["first_treated_period"] = ( 93 | self._data["first_treated_period"].replace(np.nan, 0).astype(int) 94 | ) 95 | self._data["rel_time"] = self._data["rel_time"].replace(np.nan, np.inf) 96 | 97 | @abstractmethod 98 | def estimate(self): # noqa: D102 99 | pass 100 | 101 | @abstractmethod 102 | def vcov(self): # noqa: D102 103 | pass 104 | 105 | @abstractmethod 106 | def iplot(self): # noqa: D102 107 | pass 108 | 109 | @abstractmethod 110 | def tidy(self): # noqa: D102 111 | pass 112 | 113 | @abstractmethod 114 | def summary(self): # noqa: D102 115 | pass 116 | -------------------------------------------------------------------------------- /pyfixest/did/twfe.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, cast 2 | 3 | import pandas as pd 4 | 5 | from pyfixest.did.did import DID 6 | from pyfixest.estimation.estimation import feols 7 | from pyfixest.estimation.feols_ import Feols 8 | 9 | 10 | class TWFE(DID): 11 | """ 12 | Estimate a Two-way Fixed Effects model. 13 | 14 | Estimate a Difference-in-Differences model using the two-way fixed effects 15 | estimator. 16 | 17 | Attributes 18 | ---------- 19 | data: pandas.DataFrame 20 | The DataFrame containing all variables. 21 | yname: str 22 | The name of the dependent variable. 23 | idname: str 24 | The name of the id variable. 25 | tname: str 26 | Variable name for calendar period. Must be an integer in the format 27 | YYYYMMDDHHMMSS, i.e. it must be possible to compare two dates via '>'. 28 | Datetime variables are currently not accepted. 29 | gname: str 30 | Unit-specific time of initial treatment. Must be an integer in the format 31 | YYYYMMDDHHMMSS, i.e. it must be possible to compare two dates via '>'. 32 | Datetime variables are currently not accepted. Never treated units 33 | must have a value of 0. 34 | xfml: str 35 | The formula for the covariates. 36 | att: bool 37 | Whether to estimate the average treatment effect on the treated (ATT) or the 38 | canonical event study design with all leads and lags. Default is True. 39 | cluster: Optional[str] 40 | The name of the cluster variable. 41 | """ 42 | 43 | def __init__( 44 | self, 45 | data: pd.DataFrame, 46 | yname: str, 47 | idname: str, 48 | tname: str, 49 | gname: str, 50 | xfml: Optional[str] = None, 51 | att: bool = True, 52 | cluster: Optional[str] = "idname", 53 | ) -> None: 54 | super().__init__( 55 | data=data, 56 | yname=yname, 57 | idname=idname, 58 | tname=tname, 59 | gname=gname, 60 | xfml=xfml, 61 | att=att, 62 | cluster=cluster, 63 | ) 64 | 65 | self._estimator = "twfe" 66 | 67 | if self._xfml is not None: 68 | self._fml = f"{yname} ~ is_treated + {xfml} | {idname} + {tname}" 69 | else: 70 | self._fml = f"{yname} ~ is_treated | {idname} + {tname}" 71 | 72 | def estimate(self): 73 | """Estimate the TWFE model.""" 74 | _fml = self._fml 75 | _data = self._data 76 | 77 | fit = cast(Feols, feols(fml=_fml, data=_data)) 78 | self._fit = fit 79 | 80 | return fit 81 | 82 | def vcov(self): 83 | """ 84 | Variance-covariance matrix. 85 | 86 | The vcov matrix is calculated via the [Feols(/reference/Feols.qmd) object. 87 | 88 | Notes 89 | ----- 90 | Method not needed. 91 | """ 92 | pass 93 | 94 | def iplot( 95 | self, 96 | alpha: float = 0.05, 97 | figsize: tuple[int, int] = (500, 300), 98 | yintercept: Optional[int] = None, 99 | xintercept: Optional[int] = None, 100 | rotate_xticks: int = 0, 101 | title: str = "TWFE Event Study Estimate", 102 | coord_flip: bool = False, 103 | ): 104 | """Plot TWFE estimates.""" 105 | self.iplot( 106 | alpha=alpha, 107 | figsize=figsize, 108 | yintercept=yintercept, 109 | xintercept=xintercept, 110 | rotate_xticks=rotate_xticks, 111 | title=title, 112 | coord_flip=coord_flip, 113 | ) 114 | 115 | def tidy(self): # noqa: D102 116 | return self.tidy() 117 | 118 | def summary(self): # noqa: D102 119 | return self.summary() 120 | -------------------------------------------------------------------------------- /pyfixest/errors/__init__.py: -------------------------------------------------------------------------------- 1 | class FixedEffectInteractionError(Exception): # noqa: D101 2 | pass 3 | 4 | 5 | class CovariateInteractionError(Exception): # noqa: D101 6 | pass 7 | 8 | 9 | class DuplicateKeyError(Exception): # noqa: D101 10 | pass 11 | 12 | 13 | class EndogVarsAsCovarsError(Exception): # noqa: D101 14 | pass 15 | 16 | 17 | class InstrumentsAsCovarsError(Exception): # noqa: D101 18 | pass 19 | 20 | 21 | class UnderDeterminedIVError(Exception): # noqa: D101 22 | pass 23 | 24 | 25 | class UnsupportedMultipleEstimationSyntax(Exception): # noqa: D101 26 | pass 27 | 28 | 29 | class VcovTypeNotSupportedError(Exception): # noqa: D101 30 | pass 31 | 32 | 33 | class NanInClusterVarError(Exception): # noqa: D101 34 | pass 35 | 36 | 37 | class DepvarIsNotNumericError(Exception): # noqa: D101 38 | pass 39 | 40 | 41 | class NonConvergenceError(Exception): # noqa: D101 42 | pass 43 | 44 | 45 | class MatrixNotFullRankError(Exception): # noqa: D101 46 | pass 47 | 48 | 49 | class EmptyDesignMatrixError(Exception): # noqa: D101 50 | pass 51 | 52 | 53 | class FeatureDeprecationError(Exception): # noqa: D101 54 | pass 55 | 56 | 57 | class EmptyVcovError(Exception): # noqa: D101 58 | pass 59 | 60 | 61 | __all__ = [ 62 | "CovariateInteractionError", 63 | "DepvarIsNotNumericError", 64 | "DuplicateKeyError", 65 | "EmptyDesignMatrixError", 66 | "EmptyVcovError", 67 | "EndogVarsAsCovarsError", 68 | "FeatureDeprecationError", 69 | "FixedEffectInteractionError", 70 | "InstrumentsAsCovarsError", 71 | "MatrixNotFullRankError", 72 | "NanInClusterVarError", 73 | "NonConvergenceError", 74 | "UnderDeterminedIVError", 75 | "UnsupportedMultipleEstimationSyntax", 76 | "VcovTypeNotSupportedError", 77 | ] 78 | -------------------------------------------------------------------------------- /pyfixest/estimation/__init__.py: -------------------------------------------------------------------------------- 1 | from pyfixest.estimation import literals 2 | from pyfixest.estimation.demean_ import ( 3 | demean, 4 | ) 5 | from pyfixest.estimation.detect_singletons_ import ( 6 | detect_singletons, 7 | ) 8 | from pyfixest.estimation.estimation import ( 9 | feglm, 10 | feols, 11 | fepois, 12 | ) 13 | from pyfixest.estimation.fegaussian_ import Fegaussian 14 | from pyfixest.estimation.feiv_ import ( 15 | Feiv, 16 | ) 17 | from pyfixest.estimation.felogit_ import Felogit 18 | from pyfixest.estimation.feols_ import ( 19 | Feols, 20 | ) 21 | from pyfixest.estimation.fepois_ import ( 22 | Fepois, 23 | ) 24 | from pyfixest.estimation.feprobit_ import Feprobit 25 | from pyfixest.estimation.FixestMulti_ import ( 26 | FixestMulti, 27 | ) 28 | from pyfixest.estimation.model_matrix_fixest_ import ( 29 | model_matrix_fixest, 30 | ) 31 | from pyfixest.estimation.multcomp import ( 32 | bonferroni, 33 | rwolf, 34 | wyoung, 35 | ) 36 | 37 | __all__ = [ 38 | "Fegaussian", 39 | "Feiv", 40 | "Felogit", 41 | "Feols", 42 | "Fepois", 43 | "Feprobit", 44 | "FixestMulti", 45 | "bonferroni", 46 | "demean", 47 | "detect_singletons", 48 | "feglm", 49 | "feols", 50 | "fepois", 51 | "literals", 52 | "model_matrix_fixest", 53 | "rwolf", 54 | "wyoung", 55 | ] 56 | -------------------------------------------------------------------------------- /pyfixest/estimation/backends.py: -------------------------------------------------------------------------------- 1 | from pyfixest.core.collinear import find_collinear_variables 2 | from pyfixest.core.crv1 import crv1_meat_loop 3 | from pyfixest.core.demean import demean 4 | from pyfixest.core.nested_fixed_effects import count_fixef_fully_nested_all 5 | from pyfixest.estimation.demean_ import demean as demean_nb 6 | from pyfixest.estimation.jax.demean_jax_ import demean_jax as demean_jax_fn 7 | from pyfixest.estimation.numba.find_collinear_variables_nb import ( 8 | _find_collinear_variables_nb as find_collinear_variables_nb, 9 | ) 10 | from pyfixest.estimation.numba.nested_fixef_nb import ( 11 | _count_fixef_fully_nested_all as count_fixef_fully_nested_all_nb, 12 | ) 13 | from pyfixest.estimation.vcov_utils import _crv1_meat_loop as crv1_meat_loop_nb 14 | 15 | find_collinear_variables_jax = find_collinear_variables_nb 16 | crv1_meat_loop_jax = crv1_meat_loop_nb 17 | count_fixef_fully_nested_all_jax = count_fixef_fully_nested_all_nb 18 | 19 | BACKENDS = { 20 | "numba": { 21 | "demean": demean_nb, 22 | "collinear": find_collinear_variables_nb, 23 | "crv1_meat": crv1_meat_loop_nb, 24 | "nested": count_fixef_fully_nested_all_nb, 25 | }, 26 | "rust": { 27 | "demean": demean, 28 | "collinear": find_collinear_variables, 29 | "crv1_meat": crv1_meat_loop, 30 | "nested": count_fixef_fully_nested_all, 31 | }, 32 | "jax": { 33 | "demean": demean_jax_fn, 34 | "collinear": find_collinear_variables_jax, 35 | "crv1_meat": crv1_meat_loop_jax, 36 | "nested": count_fixef_fully_nested_all_jax, 37 | }, 38 | } 39 | -------------------------------------------------------------------------------- /pyfixest/estimation/ccv.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from numpy.random import Generator 4 | 5 | from pyfixest.estimation.estimation import feols 6 | 7 | 8 | def _compute_CCV( 9 | fml: str, 10 | Y: np.ndarray, 11 | X: np.ndarray, 12 | W: np.ndarray, 13 | rng: Generator, 14 | data: pd.DataFrame, 15 | treatment: str, 16 | cluster_vec: np.ndarray, 17 | pk: float, 18 | tau_full: float, 19 | ) -> float: 20 | """ 21 | Compute the causal cluster variance estimator following Abadie et al (QJE 2023). 22 | 23 | Parameters 24 | ---------- 25 | fml : str 26 | Formula of the regression model. 27 | Y : np.array 28 | Array with the dependent variable. 29 | X : np.array 30 | Array of the regression design matrix. 31 | W : np.array 32 | Array with the treatment variable. 33 | rng : np.random.default_rng 34 | Random number generator. 35 | data : pd.DataFrame 36 | Dataframe with the data. 37 | treatment : str 38 | Name of the treatment variable. 39 | cluster_vec : np.array 40 | Array with unique cluster identifiers. 41 | pk : float between 0 and 1. 42 | The proportion of clusters sampled. 43 | Default is 1, which means all clusters are sampled. 44 | tau_full : float 45 | The treatment effect estimate for the full sample. 46 | """ 47 | unique_clusters = np.unique(cluster_vec) 48 | N = data.shape[0] 49 | G = len(unique_clusters) 50 | 51 | Z = rng.choice([False, True], size=N) 52 | # compute alpha, tau using Z == 0 53 | fit_split1 = feols(fml, data[Z]) 54 | coefs_split = fit_split1.coef().to_numpy() 55 | tau = fit_split1.coef().xs(treatment) 56 | 57 | # estimate treatment effect for each cluster 58 | # for both the full sample and the subsample 59 | pk_term = 0.0 60 | tau_ms = np.zeros(G) 61 | N = 0 62 | for i, m in enumerate(unique_clusters): 63 | ind_m = cluster_vec == m 64 | Nm = np.sum(ind_m) 65 | N += Nm 66 | ind_m_and_split = ind_m & Z 67 | 68 | treatment_nested_in_cluster = data.loc[ind_m, treatment].nunique() == 1 69 | treatment_nested_in_cluster_split = ( 70 | data.loc[ind_m_and_split, treatment].nunique() == 1 71 | ) 72 | 73 | if treatment_nested_in_cluster: 74 | aux_tau_full = tau_full 75 | else: 76 | fit_m_full = feols(fml, data[ind_m]) 77 | aux_tau_full = float(fit_m_full.coef().xs(treatment)) 78 | 79 | # treatment effect in cluster for subsample 80 | if treatment_nested_in_cluster_split: 81 | aux_tau = tau 82 | else: 83 | fit_m = feols(fml, data[ind_m_and_split]) 84 | aux_tau = fit_m.coef().xs(treatment) 85 | tau_ms[i] = aux_tau 86 | 87 | # compute the pk term in Z0 88 | aux_pk = Nm * ((aux_tau_full - tau) ** 2) 89 | pk_term += aux_pk 90 | 91 | pk_term *= (1 - pk) / N 92 | uhat = Y - X @ coefs_split 93 | Wbar = np.mean(W[Z]) 94 | Zavg = 1 - np.mean(Z) 95 | Zavg_squared = Zavg**2 96 | n_adj = N * (Wbar**2) * ((1 - Wbar) ** 2) 97 | 98 | vcov_ccv = 0 99 | for i, m in enumerate(unique_clusters): 100 | ind_m = cluster_vec == m 101 | 102 | res_term = (W[ind_m & ~Z] - Wbar) * uhat[ind_m & ~Z] 103 | tau_term = (tau_ms[i] - tau) * Wbar * (1.0 - Wbar) 104 | diff = res_term - tau_term 105 | sq_sum = np.sum(diff) ** 2 106 | sum_sq = np.sum(diff**2) 107 | vcov_ccv += ( 108 | (1.0 / (Zavg**2)) * sq_sum 109 | - ((1.0 - Zavg) / (Zavg_squared)) * sum_sq 110 | + n_adj * pk_term 111 | ) 112 | 113 | return vcov_ccv / n_adj 114 | -------------------------------------------------------------------------------- /pyfixest/estimation/demean_jax_.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import jax 4 | import jax.numpy as jnp 5 | import numpy as np 6 | from jax import config 7 | 8 | 9 | @partial(jax.jit, static_argnames=("n_groups", "tol", "maxiter")) 10 | def _demean_jax_impl( 11 | x: jnp.ndarray, 12 | flist: jnp.ndarray, 13 | weights: jnp.ndarray, 14 | n_groups: int, 15 | tol: float, 16 | maxiter: int, 17 | ) -> tuple[jnp.ndarray, bool]: 18 | """JIT-compiled implementation of demeaning.""" 19 | n_factors = flist.shape[1] 20 | 21 | @jax.jit 22 | def _apply_factor(carry, j): 23 | """Process a single factor.""" 24 | x = carry 25 | factor_ids = flist[:, j] 26 | wx = x * weights[:, None] 27 | 28 | # Compute group weights and weighted sums 29 | group_weights = jnp.bincount(factor_ids, weights=weights, length=n_groups) 30 | group_sums = jax.vmap( 31 | lambda col: jnp.bincount(factor_ids, weights=col, length=n_groups) 32 | )(wx.T).T 33 | 34 | # Compute and subtract means 35 | means = group_sums / group_weights[:, None] 36 | return x - means[factor_ids], None 37 | 38 | @jax.jit 39 | def _demean_step(x_curr): 40 | """Single demeaning step for all factors.""" 41 | # Process all factors using scan 42 | result, _ = jax.lax.scan(_apply_factor, x_curr, jnp.arange(n_factors)) 43 | return result 44 | 45 | @jax.jit 46 | def _body_fun(state): 47 | """Body function for while_loop.""" 48 | i, x_curr, x_prev, converged = state 49 | x_new = _demean_step(x_curr) 50 | max_diff = jnp.max(jnp.abs(x_new - x_curr)) 51 | has_converged = max_diff < tol 52 | return i + 1, x_new, x_curr, has_converged 53 | 54 | @jax.jit 55 | def _cond_fun(state): 56 | """Condition function for while_loop.""" 57 | i, _, _, converged = state 58 | return jnp.logical_and(i < maxiter, jnp.logical_not(converged)) 59 | 60 | # Run the iteration loop using while_loop 61 | init_state = (0, x, x - 1.0, False) 62 | final_i, final_x, _, converged = jax.lax.while_loop( 63 | _cond_fun, _body_fun, init_state 64 | ) 65 | 66 | return final_x, converged 67 | 68 | 69 | def demean_jax( 70 | x: np.ndarray, 71 | flist: np.ndarray, 72 | weights: np.ndarray, 73 | tol: float = 1e-08, 74 | maxiter: int = 100_000, 75 | ) -> tuple[np.ndarray, bool]: 76 | """Fast and reliable JAX implementation with static shapes.""" 77 | # Enable float64 precision 78 | config.update("jax_enable_x64", True) 79 | 80 | # Compute n_groups before JIT 81 | n_groups = int(np.max(flist) + 1) 82 | 83 | # Convert inputs to JAX arrays 84 | x_jax = jnp.asarray(x, dtype=jnp.float64) 85 | flist_jax = jnp.asarray(flist, dtype=jnp.int32) 86 | weights_jax = jnp.asarray(weights, dtype=jnp.float64) 87 | 88 | # Call the JIT-compiled implementation 89 | result_jax, converged = _demean_jax_impl( 90 | x_jax, flist_jax, weights_jax, n_groups, tol, maxiter 91 | ) 92 | return np.array(result_jax), converged 93 | -------------------------------------------------------------------------------- /pyfixest/estimation/detect_singletons_.py: -------------------------------------------------------------------------------- 1 | import numba as nb 2 | import numpy as np 3 | from numba.extending import overload 4 | 5 | 6 | def _prepare_fixed_effects(ary): 7 | pass 8 | 9 | 10 | @overload(_prepare_fixed_effects) 11 | def _ol_preproc_fixed_effects(ary): 12 | # If array is already an F-array we tolerate 13 | # any dtype because it saves us a copy 14 | if ary.layout == "F": 15 | return lambda ary: ary 16 | 17 | if not isinstance(ary.dtype, nb.types.Integer): 18 | raise nb.TypingError("Fixed effects must be integers") 19 | 20 | max_nbits = 32 21 | nbits = min(max_nbits, ary.dtype.bitwidth) 22 | dtype = nb.types.Integer.from_bitwidth(nbits, signed=False) 23 | 24 | def impl(ary): 25 | n, m = ary.shape 26 | out = np.empty((m, n), dtype=dtype).T 27 | out[:] = ary[:] 28 | return out 29 | 30 | return impl 31 | 32 | 33 | @nb.njit 34 | def detect_singletons(ids: np.ndarray) -> np.ndarray: 35 | """ 36 | Detect singleton fixed effects in a dataset. 37 | 38 | This function iterates over the columns of a 2D numpy array representing 39 | fixed effects to identify singleton fixed effects. 40 | An observation is considered a singleton if it is the only one in its group 41 | (fixed effect identifier). 42 | 43 | Parameters 44 | ---------- 45 | ids : np.ndarray 46 | A 2D numpy array representing fixed effects, with a shape of (n_samples, 47 | n_features). 48 | Elements should be non-negative integers representing fixed effect identifiers. 49 | 50 | Returns 51 | ------- 52 | numpy.ndarray 53 | A boolean array of shape (n_samples,), indicating which observations have 54 | a singleton fixed effect. 55 | 56 | Notes 57 | ----- 58 | The algorithm iterates over columns to identify fixed effects. After each 59 | column is processed, it updates the record of non-singleton rows. This approach 60 | accounts for the possibility that removing an observation in one column can 61 | lead to the emergence of new singletons in subsequent columns. 62 | 63 | For performance reasons, the input array should be in column-major order. 64 | Operating on a row-major array can lead to significant performance losses. 65 | """ 66 | ids = _prepare_fixed_effects(ids) 67 | n_samples, n_features = ids.shape 68 | 69 | max_fixef = np.max(ids) 70 | counts = np.empty(max_fixef + 1, dtype=np.uint32) 71 | 72 | n_non_singletons = n_samples 73 | non_singletons = np.arange(n_non_singletons, dtype=np.uint32) 74 | 75 | while True: 76 | n_non_singletons_curr = n_non_singletons 77 | 78 | for j in range(n_features): 79 | col = ids[:, j] 80 | 81 | counts[:] = 0 82 | n_singletons = 0 83 | for i in range(n_non_singletons): 84 | e = col[non_singletons[i]] 85 | c = counts[e] 86 | # Branchless version of: 87 | # 88 | # if counts[e] == 1: 89 | # n_singletons -= 1 90 | # elif counts[e] == 0: 91 | # n_singletons += 1 92 | # 93 | n_singletons += (c == 0) - (c == 1) 94 | counts[e] += 1 95 | 96 | if not n_singletons: 97 | continue 98 | 99 | cnt = 0 100 | for i in range(n_non_singletons): 101 | e = col[non_singletons[i]] 102 | if counts[e] != 1: 103 | non_singletons[cnt] = non_singletons[i] 104 | cnt += 1 105 | 106 | n_non_singletons = cnt 107 | 108 | if n_non_singletons_curr == n_non_singletons: 109 | break 110 | 111 | is_singleton = np.ones(n_samples, dtype=np.bool_) 112 | for i in range(n_non_singletons): 113 | is_singleton[non_singletons[i]] = False 114 | 115 | return is_singleton 116 | -------------------------------------------------------------------------------- /pyfixest/estimation/detect_singletons_jax.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import jax 4 | import jax.numpy as jnp 5 | import numpy as np 6 | 7 | 8 | @partial(jax.jit, static_argnames=("n_samples", "n_features", "max_fixef")) 9 | def _process_features_jax( 10 | ids, non_singletons, n_non_singletons, n_samples, n_features, max_fixef 11 | ): 12 | """JIT-compiled inner loop for processing features with static shapes.""" 13 | 14 | def process_feature(carry, j): 15 | non_singletons, n_non_singletons = carry 16 | col = ids[:, j] 17 | 18 | # Initialize counts array 19 | counts = jnp.zeros(max_fixef + 1, dtype=jnp.int32) 20 | 21 | # Count occurrences and track singletons 22 | def count_loop(i, state): 23 | counts, n_singletons = state 24 | e = col[non_singletons[i]] 25 | c = counts[e] 26 | # Exactly match Numba: n_singletons += (c == 0) - (c == 1) 27 | n_singletons = n_singletons + (c == 0) - (c == 1) 28 | counts = counts.at[e].add(1) 29 | return (counts, n_singletons) 30 | 31 | counts, n_singletons = jax.lax.fori_loop( 32 | 0, n_non_singletons, count_loop, (counts, 0) 33 | ) 34 | 35 | # Early return if no singletons found 36 | def no_singletons(_): 37 | return (non_singletons, n_non_singletons) 38 | 39 | # Update non_singletons if singletons found 40 | def update_singletons(_): 41 | def update_loop(i, state): 42 | new_non_singletons, cnt = state 43 | e = col[non_singletons[i]] 44 | keep = counts[e] != 1 45 | # Exactly match Numba's update logic 46 | new_non_singletons = jax.lax.cond( 47 | keep, 48 | lambda x: x[0].at[x[1]].set(non_singletons[i]), 49 | lambda x: x[0], 50 | (new_non_singletons, cnt), 51 | ) 52 | return (new_non_singletons, cnt + keep) 53 | 54 | new_non_singletons = jnp.zeros_like(non_singletons) 55 | new_non_singletons, new_cnt = jax.lax.fori_loop( 56 | 0, n_non_singletons, update_loop, (new_non_singletons, 0) 57 | ) 58 | return (new_non_singletons, new_cnt) 59 | 60 | return jax.lax.cond( 61 | n_singletons == 0, no_singletons, update_singletons, None 62 | ), None 63 | 64 | return jax.lax.scan( 65 | process_feature, (non_singletons, n_non_singletons), jnp.arange(n_features) 66 | )[0] 67 | 68 | 69 | def detect_singletons_jax(ids: np.ndarray) -> np.ndarray: 70 | """ 71 | JAX implementation of singleton detection in fixed effects. 72 | 73 | Parameters 74 | ---------- 75 | ids : numpy.ndarray 76 | A 2D numpy array representing fixed effects, with shape (n_samples, n_features). 77 | Elements should be non-negative integers representing fixed effect identifiers. 78 | 79 | Returns 80 | ------- 81 | numpy.ndarray 82 | A boolean array of shape (n_samples,), indicating which observations have 83 | a singleton fixed effect. 84 | """ 85 | # Get dimensions and max_fixef before JIT 86 | n_samples, n_features = ids.shape 87 | max_fixef = int(np.max(ids)) # Use numpy.max instead of jax.numpy.max 88 | 89 | # Convert input to JAX array 90 | ids = jnp.array(ids, dtype=jnp.int32) 91 | 92 | # Initialize with all indices as non-singletons 93 | init_non_singletons = jnp.arange(n_samples) 94 | init_n_non_singletons = n_samples 95 | 96 | @partial(jax.jit, static_argnames=("n_samples", "n_features", "max_fixef")) 97 | def _singleton_detection_loop( 98 | ids, non_singletons, n_non_singletons, n_samples, n_features, max_fixef 99 | ): 100 | def cond_fun(state): 101 | prev_n, curr_carry = state 102 | return prev_n != curr_carry[1] 103 | 104 | def body_fun(state): 105 | prev_n, curr_carry = state 106 | new_carry = _process_features_jax( 107 | ids, curr_carry[0], curr_carry[1], n_samples, n_features, max_fixef 108 | ) 109 | return (curr_carry[1], new_carry) 110 | 111 | init_state = (n_samples + 1, (non_singletons, n_non_singletons)) 112 | final_state = jax.lax.while_loop(cond_fun, body_fun, init_state) 113 | return final_state[1] 114 | 115 | # Run iterations until convergence 116 | final_non_singletons, final_n = _singleton_detection_loop( 117 | ids, 118 | init_non_singletons, 119 | init_n_non_singletons, 120 | n_samples, 121 | n_features, 122 | max_fixef, 123 | ) 124 | 125 | # Create final boolean mask 126 | is_singleton = jnp.ones(n_samples, dtype=jnp.bool_) 127 | 128 | @jax.jit 129 | def _mark_non_singletons(is_singleton, final_non_singletons, final_n): 130 | def mark_non_singleton(i, acc): 131 | return acc.at[final_non_singletons[i]].set(False) 132 | 133 | return jax.lax.fori_loop(0, final_n, mark_non_singleton, is_singleton) 134 | 135 | is_singleton = _mark_non_singletons(is_singleton, final_non_singletons, final_n) 136 | 137 | return np.array(is_singleton) 138 | -------------------------------------------------------------------------------- /pyfixest/estimation/fegaussian_.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Mapping 2 | from typing import Any, Literal, Optional, Union 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from pyfixest.estimation.feglm_ import Feglm 8 | from pyfixest.estimation.FormulaParser import FixestFormula 9 | 10 | 11 | class Fegaussian(Feglm): 12 | "Class for the estimation of a fixed-effects GLM with normal errors." 13 | 14 | def __init__( 15 | self, 16 | FixestFormula: FixestFormula, 17 | data: pd.DataFrame, 18 | ssc_dict: dict[str, Union[str, bool]], 19 | drop_singletons: bool, 20 | drop_intercept: bool, 21 | weights: Optional[str], 22 | weights_type: Optional[str], 23 | collin_tol: float, 24 | fixef_tol: float, 25 | lookup_demeaned_data: dict[str, pd.DataFrame], 26 | tol: float, 27 | maxiter: int, 28 | solver: Literal[ 29 | "np.linalg.lstsq", 30 | "np.linalg.solve", 31 | "scipy.linalg.solve", 32 | "scipy.sparse.linalg.lsqr", 33 | "jax", 34 | ], 35 | store_data: bool = True, 36 | copy_data: bool = True, 37 | lean: bool = False, 38 | sample_split_var: Optional[str] = None, 39 | sample_split_value: Optional[Union[str, int]] = None, 40 | separation_check: Optional[list[str]] = None, 41 | context: Union[int, Mapping[str, Any]] = 0, 42 | ): 43 | super().__init__( 44 | FixestFormula=FixestFormula, 45 | data=data, 46 | ssc_dict=ssc_dict, 47 | drop_singletons=drop_singletons, 48 | drop_intercept=drop_intercept, 49 | weights=weights, 50 | weights_type=weights_type, 51 | collin_tol=collin_tol, 52 | fixef_tol=fixef_tol, 53 | lookup_demeaned_data=lookup_demeaned_data, 54 | tol=tol, 55 | maxiter=maxiter, 56 | solver=solver, 57 | store_data=store_data, 58 | copy_data=copy_data, 59 | lean=lean, 60 | sample_split_var=sample_split_var, 61 | sample_split_value=sample_split_value, 62 | separation_check=separation_check, 63 | context=context, 64 | ) 65 | 66 | self._method = "feglm-gaussian" 67 | 68 | def _check_dependent_variable(self) -> None: 69 | pass 70 | 71 | def _get_deviance(self, y: np.ndarray, mu: np.ndarray) -> np.ndarray: 72 | return np.sum((y - mu) ** 2) 73 | 74 | def _get_dispersion_phi(self, theta: np.ndarray) -> float: 75 | return np.var(theta) 76 | 77 | def _get_b(self, theta: np.ndarray) -> np.ndarray: 78 | return theta**2 / 2 79 | 80 | def _get_mu(self, theta: np.ndarray) -> np.ndarray: 81 | return theta 82 | 83 | def _get_link(self, mu: np.ndarray) -> np.ndarray: 84 | return mu 85 | 86 | def _update_detadmu(self, mu: np.ndarray) -> np.ndarray: 87 | return np.ones_like(mu) 88 | 89 | def _get_theta(self, mu: np.ndarray) -> np.ndarray: 90 | return mu 91 | 92 | def _get_V(self, mu: np.ndarray) -> np.ndarray: 93 | return np.ones_like(mu) 94 | 95 | def _vcov_iid(self): 96 | _N = self._N 97 | _u_hat = self._u_hat 98 | _bread = self._bread 99 | sigma2 = np.sum(_u_hat.flatten() ** 2) / (_N) 100 | _vcov = _bread * sigma2 101 | 102 | return _vcov 103 | 104 | def _get_score( 105 | self, y: np.ndarray, X: np.ndarray, mu: np.ndarray, eta: np.ndarray 106 | ) -> np.ndarray: 107 | return (y - mu)[:, None] * X 108 | -------------------------------------------------------------------------------- /pyfixest/estimation/felogit_.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Mapping 2 | from typing import Any, Literal, Optional, Union 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from pyfixest.estimation.feglm_ import Feglm 8 | from pyfixest.estimation.FormulaParser import FixestFormula 9 | 10 | 11 | class Felogit(Feglm): 12 | "Class for the estimation of a fixed-effects logit model." 13 | 14 | def __init__( 15 | self, 16 | FixestFormula: FixestFormula, 17 | data: pd.DataFrame, 18 | ssc_dict: dict[str, Union[str, bool]], 19 | drop_singletons: bool, 20 | drop_intercept: bool, 21 | weights: Optional[str], 22 | weights_type: Optional[str], 23 | collin_tol: float, 24 | fixef_tol: float, 25 | lookup_demeaned_data: dict[str, pd.DataFrame], 26 | tol: float, 27 | maxiter: int, 28 | solver: Literal[ 29 | "np.linalg.lstsq", 30 | "np.linalg.solve", 31 | "scipy.linalg.solve", 32 | "scipy.sparse.linalg.lsqr", 33 | "jax", 34 | ], 35 | store_data: bool = True, 36 | copy_data: bool = True, 37 | lean: bool = False, 38 | sample_split_var: Optional[str] = None, 39 | sample_split_value: Optional[Union[str, int]] = None, 40 | separation_check: Optional[list[str]] = None, 41 | context: Union[int, Mapping[str, Any]] = 0, 42 | ): 43 | super().__init__( 44 | FixestFormula=FixestFormula, 45 | data=data, 46 | ssc_dict=ssc_dict, 47 | drop_singletons=drop_singletons, 48 | drop_intercept=drop_intercept, 49 | weights=weights, 50 | weights_type=weights_type, 51 | collin_tol=collin_tol, 52 | fixef_tol=fixef_tol, 53 | lookup_demeaned_data=lookup_demeaned_data, 54 | tol=tol, 55 | maxiter=maxiter, 56 | solver=solver, 57 | store_data=store_data, 58 | copy_data=copy_data, 59 | lean=lean, 60 | sample_split_var=sample_split_var, 61 | sample_split_value=sample_split_value, 62 | separation_check=separation_check, 63 | context=context, 64 | ) 65 | 66 | self._method = "feglm-logit" 67 | 68 | def _check_dependent_variable(self) -> None: 69 | "Check if the dependent variable is binary with values 0 and 1." 70 | Y_unique = np.unique(self._Y) 71 | if len(Y_unique) != 2: 72 | raise ValueError("The dependent variable must have two unique values.") 73 | if np.any(~np.isin(Y_unique, [0, 1])): 74 | raise ValueError("The dependent variable must be binary (0 or 1).") 75 | 76 | def _get_deviance(self, y: np.ndarray, mu: np.ndarray) -> np.ndarray: 77 | return -2 * np.sum(y * np.log(mu) + (1 - y) * np.log(1 - mu)) 78 | 79 | def _get_dispersion_phi(self, theta: np.ndarray) -> float: 80 | return 1.0 81 | 82 | def _get_b(self, theta: np.ndarray) -> np.ndarray: 83 | return np.log(1 + np.exp(theta)) 84 | 85 | def _get_mu(self, theta: np.ndarray) -> np.ndarray: 86 | return np.exp(theta) / (1 + np.exp(theta)) 87 | 88 | def _get_link(self, mu: np.ndarray) -> np.ndarray: 89 | return np.log(mu / (1 - mu)) 90 | 91 | def _update_detadmu(self, mu: np.ndarray) -> np.ndarray: 92 | return 1 / (mu * (1 - mu)) 93 | 94 | def _get_theta(self, mu: np.ndarray) -> np.ndarray: 95 | return np.log(mu / (1 - mu)) 96 | 97 | def _get_V(self, mu: np.ndarray) -> np.ndarray: 98 | return mu * (1 - mu) 99 | 100 | def _get_score( 101 | self, y: np.ndarray, X: np.ndarray, mu: np.ndarray, eta: np.ndarray 102 | ) -> np.ndarray: 103 | return (y - mu)[:, None] * X 104 | -------------------------------------------------------------------------------- /pyfixest/estimation/feprobit_.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from collections.abc import Mapping 3 | from typing import Any, Literal, Optional, Union 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from scipy.stats import norm 8 | 9 | from pyfixest.estimation.feglm_ import Feglm 10 | from pyfixest.estimation.FormulaParser import FixestFormula 11 | 12 | 13 | class Feprobit(Feglm): 14 | "Class for the estimation of a fixed-effects probit model." 15 | 16 | def __init__( 17 | self, 18 | FixestFormula: FixestFormula, 19 | data: pd.DataFrame, 20 | ssc_dict: dict[str, Union[str, bool]], 21 | drop_singletons: bool, 22 | drop_intercept: bool, 23 | weights: Optional[str], 24 | weights_type: Optional[str], 25 | collin_tol: float, 26 | fixef_tol: float, 27 | lookup_demeaned_data: dict[str, pd.DataFrame], 28 | tol: float, 29 | maxiter: int, 30 | solver: Literal[ 31 | "np.linalg.lstsq", 32 | "np.linalg.solve", 33 | "scipy.linalg.solve", 34 | "scipy.sparse.linalg.lsqr", 35 | "jax", 36 | ], 37 | store_data: bool = True, 38 | copy_data: bool = True, 39 | lean: bool = False, 40 | sample_split_var: Optional[str] = None, 41 | sample_split_value: Optional[Union[str, int]] = None, 42 | separation_check: Optional[list[str]] = None, 43 | context: Union[int, Mapping[str, Any]] = 0, 44 | ): 45 | super().__init__( 46 | FixestFormula=FixestFormula, 47 | data=data, 48 | ssc_dict=ssc_dict, 49 | drop_singletons=drop_singletons, 50 | drop_intercept=drop_intercept, 51 | weights=weights, 52 | weights_type=weights_type, 53 | collin_tol=collin_tol, 54 | fixef_tol=fixef_tol, 55 | lookup_demeaned_data=lookup_demeaned_data, 56 | tol=tol, 57 | maxiter=maxiter, 58 | solver=solver, 59 | store_data=store_data, 60 | copy_data=copy_data, 61 | lean=lean, 62 | sample_split_var=sample_split_var, 63 | sample_split_value=sample_split_value, 64 | separation_check=separation_check, 65 | context=context, 66 | ) 67 | 68 | self._method = "feglm-probit" 69 | 70 | def _check_dependent_variable(self) -> None: 71 | "Check if the dependent variable is binary with values 0 and 1." 72 | Y_unique = np.unique(self._Y) 73 | if len(Y_unique) != 2: 74 | raise ValueError("The dependent variable must have two unique values.") 75 | if np.any(~np.isin(Y_unique, [0, 1])): 76 | raise ValueError("The dependent variable must be binary (0 or 1).") 77 | 78 | def _get_deviance(self, y: np.ndarray, mu: np.ndarray) -> np.ndarray: 79 | ll_fitted = np.sum(y * np.log(mu) + (1 - y) * np.log(1 - mu)) 80 | 81 | # divide by zero warnings because of the log(0) terms 82 | with warnings.catch_warnings(): 83 | warnings.simplefilter("ignore") 84 | ll_saturated = np.sum( 85 | np.where(y == 0, 0, y * np.log(y)) 86 | + np.where(y == 1, 0, (1 - y) * np.log(1 - y)) 87 | ) 88 | 89 | return -2.0 * (ll_fitted - ll_saturated) 90 | 91 | def _get_dispersion_phi(self, theta: np.ndarray) -> float: 92 | return 1.0 93 | 94 | def _get_b(self, theta: np.ndarray) -> np.ndarray: 95 | raise ValueError("The function _get_b is not implemented for the probit model.") 96 | return None 97 | 98 | def _get_mu(self, theta: np.ndarray) -> np.ndarray: 99 | return norm.cdf(theta) 100 | 101 | def _get_link(self, mu: np.ndarray) -> np.ndarray: 102 | return norm.ppf(mu) 103 | 104 | def _update_detadmu(self, mu: np.ndarray) -> np.ndarray: 105 | return 1 / norm.pdf(norm.ppf(mu)) 106 | 107 | def _get_theta(self, mu: np.ndarray) -> np.ndarray: 108 | return norm.ppf(mu) 109 | 110 | def _get_V(self, mu: np.ndarray) -> np.ndarray: 111 | return mu * (1 - mu) 112 | 113 | def _get_score( 114 | self, y: np.ndarray, X: np.ndarray, mu: np.ndarray, eta: np.ndarray 115 | ) -> np.ndarray: 116 | residual = (y - mu) / (mu * (1 - mu)) * norm.pdf(eta) 117 | return residual[:, None] * X 118 | -------------------------------------------------------------------------------- /pyfixest/estimation/jax/demean_jax_.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import jax 4 | import jax.numpy as jnp 5 | import numpy as np 6 | from jax import config 7 | 8 | 9 | @partial(jax.jit, static_argnames=("n_groups", "tol", "maxiter")) 10 | def _demean_jax_impl( 11 | x: jnp.ndarray, 12 | flist: jnp.ndarray, 13 | weights: jnp.ndarray, 14 | n_groups: int, 15 | tol: float, 16 | maxiter: int, 17 | ) -> tuple[jnp.ndarray, bool]: 18 | """JIT-compiled implementation of demeaning.""" 19 | n_factors = flist.shape[1] 20 | 21 | @jax.jit 22 | def _apply_factor(carry, j): 23 | """Process a single factor.""" 24 | x = carry 25 | factor_ids = flist[:, j] 26 | wx = x * weights[:, None] 27 | 28 | # Compute group weights and weighted sums 29 | group_weights = jnp.bincount(factor_ids, weights=weights, length=n_groups) 30 | group_sums = jax.vmap( 31 | lambda col: jnp.bincount(factor_ids, weights=col, length=n_groups) 32 | )(wx.T).T 33 | 34 | # Compute and subtract means 35 | means = group_sums / group_weights[:, None] 36 | return x - means[factor_ids], None 37 | 38 | @jax.jit 39 | def _demean_step(x_curr): 40 | """Single demeaning step for all factors.""" 41 | # Process all factors using scan 42 | result, _ = jax.lax.scan(_apply_factor, x_curr, jnp.arange(n_factors)) 43 | return result 44 | 45 | @jax.jit 46 | def _body_fun(state): 47 | """Body function for while_loop.""" 48 | i, x_curr, x_prev, converged = state 49 | x_new = _demean_step(x_curr) 50 | max_diff = jnp.max(jnp.abs(x_new - x_curr)) 51 | has_converged = max_diff < tol 52 | return i + 1, x_new, x_curr, has_converged 53 | 54 | @jax.jit 55 | def _cond_fun(state): 56 | """Condition function for while_loop.""" 57 | i, _, _, converged = state 58 | return jnp.logical_and(i < maxiter, jnp.logical_not(converged)) 59 | 60 | # Run the iteration loop using while_loop 61 | init_state = (0, x, x - 1.0, False) 62 | final_i, final_x, _, converged = jax.lax.while_loop( 63 | _cond_fun, _body_fun, init_state 64 | ) 65 | 66 | return final_x, converged 67 | 68 | 69 | def demean_jax( 70 | x: np.ndarray, 71 | flist: np.ndarray, 72 | weights: np.ndarray, 73 | tol: float = 1e-08, 74 | maxiter: int = 100_000, 75 | ) -> tuple[np.ndarray, bool]: 76 | """Fast and reliable JAX implementation with static shapes.""" 77 | # Enable float64 precision 78 | config.update("jax_enable_x64", True) 79 | 80 | # Compute n_groups before JIT 81 | n_groups = int(np.max(flist) + 1) 82 | 83 | # Convert inputs to JAX arrays 84 | x_jax = jnp.asarray(x, dtype=jnp.float64) 85 | flist_jax = jnp.asarray(flist, dtype=jnp.int32) 86 | weights_jax = jnp.asarray(weights, dtype=jnp.float64) 87 | 88 | # Call the JIT-compiled implementation 89 | result_jax, converged = _demean_jax_impl( 90 | x_jax, flist_jax, weights_jax, n_groups, tol, maxiter 91 | ) 92 | return np.array(result_jax), converged 93 | -------------------------------------------------------------------------------- /pyfixest/estimation/jax/detect_singletons_jax.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import jax 4 | import jax.numpy as jnp 5 | import numpy as np 6 | 7 | 8 | @partial(jax.jit, static_argnames=("n_samples", "n_features", "max_fixef")) 9 | def _process_features_jax( 10 | ids, non_singletons, n_non_singletons, n_samples, n_features, max_fixef 11 | ): 12 | """JIT-compiled inner loop for processing features with static shapes.""" 13 | 14 | def process_feature(carry, j): 15 | non_singletons, n_non_singletons = carry 16 | col = ids[:, j] 17 | 18 | # Initialize counts array 19 | counts = jnp.zeros(max_fixef + 1, dtype=jnp.int32) 20 | 21 | # Count occurrences and track singletons 22 | def count_loop(i, state): 23 | counts, n_singletons = state 24 | e = col[non_singletons[i]] 25 | c = counts[e] 26 | # Exactly match Numba: n_singletons += (c == 0) - (c == 1) 27 | n_singletons = n_singletons + (c == 0) - (c == 1) 28 | counts = counts.at[e].add(1) 29 | return (counts, n_singletons) 30 | 31 | counts, n_singletons = jax.lax.fori_loop( 32 | 0, n_non_singletons, count_loop, (counts, 0) 33 | ) 34 | 35 | # Early return if no singletons found 36 | def no_singletons(_): 37 | return (non_singletons, n_non_singletons) 38 | 39 | # Update non_singletons if singletons found 40 | def update_singletons(_): 41 | def update_loop(i, state): 42 | new_non_singletons, cnt = state 43 | e = col[non_singletons[i]] 44 | keep = counts[e] != 1 45 | # Exactly match Numba's update logic 46 | new_non_singletons = jax.lax.cond( 47 | keep, 48 | lambda x: x[0].at[x[1]].set(non_singletons[i]), 49 | lambda x: x[0], 50 | (new_non_singletons, cnt), 51 | ) 52 | return (new_non_singletons, cnt + keep) 53 | 54 | new_non_singletons = jnp.zeros_like(non_singletons) 55 | new_non_singletons, new_cnt = jax.lax.fori_loop( 56 | 0, n_non_singletons, update_loop, (new_non_singletons, 0) 57 | ) 58 | return (new_non_singletons, new_cnt) 59 | 60 | return jax.lax.cond( 61 | n_singletons == 0, no_singletons, update_singletons, None 62 | ), None 63 | 64 | return jax.lax.scan( 65 | process_feature, (non_singletons, n_non_singletons), jnp.arange(n_features) 66 | )[0] 67 | 68 | 69 | def detect_singletons_jax(ids: np.ndarray) -> np.ndarray: 70 | """ 71 | JAX implementation of singleton detection in fixed effects. 72 | 73 | Parameters 74 | ---------- 75 | ids : numpy.ndarray 76 | A 2D numpy array representing fixed effects, with shape (n_samples, n_features). 77 | Elements should be non-negative integers representing fixed effect identifiers. 78 | 79 | Returns 80 | ------- 81 | numpy.ndarray 82 | A boolean array of shape (n_samples,), indicating which observations have 83 | a singleton fixed effect. 84 | """ 85 | # Get dimensions and max_fixef before JIT 86 | n_samples, n_features = ids.shape 87 | max_fixef = int(np.max(ids)) # Use numpy.max instead of jax.numpy.max 88 | 89 | # Convert input to JAX array 90 | ids = jnp.array(ids, dtype=jnp.int32) 91 | 92 | # Initialize with all indices as non-singletons 93 | init_non_singletons = jnp.arange(n_samples) 94 | init_n_non_singletons = n_samples 95 | 96 | @partial(jax.jit, static_argnames=("n_samples", "n_features", "max_fixef")) 97 | def _singleton_detection_loop( 98 | ids, non_singletons, n_non_singletons, n_samples, n_features, max_fixef 99 | ): 100 | def cond_fun(state): 101 | prev_n, curr_carry = state 102 | return prev_n != curr_carry[1] 103 | 104 | def body_fun(state): 105 | prev_n, curr_carry = state 106 | new_carry = _process_features_jax( 107 | ids, curr_carry[0], curr_carry[1], n_samples, n_features, max_fixef 108 | ) 109 | return (curr_carry[1], new_carry) 110 | 111 | init_state = (n_samples + 1, (non_singletons, n_non_singletons)) 112 | final_state = jax.lax.while_loop(cond_fun, body_fun, init_state) 113 | return final_state[1] 114 | 115 | # Run iterations until convergence 116 | final_non_singletons, final_n = _singleton_detection_loop( 117 | ids, 118 | init_non_singletons, 119 | init_n_non_singletons, 120 | n_samples, 121 | n_features, 122 | max_fixef, 123 | ) 124 | 125 | # Create final boolean mask 126 | is_singleton = jnp.ones(n_samples, dtype=jnp.bool_) 127 | 128 | @jax.jit 129 | def _mark_non_singletons(is_singleton, final_non_singletons, final_n): 130 | def mark_non_singleton(i, acc): 131 | return acc.at[final_non_singletons[i]].set(False) 132 | 133 | return jax.lax.fori_loop(0, final_n, mark_non_singleton, is_singleton) 134 | 135 | is_singleton = _mark_non_singletons(is_singleton, final_non_singletons, final_n) 136 | 137 | return np.array(is_singleton) 138 | -------------------------------------------------------------------------------- /pyfixest/estimation/literals.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Literal, get_args 2 | 3 | PredictionType = Literal["response", "link"] 4 | VcovTypeOptions = Literal["iid", "hetero", "HC1", "HC2", "HC3"] 5 | WeightsTypeOptions = Literal["aweights", "fweights"] 6 | FixedRmOptions = Literal["singleton", "none"] 7 | SolverOptions = Literal[ 8 | "np.linalg.lstsq", 9 | "np.linalg.solve", 10 | "scipy.linalg.solve", 11 | "scipy.sparse.linalg.lsqr", 12 | "jax", 13 | ] 14 | DemeanerBackendOptions = Literal["numba", "jax", "rust"] 15 | PredictionErrorOptions = Literal["prediction"] 16 | 17 | 18 | def _validate_literal_argument(arg: Any, literal: Any) -> None: 19 | """ 20 | Validate if the given argument matches one of the allowed literal types. 21 | 22 | This function checks whether the provided `arg` is among the valid types 23 | returned by `get_args(literal)`. If not, it raises a ValueError with an 24 | appropriate error message. 25 | 26 | Parameters 27 | ---------- 28 | arg : Any 29 | The argument to validate. 30 | literal : Any 31 | A Literal type that defines the allowed values for `arg`. 32 | 33 | Raises 34 | ------ 35 | TypeError 36 | If `literal` does not have valid types. 37 | ValueError 38 | If `arg` is not one of the valid types defined by `literal`. 39 | """ 40 | valid_types = get_args(literal) 41 | 42 | if len(valid_types) < 1: 43 | raise TypeError( 44 | f"{literal} must be a Literal[...] type argument with least one type" 45 | ) 46 | 47 | if arg not in valid_types: 48 | raise ValueError(f"Invalid argument. Expecting one of {valid_types}. Got {arg}") 49 | -------------------------------------------------------------------------------- /pyfixest/estimation/numba/find_collinear_variables_nb.py: -------------------------------------------------------------------------------- 1 | import numba as nb 2 | import numpy as np 3 | 4 | 5 | @nb.njit(parallel=False) 6 | def _find_collinear_variables_nb( 7 | X: np.ndarray, tol: float = 1e-10 8 | ) -> tuple[np.ndarray, int, bool]: 9 | """ 10 | Detect multicollinear variables. 11 | 12 | Detect multicollinear variables, replicating Laurent Berge's C++ implementation 13 | from the fixest package. See the fixest repo [here](https://github.com/lrberge/fixest/blob/a4d1a9bea20aa7ab7ab0e0f1d2047d8097971ad7/src/lm_related.cpp#L130) 14 | 15 | Parameters 16 | ---------- 17 | X : numpy.ndarray 18 | A symmetric matrix X used to check for multicollinearity. 19 | tol : float 20 | The tolerance level for the multicollinearity check. 21 | 22 | Returns 23 | ------- 24 | - id_excl (numpy.ndarray): A boolean array, where True indicates a collinear 25 | variable. 26 | - n_excl (int): The number of collinear variables. 27 | - all_removed (bool): True if all variables are identified as collinear. 28 | """ 29 | K = X.shape[1] 30 | R = np.zeros((K, K)) 31 | id_excl = np.zeros(K, dtype=np.int32) 32 | n_excl = 0 33 | min_norm = X[0, 0] 34 | 35 | for j in range(K): 36 | R_jj = X[j, j] 37 | for k in range(j): 38 | if id_excl[k]: 39 | continue 40 | R_jj -= R[k, j] * R[k, j] 41 | 42 | if R_jj < tol: 43 | n_excl += 1 44 | id_excl[j] = 1 45 | 46 | if n_excl == K: 47 | all_removed = True 48 | return id_excl.astype(np.bool_), n_excl, all_removed 49 | 50 | continue 51 | 52 | if min_norm > R_jj: 53 | min_norm = R_jj 54 | 55 | R_jj = np.sqrt(R_jj) 56 | R[j, j] = R_jj 57 | 58 | for i in range(j + 1, K): 59 | value = X[i, j] 60 | for k in range(j): 61 | if id_excl[k]: 62 | continue 63 | value -= R[k, i] * R[k, j] 64 | R[j, i] = value / R_jj 65 | 66 | return id_excl.astype(np.bool_), n_excl, False 67 | -------------------------------------------------------------------------------- /pyfixest/estimation/numba/nested_fixef_nb.py: -------------------------------------------------------------------------------- 1 | import numba as nb 2 | import numpy as np 3 | 4 | 5 | @nb.njit(parallel=True) 6 | def _count_fixef_fully_nested_all( 7 | all_fixef_array: np.ndarray, 8 | cluster_colnames: np.ndarray, 9 | cluster_data: np.ndarray, 10 | fe_data: np.ndarray, 11 | ) -> tuple[np.ndarray, int]: 12 | """ 13 | 14 | Compute the number of nested fixed effects over all fixed effects. 15 | 16 | Parameters 17 | ---------- 18 | all_fixef_array : np.ndarray 19 | A 1D array with the names of all fixed effects in the model. 20 | cluster_colnames : np.ndarray 21 | A 1D array with the names of all cluster variables in the model. 22 | cluster_data : np.ndarray 23 | A 2D array with the cluster data. 24 | fe_data : np.ndarray 25 | A 2D array with the fixed effects. 26 | 27 | Returns 28 | ------- 29 | k_fe_nested : np.ndarray 30 | A numpy array with shape (all_fixef_array.size, ) containing boolean values that 31 | indicate whether a given fixed effect is fully nested within a cluster or not. 32 | n_fe_fully_nested : int 33 | The number of fixed effects that are fully nested within a clusters. 34 | """ 35 | k_fe_nested_flag = np.zeros(all_fixef_array.size, dtype=np.bool_) 36 | n_fe_fully_nested = 0 37 | 38 | for fi in nb.prange(all_fixef_array.size): 39 | this_fe_name = all_fixef_array[fi] 40 | 41 | found_in_cluster = False 42 | for col_i in range(cluster_colnames.size): 43 | if this_fe_name == cluster_colnames[col_i]: 44 | found_in_cluster = True 45 | k_fe_nested_flag[fi] = True 46 | n_fe_fully_nested += 1 47 | break 48 | 49 | if not found_in_cluster: 50 | for col_j in range(cluster_colnames.size): 51 | clusters_col = cluster_data[:, col_j] 52 | fe_col = fe_data[:, fi] 53 | is_fully_nested = _count_fixef_fully_nested(clusters_col, fe_col) 54 | if is_fully_nested: 55 | k_fe_nested_flag[fi] = True 56 | n_fe_fully_nested += 1 57 | break 58 | 59 | return k_fe_nested_flag, n_fe_fully_nested 60 | 61 | 62 | @nb.njit 63 | def _count_fixef_fully_nested(clusters: np.ndarray, f: np.ndarray) -> bool: 64 | """ 65 | Check if a given fixed effect is fully nested within a given cluster. 66 | 67 | Parameters 68 | ---------- 69 | clusters : np.ndarray 70 | A vector of cluster assignments. 71 | f : np.ndarray 72 | A matrix of fixed effects. 73 | 74 | Returns 75 | ------- 76 | np.array(np.bool_) 77 | An array of booleans indicating whether each fixed effect is fully nested within clusters. 78 | True if the fixed effect is fully nested within clusters, False otherwise. 79 | """ 80 | unique_vals = np.unique(f) 81 | n_unique_vals = len(unique_vals) 82 | counts = 0 83 | for val in unique_vals: 84 | mask = f == val 85 | distinct_clusters = np.unique(clusters[mask]) 86 | if len(distinct_clusters) == 1: 87 | counts += 1 88 | is_fe_nested = counts == n_unique_vals 89 | 90 | return is_fe_nested 91 | -------------------------------------------------------------------------------- /pyfixest/estimation/solvers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.linalg import solve 3 | from scipy.sparse.linalg import lsqr 4 | from typing_extensions import Literal 5 | 6 | 7 | def solve_ols( 8 | tZX: np.ndarray, 9 | tZY: np.ndarray, 10 | solver: Literal[ 11 | "np.linalg.lstsq", 12 | "np.linalg.solve", 13 | "scipy.linalg.solve", 14 | "scipy.sparse.linalg.lsqr", 15 | "jax", 16 | ], 17 | ) -> np.ndarray: 18 | """ 19 | Solve the ordinary least squares problem using the specified solver. 20 | 21 | Parameters 22 | ---------- 23 | tZX (array-like): Z'X. 24 | tZY (array-like): Z'Y. 25 | solver (str): The solver to use. Supported solvers are "np.linalg.lstsq", 26 | "np.linalg.solve", "scipy.linalg.solve", "scipy.sparse.linalg.lsqr" and "jax". 27 | 28 | Returns 29 | ------- 30 | array-like: The solution to the ordinary least squares problem. 31 | 32 | Raises 33 | ------ 34 | ValueError: If the specified solver is not supported. 35 | """ 36 | if solver == "np.linalg.lstsq": 37 | return np.linalg.lstsq(tZX, tZY, rcond=None)[0].flatten() 38 | elif solver == "np.linalg.solve": 39 | return np.linalg.solve(tZX, tZY).flatten() 40 | elif solver == "scipy.linalg.solve": 41 | return solve(tZX, tZY, assume_a="pos").flatten() 42 | elif solver == "scipy.sparse.linalg.lsqr": 43 | return lsqr(tZX, tZY)[0].flatten() 44 | elif solver == "jax": 45 | import jax.numpy as jnp 46 | 47 | return jnp.linalg.lstsq(tZX, tZY, rcond=None)[0].flatten() 48 | else: 49 | raise ValueError(f"Solver {solver} not supported.") 50 | -------------------------------------------------------------------------------- /pyfixest/estimation/vcov_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import numba as nb 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from pyfixest.errors import NanInClusterVarError 8 | from pyfixest.utils.dev_utils import _narwhals_to_pandas 9 | 10 | 11 | def _compute_bread( 12 | _is_iv: bool, 13 | _tXZ: np.ndarray, 14 | _tZZinv: np.ndarray, 15 | _tZX: np.ndarray, 16 | _hessian: np.ndarray, 17 | ): 18 | return np.linalg.inv(_tXZ @ _tZZinv @ _tZX) if _is_iv else np.linalg.inv(_hessian) 19 | 20 | 21 | def _get_cluster_df(data: pd.DataFrame, clustervar: list[str]): 22 | if not data.empty: 23 | data_pandas = _narwhals_to_pandas(data) 24 | cluster_df = data_pandas[clustervar].copy() 25 | else: 26 | raise AttributeError( 27 | """The input data set needs to be stored in the model object if 28 | you call `vcov()` post estimation with a novel cluster variable. 29 | Please set the function argument `store_data=True` when calling 30 | the regression. 31 | """ 32 | ) 33 | 34 | return cluster_df 35 | 36 | 37 | def _check_cluster_df(cluster_df: pd.DataFrame, data: pd.DataFrame): 38 | if np.any(cluster_df.isna().any()): 39 | raise NanInClusterVarError( 40 | "CRV inference not supported with missing values in the cluster variable." 41 | "Please drop missing values before running the regression." 42 | ) 43 | 44 | N = data.shape[0] 45 | if cluster_df.shape[0] != N: 46 | raise ValueError( 47 | "The cluster variable must have the same length as the data set." 48 | ) 49 | 50 | 51 | def _count_G_for_ssc_correction( 52 | cluster_df: pd.DataFrame, ssc_dict: dict[str, Union[str, bool]] 53 | ): 54 | G = [] 55 | for col in cluster_df.columns: 56 | G.append(cluster_df[col].nunique()) 57 | 58 | if ssc_dict["cluster_df"] == "min": 59 | G = [min(G)] * 3 60 | 61 | return G 62 | 63 | 64 | def _get_vcov_type( 65 | vcov: Union[str, dict[str, str], None], fval: str 66 | ) -> Union[str, dict[str, str]]: 67 | """ 68 | Pass the specified vcov type. 69 | 70 | Passes the specified vcov type. If no vcov type specified, sets the default 71 | vcov type as iid if no fixed effect is included in the model, and CRV1 72 | clustered by the first fixed effect if a fixed effect is included in the model. 73 | 74 | Parameters 75 | ---------- 76 | vcov : Union[str, dict[str, str], None] 77 | The specified vcov type. 78 | fval : str 79 | The specified fixed effects. (i.e. "X1+X2") 80 | 81 | Returns 82 | ------- 83 | str 84 | vcov_type (str) : The specified vcov type. 85 | """ 86 | if vcov is None: 87 | # iid if no fixed effects 88 | if fval == "0": 89 | vcov_type = "iid" # type: ignore 90 | else: 91 | # CRV1 inference, clustered by first fixed effect 92 | first_fe = fval.split("+")[0] 93 | vcov_type = {"CRV1": first_fe} # type: ignore 94 | else: 95 | vcov_type = vcov # type: ignore 96 | 97 | return vcov_type # type: ignore 98 | 99 | 100 | def _prepare_twoway_clustering(clustervar: list, cluster_df: pd.DataFrame): 101 | cluster_one = clustervar[0] 102 | cluster_two = clustervar[1] 103 | cluster_df_one_str = cluster_df[cluster_one].astype(str) 104 | cluster_df_two_str = cluster_df[cluster_two].astype(str) 105 | cluster_df.loc[:, "cluster_intersection"] = cluster_df_one_str.str.cat( 106 | cluster_df_two_str, sep="-" 107 | ) 108 | 109 | return cluster_df 110 | 111 | 112 | # CODE from Styfen Schaer (@styfenschaer) 113 | @nb.njit(parallel=False) 114 | def bucket_argsort(arr: np.ndarray) -> tuple[np.ndarray, np.ndarray]: 115 | """ 116 | Sorts the input array using the bucket sort algorithm. 117 | 118 | Parameters 119 | ---------- 120 | arr : array_like 121 | An array_like object that needs to be sorted. 122 | 123 | Returns 124 | ------- 125 | array_like 126 | A sorted copy of the input array. 127 | 128 | Raises 129 | ------ 130 | ValueError 131 | If the input is not an array_like object. 132 | 133 | Notes 134 | ----- 135 | The bucket sort algorithm works by distributing the elements of an array 136 | into a number of buckets. Each bucket is then sorted individually, either 137 | using a different sorting algorithm, or by recursively applying the bucket 138 | sorting algorithm. 139 | """ 140 | counts = np.zeros(arr.max() + 1, dtype=np.uint32) 141 | for i in range(arr.size): 142 | counts[arr[i]] += 1 143 | 144 | locs = np.empty(counts.size + 1, dtype=np.uint32) 145 | locs[0] = 0 146 | pos = np.empty(counts.size, dtype=np.uint32) 147 | for i in range(counts.size): 148 | locs[i + 1] = locs[i] + counts[i] 149 | pos[i] = locs[i] 150 | 151 | args = np.empty(arr.size, dtype=np.uint32) 152 | for i in range(arr.size): 153 | e = arr[i] 154 | args[pos[e]] = i 155 | pos[e] += 1 156 | 157 | return args, locs 158 | 159 | 160 | # CODE from Styfen Schaer (@styfenschaer) 161 | @nb.njit(parallel=False) 162 | def _crv1_meat_loop( 163 | scores: np.ndarray, 164 | clustid: np.ndarray, 165 | cluster_col: np.ndarray, 166 | ) -> np.ndarray: 167 | k = scores.shape[1] 168 | dtype = scores.dtype 169 | meat = np.zeros((k, k), dtype=dtype) 170 | 171 | g_indices, g_locs = bucket_argsort(cluster_col) 172 | 173 | score_g = np.empty((k, 1), dtype=dtype) 174 | meat_i = np.empty((k, k), dtype=dtype) 175 | 176 | for i in range(clustid.size): 177 | g = clustid[i] 178 | start = g_locs[g] 179 | end = g_locs[g + 1] 180 | g_index = g_indices[start:end] 181 | score_g = scores[g_index, :].sum(axis=0) 182 | np.outer(score_g, score_g, out=meat_i) 183 | meat += meat_i 184 | 185 | return meat 186 | -------------------------------------------------------------------------------- /pyfixest/report/__init__.py: -------------------------------------------------------------------------------- 1 | from pyfixest.report.summarize import ( 2 | dtable, 3 | etable, 4 | make_table, 5 | summary, 6 | ) 7 | from pyfixest.report.visualize import ( 8 | coefplot, 9 | iplot, 10 | ) 11 | 12 | __all__ = [ 13 | "coefplot", 14 | "dtable", 15 | "etable", 16 | "iplot", 17 | "make_table", 18 | "summary", 19 | ] 20 | -------------------------------------------------------------------------------- /pyfixest/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from pyfixest.utils.utils import ( 2 | get_data, 3 | get_ssc, 4 | ssc, 5 | ) 6 | 7 | __all__ = [ 8 | "get_data", 9 | "get_ssc", 10 | "ssc", 11 | ] 12 | -------------------------------------------------------------------------------- /pyfixest/utils/_exceptions.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import os 3 | 4 | 5 | def find_stack_level() -> int: 6 | """Find the first place in the stack that is not inside pyfixest.""" 7 | import pyfixest as pf 8 | 9 | pkg_dir = os.path.dirname(pf.__file__) 10 | test_dir = os.path.join(pkg_dir, "tests") 11 | 12 | # https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow 13 | frame = inspect.currentframe() 14 | n = 0 15 | while frame: 16 | fname = inspect.getfile(frame) 17 | if fname.startswith(pkg_dir) and not fname.startswith(test_dir): 18 | frame = frame.f_back 19 | n += 1 20 | else: 21 | break 22 | return n 23 | -------------------------------------------------------------------------------- /pyfixest/utils/check_r_install.py: -------------------------------------------------------------------------------- 1 | from rpy2.robjects.packages import importr 2 | 3 | 4 | def _catch_import_issue(name: str, strict: bool) -> None | bool: 5 | if strict: 6 | raise ImportError( 7 | f"{name} package not found. Make sure the extended R environment is installed." 8 | ) 9 | else: 10 | print( 11 | f"Warning: {name} is not installed. Extended R tests will be unable to run." 12 | ) 13 | return False 14 | 15 | 16 | def check_r_install(package_names: str | list[str], strict: bool = False) -> bool: 17 | "Catch R import issues for package_names and raise ImportError if strict is True, otherwise pass a bool for passing check." 18 | utils = importr("utils") 19 | package_list = package_names if isinstance(package_names, list) else [package_names] 20 | installed_packages = utils.installed_packages() 21 | 22 | package_status = [] 23 | for package in package_list: 24 | if package not in installed_packages: 25 | package_status.append(_catch_import_issue(package, strict)) 26 | else: 27 | package_status.append(True) 28 | return all(package_status) 29 | -------------------------------------------------------------------------------- /pyfixest/utils/dev_utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Optional, Union 3 | 4 | import narwhals.stable.v1 as nw 5 | import numpy as np 6 | import pandas as pd 7 | from narwhals.typing import IntoDataFrame 8 | 9 | DataFrameType = IntoDataFrame 10 | 11 | 12 | def _narwhals_to_pandas(data: IntoDataFrame) -> pd.DataFrame: # type: ignore 13 | return nw.from_native(data, eager_or_interchange_only=True).to_pandas() 14 | 15 | 16 | def _create_rng(seed: Optional[int] = None) -> np.random.Generator: 17 | """ 18 | Create a random number generator. 19 | 20 | Parameters 21 | ---------- 22 | seed : int, optional 23 | The seed of the random number generator. If None, a random seed is chosen. 24 | 25 | Returns 26 | ------- 27 | numpy.random.Generator 28 | A random number generator. 29 | """ 30 | if seed is None: 31 | seed = np.random.randint(100_000_000) 32 | return np.random.default_rng(seed) 33 | 34 | 35 | def _select_order_coefs( 36 | coefs: list, 37 | keep: Optional[Union[list, str]] = None, 38 | drop: Optional[Union[list, str]] = None, 39 | exact_match: Optional[bool] = False, 40 | ): 41 | r""" 42 | Select and order the coefficients based on the pattern. 43 | 44 | Parameters 45 | ---------- 46 | coefs: list 47 | Coefficient names to be selected and ordered. 48 | keep: str or list of str, optional 49 | The pattern for retaining coefficient names. You can pass a string (one 50 | pattern) or a list (multiple patterns). Default is keeping all coefficients. 51 | You should use regular expressions to select coefficients. 52 | "age", # would keep all coefficients containing age 53 | r"^tr", # would keep all coefficients starting with tr 54 | r"\\d$", # would keep all coefficients ending with number 55 | Output will be in the order of the patterns. 56 | drop: str or list of str, optional 57 | The pattern for excluding coefficient names. You can pass a string (one 58 | pattern) or a list (multiple patterns). Syntax is the same as for `keep`. 59 | Default is keeping all coefficients. Parameter `keep` and `drop` can be 60 | used simultaneously. 61 | exact_match: bool, optional 62 | Whether to use exact match for `keep` and `drop`. Default is False. 63 | If True, the pattern will be matched exactly to the coefficient name 64 | instead of using regular expressions. 65 | 66 | Returns 67 | ------- 68 | res: list 69 | The filtered and ordered coefficient names. 70 | """ 71 | if keep is None: 72 | keep = [] 73 | if drop is None: 74 | drop = [] 75 | 76 | if isinstance(keep, str): 77 | keep = [keep] 78 | if isinstance(drop, str): 79 | drop = [drop] 80 | 81 | coefs = list(coefs) 82 | res = [] if keep else coefs[:] # Store matched coefs 83 | for pattern in keep: 84 | _coefs = [] # Store remaining coefs 85 | for coef in coefs: 86 | if (exact_match and pattern == coef) or ( 87 | exact_match is False and re.findall(pattern, coef) 88 | ): 89 | res.append(coef) 90 | else: 91 | _coefs.append(coef) 92 | coefs = _coefs 93 | 94 | for pattern in drop: 95 | _coefs = [] 96 | for coef in res: # Remove previously matched coefs that match the drop pattern 97 | if (exact_match and pattern == coef) or ( 98 | exact_match is False and re.findall(pattern, coef) 99 | ): 100 | continue 101 | else: 102 | _coefs.append(coef) 103 | res = _coefs 104 | 105 | return res 106 | 107 | 108 | def docstring_from(func, custom_doc=""): 109 | """Copy the docstring of another function.""" 110 | 111 | def decorator(target_func): 112 | target_func.__doc__ = custom_doc + "\n\n" + func.__doc__ 113 | return target_func 114 | 115 | return decorator 116 | 117 | 118 | def _check_series_or_dataframe(x: Union[pd.Series, pd.DataFrame]): 119 | if not isinstance(x, (pd.Series, pd.DataFrame)): 120 | raise TypeError("Input must be a pandas Series or DataFrame") 121 | else: 122 | return x 123 | 124 | 125 | def _to_list(x): 126 | if x is not None and not isinstance(x, list): 127 | return [x] 128 | return x 129 | 130 | 131 | def _drop_cols(_data: pd.DataFrame, na_index: np.ndarray): 132 | """ 133 | Drop columns from data based on the indices in na_index. 134 | 135 | Parameters 136 | ---------- 137 | _data : pd.DataFrame 138 | The input DataFrame. 139 | na_index : np.ndarray 140 | An array of indices to drop. 141 | 142 | Returns 143 | ------- 144 | pd.DataFrame 145 | The input DataFrame with NAs dropped. 146 | """ 147 | if na_index.size > 0: 148 | all_indices = np.arange(_data.shape[0]) 149 | max_index = all_indices.max() + 1 150 | keep = np.ones(max_index, dtype=bool) 151 | keep[na_index] = False 152 | return _data[keep] 153 | else: 154 | return _data 155 | 156 | 157 | def _extract_variable_level(fe_string: str): 158 | """ 159 | Extract the variable and level from a given string. 160 | 161 | Parameters 162 | ---------- 163 | fe_string: str 164 | The string encapsulating the fixed effect factor variable and level. 165 | 166 | Returns 167 | ------- 168 | tuple 169 | A tuple containing the extracted variable and level for the fixed 170 | effect. 171 | """ 172 | pattern = r"C\(([^)]*)\)\[(?:T\.)?(.*)\]$" 173 | match = re.search(pattern, fe_string) 174 | if not match: 175 | raise ValueError(f"Cannot parse: {fe_string}") 176 | 177 | variable = match.group(1) 178 | level = match.group(2) 179 | 180 | return f"C({variable})", level 181 | -------------------------------------------------------------------------------- /pyfixest/utils/set_rpy2_path.py: -------------------------------------------------------------------------------- 1 | import rpy2.robjects as robjects 2 | from rpy2.robjects.packages import importr 3 | 4 | 5 | def update_r_paths(): 6 | "Get current R library paths." 7 | current_r_paths = robjects.r(".libPaths()") 8 | 9 | # Define your custom paths 10 | custom_paths = robjects.StrVector( 11 | [ 12 | "/home/runner/work/pyfixest/pyfixest/.pixi/envs/dev/lib/R/library", 13 | "/usr/local/lib/R/site-library", 14 | "/usr/lib/R/site-library", 15 | "/usr/lib/R/library", 16 | ] 17 | ) 18 | 19 | # Combine current R paths with custom paths (avoiding duplicates) 20 | new_lib_paths = robjects.StrVector(list(set(custom_paths).union(current_r_paths))) 21 | 22 | # Set the combined library paths in the R environment 23 | robjects.r[".libPaths"](new_lib_paths) 24 | 25 | 26 | def _check_update_r_paths(): 27 | update_r_paths() 28 | try: 29 | importr("did2s") 30 | print("did2s package imported successfully.") 31 | except Exception as e: 32 | print(f"Error importing did2s: {e}") 33 | 34 | 35 | _check_update_r_paths() 36 | -------------------------------------------------------------------------------- /r_test_requirements.R: -------------------------------------------------------------------------------- 1 | # note: R, fixest, sandwich, broom are installed via conda 2 | install.packages( 3 | c('did2s', 'reticulate', 'ivDiag'), 4 | repos='https://cran.rstudio.com' 5 | ); 6 | install.packages( 7 | c('collapse', 'summclust', 'wildrwolf'), 8 | repos = c('https://s3alfisc.r-universe.dev', 'https://cloud.r-project.org', 'https://fastverse.r-universe.dev') 9 | ); 10 | install.packages( 11 | 'ritest', 12 | repos = c('https://grantmcdermott.r-universe.dev', 'https://cloud.r-project.org') 13 | ); 14 | -------------------------------------------------------------------------------- /scripts/run_notebooks.py: -------------------------------------------------------------------------------- 1 | """Script to run all notebooks example notebooks. 2 | 3 | Taken from: https://github.com/pymc-labs/pymc-marketing/blob/main/scripts/_run_notebooks/runner.py 4 | """ 5 | 6 | import logging 7 | from pathlib import Path 8 | 9 | import papermill 10 | from joblib import Parallel, delayed 11 | from tqdm import tqdm 12 | 13 | KERNEL_NAME: str = "python3" 14 | DOCS = Path("docs") 15 | NOTEBOOKS: list[Path] = [ 16 | # DOCS / "compare-fixest-pyfixest.ipynb", # needs R 17 | DOCS / "difference-in-differences.ipynb", 18 | DOCS / "marginaleffects.ipynb", 19 | DOCS / "quickstart.ipynb", 20 | DOCS / "replicating-the-effect.ipynb", 21 | # DOCS / "stargazer.ipynb", # failing notebook 22 | ] 23 | 24 | 25 | def _setup_logging() -> None: 26 | logging.basicConfig( 27 | level=logging.INFO, 28 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 29 | ) 30 | 31 | 32 | def _get_cwd_from_notebook_path(notebook_path: Path) -> str: 33 | return str(notebook_path).rsplit("/", 1)[0] 34 | 35 | 36 | def _run_notebook(notebook_path: Path) -> None: 37 | cwd = _get_cwd_from_notebook_path(notebook_path) 38 | logging.info(f"Running notebook: {notebook_path.name}") 39 | papermill.execute_notebook( 40 | input_path=str(notebook_path), 41 | output_path=None, 42 | kernel_name=KERNEL_NAME, 43 | cwd=cwd, 44 | ) 45 | 46 | 47 | if __name__ == "__main__": 48 | _setup_logging() 49 | logging.info("Starting notebook runner") 50 | logging.info(f"Notebooks to run: {NOTEBOOKS}") 51 | Parallel(n_jobs=-1)( 52 | delayed(_run_notebook)(notebook_path) for notebook_path in tqdm(NOTEBOOKS) 53 | ) 54 | 55 | logging.info("Notebooks run successfully!") 56 | -------------------------------------------------------------------------------- /src/collinear.rs: -------------------------------------------------------------------------------- 1 | use ndarray::{Array1, Array2, ArrayView2}; 2 | use numpy::IntoPyArray; 3 | use numpy::{PyArray1, PyReadonlyArray2}; 4 | use pyo3::exceptions::PyValueError; 5 | use pyo3::prelude::*; 6 | use thiserror::Error; 7 | 8 | 9 | #[derive(Debug, Error)] 10 | enum CollinearityError { 11 | #[error("Input matrix must be square, got {rows}x{cols}")] 12 | NonSquareMatrix { rows: usize, cols: usize }, 13 | 14 | #[error("Tolerance must be positive and finite, got {value}")] 15 | InvalidTolerance { value: f64 }, 16 | } 17 | 18 | 19 | /// Detect collinear (linearly dependent) columns in a symmetric matrix. 20 | /// 21 | /// Parameters 22 | /// ---------- 23 | /// x : ndarray-like of shape (p, p), dtype float64 24 | /// Symmetric (Gram) matrix `X.T @ X`. 25 | /// tol : float 26 | /// Multicollinearity threshold. 27 | /// 28 | /// Returns 29 | /// ------- 30 | /// mask : ndarray of bool, shape (p,) 31 | /// Boolean indicator of collinear columns 32 | /// n_excl : int 33 | /// Number of columns flagged as collinear 34 | /// all_collinear : bool 35 | /// `True` if all columns are collinear. 36 | /// 37 | /// * `x` - Input matrix (must be square, typically X'X in a regression model, where X is the N x k design matrix) 38 | /// * `tol` - Tolerance for detecting collinearity (smaller values require closer to exact linear dependence) 39 | /// 40 | /// Notes 41 | /// ----- 42 | /// 43 | /// The detection order depends on the original column ordering; an 44 | /// order-independent variant would add **column pivoting** (choose, at each 45 | /// step, the remaining column with the largest residual variance). 46 | 47 | fn find_collinear_variables_impl( 48 | x: ArrayView2, 49 | tol: f64, 50 | ) -> Result<(Array1, usize, bool), CollinearityError> { 51 | // Validate tolerance 52 | if tol <= 0.0 { 53 | return Err(CollinearityError::InvalidTolerance { value: tol }); 54 | } 55 | 56 | 57 | let k = x.ncols(); 58 | if !x.is_square() { 59 | return Err(CollinearityError::NonSquareMatrix {rows: x.nrows(), cols: k}) 60 | } 61 | 62 | let mut r = Array2::::zeros((k, k)); 63 | let mut id_excl = vec![false; k]; 64 | let mut n_excl = 0usize; 65 | 66 | for j in 0..k { 67 | let mut r_jj = x[(j,j)]; 68 | for k in 0..j { 69 | if id_excl[k] { continue; } 70 | let r_kj = r[(k,j)]; 71 | r_jj -= r_kj * r_kj; 72 | } 73 | 74 | if r_jj < tol { 75 | id_excl[j] = true; 76 | n_excl += 1; 77 | if n_excl == k { 78 | let arr = Array1::from_vec(id_excl); 79 | return Ok((arr, n_excl, true)); 80 | } 81 | continue; 82 | } 83 | 84 | let rjj_sqrt = r_jj.sqrt(); 85 | r[(j,j)] = rjj_sqrt; 86 | 87 | for i in (j+1)..k { 88 | let mut value = x[(i,j)]; 89 | for k in 0..j { 90 | if id_excl[k] { continue; } 91 | value -= r[(k,i)] * r[(k,j)]; 92 | } 93 | r[(j,i)] = value / rjj_sqrt; 94 | } 95 | } 96 | 97 | let arr = Array1::from_vec(id_excl); 98 | Ok((arr, n_excl, false)) 99 | } 100 | 101 | /// Detect collinear (linearly dependent) columns in a square matrix. 102 | /// 103 | /// Uses a Cholesky-based algorithm to identify variables (columns) that are collinear or nearly collinear, 104 | /// based on a user-specified tolerance. 105 | /// 106 | /// Parameters 107 | /// ---------- 108 | /// x : numpy.ndarray (float64) 109 | /// A square 2D array (n x n) whose columns will be checked for collinearity. 110 | /// tol : float, optional 111 | /// Threshold below which a variable is considered collinear (default is 1e-10). 112 | /// 113 | /// Returns 114 | /// ------- 115 | /// mask : numpy.ndarray (bool) 116 | /// Boolean array of length `n`. `True` indicates that the column is collinear and should be excluded. 117 | /// n_excluded : int 118 | /// Number of columns detected as collinear. 119 | /// all_collinear : bool 120 | /// `True` if all columns are collinear (e.g., zero or singular matrix), else `False`. 121 | /// 122 | /// Raises 123 | /// ------ 124 | /// ValueError 125 | /// If the input matrix is not square, or if the tolerance is not positive. 126 | /// 127 | /// Notes: This function is a translation of Laurent Bergé's c++ implementation in 128 | /// the fixest package. 129 | 130 | #[pyfunction] 131 | #[pyo3(signature = (x, tol=1e-10))] 132 | pub fn _find_collinear_variables_rs( 133 | py: Python, 134 | x: PyReadonlyArray2, 135 | tol: f64, 136 | ) -> PyResult<(Py>, usize, bool)> { 137 | let x = x.as_array(); 138 | // Call the implementation and convert any errors to Python ValueError 139 | match find_collinear_variables_impl(x, tol) { 140 | Ok((arr, n_excl, flag)) => Ok((arr.into_pyarray(py).to_owned(), n_excl, flag)), 141 | Err(err) => { 142 | // Convert Rust errors to Python ValueError 143 | Err(PyValueError::new_err(err.to_string())) 144 | } 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /src/crv1.rs: -------------------------------------------------------------------------------- 1 | use ndarray::{Array2, ArrayView1, ArrayView2, Axis}; 2 | use numpy::{IntoPyArray, PyArray2, PyReadonlyArray1, PyReadonlyArray2}; 3 | use pyo3::prelude::*; 4 | 5 | fn bucket_argsort_rs(arr: &ArrayView1) -> (Vec, Vec) { 6 | // 1. Count frequencies 7 | let maxv = *arr.iter().max().unwrap_or(&0); 8 | let counts = { 9 | let mut counts = vec![0usize; maxv + 1]; 10 | arr.iter().for_each(|&v| counts[v] += 1); 11 | counts 12 | }; 13 | 14 | // Compute the prefix sums of the counts vector 15 | let prefix_sum_iterator = counts.iter().scan(0, |acc, &count| { 16 | *acc += count; 17 | Some(*acc) 18 | }); 19 | 20 | // Prepend the prefix sums with 0 and collect 21 | let locs: Vec = std::iter::once(0).chain(prefix_sum_iterator).collect(); 22 | 23 | // 3. Copy locs to track insertion positions 24 | let mut pos = locs[..counts.len()].to_vec(); 25 | 26 | // 4. Build argsort result 27 | let mut args = vec![0usize; arr.len()]; 28 | for (i, &v) in arr.iter().enumerate() { 29 | args[pos[v]] = i; 30 | pos[v] += 1; 31 | } 32 | 33 | (args, locs) 34 | } 35 | 36 | fn crv1_meat_loop_imp( 37 | scores: &ArrayView2, 38 | clustid: &ArrayView1, 39 | cluster_col: &ArrayView1, 40 | ) -> Array2 { 41 | let k = scores.ncols(); 42 | let (g_indices, g_locs) = bucket_argsort_rs(cluster_col); 43 | 44 | // Compute cluster contributions 45 | let create_cluster_contrib = |&g: &usize| -> Array2 { 46 | // Extract cluster indices 47 | let start = g_locs[g]; 48 | let end = g_locs[g + 1]; 49 | let col_indices = &g_indices[start..end]; 50 | 51 | // Sum cluster scores 52 | let score_g = scores.select(Axis(0), col_indices).sum_axis(Axis(0)); 53 | 54 | // Create the outer product 55 | let x = score_g.view().insert_axis(Axis(1)); 56 | let x_t = score_g.view().insert_axis(Axis(0)); 57 | x.dot(&x_t) 58 | }; 59 | 60 | clustid 61 | .iter() 62 | .map(create_cluster_contrib) 63 | .fold(Array2::zeros((k, k)), |mut acc, x| { 64 | acc += &x; 65 | acc 66 | }) 67 | } 68 | 69 | /// Compute the CRV1 meat matrix for cluster-robust standard errors. 70 | /// 71 | /// Parameters 72 | /// ---------- 73 | /// scores : numpy.ndarray (float64), shape (n_obs, k) 74 | /// The score matrix, typically X' * u, where X is the design matrix and u are 75 | /// the residuals from the model fit. Rows correspond to observations, columns to parameters. 76 | /// clustid : numpy.ndarray (usize), shape (n_clusters,) 77 | /// Array of unique cluster identifiers (one for each cluster). 78 | /// cluster_col : numpy.ndarray (usize), shape (n_obs,) 79 | /// Cluster assignment for each observation; each entry must match a value in `clustid`. 80 | /// 81 | /// Returns 82 | /// ------- 83 | /// meat : numpy.ndarray (float64), shape (k, k) 84 | /// The CRV1 meat matrix (sum of cluster outer products), a square matrix where 85 | /// k is the number of regression coefficients. 86 | #[pyfunction] 87 | pub fn _crv1_meat_loop_rs( 88 | py: Python, 89 | scores: PyReadonlyArray2, 90 | clustid: PyReadonlyArray1, 91 | cluster_col: PyReadonlyArray1, 92 | ) -> PyResult>> { 93 | let meat = crv1_meat_loop_imp( 94 | &scores.as_array(), 95 | &clustid.as_array(), 96 | &cluster_col.as_array(), 97 | ); 98 | Ok(meat.into_pyarray(py).to_owned()) 99 | } 100 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | 3 | mod collinear; 4 | mod crv1; 5 | mod demean; 6 | mod nested_fixed_effects; 7 | 8 | #[pymodule] 9 | fn _core_impl(_py: Python, m: &PyModule) -> PyResult<()> { 10 | m.add_function(wrap_pyfunction!(collinear::_find_collinear_variables_rs, m)?)?; 11 | m.add_function(wrap_pyfunction!(crv1::_crv1_meat_loop_rs, m)?)?; 12 | m.add_function(wrap_pyfunction!(demean::_demean_rs, m)?)?; 13 | m.add_function(wrap_pyfunction!( 14 | nested_fixed_effects::_count_fixef_fully_nested_all_rs, 15 | m 16 | )?)?; 17 | Ok(()) 18 | } 19 | -------------------------------------------------------------------------------- /src/nested_fixed_effects.rs: -------------------------------------------------------------------------------- 1 | use ndarray::{Array1, ArrayView1, ArrayView2, Axis}; 2 | use numpy::{IntoPyArray, PyArray1, PyReadonlyArray2}; 3 | use pyo3::prelude::*; 4 | use std::collections::hash_map::Entry; 5 | use std::collections::{HashMap, HashSet}; 6 | 7 | #[inline] 8 | fn count_fixef_fully_nested(clusters: ArrayView1, f: ArrayView1) -> bool { 9 | let mut first_cluster: HashMap = HashMap::new(); 10 | for (&cl, &fv) in clusters.iter().zip(f.iter()) { 11 | match first_cluster.entry(fv) { 12 | Entry::Vacant(e) => { 13 | e.insert(cl); 14 | } 15 | Entry::Occupied(e) => { 16 | if *e.get() != cl { 17 | return false; 18 | } 19 | } 20 | } 21 | } 22 | true 23 | } 24 | 25 | fn count_fixef_fully_nested_impl( 26 | all_fe: &[String], 27 | cluster_names: &[String], 28 | cdata: ArrayView2, 29 | fdata: ArrayView2, 30 | ) -> (Array1, usize) { 31 | let cluster_name_set: HashSet<&String> = cluster_names.iter().collect(); 32 | 33 | // We allow 34 | let mut count = 0; 35 | let mask = all_fe 36 | .iter() 37 | .enumerate() 38 | .map(|(fi, fe_name)| { 39 | let is_nested = cluster_name_set.contains(fe_name) || { 40 | cdata 41 | .axis_iter(Axis(1)) 42 | .any(|cluster_col| count_fixef_fully_nested(cluster_col, fdata.column(fi))) 43 | }; 44 | if is_nested { 45 | count += 1 46 | } 47 | is_nested 48 | }) 49 | .collect(); 50 | 51 | (mask, count) 52 | } 53 | 54 | 55 | /// Compute which fixed effect columns are fully nested within any cluster variable, 56 | /// and count the number of such columns. 57 | /// 58 | /// Parameters 59 | /// ---------- 60 | /// all_fixef_array : list of str 61 | /// Names of all fixed effect variables in the model. 62 | /// cluster_colnames : list of str 63 | /// Names of all cluster variables in the model. 64 | /// cluster_data : np.ndarray[usize] 65 | /// 2D array of cluster assignments (rows x cluster variables). 66 | /// fe_data : np.ndarray[usize] 67 | /// 2D array of fixed effect values (rows x fixed effects). 68 | /// 69 | /// Returns 70 | /// ------- 71 | /// (np.ndarray[bool], int) 72 | /// Tuple of (mask indicating which FEs are fully nested, count of such FEs). 73 | /// 74 | /// Notes 75 | /// ----- 76 | /// A fixed effect column is "fully nested" if for every unique value in that column, 77 | /// all rows with that value share the same cluster assignment (for any cluster variable). 78 | 79 | #[pyfunction] 80 | pub fn _count_fixef_fully_nested_all_rs( 81 | py: Python<'_>, 82 | all_fixef_array: &PyAny, 83 | cluster_colnames: &PyAny, 84 | cluster_data: PyReadonlyArray2, 85 | fe_data: PyReadonlyArray2, 86 | ) -> PyResult<(Py>, usize)> { 87 | // Extract Python data into Rust types 88 | let all_fe: Vec = all_fixef_array.extract()?; 89 | let cluster_names: Vec = cluster_colnames.extract()?; 90 | let cdata = cluster_data.as_array(); 91 | let fdata = fe_data.as_array(); 92 | 93 | // Call the pure Rust implementation 94 | let (mask, count) = count_fixef_fully_nested_impl(&all_fe, &cluster_names, cdata, fdata); 95 | 96 | // Convert back to Python objects 97 | let py_mask: Py> = mask.into_pyarray(py).to_owned(); 98 | Ok((py_mask, count)) 99 | } 100 | -------------------------------------------------------------------------------- /tests/.coverage: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-econometrics/pyfixest/1eb5f18bdceeece42db77e2337526fc64eff2346/tests/.coverage -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-econometrics/pyfixest/1eb5f18bdceeece42db77e2337526fc64eff2346/tests/__init__.py -------------------------------------------------------------------------------- /tests/data/gelbach.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/py-econometrics/pyfixest/1eb5f18bdceeece42db77e2337526fc64eff2346/tests/data/gelbach.dta -------------------------------------------------------------------------------- /tests/data/ppmlhdfe_separation_examples/01.csv: -------------------------------------------------------------------------------- 1 | y,x1,x2,id1,id2,separated 2 | 0.0000000000,-0.9303550124,1,1,4,1 3 | 0.0000000000,0.1835959703,1,2,1,1 4 | 0.0000000000,-0.6371972561,0,2,6,0 5 | 0.0000000000,-0.4237562418,0,2,7,0 6 | 0.1527670026,-1.1799178123,0,8,4,0 7 | 0.1553160399,0.8860545158,0,1,7,0 8 | 0.1734523475,1.0502026081,0,8,3,0 9 | 0.2217264324,-0.2490162849,0,9,1,0 10 | 0.2260344625,0.9635434151,0,7,6,0 11 | 0.2283350676,0.5023207068,0,3,5,0 12 | 0.2368061543,0.9141282439,0,10,1,0 13 | 0.2410950512,-1.3616287708,0,4,5,0 14 | 0.2541858852,0.5753656030,0,3,3,0 15 | 0.2637400925,-0.6333113909,0,3,10,0 16 | 0.2677916288,1.0411013365,0,6,9,0 17 | 0.2768439949,-1.1694648266,0,8,10,0 18 | 0.2934476137,-0.7940499187,0,4,6,0 19 | 0.3290584087,0.5041465163,0,2,4,0 20 | 0.3606268466,-3.0584282875,0,3,8,0 21 | 0.4013363719,0.6099517941,0,7,5,0 22 | 0.4354907870,0.9624704719,0,6,1,0 23 | 0.4908127189,-0.7442333698,0,5,2,0 24 | 0.4976674914,-0.5138924718,0,6,4,0 25 | 0.5012444854,-1.3591595888,0,9,7,0 26 | 0.5456602573,0.0567612983,0,5,5,0 27 | 0.5634447336,1.2903038263,0,7,8,0 28 | 0.5983847380,-0.6872945428,0,6,5,0 29 | 0.6183075905,0.7253564000,0,1,5,0 30 | 0.6413634419,1.6118478775,0,4,3,0 31 | 0.6482065916,1.2488127947,0,7,1,0 32 | 0.6522977948,-0.4748489261,0,6,6,0 33 | 0.6631931663,0.4219789803,0,4,8,0 34 | 0.6953295469,-1.0251801014,0,10,6,0 35 | 0.6986964941,-0.3038678169,0,9,9,0 36 | 0.8503285050,1.8723217249,0,8,2,0 37 | 0.9026033878,-1.0245078802,0,10,10,0 38 | 0.9204394221,0.4229967892,0,6,10,0 39 | 0.9228412509,0.4940861166,0,1,1,0 40 | 0.9359286427,1.3081433773,0,9,2,0 41 | 0.9685080647,1.2934249640,0,2,10,0 42 | 0.9945486188,-0.5332730412,0,5,1,0 43 | 1.0105472803,-0.1284428090,0,9,3,0 44 | 1.0721468925,-1.5399883986,0,6,8,0 45 | 1.1205748320,0.6894677877,0,8,5,0 46 | 1.1252909899,-1.2204582691,0,1,10,0 47 | 1.1561176777,-0.9787744284,0,9,8,0 48 | 1.1946246624,-0.0799055845,0,10,8,0 49 | 1.2046658993,-0.8231971860,0,6,7,0 50 | 1.2189750671,0.5437637568,0,3,4,0 51 | 1.2277959585,1.3177309036,0,9,10,0 52 | 1.2413842678,0.6673717499,0,8,9,0 53 | 1.2569460869,-0.0167010967,0,6,3,0 54 | 1.2587834597,-0.4196293950,0,1,8,0 55 | 1.2782599926,-0.6420007348,0,8,1,0 56 | 1.2911227942,1.1136496067,0,9,4,0 57 | 1.2973045111,-0.3824758530,0,7,9,0 58 | 1.3675237894,1.2361305952,0,5,9,0 59 | 1.3778325319,-1.0304020643,0,5,4,0 60 | 1.3857760429,0.3235974312,0,2,3,0 61 | 1.3960508108,-0.4157371819,0,2,5,0 62 | 1.4190907478,0.9920675159,0,1,2,0 63 | 1.4420653582,-0.9114651084,0,4,1,0 64 | 1.5038720369,-1.0453398228,0,3,2,0 65 | 1.5394419432,-0.1935533732,0,4,4,0 66 | 1.5747014284,0.0698969364,0,9,6,0 67 | 1.6199581623,1.3169367313,0,4,2,0 68 | 1.6392902136,-0.3978092670,0,7,10,0 69 | 1.6421631575,-0.7466211319,0,5,8,0 70 | 1.6952790022,-0.0158907417,0,5,6,0 71 | 1.7640979290,1.0598815680,0,7,4,0 72 | 1.9505974054,0.0092241317,0,10,7,0 73 | 2.0685675144,0.1434842199,0,8,8,0 74 | 2.1190843582,0.6173521280,0,3,1,0 75 | 2.1889939308,-1.9780639410,0,3,7,0 76 | 2.2176725864,-1.5379956961,0,7,3,0 77 | 2.2831020355,0.5082080960,0,2,2,0 78 | 2.3055832386,1.0296376944,0,7,2,0 79 | 2.3692295551,2.1091823578,0,10,2,0 80 | 2.7510018349,0.2632481158,0,2,9,0 81 | 2.7675759792,-0.0022486539,0,8,6,0 82 | 2.7777233124,-1.3771806955,0,10,4,0 83 | 2.7846245766,0.1415781677,0,10,5,0 84 | 2.7860391140,-2.2442505360,0,4,9,0 85 | 2.9671635628,0.2927849889,0,5,3,0 86 | 2.9819300175,-0.8325243592,0,4,10,0 87 | 3.1186814308,-0.4090226293,0,2,8,0 88 | 3.2802021503,-0.4062994719,0,4,7,0 89 | 3.4179122448,0.0959109142,0,8,7,0 90 | 3.6803083420,2.3073217869,0,1,9,0 91 | 3.7194297314,0.3930145800,0,3,9,0 92 | 3.7777581215,0.2952103019,0,6,2,0 93 | 4.1290211678,-1.4121559858,0,5,7,0 94 | 4.2730326653,-0.5140260458,0,10,9,0 95 | 4.2883334160,-1.0160779953,0,7,7,0 96 | 4.3528537750,0.7201576829,0,10,3,0 97 | 4.9981565475,0.2207358032,0,3,6,0 98 | 5.0979351997,0.7166025043,0,9,5,0 99 | 7.3969793320,2.1998977661,0,5,10,0 100 | 8.4651517868,0.1178035960,0,1,6,0 101 | 9.8326959610,0.7707119584,0,1,3,0 102 | -------------------------------------------------------------------------------- /tests/data/ppmlhdfe_separation_examples/02.csv: -------------------------------------------------------------------------------- 1 | y,id1,id2,separated 2 | 0,1,1,0 3 | 1,1,1,0 4 | 0,2,1,1 5 | 0,2,2,0 6 | 1,2,2,0 7 | -------------------------------------------------------------------------------- /tests/data/ppmlhdfe_separation_examples/03.csv: -------------------------------------------------------------------------------- 1 | y,id1,id2,id3,separated 2 | 0,5,3,6,1 3 | 0,5,5,7,1 4 | 1,1,3,1,0 5 | 1,1,5,2,0 6 | 1,2,4,1,0 7 | 1,2,6,2,0 8 | 1,3,1,3,0 9 | 1,3,7,4,0 10 | 1,4,2,3,0 11 | 1,4,8,4,0 12 | 1,5,1,5,0 13 | 1,5,7,8,0 14 | 1,6,2,5,0 15 | 1,6,4,6,0 16 | 1,6,6,7,0 17 | 1,6,8,8,0 18 | -------------------------------------------------------------------------------- /tests/data/ppmlhdfe_separation_examples/04.csv: -------------------------------------------------------------------------------- 1 | y,id1,id2,separated 2 | 0,1,1,1 3 | 0,1,1,1 4 | 0,1,34,1 5 | 0,2,1,1 6 | 0,2,2,1 7 | 0,2,2,1 8 | 0,3,3,1 9 | 0,3,3,1 10 | 0,4,3,1 11 | 0,5,4,1 12 | 0,5,5,1 13 | 0,5,5,1 14 | 0,6,5,1 15 | 0,6,6,0 16 | 0,7,6,1 17 | 0,7,7,0 18 | 0,8,7,1 19 | 0,8,8,1 20 | 0,8,8,1 21 | 0,9,8,1 22 | 0,9,9,1 23 | 0,9,9,1 24 | 0,10,10,1 25 | 0,10,10,1 26 | 0,11,10,1 27 | 0,11,11,1 28 | 0,11,11,1 29 | 0,12,11,1 30 | 0,12,12,1 31 | 0,12,12,1 32 | 0,13,12,1 33 | 0,13,13,1 34 | 0,13,13,1 35 | 0,14,13,1 36 | 0,14,14,1 37 | 0,14,14,1 38 | 0,15,14,1 39 | 0,15,15,1 40 | 0,15,15,1 41 | 0,16,15,1 42 | 0,16,16,1 43 | 0,16,16,1 44 | 0,17,16,1 45 | 0,17,17,1 46 | 0,17,17,1 47 | 0,18,17,1 48 | 0,18,18,0 49 | 0,19,18,1 50 | 0,19,19,1 51 | 0,19,19,1 52 | 0,20,19,1 53 | 0,20,20,0 54 | 0,21,20,1 55 | 0,21,21,1 56 | 0,21,21,1 57 | 0,22,21,1 58 | 0,22,22,1 59 | 0,22,22,1 60 | 0,23,22,1 61 | 0,23,23,1 62 | 0,23,23,1 63 | 0,24,23,1 64 | 0,24,24,1 65 | 0,24,24,1 66 | 0,25,24,1 67 | 0,25,25,1 68 | 0,25,25,1 69 | 0,26,25,1 70 | 0,26,26,1 71 | 0,26,26,1 72 | 0,27,26,1 73 | 0,27,27,1 74 | 0,27,27,1 75 | 0,28,27,1 76 | 0,28,28,0 77 | 0,29,28,1 78 | 0,29,29,1 79 | 0,29,29,1 80 | 0,30,29,1 81 | 0,30,30,0 82 | 0,31,30,1 83 | 0,31,31,1 84 | 0,31,31,1 85 | 0,32,31,1 86 | 0,32,32,1 87 | 0,32,32,1 88 | 0,33,32,1 89 | 0,33,33,1 90 | 0,33,33,1 91 | 1,3,2,0 92 | 1,4,4,0 93 | 1,4,4,0 94 | 1,6,6,0 95 | 1,7,7,0 96 | 1,10,9,0 97 | 1,18,18,0 98 | 1,20,20,0 99 | 1,28,28,0 100 | 1,30,30,0 101 | 1,34,33,0 102 | -------------------------------------------------------------------------------- /tests/data/ppmlhdfe_separation_examples/05.csv: -------------------------------------------------------------------------------- 1 | y,x1,x2,x3,x4,separated 2 | 0,5,11,2,2,0 3 | 0,5,2,11,2,0 4 | 0,5,2,2,11,0 5 | 0,0,-1,-1,-1,1 6 | 1,5,5,5,5,0 7 | 2,4,4,4,4,0 8 | 3,3,3,3,3,0 9 | 4,2,2,2,2,0 10 | 5,1,1,1,1,0 11 | -------------------------------------------------------------------------------- /tests/data/ppmlhdfe_separation_examples/06.csv: -------------------------------------------------------------------------------- 1 | y,x1,x2,x3,x4,separated 2 | 0,0,0,0,0,0 3 | 0,0,0,0,0,0 4 | 0,5,11,2,2,0 5 | 0,5,2,11,2,0 6 | 0,5,2,2,11,0 7 | 0,0,-1,-1,-1,1 8 | 1,5,5,5,5,0 9 | 2,4,4,4,4,0 10 | 3,3,3,3,3,0 11 | 4,2,2,2,2,0 12 | 5,1,1,1,1,0 13 | -------------------------------------------------------------------------------- /tests/data/ppmlhdfe_separation_examples/07.csv: -------------------------------------------------------------------------------- 1 | y,x1,x2,id1,id2,separated 2 | 0,0,0,1,1,0 3 | 0,0,0,1,1,0 4 | 0,0,0,1,2,0 5 | 0,0,0,1,3,0 6 | 0,2,0,1,3,0 7 | 0,0,1,2,2,1 8 | 0,0,0,2,2,0 9 | 0,0,0,2,2,0 10 | 0,1,0,2,2,0 11 | 0,1,2,2,3,1 12 | 0,0,1,2,4,1 13 | 0,0,0,4,2,0 14 | 0,0,1,5,2,1 15 | 1,0,0,1,1,0 16 | 1,1,0,4,3,0 17 | 1,0,0,5,4,0 18 | 2,0,0,1,2,0 19 | 2,0,0,2,1,0 20 | -------------------------------------------------------------------------------- /tests/data/ppmlhdfe_separation_examples/08.csv: -------------------------------------------------------------------------------- 1 | y,x1,x2,id1,id2,separated 2 | 0,1,0,1,2,1 3 | 0,1,0,2,2,0 4 | 1,2,0,1,3,0 5 | 1,0,0,2,1,0 6 | 1,1,0,2,2,0 7 | 1,0,1,2,3,0 8 | 2,0,0,2,1,0 9 | 2,0,1,2,2,0 10 | 2,1,0,2,3,0 11 | -------------------------------------------------------------------------------- /tests/data/ppmlhdfe_separation_examples/09.csv: -------------------------------------------------------------------------------- 1 | y,x1,x2,x3,separated 2 | 0.0000000000,0,0,1,0 3 | 0.0000000000,0,0,2,0 4 | 0.0000000000,0,0,1,0 5 | 0.9788354000,3,2,1,0 6 | 2.2596662000,1,1,2,0 7 | 2.4177196000,2,2,2,0 8 | 2.6114680000,4,2,1,0 9 | -------------------------------------------------------------------------------- /tests/data/ppmlhdfe_separation_examples/10.csv: -------------------------------------------------------------------------------- 1 | y,x1,x2,x3,separated 2 | 0.0000000000,-5,0,1,1 3 | 0.0000000000,-2,0,1,1 4 | 0.0000000000,0,2,1,1 5 | 0.0000000000,0,0,2,0 6 | 0.0000000000,0,0,3,1 7 | 0.0000000000,2,3,4,1 8 | 0.5000000000,2,2,2,0 9 | 1.1000000000,1,1,2,0 10 | 2.1000000000,4,2,4,0 11 | 3.3000000000,3,3,2,0 12 | -------------------------------------------------------------------------------- /tests/data/ppmlhdfe_separation_examples/12.csv: -------------------------------------------------------------------------------- 1 | y,id1,id2,separated 2 | 0,3,3,0 3 | 0,4,3,1 4 | 0,4,4,0 5 | 0,6,6,0 6 | 0,7,7,0 7 | 0,8,8,0 8 | 0,9,9,0 9 | 1,1,1,0 10 | 1,2,2,0 11 | 1,2,2,0 12 | 1,3,3,0 13 | 1,4,4,0 14 | 1,5,5,0 15 | 1,6,6,0 16 | 1,7,7,0 17 | 1,8,8,0 18 | 1,9,9,0 19 | 1,10,10,0 20 | -------------------------------------------------------------------------------- /tests/data/ppmlhdfe_separation_examples/13.csv: -------------------------------------------------------------------------------- 1 | y,id1,id2,separated 2 | 0,3,3,0 3 | 0,4,3,1 4 | 0,4,4,0 5 | 0,6,6,0 6 | 0,7,7,0 7 | 0,8,8,0 8 | 0,9,9,0 9 | 1,1,1,0 10 | 1,2,2,0 11 | 1,2,2,0 12 | 1,3,3,0 13 | 1,4,4,0 14 | 1,5,5,0 15 | 1,6,6,0 16 | 1,7,7,0 17 | 1,8,8,0 18 | 1,9,9,0 19 | 1,10,10,0 20 | -------------------------------------------------------------------------------- /tests/data/ppmlhdfe_separation_examples/14.csv: -------------------------------------------------------------------------------- 1 | y,x1,x2,separated 2 | 0,1,0,0 3 | 0,1,0,0 4 | 0,-1,0,0 5 | 0,0,1,0 6 | 0,0,1,0 7 | 0,0,1,0 8 | 0,0,-1,0 9 | 0,0,-1,0 10 | 1,0,0,0 11 | 2,0,0,0 12 | 3,0,0,0 13 | 4,0,0,0 14 | 5,0,0,0 15 | 6,0,0,0 16 | -------------------------------------------------------------------------------- /tests/data/ppmlhdfe_separation_examples/15.csv: -------------------------------------------------------------------------------- 1 | y,x1,x2,x3,separated 2 | 0,1,0,0,0 3 | 0,1,0,0,0 4 | 0,-1,0,0,0 5 | 0,0,1,0,0 6 | 0,0,1,0,0 7 | 0,0,1,0,0 8 | 0,0,-1,0,0 9 | 0,0,-1,0,0 10 | 0,0,0,1,1 11 | 0,0,0,2,1 12 | 0,0,0,3,1 13 | 0,0,0,4,1 14 | 0,0,0,5,1 15 | 0,0,0,6,1 16 | 0,0,0,7,1 17 | 0,0,0,8,1 18 | 0,0,0,1000,1 19 | 1,0,0,0,0 20 | 2,0,0,0,0 21 | 3,0,0,0,0 22 | 4,0,0,0,0 23 | 5,0,0,0,0 24 | 6,0,0,0,0 25 | -------------------------------------------------------------------------------- /tests/data/ppmlhdfe_separation_examples/16.csv: -------------------------------------------------------------------------------- 1 | y,x1,x2,x3,separated 2 | 0,3,-1,2,1 3 | 0,5,2,-9,1 4 | 0,5,4,0,1 5 | 0,4.5,3,0,1 6 | 0,-3,-13,1,1 7 | 0,6.5,6,2,1 8 | 0,5,2,10,1 9 | 0,1.5,-3,-2,1 10 | 0,1,-4,1,1 11 | 0,4,0,-7,1 12 | 0,8.5,11,5,1 13 | 0,4,2,9,1 14 | 0,.5,-6,7,1 15 | 0,2,-3,-3,1 16 | 0,4.5,3,7,1 17 | 0,.5,-5,-29,1 18 | 0,4,2,7,1 19 | 0,6,6,-19,1 20 | 0,3.5,-1,2,1 21 | 0,2.5,-1,-17,1 22 | 0,2.5,0,3,1 23 | 0,.5,-6,-44,1 24 | 0,5,2,1,1 25 | 0,3.5,0,-8,1 26 | 0,1,-4,0,1 27 | 0,7.5,8,-36,1 28 | 0,2,-3,-6,1 29 | 0,3,-3,-41,1 30 | 0,4.5,1,1,1 31 | 0,0,-4,-5,0 32 | 0,1.5,-1,8,0 33 | 0,.5,-3,-28,0 34 | 0,-2,-8,-5,0 35 | 0,4.5,5,4,0 36 | 0,-1,-6,1,0 37 | 0,4.5,5,-31,0 38 | 0,4,4,-2,0 39 | 0,3.5,3,-35,0 40 | 0,-2.5,-9,-9,0 41 | 0,-1.5,-7,-17,0 42 | 0,3.5,3,-2,0 43 | 0,3.5,3,-25,0 44 | 0,3,2,-2,0 45 | 0,0,-4,-25,0 46 | 0,3.5,3,-5,0 47 | 0,-.5,-5,-33,0 48 | 0,3,2,-4,0 49 | 0,0,-4,-23,0 50 | 0,4.5,5,-6,0 51 | 0,6,8,-54,0 52 | 0,0,-4,-4,0 53 | 0,4.5,5,-30,0 54 | 0,2,0,-12,0 55 | 0,-.5,-5,-14,0 56 | 0,-2,-8,8,0 57 | 0,1.5,-1,-3,0 58 | 0,-1,-6,-7,0 59 | 0,2.5,1,-51,0 60 | 0,2.5,1,8,0 61 | 0,-1,-6,-28,0 62 | 0,1,-2,-2,0 63 | 0,1,-2,-27,0 64 | 0,2.5,1,4,0 65 | 0,1,-2,-33,0 66 | 0,2.5,1,-3,0 67 | 0,.5,-3,-36,0 68 | 0,2,0,-4,0 69 | 0,5,6,-25,0 70 | 0,-4.5,-13,9,0 71 | 0,5,6,-30,0 72 | 0,1,-2,6,0 73 | 0,1,-2,-44,0 74 | 0,1,-2,-3,0 75 | 0,-.5,-5,-13,0 76 | 0,-1.5,-7,-4,0 77 | 0,0,-4,-18,0 78 | 0,0,-4,4,0 79 | 0,-1,-6,-9,0 80 | 0,1,-2,-3,0 81 | 0,0,-4,-24,0 82 | 0,3.5,3,-2,0 83 | 0,-2,-8,-29,0 84 | 0,2.5,1,-4,0 85 | 0,3,2,-10,0 86 | 0,0,-4,-2,0 87 | .149277,.5,-3,-33,0 88 | .190858,4,4,4,0 89 | .341605,3,2,1,0 90 | .3791031,2,0,-4,0 91 | .3860548,4,4,-30,0 92 | .4186196,7,10,4,0 93 | .535357,2.5,1,-53,0 94 | .7474937,8,12,-3,0 95 | 1.188081,2,0,-36,0 96 | 1.461509,4.5,5,11,0 97 | 1.809239,2,0,-40,0 98 | 1.940061,5.5,7,-6,0 99 | 2.535522,2.5,1,-23,0 100 | 2.784749,1.5,-1,2,0 101 | 5.984444,7,10,-45,0 102 | -------------------------------------------------------------------------------- /tests/data/ppmlhdfe_separation_examples/17.csv: -------------------------------------------------------------------------------- 1 | y,x1,x2,x3,separated 2 | 0,-3,5,4,1 3 | 0,4,3.5,0,1 4 | 0,-5,5,5,1 5 | 0,-2,3,-3,1 6 | 0,5,6,3,1 7 | 0,3,2.5,-2,1 8 | 0,5,5.5,5,1 9 | 0,-25,1.5,-3,1 10 | 0,4,4,0,1 11 | 0,-16,6,6,1 12 | 0,-3,5,2,1 13 | 0,-25,6.5,8,1 14 | 0,3,2,-4,1 15 | 0,-44,-1,-8,1 16 | 0,0,3.5,1,1 17 | 0,-2,7,6,1 18 | 0,-6,3.5,1,1 19 | 0,-14,7,6,1 20 | 0,-1,1,-5,1 21 | 0,-20,5,3,1 22 | 0,-4,6.5,4,1 23 | 0,-13,6,3,1 24 | 0,-6,5,3,1 25 | 0,-3,5.5,3,1 26 | 0,2,6.5,6,1 27 | 0,-17,3.5,0,1 28 | 0,1,3,0,1 29 | 0,-24,4.5,4,1 30 | 0,-5,4.5,3,1 31 | 0,-36,1.5,-1,0 32 | 0,6,0,-4,0 33 | 0,-33,1,-2,0 34 | 0,-4,1.5,-1,0 35 | 0,-17,4,4,0 36 | 0,4,3.5,3,0 37 | 0,-6,.5,-3,0 38 | 0,-8,-3,-10,0 39 | 0,-34,-3.5,-11,0 40 | 0,12,-.5,-5,0 41 | 0,-22,1.5,-1,0 42 | 0,4,3,2,0 43 | 0,-3,.5,-3,0 44 | 0,8,.5,-3,0 45 | 0,-37,3,2,0 46 | 0,1,-1,-6,0 47 | 0,-24,3,2,0 48 | 0,2,-2,-8,0 49 | 0,2,-2,-8,0 50 | 0,2,4.5,5,0 51 | 0,-41,6,8,0 52 | 0,-5,4,4,0 53 | 0,-23,1.5,-1,0 54 | 0,-6,-1,-6,0 55 | 0,-42,-1,-6,0 56 | 0,-3,3,2,0 57 | 0,-45,1.5,-1,0 58 | 0,0,4,4,0 59 | 0,2,2.5,1,0 60 | 0,-6,5,6,0 61 | 0,-34,1.5,-1,0 62 | 0,-4,4.5,5,0 63 | 0,-15,3.5,3,0 64 | 0,6,2,0,0 65 | 0,-16,1.5,-1,0 66 | 0,6,2,0,0 67 | 0,-37,1.5,-1,0 68 | 0,0,1.5,-1,0 69 | 0,-9,-6.5,-17,0 70 | 0,6,5.5,7,0 71 | 0,-10,1,-2,0 72 | 0,1,5.5,7,0 73 | 0,-35,.5,-3,0 74 | 0,-7,5.5,7,0 75 | 0,-39,2.5,1,0 76 | 0,-16,3.5,3,0 77 | 0,-12,1,-2,0 78 | 0,-5,2.5,1,0 79 | 0,-37,2.5,1,0 80 | 0,-8,-1,-6,0 81 | 0,-19,7,10,0 82 | 0,2,0,-4,0 83 | 0,-43,3,2,0 84 | 0,2,4.5,5,0 85 | 0,-15,-2,-8,0 86 | 0,-2,1.5,-1,0 87 | 0,-19,2,0,0 88 | 0,1,3,2,0 89 | 0,-17,0,-4,0 90 | 0,-8,-2,-8,0 91 | 0,-37,1,-2,0 92 | 0,3,4.5,5,0 93 | 0,-45,3,2,0 94 | 0,1,1,-2,0 95 | 0,-29,1.5,-1,0 96 | 0,2,0,-4,0 97 | .3131518,-8,-.5,-5,0 98 | .3476705,-6,0,-4,0 99 | .8587969,-19,1,-2,0 100 | 3.179766,-4,-.5,-5,0 101 | 4.394069,-22,1.5,-1,0 102 | -------------------------------------------------------------------------------- /tests/data/ppmlhdfe_separation_examples/18.csv: -------------------------------------------------------------------------------- 1 | y,x1,x2,x3,separated 2 | 0,4.5,3,-8,1 3 | 0,5,3,4,1 4 | 0,5.5,3,-1,1 5 | 0,0,-6,-1,1 6 | 0,1.5,-4,-4,1 7 | 0,-1.5,-10,7,1 8 | 0,3.5,-1,-8,1 9 | 0,-1,-8,-2,1 10 | 0,7.5,8,4,1 11 | 0,-2,-9,-6,1 12 | 0,2.5,-1,3,1 13 | 0,4.5,1,-40,1 14 | 0,3,-2,3,1 15 | 0,0,-8,-22,1 16 | 0,4.5,0,-2,1 17 | 0,3.5,-1,-33,1 18 | 0,3.5,1,-1,1 19 | 0,2,-2,-56,1 20 | 0,3,0,-2,1 21 | 0,4,2,-39,1 22 | 0,5.5,4,6,1 23 | 0,4.5,2,2,1 24 | 0,1.5,-4,-9,1 25 | 0,-.5,-8,-34,1 26 | 0,5.5,2,-9,1 27 | 0,1,-4,-5,1 28 | 0,1.5,-5,-9,1 29 | 0,8,8,-34,1 30 | 0,2.5,-3,0,1 31 | 0,.5,-3,-43,0 32 | 0,2.5,1,-1,0 33 | 0,2,0,-6,0 34 | 0,2,0,11,0 35 | 0,1.5,-1,-44,0 36 | 0,2.5,1,4,0 37 | 0,2.5,1,-37,0 38 | 0,.5,-3,0,0 39 | 0,2.5,1,-52,0 40 | 0,.5,-3,-4,0 41 | 0,0,-4,-25,0 42 | 0,2.5,1,2,0 43 | 0,3.5,3,-45,0 44 | 0,4.5,5,-1,0 45 | 0,4.5,5,-47,0 46 | 0,2.5,1,9,0 47 | 0,-2,-8,-44,0 48 | 0,4,4,-3,0 49 | 0,5,6,-34,0 50 | 0,0,-4,-7,0 51 | 0,3,2,-32,0 52 | 0,4,4,-1,0 53 | 0,5,6,-37,0 54 | 0,-1,-6,-3,0 55 | 0,.5,-3,-48,0 56 | 0,3,2,-1,0 57 | 0,3,2,-14,0 58 | 0,0,-4,0,0 59 | 0,6,8,-14,0 60 | 0,1.5,-1,5,0 61 | 0,2.5,1,-11,0 62 | 0,.5,-3,-2,0 63 | 0,-.5,-5,-21,0 64 | 0,1,-2,-5,0 65 | 0,-.5,-5,-52,0 66 | 0,-1.5,-7,7,0 67 | 0,-1,-6,-56,0 68 | 0,2.5,1,-8,0 69 | 0,3,2,-39,0 70 | 0,1.5,-1,-5,0 71 | 0,-2.5,-9,-48,0 72 | 0,.5,-3,7,0 73 | 0,4,4,-48,0 74 | 0,2,0,2,0 75 | 0,4,4,0,0 76 | 0,1,-2,0,0 77 | 0,1.5,-1,-42,0 78 | 0,2.5,1,-2,0 79 | 0,1,-2,-13,0 80 | 0,5.5,7,6,0 81 | 0,6.5,9,-54,0 82 | 0,5.5,7,-4,0 83 | 0,4,4,-24,0 84 | 0,1.5,-1,1,0 85 | 0,1.5,-1,-27,0 86 | 0,4.5,5,3,0 87 | 0,6.5,9,-36,0 88 | 0,3,2,12,0 89 | 0,-3,-10,-9,0 90 | 0,1.5,-1,-9,0 91 | 0,4.5,5,-14,0 92 | .2704671,3,2,3,0 93 | .3969807,1,-2,-28,0 94 | .4520356,-2,-8,9,0 95 | .6291217,1,-2,-42,0 96 | .7097536,0,-4,7,0 97 | 1.218813,-2,-8,-7,0 98 | 1.41596,0,-4,1,0 99 | 1.53977,2.5,1,-46,0 100 | 2.074442,-.5,-5,-1,0 101 | 3.495168,-1,-6,-46,0 102 | -------------------------------------------------------------------------------- /tests/data/ppmlhdfe_separation_examples/readme.md: -------------------------------------------------------------------------------- 1 | ## Separation Data Sets 2 | 3 | All files in this document stem from the [pplmhdfe test suite](https://github.com/sergiocorreia/ppmlhdfe/tree/master/test/separation_datasets), 4 | published under MIT license. 5 | -------------------------------------------------------------------------------- /tests/data/ritest_results.csv: -------------------------------------------------------------------------------- 1 | "formula","resampvar","cluster","pval","se","ci_lower" 2 | "Y~X1+f3","X1","none",0.012,0.00179109271688753,0.00905391464842118 3 | "Y~X1+f3","X1","group_id",0.0128,0.0018490835319912,0.00975852824586804 4 | "Y~X1+f3","f3","none",7e-04,0.000435056775142265,-1.56047145225662e-05 5 | "Y~X1+f3","f3","group_id",0,0,0 6 | "Y~X1+f3","X1=-0.75","none",0,0,0 7 | "Y~X1+f3","X1=-0.75","group_id",0,0,0 8 | "Y~X1+f3","f3>0.05","none",0,0,0 9 | "Y~X1+f3","f3>0.05","group_id",0,0,0 10 | "Y~X1+f3|f1","X1","none",0.0015,0.000636602935248383,0.000452881353028745 11 | "Y~X1+f3|f1","X1","group_id",0.0019,0.000716329440884561,0.000721742921068911 12 | "Y~X1+f3|f1","f3","none",1e-04,0.000164485362695147,-0.000170554345409541 13 | "Y~X1+f3|f1","f3","group_id",0,0,0 14 | "Y~X1+f3|f1","X1=-0.75","none",0,0,0 15 | "Y~X1+f3|f1","X1=-0.75","group_id",0,0,0 16 | "Y~X1+f3|f1","f3>0.05","none",0,0,0 17 | "Y~X1+f3|f1","f3>0.05","group_id",0,0,0 18 | "Y~X1+f3|f1+f2","X1","none",0.0027,0.000853579083127533,0.00129598734922777 19 | "Y~X1+f3|f1+f2","X1","group_id",0.0042,0.00106379924748561,0.00245020594942505 20 | "Y~X1+f3|f1+f2","f3","none",0,0,0 21 | "Y~X1+f3|f1+f2","f3","group_id",0,0,0 22 | "Y~X1+f3|f1+f2","X1=-0.75","none",0,0,0 23 | "Y~X1+f3|f1+f2","X1=-0.75","group_id",0,0,0 24 | "Y~X1+f3|f1+f2","f3>0.05","none",0,0,0 25 | "Y~X1+f3|f1+f2","f3>0.05","group_id",0,0,0 26 | -------------------------------------------------------------------------------- /tests/r_test_comparisons.R: -------------------------------------------------------------------------------- 1 | # Load necessary libraries 2 | library(fixest) 3 | library(ritest) 4 | library(reticulate) 5 | 6 | set.seed(1232) 7 | 8 | # Import the pyfixest package 9 | pyfixest <- import("pyfixest") 10 | data <- pyfixest$get_data(N = as.integer(1000), seed = as.integer(2999)) 11 | 12 | # Define the function to run the tests 13 | run_tests_ritest <- function(data) { 14 | # Print the column names of the data 15 | print(names(data)) 16 | 17 | # Define the formulas, resampling variables, and clusters 18 | formulas <- c("Y~X1+f3", "Y~X1+f3|f1", "Y~X1+f3|f1+f2") 19 | resampvars <- c("X1", "f3", "X1=-0.75", "f3>0.05") 20 | clusters <- c(NA, "group_id") 21 | reps <- 10000 22 | seed <- 123 23 | 24 | # Initialize an empty data frame to store results 25 | results <- data.frame() 26 | 27 | # Loop through each combination of formula, resampvar, and cluster 28 | for (fml in formulas) { 29 | for (resampvar in resampvars) { 30 | for (cluster in clusters) { 31 | fit <- feols(as.formula(fml), data = data) 32 | 33 | if (!is.na(cluster)) { 34 | res_r <- ritest(object = fit, resampvar = resampvar, cluster = cluster, reps = reps, seed = seed) 35 | } else { 36 | res_r <- ritest(object = fit, resampvar = resampvar, reps = reps, seed = seed) 37 | } 38 | 39 | results <- rbind(results, data.frame( 40 | formula = fml, 41 | resampvar = resampvar, 42 | cluster = ifelse(is.na(cluster), "none", cluster), 43 | pval = res_r$pval, 44 | se = res_r$se, 45 | ci_lower = res_r$ci[1] 46 | )) 47 | } 48 | } 49 | } 50 | 51 | # Save the results to a CSV file 52 | write.csv(results, "tests/data/ritest_results.csv", row.names = FALSE) 53 | } 54 | 55 | # Run the tests 56 | run_tests_ritest(data) 57 | -------------------------------------------------------------------------------- /tests/readme.md: -------------------------------------------------------------------------------- 1 | ## Readme 2 | 3 | - [Check How close `PyFixest` reproduces standard errors produced via `fixest` and `stats::glm`](https://github.com/py-econometrics/pyfixest/tree/master/tests/check-crv-diffs-fixest-pyfixest-glm.qmd) 4 | - [Test `PyFixest` against `fixest`](https://github.com/py-econometrics/pyfixest/tree/master/tests/test_vs_fixest.py) 5 | - `pandas` needs to be a version lower than `1.5.3` to be compatible with `rpy2`, else you'll run into [this error](https://stackoverflow.com/questions/76404811/attributeerror-dataframe-object-has-no-attribute-iteritems). The github actions for testing ensures that `pandas` is of a version lower than `1.5.3`. 6 | -------------------------------------------------------------------------------- /tests/test_api.py: -------------------------------------------------------------------------------- 1 | import duckdb 2 | import numpy as np 3 | import pandas as pd 4 | import pytest 5 | from formulaic.errors import FactorEvaluationError 6 | 7 | import pyfixest as pf 8 | from pyfixest.utils.utils import get_data 9 | 10 | 11 | def test_api(): 12 | df1 = get_data() 13 | df2 = get_data(model="Fepois") 14 | 15 | fit1 = pf.feols("Y ~ X1 + X2 | f1", data=df1) 16 | fit2 = pf.estimation.fepois( 17 | "Y ~ X1 + X2 + f2 | f1", data=df2, vcov={"CRV1": "f1+f2"} 18 | ) 19 | fit_multi = pf.feols("Y + Y2 ~ X1", data=df2) 20 | 21 | pf.summary(fit1) 22 | pf.report.summary(fit2) 23 | pf.etable([fit1, fit2]) 24 | pf.coefplot([fit1, fit2]) 25 | 26 | pf.summary(fit_multi) 27 | pf.etable(fit_multi) 28 | pf.coefplot(fit_multi) 29 | 30 | 31 | def test_feols_args(): 32 | """ 33 | Check feols function arguments. 34 | 35 | Arguments to check: 36 | - copy_data 37 | - store_data 38 | - fixef_tol 39 | - solver 40 | """ 41 | df = pf.get_data() 42 | 43 | fit1 = pf.feols(fml="Y ~ X1 | f1 + f2", data=df, copy_data=True) 44 | fit2 = pf.feols(fml="Y ~ X1 | f1 + f2", data=df, copy_data=False) 45 | 46 | assert (fit1.coef() == fit2.coef()).all() 47 | 48 | fit3 = pf.feols(fml="Y ~ X1 | f1 + f2", data=df, store_data=False, fixef_tol=1e-02) 49 | if hasattr(fit3, "_data"): 50 | raise AttributeError( 51 | "The 'fit3' object has the attribute '_data', which should not be present." 52 | ) 53 | 54 | assert fit1.coef().xs("X1") != fit3.coef().xs("X1") 55 | assert np.abs(fit1.coef().xs("X1") - fit3.coef().xs("X1")) < 0.01 56 | 57 | fit4 = pf.feols(fml="Y ~ X1 | f1 + f2", data=df, solver="np.linalg.solve") 58 | fit5 = pf.feols(fml="Y ~ X1 | f1 + f2", data=df, solver="np.linalg.lstsq") 59 | 60 | assert (fit4.coef() == fit5.coef()).all() 61 | 62 | 63 | def test_fepois_args(): 64 | """ 65 | Check feols function arguments. 66 | 67 | Arguments to check: 68 | - copy_data 69 | - store_data 70 | - fixef_tol 71 | - solver 72 | """ 73 | df = pf.get_data(model="Fepois") 74 | 75 | fit1 = pf.fepois(fml="Y ~ X1 | f1 + f2", data=df, copy_data=True) 76 | fit2 = pf.fepois(fml="Y ~ X1 | f1 + f2", data=df, copy_data=False) 77 | 78 | assert (fit1.coef() == fit2.coef()).all() 79 | 80 | fit3 = pf.fepois(fml="Y ~ X1 | f1 + f2", data=df, store_data=False, fixef_tol=1e-02) 81 | if hasattr(fit3, "_data"): 82 | raise AttributeError( 83 | "The 'fit3' object has the attribute '_data', which should not be present." 84 | ) 85 | 86 | assert fit1.coef().xs("X1") != fit3.coef().xs("X1") 87 | assert np.abs(fit1.coef().xs("X1") - fit3.coef().xs("X1")) < 0.01 88 | 89 | fit4 = pf.feols(fml="Y ~ X1 | f1 + f2", data=df, solver="np.linalg.solve") 90 | fit5 = pf.feols(fml="Y ~ X1 | f1 + f2", data=df, solver="np.linalg.lstsq") 91 | 92 | np.testing.assert_allclose(fit4.coef(), fit5.coef(), rtol=1e-12) 93 | 94 | 95 | def test_lean(): 96 | data = pf.get_data() 97 | fit = pf.feols("Y ~ X1 + X2 | f1", data=data, lean=True) 98 | 99 | assert not hasattr(fit, "_data") 100 | assert not hasattr(fit, "_X") 101 | assert not hasattr(fit, "_Y") 102 | 103 | 104 | def test_duckdb_input(): 105 | data_pandas = pf.get_data() 106 | data_duckdb = duckdb.query("SELECT * FROM data_pandas") 107 | fit_pandas = pf.feols("Y ~ X1 | f1 + f2", data=data_pandas) 108 | fit_duckdb = pf.feols("Y ~ X1 | f1 + f2", data=data_duckdb) 109 | assert type(fit_pandas) is type(fit_duckdb) 110 | np.testing.assert_allclose(fit_pandas.coef(), fit_duckdb.coef(), rtol=1e-12) 111 | np.testing.assert_allclose(fit_pandas.se(), fit_duckdb.se(), rtol=1e-12) 112 | 113 | 114 | def _lspline(series: pd.Series, knots: list[float]) -> np.array: 115 | """Generate a linear spline design matrix for the input series based on knots.""" 116 | vector = series.values 117 | columns = [] 118 | 119 | for i, knot in enumerate(knots): 120 | column = np.minimum(vector, knot if i == 0 else knot - knots[i - 1]) 121 | columns.append(column) 122 | vector = vector - column 123 | 124 | # Add the remainder as the last column 125 | columns.append(vector) 126 | 127 | # Combine columns into a design matrix 128 | return np.column_stack(columns) 129 | 130 | 131 | @pytest.fixture 132 | def spline_data(): 133 | """Fixture to prepare data with spline splits.""" 134 | data = pf.get_data() 135 | data["Y"] = np.where(data["Y"] > data["Y"].median(), 1, 0) 136 | spline_split = _lspline(data["X2"], [0, 1]) 137 | data["X2_0"], data["0_X2_1"], data["1_X2"] = spline_split.T 138 | return data 139 | 140 | 141 | @pytest.mark.parametrize( 142 | "method,family", 143 | [ 144 | ("feols", None), 145 | ("feglm", "logit"), 146 | ("feglm", "probit"), 147 | ("feglm", "gaussian"), 148 | ], 149 | ) 150 | def test_context_capture(spline_data, method, family): 151 | method_kwargs = {"data": spline_data} 152 | if family: 153 | method_kwargs["family"] = family 154 | 155 | explicit_fit = getattr(pf, method)("Y ~ X2_0 + 0_X2_1 + 1_X2", **method_kwargs) 156 | context_captured_fit = getattr(pf, method)( 157 | "Y ~ _lspline(X2,[0,1])", context=0, **method_kwargs 158 | ) 159 | context_captured_fit_map = getattr(pf, method)( 160 | "Y ~ _lspline(X2,[0,1])", context={"_lspline": _lspline}, **method_kwargs 161 | ) 162 | 163 | for context_fit in [context_captured_fit, context_captured_fit_map]: 164 | np.testing.assert_allclose(context_fit.coef(), explicit_fit.coef(), rtol=1e-12) 165 | np.testing.assert_allclose(context_fit.se(), explicit_fit.se(), rtol=1e-12) 166 | 167 | # FactorEvaluationError for `feols` when context is not set 168 | if method == "feols": 169 | with pytest.raises( 170 | FactorEvaluationError, match="Unable to evaluate factor `_lspline" 171 | ): 172 | pf.feols("Y ~ _lspline(X2,[0,1]) | f1 + f2", data=spline_data) 173 | -------------------------------------------------------------------------------- /tests/test_collinearity.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from numpy.testing import assert_array_equal 4 | 5 | from pyfixest.core import find_collinear_variables 6 | from pyfixest.estimation.numba.find_collinear_variables_nb import ( 7 | _find_collinear_variables_nb, 8 | ) 9 | 10 | 11 | @pytest.mark.parametrize("fn", [find_collinear_variables, _find_collinear_variables_nb]) 12 | def test_find_collinear_variables(benchmark, fn): 13 | """Test the find_collinear_variables function with various test cases.""" 14 | # ========================================================================= 15 | # Test Case 1: Simple collinearity 16 | # ========================================================================= 17 | # Create a matrix with a simple collinearity: last column is sum of first two 18 | N = 100 19 | dim = 1000 20 | X1 = np.random.RandomState(495).randn(dim, N) 21 | X1 = np.concat([X1, X1[:, [1]] + X1[:, [2]]], axis=1) 22 | X1 = X1.T @ X1 23 | # Test with default tolerance 24 | collinear_flags, n_collinear, all_collinear = benchmark(fn, X1) 25 | 26 | # Third column should be flagged as collinear 27 | expected_flags = np.array(N * [False] + [True]) 28 | assert_array_equal(collinear_flags, expected_flags) 29 | assert n_collinear == 1 30 | assert not all_collinear 31 | -------------------------------------------------------------------------------- /tests/test_confint.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pytest 6 | 7 | from pyfixest.estimation.estimation import feols 8 | from pyfixest.utils.utils import get_data 9 | 10 | 11 | def test_confint(): 12 | """Test the confint method of the feols class.""" 13 | data = get_data() 14 | fit = feols("Y ~ X1 + X2 + C(f1)", data=data) 15 | confint = fit.confint() 16 | 17 | np.testing.assert_allclose(confint, fit.confint(alpha=0.05)) 18 | assert np.all(confint.loc[:, "2.5%"] == fit.confint(alpha=0.05).loc[:, "2.5%"]) 19 | assert np.all(confint.loc[:, "97.5%"] == fit.confint(alpha=0.05).loc[:, "97.5%"]) 20 | assert np.all(confint.loc[:, "2.5%"] < fit.confint(alpha=0.10).loc[:, "5.0%"]) 21 | assert np.all(confint.loc[:, "97.5%"] > fit.confint(alpha=0.10).loc[:, "95.0%"]) 22 | 23 | # test keep, drop, and exact_match 24 | assert fit.confint(keep="X1", exact_match=True).shape[0] == 1 25 | assert ( 26 | fit.confint(drop=["X2"], exact_match=True).shape[0] == len(fit._coefnames) - 1 27 | ) 28 | assert fit.confint(keep="X").shape[0] == 2 29 | 30 | # simultaneous CIs: simultaneous CIs always wider 31 | for _ in range(5): 32 | assert np.all( 33 | confint.loc[:, "2.5%"] > fit.confint(alpha=0.05, joint=True).loc[:, "2.5%"] 34 | ) 35 | assert np.all( 36 | confint.loc[:, "97.5%"] 37 | < fit.confint(alpha=0.05, joint=True).loc[:, "97.5%"] 38 | ) 39 | 40 | # test seeds 41 | confint1 = fit.confint(joint=True, seed=1) 42 | confint2 = fit.confint(joint=True, seed=1) 43 | confint3 = fit.confint(joint=True, seed=2) 44 | 45 | assert np.all(confint1 == confint2) 46 | assert np.all(confint1 != confint3) 47 | 48 | 49 | @pytest.mark.skipif(sys.version_info >= (3, 12), reason="requires python3.11 or lower.") 50 | def test_against_doubleml(): 51 | """Test joint CIs against DoubleML.""" 52 | import doubleml as dml 53 | from sklearn.base import clone 54 | from sklearn.linear_model import LinearRegression 55 | 56 | rng = np.random.default_rng(2002) 57 | n_obs = 5_000 58 | n_vars = 100 59 | X = rng.normal(size=(n_obs, n_vars)) 60 | theta = np.array([3.0, 3.0, 3.0]) 61 | y = np.dot(X[:, :3], theta) + rng.standard_normal(size=(n_obs,)) 62 | 63 | dml_data = dml.DoubleMLData.from_arrays(X[:, 10:], y, X[:, :10]) 64 | learner = LinearRegression() 65 | ml_l = clone(learner) 66 | ml_m = clone(learner) 67 | dml_plr = dml.DoubleMLPLR(dml_data, ml_l, ml_m) 68 | dml_res = dml_plr.fit().bootstrap(n_rep_boot=10_000).confint(joint=True) 69 | 70 | df = pd.DataFrame( 71 | np.c_[y, X], columns=["y"] + ["X_" + str(x) for x in range(n_vars)] 72 | ) 73 | m = feols( 74 | f"y ~ -1 + {'+'.join(['X_' + str(x) for x in range(n_vars)])}", 75 | df, 76 | vcov="hetero", 77 | ) 78 | pyfixest_res = m.confint(keep="X_.$", reps=10_000, joint=True) 79 | 80 | assert np.all(np.abs(dml_res.values - pyfixest_res.values) < 1e-2) 81 | -------------------------------------------------------------------------------- /tests/test_count_fixef_fully_nested.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from numpy.testing import assert_array_equal 4 | 5 | from pyfixest.core import count_fixef_fully_nested_all 6 | 7 | # Import your module here - adjust import path as needed 8 | from pyfixest.estimation.numba.nested_fixef_nb import ( 9 | _count_fixef_fully_nested_all as count_fixef_fully_nested_all_nb, 10 | ) 11 | 12 | 13 | @pytest.mark.parametrize( 14 | "fn", [count_fixef_fully_nested_all, count_fixef_fully_nested_all_nb] 15 | ) 16 | def test_count_fixef_fully_nested_basic(benchmark, fn): 17 | """Basic test for count_fixef_fully_nested_all_rs function.""" 18 | # Setup test data 19 | all_fe = np.array(["fe1", "fe2", "cluster1"]) 20 | cluster_names = np.array(["cluster1"]) 21 | 22 | # Fixed effects data where fe1 is nested in cluster1 23 | fe_data = np.array( 24 | [ 25 | [1, 5, 0], # row 1 - fe1=1, fe2=5, cluster1=0 26 | [1, 6, 1], # row 2 - fe1=1, fe2=6, cluster1=1 27 | [2, 5, 0], # row 3 - fe1=2, fe2=5, cluster1=0 28 | [2, 6, 1], # row 4 - fe1=2, fe2=6, cluster1=1 29 | ], 30 | dtype=np.uintp, 31 | ) 32 | 33 | # Cluster data - one column for the "cluster1" variable 34 | cluster_data = np.array( 35 | [ 36 | [10], # fe1=1 always maps to cluster=10 37 | [10], 38 | [20], # fe1=2 always maps to cluster=20 39 | [20], 40 | ], 41 | dtype=np.uintp, 42 | ) 43 | 44 | # Call function 45 | mask, count = benchmark(fn, all_fe, cluster_names, cluster_data, fe_data) 46 | 47 | # Expected results 48 | expected_mask = np.array( 49 | [True, False, True] 50 | ) # fe1 is nested, fe2 is not, cluster1 is a cluster 51 | expected_count = 2 52 | 53 | # Assertions 54 | assert_array_equal(np.array(mask), expected_mask) 55 | assert count == expected_count 56 | -------------------------------------------------------------------------------- /tests/test_crv1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from pyfixest.core import crv1_meat_loop as crv1_meat_loop_rs 5 | from pyfixest.estimation.vcov_utils import _crv1_meat_loop 6 | 7 | 8 | @pytest.mark.parametrize("func", [_crv1_meat_loop, crv1_meat_loop_rs]) 9 | def test_crv1_meat_loop(benchmark, func): 10 | # Input data 11 | scores = np.array( 12 | [ 13 | [1.0, 2.0], 14 | [3.0, 4.0], 15 | [5.0, 6.0], 16 | [7.0, 8.0], 17 | ] 18 | ) 19 | clustid = np.array([0, 1]) 20 | cluster_col = np.array([0, 0, 1, 1]) 21 | 22 | # Expected: 23 | # For group 0: indices [0, 1], sum = [4.0, 6.0] 24 | # outer = [[16, 24], [24, 36]] 25 | # For group 1: indices [2, 3], sum = [12.0, 14.0] 26 | # outer = [[144, 168], [168, 196]] 27 | # Total = sum of the two 28 | expected = np.array([[160, 192], [192, 232]]) 29 | 30 | result = benchmark( 31 | func, scores, clustid.astype(np.uint), cluster_col.astype(np.uint) 32 | ) 33 | 34 | assert np.allclose(result, expected), f"Expected {expected}, got {result}" 35 | -------------------------------------------------------------------------------- /tests/test_detect_singletons.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from pyfixest.estimation.detect_singletons_ import detect_singletons 5 | from pyfixest.estimation.jax.detect_singletons_jax import detect_singletons_jax 6 | 7 | input1 = np.array([[0, 2, 1], [0, 2, 1], [0, 1, 3], [0, 1, 2], [0, 1, 2]]) 8 | solution1 = np.array([False, False, True, False, False]) 9 | 10 | input2 = np.array([[0, 2, 1], [0, 2, 1], [3, 1, 2], [0, 1, 1], [0, 1, 2]]) 11 | solution2 = np.array([False, False, True, True, True]) 12 | 13 | input3 = np.array([[0, 2, 1], [0, 2, 1], [0, 1, 1], [0, 1, 2], [0, 1, 2]]) 14 | solution3 = np.array([False, False, False, False, False]) 15 | 16 | 17 | @pytest.mark.parametrize( 18 | argnames="input, solution", 19 | argvalues=[(input1, solution1), (input2, solution2), (input3, solution3)], 20 | ) 21 | @pytest.mark.parametrize( 22 | argnames="detection_function", 23 | argvalues=[detect_singletons, detect_singletons_jax], 24 | ids=["numba", "jax"], 25 | ) 26 | def test_correctness(input, solution, detection_function): 27 | assert np.array_equal(detection_function(input), solution) 28 | 29 | 30 | @pytest.mark.parametrize( 31 | argnames="detection_function", 32 | argvalues=[detect_singletons, detect_singletons_jax], 33 | ids=["numba", "jax"], 34 | ) 35 | def test_single_column(detection_function): 36 | """Test with a single fixed effect column.""" 37 | input_data = np.array([[0], [0], [1], [2], [2]]) 38 | expected = np.array([False, False, True, False, False]) 39 | result = detection_function(input_data) 40 | assert np.array_equal(result, expected) 41 | 42 | 43 | @pytest.mark.parametrize( 44 | argnames="detection_function", 45 | argvalues=[detect_singletons, detect_singletons_jax], 46 | ids=["numba", "jax"], 47 | ) 48 | def test_all_singletons(detection_function): 49 | """Test when all observations are singletons.""" 50 | input_data = np.array([[0, 1], [1, 2], [2, 3], [3, 4]]) 51 | expected = np.array([True, True, True, True]) 52 | result = detection_function(input_data) 53 | assert np.array_equal(result, expected) 54 | 55 | 56 | @pytest.mark.parametrize( 57 | argnames="detection_function", 58 | argvalues=[detect_singletons, detect_singletons_jax], 59 | ids=["numba", "jax"], 60 | ) 61 | def test_no_singletons(detection_function): 62 | """Test when there are no singletons.""" 63 | input_data = np.array([[0, 0], [0, 0], [1, 1], [1, 1]]) 64 | expected = np.array([False, False, False, False]) 65 | result = detection_function(input_data) 66 | assert np.array_equal(result, expected) 67 | 68 | 69 | @pytest.mark.parametrize( 70 | argnames="detection_function", 71 | argvalues=[detect_singletons, detect_singletons_jax], 72 | ids=["numba", "jax"], 73 | ) 74 | def test_large_input(detection_function): 75 | """Test with a larger input to check performance and correctness.""" 76 | rng = np.random.default_rng(42) 77 | N = 10000 78 | input_data = np.column_stack( 79 | [ 80 | rng.integers(0, N // 10, N), 81 | rng.integers(0, N // 5, N), 82 | rng.integers(0, N // 2, N), 83 | ] 84 | ) 85 | 86 | # For large input, we compare against the Numba implementation as reference 87 | reference = detect_singletons(input_data) 88 | result = detection_function(input_data) 89 | 90 | assert np.array_equal(result, reference) 91 | assert len(result) == N 92 | assert result.dtype == np.bool_ 93 | -------------------------------------------------------------------------------- /tests/test_exceptions.py: -------------------------------------------------------------------------------- 1 | from pyfixest.utils._exceptions import find_stack_level 2 | 3 | 4 | def test_find_stack_level(): 5 | # The function returns 1 when called from test code because we're one level 6 | # away from the pyfixest package 7 | level = find_stack_level() 8 | assert level >= 1, "Stack level should be at least 1 when called from tests" 9 | 10 | def wrapper_function(): 11 | # Should return a higher level as we're nested deeper 12 | return find_stack_level() 13 | 14 | nested_level = wrapper_function() 15 | assert nested_level >= level, "Nested call should return same or higher stack level" 16 | 17 | 18 | def test_find_stack_level_imports(): 19 | # Test that we can properly import and access the package 20 | import pyfixest as pf 21 | 22 | assert pf is not None 23 | assert hasattr(pf, "__file__") 24 | -------------------------------------------------------------------------------- /tests/test_feols_feglm_internally.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | import pyfixest as pf 5 | 6 | 7 | def check_absolute_diff(x1, x2, tol, msg=None): 8 | "Check for absolute differences." 9 | if isinstance(x1, (int, float)): 10 | x1 = np.array([x1]) 11 | if isinstance(x2, (int, float)): 12 | x2 = np.array([x2]) 13 | msg = "" if msg is None else msg 14 | 15 | # handle nan values 16 | nan_mask_x1 = np.isnan(x1) 17 | nan_mask_x2 = np.isnan(x2) 18 | 19 | if not np.array_equal(nan_mask_x1, nan_mask_x2): 20 | raise AssertionError(f"{msg}: NaN positions do not match") 21 | 22 | valid_mask = ~nan_mask_x1 # Mask for non-NaN elements (same for x1 and x2) 23 | assert np.all(np.abs(x1[valid_mask] - x2[valid_mask]) < tol), msg 24 | 25 | 26 | fml_list = [ 27 | ("Y ~ X1 + C(f1)", "Y~X1 | f1"), 28 | ("Y ~ X1 + C(f1) + C(f2)", "Y~X1 | f1 + f2"), 29 | ] 30 | 31 | fml_ols_vs_gaussian = ["Y ~ X1", "Y ~ X1 + C(f1)", "Y ~ X1 * X2"] 32 | 33 | 34 | @pytest.mark.parametrize("fml", fml_ols_vs_gaussian) 35 | @pytest.mark.parametrize("inference", ["iid", "hetero", {"CRV1": "f1"}]) 36 | @pytest.mark.parametrize("dropna", [True]) 37 | def test_ols_vs_gaussian_glm(fml, inference, dropna): 38 | data = pf.get_data() 39 | if dropna: 40 | data = data.dropna() 41 | 42 | fit_ols = pf.feols(fml=fml, data=data, vcov=inference) 43 | fit_gaussian = pf.feglm(fml=fml, data=data, family="gaussian", vcov=inference) 44 | 45 | check_absolute_diff( 46 | fit_ols.coef().xs("X1"), fit_gaussian.coef().xs("X1"), tol=1e-10 47 | ) 48 | check_absolute_diff(fit_ols._weights[0:5], fit_gaussian._weights[0:5], tol=1e-10) 49 | check_absolute_diff(fit_ols._u_hat[0:5], fit_gaussian._u_hat[0:5], tol=1e-10) 50 | check_absolute_diff(fit_ols._scores[0, :], fit_gaussian._scores[0, :], tol=1e-10) 51 | 52 | if inference != "iid": 53 | # iid inference different: follows iid-glm; just the bread and not bread x sigma2 54 | check_absolute_diff( 55 | fit_ols.se().xs("X1"), fit_gaussian.se().xs("X1"), tol=1e-10 56 | ) 57 | check_absolute_diff(fit_ols._vcov[0, 0], fit_gaussian._vcov[0, 0], tol=1e-10) 58 | 59 | 60 | @pytest.mark.skip("Fixed effects are not yet supported.") 61 | @pytest.mark.parametrize("fml", fml_list) 62 | @pytest.mark.parametrize("family", ["gaussian"]) 63 | def test_feols_feglm_internally(fml, family): 64 | data = pf.get_data() 65 | data["Y"] = np.where(data["Y"] > 0, 1, 0) 66 | 67 | fml1, fml2 = fml 68 | 69 | fit1 = pf.feglm( 70 | fml=fml1, data=data, family=family, ssc=pf.ssc(adj=False, cluster_adj=False) 71 | ) 72 | fit2 = pf.feglm( 73 | fml=fml2, data=data, family=family, ssc=pf.ssc(adj=False, cluster_adj=False) 74 | ) 75 | 76 | assert fit1.coef().xs("X1") == fit2.coef().xs("X1"), ( 77 | f"Test failed for fml = {fml} and family = gaussian" 78 | ) 79 | assert fit1.se().xs("X1") == fit2.se().xs("X1"), ( 80 | f"Test failed for fml = {fml} and family = gaussian" 81 | ) 82 | assert fit1._u_hat[0:5] 83 | -------------------------------------------------------------------------------- /tests/test_formulas.py: -------------------------------------------------------------------------------- 1 | from pyfixest.estimation.FormulaParser import ( 2 | _dict_to_list_of_formulas, 3 | _find_multiple_estimation_syntax, 4 | _input_formula_to_dict, 5 | ) 6 | 7 | 8 | def test_input_formula_to_dict(): 9 | assert _input_formula_to_dict("a+b+c") == {"constant": ["a", "b", "c"]} 10 | 11 | assert _input_formula_to_dict("sw(x,y)") == {"constant": [], "sw": ["x", "y"]} 12 | 13 | assert _input_formula_to_dict("a+sw0(x,y)+d") == { 14 | "constant": ["a", "d"], 15 | "sw0": ["x", "y"], 16 | } 17 | 18 | assert _input_formula_to_dict("csw(x,y)") == {"constant": [], "csw": ["x", "y"]} 19 | 20 | assert _input_formula_to_dict("csw0(x,y,z)") == { 21 | "constant": [], 22 | "csw0": ["x", "y", "z"], 23 | } 24 | 25 | assert _input_formula_to_dict("a+b+csw0(x,y,z)") == { 26 | "constant": ["a", "b"], 27 | "csw0": ["x", "y", "z"], 28 | } 29 | 30 | 31 | def test_dict_to_list_of_formulas(): 32 | # assert _dict_to_list_of_formulas([]) == [] 33 | 34 | assert _dict_to_list_of_formulas({"constant": ["x", "y"], "sw0": ["a", "b"]}) == [ 35 | "x+y", 36 | "x+y+a", 37 | "x+y+b", 38 | ] 39 | assert _dict_to_list_of_formulas({"constant": [], "sw0": ["a", "b"]}) == [ 40 | "0", 41 | "a", 42 | "b", 43 | ] 44 | assert _dict_to_list_of_formulas({"constant": ["x", "y"], "sw0": []}) == ["x+y"] 45 | 46 | assert _dict_to_list_of_formulas({"constant": ["x", "y"], "sw": ["a", "b"]}) == [ 47 | "x+y+a", 48 | "x+y+b", 49 | ] 50 | assert _dict_to_list_of_formulas({"constant": [], "sw": ["a", "b"]}) == ["a", "b"] 51 | assert _dict_to_list_of_formulas({"constant": ["x", "y"], "sw": []}) == ["x+y"] 52 | 53 | assert _dict_to_list_of_formulas({"constant": ["x", "y"], "csw0": ["a", "b"]}) == [ 54 | "x+y", 55 | "x+y+a", 56 | "x+y+a+b", 57 | ] 58 | assert _dict_to_list_of_formulas({"constant": [], "csw0": ["a", "b"]}) == [ 59 | "0", 60 | "a", 61 | "a+b", 62 | ] 63 | assert _dict_to_list_of_formulas({"constant": ["x", "y"], "csw0": []}) == ["x+y"] 64 | 65 | assert _dict_to_list_of_formulas({"constant": ["x", "y"], "csw": ["a", "b"]}) == [ 66 | "x+y+a", 67 | "x+y+a+b", 68 | ] 69 | assert _dict_to_list_of_formulas({"constant": [], "csw": ["a", "b"]}) == [ 70 | "a", 71 | "a+b", 72 | ] 73 | assert _dict_to_list_of_formulas({"constant": ["x", "y"], "csw": []}) == ["x+y"] 74 | 75 | 76 | def test_find_multiple_estimation_syntax_no_match(): 77 | x = "a + b + c" 78 | assert _find_multiple_estimation_syntax(x) == (x, None) 79 | 80 | 81 | def test_find_multiple_estimation_syntax_sw(): 82 | x = "sw(a, b, c)" 83 | expected = (["a", " b", " c"], "sw") 84 | assert _find_multiple_estimation_syntax(x) == expected 85 | 86 | 87 | def test_find_multiple_estimation_syntax_csw(): 88 | x = "csw(a, b, c)" 89 | expected = (["a", " b", " c"], "csw") 90 | assert _find_multiple_estimation_syntax(x) == expected 91 | 92 | 93 | def test_find_multiple_estimation_syntax_sw0(): 94 | x = "sw0(a, b, c)" 95 | expected = (["a", " b", " c"], "sw0") 96 | assert _find_multiple_estimation_syntax(x) == expected 97 | 98 | 99 | def test_find_multiple_estimation_syntax_csw0(): 100 | x = "csw0(a, b, c)" 101 | expected = (["a", " b", " c"], "csw0") 102 | assert _find_multiple_estimation_syntax(x) == expected 103 | -------------------------------------------------------------------------------- /tests/test_i.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | import rpy2.robjects as ro 5 | from rpy2.robjects import pandas2ri 6 | 7 | # rpy2 imports 8 | from rpy2.robjects.packages import importr 9 | 10 | from pyfixest.estimation.estimation import feols 11 | 12 | pandas2ri.activate() 13 | 14 | fixest = importr("fixest") 15 | stats = importr("stats") 16 | broom = importr("broom") 17 | 18 | 19 | @pytest.mark.against_r_core 20 | def test_i(): 21 | df_het = pd.read_csv("pyfixest/did/data/df_het.csv") 22 | df_het["X"] = np.random.normal(size=len(df_het)) 23 | 24 | if ( 25 | "C(rel_year)[T.1.0]" 26 | in feols("dep_var~i(rel_year, ref = 1.0)", df_het)._coefnames 27 | ): 28 | raise AssertionError("C(rel_year)[T.1.0] should not be in the column names.") 29 | if ( 30 | "C(rel_year)[T.-2.0]" 31 | in feols("dep_var~i(rel_year,ref=-2.0)", df_het)._coefnames 32 | ): 33 | raise AssertionError("C(rel_year)[T.-2.0] should not be in the column names.") 34 | 35 | if ( 36 | "C(rel_year)[T.1.0]:treat" 37 | in feols("dep_var~i(rel_year, treat, ref=1.0)", df_het)._coefnames 38 | ): 39 | raise AssertionError( 40 | "C(rel_year)[T.1.0]:treat should not be in the column names." 41 | ) 42 | if ( 43 | "C(rel_year)[T.-2.0]:treat" 44 | in feols("dep_var~i(rel_year, treat,ref=-2.0)", df_het)._coefnames 45 | ): 46 | raise AssertionError( 47 | "C(rel_year)[T.-2.0]:treat should not be in the column names." 48 | ) 49 | 50 | with pytest.raises(ValueError): 51 | feols("dep_var~i(rel_year, ref = [1.0, 'a'])", df_het) 52 | 53 | 54 | @pytest.mark.against_r_core 55 | def test_i_vs_fixest(): 56 | df_het = pd.read_csv("pyfixest/did/data/df_het.csv") 57 | df_het = df_het[df_het["year"] >= 2010] 58 | # ------------------------------------------------------------------------ # 59 | # no fixed effects 60 | 61 | # no references 62 | fit_py = feols("dep_var~i(treat)", df_het) 63 | fit_r = fixest.feols(ro.Formula("dep_var~i(treat)"), df_het) 64 | np.testing.assert_allclose( 65 | fit_py.coef().values, np.array(fit_r.rx2("coefficients")) 66 | ) 67 | 68 | fit_py = feols("dep_var~i(rel_year)", df_het) 69 | fit_r = fixest.feols(ro.Formula("dep_var~i(rel_year)"), df_het) 70 | np.testing.assert_allclose( 71 | fit_py.coef().values, np.array(fit_r.rx2("coefficients")) 72 | ) 73 | 74 | # with references 75 | fit_py = feols("dep_var~i(treat, ref = False)", df_het) 76 | fit_r = fixest.feols(ro.Formula("dep_var~i(treat, ref = FALSE)"), df_het) 77 | np.testing.assert_allclose( 78 | fit_py.coef().values, np.array(fit_r.rx2("coefficients")) 79 | ) 80 | 81 | fit_py = feols("dep_var~i(rel_year, ref = 1.0)", df_het) 82 | fit_r = fixest.feols(ro.Formula("dep_var~i(rel_year, ref = c(1))"), df_het) 83 | np.testing.assert_allclose( 84 | fit_py.coef().values, np.array(fit_r.rx2("coefficients")) 85 | ) 86 | 87 | # ------------------------------------------------------------------------ # 88 | # with fixed effects 89 | 90 | # no references 91 | fit_py = feols("dep_var~i(treat) | year", df_het) 92 | fit_r = fixest.feols(ro.Formula("dep_var~i(treat)|year"), df_het) 93 | np.testing.assert_allclose( 94 | fit_py.coef().values, np.array(fit_r.rx2("coefficients")) 95 | ) 96 | 97 | fit_py = feols("dep_var~i(rel_year) | year", df_het) 98 | fit_r = fixest.feols(ro.Formula("dep_var~i(rel_year)|year"), df_het) 99 | np.testing.assert_allclose( 100 | fit_py.coef().values, np.array(fit_r.rx2("coefficients")) 101 | ) 102 | 103 | # with references 104 | fit_py = feols("dep_var~i(treat,ref=False) | year", df_het) 105 | fit_r = fixest.feols(ro.Formula("dep_var~i(treat, ref = FALSE)|year"), df_het) 106 | np.testing.assert_allclose( 107 | fit_py.coef().values, np.array(fit_r.rx2("coefficients")) 108 | ) 109 | 110 | fit_py = feols("dep_var~i(rel_year,ref=1.0) | year", df_het) 111 | fit_r = fixest.feols(ro.Formula("dep_var~i(rel_year, ref = c(1))|year"), df_het) 112 | np.testing.assert_allclose( 113 | fit_py.coef().values, np.array(fit_r.rx2("coefficients")) 114 | ) 115 | 116 | 117 | @pytest.mark.against_r_core 118 | @pytest.mark.parametrize( 119 | "fml", 120 | [ 121 | "dep_var ~ i(state)", 122 | "dep_var ~ i(state, ref = 1)", 123 | "dep_var ~ i(state, year)", 124 | "dep_var ~ i(state, year, ref = 1)", 125 | "dep_var ~ i(state, year) | state", 126 | "dep_var ~ i(state, year, ref = 1) | state", 127 | ], 128 | ) 129 | def test_i_interacted_fixest(fml): 130 | df_het = pd.read_csv("pyfixest/did/data/df_het.csv") 131 | df_het["X"] = np.random.normal(df_het.shape[0]) 132 | 133 | fit_py = feols(fml, df_het) 134 | fit_r = fixest.feols(ro.Formula(fml), df_het) 135 | np.testing.assert_allclose( 136 | fit_py.coef().values, np.array(fit_r.rx2("coefficients")) 137 | ) 138 | -------------------------------------------------------------------------------- /tests/test_model_matrix.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import pyfixest as pf 4 | 5 | 6 | # Define the fixture to provide data 7 | @pytest.fixture 8 | def data(): 9 | return pf.get_data() 10 | 11 | 12 | # Parameterize the test function directly with formulas 13 | @pytest.mark.parametrize( 14 | "fml", 15 | [ 16 | "Y ~ i(f1)", 17 | "Y ~ i(f1, ref = 1.0)", 18 | "Y ~ i(f1, X1)", 19 | "Y ~ i(f1, X1, ref = 2.0)", 20 | "Y ~ i(f1) + X2", 21 | "Y ~ i(f1, ref = 1.0) + X2", 22 | "Y ~ i(f1, X1) + X2", 23 | "Y ~ i(f1, X1, ref = 2.0) + X2", 24 | ], 25 | ) 26 | def test_get_icovars(data, fml): 27 | # Use the data and fml from the fixture and parameterization 28 | fit = pf.feols(fml, data=data) 29 | assert len(fit._icovars) > 0, "No icovars found" 30 | assert "X2" not in fit._icovars, "X2 is found in _icovars" 31 | -------------------------------------------------------------------------------- /tests/test_multicollinearity.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from pyfixest.estimation.estimation import feols 5 | 6 | 7 | def test_multicollinearity_error(): 8 | rng = np.random.default_rng(4) 9 | 10 | N = 10000 11 | X1 = rng.normal(0, 1, N) 12 | Y = rng.normal(0, 1, N) 13 | f1 = rng.choice([0, 1, 2, 3, 4], N, True) 14 | f2 = f1.copy() 15 | f3 = f1.copy() 16 | f3 = np.where(f3 == 1, 0, f3) 17 | 18 | data = pd.DataFrame( 19 | { 20 | "Y": Y, 21 | "X1": X1, 22 | "X2": X1 + rng.normal(0, 0.00000000001, N), 23 | "f1": f1, 24 | "f2": f2, 25 | "f3": f3, 26 | } 27 | ) 28 | 29 | fit = feols("Y ~ X1 + X2", data=data) 30 | assert fit._coefnames == ["Intercept", "X1"] 31 | 32 | fit = feols("Y ~ X1 + f1 + X2 + f2", data=data) 33 | assert fit._coefnames == ["Intercept", "X1", "f1"] 34 | 35 | fit = feols("Y ~ X1 + f1 + X2 | f2", data=data) 36 | assert fit._coefnames == ["X1"] 37 | -------------------------------------------------------------------------------- /tests/test_poisson.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import os 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import pytest 7 | import rpy2.robjects as ro 8 | from rpy2.robjects import pandas2ri 9 | 10 | # rpy2 imports 11 | from rpy2.robjects.packages import importr 12 | 13 | import pyfixest as pf 14 | from pyfixest.estimation.estimation import fepois 15 | 16 | pandas2ri.activate() 17 | 18 | fixest = importr("fixest") 19 | 20 | 21 | def test_separation(): 22 | """Test separation detection.""" 23 | example1 = pd.DataFrame.from_dict( 24 | { 25 | "Y": [0, 0, 0, 1, 2, 3], 26 | "fe1": ["a", "a", "b", "b", "b", "c"], 27 | "fe2": ["c", "c", "d", "d", "d", "e"], 28 | "X": np.random.normal(0, 1, 6), 29 | } 30 | ) 31 | with pytest.warns( 32 | UserWarning, match="2 observations removed because of separation." 33 | ): 34 | fepois("Y ~ X | fe1", data=example1, vcov="hetero", separation_check=["fe"]) 35 | 36 | if False: 37 | # this example is taken from ppmlhdfe's primer on separation https://github.com/sergiocorreia/ppmlhdfe/blob/master/guides/separation_primer.md 38 | # disabled because we currently do not perform separation checks if no fixed effects are provided 39 | # TODO: enable once separation checks without fixed effects are enabled 40 | example2 = pd.DataFrame.from_dict( 41 | { 42 | "Y": [0, 0, 0, 1, 2, 3], 43 | "X1": [2, -1, 0, 0, 5, 6], 44 | "X2": [5, 10, 0, 0, -10, -12], 45 | } 46 | ) 47 | 48 | with pytest.warns( 49 | UserWarning, match="2 observations removed because of separation." 50 | ): 51 | fepois("Y ~ X1 + X2", data=example2, vcov="hetero", separation_check=["ir"]) 52 | 53 | # ppmlhdfe test data sets (check readme in data/ppmlhdfe_separation_examples) 54 | path = os.path.dirname(os.path.abspath(__file__)) 55 | folder = r"data/ppmlhdfe_separation_examples" 56 | fns = sorted( 57 | [fn for fn in os.listdir(os.path.join(path, folder)) if fn.endswith(".csv")] 58 | ) 59 | for fn in fns: 60 | if fn in ["07.csv"]: 61 | # this case fails but is not tested in ppmlhdfe 62 | # https://github.com/sergiocorreia/ppmlhdfe/blob/master/test/validate_tagsep.do#L27 63 | continue 64 | data = pd.read_csv(os.path.join(path, folder, fn)) 65 | # build formula dynamically from dataframe 66 | # datasets have fixed structure of the form (y, x1, ..., xN, id1, ..., idM, separated) 67 | fml = "y" # dependent variable y 68 | regressors = data.columns[ 69 | data.columns.str.startswith("x") 70 | ] # regressors x1,...,xN 71 | fixed_effects = data.columns[ 72 | data.columns.str.startswith("id") 73 | ] # fixed effects id1,...,id2 74 | 75 | if regressors.empty: 76 | # TODO: formulae with just a constant term and fixed effects throw error in FIT.get_fit(), e.g., for 03.csv and Y ~ 1 | id1 + id2 + id3? 77 | continue 78 | fml += f" ~ {' + '.join(regressors)}" 79 | 80 | if fixed_effects.empty: 81 | # TODO: separation checks are currently disabled if no fixed effects are specified; enable tests once we run separation check without fixed effects 82 | continue 83 | else: 84 | fml += f" | {' + '.join(fixed_effects)}" 85 | 86 | with ( 87 | pytest.warns( 88 | UserWarning, 89 | match=f"{data.separated.sum()} observations removed because of separation.", 90 | ) as record, 91 | contextlib.suppress(Exception), 92 | ): 93 | pf.fepois(fml, data=data, separation_check=["ir"]) 94 | 95 | # if no separation, no warning is raised 96 | if data.separated.sum() == 0: 97 | assert len(record) == 0 98 | 99 | 100 | @pytest.mark.against_r_core 101 | @pytest.mark.parametrize("fml", ["Y ~ X1", "Y ~ X1 | f1"]) 102 | def test_against_fixest(fml): 103 | data = pf.get_data(model="Fepois") 104 | iwls_tol = 1e-12 105 | 106 | # vcov = "hetero" 107 | vcov = "hetero" 108 | fit = pf.fepois(fml, data=data, vcov=vcov, iwls_tol=iwls_tol) 109 | fit_r = fixest.fepois(ro.Formula(fml), data=data, vcov=vcov, glm_tol=iwls_tol) 110 | 111 | np.testing.assert_allclose( 112 | fit_r.rx2("irls_weights").reshape(-1, 1), fit._weights, atol=1e-08, rtol=1e-07 113 | ) 114 | np.testing.assert_allclose( 115 | fit_r.rx2("linear.predictors").reshape(-1, 1), 116 | fit._Xbeta, 117 | atol=1e-08, 118 | rtol=1e-07, 119 | ) 120 | np.testing.assert_allclose( 121 | fit_r.rx2("scores").reshape(-1, 1), 122 | fit._scores.reshape(-1, 1), 123 | atol=1e-08, 124 | rtol=1e-07, 125 | ) 126 | 127 | np.testing.assert_allclose( 128 | fit_r.rx2("hessian"), fit._hessian, atol=1e-08, rtol=1e-07 129 | ) 130 | 131 | np.testing.assert_allclose( 132 | fit_r.rx2("deviance"), fit.deviance, atol=1e-08, rtol=1e-07 133 | ) 134 | -------------------------------------------------------------------------------- /tests/test_ritest.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | import numpy as np 3 | import pandas as pd 4 | import pytest 5 | 6 | import pyfixest as pf 7 | 8 | matplotlib.use("Agg") # Use a non-interactive backend 9 | 10 | 11 | @pytest.mark.extended 12 | @pytest.mark.parametrize("fml", ["Y~X1+f3", "Y~X1+f3|f1", "Y~X1+f3|f1+f2"]) 13 | @pytest.mark.parametrize("resampvar", ["X1", "f3"]) 14 | @pytest.mark.parametrize("reps", [111, 212]) 15 | @pytest.mark.parametrize("cluster", [None, "group_id"]) 16 | def test_algos_internally(data, fml, resampvar, reps, cluster): 17 | fit = pf.feols(fml, data=data) 18 | 19 | rng1 = np.random.default_rng(1234) 20 | rng2 = np.random.default_rng(1234) 21 | 22 | kwargs = { 23 | "resampvar": resampvar, 24 | "reps": reps, 25 | "type": "randomization-c", 26 | "store_ritest_statistics": True, 27 | "cluster": cluster, 28 | } 29 | 30 | kwargs1 = kwargs.copy() 31 | kwargs2 = kwargs.copy() 32 | 33 | kwargs1["choose_algorithm"] = "slow" 34 | kwargs1["rng"] = rng1 35 | kwargs2["choose_algorithm"] = "fast" 36 | kwargs2["rng"] = rng2 37 | 38 | res1 = fit.ritest(**kwargs1) 39 | ritest_stats1 = fit._ritest_statistics.copy() 40 | 41 | res2 = fit.ritest(**kwargs2) 42 | ritest_stats2 = fit._ritest_statistics.copy() 43 | 44 | assert np.allclose(res1.Estimate, res2.Estimate, atol=1e-8, rtol=1e-8) 45 | assert np.allclose(res1["Pr(>|t|)"], res2["Pr(>|t|)"], atol=1e-8, rtol=1e-8) 46 | assert np.allclose(ritest_stats1, ritest_stats2, atol=1e-8, rtol=1e-8) 47 | 48 | 49 | @pytest.mark.extended 50 | @pytest.mark.parametrize("fml", ["Y~X1+f3", "Y~X1+f3|f1"]) 51 | @pytest.mark.parametrize("resampvar", ["X1"]) 52 | @pytest.mark.parametrize("cluster", [None, "group_id"]) 53 | def test_randomization_t_vs_c(fml, resampvar, cluster): 54 | data = pf.get_data(N=300) 55 | 56 | fit1 = pf.feols(fml, data=data) 57 | fit2 = pf.feols(fml, data=data) 58 | 59 | rng1 = np.random.default_rng(12354) 60 | rng2 = np.random.default_rng(12354) 61 | 62 | fit1.ritest( 63 | resampvar="X1", 64 | type="randomization-c", 65 | rng=rng1, 66 | cluster=cluster, 67 | store_ritest_statistics=True, 68 | reps=100, 69 | ) 70 | fit2.ritest( 71 | resampvar="X1", 72 | type="randomization-t", 73 | rng=rng2, 74 | cluster=cluster, 75 | store_ritest_statistics=True, 76 | reps=100, 77 | ) 78 | 79 | # just weak test that both are somewhat close 80 | assert ( 81 | np.abs(fit1._ritest_pvalue - fit2._ritest_pvalue) < 0.03 82 | if cluster is None 83 | else 0.06 84 | ), ( 85 | f"P-values are too different for randomization-c and randomization-t tests for {fml} and {resampvar} and {cluster}." 86 | ) 87 | 88 | 89 | @pytest.fixture 90 | def ritest_results(): 91 | # Load the CSV file into a pandas DataFrame 92 | file_path = "tests/data/ritest_results.csv" 93 | results_df = pd.read_csv(file_path) 94 | results_df.set_index(["formula", "resampvar", "cluster"], inplace=True) 95 | return results_df 96 | 97 | 98 | @pytest.fixture 99 | def data(): 100 | return pf.get_data(N=1000, seed=2999) 101 | 102 | 103 | @pytest.mark.extended 104 | @pytest.mark.parametrize("fml", ["Y~X1+f3", "Y~X1+f3|f1", "Y~X1+f3|f1+f2"]) 105 | @pytest.mark.parametrize("resampvar", ["X1", "f3", "X1=-0.75", "f3>0.05"]) 106 | @pytest.mark.parametrize("cluster", [None, "group_id"]) 107 | def test_vs_r(data, fml, resampvar, cluster, ritest_results): 108 | fit = pf.feols(fml, data=data) 109 | reps = 4000 110 | 111 | rng1 = np.random.default_rng(1234) 112 | 113 | kwargs = { 114 | "resampvar": resampvar, 115 | "reps": reps, 116 | "type": "randomization-c", 117 | "cluster": cluster, 118 | } 119 | 120 | kwargs1 = kwargs.copy() 121 | 122 | kwargs1["choose_algorithm"] = "fast" 123 | kwargs1["rng"] = rng1 124 | 125 | res1 = fit.ritest(**kwargs1) 126 | 127 | if cluster is not None: 128 | pval = ritest_results.xs( 129 | (fml, resampvar, cluster), level=("formula", "resampvar", "cluster") 130 | )["pval"].to_numpy() 131 | se = ritest_results.xs( 132 | (fml, resampvar, cluster), level=("formula", "resampvar", "cluster") 133 | )["se"].to_numpy() 134 | ci_lower = ritest_results.xs( 135 | (fml, resampvar, cluster), level=("formula", "resampvar", "cluster") 136 | )["ci_lower"].to_numpy() 137 | else: 138 | pval = ritest_results.xs( 139 | (fml, resampvar, "none"), level=("formula", "resampvar", "cluster") 140 | )["pval"].to_numpy() 141 | se = ritest_results.xs( 142 | (fml, resampvar, "none"), level=("formula", "resampvar", "cluster") 143 | )["se"].to_numpy() 144 | ci_lower = ritest_results.xs( 145 | (fml, resampvar, "none"), level=("formula", "resampvar", "cluster") 146 | )["ci_lower"].to_numpy() 147 | 148 | assert np.allclose(res1["Pr(>|t|)"], pval, rtol=0.005, atol=0.005) 149 | assert np.allclose(res1["Std. Error (Pr(>|t|))"], se, rtol=0.005, atol=0.005) 150 | assert np.allclose(res1["2.5% (Pr(>|t|))"], ci_lower, rtol=0.005, atol=0.005) 151 | 152 | 153 | @pytest.mark.extended 154 | def test_fepois_ritest(): 155 | data = pf.get_data(model="Fepois") 156 | fit = pf.fepois("Y ~ X1*f3", data=data) 157 | fit.ritest(resampvar="f3", reps=2000, store_ritest_statistics=True) 158 | 159 | assert fit._ritest_statistics is not None 160 | assert np.allclose(fit.pvalue().xs("f3"), fit._ritest_pvalue, rtol=0.01, atol=0.01) 161 | 162 | 163 | @pytest.fixture 164 | def data_r_vs_t(): 165 | return pf.get_data(N=5000, seed=2999) 166 | -------------------------------------------------------------------------------- /tests/test_solvers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from pyfixest.estimation.solvers import solve_ols 5 | 6 | 7 | def test_solve_ols_simple_2x2(): 8 | # Test case 1: Simple 2x2 system 9 | tZX = np.array([[4, 2], [2, 3]]) 10 | tZY = np.array([10, 8]) 11 | solver = "scipy.linalg.solve" 12 | solution = solve_ols(tZX, tZY, solver) 13 | assert np.allclose(solution, np.array([1.75, 1.5])) 14 | # Verify solution satisfies the system 15 | assert np.allclose(tZX @ solution, tZY) 16 | 17 | 18 | def test_solve_ols_identity(): 19 | # Test case 2: Identity matrix 20 | tZX = np.eye(2) 21 | tZY = np.array([1, 2]) 22 | solver = "scipy.linalg.solve" 23 | assert np.allclose(solve_ols(tZX, tZY, solver), tZY) 24 | 25 | 26 | @pytest.mark.parametrize( 27 | argnames="solver", 28 | argvalues=[ 29 | "scipy.linalg.solve", 30 | "np.linalg.lstsq", 31 | "np.linalg.solve", 32 | "scipy.sparse.linalg.lsqr", 33 | "jax", 34 | ], 35 | ids=[ 36 | "scipy.linalg.solve", 37 | "np.linalg.lstsq", 38 | "np.linalg.solve", 39 | "scipy.sparse.linalg.lsqr", 40 | "jax", 41 | ], 42 | ) 43 | def test_solve_ols_different_solvers(solver): 44 | # Test case 3: Test different solvers give same result 45 | tZX = np.array([[4, 2], [2, 3]]) 46 | tZY = np.array([10, 8]) 47 | solution = solve_ols(tZX, tZY, solver) 48 | assert np.allclose(solution, np.array([1.75, 1.5])) 49 | # Verify solution satisfies the system 50 | assert np.allclose(tZX @ solution, tZY) 51 | 52 | 53 | def test_solve_ols_invalid_solver(): 54 | # Test case 4: Invalid solver 55 | tZX = np.array([[1, 2], [3, 4]]) 56 | tZY = np.array([5, 6]) 57 | with pytest.raises(ValueError): 58 | solve_ols(tZX, tZY, "invalid_solver") 59 | -------------------------------------------------------------------------------- /tests/test_summarise.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | import statsmodels.formula.api as smf 4 | 5 | import pyfixest as pf 6 | from pyfixest.estimation.estimation import feols, fepois 7 | from pyfixest.report.summarize import _select_order_coefs, etable, summary 8 | from pyfixest.utils.utils import get_data 9 | 10 | 11 | def test_summary(): 12 | """Just run etable() and summary() on a few models.""" 13 | df1 = get_data() 14 | df1 = pd.concat( 15 | [df1, df1], axis=0 16 | ) # Make it a bit larger, for examining the thousands separator 17 | df2 = get_data(model="Fepois") 18 | 19 | fit1 = feols("Y ~ X1 + X2 | f1", data=df1) 20 | fit1a = feols("Y ~ X1 + X2 + f1", data=df1) 21 | fit2 = fepois("Y ~ X1 + X2 + f2 | f1", data=df2, vcov={"CRV1": "f1+f2"}) 22 | fit3 = feols("Y ~ X1", data=df1) 23 | fit4 = feols("Y ~ X1", data=df1, weights="weights") 24 | fit5 = feols("Y ~ 1 | Z1 ~ X1", data=df1) 25 | 26 | summary(fit1) 27 | summary(fit2) 28 | summary([fit1, fit2]) 29 | summary([fit4]) 30 | fit5.summary() 31 | 32 | etable(fit1) 33 | etable(fit2) 34 | etable([fit1, fit2]) 35 | 36 | etable([fit3]) 37 | etable([fit1, fit2, fit3]) 38 | 39 | fit_iv = feols("Y ~ X2 | f1 | X1 ~ Z1", data=df1) 40 | etable([fit_iv, fit1]) 41 | 42 | fit_multi = feols("Y + Y2 ~ X1 + X2 | f1", data=df1) 43 | etable(fit_multi.to_list()) 44 | 45 | # Test significance code 46 | etable([fit1, fit2], signif_code=[0.01, 0.05, 0.1]) 47 | etable([fit1, fit2], signif_code=[0.02, 0.06, 0.1]) 48 | 49 | # Test coefficient format 50 | etable([fit1, fit2], coef_fmt="b (se)\nt [p]") 51 | 52 | # Test custom statistics 53 | etable( 54 | models=[fit1, fit2], 55 | custom_stats={ 56 | "conf_int_lb": [fit1._conf_int[0], fit2._conf_int[0]], 57 | "conf_int_ub": [fit1._conf_int[1], fit2._conf_int[1]], 58 | }, 59 | coef_fmt="b [conf_int_lb, conf_int_ub]", 60 | ) 61 | 62 | # Test scientific notation 63 | etable( 64 | models=[fit1], 65 | custom_stats={ 66 | "test_digits": [[0.1, 12300]], 67 | }, 68 | coef_fmt="b [test_digits]", 69 | digits=2, 70 | ) 71 | 72 | # Test scientific notation, thousands separator 73 | etable( 74 | models=[fit1], 75 | custom_stats={ 76 | "test_digits": [[0.1, 12300]], 77 | }, 78 | coef_fmt="b [test_digits]", 79 | digits=2, 80 | scientific_notation=False, 81 | thousands_sep=True, 82 | ) 83 | 84 | # Test select / order coefficients 85 | etable([fit1, fit2, fit3], coef_fmt="b (se)\nt [p]") 86 | etable([fit1, fit2, fit3], coef_fmt="b (se)\nt [p]", keep=["X1", "cep"]) 87 | etable([fit1, fit2, fit3], coef_fmt="b (se)\nt [p]", drop=[r"\d$"]) 88 | etable([fit1, fit2, fit3], coef_fmt="b (se)\nt [p]", keep=[r"\d"], drop=["f"]) 89 | etable([fit1, fit2, fit3], coef_fmt="b (se)\nt [p]", keep="X") 90 | etable([fit1, fit2, fit3], coef_fmt="b (se)\nt [p]", drop=r"\d$") 91 | 92 | # test labels, felabels args 93 | etable([fit1, fit1a], labels={"X1": "X1_label"}, felabels={"f1": "f1_label"}) 94 | etable( 95 | [fit1, fit1a], labels={"X1": "X1_label"}, felabels={"f1": "f1_label"}, keep="X1" 96 | ) 97 | etable( 98 | [fit1, fit1a], labels={"X1": "X1_label"}, felabels={"f1": "f1_label"}, drop="X1" 99 | ) 100 | etable([fit1, fit1a], felabels={"f1": "f1_renamed2"}, keep=["f1"]) 101 | 102 | cols = ["x1", "x2", "x11", "x21"] 103 | assert _select_order_coefs(cols, keep=["x1"]) == ["x1", "x11"] 104 | assert _select_order_coefs(cols, drop=["x1"]) == ["x2", "x21"] 105 | assert _select_order_coefs(cols, keep=["x1"], exact_match=True) == ["x1"] 106 | assert _select_order_coefs(cols, drop=["x1"], exact_match=True) == [ 107 | "x2", 108 | "x11", 109 | "x21", 110 | ] 111 | 112 | # API tests for new tex args 113 | 114 | etable([fit1, fit2], type="tex") 115 | etable([fit1, fit2], type="tex", print_tex=True) 116 | 117 | etable([fit1, fit2], type="tex", notes="You can add notes here.") 118 | etable([fit1, fit2], type="md", notes="You can add notes here.") 119 | 120 | etable([fit1, fit2], type="tex", model_heads=["Model 1", "Model 2"]) 121 | etable( 122 | [fit1, fit2], type="tex", model_heads=["Model 1", "Model 2"], head_order="dh" 123 | ) 124 | etable( 125 | [fit1, fit2], type="tex", model_heads=["Model 1", "Model 2"], head_order="hd" 126 | ) 127 | etable([fit1, fit2], type="tex", model_heads=["Model 1", "Model 2"], head_order="d") 128 | etable([fit1, fit2], type="tex", model_heads=["Model 1", "Model 2"], head_order="h") 129 | etable([fit1, fit2], type="tex", model_heads=["Model 1", "Model 2"], head_order="") 130 | etable([fit1, fit2], type="tex", filename="tests/texfiles/test.tex") 131 | 132 | 133 | @pytest.mark.skip("Pyfixest PR is not yet merged into stargazer.") 134 | def test_stargazer(): 135 | data = pf.get_data() 136 | 137 | fit = pf.feols("Y ~ X1", data=data) 138 | fit_smf = smf.ols("Y ~ X1", data=data).fit() 139 | 140 | pf.Stargazer([fit, fit_smf]) 141 | -------------------------------------------------------------------------------- /tests/test_wildboottest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | import pyfixest as pf 5 | from pyfixest.utils.utils import get_data, ssc 6 | 7 | 8 | @pytest.fixture 9 | def data(): 10 | return get_data(N=2_000, seed=9) 11 | 12 | 13 | # note - tests currently fail because of ssc adjustments 14 | @pytest.mark.parametrize("fml", ["Y~X1", "Y~X1|f1", "Y~X1|f1+f2"]) 15 | def test_hc_equivalence(data, fml): 16 | ssc = pf.ssc(adj=False, cluster_adj=False) 17 | # note: cannot turn of ssc for wildboottest HC 18 | fixest = pf.feols(fml=fml, data=data, ssc=ssc, vcov="hetero") 19 | tstat = fixest.tstat().xs("X1") 20 | boot = fixest.wildboottest(param="X1", reps=999) 21 | boot_tstat = boot["t value"] 22 | ssc = boot["ssc"] 23 | 24 | # cannot test for for equality because of ssc adjustments 25 | np.testing.assert_allclose(tstat / boot_tstat, np.sqrt(ssc)) 26 | 27 | 28 | @pytest.mark.parametrize("fml", ["Y~X1", "Y~X1|f1", "Y~X1|f1+f2"]) 29 | def test_crv1_equivalence(data, fml): 30 | fixest = pf.feols( 31 | fml, data=data, vcov={"CRV1": "group_id"}, ssc=ssc(adj=False, cluster_adj=False) 32 | ) 33 | tstat = fixest.tstat().xs("X1") 34 | boot_tstat = fixest.wildboottest( 35 | param="X1", reps=999, adj=False, cluster_adj=False 36 | )["t value"] 37 | 38 | np.testing.assert_allclose(tstat, boot_tstat) 39 | -------------------------------------------------------------------------------- /tests/test_wls_types.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | import pyfixest as pf 5 | 6 | 7 | # @pytest.mark.skip(reason="Bug for fweights and heteroskedastic errors.") 8 | def test_fweights_ols(): 9 | "Test that the fweights are correctly implemented for OLS models." 10 | # Fepois model for discrete Y 11 | data = pf.get_data(model="Fepois") 12 | data2_w = ( 13 | data[["Y", "X1"]] 14 | .groupby(["Y", "X1"]) 15 | .size() 16 | .reset_index() 17 | .rename(columns={0: "count"}) 18 | ) 19 | data3_w = ( 20 | data[["Y", "X1", "f1"]] 21 | .groupby(["Y", "X1", "f1"]) 22 | .size() 23 | .reset_index() 24 | .rename(columns={0: "count"}) 25 | ) 26 | 27 | fit1 = pf.feols("Y ~ X1", data=data, ssc=pf.ssc(adj=False, cluster_adj=False)) 28 | fit2 = pf.feols( 29 | "Y ~ X1", 30 | data=data2_w, 31 | weights="count", 32 | weights_type="fweights", 33 | ssc=pf.ssc(adj=False, cluster_adj=False), 34 | ) 35 | 36 | assert fit1._N == fit2._N, "Number of observations is not the same." 37 | 38 | if False: 39 | np.testing.assert_allclose(fit1.tidy().values, fit2.tidy().values) 40 | 41 | np.testing.assert_allclose(fit1.vcov("HC1")._vcov, fit2.vcov("HC1")._vcov) 42 | np.testing.assert_allclose(fit1.vcov("HC2")._vcov, fit2.vcov("HC2")._vcov) 43 | np.testing.assert_allclose(fit1.vcov("HC3")._vcov, fit2.vcov("HC3")._vcov) 44 | 45 | fit3 = pf.feols("Y ~ X1 | f1", data=data) 46 | fit4 = pf.feols( 47 | "Y ~ X1 | f1", data=data3_w, weights="count", weights_type="fweights" 48 | ) 49 | np.testing.assert_allclose(fit3.tidy().values, fit4.tidy().values) 50 | np.testing.assert_allclose( 51 | fit3.vcov({"CRV3": "f1"})._vcov, fit4.vcov({"CRV3": "f1"})._vcov 52 | ) 53 | np.testing.assert_allclose(fit1.vcov("HC1")._vcov, fit2.vcov("HC1")._vcov) 54 | np.testing.assert_allclose(fit1.vcov("HC2")._vcov, fit2.vcov("HC2")._vcov) 55 | np.testing.assert_allclose(fit1.vcov("HC3")._vcov, fit2.vcov("HC3")._vcov) 56 | 57 | 58 | @pytest.mark.skip(reason="Not implemented yet.") 59 | def test_fweights_iv(): 60 | data = pf.get_data() 61 | data2_w = ( 62 | data[["Y", "X1", "Z1"]] 63 | .groupby(["Y", "X1", "Z1"]) 64 | .size() 65 | .reset_index() 66 | .rename(columns={0: "count"}) 67 | ) 68 | 69 | fit1 = pf.feols("Y ~ 1 | X1 ~ Z1", data=data) 70 | fit2 = pf.feols( 71 | "Y ~ 1 | X1 ~ Z1", data=data2_w, weights="count", weights_type="fweights" 72 | ) 73 | np.testing.assert_allclose(fit1.tidy().values, fit2.tidy().values) 74 | 75 | data3_w = ( 76 | data[["Y", "X1", "Z1", "f1"]] 77 | .groupby(["Y", "X1", "Z1", "f1"]) 78 | .size() 79 | .reset_index() 80 | .rename(columns={0: "count"}) 81 | ) 82 | 83 | fit3 = pf.feols("Y ~ 1 | f1 | X1 ~ Z1 ", data=data.dropna(), vcov={"CRV1": "f1"}) 84 | fit4 = pf.feols( 85 | "Y ~ 1 | f1 | X1 ~ Z1", 86 | data=data3_w.dropna(), 87 | weights="count", 88 | weights_type="fweights", 89 | vcov={"CRV1": "f1"}, 90 | ) 91 | np.testing.assert_allclose(fit3.tidy().values, fit4.tidy().values) 92 | 93 | 94 | def test_aweights(): 95 | data = pf.get_data() 96 | data["weights"] = np.ones(data.shape[0]) 97 | 98 | fit1 = pf.feols("Y ~ X1", data=data) 99 | fit2 = pf.feols("Y ~ X1", data=data, weights_type="aweights") 100 | fit3 = pf.feols("Y ~ X1", data=data, weights="weights", weights_type="aweights") 101 | 102 | np.testing.assert_allclose(fit1.tidy().values, fit2.tidy().values) 103 | np.testing.assert_allclose(fit1.tidy().values, fit3.tidy().values) 104 | -------------------------------------------------------------------------------- /tests/texfiles/test.tex: -------------------------------------------------------------------------------- 1 | \renewcommand\cellalign{t} 2 | \begin{threeparttable} 3 | \begin{tabular}{lcc} 4 | \toprule 5 | & \multicolumn{2}{c}{Y} \\ 6 | \cmidrule(lr){2-3} 7 | & (1) & (2) \\ 8 | \midrule 9 | X1 & \makecell{-0.950*** \\ (0.067)} & \makecell{0.004 \\ (0.033)} \\ 10 | X2 & \makecell{-0.174*** \\ (0.018)} & \makecell{-0.014 \\ (0.011)} \\ 11 | f2 & & \makecell{0.003 \\ (0.004)} \\ 12 | \midrule 13 | f1 & x & x \\ 14 | \midrule 15 | Observations & 1994 & 997 \\ 16 | S.E. type & by: f1 & by: f1+f2 \\ 17 | $R^2$ & 0.489 & - \\ 18 | \bottomrule 19 | \end{tabular} 20 | \footnotesize Significance levels: $*$ p $<$ 0.05, $**$ p $<$ 0.01, $***$ p $<$ 0.001. Format of coefficient cell: Coefficient 21 | (Std. Error) 22 | \end{threeparttable} 23 | --------------------------------------------------------------------------------