├── requirements-test.txt ├── requirements.txt ├── .gitignore ├── Dockerfile ├── hdfe ├── __init__.py ├── multicollinearity.py ├── groupby.py └── hdfe.py ├── README.md ├── setup.py ├── tests ├── test_multicollinearity_funcs.py ├── test_groupby.py ├── test_hdfe_funcs.py └── profile_groupby.py └── LICENSE /requirements-test.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | scipy 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.cppimporthash 2 | *.swp 3 | *.pyc 4 | .rendered.*.cpp 5 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3 2 | 3 | ADD . /home/jovyan/hdfe 4 | WORKDIR /home/jovyan/hdfe 5 | 6 | USER root 7 | RUN python setup.py install 8 | RUN pip install -r requirements-test.txt 9 | CMD ["pytest"] 10 | -------------------------------------------------------------------------------- /hdfe/__init__.py: -------------------------------------------------------------------------------- 1 | from .groupby import Groupby 2 | from .multicollinearity import remove_collinear_cols, find_collinear_cols 3 | from .hdfe import make_lags 4 | 5 | __all__ = ["Groupby", "remove_collinear_cols", "find_collinear_cols", "make_lags"] 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This package contains functionality useful for econometric work with panel data. 2 | Its name, originally standing for "high-dimensional fixed effects," is now misleading. 3 | 4 | Useful features are 5 | * Groupby: A class allowing for fast operations similar to Pandas groupby-apply and groupby-transform 6 | functionality, but performing significantly faster with user-written functions. See 7 | documentation [here](http://esantorella.com/2016/06/16/groupby/). 8 | * find_collinear_cols and remove_collinear_cols: Functions 9 | for dealing with multicollinearity which operate quickly on CSC matrices. 10 | * make_lags: Makes within-group lags (frequently useful with panel data) 11 | 12 | You can install hdfe through pip: "pip install hdfe" 13 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setup( 7 | packages=["hdfe"], 8 | install_requires=["numpy", "pandas>=0.25.0", "scipy"], 9 | long_description=long_description, 10 | long_description_content_type="text/markdown", 11 | name="hdfe", 12 | version="0.0.4", 13 | description="Econometric tools for working with panel data and fixed effects", 14 | url="https://github.com/esantorella/hdfe/", 15 | author="Elizabeth Santorella", 16 | author_email="elizabeth.santorella@gmail.com", 17 | license="MIT", 18 | classifiers=[ 19 | "Programming Language :: Python :: 3", 20 | "License :: OSI Approved :: MIT License", 21 | "Operating System :: OS Independent", 22 | ], 23 | ) 24 | -------------------------------------------------------------------------------- /tests/test_multicollinearity_funcs.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy import sparse as sps 3 | from hdfe.multicollinearity import ( 4 | remove_cols_from_csc, 5 | find_collinear_cols, 6 | remove_collinear_cols, 7 | ) 8 | 9 | 10 | def test_remove_cols_from_csc() -> None: 11 | x = sps.eye(4, dtype=int).tocsc() 12 | cols_to_remove = [1, 2] 13 | result = remove_cols_from_csc(x, cols_to_remove) 14 | expected_result = np.array([[1, 0], [0, 0], [0, 0], [0, 1]]) 15 | np.testing.assert_equal(result.A, expected_result) 16 | 17 | 18 | def test_find_collinear_cols() -> None: 19 | x = np.array([[1, 1], [0, 0]]) 20 | collinear, not_collinear = find_collinear_cols(x) 21 | assert collinear == [1] 22 | assert not_collinear == [0] 23 | 24 | 25 | def test_remove_collinear_cols() -> None: 26 | x = np.array([[1, 1], [0, 0]]) 27 | res = remove_collinear_cols(x) 28 | expected = np.array([[1], [0]]) 29 | np.testing.assert_equal(res, expected) 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018 The Python Packaging Authority 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /tests/test_groupby.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from hdfe import Groupby 4 | import pytest 5 | 6 | 7 | @pytest.fixture 8 | def df() -> pd.DataFrame: 9 | np.random.seed(0) 10 | n_obs = 100 11 | n_categories = 10 12 | return pd.DataFrame( 13 | { 14 | "first category": np.random.choice(n_categories, n_obs), 15 | "y": np.random.normal(0, 1, n_obs), 16 | } 17 | ) 18 | 19 | 20 | def test_groupby_apply_mean(df: pd.DataFrame) -> None: 21 | pandas_results = df.groupby("first category")[["y"]].mean() 22 | groupby_results = Groupby(df["first category"]).apply( 23 | np.mean, df["y"], broadcast=False, as_dataframe=True 24 | ) 25 | pd.testing.assert_frame_equal(pandas_results, groupby_results) 26 | 27 | 28 | def test_groupby_transform_mean(df: pd.DataFrame) -> None: 29 | pandas_results = df.groupby("first category")["y"].transform("mean") 30 | groupby_results = Groupby(df["first category"]).apply( 31 | np.mean, df["y"], broadcast=True, as_dataframe=True 32 | ) 33 | np.testing.assert_almost_equal( 34 | pandas_results.values, np.squeeze(groupby_results.values) 35 | ) 36 | -------------------------------------------------------------------------------- /tests/test_hdfe_funcs.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from hdfe.hdfe import make_dummies, get_all_dummies 4 | 5 | 6 | def test_make_dummies_arr() -> None: 7 | x = np.array([1, 0, 0]) 8 | results = make_dummies(x, False) 9 | expected = np.array([[0, 1], [1, 0], [1, 0]], dtype=float) 10 | np.testing.assert_almost_equal(results.A, expected) 11 | 12 | 13 | def test_make_dummies_ser() -> None: 14 | x = pd.Series([1, 0, 0]) 15 | results = make_dummies(x, False) 16 | expected = np.array([[0, 1], [1, 0], [1, 0]], dtype=float) 17 | np.testing.assert_almost_equal(results.A, expected) 18 | 19 | 20 | def test_make_dummies_cat() -> None: 21 | x = pd.Series(["horse", "cat", "cat"]).astype("category") 22 | results = make_dummies(x, False) 23 | expected = np.array([[0, 1], [1, 0], [1, 0]], dtype=float) 24 | np.testing.assert_almost_equal(results.A, expected) 25 | 26 | 27 | def test_make_dummies_arr_drop() -> None: 28 | x = np.array([1, 0, 0]) 29 | results = make_dummies(x, True) 30 | expected = np.array([[0], [1], [1]], dtype=float) 31 | np.testing.assert_almost_equal(results.A, expected) 32 | 33 | 34 | def test_make_dummies_ser_drop() -> None: 35 | x = pd.Series([1, 0, 0]) 36 | results = make_dummies(x, True) 37 | expected = np.array([[0], [1], [1]], dtype=float) 38 | np.testing.assert_almost_equal(results.A, expected) 39 | 40 | 41 | def test_make_dummies_cat_drop() -> None: 42 | x = pd.Series(["horse", "cat", "cat"]).astype("category") 43 | results = make_dummies(x, True) 44 | expected = np.array([[0], [1], [1]], dtype=float) 45 | np.testing.assert_almost_equal(results.A, expected) 46 | 47 | 48 | def test_get_all_dummies() -> None: 49 | x = np.array([[0, 0], [1, 0], [0, 1]]) 50 | result = get_all_dummies(x) 51 | expected = np.array([[1, 0, 1], [0, 1, 1], [1, 0, 0]], dtype=float) 52 | np.testing.assert_almost_equal(result.A, expected) 53 | -------------------------------------------------------------------------------- /hdfe/multicollinearity.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from typing import Iterable, List, Tuple, Union 3 | 4 | import numpy as np 5 | import scipy.sparse as sps 6 | 7 | 8 | def remove_cols_from_csc( 9 | x: sps.csc_matrix, cols_to_remove: Iterable[int] 10 | ) -> sps.spmatrix: 11 | """ 12 | Efficiently removes columns from a CSC sparse matrix by efficiently editing the 13 | underlying data. 14 | :param x: CSC sparse matrix 15 | :param cols_to_remove: 16 | :return: CSC sparse matrix 17 | 18 | >>> from scipy import sparse as sps 19 | >>> x = sps.eye(3, dtype=int).tocsc() 20 | >>> cols_to_remove = [1] 21 | >>> remove_cols_from_csc(x, cols_to_remove).A 22 | array([[1, 0], 23 | [0, 0], 24 | [0, 1]]) 25 | """ 26 | 27 | if not sps.issparse(x): 28 | raise ValueError 29 | 30 | if not sps.isspmatrix_csc(x): 31 | raise ValueError("Can only remove columns from a csc matrix.") 32 | 33 | def remove_one_col(idx: List[int], ptr_: np.ndarray, data_: List[int], col_: int): 34 | n_elts_to_remove = ptr_[col_ + 1] - ptr_[col_] 35 | idx = idx[: ptr_[col_]] + idx[ptr_[col_ + 1] :] 36 | data_ = data_[: ptr_[col_]] + data_[ptr_[col_ + 1] :] 37 | ptr_ = np.concatenate((ptr_[:col_], ptr_[col_ + 1 :] - n_elts_to_remove)) 38 | return data_, idx, ptr_ 39 | 40 | indices = list(x.indices) 41 | ptr = x.indptr 42 | data = list(x.data) 43 | 44 | for i, col in enumerate(cols_to_remove): 45 | data, indices, ptr = remove_one_col(indices, ptr, data, col - i) 46 | 47 | return sps.csc_matrix((data, indices, ptr)) 48 | 49 | 50 | def find_collinear_cols( 51 | x: Union[np.ndarray, sps.spmatrix], tol: float = 10 ** (-12), verbose: bool = False 52 | ) -> Tuple[List[int], List[int]]: 53 | """ 54 | Identifies a minimal subset of columns of x that, when removed, make x full rank. 55 | Note that there may be many such subsets. This function relies on a QR decomposition 56 | and may be numerically unstable. 57 | 58 | :param x: Numpy array or something that can be converted to a Numpy array. It will 59 | be converted to a Numpy array. 60 | :param tol: A higher tolerance leads to erring on the side of identifying more 61 | columns as collinear. 62 | :param verbose: 63 | :return: List of columns that when removed make x full rank, and a list of all of 64 | the other columns 65 | 66 | >>> x = np.array([[1, 1], [0, 0]]) 67 | >>> x 68 | array([[1, 1], 69 | [0, 0]]) 70 | >>> find_collinear_cols(x) 71 | ([1], [0]) 72 | """ 73 | k = x.shape[1] 74 | x = np.asarray(x) 75 | if x.shape[0] == k: 76 | rank = np.linalg.matrix_rank(x) 77 | else: 78 | rank = np.linalg.matrix_rank((x.T.dot(x))) 79 | full_rank = rank == k 80 | 81 | if full_rank: 82 | if verbose: 83 | print("Full rank") 84 | return [], list(range(k)) 85 | 86 | _, r = np.linalg.qr(x) 87 | row = 0 88 | 89 | non_collinear_cols = [] 90 | collinear_cols = [] 91 | min_not_deleted = 1 92 | for col in range(r.shape[1]): 93 | if row >= r.shape[0]: 94 | collinear_cols += list(range(col, r.shape[1])) 95 | break 96 | if abs(r[row, col]) < tol: 97 | collinear_cols.append(col) 98 | else: 99 | non_collinear_cols.append(col) 100 | min_not_deleted = min(min_not_deleted, abs(r[row, col])) 101 | row += 1 102 | if verbose: 103 | print("Minimum not deleted:", min_not_deleted) 104 | print("Number collinear", len(collinear_cols)) 105 | if len(non_collinear_cols) != rank: 106 | warnings.warn(f"Rank is {rank}, but there are {len(non_collinear_cols)} left.") 107 | 108 | return collinear_cols, non_collinear_cols 109 | 110 | 111 | def remove_collinear_cols( 112 | x: Union[sps.spmatrix, np.ndarray], verbose: bool = False 113 | ) -> Union[sps.spmatrix, np.ndarray]: 114 | """ 115 | Removes a minimal subset of columns from x such that x becomes full rank. Note that 116 | these columns are not uniquely defined. 117 | 118 | >>> x = np.array([[1, 1], [0, 0]]) 119 | >>> remove_collinear_cols(x) 120 | array([[1], 121 | [0]]) 122 | """ 123 | collinear, not_collinear = find_collinear_cols(x, verbose=verbose) 124 | if len(collinear) == 0: 125 | if verbose: 126 | print("No collinear columns") 127 | return x 128 | if verbose: 129 | print("Number of collinear columns:", len(collinear)) 130 | print("Number of non-collinear columns:", len(not_collinear)) 131 | 132 | if isinstance(x, sps.csc.csc_matrix): 133 | return remove_cols_from_csc(x, collinear) 134 | if isinstance(x, sps.coo.coo_matrix): 135 | x = x.asformat("csc") 136 | if isinstance(x, np.ndarray): 137 | return x[:, not_collinear] 138 | raise TypeError("Not implmented for type ", type(x)) 139 | -------------------------------------------------------------------------------- /tests/profile_groupby.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | from hdfe import Groupby 3 | import numpy as np 4 | import pandas as pd 5 | import time 6 | 7 | """ 8 | Generates the results in blog post 9 | http://esantorella.com/2016/06/16/groupby/ 10 | """ 11 | 12 | n_iters = 1000 13 | n_decimals = 4 14 | 15 | 16 | def print_results(pandas_1, pandas_100, groupby_1, groupby_100): 17 | print( 18 | "time to compute group means once with Pandas: {0}".format( 19 | round(pandas_1, n_decimals) 20 | ) 21 | ) 22 | print( 23 | "time to compute group means {0} times with Pandas: {1}".format( 24 | n_iters, round(pandas_100, n_decimals) 25 | ) 26 | ) 27 | print( 28 | "time to compute group means once with Grouped: {0}".format( 29 | round(groupby_1, n_decimals) 30 | ) 31 | ) 32 | print( 33 | "time to compute group means {0} times with Grouped: {1}".format( 34 | n_iters, round(groupby_100, n_decimals) 35 | ) 36 | ) 37 | print("Improvement", groupby_100 / pandas_100) 38 | return 39 | 40 | 41 | def get_transform_comparisions(f: Callable, data: pd.DataFrame): 42 | start = time.perf_counter() 43 | pandas_answer = data.groupby("first category")["y"].transform(f) 44 | pandas_1 = time.perf_counter() - start 45 | 46 | start = time.perf_counter() 47 | grouped = data.groupby("first category")["y"] 48 | for i in range(n_iters): 49 | grouped.transform(f) 50 | 51 | pandas_100 = time.perf_counter() - start 52 | 53 | # Compute group means using Grouped class 54 | start = time.perf_counter() 55 | y = data["y"].values 56 | first_category = data["first category"].values 57 | group_means = Groupby(first_category).apply(f, y) 58 | groupby_one = time.perf_counter() - start 59 | np.testing.assert_almost_equal(pandas_answer.values, group_means) 60 | 61 | start = time.perf_counter() 62 | grouped = Groupby(first_category) 63 | for _ in range(n_iters): 64 | grouped.apply(f, y) 65 | 66 | groupby_100 = time.perf_counter() - start 67 | return pandas_1, pandas_100, groupby_one, groupby_100 68 | 69 | 70 | def get_apply_comparisions(f: Callable, data: pd.DataFrame): 71 | start = time.perf_counter() 72 | pandas_answer = data.groupby("first category")["y"].apply(f) 73 | pandas_1 = time.perf_counter() - start 74 | 75 | start = time.perf_counter() 76 | grouped = data.groupby("first category")["y"] 77 | if f == np.mean: 78 | for i in range(n_iters): 79 | grouped.mean() 80 | else: 81 | for i in range(n_iters): 82 | grouped.apply(f) 83 | 84 | pandas_100 = time.perf_counter() - start 85 | 86 | # Compute group means using Grouped class 87 | start = time.perf_counter() 88 | first_category = data["first category"].values 89 | y = data["y"].values 90 | group_means = Groupby(first_category).apply(f, y, broadcast=False) 91 | groupby_one = time.perf_counter() - start 92 | 93 | np.testing.assert_almost_equal(pandas_answer.values, group_means) 94 | 95 | start = time.perf_counter() 96 | grouped = Groupby(first_category) 97 | for _ in range(n_iters): 98 | grouped.apply(f, y, broadcast=False) 99 | 100 | groupby_100 = time.perf_counter() - start 101 | return pandas_1, pandas_100, groupby_one, groupby_100 102 | 103 | 104 | def f(x): 105 | return np.mean(x) 106 | 107 | 108 | def make_result_df(df: pd.DataFrame): 109 | 110 | result_df = pd.DataFrame( 111 | columns=["Pandas", "Groupby"], 112 | index=pd.MultiIndex.from_product( 113 | (["Apply", "Transform"], ["Cython", "Python"]) 114 | ), 115 | data=np.zeros((4, 2)), 116 | ) 117 | 118 | print( 119 | """\nTransform, np.mean: With the np.mean function, Pandas uses Cython and 120 | does great""" 121 | ) 122 | results = get_transform_comparisions(np.mean, df) 123 | print_results(*results) 124 | result_df.loc["Transform", :].loc["Cython", :] = [results[1], results[3]] 125 | 126 | print("\nTransform, user-defined: Without Cython, Pandas is terrible") 127 | results = get_transform_comparisions(f, df) 128 | print_results(*results) 129 | result_df.loc["Transform", :].loc["Python", :] = [results[1], results[3]] 130 | 131 | print( 132 | "\nApply, np.mean: With the np.mean function, Pandas uses Cython and does great" 133 | ) 134 | results = get_apply_comparisions(np.mean, df) 135 | print_results(*results) 136 | result_df.loc["Apply", :].loc["Cython", :] = [results[1], results[3]] 137 | 138 | print("\nTransform, user-defined: Without Cython, Pandas is terrible") 139 | results = get_apply_comparisions(f, df) 140 | print_results(*results) 141 | result_df.loc["Apply", :].loc["Python", :] = [results[1], results[3]] 142 | 143 | result_df /= result_df.values[0, 0] 144 | return result_df.apply(lambda x: np.round(x, 1)) 145 | 146 | 147 | def main(): 148 | # Compute group means using Pandas groupby 149 | np.random.seed(int("hi", 36)) 150 | n_obs = 10 ** 4 151 | n_categories = 10 ** 2 152 | 153 | df = pd.DataFrame( 154 | { 155 | "first category": np.random.choice(n_categories, n_obs), 156 | "y": np.random.normal(0, 1, n_obs), 157 | } 158 | ) 159 | assert not Groupby(df["first category"]).already_sorted 160 | result_table = make_result_df(df) 161 | print(result_table) 162 | 163 | # Try again when already sorted 164 | df.sort_values("first category", inplace=True) 165 | assert Groupby(df["first category"]).already_sorted 166 | result_table = make_result_df(df) 167 | print(result_table) 168 | return 169 | 170 | 171 | if __name__ == "__main__": 172 | main() 173 | -------------------------------------------------------------------------------- /hdfe/groupby.py: -------------------------------------------------------------------------------- 1 | from typing import Union, Tuple, Callable, Any 2 | import numpy as np 3 | import pandas as pd 4 | 5 | 6 | # TODO: Reimplent CGroupby 7 | class Groupby: 8 | def __init__(self, keys: Union[np.ndarray, pd.Series]): 9 | """ 10 | 11 | :param keys: List of group identifiers. Both __init__ and apply will run 12 | much faster if keys is already sorted. 13 | """ 14 | self.keys = keys 15 | try: 16 | already_sorted = np.issubdtype(keys.dtype, np.number) and ( 17 | np.all(np.diff(keys) >= 0) 18 | ) 19 | except ValueError: 20 | already_sorted = False 21 | if already_sorted: 22 | keys = np.squeeze(keys) 23 | if keys.ndim > 1: 24 | raise ValueError("keys should be 1-dimensional") 25 | 26 | self.already_sorted = True 27 | new_idx = np.concatenate(([1], np.diff(keys) != 0)) 28 | self.first_occurrences = np.where(new_idx)[0] 29 | self.keys_as_int: np.ndarray = np.cumsum(new_idx) - 1 30 | assert isinstance(self.keys_as_int, np.ndarray) 31 | self.n_keys = self.keys_as_int[-1] + 1 32 | 33 | else: 34 | self.already_sorted = False 35 | _, self.first_occurrences, self.keys_as_int = np.unique( 36 | keys, return_index=True, return_inverse=True 37 | ) 38 | self.n_keys = max(self.keys_as_int) + 1 39 | self.indices = self._set_indices() 40 | 41 | def _set_indices(self): 42 | if self.already_sorted: 43 | indices = [ 44 | slice(i, j) 45 | for i, j in zip(self.first_occurrences[:-1], self.first_occurrences[1:]) 46 | ] 47 | indices.append(slice(self.first_occurrences[-1], len(self.keys_as_int))) 48 | indices = np.array(indices) 49 | else: 50 | indices = [[] for _ in range(self.n_keys)] 51 | for i, k in enumerate(self.keys_as_int): 52 | indices[k].append(i) 53 | indices = np.array([np.array(elt) for elt in indices]) 54 | return indices 55 | 56 | def apply( 57 | self, 58 | function_: Callable[[np.ndarray], Any], 59 | array: Union[np.ndarray, pd.Series], 60 | broadcast: bool = True, 61 | shape: Tuple = None, 62 | order: str = "c", 63 | as_dataframe: bool = False, 64 | ): 65 | """ 66 | Applies a function to each group, where groups are defined by self.keys_as_int 67 | (or, equivalently, as the argument of __init__.) 68 | If broadcast=True, first dimension of output will equal first dimension of 69 | "array", as in Pandas "transform". 70 | If broadcast=False, first dimension of output equals self.n_keys, as in Pandas 71 | "groupby". 72 | 73 | :param function_: function to be applied to each group 74 | :param array: np.ndarray or similar. Should have same first dimension as 75 | self.keys_as_int. 76 | :param broadcast: bool 77 | :param shape: Shape of output. Can be up to 3-dimensional. 78 | First dimension must be array.shape[0] (if broadcast=True) 79 | or self.n_keys (if broadcast=False). Default is for output to be 80 | one-dimensional. 81 | :param order: Should output be c-ordered or fortran-ordered? 82 | :param as_dataframe: if False, returns output as ndarray; if True, returns 83 | output 84 | as DataFrame with keys as indices 85 | :return: 86 | """ 87 | if isinstance(array, pd.Series): 88 | names = [array.name] 89 | array = np.asarray(array) 90 | elif isinstance(array, pd.DataFrame): 91 | names = array.columns 92 | array = array.values 93 | else: 94 | names = [None] 95 | 96 | assert isinstance(array, np.ndarray) 97 | 98 | if broadcast: 99 | result = np.zeros(array.shape[0] if shape is None else shape, order=order) 100 | assert result.shape[0] == array.shape[0] 101 | 102 | # np.take doesn't allow slice arguments, so this has to be more verbose 103 | # than when not already sorted 104 | if self.already_sorted: 105 | if array.ndim == 1: 106 | for idx in self.indices: 107 | result[idx] = function_(array[idx]) 108 | elif array.ndim == 2: 109 | for idx in self.indices: 110 | result[idx] = function_(array[idx, :]) 111 | elif array.ndim == 3: 112 | for idx in self.indices: 113 | result[idx] = function_(array[idx, :, :]) 114 | else: 115 | raise NotImplementedError("Can't have more than 3 dims") 116 | else: 117 | for idx in self.indices: 118 | result[idx] = function_(np.take(array, idx, 0)) 119 | if as_dataframe: 120 | return pd.DataFrame(index=self.keys, data=result) 121 | return result 122 | 123 | result = np.zeros(self.n_keys if shape is None else shape, order=order) 124 | assert result.shape[0] == self.n_keys 125 | if self.already_sorted: 126 | if array.ndim == 1: 127 | for k, idx in enumerate(self.indices): 128 | result[k] = function_(array[idx]) 129 | elif array.ndim == 2: 130 | for k, idx in enumerate(self.indices): 131 | result[k] = function_(array[idx, :]) 132 | elif array.ndim == 3: 133 | for k, idx in enumerate(self.indices): 134 | result[k] = function_(array[idx, :, :]) 135 | else: 136 | raise NotImplementedError("Can't have more than 3 dims") 137 | 138 | else: 139 | for first_occurrence, idx in zip(self.first_occurrences, self.indices): 140 | result[self.keys_as_int[first_occurrence]] = function_( 141 | np.take(array, idx, 0) 142 | ) 143 | 144 | if as_dataframe: 145 | return pd.DataFrame( 146 | index=self.keys[self.first_occurrences], data=result, columns=names 147 | ) 148 | return result 149 | -------------------------------------------------------------------------------- /hdfe/hdfe.py: -------------------------------------------------------------------------------- 1 | from itertools import chain 2 | from typing import Iterable, Tuple, List, Union, Dict 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import scipy.linalg 7 | import scipy.sparse as sps 8 | 9 | from .groupby import Groupby 10 | from .multicollinearity import find_collinear_cols, remove_cols_from_csc 11 | 12 | 13 | # TODO: update link on personal website 14 | def make_dummies(elt: Union[np.ndarray, pd.Series], drop_col: bool) -> sps.spmatrix: 15 | try: 16 | if elt.dtype == "category": 17 | elt = elt.cat.codes 18 | except TypeError: 19 | pass 20 | already_sorted = ( 21 | np.issubdtype(elt.dtype, np.integer) 22 | and np.min(elt) == 0 23 | and np.max(elt) == len(set(elt)) - 1 24 | ) 25 | if not already_sorted: 26 | _, elt = np.unique(elt, return_inverse=True) 27 | 28 | dummies = sps.csc_matrix((np.ones(len(elt)), (range(len(elt)), elt))) 29 | if drop_col: 30 | return dummies[:, :-1] 31 | else: 32 | return dummies 33 | 34 | 35 | def get_all_dummies(categorical_data: Union[np.ndarray, sps.spmatrix]) -> sps.spmatrix: 36 | if len(categorical_data.shape) == 1 or categorical_data.shape[1] == 1: 37 | return make_dummies(categorical_data, False) 38 | 39 | num_fes = categorical_data.shape[1] 40 | first = make_dummies(categorical_data[:, 0], False) 41 | others = [make_dummies(categorical_data[:, col], True) for col in range(1, num_fes)] 42 | others = sps.hstack(others) 43 | return sps.hstack((first, others)) 44 | 45 | 46 | # TODO: return variance estimate if desired 47 | # TODO: verbose option 48 | # TODO: write tests and use Pandas groupby 49 | def estimate( 50 | data: pd.DataFrame, 51 | y: np.ndarray, 52 | x: np.ndarray, 53 | categorical_controls: List, 54 | check_rank=False, 55 | estimate_variance=False, 56 | get_residual=False, 57 | cluster=None, 58 | tol=None, 59 | within_if_fe=True, 60 | ): 61 | """ Automatically picks best method for least squares. y must be 2d. """ 62 | if not y.ndim == 2: 63 | raise ValueError 64 | # Use within estimator even when more than one set of fixed effects 65 | 66 | if categorical_controls is None or len(categorical_controls) == 0: 67 | b = np.linalg.lstsq(x, y)[0] 68 | assert b.ndim == 2 69 | if estimate_variance or get_residual: 70 | error = y - x.dot(b) 71 | assert error.shape == y.shape 72 | # within estimator 73 | elif len(categorical_controls) == 1 or within_if_fe: 74 | if len(categorical_controls) > 1: 75 | dummies = sps.hstack( 76 | [make_dummies(data[col], True) for col in categorical_controls[1:]] 77 | ) 78 | x = np.hstack((x, dummies.A)) 79 | 80 | x_df = pd.DataFrame( 81 | data=np.hstack((data[categorical_controls[0]].values[:, None], x)), 82 | columns=list(range(x.shape[1] + 1)), 83 | ) 84 | pandas_grouped = x_df.groupby(0) 85 | x_demeaned = ( 86 | x - pandas_grouped[list(range(1, x_df.shape[1]))].transform(np.mean).values 87 | ) 88 | assert x_demeaned.shape == x.shape 89 | 90 | if check_rank: 91 | if tol is not None: 92 | _, not_collinear = find_collinear_cols( 93 | x_demeaned, verbose=True, tol=tol 94 | ) 95 | else: 96 | _, not_collinear = find_collinear_cols(x_demeaned, verbose=True) 97 | 98 | not_collinear = np.array(not_collinear) 99 | x = x[:, not_collinear] 100 | x_demeaned = x_demeaned[:, not_collinear] 101 | 102 | # k x n_outcomes 103 | b = np.linalg.lstsq(x_demeaned, y)[0] 104 | assert b.ndim == 2 105 | error = y - x.dot(b) 106 | assert error.shape == y.shape 107 | error_df = pd.DataFrame( 108 | data=np.hstack((data[categorical_controls[0]].values[:, None], error)), 109 | columns=list(range(error.shape[1] + 1)), 110 | ) 111 | pandas_grouped = error_df.groupby(0) 112 | # n_teachers x n_outcomes 113 | fixed_effects = pandas_grouped[list(range(1, error_df.shape[1]))].mean().values 114 | assert fixed_effects.ndim == 2 115 | # (n_teachers + k) x n_outcomes 116 | b = np.concatenate((fixed_effects, b)) 117 | x = sps.hstack((make_dummies(data[categorical_controls[0]], False), x)).tocsr() 118 | assert b.shape[0] == x.shape[1] 119 | if estimate_variance or get_residual: 120 | error -= fixed_effects[data[categorical_controls[0]].values] 121 | else: 122 | dummies = get_all_dummies(data[categorical_controls].values) 123 | x = sps.hstack((dummies, sps.csc_matrix(x))) 124 | assert sps.issparse(x) 125 | assert type(x) is sps.csc_matrix 126 | if check_rank: 127 | collinear, _ = find_collinear_cols(x.T.dot(x).A) 128 | x = remove_cols_from_csc(x, collinear) 129 | if y.ndim == 1 or y.shape[1] == 1: 130 | b = sps.linalg.lsqr(x, y)[0] 131 | else: 132 | # TODO: there's a function for doing this all at once 133 | b = np.zeros((x.shape[1], y.shape[1]), order="F") 134 | for i in range(y.shape[1]): 135 | b[:, i] = sps.linalg.lsqr(x, y[:, i], atol=1e-10)[0] 136 | 137 | if estimate_variance or get_residual: 138 | if b.ndim == 1: 139 | b = b[:, None] 140 | assert b.ndim == 2 141 | predicted = x.dot(b) 142 | assert y.shape == predicted.shape 143 | error = y - predicted 144 | assert error.shape == y.shape 145 | 146 | assert np.all(np.isfinite(b)) 147 | if not estimate_variance and not get_residual: 148 | return b, x 149 | 150 | if get_residual: 151 | return b, x, error 152 | 153 | if estimate_variance: 154 | assert b.shape[0] == x.shape[1] 155 | _, r = np.linalg.qr(x if type(x) is np.array else x.A) 156 | 157 | inv_r = scipy.linalg.solve_triangular(r, np.eye(r.shape[0])) 158 | inv_x_prime_x = inv_r.dot(inv_r.T) 159 | if cluster is not None: 160 | grouped = Groupby(data[cluster]) 161 | 162 | def f(mat): 163 | return mat[:, 1:].T.dot(mat[:, 0]) 164 | 165 | V = [] 166 | for i in range(y.shape[1]): 167 | u_ = grouped.apply( 168 | f, 169 | np.hstack((error[:, i, None], x.A)), 170 | shape=(grouped.n_keys, x.shape[1]), 171 | broadcast=False, 172 | ) 173 | 174 | inner = u_.T.dot(u_) 175 | V.append(inv_x_prime_x.dot(inner).dot(inv_x_prime_x)) 176 | else: 177 | error_sums = np.sum(error ** 2, 0) 178 | assert len(error_sums) == y.shape[1] 179 | V = [inv_x_prime_x * es / (len(y) - x.shape[1]) for es in error_sums] 180 | 181 | return b, x, error, V 182 | 183 | 184 | def make_one_lag( 185 | array: np.ndarray, lag: int, axis: int, fill_missing: bool = False 186 | ) -> np.ndarray: 187 | if len(array.shape) == 1: 188 | array = np.expand_dims(array, 0) 189 | assert axis == 1 190 | 191 | # I have no idea why this is here, but it doesn't apply 192 | # for usual data format 193 | if abs(lag) > array.shape[axis]: 194 | if fill_missing: 195 | lags = np.zeros(array.shape) 196 | missing = np.ones(array.shape) 197 | if axis == 1: 198 | return np.vstack((lags, missing)) 199 | else: 200 | return np.hstack((lags, missing)) 201 | else: 202 | return np.full(array.shape, np.nan) 203 | 204 | # (1, 5) when starting with an array of size (93,5) and lag 1 205 | missing_shape = ( 206 | (array.shape[0], abs(lag)) if axis == 1 else (abs(lag), array.shape[1]) 207 | ) 208 | # (92, 5) when starting with an array of size (93, 5) and lag 1 209 | other_shape = ( 210 | (array.shape[0], array.shape[1] - abs(lag)) 211 | if axis == 1 212 | else (array.shape[0] - abs(lag), array.shape[1]) 213 | ) 214 | 215 | if fill_missing: 216 | missing_ind = np.ones(missing_shape) 217 | missing_zero = np.zeros(missing_shape) 218 | not_missing = np.zeros(other_shape) 219 | 220 | if axis == 1: 221 | if lag > 0: 222 | lags = np.hstack((missing_zero, array[:, :-lag])) 223 | missing = np.hstack((missing_ind, not_missing)) 224 | if lag < 0: 225 | lags = np.hstack((array[:, -lag:], missing_zero)) 226 | missing = np.hstack((not_missing, missing_ind)) 227 | return np.vstack((lags, missing)) 228 | else: 229 | if lag > 0: 230 | # So with one lag, first row is zeros 231 | lags = np.vstack((missing_zero, array[:-lag, :])) 232 | missing = np.vstack((missing_ind, not_missing)) 233 | if lag < 0: 234 | lags = np.vstack((array[-lag:, :], missing_zero)) 235 | missing = np.vstack((not_missing, missing_ind)) 236 | 237 | return np.hstack((lags, missing)) 238 | 239 | else: 240 | missing_nan = np.full(missing_shape, np.nan) 241 | if axis == 1: 242 | if lag > 0: 243 | return np.hstack((missing_nan, array[:, :-lag])) 244 | if lag < 0: 245 | return np.hstack((array[:, -lag:], missing_nan)) 246 | else: 247 | if lag > 0: 248 | return np.vstack((missing_nan, array[:-lag, :])) 249 | if lag < 0: 250 | return np.vstack((array[-lag:, :], missing_nan)) 251 | 252 | 253 | def make_lags( 254 | df: pd.DataFrame, 255 | n_lags_back: int, 256 | n_lags_forward: int, 257 | outcomes: List[str], 258 | groupby: Iterable, 259 | fill_zeros: bool, 260 | ) -> Tuple[pd.DataFrame, Dict[str, List[str]]]: 261 | lags = list(range(-1 * n_lags_forward, 0)) + list(range(1, n_lags_back + 1)) 262 | grouped = Groupby(df[groupby].values) 263 | outcome_data = df[outcomes].values 264 | 265 | for lag in lags: 266 | 267 | def f(x): 268 | return make_one_lag(x, lag, 0, fill_zeros) 269 | 270 | width = 2 * len(outcomes) if fill_zeros else len(outcomes) 271 | 272 | new_data = grouped.apply(f, outcome_data, True, shape=(len(df), width)) 273 | new_cols = [out + "_lag_" + str(lag) for out in outcomes] 274 | if fill_zeros: 275 | new_cols += [out + "_lag_" + str(lag) + "_mi" for out in outcomes] 276 | 277 | for i, c in enumerate(new_cols): 278 | df.loc[:, c] = new_data[:, i] 279 | 280 | if fill_zeros: 281 | lag_vars = { 282 | out: list( 283 | chain( 284 | *( 285 | [out + "_lag_" + str(lag), out + "_lag_" + str(lag) + "_mi"] 286 | for lag in lags 287 | ) 288 | ) 289 | ) 290 | for out in outcomes 291 | } 292 | for out in outcomes: 293 | for lag in lags: 294 | name = out + "_lag_" + str(lag) 295 | missing = pd.isnull(df[name]) | df[name + "_mi"] == 1 296 | df.loc[missing, name] = 0 297 | df.loc[missing, name + "_mi"] = 1 298 | 299 | else: 300 | lag_vars = {out: [out + "_lag_" + str(lag) for lag in lags] for out in outcomes} 301 | 302 | return df, lag_vars 303 | --------------------------------------------------------------------------------