├── requirements-test.txt
├── requirements.txt
├── .gitignore
├── Dockerfile
├── hdfe
    ├── __init__.py
    ├── multicollinearity.py
    ├── groupby.py
    └── hdfe.py
├── README.md
├── setup.py
├── tests
    ├── test_multicollinearity_funcs.py
    ├── test_groupby.py
    ├── test_hdfe_funcs.py
    └── profile_groupby.py
└── LICENSE


/requirements-test.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | scipy
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.cppimporthash
2 | *.swp
3 | *.pyc
4 | .rendered.*.cpp
5 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3
 2 | 
 3 | ADD . /home/jovyan/hdfe
 4 | WORKDIR /home/jovyan/hdfe
 5 | 
 6 | USER root
 7 | RUN python setup.py install
 8 | RUN pip install -r requirements-test.txt
 9 | CMD ["pytest"]
10 | 


--------------------------------------------------------------------------------
/hdfe/__init__.py:
--------------------------------------------------------------------------------
1 | from .groupby import Groupby
2 | from .multicollinearity import remove_collinear_cols, find_collinear_cols
3 | from .hdfe import make_lags
4 | 
5 | __all__ = ["Groupby", "remove_collinear_cols", "find_collinear_cols", "make_lags"]
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | This package contains functionality useful for econometric work with panel data.
 2 | Its name, originally standing for "high-dimensional fixed effects," is now misleading.
 3 | 
 4 | Useful features are
 5 | * Groupby: A class allowing for fast operations similar to Pandas groupby-apply and groupby-transform
 6 | functionality, but performing significantly faster with user-written functions. See
 7 | documentation [here](http://esantorella.com/2016/06/16/groupby/).
 8 | * find_collinear_cols and remove_collinear_cols: Functions
 9 | for dealing with multicollinearity which operate quickly on CSC matrices.
10 | * make_lags: Makes within-group lags (frequently useful with panel data)
11 | 
12 | You can install hdfe through pip: "pip install hdfe"
13 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setup(
 7 |     packages=["hdfe"],
 8 |     install_requires=["numpy", "pandas>=0.25.0", "scipy"],
 9 |     long_description=long_description,
10 |     long_description_content_type="text/markdown",
11 |     name="hdfe",
12 |     version="0.0.4",
13 |     description="Econometric tools for working with panel data and fixed effects",
14 |     url="https://github.com/esantorella/hdfe/",
15 |     author="Elizabeth Santorella",
16 |     author_email="elizabeth.santorella@gmail.com",
17 |     license="MIT",
18 |     classifiers=[
19 |         "Programming Language :: Python :: 3",
20 |         "License :: OSI Approved :: MIT License",
21 |         "Operating System :: OS Independent",
22 |     ],
23 | )
24 | 


--------------------------------------------------------------------------------
/tests/test_multicollinearity_funcs.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy import sparse as sps
 3 | from hdfe.multicollinearity import (
 4 |     remove_cols_from_csc,
 5 |     find_collinear_cols,
 6 |     remove_collinear_cols,
 7 | )
 8 | 
 9 | 
10 | def test_remove_cols_from_csc() -> None:
11 |     x = sps.eye(4, dtype=int).tocsc()
12 |     cols_to_remove = [1, 2]
13 |     result = remove_cols_from_csc(x, cols_to_remove)
14 |     expected_result = np.array([[1, 0], [0, 0], [0, 0], [0, 1]])
15 |     np.testing.assert_equal(result.A, expected_result)
16 | 
17 | 
18 | def test_find_collinear_cols() -> None:
19 |     x = np.array([[1, 1], [0, 0]])
20 |     collinear, not_collinear = find_collinear_cols(x)
21 |     assert collinear == [1]
22 |     assert not_collinear == [0]
23 | 
24 | 
25 | def test_remove_collinear_cols() -> None:
26 |     x = np.array([[1, 1], [0, 0]])
27 |     res = remove_collinear_cols(x)
28 |     expected = np.array([[1], [0]])
29 |     np.testing.assert_equal(res, expected)
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2018 The Python Packaging Authority
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/tests/test_groupby.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from hdfe import Groupby
 4 | import pytest
 5 | 
 6 | 
 7 | @pytest.fixture
 8 | def df() -> pd.DataFrame:
 9 |     np.random.seed(0)
10 |     n_obs = 100
11 |     n_categories = 10
12 |     return pd.DataFrame(
13 |         {
14 |             "first category": np.random.choice(n_categories, n_obs),
15 |             "y": np.random.normal(0, 1, n_obs),
16 |         }
17 |     )
18 | 
19 | 
20 | def test_groupby_apply_mean(df: pd.DataFrame) -> None:
21 |     pandas_results = df.groupby("first category")[["y"]].mean()
22 |     groupby_results = Groupby(df["first category"]).apply(
23 |         np.mean, df["y"], broadcast=False, as_dataframe=True
24 |     )
25 |     pd.testing.assert_frame_equal(pandas_results, groupby_results)
26 | 
27 | 
28 | def test_groupby_transform_mean(df: pd.DataFrame) -> None:
29 |     pandas_results = df.groupby("first category")["y"].transform("mean")
30 |     groupby_results = Groupby(df["first category"]).apply(
31 |         np.mean, df["y"], broadcast=True, as_dataframe=True
32 |     )
33 |     np.testing.assert_almost_equal(
34 |         pandas_results.values, np.squeeze(groupby_results.values)
35 |     )
36 | 


--------------------------------------------------------------------------------
/tests/test_hdfe_funcs.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from hdfe.hdfe import make_dummies, get_all_dummies
 4 | 
 5 | 
 6 | def test_make_dummies_arr() -> None:
 7 |     x = np.array([1, 0, 0])
 8 |     results = make_dummies(x, False)
 9 |     expected = np.array([[0, 1], [1, 0], [1, 0]], dtype=float)
10 |     np.testing.assert_almost_equal(results.A, expected)
11 | 
12 | 
13 | def test_make_dummies_ser() -> None:
14 |     x = pd.Series([1, 0, 0])
15 |     results = make_dummies(x, False)
16 |     expected = np.array([[0, 1], [1, 0], [1, 0]], dtype=float)
17 |     np.testing.assert_almost_equal(results.A, expected)
18 | 
19 | 
20 | def test_make_dummies_cat() -> None:
21 |     x = pd.Series(["horse", "cat", "cat"]).astype("category")
22 |     results = make_dummies(x, False)
23 |     expected = np.array([[0, 1], [1, 0], [1, 0]], dtype=float)
24 |     np.testing.assert_almost_equal(results.A, expected)
25 | 
26 | 
27 | def test_make_dummies_arr_drop() -> None:
28 |     x = np.array([1, 0, 0])
29 |     results = make_dummies(x, True)
30 |     expected = np.array([[0], [1], [1]], dtype=float)
31 |     np.testing.assert_almost_equal(results.A, expected)
32 | 
33 | 
34 | def test_make_dummies_ser_drop() -> None:
35 |     x = pd.Series([1, 0, 0])
36 |     results = make_dummies(x, True)
37 |     expected = np.array([[0], [1], [1]], dtype=float)
38 |     np.testing.assert_almost_equal(results.A, expected)
39 | 
40 | 
41 | def test_make_dummies_cat_drop() -> None:
42 |     x = pd.Series(["horse", "cat", "cat"]).astype("category")
43 |     results = make_dummies(x, True)
44 |     expected = np.array([[0], [1], [1]], dtype=float)
45 |     np.testing.assert_almost_equal(results.A, expected)
46 | 
47 | 
48 | def test_get_all_dummies() -> None:
49 |     x = np.array([[0, 0], [1, 0], [0, 1]])
50 |     result = get_all_dummies(x)
51 |     expected = np.array([[1, 0, 1], [0, 1, 1], [1, 0, 0]], dtype=float)
52 |     np.testing.assert_almost_equal(result.A, expected)
53 | 


--------------------------------------------------------------------------------
/hdfe/multicollinearity.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from typing import Iterable, List, Tuple, Union
  3 | 
  4 | import numpy as np
  5 | import scipy.sparse as sps
  6 | 
  7 | 
  8 | def remove_cols_from_csc(
  9 |     x: sps.csc_matrix, cols_to_remove: Iterable[int]
 10 | ) -> sps.spmatrix:
 11 |     """
 12 |     Efficiently removes columns from a CSC sparse matrix by efficiently editing the
 13 |     underlying data.
 14 |     :param x: CSC sparse matrix
 15 |     :param cols_to_remove:
 16 |     :return: CSC sparse matrix
 17 | 
 18 |     >>> from scipy import sparse as sps
 19 |     >>> x = sps.eye(3, dtype=int).tocsc()
 20 |     >>> cols_to_remove = [1]
 21 |     >>> remove_cols_from_csc(x, cols_to_remove).A
 22 |     array([[1, 0],
 23 |            [0, 0],
 24 |            [0, 1]])
 25 |     """
 26 | 
 27 |     if not sps.issparse(x):
 28 |         raise ValueError
 29 | 
 30 |     if not sps.isspmatrix_csc(x):
 31 |         raise ValueError("Can only remove columns from a csc matrix.")
 32 | 
 33 |     def remove_one_col(idx: List[int], ptr_: np.ndarray, data_: List[int], col_: int):
 34 |         n_elts_to_remove = ptr_[col_ + 1] - ptr_[col_]
 35 |         idx = idx[: ptr_[col_]] + idx[ptr_[col_ + 1] :]
 36 |         data_ = data_[: ptr_[col_]] + data_[ptr_[col_ + 1] :]
 37 |         ptr_ = np.concatenate((ptr_[:col_], ptr_[col_ + 1 :] - n_elts_to_remove))
 38 |         return data_, idx, ptr_
 39 | 
 40 |     indices = list(x.indices)
 41 |     ptr = x.indptr
 42 |     data = list(x.data)
 43 | 
 44 |     for i, col in enumerate(cols_to_remove):
 45 |         data, indices, ptr = remove_one_col(indices, ptr, data, col - i)
 46 | 
 47 |     return sps.csc_matrix((data, indices, ptr))
 48 | 
 49 | 
 50 | def find_collinear_cols(
 51 |     x: Union[np.ndarray, sps.spmatrix], tol: float = 10 ** (-12), verbose: bool = False
 52 | ) -> Tuple[List[int], List[int]]:
 53 |     """
 54 |     Identifies a minimal subset of columns of x that, when removed, make x full rank.
 55 |     Note that there may be many such subsets. This function relies on a QR decomposition
 56 |     and may be numerically unstable.
 57 | 
 58 |     :param x: Numpy array or something that can be converted to a Numpy array. It will
 59 |         be converted to a Numpy array.
 60 |     :param tol: A higher tolerance leads to erring on the side of identifying more
 61 |         columns as collinear.
 62 |     :param verbose:
 63 |     :return: List of columns that when removed make x full rank, and a list of all of
 64 |         the other columns
 65 | 
 66 |     >>> x = np.array([[1, 1], [0, 0]])
 67 |     >>> x
 68 |     array([[1, 1],
 69 |            [0, 0]])
 70 |     >>> find_collinear_cols(x)
 71 |     ([1], [0])
 72 |     """
 73 |     k = x.shape[1]
 74 |     x = np.asarray(x)
 75 |     if x.shape[0] == k:
 76 |         rank = np.linalg.matrix_rank(x)
 77 |     else:
 78 |         rank = np.linalg.matrix_rank((x.T.dot(x)))
 79 |     full_rank = rank == k
 80 | 
 81 |     if full_rank:
 82 |         if verbose:
 83 |             print("Full rank")
 84 |         return [], list(range(k))
 85 | 
 86 |     _, r = np.linalg.qr(x)
 87 |     row = 0
 88 | 
 89 |     non_collinear_cols = []
 90 |     collinear_cols = []
 91 |     min_not_deleted = 1
 92 |     for col in range(r.shape[1]):
 93 |         if row >= r.shape[0]:
 94 |             collinear_cols += list(range(col, r.shape[1]))
 95 |             break
 96 |         if abs(r[row, col]) < tol:
 97 |             collinear_cols.append(col)
 98 |         else:
 99 |             non_collinear_cols.append(col)
100 |             min_not_deleted = min(min_not_deleted, abs(r[row, col]))
101 |             row += 1
102 |     if verbose:
103 |         print("Minimum not deleted:", min_not_deleted)
104 |         print("Number collinear", len(collinear_cols))
105 |     if len(non_collinear_cols) != rank:
106 |         warnings.warn(f"Rank is {rank}, but there are {len(non_collinear_cols)} left.")
107 | 
108 |     return collinear_cols, non_collinear_cols
109 | 
110 | 
111 | def remove_collinear_cols(
112 |     x: Union[sps.spmatrix, np.ndarray], verbose: bool = False
113 | ) -> Union[sps.spmatrix, np.ndarray]:
114 |     """
115 |     Removes a minimal subset of columns from x such that x becomes full rank. Note that
116 |         these columns are not uniquely defined.
117 | 
118 |     >>> x = np.array([[1, 1], [0, 0]])
119 |     >>> remove_collinear_cols(x)
120 |     array([[1],
121 |            [0]])
122 |     """
123 |     collinear, not_collinear = find_collinear_cols(x, verbose=verbose)
124 |     if len(collinear) == 0:
125 |         if verbose:
126 |             print("No collinear columns")
127 |         return x
128 |     if verbose:
129 |         print("Number of collinear columns:", len(collinear))
130 |         print("Number of non-collinear columns:", len(not_collinear))
131 | 
132 |     if isinstance(x, sps.csc.csc_matrix):
133 |         return remove_cols_from_csc(x, collinear)
134 |     if isinstance(x, sps.coo.coo_matrix):
135 |         x = x.asformat("csc")
136 |     if isinstance(x, np.ndarray):
137 |         return x[:, not_collinear]
138 |     raise TypeError("Not implmented for type ", type(x))
139 | 


--------------------------------------------------------------------------------
/tests/profile_groupby.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable
  2 | from hdfe import Groupby
  3 | import numpy as np
  4 | import pandas as pd
  5 | import time
  6 | 
  7 | """
  8 | Generates the results in blog post
  9 | http://esantorella.com/2016/06/16/groupby/
 10 | """
 11 | 
 12 | n_iters = 1000
 13 | n_decimals = 4
 14 | 
 15 | 
 16 | def print_results(pandas_1, pandas_100, groupby_1, groupby_100):
 17 |     print(
 18 |         "time to compute group means once with Pandas: {0}".format(
 19 |             round(pandas_1, n_decimals)
 20 |         )
 21 |     )
 22 |     print(
 23 |         "time to compute group means {0} times with Pandas: {1}".format(
 24 |             n_iters, round(pandas_100, n_decimals)
 25 |         )
 26 |     )
 27 |     print(
 28 |         "time to compute group means once with Grouped: {0}".format(
 29 |             round(groupby_1, n_decimals)
 30 |         )
 31 |     )
 32 |     print(
 33 |         "time to compute group means {0} times with Grouped: {1}".format(
 34 |             n_iters, round(groupby_100, n_decimals)
 35 |         )
 36 |     )
 37 |     print("Improvement", groupby_100 / pandas_100)
 38 |     return
 39 | 
 40 | 
 41 | def get_transform_comparisions(f: Callable, data: pd.DataFrame):
 42 |     start = time.perf_counter()
 43 |     pandas_answer = data.groupby("first category")["y"].transform(f)
 44 |     pandas_1 = time.perf_counter() - start
 45 | 
 46 |     start = time.perf_counter()
 47 |     grouped = data.groupby("first category")["y"]
 48 |     for i in range(n_iters):
 49 |         grouped.transform(f)
 50 | 
 51 |     pandas_100 = time.perf_counter() - start
 52 | 
 53 |     # Compute group means using Grouped class
 54 |     start = time.perf_counter()
 55 |     y = data["y"].values
 56 |     first_category = data["first category"].values
 57 |     group_means = Groupby(first_category).apply(f, y)
 58 |     groupby_one = time.perf_counter() - start
 59 |     np.testing.assert_almost_equal(pandas_answer.values, group_means)
 60 | 
 61 |     start = time.perf_counter()
 62 |     grouped = Groupby(first_category)
 63 |     for _ in range(n_iters):
 64 |         grouped.apply(f, y)
 65 | 
 66 |     groupby_100 = time.perf_counter() - start
 67 |     return pandas_1, pandas_100, groupby_one, groupby_100
 68 | 
 69 | 
 70 | def get_apply_comparisions(f: Callable, data: pd.DataFrame):
 71 |     start = time.perf_counter()
 72 |     pandas_answer = data.groupby("first category")["y"].apply(f)
 73 |     pandas_1 = time.perf_counter() - start
 74 | 
 75 |     start = time.perf_counter()
 76 |     grouped = data.groupby("first category")["y"]
 77 |     if f == np.mean:
 78 |         for i in range(n_iters):
 79 |             grouped.mean()
 80 |     else:
 81 |         for i in range(n_iters):
 82 |             grouped.apply(f)
 83 | 
 84 |     pandas_100 = time.perf_counter() - start
 85 | 
 86 |     # Compute group means using Grouped class
 87 |     start = time.perf_counter()
 88 |     first_category = data["first category"].values
 89 |     y = data["y"].values
 90 |     group_means = Groupby(first_category).apply(f, y, broadcast=False)
 91 |     groupby_one = time.perf_counter() - start
 92 | 
 93 |     np.testing.assert_almost_equal(pandas_answer.values, group_means)
 94 | 
 95 |     start = time.perf_counter()
 96 |     grouped = Groupby(first_category)
 97 |     for _ in range(n_iters):
 98 |         grouped.apply(f, y, broadcast=False)
 99 | 
100 |     groupby_100 = time.perf_counter() - start
101 |     return pandas_1, pandas_100, groupby_one, groupby_100
102 | 
103 | 
104 | def f(x):
105 |     return np.mean(x)
106 | 
107 | 
108 | def make_result_df(df: pd.DataFrame):
109 | 
110 |     result_df = pd.DataFrame(
111 |         columns=["Pandas", "Groupby"],
112 |         index=pd.MultiIndex.from_product(
113 |             (["Apply", "Transform"], ["Cython", "Python"])
114 |         ),
115 |         data=np.zeros((4, 2)),
116 |     )
117 | 
118 |     print(
119 |         """\nTransform, np.mean: With the np.mean function, Pandas uses Cython and
120 |     does great"""
121 |     )
122 |     results = get_transform_comparisions(np.mean, df)
123 |     print_results(*results)
124 |     result_df.loc["Transform", :].loc["Cython", :] = [results[1], results[3]]
125 | 
126 |     print("\nTransform, user-defined: Without Cython, Pandas is terrible")
127 |     results = get_transform_comparisions(f, df)
128 |     print_results(*results)
129 |     result_df.loc["Transform", :].loc["Python", :] = [results[1], results[3]]
130 | 
131 |     print(
132 |         "\nApply, np.mean: With the np.mean function, Pandas uses Cython and does great"
133 |     )
134 |     results = get_apply_comparisions(np.mean, df)
135 |     print_results(*results)
136 |     result_df.loc["Apply", :].loc["Cython", :] = [results[1], results[3]]
137 | 
138 |     print("\nTransform, user-defined: Without Cython, Pandas is terrible")
139 |     results = get_apply_comparisions(f, df)
140 |     print_results(*results)
141 |     result_df.loc["Apply", :].loc["Python", :] = [results[1], results[3]]
142 | 
143 |     result_df /= result_df.values[0, 0]
144 |     return result_df.apply(lambda x: np.round(x, 1))
145 | 
146 | 
147 | def main():
148 |     # Compute group means using Pandas groupby
149 |     np.random.seed(int("hi", 36))
150 |     n_obs = 10 ** 4
151 |     n_categories = 10 ** 2
152 | 
153 |     df = pd.DataFrame(
154 |         {
155 |             "first category": np.random.choice(n_categories, n_obs),
156 |             "y": np.random.normal(0, 1, n_obs),
157 |         }
158 |     )
159 |     assert not Groupby(df["first category"]).already_sorted
160 |     result_table = make_result_df(df)
161 |     print(result_table)
162 | 
163 |     # Try again when already sorted
164 |     df.sort_values("first category", inplace=True)
165 |     assert Groupby(df["first category"]).already_sorted
166 |     result_table = make_result_df(df)
167 |     print(result_table)
168 |     return
169 | 
170 | 
171 | if __name__ == "__main__":
172 |     main()
173 | 


--------------------------------------------------------------------------------
/hdfe/groupby.py:
--------------------------------------------------------------------------------
  1 | from typing import Union, Tuple, Callable, Any
  2 | import numpy as np
  3 | import pandas as pd
  4 | 
  5 | 
  6 | # TODO: Reimplent CGroupby
  7 | class Groupby:
  8 |     def __init__(self, keys: Union[np.ndarray, pd.Series]):
  9 |         """
 10 | 
 11 |         :param keys: List of group identifiers. Both __init__ and apply will run
 12 |             much faster if keys is already sorted.
 13 |         """
 14 |         self.keys = keys
 15 |         try:
 16 |             already_sorted = np.issubdtype(keys.dtype, np.number) and (
 17 |                 np.all(np.diff(keys) >= 0)
 18 |             )
 19 |         except ValueError:
 20 |             already_sorted = False
 21 |         if already_sorted:
 22 |             keys = np.squeeze(keys)
 23 |             if keys.ndim > 1:
 24 |                 raise ValueError("keys should be 1-dimensional")
 25 | 
 26 |             self.already_sorted = True
 27 |             new_idx = np.concatenate(([1], np.diff(keys) != 0))
 28 |             self.first_occurrences = np.where(new_idx)[0]
 29 |             self.keys_as_int: np.ndarray = np.cumsum(new_idx) - 1
 30 |             assert isinstance(self.keys_as_int, np.ndarray)
 31 |             self.n_keys = self.keys_as_int[-1] + 1
 32 | 
 33 |         else:
 34 |             self.already_sorted = False
 35 |             _, self.first_occurrences, self.keys_as_int = np.unique(
 36 |                 keys, return_index=True, return_inverse=True
 37 |             )
 38 |             self.n_keys = max(self.keys_as_int) + 1
 39 |         self.indices = self._set_indices()
 40 | 
 41 |     def _set_indices(self):
 42 |         if self.already_sorted:
 43 |             indices = [
 44 |                 slice(i, j)
 45 |                 for i, j in zip(self.first_occurrences[:-1], self.first_occurrences[1:])
 46 |             ]
 47 |             indices.append(slice(self.first_occurrences[-1], len(self.keys_as_int)))
 48 |             indices = np.array(indices)
 49 |         else:
 50 |             indices = [[] for _ in range(self.n_keys)]
 51 |             for i, k in enumerate(self.keys_as_int):
 52 |                 indices[k].append(i)
 53 |             indices = np.array([np.array(elt) for elt in indices])
 54 |         return indices
 55 | 
 56 |     def apply(
 57 |         self,
 58 |         function_: Callable[[np.ndarray], Any],
 59 |         array: Union[np.ndarray, pd.Series],
 60 |         broadcast: bool = True,
 61 |         shape: Tuple = None,
 62 |         order: str = "c",
 63 |         as_dataframe: bool = False,
 64 |     ):
 65 |         """
 66 |         Applies a function to each group, where groups are defined by self.keys_as_int
 67 |         (or, equivalently, as the argument of __init__.)
 68 |         If broadcast=True, first dimension of output will equal first dimension of
 69 |         "array", as in Pandas "transform".
 70 |         If broadcast=False, first dimension of output equals self.n_keys, as in Pandas
 71 |         "groupby".
 72 | 
 73 |         :param function_: function to be applied to each group
 74 |         :param array: np.ndarray or similar. Should have same first dimension as
 75 |         self.keys_as_int.
 76 |         :param broadcast: bool
 77 |         :param shape: Shape of output. Can be up to 3-dimensional.
 78 |             First dimension must be array.shape[0] (if broadcast=True)
 79 |             or self.n_keys (if broadcast=False). Default is for output to be
 80 |             one-dimensional.
 81 |         :param order: Should output be c-ordered or fortran-ordered?
 82 |         :param as_dataframe: if False, returns output as ndarray; if True, returns
 83 |         output
 84 |             as DataFrame with keys as indices
 85 |         :return:
 86 |         """
 87 |         if isinstance(array, pd.Series):
 88 |             names = [array.name]
 89 |             array = np.asarray(array)
 90 |         elif isinstance(array, pd.DataFrame):
 91 |             names = array.columns
 92 |             array = array.values
 93 |         else:
 94 |             names = [None]
 95 | 
 96 |         assert isinstance(array, np.ndarray)
 97 | 
 98 |         if broadcast:
 99 |             result = np.zeros(array.shape[0] if shape is None else shape, order=order)
100 |             assert result.shape[0] == array.shape[0]
101 | 
102 |             # np.take doesn't allow slice arguments, so this has to be more verbose
103 |             # than when not already sorted
104 |             if self.already_sorted:
105 |                 if array.ndim == 1:
106 |                     for idx in self.indices:
107 |                         result[idx] = function_(array[idx])
108 |                 elif array.ndim == 2:
109 |                     for idx in self.indices:
110 |                         result[idx] = function_(array[idx, :])
111 |                 elif array.ndim == 3:
112 |                     for idx in self.indices:
113 |                         result[idx] = function_(array[idx, :, :])
114 |                 else:
115 |                     raise NotImplementedError("Can't have more than 3 dims")
116 |             else:
117 |                 for idx in self.indices:
118 |                     result[idx] = function_(np.take(array, idx, 0))
119 |             if as_dataframe:
120 |                 return pd.DataFrame(index=self.keys, data=result)
121 |             return result
122 | 
123 |         result = np.zeros(self.n_keys if shape is None else shape, order=order)
124 |         assert result.shape[0] == self.n_keys
125 |         if self.already_sorted:
126 |             if array.ndim == 1:
127 |                 for k, idx in enumerate(self.indices):
128 |                     result[k] = function_(array[idx])
129 |             elif array.ndim == 2:
130 |                 for k, idx in enumerate(self.indices):
131 |                     result[k] = function_(array[idx, :])
132 |             elif array.ndim == 3:
133 |                 for k, idx in enumerate(self.indices):
134 |                     result[k] = function_(array[idx, :, :])
135 |             else:
136 |                 raise NotImplementedError("Can't have more than 3 dims")
137 | 
138 |         else:
139 |             for first_occurrence, idx in zip(self.first_occurrences, self.indices):
140 |                 result[self.keys_as_int[first_occurrence]] = function_(
141 |                     np.take(array, idx, 0)
142 |                 )
143 | 
144 |         if as_dataframe:
145 |             return pd.DataFrame(
146 |                 index=self.keys[self.first_occurrences], data=result, columns=names
147 |             )
148 |         return result
149 | 


--------------------------------------------------------------------------------
/hdfe/hdfe.py:
--------------------------------------------------------------------------------
  1 | from itertools import chain
  2 | from typing import Iterable, Tuple, List, Union, Dict
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import scipy.linalg
  7 | import scipy.sparse as sps
  8 | 
  9 | from .groupby import Groupby
 10 | from .multicollinearity import find_collinear_cols, remove_cols_from_csc
 11 | 
 12 | 
 13 | # TODO: update link on personal website
 14 | def make_dummies(elt: Union[np.ndarray, pd.Series], drop_col: bool) -> sps.spmatrix:
 15 |     try:
 16 |         if elt.dtype == "category":
 17 |             elt = elt.cat.codes
 18 |     except TypeError:
 19 |         pass
 20 |     already_sorted = (
 21 |         np.issubdtype(elt.dtype, np.integer)
 22 |         and np.min(elt) == 0
 23 |         and np.max(elt) == len(set(elt)) - 1
 24 |     )
 25 |     if not already_sorted:
 26 |         _, elt = np.unique(elt, return_inverse=True)
 27 | 
 28 |     dummies = sps.csc_matrix((np.ones(len(elt)), (range(len(elt)), elt)))
 29 |     if drop_col:
 30 |         return dummies[:, :-1]
 31 |     else:
 32 |         return dummies
 33 | 
 34 | 
 35 | def get_all_dummies(categorical_data: Union[np.ndarray, sps.spmatrix]) -> sps.spmatrix:
 36 |     if len(categorical_data.shape) == 1 or categorical_data.shape[1] == 1:
 37 |         return make_dummies(categorical_data, False)
 38 | 
 39 |     num_fes = categorical_data.shape[1]
 40 |     first = make_dummies(categorical_data[:, 0], False)
 41 |     others = [make_dummies(categorical_data[:, col], True) for col in range(1, num_fes)]
 42 |     others = sps.hstack(others)
 43 |     return sps.hstack((first, others))
 44 | 
 45 | 
 46 | # TODO: return variance estimate if desired
 47 | # TODO: verbose option
 48 | # TODO: write tests and use Pandas groupby
 49 | def estimate(
 50 |     data: pd.DataFrame,
 51 |     y: np.ndarray,
 52 |     x: np.ndarray,
 53 |     categorical_controls: List,
 54 |     check_rank=False,
 55 |     estimate_variance=False,
 56 |     get_residual=False,
 57 |     cluster=None,
 58 |     tol=None,
 59 |     within_if_fe=True,
 60 | ):
 61 |     """ Automatically picks best method for least squares. y must be 2d. """
 62 |     if not y.ndim == 2:
 63 |         raise ValueError
 64 |     # Use within estimator even when more than one set of fixed effects
 65 | 
 66 |     if categorical_controls is None or len(categorical_controls) == 0:
 67 |         b = np.linalg.lstsq(x, y)[0]
 68 |         assert b.ndim == 2
 69 |         if estimate_variance or get_residual:
 70 |             error = y - x.dot(b)
 71 |             assert error.shape == y.shape
 72 |     # within estimator
 73 |     elif len(categorical_controls) == 1 or within_if_fe:
 74 |         if len(categorical_controls) > 1:
 75 |             dummies = sps.hstack(
 76 |                 [make_dummies(data[col], True) for col in categorical_controls[1:]]
 77 |             )
 78 |             x = np.hstack((x, dummies.A))
 79 | 
 80 |         x_df = pd.DataFrame(
 81 |             data=np.hstack((data[categorical_controls[0]].values[:, None], x)),
 82 |             columns=list(range(x.shape[1] + 1)),
 83 |         )
 84 |         pandas_grouped = x_df.groupby(0)
 85 |         x_demeaned = (
 86 |             x - pandas_grouped[list(range(1, x_df.shape[1]))].transform(np.mean).values
 87 |         )
 88 |         assert x_demeaned.shape == x.shape
 89 | 
 90 |         if check_rank:
 91 |             if tol is not None:
 92 |                 _, not_collinear = find_collinear_cols(
 93 |                     x_demeaned, verbose=True, tol=tol
 94 |                 )
 95 |             else:
 96 |                 _, not_collinear = find_collinear_cols(x_demeaned, verbose=True)
 97 | 
 98 |             not_collinear = np.array(not_collinear)
 99 |             x = x[:, not_collinear]
100 |             x_demeaned = x_demeaned[:, not_collinear]
101 | 
102 |         # k x n_outcomes
103 |         b = np.linalg.lstsq(x_demeaned, y)[0]
104 |         assert b.ndim == 2
105 |         error = y - x.dot(b)
106 |         assert error.shape == y.shape
107 |         error_df = pd.DataFrame(
108 |             data=np.hstack((data[categorical_controls[0]].values[:, None], error)),
109 |             columns=list(range(error.shape[1] + 1)),
110 |         )
111 |         pandas_grouped = error_df.groupby(0)
112 |         # n_teachers x n_outcomes
113 |         fixed_effects = pandas_grouped[list(range(1, error_df.shape[1]))].mean().values
114 |         assert fixed_effects.ndim == 2
115 |         # (n_teachers + k) x n_outcomes
116 |         b = np.concatenate((fixed_effects, b))
117 |         x = sps.hstack((make_dummies(data[categorical_controls[0]], False), x)).tocsr()
118 |         assert b.shape[0] == x.shape[1]
119 |         if estimate_variance or get_residual:
120 |             error -= fixed_effects[data[categorical_controls[0]].values]
121 |     else:
122 |         dummies = get_all_dummies(data[categorical_controls].values)
123 |         x = sps.hstack((dummies, sps.csc_matrix(x)))
124 |         assert sps.issparse(x)
125 |         assert type(x) is sps.csc_matrix
126 |         if check_rank:
127 |             collinear, _ = find_collinear_cols(x.T.dot(x).A)
128 |             x = remove_cols_from_csc(x, collinear)
129 |         if y.ndim == 1 or y.shape[1] == 1:
130 |             b = sps.linalg.lsqr(x, y)[0]
131 |         else:
132 |             # TODO: there's a function for doing this all at once
133 |             b = np.zeros((x.shape[1], y.shape[1]), order="F")
134 |             for i in range(y.shape[1]):
135 |                 b[:, i] = sps.linalg.lsqr(x, y[:, i], atol=1e-10)[0]
136 | 
137 |         if estimate_variance or get_residual:
138 |             if b.ndim == 1:
139 |                 b = b[:, None]
140 |             assert b.ndim == 2
141 |             predicted = x.dot(b)
142 |             assert y.shape == predicted.shape
143 |             error = y - predicted
144 |             assert error.shape == y.shape
145 | 
146 |     assert np.all(np.isfinite(b))
147 |     if not estimate_variance and not get_residual:
148 |         return b, x
149 | 
150 |     if get_residual:
151 |         return b, x, error
152 | 
153 |     if estimate_variance:
154 |         assert b.shape[0] == x.shape[1]
155 |         _, r = np.linalg.qr(x if type(x) is np.array else x.A)
156 | 
157 |         inv_r = scipy.linalg.solve_triangular(r, np.eye(r.shape[0]))
158 |         inv_x_prime_x = inv_r.dot(inv_r.T)
159 |         if cluster is not None:
160 |             grouped = Groupby(data[cluster])
161 | 
162 |             def f(mat):
163 |                 return mat[:, 1:].T.dot(mat[:, 0])
164 | 
165 |             V = []
166 |             for i in range(y.shape[1]):
167 |                 u_ = grouped.apply(
168 |                     f,
169 |                     np.hstack((error[:, i, None], x.A)),
170 |                     shape=(grouped.n_keys, x.shape[1]),
171 |                     broadcast=False,
172 |                 )
173 | 
174 |                 inner = u_.T.dot(u_)
175 |                 V.append(inv_x_prime_x.dot(inner).dot(inv_x_prime_x))
176 |         else:
177 |             error_sums = np.sum(error ** 2, 0)
178 |             assert len(error_sums) == y.shape[1]
179 |             V = [inv_x_prime_x * es / (len(y) - x.shape[1]) for es in error_sums]
180 | 
181 |         return b, x, error, V
182 | 
183 | 
184 | def make_one_lag(
185 |     array: np.ndarray, lag: int, axis: int, fill_missing: bool = False
186 | ) -> np.ndarray:
187 |     if len(array.shape) == 1:
188 |         array = np.expand_dims(array, 0)
189 |         assert axis == 1
190 | 
191 |     # I have no idea why this is here, but it doesn't apply
192 |     # for usual data format
193 |     if abs(lag) > array.shape[axis]:
194 |         if fill_missing:
195 |             lags = np.zeros(array.shape)
196 |             missing = np.ones(array.shape)
197 |             if axis == 1:
198 |                 return np.vstack((lags, missing))
199 |             else:
200 |                 return np.hstack((lags, missing))
201 |         else:
202 |             return np.full(array.shape, np.nan)
203 | 
204 |     # (1, 5) when starting with an array of size (93,5) and lag 1
205 |     missing_shape = (
206 |         (array.shape[0], abs(lag)) if axis == 1 else (abs(lag), array.shape[1])
207 |     )
208 |     # (92, 5) when starting with an array of size (93, 5) and lag 1
209 |     other_shape = (
210 |         (array.shape[0], array.shape[1] - abs(lag))
211 |         if axis == 1
212 |         else (array.shape[0] - abs(lag), array.shape[1])
213 |     )
214 | 
215 |     if fill_missing:
216 |         missing_ind = np.ones(missing_shape)
217 |         missing_zero = np.zeros(missing_shape)
218 |         not_missing = np.zeros(other_shape)
219 | 
220 |         if axis == 1:
221 |             if lag > 0:
222 |                 lags = np.hstack((missing_zero, array[:, :-lag]))
223 |                 missing = np.hstack((missing_ind, not_missing))
224 |             if lag < 0:
225 |                 lags = np.hstack((array[:, -lag:], missing_zero))
226 |                 missing = np.hstack((not_missing, missing_ind))
227 |             return np.vstack((lags, missing))
228 |         else:
229 |             if lag > 0:
230 |                 # So with one lag, first row is zeros
231 |                 lags = np.vstack((missing_zero, array[:-lag, :]))
232 |                 missing = np.vstack((missing_ind, not_missing))
233 |             if lag < 0:
234 |                 lags = np.vstack((array[-lag:, :], missing_zero))
235 |                 missing = np.vstack((not_missing, missing_ind))
236 | 
237 |             return np.hstack((lags, missing))
238 | 
239 |     else:
240 |         missing_nan = np.full(missing_shape, np.nan)
241 |         if axis == 1:
242 |             if lag > 0:
243 |                 return np.hstack((missing_nan, array[:, :-lag]))
244 |             if lag < 0:
245 |                 return np.hstack((array[:, -lag:], missing_nan))
246 |         else:
247 |             if lag > 0:
248 |                 return np.vstack((missing_nan, array[:-lag, :]))
249 |             if lag < 0:
250 |                 return np.vstack((array[-lag:, :], missing_nan))
251 | 
252 | 
253 | def make_lags(
254 |     df: pd.DataFrame,
255 |     n_lags_back: int,
256 |     n_lags_forward: int,
257 |     outcomes: List[str],
258 |     groupby: Iterable,
259 |     fill_zeros: bool,
260 | ) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
261 |     lags = list(range(-1 * n_lags_forward, 0)) + list(range(1, n_lags_back + 1))
262 |     grouped = Groupby(df[groupby].values)
263 |     outcome_data = df[outcomes].values
264 | 
265 |     for lag in lags:
266 | 
267 |         def f(x):
268 |             return make_one_lag(x, lag, 0, fill_zeros)
269 | 
270 |         width = 2 * len(outcomes) if fill_zeros else len(outcomes)
271 | 
272 |         new_data = grouped.apply(f, outcome_data, True, shape=(len(df), width))
273 |         new_cols = [out + "_lag_" + str(lag) for out in outcomes]
274 |         if fill_zeros:
275 |             new_cols += [out + "_lag_" + str(lag) + "_mi" for out in outcomes]
276 | 
277 |         for i, c in enumerate(new_cols):
278 |             df.loc[:, c] = new_data[:, i]
279 | 
280 |     if fill_zeros:
281 |         lag_vars = {
282 |             out: list(
283 |                 chain(
284 |                     *(
285 |                         [out + "_lag_" + str(lag), out + "_lag_" + str(lag) + "_mi"]
286 |                         for lag in lags
287 |                     )
288 |                 )
289 |             )
290 |             for out in outcomes
291 |         }
292 |         for out in outcomes:
293 |             for lag in lags:
294 |                 name = out + "_lag_" + str(lag)
295 |                 missing = pd.isnull(df[name]) | df[name + "_mi"] == 1
296 |                 df.loc[missing, name] = 0
297 |                 df.loc[missing, name + "_mi"] = 1
298 | 
299 |     else:
300 |         lag_vars = {out: [out + "_lag_" + str(lag) for lag in lags] for out in outcomes}
301 | 
302 |     return df, lag_vars
303 | 


--------------------------------------------------------------------------------