├── .gitattributes
├── .gitignore
├── LICENSE
├── README.html
├── README.md
├── README.pdf
├── code
├── 1_Download_WRDS_Data.py
├── 2_Process_WRDS_Data.py
├── 3_Calculate_Kappas.py
├── firminfo.py
├── investors.py
├── kappas.py
├── our_plot_config.py
├── plots10_kappa_comparison_appendix.py
├── plots11_profit_simulations.py
├── plots1_basic_descriptives.py
├── plots2_kappa_official.py
├── plots3_big_three_four.py
├── plots4_investor_similarity.py
├── plots5_airlines_cereal.py
├── plots6_sole_vs_shared.py
├── plots7_short_interest_coverage.py
├── plots8_individual_firm_coverage.py
├── plots9_blackrock_vanguard.py
├── table3_variance_decomp.py
├── table4_kappa_correlation.py
├── utilities
│ ├── date_util.py
│ ├── matlab_util.py
│ └── quantiles.py
├── wrds_checks.py
├── wrds_cleaning.py
└── wrds_downloads.py
├── data
├── checks
│ └── .keep
├── derived
│ └── .keep
├── public
│ ├── .gitattributes
│ ├── .keep
│ ├── DLE_markups_fig_v2.csv
│ ├── airlines.parquet
│ ├── big4.csv
│ ├── cereal.parquet
│ ├── manager_consolidations.csv
│ ├── out_scrape.parquet
│ └── permno_drops.csv
└── wrds
│ └── .keep
├── figures
└── .keep
├── requirements.txt
├── run_all.bat
├── run_all.sh
├── tables
└── .keep
└── wrds_constituents.pdf
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # celery beat schedule file
95 | celerybeat-schedule
96 |
97 | # SageMath parsed files
98 | *.sage.py
99 |
100 | # Environments
101 | .env
102 | .venv
103 | env/
104 | venv/
105 | ENV/
106 | env.bak/
107 | venv.bak/
108 |
109 | # Spyder project settings
110 | .spyderproject
111 | .spyproject
112 |
113 | # Rope project settings
114 | .ropeproject
115 |
116 | # mkdocs documentation
117 | /site
118 |
119 | # mypy
120 | .mypy_cache/
121 | .dmypy.json
122 | dmypy.json
123 |
124 | # Pyre type checker
125 | .pyre/
126 |
127 | # Project Data files
128 | *.parquet
129 | !airlines.parquet
130 | !cereal.parquet
131 |
132 | *.xlsx
133 | *.pickle
134 | markup-simulations.csv
135 |
136 | # Tex Files
137 | *.tex
138 | *.aux
139 | *.log
140 |
141 | # Figures
142 | *.pdf
143 | !README.pdf
144 | !wrds_constituents.pdf
145 |
146 | # Mac Garbage
147 | .DS_Store
148 | code.zip
149 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 chrisconlon
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.html:
--------------------------------------------------------------------------------
1 |
Replication Instructions for: Common Ownership in America: 1980-2017
1059 | Backus, Conlon and Sinkinson (2020)
1060 | AEJMicro-2019-0389
1061 | openicpsr-120083
1062 | A copy of the paper is here: https://chrisconlon.github.io/site/common_owner.pdf
1063 | Open ICPSR Install Instructions
1064 |
1065 | - Download and unzip the repository.
1066 | - All required files are included or are downloaded programatically from WRDS (see notes below).
1067 |
1068 | Github Install Instructions
1069 | To download the repo simply type:
1070 | git clone https://github.com/chrisconlon/CommonOwnerReplication
1071 |
1072 |
1073 |
1074 | You will need to have the git large file storage extension installed. (Which you probably do not).
1075 | To install this extension follow the directions at:
1076 | https://git-lfs.github.com
1077 | Dataset Size and Memory
1078 |
1079 | - We recommend that you have at least 64GB of RAM available.
1080 | - All of the datasets saved will take up about 14 GB of drive space.
1081 | - NumPy is used extensively for the calculations and is multithreaded (so more cores will help).
1082 | - The computation of the $\kappa_{fg}$ terms is parallelized quarter by quarter explicitly (so cores will help a lot here).
1083 | - But most of the time spent is in merging and filtering data in pandas (more cores don’t help much).
1084 | - Total runtime on a 2015 iMac with 64GB of RAM is around 3 hours.
1085 | - WRDS download time is about an hour (Depends on internet speed) and total download is > 10GB.
1086 |
1087 | Downloading from WRDS
1088 | User must provide their own WRDS account. User will be prompted for WRDS username and password in file 1_Download_WRDS_Data.py.
1089 | To request an account, please visit:
1090 | https://wrds-www.wharton.upenn.edu/register/
1091 | If you do not have API access, you will need to consult the wrds_constituents.pdf document for instructions on using the WRDS web interface. This is strongly NOT RECOMMENDED. Because you cannot apply complex filters to the SQL queries as we do programatically, you will also need much more disk space (on the order of a Terabyte to save the entire Thomson-Reuters s34 13f database.)
1092 | If you are running this on a batch job (not interactively) such as on a HPC cluster you will need to pre-enter your WRDS password by creating a pgpass file.
1093 | As an example:
1094 | import wrds
1095 | db = wrds.Connection(wrds_username='joe')
1096 | db.create_pgpass_file()
1097 |
1098 |
1099 | If you encounter a problem, it might be that your pgpass file is not accessible by your batch job.
1100 | For more information please see: https://wrds-www.wharton.upenn.edu/pages/support/programming-wrds/programming-python/python-from-your-computer/
1101 | for more details.
1102 | Python dependencies
1103 | Our run_all.sh bash script should install all of the required python dependencies (assuming python itself is installed correctly and you have necessary acces to install packages).
1104 | To install those dependencies manually (such as on a shared server) you may need to do the following.
1105 | Python (version 3.8 or above) - install dependencies with
1106 | pip3 install -r requirements.txt
1107 |
1108 | numpy, pandas, matplotlib, pyarrow, brotli, seaborn, wrds, scikit-learn, pyhdfe, pyblp, statsmodels
1109 |
1110 |
1111 |
1112 | We anticipate most users will be running this replication package from within an Anaconda environment. To avoid making changes to your base environment you will want to create a separate environment for this replication package. To do that
1113 | conda create --name common_owner --file requirements.txt
1114 | conda activate common_owner
1115 |
1116 |
1117 | How to run the code
1118 | Change to the directory containing this file and run “./run_all.sh” on the terminal. The code should take approximately 3-10 hours to run. Tables and figures will be produced as described below.
1119 | cd code
1120 | ./runall.sh
1121 |
1122 |
1123 | Windows Warning
1124 | Windows Users: instead use “run_all.bat” from the command prompt.
1125 | There are known conflicts between Windows 10 and core Python DLL’s in versions < 3.7.3. If you are running on Windows 10, all Python programs will run best with Python 3.8 or later (see: https://bugs.python.org/issue35797).
1126 |
1127 |
1128 |
1129 |
1130 | Table/Figure Number |
1131 | Generating File |
1132 |
1133 |
1134 |
1135 |
1136 | Table 1 |
1137 | (by hand) |
1138 |
1139 |
1140 | Table 2 |
1141 | (by hand) |
1142 |
1143 |
1144 | Table 3 |
1145 | table3_variance_decomp.py |
1146 |
1147 |
1148 | Table 4 |
1149 | table4_kappa_correlations.py |
1150 |
1151 |
1152 | Figure 1 |
1153 | plots2_kappa_official.py |
1154 |
1155 |
1156 | Figure 2 |
1157 | plots1_basic_descriptives.py |
1158 |
1159 |
1160 | Figure 3 |
1161 | plots1_basic_descriptives.py |
1162 |
1163 |
1164 | Figure 4 |
1165 | plots1_basic_descriptives.py |
1166 |
1167 |
1168 | Figure 5 |
1169 | plots3_big_three_four.py |
1170 |
1171 |
1172 | Figure 6 |
1173 | plots2_kappa_official.py |
1174 |
1175 |
1176 | Figure 7 |
1177 | plots2_kappa_official.py |
1178 |
1179 |
1180 | Figure 8 |
1181 | plots5_investor_similarity.py |
1182 |
1183 |
1184 | Figure 9 |
1185 | plots2_kappa_official.py |
1186 |
1187 |
1188 | Figure 10 |
1189 | plots11_profit_simulations.py |
1190 |
1191 |
1192 | Figure 11 |
1193 | plots11_profit_simulations.py |
1194 |
1195 |
1196 | Figure 12 |
1197 | plots9_blackrock_vanguard.py |
1198 |
1199 |
1200 | Figure 13 |
1201 | plots2_kappa_official.py |
1202 |
1203 |
1204 | Figure 14 |
1205 | plots2_kappa_official.py |
1206 |
1207 |
1208 | Figure 15 |
1209 | plots2_kappa_official.py |
1210 |
1211 |
1212 | Figure 16 |
1213 | plots5_airlines_cereal.py |
1214 |
1215 |
1216 | Figure 17 |
1217 | plots6_sole_vs_shared.py |
1218 |
1219 |
1220 | Figure A1 |
1221 | plots1_basic_descriptives.py |
1222 |
1223 |
1224 | Figure A2 |
1225 | plots8_individual_firm_coverage.py |
1226 |
1227 |
1228 | Figure A3 |
1229 | plots10_kappa_comparison_appendix.py |
1230 |
1231 |
1232 | Figure A4 |
1233 | plots7_short_interest_coverage.py |
1234 |
1235 |
1236 | Figure A5 |
1237 | plots7_short_interest_coverage.py |
1238 |
1239 |
1240 | Figure A6 |
1241 | plots2_kappa_official.py |
1242 |
1243 |
1244 | Figure A7 |
1245 | plots2_kappa_official.py |
1246 |
1247 |
1248 | Figure A8 |
1249 | plots5_investor_similarity.py |
1250 |
1251 |
1252 |
1253 | Within-File Dependencies:
1254 | 1_Download_WRDS_Data.py:
1255 |
1257 |
1258 |
1259 | 2_Process_WRDS_Data.py
1260 | wrds_cleaning
1261 | wrds_checks
1262 |
1263 |
1264 |
1265 | 3_Calculate_Kappas.py
1266 | kappas
1267 | investors
1268 | firminfo
1269 | utilities/quantiles
1270 |
1271 |
1272 |
1273 | plots3_big_three_four.py:
1274 | kappas
1275 | investors
1276 |
1277 |
1278 |
1279 | plots5_airlines_cereal.py:
1280 |
1282 |
1283 |
1284 | plots9_blackrock_vanguard.py:
1285 |
1287 |
1288 |
1289 | plots10_kappa_comparison_appendix.py:
1290 | utilities/matlab_util
1291 |
1292 |
1293 |
1294 | Files Provided and Data Access Statements
1295 | WRDS
1296 | We use several data sources from WRDS. These are accessed programatically through the WRDS API and we are not able to include individual files in this replication package. (See terms: https://wrds-www.wharton.upenn.edu/users/tou/).
1297 | They include:
1298 | A. CRSP: data on securities prices and shares outstanding; list of S&P 500 constituents.
1299 | B. Compustat: business fundamentals, short interest, business segment info.
1300 | C. Thomson-Reuters: s34 database of 13f filings/ownership.
1301 | Author Constructed files
1302 | data/public:
1303 | The below files are publicly available csv’s constructed by the authors. These are drops, consolidations, and manager identifiers that are used in our project. They are distributed with this code package.
1304 |
1305 | - manager_consolidations.csv: lists consolidated manager numbers: several manager actually correspond to one
1306 | - permno_drops.csv: lists dropped permno IDs with reasons why they are dropped
1307 | - big4.csv: lists manager Numbers for Blackrock, Fidelity, State Street, and Vanguard
1308 |
1309 | The markups from from DLEU 2020 can be reproduced by running the replication package:
1310 | DeLoecker Eeckhout Unger Markups
1311 |
1312 | - DLE_markups_fig_v2.csv: markups from Figure 10 of DeLoecker Eeckhout Unger (QJE 2020)
1313 |
1314 | De Loecker, Jan; Eeckhout, Jan; Unger, Gabriel, 2020,
1315 | “Replication Data for: ‘The Rise of Market Power and the Macroeconomic Implications’“, https://doi.org/10.7910/DVN/5GH8XO, Harvard Dataverse, V1
1316 | That replication package requires access to WRDS. A subset of the markups (and no additional data) are being made publicly available here.
1317 | Scraped 13f filings
1318 | The original source data are the publicly available SEC 13f filing data from EDGAR: https://www.sec.gov/edgar/searchedgar/companysearch.html
1319 | Most users instead access the Thomson-Reuters S34 database from WRDS (as our script above does). We’ve also scraped the original source documents from EDGAR and compiled them into an easy to use format. We provide the entire universe of 13f filings as a separate dataset. For the purposes of replicating this paper, we use three smaller extracts as parquet files:
1320 |
1321 | - cereal.parquet: extract 13F Filings for firms within the cereal industry (includes small cap)
1322 | - airlines.parquet: extract 13F Filings for firms within the airline industry (includes small cap)
1323 | - out_scrape.parquet: extract 13F Filings for LARGE cap firms (a superset of the S&P 500) from 1999-2017 (300MB).
1324 |
1325 | Each file contains:
1326 | - 13f filings going back to 1999 and end in late 2017 (Data period for this paper).
1327 | The full set of scraped 13f filings and a detailed description of how extracts were created are available in two places:
1328 |
1329 | -
1330 |
The live version of the 13f scraping project is https://sites.google.com/view/msinkinson/research/common-ownership-data?
1331 |
1332 | -
1333 |
The permanent archived version (including these extracts) is available to the public at Harvard Dataverse (doi:10.7910/DVN/ZRH3EU):
1334 | https://doi.org/10.7910/DVN/ZRH3EU
1335 |
1336 |
1337 | Backus, Matthew; Conlon, Christopher T; Sinkinson, Michael; 2020, “Common Ownership Data: Scraped SEC form 13F filings for 1999-2017”, https://doi.org/10.7910/DVN/ZRH3EU, Harvard Dataverse, V1.1
1338 |
1339 | We use the parquet format for:
1340 |
1341 | - Large data inputs (above)
1342 | - Most intermediary datasets
1343 |
1344 | Parquet files are compressed columnar storage binaries that are readable by several software packages (R, Python, Stata, Julia, C++, etc.) and platforms. The goal of the parquet project is to maintain good performance for large datasets as well as interoperability.
1345 | The storage method is stable and maintained by the Apache Foundation.
1346 | https://parquet.apache.org/documentation/latest/
1347 | We use the python package “pyarrow” to read parquets and the package “brotli” for compression (listed in the requirements.txt).
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Replication Instructions for: Common Ownership in America: 1980-2017
2 | Backus, Conlon and Sinkinson (2020)
3 | AEJMicro-2019-0389
4 | openicpsr-120083
5 | A copy of the paper is here: https://chrisconlon.github.io/site/common_owner.pdf
6 |
7 |
8 | ### Open ICPSR Install Instructions
9 | 1. Download and unzip the repository.
10 | 2. All required files are included or are downloaded programatically from WRDS (see notes below).
11 |
12 | ### Github Install Instructions
13 | To download the repo simply type:
14 |
15 | git clone https://github.com/chrisconlon/CommonOwnerReplication
16 |
17 | You will need to have the git large file storage extension installed. (Which you probably do not).
18 |
19 | To install this extension follow the directions at:
20 | https://git-lfs.github.com
21 |
22 | ### Dataset Size and Memory
23 | 1. We recommend that you have at least 64GB of RAM available.
24 | 2. All of the datasets saved will take up about 14 GB of drive space.
25 | 3. NumPy is used extensively for the calculations and is multithreaded (so more cores will help).
26 | 4. The computation of the $\kappa_{fg}$ terms is parallelized quarter by quarter explicitly (so cores will help a lot here).
27 | 5. But most of the time spent is in merging and filtering data in pandas (more cores don't help much).
28 | 5. Total runtime on a 2015 iMac with 64GB of RAM is around 3 hours.
29 | 6. WRDS download time is about an hour (Depends on internet speed) and total download is > 10GB.
30 |
31 | ### Downloading from WRDS
32 | User must provide their own WRDS account. User will be prompted for WRDS username and password in file 1_Download_WRDS_Data.py.
33 |
34 | To request an account, please visit:
35 | https://wrds-www.wharton.upenn.edu/register/
36 |
37 | If you do not have API access, you will need to consult the wrds_constituents.pdf document for instructions on using the WRDS web interface. This is strongly NOT RECOMMENDED. Because you cannot apply complex filters to the SQL queries as we do programatically, you will also need much more disk space (on the order of a Terabyte to save the entire Thomson-Reuters s34 13f database.)
38 |
39 | If you are running this on a batch job (not interactively) such as on a HPC cluster you will need to pre-enter your WRDS password by creating a pgpass file.
40 |
41 | As an example:
42 |
43 | ```
44 | import wrds
45 | db = wrds.Connection(wrds_username='joe')
46 | db.create_pgpass_file()
47 | ```
48 |
49 | If you encounter a problem, it might be that your pgpass file is not accessible by your batch job.
50 |
51 | For more information please see: [https://wrds-www.wharton.upenn.edu/pages/support/programming-wrds/programming-python/python-from-your-computer/](https://wrds-www.wharton.upenn.edu/pages/support/programming-wrds/programming-python/python-from-your-computer/)
52 | for more details.
53 |
54 | ### Python dependencies
55 | Our run_all.sh bash script should install all of the required python dependencies (assuming python itself is installed correctly and you have necessary acces to install packages).
56 |
57 | To install those dependencies manually (such as on a shared server) you may need to do the following.
58 |
59 | Python (version 3.8 or above) - install dependencies with
60 |
61 | pip3 install -r requirements.txt
62 |
63 | numpy, pandas, matplotlib, pyarrow, brotli, seaborn, wrds, scikit-learn, pyhdfe, pyblp, statsmodels
64 |
65 | We anticipate most users will be running this replication package from within an Anaconda environment. To avoid making changes to your base environment you will want to create a separate environment for this replication package. To do that
66 |
67 | ```
68 | conda create --name common_owner --file requirements.txt
69 | conda activate common_owner
70 | ```
71 |
72 | ## How to run the code
73 | Change to the directory containing this file and run "./run_all.sh" on the terminal. The code should take approximately 3-10 hours to run. Tables and figures will be produced as described below.
74 |
75 | ```
76 | cd code
77 | ./runall.sh
78 | ```
79 |
80 | ### Windows Warning
81 | Windows Users: instead use "run_all.bat" from the command prompt.
82 |
83 | There are known conflicts between Windows 10 and core Python DLL's in versions < 3.7.3. If you are running on Windows 10, all Python programs will run best with Python 3.8 or later (see: https://bugs.python.org/issue35797).
84 |
85 |
86 | ## File of origin for tables and figures
87 |
88 | | Table/Figure Number | Generating File |
89 | | --- |---|
90 | | Table 1 | (by hand) |
91 | | Table 2 | (by hand) |
92 | | Table 3 | table3_variance_decomp.py |
93 | | Table 4 | table4_kappa_correlations.py |
94 | | Figure 1 | plots2_kappa_official.py |
95 | | Figure 2 | plots1_basic_descriptives.py |
96 | | Figure 3 | plots1_basic_descriptives.py |
97 | | Figure 4 | plots1_basic_descriptives.py |
98 | | Figure 5 | plots3_big_three_four.py |
99 | | Figure 6 | plots2_kappa_official.py |
100 | | Figure 7 | plots2_kappa_official.py |
101 | | Figure 8 | plots5_investor_similarity.py |
102 | | Figure 9 | plots2_kappa_official.py |
103 | | Figure 10 | plots11_profit_simulations.py |
104 | | Figure 11 | plots11_profit_simulations.py |
105 | | Figure 12 | plots9_blackrock_vanguard.py |
106 | | Figure 13 | plots2_kappa_official.py |
107 | | Figure 14 | plots2_kappa_official.py |
108 | | Figure 15 | plots2_kappa_official.py |
109 | | Figure 16 | plots5_airlines_cereal.py |
110 | | Figure 17 | plots6_sole_vs_shared.py |
111 | | Figure A1 | plots1_basic_descriptives.py |
112 | | Figure A2 | plots8_individual_firm_coverage.py |
113 | | Figure A3 | plots10_kappa_comparison_appendix.py |
114 | | Figure A4 | plots7_short_interest_coverage.py |
115 | | Figure A5 | plots7_short_interest_coverage.py |
116 | | Figure A6 | plots2_kappa_official.py |
117 | | Figure A7 | plots2_kappa_official.py |
118 | | Figure A8 | plots5_investor_similarity.py |
119 |
120 |
121 | ## Within-File Dependencies:
122 | 1_Download_WRDS_Data.py:
123 |
124 | wrds_downloads
125 |
126 | 2_Process_WRDS_Data.py
127 |
128 | wrds_cleaning
129 | wrds_checks
130 |
131 | 3_Calculate_Kappas.py
132 |
133 | kappas
134 | investors
135 | firminfo
136 | utilities/quantiles
137 |
138 | plots3_big_three_four.py:
139 |
140 | kappas
141 | investors
142 |
143 | plots5_airlines_cereal.py:
144 |
145 | kappas
146 |
147 | plots9_blackrock_vanguard.py:
148 |
149 | kappas
150 |
151 | plots10_kappa_comparison_appendix.py:
152 |
153 | utilities/matlab_util
154 |
155 |
156 | ## Files Provided and Data Access Statements
157 |
158 | ### WRDS
159 |
160 | We use several data sources from WRDS. These are accessed programatically through the WRDS API and we are not able to include individual files in this replication package. (See terms: https://wrds-www.wharton.upenn.edu/users/tou/).
161 |
162 | They include:
163 | A. CRSP: data on securities prices and shares outstanding; list of S&P 500 constituents.
164 | B. Compustat: business fundamentals, short interest, business segment info.
165 | C. Thomson-Reuters: s34 database of 13f filings/ownership.
166 |
167 | ### Author Constructed files
168 | data/public:
169 |
170 | The below files are publicly available csv's constructed by the authors. These are drops, consolidations, and manager identifiers that are used in our project. They are distributed with this code package.
171 |
172 | 1. manager_consolidations.csv: lists consolidated manager numbers: several manager actually correspond to one
173 | 2. permno_drops.csv: lists dropped permno IDs with reasons why they are dropped
174 | 3. big4.csv: lists manager Numbers for Blackrock, Fidelity, State Street, and Vanguard
175 |
176 | The markups from from DLEU 2020 can be reproduced by running the replication package:
177 |
178 | ### DeLoecker Eeckhout Unger Markups
179 | 4. DLE_markups_fig_v2.csv: markups from Figure 10 of DeLoecker Eeckhout Unger (QJE 2020)
180 |
181 | De Loecker, Jan; Eeckhout, Jan; Unger, Gabriel, 2020,
182 | "Replication Data for: 'The Rise of Market Power and the Macroeconomic Implications'", https://doi.org/10.7910/DVN/5GH8XO, Harvard Dataverse, V1
183 |
184 | That replication package requires access to WRDS. A subset of the markups (and no additional data) are being made publicly available here.
185 |
186 | ### Scraped 13f filings
187 | The original source data are the publicly available SEC 13f filing data from EDGAR: https://www.sec.gov/edgar/searchedgar/companysearch.html
188 |
189 | Most users instead access the Thomson-Reuters S34 database from WRDS (as our script above does). We've also scraped the original source documents from EDGAR and compiled them into an easy to use format. We provide the entire universe of 13f filings as a separate dataset. For the purposes of replicating this paper, we use three smaller extracts as parquet files:
190 |
191 | 5. cereal.parquet: extract 13F Filings for firms within the cereal industry (includes small cap)
192 | 6. airlines.parquet: extract 13F Filings for firms within the airline industry (includes small cap)
193 | 7. out_scrape.parquet: extract 13F Filings for LARGE cap firms (a superset of the S&P 500) from 1999-2017 (300MB).
194 |
195 | Each file contains:
196 | - 13f filings going back to 1999 and end in late 2017 (Data period for this paper).
197 |
198 | The full set of scraped 13f filings and a detailed description of how extracts were created are available in two places:
199 |
200 | 1. The live version of the 13f scraping project is [https://sites.google.com/view/msinkinson/research/common-ownership-data?](https://sites.google.com/view/msinkinson/research/common-ownership-data?)
201 |
202 | 2. The permanent archived version (including these extracts) is available to the public at Harvard Dataverse (doi:10.7910/DVN/ZRH3EU):
203 | https://doi.org/10.7910/DVN/ZRH3EU
204 |
205 | Backus, Matthew; Conlon, Christopher T; Sinkinson, Michael; 2020, "Common Ownership Data: Scraped SEC form 13F filings for 1999-2017", https://doi.org/10.7910/DVN/ZRH3EU, Harvard Dataverse, V1.1
206 |
207 |
208 | ### Description of .parquet file format
209 | We use the parquet format for:
210 |
211 | - Large data inputs (above)
212 | - Most intermediary datasets
213 |
214 | Parquet files are compressed columnar storage binaries that are readable by several software packages (R, Python, Stata, Julia, C++, etc.) and platforms. The goal of the parquet project is to maintain good performance for large datasets as well as interoperability.
215 |
216 | The storage method is stable and maintained by the Apache Foundation.
217 | https://parquet.apache.org/documentation/latest/
218 |
219 | We use the python package "pyarrow" to read parquets and the package "brotli" for compression (listed in the requirements.txt).
220 |
--------------------------------------------------------------------------------
/README.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chrisconlon/CommonOwnerReplication/c430d4c174bed9e2d4a01cec1582ce7841781c0f/README.pdf
--------------------------------------------------------------------------------
/code/1_Download_WRDS_Data.py:
--------------------------------------------------------------------------------
1 | # Step 1: Download Data from WRDS
2 | # Note: you will need a WRDS account for wrds.Connection() to work
3 | import pandas as pd
4 | import wrds
5 | from our_plot_config import wrds_dir
6 | from wrds_downloads import clean_wrds, get_names, get_crosswalk
7 | from wrds_downloads import get_fundamentals, get_short_interest
8 | from wrds_downloads import get_segments, get_msf, get_s34
9 |
10 | # raw data pulls -- save in "WRDS" directory
11 | f_raw_s34 = wrds_dir / 'raw_s34.parquet'
12 | f_splist = wrds_dir / 'sp500_list.parquet'
13 | f_crsp_names = wrds_dir / 'crsp_names.parquet'
14 | f_msf_data = wrds_dir / 'crsp_msf.parquet'
15 | f_short = wrds_dir / 'short_interest.parquet'
16 | f_fundamentals = wrds_dir / 'fundamentals_data.parquet'
17 | f_segments = wrds_dir / 'wrds_segments.parquet'
18 | f_managers = wrds_dir / 'manager_list.parquet'
19 | f_managers_all = wrds_dir / 'manager_list_all.parquet'
20 | f_names = wrds_dir / 'all_names.parquet'
21 |
22 | # Pull the Data from WRDS ~20 min (ENTIRE FILE)
23 | # This file requires about 48GB of RAM available
24 |
25 | db = wrds.Connection()
26 |
27 | # Pull the ID/Crosswalk Tables
28 | # - Pull the S&P 500 Constituents List (CRSP)
29 | # - Pull the "names" file: this maps permno to CUSIP, and NCUSIP (current period), and SIC code by date
30 | # - Pull the Compustat link file : Construct a unique mapping from gvkey (Compustat) to Permno (CRSP)
31 | # - Save the raw (un-filtered by time or S&P membership) files
32 |
33 | # This block is < 1m
34 | df_sp500 = clean_wrds(db.get_table('crsp', 'DSP500LIST'))
35 | df_sp500.to_parquet(f_splist)
36 | print("First File Done: WRDS connection is probably ok")
37 |
38 |
39 | # Filter S&P List: Ignore pre-1980 components
40 | df_sp500 = df_sp500[df_sp500.ending > '1979-12-31']
41 |
42 | df_names = get_names(db)
43 | df_names.to_parquet(f_crsp_names)
44 |
45 | # Grab all possible CUSIPS by Permno
46 | df_names2 = pd.merge(df_sp500, df_names, on='permno')
47 | df_names2 = df_names2[~((df_names2['ending'] < df_names2['st_date']) | (
48 | df_names2['start'] > df_names2['end_date']))]
49 |
50 | # Get unique list of CUSIPs and Permno's for SQL queries
51 | all_cusips = list(set(df_names2.cusip).union(df_names2.ncusip))
52 | all_permnos = list(df_names2.permno.unique().astype(int))
53 |
54 | crosswalk = get_crosswalk(db, all_permnos)
55 |
56 | # Pull the CRSP and Compustat Data Files (< 1m)
57 | # Pull the Compustat Short Interest File
58 | # - Add permno's to short interest table
59 | # - Convert Short interest table to quarterly observations
60 | # - Take last observations within each Permno, Quarter
61 | #
62 | # Pull the Compustat Fundamentals Data
63 | # - Add permnos and CUSIPS to the Fundamentals data
64 | #
65 | # Pull the Compustat Business Segments Data
66 | # - Just count the number of segments
67 | # - Add permnos to number of segments
68 | #
69 | # Pull the CRSP Price and Shares Oustanding MSF Data
70 | # - Save to parquet (around 2MB compressed)
71 | # - Use this to get a single price, shares_oustanding for each security quarter
72 |
73 | df_fund = get_fundamentals(db, crosswalk)
74 | df_fund.to_parquet(f_fundamentals)
75 |
76 | df_short = get_short_interest(db, crosswalk)
77 | df_short.to_parquet(f_short, compression='brotli')
78 |
79 | df_fund = get_fundamentals(db, crosswalk)
80 | df_fund.to_parquet(f_fundamentals)
81 |
82 | df_seg = get_segments(db, crosswalk)
83 | df_seg.to_parquet(f_segments, compression='brotli')
84 |
85 | df_msf2 = get_msf(db, all_permnos, False)
86 | df_msf2.to_parquet(f_msf_data, compression='brotli')
87 |
88 |
89 | # Get Managers and stock names
90 | df_m = db.get_table('tfn', 's34type1')
91 | df_m.to_parquet(f_managers_all, compression='brotli')
92 |
93 | names = db.get_table('crsp', 'stocknames')
94 | names.to_parquet(f_names)
95 |
96 | # #### Pull the S-34 Data -- This is SLOW don't re-run ~15m
97 | # - Only get for 8-digit CUSIPs in our S&P dataset
98 | # - This is VERY slow and around 5.5 GB (320MB on disk)
99 | # - Use this to get holdings for each 13-F investor (Don't trust self reported prices or shares outstanding)
100 |
101 | print("Starting s34 Download...")
102 | s34_data = get_s34(db, all_cusips)
103 | s34_data.to_parquet(f_raw_s34, compression='brotli')
104 | print("S34 Complete!")
105 |
106 |
107 | # unique list of manager names
108 | mgr_list = s34_data.groupby(
109 | ['mgrno'])['mgrname'].agg(
110 | pd.Series.mode).reset_index()
111 | mgr_list['mgrname'] = mgr_list['mgrname'].astype(str)
112 | mgr_list.to_parquet(f_managers, compression='brotli')
113 |
--------------------------------------------------------------------------------
/code/2_Process_WRDS_Data.py:
--------------------------------------------------------------------------------
1 | from our_plot_config import raw_dir, wrds_dir, derived_dir, checks_dir
2 | import pandas as pd
3 |
4 | from wrds_cleaning import expand_names, make_cusip_list, construct_fundamentals, get_sp_quarters, read_s34
5 | from wrds_cleaning import construct_bus_segments, consolidate_mgrs, filter_s34
6 | from wrds_cleaning import compute_betas, add_drops, process_scraped, blackrock_fix
7 | from wrds_cleaning import add_stock_splits, dedup_s34, combine_betas
8 |
9 | from wrds_checks import check_bigbeta, check_s34, check_names, check_blackrock
10 | from wrds_checks import check_s34_coverage, check_multiple_cusip, check_fundamental_coverage
11 |
12 |
13 | # Public (hand) inputs
14 | f_scrape = raw_dir / 'out_scrape.parquet' # CRM renamed
15 | f_big4 = raw_dir / 'big4.csv'
16 |
17 | # raw data pulls
18 | # CRM update: calling these WRDS
19 | f_raw_s34 = wrds_dir / 'raw_s34.parquet'
20 | f_splist = wrds_dir / 'sp500_list.parquet'
21 | f_crsp_names = wrds_dir / 'crsp_names.parquet'
22 | f_msf_data = wrds_dir / 'crsp_msf.parquet'
23 | f_short = wrds_dir / 'short_interest.parquet'
24 | f_fundamentals = wrds_dir / 'fundamentals_data.parquet'
25 | f_segments = wrds_dir / 'wrds_segments.parquet'
26 |
27 | # drops and consolidations
28 | f_permno_drops = raw_dir / 'permno_drops.csv'
29 | f_mgr_consolidations = raw_dir / 'manager_consolidations.csv'
30 |
31 | # Outputs
32 | # other info
33 | f_comp_info = derived_dir / 'compustat_info.parquet'
34 | f_names_expanded = derived_dir / 'expanded_names.parquet'
35 |
36 | # Betas
37 | f_betas_unfiltered = derived_dir / '13f_sp500_unfiltered.parquet'
38 | f_betas_scraped = derived_dir / '13f_scraped.parquet'
39 | f_frankenbetas = derived_dir / '13f_sp500_frankenbeta.parquet'
40 |
41 | # Read in the raw parquet files from SQL queries
42 | df_sp500 = pd.read_parquet(f_splist)
43 | df_names = pd.read_parquet(f_crsp_names)
44 | df_msf2 = pd.read_parquet(f_msf_data)
45 | df_short = pd.read_parquet(f_short)
46 |
47 | # Match the names file against the S&P list and expand to quarters
48 | df_names2 = expand_names(df_names, df_sp500)
49 | df_names2.to_parquet(f_names_expanded, compression='brotli')
50 |
51 | # Do Compustat (Fundamentals, Bus Segments, etc.)
52 | # make sure that fundamentals data is unique permno-quarter
53 | cusip_list = make_cusip_list(df_names)
54 | df_fund = construct_fundamentals(pd.read_parquet(f_fundamentals), df_names2)
55 | df_bus = construct_bus_segments(pd.read_parquet(f_segments), df_sp500)
56 | df_fund2 = pd.merge(df_fund, df_bus, on=['permno', 'quarter'], how='outer')
57 | df_fund2.to_parquet(f_comp_info, compression='brotli')
58 |
59 | # List of S&P permo,cusip,quarters
60 | sp_df = get_sp_quarters(df_sp500, cusip_list)
61 |
62 |
63 | # ### Merge and Drops ~ 5m
64 | # - Merge: Permno information from CRSP names file to 13-F filings
65 | # - Drop: Non S&P 500 component filings from 13-f's
66 | # - Fix: Adjust Blackrock dates because of known reporting issue (see https://wrds-www.wharton.upenn.edu/pages/support/research-wrds/research-guides/research-note-regarding-thomson-reuters-ownership-data-issues/)
67 | # - Merge: stock split information from MSF file (cfacshr) (https://wrds-support.wharton.upenn.edu/hc/en-us/articles/115003101112-Adjusting-Splits-Using-CRSP-Data)
68 | # - Fix: Select a single Filing Date (Fdate) for each Rdate.
69 | # - 24,432,318 Obs have single observation
70 | # - 2,608,149 Obs have multiple filings with same shares (different prices)
71 | # - 84,159 Obs have a known share split: take the first filing (before share split)
72 | # - 44,874 Obs have no known share split: take the last filing (assume these are corrections)
73 | # - Merge and Consolidate: Managers using consolidation file (Blackrock, Inc --> Blackrock, etc.)
74 | # - Calculate $\beta_{fs}$ for each quarter in LONG format.
75 | # - Add possible drops: by permno (dual class shares, ADR's,etc.), share class (ADR's, REITs,etc.)
76 |
77 | # Process Thomson-Reuters $\beta$
78 | # this needs about 20 GB of RAM
79 | # 1. Apply fixes and merges described above
80 |
81 | s34_data = filter_s34(read_s34(f_raw_s34), sp_df)
82 | main_df = consolidate_mgrs(
83 | dedup_s34(
84 | add_stock_splits(
85 | s34_data,
86 | df_msf2)),
87 | f_mgr_consolidations)
88 | df1 = compute_betas(main_df, df_msf2)
89 | df1 = add_drops(df1, f_permno_drops, df_names2)
90 | df1.to_parquet(f_betas_unfiltered, compression='brotli')
91 |
92 | # Process Scraped 13F's ~3min
93 | # 1. Append it to the existing dataset
94 | # 2. Add the drops
95 |
96 | dfs = process_scraped(f_scrape, f_big4)
97 | dfs = add_drops(dfs, f_permno_drops, df_names2)
98 | dfs.to_parquet(f_betas_scraped, compression='brotli')
99 |
100 | # Combine Both Sets of $\beta$s
101 | # - Use TR data before 2001
102 | # - Use scraped data after 2001
103 | # - Save the combined FrankenBeta file
104 |
105 | # use TR before cut-date and scraped data after
106 | df = combine_betas(df1, dfs, cut_date='2000-01-01')
107 | df.to_parquet(f_frankenbetas, compression='brotli')
108 |
109 | # Checks
110 | # 1. Tabulate: Missing Shares Outstanding (TR), Missing Price Info (TR), Duplicate Observations within an Fdate/Rdate and Permno, Manager
111 | # 2. Tabulate: 18 cases where firm exist in S&P500 but not in names file (yet).
112 | # 3. 1057 Observations (Firm-Quarter) in S&P500 but not in S34 Data (959 after 2010).
113 | # 4. 924 Observations with multiple CUSIPS in same period for same firm
114 | # (these are filings with typos, weird share classes, etc.)
115 |
116 | print(checks_dir)
117 | # Define the Checks
118 | f_notin_crsp = checks_dir / 'compustat-notin-crsp.xlsx'
119 | f_shares_out = checks_dir / 's34-no-shares.xlsx'
120 | f_prc_zero = checks_dir / 's34-zero-price.xlsx'
121 | f_duplicates = checks_dir / 's34_duplicate_permno.xlsx'
122 | f_names_missing = checks_dir / 'unmatched-names-splist.xlsx'
123 | f_s34_coverage = checks_dir / 'coverage_s34.xlsx'
124 | f_multiple_cusips = checks_dir / 'multiple_cusips.xlsx'
125 | f_multiple_cusips_summary = checks_dir / 'multiple_cusips_summary.xlsx'
126 | f_missing_betas = checks_dir / 'missing_betas.xlsx'
127 | f_missing_atq = checks_dir / 'missing_atq.xlsx'
128 | f_missing_segments = checks_dir / 'missing_segments.xlsx'
129 | f_bigbeta_1 = checks_dir / 'big_betas_tr.xlsx'
130 | f_bigbeta_2 = checks_dir / 'big_betas_scrape.xlsx'
131 |
132 |
133 | # Run the Checks
134 | check_s34(s34_data, f_shares_out, f_prc_zero, f_duplicates)
135 | check_names(df_sp500, df_names, f_names_missing)
136 | check_s34_coverage(df1, df_sp500, df_names, f_s34_coverage)
137 | check_multiple_cusip(s34_data, f_multiple_cusips, f_multiple_cusips_summary)
138 | check_bigbeta(df1, f_bigbeta_1)
139 | check_bigbeta(dfs, f_bigbeta_2)
140 | check_fundamental_coverage(
141 | df,
142 | df_fund2,
143 | df_names2,
144 | f_missing_betas,
145 | f_missing_atq,
146 | f_missing_segments)
147 |
--------------------------------------------------------------------------------
/code/3_Calculate_Kappas.py:
--------------------------------------------------------------------------------
1 | from our_plot_config import derived_dir, raw_dir
2 | import pandas as pd
3 |
4 | from kappas import process_beta, beta_to_kappa, calc_chhis, fix_scrape_cols
5 | from investors import compute_investor_info, calc_big4, do_one_firm_similarity
6 | from firminfo import regression_merge, firm_info_merge, kappa_in_out
7 |
8 | from utilities.quantiles import weighted_quantile
9 |
10 | # Inputs
11 | # Betas
12 | f_betas = derived_dir / '13f_sp500_frankenbeta.parquet'
13 | f_betas_tr = derived_dir / '13f_sp500_unfiltered.parquet'
14 | f_betas_sc = derived_dir / '13f_scraped.parquet'
15 |
16 | # Other inputs
17 | f_names_expanded = derived_dir / 'expanded_names.parquet'
18 | f_comp_info = derived_dir / 'compustat_info.parquet'
19 | f_big4 = raw_dir / 'big4.csv'
20 |
21 | # Outputs
22 | # main outputs (kappas)
23 | f_kappas = derived_dir / 'official-kappas.parquet'
24 | f_kappas_tr = derived_dir / 'appendix_kappa_tr.parquet'
25 | f_kappas_scrape = derived_dir / 'appendix_kappa_scrape.parquet'
26 | f_kappas_combined = derived_dir / 'appendix_kappa_combined.parquet'
27 |
28 | # Firm and Investor Output
29 | f_investor_info = derived_dir / 'investor-info.parquet'
30 | f_firm_info = derived_dir / 'firm-info.parquet'
31 | f_regression = derived_dir / 'regression_data.parquet'
32 |
33 | # Calculate $\kappa$ for combined $\beta$ (Frankenstein version)
34 | # - Apply the $\kappa$ calculations period by period
35 | # - This includes (L2, L1, Sole/Shared, and various options for gamma)
36 | # - Save the output to a new parquet file
37 | df = process_beta(f_betas)
38 | df_kappa = beta_to_kappa(df)
39 | df_kappa.to_parquet(f_kappas, compression='brotli')
40 |
41 | # Calculate alternate Kappas (these are for Appendix)
42 | # - Apply $\kappa$ calculations period by period
43 | # - Do this for the pure TR data and pure scrape data
44 | total_dft = beta_to_kappa(process_beta(f_betas_tr))
45 | total_dfs = beta_to_kappa(process_beta(f_betas_sc))
46 | final_df = pd.merge(total_dft, fix_scrape_cols(total_dfs),
47 | on=['from', 'to', 'quarter'], how='outer')
48 |
49 | total_dft.to_parquet(f_kappas_tr, compression='brotli')
50 | total_dfs.to_parquet(f_kappas_scrape, compression='brotli')
51 | final_df.to_parquet(f_kappas_combined, compression='brotli')
52 |
53 | # save some memory
54 | del total_dft, total_dfs, final_df
55 |
56 | # Investor Info: How indexed is each manager? (including big4 information)
57 | df_investor = compute_investor_info(df, f_big4)
58 | df_investor.to_parquet(f_investor_info, compression='brotli')
59 |
60 | # Do the Firm-Level Descriptives
61 | # - Build the fundamentals, names, and business segments for all S&P entries
62 | # - Compute the firm level similarity measure
63 | # - Compute CHHI, IHHI from Betas
64 | # - Combine everything in the firm (permno-quarter) info file
65 | # - Write the file for regressions (merged firm info and kappa)
66 |
67 | df_fund2 = pd.read_parquet(f_comp_info)
68 | df_names2 = pd.read_parquet(f_names_expanded)
69 | firm_similarity = df.groupby(['quarter']).apply(
70 | do_one_firm_similarity).reset_index(drop=True)
71 | big4 = calc_big4(df, pd.read_csv(f_big4))
72 | chhi = calc_chhis(df)
73 |
74 | df_firm2 = firm_info_merge(df_names2, df_fund2, firm_similarity, big4, chhi)
75 | df_firm2.to_parquet(f_firm_info, compression='brotli')
76 |
77 | df_reg = regression_merge(df_kappa, df_firm2)
78 | df_reg.to_parquet(f_regression, compression='brotli')
79 |
80 | # add in-bound and outbound kappa --this isn't in final draft of paper
81 | # df_firm3=kappa_in_out(df_kappa,df_firm2)
82 |
--------------------------------------------------------------------------------
/code/firminfo.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 |
4 | def regression_merge(df_kappas, df_firm):
5 | firm_cols = ['permno', 'quarter', 'saleq', 'cogsq', 'normalized_l2',
6 | 'retail_share', 'market_cap', 'beta_BlackRock', 'beta_Vanguard', 'beta_StateStreet']
7 | keep_cols = ['from', 'to', 'quarter', 'kappa', 'cosine', 'retail_share', 'market_cap', 'marginsq', 'saleq', 'cogsq', 'normalized_l2',
8 | 'big3', 'beta_BlackRock', 'beta_Vanguard', 'beta_StateStreet']
9 |
10 | # Read things in and Merge
11 | df = pd.merge(
12 | df_kappas.loc[(df_kappas['from'] != df_kappas['to']) & (
13 | df_kappas['quarter'] <= '2017-10-01'), ['from', 'to', 'kappa', 'quarter', 'cosine']],
14 | df_firm[firm_cols], left_on=['from', 'quarter'], right_on=['permno', 'quarter'], how='left'
15 | ).reset_index(drop=True)
16 |
17 | # Calculate derived columns
18 | df['big3'] = df['beta_BlackRock'] + \
19 | df['beta_Vanguard'] + df['beta_StateStreet']
20 | df['marginsq'] = (df['saleq'] - df['cogsq']) / df['saleq']
21 | return df[keep_cols]
22 |
23 | # merge it all together
24 |
25 |
26 | def firm_info_merge(df_names2, df_fund2, firm_similarity, big4, chhi):
27 | df_firm2 = pd.merge(pd.merge(pd.merge(pd.merge(
28 | df_names2, df_fund2, on=['permno', 'quarter'], how='inner'),
29 | firm_similarity, on=['permno', 'quarter'], how='left'),
30 | big4, on=['permno', 'quarter'], how='left'),
31 | chhi, on=['permno', 'quarter'], how='left'
32 | )
33 | df_firm2['market_cap'] = df_firm2['shares_outstanding'] * df_firm2['price']
34 | df_firm2[['beta_BlackRock',
35 | 'beta_Vanguard',
36 | 'beta_StateStreet',
37 | 'beta_Fidelity']] = df_firm2[['beta_BlackRock',
38 | 'beta_Vanguard',
39 | 'beta_StateStreet',
40 | 'beta_Fidelity']].fillna(0)
41 | return df_firm2[(df_firm2.quarter >= '1980-01-01') &
42 | (df_firm2.quarter <= '2017-10-01')].drop_duplicates()
43 |
44 |
45 | # This block is for incoming and outgoing kappa
46 | # note: not sure this made it into the paper (keep the code anyway)
47 | def weighted_from(df):
48 | a1 = np.ma.average(df['kappa'].values, weights=df['saleq_x'].values)
49 | return pd.Series({'kappa_in': a1})
50 |
51 |
52 | def weighted_to(df):
53 | a1 = np.ma.average(df['kappa'].values, weights=df['saleq_y'].values)
54 | return pd.Series({'kappa_out': a1})
55 |
56 |
57 | def kappa_in_out(df, df_firm):
58 | dfk = df.loc[df['from'] != df['to'], ['from', 'to', 'quarter', 'kappa']]
59 | tmp = pd.merge(pd.merge(dfk,
60 | df_firm[['permno', 'quarter', 'saleq']], left_on=['from', 'quarter'], right_on=['permno', 'quarter']),
61 | df_firm[['permno', 'quarter', 'saleq']], left_on=['to', 'quarter'], right_on=['permno', 'quarter']
62 | ).fillna(0)
63 |
64 | g1 = tmp.groupby(['quarter', 'to']).apply(weighted_from)
65 | g2 = tmp.groupby(['quarter', 'from']).apply(weighted_to)
66 |
67 | return pd.merge(pd.merge(df_firm,
68 | g1, left_on=['quarter', 'permno'], right_on=['quarter', 'to'], how='left'),
69 | g2, left_on=['quarter', 'permno'], right_on=['quarter', 'from'], how='left')
70 |
--------------------------------------------------------------------------------
/code/investors.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from utilities.matlab_util import matlab_sparse
4 | from sklearn.metrics.pairwise import cosine_similarity, manhattan_distances
5 |
6 |
7 | def compute_investor_info(df, f_big4):
8 | tmp = df.groupby(['quarter']).apply(
9 | do_one_investor_similarity).reset_index(drop=True)
10 | return pd.merge(tmp, pd.read_csv(f_big4), how='left', on=['mgrno'])
11 |
12 |
13 | def calc_big4(df, big4):
14 | df2 = pd.merge(df, big4, on=['mgrno'], how='inner').groupby(
15 | ['quarter', 'permno', 'InvestorName'])['beta'].sum().unstack()
16 | df2.columns = [
17 | 'beta_BlackRock',
18 | 'beta_Fidelity',
19 | 'beta_StateStreet',
20 | 'beta_Vanguard']
21 | return df2[['beta_BlackRock', 'beta_Vanguard',
22 | 'beta_StateStreet', 'beta_Fidelity']].fillna(0)
23 |
24 |
25 | def investor_helper(betas):
26 | # weights for market porfolio
27 | mkt = betas.sum(axis=0) / betas.sum()
28 | # "AUM" weights to aggregate market portfolio
29 | x = betas.sum(axis=1)
30 | aum = x / x.sum()
31 | nbetas = betas / x[:, None]
32 |
33 | # distance to AUM weighted market portfolio
34 | l2 = cosine_similarity(X=betas, Y=np.expand_dims(mkt, axis=0)).flatten()
35 | l1 = 1 - manhattan_distances(X=nbetas,
36 | Y=np.expand_dims(mkt,
37 | axis=0),
38 | sum_over_features=True).flatten() / 2
39 | return(aum, l2, l1)
40 |
41 |
42 | def do_one_investor_similarity(df):
43 | [betas, mgr_keys, permno_keys] = matlab_sparse(
44 | df.mgrno, df.permno, df.beta)
45 | # Market portfolio weights
46 | (aum, l2, l1) = investor_helper(betas)
47 | out_df = pd.DataFrame({'mgrno': mgr_keys.astype(int),
48 | 'aum_weight': aum,
49 | 'l2_similarity': l2,
50 | 'l1_similarity': l1,
51 | 'cov_aum_l1': np.cov(l1,
52 | aum)[1][0]})
53 | out_df['quarter'] = df.quarter.iloc[0]
54 | return out_df
55 |
56 |
57 | def do_one_firm_similarity(df):
58 | [betas, mgr_keys, permno_keys] = matlab_sparse(
59 | df.mgrno, df.permno, df.beta)
60 | (aum, l2, l1) = investor_helper(betas)
61 |
62 | norm_l2 = y = (l2 @ (betas / betas.sum(0)))
63 | norm_l1 = y = (l1 @ (betas / betas.sum(0)))
64 | nonnorm_l2 = y = (l2 @ betas)
65 | nonnorm_l1 = y = (l1 @ betas)
66 |
67 | out_df = pd.DataFrame({'permno': permno_keys.astype(int),
68 | 'normalized_l1': norm_l1,
69 | 'nonnormalized_l1': nonnorm_l1,
70 | 'normalized_l2': norm_l2,
71 | 'nonnormalized_l2': nonnorm_l2})
72 | out_df['quarter'] = df.quarter.iloc[0]
73 | return out_df
74 |
--------------------------------------------------------------------------------
/code/kappas.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from utilities.matlab_util import matlab_sparse
4 | from sklearn.metrics.pairwise import cosine_similarity, manhattan_distances
5 |
6 |
7 | def fix_scrape_cols(df):
8 | # fix the names in the scraped data
9 | df = df.set_index(['from', 'to', 'quarter'])
10 | df.columns = ['s' + x for x in df.columns]
11 | return df.reset_index()
12 |
13 |
14 | def process_beta(fn):
15 | df = pd.read_parquet(fn)
16 | df['mgrno'] = df['mgrno'].astype(int)
17 | return df[(df.permno_drop == False) & (
18 | df.sharecode_drop == False) & (df.beta < 0.5)]
19 |
20 | # This is the main function
21 | def beta_to_kappa(df):
22 | df = df[(df.quarter >= '1980-01-01')]
23 |
24 | df.loc[df.price < 0, 'price'] = 0
25 | df['mkt_cap'] = df['shares_outstanding'] * df['price']
26 | df_m = df.groupby(['permno', 'quarter'])['mkt_cap'].median()
27 |
28 | total_df = df.groupby(['quarter']).apply(do_one_period)
29 | total_df3 = df[(df.quarter >= '1999-01-01')].groupby(['quarter']).apply(do_one_robustness)
30 |
31 | # merge and clean up missings
32 | total_df = pd.merge(
33 | total_df, total_df3, on=[
34 | 'quarter', 'from', 'to'], how='left')
35 | total_df[['kappa',
36 | 'kappa_CLWY',
37 | 'kappa_pow2',
38 | 'kappa_pow3',
39 | 'kappa_sqrt',
40 | 'cosine',
41 | 'kappa_sole',
42 | 'kappa_soleshared']] = total_df[['kappa',
43 | 'kappa_CLWY',
44 | 'kappa_pow2',
45 | 'kappa_pow3',
46 | 'kappa_sqrt',
47 | 'cosine',
48 | 'kappa_sole',
49 | 'kappa_soleshared']].fillna(0)
50 |
51 | # Add the market cap
52 | total_df = pd.merge(pd.merge(total_df,
53 | df_m, left_on=['from', 'quarter'], right_on=['permno', 'quarter']),
54 | df_m, left_on=['to', 'quarter'], right_on=['permno', 'quarter']
55 | ).rename(columns={'mkt_cap_x': 'mkt_cap_from', 'mkt_cap_y': 'mkt_cap_to'}).reset_index()
56 |
57 | return total_df
58 |
59 |
60 | def do_one_robustness(df):
61 | [betas_soleshared, mgr_keys, permno_keys] = matlab_sparse(
62 | df.mgrno, df.permno, df.beta_soleshared, compress=False)
63 | [betas_sole, mgr_keys, permno_keys] = matlab_sparse(
64 | df.mgrno, df.permno, df.beta_sole, compress=False)
65 |
66 | [betas, mgr_keys, permno_keys] = matlab_sparse(
67 | df.mgrno, df.permno, df.beta, compress=False)
68 |
69 | kappa_sole = raw_kappa(betas, betas_sole)
70 | kappa_soleshared = raw_kappa(betas, betas_soleshared)
71 | kappa_all = raw_kappa(betas, betas)
72 | # kappa_drop=raw_kappa(betas_drop,betas_drop)
73 |
74 | idx = kappa_all.nonzero()
75 | return pd.DataFrame({'from': permno_keys[idx[0]], 'to': permno_keys[idx[1]], 'kappa_all': kappa_all[idx].flatten(),
76 | 'kappa_sole': kappa_sole[idx].flatten(), 'kappa_soleshared': kappa_soleshared[idx].flatten()})
77 |
78 |
79 | def beta_to_kappa_merger_breakup(df):
80 | return df.groupby(['quarter']).apply(do_one_merger_breakup).reset_index(drop=True)
81 |
82 |
83 | def do_one_merger_breakup(df2):
84 | # breakup in three blocks
85 | blockA = df2.loc[~df2['InvestorName'].isnull(), [
86 | 'mgrno', 'permno', 'beta']]
87 | blockB = df2.loc[df2['InvestorName'].isnull(), ['mgrno', 'permno', 'beta']]
88 | blockA.beta = 0.5 * blockA.beta
89 | blockC = blockA.copy()
90 | blockC.mgrno = -blockC.mgrno
91 | df3 = pd.concat([blockA, blockB, blockC], axis=0, ignore_index=True)
92 |
93 | # first do the regular case
94 | [betas, mgr_keys, permno_keys] = matlab_sparse(
95 | df2.mgrno, df2.permno, df2.beta)
96 | k1 = calc_kappa(betas)
97 |
98 | # now do the breakup case using the augmented data
99 | [betas_b, mgr_keys_b, permno_keys_b] = matlab_sparse(
100 | df3.mgrno, df3.permno, df3.beta)
101 | k2 = calc_kappa(betas_b)
102 |
103 | df4 = df2.groupby(['mgrno_merger', 'permno']).sum().reset_index()
104 | # finally do the merger using the merger mgrno's instead of the real ones
105 | [betas_m, mgr_keys_m, permno_keys_m] = matlab_sparse(
106 | df4.mgrno_merger, df4.permno, df4.beta)
107 | k3 = calc_kappa(betas_m)
108 |
109 | # Ignore BlackRock+Vanguard
110 | df4 = df2[~(df2['InvestorName'].isin(['BlackRock', 'Vanguard']))]
111 | [betas_drop, mgr_keys_drop, permno_keys_drop] = matlab_sparse(
112 | df4.mgrno, df4.permno, df4.beta, compress=False)
113 | k4 = calc_kappa(betas_drop)
114 |
115 | # put it all together and return
116 | idx = k1.nonzero()
117 | out_df = pd.DataFrame({'from': permno_keys[idx[0]], 'to': permno_keys[idx[1]], 'kappa': k1[idx].flatten(),
118 | 'kappa_breakup': k2[idx].flatten(), 'kappa_merger': k3[idx].flatten(), 'kappa_drop': k4[idx].flatten()})
119 | out_df['quarter'] = df2.quarter.iloc[0]
120 | return out_df
121 |
122 | # handler for L2 Measures (Rotemberg Weights, CLWY Weights, etc.)
123 | # input: long dataframe of Manager, Firm, Beta_fs
124 | # Output: long dataframe of Quarter, Firm_from, Firm_to, kappa_fg, ihhi_f,
125 | # ihhi_g, cosine_fg
126 |
127 |
128 | def do_one_period(df):
129 | [betas, mgr_keys, permno_keys] = matlab_sparse(
130 | df.mgrno, df.permno, df.beta)
131 | kappa = calc_kappa(betas)
132 | kappa2 = calc_kappa(betas, 2)
133 | kappa3 = calc_kappa(betas, 3)
134 | kappa4 = calc_kappa(betas, 0.5)
135 | kappa5 = calc_kappa(betas, 'CLWY')
136 | cosine = cosine_similarity(betas.transpose())
137 | # this is a bit slow
138 | l1_measure = calc_l1_measure(betas)
139 |
140 | idx = kappa.nonzero()
141 | return pd.DataFrame({'from': permno_keys[idx[0]], 'to': permno_keys[idx[1]], 'kappa': kappa[idx].flatten(),
142 | 'kappa_pow2': kappa2[idx].flatten(), 'kappa_pow3': kappa3[idx].flatten(), 'kappa_sqrt': kappa4[idx].flatten(),
143 | 'kappa_CLWY': kappa5[idx].flatten(), 'cosine': cosine[idx].flatten(), 'l1_measure': l1_measure[idx].flatten()})
144 |
145 | # This does the work for L1 measure
146 | # Input beta: S x F matrix
147 | # Output L1: F x F matrix
148 | # Subtract beta_f from each column of beta and sum of absolute deviations,
149 | # stack for L1.
150 | def calc_l1_measure(betas):
151 | y = manhattan_distances(betas.transpose())
152 | tot = betas.sum(axis=0)
153 | return (-y + tot[np.newaxis, :] + tot[:, np.newaxis]) / 2
154 |
155 | # Calculate Summary Stats of Control Weights
156 | # Compute Convex Power Gamma:
157 | # CHHI: Control HHI
158 | # IHHI: Investor HHI
159 | # Retail Share
160 | #
161 | # This is the main function that takes a DF of betas and calculates all of
162 | # the CHHI measures
163 |
164 | def calc_chhis(df):
165 | # apply to multiple groups here
166 | df['inv_total'] = df.groupby(['mgrno', 'quarter'])['beta'].transform(sum)
167 | y = df[['permno', 'quarter', 'beta', 'inv_total']].groupby(
168 | ['permno', 'quarter']).apply(agg_chhi)
169 | x = df.groupby(['permno', 'quarter']).agg(
170 | {'shares_outstanding': np.max, 'price': np.median})
171 | return pd.merge(x, y, left_index=True, right_index=True, how='outer')
172 |
173 | # this is unitary function that takes in a vector Beta_f that is S x 1
174 | def chhi(beta, power):
175 | gamma = (beta**power)
176 | # scalar adjustment factor
177 | adj = 10000 * ((beta.sum() / gamma.sum())**2)
178 | return (gamma**2).sum() * adj
179 |
180 | # This calculates all of the CHHI measures and returns a (horizontal) series
181 | def agg_chhi(x):
182 | out = [chhi(x['beta'], a) for a in [0.5, 1, 2, 3, 4]]
183 | tmp = x['beta'] / x['inv_total']
184 | clwy = chhi(tmp, 1)
185 | clwy_alt = 10000 * (tmp**2).sum()
186 |
187 | names = {
188 | 'retail_share': 1 - x['beta'].sum(),
189 | 'chhi_05': out[0],
190 | 'ihhi': out[1],
191 | 'chhi_2': out[2],
192 | 'chhi_3': out[3],
193 | 'chhi_4': out[4],
194 | 'chhi_clwy': clwy,
195 | 'chhi_clwy2': clwy_alt
196 | }
197 | return pd.Series(names, index=['retail_share', 'ihhi', 'chhi_05',
198 | 'chhi_2', 'chhi_3', 'chhi_4', 'chhi_clwy', 'chhi_clwy2'])
199 |
200 |
201 | # This calculates profit weights
202 | #
203 | # Input beta: S x F matrix
204 | # Output kappa: F x F matrix
205 | # Options: Gamma 'CLWY', 'default' (Rotemberg), numeric: convexity
206 | # parameter "a" for gamma=beta^a
207 | def calc_kappa(betas, gamma_type='default'):
208 | # CLWY normalize the gammas
209 | if gamma_type == 'CLWY':
210 | gamma = betas / np.maximum(betas.sum(axis=1), 1e-10)[:, None]
211 | elif isinstance(gamma_type, (int, float)):
212 | if gamma_type > 0:
213 | tmp = betas**(gamma_type)
214 | gamma = tmp # *(betas.sum(axis=0)/tmp.sum(axis=0))
215 | else:
216 | print("Must provide Positive Parameter")
217 | # proportional control: do we normalize to sum to one?
218 | else:
219 | gamma = betas # /betas.sum(axis=0)
220 |
221 | return raw_kappa(betas, gamma)
222 |
223 |
224 | # This is the ratio of inner products for kappas
225 | def raw_kappa(betas, gamma):
226 | # F x F matrix
227 | numer = gamma.T @ betas
228 | # F x 1 vector
229 | denom = np.diag(numer)
230 | # this is a F x F matirx
231 | return numer / denom[:, None]
232 |
--------------------------------------------------------------------------------
/code/our_plot_config.py:
--------------------------------------------------------------------------------
1 | # For files and paths
2 | import pathlib
3 | import os
4 |
5 |
6 | # File Directories
7 | # cc modified to parent
8 | proj_dir = pathlib.Path.cwd().parent
9 | data_dir = proj_dir / 'data'
10 | raw_dir = data_dir / 'public'
11 | wrds_dir = data_dir / 'wrds'
12 | checks_dir = data_dir / 'checks'
13 | derived_dir = data_dir / 'derived'
14 |
15 | fig_dir = proj_dir / 'figures'
16 | tab_dir = proj_dir / 'tables'
17 |
18 |
19 | # For plotting
20 | #import matplotlib
21 | #import matplotlib.pyplot as plt
22 | #from cycler import cycler
23 | #import seaborn as sns
24 |
25 | # Plot Configuration
26 | def setplotstyle():
27 | from cycler import cycler
28 | import seaborn as sns
29 | import matplotlib
30 | import matplotlib.pyplot as plt
31 | matplotlib.style.use('seaborn-whitegrid')
32 |
33 | matplotlib.rcParams.update({'font.size': 24})
34 | plt.rc('font', size=24) # controls default text sizes
35 | plt.rc('axes', titlesize=24) # fontsize of the axes title
36 | plt.rc('axes', labelsize=24) # fontsize of the x and y labels
37 | plt.rc('xtick', labelsize=24) # fontsize of the tick labels
38 | plt.rc('ytick', labelsize=24) # fontsize of the tick labels
39 | plt.rc('legend', fontsize=24) # legend fontsize
40 | plt.rc('figure', titlesize=24)
41 | plt.rc(
42 | 'axes',
43 | prop_cycle=cycler(
44 | color=[
45 | '#252525',
46 | '#636363',
47 | '#969696',
48 | '#bdbdbd']) *
49 | cycler(
50 | linestyle=[
51 | '-',
52 | ':',
53 | '--',
54 | '-.']))
55 | plt.rc('lines', linewidth=3)
56 |
--------------------------------------------------------------------------------
/code/plots10_kappa_comparison_appendix.py:
--------------------------------------------------------------------------------
1 | # %%
2 | from utilities.matlab_util import coalesce
3 | import pandas as pd
4 | import numpy as np
5 | import pathlib
6 |
7 | import matplotlib
8 | import matplotlib.pyplot as plt
9 | from our_plot_config import derived_dir, fig_dir, setplotstyle
10 |
11 | setplotstyle()
12 |
13 | # %%
14 |
15 |
16 | # Input file
17 | f_kappas = derived_dir / 'appendix_kappa_combined.parquet'
18 | f_firms = derived_dir / 'firm-information.parquet'
19 |
20 | # Figures
21 | f_profitweights_comp1 = fig_dir / 'appfigure_a3.pdf'
22 |
23 | # %%
24 | # ### Read in the (Cleaned) Parquet File
25 | # - Apply the $\kappa$ calculations period by period
26 | # - Save the output to a new parquet file
27 |
28 | total_df = pd.read_parquet(f_kappas)
29 | total_df['tunnel'] = (total_df['skappa'].combine_first(total_df['kappa']) > 1)
30 | total_df = total_df[total_df['from'] != total_df['to']]
31 | qtr_mean = total_df.groupby(['quarter']).mean()
32 |
33 | qtr_mean = total_df.groupby(['quarter']).mean()
34 |
35 | qtr_mean = qtr_mean[qtr_mean.index < '2019-01-01']
36 |
37 | # %%
38 |
39 |
40 | col_list = [
41 | 'l1_measure',
42 | 'kappa',
43 | 'kappa_pow2',
44 | 'kappa_pow3',
45 | 'kappa_sqrt',
46 | 'kappa_CLWY']
47 | qtr_mean = coalesce(qtr_mean, col_list, 's', method='left')
48 |
49 | # %%
50 |
51 | # ## Make the plots
52 | # ### Comparisons
53 | # - Compare TR Data (Solid) and Scraped 13-F Data (Dashed)
54 |
55 |
56 | qtr_mean[['kappa', 'skappa']].plot(
57 | figsize=(20, 10), style=['-', '--'], color=['navy', 'maroon'])
58 | plt.xlabel("")
59 | plt.ylabel(r"$\kappa$ weight")
60 | plt.legend([r'TR Data', 'Scraped Data', ])
61 | plt.ylim(0, 1.2)
62 | plt.savefig(f_profitweights_comp1, bbox_inches='tight')
63 |
--------------------------------------------------------------------------------
/code/plots11_profit_simulations.py:
--------------------------------------------------------------------------------
1 | # %%
2 | import pandas as pd
3 | import numpy as np
4 | import pathlib
5 | import pyblp
6 | import matplotlib
7 | import matplotlib.pyplot as plt
8 |
9 | from our_plot_config import derived_dir, fig_dir, raw_dir, setplotstyle
10 |
11 | setplotstyle()
12 |
13 |
14 | pyblp.options.collinear_atol = pyblp.options.collinear_rtol = 0
15 | pyblp.options.verbose = False
16 |
17 | # jan and jan markups input
18 | f_jj_markups = raw_dir / 'DLE_markups_fig_v2.csv'
19 |
20 | # temp input
21 | f_quarter_mean = derived_dir / 'tmp-quarter-mean.pickle'
22 | f_markup_out = derived_dir / 'markup-simulations.csv'
23 |
24 | fig_markups = fig_dir / 'macro-simulated-markups.pdf'
25 | fig_markups_jj = fig_dir / 'figure10_markups.pdf'
26 | fig_profits = fig_dir / 'figure11_profits.pdf'
27 |
28 |
29 | def combine64(years, months=1, days=1, weeks=None, hours=None, minutes=None,
30 | seconds=None, milliseconds=None, microseconds=None, nanoseconds=None):
31 | years = np.asarray(years) - 1970
32 | months = np.asarray(months) - 1
33 | days = np.asarray(days) - 1
34 | types = ('