├── .gitattributes ├── .gitignore ├── LICENSE.txt ├── README.md ├── depreciated ├── equity_db_pricing_portal.py └── equity_db_sector_portal.py ├── ntiles ├── __init__.py ├── backtest │ ├── __init__.py │ ├── ntile_kicker.py │ ├── periods.py │ ├── plotter.py │ ├── portals │ │ ├── __init__.py │ │ ├── base_portal.py │ │ ├── pricing_portal.py │ │ └── sector_portal.py │ ├── stats.py │ ├── tears │ │ ├── __init__.py │ │ ├── backtest_tear.py │ │ ├── base_tear.py │ │ ├── ic_tear.py │ │ ├── inspection_tear.py │ │ ├── tilts_backtest_tear.py │ │ └── turnover_tear.py │ └── utils.py ├── examples │ ├── ic_ac.png │ ├── inspection_1.png │ ├── inspection_2.png │ ├── return_1.png │ └── return_2.png ├── tests │ ├── __init__.py │ ├── constitute_adjustment_test.py │ └── ml_factor_calculation_test.py └── toolbox │ ├── __init__.py │ ├── constitutes │ ├── __init__.py │ └── constitute_adjustment.py │ ├── db │ ├── __init__.py │ ├── api │ │ ├── __init__.py │ │ └── sql_connection.py │ ├── read │ │ ├── __init__.py │ │ ├── cached_query.py │ │ ├── db_functions.py │ │ ├── query_constructor.py │ │ └── universe.py │ ├── settings.py │ └── write │ │ ├── __init__.py │ │ ├── create_tables.py │ │ └── make_universes.py │ └── utils │ ├── __init__.py │ ├── date_config.py │ ├── format_data_alphalens.py │ ├── handle_data.py │ ├── ml_factor_calculation.py │ └── utils.py └── setup.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | .idea/ 29 | .DS_Store 30 | **/.DS_Store 31 | MANIFEST 32 | *.ipynb 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # celery beat schedule file 99 | celerybeat-schedule 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | test/ 132 | depreciated/ -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2021 Alex DiCarlo 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ```python 2 | pip install ntiles 3 | ``` 4 | 5 | ### API 6 | ```python 7 | from ntiles import Ntile, PricingPortal, SectorPortal 8 | 9 | # getting the asset pricing data 10 | pricing_portal = PricingPortal(assets=my_universe, start='2017-01-01', end='2021-01-01') 11 | # getting the group data, this is optional 12 | group_portal = SectorPortal(assets=my_universe) 13 | 14 | # generating tearsheets 15 | tile = Ntile(pricing_portal=pricing_portal, group_portal=group_portal) 16 | tile.full_tear(factor=my_factor, ntiles=5, holding_period=20) 17 | ``` 18 | 19 | ### Example Tearsheet 20 | ![](ntiles/examples/inspection_1.png) 21 | ![](ntiles/examples/inspection_2.png) 22 | ![](ntiles/examples/return_1.png) 23 | ![](ntiles/examples/return_2.png) 24 | ![](ntiles/examples/ic_ac.png) -------------------------------------------------------------------------------- /depreciated/equity_db_pricing_portal.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from typing import List, Optional 3 | 4 | import pandas as pd 5 | 6 | try: 7 | from equity_db import MongoAPI, ReadDB 8 | except ImportError: 9 | pass 10 | 11 | # from .ntiles.portals.base_portal import BaseDeltaPortal 12 | from .base_portal import BaseDeltaPortal 13 | """ 14 | No Longer used. See toolbox for the pricing portal. 15 | """ 16 | 17 | 18 | class PricingPortal(BaseDeltaPortal, ABC): 19 | """ 20 | Object to query and cache pricing data 21 | """ 22 | 23 | def __init__(self, assets: List[str], start: str, end: str, search_by: str = 'lpermno', 24 | pricing_field: str = 'prccd', adjustor_field: str = 'ajexdi', db: str = 'equity', 25 | collection: str = 'crsp', trading_calender='NYSE'): 26 | """ 27 | :param assets: The assets to get data for 28 | :param start: start of period to query 29 | :param end: end of period to query 30 | :param pricing_field: what field to use for pricing data 31 | :param adjustor_field: What field to use for adjusting the pricing data 32 | :param db: the data base to use 33 | :param collection: the _collection to query 34 | :param trading_calender: the trading calendar to use to verify dates 35 | """ 36 | super().__init__(assets, pd.Period(start, 'D'), pd.Period(end, 'D')) 37 | 38 | self._pricing_field = pricing_field 39 | 40 | self._adjusted_pricing: Optional[pd.DataFrame] = None 41 | self._period_delta: Optional[pd.DataFrame] = None 42 | self._query_adjusted_pricing(db, collection, assets, start, end, search_by, pricing_field, adjustor_field, 43 | trading_calender) 44 | 45 | @property 46 | def delta_data(self): 47 | """ 48 | :return: unstacked daily asset returns 49 | col: _asset_id; index: pd.period; values: daily asset returns 50 | """ 51 | if self._period_delta is None: 52 | self._period_delta = self.raw_data.unstack().pct_change(1).iloc[1:].fillna(0) 53 | 54 | return self._period_delta 55 | 56 | @property 57 | def raw_data(self) -> pd.Series: 58 | """ 59 | adjustments to AssetQuery: 60 | 1) Turns date column from pd.Timestamp into pd.Period 61 | 2) Turns the lpermno into an int 62 | 3) Adjusts the pricing_field: pricing_field / adjustor_field 63 | :return: Series of the adjusted pricing data indexed by date, lpermno 64 | """ 65 | if self._adjusted_pricing is None: 66 | raise ValueError('adjusted pricing is not set') 67 | 68 | return self._adjusted_pricing[self._pricing_field] 69 | 70 | @property 71 | def assets(self) -> List[int]: 72 | """ 73 | casting to int due to _db problem must fix 74 | :return: The id's of assets we have pricing data for 75 | """ 76 | return self._adjusted_pricing.index.get_level_values('id').astype(int).unique().tolist() 77 | 78 | @property 79 | def periods(self) -> List[pd.Period]: 80 | """ 81 | :return: the unique periods for which we have pricing data 82 | """ 83 | return self._adjusted_pricing.index.get_level_values('date').unique().tolist() 84 | 85 | def _query_adjusted_pricing(self, db, collection, assets, start, end, search_by, pricing_field, adjustor_field, 86 | trading_calender) -> None: 87 | """ 88 | Makes query the pricing data 89 | Performs adjustments defined in self.daily_pricing 90 | Then caches the adjusted pricing in self._adjusted_pricing 91 | self._adjusted_pricing columns: self._pricing_field, self._adjustor_field, Index: date, lpermno 92 | :return: None, mutates self._adjusted_pricing to contain adjusted pricing 93 | """ 94 | # querying pricing 95 | reader = ReadDB(MongoAPI(db, collection)) 96 | query_df = reader.get_asset_data(assets, search_by=search_by, start=pd.Timestamp(start), end=pd.Timestamp(end), 97 | fields=[pricing_field, adjustor_field]) 98 | pricing_df = query_df.set_calendar(trading_calender).df.reset_index() 99 | 100 | # possibly send command to close mongo to free up memory 101 | 102 | # adjusting data frame 103 | pricing_df[pricing_field] = pricing_df[pricing_field] / pricing_df[adjustor_field] 104 | pricing_df['date'] = pricing_df['date'].dt.to_period(freq='D') 105 | 106 | # currently code is requiring lpermno input wont work with tickers need to fix _db 107 | pricing_df['id'] = pricing_df['lpermno'].astype(int) 108 | pricing_df = pricing_df.set_index(['date', 'id']) 109 | 110 | self._adjusted_pricing = pricing_df 111 | 112 | self._query_summary(assets) # this can be cleaner 113 | 114 | def _query_summary(self, assets): 115 | """ 116 | prints a summary of query and tells you what id's were not able to be found in the query 117 | :return: None 118 | """ 119 | query_assets = self._adjusted_pricing.index.get_level_values(1).astype(str).unique().tolist() 120 | not_found_assets = set(assets) - set(query_assets) 121 | if len(not_found_assets) == 0: 122 | print('All assets retrieved in query!') 123 | else: 124 | print(f'Unable to find {len(not_found_assets)} assets: {not_found_assets}') 125 | -------------------------------------------------------------------------------- /depreciated/equity_db_sector_portal.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from typing import List 3 | 4 | import pandas as pd 5 | 6 | try: 7 | from equity_db import MongoAPI, ReadDB 8 | except ImportError: 9 | pass 10 | 11 | from ntiles.portals.base_portal import BaseGrouperPortalConstant 12 | 13 | 14 | class SectorPortal(BaseGrouperPortalConstant, ABC): 15 | def __init__(self, passed_assets: List[str], asset_id: str = 'lpermno', db: str = 'equity', 16 | collection: str = 'crsp'): 17 | """ 18 | :param asset_id: the assets to get the sector data for 19 | :param asset_id: what is the id of the asset, must be recognised by equity_db 20 | :param db: name of the db 21 | :param collection: name of the collection 22 | """ 23 | super().__init__(passed_assets, 'GIC Sector') 24 | self._passed_assets = passed_assets 25 | self._asset_id = asset_id 26 | self._db = db 27 | self._collection = collection 28 | 29 | self._sectors = None 30 | self._set_sectors() 31 | 32 | @property 33 | def group_information(self) -> pd.Series: 34 | """ 35 | gets the gic _sectors for the give assets 36 | :return: DataFrame of GIC _sectors for the given assets 37 | """ 38 | if self._sectors is not None: 39 | return self._sectors 40 | 41 | self._set_sectors() 42 | return self._sectors 43 | 44 | @property 45 | def group_mapping(self): 46 | """ 47 | :return: dict mapping for the group 48 | """ 49 | return self.group_information.to_dict() 50 | 51 | def _set_sectors(self) -> None: 52 | """ 53 | Sets the _sectors in the class 54 | :return: None 55 | """ 56 | reader = ReadDB(MongoAPI(db=self._db, collection=self._collection)) 57 | query = reader.get_asset_data(self._passed_assets, search_by=self._asset_id, fields=['gsector']) 58 | self._sectors = query.df['gsector'] 59 | self._sectors.index = self._sectors.index.astype(str) 60 | 61 | @property 62 | def assets(self) -> List[int]: 63 | return self._sectors.reset_index().lpermno.astype(int).tolist() 64 | -------------------------------------------------------------------------------- /ntiles/__init__.py: -------------------------------------------------------------------------------- 1 | from ntiles import toolbox 2 | from ntiles import backtest 3 | -------------------------------------------------------------------------------- /ntiles/backtest/__init__.py: -------------------------------------------------------------------------------- 1 | from .ntile_kicker import Ntile 2 | from .portals.pricing_portal import PricingPortal 3 | from .portals.sector_portal import SectorPortal 4 | 5 | __all__ = [ 6 | 'Ntile', 7 | 'PricingPortal', 8 | 'SectorPortal', 9 | ] 10 | -------------------------------------------------------------------------------- /ntiles/backtest/ntile_kicker.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from typing import Dict, Iterable, Optional 3 | 4 | import pandas as pd 5 | import duckdb 6 | 7 | from .portals.base_portal import BaseGrouperPortalConstant 8 | from .portals.pricing_portal import PricingPortal 9 | from .tears.base_tear import BaseTear 10 | from .tears.ic_tear import ICHorizonTear, ICTear 11 | from .tears.inspection_tear import InspectionTear 12 | from .tears.tilts_backtest_tear import TiltsBacktestTear 13 | from .tears.turnover_tear import TurnoverTear 14 | 15 | 16 | class Ntile: 17 | def __init__(self, pricing_portal: PricingPortal, group_portal: Optional[BaseGrouperPortalConstant] = None): 18 | """ 19 | :param pricing_portal: the pricing portal which holds pricing data for all assets with factor values 20 | :param group_portal: group portal which holds grouping information for all assets with factor values 21 | if this is None then no group statistics will be calculated 22 | """ 23 | self._pricing_portal: PricingPortal = pricing_portal 24 | self._group_portal = group_portal 25 | 26 | self._factor_data = None 27 | self._ntile_matrix = None 28 | self._formatted_returns = None 29 | 30 | def _input_checks(self, factor_series) -> None: 31 | """ 32 | checks the factor series to ensure it meet requirements to run a tearsheet 33 | 34 | Requirements: 35 | 1) series must have MultiIndex with 2 levels 36 | 2) First level must be of type pd.Period 37 | 3) PricingPortal must have data for all Period dates in the series 38 | 4) There can only be one observations for a single asset on a single day 39 | 4) The factor and pricing have to have the same freq 40 | 41 | :param factor_series: the series we are checking 42 | :return: None 43 | :raise ValueError: if one of the requirements are not met 44 | """ 45 | 46 | # checking for series with multi index, possibly also check types for multi index 47 | if not isinstance(factor_series.index, pd.MultiIndex) or factor_series.index.nlevels != 2: 48 | raise ValueError('Factor input must have MultiIndex of period, id') 49 | 50 | # ensure the index level zero is date 51 | if not isinstance(factor_series.index.get_level_values(0), pd.PeriodIndex): 52 | raise ValueError('Factor input must have MultiIndex with the first level being a period ' 53 | f'current factor dtype is {type(factor_series.index.get_level_values(0))}') 54 | 55 | # we will check id when looking for overlapping portal names 56 | no_pricing_for = set(factor_series.index.get_level_values(1)).difference( 57 | self._pricing_portal.assets) 58 | if len(no_pricing_for) != 0: 59 | # raise ValueError(f'PricingPortal does not have data for: {no_pricing_for}') 60 | warnings.warn(f'PricingPortal does not have data for: {no_pricing_for}') 61 | 62 | # make sure pricing portal dates match up with factor 63 | overlapping_periods = set(factor_series.index.get_level_values(0).drop_duplicates()).intersection( 64 | self._pricing_portal.periods) 65 | if len(overlapping_periods) == 0: 66 | raise ValueError('No overlap between PricingPortal dates and factor dates') 67 | if len(overlapping_periods) < 100: 68 | warnings.warn(f'Only {len(overlapping_periods)} common periods between PricingPortal and factor') 69 | 70 | # check for multiple observations on a single day for a single asset 71 | if factor_series.index.duplicated().any(): 72 | raise ValueError('Multiple factor observations on single day for a single asset') 73 | 74 | # check the pricing and factor freq are the same 75 | if factor_series.index.get_level_values('date').freq != self._pricing_portal.delta_data.index.freq: 76 | raise ValueError('Factor and pricing dont have the same freq!') 77 | 78 | def _set_ntiles_and_returns(self, factor_data: pd.Series, ntiles: int): 79 | """ 80 | Sets self._formatted_returns and self._formatted_ntile 81 | :param factor_data: the factor data 82 | :param ntiles: amount of ntiles 83 | :return: None 84 | """ 85 | self._ntile_factor_sql(factor_data, ntiles) 86 | self._align_ntiles_pricing() 87 | 88 | # can see what % of the dataframe is null here 89 | self._make_null_summary(factor_data) 90 | 91 | def _align_ntiles_pricing(self) -> None: 92 | """ 93 | ensures ntiled matrix and daily returns matrix have the same column and row order 94 | sets self._formatted_returns and self._ntile_matrix 95 | :return: None 96 | """ 97 | ntile_factor = self._factor_data['ntile'].unstack() 98 | daily_returns = self._pricing_portal.delta_data 99 | 100 | factor_date = ntile_factor.index.get_level_values('date') 101 | self._formatted_returns = daily_returns[(daily_returns.index >= factor_date.min()) & 102 | (daily_returns.index <= factor_date.max())] 103 | 104 | # reindexing the ntiles data so that you have pricing and ntiles matching up 105 | self._ntile_matrix = ntile_factor.reindex_like(self._formatted_returns) 106 | 107 | def _make_null_summary(self, raw_factor_data) -> None: 108 | """ 109 | making a summary of how much factor data we matched to pricing data 110 | :param raw_factor_data: the raw unstacked factor data 111 | """ 112 | length_og_factor_data = len(raw_factor_data) 113 | # seeing what % of factor data is missing 114 | num_na_data_points = raw_factor_data.isnull().sum() 115 | pct_na_data_points = num_na_data_points / length_og_factor_data 116 | 117 | # amount of data droped because of non aligned factor and returns dates: 118 | # above should be non null length of ntiles before reindexing 119 | # non null length of ntiles after indexing 120 | number_of_finite_ntiles = length_og_factor_data - num_na_data_points 121 | binary_if_ntile_data = self._ntile_matrix.notnull() 122 | number_of_finite_ntiles_no_overlap_returns = number_of_finite_ntiles - binary_if_ntile_data.sum().sum() 123 | pct_missing_ntile_no_overlap = number_of_finite_ntiles_no_overlap_returns / number_of_finite_ntiles 124 | 125 | # amount of data we dont have returns for given we have overlapping pricing and factor 126 | # should ffill ntile by holdign period since we need return data holding_period days out 127 | binary_if_return_data = self._formatted_returns.notnull() 128 | # should forward fill by holding period to make sure we have pricing for when we will be holding the stock 129 | missing_from_no_returns_given_overlap = (number_of_finite_ntiles 130 | - (binary_if_ntile_data * binary_if_return_data).sum().sum()) 131 | pct_missing_data_no_returns_given_overlap = missing_from_no_returns_given_overlap / number_of_finite_ntiles 132 | 133 | # total number of unusable factor data points due to null or no maped returns 134 | num_bad = (num_na_data_points 135 | + number_of_finite_ntiles_no_overlap_returns 136 | + missing_from_no_returns_given_overlap 137 | ) 138 | 139 | pct_bad = num_bad / length_og_factor_data 140 | 141 | print(f"Unusable Factor Data: {(round(pct_bad, 4)) * 100}%") 142 | print(f"NA Factor Values: {(round(pct_na_data_points, 4)) * 100}%") 143 | print(f"No Overlapping Returns: {(round(pct_missing_ntile_no_overlap, 4)) * 100}%") 144 | print(f"Missing Returns Given Overlap: {(round(pct_missing_data_no_returns_given_overlap, 4)) * 100}%") 145 | 146 | def _ntile_factor(self, factor: pd.Series, ntiles: int) -> None: 147 | """ 148 | This is slow replaced by 149 | Universe relative Quantiles of a factor by day _ntile_factor_sql 150 | 151 | pd.DataFrame of ntiled factor 152 | index: (pd.Period, _asset_id) 153 | Columns: (factor, ntile) 154 | Values: (factor value, Ntile corresponding to factor value) 155 | 156 | :param factor: same var as ntile_return_tearsheet 157 | :param ntiles: same var as ntile_return_tearsheet 158 | """ 159 | # add a filter for if a day has less than 20% factor data then just put bin as -1 for all assets 160 | # unstack the frame, percentile rank each row, divide whole matrix buy 1/ntiles, take the floor of every number 161 | factor = factor[~factor.isnull()].to_frame('factor') 162 | 163 | try: 164 | factor['ntile'] = factor.groupby('date').transform( 165 | lambda date_data: ntiles - pd.qcut(date_data, ntiles, labels=False) 166 | ).sort_index() 167 | except Exception as e: 168 | print('Hit error while binning data. Need to push the histogram') 169 | print('Your data is mighty sus we can\'t Ntile it. This is normally due to bad data') 170 | 171 | # forcing a histogram out 172 | import matplotlib.pyplot as plt 173 | factor.groupby('date').count().plot() 174 | plt.show() 175 | 176 | raise e 177 | 178 | self._factor_data = factor 179 | 180 | def _ntile_factor_sql(self, factor: pd.Series, ntiles: int) -> None: 181 | """ 182 | Universe relative Quantiles of a factor by day 183 | Around 100X faster than pandas groupby qcut 184 | 185 | pd.DataFrame of ntiled factor 186 | index: (pd.Period, _asset_id) 187 | Columns: (factor, ntile) 188 | Values: (factor value, Ntile corresponding to factor value) 189 | 190 | :param factor: same var as ntile_return_tearsheet 191 | :param ntiles: same var as ntile_return_tearsheet 192 | """ 193 | factor_freq = factor.index.get_level_values('date').freq 194 | factor = factor.to_frame('factor').reset_index() 195 | factor['date'] = factor['date'].dt.to_timestamp() 196 | 197 | sql_quantile = f"""SELECT *, NTILE({ntiles}) OVER(PARTITION BY date ORDER BY factor.factor DESC) as ntile 198 | FROM factor 199 | WHERE factor.factor IS NOT NULL""" 200 | con = duckdb.connect(':memory:') 201 | factor = con.execute(sql_quantile).df() 202 | factor['date'] = factor['date'].dt.to_period(freq=factor_freq) 203 | factor = factor.set_index(['date', 'id']) 204 | 205 | self._factor_data = factor 206 | 207 | # 208 | # Start up methods 209 | # 210 | def _prep_for_run(self, factor: pd.Series, ntiles: int) -> None: 211 | """ 212 | prepares the ntiles class to run a tear sheet 213 | :param factor: factor for tear sheet 214 | :param ntiles: num ntiles for sheet 215 | :return: None 216 | """ 217 | # checking to see if we have series or data frame 218 | if isinstance(factor, pd.DataFrame): 219 | if factor.shape[1] > 1: # there is a df passed with multible columns 220 | raise ValueError('There are multiple columns in the passed DataFrame') 221 | 222 | factor_series = factor.iloc[:, 0] 223 | else: 224 | factor_series = factor.copy() 225 | 226 | self._input_checks(factor_series) 227 | 228 | factor_series.index.names = ['date', 'id'] 229 | self.kick_tears(factor_series, ntiles) 230 | 231 | self._print_start_end_dates() 232 | 233 | def _print_start_end_dates(self): 234 | """ 235 | prints the start and end date of the backtest 236 | """ 237 | date = self._factor_data.index.get_level_values(0) 238 | print(f'\nStart Date: {date.min()}') 239 | print(f'End Date: {date.max()}\n') 240 | 241 | def kick_tears(self, factor_series: pd.Series, ntiles: int) -> None: 242 | """ 243 | Clears the object of all factor and tear data. 244 | Reruns Ntiling of factor 245 | :param factor_series: the user passed factor 246 | :param ntiles: the number of ntiles 247 | :return: None 248 | """ 249 | self._clear() 250 | self._set_ntiles_and_returns(factor_series, ntiles) 251 | 252 | def _clear(self) -> None: 253 | """ 254 | clears all data points in the object except the pricing portal 255 | :return: None 256 | """ 257 | self._factor_data = None 258 | self._ntile_matrix = None 259 | self._formatted_returns = None 260 | 261 | @staticmethod 262 | def _run(tears: Dict[str, BaseTear]) -> None: 263 | """ 264 | Runs all tear sheets that are set in the class 265 | :return: None 266 | """ 267 | for tear in tears.values(): 268 | tear.compute_plot() 269 | 270 | # 271 | # Tear Sheets Below 272 | # 273 | def full_tear(self, factor: pd.Series, ntiles: int, holding_period: int, long_short: bool = True, 274 | market_neutral=True, show_uni=False, show_ntile_tilts=False) -> Dict[str, BaseTear]: 275 | """ 276 | Creates basic visualizations of the factor data distribution by ntile and how complete the data is 277 | Creates a fan chart of cumulative returns for the given factor values. 278 | Creates a IC time series for the factor value and the forward returns 279 | Createa a turnover sheet showing how often the factor data will turn over 280 | 281 | The in the cumulative return plot, each value represents the cumulative return up to that days close. 282 | Returns are not shifted each value represents portfolios value on the close of that day. 283 | 284 | A set of weights is generated for each day based off factor quantile. 285 | The portfolio is rebalanced daily, each days 1/holding_period of the portfolio is rebalanced. 286 | All positions are equally weighted. 287 | 288 | :param factor: The factor values being tested. 289 | index: (pd.Period, _asset_id) 290 | values: (factor_value) 291 | :param holding_period: How long we want to hold positions for, represents days 292 | :param ntiles: amount of bins we are testing (1 is high factor value n is low value) 293 | :param long_short: show we compute the spread between ntiles: (1 - n) 294 | :param market_neutral: subtract out the universe returns from the ntile returns? 295 | :return: plots showing the return profile of the factor 296 | :param show_uni: Should universe return be shown in the spread plot? 297 | :param show_ntile_tilts: should we show each ntiles tilts? 298 | """ 299 | self._prep_for_run(factor, ntiles) 300 | tears = {'inspection_tear': InspectionTear(factor_data=self._factor_data), 301 | 'backtest_tear': TiltsBacktestTear(ntile_matrix=self._ntile_matrix, 302 | daily_returns=self._formatted_returns, ntiles=ntiles, 303 | holding_period=holding_period, long_short=long_short, 304 | market_neutral=market_neutral, 305 | show_uni=show_uni, factor_data=self._factor_data, 306 | group_portal=self._group_portal, 307 | show_ntile_tilts=show_ntile_tilts), 308 | 'ic_tear': ICTear(factor_data=self._factor_data, daily_returns=self._formatted_returns, 309 | holding_period=holding_period), 310 | 'turnover_tear': TurnoverTear(factor_data=self._factor_data, holding_period=holding_period)} 311 | self._run(tears) 312 | return tears 313 | 314 | def ntile_backtest_tear(self, factor: pd.Series, ntiles: int, holding_period: int, long_short: bool = True, 315 | market_neutral=True, show_uni=False, show_ntile_tilts=False) -> Dict[str, BaseTear]: 316 | """ 317 | Creates a fan chart of cumulative returns for the given factor values. 318 | The factor values are ntile'd into ntiles number of bins 319 | 320 | The in the cumulative return plot, each value represents the cumulative return up to that days close. 321 | Returns are not shifted each value represents portfolios value on the close of that day. 322 | 323 | A set of weights is generated for each day based off factor quantile. 324 | The portfolio is rebalanced daily, each days 1/holding_period of the portfolio is rebalanced. 325 | All positions are equally weighted. 326 | 327 | :param factor: The factor values being tested. 328 | index: (pd.Period, _asset_id) 329 | values: (factor_value) 330 | :param holding_period: How long we want to hold positions for, represents days 331 | :param ntiles: amount of bins we are testing (1 is high factor value n is low value) 332 | :param long_short: show we compute the spread between ntiles: (1 - n) 333 | :param market_neutral: subtract out the universe returns from the ntile returns? 334 | :return: plots showing the return profile of the factor 335 | :param show_uni: Should universe return be shown in the spread plot? 336 | :param show_ntile_tilts: should we show each ntiles tilts? 337 | """ 338 | self._prep_for_run(factor, ntiles) 339 | tears = {'backtest_tear': 340 | TiltsBacktestTear(ntile_matrix=self._ntile_matrix, daily_returns=self._formatted_returns, 341 | ntiles=ntiles, holding_period=holding_period, long_short=long_short, 342 | market_neutral=market_neutral, show_uni=show_uni, factor_data=self._factor_data, 343 | group_portal=self._group_portal, show_ntile_tilts=show_ntile_tilts) 344 | } 345 | self._run(tears) 346 | return tears 347 | 348 | def ntile_inspection_tear(self, factor: pd.Series, ntiles: int) -> Dict[str, BaseTear]: 349 | """ 350 | creates visuals showing the factor data over time 351 | only calculates IC for when the asset is in the universe 352 | :param factor: The factor values being tested. 353 | index: (pd.Period, _asset_id) 354 | values: (factor_value) 355 | :param ntiles: the number of ntiles 356 | :return: Dict of InspectionTear 357 | """ 358 | self._prep_for_run(factor, ntiles) 359 | tears = {'inspection_tear': InspectionTear(factor_data=self._factor_data)} 360 | self._run(tears) 361 | return tears 362 | 363 | def ntile_ic_tear(self, factor: pd.Series, holding_period: int) -> Dict[str, BaseTear]: 364 | """ 365 | creates visuals showing the ic over time 366 | :param factor: The factor values being tested. 367 | index: (pd.Period, _asset_id) 368 | values: (factor_value) 369 | :param holding_period: How long we want to hold positions for, represents days 370 | :return: Dict of ICTear 371 | """ 372 | self._prep_for_run(factor, 1) 373 | tears = {'ic_tear': ICTear(factor_data=self._factor_data, daily_returns=self._formatted_returns, 374 | holding_period=holding_period)} 375 | self._run(tears) 376 | return tears 377 | 378 | def ntile_turnover_tear(self, factor: pd.Series, ntiles: int, holding_period: int) -> Dict[str, BaseTear]: 379 | """ 380 | Creates visuals showing the turnover over time 381 | :param factor: The factor values being tested. 382 | index: (pd.Period, _asset_id) 383 | values: (factor_value) 384 | :param ntiles: the number of ntiles 385 | :param holding_period: How long we want to hold positions for, represents days 386 | :return: Dict of TurnoverTear 387 | """ 388 | self._prep_for_run(factor, ntiles) 389 | tears = {'turnover_tear': TurnoverTear(factor_data=self._factor_data, holding_period=holding_period)} 390 | self._run(tears) 391 | return tears 392 | 393 | def ntile_ic_horizon(self, factor: pd.Series, intervals: Iterable[int], show_individual: bool = False) -> \ 394 | Dict[str, BaseTear]: 395 | """ 396 | Shows the curve of the information coefficient over various holding periods 397 | 398 | :param factor: The factor values being tested. 399 | index: (pd.Period, _asset_id) 400 | values: (factor_value) 401 | :param intervals: an iterable that contains the holding periods we would like to make the IC frontier for 402 | :param show_individual: should each individual IC time series be show for every interval 403 | :return: Dict of ICHorizonTear 404 | """ 405 | self._prep_for_run(factor, 1) 406 | tears = { 407 | 'ic_horizon_tear': ICHorizonTear(factor_data=self._factor_data, daily_returns=self._formatted_returns, 408 | intervals=intervals, show_individual=show_individual)} 409 | self._run(tears) 410 | return tears 411 | -------------------------------------------------------------------------------- /ntiles/backtest/periods.py: -------------------------------------------------------------------------------- 1 | # 2 | # Taken from https://github.com/empyrical/blob/master/empyrical/periods.py 3 | # 4 | from typing import Union 5 | 6 | import pandas as pd 7 | 8 | APPROX_BDAYS_PER_MONTH = 21 9 | APPROX_BDAYS_PER_YEAR = 252 10 | 11 | MONTHS_PER_YEAR = 12 12 | WEEKS_PER_YEAR = 52 13 | QTRS_PER_YEAR = 4 14 | 15 | DAILY = 'daily' 16 | WEEKLY = 'weekly' 17 | MONTHLY = 'monthly' 18 | QUARTERLY = 'quarterly' 19 | YEARLY = 'yearly' 20 | 21 | PANDAS_PERIOD_TO_PERIOD_STRING = { 22 | 'D': DAILY, 23 | 'W': WEEKLY, 24 | 'M': MONTHLY, 25 | 'Q': QUARTERLY, 26 | 'Y': YEARLY 27 | } 28 | 29 | ANNUALIZATION_FACTORS = { 30 | DAILY: APPROX_BDAYS_PER_YEAR, 31 | WEEKLY: WEEKS_PER_YEAR, 32 | MONTHLY: MONTHS_PER_YEAR, 33 | QUARTERLY: QTRS_PER_YEAR, 34 | YEARLY: 1 35 | } 36 | 37 | 38 | def get_period_string(dates: Union[pd.PeriodIndex, pd.Series]) -> str: 39 | """ 40 | Gets the string definition of a period from a pandas.PeriodIndex or pandas Series 41 | :param dates: Pandas period index or columns of period we are getting the frequency for 42 | :return: a period string defined above 43 | """ 44 | if isinstance(dates, pd.Series): 45 | dates = dates.dt 46 | 47 | freq = dates.freq.name 48 | if freq not in PANDAS_PERIOD_TO_PERIOD_STRING: 49 | raise ValueError(f'Unknown frequency: {freq}') 50 | 51 | return PANDAS_PERIOD_TO_PERIOD_STRING[freq] 52 | 53 | 54 | def get_period_annualization(dates: Union[pd.PeriodIndex, pd.Series]) -> int: 55 | """ 56 | Gets the annualization factor that corresponds to the frequency of the given pandas.PeriodIndex or pandas Series 57 | :param dates: Pandas period index or columns of period we are getting the frequency for 58 | :return: The number of observations of the given date frequency in a year 59 | """ 60 | return ANNUALIZATION_FACTORS[get_period_string(dates)] 61 | -------------------------------------------------------------------------------- /ntiles/backtest/plotter.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | import matplotlib as mpl 5 | import matplotlib.pyplot as plt 6 | from IPython.core.display import display 7 | 8 | RETURN_COLOR_MAP = mpl.cm.get_cmap('jet') 9 | TILTS_COLOR_MAP = mpl.cm.get_cmap('tab20') 10 | IC_COLOR_MAP = mpl.cm.get_cmap('tab10') 11 | 12 | LARGE_FIGSIZE = 20, 10 13 | MEDIUM_FIGSIZE = 15, 8 14 | 15 | 16 | def ntile_return_plot(cum_ntile_returns: pd.DataFrame, title: str): 17 | """ 18 | generates cumulative return plot for a ntiles returns series 19 | if cols are empty list returns None 20 | :param cum_ntile_returns: cumulative returns we want to plot 21 | :param title: title of the plot 22 | :return: matplotlib axis with the return plot on it 23 | """ 24 | 25 | fig, ax = plt.subplots(1, 1, figsize=LARGE_FIGSIZE) 26 | 27 | cum_ntile_returns.plot(lw=2, ax=ax, cmap=RETURN_COLOR_MAP) 28 | ax.set(ylabel='Log Cumulative Returns', title=title, xlabel='', 29 | yscale='symlog') 30 | 31 | ax.legend(loc="center left", bbox_to_anchor=(1, .5)) 32 | ax.set_yscale('log', base=2) 33 | ax.yaxis.set_major_formatter(mpl.ticker.FormatStrFormatter('%.2f')) 34 | ax.axhline(1, linestyle='-', color='black', lw=1) 35 | fig.autofmt_xdate() 36 | 37 | plt.show() 38 | return ax 39 | 40 | 41 | def ntile_annual_return_bars(avg_annual_ret: pd.Series, period: int, freq: str): 42 | """ 43 | generates a box plot of the yearly CAGR for each ntile 44 | :return: matplotlib axis 45 | """ 46 | num_ntiles = len(avg_annual_ret) 47 | 48 | _, ax = plt.subplots(1, 1, figsize=MEDIUM_FIGSIZE) 49 | ax.set(ylabel='% Return', 50 | title=f'Annual Return, {period}{freq} Holding period', 51 | xlabel='') 52 | 53 | colors = [RETURN_COLOR_MAP(i) for i in np.linspace(0, 1, num_ntiles)] 54 | ax.bar(avg_annual_ret.index, avg_annual_ret.to_numpy(), color=colors) 55 | ax.axhline(0, linestyle='-', color='black', lw=1) 56 | 57 | plt.show() 58 | return ax 59 | 60 | 61 | def plot_inspection_data(table: pd.DataFrame, title: str, ylabel: str, decimals: int = 0) -> None: 62 | """ 63 | plots the inspection data for inspection tear sheets 64 | :param table: the table to plot 65 | :param title: the title for the plot 66 | :param ylabel: y label for plot 67 | :param decimals: amount of decimals to display on the Y axis 68 | :return: None 69 | """ 70 | 71 | fig, ax = plt.subplots(1, 1, figsize=MEDIUM_FIGSIZE) 72 | ax.set(title=title, ylabel=ylabel) 73 | table.plot(lw=2, ax=ax, cmap=RETURN_COLOR_MAP) 74 | ax.legend(loc="center left", bbox_to_anchor=(1, .5)) 75 | # ax.xaxis.set_major_formatter(mpl.dates.DateFormatter('%m-%Y')) 76 | ax.yaxis.set_major_formatter(mpl.ticker.FormatStrFormatter(f'%.{decimals}f')) 77 | fig.autofmt_xdate() 78 | 79 | if isinstance(table, pd.Series): 80 | ax.get_legend().remove() 81 | 82 | plt.show() 83 | 84 | 85 | def plot_tilts(frame: pd.DataFrame, ntile: str, group_name: str, ax=None): 86 | """ 87 | Plots the timeseries group tilts for a single ntile 88 | :param frame: frame containing the tilts per day, columns: group, index: pd.Period 89 | :param ntile: the Ntile we are plotting for 90 | :param group_name: the name of the group 91 | :param ax: axis to plot on 92 | :return: None 93 | """ 94 | if ax is None: 95 | fig, ax = plt.subplots(1, 1, figsize=MEDIUM_FIGSIZE) 96 | 97 | ax.set(title=f'{ntile}, {group_name}'.title(), ylabel='Weight In Ntile') 98 | frame.plot(lw=2, ax=ax, cmap=TILTS_COLOR_MAP, legend=None) 99 | ax.axhline(0, linestyle='-', color='black', lw=1) 100 | # ax.xaxis.set_major_formatter(mpl.dates.DateFormatter('%m-%Y')) 101 | ax.yaxis.set_major_formatter(mpl.ticker.FormatStrFormatter(f'%.2f')) 102 | plt.show() 103 | 104 | 105 | def plot_tilt_hist(series, ntile: str, group_name: str, extra_space: bool = True): 106 | """ 107 | Plots the histogram group tilts for a single ntile 108 | :param series: frame containing the avg tilts, columns: group, index: pd.Period 109 | :param ntile: the Ntile we are plotting for 110 | :param group_name: the name of the group 111 | :return: None 112 | """ 113 | if extra_space: 114 | fig, ax = plt.subplots(1, 2, figsize=LARGE_FIGSIZE) 115 | else: 116 | _, ax = plt.subplots(1, 1, figsize=(4.5, 4.5)) 117 | 118 | title = 'Weight Relative to Universe' if 'Ntile' in group_name else 'Group Exposure' 119 | plotter_frame = series.to_frame('weight') 120 | plotter_frame['colors'] = [TILTS_COLOR_MAP(i) for i in np.linspace(0, 1, len(series))] 121 | plotter_frame = plotter_frame.sort_values('weight') 122 | 123 | ax[0].barh(plotter_frame.index.tolist(), plotter_frame['weight'].tolist(), align='center', 124 | color=plotter_frame['colors'].tolist()) 125 | ax[0].set(title=f'{ntile}, {group_name}'.title(), ylabel='Group', xlabel=title) 126 | ax[0].axvline(0, linestyle='-', color='black', lw=1) 127 | 128 | if extra_space: 129 | return ax[1] 130 | 131 | plt.show() 132 | 133 | 134 | def plot_timeseries_ic(ic_frame: pd.DataFrame, holding_period: int): 135 | """ 136 | plots the daily time series IC 137 | :param ic_frame: frame of IC to plot index: pd.Period 138 | :param holding_period: how long the holding period is for the IC 139 | :return: None 140 | """ 141 | fig, ax = plt.subplots(1, 1, figsize=MEDIUM_FIGSIZE) 142 | ic_frame.plot(ax=ax, title=f'IC {holding_period} {ic_frame.index.freq.name} Holding Period') 143 | ax.get_lines()[1].set_linewidth(3) 144 | ax.axhline(0, linestyle='-', color='black', lw=1) 145 | fig.autofmt_xdate() 146 | plt.show() 147 | 148 | 149 | def plot_auto_corr(ac_series: pd.Series, holding_period: int) -> None: 150 | """ 151 | plots the daily time series IC 152 | :param ac_series: series of auto corr to plot index: pd.Period 153 | :param holding_period: how long the holding period is for the IC 154 | :return: None 155 | """ 156 | fig, ax = plt.subplots(1, 1, figsize=MEDIUM_FIGSIZE) 157 | ac_series.plot(ax=ax, title=f'Autocorrelation {holding_period}{ac_series.index.freq.name} Holding Period') 158 | ax.axhline(ac_series.median(), linestyle=(0, (5, 10)), color='black', lw=1) 159 | fig.autofmt_xdate() 160 | plt.show() 161 | 162 | 163 | def plot_turnover(turn_frame: pd.Series, holding_period: int) -> None: 164 | """ 165 | plots the daily time series IC 166 | :param turn_frame: dataframe of turnover to plot index: pd.Period 167 | :param holding_period: how long the holding period is for the IC 168 | :return: None 169 | """ 170 | fig, ax = plt.subplots(1, 1, figsize=MEDIUM_FIGSIZE) 171 | colors = [RETURN_COLOR_MAP(i) for i in np.linspace(0, 1, turn_frame.columns.max())] 172 | 173 | for col in turn_frame.columns: 174 | ax.plot(turn_frame.index.to_timestamp(), turn_frame[col], color=colors[col - 1], label=f'Ntile: {col}') 175 | ax.axhline(turn_frame[col].median(), linestyle=(0, (5, 10)), color=colors[col - 1], lw=5) 176 | 177 | ax.set(ylabel='% Turnover', title=f'Turnover {holding_period}{turn_frame.index.freq.name} Holding Period', 178 | xlabel='') 179 | ax.legend(loc="center left", bbox_to_anchor=(1, .5)) 180 | fig.autofmt_xdate() 181 | plt.show() 182 | 183 | 184 | def plot_ic_horizon(horizon_frame: pd.DataFrame): 185 | ax_tuple = plt.subplots(2, 2, figsize=LARGE_FIGSIZE)[1].flatten() 186 | colors = [IC_COLOR_MAP(i) for i in np.linspace(0, 1, 4)] 187 | 188 | for i in range(horizon_frame.shape[1]): 189 | plot_me = horizon_frame.iloc[:, i] 190 | plot_me.plot(ax=ax_tuple[i], color=colors[i], title=plot_me.name) 191 | plt.show() 192 | 193 | 194 | def render_heat_table(frame: pd.DataFrame) -> None: 195 | """ 196 | renders a dataframe as a heatmap 197 | :param frame: the frame to render 198 | :return: None 199 | """ 200 | cm = mpl.cm.get_cmap('RdYlGn') 201 | styled = frame.style.background_gradient(cmap=cm, axis=0).format('{:.2f}').set_properties( 202 | **{'text-align': 'center'}) 203 | render_table(styled) 204 | 205 | 206 | def render_table(table: pd.DataFrame, output: str = None) -> None: 207 | """ 208 | displays a table to the user 209 | :param table: the table to display 210 | :param output: the output we should render 211 | :return: None 212 | """ 213 | if output: 214 | print(output) 215 | display(table) 216 | -------------------------------------------------------------------------------- /ntiles/backtest/portals/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Alexd14/ntiles/d22c75aac2a553ccca17fb71a3650071e31808c2/ntiles/backtest/portals/__init__.py -------------------------------------------------------------------------------- /ntiles/backtest/portals/base_portal.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List, Union 3 | 4 | import pandas as pd 5 | 6 | 7 | class BasePortal(ABC): 8 | """ 9 | Base class for the data portal object 10 | """ 11 | 12 | def __init__(self, assets: List[Union[str, int]]): 13 | """ 14 | :param assets: the assets we are querying for 15 | """ 16 | self._assets = assets 17 | 18 | @property 19 | @abstractmethod 20 | def assets(self) -> List[Union[str, int]]: 21 | """ 22 | returns the assets property 23 | """ 24 | return self._assets 25 | 26 | 27 | class BaseTimeSeriesPortal(BasePortal): 28 | """portal for time series data""" 29 | 30 | def __init__(self, assets: List[Union[str, int]], start: pd.Period, end: pd.Period, freq: str): 31 | """ 32 | :param assets: the assets we are querying for 33 | :param start: start date for the query 34 | :param end: end date for the query 35 | :param freq: frequency of the data 36 | """ 37 | super().__init__(assets) 38 | self._start = start 39 | self._end = end 40 | self._freq = freq 41 | 42 | @property 43 | @abstractmethod 44 | def periods(self) -> List[pd.Period]: 45 | """ 46 | :return: the unique periods for which we have data 47 | """ 48 | pass 49 | 50 | 51 | class BaseRawPortal(BaseTimeSeriesPortal, ABC): 52 | def __init__(self, assets: List[Union[str, int]], start: pd.Period, end: pd.Period): 53 | """ 54 | :param assets: the assets we are querying for 55 | :param start: start date for the query 56 | :param end: end date for the query 57 | """ 58 | super().__init__(assets, start, end) 59 | 60 | @property 61 | @abstractmethod 62 | def raw_data(self) -> pd.DataFrame: 63 | """ 64 | returns the raw data held by the portal 65 | :return: Index: Id, pd.Period; Columns: 'data'; Values: data 66 | """ 67 | pass 68 | 69 | 70 | class BaseDeltaPortal(BaseTimeSeriesPortal, ABC): 71 | """ 72 | a portal which fetches and calculates the raw data long with delta or percent delta of a variable. 73 | Useful for fetching and calculating returns 74 | """ 75 | 76 | @property 77 | @abstractmethod 78 | def delta_data(self): 79 | """ 80 | returns the delta of the data held by the portal 81 | :return: Index: Id, pd.Period; Columns: 'delta'; Values: data 82 | """ 83 | pass 84 | 85 | 86 | class BaseGrouperPortalConstant(BasePortal, ABC): 87 | """ 88 | A portal which fetches grouping data 89 | """ 90 | 91 | def __init__(self, assets: List[Union[str, int]], group_name: str): 92 | """ 93 | :param assets: the assets we are querying for 94 | :param group_name: the name of the grouping 95 | """ 96 | super().__init__(assets) 97 | self.group_name = group_name 98 | 99 | @property 100 | def name(self): 101 | """ 102 | :return: Name of group 103 | """ 104 | return self.group_name 105 | 106 | @property 107 | @abstractmethod 108 | def group_information(self) -> pd.Series: 109 | """ 110 | Holds group information from the portal 111 | :return: Index: Id; Columns: 'group'; Values: group 112 | """ 113 | pass 114 | 115 | @property 116 | @abstractmethod 117 | def group_mapping(self): 118 | """ 119 | :return: dict mapping for the group 120 | """ 121 | pass 122 | 123 | 124 | class BaseGrouperPortalTimeSeries(BaseTimeSeriesPortal, ABC): 125 | """ 126 | a portal which returns grouping information over a time period 127 | """ 128 | 129 | def __init__(self, assets: List[Union[str, int]], start: pd.Period, end: pd.Period, group_name: str): 130 | """ 131 | :param assets: the assets we are querying for 132 | :param start: start date for the query 133 | :param end: end date for the query 134 | :param group_name: the name of the grouping 135 | """ 136 | super().__init__(assets, start, end) 137 | self.group_name = group_name 138 | 139 | @property 140 | def name(self): 141 | """ 142 | :return: Name of group 143 | """ 144 | return self.group_name 145 | 146 | @property 147 | @abstractmethod 148 | def periods(self) -> List[pd.Period]: 149 | """ 150 | :return: the unique periods for which we have data 151 | """ 152 | pass 153 | 154 | @property 155 | @abstractmethod 156 | def group_information(self) -> pd.DataFrame: 157 | """ 158 | Holds a timeseries of group information from the portal 159 | :return: Index: Id, pd.Period; Columns: 'group'; Values: group 160 | """ 161 | pass 162 | -------------------------------------------------------------------------------- /ntiles/backtest/portals/pricing_portal.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from typing import Iterable, List, Union 3 | 4 | import pandas as pd 5 | import numpy as np 6 | from .base_portal import BaseDeltaPortal 7 | 8 | from ntiles.toolbox import QueryConstructor, SQLConnection 9 | 10 | 11 | class PricingPortal(BaseDeltaPortal, ABC): 12 | """ 13 | Pulls pricing from database 14 | """ 15 | 16 | def __init__(self, 17 | assets: Union[Iterable, str], 18 | search_by: str, 19 | start_date: str, 20 | end_date: str, 21 | field: str = 'prc', 22 | table: str = 'CRSP.sd', 23 | con: SQLConnection = None, 24 | freq: str = 'D', 25 | ): 26 | """ 27 | :param assets: The assets we want ti search for. Can be list of ids or a code eg "ETF_SPY". 28 | :param search_by: The name of the asset ids we are searching the database by. 29 | :param start_date: The date to start getting pricing. Format: %Y-%m-%d 30 | :param end_date: The date to stop getting pricing. Format: %Y-%m-%d 31 | :param field: The pricing field to get from the database. Default: 'prc' 32 | :param table: The table to get the pricing from. Default: 'CRSP.sd' 33 | :param con: A SQLConnection object to use to connect to the database. Default: None 34 | :param freq: The frequency of the pricing. Default: 'D' 35 | """ 36 | super().__init__(assets=assets, 37 | start=pd.Period(start_date), 38 | end=min(pd.Timestamp(end_date), pd.Timestamp('today')).to_period('D'), 39 | freq=freq) 40 | self._search_by = search_by 41 | self._field = field 42 | self._table = table 43 | self._con = con 44 | self._freq = freq 45 | 46 | self._pricing = None 47 | self._get_pricing() 48 | 49 | @property 50 | def assets(self) -> List[any]: 51 | return self._pricing.columns.tolist() 52 | 53 | @property 54 | def delta_data(self) -> pd.DataFrame: 55 | """ 56 | returns the delta of the data held by the portal 57 | :return: Index: Id, pd.Period; Columns: 'delta'; Values: data 58 | """ 59 | return self._pricing 60 | 61 | @property 62 | def periods(self) -> List[pd.Period]: 63 | return self._pricing.index.drop_duplicates().to_list() 64 | 65 | def _get_pricing(self): 66 | df = (QueryConstructor(sql_con=self._con, freq=self._freq) 67 | .query_timeseries_table(self._table, assets=self._assets, 68 | start_date=str(self._start), end_date=str(self._end), 69 | search_by=self._search_by, fields=[self._field]) 70 | .distinct() 71 | .set_calendar('NYSE') 72 | .order_by('date') 73 | .dropna(self._field) 74 | .df) 75 | 76 | self._pricing = df[self._field].unstack().pct_change(1).iloc[1:]. \ 77 | fillna(0).replace([np.inf, -np.inf], 0).clip(-.75, 1.5) 78 | -------------------------------------------------------------------------------- /ntiles/backtest/portals/sector_portal.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from typing import Iterable, List, Union 3 | 4 | import pandas as pd 5 | 6 | from ...toolbox import QueryConstructor 7 | from .base_portal import BaseGrouperPortalConstant 8 | 9 | 10 | class SectorPortal(BaseGrouperPortalConstant, ABC): 11 | def __init__(self, assets: Union[Iterable, str], search_by: str = 'permno', field='gsector', con=None, 12 | start_date=None, end_date=None, ): 13 | """ 14 | :param assets: the assets or universe to get the sector data for 15 | :param search_by: what is the id of the asset 16 | :param field: name of field we want to get 17 | """ 18 | super().__init__(assets, 'GIC Sector') 19 | self._search_by = search_by 20 | self._field = field 21 | self._con = con 22 | self._start_date = start_date 23 | self._end_date = end_date 24 | 25 | self._group = None 26 | self._set_sectors() 27 | 28 | @property 29 | def group_information(self) -> pd.Series: 30 | """ 31 | gets the gic _sectors for the give assets 32 | :return: DataFrame of GIC _sectors for the given assets 33 | """ 34 | return self._group 35 | 36 | @property 37 | def group_mapping(self): 38 | """ 39 | :return: dict mapping for the group 40 | """ 41 | return self.group_information.to_dict() 42 | 43 | def _set_sectors(self) -> None: 44 | """ 45 | Sets the _sectors in the class 46 | :return: None 47 | """ 48 | self._group = (QueryConstructor(self._con) 49 | .query_no_date_table(table='link.crsp_cstat_link', fields=[self._field, 'lpermno as permno'], 50 | assets=self._assets, search_by=self._search_by, start_date=self._start_date, 51 | end_date=self._end_date) 52 | .df)[self._field].fillna(-1) 53 | 54 | @property 55 | def assets(self) -> List[int]: 56 | return self._group.index.tolist() 57 | -------------------------------------------------------------------------------- /ntiles/backtest/stats.py: -------------------------------------------------------------------------------- 1 | import empyrical 2 | import pandas as pd 3 | import numpy as np 4 | 5 | from . import plotter 6 | from .periods import get_period_annualization, get_period_string 7 | 8 | 9 | def generate_return_stats(period_returns, flip_mdd) -> None: 10 | """ 11 | generates following returns statics for each Ntile: 12 | - Sharpe 13 | - Annual Return 14 | - Annual Vol 15 | - % Periods Up 16 | - Max Drawdown (flips for top and bottom bins, excluding middle bin) 17 | 18 | If long_short = Ture: 19 | - All above calculated on spread and universe 20 | - Annual Tracking Error 21 | - Information Ratio 22 | :param period_returns: the returns we are calculating stats for 23 | :param flip_mdd: should max draw down be flipped around the center? 24 | """ 25 | ntile_funcs = { 26 | 'sharpe': sharpe_ratio, 27 | 'CAGR': simple_returns_CGAR, 28 | 'Vol': annual_volatility, 29 | 'Max Drawdown': lambda x: max_drawdown(x, flip_mdd), 30 | '% Periods Up': percent_periods_up, 31 | } 32 | 33 | calculated_stats = [func(period_returns) for func in ntile_funcs.values()] 34 | render_me = pd.DataFrame(calculated_stats).transpose() 35 | plotter.render_heat_table(render_me) 36 | 37 | 38 | def compute_ntile_stats(name, func, ntile_returns) -> pd.Series: 39 | """ 40 | apply a function to each column of ntile_returns 41 | :param name: name of the function 42 | :param func: the function to apply 43 | :param ntile_returns: the returns we are applying the function to 44 | :return: pd.Series, index: Ntile; Name: name; 45 | """ 46 | return ntile_returns.apply(func, axis=0).rename(name) 47 | 48 | 49 | def max_drawdown(period_returns, flip_bottom) -> pd.Series: 50 | """ 51 | computes the max drawdown for each column 52 | flips the drawdown from downside to upside for ntiles that should be negative 53 | :param period_returns: the returns we are getting the drawdown for 54 | :param flip_bottom: 55 | :return: pd.Series, index: Ntile; Values: drawdown 56 | """ 57 | adj_ret = period_returns.copy() # gets rid of setting on copy warning 58 | num_cols = period_returns.shape[1] 59 | 60 | if flip_bottom: 61 | mid_pos = int(round(num_cols / 2 + .5)) - 1 62 | adj_ret.iloc[:, mid_pos:] = adj_ret.iloc[:, mid_pos:] * -1 63 | out = compute_ntile_stats('Max Drawdown', empyrical.max_drawdown, adj_ret) 64 | 65 | if num_cols % 2 == 1: # if even number of columns is odd pad null 66 | out.iloc[mid_pos] = None 67 | 68 | return out * 100 69 | 70 | return compute_ntile_stats('Max Drawdown', empyrical.max_drawdown, adj_ret) * 100 71 | 72 | 73 | def percent_periods_up(period_returns) -> pd.Series: 74 | """ 75 | computes the percent of periods where return is > 0 76 | :param period_returns: the returns we are getting the % of periods up for 77 | :return: pd.Series, index: Ntile; Values: % periods up 78 | """ 79 | periods_up = period_returns.copy() 80 | periods_up.iloc[:] = np.where(period_returns.values > 0, 1, 0) 81 | return (periods_up.sum(axis=0) / periods_up.shape[0]).rename('% Periods Up') 82 | 83 | 84 | def annual_volatility(period_returns) -> pd.Series: 85 | """ 86 | computes the annual volatility of each column 87 | :param period_returns: the returns we are getting the vol for 88 | :return: pd.Series, index: Ntile; Values: annual vol 89 | """ 90 | vol_func = wrap_emprical_period(empyrical.annual_volatility) 91 | return compute_ntile_stats('Annual Vol', vol_func, period_returns) * 100 92 | 93 | 94 | def sharpe_ratio(period_returns) -> pd.Series: 95 | """ 96 | computes the sharpe ratio for each 97 | :param period_returns: the returns we are getting the sharpe of 98 | :return:pd.Series, index: Ntile; Values: sharpe 99 | """ 100 | sharpe_func = wrap_emprical_period(empyrical.sharpe_ratio) 101 | return compute_ntile_stats('Sharpe', sharpe_func, period_returns) 102 | 103 | 104 | def simple_returns_CGAR(period_returns) -> pd.Series: 105 | """ 106 | computes the CAGR form simple returns 107 | :param period_returns: make t e 108 | :return: series with index: cum_returns.columns; values: corresponding average return in percent 109 | """ 110 | return CAGR(cum_returns(period_returns)) 111 | 112 | 113 | def CAGR(cum_returns_df: pd.DataFrame) -> pd.Series: 114 | """ 115 | calculates the geometric average yearly ntile returns from the given cumulative returns 116 | Assumed the data is in daily format 117 | :param cum_returns_df: cn be full cum returns or just the 118 | :return: series with index: cum_returns_df.columns; values: corresponding average return in percent 119 | """ 120 | ann_factor = get_period_annualization(cum_returns_df.index) 121 | return ((cum_returns_df.iloc[-1] ** (1 / (cum_returns_df.shape[0] / ann_factor)) - 1) * 100).rename('CAGR') 122 | 123 | 124 | def cum_returns(simple_returns: pd.DataFrame) -> pd.DataFrame: 125 | """ 126 | Calculates the daily returns from the simple returns. 127 | wraps empyrical.cum_returns 128 | :param simple_returns: returns used to calculate the cumulative returns 129 | :return: cumulative returns 130 | """ 131 | return empyrical.cum_returns(simple_returns, starting_value=1) 132 | 133 | 134 | def wrap_emprical_period(func): 135 | """ 136 | Wraps and empirical function that takes in returns and period 137 | :param func: the emprical function to wrap 138 | :return: function that takes in returns and infers the frequency of the data 139 | and passes a period to the enmprical fucntion 140 | """ 141 | 142 | def inner_wrapper(period_returns): 143 | return func(returns=period_returns, period=get_period_string(period_returns.index)) 144 | 145 | return inner_wrapper 146 | -------------------------------------------------------------------------------- /ntiles/backtest/tears/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Alexd14/ntiles/d22c75aac2a553ccca17fb71a3650071e31808c2/ntiles/backtest/tears/__init__.py -------------------------------------------------------------------------------- /ntiles/backtest/tears/backtest_tear.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from abc import ABC 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from ntiles.backtest.tears.base_tear import BaseTear 8 | from ntiles.backtest import plotter, stats, utils 9 | 10 | 11 | class BacktestTear(BaseTear, ABC): 12 | """ 13 | Computes returns and stats from the given factor and pricing data 14 | 15 | Upgrades: 16 | Have cash account for when security gets delisted and we own it 17 | One day holding period 18 | 19 | """ 20 | 21 | def __init__(self, ntile_matrix: pd.DataFrame, daily_returns: pd.DataFrame, ntiles, holding_period: int, 22 | long_short: bool, market_neutral: bool, show_uni: bool): 23 | """ 24 | :param ntile_matrix: unstacked and formatted ntiles prepared by Ntiles 25 | :param daily_returns: unstacked and formatted daily returns from Ntiles 26 | :param holding_period: How long we want to hold positions for, represents days 27 | :param ntiles: amount of bins we are testing (1 is high factor value n is low value) 28 | :param long_short: show we compute the spread between ntiles: (1 - n) 29 | :param market_neutral: subtract out the universe returns from the ntile returns? 30 | :param show_uni: suhould universe return be shown in the spread plot? 31 | """ 32 | 33 | super().__init__() 34 | 35 | self.ntile_matrix = ntile_matrix 36 | self.daily_returns = daily_returns 37 | self.ntiles = ntiles 38 | self.holding_period = holding_period 39 | self.long_short = long_short 40 | self.market_neutral = market_neutral 41 | self.show_uni = show_uni 42 | 43 | self.daily_weights = {} 44 | self.weighted_returns = {} 45 | self._daily_tile_returns = None 46 | 47 | def compute(self) -> None: 48 | """ 49 | method to run the backtest 50 | """ 51 | self.kick_backtest() 52 | 53 | def plot(self): 54 | """ 55 | method to plot the data for the backtest 56 | """ 57 | self.kick_visuals() 58 | 59 | # 60 | # Vectorized Ntile Backtest 61 | # 62 | def kick_backtest(self): 63 | """ 64 | Calculates the daily returns of each ntile 65 | Saves the daily returns in self._daily_tile_returns 66 | index: pd.Period 67 | columns: Ntile: {ntile} 68 | Values: Daily close ntile returns on corresponding day 69 | :return: None 70 | """ 71 | 72 | daily_ntile_returns = self._get_ntile_returns_helper() 73 | 74 | if self.long_short: 75 | daily_ntile_returns[f'1 vs {self.ntiles}'] = (daily_ntile_returns.iloc[:, 0] - 76 | daily_ntile_returns.loc[:, f'Ntile: {self.ntiles}']) / 2 77 | 78 | if self.ntiles > 3: 79 | daily_ntile_returns[f'2 vs {self.ntiles - 1}'] = (daily_ntile_returns.iloc[:, 1] - 80 | daily_ntile_returns.loc[:, 81 | f'Ntile: {self.ntiles - 1}']) / 2 82 | 83 | self._daily_tile_returns = daily_ntile_returns 84 | 85 | def _get_ntile_returns_helper(self) -> pd.DataFrame: 86 | """ 87 | Helper to get the returns for each ntile on each day 88 | :return: data frame index: pd.period; columns: Ntile; values: daily returns 89 | """ 90 | np_ntile_matrix = self.ntile_matrix.to_numpy() 91 | np_asset_returns_matrix = self.daily_returns.to_numpy() 92 | 93 | out = {} 94 | for ntile in range(1, self.ntiles + 1): 95 | out[f'Ntile: {ntile}'] = self._compute_daily_ntile_returns(np_ntile_matrix, np_asset_returns_matrix, ntile, 96 | self.holding_period) 97 | 98 | universe_ntile_matrix = np.where(np.isfinite(np_ntile_matrix), 1, np.nan)[self.holding_period - 1:] 99 | universe_returns_matrix = np_asset_returns_matrix[self.holding_period - 1:] 100 | 101 | out['universe'] = self._compute_daily_ntile_returns(universe_ntile_matrix, universe_returns_matrix, 1, 1) 102 | 103 | if self.holding_period != 1: 104 | index_values = self.ntile_matrix.index[self.holding_period - 2:] 105 | else: 106 | second_date = self.ntile_matrix.index[0] 107 | index_values = ([second_date - 1] + self.ntile_matrix.index.tolist()) 108 | 109 | out = pd.DataFrame(out, index=index_values) 110 | 111 | if self.market_neutral: 112 | # subtracting out universe returns 113 | ntile_cols = utils.get_ntile_cols(out) 114 | out.loc[:, ntile_cols] = out.loc[:, ntile_cols].subtract(out['universe'], axis=0) 115 | 116 | if not self.show_uni: 117 | out.drop('universe', axis=1, inplace=True) 118 | 119 | return out 120 | 121 | def _compute_daily_ntile_returns(self, ntile_matrix: np.array, asset_returns_matrix: np.array, ntile: int, 122 | holding_period: int) -> np.array: 123 | """ 124 | Computes the daily returns for a ntile 125 | :param ntile_matrix: the matrix of ntiles 126 | :param asset_returns_matrix: the matrix for returns 127 | :param ntile: the amount of ntiles we have computed 128 | :param holding_period: how long we are holding the assets for 129 | :return: 1d np.array of the daily return for the ntile 130 | """ 131 | 132 | # 133 | # Calculating the asset weight per day 134 | # 135 | weight_per_day = 1 / np.count_nonzero(ntile_matrix == ntile, axis=1) / holding_period 136 | if (weight_per_day > .05).any(): 137 | warnings.warn(f'We have {(weight_per_day > .05).sum()} assets in ntile {ntile} ' 138 | f'with daily weight over 5%.' 139 | f'Max weight is {round(weight_per_day.max(), 3)}') 140 | # weight_per_day = np.minimum(weight_per_day, np.full(weight_per_day.shape, .05)) 141 | 142 | raw_daily_weights = np.where(ntile_matrix == ntile, np.expand_dims(weight_per_day, axis=1), 0) 143 | daily_weights = utils.rolling_sum(raw_daily_weights, holding_period) 144 | 145 | weighted_asset_returns = daily_weights * asset_returns_matrix[holding_period - 1:, :] 146 | daily_returns = np.insert(np.sum(weighted_asset_returns, axis=1), 0, 0) 147 | 148 | self.record_backtest_components(ntile, daily_weights, weighted_asset_returns) 149 | 150 | return daily_returns 151 | 152 | def record_backtest_components(self, ntile, daily_weights, weighted_asset_returns): 153 | """ 154 | records the components to compute the backtest for a specific ntile 155 | :param ntile: the ntile the data is for 156 | :param daily_weights: the weights of each asset on the corresponding day 157 | :param weighted_asset_returns: the weighted returns of each asset 158 | :return: None 159 | """ 160 | self.daily_weights[f'Ntile: {ntile}'] = \ 161 | pd.DataFrame(daily_weights, index=self.daily_returns.index[self.holding_period - 1:], 162 | columns=self.daily_returns.columns) 163 | 164 | self.weighted_returns[f'Ntile: {ntile}'] = \ 165 | pd.DataFrame(weighted_asset_returns, index=self.daily_returns.index[self.holding_period - 1:], 166 | columns=self.daily_returns.columns) 167 | 168 | # 169 | # Visuals 170 | # 171 | def kick_visuals(self) -> None: 172 | """ 173 | controls displaying visuals to the user 174 | :return: None 175 | """ 176 | print('Ntile Backtest') 177 | cum_ret = stats.cum_returns(self._daily_tile_returns) 178 | 179 | # ntile stats 180 | ntile_cols = utils.get_ntile_cols(self._daily_tile_returns) 181 | ntile_daily_ret = self._daily_tile_returns[ntile_cols] 182 | ntile_cum_ret = cum_ret[ntile_cols] 183 | avg_annual_ret = stats.CAGR(ntile_cum_ret) 184 | # ntile plotting 185 | stats.generate_return_stats(ntile_daily_ret, self.market_neutral) 186 | freq = ntile_cum_ret.index.freq.name 187 | plotter.ntile_return_plot(ntile_cum_ret, f'Ntile Returns {self.holding_period}{freq} Holding Period') 188 | plotter.ntile_annual_return_bars(avg_annual_ret, self.holding_period, freq) 189 | 190 | if self.long_short: 191 | # spread stats 192 | spread_cols = utils.get_non_ntile_cols(self._daily_tile_returns) 193 | long_short_frame = self._daily_tile_returns[spread_cols] 194 | # spread plotting 195 | stats.generate_return_stats(long_short_frame, False) 196 | plotter.ntile_return_plot(cum_ret[spread_cols], 197 | f'Long Short Returns {self.holding_period}{freq} Holding Period') 198 | 199 | # 200 | # Data methods 201 | # 202 | def cum_ret_to_clipboard(self) -> None: 203 | """ 204 | write cumulative returns to clipboard 205 | :return: None 206 | """ 207 | stats.cum_returns(self._daily_tile_returns).to_clipboard() 208 | -------------------------------------------------------------------------------- /ntiles/backtest/tears/base_tear.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class BaseTear(ABC): 5 | """ 6 | The base class for all tearsheets 7 | """ 8 | 9 | def __init__(self): 10 | """ 11 | empty constructor 12 | """ 13 | 14 | def compute_plot(self) -> None: 15 | """ 16 | method which calculates stats and plots the data for the tearsheet 17 | :return: None 18 | """ 19 | self.compute() 20 | self.plot() 21 | 22 | @abstractmethod 23 | def compute(self) -> None: 24 | """ 25 | method which calculates stats for the tearsheet 26 | :return: None 27 | """ 28 | pass 29 | 30 | @abstractmethod 31 | def plot(self) -> None: 32 | """ 33 | method which plots data for the tearsheet 34 | :return: None 35 | """ 36 | pass 37 | -------------------------------------------------------------------------------- /ntiles/backtest/tears/ic_tear.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from typing import Iterable 3 | 4 | import pandas as pd 5 | 6 | from ntiles.backtest import plotter 7 | from ntiles.backtest.tears.base_tear import BaseTear 8 | from ntiles.backtest import utils 9 | 10 | 11 | class ICTear(BaseTear, ABC): 12 | """ 13 | Computes IC from the given factor and returns 14 | 15 | Currently will only measure IC for days a company is in the universe 16 | Example: AAPl is in the univere on 1/10 but not in universe on 11/10 if we have greater than 10 day holding period 17 | that asset wint count in the IC calculation 18 | """ 19 | 20 | def __init__(self, factor_data: pd.DataFrame, daily_returns: pd.DataFrame, holding_period: int): 21 | """ 22 | :param factor_data: factor data to look at must be from Ntiles 23 | :param daily_returns: daily returns we are calculating the IC on must be from Ntiles 24 | :param holding_period: Holding period we are calculating IC for 25 | """ 26 | super().__init__() 27 | self.factor_data = factor_data 28 | self.daily_returns = daily_returns 29 | self.holding_period = holding_period 30 | 31 | self.daily_ic = None 32 | self.ic_stats = None 33 | 34 | # 35 | # Calculation 36 | # 37 | def compute(self) -> None: 38 | """ 39 | master function for computing the IC 40 | :return: None 41 | """"" 42 | self.compute_daily_ic() 43 | self.calculate_ic_table() 44 | 45 | def compute_daily_ic(self) -> None: 46 | """ 47 | calculates and sets the daily IC for the holding period 48 | :return: None 49 | """ 50 | self.factor_data.index.names = ['date', 'id'] 51 | 52 | # slicing off factor values we dont have forward return data for 53 | factor_unstacked = self.factor_data['factor'].unstack()#.iloc[:-self.holding_period] 54 | forward_returns = self.compute_forward_returns().reindex_like(factor_unstacked) 55 | 56 | ic_array = utils.correlation_2d(factor_unstacked.to_numpy(), forward_returns.to_numpy()) 57 | self.daily_ic = pd.Series(ic_array, index=forward_returns.index).to_frame('IC') 58 | if self.daily_ic.index.freq.name == 'D': 59 | self.daily_ic['1 Month Avg IC'] = self.daily_ic.rolling(21).mean() 60 | else: 61 | self.daily_ic['1 Year Avg IC'] = self.daily_ic.rolling(12).mean() 62 | 63 | def compute_forward_returns(self) -> pd.DataFrame: 64 | """ 65 | Calculates self.holding_period forward returns from daily returns 66 | :return: index: date; columns: asset; values: self.holding_period forward returns 67 | """ 68 | # must mad extra day due to cumprod making first date nan 69 | daily_ret = self.daily_returns # utils.pad_extra_day(self.daily_returns, 0) 70 | return daily_ret.add(1).cumprod().pct_change(self.holding_period).shift(-self.holding_period) 71 | 72 | def calculate_ic_table(self) -> None: 73 | """ 74 | calculates summary stats for the IC data 75 | :return: None, sets self.ic_stats 76 | """ 77 | mean_ic = self.daily_ic['IC'].mean() 78 | std_ic = self.daily_ic['IC'].std() 79 | stats = { 80 | 'IC Mean': mean_ic, 81 | 'IC Median': self.daily_ic['IC'].median(), 82 | 'IC Std': std_ic, 83 | 'Risk Adjusted IC': mean_ic / std_ic, 84 | 'IC Skew': self.daily_ic['IC'].skew() 85 | } 86 | 87 | self.ic_stats = pd.Series(stats).round(3).to_frame(f'{self.holding_period}D').transpose() 88 | 89 | # 90 | # Plotting 91 | # 92 | def plot(self) -> None: 93 | """ 94 | plots the IC data in self.daily_ic 95 | :return: None 96 | """ 97 | print('Information Coefficient') 98 | plotter.render_table(self.ic_stats) 99 | plotter.plot_timeseries_ic(self.daily_ic, self.holding_period) 100 | # plotter.plot_ic_qq(self.daily_ic) 101 | # plotter.plot_ic_hist(self.daily_ic) 102 | 103 | # 104 | # To clipboard functions 105 | # 106 | def ic_to_clipboard(self) -> None: 107 | """ 108 | writes ic to the clipboard 109 | :return: None 110 | """ 111 | self.daily_ic.to_clipboard() 112 | 113 | 114 | class ICHorizonTear(BaseTear, ABC): 115 | """ 116 | Computes the IC horizon tear 117 | Will give insight into optimal holding periods for the factor 118 | """ 119 | 120 | def __init__(self, factor_data: pd.DataFrame, daily_returns: pd.DataFrame, intervals: Iterable[int], 121 | show_individual): 122 | """ 123 | :param factor_data: The factor values being tested, must be from Ntiles 124 | :param daily_returns: matrix of returns from Ntiles 125 | :param intervals: an iterable that contains the holding periods we would like to make the IC frontier for 126 | """ 127 | super().__init__() 128 | self._factor_data = factor_data 129 | self._daily_returns = daily_returns 130 | self._intervals = sorted(list(intervals)) 131 | self._show_individual = show_individual 132 | 133 | self.tears = {} 134 | self._ic_horizon = None 135 | 136 | def compute(self) -> None: 137 | """ 138 | runs a IC tear for all the periods we want to test over 139 | """ 140 | for interval in self._intervals: 141 | self.tears[interval] = ICTear(self._factor_data, self._daily_returns, interval) 142 | self.tears[interval].compute() 143 | 144 | self._ic_horizon = pd.concat([tear.ic_stats for tear in self.tears.values()]) 145 | 146 | def plot(self) -> None: 147 | """ 148 | plots the IC frontier and the Time series IC 149 | """ 150 | plotter.plot_ic_horizon(self._ic_horizon.drop(['IC Skew'], axis=1)) 151 | plotter.render_table(self._ic_horizon) 152 | if self._show_individual: 153 | for ic_tear in self.tears.values(): 154 | ic_tear.plot() 155 | -------------------------------------------------------------------------------- /ntiles/backtest/tears/inspection_tear.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | 3 | from ntiles.backtest.tears.base_tear import BaseTear 4 | from ntiles.backtest import plotter 5 | 6 | 7 | class InspectionTear(BaseTear, ABC): 8 | """ 9 | creates a data inspection sheet 10 | """ 11 | 12 | def __init__(self, factor_data): 13 | """ 14 | :param factor_data: factor_data from Ntiles 15 | """ 16 | super().__init__() 17 | self._factor_data = factor_data 18 | 19 | def compute(self) -> None: 20 | """ 21 | kicks off the tearsheet 22 | :return: None 23 | """ 24 | self.make_summary() 25 | 26 | def plot(self) -> None: 27 | """ 28 | plots the tearsheet 29 | """ 30 | self.summary_plots() 31 | 32 | def make_summary(self) -> None: 33 | """ 34 | calculates the summary statics for the factor by Ntile 35 | """ 36 | quantile_stats = self._factor_data.groupby('ntile').agg(['median', 'std', 'min', 'max', 'count']).factor 37 | quantile_stats['count %'] = quantile_stats['count'] / quantile_stats['count'].sum() * 100 38 | 39 | # aesthetics 40 | quantile_stats = quantile_stats.round(2) 41 | quantile_stats.columns = [col.title() for col in quantile_stats.columns] 42 | quantile_stats.index.name = 'Ntile:' 43 | 44 | plotter.render_table(quantile_stats, 'Quantiles Statistics') 45 | 46 | def summary_plots(self) -> None: 47 | """ 48 | plots the the summary of the factor 49 | """ 50 | no_index_factor_data = self._factor_data.reset_index().dropna() 51 | date_agg = no_index_factor_data.groupby('date') 52 | date_ntile_agg = no_index_factor_data.groupby(['date', 'ntile']) 53 | 54 | plotter.plot_inspection_data(date_agg.factor.count(), 'Universe Count Of Factor Per Period', 'Count') 55 | plotter.plot_inspection_data(date_ntile_agg.factor.count().unstack(), 'Ntile Count of Factor Per Period', 56 | 'Count') 57 | plotter.plot_inspection_data(date_ntile_agg.factor.median().unstack(), 'Median Factor Value by Ntile', 'Median', 58 | 2) 59 | -------------------------------------------------------------------------------- /ntiles/backtest/tears/tilts_backtest_tear.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from typing import Optional 3 | 4 | import pandas as pd 5 | 6 | from .backtest_tear import BacktestTear 7 | from .. import plotter 8 | from .. import utils 9 | 10 | from ..portals.base_portal import BaseGrouperPortalConstant 11 | 12 | 13 | class TiltsBacktestTear(BacktestTear, ABC): 14 | """ 15 | generates a tear sheet which shows the sector exposures of a strategy 16 | Must be run after the backtest tear 17 | """ 18 | 19 | def __init__(self, ntile_matrix: pd.DataFrame, daily_returns: pd.DataFrame, ntiles, holding_period: int, 20 | long_short: bool, market_neutral: bool, show_uni: bool, factor_data: pd.DataFrame, 21 | group_portal: Optional[BaseGrouperPortalConstant], show_ntile_tilts: bool): 22 | """ 23 | :param ntile_matrix: unstacked and formatted ntiles prepared by Ntiles 24 | :param daily_returns: unstacked and formatted daily returns from Ntiles 25 | :param holding_period: How long we want to hold positions for, represents days 26 | :param ntiles: amount of bins we are testing (1 is high factor value n is low value) 27 | :param long_short: show we compute the spread between ntiles: (1 - n) 28 | :param market_neutral: subtract out the universe returns from the ntile returns? 29 | :param show_uni: should universe return be shown in the spread plot? 30 | :param factor_data: the factor data from Ntiles 31 | :param group_portal: the group portal holding the groups. If this is None then the exposures will not be shown 32 | :param show_ntile_tilts: Should we show the exposures for each individual ntile? 33 | """ 34 | 35 | super().__init__(ntile_matrix, daily_returns, ntiles, holding_period, long_short, market_neutral, show_uni) 36 | self._factor_data = factor_data 37 | self._group_portal = group_portal 38 | self._show_ntile_tilts = show_ntile_tilts 39 | 40 | self._daily_group_weights = {} 41 | self._full_group_tilt_avg = {} 42 | 43 | def compute(self) -> None: 44 | """ 45 | master function for the tear sheet 46 | :return: None 47 | """ 48 | super().compute() 49 | 50 | if (self._group_portal is not None) and (self._show_ntile_tilts or self.long_short): 51 | self.compute_tilts() 52 | 53 | def plot(self) -> None: 54 | """ 55 | plots the tear sheet 56 | """ 57 | super().plot() 58 | if (self._group_portal is not None) and (self._show_ntile_tilts or self.long_short): 59 | self.make_plots() 60 | 61 | def compute_tilts(self): 62 | """ 63 | computes the daily tilt data for each group 64 | :return: None 65 | """ 66 | self.compute_group_weights() 67 | if self.long_short: 68 | self.calculate_long_short_tilts() 69 | 70 | def compute_group_weights(self): 71 | """ 72 | computes the weights by group for each ntile 73 | currently computes data but work because need a time series data adjusted for index constitutes 74 | have to use self.factor_data 75 | :return: None 76 | """ 77 | group_info = self._group_portal.group_information 78 | center_weight = group_info.groupby(group_info).count() / group_info.shape[0] 79 | center_weight = utils.remove_cat_index(center_weight) 80 | 81 | if self._show_ntile_tilts: 82 | ntile_keys = self.daily_weights.keys() 83 | else: 84 | ntile_keys = [min(self.daily_weights.keys()), max(self.daily_weights.keys())] 85 | 86 | new_col = self.daily_weights[ntile_keys[0]].columns.astype(str).map(self._group_portal.group_mapping) 87 | 88 | for ntile in ntile_keys: 89 | frame = self.daily_weights[ntile] 90 | frame.columns = new_col 91 | frame = self.daily_weights[ntile].stack().to_frame('weight') 92 | frame.index.names = ['date', 'group'] 93 | 94 | weights_unstacked = frame.groupby(['date', 'group']).sum().sub(center_weight, level=1, axis=0).unstack() 95 | weights_unstacked.columns = weights_unstacked.columns.droplevel(0) 96 | 97 | self._daily_group_weights[ntile] = weights_unstacked 98 | self._full_group_tilt_avg[ntile] = (frame.groupby('group').sum().weight 99 | / frame.index.levels[0].unique().shape[0] 100 | - center_weight) 101 | 102 | def calculate_long_short_tilts(self): 103 | """ 104 | calculates the time series tilts for the long short portfolio 105 | :return: None 106 | """ 107 | ntile_n = max(self._daily_group_weights.keys()) 108 | self._daily_group_weights['Long Short'] = (self._daily_group_weights['Ntile: 1'] 109 | - self._daily_group_weights[ntile_n]) 110 | self._full_group_tilt_avg['Long Short'] = self._daily_group_weights['Long Short'].stack().groupby( 111 | 'group').mean() 112 | 113 | def make_plots(self): 114 | print('Weights By Group') 115 | for ntile in self._daily_group_weights.keys(): 116 | if 'Long Short' == ntile and not self.long_short: 117 | continue 118 | if 'Ntile' in ntile and not self._show_ntile_tilts: 119 | continue 120 | ax = plotter.plot_tilt_hist(self._full_group_tilt_avg[ntile], ntile, self._group_portal.name) 121 | plotter.plot_tilts(self._daily_group_weights[ntile], ntile, self._group_portal.name, ax) 122 | -------------------------------------------------------------------------------- /ntiles/backtest/tears/turnover_tear.py: -------------------------------------------------------------------------------- 1 | from typing import Union, List 2 | 3 | from abc import ABC 4 | 5 | import duckdb 6 | import pandas as pd 7 | import numpy as np 8 | 9 | from .base_tear import BaseTear 10 | from .. import plotter, utils 11 | 12 | 13 | class TurnoverTear(BaseTear, ABC): 14 | """ 15 | Shows the turnover for a factor 16 | """ 17 | 18 | def __init__(self, factor_data: pd.DataFrame, holding_period: Union[int, List[int]]): 19 | super().__init__() 20 | self._factor_data = factor_data 21 | self._holding_period = holding_period 22 | 23 | self._auto_corr = None 24 | self._turnover = None 25 | self._summary_stats = dict() 26 | 27 | def compute(self) -> None: 28 | """ 29 | calculates the data for the tear 30 | """ 31 | 32 | self.calculate_autocorrelation() 33 | self.calculate_turnover() 34 | 35 | self.calculate_summary_stats() 36 | 37 | def plot(self) -> None: 38 | """ 39 | plots the tear 40 | """ 41 | self.plot_turnover() 42 | 43 | def calculate_autocorrelation(self) -> None: 44 | """ 45 | Calculates the auto correlation of the factor with a lag of self._holding_period 46 | 47 | calculates the autocorrelation of n and n - holding period 48 | """ 49 | factor_unstacked = self._factor_data['factor'].unstack() 50 | auto_corr_arr = utils.correlation_2d(factor_unstacked.to_numpy(), 51 | factor_unstacked.shift(self._holding_period).to_numpy()) 52 | 53 | self._auto_corr = pd.Series(auto_corr_arr, index=factor_unstacked.index) 54 | 55 | def calculate_turnover(self): 56 | """ 57 | Calculates the turnover of the top and bottom bin with a lag of self._holding_period 58 | 59 | calculates the turnover of n and n - holding period 60 | """ 61 | # getting frame of only the top and bottom bin 62 | max_ntile = self._factor_data['ntile'].max() 63 | turnover_frame = self._factor_data[['ntile']][self._factor_data['ntile'].isin([1, max_ntile])] 64 | turnover_frame['ntile_shifted'] = turnover_frame['ntile'].unstack().shift(self._holding_period).stack() 65 | turnover_frame['changed'] = turnover_frame['ntile'] != turnover_frame['ntile_shifted'] 66 | 67 | # fd = self._factor_data[['ntile']].reset_index() 68 | # fd['date'] = fd['date'].dt.to_timestamp() 69 | # max_ntile = fd['ntile'].max() 70 | # turnover_sql = f"""SELECT "date", "ntile", 71 | # "ntile" != lag("ntile", {self._holding_period}) OVER (PARTITION BY id ORDER BY "date") as "changed" 72 | # FROM fd 73 | # WHERE "ntile" in (1, {max_ntile})""" 74 | # 75 | # con = duckdb.connect(':memory:') 76 | # turnover_frame = con.execute(turnover_sql).df() 77 | # con.close() 78 | 79 | final_turnover = turnover_frame.groupby(['date', 'ntile']).changed.agg(sum=sum, count=len) 80 | 81 | self._turnover = (final_turnover['sum'] / final_turnover['count']).unstack() 82 | 83 | def calculate_summary_stats(self) -> None: 84 | """ 85 | sets the summary stats for the autocorelation and the turnover 86 | """ 87 | self._summary_stats['auto'] = self._auto_corr.agg( 88 | {'Mean AC': np.mean, 'Median AC': np.median, 'Std AC': np.std}).round(3).to_frame( 89 | f'{self._holding_period}D').transpose() 90 | 91 | self._summary_stats['turnover'] = self._turnover.stack().groupby('ntile').agg( 92 | **{'Mean Turnover': np.mean, 'Median Turnover': np.median, 'Std Turnover': np.std}).round(3) 93 | 94 | def plot_turnover(self) -> None: 95 | """ 96 | plots the time series data in self.auto_corr 97 | """ 98 | print('Autocorrelation') 99 | plotter.render_table(self._summary_stats['auto']) 100 | plotter.plot_auto_corr(self._auto_corr, self._holding_period) 101 | 102 | print('Turnover') 103 | plotter.render_table(self._summary_stats['turnover']) 104 | plotter.plot_turnover(self._turnover, self._holding_period) 105 | -------------------------------------------------------------------------------- /ntiles/backtest/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import numba as nb 4 | import duckdb 5 | from typing import Optional, List, Union 6 | 7 | 8 | def subset_frame(frame: pd.DataFrame, columns: Optional[List[str]]): 9 | """ 10 | subsets the given data frame by the given columns 11 | if the columns are none then the whole frame is returned 12 | :param frame: the dataframe to subset 13 | :param columns: the columns we are going to subset by, if none then nothing is done 14 | :return: Given frame subset by the given columns 15 | """ 16 | if columns: 17 | return frame[columns] 18 | return frame 19 | 20 | 21 | def get_ntile_cols(frame: pd.DataFrame) -> List[str]: 22 | """ 23 | :param frame: data frame to get columns of 24 | :return: all columns in the frame that contain 'Ntile' 25 | """ 26 | return [col for col in frame.columns if 'Ntile' in col] 27 | 28 | 29 | def get_non_ntile_cols(frame: pd.DataFrame) -> List[str]: 30 | """ 31 | :param frame: data frame to get columns of 32 | :return: all columns in the frame that dont contain 'Ntile' 33 | """ 34 | return [col for col in frame.columns if 'Ntile' not in col] 35 | 36 | 37 | def make_nan_inf_summary(df: Union[pd.DataFrame, pd.Series], max_loss: float, print_good: bool = True) -> pd.DataFrame: 38 | """ 39 | makes a summary fot the the amount of nan and infinity values in the given data frame 40 | will throw a ValueError if the percent of nan and inf is greater than the given threshold 41 | prints a summary of the nan's and inf of there are any 42 | :param df: the data frame we are checking 43 | :param max_loss: max decimal percent of nan and inf we are allowing the df to contain 44 | :param print_good: should we print the output if we dropped less then the threshold? 45 | :return: pandas data frame with the nan and inf dropped 46 | """ 47 | df_numpy = df.to_numpy() 48 | nan_array = np.isnan(df_numpy) 49 | finite_array = np.logical_or(np.isinf(df_numpy), np.isneginf(df_numpy)) 50 | 51 | if nan_array.any() or (not finite_array.all()): 52 | factor_length = len(df) 53 | amount_nan = nan_array.sum() 54 | amount_inf = finite_array.sum() 55 | total_percent_dropped = (amount_nan + amount_inf) / factor_length 56 | 57 | outString = f'Dropped {round(total_percent_dropped * 100, 2)}% of data. ' \ 58 | f'{round((amount_nan / factor_length) * 100, 2)}% due to nan, ' \ 59 | f'{round((amount_inf / factor_length) * 100, 2)}% of inf values. Threshold: {max_loss * 100}%\n' 60 | 61 | if total_percent_dropped > max_loss: 62 | raise ValueError('Exceeded Nan Infinity Threshold. ' + outString) 63 | 64 | # print out string as a summary 65 | if print_good: 66 | print(outString) 67 | 68 | # dropping the nans and the infinity values 69 | df = df.replace([np.inf, -np.inf], np.nan).dropna() 70 | 71 | elif print_good: 72 | print('Dropped 0% of data') 73 | 74 | return df 75 | 76 | 77 | def rolling_sum(a, n): 78 | """ 79 | rolling sum, column wise 80 | :param a: array to roll and sum 81 | :param n: length of rolling window 82 | :return: a[n:, :] of rolling sum 83 | """ 84 | if n == 1: 85 | return a 86 | 87 | cum_sum = np.cumsum(a, axis=0) 88 | cum_sum[n:, :] = cum_sum[n:, :] - cum_sum[:-n, :] 89 | return cum_sum[n - 1:, :] 90 | 91 | 92 | @nb.njit(parallel=True) 93 | def correlation_2d(factor: np.array, returns: np.array) -> np.array: 94 | """ 95 | calculates a timeseries of correlation for the given factor and forward returns 96 | factor and returns must have EXACTLY the same structure and order of assets/days 97 | think of each row as a group and we calculate the correlation by groups 98 | 99 | :param factor: 2d np.array, each row represents factor values for different assets on same day 100 | :param returns: 2d np.array, each row represents forward returns for different assets on same day 101 | :return:1d np.array representing time series of factor values 102 | """ 103 | if factor.shape != returns.shape: 104 | raise ValueError('Factor and returns dont represent same information') 105 | 106 | num_rows = factor.shape[0] 107 | out = np.empty(shape=num_rows) 108 | 109 | for i in nb.prange(num_rows): 110 | finite_mask = np.isfinite(factor[i]) & np.isfinite(returns[i]) 111 | out[i] = np.corrcoef(factor[i][finite_mask], returns[i][finite_mask])[0][1] 112 | 113 | return out 114 | 115 | 116 | def pad_extra_day(matrix_df: pd.DataFrame, pad_value: any) -> pd.DataFrame: 117 | """ 118 | pads a unstacked frame with a single extra row are the start of the data frame 119 | :param matrix_df: df to pad, index: pd.Period, columns: any, values: any 120 | :param pad_value: constant value to insert into a row 121 | :return: matrix_df with a padded value 122 | """ 123 | out = matrix_df.copy() 124 | new_period = (out.index.min().to_timestamp() - pd.DateOffset(1)).to_period('D') 125 | out.loc[new_period, :] = np.full(shape=out.shape[1], fill_value=pad_value) 126 | return out.sort_index() # can make this function better without a sort 127 | 128 | 129 | def remove_cat_index(frame: Union[pd.Series, pd.DataFrame]) -> Union[pd.Series, pd.DataFrame]: 130 | """ 131 | if the frame has a categorical index it will remove it 132 | :return: frame with the categorical index removed 133 | """ 134 | if frame.index.is_categorical(): 135 | frame.index = frame.index.astype(str) 136 | 137 | return frame 138 | 139 | 140 | def convert_date_to_period(frame: Union[pd.DataFrame, pd.Series], freq: str = 'D', **kwargs) -> Union[ 141 | pd.DataFrame, pd.Series]: 142 | """ 143 | converts the date column to a period if the date column is of type timestamp 144 | if the 'date' column is a period then nothing will be changed 145 | date can be in the index or columns 146 | 147 | :param frame: the frame containing the date column 148 | :param freq: the freq for the period 149 | :return: thr same frame that was passed but 'date' is a partiod. 150 | """ 151 | index_names = list(frame.index.names) 152 | frame = frame.reset_index() 153 | 154 | if 'date' in frame.columns: 155 | frame['date'] = frame['date'].dt.to_period(freq) 156 | frame.set_index(index_names) 157 | return frame 158 | 159 | raise ValueError('"date" not found in data frame') 160 | 161 | 162 | def ntile(factor: pd.Series, ntiles: int, ) -> pd.Series: 163 | """ 164 | Universe relative Quantiles of a factor by day 165 | Around 100X faster than pandas groupby qcut 166 | 167 | pd.DataFrame of ntiled factor 168 | index: (pd.Period, _asset_id) 169 | Columns: (factor, ntile) 170 | Values: (factor value, Ntile corresponding to factor value) 171 | 172 | :param factor: same var as ntile_return_tearsheet 173 | :param ntiles: same var as ntile_return_tearsheet 174 | """ 175 | factor = factor.to_frame('factor').reset_index() 176 | index_names = factor.columns.tolist() 177 | index_names.remove('factor') 178 | 179 | date_is_period = isinstance(factor.date.dtype, pd.core.dtypes.dtypes.PeriodDtype) 180 | if date_is_period: 181 | factor['date'] = factor['date'].dt.to_timestamp() 182 | 183 | sql_quantile = f"""SELECT *, NTILE({ntiles}) OVER(PARTITION BY date ORDER BY factor.factor DESC) as ntile 184 | FROM factor 185 | WHERE factor.factor IS NOT NULL""" 186 | con = duckdb.connect(':memory:') 187 | factor_ntile = con.execute(sql_quantile).df() 188 | con.close() 189 | 190 | if date_is_period: 191 | factor_ntile['date'] = factor_ntile['date'].dt.to_period(freq='D') 192 | 193 | factor_ntile = factor_ntile.set_index(index_names) 194 | return factor_ntile 195 | -------------------------------------------------------------------------------- /ntiles/examples/ic_ac.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Alexd14/ntiles/d22c75aac2a553ccca17fb71a3650071e31808c2/ntiles/examples/ic_ac.png -------------------------------------------------------------------------------- /ntiles/examples/inspection_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Alexd14/ntiles/d22c75aac2a553ccca17fb71a3650071e31808c2/ntiles/examples/inspection_1.png -------------------------------------------------------------------------------- /ntiles/examples/inspection_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Alexd14/ntiles/d22c75aac2a553ccca17fb71a3650071e31808c2/ntiles/examples/inspection_2.png -------------------------------------------------------------------------------- /ntiles/examples/return_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Alexd14/ntiles/d22c75aac2a553ccca17fb71a3650071e31808c2/ntiles/examples/return_1.png -------------------------------------------------------------------------------- /ntiles/examples/return_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Alexd14/ntiles/d22c75aac2a553ccca17fb71a3650071e31808c2/ntiles/examples/return_2.png -------------------------------------------------------------------------------- /ntiles/tests/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /ntiles/tests/constitute_adjustment_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from pandas import ( 4 | Timestamp, 5 | DataFrame, 6 | concat, 7 | MultiIndex 8 | ) 9 | 10 | from ntiles.toolbox.constitutes.constitute_adjustment import ConstituteAdjustment 11 | from ntiles.toolbox.utils.date_config import DateConfig 12 | 13 | 14 | class ConstituteAdjustmentTest(unittest.TestCase): 15 | 16 | def examples(self): 17 | self.foo_constitutes = DataFrame(data=[ 18 | # symbol entered exited 19 | ['BOB', '2009-01-01', '2012-01-01'], # whole thing 20 | ['LARY', '2010-01-05', '2010-01-07'], # added and then exited 21 | ['JEFF', '2011-03-02', '2020-03-02']], # added too late 22 | columns=['symbol', 'from', 'thru'] 23 | ) 24 | self.date_config = DateConfig(freq='D', date_format='%Y-%m-%d', target_data_type='timestamp') 25 | self.ca = ConstituteAdjustment(id_col='symbol', date_config=self.date_config) 26 | self.ca.add_universe_info(universe=self.foo_constitutes, start_date='2010-01-04', end_date='2010-01-12', ) 27 | 28 | self.foo_data = DataFrame( 29 | data=[['BOB', '2010-01-04', 50], 30 | ['BOB', '2010-01-05', 51], 31 | ['BOB', '2010-01-06', 52], 32 | ['BOB', '2010-01-07', 53], 33 | # ['BOB', '2010-01-08', 54], this will be missing data 34 | ['BOB', '2010-01-11', 55], 35 | ['BOB', '2010-01-12', 56], 36 | ['LARY', '2010-01-04', 20], # should not be included 37 | ['LARY', '2010-01-05', 21], 38 | ['LARY', '2010-01-06', 22], 39 | ['LARY', '2010-01-07', 23], 40 | ['LARY', '2010-01-08', 24], # should not be included 41 | ['LARY', '2010-01-11', 25], # should not be included 42 | ['LARY', '2010-01-12', 26], # should not be included 43 | ['LARY', '2010-01-13', 27], # should not be included 44 | ['FOO', '2010-01-08', 0]], # should be ignored 45 | columns=['symbol', 'date', 'factor']) 46 | 47 | self.adjusted_foo = DataFrame( 48 | data=[['BOB', Timestamp('2010-01-04'), 50], 49 | ['BOB', Timestamp('2010-01-05'), 51], 50 | ['BOB', Timestamp('2010-01-06'), 52], 51 | ['BOB', Timestamp('2010-01-07'), 53], 52 | ['BOB', Timestamp('2010-01-08'), None], 53 | ['BOB', Timestamp('2010-01-11'), 55], 54 | ['BOB', Timestamp('2010-01-12'), 56], 55 | ['LARY', Timestamp('2010-01-05'), 21], 56 | ['LARY', Timestamp('2010-01-06'), 22], 57 | ['LARY', Timestamp('2010-01-07'), 23]], 58 | columns=['symbol', 'date', 'factor']).set_index(['date', 'symbol']) 59 | 60 | pricing_data = DataFrame( 61 | data=[['LARY', Timestamp('2010-01-08'), 24], 62 | ['LARY', Timestamp('2010-01-11'), 25], 63 | ['LARY', Timestamp('2010-01-12'), 26]], 64 | columns=['symbol', 'date', 'factor']).set_index(['date', 'symbol']) 65 | 66 | self.adjusted_pricing = concat([pricing_data, self.adjusted_foo]).sort_values(['symbol', 'date']) 67 | 68 | # 69 | # ************************************ add_universe_info ************************************ 70 | # 71 | 72 | def test_factor_add_universe_info(self): 73 | """ 74 | testing the index generation in add_universe_info 75 | has missing data (None), data that should not be included (yet to be added, has been removed) and 76 | irrelevant symbols 77 | """ 78 | self.examples() 79 | 80 | # for factors 81 | factor_components = [(Timestamp('2010-01-04'), 'BOB'), 82 | (Timestamp('2010-01-05'), 'BOB'), 83 | (Timestamp('2010-01-06'), 'BOB'), 84 | (Timestamp('2010-01-07'), 'BOB'), 85 | (Timestamp('2010-01-08'), 'BOB'), 86 | (Timestamp('2010-01-11'), 'BOB'), 87 | (Timestamp('2010-01-12'), 'BOB'), 88 | (Timestamp('2010-01-05'), 'LARY'), 89 | (Timestamp('2010-01-06'), 'LARY'), 90 | (Timestamp('2010-01-07'), 'LARY')] 91 | 92 | self.assertTrue(MultiIndex.from_tuples(factor_components).equals(self.ca.factor_components)) 93 | 94 | def test_throw_column_error(self): 95 | """ 96 | ensuring a error will be thrown when the correct columns are not supplied 97 | """ 98 | self.examples() 99 | 100 | with self.assertRaises(ValueError) as em: 101 | self.ca.add_universe_info(start_date='2010-01-04', 102 | end_date='2010-01-12', 103 | universe=DataFrame(columns=['foo', 'foo1', 'foo2'])) 104 | self.assertEqual('Required column "symbol" is not present', str(em.exception)) 105 | 106 | def test_duplicate_symbols(self): 107 | """ 108 | Ensuring that passing a df with duplicate symbols will raise a ValueError 109 | """ 110 | self.examples() 111 | 112 | self.foo_constitutes.iat[1, 0] = 'BOB' 113 | 114 | with self.assertRaises(ValueError) as em: 115 | self.ca.add_universe_info(start_date='2010-01-04', 116 | end_date='2010-01-12', 117 | universe=self.foo_constitutes) 118 | self.assertEqual('The column symbol is 0.333 duplicates, 1 rows\n', str(em.exception)) 119 | 120 | # 121 | # ************************************ adjust_data_for_membership ************************************ 122 | # 123 | 124 | def test_adjust_data_for_membership(self): 125 | """ 126 | ensuring adjust_data_for_membership return the correct data frame 127 | data given has good data to index, not seen bad tickers, and tickers with dates out of bounds 128 | """ 129 | self.examples() 130 | filtered = self.ca.adjust_data_for_membership(data=self.foo_data) 131 | self.assertTrue(self.adjusted_foo['factor'].sort_index().equals(filtered.sort_index())) 132 | 133 | def test_throw_error_adjust_data_for_membership(self): 134 | """ 135 | ensuring adjust_data_for_membership throws error when not given symbols or date 136 | """ 137 | self.examples() 138 | 139 | with self.assertRaises(ValueError) as em: 140 | self.ca.adjust_data_for_membership(data=DataFrame(columns=['foo', 'notSymbol', 'factor'])) 141 | self.assertEqual('Required column "date" is not present', str(em.exception)) 142 | 143 | def test_no_index_set_adjust_data_for_membership(self): 144 | """ 145 | ensuring adjust_data_for_membership throws error when there is no index set 146 | AKA add_universe_info was never called 147 | """ 148 | self.examples() 149 | 150 | with self.assertRaises(ValueError) as em: 151 | ConstituteAdjustment().adjust_data_for_membership(data=self.foo_data) 152 | self.assertEqual('Universe is not set', str(em.exception)) 153 | 154 | 155 | if __name__ == '__main__': 156 | unittest.main() 157 | -------------------------------------------------------------------------------- /ntiles/tests/ml_factor_calculation_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from abc import ABC 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from toolbox.utils.ml_factor_calculation import ModelWrapper, calc_ml_factor, generate_indexes 8 | from toolbox.utils.ml_factor_calculation import SliceHolder 9 | 10 | 11 | class MyTestCase(unittest.TestCase): 12 | 13 | def examples(self): 14 | # index includes non trading days 15 | # exactly 60 occurrences of each ticker 16 | first = pd.Timestamp(year=2010, month=1, day=1) 17 | self.date_index = pd.MultiIndex.from_product( 18 | [pd.date_range(start=first, end=pd.Timestamp(year=2010, month=3, day=1)), 19 | ['BOB', 'JEFF', 'CARL']], names=['date', 'symbol']) 20 | 21 | self.expected_index_e5_10_30 = [ 22 | (SliceHolder(first, first + pd.Timedelta(days=29)), 23 | SliceHolder(first + pd.Timedelta(days=40), first + pd.Timedelta(days=44))), 24 | 25 | (SliceHolder(first, first + pd.Timedelta(days=34)), 26 | SliceHolder(first + pd.Timedelta(days=45), first + pd.Timedelta(days=49))), 27 | 28 | (SliceHolder(first, first + pd.Timedelta(days=39)), 29 | SliceHolder(first + pd.Timedelta(days=50), first + pd.Timedelta(days=54))), 30 | 31 | (SliceHolder(first, first + pd.Timedelta(days=44)), 32 | SliceHolder(first + pd.Timedelta(days=55), first + pd.Timedelta(days=59))) 33 | ] 34 | 35 | self.expected_index_e7_8_30 = [ 36 | (SliceHolder(first, first + pd.Timedelta(days=29)), 37 | SliceHolder(first + pd.Timedelta(days=37), first + pd.Timedelta(days=44))), 38 | 39 | (SliceHolder(first, first + pd.Timedelta(days=37)), 40 | SliceHolder(first + pd.Timedelta(days=45), first + pd.Timedelta(days=52))), 41 | 42 | (SliceHolder(first, first + pd.Timedelta(days=45)), 43 | SliceHolder(first + pd.Timedelta(days=53), first + pd.Timedelta(days=59))), 44 | ] 45 | 46 | self.expected_index_e5_10_30 = self.turn_to_datetime64(self.expected_index_e5_10_30) 47 | self.expected_index_e7_8_30 = self.turn_to_datetime64(self.expected_index_e7_8_30) 48 | 49 | self.expected_index_r5_10_30 = [ 50 | (SliceHolder(first, first + pd.Timedelta(days=29)), 51 | SliceHolder(first + pd.Timedelta(days=40), first + pd.Timedelta(days=44))), 52 | 53 | (SliceHolder(first + pd.Timedelta(days=5), first + pd.Timedelta(days=34)), 54 | SliceHolder(first + pd.Timedelta(days=45), first + pd.Timedelta(days=49))), 55 | 56 | (SliceHolder(first + pd.Timedelta(days=10), first + pd.Timedelta(days=39)), 57 | SliceHolder(first + pd.Timedelta(days=50), first + pd.Timedelta(days=54))), 58 | 59 | (SliceHolder(first + pd.Timedelta(days=15), first + pd.Timedelta(days=44)), 60 | SliceHolder(first + pd.Timedelta(days=55), first + pd.Timedelta(days=59))) 61 | ] 62 | 63 | self.expected_index_r7_8_30 = [ 64 | (SliceHolder(first, first + pd.Timedelta(days=29)), 65 | SliceHolder(first + pd.Timedelta(days=37), first + pd.Timedelta(days=44))), 66 | 67 | (SliceHolder(first + pd.Timedelta(days=8), first + pd.Timedelta(days=37)), 68 | SliceHolder(first + pd.Timedelta(days=45), first + pd.Timedelta(days=52))), 69 | 70 | (SliceHolder(first + pd.Timedelta(days=16), first + pd.Timedelta(days=45)), 71 | SliceHolder(first + pd.Timedelta(days=53), first + pd.Timedelta(days=59))), 72 | ] 73 | 74 | self.expected_index_r5_10_30 = self.turn_to_datetime64(self.expected_index_r5_10_30) 75 | self.expected_index_r7_8_30 = self.turn_to_datetime64(self.expected_index_r7_8_30) 76 | 77 | class FooModel(ModelWrapper, ABC): 78 | def fit_model(self, tf: pd.DataFrame, tt: pd.Series): 79 | pass 80 | 81 | self.fooModel = FooModel() 82 | 83 | self.foo_target = pd.Series(index=self.date_index, dtype='float64') 84 | self.foo_target.loc[:] = 0 85 | self.fooFeatures = pd.DataFrame(index=self.date_index) 86 | self.fooFeatures.loc[:] = 0 87 | 88 | # 89 | # ************************************ generate_indexes ************************************ 90 | # 91 | 92 | def test_expanding_generateIndexes(self): 93 | """ 94 | testing generate indexes using the expanding param 95 | Turning slice lists to string. Comparing equality of np.datetime64 is annoying 96 | """ 97 | self.examples() 98 | 99 | # no left over days all even slices 100 | returnedIndexesE10_5_30 = list( 101 | generate_indexes(data_index=self.date_index, eval_days=10, refit_every=5, expanding=30)) 102 | self.assertEqual(str(self.expected_index_e5_10_30), str(returnedIndexesE10_5_30)) 103 | 104 | # left over days last slice will be of size 1 105 | returnedIndexesE7_8_30 = list( 106 | generate_indexes(data_index=self.date_index, eval_days=7, refit_every=8, expanding=30)) 107 | self.assertEqual(str(self.expected_index_e7_8_30), str(returnedIndexesE7_8_30)) 108 | 109 | def test_rolling_generateIndexes(self): 110 | """ 111 | testing generate indexes using the rolling param 112 | Turning slice lists to string. Comparing equality of np.datetime64 is annoying 113 | """ 114 | self.examples() 115 | # no left over days all even slices 116 | returnedIndexesR10_5_30 = list( 117 | generate_indexes(data_index=self.date_index, eval_days=10, refit_every=5, rolling=30)) 118 | self.assertEqual(str(self.expected_index_r5_10_30), str(returnedIndexesR10_5_30)) 119 | 120 | # left over days last slice will be of size 1 121 | returnedIndexesR7_8_30 = list( 122 | generate_indexes(data_index=self.date_index, eval_days=7, refit_every=8, rolling=30)) 123 | 124 | self.assertEqual(str(self.expected_index_r7_8_30), str(returnedIndexesR7_8_30)) 125 | 126 | # 127 | # ************************************ calcMlFactor ************************************ 128 | # 129 | 130 | def test_negative_calcMlFactor(self): 131 | """ 132 | testing for error when eval_days, refit_every, expanding, rolling is less than one 133 | this also tests generate_indexes 134 | """ 135 | self.examples() 136 | 137 | # eval_days 138 | with self.assertRaises(ValueError) as em: 139 | calc_ml_factor(model=self.fooModel, features=self.fooFeatures, target=self.foo_target, eval_days=0, 140 | refit_every=1, expanding=1) 141 | self.assertEqual('eval_days and/or refit_every must be greater than zero', str(em.exception)) 142 | 143 | # refit_every 144 | with self.assertRaises(ValueError) as em: 145 | calc_ml_factor(model=self.fooModel, features=self.fooFeatures, target=self.foo_target, eval_days=1, 146 | refit_every=0, expanding=1) 147 | self.assertEqual('eval_days and/or refit_every must be greater than zero', str(em.exception)) 148 | 149 | # expanding 150 | with self.assertRaises(ValueError) as em: 151 | calc_ml_factor(model=self.fooModel, features=self.fooFeatures, target=self.foo_target, eval_days=1, 152 | refit_every=1, expanding=0) 153 | self.assertEqual('expanding must be greater than zero', str(em.exception)) 154 | 155 | # rolling 156 | with self.assertRaises(ValueError) as em: 157 | calc_ml_factor(model=self.fooModel, features=self.fooFeatures, target=self.foo_target, eval_days=1, 158 | refit_every=1, rolling=0) 159 | self.assertEqual('rolling must be greater than zero', str(em.exception)) 160 | 161 | def test_rollingAndExpanding_calcMlFactor(self): 162 | """ 163 | testing for error when rolling days and expanding are both defined and not defined 164 | """ 165 | self.examples() 166 | 167 | with self.assertRaises(ValueError) as em: 168 | calc_ml_factor(model=self.fooModel, features=self.fooFeatures, target=self.foo_target, eval_days=1, 169 | refit_every=1, rolling=1, expanding=1) 170 | self.assertEqual('minTrainDays and rollingDays can not both be defined', str(em.exception)) 171 | 172 | with self.assertRaises(ValueError) as em: 173 | calc_ml_factor(model=self.fooModel, features=self.fooFeatures, target=self.foo_target, eval_days=1, 174 | refit_every=1) 175 | self.assertEqual('minTrainDays or rollingDays must be defined', str(em.exception)) 176 | 177 | def test_contain_bad_val_calc_ml_factor(self): 178 | """ 179 | testing for when the given features and target have nan values 180 | """ 181 | self.examples() 182 | # features has a nan 183 | with self.assertRaises(ValueError) as em: 184 | self.fooFeatures[0] = 0.0 185 | self.fooFeatures.iat[1, 0] = np.nan 186 | calc_ml_factor(model=self.fooModel, features=self.fooFeatures, target=self.foo_target, eval_days=1, 187 | refit_every=1) 188 | self.assertEqual('There are nan or inf values in the features', str(em.exception)) 189 | 190 | # features has a inf 191 | self.examples() 192 | with self.assertRaises(ValueError) as em: 193 | self.fooFeatures[0] = 0.0 194 | self.fooFeatures.iat[1, 0] = np.inf 195 | calc_ml_factor(model=self.fooModel, features=self.fooFeatures, target=self.foo_target, eval_days=1, 196 | refit_every=1) 197 | self.assertEqual('There are nan or inf values in the features', str(em.exception)) 198 | 199 | # target has a nan 200 | self.examples() 201 | with self.assertRaises(ValueError) as em: 202 | self.foo_target.iat[1] = np.nan 203 | calc_ml_factor(model=self.fooModel, features=self.fooFeatures, target=self.foo_target, eval_days=1, 204 | refit_every=1) 205 | self.assertEqual('There are nan or inf values in the target', str(em.exception)) 206 | 207 | # target has a inf 208 | self.examples() 209 | with self.assertRaises(ValueError) as em: 210 | self.foo_target.iat[1] = np.inf 211 | calc_ml_factor(model=self.fooModel, features=self.fooFeatures, target=self.foo_target, eval_days=1, 212 | refit_every=1) 213 | self.assertEqual('There are nan or inf values in the target', str(em.exception)) 214 | 215 | @staticmethod 216 | def turn_to_datetime64(convert): 217 | """ 218 | helper converts SliceHolder of pd.Timestamp to SliceHolder of np.datetime64 219 | """ 220 | return [(SliceHolder(s[0].start.to_datetime64(), s[0].end.to_datetime64()), 221 | SliceHolder(s[1].start.to_datetime64(), s[1].end.to_datetime64())) 222 | for s in convert] 223 | 224 | 225 | if __name__ == '__main__': 226 | unittest.main() 227 | -------------------------------------------------------------------------------- /ntiles/toolbox/__init__.py: -------------------------------------------------------------------------------- 1 | # constitutes 2 | from .constitutes.constitute_adjustment import ConstituteAdjustment 3 | 4 | # utils 5 | from .utils.format_data_alphalens import price_format_for_alphalens, factor_format_for_alphalens 6 | from .utils.ml_factor_calculation import calc_ml_factor 7 | from .utils.ml_factor_calculation import ModelWrapper 8 | from .utils.utils import factorize, rank, ntile 9 | from .utils.date_config import DateConfig 10 | 11 | # db functions 12 | from .db.read.query_constructor import QueryConstructor 13 | from .db.api.sql_connection import SQLConnection 14 | from .db.read.db_functions import table_info, db_tables 15 | from .db.write.create_tables import IngestDataBase 16 | from .db.read.universe import (ETFUniverse, 17 | clear_etf_universes, 18 | clear_built_universes, 19 | BuiltUniverse, 20 | dispatch_universe_path) 21 | 22 | __all__ = [ 23 | 'ConstituteAdjustment', 24 | 'price_format_for_alphalens', 25 | 'factor_format_for_alphalens', 26 | 'calc_ml_factor', 27 | 'ModelWrapper', 28 | 'factorize', 29 | 'rank', 30 | 'ntile', 31 | 'QueryConstructor', 32 | 'SQLConnection', 33 | 'table_info', 34 | 'IngestDataBase', 35 | 'ETFUniverse', 36 | 'clear_etf_universes', 37 | 'clear_built_universes', 38 | 'BuiltUniverse', 39 | 'dispatch_universe_path', 40 | 'db_tables', 41 | 'DateConfig', 42 | ] 43 | -------------------------------------------------------------------------------- /ntiles/toolbox/constitutes/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Alexd14/ntiles/d22c75aac2a553ccca17fb71a3650071e31808c2/ntiles/toolbox/constitutes/__init__.py -------------------------------------------------------------------------------- /ntiles/toolbox/constitutes/constitute_adjustment.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Union 2 | 3 | import duckdb 4 | import pandas as pd 5 | 6 | from ntiles.toolbox.utils.date_config import DateConfig 7 | from ..db.read.query_constructor import QueryConstructor 8 | from ..db.api.sql_connection import SQLConnection 9 | from ..utils.handle_data import handle_duplicates 10 | 11 | # this allows compatibility with python 3.6 12 | try: 13 | import pandas_market_calendars as mcal 14 | except ImportError as e: 15 | pass 16 | 17 | 18 | class ConstituteAdjustment: 19 | """ 20 | Provides the functionality of indexing a data to match a universe 21 | Correctly identifying on what day which asset should be in/not in the data set based on given universe data 22 | """ 23 | 24 | def __init__(self, 25 | id_col: str = 'permno', 26 | date_config: DateConfig = None 27 | ): 28 | """ 29 | constructor for ConstituteAdjustment 30 | :param id_col: the asset identifier column for the data that will be passed 31 | :param date_type: should the date be outputted as a pd.Period or a pd.Timestamp? 32 | self._universe_factor: holds the index constitutes for the factor in a MultiIndex of date, 33 | self._id_col 34 | self._universe_pricing: holds the index constitutes for the pricing in a MultiIndex of date, 35 | self._id_col 36 | """ 37 | self._id_col = id_col 38 | self._date_config = date_config 39 | 40 | self._universe_factor: Optional[pd.MultiIndex] = None 41 | 42 | def add_universe_info(self, 43 | universe: pd.DataFrame, 44 | start_date: str, 45 | end_date: str, 46 | calender: str = 'NYSE' 47 | ) -> None: 48 | """ 49 | Adds universe data to the ConstituteAdjustment object 50 | Creates a factors index which is simply the range of "from" to "thru" 51 | 52 | :param universe: a pandas data frame containing index component information. 53 | MUST HAVE COLUMNS: self._id_col representing the asset identifier, 54 | 'from' start trading date on the index, 55 | 'thru' end trading date on the index, 56 | If 'from', 'thru' are not pd.TimeStamps than a date_format MUST BE PASSED. 57 | if no date_format is passed its assumed that they are in a pd.TimeStamp object 58 | :param start_date: The first date we want to get data for 59 | :param end_date: The last first date we want to get data for 60 | :param calender: The trading calender we want to use to get the dates 61 | :return: None 62 | """ 63 | # making sure date and self._id_col are in the columns 64 | universe = _check_columns([self._id_col, 'from', 'thru'], universe) 65 | 66 | # will throw an error if there are duplicate self._id_col 67 | handle_duplicates(df=universe, out_type='ValueError', name=f'The column {self._id_col}', 68 | drop=False, subset=[self._id_col]) 69 | 70 | # making sure the dates are in the correct format 71 | universe = (self._date_config 72 | .copy(target_data_type='timestamp', resample=False, grouper_keys=[]) 73 | .configure_dates(universe, ['from', 'thru'])) 74 | 75 | relevant_cal = (mcal.get_calendar(calender) 76 | .valid_days(start_date=start_date, end_date=end_date) 77 | .to_frame(name='date')) 78 | relevant_cal = (self._date_config 79 | .copy(target_data_type='timestamp', resample=True, grouper_keys=[]) 80 | .configure_dates(relevant_cal, 'date') 81 | .set_index('date') 82 | .rename({'index': 'date'}, axis=1)['date']) 83 | 84 | # making a list of series to eventually concat 85 | indexes_factor: List[pd.Series] = [] 86 | 87 | for row in universe.iterrows(): 88 | symbol = row[1][self._id_col] 89 | 90 | # getting the relevant dates for the factor 91 | date_range_factors: pd.Series = relevant_cal.loc[row[1]['from']: row[1]['thru']] 92 | 93 | # converting to frame and then stacking gives us a df with the index we are making, also speed improvement 94 | indexes_factor.append( 95 | date_range_factors.to_frame(symbol).stack() 96 | ) 97 | 98 | # getting the index of the concatenated Series 99 | self._universe_factor = pd.concat(indexes_factor).index.set_names(['date', self._id_col]) 100 | 101 | def add_universe_info_from_db(self, 102 | assets: str, 103 | start_date: str, 104 | end_date: str, 105 | sql_con=None 106 | ) -> None: 107 | """ 108 | Same as add_universe_info but takes in universe info from the database, 109 | :param assets: The assets we want to get data for, Ex 'ETF_SPY' 110 | :param start_date: The first date we want to get data for string in %Y-%m-%d 111 | :param end_date: The last first date we want to get data for string in %Y-%m-%d 112 | :param sql_con: A connection to the sql database if not provided then will use default connection 113 | :return: None 114 | """ 115 | over_con = sql_con is None 116 | if sql_con is None: 117 | sql_con = SQLConnection(':memory:', close_key=self.__class__.__name__) 118 | raw_uni = (QueryConstructor(sql_con=sql_con, cache=False, freq=None) 119 | .query_universe_table(assets, fields=[self._id_col], start_date=start_date, 120 | end_date=end_date, override_sql_con=over_con) 121 | .order_by('date') 122 | .df) 123 | sql_con.close_with_key(self.__class__.__name__) 124 | self.add_universe_info_long(raw_uni, start_date, end_date) 125 | 126 | # raw_uni = (self._date_config 127 | # .copy(target_data_type='timestamp') 128 | # .configure_dates(raw_uni, 'date') 129 | # .set_index(['date', self._id_col])) 130 | # 131 | # missing_id_for = raw_uni.index.to_frame()[self._id_col].isnull().sum() / len(raw_uni) 132 | # print(f"Universe missing \"{self._id_col}\" for {round(missing_id_for * 100, 2)}% of data points") 133 | # 134 | # self._universe_factor = raw_uni.index.dropna() 135 | 136 | def add_universe_info_long(self, 137 | universe: pd.DataFrame, 138 | start_date: Union[pd.Timestamp, str] = None, 139 | end_date: Union[pd.Timestamp, str] = None 140 | ) -> None: 141 | """ 142 | Adds universe data to the ConstituteAdjustment object from a table with long format. 143 | :param universe: a pandas data frame containing universe component information. 144 | :param start_date: The first date we want to get data for 145 | :param end_date: The last first date we want to get data for 146 | :return: None 147 | """ 148 | universe = _check_columns([self._id_col, 'date'], universe)[['date', self._id_col]] 149 | universe = (self._date_config 150 | .copy(target_data_type='timestamp') 151 | .configure_dates(universe, 'date')) 152 | universe = universe[(universe['date'] > start_date) 153 | & (universe['date'] < end_date)] 154 | self._universe_factor = universe.set_index(['date', self._id_col]).index 155 | 156 | def adjust_data_for_membership(self, 157 | data: pd.DataFrame, 158 | ) -> pd.DataFrame: 159 | """ 160 | adjusts the data set accounting for when assets are a member of the index defined in add_universe_info. 161 | 162 | factor: 163 | Ex: AAPl joined S&P500 on 2012-01-01 and leaves 2015-01-01. GOOGL joined S&P500 on 2014-01-01 and is still 164 | in the index at the time of end_date passed in add_index_info. When passing data to the 165 | adjust_data_for_membership method it will only return AAPL factor data in range 166 | 2012-01-01 to 2015-01-01 and google data in the range of 2014-01-01 to the end_date. 167 | 168 | :param data: A pandas dataframe to be filtered. 169 | Must contain columns named self._id_col, 'date' otherwise can have as may columns as desired 170 | :param adjust_dates: If True then will adjust dates as depicted in date_config but will force timestamp output 171 | :return: An indexed data frame adjusted for when assets are in the universe 172 | """ 173 | # if the add_index_info is not defined then throw error 174 | if self._universe_factor is None: 175 | raise ValueError('Universe is not set') 176 | 177 | # making sure date and self._id_col are in the columns 178 | data = _check_columns(['date', self._id_col], data, False) 179 | 180 | # if adjust_dates: 181 | data = (self._date_config 182 | .copy(resample=False, target_data_type='timestamp') 183 | .configure_dates(data, 'date')) 184 | 185 | # dropping duplicates and throwing a warning if there are any 186 | data = handle_duplicates(df=data, out_type='Warning', name='Data', drop=True, subset=['date', self._id_col]) 187 | 188 | reindex_frame = self._fast_reindex(self._universe_factor, data) 189 | 190 | # if we have dataframe with 1 column then return series 191 | if reindex_frame.shape[1] == 1: 192 | return reindex_frame.iloc[:, 0] 193 | 194 | return reindex_frame 195 | 196 | def _fast_reindex(self, 197 | reindex_by: pd.MultiIndex, 198 | frame_to_reindex: pd.DataFrame 199 | ) -> pd.DataFrame: 200 | """ 201 | Quickly reindex a pandas dataframe using a join in duckdb 202 | :param reindex_by:Desired pandas Multiindex 203 | :param frame_to_reindex: Frame we are reindexing data from 204 | :return: Reindexed Dataframe 205 | """ 206 | reindex_by = reindex_by.to_frame() 207 | 208 | id_cols = f'reindex_by.date, reindex_by.{self._id_col}' 209 | factor_cols = ', '.join([col for col in frame_to_reindex.columns if col not in ['date', self._id_col]]) 210 | query = duckdb.query(f""" 211 | SELECT {id_cols}, {factor_cols} 212 | FROM reindex_by 213 | left join frame_to_reindex on (reindex_by.date = frame_to_reindex.date) 214 | and (reindex_by.{self._id_col} = frame_to_reindex.{self._id_col}); 215 | """) 216 | 217 | return self._set_dates(query.to_df()).set_index(['date', self._id_col]) 218 | 219 | def _set_dates(self, 220 | df: pd.DataFrame 221 | ) -> pd.DataFrame: 222 | """ 223 | adjusts the date column according to the self._date_type 224 | :param df: the Dataframe which we are adjusting the 'date column' for 225 | :return: df with date columns adjusted 226 | """ 227 | return self._date_config.copy(resample=False).configure_dates(df, 'date') 228 | 229 | @property 230 | def factor_components(self) -> Optional[pd.MultiIndex]: 231 | """ 232 | :return: Mutable list of tuples which represent the factor index constitutes 233 | """ 234 | return self._universe_factor 235 | 236 | 237 | def _check_columns(needed: List[str], 238 | df: pd.DataFrame, 239 | index_columns: bool = True 240 | ) -> pd.DataFrame: 241 | """ 242 | helper to check if the required columns are present 243 | raises value error if a col in needed is not in givenCols 244 | :param needed: list of needed columns 245 | :param df: df of the factor data for the given data 246 | :param index_columns: should we index the columns specified in needed when returning the df 247 | :return: Given dataframe with the correct columns and range index 248 | """ 249 | if not isinstance(df.index, pd.core.indexes.range.RangeIndex): 250 | df = df.reset_index() 251 | 252 | for col in needed: 253 | if col not in df.columns: 254 | raise ValueError(f'Required column \"{col}\" is not present') 255 | 256 | if index_columns: 257 | return df[needed] 258 | 259 | return df 260 | -------------------------------------------------------------------------------- /ntiles/toolbox/db/__init__.py: -------------------------------------------------------------------------------- 1 | from .api.sql_connection import SQLConnection 2 | from .read.query_constructor import QueryConstructor 3 | from .write.create_tables import IngestDataBase 4 | from .write.make_universes import compustat_us_universe, crsp_us_universe 5 | from .read.db_functions import table_info 6 | from .read.universe import clear_built_universes, clear_etf_universes 7 | from .read.cached_query import clear_cache 8 | 9 | __all__ = [ 10 | 'SQLConnection', 11 | 'QueryConstructor', 12 | 'IngestDataBase', 13 | 'compustat_us_universe', 14 | 'crsp_us_universe', 15 | 'table_info', 16 | 'clear_built_universes', 17 | 'clear_etf_universes', 18 | 'clear_cache' 19 | ] 20 | -------------------------------------------------------------------------------- /ntiles/toolbox/db/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Alexd14/ntiles/d22c75aac2a553ccca17fb71a3650071e31808c2/ntiles/toolbox/db/api/__init__.py -------------------------------------------------------------------------------- /ntiles/toolbox/db/api/sql_connection.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import duckdb 4 | 5 | from ntiles.toolbox.db.settings import DB_CONNECTION_STRING 6 | 7 | 8 | class SQLConnection: 9 | """ 10 | Provides a lazy connection to a duckdb database 11 | """ 12 | 13 | def __init__(self, connection_string: Optional[str] = None, read_only: bool = True, close_key=None) -> None: 14 | """ 15 | if the connection is a memory connection then read_only will be False 16 | :param connection_string: the path to the duck db database 17 | If not passed then will look in settings.py for the string 18 | :param close_key: the key to be passed in order to close the connection in self.close_with_key() 19 | :return: None 20 | """ 21 | self._read_only: bool = False if connection_string == ':memory:' else read_only 22 | self._close_key = close_key 23 | 24 | self._connection_string: str = self._get_connection_string(connection_string) 25 | self._db_connection: Optional[duckdb.DuckDBPyConnection] = None 26 | 27 | @staticmethod 28 | def _get_connection_string(connection_string: Optional[str]) -> str: 29 | """ 30 | Gets the connection string for the duckdb 31 | defaults to the connection_string, if that's not there then it grabs from settings.py 32 | :param connection_string: the path to the duck db data base 33 | :return: connection string to duck db data base 34 | :raise ValueError: if the param connection_string and DB_CONNECTION_STRING are None 35 | """ 36 | if connection_string is None: 37 | if DB_CONNECTION_STRING is None: 38 | raise ValueError('Must pass a connection string or set a connection string in settings.py') 39 | return DB_CONNECTION_STRING 40 | 41 | return connection_string 42 | 43 | def _get_db_connection(self) -> None: 44 | """ 45 | sets connection to duckdb database, if connection is currently open then it will close connection 46 | :return: None 47 | """ 48 | if self._db_connection: 49 | self._db_connection.close() 50 | 51 | self._db_connection = duckdb.connect(database=self._connection_string, read_only=self._read_only) 52 | 53 | @property 54 | def con(self) -> duckdb.DuckDBPyConnection: 55 | """ 56 | :return: connection to duckdb database 57 | """ 58 | if self._db_connection is None: 59 | self._get_db_connection() 60 | 61 | return self._db_connection 62 | 63 | @property 64 | def read_only(self) -> bool: 65 | """ 66 | :return: Is the connection read only? 67 | """ 68 | return self.read_only 69 | 70 | def connection_string(self) -> str: 71 | """ 72 | returns the connection string 73 | """ 74 | return self._connection_string 75 | 76 | def set_read_only(self, read_only: bool) -> None: 77 | """ 78 | setter for read only 79 | will cause oln connection to be closed and new connection to be created 80 | if the passed read_only != self._read_only 81 | :param read_only: should the database be read only? 82 | :return: None 83 | """ 84 | if read_only != self.read_only: 85 | self._read_only = read_only 86 | 87 | if self._db_connection: 88 | self._db_connection.close() 89 | self._connection_string = None 90 | 91 | def close(self) -> None: 92 | """ 93 | will close the sql connection regardless of self.close_key 94 | """ 95 | if self._db_connection: 96 | self._db_connection.close() 97 | self._db_connection = None 98 | 99 | def close_with_key(self, close_key: str): 100 | """ 101 | will close the sql connection if the passed close_key equals self.close_key 102 | """ 103 | if close_key == self._close_key and close_key is not None: 104 | self.close() 105 | 106 | def execute(self, sql: str, **kwargs) -> duckdb.DuckDBPyConnection: 107 | """ 108 | wrapper for self.con.execute(sq;) 109 | :param sql: query to run 110 | :return: raw duckdb object containing the results of the query 111 | """ 112 | return self.con.execute(sql, **kwargs) 113 | 114 | def set_threads(self, num_threads: int) -> None: 115 | """ 116 | sets the amount of threads duck db should use 117 | :return: None 118 | """ 119 | self.con.execute(f'PRAGMA threads={num_threads};') 120 | 121 | def return_other_if_open(self, other, connection_string=None, read_only=None, close_key=None): 122 | """ 123 | returns other if the other is not None and matches all conditions else returns self 124 | if a condition arg is none then will not check that condition 125 | can remove the current connection from scope the program if other is not None 126 | """ 127 | if other is None: 128 | return self 129 | if connection_string and other.connection_string != connection_string: 130 | return self 131 | if read_only and other.read_only != read_only: 132 | return self 133 | if close_key and other.close_key != close_key: 134 | return self 135 | return other 136 | 137 | 138 | -------------------------------------------------------------------------------- /ntiles/toolbox/db/read/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Alexd14/ntiles/d22c75aac2a553ccca17fb71a3650071e31808c2/ntiles/toolbox/db/read/__init__.py -------------------------------------------------------------------------------- /ntiles/toolbox/db/read/cached_query.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import hashlib 3 | import os 4 | 5 | import pandas as pd 6 | 7 | from datetime import datetime 8 | 9 | from ntiles.toolbox.db.settings import CACHE_DIRECTORY 10 | from ntiles.toolbox.db.api.sql_connection import SQLConnection 11 | 12 | 13 | class CachedQuery: 14 | """ 15 | Functionality to cache results of a QueryConstructor 16 | """ 17 | 18 | def __init__(self, query: str): 19 | """ 20 | :param query: the query we are looking at 21 | """ 22 | self._query = query 23 | self._query_hash = hashlib.sha224(query.encode()).hexdigest() 24 | # what the path should be to the cache file 25 | self._path = f'{CACHE_DIRECTORY}/{self._query_hash.upper()}.parquet' 26 | 27 | def is_query_cached(self) -> bool: 28 | """ 29 | checks to see if the query is cached 30 | """ 31 | return os.path.isfile(self._path) 32 | 33 | def cache_query(self, results: pd.DataFrame): 34 | """ 35 | caches the given results 36 | If index is not range index then will write index as a column not an index 37 | """ 38 | if not isinstance(results.index, pd.RangeIndex): 39 | results = results.reset_index() 40 | 41 | # if any columns are a period type change them to timestamp 42 | con = SQLConnection(':memory:') 43 | con.execute(f"COPY results TO '{self._path}' (FORMAT 'parquet')") 44 | con.close() 45 | print(f'Cached Query') 46 | 47 | def get_cached_query_path(self) -> str: 48 | """ 49 | gets the path to the cached query will rase ValueError if the query is not cached 50 | """ 51 | if self.is_query_cached(): 52 | return self._path 53 | raise ValueError('Query is not cached!') 54 | 55 | def get_cached_query_df(self) -> pd.DataFrame: 56 | """ 57 | gets the DataFrame contents of the cached query will rase ValueError if the query is not cached 58 | The index will be a default range index 59 | """ 60 | path = f"'{self.get_cached_query_path()}'" 61 | 62 | con = SQLConnection(':memory:') 63 | cached_results = con.execute(f"SELECT * FROM {path}").df() 64 | con.close() 65 | 66 | file_creation = datetime.fromtimestamp(os.stat(self._path).st_birthtime) 67 | file_age = (datetime.now() - file_creation).days 68 | 69 | print(f'Using {file_age} Day Old Cache') 70 | return cached_results 71 | 72 | 73 | def clear_cache(): 74 | files = glob.glob(f'{CACHE_DIRECTORY}/*.parquet') 75 | for f in files: 76 | os.remove(f) 77 | print('Cleared Cache') 78 | -------------------------------------------------------------------------------- /ntiles/toolbox/db/read/db_functions.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from ntiles.toolbox.db.api.sql_connection import SQLConnection 4 | 5 | 6 | def table_info(table_name: str, con=None) -> pd.DataFrame: 7 | """ 8 | runs the table info PRAGMA query 9 | """ 10 | con = con if con else SQLConnection(close_key='table_info') 11 | info_df = con.execute(f"PRAGMA table_info('{table_name}');").fetchdf() 12 | con.close_with_key('table_info') 13 | return info_df 14 | 15 | 16 | def db_tables(con=None) -> pd.DataFrame: 17 | """ 18 | runs PRAGMA query to get all table names 19 | """ 20 | con = con if con else SQLConnection(close_key='db_tables') 21 | info_df = con.execute(f"PRAGMA show_tables;").fetchdf() 22 | con.close_with_key('db_tables') 23 | return info_df 24 | -------------------------------------------------------------------------------- /ntiles/toolbox/db/read/universe.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os.path 3 | 4 | import pandas as pd 5 | 6 | from typing import Union 7 | 8 | from ntiles.toolbox.db.settings import ADD_ALL_LINKS_TO_PERMNO, ETF_UNI_DIRECTORY, BUILT_UNI_DIRECTORY 9 | from ntiles.toolbox.db.api.sql_connection import SQLConnection 10 | 11 | # this allows compatibility with python 3.6 12 | try: 13 | import pandas_market_calendars as mcal 14 | except ImportError as e: 15 | pass 16 | 17 | MAP_ETF_SYMBOL_ID = {'SPY': 1021980, 18 | 'IWM': 1025818, 19 | 'IWV': 1025817} 20 | 21 | 22 | def dispatch_universe_path(uni_name, add_quotes=False, sql_con=None) -> str: 23 | """ 24 | gets the path to the given universe. 25 | Can pass any universe and will figure out the correct path to the universe. 26 | Assumes that the universe name is valid. 27 | 28 | :param uni_name: the name of the universe 29 | :param sql_con: a connection to the database 30 | :param add_quotes: should we add single quotes around the path? 31 | :return: path to the universe 32 | """ 33 | # user passes a etf to use as universe 34 | if 'ETF' in uni_name: 35 | out = ETFUniverse(con=sql_con).get_universe_path_parse(uni_name) 36 | 37 | # user passes a built universe 38 | else: 39 | out = BuiltUniverse().get_universe_path(uni_name) 40 | 41 | if add_quotes: 42 | return f"'{out}'" 43 | 44 | return out 45 | 46 | 47 | class ETFUniverse: 48 | """ 49 | CLass to build universes from etf holdings. 50 | Will cache the universes in parquet files to be read into duckdb instances 51 | """ 52 | 53 | def __init__(self, con: SQLConnection = None): 54 | """ 55 | If the user would like to class this class mutable times then they must pass a connection to con 56 | :param con: A connection to the database, if not passed then will open a new connection. 57 | """ 58 | self._con = con if con else SQLConnection(close_key=self.__class__.__name__) 59 | 60 | def get_universe_df(self, ticker: str = None, crsp_portno: int = None, start_date: str = '2000', 61 | end_date: str = '2023') -> Union[pd.DataFrame, str]: 62 | """ 63 | gets the universe constitutes for an etf 64 | either ticker or crsp_portno must be passed but not both 65 | :param ticker: the ticker of the etf we are getting holdings for 66 | :param crsp_portno: the crsp_portno of the etf we are getting holdings for 67 | :param start_date: the date to start getting holdings for YYYY-MM-DD, no effect when caching 68 | :param end_date: the date to stop getting holdings YYYY-MM-DD, no effect when caching 69 | :return: pd.Dataframe index: int_range; columns: date, permno; 70 | """ 71 | 72 | self._input_checks(ticker=ticker, crsp_portno=crsp_portno) 73 | 74 | asset_id = self._get_crsp_portno(ticker=ticker, crsp_portno=crsp_portno) 75 | 76 | if not self._is_cached_etf(crsp_portno=asset_id): 77 | etf_uni = self._cache_etf(crsp_portno=asset_id) 78 | 79 | else: 80 | etf_uni = self._get_cached_etf(crsp_portno=asset_id) 81 | 82 | return etf_uni[(etf_uni['date'] > start_date) & (etf_uni['date'] < end_date)] 83 | 84 | def get_universe_path(self, ticker: str = None, crsp_portno: int = None): 85 | """ 86 | gets the SQL code to read cached universe constitutes for an etf 87 | either ticker or crsp_portno must be passed but not both 88 | if etf isn't cached then will cache the etf 89 | :param ticker: the ticker of the etf we are getting holdings for 90 | :param crsp_portno: the crsp_portno of the etf we are getting holdings for 91 | :return: SQL code to read the cached parquet 92 | """ 93 | self._input_checks(ticker=ticker, crsp_portno=crsp_portno) 94 | 95 | asset_id = self._get_crsp_portno(ticker=ticker, crsp_portno=crsp_portno) 96 | 97 | if not self._is_cached_etf(crsp_portno=asset_id): 98 | self._cache_etf(crsp_portno=asset_id) 99 | 100 | return self._get_cached_path(asset_id) 101 | 102 | def get_universe_path_parse(self, to_parse): 103 | """ 104 | wrapper that parses a string for get_universe_path, can tell if user passed a symbol or crsp_portno 105 | format: 106 | ticker: 107 | 'ETF_SPY' 108 | crsp_portno: 109 | 'ETF_5648362' 110 | """ 111 | to_parse = to_parse.upper() 112 | param_dict = self._parse_etf_uni_string(to_parse, param_dict={}) 113 | return self.get_universe_path(**param_dict) 114 | 115 | def get_universe_df_parse(self, to_parse, start_date: str = '2000', end_date: str = '2023'): 116 | """ 117 | wrapper that parses a string for get_universe_path, can tell if user passed a symbol or crsp_portno 118 | format: 119 | ticker: 120 | 'ETF_SPY' 121 | crsp_portno: 122 | 'ETF_5648362' 123 | """ 124 | param_dict = {'start_date': start_date, 'end_date': end_date} 125 | param_dict = self._parse_etf_uni_string(to_parse, param_dict=param_dict) 126 | return self.get_universe_df(**param_dict) 127 | 128 | def _cache_etf(self, crsp_portno) -> pd.DataFrame: 129 | """ 130 | gets and caches an etf holdings query 131 | will cache etf in temp directory of the computer 132 | :return: pd.Dataframe index: int_range; columns: date, permno; 133 | """ 134 | print('Caching ETF Holdings') 135 | 136 | sql_for_holdings = f""" 137 | SELECT DISTINCT date, permno 138 | FROM crsp.portfolio_holdings 139 | WHERE crsp_portno = {crsp_portno} AND 140 | permno IS NOT NULL 141 | """ 142 | raw_etf_holdings = self._con.execute(sql_for_holdings).fetchdf() 143 | self._con.close_with_key(close_key=self.__class__.__name__) 144 | 145 | df_of_holdings = raw_etf_holdings.set_index('date').groupby('date')['permno'].apply( 146 | lambda grp: list(grp.value_counts().index)) 147 | 148 | end_date = pd.Timestamp.now().date().strftime('%Y-%m-%d') 149 | start_date = df_of_holdings.index.min() 150 | 151 | full_cal = pd.date_range(start=start_date, end=end_date, freq='D').tz_localize(None) 152 | 153 | trading_cal = mcal.get_calendar( 154 | 'NYSE').valid_days(start_date=start_date, end_date=end_date).tz_localize( 155 | None) 156 | 157 | universe = df_of_holdings.reindex(full_cal.tolist()).ffill().reindex(trading_cal.tolist()).reset_index() 158 | 159 | relational_format = [(row[0], permno) for row in universe.values for permno in row[1]] 160 | uni_df = pd.DataFrame(relational_format, columns=['date', 'permno']) 161 | uni_df = self._link_to_ids(uni_df) 162 | 163 | self._cache_helper(uni_df=uni_df, crsp_portno=crsp_portno) 164 | 165 | return uni_df 166 | 167 | def _link_to_ids(self, uni_df: pd.DataFrame) -> pd.DataFrame: 168 | """ 169 | join cstat and ibes links to current universe df 170 | """ 171 | columns = ', '.join(['date', 'uni.permno', 'lpermco as permco', 'gvkey', 'liid as iid', 'ticker', 'cusip', 172 | "CASE WHEN gvkey NOT NULL THEN CONCAT(gvkey, '_', liid) ELSE NULL END as id"]) 173 | from_start = " uni_df as uni " 174 | 175 | sql_code = ADD_ALL_LINKS_TO_PERMNO.replace('--columns', columns).replace('--from', from_start) 176 | 177 | return self._con.con.execute(sql_code).fetchdf() 178 | 179 | def _get_crsp_portno(self, ticker, crsp_portno) -> int: 180 | """ 181 | if ticker is not none then will map the ticker to a crsp_portno 182 | :return: crsp_portno passed or the crsp_portno mapped to the symbol 183 | """ 184 | if crsp_portno: 185 | return crsp_portno 186 | 187 | mapped_id = self._con.execute(f"""SELECT distinct crsp_portno 188 | FROM crsp.fund_summary 189 | WHERE ticker = '{ticker}' AND 190 | crsp_portno IS NOT NULL""").fetchall() 191 | 192 | if len(mapped_id) == 0: 193 | self._con.close_with_key(close_key=self.__class__.__name__) 194 | raise ValueError(f"Ticker '{ticker}' is not valid cant map to crsp_portno") 195 | 196 | if len({x[0] for x in mapped_id}) > 1: 197 | # getting metadata of the portno's that mtched 198 | mapped_funds = self._con.execute(f"""SELECT DISTINCT crsp_portno, fund_name, m_fund, et_flag 199 | FROM crsp.fund_summary 200 | WHERE ticker = '{ticker}' AND 201 | crsp_portno IS NOT NULL""").fetchdf() 202 | self._con.close_with_key(close_key=self.__class__.__name__) 203 | 204 | raise ValueError(f"Ticker '{ticker}' mapped to {len(mapped_id)} crsp_portno's {mapped_id}. " 205 | f"Please specify the crsp_portno to build this etf's history\n" + 206 | mapped_funds.to_string(index=False)) 207 | 208 | return int(mapped_id[0][0]) 209 | 210 | @staticmethod 211 | def _input_checks(ticker, crsp_portno) -> None: 212 | """ 213 | input check for ticker and crsp_portno 214 | """ 215 | if ticker is None and crsp_portno is None: 216 | raise ValueError('Must pass a ticker or crsp_portno') 217 | 218 | if ticker is not None and crsp_portno is not None: 219 | raise ValueError('Must pass a ticker or crsp_portno, not both!') 220 | 221 | def _is_cached_etf(self, crsp_portno) -> bool: 222 | """ 223 | is the etf cached? 224 | """ 225 | return os.path.isfile(self._get_cached_path(crsp_portno)) 226 | 227 | def _cache_helper(self, uni_df, crsp_portno) -> None: 228 | """ 229 | Writes a parquet file to the user specified temp directory on a computer 230 | """ 231 | path = self._get_cached_path(crsp_portno) 232 | uni_df.to_parquet(path) 233 | print(f'Cached {crsp_portno} in {path}') 234 | 235 | def _get_cached_etf(self, crsp_portno) -> pd.DataFrame: 236 | """ 237 | returns a dataframe of the cached universe 238 | """ 239 | return pd.read_parquet(self._get_cached_path(crsp_portno)) 240 | 241 | @staticmethod 242 | def _get_cached_path(crsp_portno): 243 | """ 244 | :return: path to the cached file 245 | """ 246 | return f'{ETF_UNI_DIRECTORY}/etf_uni_{crsp_portno}.parquet' 247 | 248 | @staticmethod 249 | def _parse_etf_uni_string(to_parse: str, param_dict: dict) -> dict: 250 | """ 251 | adds 'ticker' or 'crsp_portno' a parameter dict 252 | :params to_parse: 253 | :params param_dict: dict which we will add 'ticker' or 'crsp_portno' to 254 | :return: param_dict with 'ticker' or 'crsp_portno' added 255 | """ 256 | to_parse = to_parse.upper() 257 | if 'ETF_' in to_parse: 258 | id_etf = to_parse.split('_')[-1] 259 | if id_etf.isdigit(): 260 | param_dict['crsp_portno'] = to_parse.split('_')[-1] 261 | else: 262 | param_dict['ticker'] = to_parse.split('_')[-1] 263 | else: 264 | raise ValueError(f"Can't parse {to_parse}") 265 | 266 | return param_dict 267 | 268 | 269 | class BuiltUniverse: 270 | """ 271 | Gets and validates path to a built universe 272 | """ 273 | 274 | def get_universe_path(self, uni_name) -> str: 275 | """ 276 | gets the path to the parquet file of the given universe 277 | :param uni_name: the name of the universe ex: CRSP_US_1000 278 | :return: the full path to the given universe 279 | :raises: ValueError if given uni_name is invalid 280 | """ 281 | self._ensure_universe_exists(uni_name) 282 | return self._get_path(uni_name) 283 | 284 | def _ensure_universe_exists(self, uni_name): 285 | """ 286 | checks to see if the universe exisis 287 | """ 288 | if not os.path.isfile(self._get_path(uni_name)): 289 | raise ValueError(f'Universe {uni_name} does not exist!') 290 | 291 | @staticmethod 292 | def _get_path(uni_name): 293 | """ 294 | creates what the path should be to the universe file 295 | """ 296 | return f'{BUILT_UNI_DIRECTORY}/{uni_name.upper()}.parquet' 297 | 298 | 299 | def clear_etf_universes(): 300 | """ 301 | Clears all parquet files in the ETF_UNI_DIRECTORY path 302 | """ 303 | files = glob.glob(f'{ETF_UNI_DIRECTORY}/*.parquet') 304 | for f in files: 305 | os.remove(f) 306 | print('Cleared ETF Universes') 307 | 308 | 309 | def clear_built_universes(): 310 | """ 311 | Clears all parquet files in the BUILT_UNI_DIRECTORY path 312 | """ 313 | files = glob.glob(f'{BUILT_UNI_DIRECTORY}/*.parquet') 314 | for f in files: 315 | os.remove(f) 316 | print('Cleared Built Universes') 317 | 318 | 319 | if __name__ == '__main__': 320 | print(ETFUniverse().get_universe_df_parse(to_parse='ETF_1021980', start_date='2017')) 321 | -------------------------------------------------------------------------------- /ntiles/toolbox/db/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Define Global Settings 3 | """ 4 | 5 | DB_CONNECTION_STRING = '/Users/alex/Desktop/DB/wrds.duckdb' # the directory to the sql database 6 | CACHE_DIRECTORY = '/tmp' # the directory to cache files, QueryConstructor gets cached here 7 | ETF_UNI_DIRECTORY = '/tmp' # '/Users/alex/Desktop/DB/universes/etf' # the directory to save ETF Universes 8 | BUILT_UNI_DIRECTORY = '/Users/alex/Desktop/DB/universes/built' # directory to save custom-built universes 9 | 10 | DB_ADJUSTOR_FIELDS = { 11 | 'cstat.sd': [ 12 | { 13 | 'adjustor': 'ajexdi', 14 | 'fields_to_adjust': ['prccd', 'prcod', 'prchd', 'prcld', 'eps'], 15 | 'operation': '/' 16 | }, 17 | { 18 | 'adjustor': 'ajexdi', 19 | 'fields_to_adjust': ['cshoc', 'cshtrd'], 20 | 'operation': '*' 21 | } 22 | ], 23 | 'crsp.sd': [ 24 | { 25 | 'adjustor': 'cfacpr', 26 | 'fields_to_adjust': ['prc', 'openprc', 'askhi', 'bidlo', 'bid', 'ask'], 27 | 'operation': '/', 28 | 'function': 'ABS' 29 | }, 30 | { 31 | 'adjustor': 'cfacshr', 32 | 'fields_to_adjust': ['vol', 'shrout'], 33 | 'operation': '*' 34 | } 35 | 36 | ], 37 | 'crsp.sm': [ 38 | { 39 | 'adjustor': 'cfacpr', 40 | 'fields_to_adjust': ['prc', 'openprc', 'askhi', 'bidlo', 'bid', 'ask', 'altprc'], 41 | 'operation': '/', 42 | 'function': 'ABS' 43 | }, 44 | { 45 | 'adjustor': 'cfacshr', 46 | 'fields_to_adjust': ['vol', 'shrout'], 47 | 'operation': '*' 48 | } 49 | 50 | ], 51 | 'cstat.sm': [ 52 | { 53 | 'adjustor': 'ajexm', 54 | 'fields_to_adjust': ['prccm', 'prchm', 'prclm'], 55 | 'operation': '/' 56 | }, 57 | { 58 | 'adjustor': 'ajexm', 59 | 'fields_to_adjust': ['cshom', 'cshtrm'], 60 | 'operation': '*' 61 | } 62 | ], 63 | 'cstat.funda': [ 64 | {'fields_to_adjust': []} 65 | ], 66 | 67 | 'wrds.firm_ratios': [ 68 | {'fields_to_adjust': []} 69 | ], 70 | 'ibes.summary_price_target': [ 71 | {'fields_to_adjust': []} 72 | ] 73 | } 74 | 75 | DB_ADJUSTOR_FIELDS['sd'] = DB_ADJUSTOR_FIELDS['cstat.sd'] 76 | DB_ADJUSTOR_FIELDS['main.sd'] = DB_ADJUSTOR_FIELDS['cstat.sd'] 77 | 78 | # sql code to link permno to cstat and ibes 79 | ADD_ALL_LINKS_TO_PERMNO = """ 80 | ( 81 | SELECT --columns 82 | FROM 83 | --from LEFT JOIN link.crsp_cstat_link AS ccm ON (uni.permno = ccm.lpermno AND uni.date >= ccm.linkdt 84 | AND uni.date <= ccm.linkenddt AND (ccm.linktype = 'LU' OR ccm.linktype = 'LC')) 85 | LEFT JOIN link.crsp_ibes_link AS crib ON (uni.permno = crib.permno AND uni.date >= crib.sdate 86 | AND uni.date <= crib.edate) 87 | ) 88 | """ 89 | -------------------------------------------------------------------------------- /ntiles/toolbox/db/write/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Alexd14/ntiles/d22c75aac2a553ccca17fb71a3650071e31808c2/ntiles/toolbox/db/write/__init__.py -------------------------------------------------------------------------------- /ntiles/toolbox/db/write/create_tables.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Dict, List 3 | 4 | from ntiles.toolbox.db.api.sql_connection import SQLConnection 5 | 6 | logging.basicConfig(format='%(message)s ::: %(asctime)s', datefmt='%I:%M:%S %p', level=logging.INFO) 7 | 8 | 9 | class IngestDataBase: 10 | def __init__(self, connection_string: str = None): 11 | """ 12 | :param connection_string: optional string connection to the database 13 | if none is given then will fall back onto path in settings.py 14 | """ 15 | self._sql_api = SQLConnection(connection_string=connection_string, read_only=False) 16 | 17 | def ingest(self, to_insert: List[Dict[str, str]], overwrite: bool = False, rows_to_interpret: int = 5_000, 18 | close: bool = True) -> None: 19 | """ 20 | will ingest the files specified by to_insert 21 | :param to_insert: A dictionary containing the schema, tablename and file path for a 22 | table that should be inserted into the db 23 | [{ 24 | 'schema': 'sch1', 25 | 'table': 'tbl1', 26 | 'file_path': 'full/path/to/file', 27 | 'custom': "UPDATE sch1.tbl1 SET LINKENDDT=99991231 WHERE LINKENDDT = 'E';", 28 | 'rename': {'datadate': 'date'}, 29 | 'alter_type': {'gsector': 'VARCHAR', 'date': ['timestamp', '%Y%m%d']}, 30 | 'index': [{'name': 'ixd2', 'column': 'col1'}, {'name': 'idx2', 'column': 'col2'}] 31 | 'where': "date > '2000'" 32 | 'from': "AS data JOIN crsp.crsp_cstat_link as link on data.permno = link.lpermno" 33 | 'rows_to_interpret': 500_000 34 | }] 35 | :param overwrite: should the tables be overwritten if they exist? 36 | :param rows_to_interpret: how many rows should we read to determine the types 37 | :param close: should we close the sql connection after everything is inserted? 38 | :return: None 39 | """ 40 | try: 41 | for tbl_to_create in to_insert: 42 | logging.info(f'Inserting {tbl_to_create["schema"]}.{tbl_to_create["table"]}') 43 | self._create_schema(tbl_to_create) # creates schema 44 | self._drop(tbl_to_create, overwrite) # drops tbl if user wants to 45 | self._create_tbl(tbl_to_create, rows_to_interpret) # writing table 46 | self._custom_sql(tbl_to_create) # letting the user run any sql code 47 | self._rename_columns(tbl_to_create) # renaming columns 48 | self._alter_types(tbl_to_create) # changing types of data 49 | self._to_lowercase(tbl_to_create) # making all column names lowercase 50 | self._create_index(tbl_to_create) # making indexes 51 | 52 | except Exception as e: 53 | self._sql_api.close() 54 | raise e 55 | 56 | if close: 57 | self._sql_api.close() 58 | logging.info('Closed SQL Connection') 59 | 60 | def _create_schema(self, tbl_to_create) -> None: 61 | """ 62 | :param tbl_to_create: dict defining the table we want to create 63 | :return: None 64 | """ 65 | sql_query = f"""CREATE SCHEMA IF NOT EXISTS {tbl_to_create['schema']};""" 66 | self._sql_api.execute(sql_query) 67 | 68 | def _drop(self, tbl_to_create, overwrite) -> None: 69 | """ 70 | Drpos a table if it exists and the user wants to drop the table 71 | :param tbl_to_create: dict defining the table we want to drop 72 | :param overwrite: should we drop the table? 73 | :return: None 74 | """ 75 | if overwrite: 76 | tbl_name = self._get_table_name(tbl_to_create) 77 | sql_query = f"""DROP TABLE IF EXISTS {tbl_name};""" 78 | self._sql_api.execute(sql_query) 79 | 80 | def _create_tbl(self, tbl_to_create, rows_to_interpret) -> None: 81 | """ 82 | inserts a table into the specified schema and table name 83 | no adjustments are done to the table or types declared 84 | :param tbl_to_create: dict defining the table we want to create 85 | :return: None 86 | """ 87 | tbl_name = self._get_table_name(tbl_to_create) 88 | 89 | rows_to_interpret = tbl_to_create[ 90 | 'rows_to_interpret'] if 'rows_to_interpret' in tbl_to_create else rows_to_interpret 91 | 92 | where_clause = f"WHERE {tbl_to_create.get('where')}" if tbl_to_create.get('where') else '' 93 | from_clause = tbl_to_create.get('from') if tbl_to_create.get('from') else '' 94 | 95 | sql_query = f""" 96 | CREATE TABLE {tbl_name} AS 97 | SELECT * 98 | FROM read_csv_auto('{tbl_to_create['file_path']}', SAMPLE_SIZE={rows_to_interpret}) {from_clause} 99 | {where_clause}""" 100 | 101 | self._sql_api.execute(sql_query) 102 | 103 | logging.info(f'\tCreated table {tbl_to_create["schema"]}.{tbl_to_create["table"]}') 104 | 105 | def _custom_sql(self, tbl_to_create): 106 | """ 107 | lets the user run any sql code they want 108 | :param tbl_to_create: dict defining the table we want to create 109 | :return: None 110 | """ 111 | 112 | if 'custom' not in tbl_to_create: 113 | return 114 | 115 | self._sql_api.execute(tbl_to_create['custom']) 116 | 117 | logging.info('\tRan custom sql code') 118 | 119 | def _rename_columns(self, tbl_to_create) -> None: 120 | """ 121 | renames the columns specified by the user 122 | :param tbl_to_create: dict defining the table we want to create 123 | :return: None 124 | """ 125 | # if there are no columns to rename then return 126 | if 'rename' not in tbl_to_create: 127 | return 128 | 129 | tbl_name = self._get_table_name(tbl_to_create) 130 | 131 | for col_to_rename in tbl_to_create['rename']: 132 | sql_query = f"""ALTER TABLE {tbl_name} RENAME COLUMN 133 | {col_to_rename} TO {tbl_to_create['rename'][col_to_rename]};""" 134 | self._sql_api.execute(sql_query) 135 | logging.info(f'\tRenamed {col_to_rename} -> {tbl_to_create["rename"][col_to_rename]}') 136 | 137 | def _alter_types(self, tbl_to_create) -> None: 138 | """ 139 | alters the types of columns according to the user 140 | :param tbl_to_create: dict defining the table we want to create 141 | :return: None 142 | """ 143 | # if there are no columns to alter types then return 144 | if 'alter_type' not in tbl_to_create: 145 | return 146 | 147 | tbl_name = self._get_table_name(tbl_to_create) 148 | 149 | for col_to_alter in tbl_to_create['alter_type']: 150 | # should we do a timestamp parse? 151 | if tbl_to_create['alter_type'][col_to_alter][0] == 'timestamp': 152 | date_format = tbl_to_create['alter_type'][col_to_alter][1] 153 | sql_query = f"""ALTER TABLE {tbl_name} ALTER {col_to_alter} TYPE varchar; 154 | ALTER TABLE {tbl_name} ALTER {col_to_alter} SET DATA TYPE 155 | TIMESTAMP USING strptime({col_to_alter}, '{date_format}')""" 156 | 157 | else: 158 | sql_query = f"""ALTER TABLE {tbl_name} ALTER {col_to_alter} TYPE 159 | {tbl_to_create['alter_type'][col_to_alter]};""" 160 | 161 | self._sql_api.execute(sql_query) 162 | logging.info(f'\tAltered column {col_to_alter}') 163 | 164 | def _create_index(self, tbl_to_create) -> None: 165 | """ 166 | creates indexes for a table 167 | :param tbl_to_create: dict defining the table we want to create 168 | :return: None 169 | """ 170 | 171 | # if there are no columns to index then return 172 | if 'index' not in tbl_to_create: 173 | return 174 | 175 | tbl_name = self._get_table_name(tbl_to_create) 176 | 177 | for idx in tbl_to_create['index']: 178 | sql_query = f"""CREATE INDEX {idx['name']} ON {tbl_name} ({idx['column']});""" 179 | self._sql_api.execute(sql_query) 180 | 181 | logging.info(f'\tCreated index {idx["name"]} using {idx["column"]}') 182 | 183 | def _to_lowercase(self, tbl_to_create) -> None: 184 | """ 185 | turns all columns in a table to lowercase 186 | :param tbl_to_create: dict defining the table we want to create 187 | :return: None 188 | """ 189 | tbl_name = self._get_table_name(tbl_to_create) 190 | 191 | cols = self._sql_api.execute(f'PRAGMA table_info({tbl_name})').fetchdf().name 192 | 193 | for col in cols: 194 | self._sql_api.execute(f"""ALTER TABLE {tbl_name} RENAME COLUMN "{col}" TO "{col.lower()}";""") 195 | 196 | logging.info('\tSuccessfully made all columns lowercase') 197 | 198 | @staticmethod 199 | def _get_table_name(tbl_to_create) -> str: 200 | """ 201 | gets the table name for a tbl_to_create 202 | :param tbl_to_create: dict defining the table we want to create 203 | :return: table name 204 | """ 205 | return f"{tbl_to_create['schema']}.{tbl_to_create['table']}" 206 | -------------------------------------------------------------------------------- /ntiles/toolbox/db/write/make_universes.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import pandas as pd 4 | 5 | from ntiles.toolbox.db.api.sql_connection import SQLConnection 6 | from ntiles.toolbox.db.settings import ADD_ALL_LINKS_TO_PERMNO, BUILT_UNI_DIRECTORY 7 | 8 | # this allows compatibility with python 3.6 9 | try: 10 | import pandas_market_calendars as mcal 11 | except ImportError as e: 12 | pass 13 | 14 | logging.basicConfig(format='%(message)s ::: %(asctime)s', datefmt='%I:%M:%S %p', level=logging.INFO) 15 | 16 | 17 | def compustat_us_universe(max_rank: int, min_rank: int = 1, start_date: str = '2000', 18 | rebuild_mc_ranking: bool = False) -> None: 19 | """ 20 | generates US daily indexes for compustat daily security file 21 | only will use the primary share for a company 22 | will generate a table called universe.US_min_rank_max_rank, ex US_0_3000 23 | :param max_rank: the max market cap rank for a company to be in the universe 24 | :param min_rank: the min market cap rank for a company in the universe 25 | :param start_date: the minimum date for creating the universe 26 | :param set_indexes: Should we index the universe by 27 | :return: None 28 | """ 29 | 30 | table_name = f'CSTAT_US{"" if min_rank == 1 else "_" + str(min_rank)}_{max_rank}' 31 | write_path = f'{BUILT_UNI_DIRECTORY}/{table_name}.parquet' 32 | 33 | if rebuild_mc_ranking: 34 | _make_cstat_us_universe_base_table() 35 | else: 36 | logging.info(f'Using Prior Build of universe.cstat_mc_rank') 37 | 38 | logging.info(f'Creating table {table_name}') 39 | 40 | sql_make_universe_table = f""" 41 | COPY 42 | ( 43 | SELECT date, gvkey, iid, id, ttm_min_prccd, ttm_mc, ttm_mc_rank 44 | FROM universe.temp_rank_cstat_mc 45 | WHERE ttm_mc_rank >= {min_rank} AND 46 | ttm_mc_rank <= {max_rank} AND 47 | date > '{start_date}' 48 | ) 49 | TO '{BUILT_UNI_DIRECTORY}/{table_name}.parquet' (FORMAT 'parquet') 50 | """ 51 | # making the db connection 52 | con = SQLConnection(read_only=False).con 53 | 54 | con.execute(sql_make_universe_table) 55 | con.close() 56 | 57 | logging.info(f'Wrote Table {table_name} To {write_path}') 58 | 59 | 60 | def crsp_us_universe(max_rank: int, min_rank: int = 1, start_date: str = '1980', 61 | rebuild_mc_ranking: bool = False, link: bool = True) -> None: 62 | """ 63 | Generates a universe of the top N stocks domiciled in the US by market cap 64 | Will only use companies primary share 65 | :param max_rank: the max market cap rank for a company to be in the universe 66 | :param min_rank: the min market cap rank for a company in the universe 67 | :param start_date: the minimum date for creating the universe 68 | :param set_indexes: Should we index the universe by 69 | :param rebuild_mc_ranking: should we rebuild the ranking table universe.crsp_mc_rank? 70 | :param link: should we link to cstat and ibes 71 | :return: None 72 | """ 73 | # getting the trading calendar so we dont have bad dates 74 | trading_cal = mcal.get_calendar( 75 | 'NYSE').valid_days(start_date=start_date, end_date=pd.to_datetime('today')).to_series().to_frame('trading_days') 76 | 77 | table_name = f'CRSP_US{"" if min_rank == 1 else "_" + str(min_rank)}_{max_rank}' 78 | write_path = f'{BUILT_UNI_DIRECTORY}/{table_name}.parquet' 79 | 80 | if rebuild_mc_ranking: 81 | _make_crsp_us_universe_base_table() 82 | else: 83 | logging.info(f'Using Prior Build of universe.crsp_mc_rank') 84 | 85 | logging.info(f'Creating table {table_name}') 86 | 87 | sql_make_universe_table = f""" 88 | ( 89 | SELECT date, permno, permco, ttm_min_prc, ttm_mc, ttm_mc_rank 90 | FROM universe.temp_rank_crsp_mc 91 | WHERE ttm_mc_rank >= {min_rank} AND 92 | ttm_mc_rank <= {max_rank} AND 93 | date > '{start_date}' 94 | ) as uni 95 | """ 96 | 97 | # will add linking tables 98 | if link: 99 | columns = ', '.join(['uni.*', 'gvkey', 'liid as iid, ''ticker', 'cusip', 100 | "CASE WHEN gvkey NOT NULL THEN CONCAT(gvkey, '_', liid) ELSE NULL END as id"]) 101 | sql_make_universe_table = '(' + (ADD_ALL_LINKS_TO_PERMNO 102 | .replace('--columns', columns) 103 | .replace('--from', sql_make_universe_table)) + ')' 104 | 105 | sql_make_universe_table = f"""COPY 106 | {sql_make_universe_table} 107 | TO '{write_path}' (FORMAT 'parquet')""" 108 | 109 | # making the db connection 110 | con = SQLConnection(read_only=False).con 111 | con.execute(sql_make_universe_table) 112 | con.close() 113 | 114 | logging.info(f'Wrote Table {table_name} To {write_path}') 115 | 116 | 117 | def _make_cstat_us_universe_base_table(): 118 | """ 119 | Makes the base table with market cap ranks for each asset. Should be deleted after its done being used 120 | """ 121 | table_name = 'universe.temp_rank_cstat_mc' 122 | logging.info(f'Creating Ranking Table {table_name}') 123 | 124 | # getting the trading calendar so we dont have bad dates 125 | trading_cal = mcal.get_calendar( 126 | 'NYSE').valid_days(start_date='1980', end_date=pd.to_datetime('today')).to_series().to_frame('trading_days') 127 | 128 | sql_ensure_schema_open = f'CREATE SCHEMA IF NOT EXISTS universe;' 129 | sql_ensure_table_open = f'DROP TABLE IF EXISTS {table_name};' 130 | sql_make_rank_universe_table = f""" 131 | CREATE TABLE {table_name} 132 | AS 133 | ( 134 | SELECT date, gvkey, iid, id, ttm_min_prccd, ttm_mc, 135 | row_number() OVER (PARTITION BY (date) ORDER BY ttm_mc desc) AS ttm_mc_rank 136 | FROM 137 | ( 138 | SELECT * 139 | FROM 140 | ( 141 | SELECT date, gvkey, iid, id, 142 | AVG(ABS(prccd) * cshoc) OVER ( 143 | PARTITION BY id ORDER BY date ROWS BETWEEN 252 PRECEDING AND CURRENT ROW) AS ttm_mc, 144 | MIN(ABS(prccd)) OVER ( 145 | PARTITION BY id ORDER BY date ROWS BETWEEN 252 PRECEDING AND CURRENT ROW) AS ttm_min_prccd 146 | FROM 147 | ( 148 | SELECT date, gvkey, iid, id, priusa, fic, tpci, curcdd, 149 | lag(prccd, 1, NULL) OVER lagDays AS prccd, 150 | lag(cshoc, 1, NULL) OVER lagDays AS cshoc 151 | FROM main.sd AS sd RIGHT JOIN trading_cal cal ON sd.date = cal.trading_days 152 | WINDOW lagDays AS (PARTITION BY id ORDER BY date) 153 | ) 154 | WHERE fic = 'USA' AND 155 | tpci = '0' AND 156 | curcdd = 'USD' AND 157 | priusa = (CASE WHEN regexp_full_match(iid, '^[0-9]*$') THEN CAST(iid AS INTEGER) end) 158 | ) 159 | WHERE ttm_mc > 0 AND 160 | ttm_min_prccd > 3 161 | ) 162 | ) 163 | ORDER BY date 164 | """ 165 | 166 | # making the db connection 167 | con = SQLConnection(read_only=False).con 168 | con.execute(sql_ensure_schema_open) 169 | con.execute(sql_ensure_table_open) 170 | con.execute(sql_make_rank_universe_table) 171 | con.close() 172 | 173 | logging.info(f'Finished Ranking Table {table_name}') 174 | 175 | 176 | def _make_crsp_us_universe_base_table(): 177 | """ 178 | Makes the base table with market cap ranks for each asset. Should be deleted after its done being used 179 | """ 180 | table_name = 'universe.temp_rank_crsp_mc' 181 | logging.info(f'Creating Ranking Table {table_name}') 182 | 183 | trading_cal = mcal.get_calendar( 184 | 'NYSE').valid_days(start_date='1925', end_date=pd.to_datetime('today')).to_series().to_frame('trading_days') 185 | 186 | sql_ensure_schema_open = f'CREATE SCHEMA IF NOT EXISTS universe;' 187 | sql_ensure_table_open = f'DROP TABLE IF EXISTS {table_name};' 188 | 189 | sql_make_rank_universe_table = f""" 190 | CREATE TABLE {table_name} 191 | AS 192 | SELECT date, permno, permco, ttm_min_prc, ttm_mc, 193 | row_number() OVER (PARTITION BY (date) ORDER BY ttm_mc desc) AS ttm_mc_rank 194 | FROM 195 | ( 196 | SELECT date, permno, permco, ttm_min_prc, ttm_mc 197 | FROM 198 | ( 199 | SELECT date, permno, permco, shrcd, 200 | AVG(ABS(prc) * shrout) OVER ( 201 | PARTITION BY permno ORDER BY date ROWS BETWEEN 252 PRECEDING AND CURRENT ROW) AS ttm_mc, 202 | MIN(ABS(prc)) OVER ( 203 | PARTITION BY permno ORDER BY date ROWS BETWEEN 252 PRECEDING AND CURRENT ROW) AS ttm_min_prc 204 | FROM 205 | ( 206 | SELECT date, permno, permco, shrcd, 207 | lag(prc, 1, NULL) OVER lagDays AS prc, 208 | lag(shrout, 1, NULL) OVER lagDays AS shrout 209 | FROM 210 | ( 211 | SELECT distinct date, permno, permco, shrcd, prc, shrout 212 | FROM crsp.sd as sd RIGHT JOIN trading_cal cal on sd.date = cal.trading_days 213 | ) 214 | WINDOW lagDays AS ( 215 | PARTITION BY permno 216 | ORDER BY date 217 | ) 218 | ) 219 | WHERE shrcd = 11 220 | ) 221 | WHERE ttm_mc IS NOT NULL AND 222 | ttm_min_prc > 3 223 | ) 224 | ORDER BY date 225 | """ 226 | 227 | # making the db connection 228 | con = SQLConnection(read_only=False).con 229 | con.execute(sql_ensure_schema_open) 230 | con.execute(sql_ensure_table_open) 231 | con.execute(sql_make_rank_universe_table) 232 | con.close() 233 | 234 | logging.info(f'Finished Ranking Table {table_name}') 235 | 236 | 237 | def clear_master_ranking_table(): 238 | """ 239 | Wipes the ranking tables made by _make_crsp_us_universe_base_table and _make_cstat_us_universe_base_table 240 | """ 241 | logging.info('Deleting Ranking Tables') 242 | 243 | con = SQLConnection(read_only=False) 244 | con.execute("DROP SCHEMA universe CASCADE;") 245 | con.close() 246 | 247 | logging.info('Finished Deleting Ranking Tables') 248 | 249 | 250 | if __name__ == '__main__': 251 | # crsp_us_universe(max_rank=500, rebuild_mc_ranking=True, link=True) 252 | # crsp_us_universe(max_rank=1000, link=True) 253 | # crsp_us_universe(max_rank=3000, link=True) 254 | # crsp_us_universe(min_rank=1000, max_rank=3000, link=True) 255 | # 256 | # # building compustat universes 257 | # compustat_us_universe(max_rank=500, rebuild_mc_ranking=True) 258 | # compustat_us_universe(max_rank=1000) 259 | # compustat_us_universe(max_rank=3000) 260 | # compustat_us_universe(min_rank=1000, max_rank=3000) 261 | 262 | # clear_master_ranking_table() 263 | pass 264 | -------------------------------------------------------------------------------- /ntiles/toolbox/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Alexd14/ntiles/d22c75aac2a553ccca17fb71a3650071e31808c2/ntiles/toolbox/utils/__init__.py -------------------------------------------------------------------------------- /ntiles/toolbox/utils/date_config.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union 2 | 3 | import pandas as pd 4 | 5 | 6 | class DateConfig: 7 | """ 8 | Configures the dates for a dataframe 9 | 10 | This class can be used to align dates up across pricing and factor data 11 | Once the class is configured can be used an unlimited number of times to align dates of dataframes 12 | """ 13 | 14 | def __init__(self, 15 | freq: str, 16 | target_data_type: str = 'period', 17 | resample: bool = False, 18 | resample_key: str = None, 19 | grouper_keys: List[str] = None, 20 | date_format: str = None 21 | ) -> None: 22 | """ 23 | :param freq: the frequency we want to align to 24 | :param target_data_type: the type of date we want as output (timestamp, period) 25 | :param resample: whether to resample the data when changing frequency's 26 | :param resample_key: The column we are using to down sample the data 27 | Will keep the last value of the period 28 | :param grouper_keys: The columns we are using to group the data 29 | will be used in conjunction with resample_key 30 | can be none if we are not grouping 31 | This would be where asset_ids go 32 | :param date_format: the date format to use when converting from a string to period 33 | """ 34 | 35 | self._target_freq = freq 36 | self._target_data_type = target_data_type 37 | self._date_format = date_format 38 | self._resample = resample 39 | self._resample_key = resample_key 40 | self._grouper_keys = [] if grouper_keys is None else grouper_keys 41 | self._resample_master = f'old_{self._resample_key}_{self.__class__.__name__}' 42 | self._validate_inputs() 43 | 44 | def _validate_inputs(self) -> None: 45 | """ 46 | Ensures the inputs are valid 47 | :throws: ValueError if inputs are invalid 48 | """ 49 | if self._target_freq not in ['D', 'B', 'W', 'M', 'Q', 'A']: 50 | raise ValueError(f'Invalid target_freq: {self._target_freq}') 51 | 52 | # if not self._resample and len(self._grouper_keys) != 0: 53 | # raise ValueError(f'Cannot use grouper_keys without resampling') 54 | 55 | if self._target_data_type not in ['timestamp', 'period']: 56 | raise ValueError(f'Invalid target_data_type: {self._target_data_type}') 57 | 58 | def configure_dates(self, 59 | df: pd.DataFrame, 60 | date_columns: Union[List[str], str] 61 | ) -> pd.DataFrame: 62 | """ 63 | Adjusts the dates of the dataframe according to the configuration passed at initiation 64 | Cn adjust columns as well as the index 65 | :param df: the dataframe to adjust 66 | :param date_columns: the date columns to adjust 67 | :return: the dataframe with the configured dates 68 | """ 69 | df = df.copy() 70 | 71 | if isinstance(date_columns, str): 72 | date_columns = [date_columns] 73 | 74 | index = None 75 | if not isinstance(df.index, pd.RangeIndex): 76 | index = df.index.name 77 | df = df.reset_index() 78 | 79 | self._validate_df(df, date_columns) 80 | df = self._prep_df(df, date_columns) 81 | for date_column in date_columns: 82 | df[date_column] = self._configure_dates(df[date_column]) 83 | df = self._resample_data(df, date_columns) 84 | df = self._clean_df(df) 85 | df = self._alter_types(df, date_columns) 86 | 87 | if index: 88 | df = df.set_index(index) 89 | return df 90 | 91 | def _alter_types(self, 92 | df: pd.DataFrame, 93 | date_columns: Union[List[str], str] 94 | ) -> pd.DataFrame: 95 | """ 96 | Alters the types of the dates to the target_data_type 97 | """ 98 | if self._target_data_type == 'timestamp': 99 | for date_column in date_columns: 100 | df[date_column] = df[date_column].dt.to_timestamp() 101 | return df 102 | 103 | def _clean_df(self, 104 | df: pd.DataFrame 105 | ) -> pd.DataFrame: 106 | """ 107 | Cleans the df after the dates have been adjusted 108 | """ 109 | return df.drop(self._resample_master, axis=1, errors='ignore') 110 | 111 | def _prep_df(self, 112 | df: pd.DataFrame, 113 | date_columns 114 | ) -> pd.DataFrame: 115 | """ 116 | Preps the df for the dates to be adjusted 117 | Currently preps for a frequency conversion and subsequent down-sample or up-sample 118 | :throws: ValueError if the correct parameters are not passed at construction to do the resample 119 | """ 120 | 121 | if self._resample: 122 | if self._resample_key is None: 123 | df[self._resample_master] = df[date_columns[0]] 124 | else: 125 | df[self._resample_master] = df[self._resample_key] 126 | return df 127 | 128 | def _resample_data(self, 129 | df: pd.DataFrame, 130 | date_columns 131 | ) -> pd.DataFrame: 132 | """ 133 | Upsamples the data if resample is True 134 | """ 135 | if self._resample and len(self._grouper_keys) == 0 and len(df) > 10_000: 136 | print('Warning you are resampling a large dataframe without grouping.') 137 | 138 | if self._resample: 139 | date_key = date_columns[0] if self._resample_key is None else self._resample_key 140 | groupby_keys = self._grouper_keys.copy() + [date_key] 141 | df = df.sort_values(self._resample_master).groupby(groupby_keys).last().reset_index() 142 | 143 | return df 144 | 145 | def _validate_df(self, 146 | df, 147 | date_columns 148 | ) -> None: 149 | """ 150 | Ensures the inputs are valid for down sampling 151 | :throws: ValueError if inputs are invalid 152 | """ 153 | if self._resample and len(date_columns) != 1 and self._resample_key is None: 154 | raise ValueError(f'Cannot down sample multiple date columns: {date_columns}. Must pass resample_key.') 155 | 156 | if self._resample and self._resample_key is not None and self._resample_key not in date_columns: 157 | raise ValueError(f'resample_key: {self._resample_key} not in date_columns: {date_columns}') 158 | 159 | def _configure_dates(self, 160 | dates: pd.Series 161 | ) -> pd.Series: 162 | """ 163 | Adjusts the dates according to the configuration passed at initiation 164 | """ 165 | if not (pd.api.types.is_period_dtype(dates) or pd.api.types.is_datetime64_any_dtype(dates)): 166 | dates = self._to_datetime(dates) 167 | 168 | dates = self._configure_freq(dates) 169 | return dates 170 | 171 | def _configure_freq(self, 172 | dates: pd.Series 173 | ) -> pd.Series: 174 | """ 175 | Configures the frequency of the dates 176 | """ 177 | if pd.api.types.is_datetime64_any_dtype(dates): 178 | if dates.dt.tz: 179 | dates = dates.dt.tz_localize(None) 180 | return dates.dt.to_period(self._target_freq) 181 | if pd.api.types.is_period_dtype(dates): 182 | if dates.dt.freq != self._target_freq: 183 | return dates.dt.asfreq(self._target_freq) 184 | else: 185 | return dates 186 | else: 187 | raise ValueError(f'Invalid date date type: {dates.dtype}') 188 | 189 | def _to_datetime(self, 190 | dates 191 | ) -> pd.Series: 192 | """ 193 | Takes in a series of strings and parses them to dates 194 | :throws: ValueError if date_format is not passed at initiation 195 | """ 196 | if self._date_format is None: 197 | raise ValueError('date_format must be passed at initiation to parse dates from strings') 198 | return pd.to_datetime(dates, format=self._date_format) 199 | 200 | def copy(self, 201 | **kwargs 202 | ) -> 'DateConfig': 203 | """ 204 | Creates a copy of the object 205 | :param kwargs: the parameters to override when doing the copy 206 | """ 207 | base_kwargs = {'freq': self._target_freq, 208 | 'date_format': self._date_format, 209 | 'target_data_type': self._target_data_type, 210 | 'resample': self._resample, 211 | 'resample_key': self._resample_key, 212 | 'grouper_keys': self._grouper_keys} 213 | base_kwargs.update(kwargs) 214 | return self.__class__(**base_kwargs) 215 | -------------------------------------------------------------------------------- /ntiles/toolbox/utils/format_data_alphalens.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from ntiles.toolbox.utils.handle_data import handle_duplicates, make_nan_inf_summary 4 | 5 | 6 | def price_format_for_alphalens(data: pd.DataFrame, factor: str, date_format: str = '', 7 | id_col: str = 'symbol') -> pd.DataFrame: 8 | """ 9 | formats the price data into the expected format by get_clean_factor_and_forward_returns 10 | out format of the data frame: index: 'date', columns: id_col 11 | data must contain 'date', id_col, can take in a dataframe with unlimited columns 12 | given df the 2 required columns names: 'date', id_col 13 | 14 | does not mutate the given dataframe 15 | 16 | :param data: the data to be turned into the format expected by prices field in get_clean_factor_and_forward_returns 17 | :param factor: the name of the factor column in the passed data 18 | :param date_format: the format to parse the date column in pd.datetime 19 | ` dont pass anything if no date conversion is wanted 20 | :param id_col: the asset identifier column for the data 21 | :return: data frame with data in format required by factor field in get_clean_factor_and_forward_returns 22 | """ 23 | data: pd.DataFrame = data.copy() 24 | 25 | _check_columns(data, id_col) 26 | _convert_to_date_time(data, date_format) 27 | 28 | pivot_table: pd.DataFrame = data.pivot_table(index='date', columns=id_col, values=factor) 29 | 30 | return pivot_table 31 | 32 | 33 | def factor_format_for_alphalens(data: pd.DataFrame, factor: str, date_format: str = '', max_loss: float = .1, 34 | id_col: str = 'symbol') -> pd.DataFrame: 35 | """ 36 | formats the alpha factor data into the expected format by get_clean_factor_and_forward_returns 37 | data must contain 'date', id_col, can take in a dataframe with unlimited columns 38 | out format of the data frame: index: ('date', id_col), columns: 'factor' 39 | given df the 1 required columns names: 'date' 40 | 41 | does not mutate the given data frame 42 | 43 | :param data: the data to be turned into the format expected by factor field in get_clean_factor_and_forward_returns 44 | :param factor: the name of the factor column in the passed data 45 | :param date_format: the format to parse the date column in pd.datetime 46 | ` pass nothing if no date conversion is wanted 47 | :param max_loss: the decimal percent of the factor that can be nan or infinity before we throw an error 48 | :param id_col: the asset identifier column for the data 49 | :return: data frame with data in required format by factor field in get_clean_factor_and_forward_returns 50 | """ 51 | data: pd.DataFrame = data.copy() 52 | 53 | _check_columns(data, id_col) 54 | _convert_to_date_time(data, date_format) 55 | 56 | # setting the index 57 | alpha_factor = data[['date', id_col, factor]].set_index(['date', id_col]) 58 | # dropping duplicates and printing a warning 59 | alpha_factor = handle_duplicates(df=alpha_factor, out_type='Warning', name='Given Factor', drop=True) 60 | # making a nan and inf summary along with dropping nan's 61 | alpha_factor = make_nan_inf_summary(df=alpha_factor, max_loss=max_loss) 62 | 63 | return alpha_factor 64 | 65 | 66 | def _check_columns(data: pd.DataFrame, id_col: str) -> None: 67 | """ 68 | checking to make sure the columns contain 'date' & id_col 69 | :param data: the data frame to check 70 | :param id_col: the identifier column we are checking for 71 | :return: Void, throws ValueError if the columns are bad 72 | """ 73 | # checking for the columns 'date' & id_col 74 | for needed in ['date', id_col]: 75 | if needed not in data.columns: 76 | raise ValueError(f'given df must have required columns \'date\' \'{id_col}\'') 77 | 78 | 79 | def _convert_to_date_time(data: pd.DataFrame, date_format: str) -> None: 80 | """ 81 | MUTATES the given dataframe 82 | converts the date column to a pd.dateTime object. 83 | If the date_format is a empty string then nothing is changed 84 | :param data: the data frame to have the date chamged 85 | :param date_format: the format of the date time string 86 | :return: Void 87 | """ 88 | 89 | if date_format != '': 90 | data['date'] = pd.to_datetime(data['date'].to_numpy(), format=date_format, utc=True) 91 | -------------------------------------------------------------------------------- /ntiles/toolbox/utils/handle_data.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | 7 | def handle_duplicates(df: pd.DataFrame, out_type: str, name: str, drop: bool = False, 8 | subset: List[any] = None) -> pd.DataFrame: 9 | """ 10 | Checking to see if there are duplicates in the given data frame 11 | if there are duplicates outType will be used 12 | Ex: give a Warning or raise ValueError 13 | :param df: The data we are checking 14 | :param name: the name of the data to give as output 15 | :param out_type: what to do do if there are duplicates. Currently supports "Warning", "ValueError" 16 | :param drop: boolean to drop the duplicates or not 17 | if False no data frame will be returned and vice verse 18 | this param will not matter if outType is a ValueError 19 | :param subset: subset of df columns we should check duplicates for 20 | :return: the given df with duplicates dropped according to drop 21 | """ 22 | # seeing if there are duplicates in the factor 23 | dups = df.duplicated(subset=subset) 24 | 25 | if dups.any(): 26 | amount_of_dups = dups.sum() 27 | out_string = f'{name} is {round(amount_of_dups / len(df), 3)} duplicates, {amount_of_dups} rows\n' 28 | if out_type == 'Warning': 29 | Warning(out_string) 30 | elif out_type == 'ValueError': 31 | raise ValueError(out_string) 32 | else: 33 | raise ValueError(f'out_type {out_type} not recognised') 34 | 35 | # dropping the duplicates 36 | if drop: 37 | return df.drop_duplicates(subset=subset, keep='first') 38 | 39 | if drop: 40 | return df 41 | 42 | 43 | def make_nan_inf_summary(df: pd.DataFrame, max_loss: float) -> pd.DataFrame: 44 | """ 45 | makes a summary fot the the amount of nan and infinity values in the given data frame 46 | will throw a ValueError if the percent of nan and inf is greater than the given threshold 47 | prints a summary of the nan's and inf of there are any 48 | :param df: the data frame we are checking 49 | :param max_loss: max decimal percent of nan and inf we are allowing the df to contain 50 | :return: pandas data frame with the nan and inf dropped 51 | """ 52 | df_numpy = df.to_numpy() 53 | nan_array = np.isnan(df_numpy) 54 | finite_array = np.logical_or(np.isinf(df_numpy), np.isneginf(df_numpy)) 55 | 56 | if nan_array.any() or (not finite_array.all()): 57 | factor_length = len(df) 58 | amount_nan = nan_array.sum() 59 | amount_inf = finite_array.sum() 60 | total_percent_dropped = (amount_nan + amount_inf) / factor_length 61 | 62 | outString = f'Dropped {round(total_percent_dropped * 100, 2)}% of data. ' \ 63 | f'{round((amount_nan / factor_length) * 100, 2)}% due to nan, ' \ 64 | f'{round((amount_inf / factor_length) * 100, 2)}% of inf values. Threshold: {max_loss * 100}%\n' 65 | 66 | if total_percent_dropped > max_loss: 67 | raise ValueError('Exceeded Nan Infinity Threshold. ' + outString) 68 | 69 | # print out string as a summary 70 | print(outString) 71 | 72 | # dropping the nans and the infinity values 73 | df = df.replace([np.inf, -np.inf], np.nan).dropna() 74 | 75 | else: 76 | print('Dropped 0% of data') 77 | 78 | return df 79 | -------------------------------------------------------------------------------- /ntiles/toolbox/utils/ml_factor_calculation.py: -------------------------------------------------------------------------------- 1 | import gc 2 | from abc import ABC, abstractmethod 3 | 4 | import pandas as pd 5 | import numpy as np 6 | 7 | from typing import Generator, Tuple, List 8 | 9 | from tqdm import tqdm 10 | 11 | 12 | class ModelWrapper(ABC): 13 | """ 14 | Wraps a model for calc_ml_factor. 15 | """ 16 | @abstractmethod 17 | def fit_model(self, train_features: pd.DataFrame, train_target: pd.Series) -> any: 18 | """ 19 | Wraps a model for use by the calcMlFactor function. 20 | Fits a model to the given features. then returns the fit model. 21 | If the fit model does not contain a "predict" method then predict mut be overwritten. 22 | 23 | :param train_features: the features to train the model on 24 | Must have the same index as train_target 25 | :param train_target: the target for the train_features. 26 | Must have the same index as train_features 27 | :return: a model fit to the given features and targets 28 | """ 29 | pass 30 | 31 | @staticmethod 32 | def transform_data(train_features: pd.DataFrame, train_target: pd.Series, predict: pd.DataFrame) -> \ 33 | Tuple[pd.DataFrame, pd.DataFrame]: 34 | """ 35 | *** Do not fit any transformations on the predict data. That WILL result in lookahead Bias.*** 36 | Only manipulate the predict data with transformations fit with the train_features 37 | 38 | This method is used to preprocess the data before the training, and predicting data is passed to the model 39 | 40 | The indexes must not be changed. However columns can be dropped and altered. 41 | Any change to the train_target must also be done to the predict data. 42 | 43 | Example use: fit a PCA to the train_features then transform the train_features and predict data using said PCA. 44 | or use RFE to reduce dimensionality 45 | 46 | :param train_features: the features to train the model on 47 | :param train_target: the target for the train_features 48 | :param predict: The data to make predictions on 49 | :return: the transformed (train_features, predict) with no index changes. 50 | """ 51 | return train_features, predict 52 | 53 | def predict(self, train_features: pd.DataFrame, train_target: pd.Series, predict: pd.DataFrame) -> pd.Series: 54 | """ 55 | fits a model to the given training data and then makes predictions with the fitted model 56 | fits a model by calling "fitModel". 57 | assumes the "fitModel" returns a model with a "predict" method. 58 | 59 | :param train_features: the features to train the model on 60 | Must have the same index as train_target 61 | :param train_target: the target for the train_features. 62 | Must have the same index as train_features 63 | :param predict: The data to make predictions on 64 | :return: a Tuple of pandas Series with the predictions and a float what s the 65 | """ 66 | # checks the index but is very slow 67 | # if not train_features.index.equals(train_target.index): 68 | # raise ValueError('The index for the features and target is different') 69 | 70 | # allowing the user to adjust the data before fitting, assuming that the user does not mess up the indexes 71 | transformed_features, transformedPredict = self.transform_data(train_features, train_target, predict) 72 | 73 | # fitting and making predictions with user defined model 74 | model: any = self.fit_model(transformed_features, train_target) 75 | predicted: pd.Series = pd.Series(data=model.predict(transformedPredict), index=predict.index) 76 | 77 | del model, train_features, train_target, predict, transformed_features, transformedPredict 78 | gc.collect() 79 | 80 | return predicted 81 | 82 | 83 | class SliceHolder: 84 | """ 85 | holds information on the start and end indexes for a slice. 86 | assumes start and end are immutable references 87 | """ 88 | 89 | def __init__(self, start, end): 90 | self.__start = start 91 | self.__end = end 92 | 93 | @property 94 | def start(self): 95 | return self.__start 96 | 97 | @property 98 | def end(self): 99 | return self.__end 100 | 101 | def __str__(self): 102 | return str(self.__start) + ', ' + str(self.__end) 103 | 104 | def __repr__(self): 105 | return self.__str__() 106 | 107 | 108 | def calc_ml_factor(model: ModelWrapper, features: pd.DataFrame, target: pd.Series, eval_days: int, refit_every: int, 109 | expanding: int = None, rolling: int = None) -> pd.Series: 110 | """ 111 | Calculates an alpha factor using a ML factor combination method. 112 | The model is fit and predictions are made in a ModelWrapper 113 | This function organizes the data so the model can make unbiased predictions 114 | on what would have been point in time data. 115 | 116 | this function assumes that the data passed has all trading days in it (first level of index). 117 | Ex if the the data is missing for one day then we will miss a 118 | 119 | :param model: the ModelWrapper that will be used to make predictions. 120 | :param features: the features to train the model on 121 | there cannot be null values 122 | must have a multi index of (pd.Timestamp, symbol) 123 | :param target: the target we are going to fit the model to 124 | there cannot be null values 125 | must have a multi index of (pd.Timestamp, symbol) 126 | :param eval_days: IF INCORRECT THERE WILL BE LOOK AHEAD BIAS 127 | the amount of days it takes to know the predictions outcome 128 | this number should simply be the length of return we are trying to predict 129 | :param refit_every: the amount of consecutive days to predict using a single model 130 | this is essentially saying refit the model every x days 131 | :param expanding: the minimum amount of days of data to train on 132 | if rollingDays is passed then this should not be passed 133 | if this value is passed then the model will be trained with an expanding window of data 134 | :param rolling: the amount of rolling days to fit a model to 135 | if minTrainDays is passed then this should not be passed 136 | :return: pandas series of predictions. The index will be the same as "features" 137 | """ 138 | 139 | features_copy: pd.DataFrame = features.copy().sort_index() 140 | target_copy: pd.Series = target.copy().sort_index() 141 | 142 | if not np.isfinite(features_copy.values).all(): 143 | raise ValueError('There are nan or inf values in the features') 144 | if not np.isfinite(target_copy.values).all(): 145 | raise ValueError('There are nan or inf values in the target') 146 | if not isinstance(features_copy.index, pd.MultiIndex): 147 | raise ValueError('Features and target must have a pd.MultiIndex of (pd.Timestamp, str)') 148 | if not isinstance(features_copy.index.get_level_values(0), pd.DatetimeIndex): 149 | raise ValueError('Features and target must have index level 0 of pd.DatetimeIndex') 150 | if not features_copy.index.equals(target_copy.index): 151 | raise ValueError('The index for the features and target is different') 152 | 153 | train_predict_slices: Generator[Tuple[SliceHolder, SliceHolder], None, None] = \ 154 | generate_indexes(features_copy.index, eval_days, refit_every, expanding, rolling) 155 | 156 | ml_alpha: List[pd.Series] = [] 157 | for train_slice, predict_slice in tqdm(train_predict_slices): 158 | features_train = features_copy.loc[train_slice.start:train_slice.end] 159 | target_train = target_copy.loc[train_slice.start:train_slice.end] 160 | predict = features_copy.loc[predict_slice.start:predict_slice.end] 161 | ml_alpha.append(model.predict(features_train, target_train, predict)) 162 | 163 | del features_copy, target_copy 164 | gc.collect() 165 | 166 | return pd.concat(ml_alpha) 167 | 168 | 169 | def generate_indexes(data_index: pd.MultiIndex, eval_days: int, refit_every: int, expanding: int = None, 170 | rolling: int = None) -> Generator[Tuple[SliceHolder, SliceHolder], None, None]: 171 | """ 172 | generates the slice index's for the training and predicting periods. 173 | function is designed to work with dates in level 0 however this is not enforced anywhere 174 | 175 | :param data_index: MultiIndex of the data we are generating int index's for 176 | :param eval_days: IF INCORRECT THERE WILL BE LOOK AHEAD BIAS 177 | the amount of days it takes to know the predictions outcome 178 | this number should simply be the length of return we are trying to predict 179 | :param refit_every: the amount of consecutive days to predict using a single model 180 | this is essentially saying refit the model every x days 181 | :param expanding: the minimum amount of days of data to train on 182 | if rollingDays is passed then this should not be passed 183 | if this value is passed then the model will be trained with an expanding window of data 184 | :param rolling: the amount of rolling days to fit a model to 185 | if minTrainDays is passed then this should not be passed 186 | :return: a generator with each iteration containing a tuple of two SliceHolders of dates. 187 | Slice One: training indexes 188 | Slice Two: predicting indexes 189 | """ 190 | 191 | if (eval_days < 1) or (refit_every < 1): 192 | raise ValueError('eval_days and/or refit_every must be greater than zero') 193 | if rolling is not None and (rolling < 1): 194 | raise ValueError('rolling must be greater than zero') 195 | if expanding is not None and (expanding < 1): 196 | raise ValueError('expanding must be greater than zero') 197 | if (not bool(expanding)) and (not bool(rolling)): 198 | raise ValueError('minTrainDays or rollingDays must be defined') 199 | if bool(expanding) & bool(rolling): 200 | raise ValueError('minTrainDays and rollingDays can not both be defined') 201 | 202 | dates: np.array = data_index.get_level_values(0).drop_duplicates().to_numpy() 203 | 204 | start_place = expanding if expanding else rolling 205 | # dont have to ceil this bc it wont matter with a < operator 206 | amount_of_loops: float = (len(dates) - start_place - eval_days) / refit_every 207 | 208 | i: int = 0 209 | while i < amount_of_loops: 210 | # .loc[] is inclusive in a slice, so everything here is inclusive 211 | train_end_index: int = (i * refit_every) + (start_place - 1) 212 | train_start_index: int = train_end_index - rolling + 1 if rolling else 0 213 | train_slice: SliceHolder = SliceHolder(dates[train_start_index], dates[train_end_index]) 214 | 215 | predict_start_index: int = train_end_index + eval_days + 1 216 | predict_end_index: int = predict_start_index + refit_every - 1 217 | # accounting for when the ending predicted index is out of bounds on the last loop 218 | if predict_end_index >= len(dates) - 1: 219 | predict_end_index: int = len(dates) - 1 220 | 221 | predict_slice: SliceHolder = SliceHolder(dates[predict_start_index], dates[predict_end_index]) 222 | 223 | i += 1 224 | yield train_slice, predict_slice 225 | -------------------------------------------------------------------------------- /ntiles/toolbox/utils/utils.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import duckdb 4 | import numpy as np 5 | import pandas as pd 6 | 7 | 8 | def calculate_ic(y_true: np.array, y_pred: np.array) -> float: 9 | """ 10 | computes the information coefficient for the predicted and true variables. 11 | This function can be given to a sklearn.model_selection Hyper-parameter optimizer. 12 | 13 | Example use in sklearn: 14 | scoring = make_scorer(crossValIC, greater_is_better=True) 15 | 16 | :param y_true: the true value of the target 17 | :param y_pred: the predicted value of the target 18 | :return: the information coefficient of the y_pred 19 | """ 20 | return np.corrcoef(y_true, y_pred)[0][1] 21 | 22 | 23 | def factorize(df: pd.DataFrame, partition_by: List[str], exclude=None): 24 | """ 25 | Factorizes each column of the given dataframe except for the partition_by columns and the exclude columns 26 | Will preserve indexes and period data types 27 | 28 | Calculates the centered zscore 29 | 30 | In future would like to winsorize at 2.5% and 97.5% percentiles but hard to do in sql 31 | 32 | Won't rename the columns will overwrite them 33 | 34 | :param df: the dataframe we are factorizing 35 | :param partition_by: What to partition by for calculating median and std will normally be date and sector 36 | :param exclude: columns to exclude in the factorization process 37 | """ 38 | if exclude is None: 39 | exclude = [] 40 | 41 | return _duck_db_edits(df, _factorize(df, partition_by, exclude)) 42 | 43 | 44 | def _factorize(df: pd.DataFrame, partition_by: List[str], exclude: List[str]): 45 | select = partition_by + exclude 46 | for col in set(df.columns) - set(partition_by) - set(exclude): 47 | select.append( 48 | f'({col} - median({col}) OVER factorize_partition) / stddev({col}) OVER factorize_partition AS {col}') 49 | sql = f"""SELECT {', '.join(select)} 50 | FROM df 51 | WINDOW factorize_partition AS (PARTITION BY {', '.join(partition_by)}) 52 | ORDER BY {', '.join(partition_by)} 53 | """ 54 | return sql 55 | 56 | 57 | def rank(df: pd.DataFrame, partition_by: List[str], exclude=None, rank_type: str = 'percent_rank'): 58 | """ 59 | Ranks each column of the given dataframe except for the partition_by columns and the exclude columns 60 | Will preserve indexes and period data types 61 | Won't rename the columns will overwrite them 62 | 63 | :param df: the dataframe we are factorizing 64 | :param partition_by: What to partition by for calculating rank will normally be date and sector 65 | :param exclude: columns to exclude in the ranking process 66 | :param rank_type: the type of rank we are performing 67 | """ 68 | if exclude is None: 69 | exclude = [] 70 | 71 | return _duck_db_edits(df, _rank(df, partition_by, exclude, rank_type)) 72 | 73 | 74 | def _rank(df: pd.DataFrame, partition_by: List[str], exclude: List[str], rank_type: str): 75 | select = partition_by + exclude 76 | for col in set(df.columns) - set(partition_by) - set(exclude): 77 | select.append( 78 | f"CASE WHEN {col} is NULL THEN NULL ELSE {rank_type}() OVER (PARTITION BY {', '.join(partition_by)} " 79 | f"ORDER BY {col}) END AS {col}") 80 | sql = f"""SELECT {', '.join(select)} 81 | FROM df 82 | ORDER BY {', '.join(partition_by)} 83 | """ 84 | return sql 85 | 86 | 87 | def ntile(df: pd.DataFrame, ntiles:int, partition_by: List[str], exclude=None): 88 | """ 89 | Ntiles each column of the given dataframe except for the partition_by columns and the exclude columns 90 | Will preserve indexes and period data types 91 | Won't rename the columns will overwrite them 92 | 93 | :param df: the dataframe we are factorizing 94 | :param partition_by: What to partition by for calculating rank will normally be date and sector 95 | :param exclude: columns to exclude in the ranking process 96 | """ 97 | if exclude is None: 98 | exclude = [] 99 | 100 | return _duck_db_edits(df, _ntile(df, ntiles, partition_by, exclude)) 101 | 102 | 103 | def _ntile(df, ntiles, partition_by, exclude): 104 | select = partition_by + exclude 105 | for col in set(df.columns) - set(partition_by) - set(exclude): 106 | select.append( 107 | f" NTILE({ntiles}) OVER(PARTITION BY {', '.join(partition_by)} ORDER BY {col} DESC) as {col} ") 108 | sql = f"""SELECT {', '.join(select)} 109 | FROM df 110 | ORDER BY {', '.join(partition_by)} 111 | """ 112 | return sql 113 | 114 | 115 | def _duck_db_edits(df, sql): 116 | index_cols = None 117 | if not isinstance(df.index, pd.RangeIndex): 118 | index_cols = df.index.names 119 | df = df.reset_index() 120 | 121 | convert_to_period = [] 122 | for col in df.columns: 123 | if isinstance(df[col].dtype, pd.PeriodDtype): 124 | df[col] = df[col].dt.to_timestamp() 125 | convert_to_period.append(col) 126 | 127 | df = duckdb.query(sql).df() 128 | for col in convert_to_period: 129 | df[col] = df[col].dt.to_period('D') 130 | df = df.set_index(index_cols) if index_cols else df 131 | return df 132 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | from setuptools import find_packages 4 | 5 | setup( 6 | name='ntiles', 7 | version='0.1.5.1', 8 | packages=find_packages(), 9 | license='Apache License 2.0', 10 | description='Vectorized quantile backtester.', 11 | url='https://github.com/Alexd14/ntiles-backtester', 12 | download_url='https://github.com/Alexd14/ntiles/archive/refs/tags/v1.5.1.tar.gz', 13 | keywords=['factor', 'backtesting', 'alphalens', 'vectorized backtesting', 'equity trading'], 14 | install_requires=[ 15 | 'numba', 16 | 'pandas', 17 | 'numpy', 18 | 'matplotlib', 19 | 'empyrical', 20 | 'factor_toolbox', 21 | # 'equity-db' 22 | ], 23 | classifiers=[ 24 | 'License :: OSI Approved :: Apache Software License', 25 | 'Programming Language :: Python :: 3.7', 26 | 'Programming Language :: Python :: 3.8', 27 | 'Programming Language :: Python :: 3.9', 28 | 'Programming Language :: Python :: 3.10', 29 | ], 30 | ) 31 | --------------------------------------------------------------------------------