├── .gitattributes
├── .gitignore
├── LICENSE.txt
├── README.md
├── depreciated
    ├── equity_db_pricing_portal.py
    └── equity_db_sector_portal.py
├── ntiles
    ├── __init__.py
    ├── backtest
    │   ├── __init__.py
    │   ├── ntile_kicker.py
    │   ├── periods.py
    │   ├── plotter.py
    │   ├── portals
    │   │   ├── __init__.py
    │   │   ├── base_portal.py
    │   │   ├── pricing_portal.py
    │   │   └── sector_portal.py
    │   ├── stats.py
    │   ├── tears
    │   │   ├── __init__.py
    │   │   ├── backtest_tear.py
    │   │   ├── base_tear.py
    │   │   ├── ic_tear.py
    │   │   ├── inspection_tear.py
    │   │   ├── tilts_backtest_tear.py
    │   │   └── turnover_tear.py
    │   └── utils.py
    ├── examples
    │   ├── ic_ac.png
    │   ├── inspection_1.png
    │   ├── inspection_2.png
    │   ├── return_1.png
    │   └── return_2.png
    ├── tests
    │   ├── __init__.py
    │   ├── constitute_adjustment_test.py
    │   └── ml_factor_calculation_test.py
    └── toolbox
    │   ├── __init__.py
    │   ├── constitutes
    │       ├── __init__.py
    │       └── constitute_adjustment.py
    │   ├── db
    │       ├── __init__.py
    │       ├── api
    │       │   ├── __init__.py
    │       │   └── sql_connection.py
    │       ├── read
    │       │   ├── __init__.py
    │       │   ├── cached_query.py
    │       │   ├── db_functions.py
    │       │   ├── query_constructor.py
    │       │   └── universe.py
    │       ├── settings.py
    │       └── write
    │       │   ├── __init__.py
    │       │   ├── create_tables.py
    │       │   └── make_universes.py
    │   └── utils
    │       ├── __init__.py
    │       ├── date_config.py
    │       ├── format_data_alphalens.py
    │       ├── handle_data.py
    │       ├── ml_factor_calculation.py
    │       └── utils.py
└── setup.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | .idea/
 29 | .DS_Store
 30 | **/.DS_Store
 31 | MANIFEST
 32 | *.ipynb
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .nox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | *.py,cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # celery beat schedule file
 99 | celerybeat-schedule
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | test/
132 | depreciated/


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |     Copyright 2021 Alex DiCarlo
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | pip install ntiles
 3 | ```
 4 | 
 5 | ### API
 6 | ```python
 7 | from ntiles import Ntile, PricingPortal, SectorPortal
 8 | 
 9 | # getting the asset pricing data
10 | pricing_portal = PricingPortal(assets=my_universe, start='2017-01-01', end='2021-01-01')
11 | # getting the group data, this is optional
12 | group_portal = SectorPortal(assets=my_universe)
13 | 
14 | # generating tearsheets
15 | tile = Ntile(pricing_portal=pricing_portal, group_portal=group_portal)
16 | tile.full_tear(factor=my_factor, ntiles=5, holding_period=20)
17 | ```
18 | 
19 | ### Example Tearsheet
20 | ![](ntiles/examples/inspection_1.png)
21 | ![](ntiles/examples/inspection_2.png)
22 | ![](ntiles/examples/return_1.png)
23 | ![](ntiles/examples/return_2.png)
24 | ![](ntiles/examples/ic_ac.png)


--------------------------------------------------------------------------------
/depreciated/equity_db_pricing_portal.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC
  2 | from typing import List, Optional
  3 | 
  4 | import pandas as pd
  5 | 
  6 | try:
  7 |     from equity_db import MongoAPI, ReadDB
  8 | except ImportError:
  9 |     pass
 10 | 
 11 | # from .ntiles.portals.base_portal import BaseDeltaPortal
 12 | from .base_portal import BaseDeltaPortal
 13 | """
 14 | No Longer used. See toolbox for the pricing portal.
 15 | """
 16 | 
 17 | 
 18 | class PricingPortal(BaseDeltaPortal, ABC):
 19 |     """
 20 |     Object to query and cache pricing data
 21 |     """
 22 | 
 23 |     def __init__(self, assets: List[str], start: str, end: str, search_by: str = 'lpermno',
 24 |                  pricing_field: str = 'prccd', adjustor_field: str = 'ajexdi', db: str = 'equity',
 25 |                  collection: str = 'crsp', trading_calender='NYSE'):
 26 |         """
 27 |         :param assets: The assets to get data for
 28 |         :param start: start of period to query
 29 |         :param end: end of period to query
 30 |         :param pricing_field: what field to use for pricing data
 31 |         :param adjustor_field: What field to use for adjusting the pricing data
 32 |         :param db: the data base to use
 33 |         :param collection: the _collection to query
 34 |         :param trading_calender: the trading calendar to use to verify dates
 35 |         """
 36 |         super().__init__(assets, pd.Period(start, 'D'), pd.Period(end, 'D'))
 37 | 
 38 |         self._pricing_field = pricing_field
 39 | 
 40 |         self._adjusted_pricing: Optional[pd.DataFrame] = None
 41 |         self._period_delta: Optional[pd.DataFrame] = None
 42 |         self._query_adjusted_pricing(db, collection, assets, start, end, search_by, pricing_field, adjustor_field,
 43 |                                      trading_calender)
 44 | 
 45 |     @property
 46 |     def delta_data(self):
 47 |         """
 48 |         :return: unstacked daily asset returns
 49 |             col: _asset_id; index: pd.period; values: daily asset returns
 50 |         """
 51 |         if self._period_delta is None:
 52 |             self._period_delta = self.raw_data.unstack().pct_change(1).iloc[1:].fillna(0)
 53 | 
 54 |         return self._period_delta
 55 | 
 56 |     @property
 57 |     def raw_data(self) -> pd.Series:
 58 |         """
 59 |         adjustments to AssetQuery:
 60 |             1) Turns date column from pd.Timestamp into pd.Period
 61 |             2) Turns the lpermno into an int
 62 |             3) Adjusts the pricing_field: pricing_field / adjustor_field
 63 |         :return: Series of the adjusted pricing data indexed by date, lpermno
 64 |         """
 65 |         if self._adjusted_pricing is None:
 66 |             raise ValueError('adjusted pricing is not set')
 67 | 
 68 |         return self._adjusted_pricing[self._pricing_field]
 69 | 
 70 |     @property
 71 |     def assets(self) -> List[int]:
 72 |         """
 73 |         casting to int due to _db problem must fix
 74 |         :return: The id's of assets we have pricing data for
 75 |         """
 76 |         return self._adjusted_pricing.index.get_level_values('id').astype(int).unique().tolist()
 77 | 
 78 |     @property
 79 |     def periods(self) -> List[pd.Period]:
 80 |         """
 81 |         :return: the unique periods for which we have pricing data
 82 |         """
 83 |         return self._adjusted_pricing.index.get_level_values('date').unique().tolist()
 84 | 
 85 |     def _query_adjusted_pricing(self, db, collection, assets, start, end, search_by, pricing_field, adjustor_field,
 86 |                                 trading_calender) -> None:
 87 |         """
 88 |         Makes query the pricing data
 89 |         Performs adjustments defined in self.daily_pricing
 90 |         Then caches the adjusted pricing in self._adjusted_pricing
 91 |         self._adjusted_pricing columns: self._pricing_field, self._adjustor_field, Index: date, lpermno
 92 |         :return: None, mutates self._adjusted_pricing to contain adjusted pricing
 93 |         """
 94 |         # querying pricing
 95 |         reader = ReadDB(MongoAPI(db, collection))
 96 |         query_df = reader.get_asset_data(assets, search_by=search_by, start=pd.Timestamp(start), end=pd.Timestamp(end),
 97 |                                          fields=[pricing_field, adjustor_field])
 98 |         pricing_df = query_df.set_calendar(trading_calender).df.reset_index()
 99 | 
100 |         # possibly send command to close mongo to free up memory
101 | 
102 |         # adjusting data frame
103 |         pricing_df[pricing_field] = pricing_df[pricing_field] / pricing_df[adjustor_field]
104 |         pricing_df['date'] = pricing_df['date'].dt.to_period(freq='D')
105 | 
106 |         # currently code is requiring lpermno input wont work with tickers need to fix _db
107 |         pricing_df['id'] = pricing_df['lpermno'].astype(int)
108 |         pricing_df = pricing_df.set_index(['date', 'id'])
109 | 
110 |         self._adjusted_pricing = pricing_df
111 | 
112 |         self._query_summary(assets)  # this can be cleaner
113 | 
114 |     def _query_summary(self, assets):
115 |         """
116 |         prints a summary of query and tells you what id's were not able to be found in the query
117 |         :return: None
118 |         """
119 |         query_assets = self._adjusted_pricing.index.get_level_values(1).astype(str).unique().tolist()
120 |         not_found_assets = set(assets) - set(query_assets)
121 |         if len(not_found_assets) == 0:
122 |             print('All assets retrieved in query!')
123 |         else:
124 |             print(f'Unable to find {len(not_found_assets)} assets: {not_found_assets}')
125 | 


--------------------------------------------------------------------------------
/depreciated/equity_db_sector_portal.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | from typing import List
 3 | 
 4 | import pandas as pd
 5 | 
 6 | try:
 7 |     from equity_db import MongoAPI, ReadDB
 8 | except ImportError:
 9 |     pass
10 | 
11 | from ntiles.portals.base_portal import BaseGrouperPortalConstant
12 | 
13 | 
14 | class SectorPortal(BaseGrouperPortalConstant, ABC):
15 |     def __init__(self, passed_assets: List[str], asset_id: str = 'lpermno', db: str = 'equity',
16 |                  collection: str = 'crsp'):
17 |         """
18 |         :param asset_id: the assets to get the sector data for
19 |         :param asset_id: what is the id of the asset, must be recognised by equity_db
20 |         :param db: name of the db
21 |         :param collection: name of the collection
22 |         """
23 |         super().__init__(passed_assets, 'GIC Sector')
24 |         self._passed_assets = passed_assets
25 |         self._asset_id = asset_id
26 |         self._db = db
27 |         self._collection = collection
28 | 
29 |         self._sectors = None
30 |         self._set_sectors()
31 | 
32 |     @property
33 |     def group_information(self) -> pd.Series:
34 |         """
35 |         gets the gic _sectors for the give assets
36 |         :return: DataFrame of GIC _sectors for the given assets
37 |         """
38 |         if self._sectors is not None:
39 |             return self._sectors
40 | 
41 |         self._set_sectors()
42 |         return self._sectors
43 | 
44 |     @property
45 |     def group_mapping(self):
46 |         """
47 |         :return: dict mapping for the group
48 |         """
49 |         return self.group_information.to_dict()
50 | 
51 |     def _set_sectors(self) -> None:
52 |         """
53 |         Sets the _sectors in the class
54 |         :return: None
55 |         """
56 |         reader = ReadDB(MongoAPI(db=self._db, collection=self._collection))
57 |         query = reader.get_asset_data(self._passed_assets, search_by=self._asset_id, fields=['gsector'])
58 |         self._sectors = query.df['gsector']
59 |         self._sectors.index = self._sectors.index.astype(str)
60 | 
61 |     @property
62 |     def assets(self) -> List[int]:
63 |         return self._sectors.reset_index().lpermno.astype(int).tolist()
64 | 


--------------------------------------------------------------------------------
/ntiles/__init__.py:
--------------------------------------------------------------------------------
1 | from ntiles import toolbox
2 | from ntiles import backtest
3 | 


--------------------------------------------------------------------------------
/ntiles/backtest/__init__.py:
--------------------------------------------------------------------------------
 1 | from .ntile_kicker import Ntile
 2 | from .portals.pricing_portal import PricingPortal
 3 | from .portals.sector_portal import SectorPortal
 4 | 
 5 | __all__ = [
 6 |     'Ntile',
 7 |     'PricingPortal',
 8 |     'SectorPortal',
 9 | ]
10 | 


--------------------------------------------------------------------------------
/ntiles/backtest/ntile_kicker.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from typing import Dict, Iterable, Optional
  3 | 
  4 | import pandas as pd
  5 | import duckdb
  6 | 
  7 | from .portals.base_portal import BaseGrouperPortalConstant
  8 | from .portals.pricing_portal import PricingPortal
  9 | from .tears.base_tear import BaseTear
 10 | from .tears.ic_tear import ICHorizonTear, ICTear
 11 | from .tears.inspection_tear import InspectionTear
 12 | from .tears.tilts_backtest_tear import TiltsBacktestTear
 13 | from .tears.turnover_tear import TurnoverTear
 14 | 
 15 | 
 16 | class Ntile:
 17 |     def __init__(self, pricing_portal: PricingPortal, group_portal: Optional[BaseGrouperPortalConstant] = None):
 18 |         """
 19 |         :param pricing_portal: the pricing portal which holds pricing data for all assets with factor values
 20 |         :param group_portal: group portal which holds grouping information for all assets with factor values
 21 |             if this is None then no group statistics will be calculated
 22 |         """
 23 |         self._pricing_portal: PricingPortal = pricing_portal
 24 |         self._group_portal = group_portal
 25 | 
 26 |         self._factor_data = None
 27 |         self._ntile_matrix = None
 28 |         self._formatted_returns = None
 29 | 
 30 |     def _input_checks(self, factor_series) -> None:
 31 |         """
 32 |         checks the factor series to ensure it meet requirements to run a tearsheet
 33 | 
 34 |         Requirements:
 35 |             1) series must have MultiIndex with 2 levels
 36 |             2) First level must be of type pd.Period
 37 |             3) PricingPortal must have data for all Period dates in the series
 38 |             4) There can only be one observations for a single asset on a single day
 39 |             4) The factor and pricing have to have the same freq
 40 | 
 41 |         :param factor_series: the series we are checking
 42 |         :return: None
 43 |         :raise ValueError: if one of the requirements are not met
 44 |         """
 45 | 
 46 |         # checking for series with multi index, possibly also check types for multi index
 47 |         if not isinstance(factor_series.index, pd.MultiIndex) or factor_series.index.nlevels != 2:
 48 |             raise ValueError('Factor input must have MultiIndex of period, id')
 49 | 
 50 |         # ensure the index level zero is date
 51 |         if not isinstance(factor_series.index.get_level_values(0), pd.PeriodIndex):
 52 |             raise ValueError('Factor input must have MultiIndex with the first level being a period '
 53 |                              f'current factor dtype is {type(factor_series.index.get_level_values(0))}')
 54 | 
 55 |         # we will check id when looking for overlapping portal names
 56 |         no_pricing_for = set(factor_series.index.get_level_values(1)).difference(
 57 |             self._pricing_portal.assets)
 58 |         if len(no_pricing_for) != 0:
 59 |             # raise ValueError(f'PricingPortal does not have data for: {no_pricing_for}')
 60 |             warnings.warn(f'PricingPortal does not have data for: {no_pricing_for}')
 61 | 
 62 |         # make sure pricing portal dates match up with factor
 63 |         overlapping_periods = set(factor_series.index.get_level_values(0).drop_duplicates()).intersection(
 64 |             self._pricing_portal.periods)
 65 |         if len(overlapping_periods) == 0:
 66 |             raise ValueError('No overlap between PricingPortal dates and factor dates')
 67 |         if len(overlapping_periods) < 100:
 68 |             warnings.warn(f'Only {len(overlapping_periods)} common periods between PricingPortal and factor')
 69 | 
 70 |         # check for multiple observations on a single day for a single asset
 71 |         if factor_series.index.duplicated().any():
 72 |             raise ValueError('Multiple factor observations on single day for a single asset')
 73 | 
 74 |         # check the pricing and factor freq are the same
 75 |         if factor_series.index.get_level_values('date').freq != self._pricing_portal.delta_data.index.freq:
 76 |             raise ValueError('Factor and pricing dont have the same freq!')
 77 | 
 78 |     def _set_ntiles_and_returns(self, factor_data: pd.Series, ntiles: int):
 79 |         """
 80 |         Sets self._formatted_returns and  self._formatted_ntile
 81 |         :param factor_data: the factor data
 82 |         :param ntiles: amount of ntiles
 83 |         :return: None
 84 |         """
 85 |         self._ntile_factor_sql(factor_data, ntiles)
 86 |         self._align_ntiles_pricing()
 87 | 
 88 |         # can see what % of the dataframe is null here
 89 |         self._make_null_summary(factor_data)
 90 | 
 91 |     def _align_ntiles_pricing(self) -> None:
 92 |         """
 93 |         ensures ntiled matrix and daily returns matrix have the same column and row order
 94 |         sets self._formatted_returns and self._ntile_matrix
 95 |         :return: None
 96 |         """
 97 |         ntile_factor = self._factor_data['ntile'].unstack()
 98 |         daily_returns = self._pricing_portal.delta_data
 99 | 
100 |         factor_date = ntile_factor.index.get_level_values('date')
101 |         self._formatted_returns = daily_returns[(daily_returns.index >= factor_date.min()) &
102 |                                                 (daily_returns.index <= factor_date.max())]
103 | 
104 |         # reindexing the ntiles data so that you have pricing and ntiles matching up
105 |         self._ntile_matrix = ntile_factor.reindex_like(self._formatted_returns)
106 | 
107 |     def _make_null_summary(self, raw_factor_data) -> None:
108 |         """
109 |         making a summary of how much factor data we matched to pricing data
110 |         :param raw_factor_data: the raw unstacked factor data
111 |         """
112 |         length_og_factor_data = len(raw_factor_data)
113 |         # seeing what % of factor data is missing
114 |         num_na_data_points = raw_factor_data.isnull().sum()
115 |         pct_na_data_points = num_na_data_points / length_og_factor_data
116 | 
117 |         # amount of data droped because of non aligned factor and returns dates:
118 |         # above should be non null length of ntiles before reindexing
119 |         # non null length of ntiles after indexing
120 |         number_of_finite_ntiles = length_og_factor_data - num_na_data_points
121 |         binary_if_ntile_data = self._ntile_matrix.notnull()
122 |         number_of_finite_ntiles_no_overlap_returns = number_of_finite_ntiles - binary_if_ntile_data.sum().sum()
123 |         pct_missing_ntile_no_overlap = number_of_finite_ntiles_no_overlap_returns / number_of_finite_ntiles
124 | 
125 |         # amount of data we dont have returns for given we have overlapping pricing and factor
126 |         # should ffill ntile by holdign period since we need return data holding_period days out
127 |         binary_if_return_data = self._formatted_returns.notnull()
128 |         # should forward fill by holding period to make sure we have pricing for when we will be holding the stock
129 |         missing_from_no_returns_given_overlap = (number_of_finite_ntiles
130 |                                                  - (binary_if_ntile_data * binary_if_return_data).sum().sum())
131 |         pct_missing_data_no_returns_given_overlap = missing_from_no_returns_given_overlap / number_of_finite_ntiles
132 | 
133 |         # total number of unusable factor data points due to null or no maped returns
134 |         num_bad = (num_na_data_points
135 |                    + number_of_finite_ntiles_no_overlap_returns
136 |                    + missing_from_no_returns_given_overlap
137 |                    )
138 | 
139 |         pct_bad = num_bad / length_og_factor_data
140 | 
141 |         print(f"Unusable Factor Data:           {(round(pct_bad, 4)) * 100}%")
142 |         print(f"NA Factor Values:               {(round(pct_na_data_points, 4)) * 100}%")
143 |         print(f"No Overlapping Returns:         {(round(pct_missing_ntile_no_overlap, 4)) * 100}%")
144 |         print(f"Missing Returns Given Overlap:  {(round(pct_missing_data_no_returns_given_overlap, 4)) * 100}%")
145 | 
146 |     def _ntile_factor(self, factor: pd.Series, ntiles: int) -> None:
147 |         """
148 |         This is slow replaced by
149 |         Universe relative Quantiles of a factor by day _ntile_factor_sql
150 | 
151 |         pd.DataFrame of ntiled factor
152 |             index: (pd.Period, _asset_id)
153 |             Columns: (factor, ntile)
154 |             Values: (factor value, Ntile corresponding to factor value)
155 | 
156 |         :param factor: same var as ntile_return_tearsheet
157 |         :param ntiles: same var as ntile_return_tearsheet
158 |         """
159 |         # add a filter for if a day has less than 20% factor data then just put bin as -1 for all assets
160 |         # unstack the frame, percentile rank each row, divide whole matrix buy 1/ntiles, take the floor of every number
161 |         factor = factor[~factor.isnull()].to_frame('factor')
162 | 
163 |         try:
164 |             factor['ntile'] = factor.groupby('date').transform(
165 |                 lambda date_data: ntiles - pd.qcut(date_data, ntiles, labels=False)
166 |             ).sort_index()
167 |         except Exception as e:
168 |             print('Hit error while binning data. Need to push the histogram')
169 |             print('Your data is mighty sus we can\'t Ntile it. This is normally due to bad data')
170 | 
171 |             # forcing a histogram out
172 |             import matplotlib.pyplot as plt
173 |             factor.groupby('date').count().plot()
174 |             plt.show()
175 | 
176 |             raise e
177 | 
178 |         self._factor_data = factor
179 | 
180 |     def _ntile_factor_sql(self, factor: pd.Series, ntiles: int) -> None:
181 |         """
182 |         Universe relative Quantiles of a factor by day
183 |         Around 100X faster than pandas groupby qcut
184 | 
185 |         pd.DataFrame of ntiled factor
186 |             index: (pd.Period, _asset_id)
187 |             Columns: (factor, ntile)
188 |             Values: (factor value, Ntile corresponding to factor value)
189 | 
190 |         :param factor: same var as ntile_return_tearsheet
191 |         :param ntiles: same var as ntile_return_tearsheet
192 |         """
193 |         factor_freq = factor.index.get_level_values('date').freq
194 |         factor = factor.to_frame('factor').reset_index()
195 |         factor['date'] = factor['date'].dt.to_timestamp()
196 | 
197 |         sql_quantile = f"""SELECT *, NTILE({ntiles}) OVER(PARTITION BY date ORDER BY factor.factor DESC) as ntile
198 |                             FROM factor
199 |                             WHERE factor.factor IS NOT NULL"""
200 |         con = duckdb.connect(':memory:')
201 |         factor = con.execute(sql_quantile).df()
202 |         factor['date'] = factor['date'].dt.to_period(freq=factor_freq)
203 |         factor = factor.set_index(['date', 'id'])
204 | 
205 |         self._factor_data = factor
206 | 
207 |     #
208 |     # Start up methods
209 |     #
210 |     def _prep_for_run(self, factor: pd.Series, ntiles: int) -> None:
211 |         """
212 |         prepares the ntiles class to run a tear sheet
213 |         :param factor: factor for tear sheet
214 |         :param ntiles: num ntiles for sheet
215 |         :return: None
216 |         """
217 |         # checking to see if we have series or data frame
218 |         if isinstance(factor, pd.DataFrame):
219 |             if factor.shape[1] > 1:  # there is a df passed with multible columns
220 |                 raise ValueError('There are multiple columns in the passed DataFrame')
221 | 
222 |             factor_series = factor.iloc[:, 0]
223 |         else:
224 |             factor_series = factor.copy()
225 | 
226 |         self._input_checks(factor_series)
227 | 
228 |         factor_series.index.names = ['date', 'id']
229 |         self.kick_tears(factor_series, ntiles)
230 | 
231 |         self._print_start_end_dates()
232 | 
233 |     def _print_start_end_dates(self):
234 |         """
235 |         prints the start and end date of the backtest
236 |         """
237 |         date = self._factor_data.index.get_level_values(0)
238 |         print(f'\nStart Date: {date.min()}')
239 |         print(f'End Date:   {date.max()}\n')
240 | 
241 |     def kick_tears(self, factor_series: pd.Series, ntiles: int) -> None:
242 |         """
243 |         Clears the object of all factor and tear data.
244 |         Reruns Ntiling of factor
245 |         :param factor_series: the user passed factor
246 |         :param ntiles: the number of ntiles
247 |         :return: None
248 |         """
249 |         self._clear()
250 |         self._set_ntiles_and_returns(factor_series, ntiles)
251 | 
252 |     def _clear(self) -> None:
253 |         """
254 |         clears all data points in the object except the pricing portal
255 |         :return: None
256 |         """
257 |         self._factor_data = None
258 |         self._ntile_matrix = None
259 |         self._formatted_returns = None
260 | 
261 |     @staticmethod
262 |     def _run(tears: Dict[str, BaseTear]) -> None:
263 |         """
264 |         Runs all tear sheets that are set in the class
265 |         :return: None
266 |         """
267 |         for tear in tears.values():
268 |             tear.compute_plot()
269 | 
270 |     #
271 |     # Tear Sheets Below
272 |     #
273 |     def full_tear(self, factor: pd.Series, ntiles: int, holding_period: int, long_short: bool = True,
274 |                   market_neutral=True, show_uni=False, show_ntile_tilts=False) -> Dict[str, BaseTear]:
275 |         """
276 |         Creates basic visualizations of the factor data distribution by ntile and how complete the data is
277 |         Creates a fan chart of cumulative returns for the given factor values.
278 |         Creates a IC time series for the factor value and the forward returns
279 |         Createa a turnover sheet showing how often the factor data will turn over
280 | 
281 |         The in the cumulative return plot, each value represents the cumulative return up to that days close.
282 |         Returns are not shifted each value represents portfolios value on the close of that day.
283 | 
284 |         A set of weights is generated for each day based off factor quantile.
285 |         The portfolio is rebalanced daily, each days 1/holding_period of the portfolio is rebalanced.
286 |         All positions are equally weighted.
287 | 
288 |         :param factor: The factor values being tested.
289 |             index: (pd.Period, _asset_id)
290 |             values: (factor_value)
291 |         :param holding_period: How long we want to hold positions for, represents days
292 |         :param ntiles: amount of bins we are testing (1 is high factor value n is low value)
293 |         :param long_short: show we compute the spread between ntiles: (1 - n)
294 |         :param market_neutral: subtract out the universe returns from the ntile returns?
295 |         :return: plots showing the return profile of the factor
296 |         :param show_uni: Should universe return be shown in the spread plot?
297 |         :param show_ntile_tilts: should we show each ntiles tilts?
298 |         """
299 |         self._prep_for_run(factor, ntiles)
300 |         tears = {'inspection_tear': InspectionTear(factor_data=self._factor_data),
301 |                  'backtest_tear': TiltsBacktestTear(ntile_matrix=self._ntile_matrix,
302 |                                                     daily_returns=self._formatted_returns, ntiles=ntiles,
303 |                                                     holding_period=holding_period, long_short=long_short,
304 |                                                     market_neutral=market_neutral,
305 |                                                     show_uni=show_uni, factor_data=self._factor_data,
306 |                                                     group_portal=self._group_portal,
307 |                                                     show_ntile_tilts=show_ntile_tilts),
308 |                  'ic_tear': ICTear(factor_data=self._factor_data, daily_returns=self._formatted_returns,
309 |                                    holding_period=holding_period),
310 |                  'turnover_tear': TurnoverTear(factor_data=self._factor_data, holding_period=holding_period)}
311 |         self._run(tears)
312 |         return tears
313 | 
314 |     def ntile_backtest_tear(self, factor: pd.Series, ntiles: int, holding_period: int, long_short: bool = True,
315 |                             market_neutral=True, show_uni=False, show_ntile_tilts=False) -> Dict[str, BaseTear]:
316 |         """
317 |         Creates a fan chart of cumulative returns for the given factor values.
318 |         The factor values are ntile'd into ntiles number of bins
319 | 
320 |         The in the cumulative return plot, each value represents the cumulative return up to that days close.
321 |         Returns are not shifted each value represents portfolios value on the close of that day.
322 | 
323 |         A set of weights is generated for each day based off factor quantile.
324 |         The portfolio is rebalanced daily, each days 1/holding_period of the portfolio is rebalanced.
325 |         All positions are equally weighted.
326 | 
327 |         :param factor: The factor values being tested.
328 |             index: (pd.Period, _asset_id)
329 |             values: (factor_value)
330 |         :param holding_period: How long we want to hold positions for, represents days
331 |         :param ntiles: amount of bins we are testing (1 is high factor value n is low value)
332 |         :param long_short: show we compute the spread between ntiles: (1 - n)
333 |         :param market_neutral: subtract out the universe returns from the ntile returns?
334 |         :return: plots showing the return profile of the factor
335 |         :param show_uni: Should universe return be shown in the spread plot?
336 |         :param show_ntile_tilts: should we show each ntiles tilts?
337 |         """
338 |         self._prep_for_run(factor, ntiles)
339 |         tears = {'backtest_tear':
340 |                      TiltsBacktestTear(ntile_matrix=self._ntile_matrix, daily_returns=self._formatted_returns,
341 |                                        ntiles=ntiles, holding_period=holding_period, long_short=long_short,
342 |                                        market_neutral=market_neutral, show_uni=show_uni, factor_data=self._factor_data,
343 |                                        group_portal=self._group_portal, show_ntile_tilts=show_ntile_tilts)
344 |                  }
345 |         self._run(tears)
346 |         return tears
347 | 
348 |     def ntile_inspection_tear(self, factor: pd.Series, ntiles: int) -> Dict[str, BaseTear]:
349 |         """
350 |         creates visuals showing the factor data over time
351 |         only calculates IC for when the asset is in the universe
352 |         :param factor: The factor values being tested.
353 |             index: (pd.Period, _asset_id)
354 |             values: (factor_value)
355 |         :param ntiles: the number of ntiles
356 |         :return: Dict of InspectionTear
357 |         """
358 |         self._prep_for_run(factor, ntiles)
359 |         tears = {'inspection_tear': InspectionTear(factor_data=self._factor_data)}
360 |         self._run(tears)
361 |         return tears
362 | 
363 |     def ntile_ic_tear(self, factor: pd.Series, holding_period: int) -> Dict[str, BaseTear]:
364 |         """
365 |         creates visuals showing the ic over time
366 |         :param factor: The factor values being tested.
367 |             index: (pd.Period, _asset_id)
368 |             values: (factor_value)
369 |         :param holding_period: How long we want to hold positions for, represents days
370 |         :return: Dict of ICTear
371 |         """
372 |         self._prep_for_run(factor, 1)
373 |         tears = {'ic_tear': ICTear(factor_data=self._factor_data, daily_returns=self._formatted_returns,
374 |                                    holding_period=holding_period)}
375 |         self._run(tears)
376 |         return tears
377 | 
378 |     def ntile_turnover_tear(self, factor: pd.Series, ntiles: int, holding_period: int) -> Dict[str, BaseTear]:
379 |         """
380 |         Creates visuals showing the turnover over time
381 |         :param factor: The factor values being tested.
382 |            index: (pd.Period, _asset_id)
383 |            values: (factor_value)
384 |         :param ntiles: the number of ntiles
385 |         :param holding_period: How long we want to hold positions for, represents days
386 |         :return: Dict of TurnoverTear
387 |         """
388 |         self._prep_for_run(factor, ntiles)
389 |         tears = {'turnover_tear': TurnoverTear(factor_data=self._factor_data, holding_period=holding_period)}
390 |         self._run(tears)
391 |         return tears
392 | 
393 |     def ntile_ic_horizon(self, factor: pd.Series, intervals: Iterable[int], show_individual: bool = False) -> \
394 |             Dict[str, BaseTear]:
395 |         """
396 |         Shows the curve of the information coefficient over various holding periods
397 | 
398 |         :param factor: The factor values being tested.
399 |            index: (pd.Period, _asset_id)
400 |            values: (factor_value)
401 |         :param intervals: an iterable that contains the holding periods we would like to make the IC frontier for
402 |         :param show_individual: should each individual IC time series be show for every interval
403 |         :return: Dict of ICHorizonTear
404 |         """
405 |         self._prep_for_run(factor, 1)
406 |         tears = {
407 |             'ic_horizon_tear': ICHorizonTear(factor_data=self._factor_data, daily_returns=self._formatted_returns,
408 |                                              intervals=intervals, show_individual=show_individual)}
409 |         self._run(tears)
410 |         return tears
411 | 


--------------------------------------------------------------------------------
/ntiles/backtest/periods.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Taken from https://github.com/empyrical/blob/master/empyrical/periods.py
 3 | #
 4 | from typing import Union
 5 | 
 6 | import pandas as pd
 7 | 
 8 | APPROX_BDAYS_PER_MONTH = 21
 9 | APPROX_BDAYS_PER_YEAR = 252
10 | 
11 | MONTHS_PER_YEAR = 12
12 | WEEKS_PER_YEAR = 52
13 | QTRS_PER_YEAR = 4
14 | 
15 | DAILY = 'daily'
16 | WEEKLY = 'weekly'
17 | MONTHLY = 'monthly'
18 | QUARTERLY = 'quarterly'
19 | YEARLY = 'yearly'
20 | 
21 | PANDAS_PERIOD_TO_PERIOD_STRING = {
22 |     'D': DAILY,
23 |     'W': WEEKLY,
24 |     'M': MONTHLY,
25 |     'Q': QUARTERLY,
26 |     'Y': YEARLY
27 | }
28 | 
29 | ANNUALIZATION_FACTORS = {
30 |     DAILY: APPROX_BDAYS_PER_YEAR,
31 |     WEEKLY: WEEKS_PER_YEAR,
32 |     MONTHLY: MONTHS_PER_YEAR,
33 |     QUARTERLY: QTRS_PER_YEAR,
34 |     YEARLY: 1
35 | }
36 | 
37 | 
38 | def get_period_string(dates: Union[pd.PeriodIndex, pd.Series]) -> str:
39 |     """
40 |     Gets the string definition of a period from a pandas.PeriodIndex or pandas Series
41 |     :param dates: Pandas period index or columns of period we are getting the frequency for
42 |     :return: a period string defined above
43 |     """
44 |     if isinstance(dates, pd.Series):
45 |         dates = dates.dt
46 | 
47 |     freq = dates.freq.name
48 |     if freq not in PANDAS_PERIOD_TO_PERIOD_STRING:
49 |         raise ValueError(f'Unknown frequency: {freq}')
50 | 
51 |     return PANDAS_PERIOD_TO_PERIOD_STRING[freq]
52 | 
53 | 
54 | def get_period_annualization(dates: Union[pd.PeriodIndex, pd.Series]) -> int:
55 |     """
56 |     Gets the annualization factor that corresponds to the frequency of the given pandas.PeriodIndex or pandas Series
57 |     :param dates: Pandas period index or columns of period we are getting the frequency for
58 |     :return: The number of observations of the given date frequency in a year
59 |     """
60 |     return ANNUALIZATION_FACTORS[get_period_string(dates)]
61 | 


--------------------------------------------------------------------------------
/ntiles/backtest/plotter.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | 
  4 | import matplotlib as mpl
  5 | import matplotlib.pyplot as plt
  6 | from IPython.core.display import display
  7 | 
  8 | RETURN_COLOR_MAP = mpl.cm.get_cmap('jet')
  9 | TILTS_COLOR_MAP = mpl.cm.get_cmap('tab20')
 10 | IC_COLOR_MAP = mpl.cm.get_cmap('tab10')
 11 | 
 12 | LARGE_FIGSIZE = 20, 10
 13 | MEDIUM_FIGSIZE = 15, 8
 14 | 
 15 | 
 16 | def ntile_return_plot(cum_ntile_returns: pd.DataFrame, title: str):
 17 |     """
 18 |     generates cumulative return plot for a ntiles returns series
 19 |     if cols are empty list returns None
 20 |     :param cum_ntile_returns: cumulative returns we want to plot
 21 |     :param title: title of the plot
 22 |     :return: matplotlib axis with the return plot on it
 23 |     """
 24 | 
 25 |     fig, ax = plt.subplots(1, 1, figsize=LARGE_FIGSIZE)
 26 | 
 27 |     cum_ntile_returns.plot(lw=2, ax=ax, cmap=RETURN_COLOR_MAP)
 28 |     ax.set(ylabel='Log Cumulative Returns', title=title, xlabel='',
 29 |            yscale='symlog')
 30 | 
 31 |     ax.legend(loc="center left", bbox_to_anchor=(1, .5))
 32 |     ax.set_yscale('log', base=2)
 33 |     ax.yaxis.set_major_formatter(mpl.ticker.FormatStrFormatter('%.2f'))
 34 |     ax.axhline(1, linestyle='-', color='black', lw=1)
 35 |     fig.autofmt_xdate()
 36 | 
 37 |     plt.show()
 38 |     return ax
 39 | 
 40 | 
 41 | def ntile_annual_return_bars(avg_annual_ret: pd.Series, period: int, freq: str):
 42 |     """
 43 |     generates a box plot of the yearly CAGR for each ntile
 44 |     :return: matplotlib axis
 45 |     """
 46 |     num_ntiles = len(avg_annual_ret)
 47 | 
 48 |     _, ax = plt.subplots(1, 1, figsize=MEDIUM_FIGSIZE)
 49 |     ax.set(ylabel='% Return',
 50 |            title=f'Annual Return, {period}{freq} Holding period',
 51 |            xlabel='')
 52 | 
 53 |     colors = [RETURN_COLOR_MAP(i) for i in np.linspace(0, 1, num_ntiles)]
 54 |     ax.bar(avg_annual_ret.index, avg_annual_ret.to_numpy(), color=colors)
 55 |     ax.axhline(0, linestyle='-', color='black', lw=1)
 56 | 
 57 |     plt.show()
 58 |     return ax
 59 | 
 60 | 
 61 | def plot_inspection_data(table: pd.DataFrame, title: str, ylabel: str, decimals: int = 0) -> None:
 62 |     """
 63 |     plots the inspection data for inspection tear sheets
 64 |     :param table: the table to plot
 65 |     :param title: the title for the plot
 66 |     :param ylabel: y label for plot
 67 |     :param decimals: amount of decimals to display on the Y axis
 68 |     :return: None
 69 |     """
 70 | 
 71 |     fig, ax = plt.subplots(1, 1, figsize=MEDIUM_FIGSIZE)
 72 |     ax.set(title=title, ylabel=ylabel)
 73 |     table.plot(lw=2, ax=ax, cmap=RETURN_COLOR_MAP)
 74 |     ax.legend(loc="center left", bbox_to_anchor=(1, .5))
 75 |     # ax.xaxis.set_major_formatter(mpl.dates.DateFormatter('%m-%Y'))
 76 |     ax.yaxis.set_major_formatter(mpl.ticker.FormatStrFormatter(f'%.{decimals}f'))
 77 |     fig.autofmt_xdate()
 78 | 
 79 |     if isinstance(table, pd.Series):
 80 |         ax.get_legend().remove()
 81 | 
 82 |     plt.show()
 83 | 
 84 | 
 85 | def plot_tilts(frame: pd.DataFrame, ntile: str, group_name: str, ax=None):
 86 |     """
 87 |     Plots the timeseries group tilts for a single ntile
 88 |     :param frame: frame containing the tilts per day, columns: group, index: pd.Period
 89 |     :param ntile: the Ntile we are plotting for
 90 |     :param group_name: the name of the group
 91 |     :param ax: axis to plot on
 92 |     :return: None
 93 |     """
 94 |     if ax is None:
 95 |         fig, ax = plt.subplots(1, 1, figsize=MEDIUM_FIGSIZE)
 96 | 
 97 |     ax.set(title=f'{ntile}, {group_name}'.title(), ylabel='Weight In Ntile')
 98 |     frame.plot(lw=2, ax=ax, cmap=TILTS_COLOR_MAP, legend=None)
 99 |     ax.axhline(0, linestyle='-', color='black', lw=1)
100 |     # ax.xaxis.set_major_formatter(mpl.dates.DateFormatter('%m-%Y'))
101 |     ax.yaxis.set_major_formatter(mpl.ticker.FormatStrFormatter(f'%.2f'))
102 |     plt.show()
103 | 
104 | 
105 | def plot_tilt_hist(series, ntile: str, group_name: str, extra_space: bool = True):
106 |     """
107 |     Plots the histogram group tilts for a single ntile
108 |     :param series: frame containing the avg tilts, columns: group, index: pd.Period
109 |     :param ntile: the Ntile we are plotting for
110 |     :param group_name: the name of the group
111 |     :return: None
112 |     """
113 |     if extra_space:
114 |         fig, ax = plt.subplots(1, 2, figsize=LARGE_FIGSIZE)
115 |     else:
116 |         _, ax = plt.subplots(1, 1, figsize=(4.5, 4.5))
117 | 
118 |     title = 'Weight Relative to Universe' if 'Ntile' in group_name else 'Group Exposure'
119 |     plotter_frame = series.to_frame('weight')
120 |     plotter_frame['colors'] = [TILTS_COLOR_MAP(i) for i in np.linspace(0, 1, len(series))]
121 |     plotter_frame = plotter_frame.sort_values('weight')
122 | 
123 |     ax[0].barh(plotter_frame.index.tolist(), plotter_frame['weight'].tolist(), align='center',
124 |                color=plotter_frame['colors'].tolist())
125 |     ax[0].set(title=f'{ntile}, {group_name}'.title(), ylabel='Group', xlabel=title)
126 |     ax[0].axvline(0, linestyle='-', color='black', lw=1)
127 | 
128 |     if extra_space:
129 |         return ax[1]
130 | 
131 |     plt.show()
132 | 
133 | 
134 | def plot_timeseries_ic(ic_frame: pd.DataFrame, holding_period: int):
135 |     """
136 |     plots the daily time series IC
137 |     :param ic_frame: frame of IC to plot index: pd.Period
138 |     :param holding_period: how long the holding period is for the IC
139 |     :return: None
140 |     """
141 |     fig, ax = plt.subplots(1, 1, figsize=MEDIUM_FIGSIZE)
142 |     ic_frame.plot(ax=ax, title=f'IC {holding_period} {ic_frame.index.freq.name} Holding Period')
143 |     ax.get_lines()[1].set_linewidth(3)
144 |     ax.axhline(0, linestyle='-', color='black', lw=1)
145 |     fig.autofmt_xdate()
146 |     plt.show()
147 | 
148 | 
149 | def plot_auto_corr(ac_series: pd.Series, holding_period: int) -> None:
150 |     """
151 |     plots the daily time series IC
152 |     :param ac_series: series of auto corr to plot index: pd.Period
153 |     :param holding_period: how long the holding period is for the IC
154 |     :return: None
155 |     """
156 |     fig, ax = plt.subplots(1, 1, figsize=MEDIUM_FIGSIZE)
157 |     ac_series.plot(ax=ax, title=f'Autocorrelation {holding_period}{ac_series.index.freq.name} Holding Period')
158 |     ax.axhline(ac_series.median(), linestyle=(0, (5, 10)), color='black', lw=1)
159 |     fig.autofmt_xdate()
160 |     plt.show()
161 | 
162 | 
163 | def plot_turnover(turn_frame: pd.Series, holding_period: int) -> None:
164 |     """
165 |     plots the daily time series IC
166 |     :param turn_frame: dataframe of turnover to plot index: pd.Period
167 |     :param holding_period: how long the holding period is for the IC
168 |     :return: None
169 |     """
170 |     fig, ax = plt.subplots(1, 1, figsize=MEDIUM_FIGSIZE)
171 |     colors = [RETURN_COLOR_MAP(i) for i in np.linspace(0, 1, turn_frame.columns.max())]
172 | 
173 |     for col in turn_frame.columns:
174 |         ax.plot(turn_frame.index.to_timestamp(), turn_frame[col], color=colors[col - 1], label=f'Ntile: {col}')
175 |         ax.axhline(turn_frame[col].median(), linestyle=(0, (5, 10)), color=colors[col - 1], lw=5)
176 | 
177 |     ax.set(ylabel='% Turnover', title=f'Turnover {holding_period}{turn_frame.index.freq.name} Holding Period',
178 |            xlabel='')
179 |     ax.legend(loc="center left", bbox_to_anchor=(1, .5))
180 |     fig.autofmt_xdate()
181 |     plt.show()
182 | 
183 | 
184 | def plot_ic_horizon(horizon_frame: pd.DataFrame):
185 |     ax_tuple = plt.subplots(2, 2, figsize=LARGE_FIGSIZE)[1].flatten()
186 |     colors = [IC_COLOR_MAP(i) for i in np.linspace(0, 1, 4)]
187 | 
188 |     for i in range(horizon_frame.shape[1]):
189 |         plot_me = horizon_frame.iloc[:, i]
190 |         plot_me.plot(ax=ax_tuple[i], color=colors[i], title=plot_me.name)
191 |     plt.show()
192 | 
193 | 
194 | def render_heat_table(frame: pd.DataFrame) -> None:
195 |     """
196 |     renders a dataframe as a heatmap
197 |     :param frame: the frame to render
198 |     :return: None
199 |     """
200 |     cm = mpl.cm.get_cmap('RdYlGn')
201 |     styled = frame.style.background_gradient(cmap=cm, axis=0).format('{:.2f}').set_properties(
202 |         **{'text-align': 'center'})
203 |     render_table(styled)
204 | 
205 | 
206 | def render_table(table: pd.DataFrame, output: str = None) -> None:
207 |     """
208 |     displays a table to the user
209 |     :param table: the table to display
210 |     :param output: the output we should render
211 |     :return: None
212 |     """
213 |     if output:
214 |         print(output)
215 |     display(table)
216 | 


--------------------------------------------------------------------------------
/ntiles/backtest/portals/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Alexd14/ntiles/d22c75aac2a553ccca17fb71a3650071e31808c2/ntiles/backtest/portals/__init__.py


--------------------------------------------------------------------------------
/ntiles/backtest/portals/base_portal.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | from typing import List, Union
  3 | 
  4 | import pandas as pd
  5 | 
  6 | 
  7 | class BasePortal(ABC):
  8 |     """
  9 |     Base class for the data portal object
 10 |     """
 11 | 
 12 |     def __init__(self, assets: List[Union[str, int]]):
 13 |         """
 14 |         :param assets: the assets we are querying for
 15 |         """
 16 |         self._assets = assets
 17 | 
 18 |     @property
 19 |     @abstractmethod
 20 |     def assets(self) -> List[Union[str, int]]:
 21 |         """
 22 |         returns the assets property
 23 |         """
 24 |         return self._assets
 25 | 
 26 | 
 27 | class BaseTimeSeriesPortal(BasePortal):
 28 |     """portal for time series data"""
 29 | 
 30 |     def __init__(self, assets: List[Union[str, int]], start: pd.Period, end: pd.Period, freq: str):
 31 |         """
 32 |         :param assets: the assets we are querying for
 33 |         :param start: start date for the query
 34 |         :param end: end date for the query
 35 |         :param freq: frequency of the data
 36 |         """
 37 |         super().__init__(assets)
 38 |         self._start = start
 39 |         self._end = end
 40 |         self._freq = freq
 41 | 
 42 |     @property
 43 |     @abstractmethod
 44 |     def periods(self) -> List[pd.Period]:
 45 |         """
 46 |         :return: the unique periods for which we have data
 47 |         """
 48 |         pass
 49 | 
 50 | 
 51 | class BaseRawPortal(BaseTimeSeriesPortal, ABC):
 52 |     def __init__(self, assets: List[Union[str, int]], start: pd.Period, end: pd.Period):
 53 |         """
 54 |         :param assets: the assets we are querying for
 55 |         :param start: start date for the query
 56 |         :param end: end date for the query
 57 |         """
 58 |         super().__init__(assets, start, end)
 59 | 
 60 |     @property
 61 |     @abstractmethod
 62 |     def raw_data(self) -> pd.DataFrame:
 63 |         """
 64 |         returns the raw data held by the portal
 65 |         :return: Index: Id, pd.Period; Columns: 'data'; Values: data
 66 |         """
 67 |         pass
 68 | 
 69 | 
 70 | class BaseDeltaPortal(BaseTimeSeriesPortal, ABC):
 71 |     """
 72 |     a portal which fetches and calculates the raw data long with delta or percent delta of a variable.
 73 |     Useful for fetching and calculating returns
 74 |     """
 75 | 
 76 |     @property
 77 |     @abstractmethod
 78 |     def delta_data(self):
 79 |         """
 80 |         returns the delta of the data held by the portal
 81 |         :return: Index: Id, pd.Period; Columns: 'delta'; Values: data
 82 |         """
 83 |         pass
 84 | 
 85 | 
 86 | class BaseGrouperPortalConstant(BasePortal, ABC):
 87 |     """
 88 |     A portal which fetches grouping data
 89 |     """
 90 | 
 91 |     def __init__(self, assets: List[Union[str, int]], group_name: str):
 92 |         """
 93 |         :param assets: the assets we are querying for
 94 |         :param group_name: the name of the grouping
 95 |         """
 96 |         super().__init__(assets)
 97 |         self.group_name = group_name
 98 | 
 99 |     @property
100 |     def name(self):
101 |         """
102 |         :return: Name of group
103 |         """
104 |         return self.group_name
105 | 
106 |     @property
107 |     @abstractmethod
108 |     def group_information(self) -> pd.Series:
109 |         """
110 |         Holds group information from the portal
111 |         :return: Index: Id; Columns: 'group'; Values: group
112 |         """
113 |         pass
114 | 
115 |     @property
116 |     @abstractmethod
117 |     def group_mapping(self):
118 |         """
119 |         :return: dict mapping for the group
120 |         """
121 |         pass
122 | 
123 | 
124 | class BaseGrouperPortalTimeSeries(BaseTimeSeriesPortal, ABC):
125 |     """
126 |     a portal which returns grouping information over a time period
127 |     """
128 | 
129 |     def __init__(self, assets: List[Union[str, int]], start: pd.Period, end: pd.Period, group_name: str):
130 |         """
131 |         :param assets: the assets we are querying for
132 |         :param start: start date for the query
133 |         :param end: end date for the query
134 |         :param group_name: the name of the grouping
135 |         """
136 |         super().__init__(assets, start, end)
137 |         self.group_name = group_name
138 | 
139 |     @property
140 |     def name(self):
141 |         """
142 |         :return: Name of group
143 |         """
144 |         return self.group_name
145 | 
146 |     @property
147 |     @abstractmethod
148 |     def periods(self) -> List[pd.Period]:
149 |         """
150 |         :return: the unique periods for which we have data
151 |         """
152 |         pass
153 | 
154 |     @property
155 |     @abstractmethod
156 |     def group_information(self) -> pd.DataFrame:
157 |         """
158 |         Holds a timeseries of group information from the portal
159 |         :return: Index: Id, pd.Period; Columns: 'group'; Values: group
160 |         """
161 |         pass
162 | 


--------------------------------------------------------------------------------
/ntiles/backtest/portals/pricing_portal.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | from typing import Iterable, List, Union
 3 | 
 4 | import pandas as pd
 5 | import numpy as np
 6 | from .base_portal import BaseDeltaPortal
 7 | 
 8 | from ntiles.toolbox import QueryConstructor, SQLConnection
 9 | 
10 | 
11 | class PricingPortal(BaseDeltaPortal, ABC):
12 |     """
13 |     Pulls pricing from database
14 |     """
15 | 
16 |     def __init__(self,
17 |                  assets: Union[Iterable, str],
18 |                  search_by: str,
19 |                  start_date: str,
20 |                  end_date: str,
21 |                  field: str = 'prc',
22 |                  table: str = 'CRSP.sd',
23 |                  con: SQLConnection = None,
24 |                  freq: str = 'D',
25 |                  ):
26 |         """
27 |         :param assets: The assets we want ti search for. Can be list of ids or a code eg "ETF_SPY".
28 |         :param search_by: The name of the asset ids we are searching the database by.
29 |         :param start_date: The date to start getting pricing. Format: %Y-%m-%d
30 |         :param end_date: The date to stop getting pricing. Format: %Y-%m-%d
31 |         :param field: The pricing field to get from the database. Default: 'prc'
32 |         :param table: The table to get the pricing from. Default: 'CRSP.sd'
33 |         :param con: A SQLConnection object to use to connect to the database. Default: None
34 |         :param freq: The frequency of the pricing. Default: 'D'
35 |         """
36 |         super().__init__(assets=assets,
37 |                          start=pd.Period(start_date),
38 |                          end=min(pd.Timestamp(end_date), pd.Timestamp('today')).to_period('D'),
39 |                          freq=freq)
40 |         self._search_by = search_by
41 |         self._field = field
42 |         self._table = table
43 |         self._con = con
44 |         self._freq = freq
45 | 
46 |         self._pricing = None
47 |         self._get_pricing()
48 | 
49 |     @property
50 |     def assets(self) -> List[any]:
51 |         return self._pricing.columns.tolist()
52 | 
53 |     @property
54 |     def delta_data(self) -> pd.DataFrame:
55 |         """
56 |         returns the delta of the data held by the portal
57 |         :return: Index: Id, pd.Period; Columns: 'delta'; Values: data
58 |         """
59 |         return self._pricing
60 | 
61 |     @property
62 |     def periods(self) -> List[pd.Period]:
63 |         return self._pricing.index.drop_duplicates().to_list()
64 | 
65 |     def _get_pricing(self):
66 |         df = (QueryConstructor(sql_con=self._con, freq=self._freq)
67 |               .query_timeseries_table(self._table, assets=self._assets,
68 |                                       start_date=str(self._start), end_date=str(self._end),
69 |                                       search_by=self._search_by, fields=[self._field])
70 |               .distinct()
71 |               .set_calendar('NYSE')
72 |               .order_by('date')
73 |               .dropna(self._field)
74 |               .df)
75 | 
76 |         self._pricing = df[self._field].unstack().pct_change(1).iloc[1:]. \
77 |             fillna(0).replace([np.inf, -np.inf], 0).clip(-.75, 1.5)
78 | 


--------------------------------------------------------------------------------
/ntiles/backtest/portals/sector_portal.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | from typing import Iterable, List, Union
 3 | 
 4 | import pandas as pd
 5 | 
 6 | from ...toolbox import QueryConstructor
 7 | from .base_portal import BaseGrouperPortalConstant
 8 | 
 9 | 
10 | class SectorPortal(BaseGrouperPortalConstant, ABC):
11 |     def __init__(self, assets: Union[Iterable, str], search_by: str = 'permno', field='gsector', con=None,
12 |                  start_date=None, end_date=None, ):
13 |         """
14 |         :param assets: the assets or universe to get the sector data for
15 |         :param search_by: what is the id of the asset
16 |         :param field: name of field we want to get
17 |         """
18 |         super().__init__(assets, 'GIC Sector')
19 |         self._search_by = search_by
20 |         self._field = field
21 |         self._con = con
22 |         self._start_date = start_date
23 |         self._end_date = end_date
24 | 
25 |         self._group = None
26 |         self._set_sectors()
27 | 
28 |     @property
29 |     def group_information(self) -> pd.Series:
30 |         """
31 |         gets the gic _sectors for the give assets
32 |         :return: DataFrame of GIC _sectors for the given assets
33 |         """
34 |         return self._group
35 | 
36 |     @property
37 |     def group_mapping(self):
38 |         """
39 |         :return: dict mapping for the group
40 |         """
41 |         return self.group_information.to_dict()
42 | 
43 |     def _set_sectors(self) -> None:
44 |         """
45 |         Sets the _sectors in the class
46 |         :return: None
47 |         """
48 |         self._group = (QueryConstructor(self._con)
49 |                        .query_no_date_table(table='link.crsp_cstat_link', fields=[self._field, 'lpermno as permno'],
50 |                                             assets=self._assets, search_by=self._search_by, start_date=self._start_date,
51 |                                             end_date=self._end_date)
52 |                        .df)[self._field].fillna(-1)
53 | 
54 |     @property
55 |     def assets(self) -> List[int]:
56 |         return self._group.index.tolist()
57 | 


--------------------------------------------------------------------------------
/ntiles/backtest/stats.py:
--------------------------------------------------------------------------------
  1 | import empyrical
  2 | import pandas as pd
  3 | import numpy as np
  4 | 
  5 | from . import plotter
  6 | from .periods import get_period_annualization, get_period_string
  7 | 
  8 | 
  9 | def generate_return_stats(period_returns, flip_mdd) -> None:
 10 |     """
 11 |     generates following returns statics for each Ntile:
 12 |         - Sharpe
 13 |         - Annual Return
 14 |         - Annual Vol
 15 |         - % Periods Up
 16 |         - Max Drawdown (flips for top and bottom bins, excluding middle bin)
 17 | 
 18 |         If long_short = Ture:
 19 |             - All above calculated on spread and universe
 20 |             - Annual Tracking Error
 21 |             - Information Ratio
 22 |     :param period_returns: the returns we are calculating stats for
 23 |     :param flip_mdd: should max draw down be flipped around the center?
 24 |     """
 25 |     ntile_funcs = {
 26 |         'sharpe': sharpe_ratio,
 27 |         'CAGR': simple_returns_CGAR,
 28 |         'Vol': annual_volatility,
 29 |         'Max Drawdown': lambda x: max_drawdown(x, flip_mdd),
 30 |         '% Periods Up': percent_periods_up,
 31 |     }
 32 | 
 33 |     calculated_stats = [func(period_returns) for func in ntile_funcs.values()]
 34 |     render_me = pd.DataFrame(calculated_stats).transpose()
 35 |     plotter.render_heat_table(render_me)
 36 | 
 37 | 
 38 | def compute_ntile_stats(name, func, ntile_returns) -> pd.Series:
 39 |     """
 40 |     apply a function to each column of ntile_returns
 41 |     :param name: name of the function
 42 |     :param func: the function to apply
 43 |     :param ntile_returns: the returns we are applying the function to
 44 |     :return: pd.Series, index: Ntile; Name: name;
 45 |     """
 46 |     return ntile_returns.apply(func, axis=0).rename(name)
 47 | 
 48 | 
 49 | def max_drawdown(period_returns, flip_bottom) -> pd.Series:
 50 |     """
 51 |     computes the max drawdown for each column
 52 |     flips the drawdown from downside to upside for ntiles that should be negative
 53 |     :param period_returns: the returns we are getting the drawdown for
 54 |     :param flip_bottom:
 55 |     :return: pd.Series, index: Ntile; Values: drawdown
 56 |     """
 57 |     adj_ret = period_returns.copy()  # gets rid of setting on copy warning
 58 |     num_cols = period_returns.shape[1]
 59 | 
 60 |     if flip_bottom:
 61 |         mid_pos = int(round(num_cols / 2 + .5)) - 1
 62 |         adj_ret.iloc[:, mid_pos:] = adj_ret.iloc[:, mid_pos:] * -1
 63 |         out = compute_ntile_stats('Max Drawdown', empyrical.max_drawdown, adj_ret)
 64 | 
 65 |         if num_cols % 2 == 1:  # if even number of columns is odd pad null
 66 |             out.iloc[mid_pos] = None
 67 | 
 68 |         return out * 100
 69 | 
 70 |     return compute_ntile_stats('Max Drawdown', empyrical.max_drawdown, adj_ret) * 100
 71 | 
 72 | 
 73 | def percent_periods_up(period_returns) -> pd.Series:
 74 |     """
 75 |     computes the percent of periods where return is > 0
 76 |     :param period_returns: the returns we are getting the % of periods up for
 77 |     :return: pd.Series, index: Ntile; Values: % periods up
 78 |     """
 79 |     periods_up = period_returns.copy()
 80 |     periods_up.iloc[:] = np.where(period_returns.values > 0, 1, 0)
 81 |     return (periods_up.sum(axis=0) / periods_up.shape[0]).rename('% Periods Up')
 82 | 
 83 | 
 84 | def annual_volatility(period_returns) -> pd.Series:
 85 |     """
 86 |     computes the annual volatility of each column
 87 |     :param period_returns: the returns we are getting the vol for
 88 |     :return: pd.Series, index: Ntile; Values: annual vol
 89 |     """
 90 |     vol_func = wrap_emprical_period(empyrical.annual_volatility)
 91 |     return compute_ntile_stats('Annual Vol', vol_func, period_returns) * 100
 92 | 
 93 | 
 94 | def sharpe_ratio(period_returns) -> pd.Series:
 95 |     """
 96 |     computes the sharpe ratio for each
 97 |     :param period_returns: the returns we are getting the sharpe of
 98 |     :return:pd.Series, index: Ntile; Values: sharpe
 99 |     """
100 |     sharpe_func = wrap_emprical_period(empyrical.sharpe_ratio)
101 |     return compute_ntile_stats('Sharpe', sharpe_func, period_returns)
102 | 
103 | 
104 | def simple_returns_CGAR(period_returns) -> pd.Series:
105 |     """
106 |     computes the CAGR form simple returns
107 |     :param period_returns: make t e
108 |     :return: series with index: cum_returns.columns; values: corresponding average return in percent
109 |     """
110 |     return CAGR(cum_returns(period_returns))
111 | 
112 | 
113 | def CAGR(cum_returns_df: pd.DataFrame) -> pd.Series:
114 |     """
115 |     calculates the geometric average yearly ntile returns from the given cumulative returns
116 |     Assumed the data is in daily format
117 |     :param cum_returns_df: cn be full cum returns or just the
118 |     :return: series with index: cum_returns_df.columns; values: corresponding average return in percent
119 |     """
120 |     ann_factor = get_period_annualization(cum_returns_df.index)
121 |     return ((cum_returns_df.iloc[-1] ** (1 / (cum_returns_df.shape[0] / ann_factor)) - 1) * 100).rename('CAGR')
122 | 
123 | 
124 | def cum_returns(simple_returns: pd.DataFrame) -> pd.DataFrame:
125 |     """
126 |     Calculates the daily returns from the simple returns.
127 |     wraps empyrical.cum_returns
128 |     :param simple_returns: returns used to calculate the cumulative returns
129 |     :return: cumulative returns
130 |     """
131 |     return empyrical.cum_returns(simple_returns, starting_value=1)
132 | 
133 | 
134 | def wrap_emprical_period(func):
135 |     """
136 |     Wraps and empirical function that takes in returns and period
137 |     :param func: the emprical function to wrap
138 |     :return: function that takes in returns and infers the frequency of the data
139 |         and passes a period to the enmprical fucntion
140 |     """
141 | 
142 |     def inner_wrapper(period_returns):
143 |         return func(returns=period_returns, period=get_period_string(period_returns.index))
144 | 
145 |     return inner_wrapper
146 | 


--------------------------------------------------------------------------------
/ntiles/backtest/tears/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Alexd14/ntiles/d22c75aac2a553ccca17fb71a3650071e31808c2/ntiles/backtest/tears/__init__.py


--------------------------------------------------------------------------------
/ntiles/backtest/tears/backtest_tear.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from abc import ABC
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | from ntiles.backtest.tears.base_tear import BaseTear
  8 | from ntiles.backtest import plotter, stats, utils
  9 | 
 10 | 
 11 | class BacktestTear(BaseTear, ABC):
 12 |     """
 13 |     Computes returns and stats from the given factor and pricing data
 14 | 
 15 |     Upgrades:
 16 |         Have cash account for when security gets delisted and we own it
 17 |         One day holding period
 18 | 
 19 |     """
 20 | 
 21 |     def __init__(self, ntile_matrix: pd.DataFrame, daily_returns: pd.DataFrame, ntiles, holding_period: int,
 22 |                  long_short: bool, market_neutral: bool, show_uni: bool):
 23 |         """
 24 |         :param ntile_matrix: unstacked and formatted ntiles prepared by Ntiles
 25 |         :param daily_returns: unstacked and formatted daily returns from Ntiles
 26 |         :param holding_period: How long we want to hold positions for, represents days
 27 |         :param ntiles: amount of bins we are testing (1 is high factor value n is low value)
 28 |         :param long_short: show we compute the spread between ntiles: (1 - n)
 29 |         :param market_neutral: subtract out the universe returns from the ntile returns?
 30 |         :param show_uni: suhould universe return be shown in the spread plot?
 31 |         """
 32 | 
 33 |         super().__init__()
 34 | 
 35 |         self.ntile_matrix = ntile_matrix
 36 |         self.daily_returns = daily_returns
 37 |         self.ntiles = ntiles
 38 |         self.holding_period = holding_period
 39 |         self.long_short = long_short
 40 |         self.market_neutral = market_neutral
 41 |         self.show_uni = show_uni
 42 | 
 43 |         self.daily_weights = {}
 44 |         self.weighted_returns = {}
 45 |         self._daily_tile_returns = None
 46 | 
 47 |     def compute(self) -> None:
 48 |         """
 49 |         method to run the backtest
 50 |         """
 51 |         self.kick_backtest()
 52 | 
 53 |     def plot(self):
 54 |         """
 55 |         method to plot the data for the backtest
 56 |         """
 57 |         self.kick_visuals()
 58 | 
 59 |     #
 60 |     # Vectorized Ntile Backtest
 61 |     #
 62 |     def kick_backtest(self):
 63 |         """
 64 |         Calculates the daily returns of each ntile
 65 |         Saves the daily returns in self._daily_tile_returns
 66 |             index: pd.Period
 67 |             columns: Ntile: {ntile}
 68 |             Values: Daily close ntile returns on corresponding day
 69 |         :return: None
 70 |         """
 71 | 
 72 |         daily_ntile_returns = self._get_ntile_returns_helper()
 73 | 
 74 |         if self.long_short:
 75 |             daily_ntile_returns[f'1 vs {self.ntiles}'] = (daily_ntile_returns.iloc[:, 0] -
 76 |                                                           daily_ntile_returns.loc[:, f'Ntile: {self.ntiles}']) / 2
 77 | 
 78 |             if self.ntiles > 3:
 79 |                 daily_ntile_returns[f'2 vs {self.ntiles - 1}'] = (daily_ntile_returns.iloc[:, 1] -
 80 |                                                                   daily_ntile_returns.loc[:,
 81 |                                                                   f'Ntile: {self.ntiles - 1}']) / 2
 82 | 
 83 |         self._daily_tile_returns = daily_ntile_returns
 84 | 
 85 |     def _get_ntile_returns_helper(self) -> pd.DataFrame:
 86 |         """
 87 |         Helper to get the returns for each ntile on each day
 88 |         :return: data frame index: pd.period; columns: Ntile; values: daily returns
 89 |         """
 90 |         np_ntile_matrix = self.ntile_matrix.to_numpy()
 91 |         np_asset_returns_matrix = self.daily_returns.to_numpy()
 92 | 
 93 |         out = {}
 94 |         for ntile in range(1, self.ntiles + 1):
 95 |             out[f'Ntile: {ntile}'] = self._compute_daily_ntile_returns(np_ntile_matrix, np_asset_returns_matrix, ntile,
 96 |                                                                        self.holding_period)
 97 | 
 98 |         universe_ntile_matrix = np.where(np.isfinite(np_ntile_matrix), 1, np.nan)[self.holding_period - 1:]
 99 |         universe_returns_matrix = np_asset_returns_matrix[self.holding_period - 1:]
100 | 
101 |         out['universe'] = self._compute_daily_ntile_returns(universe_ntile_matrix, universe_returns_matrix, 1, 1)
102 | 
103 |         if self.holding_period != 1:
104 |             index_values = self.ntile_matrix.index[self.holding_period - 2:]
105 |         else:
106 |             second_date = self.ntile_matrix.index[0]
107 |             index_values = ([second_date - 1] + self.ntile_matrix.index.tolist())
108 | 
109 |         out = pd.DataFrame(out, index=index_values)
110 | 
111 |         if self.market_neutral:
112 |             # subtracting out universe returns
113 |             ntile_cols = utils.get_ntile_cols(out)
114 |             out.loc[:, ntile_cols] = out.loc[:, ntile_cols].subtract(out['universe'], axis=0)
115 | 
116 |         if not self.show_uni:
117 |             out.drop('universe', axis=1, inplace=True)
118 | 
119 |         return out
120 | 
121 |     def _compute_daily_ntile_returns(self, ntile_matrix: np.array, asset_returns_matrix: np.array, ntile: int,
122 |                                      holding_period: int) -> np.array:
123 |         """
124 |         Computes the daily returns for a ntile
125 |         :param ntile_matrix: the matrix of ntiles
126 |         :param asset_returns_matrix: the matrix for returns
127 |         :param ntile: the amount of ntiles we have computed
128 |         :param holding_period: how long we are holding the assets for
129 |         :return: 1d np.array of the daily return for the ntile
130 |         """
131 | 
132 |         #
133 |         # Calculating the asset weight per day
134 |         #
135 |         weight_per_day = 1 / np.count_nonzero(ntile_matrix == ntile, axis=1) / holding_period
136 |         if (weight_per_day > .05).any():
137 |             warnings.warn(f'We have {(weight_per_day > .05).sum()} assets in ntile {ntile} '
138 |                           f'with daily weight over 5%.'
139 |                           f'Max weight is  {round(weight_per_day.max(), 3)}')
140 |             # weight_per_day = np.minimum(weight_per_day, np.full(weight_per_day.shape, .05))
141 | 
142 |         raw_daily_weights = np.where(ntile_matrix == ntile, np.expand_dims(weight_per_day, axis=1), 0)
143 |         daily_weights = utils.rolling_sum(raw_daily_weights, holding_period)
144 | 
145 |         weighted_asset_returns = daily_weights * asset_returns_matrix[holding_period - 1:, :]
146 |         daily_returns = np.insert(np.sum(weighted_asset_returns, axis=1), 0, 0)
147 | 
148 |         self.record_backtest_components(ntile, daily_weights, weighted_asset_returns)
149 | 
150 |         return daily_returns
151 | 
152 |     def record_backtest_components(self, ntile, daily_weights, weighted_asset_returns):
153 |         """
154 |         records the components to compute the backtest for a specific ntile
155 |         :param ntile: the ntile the data is for
156 |         :param daily_weights: the weights of each asset on the corresponding day
157 |         :param weighted_asset_returns: the weighted returns of each asset
158 |         :return: None
159 |         """
160 |         self.daily_weights[f'Ntile: {ntile}'] = \
161 |             pd.DataFrame(daily_weights, index=self.daily_returns.index[self.holding_period - 1:],
162 |                          columns=self.daily_returns.columns)
163 | 
164 |         self.weighted_returns[f'Ntile: {ntile}'] = \
165 |             pd.DataFrame(weighted_asset_returns, index=self.daily_returns.index[self.holding_period - 1:],
166 |                          columns=self.daily_returns.columns)
167 | 
168 |     #
169 |     # Visuals
170 |     #
171 |     def kick_visuals(self) -> None:
172 |         """
173 |         controls displaying visuals to the user
174 |         :return: None
175 |         """
176 |         print('Ntile Backtest')
177 |         cum_ret = stats.cum_returns(self._daily_tile_returns)
178 | 
179 |         # ntile stats
180 |         ntile_cols = utils.get_ntile_cols(self._daily_tile_returns)
181 |         ntile_daily_ret = self._daily_tile_returns[ntile_cols]
182 |         ntile_cum_ret = cum_ret[ntile_cols]
183 |         avg_annual_ret = stats.CAGR(ntile_cum_ret)
184 |         # ntile plotting
185 |         stats.generate_return_stats(ntile_daily_ret, self.market_neutral)
186 |         freq = ntile_cum_ret.index.freq.name
187 |         plotter.ntile_return_plot(ntile_cum_ret, f'Ntile Returns {self.holding_period}{freq} Holding Period')
188 |         plotter.ntile_annual_return_bars(avg_annual_ret, self.holding_period, freq)
189 | 
190 |         if self.long_short:
191 |             # spread stats
192 |             spread_cols = utils.get_non_ntile_cols(self._daily_tile_returns)
193 |             long_short_frame = self._daily_tile_returns[spread_cols]
194 |             # spread plotting
195 |             stats.generate_return_stats(long_short_frame, False)
196 |             plotter.ntile_return_plot(cum_ret[spread_cols],
197 |                                       f'Long Short Returns {self.holding_period}{freq} Holding Period')
198 | 
199 |     #
200 |     # Data methods
201 |     #
202 |     def cum_ret_to_clipboard(self) -> None:
203 |         """
204 |         write cumulative returns to clipboard
205 |         :return: None
206 |         """
207 |         stats.cum_returns(self._daily_tile_returns).to_clipboard()
208 | 


--------------------------------------------------------------------------------
/ntiles/backtest/tears/base_tear.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | 
 4 | class BaseTear(ABC):
 5 |     """
 6 |     The base class for all tearsheets
 7 |     """
 8 | 
 9 |     def __init__(self):
10 |         """
11 |         empty constructor
12 |         """
13 | 
14 |     def compute_plot(self) -> None:
15 |         """
16 |         method which calculates stats and plots the data for the tearsheet
17 |         :return: None
18 |         """
19 |         self.compute()
20 |         self.plot()
21 | 
22 |     @abstractmethod
23 |     def compute(self) -> None:
24 |         """
25 |         method which calculates stats for the tearsheet
26 |         :return: None
27 |         """
28 |         pass
29 | 
30 |     @abstractmethod
31 |     def plot(self) -> None:
32 |         """
33 |         method which plots data for the tearsheet
34 |         :return: None
35 |         """
36 |         pass
37 | 


--------------------------------------------------------------------------------
/ntiles/backtest/tears/ic_tear.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC
  2 | from typing import Iterable
  3 | 
  4 | import pandas as pd
  5 | 
  6 | from ntiles.backtest import plotter
  7 | from ntiles.backtest.tears.base_tear import BaseTear
  8 | from ntiles.backtest import utils
  9 | 
 10 | 
 11 | class ICTear(BaseTear, ABC):
 12 |     """
 13 |     Computes IC from the given factor and returns
 14 | 
 15 |     Currently will only measure IC for days a company is in the universe
 16 |     Example: AAPl is in the univere on 1/10 but not in universe on 11/10 if we have greater than 10 day holding period
 17 |         that asset wint count in the IC calculation
 18 |     """
 19 | 
 20 |     def __init__(self, factor_data: pd.DataFrame, daily_returns: pd.DataFrame, holding_period: int):
 21 |         """
 22 |         :param factor_data: factor data to look at must be from Ntiles
 23 |         :param daily_returns: daily returns we are calculating the IC on must be from Ntiles
 24 |         :param holding_period: Holding period we are calculating IC for
 25 |         """
 26 |         super().__init__()
 27 |         self.factor_data = factor_data
 28 |         self.daily_returns = daily_returns
 29 |         self.holding_period = holding_period
 30 | 
 31 |         self.daily_ic = None
 32 |         self.ic_stats = None
 33 | 
 34 |     #
 35 |     # Calculation
 36 |     #
 37 |     def compute(self) -> None:
 38 |         """
 39 |         master function for computing the IC
 40 |         :return: None
 41 |         """""
 42 |         self.compute_daily_ic()
 43 |         self.calculate_ic_table()
 44 | 
 45 |     def compute_daily_ic(self) -> None:
 46 |         """
 47 |         calculates and sets the daily IC for the holding period
 48 |         :return: None
 49 |         """
 50 |         self.factor_data.index.names = ['date', 'id']
 51 | 
 52 |         # slicing off factor values we dont have forward return data for
 53 |         factor_unstacked = self.factor_data['factor'].unstack()#.iloc[:-self.holding_period]
 54 |         forward_returns = self.compute_forward_returns().reindex_like(factor_unstacked)
 55 | 
 56 |         ic_array = utils.correlation_2d(factor_unstacked.to_numpy(), forward_returns.to_numpy())
 57 |         self.daily_ic = pd.Series(ic_array, index=forward_returns.index).to_frame('IC')
 58 |         if self.daily_ic.index.freq.name == 'D':
 59 |             self.daily_ic['1 Month Avg IC'] = self.daily_ic.rolling(21).mean()
 60 |         else:
 61 |             self.daily_ic['1 Year Avg IC'] = self.daily_ic.rolling(12).mean()
 62 | 
 63 |     def compute_forward_returns(self) -> pd.DataFrame:
 64 |         """
 65 |         Calculates self.holding_period forward returns from daily returns
 66 |         :return: index: date; columns: asset; values: self.holding_period forward returns
 67 |         """
 68 |         # must mad extra day due to cumprod making first date nan
 69 |         daily_ret = self.daily_returns  # utils.pad_extra_day(self.daily_returns, 0)
 70 |         return daily_ret.add(1).cumprod().pct_change(self.holding_period).shift(-self.holding_period)
 71 | 
 72 |     def calculate_ic_table(self) -> None:
 73 |         """
 74 |         calculates summary stats for the IC data
 75 |         :return: None, sets self.ic_stats
 76 |         """
 77 |         mean_ic = self.daily_ic['IC'].mean()
 78 |         std_ic = self.daily_ic['IC'].std()
 79 |         stats = {
 80 |             'IC Mean': mean_ic,
 81 |             'IC Median': self.daily_ic['IC'].median(),
 82 |             'IC Std': std_ic,
 83 |             'Risk Adjusted IC': mean_ic / std_ic,
 84 |             'IC Skew': self.daily_ic['IC'].skew()
 85 |         }
 86 | 
 87 |         self.ic_stats = pd.Series(stats).round(3).to_frame(f'{self.holding_period}D').transpose()
 88 | 
 89 |     #
 90 |     # Plotting
 91 |     #
 92 |     def plot(self) -> None:
 93 |         """
 94 |         plots the IC data in self.daily_ic
 95 |         :return: None
 96 |         """
 97 |         print('Information Coefficient')
 98 |         plotter.render_table(self.ic_stats)
 99 |         plotter.plot_timeseries_ic(self.daily_ic, self.holding_period)
100 |         # plotter.plot_ic_qq(self.daily_ic)
101 |         # plotter.plot_ic_hist(self.daily_ic)
102 | 
103 |     #
104 |     # To clipboard functions
105 |     #
106 |     def ic_to_clipboard(self) -> None:
107 |         """
108 |         writes ic to the clipboard
109 |         :return: None
110 |         """
111 |         self.daily_ic.to_clipboard()
112 | 
113 | 
114 | class ICHorizonTear(BaseTear, ABC):
115 |     """
116 |     Computes the IC horizon tear
117 |     Will give insight into optimal holding periods for the factor
118 |     """
119 | 
120 |     def __init__(self, factor_data: pd.DataFrame, daily_returns: pd.DataFrame, intervals: Iterable[int],
121 |                  show_individual):
122 |         """
123 |         :param factor_data: The factor values being tested, must be from Ntiles
124 |         :param daily_returns: matrix of returns from Ntiles
125 |         :param intervals: an iterable that contains the holding periods we would like to make the IC frontier for
126 |         """
127 |         super().__init__()
128 |         self._factor_data = factor_data
129 |         self._daily_returns = daily_returns
130 |         self._intervals = sorted(list(intervals))
131 |         self._show_individual = show_individual
132 | 
133 |         self.tears = {}
134 |         self._ic_horizon = None
135 | 
136 |     def compute(self) -> None:
137 |         """
138 |         runs a IC tear for all the periods we want to test over
139 |         """
140 |         for interval in self._intervals:
141 |             self.tears[interval] = ICTear(self._factor_data, self._daily_returns, interval)
142 |             self.tears[interval].compute()
143 | 
144 |         self._ic_horizon = pd.concat([tear.ic_stats for tear in self.tears.values()])
145 | 
146 |     def plot(self) -> None:
147 |         """
148 |         plots the IC frontier and the Time series IC
149 |         """
150 |         plotter.plot_ic_horizon(self._ic_horizon.drop(['IC Skew'], axis=1))
151 |         plotter.render_table(self._ic_horizon)
152 |         if self._show_individual:
153 |             for ic_tear in self.tears.values():
154 |                 ic_tear.plot()
155 | 


--------------------------------------------------------------------------------
/ntiles/backtest/tears/inspection_tear.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | 
 3 | from ntiles.backtest.tears.base_tear import BaseTear
 4 | from ntiles.backtest import plotter
 5 | 
 6 | 
 7 | class InspectionTear(BaseTear, ABC):
 8 |     """
 9 |     creates a data inspection sheet
10 |     """
11 | 
12 |     def __init__(self, factor_data):
13 |         """
14 |         :param factor_data: factor_data from Ntiles
15 |         """
16 |         super().__init__()
17 |         self._factor_data = factor_data
18 | 
19 |     def compute(self) -> None:
20 |         """
21 |         kicks off the tearsheet
22 |         :return: None
23 |         """
24 |         self.make_summary()
25 | 
26 |     def plot(self) -> None:
27 |         """
28 |         plots the tearsheet
29 |         """
30 |         self.summary_plots()
31 | 
32 |     def make_summary(self) -> None:
33 |         """
34 |         calculates the summary statics for the factor by Ntile
35 |         """
36 |         quantile_stats = self._factor_data.groupby('ntile').agg(['median', 'std', 'min', 'max', 'count']).factor
37 |         quantile_stats['count %'] = quantile_stats['count'] / quantile_stats['count'].sum() * 100
38 | 
39 |         # aesthetics
40 |         quantile_stats = quantile_stats.round(2)
41 |         quantile_stats.columns = [col.title() for col in quantile_stats.columns]
42 |         quantile_stats.index.name = 'Ntile:'
43 | 
44 |         plotter.render_table(quantile_stats, 'Quantiles Statistics')
45 | 
46 |     def summary_plots(self) -> None:
47 |         """
48 |         plots the the summary of the factor
49 |         """
50 |         no_index_factor_data = self._factor_data.reset_index().dropna()
51 |         date_agg = no_index_factor_data.groupby('date')
52 |         date_ntile_agg = no_index_factor_data.groupby(['date', 'ntile'])
53 | 
54 |         plotter.plot_inspection_data(date_agg.factor.count(), 'Universe Count Of Factor Per Period', 'Count')
55 |         plotter.plot_inspection_data(date_ntile_agg.factor.count().unstack(), 'Ntile Count of Factor Per Period',
56 |                                      'Count')
57 |         plotter.plot_inspection_data(date_ntile_agg.factor.median().unstack(), 'Median Factor Value by Ntile', 'Median',
58 |                                      2)
59 | 


--------------------------------------------------------------------------------
/ntiles/backtest/tears/tilts_backtest_tear.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC
  2 | from typing import Optional
  3 | 
  4 | import pandas as pd
  5 | 
  6 | from .backtest_tear import BacktestTear
  7 | from .. import plotter
  8 | from .. import utils
  9 | 
 10 | from ..portals.base_portal import BaseGrouperPortalConstant
 11 | 
 12 | 
 13 | class TiltsBacktestTear(BacktestTear, ABC):
 14 |     """
 15 |     generates a tear sheet which shows the sector exposures of a strategy
 16 |     Must be run after the backtest tear
 17 |     """
 18 | 
 19 |     def __init__(self, ntile_matrix: pd.DataFrame, daily_returns: pd.DataFrame, ntiles, holding_period: int,
 20 |                  long_short: bool, market_neutral: bool, show_uni: bool, factor_data: pd.DataFrame,
 21 |                  group_portal: Optional[BaseGrouperPortalConstant], show_ntile_tilts: bool):
 22 |         """
 23 |         :param ntile_matrix: unstacked and formatted ntiles prepared by Ntiles
 24 |         :param daily_returns: unstacked and formatted daily returns from Ntiles
 25 |         :param holding_period: How long we want to hold positions for, represents days
 26 |         :param ntiles: amount of bins we are testing (1 is high factor value n is low value)
 27 |         :param long_short: show we compute the spread between ntiles: (1 - n)
 28 |         :param market_neutral: subtract out the universe returns from the ntile returns?
 29 |         :param show_uni: should universe return be shown in the spread plot?
 30 |         :param factor_data: the factor data from Ntiles
 31 |         :param group_portal: the group portal holding the groups. If this is None then the exposures will not be shown
 32 |         :param show_ntile_tilts: Should we show the exposures for each individual ntile?
 33 |         """
 34 | 
 35 |         super().__init__(ntile_matrix, daily_returns, ntiles, holding_period, long_short, market_neutral, show_uni)
 36 |         self._factor_data = factor_data
 37 |         self._group_portal = group_portal
 38 |         self._show_ntile_tilts = show_ntile_tilts
 39 | 
 40 |         self._daily_group_weights = {}
 41 |         self._full_group_tilt_avg = {}
 42 | 
 43 |     def compute(self) -> None:
 44 |         """
 45 |         master function for the tear sheet
 46 |         :return: None
 47 |         """
 48 |         super().compute()
 49 | 
 50 |         if (self._group_portal is not None) and (self._show_ntile_tilts or self.long_short):
 51 |             self.compute_tilts()
 52 | 
 53 |     def plot(self) -> None:
 54 |         """
 55 |         plots the tear sheet
 56 |         """
 57 |         super().plot()
 58 |         if (self._group_portal is not None) and (self._show_ntile_tilts or self.long_short):
 59 |             self.make_plots()
 60 | 
 61 |     def compute_tilts(self):
 62 |         """
 63 |         computes the daily tilt data for each group
 64 |         :return: None
 65 |         """
 66 |         self.compute_group_weights()
 67 |         if self.long_short:
 68 |             self.calculate_long_short_tilts()
 69 | 
 70 |     def compute_group_weights(self):
 71 |         """
 72 |         computes the weights by group for each ntile
 73 |         currently computes data but work because need a time series data adjusted for index constitutes
 74 |         have to use self.factor_data
 75 |         :return: None
 76 |         """
 77 |         group_info = self._group_portal.group_information
 78 |         center_weight = group_info.groupby(group_info).count() / group_info.shape[0]
 79 |         center_weight = utils.remove_cat_index(center_weight)
 80 | 
 81 |         if self._show_ntile_tilts:
 82 |             ntile_keys = self.daily_weights.keys()
 83 |         else:
 84 |             ntile_keys = [min(self.daily_weights.keys()), max(self.daily_weights.keys())]
 85 | 
 86 |         new_col = self.daily_weights[ntile_keys[0]].columns.astype(str).map(self._group_portal.group_mapping)
 87 | 
 88 |         for ntile in ntile_keys:
 89 |             frame = self.daily_weights[ntile]
 90 |             frame.columns = new_col
 91 |             frame = self.daily_weights[ntile].stack().to_frame('weight')
 92 |             frame.index.names = ['date', 'group']
 93 | 
 94 |             weights_unstacked = frame.groupby(['date', 'group']).sum().sub(center_weight, level=1, axis=0).unstack()
 95 |             weights_unstacked.columns = weights_unstacked.columns.droplevel(0)
 96 | 
 97 |             self._daily_group_weights[ntile] = weights_unstacked
 98 |             self._full_group_tilt_avg[ntile] = (frame.groupby('group').sum().weight
 99 |                                                 / frame.index.levels[0].unique().shape[0]
100 |                                                 - center_weight)
101 | 
102 |     def calculate_long_short_tilts(self):
103 |         """
104 |         calculates the time series tilts for the long short portfolio
105 |         :return: None
106 |         """
107 |         ntile_n = max(self._daily_group_weights.keys())
108 |         self._daily_group_weights['Long Short'] = (self._daily_group_weights['Ntile: 1']
109 |                                                    - self._daily_group_weights[ntile_n])
110 |         self._full_group_tilt_avg['Long Short'] = self._daily_group_weights['Long Short'].stack().groupby(
111 |             'group').mean()
112 | 
113 |     def make_plots(self):
114 |         print('Weights By Group')
115 |         for ntile in self._daily_group_weights.keys():
116 |             if 'Long Short' == ntile and not self.long_short:
117 |                 continue
118 |             if 'Ntile' in ntile and not self._show_ntile_tilts:
119 |                 continue
120 |             ax = plotter.plot_tilt_hist(self._full_group_tilt_avg[ntile], ntile, self._group_portal.name)
121 |             plotter.plot_tilts(self._daily_group_weights[ntile], ntile, self._group_portal.name, ax)
122 | 


--------------------------------------------------------------------------------
/ntiles/backtest/tears/turnover_tear.py:
--------------------------------------------------------------------------------
  1 | from typing import Union, List
  2 | 
  3 | from abc import ABC
  4 | 
  5 | import duckdb
  6 | import pandas as pd
  7 | import numpy as np
  8 | 
  9 | from .base_tear import BaseTear
 10 | from .. import plotter, utils
 11 | 
 12 | 
 13 | class TurnoverTear(BaseTear, ABC):
 14 |     """
 15 |     Shows the turnover for a factor
 16 |     """
 17 | 
 18 |     def __init__(self, factor_data: pd.DataFrame, holding_period: Union[int, List[int]]):
 19 |         super().__init__()
 20 |         self._factor_data = factor_data
 21 |         self._holding_period = holding_period
 22 | 
 23 |         self._auto_corr = None
 24 |         self._turnover = None
 25 |         self._summary_stats = dict()
 26 | 
 27 |     def compute(self) -> None:
 28 |         """
 29 |         calculates the data for the tear
 30 |         """
 31 | 
 32 |         self.calculate_autocorrelation()
 33 |         self.calculate_turnover()
 34 | 
 35 |         self.calculate_summary_stats()
 36 | 
 37 |     def plot(self) -> None:
 38 |         """
 39 |         plots the tear
 40 |         """
 41 |         self.plot_turnover()
 42 | 
 43 |     def calculate_autocorrelation(self) -> None:
 44 |         """
 45 |         Calculates the auto correlation of the factor with a lag of self._holding_period
 46 | 
 47 |         calculates the autocorrelation of n and n - holding period
 48 |         """
 49 |         factor_unstacked = self._factor_data['factor'].unstack()
 50 |         auto_corr_arr = utils.correlation_2d(factor_unstacked.to_numpy(),
 51 |                                              factor_unstacked.shift(self._holding_period).to_numpy())
 52 | 
 53 |         self._auto_corr = pd.Series(auto_corr_arr, index=factor_unstacked.index)
 54 | 
 55 |     def calculate_turnover(self):
 56 |         """
 57 |         Calculates the turnover of the top and bottom bin with a lag of self._holding_period
 58 | 
 59 |         calculates the turnover of n and n - holding period
 60 |         """
 61 |         # getting frame of only the top and bottom bin
 62 |         max_ntile = self._factor_data['ntile'].max()
 63 |         turnover_frame = self._factor_data[['ntile']][self._factor_data['ntile'].isin([1, max_ntile])]
 64 |         turnover_frame['ntile_shifted'] = turnover_frame['ntile'].unstack().shift(self._holding_period).stack()
 65 |         turnover_frame['changed'] = turnover_frame['ntile'] != turnover_frame['ntile_shifted']
 66 | 
 67 |         # fd = self._factor_data[['ntile']].reset_index()
 68 |         # fd['date'] = fd['date'].dt.to_timestamp()
 69 |         # max_ntile = fd['ntile'].max()
 70 |         # turnover_sql = f"""SELECT "date", "ntile",
 71 |         #                         "ntile" != lag("ntile", {self._holding_period}) OVER (PARTITION BY id ORDER BY "date") as "changed"
 72 |         #                     FROM fd
 73 |         #                     WHERE "ntile" in (1, {max_ntile})"""
 74 |         #
 75 |         # con = duckdb.connect(':memory:')
 76 |         # turnover_frame = con.execute(turnover_sql).df()
 77 |         # con.close()
 78 | 
 79 |         final_turnover = turnover_frame.groupby(['date', 'ntile']).changed.agg(sum=sum, count=len)
 80 | 
 81 |         self._turnover = (final_turnover['sum'] / final_turnover['count']).unstack()
 82 | 
 83 |     def calculate_summary_stats(self) -> None:
 84 |         """
 85 |         sets the summary stats for the autocorelation and the turnover
 86 |         """
 87 |         self._summary_stats['auto'] = self._auto_corr.agg(
 88 |             {'Mean AC': np.mean, 'Median AC': np.median, 'Std AC': np.std}).round(3).to_frame(
 89 |             f'{self._holding_period}D').transpose()
 90 | 
 91 |         self._summary_stats['turnover'] = self._turnover.stack().groupby('ntile').agg(
 92 |             **{'Mean Turnover': np.mean, 'Median Turnover': np.median, 'Std Turnover': np.std}).round(3)
 93 | 
 94 |     def plot_turnover(self) -> None:
 95 |         """
 96 |         plots the time series data in self.auto_corr
 97 |         """
 98 |         print('Autocorrelation')
 99 |         plotter.render_table(self._summary_stats['auto'])
100 |         plotter.plot_auto_corr(self._auto_corr, self._holding_period)
101 | 
102 |         print('Turnover')
103 |         plotter.render_table(self._summary_stats['turnover'])
104 |         plotter.plot_turnover(self._turnover, self._holding_period)
105 | 


--------------------------------------------------------------------------------
/ntiles/backtest/utils.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import numba as nb
  4 | import duckdb
  5 | from typing import Optional, List, Union
  6 | 
  7 | 
  8 | def subset_frame(frame: pd.DataFrame, columns: Optional[List[str]]):
  9 |     """
 10 |     subsets the given data frame by the given columns
 11 |     if the columns are none then the whole frame is returned
 12 |     :param frame: the dataframe to subset
 13 |     :param columns: the columns we are going to subset by, if none then nothing is done
 14 |     :return: Given frame subset by the given columns
 15 |     """
 16 |     if columns:
 17 |         return frame[columns]
 18 |     return frame
 19 | 
 20 | 
 21 | def get_ntile_cols(frame: pd.DataFrame) -> List[str]:
 22 |     """
 23 |     :param frame: data frame to get columns of
 24 |     :return: all columns in the frame that contain 'Ntile'
 25 |     """
 26 |     return [col for col in frame.columns if 'Ntile' in col]
 27 | 
 28 | 
 29 | def get_non_ntile_cols(frame: pd.DataFrame) -> List[str]:
 30 |     """
 31 |     :param frame: data frame to get columns of
 32 |     :return: all columns in the frame that dont contain 'Ntile'
 33 |     """
 34 |     return [col for col in frame.columns if 'Ntile' not in col]
 35 | 
 36 | 
 37 | def make_nan_inf_summary(df: Union[pd.DataFrame, pd.Series], max_loss: float, print_good: bool = True) -> pd.DataFrame:
 38 |     """
 39 |     makes a summary fot the the amount of nan and infinity values in the given data frame
 40 |     will throw a ValueError if the percent of nan and inf is greater than the given threshold
 41 |     prints a summary of the nan's and inf of there are any
 42 |     :param df: the data frame we are checking
 43 |     :param max_loss: max decimal percent of nan and inf we are allowing the df to contain
 44 |     :param print_good: should we print the output if we dropped less then the threshold?
 45 |     :return: pandas data frame with the nan and inf dropped
 46 |     """
 47 |     df_numpy = df.to_numpy()
 48 |     nan_array = np.isnan(df_numpy)
 49 |     finite_array = np.logical_or(np.isinf(df_numpy), np.isneginf(df_numpy))
 50 | 
 51 |     if nan_array.any() or (not finite_array.all()):
 52 |         factor_length = len(df)
 53 |         amount_nan = nan_array.sum()
 54 |         amount_inf = finite_array.sum()
 55 |         total_percent_dropped = (amount_nan + amount_inf) / factor_length
 56 | 
 57 |         outString = f'Dropped {round(total_percent_dropped * 100, 2)}% of data. ' \
 58 |                     f'{round((amount_nan / factor_length) * 100, 2)}% due to nan, ' \
 59 |                     f'{round((amount_inf / factor_length) * 100, 2)}% of inf values. Threshold: {max_loss * 100}%\n'
 60 | 
 61 |         if total_percent_dropped > max_loss:
 62 |             raise ValueError('Exceeded Nan Infinity Threshold. ' + outString)
 63 | 
 64 |         # print out string as a summary
 65 |         if print_good:
 66 |             print(outString)
 67 | 
 68 |         # dropping the nans and the infinity values
 69 |         df = df.replace([np.inf, -np.inf], np.nan).dropna()
 70 | 
 71 |     elif print_good:
 72 |         print('Dropped 0% of data')
 73 | 
 74 |     return df
 75 | 
 76 | 
 77 | def rolling_sum(a, n):
 78 |     """
 79 |     rolling sum, column wise
 80 |     :param a: array to roll and sum
 81 |     :param n: length of rolling window
 82 |     :return: a[n:, :] of rolling sum
 83 |     """
 84 |     if n == 1:
 85 |         return a
 86 | 
 87 |     cum_sum = np.cumsum(a, axis=0)
 88 |     cum_sum[n:, :] = cum_sum[n:, :] - cum_sum[:-n, :]
 89 |     return cum_sum[n - 1:, :]
 90 | 
 91 | 
 92 | @nb.njit(parallel=True)
 93 | def correlation_2d(factor: np.array, returns: np.array) -> np.array:
 94 |     """
 95 |     calculates a timeseries of correlation for the given factor and forward returns
 96 |     factor and returns must have EXACTLY the same structure and order of assets/days
 97 |     think of each row as a group and we calculate the correlation by groups
 98 | 
 99 |     :param factor: 2d np.array, each row represents factor values for different assets on same day
100 |     :param returns: 2d np.array, each row represents forward returns for different assets on same day
101 |     :return:1d np.array representing time series of factor values
102 |     """
103 |     if factor.shape != returns.shape:
104 |         raise ValueError('Factor and returns dont represent same information')
105 | 
106 |     num_rows = factor.shape[0]
107 |     out = np.empty(shape=num_rows)
108 | 
109 |     for i in nb.prange(num_rows):
110 |         finite_mask = np.isfinite(factor[i]) & np.isfinite(returns[i])
111 |         out[i] = np.corrcoef(factor[i][finite_mask], returns[i][finite_mask])[0][1]
112 | 
113 |     return out
114 | 
115 | 
116 | def pad_extra_day(matrix_df: pd.DataFrame, pad_value: any) -> pd.DataFrame:
117 |     """
118 |     pads a unstacked frame with a single extra row are the start of the data frame
119 |     :param matrix_df: df to pad, index: pd.Period, columns: any, values: any
120 |     :param pad_value: constant value to insert into a row
121 |     :return: matrix_df with a padded value
122 |     """
123 |     out = matrix_df.copy()
124 |     new_period = (out.index.min().to_timestamp() - pd.DateOffset(1)).to_period('D')
125 |     out.loc[new_period, :] = np.full(shape=out.shape[1], fill_value=pad_value)
126 |     return out.sort_index()  # can make this function better without a sort
127 | 
128 | 
129 | def remove_cat_index(frame: Union[pd.Series, pd.DataFrame]) -> Union[pd.Series, pd.DataFrame]:
130 |     """
131 |     if the frame has a categorical index it will remove it
132 |     :return: frame with the categorical index removed
133 |     """
134 |     if frame.index.is_categorical():
135 |         frame.index = frame.index.astype(str)
136 | 
137 |     return frame
138 | 
139 | 
140 | def convert_date_to_period(frame: Union[pd.DataFrame, pd.Series], freq: str = 'D', **kwargs) -> Union[
141 |     pd.DataFrame, pd.Series]:
142 |     """
143 |     converts the date column to a period if the date column is of type timestamp
144 |     if the 'date' column is a period then nothing will be changed
145 |     date can be in the index or columns
146 | 
147 |     :param frame: the frame containing the date column
148 |     :param freq: the freq for the period
149 |     :return: thr same frame that was passed but 'date' is a partiod.
150 |     """
151 |     index_names = list(frame.index.names)
152 |     frame = frame.reset_index()
153 | 
154 |     if 'date' in frame.columns:
155 |         frame['date'] = frame['date'].dt.to_period(freq)
156 |         frame.set_index(index_names)
157 |         return frame
158 | 
159 |     raise ValueError('"date" not found in data frame')
160 | 
161 | 
162 | def ntile(factor: pd.Series, ntiles: int, ) -> pd.Series:
163 |     """
164 |     Universe relative Quantiles of a factor by day
165 |     Around 100X faster than pandas groupby qcut
166 | 
167 |     pd.DataFrame of ntiled factor
168 |         index: (pd.Period, _asset_id)
169 |         Columns: (factor, ntile)
170 |         Values: (factor value, Ntile corresponding to factor value)
171 | 
172 |     :param factor: same var as ntile_return_tearsheet
173 |     :param ntiles: same var as ntile_return_tearsheet
174 |     """
175 |     factor = factor.to_frame('factor').reset_index()
176 |     index_names = factor.columns.tolist()
177 |     index_names.remove('factor')
178 | 
179 |     date_is_period = isinstance(factor.date.dtype, pd.core.dtypes.dtypes.PeriodDtype)
180 |     if date_is_period:
181 |         factor['date'] = factor['date'].dt.to_timestamp()
182 | 
183 |     sql_quantile = f"""SELECT *, NTILE({ntiles}) OVER(PARTITION BY date ORDER BY factor.factor DESC) as ntile
184 |                             FROM factor
185 |                             WHERE factor.factor IS NOT NULL"""
186 |     con = duckdb.connect(':memory:')
187 |     factor_ntile = con.execute(sql_quantile).df()
188 |     con.close()
189 | 
190 |     if date_is_period:
191 |         factor_ntile['date'] = factor_ntile['date'].dt.to_period(freq='D')
192 | 
193 |     factor_ntile = factor_ntile.set_index(index_names)
194 |     return factor_ntile
195 | 


--------------------------------------------------------------------------------
/ntiles/examples/ic_ac.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Alexd14/ntiles/d22c75aac2a553ccca17fb71a3650071e31808c2/ntiles/examples/ic_ac.png


--------------------------------------------------------------------------------
/ntiles/examples/inspection_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Alexd14/ntiles/d22c75aac2a553ccca17fb71a3650071e31808c2/ntiles/examples/inspection_1.png


--------------------------------------------------------------------------------
/ntiles/examples/inspection_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Alexd14/ntiles/d22c75aac2a553ccca17fb71a3650071e31808c2/ntiles/examples/inspection_2.png


--------------------------------------------------------------------------------
/ntiles/examples/return_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Alexd14/ntiles/d22c75aac2a553ccca17fb71a3650071e31808c2/ntiles/examples/return_1.png


--------------------------------------------------------------------------------
/ntiles/examples/return_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Alexd14/ntiles/d22c75aac2a553ccca17fb71a3650071e31808c2/ntiles/examples/return_2.png


--------------------------------------------------------------------------------
/ntiles/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/ntiles/tests/constitute_adjustment_test.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | from pandas import (
  4 |     Timestamp,
  5 |     DataFrame,
  6 |     concat,
  7 |     MultiIndex
  8 | )
  9 | 
 10 | from ntiles.toolbox.constitutes.constitute_adjustment import ConstituteAdjustment
 11 | from ntiles.toolbox.utils.date_config import DateConfig
 12 | 
 13 | 
 14 | class ConstituteAdjustmentTest(unittest.TestCase):
 15 | 
 16 |     def examples(self):
 17 |         self.foo_constitutes = DataFrame(data=[
 18 |             # symbol    entered     exited
 19 |             ['BOB', '2009-01-01', '2012-01-01'],  # whole thing
 20 |             ['LARY', '2010-01-05', '2010-01-07'],  # added and then exited
 21 |             ['JEFF', '2011-03-02', '2020-03-02']],  # added too late
 22 |             columns=['symbol', 'from', 'thru']
 23 |         )
 24 |         self.date_config = DateConfig(freq='D', date_format='%Y-%m-%d', target_data_type='timestamp')
 25 |         self.ca = ConstituteAdjustment(id_col='symbol', date_config=self.date_config)
 26 |         self.ca.add_universe_info(universe=self.foo_constitutes, start_date='2010-01-04', end_date='2010-01-12', )
 27 | 
 28 |         self.foo_data = DataFrame(
 29 |             data=[['BOB', '2010-01-04', 50],
 30 |                   ['BOB', '2010-01-05', 51],
 31 |                   ['BOB', '2010-01-06', 52],
 32 |                   ['BOB', '2010-01-07', 53],
 33 |                   # ['BOB', '2010-01-08', 54], this will be missing data
 34 |                   ['BOB', '2010-01-11', 55],
 35 |                   ['BOB', '2010-01-12', 56],
 36 |                   ['LARY', '2010-01-04', 20],  # should not be included
 37 |                   ['LARY', '2010-01-05', 21],
 38 |                   ['LARY', '2010-01-06', 22],
 39 |                   ['LARY', '2010-01-07', 23],
 40 |                   ['LARY', '2010-01-08', 24],  # should not be included
 41 |                   ['LARY', '2010-01-11', 25],  # should not be included
 42 |                   ['LARY', '2010-01-12', 26],  # should not be included
 43 |                   ['LARY', '2010-01-13', 27],  # should not be included
 44 |                   ['FOO', '2010-01-08', 0]],  # should be ignored
 45 |             columns=['symbol', 'date', 'factor'])
 46 | 
 47 |         self.adjusted_foo = DataFrame(
 48 |             data=[['BOB', Timestamp('2010-01-04'), 50],
 49 |                   ['BOB', Timestamp('2010-01-05'), 51],
 50 |                   ['BOB', Timestamp('2010-01-06'), 52],
 51 |                   ['BOB', Timestamp('2010-01-07'), 53],
 52 |                   ['BOB', Timestamp('2010-01-08'), None],
 53 |                   ['BOB', Timestamp('2010-01-11'), 55],
 54 |                   ['BOB', Timestamp('2010-01-12'), 56],
 55 |                   ['LARY', Timestamp('2010-01-05'), 21],
 56 |                   ['LARY', Timestamp('2010-01-06'), 22],
 57 |                   ['LARY', Timestamp('2010-01-07'), 23]],
 58 |             columns=['symbol', 'date', 'factor']).set_index(['date', 'symbol'])
 59 | 
 60 |         pricing_data = DataFrame(
 61 |             data=[['LARY', Timestamp('2010-01-08'), 24],
 62 |                   ['LARY', Timestamp('2010-01-11'), 25],
 63 |                   ['LARY', Timestamp('2010-01-12'), 26]],
 64 |             columns=['symbol', 'date', 'factor']).set_index(['date', 'symbol'])
 65 | 
 66 |         self.adjusted_pricing = concat([pricing_data, self.adjusted_foo]).sort_values(['symbol', 'date'])
 67 | 
 68 |     #
 69 |     #  ************************************  add_universe_info  ************************************
 70 |     #
 71 | 
 72 |     def test_factor_add_universe_info(self):
 73 |         """
 74 |         testing the index generation in add_universe_info
 75 |         has missing data (None), data that should not be included (yet to be added, has been removed) and
 76 |         irrelevant symbols
 77 |         """
 78 |         self.examples()
 79 | 
 80 |         # for factors
 81 |         factor_components = [(Timestamp('2010-01-04'), 'BOB'),
 82 |                              (Timestamp('2010-01-05'), 'BOB'),
 83 |                              (Timestamp('2010-01-06'), 'BOB'),
 84 |                              (Timestamp('2010-01-07'), 'BOB'),
 85 |                              (Timestamp('2010-01-08'), 'BOB'),
 86 |                              (Timestamp('2010-01-11'), 'BOB'),
 87 |                              (Timestamp('2010-01-12'), 'BOB'),
 88 |                              (Timestamp('2010-01-05'), 'LARY'),
 89 |                              (Timestamp('2010-01-06'), 'LARY'),
 90 |                              (Timestamp('2010-01-07'), 'LARY')]
 91 | 
 92 |         self.assertTrue(MultiIndex.from_tuples(factor_components).equals(self.ca.factor_components))
 93 | 
 94 |     def test_throw_column_error(self):
 95 |         """
 96 |         ensuring a error will be thrown when the correct columns are not supplied
 97 |         """
 98 |         self.examples()
 99 | 
100 |         with self.assertRaises(ValueError) as em:
101 |             self.ca.add_universe_info(start_date='2010-01-04',
102 |                                       end_date='2010-01-12',
103 |                                       universe=DataFrame(columns=['foo', 'foo1', 'foo2']))
104 |         self.assertEqual('Required column "symbol" is not present', str(em.exception))
105 | 
106 |     def test_duplicate_symbols(self):
107 |         """
108 |         Ensuring that passing a df with duplicate symbols will raise a ValueError
109 |         """
110 |         self.examples()
111 | 
112 |         self.foo_constitutes.iat[1, 0] = 'BOB'
113 | 
114 |         with self.assertRaises(ValueError) as em:
115 |             self.ca.add_universe_info(start_date='2010-01-04',
116 |                                       end_date='2010-01-12',
117 |                                       universe=self.foo_constitutes)
118 |         self.assertEqual('The column symbol is 0.333 duplicates, 1 rows\n', str(em.exception))
119 | 
120 |     #
121 |     #  ************************************  adjust_data_for_membership  ************************************
122 |     #
123 | 
124 |     def test_adjust_data_for_membership(self):
125 |         """
126 |         ensuring adjust_data_for_membership return the correct data frame
127 |         data given has good data to index, not seen bad tickers, and tickers with dates out of bounds
128 |         """
129 |         self.examples()
130 |         filtered = self.ca.adjust_data_for_membership(data=self.foo_data)
131 |         self.assertTrue(self.adjusted_foo['factor'].sort_index().equals(filtered.sort_index()))
132 | 
133 |     def test_throw_error_adjust_data_for_membership(self):
134 |         """
135 |         ensuring adjust_data_for_membership throws error when not given symbols or date
136 |         """
137 |         self.examples()
138 | 
139 |         with self.assertRaises(ValueError) as em:
140 |             self.ca.adjust_data_for_membership(data=DataFrame(columns=['foo', 'notSymbol', 'factor']))
141 |         self.assertEqual('Required column "date" is not present', str(em.exception))
142 | 
143 |     def test_no_index_set_adjust_data_for_membership(self):
144 |         """
145 |         ensuring adjust_data_for_membership throws error when there is no index set
146 |         AKA add_universe_info was never called
147 |         """
148 |         self.examples()
149 | 
150 |         with self.assertRaises(ValueError) as em:
151 |             ConstituteAdjustment().adjust_data_for_membership(data=self.foo_data)
152 |         self.assertEqual('Universe is not set', str(em.exception))
153 | 
154 | 
155 | if __name__ == '__main__':
156 |     unittest.main()
157 | 


--------------------------------------------------------------------------------
/ntiles/tests/ml_factor_calculation_test.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from abc import ABC
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | from toolbox.utils.ml_factor_calculation import ModelWrapper, calc_ml_factor, generate_indexes
  8 | from toolbox.utils.ml_factor_calculation import SliceHolder
  9 | 
 10 | 
 11 | class MyTestCase(unittest.TestCase):
 12 | 
 13 |     def examples(self):
 14 |         # index includes non trading days
 15 |         # exactly 60 occurrences of each ticker
 16 |         first = pd.Timestamp(year=2010, month=1, day=1)
 17 |         self.date_index = pd.MultiIndex.from_product(
 18 |             [pd.date_range(start=first, end=pd.Timestamp(year=2010, month=3, day=1)),
 19 |              ['BOB', 'JEFF', 'CARL']], names=['date', 'symbol'])
 20 | 
 21 |         self.expected_index_e5_10_30 = [
 22 |             (SliceHolder(first, first + pd.Timedelta(days=29)),
 23 |              SliceHolder(first + pd.Timedelta(days=40), first + pd.Timedelta(days=44))),
 24 | 
 25 |             (SliceHolder(first, first + pd.Timedelta(days=34)),
 26 |              SliceHolder(first + pd.Timedelta(days=45), first + pd.Timedelta(days=49))),
 27 | 
 28 |             (SliceHolder(first, first + pd.Timedelta(days=39)),
 29 |              SliceHolder(first + pd.Timedelta(days=50), first + pd.Timedelta(days=54))),
 30 | 
 31 |             (SliceHolder(first, first + pd.Timedelta(days=44)),
 32 |              SliceHolder(first + pd.Timedelta(days=55), first + pd.Timedelta(days=59)))
 33 |         ]
 34 | 
 35 |         self.expected_index_e7_8_30 = [
 36 |             (SliceHolder(first, first + pd.Timedelta(days=29)),
 37 |              SliceHolder(first + pd.Timedelta(days=37), first + pd.Timedelta(days=44))),
 38 | 
 39 |             (SliceHolder(first, first + pd.Timedelta(days=37)),
 40 |              SliceHolder(first + pd.Timedelta(days=45), first + pd.Timedelta(days=52))),
 41 | 
 42 |             (SliceHolder(first, first + pd.Timedelta(days=45)),
 43 |              SliceHolder(first + pd.Timedelta(days=53), first + pd.Timedelta(days=59))),
 44 |         ]
 45 | 
 46 |         self.expected_index_e5_10_30 = self.turn_to_datetime64(self.expected_index_e5_10_30)
 47 |         self.expected_index_e7_8_30 = self.turn_to_datetime64(self.expected_index_e7_8_30)
 48 | 
 49 |         self.expected_index_r5_10_30 = [
 50 |             (SliceHolder(first, first + pd.Timedelta(days=29)),
 51 |              SliceHolder(first + pd.Timedelta(days=40), first + pd.Timedelta(days=44))),
 52 | 
 53 |             (SliceHolder(first + pd.Timedelta(days=5), first + pd.Timedelta(days=34)),
 54 |              SliceHolder(first + pd.Timedelta(days=45), first + pd.Timedelta(days=49))),
 55 | 
 56 |             (SliceHolder(first + pd.Timedelta(days=10), first + pd.Timedelta(days=39)),
 57 |              SliceHolder(first + pd.Timedelta(days=50), first + pd.Timedelta(days=54))),
 58 | 
 59 |             (SliceHolder(first + pd.Timedelta(days=15), first + pd.Timedelta(days=44)),
 60 |              SliceHolder(first + pd.Timedelta(days=55), first + pd.Timedelta(days=59)))
 61 |         ]
 62 | 
 63 |         self.expected_index_r7_8_30 = [
 64 |             (SliceHolder(first, first + pd.Timedelta(days=29)),
 65 |              SliceHolder(first + pd.Timedelta(days=37), first + pd.Timedelta(days=44))),
 66 | 
 67 |             (SliceHolder(first + pd.Timedelta(days=8), first + pd.Timedelta(days=37)),
 68 |              SliceHolder(first + pd.Timedelta(days=45), first + pd.Timedelta(days=52))),
 69 | 
 70 |             (SliceHolder(first + pd.Timedelta(days=16), first + pd.Timedelta(days=45)),
 71 |              SliceHolder(first + pd.Timedelta(days=53), first + pd.Timedelta(days=59))),
 72 |         ]
 73 | 
 74 |         self.expected_index_r5_10_30 = self.turn_to_datetime64(self.expected_index_r5_10_30)
 75 |         self.expected_index_r7_8_30 = self.turn_to_datetime64(self.expected_index_r7_8_30)
 76 | 
 77 |         class FooModel(ModelWrapper, ABC):
 78 |             def fit_model(self, tf: pd.DataFrame, tt: pd.Series):
 79 |                 pass
 80 | 
 81 |         self.fooModel = FooModel()
 82 | 
 83 |         self.foo_target = pd.Series(index=self.date_index, dtype='float64')
 84 |         self.foo_target.loc[:] = 0
 85 |         self.fooFeatures = pd.DataFrame(index=self.date_index)
 86 |         self.fooFeatures.loc[:] = 0
 87 | 
 88 |     #
 89 |     #  ************************************  generate_indexes  ************************************
 90 |     #
 91 | 
 92 |     def test_expanding_generateIndexes(self):
 93 |         """
 94 |         testing generate indexes using the expanding param
 95 |         Turning slice lists to string. Comparing equality of np.datetime64 is annoying
 96 |         """
 97 |         self.examples()
 98 | 
 99 |         # no left over days all even slices
100 |         returnedIndexesE10_5_30 = list(
101 |             generate_indexes(data_index=self.date_index, eval_days=10, refit_every=5, expanding=30))
102 |         self.assertEqual(str(self.expected_index_e5_10_30), str(returnedIndexesE10_5_30))
103 | 
104 |         # left over days last slice will be of size 1
105 |         returnedIndexesE7_8_30 = list(
106 |             generate_indexes(data_index=self.date_index, eval_days=7, refit_every=8, expanding=30))
107 |         self.assertEqual(str(self.expected_index_e7_8_30), str(returnedIndexesE7_8_30))
108 | 
109 |     def test_rolling_generateIndexes(self):
110 |         """
111 |         testing generate indexes using the rolling param
112 |         Turning slice lists to string. Comparing equality of np.datetime64 is annoying
113 |         """
114 |         self.examples()
115 |         # no left over days all even slices
116 |         returnedIndexesR10_5_30 = list(
117 |             generate_indexes(data_index=self.date_index, eval_days=10, refit_every=5, rolling=30))
118 |         self.assertEqual(str(self.expected_index_r5_10_30), str(returnedIndexesR10_5_30))
119 | 
120 |         # left over days last slice will be of size 1
121 |         returnedIndexesR7_8_30 = list(
122 |             generate_indexes(data_index=self.date_index, eval_days=7, refit_every=8, rolling=30))
123 | 
124 |         self.assertEqual(str(self.expected_index_r7_8_30), str(returnedIndexesR7_8_30))
125 | 
126 |     #
127 |     #  ************************************  calcMlFactor  ************************************
128 |     #
129 | 
130 |     def test_negative_calcMlFactor(self):
131 |         """
132 |         testing for error when eval_days, refit_every, expanding, rolling  is less than one
133 |         this also tests generate_indexes
134 |         """
135 |         self.examples()
136 | 
137 |         # eval_days
138 |         with self.assertRaises(ValueError) as em:
139 |             calc_ml_factor(model=self.fooModel, features=self.fooFeatures, target=self.foo_target, eval_days=0,
140 |                            refit_every=1, expanding=1)
141 |         self.assertEqual('eval_days and/or refit_every must be greater than zero', str(em.exception))
142 | 
143 |         # refit_every
144 |         with self.assertRaises(ValueError) as em:
145 |             calc_ml_factor(model=self.fooModel, features=self.fooFeatures, target=self.foo_target, eval_days=1,
146 |                            refit_every=0, expanding=1)
147 |         self.assertEqual('eval_days and/or refit_every must be greater than zero', str(em.exception))
148 | 
149 |         # expanding
150 |         with self.assertRaises(ValueError) as em:
151 |             calc_ml_factor(model=self.fooModel, features=self.fooFeatures, target=self.foo_target, eval_days=1,
152 |                            refit_every=1, expanding=0)
153 |         self.assertEqual('expanding must be greater than zero', str(em.exception))
154 | 
155 |         # rolling
156 |         with self.assertRaises(ValueError) as em:
157 |             calc_ml_factor(model=self.fooModel, features=self.fooFeatures, target=self.foo_target, eval_days=1,
158 |                            refit_every=1, rolling=0)
159 |         self.assertEqual('rolling must be greater than zero', str(em.exception))
160 | 
161 |     def test_rollingAndExpanding_calcMlFactor(self):
162 |         """
163 |         testing for error when rolling days and expanding are both defined and not defined
164 |         """
165 |         self.examples()
166 | 
167 |         with self.assertRaises(ValueError) as em:
168 |             calc_ml_factor(model=self.fooModel, features=self.fooFeatures, target=self.foo_target, eval_days=1,
169 |                            refit_every=1, rolling=1, expanding=1)
170 |         self.assertEqual('minTrainDays and rollingDays can not both be defined', str(em.exception))
171 | 
172 |         with self.assertRaises(ValueError) as em:
173 |             calc_ml_factor(model=self.fooModel, features=self.fooFeatures, target=self.foo_target, eval_days=1,
174 |                            refit_every=1)
175 |         self.assertEqual('minTrainDays or rollingDays must be defined', str(em.exception))
176 | 
177 |     def test_contain_bad_val_calc_ml_factor(self):
178 |         """
179 |         testing for when the given features and target have nan values
180 |         """
181 |         self.examples()
182 |         # features has a nan
183 |         with self.assertRaises(ValueError) as em:
184 |             self.fooFeatures[0] = 0.0
185 |             self.fooFeatures.iat[1, 0] = np.nan
186 |             calc_ml_factor(model=self.fooModel, features=self.fooFeatures, target=self.foo_target, eval_days=1,
187 |                            refit_every=1)
188 |         self.assertEqual('There are nan or inf values in the features', str(em.exception))
189 | 
190 |         # features has a inf
191 |         self.examples()
192 |         with self.assertRaises(ValueError) as em:
193 |             self.fooFeatures[0] = 0.0
194 |             self.fooFeatures.iat[1, 0] = np.inf
195 |             calc_ml_factor(model=self.fooModel, features=self.fooFeatures, target=self.foo_target, eval_days=1,
196 |                            refit_every=1)
197 |         self.assertEqual('There are nan or inf values in the features', str(em.exception))
198 | 
199 |         # target has a nan
200 |         self.examples()
201 |         with self.assertRaises(ValueError) as em:
202 |             self.foo_target.iat[1] = np.nan
203 |             calc_ml_factor(model=self.fooModel, features=self.fooFeatures, target=self.foo_target, eval_days=1,
204 |                            refit_every=1)
205 |         self.assertEqual('There are nan or inf values in the target', str(em.exception))
206 | 
207 |         # target has a inf
208 |         self.examples()
209 |         with self.assertRaises(ValueError) as em:
210 |             self.foo_target.iat[1] = np.inf
211 |             calc_ml_factor(model=self.fooModel, features=self.fooFeatures, target=self.foo_target, eval_days=1,
212 |                            refit_every=1)
213 |         self.assertEqual('There are nan or inf values in the target', str(em.exception))
214 | 
215 |     @staticmethod
216 |     def turn_to_datetime64(convert):
217 |         """
218 |         helper converts SliceHolder of pd.Timestamp to SliceHolder of np.datetime64
219 |         """
220 |         return [(SliceHolder(s[0].start.to_datetime64(), s[0].end.to_datetime64()),
221 |                  SliceHolder(s[1].start.to_datetime64(), s[1].end.to_datetime64()))
222 |                 for s in convert]
223 | 
224 | 
225 | if __name__ == '__main__':
226 |     unittest.main()
227 | 


--------------------------------------------------------------------------------
/ntiles/toolbox/__init__.py:
--------------------------------------------------------------------------------
 1 | # constitutes
 2 | from .constitutes.constitute_adjustment import ConstituteAdjustment
 3 | 
 4 | # utils
 5 | from .utils.format_data_alphalens import price_format_for_alphalens, factor_format_for_alphalens
 6 | from .utils.ml_factor_calculation import calc_ml_factor
 7 | from .utils.ml_factor_calculation import ModelWrapper
 8 | from .utils.utils import factorize, rank, ntile
 9 | from .utils.date_config import DateConfig
10 | 
11 | # db functions
12 | from .db.read.query_constructor import QueryConstructor
13 | from .db.api.sql_connection import SQLConnection
14 | from .db.read.db_functions import table_info, db_tables
15 | from .db.write.create_tables import IngestDataBase
16 | from .db.read.universe import (ETFUniverse,
17 |                                clear_etf_universes,
18 |                                clear_built_universes,
19 |                                BuiltUniverse,
20 |                                dispatch_universe_path)
21 | 
22 | __all__ = [
23 |     'ConstituteAdjustment',
24 |     'price_format_for_alphalens',
25 |     'factor_format_for_alphalens',
26 |     'calc_ml_factor',
27 |     'ModelWrapper',
28 |     'factorize',
29 |     'rank',
30 |     'ntile',
31 |     'QueryConstructor',
32 |     'SQLConnection',
33 |     'table_info',
34 |     'IngestDataBase',
35 |     'ETFUniverse',
36 |     'clear_etf_universes',
37 |     'clear_built_universes',
38 |     'BuiltUniverse',
39 |     'dispatch_universe_path',
40 |     'db_tables',
41 |     'DateConfig',
42 | ]
43 | 


--------------------------------------------------------------------------------
/ntiles/toolbox/constitutes/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Alexd14/ntiles/d22c75aac2a553ccca17fb71a3650071e31808c2/ntiles/toolbox/constitutes/__init__.py


--------------------------------------------------------------------------------
/ntiles/toolbox/constitutes/constitute_adjustment.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional, Union
  2 | 
  3 | import duckdb
  4 | import pandas as pd
  5 | 
  6 | from ntiles.toolbox.utils.date_config import DateConfig
  7 | from ..db.read.query_constructor import QueryConstructor
  8 | from ..db.api.sql_connection import SQLConnection
  9 | from ..utils.handle_data import handle_duplicates
 10 | 
 11 | # this allows compatibility with python 3.6
 12 | try:
 13 |     import pandas_market_calendars as mcal
 14 | except ImportError as e:
 15 |     pass
 16 | 
 17 | 
 18 | class ConstituteAdjustment:
 19 |     """
 20 |     Provides the functionality of indexing a data to match a universe
 21 |     Correctly identifying on what day which asset should be in/not in the data set based on given universe data
 22 |     """
 23 | 
 24 |     def __init__(self,
 25 |                  id_col: str = 'permno',
 26 |                  date_config: DateConfig = None
 27 |                  ):
 28 |         """
 29 |         constructor for ConstituteAdjustment
 30 |         :param id_col: the asset identifier column for the data that will be passed
 31 |         :param date_type: should the date be outputted as a pd.Period or a pd.Timestamp?
 32 |         self._universe_factor: holds the index constitutes for the factor in a MultiIndex of date,
 33 |             self._id_col
 34 |         self._universe_pricing: holds the index constitutes for the pricing in a MultiIndex of date,
 35 |             self._id_col
 36 |         """
 37 |         self._id_col = id_col
 38 |         self._date_config = date_config
 39 | 
 40 |         self._universe_factor: Optional[pd.MultiIndex] = None
 41 | 
 42 |     def add_universe_info(self,
 43 |                           universe: pd.DataFrame,
 44 |                           start_date: str,
 45 |                           end_date: str,
 46 |                           calender: str = 'NYSE'
 47 |                           ) -> None:
 48 |         """
 49 |         Adds universe data to the ConstituteAdjustment object
 50 |         Creates a factors index which is simply the range of "from" to "thru"
 51 | 
 52 |         :param universe: a pandas data frame containing index component information.
 53 |                                 MUST HAVE COLUMNS: self._id_col representing the asset identifier,
 54 |                                                    'from' start trading date on the index,
 55 |                                                    'thru' end trading date on the index,
 56 |                                 If 'from', 'thru' are not pd.TimeStamps than a date_format MUST BE PASSED.
 57 |                                 if no date_format is passed its assumed that they are in a pd.TimeStamp object
 58 |         :param start_date: The first date we want to get data for
 59 |         :param end_date: The last first date we want to get data for
 60 |         :param calender: The trading calender we want to use to get the dates
 61 |         :return: None
 62 |         """
 63 |         # making sure date and self._id_col are in the columns
 64 |         universe = _check_columns([self._id_col, 'from', 'thru'], universe)
 65 | 
 66 |         # will throw an error if there are duplicate self._id_col
 67 |         handle_duplicates(df=universe, out_type='ValueError', name=f'The column {self._id_col}',
 68 |                           drop=False, subset=[self._id_col])
 69 | 
 70 |         # making sure the dates are in the correct format
 71 |         universe = (self._date_config
 72 |                              .copy(target_data_type='timestamp', resample=False, grouper_keys=[])
 73 |                              .configure_dates(universe, ['from', 'thru']))
 74 | 
 75 |         relevant_cal = (mcal.get_calendar(calender)
 76 |                         .valid_days(start_date=start_date, end_date=end_date)
 77 |                         .to_frame(name='date'))
 78 |         relevant_cal = (self._date_config
 79 |         .copy(target_data_type='timestamp', resample=True, grouper_keys=[])
 80 |         .configure_dates(relevant_cal, 'date')
 81 |         .set_index('date')
 82 |         .rename({'index': 'date'}, axis=1)['date'])
 83 | 
 84 |         # making a list of series to eventually concat
 85 |         indexes_factor: List[pd.Series] = []
 86 | 
 87 |         for row in universe.iterrows():
 88 |             symbol = row[1][self._id_col]
 89 | 
 90 |             # getting the relevant dates for the factor
 91 |             date_range_factors: pd.Series = relevant_cal.loc[row[1]['from']: row[1]['thru']]
 92 | 
 93 |             # converting to frame and then stacking gives us a df with the index we are making, also speed improvement
 94 |             indexes_factor.append(
 95 |                 date_range_factors.to_frame(symbol).stack()
 96 |             )
 97 | 
 98 |         # getting the index of the concatenated Series
 99 |         self._universe_factor = pd.concat(indexes_factor).index.set_names(['date', self._id_col])
100 | 
101 |     def add_universe_info_from_db(self,
102 |                                   assets: str,
103 |                                   start_date: str,
104 |                                   end_date: str,
105 |                                   sql_con=None
106 |                                   ) -> None:
107 |         """
108 |         Same as add_universe_info but takes in universe info from the database,
109 |         :param assets: The assets we want to get data for, Ex 'ETF_SPY'
110 |         :param start_date: The first date we want to get data for string in %Y-%m-%d
111 |         :param end_date: The last first date we want to get data for string in %Y-%m-%d
112 |         :param sql_con: A connection to the sql database if not provided then will use default connection
113 |         :return: None
114 |         """
115 |         over_con = sql_con is None
116 |         if sql_con is None:
117 |             sql_con = SQLConnection(':memory:', close_key=self.__class__.__name__)
118 |         raw_uni = (QueryConstructor(sql_con=sql_con, cache=False, freq=None)
119 |                    .query_universe_table(assets, fields=[self._id_col], start_date=start_date,
120 |                                          end_date=end_date, override_sql_con=over_con)
121 |                    .order_by('date')
122 |                    .df)
123 |         sql_con.close_with_key(self.__class__.__name__)
124 |         self.add_universe_info_long(raw_uni, start_date, end_date)
125 | 
126 |         # raw_uni = (self._date_config
127 |         #            .copy(target_data_type='timestamp')
128 |         #            .configure_dates(raw_uni, 'date')
129 |         #            .set_index(['date', self._id_col]))
130 |         #
131 |         # missing_id_for = raw_uni.index.to_frame()[self._id_col].isnull().sum() / len(raw_uni)
132 |         # print(f"Universe missing \"{self._id_col}\" for {round(missing_id_for * 100, 2)}% of data points")
133 |         #
134 |         # self._universe_factor = raw_uni.index.dropna()
135 | 
136 |     def add_universe_info_long(self,
137 |                                universe: pd.DataFrame,
138 |                                start_date: Union[pd.Timestamp, str] = None,
139 |                                end_date: Union[pd.Timestamp, str] = None
140 |                                ) -> None:
141 |         """
142 |         Adds universe data to the ConstituteAdjustment object from a table with long format.
143 |         :param universe: a pandas data frame containing universe component information.
144 |         :param start_date: The first date we want to get data for
145 |         :param end_date: The last first date we want to get data for
146 |         :return: None
147 |         """
148 |         universe = _check_columns([self._id_col, 'date'], universe)[['date', self._id_col]]
149 |         universe = (self._date_config
150 |                              .copy(target_data_type='timestamp')
151 |                              .configure_dates(universe, 'date'))
152 |         universe = universe[(universe['date'] > start_date)
153 |                                               & (universe['date'] < end_date)]
154 |         self._universe_factor = universe.set_index(['date', self._id_col]).index
155 | 
156 |     def adjust_data_for_membership(self,
157 |                                    data: pd.DataFrame,
158 |                                    ) -> pd.DataFrame:
159 |         """
160 |         adjusts the data set accounting for when assets are a member of the index defined in add_universe_info.
161 | 
162 |         factor:
163 |             Ex: AAPl joined S&P500 on 2012-01-01 and leaves 2015-01-01. GOOGL joined S&P500 on 2014-01-01 and is still
164 |             in the index at the time of end_date passed in add_index_info. When passing data to the
165 |             adjust_data_for_membership method it will only return AAPL factor data in range
166 |             2012-01-01 to 2015-01-01 and google data in the range of 2014-01-01 to the end_date.
167 | 
168 |         :param data: A pandas dataframe to be filtered.
169 |                     Must contain columns named self._id_col, 'date' otherwise can have as may columns as desired
170 |         :param adjust_dates: If True then will adjust dates as depicted in date_config but will force timestamp output
171 |         :return: An indexed data frame adjusted for when assets are in the universe
172 |         """
173 |         # if the add_index_info is not defined then throw error
174 |         if self._universe_factor is None:
175 |             raise ValueError('Universe is not set')
176 | 
177 |         # making sure date and self._id_col are in the columns
178 |         data = _check_columns(['date', self._id_col], data, False)
179 | 
180 |         # if adjust_dates:
181 |         data = (self._date_config
182 |                 .copy(resample=False, target_data_type='timestamp')
183 |                 .configure_dates(data, 'date'))
184 | 
185 |         # dropping duplicates and throwing a warning if there are any
186 |         data = handle_duplicates(df=data, out_type='Warning', name='Data', drop=True, subset=['date', self._id_col])
187 | 
188 |         reindex_frame = self._fast_reindex(self._universe_factor, data)
189 | 
190 |         # if we have dataframe with 1 column then return series
191 |         if reindex_frame.shape[1] == 1:
192 |             return reindex_frame.iloc[:, 0]
193 | 
194 |         return reindex_frame
195 | 
196 |     def _fast_reindex(self,
197 |                       reindex_by: pd.MultiIndex,
198 |                       frame_to_reindex: pd.DataFrame
199 |                       ) -> pd.DataFrame:
200 |         """
201 |         Quickly reindex a pandas dataframe using a join in duckdb
202 |         :param reindex_by:Desired pandas Multiindex
203 |         :param frame_to_reindex: Frame we are reindexing data from
204 |         :return: Reindexed Dataframe
205 |         """
206 |         reindex_by = reindex_by.to_frame()
207 | 
208 |         id_cols = f'reindex_by.date, reindex_by.{self._id_col}'
209 |         factor_cols = ', '.join([col for col in frame_to_reindex.columns if col not in ['date', self._id_col]])
210 |         query = duckdb.query(f"""
211 |                         SELECT {id_cols}, {factor_cols}
212 |                             FROM reindex_by 
213 |                                 left join frame_to_reindex on (reindex_by.date = frame_to_reindex.date) 
214 |                                                         and (reindex_by.{self._id_col} = frame_to_reindex.{self._id_col});
215 |                         """)
216 | 
217 |         return self._set_dates(query.to_df()).set_index(['date', self._id_col])
218 | 
219 |     def _set_dates(self,
220 |                    df: pd.DataFrame
221 |                    ) -> pd.DataFrame:
222 |         """
223 |         adjusts the date column according to the self._date_type
224 |         :param df: the Dataframe which we are adjusting the 'date column' for
225 |         :return: df with date columns adjusted
226 |         """
227 |         return self._date_config.copy(resample=False).configure_dates(df, 'date')
228 | 
229 |     @property
230 |     def factor_components(self) -> Optional[pd.MultiIndex]:
231 |         """
232 |         :return: Mutable list of tuples which represent the factor index constitutes
233 |         """
234 |         return self._universe_factor
235 | 
236 | 
237 | def _check_columns(needed: List[str],
238 |                    df: pd.DataFrame,
239 |                    index_columns: bool = True
240 |                    ) -> pd.DataFrame:
241 |     """
242 |     helper to check if the required columns are present
243 |     raises value error if a col in needed is not in givenCols
244 |     :param needed: list of needed columns
245 |     :param df: df of the factor data for the given data
246 |     :param index_columns: should we index the columns specified in needed when returning the df
247 |     :return: Given dataframe with the correct columns and range index
248 |     """
249 |     if not isinstance(df.index, pd.core.indexes.range.RangeIndex):
250 |         df = df.reset_index()
251 | 
252 |     for col in needed:
253 |         if col not in df.columns:
254 |             raise ValueError(f'Required column \"{col}\" is not present')
255 | 
256 |     if index_columns:
257 |         return df[needed]
258 | 
259 |     return df
260 | 


--------------------------------------------------------------------------------
/ntiles/toolbox/db/__init__.py:
--------------------------------------------------------------------------------
 1 | from .api.sql_connection import SQLConnection
 2 | from .read.query_constructor import QueryConstructor
 3 | from .write.create_tables import IngestDataBase
 4 | from .write.make_universes import compustat_us_universe, crsp_us_universe
 5 | from .read.db_functions import table_info
 6 | from .read.universe import clear_built_universes, clear_etf_universes
 7 | from .read.cached_query import clear_cache
 8 | 
 9 | __all__ = [
10 |     'SQLConnection',
11 |     'QueryConstructor',
12 |     'IngestDataBase',
13 |     'compustat_us_universe',
14 |     'crsp_us_universe',
15 |     'table_info',
16 |     'clear_built_universes',
17 |     'clear_etf_universes',
18 |     'clear_cache'
19 | ]
20 | 


--------------------------------------------------------------------------------
/ntiles/toolbox/db/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Alexd14/ntiles/d22c75aac2a553ccca17fb71a3650071e31808c2/ntiles/toolbox/db/api/__init__.py


--------------------------------------------------------------------------------
/ntiles/toolbox/db/api/sql_connection.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | 
  3 | import duckdb
  4 | 
  5 | from ntiles.toolbox.db.settings import DB_CONNECTION_STRING
  6 | 
  7 | 
  8 | class SQLConnection:
  9 |     """
 10 |     Provides a lazy connection to a duckdb database
 11 |     """
 12 | 
 13 |     def __init__(self, connection_string: Optional[str] = None, read_only: bool = True, close_key=None) -> None:
 14 |         """
 15 |         if the connection is a memory connection then read_only will be False
 16 |         :param connection_string: the path to the duck db database
 17 |             If not passed then will look in settings.py for the string
 18 |         :param close_key: the key to be passed in order to close the connection in self.close_with_key()
 19 |         :return: None
 20 |         """
 21 |         self._read_only: bool = False if connection_string == ':memory:' else read_only
 22 |         self._close_key = close_key
 23 | 
 24 |         self._connection_string: str = self._get_connection_string(connection_string)
 25 |         self._db_connection: Optional[duckdb.DuckDBPyConnection] = None
 26 | 
 27 |     @staticmethod
 28 |     def _get_connection_string(connection_string: Optional[str]) -> str:
 29 |         """
 30 |         Gets the connection string for the duckdb
 31 |         defaults to the connection_string, if that's not there then it grabs from settings.py
 32 |         :param connection_string: the path to the duck db data base
 33 |         :return: connection string to duck db data base
 34 |         :raise ValueError: if the param connection_string and DB_CONNECTION_STRING are None
 35 |         """
 36 |         if connection_string is None:
 37 |             if DB_CONNECTION_STRING is None:
 38 |                 raise ValueError('Must pass a connection string or set a connection string in settings.py')
 39 |             return DB_CONNECTION_STRING
 40 | 
 41 |         return connection_string
 42 | 
 43 |     def _get_db_connection(self) -> None:
 44 |         """
 45 |         sets connection to duckdb database, if connection is currently open then it will close connection
 46 |         :return: None
 47 |         """
 48 |         if self._db_connection:
 49 |             self._db_connection.close()
 50 | 
 51 |         self._db_connection = duckdb.connect(database=self._connection_string, read_only=self._read_only)
 52 | 
 53 |     @property
 54 |     def con(self) -> duckdb.DuckDBPyConnection:
 55 |         """
 56 |         :return: connection to duckdb database
 57 |         """
 58 |         if self._db_connection is None:
 59 |             self._get_db_connection()
 60 | 
 61 |         return self._db_connection
 62 | 
 63 |     @property
 64 |     def read_only(self) -> bool:
 65 |         """
 66 |         :return: Is the connection read only?
 67 |         """
 68 |         return self.read_only
 69 | 
 70 |     def connection_string(self) -> str:
 71 |         """
 72 |         returns the connection string
 73 |         """
 74 |         return self._connection_string
 75 | 
 76 |     def set_read_only(self, read_only: bool) -> None:
 77 |         """
 78 |         setter for read only
 79 |         will cause oln connection to be closed and new connection to be created
 80 |         if the passed read_only != self._read_only
 81 |         :param read_only: should the database be read only?
 82 |         :return: None
 83 |         """
 84 |         if read_only != self.read_only:
 85 |             self._read_only = read_only
 86 | 
 87 |             if self._db_connection:
 88 |                 self._db_connection.close()
 89 |                 self._connection_string = None
 90 | 
 91 |     def close(self) -> None:
 92 |         """
 93 |         will close the sql connection regardless of self.close_key
 94 |         """
 95 |         if self._db_connection:
 96 |             self._db_connection.close()
 97 |             self._db_connection = None
 98 | 
 99 |     def close_with_key(self, close_key: str):
100 |         """
101 |         will close the sql connection if the passed close_key equals self.close_key
102 |         """
103 |         if close_key == self._close_key and close_key is not None:
104 |             self.close()
105 | 
106 |     def execute(self, sql: str, **kwargs) -> duckdb.DuckDBPyConnection:
107 |         """
108 |         wrapper for self.con.execute(sq;)
109 |         :param sql: query to run
110 |         :return: raw duckdb object containing the results of the query
111 |         """
112 |         return self.con.execute(sql, **kwargs)
113 | 
114 |     def set_threads(self, num_threads: int) -> None:
115 |         """
116 |         sets the amount of threads duck db should use
117 |         :return: None
118 |         """
119 |         self.con.execute(f'PRAGMA threads={num_threads};')
120 | 
121 |     def return_other_if_open(self, other, connection_string=None, read_only=None, close_key=None):
122 |         """
123 |         returns other if the other is not None and matches all conditions else returns self
124 |         if a condition arg is none then will not check that condition
125 |         can remove the current connection from scope the program if other is not None
126 |         """
127 |         if other is None:
128 |             return self
129 |         if connection_string and other.connection_string != connection_string:
130 |             return self
131 |         if read_only and other.read_only != read_only:
132 |             return self
133 |         if close_key and other.close_key != close_key:
134 |             return self
135 |         return other
136 | 
137 | 
138 | 


--------------------------------------------------------------------------------
/ntiles/toolbox/db/read/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Alexd14/ntiles/d22c75aac2a553ccca17fb71a3650071e31808c2/ntiles/toolbox/db/read/__init__.py


--------------------------------------------------------------------------------
/ntiles/toolbox/db/read/cached_query.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import hashlib
 3 | import os
 4 | 
 5 | import pandas as pd
 6 | 
 7 | from datetime import datetime
 8 | 
 9 | from ntiles.toolbox.db.settings import CACHE_DIRECTORY
10 | from ntiles.toolbox.db.api.sql_connection import SQLConnection
11 | 
12 | 
13 | class CachedQuery:
14 |     """
15 |     Functionality to cache results of a QueryConstructor
16 |     """
17 | 
18 |     def __init__(self, query: str):
19 |         """
20 |         :param query: the query we are looking at
21 |         """
22 |         self._query = query
23 |         self._query_hash = hashlib.sha224(query.encode()).hexdigest()
24 |         # what the path should be to the cache file
25 |         self._path = f'{CACHE_DIRECTORY}/{self._query_hash.upper()}.parquet'
26 | 
27 |     def is_query_cached(self) -> bool:
28 |         """
29 |         checks to see if the query is cached
30 |         """
31 |         return os.path.isfile(self._path)
32 | 
33 |     def cache_query(self, results: pd.DataFrame):
34 |         """
35 |         caches the given results
36 |         If index is not range index then will write index as a column not an index
37 |         """
38 |         if not isinstance(results.index, pd.RangeIndex):
39 |             results = results.reset_index()
40 | 
41 |         # if any columns are a period type change them to timestamp
42 |         con = SQLConnection(':memory:')
43 |         con.execute(f"COPY results TO '{self._path}' (FORMAT 'parquet')")
44 |         con.close()
45 |         print(f'Cached Query')
46 | 
47 |     def get_cached_query_path(self) -> str:
48 |         """
49 |         gets the path to the cached query will rase ValueError if the query is not cached
50 |         """
51 |         if self.is_query_cached():
52 |             return self._path
53 |         raise ValueError('Query is not cached!')
54 | 
55 |     def get_cached_query_df(self) -> pd.DataFrame:
56 |         """
57 |         gets the DataFrame contents of the cached query will rase ValueError if the query is not cached
58 |         The index will be a default range index
59 |         """
60 |         path = f"'{self.get_cached_query_path()}'"
61 | 
62 |         con = SQLConnection(':memory:')
63 |         cached_results = con.execute(f"SELECT * FROM {path}").df()
64 |         con.close()
65 | 
66 |         file_creation = datetime.fromtimestamp(os.stat(self._path).st_birthtime)
67 |         file_age = (datetime.now() - file_creation).days
68 | 
69 |         print(f'Using {file_age} Day Old Cache')
70 |         return cached_results
71 | 
72 | 
73 | def clear_cache():
74 |     files = glob.glob(f'{CACHE_DIRECTORY}/*.parquet')
75 |     for f in files:
76 |         os.remove(f)
77 |     print('Cleared Cache')
78 | 


--------------------------------------------------------------------------------
/ntiles/toolbox/db/read/db_functions.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from ntiles.toolbox.db.api.sql_connection import SQLConnection
 4 | 
 5 | 
 6 | def table_info(table_name: str, con=None) -> pd.DataFrame:
 7 |     """
 8 |     runs the table info PRAGMA query
 9 |     """
10 |     con = con if con else SQLConnection(close_key='table_info')
11 |     info_df = con.execute(f"PRAGMA table_info('{table_name}');").fetchdf()
12 |     con.close_with_key('table_info')
13 |     return info_df
14 | 
15 | 
16 | def db_tables(con=None) -> pd.DataFrame:
17 |     """
18 |     runs PRAGMA query to get all table names
19 |     """
20 |     con = con if con else SQLConnection(close_key='db_tables')
21 |     info_df = con.execute(f"PRAGMA show_tables;").fetchdf()
22 |     con.close_with_key('db_tables')
23 |     return info_df
24 | 


--------------------------------------------------------------------------------
/ntiles/toolbox/db/read/universe.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os.path
  3 | 
  4 | import pandas as pd
  5 | 
  6 | from typing import Union
  7 | 
  8 | from ntiles.toolbox.db.settings import ADD_ALL_LINKS_TO_PERMNO, ETF_UNI_DIRECTORY, BUILT_UNI_DIRECTORY
  9 | from ntiles.toolbox.db.api.sql_connection import SQLConnection
 10 | 
 11 | # this allows compatibility with python 3.6
 12 | try:
 13 |     import pandas_market_calendars as mcal
 14 | except ImportError as e:
 15 |     pass
 16 | 
 17 | MAP_ETF_SYMBOL_ID = {'SPY': 1021980,
 18 |                      'IWM': 1025818,
 19 |                      'IWV': 1025817}
 20 | 
 21 | 
 22 | def dispatch_universe_path(uni_name, add_quotes=False, sql_con=None) -> str:
 23 |     """
 24 |     gets the path to the given universe.
 25 |     Can pass any universe and will figure out the correct path to the universe.
 26 |     Assumes that the universe name is valid.
 27 | 
 28 |     :param uni_name: the name of the universe
 29 |     :param sql_con: a connection to the database
 30 |     :param add_quotes: should we add single quotes around the path?
 31 |     :return: path to the universe
 32 |     """
 33 |     #  user passes a etf to use as universe
 34 |     if 'ETF' in uni_name:
 35 |         out = ETFUniverse(con=sql_con).get_universe_path_parse(uni_name)
 36 | 
 37 |     # user passes a built universe
 38 |     else:
 39 |         out = BuiltUniverse().get_universe_path(uni_name)
 40 | 
 41 |     if add_quotes:
 42 |         return f"'{out}'"
 43 | 
 44 |     return out
 45 | 
 46 | 
 47 | class ETFUniverse:
 48 |     """
 49 |     CLass to build universes from etf holdings.
 50 |     Will cache the universes in parquet files to be read into duckdb instances
 51 |     """
 52 | 
 53 |     def __init__(self, con: SQLConnection = None):
 54 |         """
 55 |         If the user would like to class this class mutable times then they must pass a connection to con
 56 |         :param con: A connection to the database, if not passed then will open a new connection.
 57 |         """
 58 |         self._con = con if con else SQLConnection(close_key=self.__class__.__name__)
 59 | 
 60 |     def get_universe_df(self, ticker: str = None, crsp_portno: int = None, start_date: str = '2000',
 61 |                         end_date: str = '2023') -> Union[pd.DataFrame, str]:
 62 |         """
 63 |         gets the universe constitutes for an etf
 64 |         either ticker or crsp_portno must be passed but not both
 65 |         :param ticker: the ticker of the etf we are getting holdings for
 66 |         :param crsp_portno: the crsp_portno of the etf we are getting holdings for
 67 |         :param start_date: the date to start getting holdings for YYYY-MM-DD, no effect when caching
 68 |         :param end_date: the date to stop getting holdings YYYY-MM-DD, no effect when caching
 69 |         :return: pd.Dataframe index: int_range; columns: date, permno;
 70 |         """
 71 | 
 72 |         self._input_checks(ticker=ticker, crsp_portno=crsp_portno)
 73 | 
 74 |         asset_id = self._get_crsp_portno(ticker=ticker, crsp_portno=crsp_portno)
 75 | 
 76 |         if not self._is_cached_etf(crsp_portno=asset_id):
 77 |             etf_uni = self._cache_etf(crsp_portno=asset_id)
 78 | 
 79 |         else:
 80 |             etf_uni = self._get_cached_etf(crsp_portno=asset_id)
 81 | 
 82 |         return etf_uni[(etf_uni['date'] > start_date) & (etf_uni['date'] < end_date)]
 83 | 
 84 |     def get_universe_path(self, ticker: str = None, crsp_portno: int = None):
 85 |         """
 86 |         gets the SQL code to read cached universe constitutes for an etf
 87 |         either ticker or crsp_portno must be passed but not both
 88 |         if etf isn't cached then will cache the etf
 89 |         :param ticker: the ticker of the etf we are getting holdings for
 90 |         :param crsp_portno: the crsp_portno of the etf we are getting holdings for
 91 |         :return: SQL code to read the cached parquet
 92 |         """
 93 |         self._input_checks(ticker=ticker, crsp_portno=crsp_portno)
 94 | 
 95 |         asset_id = self._get_crsp_portno(ticker=ticker, crsp_portno=crsp_portno)
 96 | 
 97 |         if not self._is_cached_etf(crsp_portno=asset_id):
 98 |             self._cache_etf(crsp_portno=asset_id)
 99 | 
100 |         return self._get_cached_path(asset_id)
101 | 
102 |     def get_universe_path_parse(self, to_parse):
103 |         """
104 |         wrapper that parses a string for get_universe_path, can tell if user passed a symbol or crsp_portno
105 |         format:
106 |             ticker:
107 |                 'ETF_SPY'
108 |             crsp_portno:
109 |                 'ETF_5648362'
110 |         """
111 |         to_parse = to_parse.upper()
112 |         param_dict = self._parse_etf_uni_string(to_parse, param_dict={})
113 |         return self.get_universe_path(**param_dict)
114 | 
115 |     def get_universe_df_parse(self, to_parse, start_date: str = '2000', end_date: str = '2023'):
116 |         """
117 |         wrapper that parses a string for get_universe_path, can tell if user passed a symbol or crsp_portno
118 |         format:
119 |             ticker:
120 |                 'ETF_SPY'
121 |             crsp_portno:
122 |                 'ETF_5648362'
123 |         """
124 |         param_dict = {'start_date': start_date, 'end_date': end_date}
125 |         param_dict = self._parse_etf_uni_string(to_parse, param_dict=param_dict)
126 |         return self.get_universe_df(**param_dict)
127 | 
128 |     def _cache_etf(self, crsp_portno) -> pd.DataFrame:
129 |         """
130 |         gets and caches an etf holdings query
131 |         will cache etf in temp directory of the computer
132 |         :return: pd.Dataframe index: int_range; columns: date, permno;
133 |         """
134 |         print('Caching ETF Holdings')
135 | 
136 |         sql_for_holdings = f"""
137 |                 SELECT DISTINCT date, permno 
138 |                 FROM crsp.portfolio_holdings
139 |                 WHERE crsp_portno = {crsp_portno} AND 
140 |                     permno IS NOT NULL
141 |                """
142 |         raw_etf_holdings = self._con.execute(sql_for_holdings).fetchdf()
143 |         self._con.close_with_key(close_key=self.__class__.__name__)
144 | 
145 |         df_of_holdings = raw_etf_holdings.set_index('date').groupby('date')['permno'].apply(
146 |             lambda grp: list(grp.value_counts().index))
147 | 
148 |         end_date = pd.Timestamp.now().date().strftime('%Y-%m-%d')
149 |         start_date = df_of_holdings.index.min()
150 | 
151 |         full_cal = pd.date_range(start=start_date, end=end_date, freq='D').tz_localize(None)
152 | 
153 |         trading_cal = mcal.get_calendar(
154 |             'NYSE').valid_days(start_date=start_date, end_date=end_date).tz_localize(
155 |             None)
156 | 
157 |         universe = df_of_holdings.reindex(full_cal.tolist()).ffill().reindex(trading_cal.tolist()).reset_index()
158 | 
159 |         relational_format = [(row[0], permno) for row in universe.values for permno in row[1]]
160 |         uni_df = pd.DataFrame(relational_format, columns=['date', 'permno'])
161 |         uni_df = self._link_to_ids(uni_df)
162 | 
163 |         self._cache_helper(uni_df=uni_df, crsp_portno=crsp_portno)
164 | 
165 |         return uni_df
166 | 
167 |     def _link_to_ids(self, uni_df: pd.DataFrame) -> pd.DataFrame:
168 |         """
169 |         join cstat and ibes links to current universe df
170 |         """
171 |         columns = ', '.join(['date', 'uni.permno', 'lpermco as permco', 'gvkey', 'liid as iid', 'ticker', 'cusip',
172 |                              "CASE WHEN gvkey NOT NULL THEN CONCAT(gvkey, '_', liid) ELSE NULL END as id"])
173 |         from_start = " uni_df as uni "
174 | 
175 |         sql_code = ADD_ALL_LINKS_TO_PERMNO.replace('--columns', columns).replace('--from', from_start)
176 | 
177 |         return self._con.con.execute(sql_code).fetchdf()
178 | 
179 |     def _get_crsp_portno(self, ticker, crsp_portno) -> int:
180 |         """
181 |         if ticker is not none then will map the ticker to a crsp_portno
182 |         :return: crsp_portno passed or the crsp_portno mapped to the symbol
183 |         """
184 |         if crsp_portno:
185 |             return crsp_portno
186 | 
187 |         mapped_id = self._con.execute(f"""SELECT distinct crsp_portno 
188 |                                          FROM crsp.fund_summary 
189 |                                          WHERE ticker = '{ticker}' AND
190 |                                             crsp_portno IS NOT NULL""").fetchall()
191 | 
192 |         if len(mapped_id) == 0:
193 |             self._con.close_with_key(close_key=self.__class__.__name__)
194 |             raise ValueError(f"Ticker '{ticker}' is not valid cant map to crsp_portno")
195 | 
196 |         if len({x[0] for x in mapped_id}) > 1:
197 |             # getting metadata of the portno's that mtched
198 |             mapped_funds = self._con.execute(f"""SELECT DISTINCT crsp_portno, fund_name, m_fund, et_flag
199 |                                             FROM crsp.fund_summary 
200 |                                             WHERE ticker = '{ticker}' AND
201 |                                                 crsp_portno IS NOT NULL""").fetchdf()
202 |             self._con.close_with_key(close_key=self.__class__.__name__)
203 | 
204 |             raise ValueError(f"Ticker '{ticker}' mapped to {len(mapped_id)} crsp_portno's {mapped_id}. "
205 |                              f"Please specify the crsp_portno to build this etf's history\n" +
206 |                              mapped_funds.to_string(index=False))
207 | 
208 |         return int(mapped_id[0][0])
209 | 
210 |     @staticmethod
211 |     def _input_checks(ticker, crsp_portno) -> None:
212 |         """
213 |         input check for ticker and crsp_portno
214 |         """
215 |         if ticker is None and crsp_portno is None:
216 |             raise ValueError('Must pass a ticker or crsp_portno')
217 | 
218 |         if ticker is not None and crsp_portno is not None:
219 |             raise ValueError('Must pass a ticker or crsp_portno, not both!')
220 | 
221 |     def _is_cached_etf(self, crsp_portno) -> bool:
222 |         """
223 |         is the etf cached?
224 |         """
225 |         return os.path.isfile(self._get_cached_path(crsp_portno))
226 | 
227 |     def _cache_helper(self, uni_df, crsp_portno) -> None:
228 |         """
229 |         Writes a parquet file to the user specified temp directory on a computer
230 |         """
231 |         path = self._get_cached_path(crsp_portno)
232 |         uni_df.to_parquet(path)
233 |         print(f'Cached {crsp_portno} in {path}')
234 | 
235 |     def _get_cached_etf(self, crsp_portno) -> pd.DataFrame:
236 |         """
237 |         returns a dataframe of the cached universe
238 |         """
239 |         return pd.read_parquet(self._get_cached_path(crsp_portno))
240 | 
241 |     @staticmethod
242 |     def _get_cached_path(crsp_portno):
243 |         """
244 |         :return: path to the cached file
245 |         """
246 |         return f'{ETF_UNI_DIRECTORY}/etf_uni_{crsp_portno}.parquet'
247 | 
248 |     @staticmethod
249 |     def _parse_etf_uni_string(to_parse: str, param_dict: dict) -> dict:
250 |         """
251 |         adds 'ticker' or 'crsp_portno' a parameter dict
252 |         :params to_parse:
253 |         :params param_dict: dict which we will add 'ticker' or 'crsp_portno' to
254 |         :return: param_dict with 'ticker' or 'crsp_portno' added
255 |         """
256 |         to_parse = to_parse.upper()
257 |         if 'ETF_' in to_parse:
258 |             id_etf = to_parse.split('_')[-1]
259 |             if id_etf.isdigit():
260 |                 param_dict['crsp_portno'] = to_parse.split('_')[-1]
261 |             else:
262 |                 param_dict['ticker'] = to_parse.split('_')[-1]
263 |         else:
264 |             raise ValueError(f"Can't parse {to_parse}")
265 | 
266 |         return param_dict
267 | 
268 | 
269 | class BuiltUniverse:
270 |     """
271 |     Gets and validates path to a built universe
272 |     """
273 | 
274 |     def get_universe_path(self, uni_name) -> str:
275 |         """
276 |         gets the path to the parquet file of the given universe
277 |         :param uni_name: the name of the universe ex: CRSP_US_1000
278 |         :return: the full path to the given universe
279 |         :raises: ValueError if given uni_name is invalid
280 |         """
281 |         self._ensure_universe_exists(uni_name)
282 |         return self._get_path(uni_name)
283 | 
284 |     def _ensure_universe_exists(self, uni_name):
285 |         """
286 |         checks to see if the universe exisis
287 |         """
288 |         if not os.path.isfile(self._get_path(uni_name)):
289 |             raise ValueError(f'Universe {uni_name} does not exist!')
290 | 
291 |     @staticmethod
292 |     def _get_path(uni_name):
293 |         """
294 |         creates what the path should be to the universe file
295 |         """
296 |         return f'{BUILT_UNI_DIRECTORY}/{uni_name.upper()}.parquet'
297 | 
298 | 
299 | def clear_etf_universes():
300 |     """
301 |     Clears all parquet files in the ETF_UNI_DIRECTORY path
302 |     """
303 |     files = glob.glob(f'{ETF_UNI_DIRECTORY}/*.parquet')
304 |     for f in files:
305 |         os.remove(f)
306 |     print('Cleared ETF Universes')
307 | 
308 | 
309 | def clear_built_universes():
310 |     """
311 |     Clears all parquet files in the BUILT_UNI_DIRECTORY path
312 |     """
313 |     files = glob.glob(f'{BUILT_UNI_DIRECTORY}/*.parquet')
314 |     for f in files:
315 |         os.remove(f)
316 |     print('Cleared Built Universes')
317 | 
318 | 
319 | if __name__ == '__main__':
320 |     print(ETFUniverse().get_universe_df_parse(to_parse='ETF_1021980', start_date='2017'))
321 | 


--------------------------------------------------------------------------------
/ntiles/toolbox/db/settings.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Define Global Settings
 3 | """
 4 | 
 5 | DB_CONNECTION_STRING = '/Users/alex/Desktop/DB/wrds.duckdb'  # the directory to the sql database
 6 | CACHE_DIRECTORY = '/tmp'  # the directory to cache files, QueryConstructor gets cached here
 7 | ETF_UNI_DIRECTORY = '/tmp'  # '/Users/alex/Desktop/DB/universes/etf'  # the directory to save ETF Universes
 8 | BUILT_UNI_DIRECTORY = '/Users/alex/Desktop/DB/universes/built'  # directory to save custom-built universes
 9 | 
10 | DB_ADJUSTOR_FIELDS = {
11 |     'cstat.sd': [
12 |         {
13 |             'adjustor': 'ajexdi',
14 |             'fields_to_adjust': ['prccd', 'prcod', 'prchd', 'prcld', 'eps'],
15 |             'operation': '/'
16 |         },
17 |         {
18 |             'adjustor': 'ajexdi',
19 |             'fields_to_adjust': ['cshoc', 'cshtrd'],
20 |             'operation': '*'
21 |         }
22 |     ],
23 |     'crsp.sd': [
24 |         {
25 |             'adjustor': 'cfacpr',
26 |             'fields_to_adjust': ['prc', 'openprc', 'askhi', 'bidlo', 'bid', 'ask'],
27 |             'operation': '/',
28 |             'function': 'ABS'
29 |         },
30 |         {
31 |             'adjustor': 'cfacshr',
32 |             'fields_to_adjust': ['vol', 'shrout'],
33 |             'operation': '*'
34 |         }
35 | 
36 |     ],
37 |     'crsp.sm': [
38 |         {
39 |             'adjustor': 'cfacpr',
40 |             'fields_to_adjust': ['prc', 'openprc', 'askhi', 'bidlo', 'bid', 'ask', 'altprc'],
41 |             'operation': '/',
42 |             'function': 'ABS'
43 |         },
44 |         {
45 |             'adjustor': 'cfacshr',
46 |             'fields_to_adjust': ['vol', 'shrout'],
47 |             'operation': '*'
48 |         }
49 | 
50 |     ],
51 |     'cstat.sm': [
52 |         {
53 |             'adjustor': 'ajexm',
54 |             'fields_to_adjust': ['prccm', 'prchm', 'prclm'],
55 |             'operation': '/'
56 |         },
57 |         {
58 |             'adjustor': 'ajexm',
59 |             'fields_to_adjust': ['cshom', 'cshtrm'],
60 |             'operation': '*'
61 |         }
62 |     ],
63 |     'cstat.funda': [
64 |         {'fields_to_adjust': []}
65 |     ],
66 | 
67 |     'wrds.firm_ratios': [
68 |         {'fields_to_adjust': []}
69 |     ],
70 |     'ibes.summary_price_target': [
71 |         {'fields_to_adjust': []}
72 |     ]
73 | }
74 | 
75 | DB_ADJUSTOR_FIELDS['sd'] = DB_ADJUSTOR_FIELDS['cstat.sd']
76 | DB_ADJUSTOR_FIELDS['main.sd'] = DB_ADJUSTOR_FIELDS['cstat.sd']
77 | 
78 | # sql code to link permno to cstat and ibes
79 | ADD_ALL_LINKS_TO_PERMNO = """
80 | (
81 |     SELECT --columns 
82 |     FROM
83 |         --from LEFT JOIN link.crsp_cstat_link AS ccm ON (uni.permno = ccm.lpermno AND uni.date >= ccm.linkdt 
84 |                 AND uni.date <= ccm.linkenddt AND (ccm.linktype = 'LU' OR ccm.linktype = 'LC'))
85 |         LEFT JOIN link.crsp_ibes_link AS crib ON (uni.permno = crib.permno AND uni.date >= crib.sdate 
86 |             AND uni.date <= crib.edate)
87 | )
88 | """
89 | 


--------------------------------------------------------------------------------
/ntiles/toolbox/db/write/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Alexd14/ntiles/d22c75aac2a553ccca17fb71a3650071e31808c2/ntiles/toolbox/db/write/__init__.py


--------------------------------------------------------------------------------
/ntiles/toolbox/db/write/create_tables.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import Dict, List
  3 | 
  4 | from ntiles.toolbox.db.api.sql_connection import SQLConnection
  5 | 
  6 | logging.basicConfig(format='%(message)s ::: %(asctime)s', datefmt='%I:%M:%S %p', level=logging.INFO)
  7 | 
  8 | 
  9 | class IngestDataBase:
 10 |     def __init__(self, connection_string: str = None):
 11 |         """
 12 |         :param connection_string: optional string connection to the database
 13 |             if none is given then will fall back onto path in settings.py
 14 |         """
 15 |         self._sql_api = SQLConnection(connection_string=connection_string, read_only=False)
 16 | 
 17 |     def ingest(self, to_insert: List[Dict[str, str]], overwrite: bool = False, rows_to_interpret: int = 5_000,
 18 |                close: bool = True) -> None:
 19 |         """
 20 |         will ingest the files specified by to_insert
 21 |         :param to_insert: A dictionary containing the schema, tablename and file path for a
 22 |             table that should be inserted into the db
 23 |             [{
 24 |             'schema': 'sch1',
 25 |             'table': 'tbl1',
 26 |             'file_path': 'full/path/to/file',
 27 |             'custom': "UPDATE sch1.tbl1 SET LINKENDDT=99991231 WHERE LINKENDDT = 'E';",
 28 |             'rename': {'datadate': 'date'},
 29 |             'alter_type': {'gsector': 'VARCHAR', 'date': ['timestamp', '%Y%m%d']},
 30 |             'index': [{'name': 'ixd2', 'column': 'col1'},  {'name': 'idx2', 'column': 'col2'}]
 31 |             'where': "date > '2000'"
 32 |             'from': "AS data JOIN crsp.crsp_cstat_link as link on data.permno = link.lpermno"
 33 |             'rows_to_interpret': 500_000
 34 |             }]
 35 |         :param overwrite: should the tables be overwritten if they exist?
 36 |         :param rows_to_interpret: how many rows should we read to determine the types
 37 |         :param close: should we close the sql connection after everything is inserted?
 38 |         :return: None
 39 |         """
 40 |         try:
 41 |             for tbl_to_create in to_insert:
 42 |                 logging.info(f'Inserting {tbl_to_create["schema"]}.{tbl_to_create["table"]}')
 43 |                 self._create_schema(tbl_to_create)  # creates schema
 44 |                 self._drop(tbl_to_create, overwrite)  # drops tbl if user wants to
 45 |                 self._create_tbl(tbl_to_create, rows_to_interpret)  # writing table
 46 |                 self._custom_sql(tbl_to_create)  # letting the user run any sql code
 47 |                 self._rename_columns(tbl_to_create)  # renaming columns
 48 |                 self._alter_types(tbl_to_create)  # changing types of data
 49 |                 self._to_lowercase(tbl_to_create)  # making all column names lowercase
 50 |                 self._create_index(tbl_to_create)  # making indexes
 51 | 
 52 |         except Exception as e:
 53 |             self._sql_api.close()
 54 |             raise e
 55 | 
 56 |         if close:
 57 |             self._sql_api.close()
 58 |             logging.info('Closed SQL Connection')
 59 | 
 60 |     def _create_schema(self, tbl_to_create) -> None:
 61 |         """
 62 |         :param tbl_to_create: dict defining the table we want to create
 63 |         :return: None
 64 |         """
 65 |         sql_query = f"""CREATE SCHEMA IF NOT EXISTS {tbl_to_create['schema']};"""
 66 |         self._sql_api.execute(sql_query)
 67 | 
 68 |     def _drop(self, tbl_to_create, overwrite) -> None:
 69 |         """
 70 |         Drpos a table if it exists and the user wants to drop the table
 71 |         :param tbl_to_create: dict defining the table we want to drop
 72 |         :param overwrite: should we drop the table?
 73 |         :return: None
 74 |         """
 75 |         if overwrite:
 76 |             tbl_name = self._get_table_name(tbl_to_create)
 77 |             sql_query = f"""DROP TABLE IF EXISTS {tbl_name};"""
 78 |             self._sql_api.execute(sql_query)
 79 | 
 80 |     def _create_tbl(self, tbl_to_create, rows_to_interpret) -> None:
 81 |         """
 82 |         inserts a table into the specified schema and table name
 83 |         no adjustments are done to the table or types declared
 84 |         :param tbl_to_create: dict defining the table we want to create
 85 |         :return: None
 86 |         """
 87 |         tbl_name = self._get_table_name(tbl_to_create)
 88 | 
 89 |         rows_to_interpret = tbl_to_create[
 90 |             'rows_to_interpret'] if 'rows_to_interpret' in tbl_to_create else rows_to_interpret
 91 | 
 92 |         where_clause = f"WHERE {tbl_to_create.get('where')}" if tbl_to_create.get('where') else ''
 93 |         from_clause = tbl_to_create.get('from') if tbl_to_create.get('from') else ''
 94 | 
 95 |         sql_query = f"""
 96 |             CREATE TABLE {tbl_name} AS 
 97 |                 SELECT * 
 98 |                 FROM  read_csv_auto('{tbl_to_create['file_path']}', SAMPLE_SIZE={rows_to_interpret}) {from_clause}
 99 |                 {where_clause}"""
100 | 
101 |         self._sql_api.execute(sql_query)
102 | 
103 |         logging.info(f'\tCreated table {tbl_to_create["schema"]}.{tbl_to_create["table"]}')
104 | 
105 |     def _custom_sql(self, tbl_to_create):
106 |         """
107 |         lets the user run any sql code they want
108 |         :param tbl_to_create: dict defining the table we want to create
109 |         :return: None
110 |         """
111 | 
112 |         if 'custom' not in tbl_to_create:
113 |             return
114 | 
115 |         self._sql_api.execute(tbl_to_create['custom'])
116 | 
117 |         logging.info('\tRan custom sql code')
118 | 
119 |     def _rename_columns(self, tbl_to_create) -> None:
120 |         """
121 |         renames the columns specified by the user
122 |         :param tbl_to_create: dict defining the table we want to create
123 |         :return: None
124 |         """
125 |         # if there are no columns to rename then return
126 |         if 'rename' not in tbl_to_create:
127 |             return
128 | 
129 |         tbl_name = self._get_table_name(tbl_to_create)
130 | 
131 |         for col_to_rename in tbl_to_create['rename']:
132 |             sql_query = f"""ALTER TABLE {tbl_name} RENAME COLUMN 
133 |                         {col_to_rename} TO {tbl_to_create['rename'][col_to_rename]};"""
134 |             self._sql_api.execute(sql_query)
135 |             logging.info(f'\tRenamed {col_to_rename} -> {tbl_to_create["rename"][col_to_rename]}')
136 | 
137 |     def _alter_types(self, tbl_to_create) -> None:
138 |         """
139 |         alters the types of columns according to the user
140 |         :param tbl_to_create: dict defining the table we want to create
141 |         :return: None
142 |         """
143 |         # if there are no columns to alter types then return
144 |         if 'alter_type' not in tbl_to_create:
145 |             return
146 | 
147 |         tbl_name = self._get_table_name(tbl_to_create)
148 | 
149 |         for col_to_alter in tbl_to_create['alter_type']:
150 |             # should we do a timestamp parse?
151 |             if tbl_to_create['alter_type'][col_to_alter][0] == 'timestamp':
152 |                 date_format = tbl_to_create['alter_type'][col_to_alter][1]
153 |                 sql_query = f"""ALTER TABLE {tbl_name} ALTER {col_to_alter} TYPE varchar; 
154 |                                 ALTER TABLE {tbl_name} ALTER {col_to_alter} SET DATA TYPE 
155 |                                 TIMESTAMP USING strptime({col_to_alter}, '{date_format}')"""
156 | 
157 |             else:
158 |                 sql_query = f"""ALTER TABLE {tbl_name} ALTER {col_to_alter} TYPE 
159 |                                 {tbl_to_create['alter_type'][col_to_alter]};"""
160 | 
161 |             self._sql_api.execute(sql_query)
162 |             logging.info(f'\tAltered column {col_to_alter}')
163 | 
164 |     def _create_index(self, tbl_to_create) -> None:
165 |         """
166 |         creates indexes for a table
167 |         :param tbl_to_create: dict defining the table we want to create
168 |         :return: None
169 |         """
170 | 
171 |         # if there are no columns to index then return
172 |         if 'index' not in tbl_to_create:
173 |             return
174 | 
175 |         tbl_name = self._get_table_name(tbl_to_create)
176 | 
177 |         for idx in tbl_to_create['index']:
178 |             sql_query = f"""CREATE INDEX {idx['name']} ON {tbl_name} ({idx['column']});"""
179 |             self._sql_api.execute(sql_query)
180 | 
181 |             logging.info(f'\tCreated index {idx["name"]} using {idx["column"]}')
182 | 
183 |     def _to_lowercase(self, tbl_to_create) -> None:
184 |         """
185 |         turns all columns in a table to lowercase
186 |         :param tbl_to_create: dict defining the table we want to create
187 |         :return: None
188 |         """
189 |         tbl_name = self._get_table_name(tbl_to_create)
190 | 
191 |         cols = self._sql_api.execute(f'PRAGMA table_info({tbl_name})').fetchdf().name
192 | 
193 |         for col in cols:
194 |             self._sql_api.execute(f"""ALTER TABLE {tbl_name} RENAME COLUMN "{col}" TO "{col.lower()}";""")
195 | 
196 |         logging.info('\tSuccessfully made all columns lowercase')
197 | 
198 |     @staticmethod
199 |     def _get_table_name(tbl_to_create) -> str:
200 |         """
201 |         gets the table name for a tbl_to_create
202 |         :param tbl_to_create: dict defining the table we want to create
203 |         :return: table name
204 |         """
205 |         return f"{tbl_to_create['schema']}.{tbl_to_create['table']}"
206 | 


--------------------------------------------------------------------------------
/ntiles/toolbox/db/write/make_universes.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import pandas as pd
  4 | 
  5 | from ntiles.toolbox.db.api.sql_connection import SQLConnection
  6 | from ntiles.toolbox.db.settings import ADD_ALL_LINKS_TO_PERMNO, BUILT_UNI_DIRECTORY
  7 | 
  8 | # this allows compatibility with python 3.6
  9 | try:
 10 |     import pandas_market_calendars as mcal
 11 | except ImportError as e:
 12 |     pass
 13 | 
 14 | logging.basicConfig(format='%(message)s ::: %(asctime)s', datefmt='%I:%M:%S %p', level=logging.INFO)
 15 | 
 16 | 
 17 | def compustat_us_universe(max_rank: int, min_rank: int = 1, start_date: str = '2000',
 18 |                           rebuild_mc_ranking: bool = False) -> None:
 19 |     """
 20 |     generates US daily indexes for compustat daily security file
 21 |     only will use the primary share for a company
 22 |     will generate a table called universe.US_min_rank_max_rank, ex US_0_3000
 23 |     :param max_rank: the max market cap rank for a company to be in the universe
 24 |     :param min_rank: the min market cap rank for a company in the universe
 25 |     :param start_date: the minimum date for creating the universe
 26 |     :param set_indexes: Should we index the universe by
 27 |     :return: None
 28 |     """
 29 | 
 30 |     table_name = f'CSTAT_US{"" if min_rank == 1 else "_" + str(min_rank)}_{max_rank}'
 31 |     write_path = f'{BUILT_UNI_DIRECTORY}/{table_name}.parquet'
 32 | 
 33 |     if rebuild_mc_ranking:
 34 |         _make_cstat_us_universe_base_table()
 35 |     else:
 36 |         logging.info(f'Using Prior Build of universe.cstat_mc_rank')
 37 | 
 38 |     logging.info(f'Creating table {table_name}')
 39 | 
 40 |     sql_make_universe_table = f""" 
 41 |         COPY 
 42 |             (
 43 |             SELECT date, gvkey, iid, id, ttm_min_prccd, ttm_mc, ttm_mc_rank
 44 |             FROM universe.temp_rank_cstat_mc
 45 |             WHERE ttm_mc_rank >= {min_rank} AND 
 46 |                 ttm_mc_rank <= {max_rank} AND 
 47 |                 date > '{start_date}'
 48 |             )
 49 |         TO '{BUILT_UNI_DIRECTORY}/{table_name}.parquet' (FORMAT 'parquet')
 50 |         """
 51 |     # making the db connection
 52 |     con = SQLConnection(read_only=False).con
 53 | 
 54 |     con.execute(sql_make_universe_table)
 55 |     con.close()
 56 | 
 57 |     logging.info(f'Wrote Table {table_name} To {write_path}')
 58 | 
 59 | 
 60 | def crsp_us_universe(max_rank: int, min_rank: int = 1, start_date: str = '1980',
 61 |                      rebuild_mc_ranking: bool = False, link: bool = True) -> None:
 62 |     """
 63 |     Generates a universe of the top N stocks domiciled in the US by market cap
 64 |     Will only use companies primary share
 65 |     :param max_rank: the max market cap rank for a company to be in the universe
 66 |     :param min_rank: the min market cap rank for a company in the universe
 67 |     :param start_date: the minimum date for creating the universe
 68 |     :param set_indexes: Should we index the universe by
 69 |     :param rebuild_mc_ranking: should we rebuild the ranking table universe.crsp_mc_rank?
 70 |     :param link: should we link to cstat and ibes
 71 |     :return: None
 72 |     """
 73 |     # getting the trading calendar so we dont have bad dates
 74 |     trading_cal = mcal.get_calendar(
 75 |         'NYSE').valid_days(start_date=start_date, end_date=pd.to_datetime('today')).to_series().to_frame('trading_days')
 76 | 
 77 |     table_name = f'CRSP_US{"" if min_rank == 1 else "_" + str(min_rank)}_{max_rank}'
 78 |     write_path = f'{BUILT_UNI_DIRECTORY}/{table_name}.parquet'
 79 | 
 80 |     if rebuild_mc_ranking:
 81 |         _make_crsp_us_universe_base_table()
 82 |     else:
 83 |         logging.info(f'Using Prior Build of universe.crsp_mc_rank')
 84 | 
 85 |     logging.info(f'Creating table {table_name}')
 86 | 
 87 |     sql_make_universe_table = f""" 
 88 |         (
 89 |         SELECT date, permno, permco, ttm_min_prc, ttm_mc, ttm_mc_rank
 90 |         FROM universe.temp_rank_crsp_mc 
 91 |         WHERE ttm_mc_rank >= {min_rank} AND 
 92 |             ttm_mc_rank <= {max_rank} AND 
 93 |             date > '{start_date}'
 94 |         ) as uni
 95 |         """
 96 | 
 97 |     # will add linking tables
 98 |     if link:
 99 |         columns = ', '.join(['uni.*', 'gvkey', 'liid as iid, ''ticker', 'cusip',
100 |                              "CASE WHEN gvkey NOT NULL THEN CONCAT(gvkey, '_', liid) ELSE NULL END as id"])
101 |         sql_make_universe_table = '(' + (ADD_ALL_LINKS_TO_PERMNO
102 |                                          .replace('--columns', columns)
103 |                                          .replace('--from', sql_make_universe_table)) + ')'
104 | 
105 |     sql_make_universe_table = f"""COPY 
106 |                                     {sql_make_universe_table} 
107 |                                     TO '{write_path}' (FORMAT 'parquet')"""
108 | 
109 |     # making the db connection
110 |     con = SQLConnection(read_only=False).con
111 |     con.execute(sql_make_universe_table)
112 |     con.close()
113 | 
114 |     logging.info(f'Wrote Table {table_name} To {write_path}')
115 | 
116 | 
117 | def _make_cstat_us_universe_base_table():
118 |     """
119 |     Makes the base table with market cap ranks for each asset. Should be deleted after its done being used
120 |     """
121 |     table_name = 'universe.temp_rank_cstat_mc'
122 |     logging.info(f'Creating Ranking Table {table_name}')
123 | 
124 |     # getting the trading calendar so we dont have bad dates
125 |     trading_cal = mcal.get_calendar(
126 |         'NYSE').valid_days(start_date='1980', end_date=pd.to_datetime('today')).to_series().to_frame('trading_days')
127 | 
128 |     sql_ensure_schema_open = f'CREATE SCHEMA IF NOT EXISTS universe;'
129 |     sql_ensure_table_open = f'DROP TABLE IF EXISTS {table_name};'
130 |     sql_make_rank_universe_table = f""" 
131 |             CREATE TABLE {table_name}
132 |             AS
133 |                 (
134 |                 SELECT date, gvkey, iid, id, ttm_min_prccd, ttm_mc, 
135 |                     row_number() OVER (PARTITION BY (date) ORDER BY ttm_mc desc) AS ttm_mc_rank
136 |                 FROM
137 |                     (
138 |                     SELECT * 
139 |                     FROM
140 |                         (
141 |                         SELECT date, gvkey, iid, id, 
142 |                             AVG(ABS(prccd) * cshoc) OVER (
143 |                             PARTITION BY id ORDER BY date ROWS BETWEEN 252 PRECEDING AND CURRENT ROW) AS ttm_mc,
144 |                             MIN(ABS(prccd)) OVER (
145 |                             PARTITION BY id ORDER BY date ROWS BETWEEN 252 PRECEDING AND CURRENT ROW) AS ttm_min_prccd
146 |                         FROM 
147 |                             (
148 |                             SELECT date, gvkey, iid, id, priusa, fic, tpci, curcdd,
149 |                                 lag(prccd, 1, NULL) OVER lagDays AS prccd, 
150 |                                 lag(cshoc, 1, NULL) OVER lagDays AS cshoc
151 |                             FROM main.sd AS sd RIGHT JOIN trading_cal cal ON sd.date = cal.trading_days 
152 |                             WINDOW lagDays AS (PARTITION BY id ORDER BY date) 
153 |                             )
154 |                         WHERE fic = 'USA' AND
155 |                             tpci = '0' AND
156 |                             curcdd = 'USD' AND
157 |                             priusa = (CASE WHEN regexp_full_match(iid, '^[0-9]*$') THEN CAST(iid AS INTEGER) end)
158 |                         )
159 |                     WHERE ttm_mc > 0 AND
160 |                           ttm_min_prccd > 3
161 |                     )
162 |                 )
163 |             ORDER BY date
164 |             """
165 | 
166 |     # making the db connection
167 |     con = SQLConnection(read_only=False).con
168 |     con.execute(sql_ensure_schema_open)
169 |     con.execute(sql_ensure_table_open)
170 |     con.execute(sql_make_rank_universe_table)
171 |     con.close()
172 | 
173 |     logging.info(f'Finished Ranking Table {table_name}')
174 | 
175 | 
176 | def _make_crsp_us_universe_base_table():
177 |     """
178 |     Makes the base table with market cap ranks for each asset. Should be deleted after its done being used
179 |     """
180 |     table_name = 'universe.temp_rank_crsp_mc'
181 |     logging.info(f'Creating Ranking Table {table_name}')
182 | 
183 |     trading_cal = mcal.get_calendar(
184 |         'NYSE').valid_days(start_date='1925', end_date=pd.to_datetime('today')).to_series().to_frame('trading_days')
185 | 
186 |     sql_ensure_schema_open = f'CREATE SCHEMA IF NOT EXISTS universe;'
187 |     sql_ensure_table_open = f'DROP TABLE IF EXISTS {table_name};'
188 | 
189 |     sql_make_rank_universe_table = f""" 
190 |         CREATE TABLE {table_name}
191 |         AS
192 |             SELECT date, permno, permco, ttm_min_prc, ttm_mc, 
193 |                 row_number() OVER (PARTITION BY (date) ORDER BY ttm_mc desc) AS ttm_mc_rank
194 |             FROM
195 |                 (
196 |                 SELECT date, permno, permco, ttm_min_prc, ttm_mc
197 |                 FROM
198 |                     (
199 |                     SELECT date, permno, permco, shrcd,
200 |                         AVG(ABS(prc) * shrout) OVER (
201 |                         PARTITION BY permno ORDER BY date ROWS BETWEEN 252 PRECEDING AND CURRENT ROW) AS ttm_mc,
202 |                         MIN(ABS(prc)) OVER (
203 |                         PARTITION BY permno ORDER BY date ROWS BETWEEN 252 PRECEDING AND CURRENT ROW) AS ttm_min_prc
204 |                     FROM 
205 |                         (
206 |                         SELECT date, permno, permco, shrcd,
207 |                         lag(prc, 1, NULL) OVER lagDays AS prc, 
208 |                         lag(shrout, 1, NULL) OVER lagDays AS shrout
209 |                         FROM
210 |                             (
211 |                             SELECT distinct date, permno, permco, shrcd, prc, shrout
212 |                             FROM crsp.sd as sd RIGHT JOIN trading_cal cal on sd.date = cal.trading_days
213 |                             )  
214 |                         WINDOW lagDays AS (
215 |                             PARTITION BY permno
216 |                             ORDER BY date
217 |                         )   
218 |                         )
219 |                     WHERE shrcd = 11
220 |                     )
221 |                 WHERE ttm_mc IS NOT NULL AND
222 |                       ttm_min_prc > 3 
223 |                 )
224 |             ORDER BY date
225 |         """
226 | 
227 |     # making the db connection
228 |     con = SQLConnection(read_only=False).con
229 |     con.execute(sql_ensure_schema_open)
230 |     con.execute(sql_ensure_table_open)
231 |     con.execute(sql_make_rank_universe_table)
232 |     con.close()
233 | 
234 |     logging.info(f'Finished Ranking Table {table_name}')
235 | 
236 | 
237 | def clear_master_ranking_table():
238 |     """
239 |     Wipes the ranking tables made by _make_crsp_us_universe_base_table and _make_cstat_us_universe_base_table
240 |     """
241 |     logging.info('Deleting Ranking Tables')
242 | 
243 |     con = SQLConnection(read_only=False)
244 |     con.execute("DROP SCHEMA universe CASCADE;")
245 |     con.close()
246 | 
247 |     logging.info('Finished Deleting Ranking Tables')
248 | 
249 | 
250 | if __name__ == '__main__':
251 |     # crsp_us_universe(max_rank=500, rebuild_mc_ranking=True, link=True)
252 |     # crsp_us_universe(max_rank=1000, link=True)
253 |     # crsp_us_universe(max_rank=3000, link=True)
254 |     # crsp_us_universe(min_rank=1000, max_rank=3000, link=True)
255 |     #
256 |     # # building compustat universes
257 |     # compustat_us_universe(max_rank=500, rebuild_mc_ranking=True)
258 |     # compustat_us_universe(max_rank=1000)
259 |     # compustat_us_universe(max_rank=3000)
260 |     # compustat_us_universe(min_rank=1000, max_rank=3000)
261 | 
262 |     # clear_master_ranking_table()
263 |     pass
264 | 


--------------------------------------------------------------------------------
/ntiles/toolbox/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Alexd14/ntiles/d22c75aac2a553ccca17fb71a3650071e31808c2/ntiles/toolbox/utils/__init__.py


--------------------------------------------------------------------------------
/ntiles/toolbox/utils/date_config.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Union
  2 | 
  3 | import pandas as pd
  4 | 
  5 | 
  6 | class DateConfig:
  7 |     """
  8 |     Configures the dates for a dataframe
  9 | 
 10 |     This class can be used to align dates up across pricing and factor data
 11 |     Once the class is configured can be used an unlimited number of times to align dates of dataframes
 12 |     """
 13 | 
 14 |     def __init__(self,
 15 |                  freq: str,
 16 |                  target_data_type: str = 'period',
 17 |                  resample: bool = False,
 18 |                  resample_key: str = None,
 19 |                  grouper_keys: List[str] = None,
 20 |                  date_format: str = None
 21 |                  ) -> None:
 22 |         """
 23 |         :param freq: the frequency we want to align to
 24 |         :param target_data_type: the type of date we want as output (timestamp, period)
 25 |         :param resample: whether to resample the data when changing frequency's
 26 |         :param resample_key: The column we are using to down sample the data
 27 |             Will keep the last value of the period
 28 |         :param grouper_keys: The columns we are using to group the data
 29 |             will be used in conjunction with resample_key
 30 |             can be none if we are not grouping
 31 |             This would be where asset_ids go
 32 |         :param date_format: the date format to use when converting from a string to period
 33 |         """
 34 | 
 35 |         self._target_freq = freq
 36 |         self._target_data_type = target_data_type
 37 |         self._date_format = date_format
 38 |         self._resample = resample
 39 |         self._resample_key = resample_key
 40 |         self._grouper_keys = [] if grouper_keys is None else grouper_keys
 41 |         self._resample_master = f'old_{self._resample_key}_{self.__class__.__name__}'
 42 |         self._validate_inputs()
 43 | 
 44 |     def _validate_inputs(self) -> None:
 45 |         """
 46 |         Ensures the inputs are valid
 47 |         :throws: ValueError if inputs are invalid
 48 |         """
 49 |         if self._target_freq not in ['D', 'B', 'W', 'M', 'Q', 'A']:
 50 |             raise ValueError(f'Invalid target_freq: {self._target_freq}')
 51 | 
 52 |         # if not self._resample and len(self._grouper_keys) != 0:
 53 |         #     raise ValueError(f'Cannot use grouper_keys without resampling')
 54 | 
 55 |         if self._target_data_type not in ['timestamp', 'period']:
 56 |             raise ValueError(f'Invalid target_data_type: {self._target_data_type}')
 57 | 
 58 |     def configure_dates(self,
 59 |                         df: pd.DataFrame,
 60 |                         date_columns: Union[List[str], str]
 61 |                         ) -> pd.DataFrame:
 62 |         """
 63 |         Adjusts the dates of the dataframe according to the configuration passed at initiation
 64 |         Cn adjust columns as well as the index
 65 |         :param df: the dataframe to adjust
 66 |         :param date_columns: the date columns to adjust
 67 |         :return: the dataframe with the configured dates
 68 |         """
 69 |         df = df.copy()
 70 | 
 71 |         if isinstance(date_columns, str):
 72 |             date_columns = [date_columns]
 73 | 
 74 |         index = None
 75 |         if not isinstance(df.index, pd.RangeIndex):
 76 |             index = df.index.name
 77 |             df = df.reset_index()
 78 | 
 79 |         self._validate_df(df, date_columns)
 80 |         df = self._prep_df(df, date_columns)
 81 |         for date_column in date_columns:
 82 |             df[date_column] = self._configure_dates(df[date_column])
 83 |         df = self._resample_data(df, date_columns)
 84 |         df = self._clean_df(df)
 85 |         df = self._alter_types(df, date_columns)
 86 | 
 87 |         if index:
 88 |             df = df.set_index(index)
 89 |         return df
 90 | 
 91 |     def _alter_types(self,
 92 |                      df: pd.DataFrame,
 93 |                      date_columns: Union[List[str], str]
 94 |                      ) -> pd.DataFrame:
 95 |         """
 96 |         Alters the types of the dates to the target_data_type
 97 |         """
 98 |         if self._target_data_type == 'timestamp':
 99 |             for date_column in date_columns:
100 |                 df[date_column] = df[date_column].dt.to_timestamp()
101 |         return df
102 | 
103 |     def _clean_df(self,
104 |                   df: pd.DataFrame
105 |                   ) -> pd.DataFrame:
106 |         """
107 |         Cleans the df after the dates have been adjusted
108 |         """
109 |         return df.drop(self._resample_master, axis=1, errors='ignore')
110 | 
111 |     def _prep_df(self,
112 |                  df: pd.DataFrame,
113 |                  date_columns
114 |                  ) -> pd.DataFrame:
115 |         """
116 |         Preps the df for the dates to be adjusted
117 |         Currently preps for a frequency conversion and subsequent down-sample or up-sample
118 |         :throws: ValueError if the correct parameters are not passed at construction to do the resample
119 |         """
120 | 
121 |         if self._resample:
122 |             if self._resample_key is None:
123 |                 df[self._resample_master] = df[date_columns[0]]
124 |             else:
125 |                 df[self._resample_master] = df[self._resample_key]
126 |         return df
127 | 
128 |     def _resample_data(self,
129 |                        df: pd.DataFrame,
130 |                        date_columns
131 |                        ) -> pd.DataFrame:
132 |         """
133 |         Upsamples the data if resample is True
134 |         """
135 |         if self._resample and len(self._grouper_keys) == 0 and len(df) > 10_000:
136 |             print('Warning you are resampling a large dataframe without grouping.')
137 | 
138 |         if self._resample:
139 |             date_key = date_columns[0] if self._resample_key is None else self._resample_key
140 |             groupby_keys = self._grouper_keys.copy() + [date_key]
141 |             df = df.sort_values(self._resample_master).groupby(groupby_keys).last().reset_index()
142 | 
143 |         return df
144 | 
145 |     def _validate_df(self,
146 |                      df,
147 |                      date_columns
148 |                      ) -> None:
149 |         """
150 |         Ensures the inputs are valid for down sampling
151 |         :throws: ValueError if inputs are invalid
152 |         """
153 |         if self._resample and len(date_columns) != 1 and self._resample_key is None:
154 |             raise ValueError(f'Cannot down sample multiple date columns: {date_columns}. Must pass resample_key.')
155 | 
156 |         if self._resample and self._resample_key is not None and self._resample_key not in date_columns:
157 |             raise ValueError(f'resample_key: {self._resample_key} not in date_columns: {date_columns}')
158 | 
159 |     def _configure_dates(self,
160 |                          dates: pd.Series
161 |                          ) -> pd.Series:
162 |         """
163 |         Adjusts the dates according to the configuration passed at initiation
164 |         """
165 |         if not (pd.api.types.is_period_dtype(dates) or pd.api.types.is_datetime64_any_dtype(dates)):
166 |             dates = self._to_datetime(dates)
167 | 
168 |         dates = self._configure_freq(dates)
169 |         return dates
170 | 
171 |     def _configure_freq(self,
172 |                         dates: pd.Series
173 |                         ) -> pd.Series:
174 |         """
175 |         Configures the frequency of the dates
176 |         """
177 |         if pd.api.types.is_datetime64_any_dtype(dates):
178 |             if dates.dt.tz:
179 |                 dates = dates.dt.tz_localize(None)
180 |             return dates.dt.to_period(self._target_freq)
181 |         if pd.api.types.is_period_dtype(dates):
182 |             if dates.dt.freq != self._target_freq:
183 |                 return dates.dt.asfreq(self._target_freq)
184 |             else:
185 |                 return dates
186 |         else:
187 |             raise ValueError(f'Invalid date date type: {dates.dtype}')
188 | 
189 |     def _to_datetime(self,
190 |                      dates
191 |                      ) -> pd.Series:
192 |         """
193 |         Takes in a series of strings and parses them to dates
194 |         :throws: ValueError if date_format is not passed at initiation
195 |         """
196 |         if self._date_format is None:
197 |             raise ValueError('date_format must be passed at initiation to parse dates from strings')
198 |         return pd.to_datetime(dates, format=self._date_format)
199 | 
200 |     def copy(self,
201 |              **kwargs
202 |              ) -> 'DateConfig':
203 |         """
204 |         Creates a copy of the object
205 |         :param kwargs: the parameters to override when doing the copy
206 |         """
207 |         base_kwargs = {'freq': self._target_freq,
208 |                        'date_format': self._date_format,
209 |                        'target_data_type': self._target_data_type,
210 |                        'resample': self._resample,
211 |                        'resample_key': self._resample_key,
212 |                        'grouper_keys': self._grouper_keys}
213 |         base_kwargs.update(kwargs)
214 |         return self.__class__(**base_kwargs)
215 | 


--------------------------------------------------------------------------------
/ntiles/toolbox/utils/format_data_alphalens.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from ntiles.toolbox.utils.handle_data import handle_duplicates, make_nan_inf_summary
 4 | 
 5 | 
 6 | def price_format_for_alphalens(data: pd.DataFrame, factor: str, date_format: str = '',
 7 |                                id_col: str = 'symbol') -> pd.DataFrame:
 8 |     """
 9 |     formats the price data into the expected format by get_clean_factor_and_forward_returns
10 |     out format of the data frame: index: 'date', columns: id_col
11 |     data must contain 'date', id_col, can take in a dataframe with unlimited columns
12 |     given df the 2 required columns names: 'date', id_col
13 | 
14 |     does not mutate the given dataframe
15 | 
16 |     :param data: the data to be turned into the format expected by prices field in get_clean_factor_and_forward_returns
17 |     :param factor: the name of the factor column in the passed data
18 |     :param date_format: the format to parse the date column in pd.datetime
19 |     `   dont pass anything if no date conversion is wanted
20 |     :param id_col: the asset identifier column for the data
21 |     :return: data frame with data in format required by factor field in get_clean_factor_and_forward_returns
22 |     """
23 |     data: pd.DataFrame = data.copy()
24 | 
25 |     _check_columns(data, id_col)
26 |     _convert_to_date_time(data, date_format)
27 | 
28 |     pivot_table: pd.DataFrame = data.pivot_table(index='date', columns=id_col, values=factor)
29 | 
30 |     return pivot_table
31 | 
32 | 
33 | def factor_format_for_alphalens(data: pd.DataFrame, factor: str, date_format: str = '', max_loss: float = .1,
34 |                                 id_col: str = 'symbol') -> pd.DataFrame:
35 |     """
36 |     formats the alpha factor data into the expected format by get_clean_factor_and_forward_returns
37 |     data must contain 'date', id_col, can take in a dataframe with unlimited columns
38 |     out format of the data frame: index: ('date', id_col), columns: 'factor'
39 |     given df the 1 required columns names: 'date'
40 | 
41 |     does not mutate the given data frame
42 | 
43 |     :param data: the data to be turned into the format expected by factor field in get_clean_factor_and_forward_returns
44 |     :param factor: the name of the factor column in the passed data
45 |     :param date_format: the format to parse the date column in pd.datetime
46 |     `   pass nothing if no date conversion is wanted
47 |     :param max_loss: the decimal percent of the factor that can be nan or infinity before we throw an error
48 |     :param id_col: the asset identifier column for the data
49 |     :return: data frame with data in required format by factor field in get_clean_factor_and_forward_returns
50 |     """
51 |     data: pd.DataFrame = data.copy()
52 | 
53 |     _check_columns(data, id_col)
54 |     _convert_to_date_time(data, date_format)
55 | 
56 |     # setting the index
57 |     alpha_factor = data[['date', id_col, factor]].set_index(['date', id_col])
58 |     # dropping duplicates and printing a warning
59 |     alpha_factor = handle_duplicates(df=alpha_factor, out_type='Warning', name='Given Factor', drop=True)
60 |     # making a nan and inf summary along with dropping nan's
61 |     alpha_factor = make_nan_inf_summary(df=alpha_factor, max_loss=max_loss)
62 | 
63 |     return alpha_factor
64 | 
65 | 
66 | def _check_columns(data: pd.DataFrame, id_col: str) -> None:
67 |     """
68 |     checking to make sure the columns contain 'date' & id_col
69 |     :param data: the data frame to check
70 |     :param id_col: the identifier column we are checking for
71 |     :return: Void, throws ValueError if the columns are bad
72 |     """
73 |     # checking for the columns 'date' & id_col
74 |     for needed in ['date', id_col]:
75 |         if needed not in data.columns:
76 |             raise ValueError(f'given df must have required columns \'date\' \'{id_col}\'')
77 | 
78 | 
79 | def _convert_to_date_time(data: pd.DataFrame, date_format: str) -> None:
80 |     """
81 |     MUTATES the given dataframe
82 |     converts the date column to a pd.dateTime object.
83 |     If the date_format is a empty string then nothing is changed
84 |     :param data: the data frame to have the date chamged
85 |     :param date_format: the format of the date time string
86 |     :return: Void
87 |     """
88 | 
89 |     if date_format != '':
90 |         data['date'] = pd.to_datetime(data['date'].to_numpy(), format=date_format, utc=True)
91 | 


--------------------------------------------------------------------------------
/ntiles/toolbox/utils/handle_data.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | 
 7 | def handle_duplicates(df: pd.DataFrame, out_type: str, name: str, drop: bool = False,
 8 |                       subset: List[any] = None) -> pd.DataFrame:
 9 |     """
10 |     Checking to see if there are duplicates in the given data frame
11 |     if there are duplicates outType will be used
12 |         Ex: give a Warning or raise ValueError
13 |     :param df: The data we are checking
14 |     :param name: the name of the data to give as output
15 |     :param out_type: what to do do if there are duplicates. Currently supports "Warning", "ValueError"
16 |     :param drop: boolean to drop the duplicates or not
17 |         if False no data frame will be returned and vice verse
18 |         this param will not matter if outType is a ValueError
19 |     :param subset: subset of df columns we should check duplicates for
20 |     :return: the given df with duplicates dropped according to drop
21 |     """
22 |     # seeing if there are duplicates in the factor
23 |     dups = df.duplicated(subset=subset)
24 | 
25 |     if dups.any():
26 |         amount_of_dups = dups.sum()
27 |         out_string = f'{name} is {round(amount_of_dups / len(df), 3)} duplicates, {amount_of_dups} rows\n'
28 |         if out_type == 'Warning':
29 |             Warning(out_string)
30 |         elif out_type == 'ValueError':
31 |             raise ValueError(out_string)
32 |         else:
33 |             raise ValueError(f'out_type {out_type} not recognised')
34 | 
35 |         # dropping the duplicates
36 |         if drop:
37 |             return df.drop_duplicates(subset=subset, keep='first')
38 | 
39 |     if drop:
40 |         return df
41 | 
42 | 
43 | def make_nan_inf_summary(df: pd.DataFrame, max_loss: float) -> pd.DataFrame:
44 |     """
45 |     makes a summary fot the the amount of nan and infinity values in the given data frame
46 |     will throw a ValueError if the percent of nan and inf is greater than the given threshold
47 |     prints a summary of the nan's and inf of there are any
48 |     :param df: the data frame we are checking
49 |     :param max_loss: max decimal percent of nan and inf we are allowing the df to contain
50 |     :return: pandas data frame with the nan and inf dropped
51 |     """
52 |     df_numpy = df.to_numpy()
53 |     nan_array = np.isnan(df_numpy)
54 |     finite_array = np.logical_or(np.isinf(df_numpy), np.isneginf(df_numpy))
55 | 
56 |     if nan_array.any() or (not finite_array.all()):
57 |         factor_length = len(df)
58 |         amount_nan = nan_array.sum()
59 |         amount_inf = finite_array.sum()
60 |         total_percent_dropped = (amount_nan + amount_inf) / factor_length
61 | 
62 |         outString = f'Dropped {round(total_percent_dropped * 100, 2)}% of data. ' \
63 |                     f'{round((amount_nan / factor_length) * 100, 2)}% due to nan, ' \
64 |                     f'{round((amount_inf / factor_length) * 100, 2)}% of inf values. Threshold: {max_loss * 100}%\n'
65 | 
66 |         if total_percent_dropped > max_loss:
67 |             raise ValueError('Exceeded Nan Infinity Threshold. ' + outString)
68 | 
69 |         # print out string as a summary
70 |         print(outString)
71 | 
72 |         # dropping the nans and the infinity values
73 |         df = df.replace([np.inf, -np.inf], np.nan).dropna()
74 | 
75 |     else:
76 |         print('Dropped 0% of data')
77 | 
78 |     return df
79 | 


--------------------------------------------------------------------------------
/ntiles/toolbox/utils/ml_factor_calculation.py:
--------------------------------------------------------------------------------
  1 | import gc
  2 | from abc import ABC, abstractmethod
  3 | 
  4 | import pandas as pd
  5 | import numpy as np
  6 | 
  7 | from typing import Generator, Tuple, List
  8 | 
  9 | from tqdm import tqdm
 10 | 
 11 | 
 12 | class ModelWrapper(ABC):
 13 |     """
 14 |     Wraps a model for calc_ml_factor.
 15 |     """
 16 |     @abstractmethod
 17 |     def fit_model(self, train_features: pd.DataFrame, train_target: pd.Series) -> any:
 18 |         """
 19 |         Wraps a model for use by the calcMlFactor function.
 20 |         Fits a model to the given features. then returns the fit model.
 21 |         If the fit model does not contain a "predict" method then predict mut be overwritten.
 22 | 
 23 |         :param train_features: the features to train the model on
 24 |             Must have the same index as train_target
 25 |         :param train_target: the target for the train_features.
 26 |             Must have the same index as train_features
 27 |         :return: a model fit to the given features and targets
 28 |         """
 29 |         pass
 30 | 
 31 |     @staticmethod
 32 |     def transform_data(train_features: pd.DataFrame, train_target: pd.Series, predict: pd.DataFrame) -> \
 33 |             Tuple[pd.DataFrame, pd.DataFrame]:
 34 |         """
 35 |         *** Do not fit any transformations on the predict data. That WILL result in lookahead Bias.***
 36 |         Only manipulate the predict data with transformations fit with the train_features
 37 | 
 38 |         This method is used to preprocess the data before the training, and predicting data is passed to the model
 39 | 
 40 |         The indexes must not be changed. However columns can be dropped and altered.
 41 |         Any change to the train_target must also be done to the predict data.
 42 | 
 43 |         Example use: fit a PCA to the train_features then transform the train_features and predict data using said PCA.
 44 |                 or use RFE to reduce dimensionality
 45 | 
 46 |         :param train_features: the features to train the model on
 47 |         :param train_target: the target for the train_features
 48 |         :param predict: The data to make predictions on
 49 |         :return: the transformed (train_features, predict) with no index changes.
 50 |         """
 51 |         return train_features, predict
 52 | 
 53 |     def predict(self, train_features: pd.DataFrame, train_target: pd.Series, predict: pd.DataFrame) -> pd.Series:
 54 |         """
 55 |         fits a model to the given training data and then makes predictions with the fitted model
 56 |         fits a model by calling "fitModel".
 57 |         assumes the "fitModel" returns a model with a "predict" method.
 58 | 
 59 |         :param train_features: the features to train the model on
 60 |             Must have the same index as train_target
 61 |         :param train_target: the target for the train_features.
 62 |             Must have the same index as train_features
 63 |         :param predict: The data to make predictions on
 64 |         :return: a Tuple of pandas Series with the predictions and a float what s the
 65 |         """
 66 |         # checks the index but is very slow
 67 |         # if not train_features.index.equals(train_target.index):
 68 |         #     raise ValueError('The index for the features and target is different')
 69 | 
 70 |         # allowing the user to adjust the data before fitting, assuming that the user does not mess up the indexes
 71 |         transformed_features, transformedPredict = self.transform_data(train_features, train_target, predict)
 72 | 
 73 |         # fitting and making predictions with user defined model
 74 |         model: any = self.fit_model(transformed_features, train_target)
 75 |         predicted: pd.Series = pd.Series(data=model.predict(transformedPredict), index=predict.index)
 76 | 
 77 |         del model, train_features, train_target, predict, transformed_features, transformedPredict
 78 |         gc.collect()
 79 | 
 80 |         return predicted
 81 | 
 82 | 
 83 | class SliceHolder:
 84 |     """
 85 |     holds information on the start and end indexes for a slice.
 86 |     assumes start and end are immutable references
 87 |     """
 88 | 
 89 |     def __init__(self, start, end):
 90 |         self.__start = start
 91 |         self.__end = end
 92 | 
 93 |     @property
 94 |     def start(self):
 95 |         return self.__start
 96 | 
 97 |     @property
 98 |     def end(self):
 99 |         return self.__end
100 | 
101 |     def __str__(self):
102 |         return str(self.__start) + ', ' + str(self.__end)
103 | 
104 |     def __repr__(self):
105 |         return self.__str__()
106 | 
107 | 
108 | def calc_ml_factor(model: ModelWrapper, features: pd.DataFrame, target: pd.Series, eval_days: int, refit_every: int,
109 |                    expanding: int = None, rolling: int = None) -> pd.Series:
110 |     """
111 |     Calculates an alpha factor using a ML factor combination method.
112 |     The model is fit and predictions are made in a ModelWrapper
113 |     This function organizes the data so the model can make unbiased predictions
114 |     on what would have been point in time data.
115 | 
116 |     this function assumes that the data passed has all trading days in it (first level of index).
117 |     Ex if the the data is missing for one day then we will miss a
118 | 
119 |     :param model: the ModelWrapper that will be used to make predictions.
120 |     :param features: the features to train the model on
121 |         there cannot be null values
122 |         must have a multi index of (pd.Timestamp, symbol)
123 |     :param target: the target we are going to fit the model to
124 |         there cannot be null values
125 |         must have a multi index of (pd.Timestamp, symbol)
126 |     :param eval_days: IF INCORRECT THERE WILL BE LOOK AHEAD BIAS
127 |         the amount of days it takes to know the predictions outcome
128 |         this number should simply be the length of return we are trying to predict
129 |     :param refit_every: the amount of consecutive days to predict using a single model
130 |         this is essentially saying refit the model every x days
131 |     :param expanding: the minimum amount of days of data to train on
132 |         if rollingDays is passed then this should not be passed
133 |         if this value is passed then the model will be trained with an expanding window of data
134 |     :param rolling: the amount of rolling days to fit a model to
135 |         if minTrainDays is passed then this should not be passed
136 |     :return: pandas series of predictions. The index will be the same as "features"
137 |     """
138 | 
139 |     features_copy: pd.DataFrame = features.copy().sort_index()
140 |     target_copy: pd.Series = target.copy().sort_index()
141 | 
142 |     if not np.isfinite(features_copy.values).all():
143 |         raise ValueError('There are nan or inf values in the features')
144 |     if not np.isfinite(target_copy.values).all():
145 |         raise ValueError('There are nan or inf values in the target')
146 |     if not isinstance(features_copy.index, pd.MultiIndex):
147 |         raise ValueError('Features and target must have a pd.MultiIndex of (pd.Timestamp, str)')
148 |     if not isinstance(features_copy.index.get_level_values(0), pd.DatetimeIndex):
149 |         raise ValueError('Features and target must have index level 0 of pd.DatetimeIndex')
150 |     if not features_copy.index.equals(target_copy.index):
151 |         raise ValueError('The index for the features and target is different')
152 | 
153 |     train_predict_slices: Generator[Tuple[SliceHolder, SliceHolder], None, None] = \
154 |         generate_indexes(features_copy.index, eval_days, refit_every, expanding, rolling)
155 | 
156 |     ml_alpha: List[pd.Series] = []
157 |     for train_slice, predict_slice in tqdm(train_predict_slices):
158 |         features_train = features_copy.loc[train_slice.start:train_slice.end]
159 |         target_train = target_copy.loc[train_slice.start:train_slice.end]
160 |         predict = features_copy.loc[predict_slice.start:predict_slice.end]
161 |         ml_alpha.append(model.predict(features_train, target_train, predict))
162 | 
163 |     del features_copy, target_copy
164 |     gc.collect()
165 | 
166 |     return pd.concat(ml_alpha)
167 | 
168 | 
169 | def generate_indexes(data_index: pd.MultiIndex, eval_days: int, refit_every: int, expanding: int = None,
170 |                      rolling: int = None) -> Generator[Tuple[SliceHolder, SliceHolder], None, None]:
171 |     """
172 |     generates the slice index's for the training and predicting periods.
173 |     function is designed to work with dates in level 0 however this is not enforced anywhere
174 | 
175 |     :param data_index: MultiIndex of the data we are generating int index's for
176 |     :param eval_days: IF INCORRECT THERE WILL BE LOOK AHEAD BIAS
177 |         the amount of days it takes to know the predictions outcome
178 |         this number should simply be the length of return we are trying to predict
179 |     :param refit_every: the amount of consecutive days to predict using a single model
180 |         this is essentially saying refit the model every x days
181 |     :param expanding: the minimum amount of days of data to train on
182 |         if rollingDays is passed then this should not be passed
183 |         if this value is passed then the model will be trained with an expanding window of data
184 |     :param rolling: the amount of rolling days to fit a model to
185 |         if minTrainDays is passed then this should not be passed
186 |     :return: a generator with each iteration containing a tuple of two SliceHolders of dates.
187 |             Slice One: training indexes
188 |             Slice Two: predicting indexes
189 |     """
190 | 
191 |     if (eval_days < 1) or (refit_every < 1):
192 |         raise ValueError('eval_days and/or refit_every must be greater than zero')
193 |     if rolling is not None and (rolling < 1):
194 |         raise ValueError('rolling must be greater than zero')
195 |     if expanding is not None and (expanding < 1):
196 |         raise ValueError('expanding must be greater than zero')
197 |     if (not bool(expanding)) and (not bool(rolling)):
198 |         raise ValueError('minTrainDays or rollingDays must be defined')
199 |     if bool(expanding) & bool(rolling):
200 |         raise ValueError('minTrainDays and rollingDays can not both be defined')
201 | 
202 |     dates: np.array = data_index.get_level_values(0).drop_duplicates().to_numpy()
203 | 
204 |     start_place = expanding if expanding else rolling
205 |     # dont have to ceil this bc it wont matter with a < operator
206 |     amount_of_loops: float = (len(dates) - start_place - eval_days) / refit_every
207 | 
208 |     i: int = 0
209 |     while i < amount_of_loops:
210 |         # .loc[] is inclusive in a slice, so everything here is inclusive
211 |         train_end_index: int = (i * refit_every) + (start_place - 1)
212 |         train_start_index: int = train_end_index - rolling + 1 if rolling else 0
213 |         train_slice: SliceHolder = SliceHolder(dates[train_start_index], dates[train_end_index])
214 | 
215 |         predict_start_index: int = train_end_index + eval_days + 1
216 |         predict_end_index: int = predict_start_index + refit_every - 1
217 |         # accounting for when the ending predicted index is out of bounds on the last loop
218 |         if predict_end_index >= len(dates) - 1:
219 |             predict_end_index: int = len(dates) - 1
220 | 
221 |         predict_slice: SliceHolder = SliceHolder(dates[predict_start_index], dates[predict_end_index])
222 | 
223 |         i += 1
224 |         yield train_slice, predict_slice
225 | 


--------------------------------------------------------------------------------
/ntiles/toolbox/utils/utils.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | 
  3 | import duckdb
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | 
  8 | def calculate_ic(y_true: np.array, y_pred: np.array) -> float:
  9 |     """
 10 |     computes the information coefficient for the predicted and true variables.
 11 |     This function can be given to a sklearn.model_selection Hyper-parameter optimizer.
 12 | 
 13 |     Example use in sklearn:
 14 |         scoring = make_scorer(crossValIC, greater_is_better=True)
 15 | 
 16 |     :param y_true: the true value of the target
 17 |     :param y_pred: the predicted value of the target
 18 |     :return: the information coefficient of the y_pred
 19 |     """
 20 |     return np.corrcoef(y_true, y_pred)[0][1]
 21 | 
 22 | 
 23 | def factorize(df: pd.DataFrame, partition_by: List[str], exclude=None):
 24 |     """
 25 |     Factorizes each column of the given dataframe except for the partition_by columns and the exclude columns
 26 |     Will preserve indexes and period data types
 27 | 
 28 |     Calculates the centered zscore
 29 | 
 30 |     In future would like to winsorize at 2.5% and 97.5% percentiles but hard to do in sql
 31 | 
 32 |     Won't rename the columns will overwrite them
 33 | 
 34 |     :param df: the dataframe we are factorizing
 35 |     :param partition_by: What to partition by for calculating median and std will normally be date and sector
 36 |     :param exclude: columns to exclude in the factorization process
 37 |     """
 38 |     if exclude is None:
 39 |         exclude = []
 40 | 
 41 |     return _duck_db_edits(df, _factorize(df, partition_by, exclude))
 42 | 
 43 | 
 44 | def _factorize(df: pd.DataFrame, partition_by: List[str], exclude: List[str]):
 45 |     select = partition_by + exclude
 46 |     for col in set(df.columns) - set(partition_by) - set(exclude):
 47 |         select.append(
 48 |             f'({col} - median({col}) OVER factorize_partition) / stddev({col}) OVER factorize_partition AS {col}')
 49 |     sql = f"""SELECT {', '.join(select)}
 50 |                     FROM df
 51 |                     WINDOW factorize_partition AS (PARTITION BY {', '.join(partition_by)})
 52 |                     ORDER BY {', '.join(partition_by)}
 53 |                     """
 54 |     return sql
 55 | 
 56 | 
 57 | def rank(df: pd.DataFrame, partition_by: List[str], exclude=None, rank_type: str = 'percent_rank'):
 58 |     """
 59 |     Ranks each column of the given dataframe except for the partition_by columns and the exclude columns
 60 |     Will preserve indexes and period data types
 61 |     Won't rename the columns will overwrite them
 62 | 
 63 |     :param df: the dataframe we are factorizing
 64 |     :param partition_by: What to partition by for calculating rank will normally be date and sector
 65 |     :param exclude: columns to exclude in the ranking process
 66 |     :param rank_type: the type of rank we are performing
 67 |     """
 68 |     if exclude is None:
 69 |         exclude = []
 70 | 
 71 |     return _duck_db_edits(df, _rank(df, partition_by, exclude, rank_type))
 72 | 
 73 | 
 74 | def _rank(df: pd.DataFrame, partition_by: List[str], exclude: List[str], rank_type: str):
 75 |     select = partition_by + exclude
 76 |     for col in set(df.columns) - set(partition_by) - set(exclude):
 77 |         select.append(
 78 |             f"CASE WHEN {col} is NULL THEN NULL ELSE {rank_type}() OVER (PARTITION BY {', '.join(partition_by)} "
 79 |             f"ORDER BY {col}) END AS {col}")
 80 |     sql = f"""SELECT {', '.join(select)}
 81 |                         FROM df
 82 |                         ORDER BY {', '.join(partition_by)}
 83 |                         """
 84 |     return sql
 85 | 
 86 | 
 87 | def ntile(df: pd.DataFrame, ntiles:int, partition_by: List[str], exclude=None):
 88 |     """
 89 |     Ntiles each column of the given dataframe except for the partition_by columns and the exclude columns
 90 |     Will preserve indexes and period data types
 91 |     Won't rename the columns will overwrite them
 92 | 
 93 |     :param df: the dataframe we are factorizing
 94 |     :param partition_by: What to partition by for calculating rank will normally be date and sector
 95 |     :param exclude: columns to exclude in the ranking process
 96 |     """
 97 |     if exclude is None:
 98 |         exclude = []
 99 | 
100 |     return _duck_db_edits(df, _ntile(df, ntiles, partition_by, exclude))
101 | 
102 | 
103 | def _ntile(df, ntiles, partition_by, exclude):
104 |     select = partition_by + exclude
105 |     for col in set(df.columns) - set(partition_by) - set(exclude):
106 |         select.append(
107 |             f" NTILE({ntiles}) OVER(PARTITION BY {', '.join(partition_by)} ORDER BY {col} DESC) as {col} ")
108 |     sql = f"""SELECT {', '.join(select)}
109 |                             FROM df
110 |                             ORDER BY {', '.join(partition_by)}
111 |                             """
112 |     return sql
113 | 
114 | 
115 | def _duck_db_edits(df, sql):
116 |     index_cols = None
117 |     if not isinstance(df.index, pd.RangeIndex):
118 |         index_cols = df.index.names
119 |         df = df.reset_index()
120 | 
121 |     convert_to_period = []
122 |     for col in df.columns:
123 |         if isinstance(df[col].dtype, pd.PeriodDtype):
124 |             df[col] = df[col].dt.to_timestamp()
125 |             convert_to_period.append(col)
126 | 
127 |     df = duckdb.query(sql).df()
128 |     for col in convert_to_period:
129 |         df[col] = df[col].dt.to_period('D')
130 |     df = df.set_index(index_cols) if index_cols else df
131 |     return df
132 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | 
 3 | from setuptools import find_packages
 4 | 
 5 | setup(
 6 |     name='ntiles',
 7 |     version='0.1.5.1',
 8 |     packages=find_packages(),
 9 |     license='Apache License 2.0',
10 |     description='Vectorized quantile backtester.',
11 |     url='https://github.com/Alexd14/ntiles-backtester',
12 |     download_url='https://github.com/Alexd14/ntiles/archive/refs/tags/v1.5.1.tar.gz',
13 |     keywords=['factor', 'backtesting', 'alphalens', 'vectorized backtesting', 'equity trading'],
14 |     install_requires=[
15 |         'numba',
16 |         'pandas',
17 |         'numpy',
18 |         'matplotlib',
19 |         'empyrical',
20 |         'factor_toolbox',
21 |         # 'equity-db'
22 |     ],
23 |     classifiers=[
24 |         'License :: OSI Approved :: Apache Software License',
25 |         'Programming Language :: Python :: 3.7',
26 |         'Programming Language :: Python :: 3.8',
27 |         'Programming Language :: Python :: 3.9',
28 |         'Programming Language :: Python :: 3.10',
29 |     ],
30 | )
31 | 


--------------------------------------------------------------------------------