├── .gitignore ├── CITATION.cff ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── README.rst ├── SECURITY.md ├── SUPPORT.md ├── caspr ├── __init__.py ├── data │ ├── __init__.py │ ├── common_dataset.py │ └── load.py ├── models │ ├── README.md │ ├── __init__.py │ ├── attention_mechanisms.py │ ├── convolutional_aggregation.py │ ├── dec.py │ ├── dense_bn_dropout.py │ ├── embedding_layer.py │ ├── factory.py │ ├── lstm_autoencoder_sequence.py │ ├── lstm_decoder.py │ ├── lstm_timeseries_tpa_attention.py │ ├── mlp.py │ ├── model_wrapper.py │ ├── multi_layer_lstm.py │ ├── transformer.py │ ├── unified_encoder.py │ └── unified_transformer_encoder.py └── utils │ ├── __init__.py │ ├── early_stopping.py │ ├── estimate_parameters.py │ ├── explain │ ├── CASPRExplainer.py │ ├── __init__.py │ └── utils.py │ ├── horovod │ ├── __init__.py │ └── train.py │ ├── metrics.py │ ├── noise.py │ ├── onnx.py │ ├── preprocess.py │ ├── score.py │ ├── segmentation │ ├── __init__.py │ ├── dec_utils.py │ └── pandas.py │ ├── spark │ ├── __init__.py │ ├── large │ │ ├── __init__.py │ │ ├── score.py │ │ └── train.py │ ├── preprocess.py │ └── score.py │ └── train.py ├── docs ├── PR_Guidelines.md └── images │ ├── caspr-logo.png │ └── caspr-poster.png ├── setup.cfg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.pickle 3 | *.pth 4 | *.p 5 | *.ipynb 6 | *.csv 7 | .amlignore 8 | aml_config 9 | .git 10 | .vscodeignore 11 | azureml-logs 12 | .azureml 13 | outputs 14 | azureml-setup 15 | *:Zone.Identifier* 16 | 17 | # VSCode stuff 18 | **/*.prefs 19 | **/*.project 20 | **/*.classpath 21 | .[Vv]scode 22 | .idea/ 23 | 24 | # Byte-compiled / optimized / DLL files 25 | __pycache__/ 26 | *.py[cod] 27 | *$py.class 28 | 29 | # C extensions 30 | *.so 31 | 32 | # Distribution / packaging 33 | .Python 34 | build/ 35 | !.azure-pipelines/build/ 36 | develop-eggs/ 37 | dist/ 38 | downloads/ 39 | eggs/ 40 | .eggs/ 41 | lib/ 42 | lib64/ 43 | parts/ 44 | sdist/ 45 | var/ 46 | wheels/ 47 | *.egg-info/ 48 | .installed.cfg 49 | *.egg 50 | MANIFEST 51 | 52 | # PyInstaller 53 | # Usually these files are written by a python script from a template 54 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 55 | *.manifest 56 | *.spec 57 | 58 | # Installer logs 59 | pip-log.txt 60 | pip-delete-this-directory.txt 61 | 62 | # Unit test / coverage reports 63 | htmlcov/ 64 | .tox/ 65 | .coverage 66 | .coverage.* 67 | .cache 68 | nosetests.xml 69 | coverage.xml 70 | *.cover 71 | .hypothesis/ 72 | .pytest_cache/ 73 | 74 | # Translations 75 | *.mo 76 | *.pot 77 | 78 | # Django stuff: 79 | *.log 80 | local_settings.py 81 | db.sqlite3 82 | 83 | # Flask stuff: 84 | instance/ 85 | .webassets-cache 86 | 87 | # Scrapy stuff: 88 | .scrapy 89 | 90 | # Sphinx documentation 91 | docs/_build/ 92 | 93 | # PyBuilder 94 | target/ 95 | 96 | # Jupyter Notebook 97 | .ipynb_checkpoints 98 | 99 | # pyenv 100 | .python-version 101 | 102 | # celery beat schedule file 103 | celerybeat-schedule 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .vs/* 130 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | title: CASPR 3 | message: "Please use this information to cite CASPR in 4 | research or other publications." 5 | authors: 6 | - given-names: Pin-Jung 7 | family-names: Chen 8 | email: pinjung.chen@microsoft.com 9 | affiliation: Microsoft Corporation 10 | - given-names: Sahil 11 | family-names: Bhatnagar 12 | email: sahil.bhatnagar@microsoft.com 13 | affiliation: Microsoft Corporation 14 | - given-names: Damian Konrad 15 | family-names: Kowalczyk 16 | email: damian.kowalczyk@microsoft.com 17 | affiliation: Microsoft Corporation 18 | - given-names: Mayank 19 | family-names: Shrivastava 20 | email: mayank.shrivastava@microsoft.com 21 | affiliation: Microsoft Corporation 22 | - given-names: Sagar 23 | family-names: Goyal 24 | email: goyalsagar@outlook.com 25 | 26 | date-released: 2022-11-16 27 | repository-code: "https://github.com/microsoft/CASPR" 28 | license: "MIT" 29 | keywords: 30 | - deep learning 31 | - machine learning 32 | - tabular data 33 | 34 | version: 0.2.6 35 | doi: 10.48550/arXiv.2211.09174 36 | references: 37 | - type: article 38 | authors: 39 | - given-names: Pin-Jung 40 | family-names: Chen 41 | email: pinjung.chen@microsoft.com 42 | affiliation: Microsoft Corporation 43 | - given-names: Sahil 44 | family-names: Bhatnagar 45 | email: sahil.bhatnagar@microsoft.com 46 | affiliation: Microsoft Corporation 47 | - given-names: Damian Konrad 48 | family-names: Kowalczyk 49 | email: damian.kowalczyk@microsoft.com 50 | affiliation: Microsoft Corporation 51 | - given-names: Mayank 52 | family-names: Shrivastava 53 | email: mayank.shrivastava@microsoft.com 54 | affiliation: Microsoft Corporation 55 | - given-names: Sagar 56 | family-names: Goyal 57 | email: goyalsagar@outlook.com 58 | title: "CASPR: Customer Activity Sequence-based Prediction and Representation" 59 | year: 2022 60 | journal: ArXiv 61 | doi: 10.48550/arXiv.2211.09174 62 | url: https://arxiv.org/abs/2211.09174 63 | 64 | abstract: >- 65 | Tasks critical to enterprise profitability, such as customer churn prediction, fraudulent account detection or customer lifetime value estimation, are often tackled by models trained on features engineered from customer data in tabular format. Application-specific feature engineering adds development, operationalization and maintenance costs over time. Recent advances in representation learning present an opportunity to simplify and generalize feature engineering across applications. When applying these advancements to tabular data researchers deal with data heterogeneity, variations in customer engagement history or the sheer volume of enterprise datasets. In this paper, we propose a novel approach to encode tabular data containing customer transactions, purchase history and other interactions into a generic representation of a customer's association with the business. We then evaluate these embeddings as features to train multiple models spanning a variety of applications. CASPR, Customer Activity Sequence-based Prediction and Representation, applies Transformer architecture to encode activity sequences to improve model performance and avoid bespoke feature engineering across applications. Our experiments at scale validate CASPR for both small and large enterprise applications. 66 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | We're looking for your help to improve CASPR (bug fixes, new features, documentation, etc). 4 | 5 | ## Contribute a code change 6 | * Start by reading the [CASPR Paper](https://arxiv.org/abs/2211.09174) 7 | * If your change is non-trivial or introduces new public facing APIs (discussed in more detail below) please use the [feature request issue template](https://github.com/microsoft/CASPR/issues/new?template=feature_request.md) to discuss it with the team and get consensus on the basic design and direction first. For all other changes, you can directly create a pull request (PR) and we'll be happy to take a look. 8 | * Make sure your PR adheres to the [PR Guidelines](./docs/PR_Guidelines.md) established by the team. 9 | * If you're unsure about any of the above and want to contribute, you're welcome to start a discussion with the team. 10 | 11 | ## Process details 12 | 13 | Please search the [issue tracker](https://github.com/microsoft/CASPR/issues) for a similar idea first: there may already be an issue you can contribute to. 14 | 15 | 1. **Create Issue** 16 | To propose a new feature or API please start by filing a new issue in the [issue tracker](https://github.com/microsoft/CASPR/issues). 17 | Include as much detail as you have. It's fine if it's not a complete design: just a summary and rationale is a good starting point. 18 | 19 | 2. **Discussion** 20 | We'll keep the issue open for community discussion until it has been resolved or is deemed no longer relevant. 21 | Note that if an issue isn't a high priority or has many open questions then it might stay open for a long time. 22 | 23 | 3. **Owner Review** 24 | The CASPR team will review the proposal and either approve or close the issue based on whether it broadly aligns with the CASPR Roadmap and contribution guidelines. 25 | 26 | 4. **Implementation** 27 | * A feature can be implemented by you, the CASPR team, or other community members. Code contributions are greatly appreciated: feel free to work on any reviewed feature you proposed, or choose one in the backlog and send us a PR. If you are new to the project and want to work on an existing issue, we recommend starting with issues that are tagged with “good first issue”. Please let us know in the issue comments if you are actively working on implementing a feature so we can ensure it's assigned to you. 28 | * Unit tests: New code *must* be accompanied by unit tests. 29 | * Documentation and sample updates: If the PR affects any of the documentation or samples then include those updates in the same PR. 30 | 31 | * Once a feature is complete and tested according to the contribution guidelines follow these steps: 32 | * Follow the [standard GitHub process to open a pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests) 33 | * Add reviewers who have context from the earlier discussion. If you can't find a reviewer, add 'microsoft/CASPR'. 34 | * Note: After creating a pull request, you might not see a build getting triggered right away. One of the 35 | CASPR team members can trigger the build for you. 36 | 37 | ## Licensing guidelines 38 | 39 | This project welcomes contributions and suggestions. Most contributions require you to 40 | agree to a Contributor License Agreement (CLA) declaring that you have the right to, 41 | and actually do, grant us the rights to use your contribution. For details, visit 42 | https://cla.microsoft.com. 43 | 44 | When you submit a pull request, a CLA-bot should automatically determine whether you need 45 | to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the 46 | instructions provided by the bot. You will only need to do this once across all repositories using our CLA. 47 | 48 | ## Code of conduct 49 | 50 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 51 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 52 | or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 53 | 54 | ## Report a security issue 55 | 56 | Security issues and bugs should be reported privately, via email, to the Microsoft Security 57 | Response Center (MSRC) at [secure@microsoft.com](mailto:secure@microsoft.com). You should 58 | receive a response within 24 hours. If for some reason you do not, please follow up via 59 | email to ensure we received your original message. Further information, including the 60 | [MSRC PGP](https://technet.microsoft.com/en-us/security/dn606155) key, can be found in 61 | the [Security TechCenter](https://technet.microsoft.com/en-us/security/default). 62 | 63 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.rst 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | 4 | **CASPR is a transformer-based framework for deep learning from sequential data in tabular format, most common in business applications.** 5 | 6 |

7 | Tasks critical to enterprise profitability, such as customer churn prediction, fraudulent account detection or customer lifetime value estimation, are often tackled by models trained on features engineered from customer data in tabular format. Application-specific feature engineering however adds development, operationalization and maintenance costs over time. Recent advances in representation learning present an opportunity to simplify and generalize feature engineering across applications. 8 | 9 | With **CASPR** we propose a novel approach to encode sequential data in tabular format (e.g., customer transactions, purchase history and other interactions) into a generic representation of a subject's (e.g., customer's) association with the business. We evaluate these embeddings as features to train multiple models spanning a variety of applications (see: [paper](https://arxiv.org/abs/2211.09174)). CASPR, Customer Activity Sequence-based Prediction and Representation, applies transformer architecture to encode activity sequences to improve model performance and avoid bespoke feature engineering across applications. Our experiments at scale validate CASPR for both small and large enterprise applications. 10 |

11 | 12 | 17 | 18 | ## Getting Started & Resources 19 | 20 | * **CASPR: Customer Activity Sequence-based Prediction and Representation** (NeurIPS 2022, New Orleans: Tabular Representation Learning) 21 | - [paper](https://arxiv.org/abs/2211.09174) 22 | - [poster](https://github.com/microsoft/CASPR/docs/images/caspr-poster.png) 23 | 24 | * **Build** 25 | 26 | - pre-requisites: ```python==3.9, setuptools``` 27 | - building the wheel: ```python setup.py build bdist_wheel``` 28 | 29 | * **Installation** 30 | 31 | ``` 32 | (now) 33 | pip install .\dist\AI.Models.CASPR-.whl[] 34 | 35 | (future) 36 | pip install AI.Models.CASPR[] 37 | ``` 38 | 39 | use any of below modifiers, to customize the installation for target system / usecase: 40 | ``` 41 | horovod - for distributed training and inference on Horovod 42 | databricks - for distributed training and inference on Databricks 43 | aml - for (distributed) training and inference on Azure ML 44 | hdi - for execution on Azure HD Insights 45 | xai - to enable explainability 46 | test - for extended test execution 47 | dev - for development purposes only 48 | ``` 49 | * **Examples** 50 | 51 | (TODO: can we point to a well commented one of our examples w/ or w/o data?) 52 | 53 | ## Contributions and Feedback 54 | 55 | We welcome contributions! Please see the [contribution guidelines](CONTRIBUTING.md). 56 | 57 | For feature requests or bug reports please file a [GitHub Issue](https://github.com/Microsoft/CASPR/issues). 58 | 59 | ## Code of Conduct 60 | 61 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 62 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 63 | or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 64 | 65 | ## License 66 | 67 | This project is licensed under the [MIT License](LICENSE). 68 | 69 | --- -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | AI.Models.CASPR 2 | ================== 3 | This is the deep learning CASPR model. 4 | 5 | This package has been tested with Python 3.7 6 | 7 | Usage 8 | ----- 9 | You need to have access to Business360 artifact feed on Azure Devops 10 | 11 | | pip install twine keyring artifacts-keyring 12 | | pip install AI.Models.CASPR --index-url=https://powerbi.pkgs.visualstudio.com/_packaging/Business360/pypi/simple -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # TODO: The maintainer of this repo has not yet edited this file 2 | 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? 4 | 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help. 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps. 7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide. 8 | 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.* 10 | 11 | # Support 12 | 13 | ## How to file issues and get help 14 | 15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 17 | feature request as a new Issue. 18 | 19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER 21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**. 22 | 23 | ## Microsoft Support Policy 24 | 25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 26 | -------------------------------------------------------------------------------- /caspr/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # -------------------------------------------------------------------------- 4 | 5 | """A series of modules for the CASPR deep learning AI model. 6 | 7 | Provide a longer general description of the modules in this folder here. 8 | 9 | Modules: 10 | :module1_name: A description of this specific module. 11 | """ 12 | 13 | __VERSION = "0.9.dev3" # arbitrary low dev version for local build 14 | -------------------------------------------------------------------------------- /caspr/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | 3 | # 4 | 5 | # Unless required by applicable law or agreed to in writing, software 6 | 7 | # distributed under the License is distributed on an "AS IS" BASIS, 8 | 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | 11 | # See the License for the specific language governing permissions and 12 | 13 | # limitations under the License. 14 | 15 | # 16 | 17 | # ============================================================================== 18 | -------------------------------------------------------------------------------- /caspr/data/common_dataset.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import numpy as np 3 | import pandas as pd 4 | import torch 5 | from torch.utils.data.dataloader import default_collate 6 | 7 | 8 | class CommonDataset(torch.utils.data.Dataset): 9 | def __init__(self, df, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps, tgt_id=[]): 10 | self.len = df.shape[0] 11 | self.seq_cols = seq_cols if seq_cols else [] 12 | 13 | self.non_seq_cols = non_seq_cols 14 | self.output_col = output_col 15 | 16 | self.seq_contX = torch.tensor(df[[item for item in seq_cols if item in cont_cols]].values, dtype=torch.float32) 17 | self.seq_catX = torch.tensor(df[[item for item in seq_cols if item in cat_cols]].values, dtype=torch.long) 18 | 19 | self.seq_contX = self.seq_contX.reshape( 20 | (self.seq_contX.shape[0], int(self.seq_contX.shape[1]/time_steps), time_steps)) 21 | self.seq_contX = self.seq_contX.permute(0, 2, 1) 22 | 23 | self.seq_catX = self.seq_catX.reshape( 24 | (self.seq_catX.shape[0], int(self.seq_catX.shape[1]/time_steps), time_steps)) 25 | self.seq_catX = self.seq_catX.permute(0, 2, 1) 26 | 27 | self.non_seq_catX = torch.tensor( 28 | df[[item for item in non_seq_cols if item in cat_cols]].values, dtype=torch.long) 29 | self.non_seq_contX = torch.tensor( 30 | df[[item for item in non_seq_cols if item in cont_cols]].values, dtype=torch.float32) 31 | 32 | self.y = torch.tensor(df[output_col].values, dtype=torch.float32) 33 | 34 | self.tgt_id = df[tgt_id].values 35 | 36 | @classmethod 37 | def for_inference(cls, continuous: pd.Series, categorical: pd.Series, seq_cols, non_seq_cols, cat_cols, cont_cols, time_steps): 38 | cont_df = pd.DataFrame(continuous.values.tolist(), columns=cont_cols) 39 | cat_df = pd.DataFrame(categorical.values.tolist(), columns=cat_cols) 40 | 41 | df = pd.concat([cont_df, cat_df], axis=1) 42 | return cls(df, seq_cols, non_seq_cols, [], cat_cols, cont_cols, time_steps, tgt_id=[]) 43 | 44 | def __getitem__(self, index): 45 | return [self.tgt_id[index], self.y[index], self.seq_catX[index], self.seq_contX[index], self.non_seq_catX[index], self.non_seq_contX[index]] 46 | 47 | def __len__(self): 48 | return self.len 49 | 50 | 51 | def id_collate(batch): 52 | ids = [] 53 | new_batch = [] 54 | for _batch in batch: 55 | ids.append(_batch[0]) 56 | new_batch.append(_batch[1:]) 57 | ids = np.stack(ids, axis=0) 58 | return tuple([ids] + default_collate(new_batch)) 59 | -------------------------------------------------------------------------------- /caspr/data/load.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from sklearn.model_selection import train_test_split 3 | from torch.utils.data import DataLoader 4 | from torch.utils.data.distributed import DistributedSampler 5 | 6 | from caspr.data.common_dataset import CommonDataset, id_collate 7 | 8 | 9 | def transform_and_load(batch, device, tgt_id_cols, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps): 10 | """ 11 | Transforms a batch of feature tensors from Petastorm, into input tensors for CASPR, then loads onto chosen device. 12 | """ 13 | if not batch: 14 | raise ValueError("non-empty batch of tensors required") 15 | 16 | if not int(time_steps) > 0: 17 | raise ValueError("time_steps should be a positive integer") 18 | 19 | batch_size = batch[list(batch.keys())[0]].shape[0] 20 | 21 | seq_contX_cols = [item for item in seq_cols if item in cont_cols] 22 | if seq_contX_cols: 23 | seq_contX = torch.cat([batch[c] for c in seq_contX_cols], 0).float().to(device) 24 | seq_contX = seq_contX.reshape(-1, time_steps, batch_size).T 25 | else: 26 | seq_contX = torch.zeros((batch_size, time_steps, 0), device=device).float() 27 | 28 | seq_catX_cols = [item for item in seq_cols if item in cat_cols] 29 | if seq_catX_cols: 30 | seq_catX = torch.cat([batch[c] for c in seq_catX_cols], 0).long().to(device) 31 | seq_catX = seq_catX.reshape(-1, time_steps, batch_size).T 32 | else: 33 | seq_catX = torch.zeros((batch_size, time_steps, 0), device=device).long() 34 | 35 | non_seq_catX_cols = [item for item in non_seq_cols if item in cat_cols] 36 | if non_seq_catX_cols: 37 | non_seq_catX = torch.cat([batch[c] for c in non_seq_catX_cols], 0).long().to(device) 38 | non_seq_catX = non_seq_catX.reshape(len(non_seq_catX_cols), batch_size).T 39 | else: 40 | non_seq_catX = torch.zeros(batch_size, 0, device=device).long() 41 | 42 | non_seq_contX_cols = [item for item in non_seq_cols if item in cont_cols] 43 | if non_seq_contX_cols: 44 | non_seq_contX = torch.cat([batch[c] for c in non_seq_contX_cols], 0).float().to(device) 45 | non_seq_contX = non_seq_contX.reshape(len(non_seq_contX_cols), batch_size).T 46 | else: 47 | non_seq_contX = torch.zeros(batch_size, 0, device=device).float() 48 | 49 | if output_col: 50 | y = torch.cat([batch[c] for c in output_col], 0).to(device) 51 | y = y.reshape((len(output_col), -1)).T 52 | else: 53 | y = torch.zeros(batch_size, 0, device=device).float() 54 | 55 | if tgt_id_cols: 56 | tgt_id = torch.cat([batch[c] for c in tgt_id_cols], 0).long().cpu() 57 | tgt_id = tgt_id.reshape(len(tgt_id_cols), batch_size).T.numpy() 58 | else: 59 | tgt_id = torch.zeros(batch_size, 0).long().cpu().numpy() 60 | 61 | return tgt_id, y, seq_catX, seq_contX, non_seq_catX, non_seq_contX 62 | 63 | 64 | def init_datasets(df, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, seq_len, test_ratio=0.2, seed=None): 65 | """ 66 | Splits an incoming columnar dataframe into CASPR train and validation datasets 67 | """ 68 | 69 | train_pd, val_pd = train_test_split(df, test_size=test_ratio, random_state=seed) 70 | 71 | print(f"train: {len(train_pd)}, val: {len(val_pd)}") 72 | 73 | dataset_train = CommonDataset( 74 | train_pd, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, seq_len) 75 | 76 | dataset_val = CommonDataset( 77 | val_pd, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, seq_len) 78 | 79 | return dataset_train, dataset_val 80 | 81 | 82 | def init_loaders(ds_train, ds_val, batch_size, num_workers=0, shuffle=False, pin_memory=True, world_size=1, rank=0): 83 | """ 84 | Initializes train and validation data loaders. The loaders support distributed sampling when world_size > 1. 85 | """ 86 | 87 | print("Initializing dataloaders... Replica: %d of %d" % (rank + 1, world_size)) 88 | 89 | val_sampler = DistributedSampler(ds_val, 90 | num_replicas=world_size, rank=rank, shuffle=shuffle) if world_size > 1 else None 91 | 92 | val_loader = DataLoader(ds_val, pin_memory=pin_memory, 93 | batch_size=batch_size, num_workers=num_workers, sampler=val_sampler, collate_fn=id_collate) 94 | 95 | train_sampler = DistributedSampler(ds_train, 96 | num_replicas=world_size, rank=rank, shuffle=shuffle) if world_size > 1 else None 97 | 98 | train_loader = DataLoader(ds_train, pin_memory=pin_memory, 99 | batch_size=batch_size, num_workers=num_workers, sampler=train_sampler, collate_fn=id_collate) 100 | 101 | return train_loader, val_loader 102 | -------------------------------------------------------------------------------- /caspr/models/README.md: -------------------------------------------------------------------------------- 1 | The model architecture should follow the following guidelines to support explainability 2 | 3 | Basic changes made: 4 | 1. Every model class should have the flags - explain, interpretable_emb_non_seq and interpretable_emb_seq 5 | 6 | 2. The nn.Embedding layers and the dropout after that need to be modularised 7 | out of the model and the Seq_Cat_Embedding and Non_Seq_Cat_Embedding classes present in the Embedding_Layers.py file should be used for them 8 | 9 | 3. The input to every forward function should be a single concatenated vector 10 | 11 | 4. The activate_explainer_mode and deactivate_explainer_mode functions should be a part of every model class (also every model wrapper class) 12 | 13 | 14 | """ 15 | Some notes regarding the explainer: 16 | 1. When we join multiple models to form a new model - 17 | use the activate_explainer_mode functions to call the 18 | respective functions for all consituent sub_model classes 19 | 20 | 2. Right now the architecture supports only model wrappers which join the model in a vertical fashion (the case for all our models for now) 21 | 22 | 3. The explainer modes are activated by the DLExplainer module and 23 | also deactivated by it 24 | 25 | 4. The indices to embedding conversion happens in the DLExplainer module 26 | 27 | 28 | """ 29 | 30 | Please refer the mlp_autoencoder.py file to have a look -------------------------------------------------------------------------------- /caspr/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | 3 | # 4 | 5 | # Unless required by applicable law or agreed to in writing, software 6 | 7 | # distributed under the License is distributed on an "AS IS" BASIS, 8 | 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | 11 | # See the License for the specific language governing permissions and 12 | 13 | # limitations under the License. 14 | 15 | # 16 | 17 | # ============================================================================== 18 | -------------------------------------------------------------------------------- /caspr/models/attention_mechanisms.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """Attention mechanisms base class.""" 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | class MultiHeadAttentionLayer(nn.Module): # noqa: W0223 9 | def __init__(self, hid_dim, n_heads, dropout): 10 | """Initialize model with params.""" 11 | super().__init__() 12 | 13 | assert hid_dim % n_heads == 0 14 | 15 | self.hid_dim = hid_dim 16 | self.n_heads = n_heads 17 | self.head_dim = hid_dim // n_heads 18 | 19 | self.fc_q = nn.Linear(hid_dim, hid_dim) 20 | self.fc_k = nn.Linear(hid_dim, hid_dim) 21 | self.fc_v = nn.Linear(hid_dim, hid_dim) 22 | 23 | self.fc_o = nn.Linear(hid_dim, hid_dim) 24 | 25 | self.dropout = nn.Dropout(dropout) 26 | 27 | self.register_buffer('scale', torch.sqrt(torch.FloatTensor([self.head_dim]))) 28 | 29 | def forward(self, query, key, value, mask=None): 30 | """Run a forward pass of model over the data.""" 31 | batch_size = query.shape[0] 32 | 33 | # query = [batch size, query len, hid dim] 34 | # key = [batch size, key len, hid dim] 35 | # value = [batch size, value len, hid dim] 36 | 37 | Q = self.fc_q(query) 38 | K = self.fc_k(key) 39 | V = self.fc_v(value) 40 | 41 | # Q = [batch size, query len, hid dim] 42 | # K = [batch size, key len, hid dim] 43 | # V = [batch size, value len, hid dim] 44 | 45 | Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3) 46 | K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3) 47 | V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3) 48 | 49 | # Q = [batch size, n heads, query len, head dim] 50 | # K = [batch size, n heads, key len, head dim] 51 | # V = [batch size, n heads, value len, head dim] 52 | 53 | energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale 54 | 55 | # energy = [batch size, n heads, query len, key len] 56 | 57 | if mask is not None: 58 | energy = energy.masked_fill(mask == 0, -1e10) 59 | 60 | attention = torch.softmax(energy, dim=-1) 61 | 62 | # attention = [batch size, n heads, query len, key len] 63 | 64 | x = torch.matmul(self.dropout(attention), V) 65 | 66 | # x = [batch size, n heads, query len, head dim] 67 | 68 | x = x.permute(0, 2, 1, 3).contiguous() 69 | 70 | # x = [batch size, query len, n heads, head dim] 71 | 72 | x = x.view(batch_size, -1, self.hid_dim) 73 | 74 | # x = [batch size, query len, hid dim] 75 | 76 | x = self.fc_o(x) 77 | 78 | # x = [batch size, query len, hid dim] 79 | 80 | return x, attention 81 | 82 | 83 | class MultiHeadAttentionLSTMWrapper(nn.Module): # noqa: W0223 84 | def __init__(self, n_head, d_model, dropout=0.1): 85 | """Initialize model with params.""" 86 | super().__init__() 87 | 88 | self.self_attn_layer_norm = nn.LayerNorm(d_model) 89 | self.multi_head_attn = MultiHeadAttentionLayer(hid_dim=d_model, n_heads=n_head, dropout=dropout) 90 | self.dropout = nn.Dropout(dropout) 91 | 92 | def forward(self, q, k, v, mask=None): 93 | """Run a forward pass of model over the data.""" 94 | _q, _ = self.multi_head_attn(q, k, v, mask=mask) 95 | # dropout, residual connection and layer norm 96 | q = self.self_attn_layer_norm(q + self.dropout(_q)) 97 | 98 | context_vector = torch.sum(q, 1) 99 | return context_vector 100 | 101 | 102 | class BahdanauAttention(nn.Module): # noqa: W0223 103 | def __init__(self, hidden_size, num_directions=1): 104 | """Initialize model with params.""" 105 | 106 | super().__init__() 107 | self.num_directions = num_directions 108 | self.hidden_size = hidden_size 109 | self.fc_encoder = nn.Linear(self.num_directions*self.hidden_size, self.hidden_size, bias=False) 110 | self.attnHidden = nn.Linear(self.hidden_size, 1) 111 | 112 | def forward(self, enc_outputs): 113 | """Run a forward pass of model over the data.""" 114 | tempX = torch.tanh(self.fc_encoder(enc_outputs)) 115 | 116 | alignment_scores = self.attnHidden(tempX) 117 | 118 | attn_weights = F.softmax(alignment_scores, dim=1) 119 | attn_weights = attn_weights.permute(0, 2, 1) 120 | 121 | context_vector = torch.bmm(attn_weights, enc_outputs) 122 | 123 | return context_vector 124 | -------------------------------------------------------------------------------- /caspr/models/convolutional_aggregation.py: -------------------------------------------------------------------------------- 1 | """CNN based layer base class.""" 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | class ConvAggregation(nn.Module): # noqa: W0223 9 | """Initialise a CNN based layer that reduces the size of our input. 10 | 11 | It treates the sequential input like an image with a single channel and performs learned aggregation 12 | """ 13 | 14 | def __init__(self, kernel_size=(3, 3), stride=(2, 2), max_pool_size=(2, 2), dropout_size=0.): 15 | """Initiliase the cnn layers. 16 | 17 | Args: 18 | kernel_size : Tuple which determines the size of the cnn kernel 19 | stride : Tuple which determines the size of the strides in the x and y direction 20 | max_pool_size = Tuple which determines the size of the max_pooling function 21 | dropout_size = Value of dropout added after entire processing 22 | """ 23 | super().__init__() 24 | self.in_channels = 1 25 | self.out_channels = 1 26 | 27 | self.conv_layer = nn.Conv2d(in_channels=self.in_channels, 28 | out_channels=self.out_channels, kernel_size=kernel_size, stride=stride) 29 | self.max_pool = nn.MaxPool2d(max_pool_size) 30 | self.conv_dropout = nn.Dropout(dropout_size) 31 | 32 | def forward(self, input_tensor): 33 | """Run a forward pass of model over the data.""" 34 | 35 | # The CNN by default accepts the input as (batch_size, in_channels, height_img, width_img). 36 | # We treat the sequential input as an image but we need an additional dimension to correspond to in_channels 37 | # Therefore we need to unsqueeze a dimension out here 38 | 39 | input_tensor = torch.unsqueeze(input_tensor, 1) 40 | 41 | input_tensor = F.tanh(self.conv_layer(input_tensor)) 42 | input_tensor = self.max_pool(input_tensor) 43 | 44 | # The CNN by default outputs as (batch_size, out_channels, height_img, width_img). 45 | # We need to squeeze away the dimension we had added earlier to remain consistent 46 | 47 | input_tensor = input_tensor.squeeze(1) 48 | output_tensor = self.conv_dropout(input_tensor) 49 | 50 | return output_tensor 51 | -------------------------------------------------------------------------------- /caspr/models/dec.py: -------------------------------------------------------------------------------- 1 | """CASPR deep embedding clustering class.""" 2 | 3 | import torch 4 | import torch.nn as nn 5 | from torch.nn import Parameter 6 | 7 | from caspr.utils.preprocess import get_nonempty_tensors 8 | 9 | 10 | class ClusterAssignment(nn.Module): # noqa: W0223 11 | def __init__(self, 12 | cluster_number, 13 | embedding_dimension, 14 | alpha=1.0, 15 | cluster_centers=None): 16 | """Handle the soft assignment. 17 | 18 | For a description see in 3.1.1. in Xie/Girshick/Farhadi, where the Student's t-distribution 19 | is used to measure similarity between feature vector and each cluster centroid. 20 | 21 | Args: 22 | cluster_number (int): number of clusters 23 | embedding_dimension (int): embedding dimension of feature vectors 24 | alpha (float): parameter representing the degrees of freedom in the t-distribution, default 1.0 25 | cluster_centers (tensors): clusters centers to initialise, if None then use Xavier uniform 26 | """ 27 | super().__init__() 28 | self.embedding_dimension = embedding_dimension 29 | self.cluster_number = cluster_number 30 | self.alpha = alpha 31 | if cluster_centers is None: 32 | initial_cluster_centers = torch.zeros( 33 | self.cluster_number, 34 | self.embedding_dimension, 35 | dtype=torch.float 36 | ) 37 | nn.init.xavier_uniform_(initial_cluster_centers) 38 | else: 39 | initial_cluster_centers = cluster_centers 40 | self.cluster_centers = Parameter(initial_cluster_centers) 41 | 42 | def forward(self, batch): 43 | """Run a forward pass of model over the data. 44 | 45 | Compute the soft assignment for a batch of feature vectors, returning a batch of assignments for each cluster. 46 | 47 | Args: 48 | batch: FloatTensor of [batch size, embedding dimension] 49 | 50 | Return: 51 | FloatTensor of [batch size, number of clusters] 52 | """ 53 | norm_squared = torch.sum((batch.unsqueeze(1) - self.cluster_centers) ** 2, 2) 54 | numerator = 1.0 / (1.0 + (norm_squared / self.alpha)) 55 | power = float(self.alpha + 1) / 2 56 | numerator = numerator**power 57 | return numerator / torch.sum(numerator, dim=1, keepdim=True) 58 | 59 | 60 | class DEC(nn.Module): # noqa: W0223 61 | def __init__(self, 62 | cluster_number, 63 | hidden_dimension, 64 | enc, 65 | alpha=1): 66 | """Initialize the parts of DEC algorithm. 67 | 68 | as described in Xie/Girshick/Farhadi; this includes the AutoEncoder stage and the ClusterAssignment stage. 69 | 70 | Args: 71 | cluster_number (int): number of clusters 72 | hidden_dimension (int): hidden dimension, output of the encoder 73 | enc (nn.Module): # noqa: W0223 encoder to use 74 | alpha (float): parameter representing the degrees of freedom in the t-distribution, default 1.0 75 | """ 76 | super().__init__() 77 | self.enc = enc 78 | self.hidden_dimension = hidden_dimension 79 | self.cluster_number = cluster_number 80 | self.alpha = alpha 81 | self.assignment = ClusterAssignment(cluster_number, self.hidden_dimension, alpha) 82 | 83 | def forward(self, *args): 84 | """Compute the cluster assignment. 85 | 86 | Using the ClusterAssignment after running the batch 87 | through the encoder part of the associated AutoEncoder module. 88 | 89 | Args: 90 | batch: FloatTensor of [batch size, embedding dimension] 91 | 92 | Return: 93 | FloatTensor of [batch size, number of clusters] 94 | """ 95 | return self.assignment(self.enc(*args)) 96 | 97 | def run(self, # noqa : R0913 98 | y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data, criterion): # noqa : W0613 99 | data = (seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data) 100 | nonempty_tensors, nonempty_idx = get_nonempty_tensors(data) 101 | output = self(*nonempty_tensors, nonempty_idx) 102 | target = _target_distribution(output).detach() 103 | loss = criterion(output.log(), target) / output.shape[0] 104 | return output, loss 105 | 106 | 107 | def _target_distribution(batch): 108 | """Compute the target distribution p_ij, given the batch (q_ij). 109 | 110 | 3.1.3 Equation 3 of Xie/Girshick/Farhadi; this used the KL-divergence loss function. 111 | 112 | Args: 113 | batch: FloatTensor of [batch size, number of clusters] 114 | 115 | Return: 116 | FloatTensor of [batch size, number of clusters] 117 | """ 118 | weight = (batch ** 2) / torch.sum(batch, 0) 119 | return (weight.t() / torch.sum(weight, 1)).t() 120 | -------------------------------------------------------------------------------- /caspr/models/dense_bn_dropout.py: -------------------------------------------------------------------------------- 1 | """CASPR base dense layer class.""" 2 | 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | 7 | class DenseBnDropout(nn.Module): # noqa: W0223 8 | """Dense Layers w/ dropout and batch-normalization. 9 | 10 | A module comprising of a sequential structure of [Linear -> Batch Normalisation -> Dropout] 11 | used for multiple iterations through it 12 | When the input is a 3D tensor - batch_size x seq_len x features 13 | When the input is a 2D tensor - batch_size x features 14 | """ 15 | 16 | def __init__(self, lin_layer_sizes, lin_layer_dropouts, input_size): 17 | """Initiliasing the layers. 18 | 19 | Args: 20 | lin_layer_sizes (list) = sizes of the linear layers being using across multiple iterations 21 | lin_layer_dropouts (list) = values of the dropout layers across multiple iterations 22 | input_size (integer) = size of the input tensor - batch_size x 'input_size' x seq_len 23 | """ 24 | 25 | super().__init__() 26 | first_lin_layer = nn.Linear(input_size, lin_layer_sizes[0]) 27 | self.lin_layers = nn.ModuleList([first_lin_layer] + 28 | [nn.Linear(lin_layer_sizes[i], 29 | lin_layer_sizes[i + 1]) 30 | for i in range(len(lin_layer_sizes) - 1)]) 31 | for lin_layer in self.lin_layers: 32 | nn.init.kaiming_normal_(lin_layer.weight.data) 33 | 34 | self.dropout_layers = nn.ModuleList([nn.Dropout(p) for p in lin_layer_dropouts]) 35 | self.bn_layers = nn.ModuleList([nn.BatchNorm1d(size) for size in lin_layer_sizes]) 36 | 37 | def forward(self, input_tensor): 38 | """Run a forward pass of model over the data.""" 39 | is_seq = input_tensor.ndim == 3 40 | 41 | for lin_layer, dropout_layer, bn_layer in zip(self.lin_layers, self.dropout_layers, self.bn_layers): 42 | input_tensor = F.relu(lin_layer(input_tensor)) 43 | if is_seq: 44 | # permute to adjust for the BN internal structure 45 | input_tensor = input_tensor.permute(0, 2, 1) 46 | 47 | input_tensor = bn_layer(input_tensor) 48 | 49 | if is_seq: 50 | # permute back to maintain the original structure required for linear layer 51 | input_tensor = input_tensor.permute(0, 2, 1) 52 | 53 | input_tensor = dropout_layer(input_tensor) 54 | 55 | output_tensor = input_tensor 56 | return output_tensor 57 | -------------------------------------------------------------------------------- /caspr/models/embedding_layer.py: -------------------------------------------------------------------------------- 1 | """CASPR embedding layer base class.""" 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | 7 | 8 | class CategoricalEmbedding(nn.Module): # noqa: W0223 9 | """Define embedding layers to convert categorical variable values to continuous embeddings. 10 | 11 | Uses pytorch defined nn.Embedding layers 12 | The incoming data for this class has 3 dimensions - dim(1) is the number of time steps in the sequence 13 | when used for a seq variable 14 | When being used for non-seq variable - data has 2 dimensions 15 | """ 16 | 17 | def __init__(self, # noqa: R0913 18 | emb_dims, emb_dropout, is_seq=False, pretrained_vecs=None, freeze_pretrained=True): 19 | """Initialise the emb layer class. 20 | 21 | Args: 22 | emb_dims: A list of tuple (x, y) which contains the input for the nn.Embedding layer 23 | emb_dropout : The dropout value for the layers applied after concatenation of all the embeddings 24 | is_seq = determines if this layer has been initialised for sequential or non-sequential data 25 | pretrained_vecs = The tensor which contains the pretrained values. For variables for which we dont have the 26 | vecs we initialise the nn.Embedding layer and backpropagate through them 27 | freeze_pretrained This boolean label determines if we freeze the pretrained embeddings and dont 28 | backpropagate through them 29 | """ 30 | 31 | super().__init__() 32 | 33 | self.emb_dims = emb_dims 34 | self.is_seq = is_seq 35 | self.emb_layers = nn.ModuleList([nn.Embedding(x, y) for x, y in emb_dims]) 36 | if pretrained_vecs is not None and len(emb_dims) > 0: 37 | for i, v in enumerate(pretrained_vecs): 38 | if v is not None: 39 | self.emb_layers[i] = nn.Embedding.from_pretrained(v, freeze=freeze_pretrained) 40 | self.num_classes = [x for x, _ in emb_dims] 41 | self.emb_size = np.sum([y for _, y in emb_dims], dtype=np.int32) 42 | self.emb_dropout_layer = nn.Dropout(emb_dropout) 43 | 44 | def forward(self, cat_data): 45 | """Run a forward pass of model over the data.""" 46 | cat_data = cat_data.long() 47 | # across all rows and column i - useful for batches 48 | cat_inp = [emb_layer(cat_data[..., i]) for i, emb_layer in enumerate(self.emb_layers)] 49 | cat_inp = torch.cat(cat_inp, -1) 50 | cat_inp = self.emb_dropout_layer(cat_inp) 51 | return cat_inp 52 | -------------------------------------------------------------------------------- /caspr/models/factory.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import numpy as np 4 | import torch.nn as nn 5 | 6 | from caspr.models.dense_bn_dropout import DenseBnDropout 7 | from caspr.models.lstm_decoder import LSTM_attention_embedding_decoder 8 | from caspr.models.model_wrapper import LSTMAutoencoder, OutputLayer, TransformerAutoEncoder 9 | from caspr.models.transformer import TransformerDecoder, TransformerEncoder 10 | from caspr.models.unified_encoder import UnifiedEncoder 11 | from caspr.models.unified_transformer_encoder import UnifiedTransformerEncoder 12 | 13 | TRANSFORMER = 'TransformerAutoEncoder' 14 | LSTM = 'LSTMAutoencoder' 15 | logger = logging.getLogger(__name__) 16 | 17 | class CASPRFactory: 18 | 19 | def __init__(self, cat_cols_, num_activities, cont_cols_, seq_cols_, non_seq_cols_, date_cols_=[], seq_len=15, max_emb_size=25, emb_dims_non_seq=None, emb_dims_seq=None) -> None: 20 | self.support = { 21 | TRANSFORMER : self.__create_transformer_autoencoder__, 22 | LSTM : self.__create_autoencoder__ 23 | } 24 | 25 | if num_activities: 26 | self.emb_dims_non_seq, self.emb_dims_seq = self.calculate_embedding_dimensions(num_activities, seq_cols=seq_cols_, 27 | non_seq_cols=non_seq_cols_, 28 | max_emb_size=max_emb_size) 29 | else: 30 | self.emb_dims_non_seq = emb_dims_non_seq 31 | self.emb_dims_seq = emb_dims_seq 32 | 33 | self.seq_len = seq_len 34 | 35 | self.non_seq_cat_ = [x for x in cat_cols_ if x in non_seq_cols_] 36 | self.seq_cat_ = [x for x in cat_cols_ if x in seq_cols_] 37 | self.non_seq_cont_ = [x for x in cont_cols_ if x in non_seq_cols_] 38 | self.seq_cont_ = [x for x in cont_cols_+date_cols_ if x in seq_cols_] 39 | 40 | self.seq_cont_dim = len(set(seq_cols_) & set(cont_cols_)) + len(date_cols_) 41 | self.non_seq_cont_dim = len(set(non_seq_cols_) & set(cont_cols_)) 42 | # Append non seq features to the end of the sequence if exist 43 | self.adjust_seq_len = seq_len + int(len(non_seq_cols_) > 0) 44 | 45 | @staticmethod 46 | def calculate_embedding_dimensions(num_activities, seq_cols=None, non_seq_cols=None, max_emb_size=25): 47 | """Calculate the emb dims for the categorical embedding layer for each categorical variable. 48 | 49 | Args: 50 | num_activities: number of unique activities for each categorical variable 51 | seq_cols (list): List of sequential vars 52 | non_seq_cols (list): List of non-sequential vars 53 | max_emb_size (Default = 25) : The max size of the embedding layer for a variable 54 | (needed when the possible values are very high) 55 | """ 56 | 57 | # Avoid using empty lists as default values 58 | seq_cols = [] if seq_cols is None else seq_cols 59 | non_seq_cols = [] if non_seq_cols is None else non_seq_cols 60 | 61 | cat_seq_dims = [num_activities[c] for c in num_activities.keys() if c in seq_cols] 62 | cat_non_seq_dims = [num_activities[c] for c in num_activities.keys() if c in non_seq_cols] 63 | emb_dims_non_seq = [(x, int(np.minimum(max_emb_size, (x + 1) // 2))) for x in cat_non_seq_dims] 64 | emb_dims_seq = [(x, int(np.minimum(max_emb_size, (x + 1) // 2))) for x in cat_seq_dims] 65 | 66 | return emb_dims_non_seq, emb_dims_seq 67 | 68 | def __create_transformer_autoencoder__(self, device="cuda", HIDDEN_SIZE=64, 69 | NUM_LAYERS_ENC=4, 70 | NUM_LAYERS_DEC=2, 71 | NUM_HEADS_ENC=2, 72 | NUM_HEADS_DEC=4, 73 | PF_DIM_ENC=32, 74 | PF_DIM_DEC=128, 75 | DROPOUT_ENC=0.1, 76 | DROPOUT_DEC=0.1, 77 | EMBEDDING_DROPOUT_SEQUENTIAL=0.1, 78 | EMBEDDING_DROPOUT_NON_SEQUENTIAL=0.1) -> TransformerAutoEncoder: 79 | 80 | enc = TransformerEncoder(hid_dim=HIDDEN_SIZE, n_layers=NUM_LAYERS_ENC, n_heads=NUM_HEADS_ENC, 81 | pf_dim=PF_DIM_ENC, dropout=DROPOUT_ENC, max_length=self.adjust_seq_len) 82 | 83 | dec = TransformerDecoder(hid_dim=HIDDEN_SIZE, n_layers=NUM_LAYERS_DEC, n_heads=NUM_HEADS_DEC, 84 | pf_dim=PF_DIM_DEC, dropout=DROPOUT_DEC, pos_embedding=enc.pos_embedding) 85 | 86 | emb_seq_num_classes = [x for x, _ in self.emb_dims_seq] 87 | emb_non_seq_num_classes = [x for x, _ in self.emb_dims_non_seq] 88 | 89 | output_layer = OutputLayer(HIDDEN_SIZE, self.seq_cont_dim, self.non_seq_cont_dim, 90 | emb_seq_num_classes, emb_non_seq_num_classes) 91 | 92 | unified_transformer_encoder = UnifiedTransformerEncoder(enc, 93 | self.emb_dims_non_seq, 94 | EMBEDDING_DROPOUT_NON_SEQUENTIAL, 95 | self.emb_dims_seq, 96 | EMBEDDING_DROPOUT_SEQUENTIAL, 97 | HIDDEN_SIZE, 98 | self.seq_cont_dim, 99 | self.non_seq_cont_dim, 100 | non_seq_pretrained_embs=None, 101 | freeze_non_seq_pretrained_embs=True, 102 | seq_pretrained_embs=None, 103 | freeze_seq_pretrained_embs=True) 104 | 105 | return TransformerAutoEncoder(unified_transformer_encoder, dec, output_layer).to(device) 106 | 107 | def __create_autoencoder__(self, device="cuda", HIDDEN_SIZE=64, 108 | NUM_LAYERS=1, 109 | LIN_LAYER_SIZES_NON_SEQUENTIAL=[50, 25], 110 | LIN_LAYER_SIZES_SEQUENTIAL=[50, 25], 111 | EMBEDDING_DROPOUT_NON_SEQUENTIAL=0.04, 112 | LIN_LAYER_DROPOUTS_NON_SEQUENTIAL=[0.0001, 0.01], 113 | EMBEDDING_DROPOUT_SEQUENTIAL=0.04, 114 | LIN_LAYER_DROPOUTS_SEQUENTIAL=[0.001, 0.01]) -> LSTMAutoencoder: 115 | 116 | output_dim = len(self.seq_cont_) 117 | num_classes = [x for (x, _) in self.emb_dims_seq] 118 | 119 | # Model objects initialisation 120 | encoder = UnifiedEncoder(emb_dims_non_seq=self.emb_dims_non_seq, 121 | emb_dropout_non_seq=EMBEDDING_DROPOUT_NON_SEQUENTIAL, 122 | emb_dims_seq=self.emb_dims_seq, 123 | emb_dropout_seq=EMBEDDING_DROPOUT_SEQUENTIAL, 124 | emb_lin_layer_sizes_non_seq=LIN_LAYER_SIZES_NON_SEQUENTIAL, 125 | emb_lin_layer_dropouts_non_seq=LIN_LAYER_DROPOUTS_NON_SEQUENTIAL, 126 | emb_lin_layer_sizes_seq=LIN_LAYER_SIZES_SEQUENTIAL, 127 | emb_lin_layer_dropouts_seq=LIN_LAYER_DROPOUTS_SEQUENTIAL, 128 | lstm_hidden_size=HIDDEN_SIZE, 129 | output_size=output_dim, 130 | seq_len=self.seq_len, 131 | non_seq_cont_count=len(self.non_seq_cont_), 132 | seq_cat_count=len(self.seq_cat_), 133 | seq_cont_count=len(self.seq_cont_), 134 | non_seq_cat_count=len(self.non_seq_cat_)) 135 | 136 | input_dim = int(encoder.seq_cont_count + encoder.no_of_embs_seq) 137 | 138 | decoder = LSTM_attention_embedding_decoder(input_dim=input_dim, 139 | hidden_size=HIDDEN_SIZE, 140 | num_layers=NUM_LAYERS, 141 | output_dim=output_dim, 142 | num_classes=num_classes) 143 | 144 | mlp_non_seq_cat_list = [] 145 | 146 | for non_seq_cat, _ in self.emb_dims_non_seq: 147 | mlp_non_seq_cat_list.append(DenseBnDropout(LIN_LAYER_SIZES_NON_SEQUENTIAL+[ 148 | non_seq_cat], LIN_LAYER_DROPOUTS_NON_SEQUENTIAL+[0], HIDDEN_SIZE)) 149 | mlp_non_seq_cont = DenseBnDropout( 150 | LIN_LAYER_SIZES_NON_SEQUENTIAL, LIN_LAYER_DROPOUTS_NON_SEQUENTIAL, HIDDEN_SIZE) 151 | 152 | autoenc = LSTMAutoencoder(encoder, mlp_non_seq_cat_list, mlp_non_seq_cont, decoder).to(device) 153 | 154 | return autoenc 155 | 156 | def create(self, architecture: str, device="cuda", **hyperparams) -> nn.Module: 157 | if architecture not in self.support: 158 | raise ValueError("Unknown architecture specified. Model Factory currently supports: %s Requested: %s" % (str(self.support.keys()), architecture)) 159 | 160 | constructor_f = self.support[architecture] 161 | 162 | logger.info("Initializing CASPR with %s architecture. Hyperparams provided: %s" % (architecture, hyperparams)) 163 | 164 | return constructor_f(device, **hyperparams) 165 | -------------------------------------------------------------------------------- /caspr/models/lstm_autoencoder_sequence.py: -------------------------------------------------------------------------------- 1 | """Bahdanau attention based LSTM encoder.""" 2 | 3 | import warnings 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | 9 | import caspr.models 10 | 11 | warnings.simplefilter('ignore') 12 | 13 | 14 | class LSTM_attention_embedding_encoder_sequence(nn.Module): # noqa: W0223 15 | """Luong/Bahdanau attention based LSTM encoder.""" 16 | 17 | def __init__(self, # noqa: R0913, R0914 18 | emb_dims_non_seq, 19 | emb_dims_seq, 20 | lin_layer_sizes_non_sequential, 21 | lin_layer_sizes_sequential, 22 | hidden_size, 23 | num_layers, 24 | bidirectional, 25 | output_size, 26 | emb_dropout_non_seq, 27 | lin_layer_dropouts_non_sequential, 28 | emb_dropout_seq, 29 | lin_layer_dropouts_sequential, 30 | lin_layer_sizes_fin, 31 | lin_layer_dropouts_fin, # noqa: W0613 32 | seq_len, input_dim, 33 | non_seq_cont_count, seq_cat_count, seq_cont_count, non_seq_cat_count, 34 | device): 35 | """Initialise the pytorch LSTM layer. 36 | 37 | Args: 38 | emb_dims_non_seq, emb_dims_seq (list of int tuples): 39 | List of category dimension and corresponding embedding size. 40 | lin_layer_sizes_non_sequential, lin_layer_sizes_sequential (list of int tuples): 41 | List of [m1*m2] tuples for embedding dimension reduction and non-linearity 42 | emb_dropout_non_seq, emb_dropout_seq (float): dropout values for embedding layers 43 | lin_layer_dropouts_non_seq, lin_layer_dropouts_seq (list of float): 44 | dropout values for linear layers corresponding to embedding layers 45 | hidden_size (int): Size of the hidden state 46 | num_layers (int): Number of stacked LSTM layers 47 | bidirectional (bool): Flag for bi/uni LSTM 48 | output_size (int): Size of the final output layer 49 | lin_layer_sizes_fin (list of int tuples): 50 | List of [m1*m2] tuples for non-linear combination of sequential and nonsequential inputs 51 | seq_len (int): Length of input Sequence 52 | """ 53 | super().__init__() 54 | 55 | self.device = device 56 | self.non_seq_emb_layers = nn.ModuleList( 57 | [nn.Embedding(x, y) for x, y in emb_dims_non_seq]) 58 | self.seq_emb_layers = nn.ModuleList( 59 | [nn.Embedding(x, y) for x, y in emb_dims_seq]) 60 | self.no_of_embs_non_seq = sum([y for x, y in emb_dims_non_seq]) 61 | self.no_of_embs_seq = sum([y for x, y in emb_dims_seq]) 62 | self.input_dim = input_dim 63 | self.seq_len = seq_len 64 | self.hidden_size = hidden_size 65 | self.non_seq_cont_count = non_seq_cont_count 66 | self.non_seq_cat_count = non_seq_cat_count 67 | self.context_vector_size = hidden_size 68 | self.output_dim = output_size 69 | self.num_layers = num_layers 70 | self.num_directions = 2 if bidirectional else 1 71 | 72 | self.seq_cat_count = seq_cat_count 73 | self.seq_cont_count = seq_cont_count 74 | self.non_seq_cat_count = non_seq_cat_count 75 | self.non_seq_cont_count = non_seq_cont_count 76 | 77 | # Linear Layers for non_seq_data parallel to LSTM 78 | if self.no_of_embs_non_seq != 0: 79 | first_lin_layer = nn.Linear(self.no_of_embs_non_seq, lin_layer_sizes_non_sequential[0]) 80 | self.lin_layersnon_sequential = nn.ModuleList([first_lin_layer] + 81 | [nn.Linear(lin_layer_sizes_non_sequential[i], 82 | lin_layer_sizes_non_sequential[i + 1]) 83 | for i in range(len(lin_layer_sizes_non_sequential) - 1)]) 84 | for lin_layer in self.lin_layersnon_sequential: 85 | nn.init.kaiming_normal_(lin_layer.weight.data) 86 | 87 | self.emb_dropout_layer_non_sequential = nn.Dropout(emb_dropout_non_seq) 88 | self.dropout_layersnon_sequential = nn.ModuleList( 89 | [nn.Dropout(size) for size in lin_layer_dropouts_non_sequential]) 90 | self.bn_layersnon_sequential = nn.ModuleList( 91 | [nn.BatchNorm1d(size) for size in lin_layer_sizes_non_sequential]) 92 | 93 | # Linear Layers for seq_cat_data 94 | if self.no_of_embs_seq != 0: 95 | first_lin_layer_seq = nn.Linear(self.no_of_embs_seq, lin_layer_sizes_sequential[0]) 96 | self.lin_layers_seq = nn.ModuleList([first_lin_layer_seq] + 97 | [nn.Linear(lin_layer_sizes_sequential[i], 98 | lin_layer_sizes_sequential[i + 1]) 99 | for i in range(len(lin_layer_sizes_sequential) - 1)]) 100 | for lin_layer in self.lin_layers_seq: 101 | nn.init.kaiming_normal_(lin_layer.weight.data) 102 | 103 | self.emb_dropout_layer_seq = nn.Dropout(emb_dropout_seq) 104 | self.dropout_layers_seq = nn.ModuleList([nn.Dropout(size) for size in lin_layer_dropouts_sequential]) 105 | self.bn_layers_seq = nn.ModuleList([nn.BatchNorm1d(size) for size in lin_layer_sizes_sequential]) 106 | 107 | # Output Layer 108 | self.output_layer = nn.Linear(lin_layer_sizes_fin[-1], output_size) 109 | nn.init.kaiming_normal_(self.output_layer.weight.data) 110 | 111 | # LSTM layer 112 | self.lstmLayer = nn.LSTM( 113 | self.input_dim + lin_layer_sizes_sequential[-1], 114 | self.hidden_size, self.num_layers, batch_first=True, bidirectional=bidirectional) 115 | # self.lstmLayer = nn.LSTM( 116 | # self.input_dim+self.no_of_embs_seq, 117 | # self.hidden_size, self.num_layers, batch_first=True, bidirectional=bidirectional) 118 | 119 | # Linear Layers post LSTM 120 | self.lin_layer_lstm_to_dense = nn.Linear( 121 | self.num_directions*self.hidden_size, self.hidden_size) 122 | 123 | # Attention 124 | self.bahdanau_attention_layer = caspr.models.attention_mechanisms.BahdanauAttention( 125 | self.hidden_size, self.num_directions) 126 | 127 | # self.fc_encoder = nn.Linear( 128 | # self.num_directions*self.hidden_size, self.hidden_size, bias=False) 129 | 130 | # self.attnHidden = nn.Linear(self.hidden_size, 1) 131 | 132 | self.fin_layer = nn.Linear( 133 | self.num_directions*self.hidden_size + 134 | self.context_vector_size + self.no_of_embs_non_seq + self.non_seq_cont_count, hidden_size) 135 | # self.fin_layer = nn.Linear( 136 | # self.num_directions*self.hidden_size + self.context_vector_size , hidden_size) 137 | 138 | def forward(self, input_tensor): # noqa : R0914 139 | """Run a forward pass of model over the data.""" 140 | seq_cat_index = self.seq_len * self.seq_cat_count 141 | seq_cont_index = seq_cat_index + self.seq_len * self.seq_cont_count 142 | non_seq_cat_index = seq_cont_index + self.non_seq_cat_count 143 | non_seq_cont_index = non_seq_cat_index + self.non_seq_cont_count 144 | 145 | seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data = input_tensor[:, :seq_cat_index], \ 146 | input_tensor[:, seq_cat_index: seq_cont_index], \ 147 | input_tensor[:, seq_cont_index: non_seq_cat_index], \ 148 | input_tensor[:, non_seq_cat_index: non_seq_cont_index] 149 | seq_cat_data = seq_cat_data.type(torch.LongTensor) 150 | seq_cat_data = seq_cat_data.reshape( 151 | seq_cat_data.shape[0], self.seq_len, int(seq_cat_data.shape[1]/self.seq_len)) 152 | seq_cont_data = seq_cont_data.reshape( 153 | seq_cont_data.shape[0], self.seq_len, int(seq_cont_data.shape[1]/self.seq_len)) 154 | 155 | if self.no_of_embs_non_seq != 0: 156 | non_seq_cat_data = non_seq_cat_data.type( 157 | torch.LongTensor).to(self.device) 158 | # across all rows and column i - useful for batches 159 | non_seq_cat_inp = [emb_layer(non_seq_cat_data[:, i]) 160 | for i, emb_layer in enumerate(self.non_seq_emb_layers)] 161 | non_seq_cat_inp = torch.cat(non_seq_cat_inp, 1) 162 | non_seq_cat_inp = self.emb_dropout_layer_non_sequential(non_seq_cat_inp) 163 | if self.non_seq_cont_count != 0: 164 | non_seq_inp = torch.cat((non_seq_cat_inp.type(torch.FloatTensor).to( 165 | self.device), non_seq_cont_data.type(torch.FloatTensor).to(self.device)), 1) 166 | else: 167 | non_seq_inp = non_seq_cat_inp.type(torch.FloatTensor).to(self.device) 168 | elif self.non_seq_cont_count != 0: 169 | non_seq_inp = non_seq_cont_data.type(torch.FloatTensor).to(self.device) 170 | 171 | if self.no_of_embs_seq != 0: 172 | seq_cat_data = seq_cat_data.type( 173 | torch.LongTensor).to(self.device) 174 | # across all rows and column i - useful for batches 175 | seq_cat_inp = [emb_layer(seq_cat_data[:, :, i]) 176 | for i, emb_layer in enumerate(self.seq_emb_layers)] 177 | # shape = batchsize * seq_len * 16(emb size) 178 | seq_cat_inp = torch.cat(seq_cat_inp, 2) 179 | seq_cat_inp = self.emb_dropout_layer_seq(seq_cat_inp) 180 | seq_cat_inp_emb = seq_cat_inp 181 | for lin_layer, dropout_layer, bn_layer in zip(self.lin_layers_seq, 182 | self.dropout_layers_seq, self.bn_layers_seq): 183 | seq_cat_inp_emb = F.relu(lin_layer(seq_cat_inp_emb)) 184 | seq_cat_inp_emb = torch.cat([bn_layer(seq_cat_inp_emb[:, i, :]).unsqueeze(1) 185 | for i in range(self.seq_len)], 1) 186 | seq_cat_inp_emb = dropout_layer(seq_cat_inp_emb) 187 | 188 | seq_cat_inp_emb = seq_cat_inp_emb.to(self.device) 189 | # shape seq_cat = batchsize * seq_len * emb size/lin_layers_seq[-1].shape 190 | # shape seq_cont = batchsize * seq_len * data 191 | 192 | seq_data = torch.cat([seq_cat_inp_emb, seq_cont_data], 2) 193 | 194 | # now the sequential data 195 | inp_tens = seq_data 196 | 197 | temp_batch_size = inp_tens.size()[0] 198 | 199 | h0 = torch.zeros(self.num_directions*self.num_layers, temp_batch_size, self.hidden_size).to( 200 | self.device).requires_grad_() 201 | c0 = torch.zeros(self.num_directions*self.num_layers, temp_batch_size, self.hidden_size).to( 202 | self.device).requires_grad_() 203 | 204 | output, (hn, cn) = self.lstmLayer(inp_tens, (h0, c0)) 205 | # passes through the embedding layer to generate the required embeddings 206 | # attention weight calculation 207 | 208 | # tempX = torch.tanh(self.fc_encoder(output)) 209 | # alignment_scores = self.attnHidden(tempX) 210 | # attn_weights = F.softmax(alignment_scores, dim=1) 211 | # attn_weights = attn_weights.permute(0, 2, 1) 212 | # context_vector = torch.bmm(attn_weights, output) 213 | 214 | context_vector = self.bahdanau_attention_layer(output) 215 | 216 | hn = hn.view(self.num_layers, self.num_directions, - 217 | 1, self.hidden_size).to(self.device) 218 | cn_ = cn.view(self.num_layers, self.num_directions, - 219 | 1, self.hidden_size).to(self.device) 220 | if self.num_directions > 1: 221 | seq_inp = self.lin_layer_lstm_to_dense(torch.cat( 222 | [hn[self.num_layers-1, 0], hn[self.num_layers-1, -1]], 1).unsqueeze(0)) 223 | else: 224 | seq_inp = self.lin_layer_lstm_to_dense( 225 | hn[self.num_layers-1, 0]).unsqueeze(0) 226 | 227 | seq_inp = seq_inp.reshape(seq_inp.size()[1], seq_inp.size()[2]) 228 | 229 | context_vector = context_vector.reshape( 230 | context_vector.size()[0], context_vector.size()[2]) 231 | 232 | fin_input = torch.cat((non_seq_inp, seq_inp, context_vector), 1) 233 | # fin_input = torch.cat((seq_inp, context_vector), 1) 234 | 235 | hn_ = F.relu(self.fin_layer(fin_input)) 236 | 237 | return output, (hn_, cn_[self.num_layers-1, 0, :, :].unsqueeze(0)) 238 | -------------------------------------------------------------------------------- /caspr/models/lstm_decoder.py: -------------------------------------------------------------------------------- 1 | """CASPR LSTM decoder base class.""" 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | class LSTM_attention_embedding_decoder(nn.Module): # noqa: W0223 9 | """Simple LSTM decoder.""" 10 | 11 | def __init__(self, # noqa: R0913 12 | input_dim, 13 | hidden_size, 14 | output_dim, 15 | num_classes, 16 | num_layers=1): 17 | """Initialize model with params.""" 18 | super().__init__() 19 | 20 | self.input_dim = input_dim 21 | self.hidden_size = hidden_size 22 | self.num_layers = num_layers 23 | self.num_classes = num_classes 24 | self.output_dim = output_dim 25 | 26 | # LSTM layer 27 | self.lstm_layer = nn.LSTM( 28 | input_dim, hidden_size, num_layers, batch_first=True) 29 | 30 | self.linear = nn.Linear(self.hidden_size, output_dim) 31 | 32 | self.output = nn.ModuleList([nn.Linear(self.hidden_size, num_class) for num_class in self.num_classes]) 33 | self.hidden = None 34 | 35 | def forward(self, inp, hidden): 36 | """Forward pass through LSTM layer. 37 | 38 | shape of lstm_out: [input_size, batch_size, hidden_dim] 39 | shape of self.hidden: (a, b), where a and b both 40 | have shape (num_layers, batch_size, hidden_dim). 41 | """ 42 | inp = inp.view(inp.shape[0], 1, -1) 43 | self.hidden = hidden 44 | 45 | lstm_out, self.hidden = self.lstm_layer(inp, self.hidden) 46 | decoder_out = (torch.tanh(lstm_out[:, -1, :])) 47 | 48 | y_pred = self.linear(decoder_out) 49 | out_cont = F.relu(y_pred) 50 | # out_cat = self.output(decoder_out) 51 | out_cat = [ # across all rows and column i - useful for batches 52 | output_layer(decoder_out) for i, output_layer in enumerate(self.output) 53 | ] 54 | # out_cat = torch.cat(out_cat, -1) 55 | # print(out_cat.shape) 56 | 57 | return out_cont, self.hidden, out_cont, out_cat 58 | -------------------------------------------------------------------------------- /caspr/models/lstm_timeseries_tpa_attention.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """TPA attention based LSTM encoder.""" 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | 9 | class LSTM_TPA_attention_timeseries(nn.Module): # noqa: W0223 10 | """TPA attention based LSTM encoder.""" 11 | 12 | def __init__(self, # noqa: R0913 13 | emb_dims_non_seq, 14 | emb_dims_seq, 15 | lin_layer_sizes_non_sequential, 16 | lin_layer_sizes_sequential, 17 | non_seq_cont_count, 18 | hidden_size, 19 | output_size, 20 | emb_dropout_non_seq, 21 | lin_layer_dropouts_non_sequential, 22 | emb_dropout_seq, 23 | lin_layer_dropouts_sequential, 24 | lin_layer_sizes_fin, 25 | lin_layer_dropouts_fin, 26 | seq_len, input_dim, device): 27 | """Initialise the pytorch LSTM layer. 28 | 29 | Args: 30 | emb_dims_non_seq, emb_dims_seq (list of int tuples): 31 | List of category dimension and corresponding embedding size. 32 | lin_layer_sizes_non_sequential, lin_layer_sizes_sequential (list of int tuples): 33 | List of [m1*m2] tuples for embedding dimension reduction and non-linearity 34 | emb_dropout_non_seq, emb_dropout_seq (float): dropout values for embedding layers 35 | lin_layer_dropouts_non_seq, lin_layer_dropouts_seq (list of float): 36 | dropout values for linear layers corresponding to embedding layers 37 | hidden_size (int): Size of the hidden state 38 | output_size (int): Size of the final output layer 39 | lin_layer_sizes_fin (list of int tuples): 40 | List of [m1*m2] tuples for non-linear combination of sequential and nonsequential inputs 41 | seq_len (int): Length of input Sequence 42 | """ 43 | 44 | super().__init__() 45 | 46 | self.device = device 47 | self.non_seq_emb_layers = nn.ModuleList( 48 | [nn.Embedding(x, y) for x, y in emb_dims_non_seq]) 49 | self.seq_emb_layers = nn.ModuleList( 50 | [nn.Embedding(x, y) for x, y in emb_dims_seq]) 51 | self.no_of_embs_non_seq = sum([y for x, y in emb_dims_non_seq]) 52 | self.no_of_embs_seq = sum([y for x, y in emb_dims_seq]) 53 | self.input_dim = input_dim 54 | self.seq_len = seq_len 55 | self.hidden_size = hidden_size 56 | self.non_seq_cont_count = non_seq_cont_count 57 | self.context_vector_size = hidden_size 58 | self.output_dim = output_size 59 | 60 | if self.no_of_embs_non_seq != 0: 61 | first_lin_layer = nn.Linear(self.no_of_embs_non_seq, lin_layer_sizes_non_sequential[0]) 62 | self.lin_layersnon_sequential = nn.ModuleList([first_lin_layer] + 63 | [nn.Linear(lin_layer_sizes_non_sequential[i], 64 | lin_layer_sizes_non_sequential[i + 1]) 65 | for i in range(len(lin_layer_sizes_non_sequential) - 1)]) 66 | for lin_layer in self.lin_layersnon_sequential: 67 | nn.init.kaiming_normal_(lin_layer.weight.data) 68 | 69 | self.emb_dropout_layernon_sequential = nn.Dropout(emb_dropout_non_seq) 70 | self.dropout_layersnon_sequential = nn.ModuleList( 71 | [nn.Dropout(size) for size in lin_layer_dropouts_non_sequential]) 72 | self.bn_layersnon_sequential = nn.ModuleList( 73 | [nn.BatchNorm1d(size) for size in lin_layer_sizes_non_sequential]) 74 | 75 | # Linear Layers for seq_cat_data 76 | if self.no_of_embs_seq != 0: 77 | first_lin_layer_seq = nn.Linear(self.no_of_embs_seq, lin_layer_sizes_sequential[0]) 78 | self.lin_layers_seq = nn.ModuleList([first_lin_layer_seq] + 79 | [nn.Linear(lin_layer_sizes_sequential[i], 80 | lin_layer_sizes_sequential[i + 1]) 81 | for i in range(len(lin_layer_sizes_sequential) - 1)]) 82 | for lin_layer in self.lin_layers_seq: 83 | nn.init.kaiming_normal_(lin_layer.weight.data) 84 | 85 | self.emb_dropout_layer_seq = nn.Dropout(emb_dropout_seq) 86 | self.dropout_layers_seq = nn.ModuleList([nn.Dropout(size) for size in lin_layer_dropouts_sequential]) 87 | self.bn_layers_seq = nn.ModuleList([nn.BatchNorm1d(size) for size in lin_layer_sizes_sequential]) 88 | 89 | # Output Layer 90 | self.output_layer = nn.Linear(lin_layer_sizes_fin[-1], output_size) 91 | 92 | for lin_layer in self.lin_layer_non_sequential: 93 | nn.init.kaiming_normal_(lin_layer.weight.data) 94 | 95 | nn.init.kaiming_normal_(self.output_layer.weight.data) 96 | 97 | # definitions for data parsing 98 | # primarily required to make sure embeddings are used for categorical data 99 | 100 | # LSTM layer 101 | self.lstm_layer = nn.LSTM( 102 | self.input_dim + lin_layer_sizes_non_sequential[-1], self.hidden_size, batch_first=True) 103 | 104 | # Linear Layers post LSTM 105 | self.lin_layer_lstm_to_dense = nn.Linear( 106 | self.hidden_size, self.hidden_size) 107 | 108 | # TPA attention 109 | self.convolution_filters = nn.ModuleList([nn.Conv1d( 110 | in_channels=1, out_channels=1, kernel_size=self.seq_len) for i in range(hidden_size)]) 111 | self.tpa_linear = nn.Linear( 112 | self.hidden_size, self.hidden_size, bias=False) 113 | 114 | self.tpa_hiddent_linear = nn.Linear( 115 | self.hidden_size, self.hidden_size, bias=False) 116 | self.tpa_context_linear = nn.Linear( 117 | self.hidden_size, self.hidden_size, bias=False) 118 | # Final MLP 119 | first_fin_layer = nn.Linear(self.hidden_size + self.context_vector_size + 120 | self.no_of_embs_non_seq + self.non_seq_cont_count, lin_layer_sizes_fin[0]) 121 | 122 | self.lin_layers_final = nn.ModuleList([first_fin_layer] + 123 | [nn.Linear(lin_layer_sizes_fin[i], 124 | lin_layer_sizes_fin[i + 1]) 125 | for i in range(len(lin_layer_sizes_fin) - 1)]) 126 | for lin_layer in self.lin_layers_final: 127 | nn.init.kaiming_normal_(lin_layer.weight.data) 128 | 129 | # final dropout and batch norm layers for final prediction 130 | self.dropout_layers_final = nn.ModuleList( 131 | [nn.Dropout(size) for size in lin_layer_dropouts_fin]) 132 | self.bn_layers_final = nn.ModuleList( 133 | [nn.BatchNorm1d(size) for size in lin_layer_sizes_fin]) 134 | 135 | # Output Layer 136 | self.output_layer = nn.Linear(lin_layer_sizes_fin[-1], output_size) 137 | 138 | def forward(self, seq_cont_data, seq_cat_data, non_seq_cat_data, non_seq_cont_data): # noqa : R0914 139 | """Run a forward pass of model over the data.""" 140 | 141 | if self.no_of_embs_non_seq != 0: 142 | non_seq_cat_data = non_seq_cat_data.type( 143 | torch.LongTensor).to(self.device) 144 | # across all rows and column i - useful for batches 145 | non_seq_cat_inp = [emb_layer(non_seq_cat_data[:, i]) 146 | for i, emb_layer in enumerate(self.emb_layers)] 147 | non_seq_cat_inp = torch.cat(non_seq_cat_inp, 1) 148 | non_seq_cat_inp = self.emb_dropout_layer_non_sequential(non_seq_cat_inp) 149 | if self.non_seq_cont_count != 0: 150 | non_seq_inp = torch.cat((non_seq_cat_inp.type(torch.FloatTensor).to( 151 | self.device), non_seq_cont_data.type(torch.FloatTensor).to(self.device)), 1) 152 | else: 153 | non_seq_inp = non_seq_cat_inp.type(torch.FloatTensor).to(self.device) 154 | elif self.non_seq_cont_count != 0: 155 | non_seq_inp = non_seq_cont_data.type(torch.FloatTensor).to(self.device) 156 | 157 | if self.no_of_embs_seq != 0: 158 | seq_cat_data = seq_cat_data.type( 159 | torch.LongTensor).to(self.device) 160 | # across all rows and column i - useful for batches 161 | seq_cat_inp = [emb_layer(seq_cat_data[:, :, i]) 162 | for i, emb_layer in enumerate(self.seq_emb_layers)] 163 | seq_cat_inp = torch.cat(seq_cat_inp, 2) 164 | seq_cat_inp = self.emb_dropout_layer_seq(seq_cat_inp) 165 | 166 | seq_cat_inp_emb = seq_cat_inp 167 | 168 | seq_cat_inp_emb = seq_cat_inp_emb.to(self.device) 169 | 170 | seq_data = torch.cat([seq_cat_inp_emb, seq_cont_data], 2) 171 | 172 | # now the sequential data ------------------------------ 173 | inp_tens = seq_data 174 | 175 | temp_batch_size = inp_tens.size()[0] 176 | 177 | h0 = torch.zeros(1, temp_batch_size, self.hidden_size).to( 178 | self.device).requires_grad_() 179 | c0 = torch.zeros(1, temp_batch_size, self.hidden_size).to( 180 | self.device).requires_grad_() 181 | 182 | output, (hn, _) = self.lstm_layer(inp_tens, (h0, c0)) 183 | hn = hn.to(self.device) 184 | # passes through the embedding layer to generate the required embeddings 185 | # output shape batch_size * seq_len * hidden_size 186 | # output[:,:,i] shape batch_size * seq_len - 1st row of H matrix 187 | hc = torch.zeros(temp_batch_size, self.hidden_size, 188 | self.hidden_size).to(self.device) 189 | 190 | for i in range(self.hidden_size): 191 | for j in range(self.hidden_size): 192 | hc[:, i, j] = self.convolution_filters[j]( 193 | output[:, :, i].unsqueeze(1)).squeeze() 194 | 195 | alpha = torch.zeros(temp_batch_size, self.hidden_size).to(self.device) 196 | 197 | for i in range(self.hidden_size): 198 | temp1 = self.tpa_linear(hc[:, i]).unsqueeze(1) 199 | temp2 = hn.squeeze().unsqueeze(2) 200 | temp = torch.bmm(temp1, temp2) 201 | alpha[:, i] = F.sigmoid(temp).squeeze() 202 | 203 | vt = torch.zeros(temp_batch_size, self.hidden_size).to(self.device) 204 | for i in range(self.hidden_size): 205 | temp = torch.bmm(alpha[:, i].unsqueeze(1).unsqueeze( 206 | 2), hc[:, i].unsqueeze(1)).squeeze() 207 | vt += temp 208 | 209 | htprime = self.tpa_hiddent_linear(hn) + self.tpa_context_linear(vt) 210 | 211 | seq_inp = self.lin_layer_lstm_to_dense(hn) 212 | seq_inp = seq_inp.reshape(seq_inp.size()[1], seq_inp.size()[2]) 213 | htprime = htprime.squeeze() 214 | 215 | # Linear mlp for prediction 216 | # fin_input = torch.cat((seq_inp, htprime), 1) 217 | fin_input = torch.cat((non_seq_inp, seq_inp, htprime), 1) 218 | 219 | x = fin_input 220 | for lin_layer, dropout_layer, bn_layer in zip(self.lin_layers_final, self.dropout_layers_final, 221 | self.bn_layers_final): 222 | x = F.relu(lin_layer(x)) 223 | x = bn_layer(x) 224 | x = dropout_layer(x) 225 | 226 | x = F.relu(self.output_layer(x)) 227 | 228 | return x, fin_input 229 | -------------------------------------------------------------------------------- /caspr/models/mlp.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """CASPR mlp base class.""" 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | from caspr.models.dense_bn_dropout import DenseBnDropout 8 | 9 | 10 | class MLP(nn.Module): # noqa: W0223 11 | def __init__(self, # noqa: R0913 12 | input_size, 13 | lin_layer_sizes, 14 | lin_layer_dropouts, 15 | output_size, 16 | use_sigmoid=False): 17 | """Initialize model with params.""" 18 | 19 | super().__init__() 20 | 21 | self.output_size = output_size 22 | self.use_sigmoid = use_sigmoid 23 | 24 | # final linear layers just before prediction 25 | self.dense_bn_dropout = DenseBnDropout( 26 | lin_layer_sizes=lin_layer_sizes, lin_layer_dropouts=lin_layer_dropouts, input_size=input_size) 27 | 28 | # Output Layer 29 | self.output_layer = nn.Linear(lin_layer_sizes[-1], output_size) 30 | nn.init.kaiming_normal_(self.output_layer.weight.data) 31 | 32 | def forward(self, inp): 33 | """Run a forward pass of model over the data.""" 34 | inp = self.dense_bn_dropout(inp) 35 | out = self.output_layer(inp) 36 | if self.use_sigmoid: 37 | out = torch.sigmoid(out) 38 | return out 39 | -------------------------------------------------------------------------------- /caspr/models/multi_layer_lstm.py: -------------------------------------------------------------------------------- 1 | """CASPR LSTM base class.""" 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | 7 | class MultiLayerLSTM(nn.Module): # noqa: W0223 8 | """Encapsulates the Pytorch LSTM. 9 | 10 | Added functionality of aggregation / concatenation in cases of 11 | bidirectional and multi-layered LSTM's 12 | 13 | It outputs the original outputs of the lstm along with an aggregated output vector 14 | """ 15 | 16 | def __init__(self, input_size, hidden_size, dropout=0., num_layers=1, bidirectional=False): # noqa: R0913 17 | """Initialise the pytorch LSTM layer. 18 | 19 | Args: 20 | input_size = The size of the input in the lstm. This represents the number of input features 21 | hidden_size = the hidden size of the lstm 22 | dropout = the dropout layers between the multiple layers of the lstm (works only when we use a 23 | multi-layered lstm) 24 | num_layers = num_layers of the lstm 25 | bidirectional = represents the type of the lstm 26 | """ 27 | super().__init__() 28 | self.lstm_layer = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, 29 | bidirectional=bidirectional, dropout=dropout) 30 | self.num_directions = 2 if bidirectional else 1 31 | self.num_layers = num_layers 32 | self.hidden_size = hidden_size 33 | # Linear Layers post LSTM 34 | self.lin_layer_lstm_to_dense = nn.Linear( 35 | self.num_directions*self.hidden_size, self.hidden_size) 36 | 37 | def forward(self, input_tensor, hidden_state=None): 38 | """Run a forward pass of model over the data.""" 39 | batch_size = input_tensor.size()[0] 40 | device = input_tensor.device 41 | 42 | if hidden_state is not None: 43 | h0 = hidden_state 44 | c0 = torch.zeros(self.num_directions*self.num_layers, batch_size, self.hidden_size).to(device) 45 | output, (hn, cn) = self.lstm_layer(input_tensor, (h0, c0)) 46 | else: 47 | output, (hn, cn) = self.lstm_layer(input_tensor) 48 | 49 | hn = hn.view(self.num_layers, self.num_directions, - 50 | 1, self.hidden_size) 51 | cn = cn.view(self.num_layers, self.num_directions, - 52 | 1, self.hidden_size) 53 | 54 | if self.num_directions > 1: 55 | seq_inp = self.lin_layer_lstm_to_dense(torch.cat( 56 | [hn[self.num_layers-1, 0], hn[self.num_layers-1, -1]], 1).unsqueeze(0)) 57 | else: 58 | seq_inp = self.lin_layer_lstm_to_dense( 59 | hn[self.num_layers-1, 0]).unsqueeze(0) 60 | 61 | seq_inp = seq_inp.reshape(seq_inp.size()[1], seq_inp.size()[2]) 62 | 63 | return output, (hn[self.num_layers-1, 0, :, :], cn[self.num_layers-1, 0, :, :]), seq_inp 64 | -------------------------------------------------------------------------------- /caspr/models/transformer.py: -------------------------------------------------------------------------------- 1 | """CASPR transformer base class.""" 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | from caspr.models.attention_mechanisms import MultiHeadAttentionLayer 7 | 8 | 9 | class TransformerEncoderLayer(nn.Module): # noqa: W0223 # noqa: W0223 10 | """TransformerEncoderLayer is made up of self-attn and feedforward network. 11 | 12 | Args: 13 | hid_dim: the hidden size of the encoder 14 | n_heads: the number of heads in the multi-head attention layers 15 | pf_dim: the dimension of the feedforward network model 16 | dropout: the dropout value 17 | device: the device on which the model is running 18 | """ 19 | 20 | def __init__(self, # noqa: R0913 21 | hid_dim, 22 | n_heads, 23 | pf_dim, 24 | dropout): 25 | """Initialize model with params.""" 26 | 27 | super().__init__() 28 | 29 | self.self_attn_layer_norm = nn.LayerNorm(hid_dim) 30 | self.ff_layer_norm = nn.LayerNorm(hid_dim) 31 | self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout) 32 | self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 33 | pf_dim, 34 | dropout) 35 | self.dropout = nn.Dropout(dropout) 36 | 37 | def forward(self, src, src_mask): 38 | """Run a forward pass of model over the data.""" 39 | 40 | # src = [batch size, src len, hid dim] 41 | # src_mask = [batch size, src len] 42 | 43 | # self attention 44 | _src, _ = self.self_attention(src, src, src, src_mask) 45 | 46 | # dropout, residual connection and layer norm 47 | src = self.self_attn_layer_norm(src + self.dropout(_src)) 48 | 49 | # src = [batch size, src len, hid dim] 50 | 51 | # positionwise feedforward 52 | _src = self.positionwise_feedforward(src) 53 | 54 | # dropout, residual and layer norm 55 | src = self.ff_layer_norm(src + self.dropout(_src)) 56 | 57 | # src = [batch size, src len, hid dim] 58 | 59 | return src 60 | 61 | 62 | class TransformerEncoder(nn.Module): # noqa: W0223 # noqa: W0223 63 | """TransformerEncoder is a stack of N encoder layers. 64 | 65 | Args: 66 | hid_dim: the hidden size of the encoder. 67 | n_layers: the number of sub-encoder-layers in the encoder 68 | n_heads: the number of heads in the multi-head attention layers 69 | pf_dim: the dimension of the feedforward network model 70 | dropout: the dropout value 71 | device: the device on which the model is running 72 | max_length: the maximum length of the input sequence 73 | """ 74 | 75 | def __init__(self, # noqa: R0913 76 | hid_dim, 77 | n_layers, 78 | n_heads, 79 | pf_dim, 80 | dropout, 81 | max_length=100): 82 | """Initialize model with params.""" 83 | super().__init__() 84 | 85 | self.pos_embedding = nn.Embedding(max_length, hid_dim) 86 | 87 | self.layers = nn.ModuleList([TransformerEncoderLayer(hid_dim, 88 | n_heads, 89 | pf_dim, 90 | dropout) 91 | for _ in range(n_layers)]) 92 | 93 | self.dropout = nn.Dropout(dropout) 94 | 95 | self.register_buffer('scale', torch.sqrt(torch.FloatTensor([hid_dim]))) 96 | 97 | def _make_src_mask(self, batch_size, src_len, device): 98 | 99 | src_mask = torch.ones((batch_size, 1, 1, src_len), device=device).bool() 100 | 101 | # src_mask = [batch size, 1, 1, src len] 102 | 103 | return src_mask 104 | 105 | def forward(self, src): 106 | """Run a forward pass of model over the data.""" 107 | 108 | # src = [batch size, src len, hid_dim] 109 | 110 | batch_size = src.shape[0] 111 | src_len = src.shape[1] 112 | device = src.device 113 | 114 | src_mask = self._make_src_mask(batch_size, src_len, device) 115 | 116 | # src_mask = [batch size, src len] 117 | 118 | pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(device) 119 | 120 | # pos = [batch size, src len] 121 | 122 | src = self.dropout(src * self.scale + self.pos_embedding(pos)) 123 | 124 | # src = [batch size, src len, hid dim] 125 | 126 | for layer in self.layers: 127 | src = layer(src, src_mask) 128 | 129 | # src = [batch size, src len, hid dim] 130 | # src_mask = [batch size, 1, 1, src len] 131 | 132 | return src, src_mask 133 | 134 | 135 | class PositionwiseFeedforwardLayer(nn.Module): # noqa: W0223 136 | """Fully connected feed-forward network consisting of two linear transformations with a ReLU activation in between. 137 | 138 | Args: 139 | hid_dim: the hidden size of the encoder 140 | pf_dim: the dimension of the feedforward network model 141 | dropout: the dropout value 142 | """ 143 | 144 | def __init__(self, hid_dim, pf_dim, dropout): 145 | """Initialize model with params.""" 146 | super().__init__() 147 | 148 | self.fc_1 = nn.Linear(hid_dim, pf_dim) 149 | self.fc_2 = nn.Linear(pf_dim, hid_dim) 150 | 151 | self.dropout = nn.Dropout(dropout) 152 | 153 | def forward(self, x): 154 | """Run a forward pass of model over the data.""" 155 | 156 | # x = [batch size, seq len, hid dim] 157 | 158 | x = self.dropout(torch.relu(self.fc_1(x))) 159 | 160 | # x = [batch size, seq len, pf dim] 161 | 162 | x = self.fc_2(x) 163 | 164 | # x = [batch size, seq len, hid dim] 165 | 166 | return x 167 | 168 | 169 | class TransformerDecoderLayer(nn.Module): # noqa: W0223 170 | """TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network. 171 | 172 | Args: 173 | hid_dim: the hidden size of the encoder 174 | n_heads: the number of heads in the multi-head attention layers 175 | pf_dim: the dimension of the feedforward network model 176 | dropout: the dropout value 177 | device: the device on which the model is running 178 | """ 179 | 180 | def __init__(self, # noqa: R0913 181 | hid_dim, 182 | n_heads, 183 | pf_dim, 184 | dropout): 185 | """Initialize model with params.""" 186 | super().__init__() 187 | 188 | self.self_attn_layer_norm = nn.LayerNorm(hid_dim) 189 | self.enc_attn_layer_norm = nn.LayerNorm(hid_dim) 190 | self.ff_layer_norm = nn.LayerNorm(hid_dim) 191 | self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout) 192 | self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout) 193 | self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 194 | pf_dim, 195 | dropout) 196 | self.dropout = nn.Dropout(dropout) 197 | 198 | def forward(self, trg, enc_src, trg_mask, src_mask): 199 | """Run a forward pass of model over the data.""" 200 | 201 | # trg = [batch size, trg len, hid dim] 202 | # enc_src = [batch size, src len, hid dim] 203 | # trg_mask = [batch size, trg len] 204 | # src_mask = [batch size, src len] 205 | 206 | # self attention 207 | _trg, _ = self.self_attention(trg, trg, trg, trg_mask) 208 | 209 | # dropout, residual connection and layer norm 210 | trg = self.self_attn_layer_norm(trg + self.dropout(_trg)) 211 | 212 | # trg = [batch size, trg len, hid dim] 213 | 214 | # encoder attention 215 | _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask) 216 | 217 | # dropout, residual connection and layer norm 218 | trg = self.enc_attn_layer_norm(trg + self.dropout(_trg)) 219 | 220 | # trg = [batch size, trg len, hid dim] 221 | 222 | # positionwise feedforward 223 | _trg = self.positionwise_feedforward(trg) 224 | 225 | # dropout, residual and layer norm 226 | trg = self.ff_layer_norm(trg + self.dropout(_trg)) 227 | 228 | # trg = [batch size, trg len, hid dim] 229 | # attention = [batch size, n heads, trg len, src len] 230 | 231 | return trg, attention 232 | 233 | 234 | class TransformerDecoder(nn.Module): # noqa: W0223 235 | """TransformerDecoder is a stack of N decoder layers. 236 | 237 | Args: 238 | hid_dim: the hidden size of the decoder 239 | n_layers: the number of sub-decoder-layers in the decoder 240 | n_heads: the number of heads in the multi-head attention layers 241 | pf_dim: the dimension of the feedforward network model 242 | dropout: the dropout value 243 | pos_embedding: learned positional encoding added to the input embedding 244 | device: the device on which the model is running 245 | """ 246 | 247 | def __init__(self, # noqa: R0913 248 | hid_dim, 249 | n_layers, 250 | n_heads, 251 | pf_dim, 252 | dropout, 253 | pos_embedding): 254 | """Initialize model with params.""" 255 | super().__init__() 256 | 257 | self.pos_embedding = pos_embedding 258 | 259 | self.layers = nn.ModuleList([TransformerDecoderLayer(hid_dim, 260 | n_heads, 261 | pf_dim, 262 | dropout) 263 | for _ in range(n_layers)]) 264 | 265 | self.dropout = nn.Dropout(dropout) 266 | 267 | self.register_buffer('scale', torch.sqrt(torch.FloatTensor([hid_dim]))) 268 | 269 | def _make_trg_mask(self, batch_size, trg_len, device): 270 | 271 | trg_mask = torch.tril(torch.ones((batch_size, 1, trg_len, trg_len), device=device)).bool() 272 | 273 | # trg_mask = [batch size, 1, trg len, trg len] 274 | 275 | return trg_mask 276 | 277 | def forward(self, trg, enc_src, src_mask): 278 | """Run a forward pass of model over the data.""" 279 | 280 | # trg = [batch size, trg len, hid_dim] 281 | # enc_src = [batch size, src len, hid dim] 282 | # src_mask = [batch size, 1, 1, src len] 283 | 284 | batch_size = trg.shape[0] 285 | trg_len = trg.shape[1] 286 | device = trg.device 287 | 288 | trg_mask = self._make_trg_mask(batch_size, trg_len, device) 289 | 290 | # trg_mask = [batch size, trg len] 291 | 292 | pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(device) 293 | 294 | # pos = [batch size, trg len] 295 | 296 | trg = self.dropout(trg * self.scale + self.pos_embedding(pos)) 297 | 298 | # trg = [batch size, trg len, hid dim] 299 | 300 | for layer in self.layers: 301 | trg, attention = layer(trg, enc_src, trg_mask, src_mask) 302 | 303 | # trg = [batch size, trg len, hid dim] 304 | # attention = [batch size, n heads, trg len, src len] 305 | 306 | return trg, attention 307 | -------------------------------------------------------------------------------- /caspr/models/unified_encoder.py: -------------------------------------------------------------------------------- 1 | """CASPR LSTM model.""" 2 | 3 | import warnings 4 | 5 | import numpy as np 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | 10 | from caspr.models.attention_mechanisms import BahdanauAttention, MultiHeadAttentionLSTMWrapper 11 | from caspr.models.convolutional_aggregation import ConvAggregation 12 | from caspr.models.dense_bn_dropout import DenseBnDropout 13 | from caspr.models.embedding_layer import CategoricalEmbedding 14 | from caspr.models.multi_layer_lstm import MultiLayerLSTM 15 | 16 | warnings.simplefilter('ignore') 17 | 18 | 19 | SEQ_CAT_INDEX = 0 20 | SEQ_CONT_INDEX = 1 21 | NON_SEQ_CAT_INDEX = 2 22 | NON_SEQ_CONT_INDEX = 3 23 | 24 | 25 | class UnifiedEncoder(nn.Module): # noqa: R0902, W0223 26 | """Encapsulates the basic structure to run most of our models. 27 | 28 | It checks the various conditions for the presence 29 | or the absence of data and is compatible with functionalities like 30 | 1. Usage of pretrained embedding vectors 31 | 2. Multi-Layered LSTM use 32 | 3. Convolutional Aggregation for data 33 | 4. Self-Multi-Head and Bahdanau Attention (when number of heads = 1, Bahdanau is used by default) 34 | 35 | In this new edition, it is compatible with the DLExplainer module and should be used if 36 | explainability is a requirement 37 | """ 38 | 39 | def __init__(self, # noqa: R0912, R0913, R0914, R0915 40 | emb_dims_non_seq, 41 | emb_dropout_non_seq, 42 | emb_dims_seq, 43 | emb_dropout_seq, 44 | emb_lin_layer_sizes_non_seq, 45 | emb_lin_layer_dropouts_non_seq, 46 | emb_lin_layer_sizes_seq, 47 | emb_lin_layer_dropouts_seq, 48 | lstm_hidden_size, 49 | output_size, 50 | seq_len, 51 | non_seq_cont_count, 52 | seq_cat_count, 53 | seq_cont_count, 54 | non_seq_cat_count, 55 | attention_heads=1, 56 | non_seq_pretrained_embs=None, 57 | freeze_non_seq_pretrained_embs=True, 58 | seq_pretrained_embs=None, 59 | freeze_seq_pretrained_embs=True, 60 | lstm_num_layers=1, 61 | lstm_bidirectional=False, 62 | use_conv_agg=False, 63 | kernel_size=(3, 3), 64 | max_pool_size=(2, 2), 65 | stride=(2, 2)): 66 | """Initialize of all the variables and the layers depending on the arguments supplied. 67 | 68 | Args: 69 | emb_dims_non_seq = (List of tuples (x, y)) where x is the vocab size and y is the number of dimensions 70 | for the respective embedding layer for every non_sequential categorical variable 71 | emb_dropout_non_seq = (Float) Dropout value of a layer used after the embedding layer - non_sequential 72 | emb_dims_seq = (List of tuples (x, y)) where x is the vocab size and y is the number of dimensions for the 73 | respective embedding layer for every sequential categorical variable 74 | emb_dropout_seq = (Float) Dropout value of a layer used after the embedding layer - sequential 75 | emb_lin_layer_sizes_non_seq = (List of integers) determining the sizes of the stacked linear layers 76 | used just after the embedding layers to learn better representations for non_sequential 77 | categorical variables 78 | emb_lin_layer_dropouts_non_seq = (List of float) values determining the p values in the dropout 79 | layers between linear layers 80 | emb_lin_layer_sizes_seq = (List of integers) determining the sizes of the stacked linear layers 81 | used just after the embedding layers to learn better representations for sequential 82 | categorical variables 83 | emb_lin_layer_dropouts_seq = (List of float) values determining the p values in the dropout 84 | layers between linear layers 85 | lstm_hidden_size = (Integer) determining the Hidden size of the LSTM layer used to train the sequence model 86 | output_size = (Integer) Size of the final embedded output by the encoder. 87 | seq_len = (Integer) determining the length of the sequence in input 88 | non_seq_cont_count = (Integer) Number of non_sequential continuous variables 89 | seq_cat_count = (Integer) Number of sequential categorical variables 90 | seq_cont_count = (Integer) Number of sequential continuous variables 91 | non_seq_cat_count = (Integer) Number of non_sequential categorical variables 92 | attention_heads = (Integer: Default = 1) Describes the number of attention heads being used after the LSTM. 93 | When 0 means that attention is not being used. 94 | When = 1 uses Bahdanau attention by default and 95 | When > 1 uses Multi-Head self-attention 96 | non_seq_pretrained_embs = (List of Tensors: Default = None) To be used as pretrained embeddings 97 | in the embedding layers 98 | freeeze_non_seq_pretrained_embs = (Boolean: Default = True) Determines if the pretrained embeddings 99 | are to be left untouched during backprop 100 | seq_pretrained_embs = (List of Tensors: Default = None) To be used as pretrained embeddings in the 101 | embedding layers, 102 | freeeze_seq_pretrained_embs = (Boolean: Default = True) Determines if the pretrained embeddings 103 | are to be left untouched during backprop 104 | lstm_num_layers = (Integer: Default = 1) The number of stacked LSTM layers used 105 | lstm_bidirectional = (Boolean: Default = False) Determines if the LSTM used is bidirectional 106 | use_conv_agg = (Boolean: Default = False) Determines if Convolutional aggregation is to be used in 107 | the model or not 108 | kernel_size = (Tuple of Integers : Default = (3,3)) Determines the kernel size of the cnn aggregator 109 | max_pool_size = (Tuple of Integers : Default = (2, 2)) Determines the max_pool size of the cnn aggregator 110 | stride = (Tuple of Integers : Default = (2, 2)) Determines the stride of the cnn aggregator 111 | """ 112 | super().__init__() 113 | 114 | self._explain = False 115 | self.non_seq_emb_layers = CategoricalEmbedding(emb_dims=emb_dims_non_seq, emb_dropout=emb_dropout_non_seq, 116 | pretrained_vecs=non_seq_pretrained_embs, 117 | freeze_pretrained=freeze_non_seq_pretrained_embs) 118 | self.seq_emb_layers = CategoricalEmbedding(emb_dims=emb_dims_seq, emb_dropout=emb_dropout_seq, is_seq=True, 119 | pretrained_vecs=seq_pretrained_embs, 120 | freeze_pretrained=freeze_seq_pretrained_embs) 121 | 122 | self.no_of_embs_non_seq = np.sum([y for x, y in emb_dims_non_seq]) 123 | self.no_of_embs_seq = np.sum([y for x, y in emb_dims_seq]) 124 | 125 | self.non_seq_cat_final_size = 0 126 | self.seq_len = seq_len 127 | self.hidden_size = lstm_hidden_size 128 | self.context_vector_size = lstm_hidden_size 129 | self.output_dim = output_size 130 | self.num_layers = lstm_num_layers 131 | self.num_directions = 2 if lstm_bidirectional else 1 132 | 133 | self.seq_cat_count = seq_cat_count 134 | self.seq_cont_count = seq_cont_count 135 | self.non_seq_cat_count = non_seq_cat_count 136 | self.non_seq_cont_count = non_seq_cont_count 137 | self.attention_heads = attention_heads 138 | 139 | self.use_conv_agg = use_conv_agg 140 | 141 | # Linear Layers for non_seq_data parallel to LSTM 142 | if self.no_of_embs_non_seq != 0: 143 | self.emb_lin_layer_non_seq = DenseBnDropout( 144 | lin_layer_sizes=emb_lin_layer_sizes_non_seq, 145 | lin_layer_dropouts=emb_lin_layer_dropouts_non_seq, input_size=self.no_of_embs_non_seq) 146 | self.non_seq_cat_final_size = emb_lin_layer_sizes_non_seq[-1] 147 | 148 | # LSTM layer 149 | if self.no_of_embs_seq != 0: 150 | self.emb_lin_layer_seq = DenseBnDropout( 151 | lin_layer_sizes=emb_lin_layer_sizes_seq, 152 | lin_layer_dropouts=emb_lin_layer_dropouts_seq, input_size=self.no_of_embs_seq) 153 | 154 | # LSTM layer 155 | if self.no_of_embs_seq != 0: 156 | self.emb_lin_layer_seq = DenseBnDropout( 157 | lin_layer_sizes=emb_lin_layer_sizes_seq, 158 | lin_layer_dropouts=emb_lin_layer_dropouts_seq, input_size=self.no_of_embs_seq) 159 | self.lstm_inp_size = emb_lin_layer_sizes_seq[-1] + seq_cont_count 160 | else: 161 | self.lstm_inp_size = seq_cont_count 162 | 163 | if use_conv_agg and seq_len >= kernel_size[0] and \ 164 | (min(1, seq_cat_count)*emb_lin_layer_sizes_seq[-1] + seq_cont_count) >= kernel_size[1] and \ 165 | int((min(1, seq_cat_count)*emb_lin_layer_sizes_seq[-1] + seq_cont_count - 166 | (kernel_size[1] - 1))/stride[1]) >= max_pool_size[1] and \ 167 | int((seq_len - (kernel_size[0] - 1))/stride[0]) >= max_pool_size[0]: 168 | # kernel_size[0] -> size of kernel along sequence dimension, hence must be <= seq_len 169 | # kernel_size[1] -> size of kernel along features dimension, hence must be <= net size of input features 170 | # int((min(1, seq_cat_count)*emb_lin_layer_sizes_seq[-1] 171 | # + seq_cont_count - (kernel_size[i] - 1))/stride[i]) 172 | # is the formula to calculate the final size of dimension i after the CNN filter is applied 173 | # the above size should be >= max_pool[i] for pooling 174 | self.conv_agg = ConvAggregation( 175 | kernel_size=kernel_size, stride=stride, max_pool_size=max_pool_size, dropout_size=0.4) 176 | self.lstm_inp_size = int((int((min(1, seq_cat_count)*emb_lin_layer_sizes_seq[-1] + seq_cont_count - ( 177 | kernel_size[1] - 1) - 1)/stride[1] + 1)) / max_pool_size[1]) 178 | else: 179 | self.use_conv_agg = False 180 | 181 | if self.lstm_inp_size > 0: 182 | self.lstm_layer = MultiLayerLSTM(input_size=self.lstm_inp_size, hidden_size=self.hidden_size, 183 | num_layers=self.num_layers, bidirectional=lstm_bidirectional, dropout=0.4) 184 | 185 | # Attention 186 | if self.attention_heads > 0: 187 | if self.attention_heads == 1: 188 | self.bahdanau_attention_layer = BahdanauAttention(self.hidden_size, self.num_directions) 189 | else: 190 | n_head = self.attention_heads 191 | d_model = self.hidden_size 192 | self.multi_head_attention_layer = MultiHeadAttentionLSTMWrapper(n_head, d_model, dropout=0.1) 193 | 194 | if self.attention_heads > 0: 195 | self.output_layer = nn.Linear(int(self.num_directions*self.hidden_size + self.context_vector_size + 196 | self.non_seq_cat_final_size + self.non_seq_cont_count), 197 | int(self.hidden_size)) 198 | else: 199 | self.output_layer = nn.Linear(int(self.num_directions*self.hidden_size + 200 | self.non_seq_cat_final_size + self.non_seq_cont_count), 201 | int(self.hidden_size)) 202 | nn.init.kaiming_normal_(self.output_layer.weight.data) 203 | 204 | def forward(self, *args): # noqa: R0912, R0914 205 | """Forward function accepts multiple arguments. 206 | 207 | The last argument is always a list of indices representing the index (if data present) 208 | with -1 in places for the absence of data The indices are used to partition the data into 4 types 209 | - seq_cat, seq_cont, non_seq_cat, non_seq_cont 210 | """ 211 | nonempty_idx = args[-1] 212 | data_exists = list(map(lambda x: x != -1, nonempty_idx)) 213 | device = args[0].device 214 | batch_size = args[0].shape[0] 215 | 216 | seq_cat_data = args[nonempty_idx[SEQ_CAT_INDEX]] if data_exists[SEQ_CAT_INDEX] else torch.empty(batch_size, 0, 0, device=device) 217 | seq_cont_data = args[nonempty_idx[SEQ_CONT_INDEX]] if data_exists[SEQ_CONT_INDEX] else torch.empty(batch_size, 0, 0, device=device) 218 | non_seq_cat_data = args[nonempty_idx[NON_SEQ_CAT_INDEX]] if data_exists[NON_SEQ_CAT_INDEX] else torch.empty(batch_size, 0, device=device) 219 | non_seq_cont_data = args[nonempty_idx[NON_SEQ_CONT_INDEX]] if data_exists[NON_SEQ_CONT_INDEX] else torch.empty(batch_size, 0, device=device) 220 | 221 | if self.no_of_embs_non_seq != 0: 222 | non_seq_cat_inp = self.non_seq_emb_layers(non_seq_cat_data) 223 | non_seq_inp = self.emb_lin_layer_non_seq(non_seq_cat_inp) 224 | 225 | if self.non_seq_cont_count != 0: 226 | non_seq_inp = torch.cat((non_seq_inp.type(torch.FloatTensor).to(device), 227 | non_seq_cont_data.type(torch.FloatTensor).to(device)), 1) 228 | else: 229 | if self.non_seq_cont_count != 0: 230 | non_seq_inp = non_seq_cont_data.to(device) 231 | else: 232 | non_seq_inp = torch.Tensor().to(device) 233 | 234 | if self.no_of_embs_seq != 0: 235 | seq_cat_inp = self.seq_emb_layers(seq_cat_data) 236 | seq_inp = self.emb_lin_layer_seq(seq_cat_inp) 237 | if self.seq_cont_count != 0: 238 | seq_inp = torch.cat((seq_inp.type(torch.FloatTensor).to(device), 239 | seq_cont_data.type(torch.FloatTensor).to(device)), 2) 240 | 241 | elif self.seq_cont_count != 0: 242 | seq_inp = seq_cont_data.type(torch.FloatTensor).to(device) 243 | 244 | if self.no_of_embs_seq + self.seq_cont_count > 0: 245 | 246 | if self.use_conv_agg: 247 | seq_inp = self.conv_agg(seq_inp) 248 | 249 | output, (_, cn), seq_inp = self.lstm_layer(seq_inp) 250 | 251 | if self.attention_heads > 0: 252 | if self.attention_heads == 1: 253 | context_vector = self.bahdanau_attention_layer(output) 254 | context_vector = context_vector.reshape(context_vector.size()[0], context_vector.size()[2]) 255 | else: 256 | context_vector = self.multi_head_attention_layer(output, output, output) 257 | 258 | fin_input = torch.cat((seq_inp, context_vector), 1) 259 | else: 260 | fin_input = seq_inp 261 | 262 | if self.no_of_embs_non_seq + self.non_seq_cont_count > 0: 263 | fin_input = torch.cat((non_seq_inp.type(torch.FloatTensor).to(device), fin_input), 1) 264 | else: 265 | fin_input = non_seq_inp 266 | 267 | fin_output = F.relu(self.output_layer(fin_input)) 268 | 269 | if self._explain: 270 | return fin_output 271 | return output, (fin_output, cn) 272 | 273 | @property 274 | def explain(self): 275 | """Getter for explain.""" 276 | 277 | return self._explain 278 | 279 | def set_explain(self, value): 280 | """Setter for explain.""" 281 | 282 | self._explain = value 283 | -------------------------------------------------------------------------------- /caspr/models/unified_transformer_encoder.py: -------------------------------------------------------------------------------- 1 | """CASPR Transfomer model.""" 2 | 3 | import warnings 4 | 5 | import torch 6 | import torch.nn as nn 7 | 8 | from caspr.models.embedding_layer import CategoricalEmbedding 9 | 10 | warnings.simplefilter('ignore') 11 | 12 | SEQ_CAT_INDEX = 0 13 | SEQ_CONT_INDEX = 1 14 | NON_SEQ_CAT_INDEX = 2 15 | NON_SEQ_CONT_INDEX = 3 16 | 17 | 18 | class UnifiedTransformerEncoder(nn.Module): # noqa: R0902, W0223 19 | """Encapsulates the basic structure to run most of our models. 20 | 21 | It checks the various conditions for the presence or the absence of data 22 | and is compatible with functionalities like 23 | 1. Usage of pretrained embedding vectors 24 | 2. Multi-Layered transformer use 25 | 3. Convolutional Aggregation for data 26 | 4. Self-Multi-Head and Bahdanau Attention (when number of heads = 1, Bahdanau is used by default) 27 | 28 | In this new edition, it is compatible with the DLExplainer module and should be used if explainability 29 | is a requirement 30 | """ 31 | 32 | def __init__(self, # noqa: R0913, R0914 33 | transformer_encoder, 34 | emb_dims_non_seq, 35 | emb_dropout_non_seq, 36 | emb_dims_seq, 37 | emb_dropout_seq, 38 | hidden_size, 39 | seq_cont_count, 40 | non_seq_cont_count, 41 | non_seq_pretrained_embs=None, 42 | freeze_non_seq_pretrained_embs=True, 43 | seq_pretrained_embs=None, 44 | freeze_seq_pretrained_embs=True): 45 | """Initialize model with params.""" 46 | 47 | super().__init__() 48 | 49 | self._explain = False 50 | 51 | self.emb_non_seq = CategoricalEmbedding(emb_dims=emb_dims_non_seq, emb_dropout=emb_dropout_non_seq, 52 | is_seq=False, pretrained_vecs=non_seq_pretrained_embs, 53 | freeze_pretrained=freeze_non_seq_pretrained_embs) 54 | self.emb_seq = CategoricalEmbedding(emb_dims=emb_dims_seq, emb_dropout=emb_dropout_seq, 55 | is_seq=True, pretrained_vecs=seq_pretrained_embs, 56 | freeze_pretrained=freeze_seq_pretrained_embs) 57 | 58 | self.hid_dim = hidden_size 59 | self.seq_cont_dim = seq_cont_count 60 | self.non_seq_cont_dim = non_seq_cont_count 61 | 62 | # Linear layers for seq_data 63 | seq_inp_size = self.emb_seq.emb_size + self.seq_cont_dim 64 | self.linear_seq = nn.Linear(seq_inp_size, self.hid_dim) 65 | 66 | # Linear layers for non_seq_data 67 | non_seq_inp_size = self.emb_non_seq.emb_size + self.non_seq_cont_dim 68 | self.linear_non_seq = nn.Linear(non_seq_inp_size, self.hid_dim) if non_seq_inp_size else None 69 | 70 | self.transformer_encoder = transformer_encoder 71 | 72 | def forward(self, *args): 73 | """Run a forward pass of model over the data.""" 74 | 75 | nonempty_idx = args[-1] 76 | data_exists = list(map(lambda x: x != -1, nonempty_idx)) 77 | device = args[0].device 78 | batch_size, seq_len = args[0].shape[:2] 79 | 80 | seq_cat_data = args[nonempty_idx[SEQ_CAT_INDEX]] if data_exists[SEQ_CAT_INDEX] else torch.empty(batch_size, seq_len, 0, device=device) 81 | seq_cont_data = args[nonempty_idx[SEQ_CONT_INDEX]] if data_exists[SEQ_CONT_INDEX] else torch.empty(batch_size, seq_len, 0, device=device) 82 | non_seq_cat_data = args[nonempty_idx[NON_SEQ_CAT_INDEX]] if data_exists[NON_SEQ_CAT_INDEX] else torch.empty(batch_size, 0, device=device) 83 | non_seq_cont_data = args[nonempty_idx[NON_SEQ_CONT_INDEX]] if data_exists[NON_SEQ_CONT_INDEX] else torch.empty(batch_size, 0, device=device) 84 | 85 | if self.emb_seq and data_exists[SEQ_CAT_INDEX]: 86 | seq_cat_data = self.emb_seq(seq_cat_data) 87 | seq_inp = torch.cat((seq_cat_data, seq_cont_data), -1) 88 | seq_inp = self.linear_seq(seq_inp) 89 | 90 | if self.emb_non_seq and data_exists[NON_SEQ_CAT_INDEX]: 91 | non_seq_cat_data = self.emb_non_seq(non_seq_cat_data) 92 | non_seq_inp = torch.cat((non_seq_cat_data, non_seq_cont_data), -1) 93 | if self.linear_non_seq: 94 | non_seq_inp = self.linear_non_seq(non_seq_inp).unsqueeze(1) 95 | 96 | src_inp = torch.cat((seq_inp, non_seq_inp), 1) if non_seq_inp.nelement() > 0 else seq_inp 97 | # src_inp = [batch_size, src len, hid dim] 98 | 99 | enc_src, src_mask = self.transformer_encoder(src_inp) 100 | 101 | if self._explain: 102 | return enc_src.reshape(enc_src.shape[0], -1) 103 | return enc_src, src_mask, src_inp 104 | 105 | @property 106 | def explain(self): 107 | """Getter for explain.""" 108 | 109 | return self._explain 110 | 111 | def set_explain(self, value): 112 | """Setter for explain.""" 113 | 114 | self._explain = value 115 | -------------------------------------------------------------------------------- /caspr/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/CASPR/9cdbf270487751a0ad6862b2fea2ccc0e23a0b67/caspr/utils/__init__.py -------------------------------------------------------------------------------- /caspr/utils/early_stopping.py: -------------------------------------------------------------------------------- 1 | """Early stopping class for nn models.""" 2 | 3 | import logging 4 | 5 | import torch 6 | from torch.nn.parallel import DistributedDataParallel as DDP 7 | 8 | from caspr.utils.onnx import export_onnx, register_custom_op 9 | 10 | 11 | class EarlyStopping: 12 | """Stop the training early and save a PyTorch or ONNX model after specified iterations (patience)""" 13 | 14 | def __init__(self, logger, should_decrease,patience=3, verbose=True, delta=0, save_onnx=False): 15 | """Initialize the early stopping module. 16 | 17 | Args: 18 | logger: For logging 19 | should_decrease (bool): True if metrics improve by decreasing. 20 | patience (int): How long to wait after last time validation score improved. 21 | Default: 3 22 | verbose (bool): If True, prints a message for each validation score improvement. 23 | Default: True 24 | delta (float): Minimum change in the monitored quantity to qualify as an improvement. 25 | Default: 0 26 | save_onnx (bool): If True, export the model as onnx format. 27 | Default: False 28 | """ 29 | self.logger = logger 30 | self.patience = patience 31 | self.verbose = verbose 32 | self.counter = 0 33 | self.best_score = None 34 | self.early_stop = False 35 | self.should_decrease = should_decrease 36 | self.delta = delta 37 | self.save_onnx = save_onnx 38 | if self.save_onnx: 39 | register_custom_op() 40 | 41 | def __call__(self, val_score, model, path): 42 | """Define __call__ method. 43 | 44 | Args: 45 | val_score (float): Validation score to determine whether to early stop. 46 | model (nn.Module): Model being trained. 47 | path (str): Model save path. 48 | """ 49 | 50 | if self.should_decrease: 51 | val_score = -val_score 52 | 53 | if self.best_score is None: 54 | self.best_score = val_score 55 | self.save_checkpoint(model, path) 56 | elif val_score <= self.best_score + self.delta: 57 | self.counter += 1 58 | self.logger.info('EarlyStopping counter: {} out of {}\n'.format(self.counter, self.patience)) 59 | if self.counter >= self.patience: 60 | self.early_stop = True 61 | else: 62 | self.best_score = val_score 63 | self.save_checkpoint(model, path) 64 | self.counter = 0 65 | 66 | def save(self, model, path): 67 | if self.save_onnx: 68 | export_onnx(model, path) 69 | else: 70 | torch.save(model.state_dict(), path) 71 | 72 | def save_checkpoint(self, model, path): 73 | """Save model when validation score improves. 74 | 75 | The model parameter can be a list that allows multiple models to be saved. 76 | """ 77 | 78 | if self.verbose: 79 | self.logger.info('Validation score improved. Saving model ...\n') 80 | if not isinstance(model, list): 81 | self.save(model, path) 82 | else: 83 | for m, p in zip(model, path): 84 | self.save(m, path) 85 | 86 | 87 | class DistributedEarlyStopping(EarlyStopping): 88 | def __init__(self, logger, should_decrease=True, patience=3, verbose=True, delta=0, rank=None, save_onnx=False): 89 | super().__init__(logger, should_decrease, patience=patience, verbose=verbose, delta=delta, save_onnx=save_onnx) 90 | self.rank = rank 91 | 92 | def __call__(self, val_score, model, path, rank=None): 93 | if not rank: 94 | rank = self.rank 95 | 96 | if rank and rank > 0: 97 | return 98 | 99 | if isinstance(model, DDP): 100 | model = model.module 101 | 102 | return super().__call__(val_score, model, path) 103 | 104 | 105 | if __name__ == '__main__': 106 | 107 | class TwoLayerNet(torch.nn.Module): 108 | """Simple two-layer neural network for demonstration purposes.""" 109 | 110 | def __init__(self, D_in, H, D_out): 111 | """Instantiate two nn.Linear modules and assign them as member variables.""" 112 | 113 | super().__init__() 114 | 115 | self.linear1 = torch.nn.Linear(D_in, H) 116 | self.linear2 = torch.nn.Linear(H, D_out) 117 | 118 | def forward(self, x): 119 | """In the forward function we accept a Tensor of input data and we must return a Tensor of output data. 120 | 121 | We can use Modules defined in the constructor as well as arbitrary operators on Tensors. 122 | """ 123 | 124 | h_relu = self.linear1(x).clamp(min=0) 125 | y = self.linear2(h_relu) 126 | return y 127 | 128 | # N is batch size; D_in is input dimension; 129 | # H is hidden dimension; D_out is output dimension. 130 | batch_size, input_dim, hidden_dim, output_dim = 1000, 1000, 100, 10 131 | 132 | # Create random Tensors to hold inputs and outputs 133 | X = torch.randn(batch_size, input_dim) 134 | y_true = torch.randn(batch_size, output_dim) 135 | 136 | # Construct our model by instantiating the class defined above 137 | mlp = TwoLayerNet(input_dim, hidden_dim, output_dim) 138 | 139 | # Construct our loss function and an Optimizer. The call to model.parameters() 140 | # in the SGD constructor will contain the learnable parameters of the two 141 | # nn.Linear modules which are members of the model. 142 | criterion = torch.nn.MSELoss(reduction='sum') 143 | optimizer = torch.optim.SGD(mlp.parameters(), lr=1e-4) 144 | logger = logging.getLogger(__name__) 145 | early_stopping = EarlyStopping(logger, should_decrease=True, patience=3, verbose=True, delta=1e-5) 146 | 147 | for t in range(10000): 148 | # Forward pass: Compute predicted y by passing x to the model 149 | y_pred = mlp(X) 150 | 151 | # Compute and print loss 152 | loss = criterion(y_pred, y_true) 153 | if t % 100 == 99: 154 | logger.info(t, loss.item()) 155 | early_stopping(loss.item(), mlp, 'early_stopping_test_model.pth') 156 | if early_stopping.early_stop: 157 | break 158 | 159 | # Zero gradients, perform a backward pass, and update the weights. 160 | optimizer.zero_grad() 161 | loss.backward() 162 | optimizer.step() 163 | 164 | mlp.load_state_dict(torch.load('early_stopping_test_model.pth')) 165 | y_pred = mlp(X) 166 | loss = criterion(y_pred, y_true) 167 | logger.info('Best loss: {}'.format(loss.item())) 168 | -------------------------------------------------------------------------------- /caspr/utils/estimate_parameters.py: -------------------------------------------------------------------------------- 1 | def estimate_linear_parameters(input_dim, output_dim, bias=True): 2 | if input_dim > 0 and bias: 3 | input_dim += 1 4 | return input_dim * output_dim 5 | 6 | 7 | def estimate_embedding_parameters(df, cat_cols_, max_emb_dim): 8 | emb_num_classes = [df.select(c).distinct().count() for c in cat_cols_] 9 | emb_dims = [(x, int(min(max_emb_dim, (x + 1) // 2))) for x in emb_num_classes] 10 | emb_size = sum([d for _, d in emb_dims]) 11 | emb_num_param = sum([estimate_linear_parameters(v, d, bias=False) for v, d in emb_dims]) 12 | return emb_num_param, emb_size, emb_num_classes 13 | 14 | 15 | def estimate_transformer_parameters(hidden_dim, seq_len, pf_dim, num_layers, is_encoder=True): 16 | pos_emb_num_param = estimate_linear_parameters(seq_len, hidden_dim, bias=False) if is_encoder else 0 17 | layer_norm_num_param = hidden_dim * 2 18 | attn_num_param = estimate_linear_parameters(hidden_dim, hidden_dim) * 4 19 | layer_norm_count = 2 if is_encoder else 3 20 | attn_count = 1 if is_encoder else 2 21 | pf_num_param = estimate_linear_parameters(hidden_dim, pf_dim) + estimate_linear_parameters(pf_dim, hidden_dim) 22 | transformer_num_param = pos_emb_num_param + \ 23 | (layer_norm_num_param * layer_norm_count + attn_num_param * attn_count + pf_num_param) * num_layers 24 | return transformer_num_param 25 | 26 | 27 | def estimate_output_parameters(hidden_dim, emb_num_classes, cont_dim): 28 | output_num_param_cat = sum([estimate_linear_parameters(hidden_dim, v) for v in emb_num_classes]) 29 | output_num_param_cont = estimate_linear_parameters(hidden_dim, cont_dim) 30 | output_num_param = output_num_param_cat + output_num_param_cont 31 | return output_num_param 32 | 33 | 34 | def estimate_transformer_autoencoder_parameters(df, seq_cat_, seq_cont_, non_seq_cat_, non_seq_cont_, 35 | hidden_dim, pf_dim_enc, pf_dim_dec, num_layers_enc, 36 | num_layers_dec, seq_len, max_emb_dim=30): 37 | emb_num_param_seq, emb_size_seq, emb_num_classes_seq = estimate_embedding_parameters(df, seq_cat_, max_emb_dim) 38 | emb_num_param_non_seq, emb_size_non_seq, emb_num_classes_non_seq = estimate_embedding_parameters(df, non_seq_cat_, max_emb_dim) 39 | emb_num_param = emb_num_param_seq + emb_num_param_non_seq 40 | 41 | seq_cont_dim = len(seq_cont_) 42 | non_seq_cont_dim = len(non_seq_cont_) 43 | non_seq_dim = emb_size_non_seq + non_seq_cont_dim 44 | linear_num_param_seq = estimate_linear_parameters(seq_cont_dim + emb_size_seq, hidden_dim) 45 | linear_num_param_non_seq = estimate_linear_parameters(non_seq_dim, hidden_dim) 46 | linear_num_param = linear_num_param_seq + linear_num_param_non_seq 47 | 48 | adjust_seq_len = seq_len + int(non_seq_dim > 0) 49 | enc_num_param = estimate_transformer_parameters(hidden_dim, adjust_seq_len, pf_dim_enc, num_layers_enc) 50 | dec_num_param = estimate_transformer_parameters(hidden_dim, adjust_seq_len, pf_dim_dec, num_layers_dec, is_encoder=False) 51 | transformer_num_param = enc_num_param + dec_num_param 52 | 53 | output_num_param_seq = estimate_output_parameters(hidden_dim, emb_num_classes_seq, seq_cont_dim) 54 | output_num_param_non_seq = estimate_output_parameters(hidden_dim, emb_num_classes_non_seq, non_seq_cont_dim) 55 | output_num_param = output_num_param_seq + output_num_param_non_seq 56 | 57 | num_param = emb_num_param + linear_num_param + transformer_num_param + output_num_param 58 | return num_param 59 | -------------------------------------------------------------------------------- /caspr/utils/explain/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # -------------------------------------------------------------------------- 4 | 5 | """The explainer module for the CASPR library. 6 | 7 | Modules: 8 | :module1_name: A description of this specific module. 9 | """ 10 | -------------------------------------------------------------------------------- /caspr/utils/explain/utils.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import pandas as pd 4 | 5 | 6 | def separate_pos_neg(attribution): 7 | """Separate out positive and negative attributes in the dataframe with attributes. 8 | 9 | Outputs two separated dataframes 10 | """ 11 | attribution_pos_val = attribution*(attribution >= 0) 12 | attribution_neg_val = attribution*~(attribution >= 0) 13 | return attribution_pos_val, attribution_neg_val 14 | 15 | 16 | def visualize(explanations: pd.DataFrame, separate_pos_neg_imp: bool = False, 17 | title="Average Feature Importances", axis_title="Features", save_fig: str = None): 18 | """Visualize explanations. 19 | 20 | Utility function used to create bar graph visualisations at a model level 21 | 22 | Args: 23 | explanations (pandas dataframe): Dataframe with feature attributions 24 | separate_pos_neg_imp (Boolean: Default = False): Determines if the positive and negative attributions are to be 25 | aggregated and plotted separately (two reverse sided bars) in the same plot 26 | title (String : Default = "Average Feature Importances") : Represents the title of the graph 27 | axis_title (String: Default = "Features") : Represents the title of the Y axis 28 | save_fig (String) : Contains the path where to save the image plot. If None : the module doesnt save the image 29 | 30 | """ 31 | feature_names = explanations.columns 32 | imp_pos_df, imp_neg_df = separate_pos_neg(explanations) 33 | combine_importances = not separate_pos_neg_imp 34 | 35 | importances_pos = imp_pos_df.values 36 | importances_neg = imp_neg_df.values 37 | 38 | if importances_pos.ndim == 2: 39 | importances_pos = np.mean(importances_pos, axis=0) 40 | importances_neg = np.mean(importances_neg, axis=0) 41 | 42 | xlim_pos = np.max(importances_pos)*1.25 43 | xlim_neg = np.max(np.abs(importances_neg))*1.25 44 | 45 | if combine_importances: 46 | xlim_pos += xlim_neg 47 | xlim_neg = 0 48 | importances_pos += np.abs(importances_neg) 49 | 50 | else: 51 | xlim_pos = np.max([xlim_pos, xlim_neg]) 52 | xlim_neg = -1 * xlim_pos 53 | 54 | x_pos = (np.arange(len(feature_names))) 55 | 56 | # Plotting begins 57 | plt.figure(figsize=(10, 10)) 58 | width = 0.3 59 | 60 | if combine_importances: 61 | plt.barh(x_pos, importances_pos, width, align='center') 62 | else: 63 | plt.barh(x_pos, importances_pos, width, align='center') 64 | plt.barh(x_pos + width, importances_neg, width, align='center') 65 | 66 | plt.yticks(x_pos + width/2, feature_names, wrap=True) 67 | plt.ylabel(axis_title) 68 | plt.title(title) 69 | axes = plt.gca() 70 | axes.set_xlim([xlim_neg, xlim_pos]) 71 | 72 | if save_fig is not None: 73 | plt.savefig(save_fig) 74 | -------------------------------------------------------------------------------- /caspr/utils/horovod/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | 3 | # 4 | 5 | # Unless required by applicable law or agreed to in writing, software 6 | 7 | # distributed under the License is distributed on an "AS IS" BASIS, 8 | 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | 11 | # See the License for the specific language governing permissions and 12 | 13 | # limitations under the License. 14 | 15 | # 16 | 17 | # ============================================================================== 18 | -------------------------------------------------------------------------------- /caspr/utils/horovod/train.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import horovod.torch as hvd 4 | import torch 5 | import torch.nn as nn 6 | from torch import optim 7 | from torch.utils.data.distributed import DistributedSampler 8 | 9 | from caspr.data.common_dataset import id_collate 10 | from caspr.utils.early_stopping import DistributedEarlyStopping 11 | from caspr.utils.train import init_lr_schedulers, run_autoencoder, run_autoencoder_val 12 | 13 | BATCH_SIZE = 1024 * 32 14 | NUM_EPOCHS = 100 15 | EARLY_STOPPING_PATIENCE = 8 16 | EARLY_STOPPING_DELTA = 1e-5 17 | ROOT_RANK = 0 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | def save_checkpoint(model, optimizer, epoch, name): 22 | filepath = '/checkpoint-{epoch}-{model}.pth'.format(epoch=epoch, model=name) 23 | state = { 24 | 'model': model.state_dict(), 25 | 'optimizer': optimizer.state_dict(), 26 | } 27 | torch.save(state, filepath) 28 | 29 | 30 | def metric_average(metric, name): 31 | avg_tensor = hvd.allreduce(metric, name=name) 32 | return avg_tensor.item() 33 | 34 | 35 | def determine_early_stop(early_stopper: DistributedEarlyStopping, loss_averaged, model, path, epoch, num_epochs): 36 | # Call the distributed early stopper while passing rank info 37 | # Only rank 0 is allowed to checkpoint 38 | early_stopper(loss_averaged, model, 39 | path, hvd.rank()) 40 | if early_stopper.early_stop: 41 | epoch = num_epochs 42 | # The answer to whether to stop or not is decided by the root rank 43 | # The answer is then broadcased to other nodes 44 | epoch = hvd.broadcast_object(epoch, root_rank=ROOT_RANK) 45 | 46 | # The root rank loads the latest model checkpoint and broadcasts parameters 47 | if hvd.rank() == ROOT_RANK and epoch == num_epochs: 48 | model.load_state_dict(torch.load(path)) 49 | hvd.broadcast_parameters(model.state_dict(), root_rank=ROOT_RANK) 50 | return epoch 51 | 52 | 53 | def train_hvd(dataset_train, autoenc, device, batch_size=1024, epochs=10, learning_rate=0.01, warmup_epochs=5, save_model=False, path='./early_stopping_test_model.pth'): 54 | autoenc.train() 55 | hvd.init() 56 | logger.info("Number of workers:" + str(hvd.size())) 57 | 58 | if device.type == 'cuda': 59 | # Horovod: pin GPU to local rank. 60 | torch.cuda.set_device(hvd.local_rank()) 61 | 62 | # Configure the sampler such that each worker obtains a distinct sample of input dataset. 63 | train_sampler = DistributedSampler(dataset_train, num_replicas=hvd.size(), rank=hvd.rank()) 64 | train_loader = torch.utils.data.DataLoader( 65 | dataset_train, batch_size=batch_size, sampler=train_sampler, collate_fn=id_collate) 66 | 67 | num_epochs = epochs 68 | 69 | # Effective batch size in synchronous distributed training is scaled by the number of workers. 70 | # An increase in learning rate compensates for the increased batch size. 71 | optimizer = optim.Adam(autoenc.parameters(), lr=learning_rate * hvd.size()) 72 | # Wrap the optimizer with Horovod's DistributedOptimizer. 73 | optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=autoenc.named_parameters()) 74 | 75 | scheduler_wu, scheduler_re = init_lr_schedulers( 76 | optimizer, warmup_epochs, reduce_patience=int(EARLY_STOPPING_PATIENCE/2), verbose=True) 77 | 78 | hvd.broadcast_object(scheduler_wu, root_rank=ROOT_RANK) 79 | hvd.broadcast_object(scheduler_re, root_rank=ROOT_RANK) 80 | 81 | # Broadcast initial parameters so all workers start with the same parameters. 82 | hvd.broadcast_parameters(autoenc.state_dict(), root_rank=ROOT_RANK) 83 | 84 | criterion = [nn.MSELoss(), nn.CrossEntropyLoss()] 85 | 86 | losses = [] 87 | early_stopper = DistributedEarlyStopping(logger, patience=EARLY_STOPPING_PATIENCE, delta=EARLY_STOPPING_DELTA) 88 | 89 | epoch = 1 90 | while epoch < num_epochs + 1: 91 | losses, _ = run_autoencoder(autoenc, optimizer, train_loader, criterion, device) 92 | loss_averaged = metric_average(torch.tensor(losses), 'avg_loss') 93 | logger.info("Average overall training loss in epoch {0} is {1}".format( 94 | epoch, loss_averaged)) 95 | 96 | epoch = determine_early_stop(early_stopper, loss_averaged, autoenc, path, epoch, num_epochs) 97 | 98 | if epoch <= warmup_epochs: 99 | scheduler_wu.step() 100 | scheduler_re.step(loss_averaged) 101 | 102 | if hvd.rank() == ROOT_RANK and epoch == num_epochs: 103 | if save_model: 104 | save_checkpoint(autoenc, optimizer, epoch, 'encoder') 105 | return autoenc, loss_averaged 106 | epoch = epoch+1 107 | 108 | 109 | def train_val_hvd(dataset_train, dataset_val, autoenc, device, batch_size=1024, epochs=10, learning_rate=0.01, warmup_epochs=5, save_model=False, path='./early_stopping_test_model.pth'): 110 | autoenc.train() 111 | hvd.init() 112 | logger.info("Number of workers:" + str(hvd.size())) 113 | 114 | if device.type == 'cuda': 115 | # Horovod: pin GPU to local rank. 116 | torch.cuda.set_device(hvd.local_rank()) 117 | 118 | # Configure the sampler such that each worker obtains a distinct sample of input dataset. 119 | train_sampler = DistributedSampler(dataset_train, num_replicas=hvd.size(), rank=hvd.rank()) 120 | train_loader = torch.utils.data.DataLoader( 121 | dataset_train, batch_size=batch_size, sampler=train_sampler, collate_fn=id_collate) 122 | 123 | val_sampler = DistributedSampler(dataset_val, num_replicas=hvd.size(), rank=hvd.rank()) 124 | val_loader = torch.utils.data.DataLoader(dataset_val, batch_size=batch_size, 125 | sampler=val_sampler, collate_fn=id_collate) 126 | 127 | num_epochs = epochs 128 | 129 | # Effective batch size in synchronous distributed training is scaled by the number of workers. 130 | # An increase in learning rate compensates for the increased batch size. 131 | optimizer = optim.Adam(autoenc.parameters(), lr=learning_rate * hvd.size()) 132 | # Wrap the optimizer with Horovod's DistributedOptimizer. 133 | optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=autoenc.named_parameters()) 134 | 135 | scheduler_wu, scheduler_re = init_lr_schedulers( 136 | optimizer, warmup_epochs, reduce_patience=int(EARLY_STOPPING_PATIENCE/2), verbose=True) 137 | 138 | hvd.broadcast_object(scheduler_wu, root_rank=ROOT_RANK) 139 | hvd.broadcast_object(scheduler_re, root_rank=ROOT_RANK) 140 | 141 | # Broadcast initial parameters so all workers start with the same parameters. 142 | hvd.broadcast_parameters(autoenc.state_dict(), root_rank=ROOT_RANK) 143 | 144 | criterion = [nn.MSELoss(), nn.CrossEntropyLoss()] 145 | 146 | losses = [] 147 | early_stopper = DistributedEarlyStopping(logger, patience=EARLY_STOPPING_PATIENCE, delta=EARLY_STOPPING_DELTA) 148 | 149 | epoch = 1 150 | while epoch < num_epochs + 1: 151 | autoenc.train() 152 | losses, _ = run_autoencoder(autoenc, optimizer, train_loader, criterion, device) 153 | autoenc.eval() 154 | losses_val = run_autoencoder_val(autoenc, val_loader, criterion, device) 155 | loss_train_averaged = metric_average(torch.tensor(losses), 'avg_train_loss') 156 | loss_val_averaged = metric_average(torch.tensor(losses_val), 'avg_val_loss') 157 | 158 | logger.info("Average training loss in epoch {0} is {1}".format(epoch, loss_train_averaged)) 159 | logger.info("Average validation loss in epoch {0} is {1}".format(epoch, loss_val_averaged)) 160 | 161 | if epoch <= warmup_epochs: 162 | scheduler_wu.step() 163 | scheduler_re.step(loss_val_averaged) 164 | 165 | epoch = determine_early_stop(early_stopper, loss_val_averaged, autoenc, path, epoch, num_epochs) 166 | if hvd.rank() == ROOT_RANK and epoch == num_epochs: 167 | if save_model: 168 | save_checkpoint(autoenc, optimizer, epoch, 'encoder') 169 | return autoenc, loss_val_averaged 170 | epoch = epoch+1 171 | -------------------------------------------------------------------------------- /caspr/utils/metrics.py: -------------------------------------------------------------------------------- 1 | """Get classification metrics for CASPR models.""" 2 | 3 | # coding: utf-8 4 | import logging 5 | 6 | from sklearn.metrics import auc, classification_report, confusion_matrix, precision_recall_curve, roc_auc_score 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | def check_topk_values_if_churn(k, preds, y): 11 | """Check how many of top k churn predictions actually churned.""" 12 | 13 | pred_arr = preds.cpu() 14 | pred_arr = pred_arr.detach().numpy() 15 | topk = pred_arr.argsort()[-k:][::-1] 16 | count = 0 17 | for ind in topk: 18 | if y[ind] == 1: 19 | count += 1 20 | return count 21 | 22 | 23 | def pr_auc_score(y_true, y_score): 24 | """Get pr_auc score.""" 25 | 26 | precision, recall, _ = precision_recall_curve(y_true, y_score) 27 | pr_auc = auc(recall, precision) 28 | return pr_auc 29 | 30 | 31 | def get_metrics(y_true, y_score, threshold=0.5, digits=3): 32 | """Get classification report, confusion matrix, roc_auc score, and pr_auc score.""" 33 | 34 | y_pred = y_score > threshold 35 | 36 | report = classification_report(y_true, y_pred, digits=digits) 37 | report_dict = convert_classification_report_to_dict(report) 38 | logger.info(report) 39 | 40 | tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel() 41 | report_dict.update({'tp': tp, 'fp': fp, 'tn': tn, 'fn': fn}) 42 | logger.info("tp: {}, fp: {}, tn: {}, fn: {}".format(tp, fp, tn, fn)) 43 | 44 | roc_auc = roc_auc_score(y_true, y_score) 45 | pr_auc = pr_auc_score(y_true, y_score) 46 | report_dict.update({'roc_auc_score': roc_auc, 'pr_auc_score': pr_auc}) 47 | logger.info("roc_auc_score = {:.4f}, pr_auc_score = {:.4f}".format(roc_auc, pr_auc)) 48 | 49 | return report_dict 50 | 51 | 52 | def convert_classification_report_to_dict(report): 53 | """Convert classification report to Dict format.""" 54 | 55 | rows = [row.split() for row in report.split('\n') if row] 56 | headers = rows[0] 57 | report_dict = {} 58 | for row in rows[1:]: 59 | if row[1] == 'avg': 60 | label, scores = ' '.join(row[:2]), row[2:] 61 | else: 62 | label, scores = row[0], row[1:] 63 | 64 | if label == 'accuracy': 65 | report_dict[label] = float(scores[-2]) 66 | else: 67 | report_dict[label] = dict(zip(headers, [float(score) for score in scores[:-1]] + [int(scores[-1])])) 68 | return report_dict 69 | -------------------------------------------------------------------------------- /caspr/utils/noise.py: -------------------------------------------------------------------------------- 1 | """Noise class for generating noisy data.""" 2 | 3 | import torch 4 | 5 | 6 | class Noise(torch.nn.Module): 7 | """Add different types of noise to the sequential inputs for denoising autoencoder. 8 | 9 | Usage: 10 | noise = Noise(emb_dims, gau_prob, sub_prob, shuffle_dist) 11 | seq_cat_noisy, seq_cont_noisy = noise(seq_cat, seq_cont) 12 | """ 13 | 14 | def __init__(self, emb_dims, gau_prob=0.1, sub_prob=0.1, shuffle_dist=1): 15 | """Initialize Noise objects with probabilities for different noise types. 16 | 17 | Args: 18 | emb_dims (List of tuples (x, y)): Embedding dimensions where x is the vocab size and 19 | y is the embedding size for every categorical variable. 20 | gau_prob (float): Probability of adding gaussian noise to the continuous variables. 21 | sub_prob (float): Probability of substituting a categorical value with another randomly selected one. 22 | shuffle_dist (int): The max distance that each element will be away from its original position 23 | after shuffling. 24 | """ 25 | 26 | super().__init__() 27 | 28 | self.gau_prob = gau_prob 29 | self.sub_prob = sub_prob 30 | self.shuffle_dist = shuffle_dist 31 | self.vocab_sizes = [dim[0] for dim in emb_dims] 32 | 33 | def forward(self, seq_cat_data, seq_cont_data): 34 | """Run a forward pass of the module over the data to add noise.""" 35 | 36 | return self.add_noise(seq_cat_data, seq_cont_data) 37 | 38 | def add_noise(self, seq_cat_data, seq_cont_data): 39 | """Add noise to the sequential data based on the specified probabilities. 40 | 41 | Args: 42 | seq_cat_data (Tensors): Sequential categorical data. 43 | seq_cont_data (Tensors): Sequential continuous data. 44 | """ 45 | 46 | if self.sub_prob > 0: 47 | seq_cat_data = self._word_substitute(seq_cat_data) 48 | 49 | if self.gau_prob > 0: 50 | seq_cont_data = self._word_gaussian(seq_cont_data) 51 | 52 | if self.shuffle_dist > 0: 53 | seq_cat_data, seq_cont_data = self._word_shuffle(seq_cat_data, seq_cont_data) 54 | 55 | return seq_cat_data, seq_cont_data 56 | 57 | def _word_shuffle(self, seq_cat_data, seq_cont_data): 58 | batch_size, seq_len, _ = seq_cat_data.size() 59 | base = torch.arange(seq_len, dtype=torch.float).repeat(batch_size, 1) 60 | inc = (self.shuffle_dist+1) * torch.rand((batch_size, seq_len)) 61 | _, sigma = (base + inc).sort(dim=1) 62 | return (seq_cat_data[torch.arange(batch_size).unsqueeze(1), sigma], 63 | seq_cont_data[torch.arange(batch_size).unsqueeze(1), sigma]) 64 | 65 | def _word_substitute(self, x): 66 | keep = (torch.rand(x.size(), device=x.device) > self.sub_prob) 67 | x_ = x.clone() 68 | for i in range(len(self.vocab_sizes)): 69 | x_[:, :, i].random_(0, self.vocab_sizes[i]) 70 | x_[keep] = x[keep] 71 | return x_ 72 | 73 | def _word_gaussian(self, x): 74 | gaussian = (torch.rand(x.size(), device=x.device) < self.gau_prob) 75 | x_ = x.clone() 76 | x_ += torch.randn(x.size(), device=x.device) * gaussian 77 | return x_ 78 | -------------------------------------------------------------------------------- /caspr/utils/onnx.py: -------------------------------------------------------------------------------- 1 | import onnx 2 | import torch 3 | from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions, get_available_providers 4 | from torch.onnx import register_custom_op_symbolic 5 | 6 | from caspr.models.factory import LSTM, TRANSFORMER 7 | from caspr.utils.preprocess import get_nonempty_tensors 8 | from caspr.utils.score import get_architecture 9 | 10 | OPSET_VERSION = 12 11 | SEQ_CAT_INDEX = 0 12 | SEQ_CONT_INDEX = 1 13 | NON_SEQ_CAT_INDEX = 2 14 | NON_SEQ_CONT_INDEX = 3 15 | 16 | _onnx_opset_version = 1 17 | 18 | def register_custom_op(): 19 | """ 20 | This function registers symbolic functions for 21 | custom ops that are implemented as part of ONNX Runtime 22 | """ 23 | 24 | # Symbolic definition 25 | def inverse(g, self): 26 | return g.op("com.microsoft::Inverse", self) 27 | 28 | def gelu(g, self): 29 | return g.op("com.microsoft::Gelu", self) 30 | 31 | def triu(g, self, diagonal): 32 | return g.op("com.microsoft::Trilu", self, diagonal, upper_i=1) 33 | 34 | def tril(g, self, diagonal): 35 | return g.op("com.microsoft::Trilu", self, diagonal, upper_i=0) 36 | 37 | # Op Registration 38 | register_custom_op_symbolic('::inverse', inverse, _onnx_opset_version) 39 | register_custom_op_symbolic('::gelu', gelu, _onnx_opset_version) 40 | register_custom_op_symbolic('::triu', triu, _onnx_opset_version) 41 | register_custom_op_symbolic('::tril', tril, _onnx_opset_version) 42 | 43 | 44 | def unregister_custom_op(): 45 | """ 46 | This function unregisters symbolic functions for 47 | custom ops that are implemented as part of ONNX Runtime 48 | """ 49 | 50 | import torch.onnx.symbolic_registry as sym_registry 51 | 52 | # TODO: replace this once PyTorch supports unregister natively. 53 | def unregister(name, opset_version): 54 | ns, kind = name.split("::") 55 | from torch.onnx.symbolic_helper import _onnx_stable_opsets 56 | 57 | for version in _onnx_stable_opsets: 58 | if version >= opset_version and sym_registry.is_registered_op(kind, ns, version): 59 | del sym_registry._registry[(ns, version)][kind] 60 | 61 | unregister('::inverse', _onnx_opset_version) 62 | unregister('::gelu', _onnx_opset_version) 63 | unregister('::triu', _onnx_opset_version) 64 | unregister('::tril', _onnx_opset_version) 65 | 66 | 67 | def get_input_names(nonempty_idx): 68 | mapping = {SEQ_CAT_INDEX: 'seq_cat', SEQ_CONT_INDEX: 'seq_cont', 69 | NON_SEQ_CAT_INDEX: 'non_seq_cat', NON_SEQ_CONT_INDEX: 'non_seq_cont'} 70 | input_names = [mapping[idx] for idx in nonempty_idx if idx in mapping] + ['nonempty_idx'] 71 | return input_names 72 | 73 | 74 | def get_dummy_inputs(model): 75 | if get_architecture(model) == TRANSFORMER: 76 | seq_cat_dim = len(model.unified_encoder.emb_seq.emb_layers) 77 | seq_cont_dim = model.unified_encoder.seq_cont_dim 78 | non_seq_cat_dim = len(model.unified_encoder.emb_non_seq.emb_layers) 79 | non_seq_cont_dim = model.unified_encoder.non_seq_cont_dim 80 | adjust_seq_len = model.unified_encoder.transformer_encoder.pos_embedding.num_embeddings 81 | seq_len = adjust_seq_len - int((non_seq_cat_dim + non_seq_cont_dim) > 0) 82 | elif get_architecture(model) == LSTM: 83 | seq_cat_dim = model.unified_encoder.seq_cat_count 84 | seq_cont_dim = model.unified_encoder.seq_cont_count 85 | non_seq_cat_dim = model.unified_encoder.non_seq_cat_count 86 | non_seq_cont_dim = model.unified_encoder.non_seq_cont_count 87 | seq_len = model.unified_encoder.seq_len 88 | 89 | device = next(model.parameters()).device 90 | seq_cat_dummy = torch.zeros((1, seq_len, seq_cat_dim), dtype=torch.long, device=device) 91 | seq_cont_dummy = torch.zeros((1, seq_len, seq_cont_dim), dtype=torch.float32, device=device) 92 | non_seq_cat_dummy = torch.zeros((1, non_seq_cat_dim), dtype=torch.long, device=device) 93 | non_seq_cont_dummy = torch.zeros((1, non_seq_cont_dim), dtype=torch.float32, device=device) 94 | 95 | dummy = (seq_cat_dummy, seq_cont_dummy, non_seq_cat_dummy, non_seq_cont_dummy) 96 | nonempty_tensors, nonempty_idx = get_nonempty_tensors(dummy) 97 | dummy_inputs = (*nonempty_tensors, torch.tensor(nonempty_idx)) 98 | 99 | input_names = get_input_names(nonempty_idx) 100 | 101 | return dummy_inputs, input_names 102 | 103 | 104 | def export_onnx(model, model_path): 105 | model.eval() 106 | 107 | dummy_inputs, input_names = get_dummy_inputs(model) 108 | 109 | with torch.no_grad(): 110 | dummy_outputs = model.unified_encoder(*dummy_inputs) 111 | output_names = [f"output_{i}" for i in range(len(dummy_outputs))] 112 | 113 | dynamic_axes = dict.fromkeys(input_names + output_names, {0: 'batch_size'}) 114 | torch.onnx.export(model=model.unified_encoder, 115 | args=dummy_inputs, 116 | f=model_path, 117 | input_names=input_names, 118 | output_names=output_names, 119 | dynamic_axes=dynamic_axes, 120 | opset_version=OPSET_VERSION, 121 | custom_opsets={'com.microsoft': 1}, 122 | do_constant_folding=True) 123 | 124 | 125 | def to_numpy(tensor): 126 | if tensor.requires_grad: 127 | return tensor.detach().cpu().numpy() 128 | else: 129 | return tensor.cpu().numpy() 130 | 131 | 132 | class ONNXWrapper: 133 | 134 | def __init__(self, model_path_or_proto, model_type=TRANSFORMER): 135 | if isinstance(model_path_or_proto, str): 136 | with open(model_path_or_proto, 'rb') as model_file: 137 | self.model_bytes = model_file.read() 138 | else: 139 | self.model_bytes = onnx._serialize(model_path_or_proto) 140 | self.session = self.load() 141 | self.model_type = model_type 142 | 143 | def __getstate__(self): 144 | state = self.__dict__.copy() 145 | del state['session'] 146 | return state 147 | 148 | def __setstate__(self, state): 149 | self.__dict__.update(state) 150 | self.session = self.load() 151 | 152 | def unified_encoder(self, *args): 153 | nonempty_tensors = args[:-1] 154 | inputs = list(map(to_numpy, nonempty_tensors)) 155 | ort_inputs = dict((self.session.get_inputs()[i].name, inp) for i, inp in enumerate(inputs)) 156 | return (torch.from_numpy(out) for out in self.session.run(None, ort_inputs)) 157 | 158 | def load(self, device=torch.device('cpu'), enable_all_optimization=True): 159 | sess_options = SessionOptions() 160 | sess_options.graph_optimization_level = ( 161 | GraphOptimizationLevel.ORT_ENABLE_ALL 162 | if enable_all_optimization 163 | else GraphOptimizationLevel.ORT_ENABLE_BASIC 164 | ) 165 | 166 | use_gpu = 'cuda' in device.type and 'CUDAExecutionProvider' in get_available_providers() 167 | execution_providers = ( 168 | ["CPUExecutionProvider"] if not use_gpu else ["CUDAExecutionProvider", "CPUExecutionProvider"] 169 | ) 170 | 171 | session = InferenceSession(self.model_bytes, sess_options, providers=execution_providers) 172 | return session 173 | 174 | def to(self, device): 175 | self.session = self.load(device) 176 | 177 | def cpu(self): 178 | self.to(torch.device('cpu')) 179 | 180 | def eval(self): 181 | pass 182 | -------------------------------------------------------------------------------- /caspr/utils/score.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import numpy as np 4 | import torch 5 | 6 | from caspr.data.common_dataset import id_collate 7 | from caspr.models.factory import LSTM, TRANSFORMER 8 | from caspr.utils.preprocess import get_nonempty_tensors 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | def run_autoencoder_score(autoenc, dataloader_test, device): 13 | 14 | embeddings = [] 15 | tgt_ids = [] 16 | 17 | for tgt_id, _, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data in dataloader_test: 18 | 19 | data = [seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data] 20 | if isinstance(autoenc, torch.nn.Module): 21 | data = [d.to(device) for d in data] 22 | 23 | nonempty_tensors, nonempty_idx = get_nonempty_tensors(data) 24 | 25 | tgt_ids.append(tgt_id) 26 | 27 | if get_architecture(autoenc) == TRANSFORMER: 28 | emb, _, _ = autoenc.unified_encoder(*nonempty_tensors, nonempty_idx) 29 | # Concatenate across timesteps 30 | emb = emb.reshape(emb.shape[0], -1) 31 | embeddings.append(emb.detach().cpu() if isinstance(emb, torch.Tensor) else emb) 32 | 33 | elif get_architecture(autoenc) == LSTM: 34 | _, (hn, _) = autoenc.unified_encoder(*nonempty_tensors, nonempty_idx) 35 | embeddings.append(hn.detach().cpu() if isinstance(hn, torch.Tensor) else hn) 36 | 37 | tgt_ids = np.concatenate(tgt_ids, axis=0) 38 | embeddings = np.concatenate(embeddings, axis=0) 39 | embeddings_with_id = np.hstack((tgt_ids, embeddings)) 40 | 41 | return embeddings_with_id 42 | 43 | def score(dataset_test, autoenc, device, batch_size=1024): 44 | autoenc.eval() 45 | test_loader = torch.utils.data.DataLoader(dataset_test, batch_size=batch_size, collate_fn=id_collate) 46 | 47 | logger.info("Performing inference on given dataset") 48 | embeddings = run_autoencoder_score(autoenc, test_loader, device) 49 | return embeddings 50 | 51 | def get_architecture(model): 52 | return model.__class__.__name__ if isinstance(model, torch.nn.Module) else model.model_type 53 | -------------------------------------------------------------------------------- /caspr/utils/segmentation/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # -------------------------------------------------------------------------- 4 | 5 | """The segmentation module for the CASPR library. 6 | 7 | Modules: 8 | :module1_name: A description of this specific module. 9 | """ 10 | -------------------------------------------------------------------------------- /caspr/utils/segmentation/dec_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from scipy.optimize import linear_sum_assignment 4 | from sklearn.cluster import KMeans 5 | 6 | from caspr.utils.preprocess import get_nonempty_tensors 7 | 8 | 9 | def cluster_initialize(model, dataloader, device): 10 | """Initialize cluster. 11 | 12 | Args: 13 | model (nn.Module): # noqa: W0223 Pretrained encoder-decoder model 14 | dataloader (DataLoader): Data loader that provides an iterable over the given dataset 15 | device ('cpu' or 'cuda'): Describes the machine on which the code is running 16 | """ 17 | kmeans = KMeans(model.cluster_number, n_init=20) 18 | model.train() 19 | encoder_embs = [] 20 | labels = [] 21 | # form initial cluster centres 22 | for _, y, seq_cat_x, seq_cont_x, non_seq_cat_x, non_seq_cont_x in dataloader: 23 | seq_cat_x = seq_cat_x.to(device) 24 | seq_cont_x = seq_cont_x.to(device) 25 | non_seq_cat_x = non_seq_cat_x.to(device) 26 | non_seq_cont_x = non_seq_cont_x.to(device) 27 | 28 | data = (seq_cat_x, seq_cont_x, non_seq_cat_x, non_seq_cont_x) 29 | nonempty_tensors, nonempty_idx = get_nonempty_tensors(data) 30 | encoder_embs.append(model.enc(*nonempty_tensors, nonempty_idx).detach().cpu()) 31 | 32 | labels.append(y) 33 | 34 | labels = torch.cat(labels).long() 35 | 36 | predicted = kmeans.fit_predict(torch.cat(encoder_embs).numpy()) 37 | predicted_tensor = torch.tensor(np.copy(predicted), dtype=torch.long) 38 | _, accuracy = cluster_accuracy(predicted, labels.cpu().numpy()) 39 | print('Initial Cluster Acc: ', accuracy) 40 | cluster_centers = torch.tensor(kmeans.cluster_centers_, dtype=torch.float, requires_grad=True).to(device) 41 | with torch.no_grad(): 42 | # initialise the cluster centers 43 | model.state_dict()['assignment.cluster_centers'].copy_(cluster_centers) 44 | return predicted_tensor 45 | 46 | 47 | def cluster_accuracy(y_true, y_predicted, cluster_number=None): 48 | """Calculate clustering accuracy after using the linear_sum_assignment function in SciPy to determine reassignments. 49 | 50 | Args: 51 | y_true (List of int): list of true cluster numbers, an integer array 0-indexed 52 | y_predicted (List of int): list of predicted cluster numbers, an integer array 0-indexed 53 | cluster_number (int): number of clusters, if None then calculated from input 54 | Return: 55 | reassignment dictionary, clustering accuracy 56 | """ 57 | if cluster_number is None: 58 | cluster_number = max(y_predicted.max(), y_true.max()) + 1 # assume labels are 0-indexed 59 | count_matrix = np.zeros((cluster_number, cluster_number), dtype=np.int64) 60 | for i in range(y_predicted.size): 61 | count_matrix[y_predicted[i], y_true[i]] += 1 62 | 63 | row_ind, col_ind = linear_sum_assignment(count_matrix.max() - count_matrix) 64 | reassignment = dict(zip(row_ind, col_ind)) 65 | accuracy = count_matrix[row_ind, col_ind].sum() / y_predicted.size 66 | return reassignment, accuracy 67 | 68 | 69 | def cluster_predict(model, dataloader, device): 70 | """Predict the cluster centers for the given input data. 71 | 72 | Args: 73 | model (nn.Module): # noqa: W0223 Pretrained encoder-decoder model 74 | dataloader (DataLoader): Data loader that provides an iterable over the given dataset 75 | device ('cpu' or 'cuda'): Describes the machine on which the code is running 76 | """ 77 | features = [] 78 | labels = [] 79 | for _, y, seq_cat_x, seq_cont_x, non_seq_cat_x, non_seq_cont_x in dataloader: 80 | seq_cat_x = seq_cat_x.to(device) 81 | seq_cont_x = seq_cont_x.to(device) 82 | non_seq_cat_x = non_seq_cat_x.to(device) 83 | non_seq_cont_x = non_seq_cont_x.to(device) 84 | 85 | data = (seq_cat_x, seq_cont_x, non_seq_cat_x, non_seq_cont_x) 86 | nonempty_tensors, nonempty_idx = get_nonempty_tensors(data) 87 | features.append(model(*nonempty_tensors, nonempty_idx).detach().cpu()) 88 | 89 | labels.append(y) 90 | 91 | return torch.cat(features).max(1)[1], torch.cat(labels).long() 92 | -------------------------------------------------------------------------------- /caspr/utils/segmentation/pandas.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn.cluster import KMeans 7 | from sklearn.decomposition import PCA 8 | from sklearn.manifold import TSNE 9 | from sklearn.metrics import silhouette_score 10 | from sklearn.preprocessing import MinMaxScaler, StandardScaler 11 | from tqdm import tqdm 12 | 13 | 14 | def check_sparsity(data): 15 | """Check sparsity in data.""" 16 | for c in data.columns: 17 | try: 18 | temp = pd.qcut(data[c], q=10, labels=False, duplicates='drop').value_counts()/data.shape[0] 19 | 20 | # top quantile%, unique value% 21 | print(c, np.round(temp.values[0], 2), np.round(len(data[c].unique())/data.shape[0], 2)) 22 | except Exception: 23 | print(c, np.nan, np.round(len(data[c].unique()), 2)) 24 | 25 | 26 | def quantile(df, q=5, col_features=None): 27 | """Score customers from 0 to 5 based on Engagement metrics. 28 | 29 | Input: 30 | Output: 31 | """ 32 | 33 | # create quantile scores [0, q] with q interval 34 | for c in col_features: 35 | if 'R_' in c: 36 | df[c+'_q'] = pd.qcut(df[c], q=q+1, labels=range(q, -1, -1), duplicates='drop') 37 | else: 38 | df[c+'_q'] = pd.qcut(df[c], q=q+1, labels=range(0, q+1), duplicates='drop') 39 | 40 | df['AvgScore'] = df[[c + '_q' for c in col_features]].mean(axis=1) 41 | df['AvgScore'].hist(bins=q) 42 | plt.title('AvgScore') 43 | plt.show() 44 | 45 | # generate segments 46 | df['Segment'] = np.nan 47 | for i in range(1, q+1): 48 | df.loc[(df.AvgScore <= i) & (df.AvgScore > (i-1)), 'Segment'] = i 49 | 50 | df['Segment'].hist(bins=q) 51 | plt.title('Segment') 52 | plt.show() 53 | 54 | return df 55 | 56 | 57 | def clustering(df, col_features=None, cluster_range=range(2, 10), scaling_option="minmax", 58 | pca=True, pca_param={'threshold': 0.8, 'show_plot': False}, 59 | default_cluster_size=None, default_cluster_threshold=0.1, 60 | tsne_plt=True, tsne_sample=1000, removed_outlier=False): 61 | """Perform Clustering. 62 | 63 | Options to do transformation and PCA before performing clustering 64 | - featurization 65 | - find # of clusters 66 | - fit final model 67 | """ 68 | inertias = [] 69 | sil_scores = [] 70 | 71 | # featurization 72 | df = apply_scaling(df, col_features, scaling_option, removed_outlier) 73 | 74 | if pca: 75 | df_features, n_pca, pca = apply_pca(df, col_features=col_features, pca_param=pca_param) # noqa W0612 76 | else: 77 | df_features = df[col_features].values 78 | 79 | # find # of clusters 80 | for k in tqdm(cluster_range): 81 | # kc = KMeans(n_clusters=k, init='k-means++', n_init=10, random_state=1) 82 | kc = KMeans(n_clusters=k, random_state=1, n_jobs=4) 83 | 84 | kc.fit(df_features) 85 | sil_scores.append(silhouette_score(df_features, kc.labels_)) 86 | inertias.append(kc.inertia_) 87 | 88 | results = pd.DataFrame(np.array([cluster_range, inertias, sil_scores]).T) 89 | results.columns = ['cluster_size', 'inertias', 'sil_scores'] 90 | n_final = cluster_range[np.where(sil_scores == np.max(sil_scores))[0][0]] 91 | print('optimal cluster size:', n_final, np.round(np.max(sil_scores), 2)) 92 | 93 | # fit final model 94 | if default_cluster_size is not None: 95 | # kc = KMeans(n_clusters=default_cluster_size, random_state=1) 96 | sil_score_default = results.loc[results.index == default_cluster_size, 'sil_scores'].values[0] 97 | 98 | if (np.max(sil_scores)/sil_score_default - 1) <= default_cluster_threshold: 99 | print('default is a good cluster size', sil_score_default, np.max(sil_scores)) 100 | kc = KMeans(n_clusters=default_cluster_size, random_state=1) 101 | else: 102 | print('optimal is a better cluster size', sil_score_default, np.max(sil_scores)) 103 | kc = KMeans(n_clusters=n_final, random_state=1) 104 | else: 105 | kc = KMeans(n_clusters=n_final, random_state=1) 106 | 107 | kc.fit(df_features) 108 | 109 | df['label'] = kc.labels_ 110 | 111 | # score visualization 112 | if len(cluster_range) > 1: 113 | _, axes = plt.subplots(1, 2, figsize=(10, 5)) 114 | results.plot(ax=axes[0], x='cluster_size', y='inertias') 115 | results.plot.bar(ax=axes[1], x='cluster_size', y='sil_scores') 116 | plt.show() 117 | 118 | # clustering size distribution 119 | print(df.label.value_counts().to_frame()/df.shape[0]) 120 | 121 | # tsne visualization 122 | if tsne_plt: 123 | if (tsne_sample > 0) & (tsne_sample < len(kc.labels_)): 124 | df_tsne = pd.DataFrame(df_features) 125 | df_tsne['label'] = kc.labels_ 126 | 127 | plt_tsne(x=df_tsne.drop(columns=['label']).sample(n=tsne_sample, random_state=1).values, 128 | label=df_tsne.sample(n=tsne_sample, random_state=1).label.values) 129 | else: 130 | plt_tsne(x=df_features, label=kc.labels_) 131 | 132 | return results, df, kc 133 | 134 | 135 | def apply_scaling(df, col_features=None, scaling_option=None, removed_outlier=False): 136 | """Apply Scaling to dataframe.""" 137 | 138 | if scaling_option == 'minmax': 139 | scaler = MinMaxScaler() 140 | df[col_features] = scaler.fit_transform(df[col_features]) 141 | elif scaling_option == 'qcut': 142 | for c in col_features: 143 | df[c] = pd.qcut(df[c], q=100, labels=False, duplicates='drop') 144 | else: 145 | pass 146 | 147 | if removed_outlier: 148 | n_std = 3 149 | for c in col_features: 150 | if df[c].dtype != 'object': 151 | tic_cnt = df.shape[0] 152 | temp_mean = df[c].mean() 153 | temp_std = df[c].std() 154 | df = df[(df[c] <= (temp_mean + n_std*temp_std)) & (df[c] >= (temp_mean - n_std*temp_std))].copy() 155 | print('remove outlier', c, ':', df.shape[0] - tic_cnt) 156 | 157 | df.reset_index(drop=True, inplace=True) 158 | 159 | return df 160 | 161 | 162 | def apply_pca(df, col_features=None, pca_param={'threshold': 0.8, 'show_plot': False}): 163 | """Apply PCA transformation and return # of eigen-vectors based on threshold (20/80 rules).""" 164 | 165 | # normalize the input matrix 166 | matrix = df[col_features].values 167 | scaler = StandardScaler() 168 | scaler.fit(matrix) 169 | scaled_matrix = scaler.transform(matrix) 170 | 171 | # perform PCA 172 | pca = PCA() 173 | pca.fit(scaled_matrix) 174 | pca_samples = pca.transform(scaled_matrix) 175 | 176 | # # visualize explained variance 177 | # if pca_param['show_plot']: 178 | # fig, ax = plt.subplots(figsize=(10, 5)) 179 | # sns.set(font_scale=1) 180 | # plt.step(range(matrix.shape[1]), pca.explained_variance_ratio_.cumsum(), where='mid', 181 | # label='cumulative explained variance') 182 | # sns.barplot(np.arange(1,matrix.shape[1]+1), pca.explained_variance_ratio_, alpha=0.5, color = 'g', 183 | # label='individual explained variance') 184 | # plt.xlim(0, len(col_features)) 185 | # ax.set_xticklabels([s if int(s.get_text())%2 == 0 else '' for s in ax.get_xticklabels()]) 186 | # plt.ylabel('Explained variance', fontsize = 14) 187 | # plt.xlabel('Principal components', fontsize = 14) 188 | # plt.legend(loc='best', fontsize = 13) 189 | 190 | # define n_pca based on the threshold 191 | n_pca = np.where(pca.explained_variance_ratio_.cumsum() > pca_param['threshold'])[0][0] + 1 192 | print('# of pca components:', n_pca, '/', scaled_matrix.shape[1]) 193 | print('# of variance explained:', pca.explained_variance_ratio_.cumsum()[n_pca-1]) 194 | 195 | # see loadings of the main components 196 | df_pca_components = pd.DataFrame(pca.components_, columns=col_features) 197 | plt_bar(df_pca_components.head(n_pca).copy(), ncols=3, figsize=(10, 10), title='PCA ') 198 | 199 | # extract the main transformed features 200 | df_pca = pca_samples[:, 0:n_pca] 201 | 202 | return df_pca, n_pca, pca 203 | 204 | 205 | def profiling(df, label, col_features, col_dropped=[]): 206 | """Profile Dataframe using heatmap. 207 | 208 | Heat-map around KPIs: absolute & relative 209 | - Useful technique to identify relative importance of each segment's attribute 210 | - Calculate average values of each cluster 211 | - Calculate average values of population 212 | - Calculate importance score by dividing them and subtracting 1 213 | (ensures 0 is returned when cluster average equals population average) 214 | 215 | col_dropped: automatic or apply min-max scaling to features [TO-DO] 216 | """ 217 | df['Segment'] = label 218 | 219 | # classifying cat vs. cont features 220 | cat_features = [] 221 | cont_features = [] 222 | for x in col_features: 223 | if df[x].dtypes == 'object': 224 | cat_features.append(x) 225 | else: 226 | cont_features.append(x) 227 | 228 | # customer counts 229 | df_count = df.groupby('Segment')[cont_features[0]].count() 230 | df_count.loc['All'] = df_count.sum() 231 | df_count = df_count.to_frame() 232 | df_count.columns = ['Customers'] 233 | df_count['Customers%'] = df_count.Customers/df_count.Customers.values[-1]*100 234 | 235 | # numerical features 236 | df_cont = df.groupby('Segment')[cont_features].mean() 237 | df_cont.loc['All'] = df_cont.mean() 238 | 239 | # categorical features 240 | df_cat = pd.DataFrame() 241 | for c in cat_features: 242 | df_pivot = df.pivot_table(index='Segment', 243 | columns=c, values=cont_features[0], aggfunc='count') 244 | df_pivot.loc['All'] = df_pivot.sum() 245 | df_pivot[df_pivot.columns] = df_pivot.values / df_pivot.sum(axis=1).values.reshape(-1, 1)*100 246 | df_cat = pd.concat([df_cat, df_pivot], axis=1) 247 | 248 | # combine results and calcuate relative importance 249 | result_profile = pd.concat([df_count, df_cont, df_cat], axis=1) 250 | temp_all = result_profile.loc['All'] 251 | result_profile.drop('All', inplace=True) 252 | result_profile.sort_index(ascending=False, inplace=True) 253 | result_profile.loc['All'] = temp_all 254 | 255 | relative_imp = result_profile/result_profile.loc['All'] - 1 256 | relative_imp.drop('All', inplace=True) 257 | 258 | # visualization - heatmap 259 | temp = relative_imp.drop(columns=['Customers', 'Customers%'] + col_dropped).copy() 260 | plt_heatmap(temp, x_labels=temp.columns, y_labels=temp.index) 261 | 262 | # visualization - barchart by clusters 263 | plt_bar(temp) 264 | 265 | return relative_imp, result_profile 266 | 267 | 268 | def plt_tsne(x, label): 269 | """Visualize TSNE.""" 270 | tic = time.time() 271 | x_embedded = TSNE(n_components=2).fit_transform(x) 272 | print('tsne takes time: ', time.time() - tic) 273 | 274 | vis_x = x_embedded[:, 0] 275 | vis_y = x_embedded[:, 1] 276 | 277 | fig = plt.figure(figsize=(12, 8)) # noqa W0612 278 | plt.scatter(vis_x, vis_y, c=label, cmap=plt.cm.get_cmap("jet", 256)) 279 | plt.colorbar(ticks=range(256)) 280 | plt.clim(-0.5, 9.5) 281 | plt.show() 282 | 283 | 284 | def plt_heatmap(data, x_labels, y_labels): 285 | """Plot Heatmap.""" 286 | fig = plt.figure(figsize=(10, 5)) 287 | ax = fig.add_axes([1, 1, 1.1, 1.1]) 288 | 289 | plt.imshow(data, cmap='Blues', interpolation='nearest') 290 | ax.set_yticks(range(len(y_labels))) 291 | ax.set_yticklabels(y_labels) 292 | ax.set_xticks(range(len(x_labels))) 293 | ax.set_xticklabels(x_labels, rotation=60) 294 | plt.colorbar() 295 | plt.show() 296 | 297 | 298 | def plt_bar(data, ncols=3, figsize=(10, 10), title='Segment'): 299 | """Plot bars.""" 300 | 301 | data.dropna(axis=1, inplace=True) 302 | nrows = int(np.ceil(data.shape[0]/ncols)) 303 | xlim_min = data.min().min() 304 | xlim_max = data.max().max() 305 | 306 | fig = plt.figure(figsize=figsize) 307 | for i in range(data.shape[0]): 308 | temp = data.iloc[i] 309 | ax = plt.subplot(nrows, ncols, i+1) 310 | ax.barh(range(len(temp)), temp.values, align='center') 311 | if title is not None: 312 | ax.set_title(title + str(data.index[i])) 313 | 314 | plt.xticks(rotation=45) 315 | plt.xlim(xlim_min, xlim_max) 316 | 317 | if i % ncols == 0: 318 | ax.set_yticks(range(len(temp))) 319 | ax.set_yticklabels(temp.index) 320 | 321 | fig.tight_layout() 322 | plt.show() 323 | 324 | 325 | def generate_segmentation_graphs(combined_df, profile_features, 326 | emb_features, use_profile=False, use_embedding=False): 327 | """Generate segmentation graphs. 328 | 329 | combined_df - the dataframe containing embeddings and profile features 330 | with feature names as the column names 331 | profile_features - the names of all the profile features in the data 332 | 333 | emb_features - name of the embedding features, by default they should be 'dim_0', 'dim_1'... 334 | 335 | use_profile - boolean flag to determine if we use the profile features or not 336 | 337 | use_embedding - boolean flag to determine if we use the embedding values 338 | """ 339 | 340 | # importlib.reload(segmentation_utils) 341 | 342 | df_emb = combined_df 343 | # Need to remove the dimensions of the embedding which have only onevalue if we use scaling 344 | col_one = [] 345 | for col in emb_features: 346 | if df_emb[col].nunique() == 1: 347 | col_one.append(col) 348 | df_emb = df_emb.drop(columns=col_one, axis=1) 349 | 350 | emb_featuresN = [] # noqa: C0103 351 | for item in emb_features: 352 | if item not in col_one: 353 | emb_featuresN.append(item) 354 | 355 | emb_features = emb_featuresN 356 | 357 | plt_heatmap(df_emb[emb_features].corr(), emb_features, emb_features) 358 | df_emb[emb_features].describe() 359 | 360 | features_to_use = [] 361 | if use_profile: 362 | features_to_use = profile_features 363 | if use_embedding: 364 | features_to_use = emb_features 365 | 366 | if use_embedding and use_profile: 367 | features_to_use = profile_features + emb_features 368 | 369 | n = 5000 370 | data_c = df_emb.sample(n=n, random_state=1).copy() 371 | 372 | results, df, kc = clustering(df=data_c.copy(), 373 | col_features=features_to_use, cluster_range=range(2, 9), scaling_option='qcut', 374 | pca=True, pca_param={'threshold': 0.8, 'show_plot': False}, 375 | default_cluster_size=None, default_cluster_threshold=0.1, 376 | tsne_plt=True, tsne_sample=1000, removed_outlier=False) 377 | 378 | col_features = emb_features + profile_features 379 | relative_imp, result_profile = profiling(data_c.copy(), kc.labels_, col_features, col_dropped=[]) 380 | 381 | 382 | def generate_combined_df(embedding_data=None, profile_data: pd.DataFrame = None): 383 | """Generate Combined DF. 384 | 385 | embedding data - The numpy array containing the embeddings in the NxM format where 386 | N = number of data entries, M = embedding dimension 387 | 388 | profile data - The dataframe containing all the profile features, with column names 389 | equal to the featur names 390 | 391 | """ 392 | if embedding_data is None: 393 | profile_data.reset_index(drop=True, inplace=True) 394 | return profile_data 395 | 396 | emb_dim = embedding_data.shape[1] 397 | column_list = [] 398 | for i in range(emb_dim): 399 | column_list.append('dim_' + str(i)) 400 | emb_df = pd.DataFrame(embedding_data, columns=column_list) 401 | emb_df.reset_index(drop=True, inplace=True) 402 | 403 | if profile_data is None: 404 | return emb_df 405 | 406 | profile_data.reset_index(drop=True, inplace=True) 407 | final_df = pd.concat([profile_data, emb_df], axis=1) 408 | return final_df 409 | -------------------------------------------------------------------------------- /caspr/utils/spark/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/CASPR/9cdbf270487751a0ad6862b2fea2ccc0e23a0b67/caspr/utils/spark/__init__.py -------------------------------------------------------------------------------- /caspr/utils/spark/large/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/CASPR/9cdbf270487751a0ad6862b2fea2ccc0e23a0b67/caspr/utils/spark/large/__init__.py -------------------------------------------------------------------------------- /caspr/utils/spark/large/score.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import torch 6 | import torch.nn as nn 7 | from petastorm.pytorch import BatchedDataLoader 8 | 9 | from caspr.data.load import transform_and_load 10 | from caspr.models.model_wrapper import LSTMAutoencoder, TransformerAutoEncoder 11 | from caspr.utils.preprocess import get_nonempty_tensors 12 | from caspr.utils.spark.preprocess import remove_underscore_in_seq_col_name_list 13 | 14 | PS_HDFS_DRIVER = 'libhdfs3' 15 | # lower overhead, alternative is 'process' 16 | PS_WORKER_TYPE = 'thread' 17 | # assuming the training relies on SSD backed dbfs:/ml, Petastorm's caching can be disabled 18 | PS_CACHE_TYPE = None 19 | 20 | def get_default_parallelism(): 21 | try: 22 | return sc.defaultParallelism 23 | except NameError as _: 24 | # Spark Context not initialized (sc) 25 | return os.cpu_count() 26 | 27 | 28 | def run_autoencoder_score_peta(autoenc, steps_per_epoch, train_dataloader_iter, device, tgt_id_col, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps): 29 | 30 | embeddings = [] 31 | tgt_ids = [] 32 | 33 | for _ in range(steps_per_epoch): 34 | pd_batch = next(train_dataloader_iter) 35 | tgt_id, _, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data = transform_and_load( 36 | pd_batch, device, remove_underscore_in_seq_col_name_list(tgt_id_col), remove_underscore_in_seq_col_name_list(seq_cols), remove_underscore_in_seq_col_name_list(non_seq_cols), output_col, remove_underscore_in_seq_col_name_list(cat_cols), remove_underscore_in_seq_col_name_list(cont_cols), time_steps) 37 | 38 | data = (seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data) 39 | nonempty_tensors, nonempty_idx = get_nonempty_tensors(data) 40 | 41 | tgt_ids.append(tgt_id) 42 | 43 | if isinstance(autoenc, TransformerAutoEncoder): 44 | emb, _, _ = autoenc.unified_encoder(*nonempty_tensors, nonempty_idx) 45 | # Concatenate across timesteps 46 | emb = emb.view(emb.shape[0], -1) 47 | embeddings.append(emb.detach().cpu()) 48 | 49 | elif isinstance(autoenc, LSTMAutoencoder): 50 | _, (hn, _) = autoenc.unified_encoder(*nonempty_tensors, nonempty_idx) 51 | embeddings.append(hn.detach().cpu()) 52 | 53 | tgt_ids = pd.DataFrame(np.concatenate(tgt_ids, axis=0)) 54 | tgt_ids.columns = tgt_id_col 55 | embeddings = pd.DataFrame(np.concatenate(embeddings, axis=0)) 56 | # embeddings_with_id = np.hstack((tgt_ids, embeddings)) 57 | embeddings_with_id = pd.concat([tgt_ids, embeddings], axis=1) 58 | return embeddings_with_id 59 | 60 | 61 | def score_peta(converter_test, autoenc, tgt_id, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps, batch_size=1024): 62 | autoenc.eval() 63 | if torch.cuda.is_available(): 64 | device = torch.cuda.current_device() 65 | else: 66 | device = torch.device("cpu") 67 | criterion = [nn.MSELoss(), nn.CrossEntropyLoss()] 68 | 69 | with converter_test.make_torch_dataloader(batch_size=batch_size, data_loader_fn=BatchedDataLoader, 70 | num_epochs=None, cache_type=PS_CACHE_TYPE, 71 | workers_count=get_default_parallelism(), 72 | reader_pool_type=PS_WORKER_TYPE, 73 | hdfs_driver=PS_HDFS_DRIVER) as test_dataloader: 74 | test_dataloader_iter = iter(test_dataloader) 75 | steps_per_epoch = max(1, len(converter_test) // (batch_size)) 76 | embeddings = run_autoencoder_score_peta(autoenc, steps_per_epoch, test_dataloader_iter, 77 | device, tgt_id, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps) # noqa: E1121 78 | return embeddings 79 | -------------------------------------------------------------------------------- /caspr/utils/spark/large/train.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | 4 | import horovod.torch as hvd 5 | import torch 6 | import torch.nn as nn 7 | from petastorm.pytorch import BatchedDataLoader 8 | from torch import optim 9 | from torch.utils.data.distributed import DistributedSampler 10 | 11 | from caspr.data.load import transform_and_load 12 | from caspr.utils.early_stopping import DistributedEarlyStopping 13 | from caspr.utils.horovod.train import determine_early_stop 14 | from caspr.utils.spark.large.score import get_default_parallelism 15 | from caspr.utils.spark.preprocess import remove_underscore_in_seq_col_name_list 16 | from caspr.utils.train import init_lr_schedulers 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | def save_checkpoint(model, optimizer, epoch, name): 22 | filepath = '/checkpoint-{epoch}-{model}.pth'.format(epoch=epoch, model=name) 23 | state = { 24 | 'model': model.state_dict(), 25 | 'optimizer': optimizer.state_dict(), 26 | } 27 | torch.save(state, filepath) 28 | 29 | 30 | def metric_average(metric, name): 31 | avg_tensor = hvd.allreduce(metric, name=name) 32 | return avg_tensor.item() 33 | 34 | 35 | BATCH_SIZE = 1024 * 32 36 | NUM_EPOCHS = 100 37 | NUM_WORKERS = 4 # assume cluster consists of two workers 2x K80 each 38 | # default loader parallism is low or None, this widens the IO bottleneck when feeding each GPU 39 | PS_WORKERS_PER_CPU = 2 40 | # this version is implemented in C, vs Java (slower) default 41 | PS_HDFS_DRIVER = 'libhdfs3' 42 | # lower overhead, alternative is 'process' 43 | PS_WORKER_TYPE = 'thread' 44 | # assuming the training relies on SSD backed dbfs:/ml, Petastorm's caching can be disabled 45 | PS_CACHE_TYPE = None 46 | EARLY_STOPPING_PATIENCE = 8 47 | EARLY_STOPPING_DELTA = 1e-5 48 | ROOT_RANK = 0 49 | 50 | 51 | def run_autoencoder_peta(autoenc, optimizer, steps_per_epoch, train_dataloader_iter, criterion, device, tgt_id_col, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps): 52 | count = 0 53 | val_start_time = time.time() 54 | running_loss = 0.0 55 | for _ in range(steps_per_epoch): 56 | pd_batch = next(train_dataloader_iter) 57 | _, y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data = transform_and_load( 58 | pd_batch, device, remove_underscore_in_seq_col_name_list(tgt_id_col), remove_underscore_in_seq_col_name_list(seq_cols), remove_underscore_in_seq_col_name_list(non_seq_cols), output_col, remove_underscore_in_seq_col_name_list(cat_cols), remove_underscore_in_seq_col_name_list(cont_cols), time_steps) 59 | 60 | # Track history in training 61 | torch.set_grad_enabled(True) 62 | _, loss = autoenc.run(y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data, criterion) 63 | optimizer.zero_grad() 64 | loss.backward() 65 | optimizer.step() 66 | 67 | running_loss = (running_loss * count + loss.item()) / (count + 1) 68 | count = count + 1 69 | if count % 64 == 0: 70 | logger.info("Running Loss so far: " + str(running_loss)) 71 | logger.info("Records processed so far: " + str(count*seq_cat_data.shape[0])) 72 | time_so_far = time.time() - val_start_time 73 | logger.info("Time taken since start:" + str(time_so_far)) 74 | 75 | val_end_time = time.time() 76 | 77 | logger.info("Total time taken:" + str(val_end_time - val_start_time)) 78 | logger.info("Running loss at the end of training epoch:" + str(running_loss)) 79 | return running_loss, val_end_time - val_start_time 80 | 81 | 82 | def run_autoencoder_val_peta(autoenc, steps_per_epoch, val_dataloader_iter, criterion, device, tgt_id_col, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps): 83 | criterion = [nn.MSELoss(), nn.CrossEntropyLoss()] 84 | count = 0 85 | val_start_time = time.time() 86 | running_loss = 0.0 87 | for _ in range(steps_per_epoch): 88 | pd_batch = next(val_dataloader_iter) 89 | _, y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data = transform_and_load( 90 | pd_batch, device, remove_underscore_in_seq_col_name_list(tgt_id_col), remove_underscore_in_seq_col_name_list(seq_cols), remove_underscore_in_seq_col_name_list(non_seq_cols), output_col, remove_underscore_in_seq_col_name_list(cat_cols), remove_underscore_in_seq_col_name_list(cont_cols), time_steps) 91 | 92 | # Track history in training 93 | torch.set_grad_enabled(False) 94 | _, loss = autoenc.run(y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data, criterion) 95 | 96 | running_loss = (running_loss * count + loss.item()) / (count + 1) 97 | 98 | count = count + 1 99 | if count % 64 == 0: 100 | logger.info("Running Loss so far: " + str(running_loss)) 101 | logger.info("Records processed so far: " + str(count*seq_cat_data.shape[0])) 102 | time_so_far = time.time() - val_start_time 103 | logger.info("Time taken since start:" + str(time_so_far)) 104 | 105 | val_end_time = time.time() 106 | 107 | logger.info("Total time taken:" + str(val_start_time - val_end_time)) 108 | logger.info("Running loss at the end of validation epoch:" + str(running_loss)) 109 | return running_loss, val_start_time - val_end_time 110 | 111 | 112 | def train_peta_hvd(converter_train, autoenc, tgt_id, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps, batch_size=1024, epochs=10, learning_rate=0.01, warmup_epochs=5, save_model=False, path='./early_stop_model.pth'): 113 | autoenc.train() 114 | hvd.init() # Initialize Horovod. 115 | logger.info("Number of workers:" + str(hvd.size())) 116 | # Horovod: pin GPU to local rank. 117 | if torch.cuda.is_available(): 118 | torch.cuda.set_device(hvd.local_rank()) 119 | device = torch.cuda.current_device() 120 | else: 121 | device = torch.device("cpu") 122 | 123 | # from torch.utils.data.distributed import DistributedSampler 124 | # Configure the sampler such that each worker obtains a distinct sample of input dataset. 125 | # train_sampler = DistributedSampler(dataset_train, num_replicas=hvd.size(), rank=hvd.rank()) 126 | # Use trian_sampler to load a different sample of data on each worker. 127 | # train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size, sampler=train_sampler) 128 | 129 | autoenc = autoenc.to(device) 130 | num_epochs = epochs 131 | 132 | # Effective batch size in synchronous distributed training is scaled by the number of workers. 133 | # An increase in learning rate compensates for the increased batch size. 134 | optimizer = optim.Adam(autoenc.parameters(), lr=learning_rate * hvd.size()) 135 | 136 | # Broadcast initial parameters so all workers start with the same parameters. 137 | hvd.broadcast_parameters(autoenc.state_dict(), root_rank=ROOT_RANK) 138 | hvd.broadcast_optimizer_state(optimizer, root_rank=ROOT_RANK) 139 | 140 | # Wrap the optimizer with Horovod's DistributedOptimizer. 141 | optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=autoenc.named_parameters()) 142 | 143 | scheduler_wu, scheduler_re = init_lr_schedulers( 144 | optimizer, warmup_epochs, reduce_patience=int(EARLY_STOPPING_PATIENCE/2), verbose=True) 145 | 146 | hvd.broadcast_object(scheduler_wu, root_rank=ROOT_RANK) 147 | hvd.broadcast_object(scheduler_re, root_rank=ROOT_RANK) 148 | 149 | criterion = [nn.MSELoss(), nn.CrossEntropyLoss()] 150 | early_stopper = DistributedEarlyStopping(logger, patience=EARLY_STOPPING_PATIENCE, delta=EARLY_STOPPING_DELTA) 151 | 152 | with converter_train.make_torch_dataloader(cur_shard=hvd.rank(), shard_count=hvd.size(), 153 | batch_size=batch_size, data_loader_fn=BatchedDataLoader, 154 | num_epochs=None, cache_type=PS_CACHE_TYPE, 155 | workers_count=PS_WORKERS_PER_CPU * get_default_parallelism(), 156 | reader_pool_type=PS_WORKER_TYPE, 157 | hdfs_driver=PS_HDFS_DRIVER) as train_dataloader: 158 | train_dataloader_iter = iter(train_dataloader) 159 | steps_per_epoch = max(1, len(converter_train) // (batch_size * hvd.size())) 160 | total_time = 0 161 | 162 | epoch = 1 163 | while epoch < num_epochs + 1: 164 | loss, epoch_time = run_autoencoder_peta(autoenc, optimizer, steps_per_epoch, train_dataloader_iter, 165 | criterion, device, tgt_id, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps) 166 | # Only save checkpoints on the first worker. 167 | total_time = total_time + epoch_time 168 | loss_averaged = metric_average(torch.tensor(loss), 'avg_loss') 169 | logger.info("Average overall training loss in epoch {0} is {1}".format( 170 | epoch, loss_averaged)) 171 | 172 | if epoch <= warmup_epochs: 173 | scheduler_wu.step() 174 | scheduler_re.step(loss_averaged) 175 | 176 | epoch = determine_early_stop(early_stopper, loss_averaged, autoenc, path, epoch, num_epochs) 177 | if hvd.rank() == ROOT_RANK and epoch == num_epochs: 178 | if save_model: 179 | save_checkpoint(autoenc, optimizer, epoch, 'encoder') 180 | return autoenc, loss_averaged, total_time 181 | epoch = epoch+1 182 | 183 | 184 | def train_val_peta_hvd(converter_train, converter_val, autoenc, tgt_id, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps, batch_size=1024, epochs=10, learning_rate=0.01, warmup_epochs=5, save_model=False, path='./early_stop_model.pth'): 185 | autoenc.train() 186 | hvd.init() # Initialize Horovod. 187 | logger.info("Number of workers:" + str(hvd.size())) 188 | # Horovod: pin GPU to local rank. 189 | if torch.cuda.is_available(): 190 | torch.cuda.set_device(hvd.local_rank()) 191 | device = torch.cuda.current_device() 192 | else: 193 | device = torch.device("cpu") 194 | 195 | autoenc = autoenc.to(device) 196 | num_epochs = epochs 197 | 198 | # Effective batch size in synchronous distributed training is scaled by the number of workers. 199 | # An increase in learning rate compensates for the increased batch size. 200 | optimizer = optim.Adam(autoenc.parameters(), lr=learning_rate * hvd.size()) 201 | 202 | # Broadcast initial parameters so all workers start with the same parameters. 203 | hvd.broadcast_parameters(autoenc.state_dict(), root_rank=ROOT_RANK) 204 | hvd.broadcast_optimizer_state(optimizer, root_rank=ROOT_RANK) 205 | 206 | # Wrap the optimizer with Horovod's DistributedOptimizer. 207 | optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=autoenc.named_parameters()) 208 | 209 | scheduler_wu, scheduler_re = init_lr_schedulers( 210 | optimizer, warmup_epochs, reduce_patience=int(EARLY_STOPPING_PATIENCE/2), verbose=True) 211 | 212 | hvd.broadcast_object(scheduler_wu, root_rank=ROOT_RANK) 213 | hvd.broadcast_object(scheduler_re, root_rank=ROOT_RANK) 214 | 215 | criterion = [nn.MSELoss(), nn.CrossEntropyLoss()] 216 | early_stopper = DistributedEarlyStopping(logger, patience=EARLY_STOPPING_PATIENCE, delta=EARLY_STOPPING_DELTA) 217 | 218 | with converter_val.make_torch_dataloader(cur_shard=hvd.rank(), shard_count=hvd.size(), 219 | batch_size=batch_size, data_loader_fn=BatchedDataLoader, 220 | num_epochs=None, cache_type=PS_CACHE_TYPE, 221 | workers_count=PS_WORKERS_PER_CPU * get_default_parallelism(), 222 | reader_pool_type=PS_WORKER_TYPE, 223 | hdfs_driver=PS_HDFS_DRIVER) as val_dataloader, \ 224 | converter_train.make_torch_dataloader(cur_shard=hvd.rank(), shard_count=hvd.size(), 225 | batch_size=batch_size, data_loader_fn=BatchedDataLoader, 226 | num_epochs=None, cache_type=PS_CACHE_TYPE, 227 | workers_count=PS_WORKERS_PER_CPU * get_default_parallelism(), 228 | reader_pool_type=PS_WORKER_TYPE, 229 | hdfs_driver=PS_HDFS_DRIVER) as train_dataloader: 230 | 231 | val_dataloader_iter = iter(val_dataloader) 232 | steps_val = max(1, len(converter_val) // (batch_size * hvd.size())) 233 | 234 | train_dataloader_iter = iter(train_dataloader) 235 | steps_per_epoch = max(1, len(converter_train) // (batch_size * hvd.size())) 236 | total_time = 0 237 | 238 | epoch = 1 239 | while epoch < num_epochs + 1: 240 | autoenc.train() 241 | _, epoch_time = run_autoencoder_peta(autoenc, optimizer, steps_per_epoch, train_dataloader_iter, 242 | criterion, device, tgt_id, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps) 243 | autoenc.eval() 244 | val_loss, val_epoch_time = run_autoencoder_val_peta(autoenc, steps_val, val_dataloader_iter, criterion, 245 | device, tgt_id, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps) 246 | total_time = total_time + epoch_time + val_epoch_time 247 | 248 | loss_averaged = metric_average(torch.tensor(val_loss), 'avg_loss') 249 | logger.info("Average overall training loss in epoch {0} is {1}".format( 250 | epoch, loss_averaged)) 251 | 252 | if epoch <= warmup_epochs: 253 | scheduler_wu.step() 254 | scheduler_re.step(loss_averaged) 255 | 256 | epoch = determine_early_stop(early_stopper, loss_averaged, autoenc, path, epoch, num_epochs) 257 | if hvd.rank() == ROOT_RANK and epoch == num_epochs: 258 | if save_model: 259 | save_checkpoint(autoenc, optimizer, epoch, 'encoder') 260 | return autoenc, loss_averaged, total_time 261 | epoch = epoch+1 262 | -------------------------------------------------------------------------------- /caspr/utils/spark/score.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from math import frexp 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import torch 7 | from pyspark.sql.functions import array, col, pandas_udf 8 | from pyspark.sql.types import ArrayType, FloatType 9 | 10 | from caspr.data.common_dataset import CommonDataset, id_collate 11 | from caspr.models.factory import LSTM, TRANSFORMER 12 | from caspr.utils.preprocess import get_nonempty_tensors 13 | from caspr.utils.score import get_architecture 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | def score(df, model, seq_cols, non_seq_cols, cat_cols, cont_cols, time_steps, batch_size=16*2048): 19 | model.eval() 20 | 21 | # vectorizing continuous and discrete features separately 22 | output = df.withColumn('cont_features', array([col(f) for f in cont_cols])).drop(*cont_cols) 23 | output = output.withColumn('cat_features', array([col(f) for f in cat_cols])).drop(*cat_cols) 24 | 25 | if torch.cuda.is_available(): 26 | device = torch.device("cuda") 27 | else: 28 | device = torch.device("cpu") 29 | 30 | logger.info("Scoring on: %s" % device) 31 | 32 | # making sure the model is on CPU before the UDF is defined 33 | model.cpu() 34 | 35 | def calculate_embeddings(continuous, categorical): 36 | try: 37 | model.to(device) 38 | embeddings = [] 39 | batch_ds = CommonDataset.for_inference(continuous, categorical, seq_cols, 40 | non_seq_cols, cat_cols, cont_cols, time_steps) 41 | 42 | # nested batching to ensure Spark does not trigger CUDA OOM with larger datasets 43 | data_loader = torch.utils.data.DataLoader(batch_ds, batch_size=batch_size, collate_fn=id_collate) 44 | 45 | for _, _, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data in data_loader: 46 | 47 | data = [seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data] 48 | if isinstance(model, torch.nn.Module): 49 | data = [d.to(device) for d in data] 50 | 51 | nonempty_tensors, nonempty_idx = get_nonempty_tensors(data) 52 | 53 | if get_architecture(model) == TRANSFORMER: 54 | emb, _, _ = model.unified_encoder(*nonempty_tensors, nonempty_idx) 55 | # Concatenate across timesteps 56 | emb = emb.reshape(emb.shape[0], -1) 57 | embeddings.append(emb.detach().cpu() if isinstance(emb, torch.Tensor) else emb) 58 | 59 | elif get_architecture(model) == LSTM: 60 | _, (hn, _) = model.unified_encoder(*nonempty_tensors, nonempty_idx) 61 | embeddings.append(hn.detach().cpu() if isinstance(hn, torch.Tensor) else hn) 62 | 63 | embeddings = pd.DataFrame(np.concatenate(embeddings, axis=0)) 64 | 65 | return pd.Series(embeddings.values.tolist()) 66 | 67 | finally: 68 | # can release resources here, if needed 69 | pass 70 | 71 | # Pandas UDF declaration with float[] return type 72 | score_udf = pandas_udf(calculate_embeddings, ArrayType(FloatType())) 73 | 74 | # Calculating the embeddings as an additional column and dropping the temporary vectors 75 | output = output.withColumn('embeddings', score_udf('cont_features', 'cat_features') 76 | ).drop('cont_features', 'cat_features') 77 | 78 | return output 79 | -------------------------------------------------------------------------------- /caspr/utils/train.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import logging 4 | import os 5 | import time 6 | 7 | import numpy as np 8 | import torch 9 | import torch.distributed as dist 10 | import torch.multiprocessing as mp 11 | from torch import optim 12 | from torch.nn.parallel import DistributedDataParallel as DDP 13 | 14 | from caspr.data.load import init_loaders 15 | from caspr.models.factory import CASPRFactory 16 | from caspr.models.model_wrapper import AutoencoderTeacherTraining, LSTMAutoencoder, TransformerAutoEncoder 17 | from caspr.utils.early_stopping import DistributedEarlyStopping, EarlyStopping 18 | from caspr.utils.metrics import get_metrics 19 | from caspr.utils.onnx import ONNXWrapper 20 | from caspr.utils.score import get_architecture 21 | 22 | DDP_BACKEND = "nccl" 23 | DDP_MASTER_ADDR = "localhost" 24 | DDP_MASTER_PORT = "12355" 25 | DDP_LOAD_WORKERS = 1 26 | STD_LOAD_WORKERS = 0 27 | logger = logging.getLogger(__name__) 28 | 29 | 30 | def run_autoencoder(autoenc, optimizer, dataloader_train, criterion, device): 31 | count = 0 32 | epoch_start_time = time.time() 33 | running_loss = 0.0 34 | 35 | for _, y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data in dataloader_train: 36 | y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data = y.to(device), seq_cat_data.to( 37 | device), seq_cont_data.to(device), non_seq_cat_data.to(device), non_seq_cont_data.to(device) 38 | 39 | _, loss = autoenc.run(y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data, criterion) 40 | optimizer.zero_grad() 41 | loss.backward() 42 | optimizer.step() 43 | 44 | running_loss = (running_loss * count + loss.item()) / (count + 1) 45 | 46 | count = count + 1 47 | 48 | if count % 64 == 0: 49 | logger.info(loss, count*seq_cat_data.shape[0]) 50 | time_so_far = time.time() - epoch_start_time 51 | logger.info("Time taken since start:" + str(time_so_far)) 52 | 53 | epoch_end_time = time.time() 54 | logger.info(epoch_end_time - epoch_start_time) 55 | 56 | return running_loss, epoch_end_time - epoch_start_time 57 | 58 | 59 | def run_autoencoder_val(autoenc, dataloader_val, criterion, device): 60 | count = 0 61 | running_loss = 0.0 62 | 63 | for _, y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data in dataloader_val: 64 | y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data = y.to(device), seq_cat_data.to( 65 | device), seq_cont_data.to(device), non_seq_cat_data.to(device), non_seq_cont_data.to(device) 66 | 67 | _, loss = autoenc.run(y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data, criterion) 68 | 69 | running_loss = (running_loss * count + loss.item()) / (count + 1) 70 | count = count + 1 71 | 72 | if count % 64 == 0: 73 | logger.info(loss, count*seq_cat_data.shape[0]) 74 | 75 | return running_loss 76 | 77 | 78 | def run_epoch(model, epoch, dataloader, criterion, device, optimizer=None, is_train=True, get_outputs=False): 79 | model.to(device) 80 | losses = [] 81 | y_labels = [] 82 | y_preds = [] 83 | 84 | if isinstance(model, DDP): 85 | model = model.module 86 | 87 | for _, y, seq_cat_x, seq_cont_x, non_seq_cat_x, non_seq_cont_x in dataloader: 88 | if is_train: 89 | optimizer.zero_grad() 90 | 91 | seq_cat_x = seq_cat_x.to(device) 92 | seq_cont_x = seq_cont_x.to(device) 93 | non_seq_cat_x = non_seq_cat_x.to(device) 94 | non_seq_cont_x = non_seq_cont_x.to(device) 95 | y = y.to(device) 96 | 97 | # Forward Pass 98 | y_pred, loss = model.run(y, seq_cat_x, seq_cont_x, non_seq_cat_x, non_seq_cont_x, criterion=criterion) 99 | losses.append(loss.detach().cpu().numpy()) 100 | 101 | if get_outputs: 102 | y_labels.append(y) 103 | y_preds.append(y_pred) 104 | 105 | # Backward Pass and Optimization 106 | if is_train: 107 | loss.backward() 108 | optimizer.step() 109 | 110 | if get_outputs: 111 | y_labels = torch.cat(y_labels, 0).detach().cpu().numpy() 112 | y_preds = torch.cat(y_preds, 0).detach().cpu().numpy() 113 | 114 | mean_loss = np.mean(np.asarray(losses)) 115 | mode = 'training' if is_train else 'validation' 116 | logger.info("Average {} loss in epoch {} is {}".format(mode, epoch, mean_loss)) 117 | return y_labels, y_preds, mean_loss 118 | 119 | 120 | def init_lr_schedulers(optimizer, warmup_epochs, reduce_mode='min', reduce_factor=0.1, reduce_patience=4, verbose=True): 121 | """ 122 | Training batch size grows proportionally with training distribution, mandating upscaling of the learning rate, which in turn reduces the probability of finding the global optimum. 123 | This function initializes learning rate schedulers for a given optimizer to facilitate dynamic adjustment (reduction) of learning rate during training. 124 | """ 125 | 126 | warm_up = lambda epoch: epoch / warmup_epochs if warmup_epochs > 0 & epoch <= warmup_epochs else 1 127 | scheduler_wu = torch.optim.lr_scheduler.LambdaLR(optimizer=optimizer, lr_lambda=warm_up) 128 | scheduler_re = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode=reduce_mode, factor=reduce_factor, patience=reduce_patience, verbose=verbose) 129 | 130 | return scheduler_wu, scheduler_re 131 | 132 | 133 | def train_model(model, criterion, num_epochs, dataloader_train, dataloader_val, device, save_path, lr=1e-3, fix_module_names=None, 134 | should_decrease=True, patience=8, verbose=True, evaluate_downstream=False, rank=0, world_size=1, warmup_epochs=5, save_onnx=False): 135 | 136 | if isinstance(model, (LSTMAutoencoder, AutoencoderTeacherTraining, TransformerAutoEncoder)) and evaluate_downstream: 137 | raise ValueError('evaluate_downstream should be set to False when training autoencoder') 138 | 139 | if fix_module_names: 140 | fix_modules = [module for name, module in model.named_modules() if name in fix_module_names] 141 | for module in fix_modules: 142 | for param in module.parameters(): 143 | param.requires_grad = False 144 | module.eval() 145 | 146 | optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr) 147 | 148 | scheduler_wu, scheduler_re = init_lr_schedulers(optimizer, warmup_epochs, reduce_patience=int(patience/2), verbose=verbose) 149 | 150 | if world_size > 1: 151 | early_stopping = DistributedEarlyStopping(logger, should_decrease, patience, verbose, rank=rank, save_onnx=save_onnx) 152 | else: 153 | early_stopping = EarlyStopping(logger, should_decrease, patience, verbose, save_onnx=save_onnx) 154 | 155 | for epoch in range(num_epochs): 156 | start = time.time() 157 | 158 | model.train() 159 | if fix_module_names: 160 | for module in fix_modules: 161 | module.eval() 162 | 163 | run_epoch(model, epoch, dataloader_train, criterion, device, optimizer) 164 | 165 | model.eval() 166 | with torch.no_grad(): 167 | y_labels, y_preds, mean_val_loss = run_epoch(model, epoch, dataloader_val, criterion, device, 168 | is_train=False, get_outputs=evaluate_downstream) 169 | if evaluate_downstream: 170 | get_metrics(y_labels, y_preds) 171 | 172 | end = time.time() 173 | logger.info("Time for epoch {0} is {1}\n".format(epoch, (end - start))) 174 | logger.info("Mean validation loss for epoch {0} is {1}\n".format(epoch, mean_val_loss)) 175 | 176 | if epoch <= warmup_epochs: 177 | scheduler_wu.step() 178 | scheduler_re.step(mean_val_loss) 179 | 180 | early_stopping(mean_val_loss, model, save_path) 181 | if early_stopping.early_stop: 182 | logger.info('early stopping at epoch {}'.format(epoch)) 183 | break 184 | 185 | if rank == 0: 186 | if save_onnx: 187 | model_type = get_architecture(model) 188 | model = ONNXWrapper(save_path, model_type) 189 | elif isinstance(model, DDP): 190 | model.module.load_state_dict(torch.load(save_path)) 191 | else: 192 | model.load_state_dict(torch.load(save_path)) 193 | return model 194 | 195 | 196 | def __setup_ddp(rank, world_size): 197 | 198 | os.environ['MASTER_ADDR'] = DDP_MASTER_ADDR 199 | os.environ['MASTER_PORT'] = DDP_MASTER_PORT 200 | 201 | # initialize the process group 202 | dist.init_process_group(DDP_BACKEND, rank=rank, world_size=world_size) 203 | torch.cuda.set_device(rank) 204 | 205 | 206 | def __do_train_ddp(rank, args): 207 | 208 | __setup_ddp(rank, args['world_size']) 209 | 210 | caspr_factory = args['caspr_factory'] 211 | 212 | model = caspr_factory.create(args['caspr_arch'], **args['hyper_params']) 213 | 214 | model = DDP(model.cuda(), device_ids=[rank]) 215 | 216 | train_loader, val_loader = init_loaders(args['ds_train'], args['ds_val'], args['batch_size'], 217 | num_workers=DDP_LOAD_WORKERS, world_size=args['world_size'], rank=rank) 218 | 219 | train_model(model, args['criterion'], args['num_epochs'], train_loader, val_loader, rank, args['save_path'], 220 | lr=args['lr'] * args['world_size'], rank=rank, world_size=args['world_size'], **args['kwargs']) 221 | 222 | dist.destroy_process_group() 223 | 224 | 225 | def train_model_ddp(caspr_factory : CASPRFactory, caspr_arch : str, hyper_params : dict, ds_train, ds_val, criterion, num_epochs, batch_size, save_path, lr=1e-3, **kwargs): 226 | """ 227 | Distributed Data Parallel implementation of CASPR training. Will use all GPUs available on the current machine. 228 | 229 | Arguments: 230 | ---------- 231 | 232 | caspr_factory: CASPR model factory for the specified dataset 233 | 234 | caspr_arch: CASPR architecture e.g. TransformerAutoEncoder 235 | 236 | hyper_params: parameters for instantiating a new CASPR model with the above method 237 | 238 | ds_train: CommonDataset for training 239 | 240 | ds_val: CommonDataset for validation 241 | 242 | criterion, num_epochs, batch_size, save_path, lr: self explanatory 243 | 244 | **kwargs: any other parameters to be passed to the train_model function by the DDP worker (e.g. evaluate, verbose or patience) 245 | 246 | Returns: Trained model 247 | 248 | """ 249 | logger.info("Setting up model training using torch DDP") 250 | 251 | for arg in [caspr_factory, caspr_arch, ds_train, ds_val, criterion, num_epochs, batch_size, save_path, lr]: 252 | if not arg: 253 | raise ValueError("Illegal null argument. Check for None values and try again.") 254 | 255 | world_size = torch.cuda.device_count() 256 | 257 | if not torch.cuda.is_available() or world_size < 2: 258 | device = "cuda" if torch.cuda.is_available() else "cpu" 259 | logger.warn("DDP mode disabled. Training on %s..." % device) 260 | model = caspr_factory.create(caspr_arch, device=device, **hyper_params) 261 | train_loader, val_loader = init_loaders(ds_train, ds_val, batch_size, num_workers=STD_LOAD_WORKERS) 262 | return train_model(model, criterion, num_epochs, train_loader, val_loader, device, save_path, lr, **kwargs) 263 | 264 | logger.info("DDP mode enabled, will train on %d GPUs" % world_size) 265 | 266 | arguments = locals() 267 | 268 | mp.spawn(__do_train_ddp, 269 | args=(arguments,), 270 | nprocs=world_size, 271 | join=True) 272 | 273 | model = caspr_factory.create(caspr_arch, **hyper_params) 274 | model.load_state_dict(torch.load(save_path)) 275 | return model 276 | 277 | 278 | def test_model(model, dataloader_test, criterion, device): 279 | model.eval() 280 | with torch.no_grad(): 281 | y_labels, y_preds, _ = run_epoch( 282 | model, 0, dataloader_test, criterion, device, is_train=False, get_outputs=True) 283 | return y_labels, y_preds 284 | 285 | 286 | def count_parameters(model): 287 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 288 | -------------------------------------------------------------------------------- /docs/PR_Guidelines.md: -------------------------------------------------------------------------------- 1 | # Guidelines for creating a good pull request 2 | 3 | 1. A PR should describe the change clearly and most importantly it should mention the motivation behind the change. Filling out the PR template should satisfy this guideline. 4 | 2. If the PR is fixing a performance issue, mention the improvement and how the measurement was done (for educational purposes). 5 | 3. Do not leave comments unresolved. If PR comments have been addressed without making the requested code changes, explicitly mark them resolved with an appropriate comment explaining why you're resolving it. If you intend to resolve it in a follow up PR, create a task and mention why this comment cannot be fixed in this PR. Leaving comments unresolved sets a wrong precedent for other contributors that it's ok to ignore comments. 6 | 4. In the interest of time, discuss the PR/comments in person/phone if it's difficult to explain in writing. Document the resolution in the PR for the educational benefit of others. Don't just mark the comment resolved saying 'based on offline discussion'. 7 | 5. Add comments, if not obvious, in the PR to help the reviewer navigate your PR faster. If this is a big change, include a short design doc (docs/ folder). 8 | 6. Unit tests are mandatory for all PRs (except when the proposed changes are already covered by existing unit tests). 9 | 7. Do not use PRs as scratch pads for development as they consume valuable build/CI cycles for every commit. Build and test your changes for at least one environment (windows/linux/mac) before creating a PR. 10 | 8. Keep it small. If the feature is big, it's best to split into multiple PRs. Modulo cosmetic changes, a PR with more than 10 files is notoriously hard to review. Be kind to the reviewers. 11 | 9. Separate cosmetic changes from functional changes by making them separate PRs. 12 | 10. The PR author is responsible for merging the changes once they're approved. 13 | 11. If you co-author a PR, seek review from someone else. Do not self-approve PRs. -------------------------------------------------------------------------------- /docs/images/caspr-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/CASPR/9cdbf270487751a0ad6862b2fea2ccc0e23a0b67/docs/images/caspr-logo.png -------------------------------------------------------------------------------- /docs/images/caspr-poster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/CASPR/9cdbf270487751a0ad6862b2fea2ccc0e23a0b67/docs/images/caspr-poster.png -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # ----------------------------------------------------------------------------- 4 | 5 | # ----------------------------------------------------------------------------- 6 | # Setup() configuration 7 | # ----------------------------------------------------------------------------- 8 | 9 | [metadata] 10 | name = AI.Models.CASPR 11 | version = attr: caspr.__VERSION 12 | description = CASPR 13 | long_description = file: README.rst, LICENSE 14 | keywords = "CASPR", "Machine Learning", "Deep Learning" 15 | license = "Microsoft" 16 | classifiers = 17 | Programming Language :: Python :: 3.7, 18 | Intended Audience :: Developers, 19 | License :: OSI Approved :: MIT License, 20 | Natural Language :: English, 21 | Operating System :: OS Independent, 22 | Topic :: Scientific/Engineering :: Artificial Intelligence 23 | url = https://powerbi.visualstudio.com/Business360%%20AI/_git/AI.Models.CASPR 24 | 25 | [options] 26 | zip_safe = False 27 | include_package_data = True 28 | packages = find: 29 | 30 | install_requires = ## base (common) requirements 31 | pandas>1.0 32 | imbalanced-learn>=0.8 33 | scikit-learn>=0.7 34 | scipy>=1.5 35 | matplotlib>=3.3 36 | torch~=1.11.0 37 | protobuf<4.0 38 | onnx~=1.10.1 39 | onnxruntime~=1.7.0 40 | 41 | [options.packages.find] 42 | include=caspr.* 43 | exclude=tests 44 | 45 | [options.extras_require] 46 | 47 | horovod = ## install for horovod + petastorm execution (spark.large module) 48 | pyspark~=3.1 49 | torchvision 50 | petastorm~=0.11 51 | horovod[pytorch,spark]>=0.22 52 | b360sparkdl>=1.0 53 | 54 | xai = ## install for explainability 55 | AI.Models.Explainer~=6.0 56 | captum>=0.2 57 | 58 | databricks = ## install on Databricks 59 | mlflow>=1.19 60 | petastorm~=0.11 61 | 62 | aml = ## install on Azure ML 63 | azureml-core>=1.32 64 | mlflow>=1.19 65 | azureml-mlflow>=1.32 66 | 67 | hdi = ## install on HDInsights 68 | pyspark~=2.4.5 69 | numpy<1.20.0 70 | pyarrow~=0.17.1 71 | 72 | test = ## install before test runs 73 | pytest 74 | pytest-cov 75 | pylint 76 | pylint-junit 77 | 78 | dev = ## install for PPE, latest 79 | AI.Models.Explainer 80 | captum 81 | imbalanced-learn 82 | matplotlib 83 | scikit-learn 84 | pandas 85 | numpy 86 | torch 87 | 88 | 89 | # ----------------------------------------------------------------------------- 90 | # Pylama Configurations 91 | # ----------------------------------------------------------------------------- 92 | # Documentation: https://pylama.readthedocs.io/en/latest/#command-line-options 93 | [pylama] 94 | format = pylint 95 | skip = */.tox/*,*/.env/* 96 | linters = isort,mccabe,pycodestyle,pydocstyle,pyflakes,pylint 97 | ignore = D202,D203,D213,D406,D407,D413,D415,D417 98 | 99 | 100 | # ----------------------------------------------------------------------------- 101 | # Linter-Specific Configurations 102 | # ----------------------------------------------------------------------------- 103 | # Possible settings: https://github.com/timothycrosley/isort/wiki/isort-Settings 104 | [pylama:isort] 105 | line_length = 120 106 | multi_line_output = 0 107 | combine_star = True 108 | use_parentheses = True 109 | combine_as_imports = True 110 | 111 | # Used by isort command 112 | [isort] 113 | line_length = 120 114 | multi_line_output = 0 115 | combine_star = True 116 | use_parentheses = True 117 | combine_as_imports = True 118 | 119 | # Source code: https://github.com/pycqa/mccabe 120 | [pylama:mccabe] 121 | 122 | # Codes: https://pycodestyle.pycqa.org/en/latest/intro.html#error-codes 123 | [pylama:pycodestyle] 124 | max_line_length = 120 125 | 126 | # Used by auto-formatters 127 | [pycodestyle] 128 | max_line_length = 120 129 | 130 | # Codes: http://www.pydocstyle.org/en/5.0.1/error_codes.html 131 | [pylama:pydocstyle] 132 | 133 | # Source code: https://github.com/PyCQA/pyflakes 134 | [pylama:pyflakes] 135 | max_line_length = 120 136 | statistics = True 137 | doctests = False 138 | builtins = _ 139 | 140 | # Codes: https://docs.pylint.org/en/1.6.0/features.html 141 | # Default settings: https://github.com/PyCQA/pylint/blob/master/pylintrc 142 | [pylama:pylint] 143 | max_line_length = 120 144 | logging_format_style = new 145 | attr_rgx = [a-z_][a-z0-9_]{,30}$ 146 | variable_rgx = [a-z_][a-z0-9_]{,30}$ 147 | argument_rgx = [a-z_][a-z0-9_]{,30}$ 148 | class_attribute_rgx = ([A-Za-z_][A-Za-z0-9_]{,30}|(__.*__))$ 149 | # Modules whose attributes are generated at runtime and thus attributes cannot be found using static analysis: 150 | ignored_modules = 151 | pyspark.sql.functions, torch, numpy 152 | 153 | 154 | # ----------------------------------------------------------------------------- 155 | # File-Specific Configurations 156 | # ----------------------------------------------------------------------------- 157 | [pylama:*tests/*.py] 158 | ignore = C0114,C0115,C0116,C0302,C0321,D,R0902,R0903,R0904,W0612,W0613,C0103,R0914 159 | 160 | [pylama:*caspr/models/lstm_autoencoder_sequence.py] 161 | ignore = C0103 162 | 163 | [pylama:*caspr/models/attention_mechanisms.py] 164 | ignore = C0103 165 | 166 | [pylama:*caspr/utils/train.py] 167 | ignore = W0613 168 | 169 | [pylama:*caspr/utils/spark/large/train.py] 170 | ignore = E1102, E1121 171 | 172 | [pylama:*caspr/utils/spark/large/score.py] 173 | ignore = E1121 174 | 175 | [pylama:*caspr/utils/preprocess.py] 176 | ignore = R0913, R0914 177 | 178 | [pylama:*caspr/utils/spark/preprocess.py] 179 | ignore = R0913, R0914, W0640 180 | 181 | [pylama:*caspr/utils/explain/CASPRExplainer.py] 182 | ignore = C0103, R0902, R0913, W0221 183 | 184 | [pylama:*caspr/utils/explain/utils.py] 185 | ignore = R0914 186 | 187 | [pylama:*caspr/utils/segmentation/pandas.py] 188 | ignore = W0703, W0102, R0913, R0914, W0612 189 | 190 | [pylama:*caspr/utils/segmentation/dec_utils.py] 191 | ignore = E1102, R0914 192 | 193 | [pylama:*setup.py] 194 | ignore = A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z 195 | # skip = 1 # Not currently enforced 196 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | # replaced by AI.Common build template 4 | auto_replaced = "__version__" 5 | 6 | # minor trick to circumvent version warning when building manually 7 | version = None if 'version' in auto_replaced else auto_replaced 8 | 9 | setup(version=version) 10 | --------------------------------------------------------------------------------