├── .gitignore
├── CITATION.cff
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── README.rst
├── SECURITY.md
├── SUPPORT.md
├── caspr
├── __init__.py
├── data
│ ├── __init__.py
│ ├── common_dataset.py
│ └── load.py
├── models
│ ├── README.md
│ ├── __init__.py
│ ├── attention_mechanisms.py
│ ├── convolutional_aggregation.py
│ ├── dec.py
│ ├── dense_bn_dropout.py
│ ├── embedding_layer.py
│ ├── factory.py
│ ├── lstm_autoencoder_sequence.py
│ ├── lstm_decoder.py
│ ├── lstm_timeseries_tpa_attention.py
│ ├── mlp.py
│ ├── model_wrapper.py
│ ├── multi_layer_lstm.py
│ ├── transformer.py
│ ├── unified_encoder.py
│ └── unified_transformer_encoder.py
└── utils
│ ├── __init__.py
│ ├── early_stopping.py
│ ├── estimate_parameters.py
│ ├── explain
│ ├── CASPRExplainer.py
│ ├── __init__.py
│ └── utils.py
│ ├── horovod
│ ├── __init__.py
│ └── train.py
│ ├── metrics.py
│ ├── noise.py
│ ├── onnx.py
│ ├── preprocess.py
│ ├── score.py
│ ├── segmentation
│ ├── __init__.py
│ ├── dec_utils.py
│ └── pandas.py
│ ├── spark
│ ├── __init__.py
│ ├── large
│ │ ├── __init__.py
│ │ ├── score.py
│ │ └── train.py
│ ├── preprocess.py
│ └── score.py
│ └── train.py
├── docs
├── PR_Guidelines.md
└── images
│ ├── caspr-logo.png
│ └── caspr-poster.png
├── setup.cfg
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.pickle
3 | *.pth
4 | *.p
5 | *.ipynb
6 | *.csv
7 | .amlignore
8 | aml_config
9 | .git
10 | .vscodeignore
11 | azureml-logs
12 | .azureml
13 | outputs
14 | azureml-setup
15 | *:Zone.Identifier*
16 |
17 | # VSCode stuff
18 | **/*.prefs
19 | **/*.project
20 | **/*.classpath
21 | .[Vv]scode
22 | .idea/
23 |
24 | # Byte-compiled / optimized / DLL files
25 | __pycache__/
26 | *.py[cod]
27 | *$py.class
28 |
29 | # C extensions
30 | *.so
31 |
32 | # Distribution / packaging
33 | .Python
34 | build/
35 | !.azure-pipelines/build/
36 | develop-eggs/
37 | dist/
38 | downloads/
39 | eggs/
40 | .eggs/
41 | lib/
42 | lib64/
43 | parts/
44 | sdist/
45 | var/
46 | wheels/
47 | *.egg-info/
48 | .installed.cfg
49 | *.egg
50 | MANIFEST
51 |
52 | # PyInstaller
53 | # Usually these files are written by a python script from a template
54 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
55 | *.manifest
56 | *.spec
57 |
58 | # Installer logs
59 | pip-log.txt
60 | pip-delete-this-directory.txt
61 |
62 | # Unit test / coverage reports
63 | htmlcov/
64 | .tox/
65 | .coverage
66 | .coverage.*
67 | .cache
68 | nosetests.xml
69 | coverage.xml
70 | *.cover
71 | .hypothesis/
72 | .pytest_cache/
73 |
74 | # Translations
75 | *.mo
76 | *.pot
77 |
78 | # Django stuff:
79 | *.log
80 | local_settings.py
81 | db.sqlite3
82 |
83 | # Flask stuff:
84 | instance/
85 | .webassets-cache
86 |
87 | # Scrapy stuff:
88 | .scrapy
89 |
90 | # Sphinx documentation
91 | docs/_build/
92 |
93 | # PyBuilder
94 | target/
95 |
96 | # Jupyter Notebook
97 | .ipynb_checkpoints
98 |
99 | # pyenv
100 | .python-version
101 |
102 | # celery beat schedule file
103 | celerybeat-schedule
104 |
105 | # SageMath parsed files
106 | *.sage.py
107 |
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 |
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 |
121 | # Rope project settings
122 | .ropeproject
123 |
124 | # mkdocs documentation
125 | /site
126 |
127 | # mypy
128 | .mypy_cache/
129 | .vs/*
130 |
--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.2.0
2 | title: CASPR
3 | message: "Please use this information to cite CASPR in
4 | research or other publications."
5 | authors:
6 | - given-names: Pin-Jung
7 | family-names: Chen
8 | email: pinjung.chen@microsoft.com
9 | affiliation: Microsoft Corporation
10 | - given-names: Sahil
11 | family-names: Bhatnagar
12 | email: sahil.bhatnagar@microsoft.com
13 | affiliation: Microsoft Corporation
14 | - given-names: Damian Konrad
15 | family-names: Kowalczyk
16 | email: damian.kowalczyk@microsoft.com
17 | affiliation: Microsoft Corporation
18 | - given-names: Mayank
19 | family-names: Shrivastava
20 | email: mayank.shrivastava@microsoft.com
21 | affiliation: Microsoft Corporation
22 | - given-names: Sagar
23 | family-names: Goyal
24 | email: goyalsagar@outlook.com
25 |
26 | date-released: 2022-11-16
27 | repository-code: "https://github.com/microsoft/CASPR"
28 | license: "MIT"
29 | keywords:
30 | - deep learning
31 | - machine learning
32 | - tabular data
33 |
34 | version: 0.2.6
35 | doi: 10.48550/arXiv.2211.09174
36 | references:
37 | - type: article
38 | authors:
39 | - given-names: Pin-Jung
40 | family-names: Chen
41 | email: pinjung.chen@microsoft.com
42 | affiliation: Microsoft Corporation
43 | - given-names: Sahil
44 | family-names: Bhatnagar
45 | email: sahil.bhatnagar@microsoft.com
46 | affiliation: Microsoft Corporation
47 | - given-names: Damian Konrad
48 | family-names: Kowalczyk
49 | email: damian.kowalczyk@microsoft.com
50 | affiliation: Microsoft Corporation
51 | - given-names: Mayank
52 | family-names: Shrivastava
53 | email: mayank.shrivastava@microsoft.com
54 | affiliation: Microsoft Corporation
55 | - given-names: Sagar
56 | family-names: Goyal
57 | email: goyalsagar@outlook.com
58 | title: "CASPR: Customer Activity Sequence-based Prediction and Representation"
59 | year: 2022
60 | journal: ArXiv
61 | doi: 10.48550/arXiv.2211.09174
62 | url: https://arxiv.org/abs/2211.09174
63 |
64 | abstract: >-
65 | Tasks critical to enterprise profitability, such as customer churn prediction, fraudulent account detection or customer lifetime value estimation, are often tackled by models trained on features engineered from customer data in tabular format. Application-specific feature engineering adds development, operationalization and maintenance costs over time. Recent advances in representation learning present an opportunity to simplify and generalize feature engineering across applications. When applying these advancements to tabular data researchers deal with data heterogeneity, variations in customer engagement history or the sheer volume of enterprise datasets. In this paper, we propose a novel approach to encode tabular data containing customer transactions, purchase history and other interactions into a generic representation of a customer's association with the business. We then evaluate these embeddings as features to train multiple models spanning a variety of applications. CASPR, Customer Activity Sequence-based Prediction and Representation, applies Transformer architecture to encode activity sequences to improve model performance and avoid bespoke feature engineering across applications. Our experiments at scale validate CASPR for both small and large enterprise applications.
66 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Microsoft Open Source Code of Conduct
2 |
3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
4 |
5 | Resources:
6 |
7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | We're looking for your help to improve CASPR (bug fixes, new features, documentation, etc).
4 |
5 | ## Contribute a code change
6 | * Start by reading the [CASPR Paper](https://arxiv.org/abs/2211.09174)
7 | * If your change is non-trivial or introduces new public facing APIs (discussed in more detail below) please use the [feature request issue template](https://github.com/microsoft/CASPR/issues/new?template=feature_request.md) to discuss it with the team and get consensus on the basic design and direction first. For all other changes, you can directly create a pull request (PR) and we'll be happy to take a look.
8 | * Make sure your PR adheres to the [PR Guidelines](./docs/PR_Guidelines.md) established by the team.
9 | * If you're unsure about any of the above and want to contribute, you're welcome to start a discussion with the team.
10 |
11 | ## Process details
12 |
13 | Please search the [issue tracker](https://github.com/microsoft/CASPR/issues) for a similar idea first: there may already be an issue you can contribute to.
14 |
15 | 1. **Create Issue**
16 | To propose a new feature or API please start by filing a new issue in the [issue tracker](https://github.com/microsoft/CASPR/issues).
17 | Include as much detail as you have. It's fine if it's not a complete design: just a summary and rationale is a good starting point.
18 |
19 | 2. **Discussion**
20 | We'll keep the issue open for community discussion until it has been resolved or is deemed no longer relevant.
21 | Note that if an issue isn't a high priority or has many open questions then it might stay open for a long time.
22 |
23 | 3. **Owner Review**
24 | The CASPR team will review the proposal and either approve or close the issue based on whether it broadly aligns with the CASPR Roadmap and contribution guidelines.
25 |
26 | 4. **Implementation**
27 | * A feature can be implemented by you, the CASPR team, or other community members. Code contributions are greatly appreciated: feel free to work on any reviewed feature you proposed, or choose one in the backlog and send us a PR. If you are new to the project and want to work on an existing issue, we recommend starting with issues that are tagged with “good first issue”. Please let us know in the issue comments if you are actively working on implementing a feature so we can ensure it's assigned to you.
28 | * Unit tests: New code *must* be accompanied by unit tests.
29 | * Documentation and sample updates: If the PR affects any of the documentation or samples then include those updates in the same PR.
30 |
31 | * Once a feature is complete and tested according to the contribution guidelines follow these steps:
32 | * Follow the [standard GitHub process to open a pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests)
33 | * Add reviewers who have context from the earlier discussion. If you can't find a reviewer, add 'microsoft/CASPR'.
34 | * Note: After creating a pull request, you might not see a build getting triggered right away. One of the
35 | CASPR team members can trigger the build for you.
36 |
37 | ## Licensing guidelines
38 |
39 | This project welcomes contributions and suggestions. Most contributions require you to
40 | agree to a Contributor License Agreement (CLA) declaring that you have the right to,
41 | and actually do, grant us the rights to use your contribution. For details, visit
42 | https://cla.microsoft.com.
43 |
44 | When you submit a pull request, a CLA-bot should automatically determine whether you need
45 | to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the
46 | instructions provided by the bot. You will only need to do this once across all repositories using our CLA.
47 |
48 | ## Code of conduct
49 |
50 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
51 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
52 | or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
53 |
54 | ## Report a security issue
55 |
56 | Security issues and bugs should be reported privately, via email, to the Microsoft Security
57 | Response Center (MSRC) at [secure@microsoft.com](mailto:secure@microsoft.com). You should
58 | receive a response within 24 hours. If for some reason you do not, please follow up via
59 | email to ensure we received your original message. Further information, including the
60 | [MSRC PGP](https://technet.microsoft.com/en-us/security/dn606155) key, can be found in
61 | the [Security TechCenter](https://technet.microsoft.com/en-us/security/default).
62 |
63 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) Microsoft Corporation. All rights reserved.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include README.rst
3 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |

2 |
3 |
4 | **CASPR is a transformer-based framework for deep learning from sequential data in tabular format, most common in business applications.**
5 |
6 |
7 | Tasks critical to enterprise profitability, such as customer churn prediction, fraudulent account detection or customer lifetime value estimation, are often tackled by models trained on features engineered from customer data in tabular format. Application-specific feature engineering however adds development, operationalization and maintenance costs over time. Recent advances in representation learning present an opportunity to simplify and generalize feature engineering across applications.
8 |
9 | With **CASPR** we propose a novel approach to encode sequential data in tabular format (e.g., customer transactions, purchase history and other interactions) into a generic representation of a subject's (e.g., customer's) association with the business. We evaluate these embeddings as features to train multiple models spanning a variety of applications (see: [paper](https://arxiv.org/abs/2211.09174)). CASPR, Customer Activity Sequence-based Prediction and Representation, applies transformer architecture to encode activity sequences to improve model performance and avoid bespoke feature engineering across applications. Our experiments at scale validate CASPR for both small and large enterprise applications.
10 |
11 |
12 |
17 |
18 | ## Getting Started & Resources
19 |
20 | * **CASPR: Customer Activity Sequence-based Prediction and Representation** (NeurIPS 2022, New Orleans: Tabular Representation Learning)
21 | - [paper](https://arxiv.org/abs/2211.09174)
22 | - [poster](https://github.com/microsoft/CASPR/docs/images/caspr-poster.png)
23 |
24 | * **Build**
25 |
26 | - pre-requisites: ```python==3.9, setuptools```
27 | - building the wheel: ```python setup.py build bdist_wheel```
28 |
29 | * **Installation**
30 |
31 | ```
32 | (now)
33 | pip install .\dist\AI.Models.CASPR-.whl[]
34 |
35 | (future)
36 | pip install AI.Models.CASPR[]
37 | ```
38 |
39 | use any of below modifiers, to customize the installation for target system / usecase:
40 | ```
41 | horovod - for distributed training and inference on Horovod
42 | databricks - for distributed training and inference on Databricks
43 | aml - for (distributed) training and inference on Azure ML
44 | hdi - for execution on Azure HD Insights
45 | xai - to enable explainability
46 | test - for extended test execution
47 | dev - for development purposes only
48 | ```
49 | * **Examples**
50 |
51 | (TODO: can we point to a well commented one of our examples w/ or w/o data?)
52 |
53 | ## Contributions and Feedback
54 |
55 | We welcome contributions! Please see the [contribution guidelines](CONTRIBUTING.md).
56 |
57 | For feature requests or bug reports please file a [GitHub Issue](https://github.com/Microsoft/CASPR/issues).
58 |
59 | ## Code of Conduct
60 |
61 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
62 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
63 | or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
64 |
65 | ## License
66 |
67 | This project is licensed under the [MIT License](LICENSE).
68 |
69 | ---
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | AI.Models.CASPR
2 | ==================
3 | This is the deep learning CASPR model.
4 |
5 | This package has been tested with Python 3.7
6 |
7 | Usage
8 | -----
9 | You need to have access to Business360 artifact feed on Azure Devops
10 |
11 | | pip install twine keyring artifacts-keyring
12 | | pip install AI.Models.CASPR --index-url=https://powerbi.pkgs.visualstudio.com/_packaging/Business360/pypi/simple
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## Security
4 |
5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
6 |
7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
8 |
9 | ## Reporting Security Issues
10 |
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 |
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 |
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 |
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc).
18 |
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 |
21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 | * Full paths of source file(s) related to the manifestation of the issue
23 | * The location of the affected source code (tag/branch/commit or direct URL)
24 | * Any special configuration required to reproduce the issue
25 | * Step-by-step instructions to reproduce the issue
26 | * Proof-of-concept or exploit code (if possible)
27 | * Impact of the issue, including how an attacker might exploit the issue
28 |
29 | This information will help us triage your report more quickly.
30 |
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 |
33 | ## Preferred Languages
34 |
35 | We prefer all communications to be in English.
36 |
37 | ## Policy
38 |
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 |
41 |
42 |
--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
1 | # TODO: The maintainer of this repo has not yet edited this file
2 |
3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
4 |
5 | - **No CSS support:** Fill out this template with information about how to file issues and get help.
6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
8 |
9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10 |
11 | # Support
12 |
13 | ## How to file issues and get help
14 |
15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing
16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or
17 | feature request as a new Issue.
18 |
19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE
20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22 |
23 | ## Microsoft Support Policy
24 |
25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
26 |
--------------------------------------------------------------------------------
/caspr/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------------------------
2 | # Copyright (c) Microsoft Corporation. All rights reserved.
3 | # --------------------------------------------------------------------------
4 |
5 | """A series of modules for the CASPR deep learning AI model.
6 |
7 | Provide a longer general description of the modules in this folder here.
8 |
9 | Modules:
10 | :module1_name: A description of this specific module.
11 | """
12 |
13 | __VERSION = "0.9.dev3" # arbitrary low dev version for local build
14 |
--------------------------------------------------------------------------------
/caspr/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation. All rights reserved.
2 |
3 | #
4 |
5 | # Unless required by applicable law or agreed to in writing, software
6 |
7 | # distributed under the License is distributed on an "AS IS" BASIS,
8 |
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 |
11 | # See the License for the specific language governing permissions and
12 |
13 | # limitations under the License.
14 |
15 | #
16 |
17 | # ==============================================================================
18 |
--------------------------------------------------------------------------------
/caspr/data/common_dataset.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | import numpy as np
3 | import pandas as pd
4 | import torch
5 | from torch.utils.data.dataloader import default_collate
6 |
7 |
8 | class CommonDataset(torch.utils.data.Dataset):
9 | def __init__(self, df, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps, tgt_id=[]):
10 | self.len = df.shape[0]
11 | self.seq_cols = seq_cols if seq_cols else []
12 |
13 | self.non_seq_cols = non_seq_cols
14 | self.output_col = output_col
15 |
16 | self.seq_contX = torch.tensor(df[[item for item in seq_cols if item in cont_cols]].values, dtype=torch.float32)
17 | self.seq_catX = torch.tensor(df[[item for item in seq_cols if item in cat_cols]].values, dtype=torch.long)
18 |
19 | self.seq_contX = self.seq_contX.reshape(
20 | (self.seq_contX.shape[0], int(self.seq_contX.shape[1]/time_steps), time_steps))
21 | self.seq_contX = self.seq_contX.permute(0, 2, 1)
22 |
23 | self.seq_catX = self.seq_catX.reshape(
24 | (self.seq_catX.shape[0], int(self.seq_catX.shape[1]/time_steps), time_steps))
25 | self.seq_catX = self.seq_catX.permute(0, 2, 1)
26 |
27 | self.non_seq_catX = torch.tensor(
28 | df[[item for item in non_seq_cols if item in cat_cols]].values, dtype=torch.long)
29 | self.non_seq_contX = torch.tensor(
30 | df[[item for item in non_seq_cols if item in cont_cols]].values, dtype=torch.float32)
31 |
32 | self.y = torch.tensor(df[output_col].values, dtype=torch.float32)
33 |
34 | self.tgt_id = df[tgt_id].values
35 |
36 | @classmethod
37 | def for_inference(cls, continuous: pd.Series, categorical: pd.Series, seq_cols, non_seq_cols, cat_cols, cont_cols, time_steps):
38 | cont_df = pd.DataFrame(continuous.values.tolist(), columns=cont_cols)
39 | cat_df = pd.DataFrame(categorical.values.tolist(), columns=cat_cols)
40 |
41 | df = pd.concat([cont_df, cat_df], axis=1)
42 | return cls(df, seq_cols, non_seq_cols, [], cat_cols, cont_cols, time_steps, tgt_id=[])
43 |
44 | def __getitem__(self, index):
45 | return [self.tgt_id[index], self.y[index], self.seq_catX[index], self.seq_contX[index], self.non_seq_catX[index], self.non_seq_contX[index]]
46 |
47 | def __len__(self):
48 | return self.len
49 |
50 |
51 | def id_collate(batch):
52 | ids = []
53 | new_batch = []
54 | for _batch in batch:
55 | ids.append(_batch[0])
56 | new_batch.append(_batch[1:])
57 | ids = np.stack(ids, axis=0)
58 | return tuple([ids] + default_collate(new_batch))
59 |
--------------------------------------------------------------------------------
/caspr/data/load.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from sklearn.model_selection import train_test_split
3 | from torch.utils.data import DataLoader
4 | from torch.utils.data.distributed import DistributedSampler
5 |
6 | from caspr.data.common_dataset import CommonDataset, id_collate
7 |
8 |
9 | def transform_and_load(batch, device, tgt_id_cols, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps):
10 | """
11 | Transforms a batch of feature tensors from Petastorm, into input tensors for CASPR, then loads onto chosen device.
12 | """
13 | if not batch:
14 | raise ValueError("non-empty batch of tensors required")
15 |
16 | if not int(time_steps) > 0:
17 | raise ValueError("time_steps should be a positive integer")
18 |
19 | batch_size = batch[list(batch.keys())[0]].shape[0]
20 |
21 | seq_contX_cols = [item for item in seq_cols if item in cont_cols]
22 | if seq_contX_cols:
23 | seq_contX = torch.cat([batch[c] for c in seq_contX_cols], 0).float().to(device)
24 | seq_contX = seq_contX.reshape(-1, time_steps, batch_size).T
25 | else:
26 | seq_contX = torch.zeros((batch_size, time_steps, 0), device=device).float()
27 |
28 | seq_catX_cols = [item for item in seq_cols if item in cat_cols]
29 | if seq_catX_cols:
30 | seq_catX = torch.cat([batch[c] for c in seq_catX_cols], 0).long().to(device)
31 | seq_catX = seq_catX.reshape(-1, time_steps, batch_size).T
32 | else:
33 | seq_catX = torch.zeros((batch_size, time_steps, 0), device=device).long()
34 |
35 | non_seq_catX_cols = [item for item in non_seq_cols if item in cat_cols]
36 | if non_seq_catX_cols:
37 | non_seq_catX = torch.cat([batch[c] for c in non_seq_catX_cols], 0).long().to(device)
38 | non_seq_catX = non_seq_catX.reshape(len(non_seq_catX_cols), batch_size).T
39 | else:
40 | non_seq_catX = torch.zeros(batch_size, 0, device=device).long()
41 |
42 | non_seq_contX_cols = [item for item in non_seq_cols if item in cont_cols]
43 | if non_seq_contX_cols:
44 | non_seq_contX = torch.cat([batch[c] for c in non_seq_contX_cols], 0).float().to(device)
45 | non_seq_contX = non_seq_contX.reshape(len(non_seq_contX_cols), batch_size).T
46 | else:
47 | non_seq_contX = torch.zeros(batch_size, 0, device=device).float()
48 |
49 | if output_col:
50 | y = torch.cat([batch[c] for c in output_col], 0).to(device)
51 | y = y.reshape((len(output_col), -1)).T
52 | else:
53 | y = torch.zeros(batch_size, 0, device=device).float()
54 |
55 | if tgt_id_cols:
56 | tgt_id = torch.cat([batch[c] for c in tgt_id_cols], 0).long().cpu()
57 | tgt_id = tgt_id.reshape(len(tgt_id_cols), batch_size).T.numpy()
58 | else:
59 | tgt_id = torch.zeros(batch_size, 0).long().cpu().numpy()
60 |
61 | return tgt_id, y, seq_catX, seq_contX, non_seq_catX, non_seq_contX
62 |
63 |
64 | def init_datasets(df, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, seq_len, test_ratio=0.2, seed=None):
65 | """
66 | Splits an incoming columnar dataframe into CASPR train and validation datasets
67 | """
68 |
69 | train_pd, val_pd = train_test_split(df, test_size=test_ratio, random_state=seed)
70 |
71 | print(f"train: {len(train_pd)}, val: {len(val_pd)}")
72 |
73 | dataset_train = CommonDataset(
74 | train_pd, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, seq_len)
75 |
76 | dataset_val = CommonDataset(
77 | val_pd, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, seq_len)
78 |
79 | return dataset_train, dataset_val
80 |
81 |
82 | def init_loaders(ds_train, ds_val, batch_size, num_workers=0, shuffle=False, pin_memory=True, world_size=1, rank=0):
83 | """
84 | Initializes train and validation data loaders. The loaders support distributed sampling when world_size > 1.
85 | """
86 |
87 | print("Initializing dataloaders... Replica: %d of %d" % (rank + 1, world_size))
88 |
89 | val_sampler = DistributedSampler(ds_val,
90 | num_replicas=world_size, rank=rank, shuffle=shuffle) if world_size > 1 else None
91 |
92 | val_loader = DataLoader(ds_val, pin_memory=pin_memory,
93 | batch_size=batch_size, num_workers=num_workers, sampler=val_sampler, collate_fn=id_collate)
94 |
95 | train_sampler = DistributedSampler(ds_train,
96 | num_replicas=world_size, rank=rank, shuffle=shuffle) if world_size > 1 else None
97 |
98 | train_loader = DataLoader(ds_train, pin_memory=pin_memory,
99 | batch_size=batch_size, num_workers=num_workers, sampler=train_sampler, collate_fn=id_collate)
100 |
101 | return train_loader, val_loader
102 |
--------------------------------------------------------------------------------
/caspr/models/README.md:
--------------------------------------------------------------------------------
1 | The model architecture should follow the following guidelines to support explainability
2 |
3 | Basic changes made:
4 | 1. Every model class should have the flags - explain, interpretable_emb_non_seq and interpretable_emb_seq
5 |
6 | 2. The nn.Embedding layers and the dropout after that need to be modularised
7 | out of the model and the Seq_Cat_Embedding and Non_Seq_Cat_Embedding classes present in the Embedding_Layers.py file should be used for them
8 |
9 | 3. The input to every forward function should be a single concatenated vector
10 |
11 | 4. The activate_explainer_mode and deactivate_explainer_mode functions should be a part of every model class (also every model wrapper class)
12 |
13 |
14 | """
15 | Some notes regarding the explainer:
16 | 1. When we join multiple models to form a new model -
17 | use the activate_explainer_mode functions to call the
18 | respective functions for all consituent sub_model classes
19 |
20 | 2. Right now the architecture supports only model wrappers which join the model in a vertical fashion (the case for all our models for now)
21 |
22 | 3. The explainer modes are activated by the DLExplainer module and
23 | also deactivated by it
24 |
25 | 4. The indices to embedding conversion happens in the DLExplainer module
26 |
27 |
28 | """
29 |
30 | Please refer the mlp_autoencoder.py file to have a look
--------------------------------------------------------------------------------
/caspr/models/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation. All rights reserved.
2 |
3 | #
4 |
5 | # Unless required by applicable law or agreed to in writing, software
6 |
7 | # distributed under the License is distributed on an "AS IS" BASIS,
8 |
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 |
11 | # See the License for the specific language governing permissions and
12 |
13 | # limitations under the License.
14 |
15 | #
16 |
17 | # ==============================================================================
18 |
--------------------------------------------------------------------------------
/caspr/models/attention_mechanisms.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | """Attention mechanisms base class."""
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 |
7 |
8 | class MultiHeadAttentionLayer(nn.Module): # noqa: W0223
9 | def __init__(self, hid_dim, n_heads, dropout):
10 | """Initialize model with params."""
11 | super().__init__()
12 |
13 | assert hid_dim % n_heads == 0
14 |
15 | self.hid_dim = hid_dim
16 | self.n_heads = n_heads
17 | self.head_dim = hid_dim // n_heads
18 |
19 | self.fc_q = nn.Linear(hid_dim, hid_dim)
20 | self.fc_k = nn.Linear(hid_dim, hid_dim)
21 | self.fc_v = nn.Linear(hid_dim, hid_dim)
22 |
23 | self.fc_o = nn.Linear(hid_dim, hid_dim)
24 |
25 | self.dropout = nn.Dropout(dropout)
26 |
27 | self.register_buffer('scale', torch.sqrt(torch.FloatTensor([self.head_dim])))
28 |
29 | def forward(self, query, key, value, mask=None):
30 | """Run a forward pass of model over the data."""
31 | batch_size = query.shape[0]
32 |
33 | # query = [batch size, query len, hid dim]
34 | # key = [batch size, key len, hid dim]
35 | # value = [batch size, value len, hid dim]
36 |
37 | Q = self.fc_q(query)
38 | K = self.fc_k(key)
39 | V = self.fc_v(value)
40 |
41 | # Q = [batch size, query len, hid dim]
42 | # K = [batch size, key len, hid dim]
43 | # V = [batch size, value len, hid dim]
44 |
45 | Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
46 | K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
47 | V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
48 |
49 | # Q = [batch size, n heads, query len, head dim]
50 | # K = [batch size, n heads, key len, head dim]
51 | # V = [batch size, n heads, value len, head dim]
52 |
53 | energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
54 |
55 | # energy = [batch size, n heads, query len, key len]
56 |
57 | if mask is not None:
58 | energy = energy.masked_fill(mask == 0, -1e10)
59 |
60 | attention = torch.softmax(energy, dim=-1)
61 |
62 | # attention = [batch size, n heads, query len, key len]
63 |
64 | x = torch.matmul(self.dropout(attention), V)
65 |
66 | # x = [batch size, n heads, query len, head dim]
67 |
68 | x = x.permute(0, 2, 1, 3).contiguous()
69 |
70 | # x = [batch size, query len, n heads, head dim]
71 |
72 | x = x.view(batch_size, -1, self.hid_dim)
73 |
74 | # x = [batch size, query len, hid dim]
75 |
76 | x = self.fc_o(x)
77 |
78 | # x = [batch size, query len, hid dim]
79 |
80 | return x, attention
81 |
82 |
83 | class MultiHeadAttentionLSTMWrapper(nn.Module): # noqa: W0223
84 | def __init__(self, n_head, d_model, dropout=0.1):
85 | """Initialize model with params."""
86 | super().__init__()
87 |
88 | self.self_attn_layer_norm = nn.LayerNorm(d_model)
89 | self.multi_head_attn = MultiHeadAttentionLayer(hid_dim=d_model, n_heads=n_head, dropout=dropout)
90 | self.dropout = nn.Dropout(dropout)
91 |
92 | def forward(self, q, k, v, mask=None):
93 | """Run a forward pass of model over the data."""
94 | _q, _ = self.multi_head_attn(q, k, v, mask=mask)
95 | # dropout, residual connection and layer norm
96 | q = self.self_attn_layer_norm(q + self.dropout(_q))
97 |
98 | context_vector = torch.sum(q, 1)
99 | return context_vector
100 |
101 |
102 | class BahdanauAttention(nn.Module): # noqa: W0223
103 | def __init__(self, hidden_size, num_directions=1):
104 | """Initialize model with params."""
105 |
106 | super().__init__()
107 | self.num_directions = num_directions
108 | self.hidden_size = hidden_size
109 | self.fc_encoder = nn.Linear(self.num_directions*self.hidden_size, self.hidden_size, bias=False)
110 | self.attnHidden = nn.Linear(self.hidden_size, 1)
111 |
112 | def forward(self, enc_outputs):
113 | """Run a forward pass of model over the data."""
114 | tempX = torch.tanh(self.fc_encoder(enc_outputs))
115 |
116 | alignment_scores = self.attnHidden(tempX)
117 |
118 | attn_weights = F.softmax(alignment_scores, dim=1)
119 | attn_weights = attn_weights.permute(0, 2, 1)
120 |
121 | context_vector = torch.bmm(attn_weights, enc_outputs)
122 |
123 | return context_vector
124 |
--------------------------------------------------------------------------------
/caspr/models/convolutional_aggregation.py:
--------------------------------------------------------------------------------
1 | """CNN based layer base class."""
2 |
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 |
7 |
8 | class ConvAggregation(nn.Module): # noqa: W0223
9 | """Initialise a CNN based layer that reduces the size of our input.
10 |
11 | It treates the sequential input like an image with a single channel and performs learned aggregation
12 | """
13 |
14 | def __init__(self, kernel_size=(3, 3), stride=(2, 2), max_pool_size=(2, 2), dropout_size=0.):
15 | """Initiliase the cnn layers.
16 |
17 | Args:
18 | kernel_size : Tuple which determines the size of the cnn kernel
19 | stride : Tuple which determines the size of the strides in the x and y direction
20 | max_pool_size = Tuple which determines the size of the max_pooling function
21 | dropout_size = Value of dropout added after entire processing
22 | """
23 | super().__init__()
24 | self.in_channels = 1
25 | self.out_channels = 1
26 |
27 | self.conv_layer = nn.Conv2d(in_channels=self.in_channels,
28 | out_channels=self.out_channels, kernel_size=kernel_size, stride=stride)
29 | self.max_pool = nn.MaxPool2d(max_pool_size)
30 | self.conv_dropout = nn.Dropout(dropout_size)
31 |
32 | def forward(self, input_tensor):
33 | """Run a forward pass of model over the data."""
34 |
35 | # The CNN by default accepts the input as (batch_size, in_channels, height_img, width_img).
36 | # We treat the sequential input as an image but we need an additional dimension to correspond to in_channels
37 | # Therefore we need to unsqueeze a dimension out here
38 |
39 | input_tensor = torch.unsqueeze(input_tensor, 1)
40 |
41 | input_tensor = F.tanh(self.conv_layer(input_tensor))
42 | input_tensor = self.max_pool(input_tensor)
43 |
44 | # The CNN by default outputs as (batch_size, out_channels, height_img, width_img).
45 | # We need to squeeze away the dimension we had added earlier to remain consistent
46 |
47 | input_tensor = input_tensor.squeeze(1)
48 | output_tensor = self.conv_dropout(input_tensor)
49 |
50 | return output_tensor
51 |
--------------------------------------------------------------------------------
/caspr/models/dec.py:
--------------------------------------------------------------------------------
1 | """CASPR deep embedding clustering class."""
2 |
3 | import torch
4 | import torch.nn as nn
5 | from torch.nn import Parameter
6 |
7 | from caspr.utils.preprocess import get_nonempty_tensors
8 |
9 |
10 | class ClusterAssignment(nn.Module): # noqa: W0223
11 | def __init__(self,
12 | cluster_number,
13 | embedding_dimension,
14 | alpha=1.0,
15 | cluster_centers=None):
16 | """Handle the soft assignment.
17 |
18 | For a description see in 3.1.1. in Xie/Girshick/Farhadi, where the Student's t-distribution
19 | is used to measure similarity between feature vector and each cluster centroid.
20 |
21 | Args:
22 | cluster_number (int): number of clusters
23 | embedding_dimension (int): embedding dimension of feature vectors
24 | alpha (float): parameter representing the degrees of freedom in the t-distribution, default 1.0
25 | cluster_centers (tensors): clusters centers to initialise, if None then use Xavier uniform
26 | """
27 | super().__init__()
28 | self.embedding_dimension = embedding_dimension
29 | self.cluster_number = cluster_number
30 | self.alpha = alpha
31 | if cluster_centers is None:
32 | initial_cluster_centers = torch.zeros(
33 | self.cluster_number,
34 | self.embedding_dimension,
35 | dtype=torch.float
36 | )
37 | nn.init.xavier_uniform_(initial_cluster_centers)
38 | else:
39 | initial_cluster_centers = cluster_centers
40 | self.cluster_centers = Parameter(initial_cluster_centers)
41 |
42 | def forward(self, batch):
43 | """Run a forward pass of model over the data.
44 |
45 | Compute the soft assignment for a batch of feature vectors, returning a batch of assignments for each cluster.
46 |
47 | Args:
48 | batch: FloatTensor of [batch size, embedding dimension]
49 |
50 | Return:
51 | FloatTensor of [batch size, number of clusters]
52 | """
53 | norm_squared = torch.sum((batch.unsqueeze(1) - self.cluster_centers) ** 2, 2)
54 | numerator = 1.0 / (1.0 + (norm_squared / self.alpha))
55 | power = float(self.alpha + 1) / 2
56 | numerator = numerator**power
57 | return numerator / torch.sum(numerator, dim=1, keepdim=True)
58 |
59 |
60 | class DEC(nn.Module): # noqa: W0223
61 | def __init__(self,
62 | cluster_number,
63 | hidden_dimension,
64 | enc,
65 | alpha=1):
66 | """Initialize the parts of DEC algorithm.
67 |
68 | as described in Xie/Girshick/Farhadi; this includes the AutoEncoder stage and the ClusterAssignment stage.
69 |
70 | Args:
71 | cluster_number (int): number of clusters
72 | hidden_dimension (int): hidden dimension, output of the encoder
73 | enc (nn.Module): # noqa: W0223 encoder to use
74 | alpha (float): parameter representing the degrees of freedom in the t-distribution, default 1.0
75 | """
76 | super().__init__()
77 | self.enc = enc
78 | self.hidden_dimension = hidden_dimension
79 | self.cluster_number = cluster_number
80 | self.alpha = alpha
81 | self.assignment = ClusterAssignment(cluster_number, self.hidden_dimension, alpha)
82 |
83 | def forward(self, *args):
84 | """Compute the cluster assignment.
85 |
86 | Using the ClusterAssignment after running the batch
87 | through the encoder part of the associated AutoEncoder module.
88 |
89 | Args:
90 | batch: FloatTensor of [batch size, embedding dimension]
91 |
92 | Return:
93 | FloatTensor of [batch size, number of clusters]
94 | """
95 | return self.assignment(self.enc(*args))
96 |
97 | def run(self, # noqa : R0913
98 | y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data, criterion): # noqa : W0613
99 | data = (seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data)
100 | nonempty_tensors, nonempty_idx = get_nonempty_tensors(data)
101 | output = self(*nonempty_tensors, nonempty_idx)
102 | target = _target_distribution(output).detach()
103 | loss = criterion(output.log(), target) / output.shape[0]
104 | return output, loss
105 |
106 |
107 | def _target_distribution(batch):
108 | """Compute the target distribution p_ij, given the batch (q_ij).
109 |
110 | 3.1.3 Equation 3 of Xie/Girshick/Farhadi; this used the KL-divergence loss function.
111 |
112 | Args:
113 | batch: FloatTensor of [batch size, number of clusters]
114 |
115 | Return:
116 | FloatTensor of [batch size, number of clusters]
117 | """
118 | weight = (batch ** 2) / torch.sum(batch, 0)
119 | return (weight.t() / torch.sum(weight, 1)).t()
120 |
--------------------------------------------------------------------------------
/caspr/models/dense_bn_dropout.py:
--------------------------------------------------------------------------------
1 | """CASPR base dense layer class."""
2 |
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 |
6 |
7 | class DenseBnDropout(nn.Module): # noqa: W0223
8 | """Dense Layers w/ dropout and batch-normalization.
9 |
10 | A module comprising of a sequential structure of [Linear -> Batch Normalisation -> Dropout]
11 | used for multiple iterations through it
12 | When the input is a 3D tensor - batch_size x seq_len x features
13 | When the input is a 2D tensor - batch_size x features
14 | """
15 |
16 | def __init__(self, lin_layer_sizes, lin_layer_dropouts, input_size):
17 | """Initiliasing the layers.
18 |
19 | Args:
20 | lin_layer_sizes (list) = sizes of the linear layers being using across multiple iterations
21 | lin_layer_dropouts (list) = values of the dropout layers across multiple iterations
22 | input_size (integer) = size of the input tensor - batch_size x 'input_size' x seq_len
23 | """
24 |
25 | super().__init__()
26 | first_lin_layer = nn.Linear(input_size, lin_layer_sizes[0])
27 | self.lin_layers = nn.ModuleList([first_lin_layer] +
28 | [nn.Linear(lin_layer_sizes[i],
29 | lin_layer_sizes[i + 1])
30 | for i in range(len(lin_layer_sizes) - 1)])
31 | for lin_layer in self.lin_layers:
32 | nn.init.kaiming_normal_(lin_layer.weight.data)
33 |
34 | self.dropout_layers = nn.ModuleList([nn.Dropout(p) for p in lin_layer_dropouts])
35 | self.bn_layers = nn.ModuleList([nn.BatchNorm1d(size) for size in lin_layer_sizes])
36 |
37 | def forward(self, input_tensor):
38 | """Run a forward pass of model over the data."""
39 | is_seq = input_tensor.ndim == 3
40 |
41 | for lin_layer, dropout_layer, bn_layer in zip(self.lin_layers, self.dropout_layers, self.bn_layers):
42 | input_tensor = F.relu(lin_layer(input_tensor))
43 | if is_seq:
44 | # permute to adjust for the BN internal structure
45 | input_tensor = input_tensor.permute(0, 2, 1)
46 |
47 | input_tensor = bn_layer(input_tensor)
48 |
49 | if is_seq:
50 | # permute back to maintain the original structure required for linear layer
51 | input_tensor = input_tensor.permute(0, 2, 1)
52 |
53 | input_tensor = dropout_layer(input_tensor)
54 |
55 | output_tensor = input_tensor
56 | return output_tensor
57 |
--------------------------------------------------------------------------------
/caspr/models/embedding_layer.py:
--------------------------------------------------------------------------------
1 | """CASPR embedding layer base class."""
2 |
3 | import numpy as np
4 | import torch
5 | import torch.nn as nn
6 |
7 |
8 | class CategoricalEmbedding(nn.Module): # noqa: W0223
9 | """Define embedding layers to convert categorical variable values to continuous embeddings.
10 |
11 | Uses pytorch defined nn.Embedding layers
12 | The incoming data for this class has 3 dimensions - dim(1) is the number of time steps in the sequence
13 | when used for a seq variable
14 | When being used for non-seq variable - data has 2 dimensions
15 | """
16 |
17 | def __init__(self, # noqa: R0913
18 | emb_dims, emb_dropout, is_seq=False, pretrained_vecs=None, freeze_pretrained=True):
19 | """Initialise the emb layer class.
20 |
21 | Args:
22 | emb_dims: A list of tuple (x, y) which contains the input for the nn.Embedding layer
23 | emb_dropout : The dropout value for the layers applied after concatenation of all the embeddings
24 | is_seq = determines if this layer has been initialised for sequential or non-sequential data
25 | pretrained_vecs = The tensor which contains the pretrained values. For variables for which we dont have the
26 | vecs we initialise the nn.Embedding layer and backpropagate through them
27 | freeze_pretrained This boolean label determines if we freeze the pretrained embeddings and dont
28 | backpropagate through them
29 | """
30 |
31 | super().__init__()
32 |
33 | self.emb_dims = emb_dims
34 | self.is_seq = is_seq
35 | self.emb_layers = nn.ModuleList([nn.Embedding(x, y) for x, y in emb_dims])
36 | if pretrained_vecs is not None and len(emb_dims) > 0:
37 | for i, v in enumerate(pretrained_vecs):
38 | if v is not None:
39 | self.emb_layers[i] = nn.Embedding.from_pretrained(v, freeze=freeze_pretrained)
40 | self.num_classes = [x for x, _ in emb_dims]
41 | self.emb_size = np.sum([y for _, y in emb_dims], dtype=np.int32)
42 | self.emb_dropout_layer = nn.Dropout(emb_dropout)
43 |
44 | def forward(self, cat_data):
45 | """Run a forward pass of model over the data."""
46 | cat_data = cat_data.long()
47 | # across all rows and column i - useful for batches
48 | cat_inp = [emb_layer(cat_data[..., i]) for i, emb_layer in enumerate(self.emb_layers)]
49 | cat_inp = torch.cat(cat_inp, -1)
50 | cat_inp = self.emb_dropout_layer(cat_inp)
51 | return cat_inp
52 |
--------------------------------------------------------------------------------
/caspr/models/factory.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import numpy as np
4 | import torch.nn as nn
5 |
6 | from caspr.models.dense_bn_dropout import DenseBnDropout
7 | from caspr.models.lstm_decoder import LSTM_attention_embedding_decoder
8 | from caspr.models.model_wrapper import LSTMAutoencoder, OutputLayer, TransformerAutoEncoder
9 | from caspr.models.transformer import TransformerDecoder, TransformerEncoder
10 | from caspr.models.unified_encoder import UnifiedEncoder
11 | from caspr.models.unified_transformer_encoder import UnifiedTransformerEncoder
12 |
13 | TRANSFORMER = 'TransformerAutoEncoder'
14 | LSTM = 'LSTMAutoencoder'
15 | logger = logging.getLogger(__name__)
16 |
17 | class CASPRFactory:
18 |
19 | def __init__(self, cat_cols_, num_activities, cont_cols_, seq_cols_, non_seq_cols_, date_cols_=[], seq_len=15, max_emb_size=25, emb_dims_non_seq=None, emb_dims_seq=None) -> None:
20 | self.support = {
21 | TRANSFORMER : self.__create_transformer_autoencoder__,
22 | LSTM : self.__create_autoencoder__
23 | }
24 |
25 | if num_activities:
26 | self.emb_dims_non_seq, self.emb_dims_seq = self.calculate_embedding_dimensions(num_activities, seq_cols=seq_cols_,
27 | non_seq_cols=non_seq_cols_,
28 | max_emb_size=max_emb_size)
29 | else:
30 | self.emb_dims_non_seq = emb_dims_non_seq
31 | self.emb_dims_seq = emb_dims_seq
32 |
33 | self.seq_len = seq_len
34 |
35 | self.non_seq_cat_ = [x for x in cat_cols_ if x in non_seq_cols_]
36 | self.seq_cat_ = [x for x in cat_cols_ if x in seq_cols_]
37 | self.non_seq_cont_ = [x for x in cont_cols_ if x in non_seq_cols_]
38 | self.seq_cont_ = [x for x in cont_cols_+date_cols_ if x in seq_cols_]
39 |
40 | self.seq_cont_dim = len(set(seq_cols_) & set(cont_cols_)) + len(date_cols_)
41 | self.non_seq_cont_dim = len(set(non_seq_cols_) & set(cont_cols_))
42 | # Append non seq features to the end of the sequence if exist
43 | self.adjust_seq_len = seq_len + int(len(non_seq_cols_) > 0)
44 |
45 | @staticmethod
46 | def calculate_embedding_dimensions(num_activities, seq_cols=None, non_seq_cols=None, max_emb_size=25):
47 | """Calculate the emb dims for the categorical embedding layer for each categorical variable.
48 |
49 | Args:
50 | num_activities: number of unique activities for each categorical variable
51 | seq_cols (list): List of sequential vars
52 | non_seq_cols (list): List of non-sequential vars
53 | max_emb_size (Default = 25) : The max size of the embedding layer for a variable
54 | (needed when the possible values are very high)
55 | """
56 |
57 | # Avoid using empty lists as default values
58 | seq_cols = [] if seq_cols is None else seq_cols
59 | non_seq_cols = [] if non_seq_cols is None else non_seq_cols
60 |
61 | cat_seq_dims = [num_activities[c] for c in num_activities.keys() if c in seq_cols]
62 | cat_non_seq_dims = [num_activities[c] for c in num_activities.keys() if c in non_seq_cols]
63 | emb_dims_non_seq = [(x, int(np.minimum(max_emb_size, (x + 1) // 2))) for x in cat_non_seq_dims]
64 | emb_dims_seq = [(x, int(np.minimum(max_emb_size, (x + 1) // 2))) for x in cat_seq_dims]
65 |
66 | return emb_dims_non_seq, emb_dims_seq
67 |
68 | def __create_transformer_autoencoder__(self, device="cuda", HIDDEN_SIZE=64,
69 | NUM_LAYERS_ENC=4,
70 | NUM_LAYERS_DEC=2,
71 | NUM_HEADS_ENC=2,
72 | NUM_HEADS_DEC=4,
73 | PF_DIM_ENC=32,
74 | PF_DIM_DEC=128,
75 | DROPOUT_ENC=0.1,
76 | DROPOUT_DEC=0.1,
77 | EMBEDDING_DROPOUT_SEQUENTIAL=0.1,
78 | EMBEDDING_DROPOUT_NON_SEQUENTIAL=0.1) -> TransformerAutoEncoder:
79 |
80 | enc = TransformerEncoder(hid_dim=HIDDEN_SIZE, n_layers=NUM_LAYERS_ENC, n_heads=NUM_HEADS_ENC,
81 | pf_dim=PF_DIM_ENC, dropout=DROPOUT_ENC, max_length=self.adjust_seq_len)
82 |
83 | dec = TransformerDecoder(hid_dim=HIDDEN_SIZE, n_layers=NUM_LAYERS_DEC, n_heads=NUM_HEADS_DEC,
84 | pf_dim=PF_DIM_DEC, dropout=DROPOUT_DEC, pos_embedding=enc.pos_embedding)
85 |
86 | emb_seq_num_classes = [x for x, _ in self.emb_dims_seq]
87 | emb_non_seq_num_classes = [x for x, _ in self.emb_dims_non_seq]
88 |
89 | output_layer = OutputLayer(HIDDEN_SIZE, self.seq_cont_dim, self.non_seq_cont_dim,
90 | emb_seq_num_classes, emb_non_seq_num_classes)
91 |
92 | unified_transformer_encoder = UnifiedTransformerEncoder(enc,
93 | self.emb_dims_non_seq,
94 | EMBEDDING_DROPOUT_NON_SEQUENTIAL,
95 | self.emb_dims_seq,
96 | EMBEDDING_DROPOUT_SEQUENTIAL,
97 | HIDDEN_SIZE,
98 | self.seq_cont_dim,
99 | self.non_seq_cont_dim,
100 | non_seq_pretrained_embs=None,
101 | freeze_non_seq_pretrained_embs=True,
102 | seq_pretrained_embs=None,
103 | freeze_seq_pretrained_embs=True)
104 |
105 | return TransformerAutoEncoder(unified_transformer_encoder, dec, output_layer).to(device)
106 |
107 | def __create_autoencoder__(self, device="cuda", HIDDEN_SIZE=64,
108 | NUM_LAYERS=1,
109 | LIN_LAYER_SIZES_NON_SEQUENTIAL=[50, 25],
110 | LIN_LAYER_SIZES_SEQUENTIAL=[50, 25],
111 | EMBEDDING_DROPOUT_NON_SEQUENTIAL=0.04,
112 | LIN_LAYER_DROPOUTS_NON_SEQUENTIAL=[0.0001, 0.01],
113 | EMBEDDING_DROPOUT_SEQUENTIAL=0.04,
114 | LIN_LAYER_DROPOUTS_SEQUENTIAL=[0.001, 0.01]) -> LSTMAutoencoder:
115 |
116 | output_dim = len(self.seq_cont_)
117 | num_classes = [x for (x, _) in self.emb_dims_seq]
118 |
119 | # Model objects initialisation
120 | encoder = UnifiedEncoder(emb_dims_non_seq=self.emb_dims_non_seq,
121 | emb_dropout_non_seq=EMBEDDING_DROPOUT_NON_SEQUENTIAL,
122 | emb_dims_seq=self.emb_dims_seq,
123 | emb_dropout_seq=EMBEDDING_DROPOUT_SEQUENTIAL,
124 | emb_lin_layer_sizes_non_seq=LIN_LAYER_SIZES_NON_SEQUENTIAL,
125 | emb_lin_layer_dropouts_non_seq=LIN_LAYER_DROPOUTS_NON_SEQUENTIAL,
126 | emb_lin_layer_sizes_seq=LIN_LAYER_SIZES_SEQUENTIAL,
127 | emb_lin_layer_dropouts_seq=LIN_LAYER_DROPOUTS_SEQUENTIAL,
128 | lstm_hidden_size=HIDDEN_SIZE,
129 | output_size=output_dim,
130 | seq_len=self.seq_len,
131 | non_seq_cont_count=len(self.non_seq_cont_),
132 | seq_cat_count=len(self.seq_cat_),
133 | seq_cont_count=len(self.seq_cont_),
134 | non_seq_cat_count=len(self.non_seq_cat_))
135 |
136 | input_dim = int(encoder.seq_cont_count + encoder.no_of_embs_seq)
137 |
138 | decoder = LSTM_attention_embedding_decoder(input_dim=input_dim,
139 | hidden_size=HIDDEN_SIZE,
140 | num_layers=NUM_LAYERS,
141 | output_dim=output_dim,
142 | num_classes=num_classes)
143 |
144 | mlp_non_seq_cat_list = []
145 |
146 | for non_seq_cat, _ in self.emb_dims_non_seq:
147 | mlp_non_seq_cat_list.append(DenseBnDropout(LIN_LAYER_SIZES_NON_SEQUENTIAL+[
148 | non_seq_cat], LIN_LAYER_DROPOUTS_NON_SEQUENTIAL+[0], HIDDEN_SIZE))
149 | mlp_non_seq_cont = DenseBnDropout(
150 | LIN_LAYER_SIZES_NON_SEQUENTIAL, LIN_LAYER_DROPOUTS_NON_SEQUENTIAL, HIDDEN_SIZE)
151 |
152 | autoenc = LSTMAutoencoder(encoder, mlp_non_seq_cat_list, mlp_non_seq_cont, decoder).to(device)
153 |
154 | return autoenc
155 |
156 | def create(self, architecture: str, device="cuda", **hyperparams) -> nn.Module:
157 | if architecture not in self.support:
158 | raise ValueError("Unknown architecture specified. Model Factory currently supports: %s Requested: %s" % (str(self.support.keys()), architecture))
159 |
160 | constructor_f = self.support[architecture]
161 |
162 | logger.info("Initializing CASPR with %s architecture. Hyperparams provided: %s" % (architecture, hyperparams))
163 |
164 | return constructor_f(device, **hyperparams)
165 |
--------------------------------------------------------------------------------
/caspr/models/lstm_autoencoder_sequence.py:
--------------------------------------------------------------------------------
1 | """Bahdanau attention based LSTM encoder."""
2 |
3 | import warnings
4 |
5 | import torch
6 | import torch.nn as nn
7 | import torch.nn.functional as F
8 |
9 | import caspr.models
10 |
11 | warnings.simplefilter('ignore')
12 |
13 |
14 | class LSTM_attention_embedding_encoder_sequence(nn.Module): # noqa: W0223
15 | """Luong/Bahdanau attention based LSTM encoder."""
16 |
17 | def __init__(self, # noqa: R0913, R0914
18 | emb_dims_non_seq,
19 | emb_dims_seq,
20 | lin_layer_sizes_non_sequential,
21 | lin_layer_sizes_sequential,
22 | hidden_size,
23 | num_layers,
24 | bidirectional,
25 | output_size,
26 | emb_dropout_non_seq,
27 | lin_layer_dropouts_non_sequential,
28 | emb_dropout_seq,
29 | lin_layer_dropouts_sequential,
30 | lin_layer_sizes_fin,
31 | lin_layer_dropouts_fin, # noqa: W0613
32 | seq_len, input_dim,
33 | non_seq_cont_count, seq_cat_count, seq_cont_count, non_seq_cat_count,
34 | device):
35 | """Initialise the pytorch LSTM layer.
36 |
37 | Args:
38 | emb_dims_non_seq, emb_dims_seq (list of int tuples):
39 | List of category dimension and corresponding embedding size.
40 | lin_layer_sizes_non_sequential, lin_layer_sizes_sequential (list of int tuples):
41 | List of [m1*m2] tuples for embedding dimension reduction and non-linearity
42 | emb_dropout_non_seq, emb_dropout_seq (float): dropout values for embedding layers
43 | lin_layer_dropouts_non_seq, lin_layer_dropouts_seq (list of float):
44 | dropout values for linear layers corresponding to embedding layers
45 | hidden_size (int): Size of the hidden state
46 | num_layers (int): Number of stacked LSTM layers
47 | bidirectional (bool): Flag for bi/uni LSTM
48 | output_size (int): Size of the final output layer
49 | lin_layer_sizes_fin (list of int tuples):
50 | List of [m1*m2] tuples for non-linear combination of sequential and nonsequential inputs
51 | seq_len (int): Length of input Sequence
52 | """
53 | super().__init__()
54 |
55 | self.device = device
56 | self.non_seq_emb_layers = nn.ModuleList(
57 | [nn.Embedding(x, y) for x, y in emb_dims_non_seq])
58 | self.seq_emb_layers = nn.ModuleList(
59 | [nn.Embedding(x, y) for x, y in emb_dims_seq])
60 | self.no_of_embs_non_seq = sum([y for x, y in emb_dims_non_seq])
61 | self.no_of_embs_seq = sum([y for x, y in emb_dims_seq])
62 | self.input_dim = input_dim
63 | self.seq_len = seq_len
64 | self.hidden_size = hidden_size
65 | self.non_seq_cont_count = non_seq_cont_count
66 | self.non_seq_cat_count = non_seq_cat_count
67 | self.context_vector_size = hidden_size
68 | self.output_dim = output_size
69 | self.num_layers = num_layers
70 | self.num_directions = 2 if bidirectional else 1
71 |
72 | self.seq_cat_count = seq_cat_count
73 | self.seq_cont_count = seq_cont_count
74 | self.non_seq_cat_count = non_seq_cat_count
75 | self.non_seq_cont_count = non_seq_cont_count
76 |
77 | # Linear Layers for non_seq_data parallel to LSTM
78 | if self.no_of_embs_non_seq != 0:
79 | first_lin_layer = nn.Linear(self.no_of_embs_non_seq, lin_layer_sizes_non_sequential[0])
80 | self.lin_layersnon_sequential = nn.ModuleList([first_lin_layer] +
81 | [nn.Linear(lin_layer_sizes_non_sequential[i],
82 | lin_layer_sizes_non_sequential[i + 1])
83 | for i in range(len(lin_layer_sizes_non_sequential) - 1)])
84 | for lin_layer in self.lin_layersnon_sequential:
85 | nn.init.kaiming_normal_(lin_layer.weight.data)
86 |
87 | self.emb_dropout_layer_non_sequential = nn.Dropout(emb_dropout_non_seq)
88 | self.dropout_layersnon_sequential = nn.ModuleList(
89 | [nn.Dropout(size) for size in lin_layer_dropouts_non_sequential])
90 | self.bn_layersnon_sequential = nn.ModuleList(
91 | [nn.BatchNorm1d(size) for size in lin_layer_sizes_non_sequential])
92 |
93 | # Linear Layers for seq_cat_data
94 | if self.no_of_embs_seq != 0:
95 | first_lin_layer_seq = nn.Linear(self.no_of_embs_seq, lin_layer_sizes_sequential[0])
96 | self.lin_layers_seq = nn.ModuleList([first_lin_layer_seq] +
97 | [nn.Linear(lin_layer_sizes_sequential[i],
98 | lin_layer_sizes_sequential[i + 1])
99 | for i in range(len(lin_layer_sizes_sequential) - 1)])
100 | for lin_layer in self.lin_layers_seq:
101 | nn.init.kaiming_normal_(lin_layer.weight.data)
102 |
103 | self.emb_dropout_layer_seq = nn.Dropout(emb_dropout_seq)
104 | self.dropout_layers_seq = nn.ModuleList([nn.Dropout(size) for size in lin_layer_dropouts_sequential])
105 | self.bn_layers_seq = nn.ModuleList([nn.BatchNorm1d(size) for size in lin_layer_sizes_sequential])
106 |
107 | # Output Layer
108 | self.output_layer = nn.Linear(lin_layer_sizes_fin[-1], output_size)
109 | nn.init.kaiming_normal_(self.output_layer.weight.data)
110 |
111 | # LSTM layer
112 | self.lstmLayer = nn.LSTM(
113 | self.input_dim + lin_layer_sizes_sequential[-1],
114 | self.hidden_size, self.num_layers, batch_first=True, bidirectional=bidirectional)
115 | # self.lstmLayer = nn.LSTM(
116 | # self.input_dim+self.no_of_embs_seq,
117 | # self.hidden_size, self.num_layers, batch_first=True, bidirectional=bidirectional)
118 |
119 | # Linear Layers post LSTM
120 | self.lin_layer_lstm_to_dense = nn.Linear(
121 | self.num_directions*self.hidden_size, self.hidden_size)
122 |
123 | # Attention
124 | self.bahdanau_attention_layer = caspr.models.attention_mechanisms.BahdanauAttention(
125 | self.hidden_size, self.num_directions)
126 |
127 | # self.fc_encoder = nn.Linear(
128 | # self.num_directions*self.hidden_size, self.hidden_size, bias=False)
129 |
130 | # self.attnHidden = nn.Linear(self.hidden_size, 1)
131 |
132 | self.fin_layer = nn.Linear(
133 | self.num_directions*self.hidden_size +
134 | self.context_vector_size + self.no_of_embs_non_seq + self.non_seq_cont_count, hidden_size)
135 | # self.fin_layer = nn.Linear(
136 | # self.num_directions*self.hidden_size + self.context_vector_size , hidden_size)
137 |
138 | def forward(self, input_tensor): # noqa : R0914
139 | """Run a forward pass of model over the data."""
140 | seq_cat_index = self.seq_len * self.seq_cat_count
141 | seq_cont_index = seq_cat_index + self.seq_len * self.seq_cont_count
142 | non_seq_cat_index = seq_cont_index + self.non_seq_cat_count
143 | non_seq_cont_index = non_seq_cat_index + self.non_seq_cont_count
144 |
145 | seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data = input_tensor[:, :seq_cat_index], \
146 | input_tensor[:, seq_cat_index: seq_cont_index], \
147 | input_tensor[:, seq_cont_index: non_seq_cat_index], \
148 | input_tensor[:, non_seq_cat_index: non_seq_cont_index]
149 | seq_cat_data = seq_cat_data.type(torch.LongTensor)
150 | seq_cat_data = seq_cat_data.reshape(
151 | seq_cat_data.shape[0], self.seq_len, int(seq_cat_data.shape[1]/self.seq_len))
152 | seq_cont_data = seq_cont_data.reshape(
153 | seq_cont_data.shape[0], self.seq_len, int(seq_cont_data.shape[1]/self.seq_len))
154 |
155 | if self.no_of_embs_non_seq != 0:
156 | non_seq_cat_data = non_seq_cat_data.type(
157 | torch.LongTensor).to(self.device)
158 | # across all rows and column i - useful for batches
159 | non_seq_cat_inp = [emb_layer(non_seq_cat_data[:, i])
160 | for i, emb_layer in enumerate(self.non_seq_emb_layers)]
161 | non_seq_cat_inp = torch.cat(non_seq_cat_inp, 1)
162 | non_seq_cat_inp = self.emb_dropout_layer_non_sequential(non_seq_cat_inp)
163 | if self.non_seq_cont_count != 0:
164 | non_seq_inp = torch.cat((non_seq_cat_inp.type(torch.FloatTensor).to(
165 | self.device), non_seq_cont_data.type(torch.FloatTensor).to(self.device)), 1)
166 | else:
167 | non_seq_inp = non_seq_cat_inp.type(torch.FloatTensor).to(self.device)
168 | elif self.non_seq_cont_count != 0:
169 | non_seq_inp = non_seq_cont_data.type(torch.FloatTensor).to(self.device)
170 |
171 | if self.no_of_embs_seq != 0:
172 | seq_cat_data = seq_cat_data.type(
173 | torch.LongTensor).to(self.device)
174 | # across all rows and column i - useful for batches
175 | seq_cat_inp = [emb_layer(seq_cat_data[:, :, i])
176 | for i, emb_layer in enumerate(self.seq_emb_layers)]
177 | # shape = batchsize * seq_len * 16(emb size)
178 | seq_cat_inp = torch.cat(seq_cat_inp, 2)
179 | seq_cat_inp = self.emb_dropout_layer_seq(seq_cat_inp)
180 | seq_cat_inp_emb = seq_cat_inp
181 | for lin_layer, dropout_layer, bn_layer in zip(self.lin_layers_seq,
182 | self.dropout_layers_seq, self.bn_layers_seq):
183 | seq_cat_inp_emb = F.relu(lin_layer(seq_cat_inp_emb))
184 | seq_cat_inp_emb = torch.cat([bn_layer(seq_cat_inp_emb[:, i, :]).unsqueeze(1)
185 | for i in range(self.seq_len)], 1)
186 | seq_cat_inp_emb = dropout_layer(seq_cat_inp_emb)
187 |
188 | seq_cat_inp_emb = seq_cat_inp_emb.to(self.device)
189 | # shape seq_cat = batchsize * seq_len * emb size/lin_layers_seq[-1].shape
190 | # shape seq_cont = batchsize * seq_len * data
191 |
192 | seq_data = torch.cat([seq_cat_inp_emb, seq_cont_data], 2)
193 |
194 | # now the sequential data
195 | inp_tens = seq_data
196 |
197 | temp_batch_size = inp_tens.size()[0]
198 |
199 | h0 = torch.zeros(self.num_directions*self.num_layers, temp_batch_size, self.hidden_size).to(
200 | self.device).requires_grad_()
201 | c0 = torch.zeros(self.num_directions*self.num_layers, temp_batch_size, self.hidden_size).to(
202 | self.device).requires_grad_()
203 |
204 | output, (hn, cn) = self.lstmLayer(inp_tens, (h0, c0))
205 | # passes through the embedding layer to generate the required embeddings
206 | # attention weight calculation
207 |
208 | # tempX = torch.tanh(self.fc_encoder(output))
209 | # alignment_scores = self.attnHidden(tempX)
210 | # attn_weights = F.softmax(alignment_scores, dim=1)
211 | # attn_weights = attn_weights.permute(0, 2, 1)
212 | # context_vector = torch.bmm(attn_weights, output)
213 |
214 | context_vector = self.bahdanau_attention_layer(output)
215 |
216 | hn = hn.view(self.num_layers, self.num_directions, -
217 | 1, self.hidden_size).to(self.device)
218 | cn_ = cn.view(self.num_layers, self.num_directions, -
219 | 1, self.hidden_size).to(self.device)
220 | if self.num_directions > 1:
221 | seq_inp = self.lin_layer_lstm_to_dense(torch.cat(
222 | [hn[self.num_layers-1, 0], hn[self.num_layers-1, -1]], 1).unsqueeze(0))
223 | else:
224 | seq_inp = self.lin_layer_lstm_to_dense(
225 | hn[self.num_layers-1, 0]).unsqueeze(0)
226 |
227 | seq_inp = seq_inp.reshape(seq_inp.size()[1], seq_inp.size()[2])
228 |
229 | context_vector = context_vector.reshape(
230 | context_vector.size()[0], context_vector.size()[2])
231 |
232 | fin_input = torch.cat((non_seq_inp, seq_inp, context_vector), 1)
233 | # fin_input = torch.cat((seq_inp, context_vector), 1)
234 |
235 | hn_ = F.relu(self.fin_layer(fin_input))
236 |
237 | return output, (hn_, cn_[self.num_layers-1, 0, :, :].unsqueeze(0))
238 |
--------------------------------------------------------------------------------
/caspr/models/lstm_decoder.py:
--------------------------------------------------------------------------------
1 | """CASPR LSTM decoder base class."""
2 |
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 |
7 |
8 | class LSTM_attention_embedding_decoder(nn.Module): # noqa: W0223
9 | """Simple LSTM decoder."""
10 |
11 | def __init__(self, # noqa: R0913
12 | input_dim,
13 | hidden_size,
14 | output_dim,
15 | num_classes,
16 | num_layers=1):
17 | """Initialize model with params."""
18 | super().__init__()
19 |
20 | self.input_dim = input_dim
21 | self.hidden_size = hidden_size
22 | self.num_layers = num_layers
23 | self.num_classes = num_classes
24 | self.output_dim = output_dim
25 |
26 | # LSTM layer
27 | self.lstm_layer = nn.LSTM(
28 | input_dim, hidden_size, num_layers, batch_first=True)
29 |
30 | self.linear = nn.Linear(self.hidden_size, output_dim)
31 |
32 | self.output = nn.ModuleList([nn.Linear(self.hidden_size, num_class) for num_class in self.num_classes])
33 | self.hidden = None
34 |
35 | def forward(self, inp, hidden):
36 | """Forward pass through LSTM layer.
37 |
38 | shape of lstm_out: [input_size, batch_size, hidden_dim]
39 | shape of self.hidden: (a, b), where a and b both
40 | have shape (num_layers, batch_size, hidden_dim).
41 | """
42 | inp = inp.view(inp.shape[0], 1, -1)
43 | self.hidden = hidden
44 |
45 | lstm_out, self.hidden = self.lstm_layer(inp, self.hidden)
46 | decoder_out = (torch.tanh(lstm_out[:, -1, :]))
47 |
48 | y_pred = self.linear(decoder_out)
49 | out_cont = F.relu(y_pred)
50 | # out_cat = self.output(decoder_out)
51 | out_cat = [ # across all rows and column i - useful for batches
52 | output_layer(decoder_out) for i, output_layer in enumerate(self.output)
53 | ]
54 | # out_cat = torch.cat(out_cat, -1)
55 | # print(out_cat.shape)
56 |
57 | return out_cont, self.hidden, out_cont, out_cat
58 |
--------------------------------------------------------------------------------
/caspr/models/lstm_timeseries_tpa_attention.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | """TPA attention based LSTM encoder."""
3 |
4 | import torch
5 | import torch.nn as nn
6 | import torch.nn.functional as F
7 |
8 |
9 | class LSTM_TPA_attention_timeseries(nn.Module): # noqa: W0223
10 | """TPA attention based LSTM encoder."""
11 |
12 | def __init__(self, # noqa: R0913
13 | emb_dims_non_seq,
14 | emb_dims_seq,
15 | lin_layer_sizes_non_sequential,
16 | lin_layer_sizes_sequential,
17 | non_seq_cont_count,
18 | hidden_size,
19 | output_size,
20 | emb_dropout_non_seq,
21 | lin_layer_dropouts_non_sequential,
22 | emb_dropout_seq,
23 | lin_layer_dropouts_sequential,
24 | lin_layer_sizes_fin,
25 | lin_layer_dropouts_fin,
26 | seq_len, input_dim, device):
27 | """Initialise the pytorch LSTM layer.
28 |
29 | Args:
30 | emb_dims_non_seq, emb_dims_seq (list of int tuples):
31 | List of category dimension and corresponding embedding size.
32 | lin_layer_sizes_non_sequential, lin_layer_sizes_sequential (list of int tuples):
33 | List of [m1*m2] tuples for embedding dimension reduction and non-linearity
34 | emb_dropout_non_seq, emb_dropout_seq (float): dropout values for embedding layers
35 | lin_layer_dropouts_non_seq, lin_layer_dropouts_seq (list of float):
36 | dropout values for linear layers corresponding to embedding layers
37 | hidden_size (int): Size of the hidden state
38 | output_size (int): Size of the final output layer
39 | lin_layer_sizes_fin (list of int tuples):
40 | List of [m1*m2] tuples for non-linear combination of sequential and nonsequential inputs
41 | seq_len (int): Length of input Sequence
42 | """
43 |
44 | super().__init__()
45 |
46 | self.device = device
47 | self.non_seq_emb_layers = nn.ModuleList(
48 | [nn.Embedding(x, y) for x, y in emb_dims_non_seq])
49 | self.seq_emb_layers = nn.ModuleList(
50 | [nn.Embedding(x, y) for x, y in emb_dims_seq])
51 | self.no_of_embs_non_seq = sum([y for x, y in emb_dims_non_seq])
52 | self.no_of_embs_seq = sum([y for x, y in emb_dims_seq])
53 | self.input_dim = input_dim
54 | self.seq_len = seq_len
55 | self.hidden_size = hidden_size
56 | self.non_seq_cont_count = non_seq_cont_count
57 | self.context_vector_size = hidden_size
58 | self.output_dim = output_size
59 |
60 | if self.no_of_embs_non_seq != 0:
61 | first_lin_layer = nn.Linear(self.no_of_embs_non_seq, lin_layer_sizes_non_sequential[0])
62 | self.lin_layersnon_sequential = nn.ModuleList([first_lin_layer] +
63 | [nn.Linear(lin_layer_sizes_non_sequential[i],
64 | lin_layer_sizes_non_sequential[i + 1])
65 | for i in range(len(lin_layer_sizes_non_sequential) - 1)])
66 | for lin_layer in self.lin_layersnon_sequential:
67 | nn.init.kaiming_normal_(lin_layer.weight.data)
68 |
69 | self.emb_dropout_layernon_sequential = nn.Dropout(emb_dropout_non_seq)
70 | self.dropout_layersnon_sequential = nn.ModuleList(
71 | [nn.Dropout(size) for size in lin_layer_dropouts_non_sequential])
72 | self.bn_layersnon_sequential = nn.ModuleList(
73 | [nn.BatchNorm1d(size) for size in lin_layer_sizes_non_sequential])
74 |
75 | # Linear Layers for seq_cat_data
76 | if self.no_of_embs_seq != 0:
77 | first_lin_layer_seq = nn.Linear(self.no_of_embs_seq, lin_layer_sizes_sequential[0])
78 | self.lin_layers_seq = nn.ModuleList([first_lin_layer_seq] +
79 | [nn.Linear(lin_layer_sizes_sequential[i],
80 | lin_layer_sizes_sequential[i + 1])
81 | for i in range(len(lin_layer_sizes_sequential) - 1)])
82 | for lin_layer in self.lin_layers_seq:
83 | nn.init.kaiming_normal_(lin_layer.weight.data)
84 |
85 | self.emb_dropout_layer_seq = nn.Dropout(emb_dropout_seq)
86 | self.dropout_layers_seq = nn.ModuleList([nn.Dropout(size) for size in lin_layer_dropouts_sequential])
87 | self.bn_layers_seq = nn.ModuleList([nn.BatchNorm1d(size) for size in lin_layer_sizes_sequential])
88 |
89 | # Output Layer
90 | self.output_layer = nn.Linear(lin_layer_sizes_fin[-1], output_size)
91 |
92 | for lin_layer in self.lin_layer_non_sequential:
93 | nn.init.kaiming_normal_(lin_layer.weight.data)
94 |
95 | nn.init.kaiming_normal_(self.output_layer.weight.data)
96 |
97 | # definitions for data parsing
98 | # primarily required to make sure embeddings are used for categorical data
99 |
100 | # LSTM layer
101 | self.lstm_layer = nn.LSTM(
102 | self.input_dim + lin_layer_sizes_non_sequential[-1], self.hidden_size, batch_first=True)
103 |
104 | # Linear Layers post LSTM
105 | self.lin_layer_lstm_to_dense = nn.Linear(
106 | self.hidden_size, self.hidden_size)
107 |
108 | # TPA attention
109 | self.convolution_filters = nn.ModuleList([nn.Conv1d(
110 | in_channels=1, out_channels=1, kernel_size=self.seq_len) for i in range(hidden_size)])
111 | self.tpa_linear = nn.Linear(
112 | self.hidden_size, self.hidden_size, bias=False)
113 |
114 | self.tpa_hiddent_linear = nn.Linear(
115 | self.hidden_size, self.hidden_size, bias=False)
116 | self.tpa_context_linear = nn.Linear(
117 | self.hidden_size, self.hidden_size, bias=False)
118 | # Final MLP
119 | first_fin_layer = nn.Linear(self.hidden_size + self.context_vector_size +
120 | self.no_of_embs_non_seq + self.non_seq_cont_count, lin_layer_sizes_fin[0])
121 |
122 | self.lin_layers_final = nn.ModuleList([first_fin_layer] +
123 | [nn.Linear(lin_layer_sizes_fin[i],
124 | lin_layer_sizes_fin[i + 1])
125 | for i in range(len(lin_layer_sizes_fin) - 1)])
126 | for lin_layer in self.lin_layers_final:
127 | nn.init.kaiming_normal_(lin_layer.weight.data)
128 |
129 | # final dropout and batch norm layers for final prediction
130 | self.dropout_layers_final = nn.ModuleList(
131 | [nn.Dropout(size) for size in lin_layer_dropouts_fin])
132 | self.bn_layers_final = nn.ModuleList(
133 | [nn.BatchNorm1d(size) for size in lin_layer_sizes_fin])
134 |
135 | # Output Layer
136 | self.output_layer = nn.Linear(lin_layer_sizes_fin[-1], output_size)
137 |
138 | def forward(self, seq_cont_data, seq_cat_data, non_seq_cat_data, non_seq_cont_data): # noqa : R0914
139 | """Run a forward pass of model over the data."""
140 |
141 | if self.no_of_embs_non_seq != 0:
142 | non_seq_cat_data = non_seq_cat_data.type(
143 | torch.LongTensor).to(self.device)
144 | # across all rows and column i - useful for batches
145 | non_seq_cat_inp = [emb_layer(non_seq_cat_data[:, i])
146 | for i, emb_layer in enumerate(self.emb_layers)]
147 | non_seq_cat_inp = torch.cat(non_seq_cat_inp, 1)
148 | non_seq_cat_inp = self.emb_dropout_layer_non_sequential(non_seq_cat_inp)
149 | if self.non_seq_cont_count != 0:
150 | non_seq_inp = torch.cat((non_seq_cat_inp.type(torch.FloatTensor).to(
151 | self.device), non_seq_cont_data.type(torch.FloatTensor).to(self.device)), 1)
152 | else:
153 | non_seq_inp = non_seq_cat_inp.type(torch.FloatTensor).to(self.device)
154 | elif self.non_seq_cont_count != 0:
155 | non_seq_inp = non_seq_cont_data.type(torch.FloatTensor).to(self.device)
156 |
157 | if self.no_of_embs_seq != 0:
158 | seq_cat_data = seq_cat_data.type(
159 | torch.LongTensor).to(self.device)
160 | # across all rows and column i - useful for batches
161 | seq_cat_inp = [emb_layer(seq_cat_data[:, :, i])
162 | for i, emb_layer in enumerate(self.seq_emb_layers)]
163 | seq_cat_inp = torch.cat(seq_cat_inp, 2)
164 | seq_cat_inp = self.emb_dropout_layer_seq(seq_cat_inp)
165 |
166 | seq_cat_inp_emb = seq_cat_inp
167 |
168 | seq_cat_inp_emb = seq_cat_inp_emb.to(self.device)
169 |
170 | seq_data = torch.cat([seq_cat_inp_emb, seq_cont_data], 2)
171 |
172 | # now the sequential data ------------------------------
173 | inp_tens = seq_data
174 |
175 | temp_batch_size = inp_tens.size()[0]
176 |
177 | h0 = torch.zeros(1, temp_batch_size, self.hidden_size).to(
178 | self.device).requires_grad_()
179 | c0 = torch.zeros(1, temp_batch_size, self.hidden_size).to(
180 | self.device).requires_grad_()
181 |
182 | output, (hn, _) = self.lstm_layer(inp_tens, (h0, c0))
183 | hn = hn.to(self.device)
184 | # passes through the embedding layer to generate the required embeddings
185 | # output shape batch_size * seq_len * hidden_size
186 | # output[:,:,i] shape batch_size * seq_len - 1st row of H matrix
187 | hc = torch.zeros(temp_batch_size, self.hidden_size,
188 | self.hidden_size).to(self.device)
189 |
190 | for i in range(self.hidden_size):
191 | for j in range(self.hidden_size):
192 | hc[:, i, j] = self.convolution_filters[j](
193 | output[:, :, i].unsqueeze(1)).squeeze()
194 |
195 | alpha = torch.zeros(temp_batch_size, self.hidden_size).to(self.device)
196 |
197 | for i in range(self.hidden_size):
198 | temp1 = self.tpa_linear(hc[:, i]).unsqueeze(1)
199 | temp2 = hn.squeeze().unsqueeze(2)
200 | temp = torch.bmm(temp1, temp2)
201 | alpha[:, i] = F.sigmoid(temp).squeeze()
202 |
203 | vt = torch.zeros(temp_batch_size, self.hidden_size).to(self.device)
204 | for i in range(self.hidden_size):
205 | temp = torch.bmm(alpha[:, i].unsqueeze(1).unsqueeze(
206 | 2), hc[:, i].unsqueeze(1)).squeeze()
207 | vt += temp
208 |
209 | htprime = self.tpa_hiddent_linear(hn) + self.tpa_context_linear(vt)
210 |
211 | seq_inp = self.lin_layer_lstm_to_dense(hn)
212 | seq_inp = seq_inp.reshape(seq_inp.size()[1], seq_inp.size()[2])
213 | htprime = htprime.squeeze()
214 |
215 | # Linear mlp for prediction
216 | # fin_input = torch.cat((seq_inp, htprime), 1)
217 | fin_input = torch.cat((non_seq_inp, seq_inp, htprime), 1)
218 |
219 | x = fin_input
220 | for lin_layer, dropout_layer, bn_layer in zip(self.lin_layers_final, self.dropout_layers_final,
221 | self.bn_layers_final):
222 | x = F.relu(lin_layer(x))
223 | x = bn_layer(x)
224 | x = dropout_layer(x)
225 |
226 | x = F.relu(self.output_layer(x))
227 |
228 | return x, fin_input
229 |
--------------------------------------------------------------------------------
/caspr/models/mlp.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | """CASPR mlp base class."""
3 |
4 | import torch
5 | import torch.nn as nn
6 |
7 | from caspr.models.dense_bn_dropout import DenseBnDropout
8 |
9 |
10 | class MLP(nn.Module): # noqa: W0223
11 | def __init__(self, # noqa: R0913
12 | input_size,
13 | lin_layer_sizes,
14 | lin_layer_dropouts,
15 | output_size,
16 | use_sigmoid=False):
17 | """Initialize model with params."""
18 |
19 | super().__init__()
20 |
21 | self.output_size = output_size
22 | self.use_sigmoid = use_sigmoid
23 |
24 | # final linear layers just before prediction
25 | self.dense_bn_dropout = DenseBnDropout(
26 | lin_layer_sizes=lin_layer_sizes, lin_layer_dropouts=lin_layer_dropouts, input_size=input_size)
27 |
28 | # Output Layer
29 | self.output_layer = nn.Linear(lin_layer_sizes[-1], output_size)
30 | nn.init.kaiming_normal_(self.output_layer.weight.data)
31 |
32 | def forward(self, inp):
33 | """Run a forward pass of model over the data."""
34 | inp = self.dense_bn_dropout(inp)
35 | out = self.output_layer(inp)
36 | if self.use_sigmoid:
37 | out = torch.sigmoid(out)
38 | return out
39 |
--------------------------------------------------------------------------------
/caspr/models/multi_layer_lstm.py:
--------------------------------------------------------------------------------
1 | """CASPR LSTM base class."""
2 |
3 | import torch
4 | import torch.nn as nn
5 |
6 |
7 | class MultiLayerLSTM(nn.Module): # noqa: W0223
8 | """Encapsulates the Pytorch LSTM.
9 |
10 | Added functionality of aggregation / concatenation in cases of
11 | bidirectional and multi-layered LSTM's
12 |
13 | It outputs the original outputs of the lstm along with an aggregated output vector
14 | """
15 |
16 | def __init__(self, input_size, hidden_size, dropout=0., num_layers=1, bidirectional=False): # noqa: R0913
17 | """Initialise the pytorch LSTM layer.
18 |
19 | Args:
20 | input_size = The size of the input in the lstm. This represents the number of input features
21 | hidden_size = the hidden size of the lstm
22 | dropout = the dropout layers between the multiple layers of the lstm (works only when we use a
23 | multi-layered lstm)
24 | num_layers = num_layers of the lstm
25 | bidirectional = represents the type of the lstm
26 | """
27 | super().__init__()
28 | self.lstm_layer = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True,
29 | bidirectional=bidirectional, dropout=dropout)
30 | self.num_directions = 2 if bidirectional else 1
31 | self.num_layers = num_layers
32 | self.hidden_size = hidden_size
33 | # Linear Layers post LSTM
34 | self.lin_layer_lstm_to_dense = nn.Linear(
35 | self.num_directions*self.hidden_size, self.hidden_size)
36 |
37 | def forward(self, input_tensor, hidden_state=None):
38 | """Run a forward pass of model over the data."""
39 | batch_size = input_tensor.size()[0]
40 | device = input_tensor.device
41 |
42 | if hidden_state is not None:
43 | h0 = hidden_state
44 | c0 = torch.zeros(self.num_directions*self.num_layers, batch_size, self.hidden_size).to(device)
45 | output, (hn, cn) = self.lstm_layer(input_tensor, (h0, c0))
46 | else:
47 | output, (hn, cn) = self.lstm_layer(input_tensor)
48 |
49 | hn = hn.view(self.num_layers, self.num_directions, -
50 | 1, self.hidden_size)
51 | cn = cn.view(self.num_layers, self.num_directions, -
52 | 1, self.hidden_size)
53 |
54 | if self.num_directions > 1:
55 | seq_inp = self.lin_layer_lstm_to_dense(torch.cat(
56 | [hn[self.num_layers-1, 0], hn[self.num_layers-1, -1]], 1).unsqueeze(0))
57 | else:
58 | seq_inp = self.lin_layer_lstm_to_dense(
59 | hn[self.num_layers-1, 0]).unsqueeze(0)
60 |
61 | seq_inp = seq_inp.reshape(seq_inp.size()[1], seq_inp.size()[2])
62 |
63 | return output, (hn[self.num_layers-1, 0, :, :], cn[self.num_layers-1, 0, :, :]), seq_inp
64 |
--------------------------------------------------------------------------------
/caspr/models/transformer.py:
--------------------------------------------------------------------------------
1 | """CASPR transformer base class."""
2 |
3 | import torch
4 | import torch.nn as nn
5 |
6 | from caspr.models.attention_mechanisms import MultiHeadAttentionLayer
7 |
8 |
9 | class TransformerEncoderLayer(nn.Module): # noqa: W0223 # noqa: W0223
10 | """TransformerEncoderLayer is made up of self-attn and feedforward network.
11 |
12 | Args:
13 | hid_dim: the hidden size of the encoder
14 | n_heads: the number of heads in the multi-head attention layers
15 | pf_dim: the dimension of the feedforward network model
16 | dropout: the dropout value
17 | device: the device on which the model is running
18 | """
19 |
20 | def __init__(self, # noqa: R0913
21 | hid_dim,
22 | n_heads,
23 | pf_dim,
24 | dropout):
25 | """Initialize model with params."""
26 |
27 | super().__init__()
28 |
29 | self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
30 | self.ff_layer_norm = nn.LayerNorm(hid_dim)
31 | self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout)
32 | self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim,
33 | pf_dim,
34 | dropout)
35 | self.dropout = nn.Dropout(dropout)
36 |
37 | def forward(self, src, src_mask):
38 | """Run a forward pass of model over the data."""
39 |
40 | # src = [batch size, src len, hid dim]
41 | # src_mask = [batch size, src len]
42 |
43 | # self attention
44 | _src, _ = self.self_attention(src, src, src, src_mask)
45 |
46 | # dropout, residual connection and layer norm
47 | src = self.self_attn_layer_norm(src + self.dropout(_src))
48 |
49 | # src = [batch size, src len, hid dim]
50 |
51 | # positionwise feedforward
52 | _src = self.positionwise_feedforward(src)
53 |
54 | # dropout, residual and layer norm
55 | src = self.ff_layer_norm(src + self.dropout(_src))
56 |
57 | # src = [batch size, src len, hid dim]
58 |
59 | return src
60 |
61 |
62 | class TransformerEncoder(nn.Module): # noqa: W0223 # noqa: W0223
63 | """TransformerEncoder is a stack of N encoder layers.
64 |
65 | Args:
66 | hid_dim: the hidden size of the encoder.
67 | n_layers: the number of sub-encoder-layers in the encoder
68 | n_heads: the number of heads in the multi-head attention layers
69 | pf_dim: the dimension of the feedforward network model
70 | dropout: the dropout value
71 | device: the device on which the model is running
72 | max_length: the maximum length of the input sequence
73 | """
74 |
75 | def __init__(self, # noqa: R0913
76 | hid_dim,
77 | n_layers,
78 | n_heads,
79 | pf_dim,
80 | dropout,
81 | max_length=100):
82 | """Initialize model with params."""
83 | super().__init__()
84 |
85 | self.pos_embedding = nn.Embedding(max_length, hid_dim)
86 |
87 | self.layers = nn.ModuleList([TransformerEncoderLayer(hid_dim,
88 | n_heads,
89 | pf_dim,
90 | dropout)
91 | for _ in range(n_layers)])
92 |
93 | self.dropout = nn.Dropout(dropout)
94 |
95 | self.register_buffer('scale', torch.sqrt(torch.FloatTensor([hid_dim])))
96 |
97 | def _make_src_mask(self, batch_size, src_len, device):
98 |
99 | src_mask = torch.ones((batch_size, 1, 1, src_len), device=device).bool()
100 |
101 | # src_mask = [batch size, 1, 1, src len]
102 |
103 | return src_mask
104 |
105 | def forward(self, src):
106 | """Run a forward pass of model over the data."""
107 |
108 | # src = [batch size, src len, hid_dim]
109 |
110 | batch_size = src.shape[0]
111 | src_len = src.shape[1]
112 | device = src.device
113 |
114 | src_mask = self._make_src_mask(batch_size, src_len, device)
115 |
116 | # src_mask = [batch size, src len]
117 |
118 | pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(device)
119 |
120 | # pos = [batch size, src len]
121 |
122 | src = self.dropout(src * self.scale + self.pos_embedding(pos))
123 |
124 | # src = [batch size, src len, hid dim]
125 |
126 | for layer in self.layers:
127 | src = layer(src, src_mask)
128 |
129 | # src = [batch size, src len, hid dim]
130 | # src_mask = [batch size, 1, 1, src len]
131 |
132 | return src, src_mask
133 |
134 |
135 | class PositionwiseFeedforwardLayer(nn.Module): # noqa: W0223
136 | """Fully connected feed-forward network consisting of two linear transformations with a ReLU activation in between.
137 |
138 | Args:
139 | hid_dim: the hidden size of the encoder
140 | pf_dim: the dimension of the feedforward network model
141 | dropout: the dropout value
142 | """
143 |
144 | def __init__(self, hid_dim, pf_dim, dropout):
145 | """Initialize model with params."""
146 | super().__init__()
147 |
148 | self.fc_1 = nn.Linear(hid_dim, pf_dim)
149 | self.fc_2 = nn.Linear(pf_dim, hid_dim)
150 |
151 | self.dropout = nn.Dropout(dropout)
152 |
153 | def forward(self, x):
154 | """Run a forward pass of model over the data."""
155 |
156 | # x = [batch size, seq len, hid dim]
157 |
158 | x = self.dropout(torch.relu(self.fc_1(x)))
159 |
160 | # x = [batch size, seq len, pf dim]
161 |
162 | x = self.fc_2(x)
163 |
164 | # x = [batch size, seq len, hid dim]
165 |
166 | return x
167 |
168 |
169 | class TransformerDecoderLayer(nn.Module): # noqa: W0223
170 | """TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network.
171 |
172 | Args:
173 | hid_dim: the hidden size of the encoder
174 | n_heads: the number of heads in the multi-head attention layers
175 | pf_dim: the dimension of the feedforward network model
176 | dropout: the dropout value
177 | device: the device on which the model is running
178 | """
179 |
180 | def __init__(self, # noqa: R0913
181 | hid_dim,
182 | n_heads,
183 | pf_dim,
184 | dropout):
185 | """Initialize model with params."""
186 | super().__init__()
187 |
188 | self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
189 | self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
190 | self.ff_layer_norm = nn.LayerNorm(hid_dim)
191 | self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout)
192 | self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout)
193 | self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim,
194 | pf_dim,
195 | dropout)
196 | self.dropout = nn.Dropout(dropout)
197 |
198 | def forward(self, trg, enc_src, trg_mask, src_mask):
199 | """Run a forward pass of model over the data."""
200 |
201 | # trg = [batch size, trg len, hid dim]
202 | # enc_src = [batch size, src len, hid dim]
203 | # trg_mask = [batch size, trg len]
204 | # src_mask = [batch size, src len]
205 |
206 | # self attention
207 | _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
208 |
209 | # dropout, residual connection and layer norm
210 | trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
211 |
212 | # trg = [batch size, trg len, hid dim]
213 |
214 | # encoder attention
215 | _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
216 |
217 | # dropout, residual connection and layer norm
218 | trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
219 |
220 | # trg = [batch size, trg len, hid dim]
221 |
222 | # positionwise feedforward
223 | _trg = self.positionwise_feedforward(trg)
224 |
225 | # dropout, residual and layer norm
226 | trg = self.ff_layer_norm(trg + self.dropout(_trg))
227 |
228 | # trg = [batch size, trg len, hid dim]
229 | # attention = [batch size, n heads, trg len, src len]
230 |
231 | return trg, attention
232 |
233 |
234 | class TransformerDecoder(nn.Module): # noqa: W0223
235 | """TransformerDecoder is a stack of N decoder layers.
236 |
237 | Args:
238 | hid_dim: the hidden size of the decoder
239 | n_layers: the number of sub-decoder-layers in the decoder
240 | n_heads: the number of heads in the multi-head attention layers
241 | pf_dim: the dimension of the feedforward network model
242 | dropout: the dropout value
243 | pos_embedding: learned positional encoding added to the input embedding
244 | device: the device on which the model is running
245 | """
246 |
247 | def __init__(self, # noqa: R0913
248 | hid_dim,
249 | n_layers,
250 | n_heads,
251 | pf_dim,
252 | dropout,
253 | pos_embedding):
254 | """Initialize model with params."""
255 | super().__init__()
256 |
257 | self.pos_embedding = pos_embedding
258 |
259 | self.layers = nn.ModuleList([TransformerDecoderLayer(hid_dim,
260 | n_heads,
261 | pf_dim,
262 | dropout)
263 | for _ in range(n_layers)])
264 |
265 | self.dropout = nn.Dropout(dropout)
266 |
267 | self.register_buffer('scale', torch.sqrt(torch.FloatTensor([hid_dim])))
268 |
269 | def _make_trg_mask(self, batch_size, trg_len, device):
270 |
271 | trg_mask = torch.tril(torch.ones((batch_size, 1, trg_len, trg_len), device=device)).bool()
272 |
273 | # trg_mask = [batch size, 1, trg len, trg len]
274 |
275 | return trg_mask
276 |
277 | def forward(self, trg, enc_src, src_mask):
278 | """Run a forward pass of model over the data."""
279 |
280 | # trg = [batch size, trg len, hid_dim]
281 | # enc_src = [batch size, src len, hid dim]
282 | # src_mask = [batch size, 1, 1, src len]
283 |
284 | batch_size = trg.shape[0]
285 | trg_len = trg.shape[1]
286 | device = trg.device
287 |
288 | trg_mask = self._make_trg_mask(batch_size, trg_len, device)
289 |
290 | # trg_mask = [batch size, trg len]
291 |
292 | pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(device)
293 |
294 | # pos = [batch size, trg len]
295 |
296 | trg = self.dropout(trg * self.scale + self.pos_embedding(pos))
297 |
298 | # trg = [batch size, trg len, hid dim]
299 |
300 | for layer in self.layers:
301 | trg, attention = layer(trg, enc_src, trg_mask, src_mask)
302 |
303 | # trg = [batch size, trg len, hid dim]
304 | # attention = [batch size, n heads, trg len, src len]
305 |
306 | return trg, attention
307 |
--------------------------------------------------------------------------------
/caspr/models/unified_encoder.py:
--------------------------------------------------------------------------------
1 | """CASPR LSTM model."""
2 |
3 | import warnings
4 |
5 | import numpy as np
6 | import torch
7 | import torch.nn as nn
8 | import torch.nn.functional as F
9 |
10 | from caspr.models.attention_mechanisms import BahdanauAttention, MultiHeadAttentionLSTMWrapper
11 | from caspr.models.convolutional_aggregation import ConvAggregation
12 | from caspr.models.dense_bn_dropout import DenseBnDropout
13 | from caspr.models.embedding_layer import CategoricalEmbedding
14 | from caspr.models.multi_layer_lstm import MultiLayerLSTM
15 |
16 | warnings.simplefilter('ignore')
17 |
18 |
19 | SEQ_CAT_INDEX = 0
20 | SEQ_CONT_INDEX = 1
21 | NON_SEQ_CAT_INDEX = 2
22 | NON_SEQ_CONT_INDEX = 3
23 |
24 |
25 | class UnifiedEncoder(nn.Module): # noqa: R0902, W0223
26 | """Encapsulates the basic structure to run most of our models.
27 |
28 | It checks the various conditions for the presence
29 | or the absence of data and is compatible with functionalities like
30 | 1. Usage of pretrained embedding vectors
31 | 2. Multi-Layered LSTM use
32 | 3. Convolutional Aggregation for data
33 | 4. Self-Multi-Head and Bahdanau Attention (when number of heads = 1, Bahdanau is used by default)
34 |
35 | In this new edition, it is compatible with the DLExplainer module and should be used if
36 | explainability is a requirement
37 | """
38 |
39 | def __init__(self, # noqa: R0912, R0913, R0914, R0915
40 | emb_dims_non_seq,
41 | emb_dropout_non_seq,
42 | emb_dims_seq,
43 | emb_dropout_seq,
44 | emb_lin_layer_sizes_non_seq,
45 | emb_lin_layer_dropouts_non_seq,
46 | emb_lin_layer_sizes_seq,
47 | emb_lin_layer_dropouts_seq,
48 | lstm_hidden_size,
49 | output_size,
50 | seq_len,
51 | non_seq_cont_count,
52 | seq_cat_count,
53 | seq_cont_count,
54 | non_seq_cat_count,
55 | attention_heads=1,
56 | non_seq_pretrained_embs=None,
57 | freeze_non_seq_pretrained_embs=True,
58 | seq_pretrained_embs=None,
59 | freeze_seq_pretrained_embs=True,
60 | lstm_num_layers=1,
61 | lstm_bidirectional=False,
62 | use_conv_agg=False,
63 | kernel_size=(3, 3),
64 | max_pool_size=(2, 2),
65 | stride=(2, 2)):
66 | """Initialize of all the variables and the layers depending on the arguments supplied.
67 |
68 | Args:
69 | emb_dims_non_seq = (List of tuples (x, y)) where x is the vocab size and y is the number of dimensions
70 | for the respective embedding layer for every non_sequential categorical variable
71 | emb_dropout_non_seq = (Float) Dropout value of a layer used after the embedding layer - non_sequential
72 | emb_dims_seq = (List of tuples (x, y)) where x is the vocab size and y is the number of dimensions for the
73 | respective embedding layer for every sequential categorical variable
74 | emb_dropout_seq = (Float) Dropout value of a layer used after the embedding layer - sequential
75 | emb_lin_layer_sizes_non_seq = (List of integers) determining the sizes of the stacked linear layers
76 | used just after the embedding layers to learn better representations for non_sequential
77 | categorical variables
78 | emb_lin_layer_dropouts_non_seq = (List of float) values determining the p values in the dropout
79 | layers between linear layers
80 | emb_lin_layer_sizes_seq = (List of integers) determining the sizes of the stacked linear layers
81 | used just after the embedding layers to learn better representations for sequential
82 | categorical variables
83 | emb_lin_layer_dropouts_seq = (List of float) values determining the p values in the dropout
84 | layers between linear layers
85 | lstm_hidden_size = (Integer) determining the Hidden size of the LSTM layer used to train the sequence model
86 | output_size = (Integer) Size of the final embedded output by the encoder.
87 | seq_len = (Integer) determining the length of the sequence in input
88 | non_seq_cont_count = (Integer) Number of non_sequential continuous variables
89 | seq_cat_count = (Integer) Number of sequential categorical variables
90 | seq_cont_count = (Integer) Number of sequential continuous variables
91 | non_seq_cat_count = (Integer) Number of non_sequential categorical variables
92 | attention_heads = (Integer: Default = 1) Describes the number of attention heads being used after the LSTM.
93 | When 0 means that attention is not being used.
94 | When = 1 uses Bahdanau attention by default and
95 | When > 1 uses Multi-Head self-attention
96 | non_seq_pretrained_embs = (List of Tensors: Default = None) To be used as pretrained embeddings
97 | in the embedding layers
98 | freeeze_non_seq_pretrained_embs = (Boolean: Default = True) Determines if the pretrained embeddings
99 | are to be left untouched during backprop
100 | seq_pretrained_embs = (List of Tensors: Default = None) To be used as pretrained embeddings in the
101 | embedding layers,
102 | freeeze_seq_pretrained_embs = (Boolean: Default = True) Determines if the pretrained embeddings
103 | are to be left untouched during backprop
104 | lstm_num_layers = (Integer: Default = 1) The number of stacked LSTM layers used
105 | lstm_bidirectional = (Boolean: Default = False) Determines if the LSTM used is bidirectional
106 | use_conv_agg = (Boolean: Default = False) Determines if Convolutional aggregation is to be used in
107 | the model or not
108 | kernel_size = (Tuple of Integers : Default = (3,3)) Determines the kernel size of the cnn aggregator
109 | max_pool_size = (Tuple of Integers : Default = (2, 2)) Determines the max_pool size of the cnn aggregator
110 | stride = (Tuple of Integers : Default = (2, 2)) Determines the stride of the cnn aggregator
111 | """
112 | super().__init__()
113 |
114 | self._explain = False
115 | self.non_seq_emb_layers = CategoricalEmbedding(emb_dims=emb_dims_non_seq, emb_dropout=emb_dropout_non_seq,
116 | pretrained_vecs=non_seq_pretrained_embs,
117 | freeze_pretrained=freeze_non_seq_pretrained_embs)
118 | self.seq_emb_layers = CategoricalEmbedding(emb_dims=emb_dims_seq, emb_dropout=emb_dropout_seq, is_seq=True,
119 | pretrained_vecs=seq_pretrained_embs,
120 | freeze_pretrained=freeze_seq_pretrained_embs)
121 |
122 | self.no_of_embs_non_seq = np.sum([y for x, y in emb_dims_non_seq])
123 | self.no_of_embs_seq = np.sum([y for x, y in emb_dims_seq])
124 |
125 | self.non_seq_cat_final_size = 0
126 | self.seq_len = seq_len
127 | self.hidden_size = lstm_hidden_size
128 | self.context_vector_size = lstm_hidden_size
129 | self.output_dim = output_size
130 | self.num_layers = lstm_num_layers
131 | self.num_directions = 2 if lstm_bidirectional else 1
132 |
133 | self.seq_cat_count = seq_cat_count
134 | self.seq_cont_count = seq_cont_count
135 | self.non_seq_cat_count = non_seq_cat_count
136 | self.non_seq_cont_count = non_seq_cont_count
137 | self.attention_heads = attention_heads
138 |
139 | self.use_conv_agg = use_conv_agg
140 |
141 | # Linear Layers for non_seq_data parallel to LSTM
142 | if self.no_of_embs_non_seq != 0:
143 | self.emb_lin_layer_non_seq = DenseBnDropout(
144 | lin_layer_sizes=emb_lin_layer_sizes_non_seq,
145 | lin_layer_dropouts=emb_lin_layer_dropouts_non_seq, input_size=self.no_of_embs_non_seq)
146 | self.non_seq_cat_final_size = emb_lin_layer_sizes_non_seq[-1]
147 |
148 | # LSTM layer
149 | if self.no_of_embs_seq != 0:
150 | self.emb_lin_layer_seq = DenseBnDropout(
151 | lin_layer_sizes=emb_lin_layer_sizes_seq,
152 | lin_layer_dropouts=emb_lin_layer_dropouts_seq, input_size=self.no_of_embs_seq)
153 |
154 | # LSTM layer
155 | if self.no_of_embs_seq != 0:
156 | self.emb_lin_layer_seq = DenseBnDropout(
157 | lin_layer_sizes=emb_lin_layer_sizes_seq,
158 | lin_layer_dropouts=emb_lin_layer_dropouts_seq, input_size=self.no_of_embs_seq)
159 | self.lstm_inp_size = emb_lin_layer_sizes_seq[-1] + seq_cont_count
160 | else:
161 | self.lstm_inp_size = seq_cont_count
162 |
163 | if use_conv_agg and seq_len >= kernel_size[0] and \
164 | (min(1, seq_cat_count)*emb_lin_layer_sizes_seq[-1] + seq_cont_count) >= kernel_size[1] and \
165 | int((min(1, seq_cat_count)*emb_lin_layer_sizes_seq[-1] + seq_cont_count -
166 | (kernel_size[1] - 1))/stride[1]) >= max_pool_size[1] and \
167 | int((seq_len - (kernel_size[0] - 1))/stride[0]) >= max_pool_size[0]:
168 | # kernel_size[0] -> size of kernel along sequence dimension, hence must be <= seq_len
169 | # kernel_size[1] -> size of kernel along features dimension, hence must be <= net size of input features
170 | # int((min(1, seq_cat_count)*emb_lin_layer_sizes_seq[-1]
171 | # + seq_cont_count - (kernel_size[i] - 1))/stride[i])
172 | # is the formula to calculate the final size of dimension i after the CNN filter is applied
173 | # the above size should be >= max_pool[i] for pooling
174 | self.conv_agg = ConvAggregation(
175 | kernel_size=kernel_size, stride=stride, max_pool_size=max_pool_size, dropout_size=0.4)
176 | self.lstm_inp_size = int((int((min(1, seq_cat_count)*emb_lin_layer_sizes_seq[-1] + seq_cont_count - (
177 | kernel_size[1] - 1) - 1)/stride[1] + 1)) / max_pool_size[1])
178 | else:
179 | self.use_conv_agg = False
180 |
181 | if self.lstm_inp_size > 0:
182 | self.lstm_layer = MultiLayerLSTM(input_size=self.lstm_inp_size, hidden_size=self.hidden_size,
183 | num_layers=self.num_layers, bidirectional=lstm_bidirectional, dropout=0.4)
184 |
185 | # Attention
186 | if self.attention_heads > 0:
187 | if self.attention_heads == 1:
188 | self.bahdanau_attention_layer = BahdanauAttention(self.hidden_size, self.num_directions)
189 | else:
190 | n_head = self.attention_heads
191 | d_model = self.hidden_size
192 | self.multi_head_attention_layer = MultiHeadAttentionLSTMWrapper(n_head, d_model, dropout=0.1)
193 |
194 | if self.attention_heads > 0:
195 | self.output_layer = nn.Linear(int(self.num_directions*self.hidden_size + self.context_vector_size +
196 | self.non_seq_cat_final_size + self.non_seq_cont_count),
197 | int(self.hidden_size))
198 | else:
199 | self.output_layer = nn.Linear(int(self.num_directions*self.hidden_size +
200 | self.non_seq_cat_final_size + self.non_seq_cont_count),
201 | int(self.hidden_size))
202 | nn.init.kaiming_normal_(self.output_layer.weight.data)
203 |
204 | def forward(self, *args): # noqa: R0912, R0914
205 | """Forward function accepts multiple arguments.
206 |
207 | The last argument is always a list of indices representing the index (if data present)
208 | with -1 in places for the absence of data The indices are used to partition the data into 4 types
209 | - seq_cat, seq_cont, non_seq_cat, non_seq_cont
210 | """
211 | nonempty_idx = args[-1]
212 | data_exists = list(map(lambda x: x != -1, nonempty_idx))
213 | device = args[0].device
214 | batch_size = args[0].shape[0]
215 |
216 | seq_cat_data = args[nonempty_idx[SEQ_CAT_INDEX]] if data_exists[SEQ_CAT_INDEX] else torch.empty(batch_size, 0, 0, device=device)
217 | seq_cont_data = args[nonempty_idx[SEQ_CONT_INDEX]] if data_exists[SEQ_CONT_INDEX] else torch.empty(batch_size, 0, 0, device=device)
218 | non_seq_cat_data = args[nonempty_idx[NON_SEQ_CAT_INDEX]] if data_exists[NON_SEQ_CAT_INDEX] else torch.empty(batch_size, 0, device=device)
219 | non_seq_cont_data = args[nonempty_idx[NON_SEQ_CONT_INDEX]] if data_exists[NON_SEQ_CONT_INDEX] else torch.empty(batch_size, 0, device=device)
220 |
221 | if self.no_of_embs_non_seq != 0:
222 | non_seq_cat_inp = self.non_seq_emb_layers(non_seq_cat_data)
223 | non_seq_inp = self.emb_lin_layer_non_seq(non_seq_cat_inp)
224 |
225 | if self.non_seq_cont_count != 0:
226 | non_seq_inp = torch.cat((non_seq_inp.type(torch.FloatTensor).to(device),
227 | non_seq_cont_data.type(torch.FloatTensor).to(device)), 1)
228 | else:
229 | if self.non_seq_cont_count != 0:
230 | non_seq_inp = non_seq_cont_data.to(device)
231 | else:
232 | non_seq_inp = torch.Tensor().to(device)
233 |
234 | if self.no_of_embs_seq != 0:
235 | seq_cat_inp = self.seq_emb_layers(seq_cat_data)
236 | seq_inp = self.emb_lin_layer_seq(seq_cat_inp)
237 | if self.seq_cont_count != 0:
238 | seq_inp = torch.cat((seq_inp.type(torch.FloatTensor).to(device),
239 | seq_cont_data.type(torch.FloatTensor).to(device)), 2)
240 |
241 | elif self.seq_cont_count != 0:
242 | seq_inp = seq_cont_data.type(torch.FloatTensor).to(device)
243 |
244 | if self.no_of_embs_seq + self.seq_cont_count > 0:
245 |
246 | if self.use_conv_agg:
247 | seq_inp = self.conv_agg(seq_inp)
248 |
249 | output, (_, cn), seq_inp = self.lstm_layer(seq_inp)
250 |
251 | if self.attention_heads > 0:
252 | if self.attention_heads == 1:
253 | context_vector = self.bahdanau_attention_layer(output)
254 | context_vector = context_vector.reshape(context_vector.size()[0], context_vector.size()[2])
255 | else:
256 | context_vector = self.multi_head_attention_layer(output, output, output)
257 |
258 | fin_input = torch.cat((seq_inp, context_vector), 1)
259 | else:
260 | fin_input = seq_inp
261 |
262 | if self.no_of_embs_non_seq + self.non_seq_cont_count > 0:
263 | fin_input = torch.cat((non_seq_inp.type(torch.FloatTensor).to(device), fin_input), 1)
264 | else:
265 | fin_input = non_seq_inp
266 |
267 | fin_output = F.relu(self.output_layer(fin_input))
268 |
269 | if self._explain:
270 | return fin_output
271 | return output, (fin_output, cn)
272 |
273 | @property
274 | def explain(self):
275 | """Getter for explain."""
276 |
277 | return self._explain
278 |
279 | def set_explain(self, value):
280 | """Setter for explain."""
281 |
282 | self._explain = value
283 |
--------------------------------------------------------------------------------
/caspr/models/unified_transformer_encoder.py:
--------------------------------------------------------------------------------
1 | """CASPR Transfomer model."""
2 |
3 | import warnings
4 |
5 | import torch
6 | import torch.nn as nn
7 |
8 | from caspr.models.embedding_layer import CategoricalEmbedding
9 |
10 | warnings.simplefilter('ignore')
11 |
12 | SEQ_CAT_INDEX = 0
13 | SEQ_CONT_INDEX = 1
14 | NON_SEQ_CAT_INDEX = 2
15 | NON_SEQ_CONT_INDEX = 3
16 |
17 |
18 | class UnifiedTransformerEncoder(nn.Module): # noqa: R0902, W0223
19 | """Encapsulates the basic structure to run most of our models.
20 |
21 | It checks the various conditions for the presence or the absence of data
22 | and is compatible with functionalities like
23 | 1. Usage of pretrained embedding vectors
24 | 2. Multi-Layered transformer use
25 | 3. Convolutional Aggregation for data
26 | 4. Self-Multi-Head and Bahdanau Attention (when number of heads = 1, Bahdanau is used by default)
27 |
28 | In this new edition, it is compatible with the DLExplainer module and should be used if explainability
29 | is a requirement
30 | """
31 |
32 | def __init__(self, # noqa: R0913, R0914
33 | transformer_encoder,
34 | emb_dims_non_seq,
35 | emb_dropout_non_seq,
36 | emb_dims_seq,
37 | emb_dropout_seq,
38 | hidden_size,
39 | seq_cont_count,
40 | non_seq_cont_count,
41 | non_seq_pretrained_embs=None,
42 | freeze_non_seq_pretrained_embs=True,
43 | seq_pretrained_embs=None,
44 | freeze_seq_pretrained_embs=True):
45 | """Initialize model with params."""
46 |
47 | super().__init__()
48 |
49 | self._explain = False
50 |
51 | self.emb_non_seq = CategoricalEmbedding(emb_dims=emb_dims_non_seq, emb_dropout=emb_dropout_non_seq,
52 | is_seq=False, pretrained_vecs=non_seq_pretrained_embs,
53 | freeze_pretrained=freeze_non_seq_pretrained_embs)
54 | self.emb_seq = CategoricalEmbedding(emb_dims=emb_dims_seq, emb_dropout=emb_dropout_seq,
55 | is_seq=True, pretrained_vecs=seq_pretrained_embs,
56 | freeze_pretrained=freeze_seq_pretrained_embs)
57 |
58 | self.hid_dim = hidden_size
59 | self.seq_cont_dim = seq_cont_count
60 | self.non_seq_cont_dim = non_seq_cont_count
61 |
62 | # Linear layers for seq_data
63 | seq_inp_size = self.emb_seq.emb_size + self.seq_cont_dim
64 | self.linear_seq = nn.Linear(seq_inp_size, self.hid_dim)
65 |
66 | # Linear layers for non_seq_data
67 | non_seq_inp_size = self.emb_non_seq.emb_size + self.non_seq_cont_dim
68 | self.linear_non_seq = nn.Linear(non_seq_inp_size, self.hid_dim) if non_seq_inp_size else None
69 |
70 | self.transformer_encoder = transformer_encoder
71 |
72 | def forward(self, *args):
73 | """Run a forward pass of model over the data."""
74 |
75 | nonempty_idx = args[-1]
76 | data_exists = list(map(lambda x: x != -1, nonempty_idx))
77 | device = args[0].device
78 | batch_size, seq_len = args[0].shape[:2]
79 |
80 | seq_cat_data = args[nonempty_idx[SEQ_CAT_INDEX]] if data_exists[SEQ_CAT_INDEX] else torch.empty(batch_size, seq_len, 0, device=device)
81 | seq_cont_data = args[nonempty_idx[SEQ_CONT_INDEX]] if data_exists[SEQ_CONT_INDEX] else torch.empty(batch_size, seq_len, 0, device=device)
82 | non_seq_cat_data = args[nonempty_idx[NON_SEQ_CAT_INDEX]] if data_exists[NON_SEQ_CAT_INDEX] else torch.empty(batch_size, 0, device=device)
83 | non_seq_cont_data = args[nonempty_idx[NON_SEQ_CONT_INDEX]] if data_exists[NON_SEQ_CONT_INDEX] else torch.empty(batch_size, 0, device=device)
84 |
85 | if self.emb_seq and data_exists[SEQ_CAT_INDEX]:
86 | seq_cat_data = self.emb_seq(seq_cat_data)
87 | seq_inp = torch.cat((seq_cat_data, seq_cont_data), -1)
88 | seq_inp = self.linear_seq(seq_inp)
89 |
90 | if self.emb_non_seq and data_exists[NON_SEQ_CAT_INDEX]:
91 | non_seq_cat_data = self.emb_non_seq(non_seq_cat_data)
92 | non_seq_inp = torch.cat((non_seq_cat_data, non_seq_cont_data), -1)
93 | if self.linear_non_seq:
94 | non_seq_inp = self.linear_non_seq(non_seq_inp).unsqueeze(1)
95 |
96 | src_inp = torch.cat((seq_inp, non_seq_inp), 1) if non_seq_inp.nelement() > 0 else seq_inp
97 | # src_inp = [batch_size, src len, hid dim]
98 |
99 | enc_src, src_mask = self.transformer_encoder(src_inp)
100 |
101 | if self._explain:
102 | return enc_src.reshape(enc_src.shape[0], -1)
103 | return enc_src, src_mask, src_inp
104 |
105 | @property
106 | def explain(self):
107 | """Getter for explain."""
108 |
109 | return self._explain
110 |
111 | def set_explain(self, value):
112 | """Setter for explain."""
113 |
114 | self._explain = value
115 |
--------------------------------------------------------------------------------
/caspr/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/CASPR/9cdbf270487751a0ad6862b2fea2ccc0e23a0b67/caspr/utils/__init__.py
--------------------------------------------------------------------------------
/caspr/utils/early_stopping.py:
--------------------------------------------------------------------------------
1 | """Early stopping class for nn models."""
2 |
3 | import logging
4 |
5 | import torch
6 | from torch.nn.parallel import DistributedDataParallel as DDP
7 |
8 | from caspr.utils.onnx import export_onnx, register_custom_op
9 |
10 |
11 | class EarlyStopping:
12 | """Stop the training early and save a PyTorch or ONNX model after specified iterations (patience)"""
13 |
14 | def __init__(self, logger, should_decrease,patience=3, verbose=True, delta=0, save_onnx=False):
15 | """Initialize the early stopping module.
16 |
17 | Args:
18 | logger: For logging
19 | should_decrease (bool): True if metrics improve by decreasing.
20 | patience (int): How long to wait after last time validation score improved.
21 | Default: 3
22 | verbose (bool): If True, prints a message for each validation score improvement.
23 | Default: True
24 | delta (float): Minimum change in the monitored quantity to qualify as an improvement.
25 | Default: 0
26 | save_onnx (bool): If True, export the model as onnx format.
27 | Default: False
28 | """
29 | self.logger = logger
30 | self.patience = patience
31 | self.verbose = verbose
32 | self.counter = 0
33 | self.best_score = None
34 | self.early_stop = False
35 | self.should_decrease = should_decrease
36 | self.delta = delta
37 | self.save_onnx = save_onnx
38 | if self.save_onnx:
39 | register_custom_op()
40 |
41 | def __call__(self, val_score, model, path):
42 | """Define __call__ method.
43 |
44 | Args:
45 | val_score (float): Validation score to determine whether to early stop.
46 | model (nn.Module): Model being trained.
47 | path (str): Model save path.
48 | """
49 |
50 | if self.should_decrease:
51 | val_score = -val_score
52 |
53 | if self.best_score is None:
54 | self.best_score = val_score
55 | self.save_checkpoint(model, path)
56 | elif val_score <= self.best_score + self.delta:
57 | self.counter += 1
58 | self.logger.info('EarlyStopping counter: {} out of {}\n'.format(self.counter, self.patience))
59 | if self.counter >= self.patience:
60 | self.early_stop = True
61 | else:
62 | self.best_score = val_score
63 | self.save_checkpoint(model, path)
64 | self.counter = 0
65 |
66 | def save(self, model, path):
67 | if self.save_onnx:
68 | export_onnx(model, path)
69 | else:
70 | torch.save(model.state_dict(), path)
71 |
72 | def save_checkpoint(self, model, path):
73 | """Save model when validation score improves.
74 |
75 | The model parameter can be a list that allows multiple models to be saved.
76 | """
77 |
78 | if self.verbose:
79 | self.logger.info('Validation score improved. Saving model ...\n')
80 | if not isinstance(model, list):
81 | self.save(model, path)
82 | else:
83 | for m, p in zip(model, path):
84 | self.save(m, path)
85 |
86 |
87 | class DistributedEarlyStopping(EarlyStopping):
88 | def __init__(self, logger, should_decrease=True, patience=3, verbose=True, delta=0, rank=None, save_onnx=False):
89 | super().__init__(logger, should_decrease, patience=patience, verbose=verbose, delta=delta, save_onnx=save_onnx)
90 | self.rank = rank
91 |
92 | def __call__(self, val_score, model, path, rank=None):
93 | if not rank:
94 | rank = self.rank
95 |
96 | if rank and rank > 0:
97 | return
98 |
99 | if isinstance(model, DDP):
100 | model = model.module
101 |
102 | return super().__call__(val_score, model, path)
103 |
104 |
105 | if __name__ == '__main__':
106 |
107 | class TwoLayerNet(torch.nn.Module):
108 | """Simple two-layer neural network for demonstration purposes."""
109 |
110 | def __init__(self, D_in, H, D_out):
111 | """Instantiate two nn.Linear modules and assign them as member variables."""
112 |
113 | super().__init__()
114 |
115 | self.linear1 = torch.nn.Linear(D_in, H)
116 | self.linear2 = torch.nn.Linear(H, D_out)
117 |
118 | def forward(self, x):
119 | """In the forward function we accept a Tensor of input data and we must return a Tensor of output data.
120 |
121 | We can use Modules defined in the constructor as well as arbitrary operators on Tensors.
122 | """
123 |
124 | h_relu = self.linear1(x).clamp(min=0)
125 | y = self.linear2(h_relu)
126 | return y
127 |
128 | # N is batch size; D_in is input dimension;
129 | # H is hidden dimension; D_out is output dimension.
130 | batch_size, input_dim, hidden_dim, output_dim = 1000, 1000, 100, 10
131 |
132 | # Create random Tensors to hold inputs and outputs
133 | X = torch.randn(batch_size, input_dim)
134 | y_true = torch.randn(batch_size, output_dim)
135 |
136 | # Construct our model by instantiating the class defined above
137 | mlp = TwoLayerNet(input_dim, hidden_dim, output_dim)
138 |
139 | # Construct our loss function and an Optimizer. The call to model.parameters()
140 | # in the SGD constructor will contain the learnable parameters of the two
141 | # nn.Linear modules which are members of the model.
142 | criterion = torch.nn.MSELoss(reduction='sum')
143 | optimizer = torch.optim.SGD(mlp.parameters(), lr=1e-4)
144 | logger = logging.getLogger(__name__)
145 | early_stopping = EarlyStopping(logger, should_decrease=True, patience=3, verbose=True, delta=1e-5)
146 |
147 | for t in range(10000):
148 | # Forward pass: Compute predicted y by passing x to the model
149 | y_pred = mlp(X)
150 |
151 | # Compute and print loss
152 | loss = criterion(y_pred, y_true)
153 | if t % 100 == 99:
154 | logger.info(t, loss.item())
155 | early_stopping(loss.item(), mlp, 'early_stopping_test_model.pth')
156 | if early_stopping.early_stop:
157 | break
158 |
159 | # Zero gradients, perform a backward pass, and update the weights.
160 | optimizer.zero_grad()
161 | loss.backward()
162 | optimizer.step()
163 |
164 | mlp.load_state_dict(torch.load('early_stopping_test_model.pth'))
165 | y_pred = mlp(X)
166 | loss = criterion(y_pred, y_true)
167 | logger.info('Best loss: {}'.format(loss.item()))
168 |
--------------------------------------------------------------------------------
/caspr/utils/estimate_parameters.py:
--------------------------------------------------------------------------------
1 | def estimate_linear_parameters(input_dim, output_dim, bias=True):
2 | if input_dim > 0 and bias:
3 | input_dim += 1
4 | return input_dim * output_dim
5 |
6 |
7 | def estimate_embedding_parameters(df, cat_cols_, max_emb_dim):
8 | emb_num_classes = [df.select(c).distinct().count() for c in cat_cols_]
9 | emb_dims = [(x, int(min(max_emb_dim, (x + 1) // 2))) for x in emb_num_classes]
10 | emb_size = sum([d for _, d in emb_dims])
11 | emb_num_param = sum([estimate_linear_parameters(v, d, bias=False) for v, d in emb_dims])
12 | return emb_num_param, emb_size, emb_num_classes
13 |
14 |
15 | def estimate_transformer_parameters(hidden_dim, seq_len, pf_dim, num_layers, is_encoder=True):
16 | pos_emb_num_param = estimate_linear_parameters(seq_len, hidden_dim, bias=False) if is_encoder else 0
17 | layer_norm_num_param = hidden_dim * 2
18 | attn_num_param = estimate_linear_parameters(hidden_dim, hidden_dim) * 4
19 | layer_norm_count = 2 if is_encoder else 3
20 | attn_count = 1 if is_encoder else 2
21 | pf_num_param = estimate_linear_parameters(hidden_dim, pf_dim) + estimate_linear_parameters(pf_dim, hidden_dim)
22 | transformer_num_param = pos_emb_num_param + \
23 | (layer_norm_num_param * layer_norm_count + attn_num_param * attn_count + pf_num_param) * num_layers
24 | return transformer_num_param
25 |
26 |
27 | def estimate_output_parameters(hidden_dim, emb_num_classes, cont_dim):
28 | output_num_param_cat = sum([estimate_linear_parameters(hidden_dim, v) for v in emb_num_classes])
29 | output_num_param_cont = estimate_linear_parameters(hidden_dim, cont_dim)
30 | output_num_param = output_num_param_cat + output_num_param_cont
31 | return output_num_param
32 |
33 |
34 | def estimate_transformer_autoencoder_parameters(df, seq_cat_, seq_cont_, non_seq_cat_, non_seq_cont_,
35 | hidden_dim, pf_dim_enc, pf_dim_dec, num_layers_enc,
36 | num_layers_dec, seq_len, max_emb_dim=30):
37 | emb_num_param_seq, emb_size_seq, emb_num_classes_seq = estimate_embedding_parameters(df, seq_cat_, max_emb_dim)
38 | emb_num_param_non_seq, emb_size_non_seq, emb_num_classes_non_seq = estimate_embedding_parameters(df, non_seq_cat_, max_emb_dim)
39 | emb_num_param = emb_num_param_seq + emb_num_param_non_seq
40 |
41 | seq_cont_dim = len(seq_cont_)
42 | non_seq_cont_dim = len(non_seq_cont_)
43 | non_seq_dim = emb_size_non_seq + non_seq_cont_dim
44 | linear_num_param_seq = estimate_linear_parameters(seq_cont_dim + emb_size_seq, hidden_dim)
45 | linear_num_param_non_seq = estimate_linear_parameters(non_seq_dim, hidden_dim)
46 | linear_num_param = linear_num_param_seq + linear_num_param_non_seq
47 |
48 | adjust_seq_len = seq_len + int(non_seq_dim > 0)
49 | enc_num_param = estimate_transformer_parameters(hidden_dim, adjust_seq_len, pf_dim_enc, num_layers_enc)
50 | dec_num_param = estimate_transformer_parameters(hidden_dim, adjust_seq_len, pf_dim_dec, num_layers_dec, is_encoder=False)
51 | transformer_num_param = enc_num_param + dec_num_param
52 |
53 | output_num_param_seq = estimate_output_parameters(hidden_dim, emb_num_classes_seq, seq_cont_dim)
54 | output_num_param_non_seq = estimate_output_parameters(hidden_dim, emb_num_classes_non_seq, non_seq_cont_dim)
55 | output_num_param = output_num_param_seq + output_num_param_non_seq
56 |
57 | num_param = emb_num_param + linear_num_param + transformer_num_param + output_num_param
58 | return num_param
59 |
--------------------------------------------------------------------------------
/caspr/utils/explain/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------------------------
2 | # Copyright (c) Microsoft Corporation. All rights reserved.
3 | # --------------------------------------------------------------------------
4 |
5 | """The explainer module for the CASPR library.
6 |
7 | Modules:
8 | :module1_name: A description of this specific module.
9 | """
10 |
--------------------------------------------------------------------------------
/caspr/utils/explain/utils.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | import pandas as pd
4 |
5 |
6 | def separate_pos_neg(attribution):
7 | """Separate out positive and negative attributes in the dataframe with attributes.
8 |
9 | Outputs two separated dataframes
10 | """
11 | attribution_pos_val = attribution*(attribution >= 0)
12 | attribution_neg_val = attribution*~(attribution >= 0)
13 | return attribution_pos_val, attribution_neg_val
14 |
15 |
16 | def visualize(explanations: pd.DataFrame, separate_pos_neg_imp: bool = False,
17 | title="Average Feature Importances", axis_title="Features", save_fig: str = None):
18 | """Visualize explanations.
19 |
20 | Utility function used to create bar graph visualisations at a model level
21 |
22 | Args:
23 | explanations (pandas dataframe): Dataframe with feature attributions
24 | separate_pos_neg_imp (Boolean: Default = False): Determines if the positive and negative attributions are to be
25 | aggregated and plotted separately (two reverse sided bars) in the same plot
26 | title (String : Default = "Average Feature Importances") : Represents the title of the graph
27 | axis_title (String: Default = "Features") : Represents the title of the Y axis
28 | save_fig (String) : Contains the path where to save the image plot. If None : the module doesnt save the image
29 |
30 | """
31 | feature_names = explanations.columns
32 | imp_pos_df, imp_neg_df = separate_pos_neg(explanations)
33 | combine_importances = not separate_pos_neg_imp
34 |
35 | importances_pos = imp_pos_df.values
36 | importances_neg = imp_neg_df.values
37 |
38 | if importances_pos.ndim == 2:
39 | importances_pos = np.mean(importances_pos, axis=0)
40 | importances_neg = np.mean(importances_neg, axis=0)
41 |
42 | xlim_pos = np.max(importances_pos)*1.25
43 | xlim_neg = np.max(np.abs(importances_neg))*1.25
44 |
45 | if combine_importances:
46 | xlim_pos += xlim_neg
47 | xlim_neg = 0
48 | importances_pos += np.abs(importances_neg)
49 |
50 | else:
51 | xlim_pos = np.max([xlim_pos, xlim_neg])
52 | xlim_neg = -1 * xlim_pos
53 |
54 | x_pos = (np.arange(len(feature_names)))
55 |
56 | # Plotting begins
57 | plt.figure(figsize=(10, 10))
58 | width = 0.3
59 |
60 | if combine_importances:
61 | plt.barh(x_pos, importances_pos, width, align='center')
62 | else:
63 | plt.barh(x_pos, importances_pos, width, align='center')
64 | plt.barh(x_pos + width, importances_neg, width, align='center')
65 |
66 | plt.yticks(x_pos + width/2, feature_names, wrap=True)
67 | plt.ylabel(axis_title)
68 | plt.title(title)
69 | axes = plt.gca()
70 | axes.set_xlim([xlim_neg, xlim_pos])
71 |
72 | if save_fig is not None:
73 | plt.savefig(save_fig)
74 |
--------------------------------------------------------------------------------
/caspr/utils/horovod/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation. All rights reserved.
2 |
3 | #
4 |
5 | # Unless required by applicable law or agreed to in writing, software
6 |
7 | # distributed under the License is distributed on an "AS IS" BASIS,
8 |
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 |
11 | # See the License for the specific language governing permissions and
12 |
13 | # limitations under the License.
14 |
15 | #
16 |
17 | # ==============================================================================
18 |
--------------------------------------------------------------------------------
/caspr/utils/horovod/train.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import horovod.torch as hvd
4 | import torch
5 | import torch.nn as nn
6 | from torch import optim
7 | from torch.utils.data.distributed import DistributedSampler
8 |
9 | from caspr.data.common_dataset import id_collate
10 | from caspr.utils.early_stopping import DistributedEarlyStopping
11 | from caspr.utils.train import init_lr_schedulers, run_autoencoder, run_autoencoder_val
12 |
13 | BATCH_SIZE = 1024 * 32
14 | NUM_EPOCHS = 100
15 | EARLY_STOPPING_PATIENCE = 8
16 | EARLY_STOPPING_DELTA = 1e-5
17 | ROOT_RANK = 0
18 | logger = logging.getLogger(__name__)
19 |
20 |
21 | def save_checkpoint(model, optimizer, epoch, name):
22 | filepath = '/checkpoint-{epoch}-{model}.pth'.format(epoch=epoch, model=name)
23 | state = {
24 | 'model': model.state_dict(),
25 | 'optimizer': optimizer.state_dict(),
26 | }
27 | torch.save(state, filepath)
28 |
29 |
30 | def metric_average(metric, name):
31 | avg_tensor = hvd.allreduce(metric, name=name)
32 | return avg_tensor.item()
33 |
34 |
35 | def determine_early_stop(early_stopper: DistributedEarlyStopping, loss_averaged, model, path, epoch, num_epochs):
36 | # Call the distributed early stopper while passing rank info
37 | # Only rank 0 is allowed to checkpoint
38 | early_stopper(loss_averaged, model,
39 | path, hvd.rank())
40 | if early_stopper.early_stop:
41 | epoch = num_epochs
42 | # The answer to whether to stop or not is decided by the root rank
43 | # The answer is then broadcased to other nodes
44 | epoch = hvd.broadcast_object(epoch, root_rank=ROOT_RANK)
45 |
46 | # The root rank loads the latest model checkpoint and broadcasts parameters
47 | if hvd.rank() == ROOT_RANK and epoch == num_epochs:
48 | model.load_state_dict(torch.load(path))
49 | hvd.broadcast_parameters(model.state_dict(), root_rank=ROOT_RANK)
50 | return epoch
51 |
52 |
53 | def train_hvd(dataset_train, autoenc, device, batch_size=1024, epochs=10, learning_rate=0.01, warmup_epochs=5, save_model=False, path='./early_stopping_test_model.pth'):
54 | autoenc.train()
55 | hvd.init()
56 | logger.info("Number of workers:" + str(hvd.size()))
57 |
58 | if device.type == 'cuda':
59 | # Horovod: pin GPU to local rank.
60 | torch.cuda.set_device(hvd.local_rank())
61 |
62 | # Configure the sampler such that each worker obtains a distinct sample of input dataset.
63 | train_sampler = DistributedSampler(dataset_train, num_replicas=hvd.size(), rank=hvd.rank())
64 | train_loader = torch.utils.data.DataLoader(
65 | dataset_train, batch_size=batch_size, sampler=train_sampler, collate_fn=id_collate)
66 |
67 | num_epochs = epochs
68 |
69 | # Effective batch size in synchronous distributed training is scaled by the number of workers.
70 | # An increase in learning rate compensates for the increased batch size.
71 | optimizer = optim.Adam(autoenc.parameters(), lr=learning_rate * hvd.size())
72 | # Wrap the optimizer with Horovod's DistributedOptimizer.
73 | optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=autoenc.named_parameters())
74 |
75 | scheduler_wu, scheduler_re = init_lr_schedulers(
76 | optimizer, warmup_epochs, reduce_patience=int(EARLY_STOPPING_PATIENCE/2), verbose=True)
77 |
78 | hvd.broadcast_object(scheduler_wu, root_rank=ROOT_RANK)
79 | hvd.broadcast_object(scheduler_re, root_rank=ROOT_RANK)
80 |
81 | # Broadcast initial parameters so all workers start with the same parameters.
82 | hvd.broadcast_parameters(autoenc.state_dict(), root_rank=ROOT_RANK)
83 |
84 | criterion = [nn.MSELoss(), nn.CrossEntropyLoss()]
85 |
86 | losses = []
87 | early_stopper = DistributedEarlyStopping(logger, patience=EARLY_STOPPING_PATIENCE, delta=EARLY_STOPPING_DELTA)
88 |
89 | epoch = 1
90 | while epoch < num_epochs + 1:
91 | losses, _ = run_autoencoder(autoenc, optimizer, train_loader, criterion, device)
92 | loss_averaged = metric_average(torch.tensor(losses), 'avg_loss')
93 | logger.info("Average overall training loss in epoch {0} is {1}".format(
94 | epoch, loss_averaged))
95 |
96 | epoch = determine_early_stop(early_stopper, loss_averaged, autoenc, path, epoch, num_epochs)
97 |
98 | if epoch <= warmup_epochs:
99 | scheduler_wu.step()
100 | scheduler_re.step(loss_averaged)
101 |
102 | if hvd.rank() == ROOT_RANK and epoch == num_epochs:
103 | if save_model:
104 | save_checkpoint(autoenc, optimizer, epoch, 'encoder')
105 | return autoenc, loss_averaged
106 | epoch = epoch+1
107 |
108 |
109 | def train_val_hvd(dataset_train, dataset_val, autoenc, device, batch_size=1024, epochs=10, learning_rate=0.01, warmup_epochs=5, save_model=False, path='./early_stopping_test_model.pth'):
110 | autoenc.train()
111 | hvd.init()
112 | logger.info("Number of workers:" + str(hvd.size()))
113 |
114 | if device.type == 'cuda':
115 | # Horovod: pin GPU to local rank.
116 | torch.cuda.set_device(hvd.local_rank())
117 |
118 | # Configure the sampler such that each worker obtains a distinct sample of input dataset.
119 | train_sampler = DistributedSampler(dataset_train, num_replicas=hvd.size(), rank=hvd.rank())
120 | train_loader = torch.utils.data.DataLoader(
121 | dataset_train, batch_size=batch_size, sampler=train_sampler, collate_fn=id_collate)
122 |
123 | val_sampler = DistributedSampler(dataset_val, num_replicas=hvd.size(), rank=hvd.rank())
124 | val_loader = torch.utils.data.DataLoader(dataset_val, batch_size=batch_size,
125 | sampler=val_sampler, collate_fn=id_collate)
126 |
127 | num_epochs = epochs
128 |
129 | # Effective batch size in synchronous distributed training is scaled by the number of workers.
130 | # An increase in learning rate compensates for the increased batch size.
131 | optimizer = optim.Adam(autoenc.parameters(), lr=learning_rate * hvd.size())
132 | # Wrap the optimizer with Horovod's DistributedOptimizer.
133 | optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=autoenc.named_parameters())
134 |
135 | scheduler_wu, scheduler_re = init_lr_schedulers(
136 | optimizer, warmup_epochs, reduce_patience=int(EARLY_STOPPING_PATIENCE/2), verbose=True)
137 |
138 | hvd.broadcast_object(scheduler_wu, root_rank=ROOT_RANK)
139 | hvd.broadcast_object(scheduler_re, root_rank=ROOT_RANK)
140 |
141 | # Broadcast initial parameters so all workers start with the same parameters.
142 | hvd.broadcast_parameters(autoenc.state_dict(), root_rank=ROOT_RANK)
143 |
144 | criterion = [nn.MSELoss(), nn.CrossEntropyLoss()]
145 |
146 | losses = []
147 | early_stopper = DistributedEarlyStopping(logger, patience=EARLY_STOPPING_PATIENCE, delta=EARLY_STOPPING_DELTA)
148 |
149 | epoch = 1
150 | while epoch < num_epochs + 1:
151 | autoenc.train()
152 | losses, _ = run_autoencoder(autoenc, optimizer, train_loader, criterion, device)
153 | autoenc.eval()
154 | losses_val = run_autoencoder_val(autoenc, val_loader, criterion, device)
155 | loss_train_averaged = metric_average(torch.tensor(losses), 'avg_train_loss')
156 | loss_val_averaged = metric_average(torch.tensor(losses_val), 'avg_val_loss')
157 |
158 | logger.info("Average training loss in epoch {0} is {1}".format(epoch, loss_train_averaged))
159 | logger.info("Average validation loss in epoch {0} is {1}".format(epoch, loss_val_averaged))
160 |
161 | if epoch <= warmup_epochs:
162 | scheduler_wu.step()
163 | scheduler_re.step(loss_val_averaged)
164 |
165 | epoch = determine_early_stop(early_stopper, loss_val_averaged, autoenc, path, epoch, num_epochs)
166 | if hvd.rank() == ROOT_RANK and epoch == num_epochs:
167 | if save_model:
168 | save_checkpoint(autoenc, optimizer, epoch, 'encoder')
169 | return autoenc, loss_val_averaged
170 | epoch = epoch+1
171 |
--------------------------------------------------------------------------------
/caspr/utils/metrics.py:
--------------------------------------------------------------------------------
1 | """Get classification metrics for CASPR models."""
2 |
3 | # coding: utf-8
4 | import logging
5 |
6 | from sklearn.metrics import auc, classification_report, confusion_matrix, precision_recall_curve, roc_auc_score
7 |
8 | logger = logging.getLogger(__name__)
9 |
10 | def check_topk_values_if_churn(k, preds, y):
11 | """Check how many of top k churn predictions actually churned."""
12 |
13 | pred_arr = preds.cpu()
14 | pred_arr = pred_arr.detach().numpy()
15 | topk = pred_arr.argsort()[-k:][::-1]
16 | count = 0
17 | for ind in topk:
18 | if y[ind] == 1:
19 | count += 1
20 | return count
21 |
22 |
23 | def pr_auc_score(y_true, y_score):
24 | """Get pr_auc score."""
25 |
26 | precision, recall, _ = precision_recall_curve(y_true, y_score)
27 | pr_auc = auc(recall, precision)
28 | return pr_auc
29 |
30 |
31 | def get_metrics(y_true, y_score, threshold=0.5, digits=3):
32 | """Get classification report, confusion matrix, roc_auc score, and pr_auc score."""
33 |
34 | y_pred = y_score > threshold
35 |
36 | report = classification_report(y_true, y_pred, digits=digits)
37 | report_dict = convert_classification_report_to_dict(report)
38 | logger.info(report)
39 |
40 | tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
41 | report_dict.update({'tp': tp, 'fp': fp, 'tn': tn, 'fn': fn})
42 | logger.info("tp: {}, fp: {}, tn: {}, fn: {}".format(tp, fp, tn, fn))
43 |
44 | roc_auc = roc_auc_score(y_true, y_score)
45 | pr_auc = pr_auc_score(y_true, y_score)
46 | report_dict.update({'roc_auc_score': roc_auc, 'pr_auc_score': pr_auc})
47 | logger.info("roc_auc_score = {:.4f}, pr_auc_score = {:.4f}".format(roc_auc, pr_auc))
48 |
49 | return report_dict
50 |
51 |
52 | def convert_classification_report_to_dict(report):
53 | """Convert classification report to Dict format."""
54 |
55 | rows = [row.split() for row in report.split('\n') if row]
56 | headers = rows[0]
57 | report_dict = {}
58 | for row in rows[1:]:
59 | if row[1] == 'avg':
60 | label, scores = ' '.join(row[:2]), row[2:]
61 | else:
62 | label, scores = row[0], row[1:]
63 |
64 | if label == 'accuracy':
65 | report_dict[label] = float(scores[-2])
66 | else:
67 | report_dict[label] = dict(zip(headers, [float(score) for score in scores[:-1]] + [int(scores[-1])]))
68 | return report_dict
69 |
--------------------------------------------------------------------------------
/caspr/utils/noise.py:
--------------------------------------------------------------------------------
1 | """Noise class for generating noisy data."""
2 |
3 | import torch
4 |
5 |
6 | class Noise(torch.nn.Module):
7 | """Add different types of noise to the sequential inputs for denoising autoencoder.
8 |
9 | Usage:
10 | noise = Noise(emb_dims, gau_prob, sub_prob, shuffle_dist)
11 | seq_cat_noisy, seq_cont_noisy = noise(seq_cat, seq_cont)
12 | """
13 |
14 | def __init__(self, emb_dims, gau_prob=0.1, sub_prob=0.1, shuffle_dist=1):
15 | """Initialize Noise objects with probabilities for different noise types.
16 |
17 | Args:
18 | emb_dims (List of tuples (x, y)): Embedding dimensions where x is the vocab size and
19 | y is the embedding size for every categorical variable.
20 | gau_prob (float): Probability of adding gaussian noise to the continuous variables.
21 | sub_prob (float): Probability of substituting a categorical value with another randomly selected one.
22 | shuffle_dist (int): The max distance that each element will be away from its original position
23 | after shuffling.
24 | """
25 |
26 | super().__init__()
27 |
28 | self.gau_prob = gau_prob
29 | self.sub_prob = sub_prob
30 | self.shuffle_dist = shuffle_dist
31 | self.vocab_sizes = [dim[0] for dim in emb_dims]
32 |
33 | def forward(self, seq_cat_data, seq_cont_data):
34 | """Run a forward pass of the module over the data to add noise."""
35 |
36 | return self.add_noise(seq_cat_data, seq_cont_data)
37 |
38 | def add_noise(self, seq_cat_data, seq_cont_data):
39 | """Add noise to the sequential data based on the specified probabilities.
40 |
41 | Args:
42 | seq_cat_data (Tensors): Sequential categorical data.
43 | seq_cont_data (Tensors): Sequential continuous data.
44 | """
45 |
46 | if self.sub_prob > 0:
47 | seq_cat_data = self._word_substitute(seq_cat_data)
48 |
49 | if self.gau_prob > 0:
50 | seq_cont_data = self._word_gaussian(seq_cont_data)
51 |
52 | if self.shuffle_dist > 0:
53 | seq_cat_data, seq_cont_data = self._word_shuffle(seq_cat_data, seq_cont_data)
54 |
55 | return seq_cat_data, seq_cont_data
56 |
57 | def _word_shuffle(self, seq_cat_data, seq_cont_data):
58 | batch_size, seq_len, _ = seq_cat_data.size()
59 | base = torch.arange(seq_len, dtype=torch.float).repeat(batch_size, 1)
60 | inc = (self.shuffle_dist+1) * torch.rand((batch_size, seq_len))
61 | _, sigma = (base + inc).sort(dim=1)
62 | return (seq_cat_data[torch.arange(batch_size).unsqueeze(1), sigma],
63 | seq_cont_data[torch.arange(batch_size).unsqueeze(1), sigma])
64 |
65 | def _word_substitute(self, x):
66 | keep = (torch.rand(x.size(), device=x.device) > self.sub_prob)
67 | x_ = x.clone()
68 | for i in range(len(self.vocab_sizes)):
69 | x_[:, :, i].random_(0, self.vocab_sizes[i])
70 | x_[keep] = x[keep]
71 | return x_
72 |
73 | def _word_gaussian(self, x):
74 | gaussian = (torch.rand(x.size(), device=x.device) < self.gau_prob)
75 | x_ = x.clone()
76 | x_ += torch.randn(x.size(), device=x.device) * gaussian
77 | return x_
78 |
--------------------------------------------------------------------------------
/caspr/utils/onnx.py:
--------------------------------------------------------------------------------
1 | import onnx
2 | import torch
3 | from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions, get_available_providers
4 | from torch.onnx import register_custom_op_symbolic
5 |
6 | from caspr.models.factory import LSTM, TRANSFORMER
7 | from caspr.utils.preprocess import get_nonempty_tensors
8 | from caspr.utils.score import get_architecture
9 |
10 | OPSET_VERSION = 12
11 | SEQ_CAT_INDEX = 0
12 | SEQ_CONT_INDEX = 1
13 | NON_SEQ_CAT_INDEX = 2
14 | NON_SEQ_CONT_INDEX = 3
15 |
16 | _onnx_opset_version = 1
17 |
18 | def register_custom_op():
19 | """
20 | This function registers symbolic functions for
21 | custom ops that are implemented as part of ONNX Runtime
22 | """
23 |
24 | # Symbolic definition
25 | def inverse(g, self):
26 | return g.op("com.microsoft::Inverse", self)
27 |
28 | def gelu(g, self):
29 | return g.op("com.microsoft::Gelu", self)
30 |
31 | def triu(g, self, diagonal):
32 | return g.op("com.microsoft::Trilu", self, diagonal, upper_i=1)
33 |
34 | def tril(g, self, diagonal):
35 | return g.op("com.microsoft::Trilu", self, diagonal, upper_i=0)
36 |
37 | # Op Registration
38 | register_custom_op_symbolic('::inverse', inverse, _onnx_opset_version)
39 | register_custom_op_symbolic('::gelu', gelu, _onnx_opset_version)
40 | register_custom_op_symbolic('::triu', triu, _onnx_opset_version)
41 | register_custom_op_symbolic('::tril', tril, _onnx_opset_version)
42 |
43 |
44 | def unregister_custom_op():
45 | """
46 | This function unregisters symbolic functions for
47 | custom ops that are implemented as part of ONNX Runtime
48 | """
49 |
50 | import torch.onnx.symbolic_registry as sym_registry
51 |
52 | # TODO: replace this once PyTorch supports unregister natively.
53 | def unregister(name, opset_version):
54 | ns, kind = name.split("::")
55 | from torch.onnx.symbolic_helper import _onnx_stable_opsets
56 |
57 | for version in _onnx_stable_opsets:
58 | if version >= opset_version and sym_registry.is_registered_op(kind, ns, version):
59 | del sym_registry._registry[(ns, version)][kind]
60 |
61 | unregister('::inverse', _onnx_opset_version)
62 | unregister('::gelu', _onnx_opset_version)
63 | unregister('::triu', _onnx_opset_version)
64 | unregister('::tril', _onnx_opset_version)
65 |
66 |
67 | def get_input_names(nonempty_idx):
68 | mapping = {SEQ_CAT_INDEX: 'seq_cat', SEQ_CONT_INDEX: 'seq_cont',
69 | NON_SEQ_CAT_INDEX: 'non_seq_cat', NON_SEQ_CONT_INDEX: 'non_seq_cont'}
70 | input_names = [mapping[idx] for idx in nonempty_idx if idx in mapping] + ['nonempty_idx']
71 | return input_names
72 |
73 |
74 | def get_dummy_inputs(model):
75 | if get_architecture(model) == TRANSFORMER:
76 | seq_cat_dim = len(model.unified_encoder.emb_seq.emb_layers)
77 | seq_cont_dim = model.unified_encoder.seq_cont_dim
78 | non_seq_cat_dim = len(model.unified_encoder.emb_non_seq.emb_layers)
79 | non_seq_cont_dim = model.unified_encoder.non_seq_cont_dim
80 | adjust_seq_len = model.unified_encoder.transformer_encoder.pos_embedding.num_embeddings
81 | seq_len = adjust_seq_len - int((non_seq_cat_dim + non_seq_cont_dim) > 0)
82 | elif get_architecture(model) == LSTM:
83 | seq_cat_dim = model.unified_encoder.seq_cat_count
84 | seq_cont_dim = model.unified_encoder.seq_cont_count
85 | non_seq_cat_dim = model.unified_encoder.non_seq_cat_count
86 | non_seq_cont_dim = model.unified_encoder.non_seq_cont_count
87 | seq_len = model.unified_encoder.seq_len
88 |
89 | device = next(model.parameters()).device
90 | seq_cat_dummy = torch.zeros((1, seq_len, seq_cat_dim), dtype=torch.long, device=device)
91 | seq_cont_dummy = torch.zeros((1, seq_len, seq_cont_dim), dtype=torch.float32, device=device)
92 | non_seq_cat_dummy = torch.zeros((1, non_seq_cat_dim), dtype=torch.long, device=device)
93 | non_seq_cont_dummy = torch.zeros((1, non_seq_cont_dim), dtype=torch.float32, device=device)
94 |
95 | dummy = (seq_cat_dummy, seq_cont_dummy, non_seq_cat_dummy, non_seq_cont_dummy)
96 | nonempty_tensors, nonempty_idx = get_nonempty_tensors(dummy)
97 | dummy_inputs = (*nonempty_tensors, torch.tensor(nonempty_idx))
98 |
99 | input_names = get_input_names(nonempty_idx)
100 |
101 | return dummy_inputs, input_names
102 |
103 |
104 | def export_onnx(model, model_path):
105 | model.eval()
106 |
107 | dummy_inputs, input_names = get_dummy_inputs(model)
108 |
109 | with torch.no_grad():
110 | dummy_outputs = model.unified_encoder(*dummy_inputs)
111 | output_names = [f"output_{i}" for i in range(len(dummy_outputs))]
112 |
113 | dynamic_axes = dict.fromkeys(input_names + output_names, {0: 'batch_size'})
114 | torch.onnx.export(model=model.unified_encoder,
115 | args=dummy_inputs,
116 | f=model_path,
117 | input_names=input_names,
118 | output_names=output_names,
119 | dynamic_axes=dynamic_axes,
120 | opset_version=OPSET_VERSION,
121 | custom_opsets={'com.microsoft': 1},
122 | do_constant_folding=True)
123 |
124 |
125 | def to_numpy(tensor):
126 | if tensor.requires_grad:
127 | return tensor.detach().cpu().numpy()
128 | else:
129 | return tensor.cpu().numpy()
130 |
131 |
132 | class ONNXWrapper:
133 |
134 | def __init__(self, model_path_or_proto, model_type=TRANSFORMER):
135 | if isinstance(model_path_or_proto, str):
136 | with open(model_path_or_proto, 'rb') as model_file:
137 | self.model_bytes = model_file.read()
138 | else:
139 | self.model_bytes = onnx._serialize(model_path_or_proto)
140 | self.session = self.load()
141 | self.model_type = model_type
142 |
143 | def __getstate__(self):
144 | state = self.__dict__.copy()
145 | del state['session']
146 | return state
147 |
148 | def __setstate__(self, state):
149 | self.__dict__.update(state)
150 | self.session = self.load()
151 |
152 | def unified_encoder(self, *args):
153 | nonempty_tensors = args[:-1]
154 | inputs = list(map(to_numpy, nonempty_tensors))
155 | ort_inputs = dict((self.session.get_inputs()[i].name, inp) for i, inp in enumerate(inputs))
156 | return (torch.from_numpy(out) for out in self.session.run(None, ort_inputs))
157 |
158 | def load(self, device=torch.device('cpu'), enable_all_optimization=True):
159 | sess_options = SessionOptions()
160 | sess_options.graph_optimization_level = (
161 | GraphOptimizationLevel.ORT_ENABLE_ALL
162 | if enable_all_optimization
163 | else GraphOptimizationLevel.ORT_ENABLE_BASIC
164 | )
165 |
166 | use_gpu = 'cuda' in device.type and 'CUDAExecutionProvider' in get_available_providers()
167 | execution_providers = (
168 | ["CPUExecutionProvider"] if not use_gpu else ["CUDAExecutionProvider", "CPUExecutionProvider"]
169 | )
170 |
171 | session = InferenceSession(self.model_bytes, sess_options, providers=execution_providers)
172 | return session
173 |
174 | def to(self, device):
175 | self.session = self.load(device)
176 |
177 | def cpu(self):
178 | self.to(torch.device('cpu'))
179 |
180 | def eval(self):
181 | pass
182 |
--------------------------------------------------------------------------------
/caspr/utils/score.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import numpy as np
4 | import torch
5 |
6 | from caspr.data.common_dataset import id_collate
7 | from caspr.models.factory import LSTM, TRANSFORMER
8 | from caspr.utils.preprocess import get_nonempty_tensors
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 | def run_autoencoder_score(autoenc, dataloader_test, device):
13 |
14 | embeddings = []
15 | tgt_ids = []
16 |
17 | for tgt_id, _, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data in dataloader_test:
18 |
19 | data = [seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data]
20 | if isinstance(autoenc, torch.nn.Module):
21 | data = [d.to(device) for d in data]
22 |
23 | nonempty_tensors, nonempty_idx = get_nonempty_tensors(data)
24 |
25 | tgt_ids.append(tgt_id)
26 |
27 | if get_architecture(autoenc) == TRANSFORMER:
28 | emb, _, _ = autoenc.unified_encoder(*nonempty_tensors, nonempty_idx)
29 | # Concatenate across timesteps
30 | emb = emb.reshape(emb.shape[0], -1)
31 | embeddings.append(emb.detach().cpu() if isinstance(emb, torch.Tensor) else emb)
32 |
33 | elif get_architecture(autoenc) == LSTM:
34 | _, (hn, _) = autoenc.unified_encoder(*nonempty_tensors, nonempty_idx)
35 | embeddings.append(hn.detach().cpu() if isinstance(hn, torch.Tensor) else hn)
36 |
37 | tgt_ids = np.concatenate(tgt_ids, axis=0)
38 | embeddings = np.concatenate(embeddings, axis=0)
39 | embeddings_with_id = np.hstack((tgt_ids, embeddings))
40 |
41 | return embeddings_with_id
42 |
43 | def score(dataset_test, autoenc, device, batch_size=1024):
44 | autoenc.eval()
45 | test_loader = torch.utils.data.DataLoader(dataset_test, batch_size=batch_size, collate_fn=id_collate)
46 |
47 | logger.info("Performing inference on given dataset")
48 | embeddings = run_autoencoder_score(autoenc, test_loader, device)
49 | return embeddings
50 |
51 | def get_architecture(model):
52 | return model.__class__.__name__ if isinstance(model, torch.nn.Module) else model.model_type
53 |
--------------------------------------------------------------------------------
/caspr/utils/segmentation/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------------------------
2 | # Copyright (c) Microsoft Corporation. All rights reserved.
3 | # --------------------------------------------------------------------------
4 |
5 | """The segmentation module for the CASPR library.
6 |
7 | Modules:
8 | :module1_name: A description of this specific module.
9 | """
10 |
--------------------------------------------------------------------------------
/caspr/utils/segmentation/dec_utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | from scipy.optimize import linear_sum_assignment
4 | from sklearn.cluster import KMeans
5 |
6 | from caspr.utils.preprocess import get_nonempty_tensors
7 |
8 |
9 | def cluster_initialize(model, dataloader, device):
10 | """Initialize cluster.
11 |
12 | Args:
13 | model (nn.Module): # noqa: W0223 Pretrained encoder-decoder model
14 | dataloader (DataLoader): Data loader that provides an iterable over the given dataset
15 | device ('cpu' or 'cuda'): Describes the machine on which the code is running
16 | """
17 | kmeans = KMeans(model.cluster_number, n_init=20)
18 | model.train()
19 | encoder_embs = []
20 | labels = []
21 | # form initial cluster centres
22 | for _, y, seq_cat_x, seq_cont_x, non_seq_cat_x, non_seq_cont_x in dataloader:
23 | seq_cat_x = seq_cat_x.to(device)
24 | seq_cont_x = seq_cont_x.to(device)
25 | non_seq_cat_x = non_seq_cat_x.to(device)
26 | non_seq_cont_x = non_seq_cont_x.to(device)
27 |
28 | data = (seq_cat_x, seq_cont_x, non_seq_cat_x, non_seq_cont_x)
29 | nonempty_tensors, nonempty_idx = get_nonempty_tensors(data)
30 | encoder_embs.append(model.enc(*nonempty_tensors, nonempty_idx).detach().cpu())
31 |
32 | labels.append(y)
33 |
34 | labels = torch.cat(labels).long()
35 |
36 | predicted = kmeans.fit_predict(torch.cat(encoder_embs).numpy())
37 | predicted_tensor = torch.tensor(np.copy(predicted), dtype=torch.long)
38 | _, accuracy = cluster_accuracy(predicted, labels.cpu().numpy())
39 | print('Initial Cluster Acc: ', accuracy)
40 | cluster_centers = torch.tensor(kmeans.cluster_centers_, dtype=torch.float, requires_grad=True).to(device)
41 | with torch.no_grad():
42 | # initialise the cluster centers
43 | model.state_dict()['assignment.cluster_centers'].copy_(cluster_centers)
44 | return predicted_tensor
45 |
46 |
47 | def cluster_accuracy(y_true, y_predicted, cluster_number=None):
48 | """Calculate clustering accuracy after using the linear_sum_assignment function in SciPy to determine reassignments.
49 |
50 | Args:
51 | y_true (List of int): list of true cluster numbers, an integer array 0-indexed
52 | y_predicted (List of int): list of predicted cluster numbers, an integer array 0-indexed
53 | cluster_number (int): number of clusters, if None then calculated from input
54 | Return:
55 | reassignment dictionary, clustering accuracy
56 | """
57 | if cluster_number is None:
58 | cluster_number = max(y_predicted.max(), y_true.max()) + 1 # assume labels are 0-indexed
59 | count_matrix = np.zeros((cluster_number, cluster_number), dtype=np.int64)
60 | for i in range(y_predicted.size):
61 | count_matrix[y_predicted[i], y_true[i]] += 1
62 |
63 | row_ind, col_ind = linear_sum_assignment(count_matrix.max() - count_matrix)
64 | reassignment = dict(zip(row_ind, col_ind))
65 | accuracy = count_matrix[row_ind, col_ind].sum() / y_predicted.size
66 | return reassignment, accuracy
67 |
68 |
69 | def cluster_predict(model, dataloader, device):
70 | """Predict the cluster centers for the given input data.
71 |
72 | Args:
73 | model (nn.Module): # noqa: W0223 Pretrained encoder-decoder model
74 | dataloader (DataLoader): Data loader that provides an iterable over the given dataset
75 | device ('cpu' or 'cuda'): Describes the machine on which the code is running
76 | """
77 | features = []
78 | labels = []
79 | for _, y, seq_cat_x, seq_cont_x, non_seq_cat_x, non_seq_cont_x in dataloader:
80 | seq_cat_x = seq_cat_x.to(device)
81 | seq_cont_x = seq_cont_x.to(device)
82 | non_seq_cat_x = non_seq_cat_x.to(device)
83 | non_seq_cont_x = non_seq_cont_x.to(device)
84 |
85 | data = (seq_cat_x, seq_cont_x, non_seq_cat_x, non_seq_cont_x)
86 | nonempty_tensors, nonempty_idx = get_nonempty_tensors(data)
87 | features.append(model(*nonempty_tensors, nonempty_idx).detach().cpu())
88 |
89 | labels.append(y)
90 |
91 | return torch.cat(features).max(1)[1], torch.cat(labels).long()
92 |
--------------------------------------------------------------------------------
/caspr/utils/segmentation/pandas.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 | import pandas as pd
6 | from sklearn.cluster import KMeans
7 | from sklearn.decomposition import PCA
8 | from sklearn.manifold import TSNE
9 | from sklearn.metrics import silhouette_score
10 | from sklearn.preprocessing import MinMaxScaler, StandardScaler
11 | from tqdm import tqdm
12 |
13 |
14 | def check_sparsity(data):
15 | """Check sparsity in data."""
16 | for c in data.columns:
17 | try:
18 | temp = pd.qcut(data[c], q=10, labels=False, duplicates='drop').value_counts()/data.shape[0]
19 |
20 | # top quantile%, unique value%
21 | print(c, np.round(temp.values[0], 2), np.round(len(data[c].unique())/data.shape[0], 2))
22 | except Exception:
23 | print(c, np.nan, np.round(len(data[c].unique()), 2))
24 |
25 |
26 | def quantile(df, q=5, col_features=None):
27 | """Score customers from 0 to 5 based on Engagement metrics.
28 |
29 | Input:
30 | Output:
31 | """
32 |
33 | # create quantile scores [0, q] with q interval
34 | for c in col_features:
35 | if 'R_' in c:
36 | df[c+'_q'] = pd.qcut(df[c], q=q+1, labels=range(q, -1, -1), duplicates='drop')
37 | else:
38 | df[c+'_q'] = pd.qcut(df[c], q=q+1, labels=range(0, q+1), duplicates='drop')
39 |
40 | df['AvgScore'] = df[[c + '_q' for c in col_features]].mean(axis=1)
41 | df['AvgScore'].hist(bins=q)
42 | plt.title('AvgScore')
43 | plt.show()
44 |
45 | # generate segments
46 | df['Segment'] = np.nan
47 | for i in range(1, q+1):
48 | df.loc[(df.AvgScore <= i) & (df.AvgScore > (i-1)), 'Segment'] = i
49 |
50 | df['Segment'].hist(bins=q)
51 | plt.title('Segment')
52 | plt.show()
53 |
54 | return df
55 |
56 |
57 | def clustering(df, col_features=None, cluster_range=range(2, 10), scaling_option="minmax",
58 | pca=True, pca_param={'threshold': 0.8, 'show_plot': False},
59 | default_cluster_size=None, default_cluster_threshold=0.1,
60 | tsne_plt=True, tsne_sample=1000, removed_outlier=False):
61 | """Perform Clustering.
62 |
63 | Options to do transformation and PCA before performing clustering
64 | - featurization
65 | - find # of clusters
66 | - fit final model
67 | """
68 | inertias = []
69 | sil_scores = []
70 |
71 | # featurization
72 | df = apply_scaling(df, col_features, scaling_option, removed_outlier)
73 |
74 | if pca:
75 | df_features, n_pca, pca = apply_pca(df, col_features=col_features, pca_param=pca_param) # noqa W0612
76 | else:
77 | df_features = df[col_features].values
78 |
79 | # find # of clusters
80 | for k in tqdm(cluster_range):
81 | # kc = KMeans(n_clusters=k, init='k-means++', n_init=10, random_state=1)
82 | kc = KMeans(n_clusters=k, random_state=1, n_jobs=4)
83 |
84 | kc.fit(df_features)
85 | sil_scores.append(silhouette_score(df_features, kc.labels_))
86 | inertias.append(kc.inertia_)
87 |
88 | results = pd.DataFrame(np.array([cluster_range, inertias, sil_scores]).T)
89 | results.columns = ['cluster_size', 'inertias', 'sil_scores']
90 | n_final = cluster_range[np.where(sil_scores == np.max(sil_scores))[0][0]]
91 | print('optimal cluster size:', n_final, np.round(np.max(sil_scores), 2))
92 |
93 | # fit final model
94 | if default_cluster_size is not None:
95 | # kc = KMeans(n_clusters=default_cluster_size, random_state=1)
96 | sil_score_default = results.loc[results.index == default_cluster_size, 'sil_scores'].values[0]
97 |
98 | if (np.max(sil_scores)/sil_score_default - 1) <= default_cluster_threshold:
99 | print('default is a good cluster size', sil_score_default, np.max(sil_scores))
100 | kc = KMeans(n_clusters=default_cluster_size, random_state=1)
101 | else:
102 | print('optimal is a better cluster size', sil_score_default, np.max(sil_scores))
103 | kc = KMeans(n_clusters=n_final, random_state=1)
104 | else:
105 | kc = KMeans(n_clusters=n_final, random_state=1)
106 |
107 | kc.fit(df_features)
108 |
109 | df['label'] = kc.labels_
110 |
111 | # score visualization
112 | if len(cluster_range) > 1:
113 | _, axes = plt.subplots(1, 2, figsize=(10, 5))
114 | results.plot(ax=axes[0], x='cluster_size', y='inertias')
115 | results.plot.bar(ax=axes[1], x='cluster_size', y='sil_scores')
116 | plt.show()
117 |
118 | # clustering size distribution
119 | print(df.label.value_counts().to_frame()/df.shape[0])
120 |
121 | # tsne visualization
122 | if tsne_plt:
123 | if (tsne_sample > 0) & (tsne_sample < len(kc.labels_)):
124 | df_tsne = pd.DataFrame(df_features)
125 | df_tsne['label'] = kc.labels_
126 |
127 | plt_tsne(x=df_tsne.drop(columns=['label']).sample(n=tsne_sample, random_state=1).values,
128 | label=df_tsne.sample(n=tsne_sample, random_state=1).label.values)
129 | else:
130 | plt_tsne(x=df_features, label=kc.labels_)
131 |
132 | return results, df, kc
133 |
134 |
135 | def apply_scaling(df, col_features=None, scaling_option=None, removed_outlier=False):
136 | """Apply Scaling to dataframe."""
137 |
138 | if scaling_option == 'minmax':
139 | scaler = MinMaxScaler()
140 | df[col_features] = scaler.fit_transform(df[col_features])
141 | elif scaling_option == 'qcut':
142 | for c in col_features:
143 | df[c] = pd.qcut(df[c], q=100, labels=False, duplicates='drop')
144 | else:
145 | pass
146 |
147 | if removed_outlier:
148 | n_std = 3
149 | for c in col_features:
150 | if df[c].dtype != 'object':
151 | tic_cnt = df.shape[0]
152 | temp_mean = df[c].mean()
153 | temp_std = df[c].std()
154 | df = df[(df[c] <= (temp_mean + n_std*temp_std)) & (df[c] >= (temp_mean - n_std*temp_std))].copy()
155 | print('remove outlier', c, ':', df.shape[0] - tic_cnt)
156 |
157 | df.reset_index(drop=True, inplace=True)
158 |
159 | return df
160 |
161 |
162 | def apply_pca(df, col_features=None, pca_param={'threshold': 0.8, 'show_plot': False}):
163 | """Apply PCA transformation and return # of eigen-vectors based on threshold (20/80 rules)."""
164 |
165 | # normalize the input matrix
166 | matrix = df[col_features].values
167 | scaler = StandardScaler()
168 | scaler.fit(matrix)
169 | scaled_matrix = scaler.transform(matrix)
170 |
171 | # perform PCA
172 | pca = PCA()
173 | pca.fit(scaled_matrix)
174 | pca_samples = pca.transform(scaled_matrix)
175 |
176 | # # visualize explained variance
177 | # if pca_param['show_plot']:
178 | # fig, ax = plt.subplots(figsize=(10, 5))
179 | # sns.set(font_scale=1)
180 | # plt.step(range(matrix.shape[1]), pca.explained_variance_ratio_.cumsum(), where='mid',
181 | # label='cumulative explained variance')
182 | # sns.barplot(np.arange(1,matrix.shape[1]+1), pca.explained_variance_ratio_, alpha=0.5, color = 'g',
183 | # label='individual explained variance')
184 | # plt.xlim(0, len(col_features))
185 | # ax.set_xticklabels([s if int(s.get_text())%2 == 0 else '' for s in ax.get_xticklabels()])
186 | # plt.ylabel('Explained variance', fontsize = 14)
187 | # plt.xlabel('Principal components', fontsize = 14)
188 | # plt.legend(loc='best', fontsize = 13)
189 |
190 | # define n_pca based on the threshold
191 | n_pca = np.where(pca.explained_variance_ratio_.cumsum() > pca_param['threshold'])[0][0] + 1
192 | print('# of pca components:', n_pca, '/', scaled_matrix.shape[1])
193 | print('# of variance explained:', pca.explained_variance_ratio_.cumsum()[n_pca-1])
194 |
195 | # see loadings of the main components
196 | df_pca_components = pd.DataFrame(pca.components_, columns=col_features)
197 | plt_bar(df_pca_components.head(n_pca).copy(), ncols=3, figsize=(10, 10), title='PCA ')
198 |
199 | # extract the main transformed features
200 | df_pca = pca_samples[:, 0:n_pca]
201 |
202 | return df_pca, n_pca, pca
203 |
204 |
205 | def profiling(df, label, col_features, col_dropped=[]):
206 | """Profile Dataframe using heatmap.
207 |
208 | Heat-map around KPIs: absolute & relative
209 | - Useful technique to identify relative importance of each segment's attribute
210 | - Calculate average values of each cluster
211 | - Calculate average values of population
212 | - Calculate importance score by dividing them and subtracting 1
213 | (ensures 0 is returned when cluster average equals population average)
214 |
215 | col_dropped: automatic or apply min-max scaling to features [TO-DO]
216 | """
217 | df['Segment'] = label
218 |
219 | # classifying cat vs. cont features
220 | cat_features = []
221 | cont_features = []
222 | for x in col_features:
223 | if df[x].dtypes == 'object':
224 | cat_features.append(x)
225 | else:
226 | cont_features.append(x)
227 |
228 | # customer counts
229 | df_count = df.groupby('Segment')[cont_features[0]].count()
230 | df_count.loc['All'] = df_count.sum()
231 | df_count = df_count.to_frame()
232 | df_count.columns = ['Customers']
233 | df_count['Customers%'] = df_count.Customers/df_count.Customers.values[-1]*100
234 |
235 | # numerical features
236 | df_cont = df.groupby('Segment')[cont_features].mean()
237 | df_cont.loc['All'] = df_cont.mean()
238 |
239 | # categorical features
240 | df_cat = pd.DataFrame()
241 | for c in cat_features:
242 | df_pivot = df.pivot_table(index='Segment',
243 | columns=c, values=cont_features[0], aggfunc='count')
244 | df_pivot.loc['All'] = df_pivot.sum()
245 | df_pivot[df_pivot.columns] = df_pivot.values / df_pivot.sum(axis=1).values.reshape(-1, 1)*100
246 | df_cat = pd.concat([df_cat, df_pivot], axis=1)
247 |
248 | # combine results and calcuate relative importance
249 | result_profile = pd.concat([df_count, df_cont, df_cat], axis=1)
250 | temp_all = result_profile.loc['All']
251 | result_profile.drop('All', inplace=True)
252 | result_profile.sort_index(ascending=False, inplace=True)
253 | result_profile.loc['All'] = temp_all
254 |
255 | relative_imp = result_profile/result_profile.loc['All'] - 1
256 | relative_imp.drop('All', inplace=True)
257 |
258 | # visualization - heatmap
259 | temp = relative_imp.drop(columns=['Customers', 'Customers%'] + col_dropped).copy()
260 | plt_heatmap(temp, x_labels=temp.columns, y_labels=temp.index)
261 |
262 | # visualization - barchart by clusters
263 | plt_bar(temp)
264 |
265 | return relative_imp, result_profile
266 |
267 |
268 | def plt_tsne(x, label):
269 | """Visualize TSNE."""
270 | tic = time.time()
271 | x_embedded = TSNE(n_components=2).fit_transform(x)
272 | print('tsne takes time: ', time.time() - tic)
273 |
274 | vis_x = x_embedded[:, 0]
275 | vis_y = x_embedded[:, 1]
276 |
277 | fig = plt.figure(figsize=(12, 8)) # noqa W0612
278 | plt.scatter(vis_x, vis_y, c=label, cmap=plt.cm.get_cmap("jet", 256))
279 | plt.colorbar(ticks=range(256))
280 | plt.clim(-0.5, 9.5)
281 | plt.show()
282 |
283 |
284 | def plt_heatmap(data, x_labels, y_labels):
285 | """Plot Heatmap."""
286 | fig = plt.figure(figsize=(10, 5))
287 | ax = fig.add_axes([1, 1, 1.1, 1.1])
288 |
289 | plt.imshow(data, cmap='Blues', interpolation='nearest')
290 | ax.set_yticks(range(len(y_labels)))
291 | ax.set_yticklabels(y_labels)
292 | ax.set_xticks(range(len(x_labels)))
293 | ax.set_xticklabels(x_labels, rotation=60)
294 | plt.colorbar()
295 | plt.show()
296 |
297 |
298 | def plt_bar(data, ncols=3, figsize=(10, 10), title='Segment'):
299 | """Plot bars."""
300 |
301 | data.dropna(axis=1, inplace=True)
302 | nrows = int(np.ceil(data.shape[0]/ncols))
303 | xlim_min = data.min().min()
304 | xlim_max = data.max().max()
305 |
306 | fig = plt.figure(figsize=figsize)
307 | for i in range(data.shape[0]):
308 | temp = data.iloc[i]
309 | ax = plt.subplot(nrows, ncols, i+1)
310 | ax.barh(range(len(temp)), temp.values, align='center')
311 | if title is not None:
312 | ax.set_title(title + str(data.index[i]))
313 |
314 | plt.xticks(rotation=45)
315 | plt.xlim(xlim_min, xlim_max)
316 |
317 | if i % ncols == 0:
318 | ax.set_yticks(range(len(temp)))
319 | ax.set_yticklabels(temp.index)
320 |
321 | fig.tight_layout()
322 | plt.show()
323 |
324 |
325 | def generate_segmentation_graphs(combined_df, profile_features,
326 | emb_features, use_profile=False, use_embedding=False):
327 | """Generate segmentation graphs.
328 |
329 | combined_df - the dataframe containing embeddings and profile features
330 | with feature names as the column names
331 | profile_features - the names of all the profile features in the data
332 |
333 | emb_features - name of the embedding features, by default they should be 'dim_0', 'dim_1'...
334 |
335 | use_profile - boolean flag to determine if we use the profile features or not
336 |
337 | use_embedding - boolean flag to determine if we use the embedding values
338 | """
339 |
340 | # importlib.reload(segmentation_utils)
341 |
342 | df_emb = combined_df
343 | # Need to remove the dimensions of the embedding which have only onevalue if we use scaling
344 | col_one = []
345 | for col in emb_features:
346 | if df_emb[col].nunique() == 1:
347 | col_one.append(col)
348 | df_emb = df_emb.drop(columns=col_one, axis=1)
349 |
350 | emb_featuresN = [] # noqa: C0103
351 | for item in emb_features:
352 | if item not in col_one:
353 | emb_featuresN.append(item)
354 |
355 | emb_features = emb_featuresN
356 |
357 | plt_heatmap(df_emb[emb_features].corr(), emb_features, emb_features)
358 | df_emb[emb_features].describe()
359 |
360 | features_to_use = []
361 | if use_profile:
362 | features_to_use = profile_features
363 | if use_embedding:
364 | features_to_use = emb_features
365 |
366 | if use_embedding and use_profile:
367 | features_to_use = profile_features + emb_features
368 |
369 | n = 5000
370 | data_c = df_emb.sample(n=n, random_state=1).copy()
371 |
372 | results, df, kc = clustering(df=data_c.copy(),
373 | col_features=features_to_use, cluster_range=range(2, 9), scaling_option='qcut',
374 | pca=True, pca_param={'threshold': 0.8, 'show_plot': False},
375 | default_cluster_size=None, default_cluster_threshold=0.1,
376 | tsne_plt=True, tsne_sample=1000, removed_outlier=False)
377 |
378 | col_features = emb_features + profile_features
379 | relative_imp, result_profile = profiling(data_c.copy(), kc.labels_, col_features, col_dropped=[])
380 |
381 |
382 | def generate_combined_df(embedding_data=None, profile_data: pd.DataFrame = None):
383 | """Generate Combined DF.
384 |
385 | embedding data - The numpy array containing the embeddings in the NxM format where
386 | N = number of data entries, M = embedding dimension
387 |
388 | profile data - The dataframe containing all the profile features, with column names
389 | equal to the featur names
390 |
391 | """
392 | if embedding_data is None:
393 | profile_data.reset_index(drop=True, inplace=True)
394 | return profile_data
395 |
396 | emb_dim = embedding_data.shape[1]
397 | column_list = []
398 | for i in range(emb_dim):
399 | column_list.append('dim_' + str(i))
400 | emb_df = pd.DataFrame(embedding_data, columns=column_list)
401 | emb_df.reset_index(drop=True, inplace=True)
402 |
403 | if profile_data is None:
404 | return emb_df
405 |
406 | profile_data.reset_index(drop=True, inplace=True)
407 | final_df = pd.concat([profile_data, emb_df], axis=1)
408 | return final_df
409 |
--------------------------------------------------------------------------------
/caspr/utils/spark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/CASPR/9cdbf270487751a0ad6862b2fea2ccc0e23a0b67/caspr/utils/spark/__init__.py
--------------------------------------------------------------------------------
/caspr/utils/spark/large/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/CASPR/9cdbf270487751a0ad6862b2fea2ccc0e23a0b67/caspr/utils/spark/large/__init__.py
--------------------------------------------------------------------------------
/caspr/utils/spark/large/score.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | import pandas as pd
5 | import torch
6 | import torch.nn as nn
7 | from petastorm.pytorch import BatchedDataLoader
8 |
9 | from caspr.data.load import transform_and_load
10 | from caspr.models.model_wrapper import LSTMAutoencoder, TransformerAutoEncoder
11 | from caspr.utils.preprocess import get_nonempty_tensors
12 | from caspr.utils.spark.preprocess import remove_underscore_in_seq_col_name_list
13 |
14 | PS_HDFS_DRIVER = 'libhdfs3'
15 | # lower overhead, alternative is 'process'
16 | PS_WORKER_TYPE = 'thread'
17 | # assuming the training relies on SSD backed dbfs:/ml, Petastorm's caching can be disabled
18 | PS_CACHE_TYPE = None
19 |
20 | def get_default_parallelism():
21 | try:
22 | return sc.defaultParallelism
23 | except NameError as _:
24 | # Spark Context not initialized (sc)
25 | return os.cpu_count()
26 |
27 |
28 | def run_autoencoder_score_peta(autoenc, steps_per_epoch, train_dataloader_iter, device, tgt_id_col, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps):
29 |
30 | embeddings = []
31 | tgt_ids = []
32 |
33 | for _ in range(steps_per_epoch):
34 | pd_batch = next(train_dataloader_iter)
35 | tgt_id, _, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data = transform_and_load(
36 | pd_batch, device, remove_underscore_in_seq_col_name_list(tgt_id_col), remove_underscore_in_seq_col_name_list(seq_cols), remove_underscore_in_seq_col_name_list(non_seq_cols), output_col, remove_underscore_in_seq_col_name_list(cat_cols), remove_underscore_in_seq_col_name_list(cont_cols), time_steps)
37 |
38 | data = (seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data)
39 | nonempty_tensors, nonempty_idx = get_nonempty_tensors(data)
40 |
41 | tgt_ids.append(tgt_id)
42 |
43 | if isinstance(autoenc, TransformerAutoEncoder):
44 | emb, _, _ = autoenc.unified_encoder(*nonempty_tensors, nonempty_idx)
45 | # Concatenate across timesteps
46 | emb = emb.view(emb.shape[0], -1)
47 | embeddings.append(emb.detach().cpu())
48 |
49 | elif isinstance(autoenc, LSTMAutoencoder):
50 | _, (hn, _) = autoenc.unified_encoder(*nonempty_tensors, nonempty_idx)
51 | embeddings.append(hn.detach().cpu())
52 |
53 | tgt_ids = pd.DataFrame(np.concatenate(tgt_ids, axis=0))
54 | tgt_ids.columns = tgt_id_col
55 | embeddings = pd.DataFrame(np.concatenate(embeddings, axis=0))
56 | # embeddings_with_id = np.hstack((tgt_ids, embeddings))
57 | embeddings_with_id = pd.concat([tgt_ids, embeddings], axis=1)
58 | return embeddings_with_id
59 |
60 |
61 | def score_peta(converter_test, autoenc, tgt_id, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps, batch_size=1024):
62 | autoenc.eval()
63 | if torch.cuda.is_available():
64 | device = torch.cuda.current_device()
65 | else:
66 | device = torch.device("cpu")
67 | criterion = [nn.MSELoss(), nn.CrossEntropyLoss()]
68 |
69 | with converter_test.make_torch_dataloader(batch_size=batch_size, data_loader_fn=BatchedDataLoader,
70 | num_epochs=None, cache_type=PS_CACHE_TYPE,
71 | workers_count=get_default_parallelism(),
72 | reader_pool_type=PS_WORKER_TYPE,
73 | hdfs_driver=PS_HDFS_DRIVER) as test_dataloader:
74 | test_dataloader_iter = iter(test_dataloader)
75 | steps_per_epoch = max(1, len(converter_test) // (batch_size))
76 | embeddings = run_autoencoder_score_peta(autoenc, steps_per_epoch, test_dataloader_iter,
77 | device, tgt_id, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps) # noqa: E1121
78 | return embeddings
79 |
--------------------------------------------------------------------------------
/caspr/utils/spark/large/train.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import time
3 |
4 | import horovod.torch as hvd
5 | import torch
6 | import torch.nn as nn
7 | from petastorm.pytorch import BatchedDataLoader
8 | from torch import optim
9 | from torch.utils.data.distributed import DistributedSampler
10 |
11 | from caspr.data.load import transform_and_load
12 | from caspr.utils.early_stopping import DistributedEarlyStopping
13 | from caspr.utils.horovod.train import determine_early_stop
14 | from caspr.utils.spark.large.score import get_default_parallelism
15 | from caspr.utils.spark.preprocess import remove_underscore_in_seq_col_name_list
16 | from caspr.utils.train import init_lr_schedulers
17 |
18 | logger = logging.getLogger(__name__)
19 |
20 |
21 | def save_checkpoint(model, optimizer, epoch, name):
22 | filepath = '/checkpoint-{epoch}-{model}.pth'.format(epoch=epoch, model=name)
23 | state = {
24 | 'model': model.state_dict(),
25 | 'optimizer': optimizer.state_dict(),
26 | }
27 | torch.save(state, filepath)
28 |
29 |
30 | def metric_average(metric, name):
31 | avg_tensor = hvd.allreduce(metric, name=name)
32 | return avg_tensor.item()
33 |
34 |
35 | BATCH_SIZE = 1024 * 32
36 | NUM_EPOCHS = 100
37 | NUM_WORKERS = 4 # assume cluster consists of two workers 2x K80 each
38 | # default loader parallism is low or None, this widens the IO bottleneck when feeding each GPU
39 | PS_WORKERS_PER_CPU = 2
40 | # this version is implemented in C, vs Java (slower) default
41 | PS_HDFS_DRIVER = 'libhdfs3'
42 | # lower overhead, alternative is 'process'
43 | PS_WORKER_TYPE = 'thread'
44 | # assuming the training relies on SSD backed dbfs:/ml, Petastorm's caching can be disabled
45 | PS_CACHE_TYPE = None
46 | EARLY_STOPPING_PATIENCE = 8
47 | EARLY_STOPPING_DELTA = 1e-5
48 | ROOT_RANK = 0
49 |
50 |
51 | def run_autoencoder_peta(autoenc, optimizer, steps_per_epoch, train_dataloader_iter, criterion, device, tgt_id_col, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps):
52 | count = 0
53 | val_start_time = time.time()
54 | running_loss = 0.0
55 | for _ in range(steps_per_epoch):
56 | pd_batch = next(train_dataloader_iter)
57 | _, y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data = transform_and_load(
58 | pd_batch, device, remove_underscore_in_seq_col_name_list(tgt_id_col), remove_underscore_in_seq_col_name_list(seq_cols), remove_underscore_in_seq_col_name_list(non_seq_cols), output_col, remove_underscore_in_seq_col_name_list(cat_cols), remove_underscore_in_seq_col_name_list(cont_cols), time_steps)
59 |
60 | # Track history in training
61 | torch.set_grad_enabled(True)
62 | _, loss = autoenc.run(y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data, criterion)
63 | optimizer.zero_grad()
64 | loss.backward()
65 | optimizer.step()
66 |
67 | running_loss = (running_loss * count + loss.item()) / (count + 1)
68 | count = count + 1
69 | if count % 64 == 0:
70 | logger.info("Running Loss so far: " + str(running_loss))
71 | logger.info("Records processed so far: " + str(count*seq_cat_data.shape[0]))
72 | time_so_far = time.time() - val_start_time
73 | logger.info("Time taken since start:" + str(time_so_far))
74 |
75 | val_end_time = time.time()
76 |
77 | logger.info("Total time taken:" + str(val_end_time - val_start_time))
78 | logger.info("Running loss at the end of training epoch:" + str(running_loss))
79 | return running_loss, val_end_time - val_start_time
80 |
81 |
82 | def run_autoencoder_val_peta(autoenc, steps_per_epoch, val_dataloader_iter, criterion, device, tgt_id_col, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps):
83 | criterion = [nn.MSELoss(), nn.CrossEntropyLoss()]
84 | count = 0
85 | val_start_time = time.time()
86 | running_loss = 0.0
87 | for _ in range(steps_per_epoch):
88 | pd_batch = next(val_dataloader_iter)
89 | _, y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data = transform_and_load(
90 | pd_batch, device, remove_underscore_in_seq_col_name_list(tgt_id_col), remove_underscore_in_seq_col_name_list(seq_cols), remove_underscore_in_seq_col_name_list(non_seq_cols), output_col, remove_underscore_in_seq_col_name_list(cat_cols), remove_underscore_in_seq_col_name_list(cont_cols), time_steps)
91 |
92 | # Track history in training
93 | torch.set_grad_enabled(False)
94 | _, loss = autoenc.run(y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data, criterion)
95 |
96 | running_loss = (running_loss * count + loss.item()) / (count + 1)
97 |
98 | count = count + 1
99 | if count % 64 == 0:
100 | logger.info("Running Loss so far: " + str(running_loss))
101 | logger.info("Records processed so far: " + str(count*seq_cat_data.shape[0]))
102 | time_so_far = time.time() - val_start_time
103 | logger.info("Time taken since start:" + str(time_so_far))
104 |
105 | val_end_time = time.time()
106 |
107 | logger.info("Total time taken:" + str(val_start_time - val_end_time))
108 | logger.info("Running loss at the end of validation epoch:" + str(running_loss))
109 | return running_loss, val_start_time - val_end_time
110 |
111 |
112 | def train_peta_hvd(converter_train, autoenc, tgt_id, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps, batch_size=1024, epochs=10, learning_rate=0.01, warmup_epochs=5, save_model=False, path='./early_stop_model.pth'):
113 | autoenc.train()
114 | hvd.init() # Initialize Horovod.
115 | logger.info("Number of workers:" + str(hvd.size()))
116 | # Horovod: pin GPU to local rank.
117 | if torch.cuda.is_available():
118 | torch.cuda.set_device(hvd.local_rank())
119 | device = torch.cuda.current_device()
120 | else:
121 | device = torch.device("cpu")
122 |
123 | # from torch.utils.data.distributed import DistributedSampler
124 | # Configure the sampler such that each worker obtains a distinct sample of input dataset.
125 | # train_sampler = DistributedSampler(dataset_train, num_replicas=hvd.size(), rank=hvd.rank())
126 | # Use trian_sampler to load a different sample of data on each worker.
127 | # train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size, sampler=train_sampler)
128 |
129 | autoenc = autoenc.to(device)
130 | num_epochs = epochs
131 |
132 | # Effective batch size in synchronous distributed training is scaled by the number of workers.
133 | # An increase in learning rate compensates for the increased batch size.
134 | optimizer = optim.Adam(autoenc.parameters(), lr=learning_rate * hvd.size())
135 |
136 | # Broadcast initial parameters so all workers start with the same parameters.
137 | hvd.broadcast_parameters(autoenc.state_dict(), root_rank=ROOT_RANK)
138 | hvd.broadcast_optimizer_state(optimizer, root_rank=ROOT_RANK)
139 |
140 | # Wrap the optimizer with Horovod's DistributedOptimizer.
141 | optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=autoenc.named_parameters())
142 |
143 | scheduler_wu, scheduler_re = init_lr_schedulers(
144 | optimizer, warmup_epochs, reduce_patience=int(EARLY_STOPPING_PATIENCE/2), verbose=True)
145 |
146 | hvd.broadcast_object(scheduler_wu, root_rank=ROOT_RANK)
147 | hvd.broadcast_object(scheduler_re, root_rank=ROOT_RANK)
148 |
149 | criterion = [nn.MSELoss(), nn.CrossEntropyLoss()]
150 | early_stopper = DistributedEarlyStopping(logger, patience=EARLY_STOPPING_PATIENCE, delta=EARLY_STOPPING_DELTA)
151 |
152 | with converter_train.make_torch_dataloader(cur_shard=hvd.rank(), shard_count=hvd.size(),
153 | batch_size=batch_size, data_loader_fn=BatchedDataLoader,
154 | num_epochs=None, cache_type=PS_CACHE_TYPE,
155 | workers_count=PS_WORKERS_PER_CPU * get_default_parallelism(),
156 | reader_pool_type=PS_WORKER_TYPE,
157 | hdfs_driver=PS_HDFS_DRIVER) as train_dataloader:
158 | train_dataloader_iter = iter(train_dataloader)
159 | steps_per_epoch = max(1, len(converter_train) // (batch_size * hvd.size()))
160 | total_time = 0
161 |
162 | epoch = 1
163 | while epoch < num_epochs + 1:
164 | loss, epoch_time = run_autoencoder_peta(autoenc, optimizer, steps_per_epoch, train_dataloader_iter,
165 | criterion, device, tgt_id, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps)
166 | # Only save checkpoints on the first worker.
167 | total_time = total_time + epoch_time
168 | loss_averaged = metric_average(torch.tensor(loss), 'avg_loss')
169 | logger.info("Average overall training loss in epoch {0} is {1}".format(
170 | epoch, loss_averaged))
171 |
172 | if epoch <= warmup_epochs:
173 | scheduler_wu.step()
174 | scheduler_re.step(loss_averaged)
175 |
176 | epoch = determine_early_stop(early_stopper, loss_averaged, autoenc, path, epoch, num_epochs)
177 | if hvd.rank() == ROOT_RANK and epoch == num_epochs:
178 | if save_model:
179 | save_checkpoint(autoenc, optimizer, epoch, 'encoder')
180 | return autoenc, loss_averaged, total_time
181 | epoch = epoch+1
182 |
183 |
184 | def train_val_peta_hvd(converter_train, converter_val, autoenc, tgt_id, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps, batch_size=1024, epochs=10, learning_rate=0.01, warmup_epochs=5, save_model=False, path='./early_stop_model.pth'):
185 | autoenc.train()
186 | hvd.init() # Initialize Horovod.
187 | logger.info("Number of workers:" + str(hvd.size()))
188 | # Horovod: pin GPU to local rank.
189 | if torch.cuda.is_available():
190 | torch.cuda.set_device(hvd.local_rank())
191 | device = torch.cuda.current_device()
192 | else:
193 | device = torch.device("cpu")
194 |
195 | autoenc = autoenc.to(device)
196 | num_epochs = epochs
197 |
198 | # Effective batch size in synchronous distributed training is scaled by the number of workers.
199 | # An increase in learning rate compensates for the increased batch size.
200 | optimizer = optim.Adam(autoenc.parameters(), lr=learning_rate * hvd.size())
201 |
202 | # Broadcast initial parameters so all workers start with the same parameters.
203 | hvd.broadcast_parameters(autoenc.state_dict(), root_rank=ROOT_RANK)
204 | hvd.broadcast_optimizer_state(optimizer, root_rank=ROOT_RANK)
205 |
206 | # Wrap the optimizer with Horovod's DistributedOptimizer.
207 | optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=autoenc.named_parameters())
208 |
209 | scheduler_wu, scheduler_re = init_lr_schedulers(
210 | optimizer, warmup_epochs, reduce_patience=int(EARLY_STOPPING_PATIENCE/2), verbose=True)
211 |
212 | hvd.broadcast_object(scheduler_wu, root_rank=ROOT_RANK)
213 | hvd.broadcast_object(scheduler_re, root_rank=ROOT_RANK)
214 |
215 | criterion = [nn.MSELoss(), nn.CrossEntropyLoss()]
216 | early_stopper = DistributedEarlyStopping(logger, patience=EARLY_STOPPING_PATIENCE, delta=EARLY_STOPPING_DELTA)
217 |
218 | with converter_val.make_torch_dataloader(cur_shard=hvd.rank(), shard_count=hvd.size(),
219 | batch_size=batch_size, data_loader_fn=BatchedDataLoader,
220 | num_epochs=None, cache_type=PS_CACHE_TYPE,
221 | workers_count=PS_WORKERS_PER_CPU * get_default_parallelism(),
222 | reader_pool_type=PS_WORKER_TYPE,
223 | hdfs_driver=PS_HDFS_DRIVER) as val_dataloader, \
224 | converter_train.make_torch_dataloader(cur_shard=hvd.rank(), shard_count=hvd.size(),
225 | batch_size=batch_size, data_loader_fn=BatchedDataLoader,
226 | num_epochs=None, cache_type=PS_CACHE_TYPE,
227 | workers_count=PS_WORKERS_PER_CPU * get_default_parallelism(),
228 | reader_pool_type=PS_WORKER_TYPE,
229 | hdfs_driver=PS_HDFS_DRIVER) as train_dataloader:
230 |
231 | val_dataloader_iter = iter(val_dataloader)
232 | steps_val = max(1, len(converter_val) // (batch_size * hvd.size()))
233 |
234 | train_dataloader_iter = iter(train_dataloader)
235 | steps_per_epoch = max(1, len(converter_train) // (batch_size * hvd.size()))
236 | total_time = 0
237 |
238 | epoch = 1
239 | while epoch < num_epochs + 1:
240 | autoenc.train()
241 | _, epoch_time = run_autoencoder_peta(autoenc, optimizer, steps_per_epoch, train_dataloader_iter,
242 | criterion, device, tgt_id, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps)
243 | autoenc.eval()
244 | val_loss, val_epoch_time = run_autoencoder_val_peta(autoenc, steps_val, val_dataloader_iter, criterion,
245 | device, tgt_id, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps)
246 | total_time = total_time + epoch_time + val_epoch_time
247 |
248 | loss_averaged = metric_average(torch.tensor(val_loss), 'avg_loss')
249 | logger.info("Average overall training loss in epoch {0} is {1}".format(
250 | epoch, loss_averaged))
251 |
252 | if epoch <= warmup_epochs:
253 | scheduler_wu.step()
254 | scheduler_re.step(loss_averaged)
255 |
256 | epoch = determine_early_stop(early_stopper, loss_averaged, autoenc, path, epoch, num_epochs)
257 | if hvd.rank() == ROOT_RANK and epoch == num_epochs:
258 | if save_model:
259 | save_checkpoint(autoenc, optimizer, epoch, 'encoder')
260 | return autoenc, loss_averaged, total_time
261 | epoch = epoch+1
262 |
--------------------------------------------------------------------------------
/caspr/utils/spark/score.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from math import frexp
3 |
4 | import numpy as np
5 | import pandas as pd
6 | import torch
7 | from pyspark.sql.functions import array, col, pandas_udf
8 | from pyspark.sql.types import ArrayType, FloatType
9 |
10 | from caspr.data.common_dataset import CommonDataset, id_collate
11 | from caspr.models.factory import LSTM, TRANSFORMER
12 | from caspr.utils.preprocess import get_nonempty_tensors
13 | from caspr.utils.score import get_architecture
14 |
15 | logger = logging.getLogger(__name__)
16 |
17 |
18 | def score(df, model, seq_cols, non_seq_cols, cat_cols, cont_cols, time_steps, batch_size=16*2048):
19 | model.eval()
20 |
21 | # vectorizing continuous and discrete features separately
22 | output = df.withColumn('cont_features', array([col(f) for f in cont_cols])).drop(*cont_cols)
23 | output = output.withColumn('cat_features', array([col(f) for f in cat_cols])).drop(*cat_cols)
24 |
25 | if torch.cuda.is_available():
26 | device = torch.device("cuda")
27 | else:
28 | device = torch.device("cpu")
29 |
30 | logger.info("Scoring on: %s" % device)
31 |
32 | # making sure the model is on CPU before the UDF is defined
33 | model.cpu()
34 |
35 | def calculate_embeddings(continuous, categorical):
36 | try:
37 | model.to(device)
38 | embeddings = []
39 | batch_ds = CommonDataset.for_inference(continuous, categorical, seq_cols,
40 | non_seq_cols, cat_cols, cont_cols, time_steps)
41 |
42 | # nested batching to ensure Spark does not trigger CUDA OOM with larger datasets
43 | data_loader = torch.utils.data.DataLoader(batch_ds, batch_size=batch_size, collate_fn=id_collate)
44 |
45 | for _, _, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data in data_loader:
46 |
47 | data = [seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data]
48 | if isinstance(model, torch.nn.Module):
49 | data = [d.to(device) for d in data]
50 |
51 | nonempty_tensors, nonempty_idx = get_nonempty_tensors(data)
52 |
53 | if get_architecture(model) == TRANSFORMER:
54 | emb, _, _ = model.unified_encoder(*nonempty_tensors, nonempty_idx)
55 | # Concatenate across timesteps
56 | emb = emb.reshape(emb.shape[0], -1)
57 | embeddings.append(emb.detach().cpu() if isinstance(emb, torch.Tensor) else emb)
58 |
59 | elif get_architecture(model) == LSTM:
60 | _, (hn, _) = model.unified_encoder(*nonempty_tensors, nonempty_idx)
61 | embeddings.append(hn.detach().cpu() if isinstance(hn, torch.Tensor) else hn)
62 |
63 | embeddings = pd.DataFrame(np.concatenate(embeddings, axis=0))
64 |
65 | return pd.Series(embeddings.values.tolist())
66 |
67 | finally:
68 | # can release resources here, if needed
69 | pass
70 |
71 | # Pandas UDF declaration with float[] return type
72 | score_udf = pandas_udf(calculate_embeddings, ArrayType(FloatType()))
73 |
74 | # Calculating the embeddings as an additional column and dropping the temporary vectors
75 | output = output.withColumn('embeddings', score_udf('cont_features', 'cat_features')
76 | ).drop('cont_features', 'cat_features')
77 |
78 | return output
79 |
--------------------------------------------------------------------------------
/caspr/utils/train.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | import logging
4 | import os
5 | import time
6 |
7 | import numpy as np
8 | import torch
9 | import torch.distributed as dist
10 | import torch.multiprocessing as mp
11 | from torch import optim
12 | from torch.nn.parallel import DistributedDataParallel as DDP
13 |
14 | from caspr.data.load import init_loaders
15 | from caspr.models.factory import CASPRFactory
16 | from caspr.models.model_wrapper import AutoencoderTeacherTraining, LSTMAutoencoder, TransformerAutoEncoder
17 | from caspr.utils.early_stopping import DistributedEarlyStopping, EarlyStopping
18 | from caspr.utils.metrics import get_metrics
19 | from caspr.utils.onnx import ONNXWrapper
20 | from caspr.utils.score import get_architecture
21 |
22 | DDP_BACKEND = "nccl"
23 | DDP_MASTER_ADDR = "localhost"
24 | DDP_MASTER_PORT = "12355"
25 | DDP_LOAD_WORKERS = 1
26 | STD_LOAD_WORKERS = 0
27 | logger = logging.getLogger(__name__)
28 |
29 |
30 | def run_autoencoder(autoenc, optimizer, dataloader_train, criterion, device):
31 | count = 0
32 | epoch_start_time = time.time()
33 | running_loss = 0.0
34 |
35 | for _, y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data in dataloader_train:
36 | y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data = y.to(device), seq_cat_data.to(
37 | device), seq_cont_data.to(device), non_seq_cat_data.to(device), non_seq_cont_data.to(device)
38 |
39 | _, loss = autoenc.run(y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data, criterion)
40 | optimizer.zero_grad()
41 | loss.backward()
42 | optimizer.step()
43 |
44 | running_loss = (running_loss * count + loss.item()) / (count + 1)
45 |
46 | count = count + 1
47 |
48 | if count % 64 == 0:
49 | logger.info(loss, count*seq_cat_data.shape[0])
50 | time_so_far = time.time() - epoch_start_time
51 | logger.info("Time taken since start:" + str(time_so_far))
52 |
53 | epoch_end_time = time.time()
54 | logger.info(epoch_end_time - epoch_start_time)
55 |
56 | return running_loss, epoch_end_time - epoch_start_time
57 |
58 |
59 | def run_autoencoder_val(autoenc, dataloader_val, criterion, device):
60 | count = 0
61 | running_loss = 0.0
62 |
63 | for _, y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data in dataloader_val:
64 | y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data = y.to(device), seq_cat_data.to(
65 | device), seq_cont_data.to(device), non_seq_cat_data.to(device), non_seq_cont_data.to(device)
66 |
67 | _, loss = autoenc.run(y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data, criterion)
68 |
69 | running_loss = (running_loss * count + loss.item()) / (count + 1)
70 | count = count + 1
71 |
72 | if count % 64 == 0:
73 | logger.info(loss, count*seq_cat_data.shape[0])
74 |
75 | return running_loss
76 |
77 |
78 | def run_epoch(model, epoch, dataloader, criterion, device, optimizer=None, is_train=True, get_outputs=False):
79 | model.to(device)
80 | losses = []
81 | y_labels = []
82 | y_preds = []
83 |
84 | if isinstance(model, DDP):
85 | model = model.module
86 |
87 | for _, y, seq_cat_x, seq_cont_x, non_seq_cat_x, non_seq_cont_x in dataloader:
88 | if is_train:
89 | optimizer.zero_grad()
90 |
91 | seq_cat_x = seq_cat_x.to(device)
92 | seq_cont_x = seq_cont_x.to(device)
93 | non_seq_cat_x = non_seq_cat_x.to(device)
94 | non_seq_cont_x = non_seq_cont_x.to(device)
95 | y = y.to(device)
96 |
97 | # Forward Pass
98 | y_pred, loss = model.run(y, seq_cat_x, seq_cont_x, non_seq_cat_x, non_seq_cont_x, criterion=criterion)
99 | losses.append(loss.detach().cpu().numpy())
100 |
101 | if get_outputs:
102 | y_labels.append(y)
103 | y_preds.append(y_pred)
104 |
105 | # Backward Pass and Optimization
106 | if is_train:
107 | loss.backward()
108 | optimizer.step()
109 |
110 | if get_outputs:
111 | y_labels = torch.cat(y_labels, 0).detach().cpu().numpy()
112 | y_preds = torch.cat(y_preds, 0).detach().cpu().numpy()
113 |
114 | mean_loss = np.mean(np.asarray(losses))
115 | mode = 'training' if is_train else 'validation'
116 | logger.info("Average {} loss in epoch {} is {}".format(mode, epoch, mean_loss))
117 | return y_labels, y_preds, mean_loss
118 |
119 |
120 | def init_lr_schedulers(optimizer, warmup_epochs, reduce_mode='min', reduce_factor=0.1, reduce_patience=4, verbose=True):
121 | """
122 | Training batch size grows proportionally with training distribution, mandating upscaling of the learning rate, which in turn reduces the probability of finding the global optimum.
123 | This function initializes learning rate schedulers for a given optimizer to facilitate dynamic adjustment (reduction) of learning rate during training.
124 | """
125 |
126 | warm_up = lambda epoch: epoch / warmup_epochs if warmup_epochs > 0 & epoch <= warmup_epochs else 1
127 | scheduler_wu = torch.optim.lr_scheduler.LambdaLR(optimizer=optimizer, lr_lambda=warm_up)
128 | scheduler_re = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode=reduce_mode, factor=reduce_factor, patience=reduce_patience, verbose=verbose)
129 |
130 | return scheduler_wu, scheduler_re
131 |
132 |
133 | def train_model(model, criterion, num_epochs, dataloader_train, dataloader_val, device, save_path, lr=1e-3, fix_module_names=None,
134 | should_decrease=True, patience=8, verbose=True, evaluate_downstream=False, rank=0, world_size=1, warmup_epochs=5, save_onnx=False):
135 |
136 | if isinstance(model, (LSTMAutoencoder, AutoencoderTeacherTraining, TransformerAutoEncoder)) and evaluate_downstream:
137 | raise ValueError('evaluate_downstream should be set to False when training autoencoder')
138 |
139 | if fix_module_names:
140 | fix_modules = [module for name, module in model.named_modules() if name in fix_module_names]
141 | for module in fix_modules:
142 | for param in module.parameters():
143 | param.requires_grad = False
144 | module.eval()
145 |
146 | optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
147 |
148 | scheduler_wu, scheduler_re = init_lr_schedulers(optimizer, warmup_epochs, reduce_patience=int(patience/2), verbose=verbose)
149 |
150 | if world_size > 1:
151 | early_stopping = DistributedEarlyStopping(logger, should_decrease, patience, verbose, rank=rank, save_onnx=save_onnx)
152 | else:
153 | early_stopping = EarlyStopping(logger, should_decrease, patience, verbose, save_onnx=save_onnx)
154 |
155 | for epoch in range(num_epochs):
156 | start = time.time()
157 |
158 | model.train()
159 | if fix_module_names:
160 | for module in fix_modules:
161 | module.eval()
162 |
163 | run_epoch(model, epoch, dataloader_train, criterion, device, optimizer)
164 |
165 | model.eval()
166 | with torch.no_grad():
167 | y_labels, y_preds, mean_val_loss = run_epoch(model, epoch, dataloader_val, criterion, device,
168 | is_train=False, get_outputs=evaluate_downstream)
169 | if evaluate_downstream:
170 | get_metrics(y_labels, y_preds)
171 |
172 | end = time.time()
173 | logger.info("Time for epoch {0} is {1}\n".format(epoch, (end - start)))
174 | logger.info("Mean validation loss for epoch {0} is {1}\n".format(epoch, mean_val_loss))
175 |
176 | if epoch <= warmup_epochs:
177 | scheduler_wu.step()
178 | scheduler_re.step(mean_val_loss)
179 |
180 | early_stopping(mean_val_loss, model, save_path)
181 | if early_stopping.early_stop:
182 | logger.info('early stopping at epoch {}'.format(epoch))
183 | break
184 |
185 | if rank == 0:
186 | if save_onnx:
187 | model_type = get_architecture(model)
188 | model = ONNXWrapper(save_path, model_type)
189 | elif isinstance(model, DDP):
190 | model.module.load_state_dict(torch.load(save_path))
191 | else:
192 | model.load_state_dict(torch.load(save_path))
193 | return model
194 |
195 |
196 | def __setup_ddp(rank, world_size):
197 |
198 | os.environ['MASTER_ADDR'] = DDP_MASTER_ADDR
199 | os.environ['MASTER_PORT'] = DDP_MASTER_PORT
200 |
201 | # initialize the process group
202 | dist.init_process_group(DDP_BACKEND, rank=rank, world_size=world_size)
203 | torch.cuda.set_device(rank)
204 |
205 |
206 | def __do_train_ddp(rank, args):
207 |
208 | __setup_ddp(rank, args['world_size'])
209 |
210 | caspr_factory = args['caspr_factory']
211 |
212 | model = caspr_factory.create(args['caspr_arch'], **args['hyper_params'])
213 |
214 | model = DDP(model.cuda(), device_ids=[rank])
215 |
216 | train_loader, val_loader = init_loaders(args['ds_train'], args['ds_val'], args['batch_size'],
217 | num_workers=DDP_LOAD_WORKERS, world_size=args['world_size'], rank=rank)
218 |
219 | train_model(model, args['criterion'], args['num_epochs'], train_loader, val_loader, rank, args['save_path'],
220 | lr=args['lr'] * args['world_size'], rank=rank, world_size=args['world_size'], **args['kwargs'])
221 |
222 | dist.destroy_process_group()
223 |
224 |
225 | def train_model_ddp(caspr_factory : CASPRFactory, caspr_arch : str, hyper_params : dict, ds_train, ds_val, criterion, num_epochs, batch_size, save_path, lr=1e-3, **kwargs):
226 | """
227 | Distributed Data Parallel implementation of CASPR training. Will use all GPUs available on the current machine.
228 |
229 | Arguments:
230 | ----------
231 |
232 | caspr_factory: CASPR model factory for the specified dataset
233 |
234 | caspr_arch: CASPR architecture e.g. TransformerAutoEncoder
235 |
236 | hyper_params: parameters for instantiating a new CASPR model with the above method
237 |
238 | ds_train: CommonDataset for training
239 |
240 | ds_val: CommonDataset for validation
241 |
242 | criterion, num_epochs, batch_size, save_path, lr: self explanatory
243 |
244 | **kwargs: any other parameters to be passed to the train_model function by the DDP worker (e.g. evaluate, verbose or patience)
245 |
246 | Returns: Trained model
247 |
248 | """
249 | logger.info("Setting up model training using torch DDP")
250 |
251 | for arg in [caspr_factory, caspr_arch, ds_train, ds_val, criterion, num_epochs, batch_size, save_path, lr]:
252 | if not arg:
253 | raise ValueError("Illegal null argument. Check for None values and try again.")
254 |
255 | world_size = torch.cuda.device_count()
256 |
257 | if not torch.cuda.is_available() or world_size < 2:
258 | device = "cuda" if torch.cuda.is_available() else "cpu"
259 | logger.warn("DDP mode disabled. Training on %s..." % device)
260 | model = caspr_factory.create(caspr_arch, device=device, **hyper_params)
261 | train_loader, val_loader = init_loaders(ds_train, ds_val, batch_size, num_workers=STD_LOAD_WORKERS)
262 | return train_model(model, criterion, num_epochs, train_loader, val_loader, device, save_path, lr, **kwargs)
263 |
264 | logger.info("DDP mode enabled, will train on %d GPUs" % world_size)
265 |
266 | arguments = locals()
267 |
268 | mp.spawn(__do_train_ddp,
269 | args=(arguments,),
270 | nprocs=world_size,
271 | join=True)
272 |
273 | model = caspr_factory.create(caspr_arch, **hyper_params)
274 | model.load_state_dict(torch.load(save_path))
275 | return model
276 |
277 |
278 | def test_model(model, dataloader_test, criterion, device):
279 | model.eval()
280 | with torch.no_grad():
281 | y_labels, y_preds, _ = run_epoch(
282 | model, 0, dataloader_test, criterion, device, is_train=False, get_outputs=True)
283 | return y_labels, y_preds
284 |
285 |
286 | def count_parameters(model):
287 | return sum(p.numel() for p in model.parameters() if p.requires_grad)
288 |
--------------------------------------------------------------------------------
/docs/PR_Guidelines.md:
--------------------------------------------------------------------------------
1 | # Guidelines for creating a good pull request
2 |
3 | 1. A PR should describe the change clearly and most importantly it should mention the motivation behind the change. Filling out the PR template should satisfy this guideline.
4 | 2. If the PR is fixing a performance issue, mention the improvement and how the measurement was done (for educational purposes).
5 | 3. Do not leave comments unresolved. If PR comments have been addressed without making the requested code changes, explicitly mark them resolved with an appropriate comment explaining why you're resolving it. If you intend to resolve it in a follow up PR, create a task and mention why this comment cannot be fixed in this PR. Leaving comments unresolved sets a wrong precedent for other contributors that it's ok to ignore comments.
6 | 4. In the interest of time, discuss the PR/comments in person/phone if it's difficult to explain in writing. Document the resolution in the PR for the educational benefit of others. Don't just mark the comment resolved saying 'based on offline discussion'.
7 | 5. Add comments, if not obvious, in the PR to help the reviewer navigate your PR faster. If this is a big change, include a short design doc (docs/ folder).
8 | 6. Unit tests are mandatory for all PRs (except when the proposed changes are already covered by existing unit tests).
9 | 7. Do not use PRs as scratch pads for development as they consume valuable build/CI cycles for every commit. Build and test your changes for at least one environment (windows/linux/mac) before creating a PR.
10 | 8. Keep it small. If the feature is big, it's best to split into multiple PRs. Modulo cosmetic changes, a PR with more than 10 files is notoriously hard to review. Be kind to the reviewers.
11 | 9. Separate cosmetic changes from functional changes by making them separate PRs.
12 | 10. The PR author is responsible for merging the changes once they're approved.
13 | 11. If you co-author a PR, seek review from someone else. Do not self-approve PRs.
--------------------------------------------------------------------------------
/docs/images/caspr-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/CASPR/9cdbf270487751a0ad6862b2fea2ccc0e23a0b67/docs/images/caspr-logo.png
--------------------------------------------------------------------------------
/docs/images/caspr-poster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/CASPR/9cdbf270487751a0ad6862b2fea2ccc0e23a0b67/docs/images/caspr-poster.png
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) Microsoft Corporation. All rights reserved.
3 | # -----------------------------------------------------------------------------
4 |
5 | # -----------------------------------------------------------------------------
6 | # Setup() configuration
7 | # -----------------------------------------------------------------------------
8 |
9 | [metadata]
10 | name = AI.Models.CASPR
11 | version = attr: caspr.__VERSION
12 | description = CASPR
13 | long_description = file: README.rst, LICENSE
14 | keywords = "CASPR", "Machine Learning", "Deep Learning"
15 | license = "Microsoft"
16 | classifiers =
17 | Programming Language :: Python :: 3.7,
18 | Intended Audience :: Developers,
19 | License :: OSI Approved :: MIT License,
20 | Natural Language :: English,
21 | Operating System :: OS Independent,
22 | Topic :: Scientific/Engineering :: Artificial Intelligence
23 | url = https://powerbi.visualstudio.com/Business360%%20AI/_git/AI.Models.CASPR
24 |
25 | [options]
26 | zip_safe = False
27 | include_package_data = True
28 | packages = find:
29 |
30 | install_requires = ## base (common) requirements
31 | pandas>1.0
32 | imbalanced-learn>=0.8
33 | scikit-learn>=0.7
34 | scipy>=1.5
35 | matplotlib>=3.3
36 | torch~=1.11.0
37 | protobuf<4.0
38 | onnx~=1.10.1
39 | onnxruntime~=1.7.0
40 |
41 | [options.packages.find]
42 | include=caspr.*
43 | exclude=tests
44 |
45 | [options.extras_require]
46 |
47 | horovod = ## install for horovod + petastorm execution (spark.large module)
48 | pyspark~=3.1
49 | torchvision
50 | petastorm~=0.11
51 | horovod[pytorch,spark]>=0.22
52 | b360sparkdl>=1.0
53 |
54 | xai = ## install for explainability
55 | AI.Models.Explainer~=6.0
56 | captum>=0.2
57 |
58 | databricks = ## install on Databricks
59 | mlflow>=1.19
60 | petastorm~=0.11
61 |
62 | aml = ## install on Azure ML
63 | azureml-core>=1.32
64 | mlflow>=1.19
65 | azureml-mlflow>=1.32
66 |
67 | hdi = ## install on HDInsights
68 | pyspark~=2.4.5
69 | numpy<1.20.0
70 | pyarrow~=0.17.1
71 |
72 | test = ## install before test runs
73 | pytest
74 | pytest-cov
75 | pylint
76 | pylint-junit
77 |
78 | dev = ## install for PPE, latest
79 | AI.Models.Explainer
80 | captum
81 | imbalanced-learn
82 | matplotlib
83 | scikit-learn
84 | pandas
85 | numpy
86 | torch
87 |
88 |
89 | # -----------------------------------------------------------------------------
90 | # Pylama Configurations
91 | # -----------------------------------------------------------------------------
92 | # Documentation: https://pylama.readthedocs.io/en/latest/#command-line-options
93 | [pylama]
94 | format = pylint
95 | skip = */.tox/*,*/.env/*
96 | linters = isort,mccabe,pycodestyle,pydocstyle,pyflakes,pylint
97 | ignore = D202,D203,D213,D406,D407,D413,D415,D417
98 |
99 |
100 | # -----------------------------------------------------------------------------
101 | # Linter-Specific Configurations
102 | # -----------------------------------------------------------------------------
103 | # Possible settings: https://github.com/timothycrosley/isort/wiki/isort-Settings
104 | [pylama:isort]
105 | line_length = 120
106 | multi_line_output = 0
107 | combine_star = True
108 | use_parentheses = True
109 | combine_as_imports = True
110 |
111 | # Used by isort command
112 | [isort]
113 | line_length = 120
114 | multi_line_output = 0
115 | combine_star = True
116 | use_parentheses = True
117 | combine_as_imports = True
118 |
119 | # Source code: https://github.com/pycqa/mccabe
120 | [pylama:mccabe]
121 |
122 | # Codes: https://pycodestyle.pycqa.org/en/latest/intro.html#error-codes
123 | [pylama:pycodestyle]
124 | max_line_length = 120
125 |
126 | # Used by auto-formatters
127 | [pycodestyle]
128 | max_line_length = 120
129 |
130 | # Codes: http://www.pydocstyle.org/en/5.0.1/error_codes.html
131 | [pylama:pydocstyle]
132 |
133 | # Source code: https://github.com/PyCQA/pyflakes
134 | [pylama:pyflakes]
135 | max_line_length = 120
136 | statistics = True
137 | doctests = False
138 | builtins = _
139 |
140 | # Codes: https://docs.pylint.org/en/1.6.0/features.html
141 | # Default settings: https://github.com/PyCQA/pylint/blob/master/pylintrc
142 | [pylama:pylint]
143 | max_line_length = 120
144 | logging_format_style = new
145 | attr_rgx = [a-z_][a-z0-9_]{,30}$
146 | variable_rgx = [a-z_][a-z0-9_]{,30}$
147 | argument_rgx = [a-z_][a-z0-9_]{,30}$
148 | class_attribute_rgx = ([A-Za-z_][A-Za-z0-9_]{,30}|(__.*__))$
149 | # Modules whose attributes are generated at runtime and thus attributes cannot be found using static analysis:
150 | ignored_modules =
151 | pyspark.sql.functions, torch, numpy
152 |
153 |
154 | # -----------------------------------------------------------------------------
155 | # File-Specific Configurations
156 | # -----------------------------------------------------------------------------
157 | [pylama:*tests/*.py]
158 | ignore = C0114,C0115,C0116,C0302,C0321,D,R0902,R0903,R0904,W0612,W0613,C0103,R0914
159 |
160 | [pylama:*caspr/models/lstm_autoencoder_sequence.py]
161 | ignore = C0103
162 |
163 | [pylama:*caspr/models/attention_mechanisms.py]
164 | ignore = C0103
165 |
166 | [pylama:*caspr/utils/train.py]
167 | ignore = W0613
168 |
169 | [pylama:*caspr/utils/spark/large/train.py]
170 | ignore = E1102, E1121
171 |
172 | [pylama:*caspr/utils/spark/large/score.py]
173 | ignore = E1121
174 |
175 | [pylama:*caspr/utils/preprocess.py]
176 | ignore = R0913, R0914
177 |
178 | [pylama:*caspr/utils/spark/preprocess.py]
179 | ignore = R0913, R0914, W0640
180 |
181 | [pylama:*caspr/utils/explain/CASPRExplainer.py]
182 | ignore = C0103, R0902, R0913, W0221
183 |
184 | [pylama:*caspr/utils/explain/utils.py]
185 | ignore = R0914
186 |
187 | [pylama:*caspr/utils/segmentation/pandas.py]
188 | ignore = W0703, W0102, R0913, R0914, W0612
189 |
190 | [pylama:*caspr/utils/segmentation/dec_utils.py]
191 | ignore = E1102, R0914
192 |
193 | [pylama:*setup.py]
194 | ignore = A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z
195 | # skip = 1 # Not currently enforced
196 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | # replaced by AI.Common build template
4 | auto_replaced = "__version__"
5 |
6 | # minor trick to circumvent version warning when building manually
7 | version = None if 'version' in auto_replaced else auto_replaced
8 |
9 | setup(version=version)
10 |
--------------------------------------------------------------------------------