├── .gitignore
├── CITATION.cff
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── README.rst
├── SECURITY.md
├── SUPPORT.md
├── caspr
    ├── __init__.py
    ├── data
    │   ├── __init__.py
    │   ├── common_dataset.py
    │   └── load.py
    ├── models
    │   ├── README.md
    │   ├── __init__.py
    │   ├── attention_mechanisms.py
    │   ├── convolutional_aggregation.py
    │   ├── dec.py
    │   ├── dense_bn_dropout.py
    │   ├── embedding_layer.py
    │   ├── factory.py
    │   ├── lstm_autoencoder_sequence.py
    │   ├── lstm_decoder.py
    │   ├── lstm_timeseries_tpa_attention.py
    │   ├── mlp.py
    │   ├── model_wrapper.py
    │   ├── multi_layer_lstm.py
    │   ├── transformer.py
    │   ├── unified_encoder.py
    │   └── unified_transformer_encoder.py
    └── utils
    │   ├── __init__.py
    │   ├── early_stopping.py
    │   ├── estimate_parameters.py
    │   ├── explain
    │       ├── CASPRExplainer.py
    │       ├── __init__.py
    │       └── utils.py
    │   ├── horovod
    │       ├── __init__.py
    │       └── train.py
    │   ├── metrics.py
    │   ├── noise.py
    │   ├── onnx.py
    │   ├── preprocess.py
    │   ├── score.py
    │   ├── segmentation
    │       ├── __init__.py
    │       ├── dec_utils.py
    │       └── pandas.py
    │   ├── spark
    │       ├── __init__.py
    │       ├── large
    │       │   ├── __init__.py
    │       │   ├── score.py
    │       │   └── train.py
    │       ├── preprocess.py
    │       └── score.py
    │   └── train.py
├── docs
    ├── PR_Guidelines.md
    └── images
    │   ├── caspr-logo.png
    │   └── caspr-poster.png
├── setup.cfg
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | *.pyc
  2 | *.pickle
  3 | *.pth
  4 | *.p
  5 | *.ipynb
  6 | *.csv
  7 | .amlignore
  8 | aml_config
  9 | .git
 10 | .vscodeignore
 11 | azureml-logs
 12 | .azureml
 13 | outputs
 14 | azureml-setup
 15 | *:Zone.Identifier*
 16 | 
 17 | # VSCode stuff
 18 | **/*.prefs
 19 | **/*.project
 20 | **/*.classpath
 21 | .[Vv]scode
 22 | .idea/
 23 | 
 24 | # Byte-compiled / optimized / DLL files
 25 | __pycache__/
 26 | *.py[cod]
 27 | *$py.class
 28 | 
 29 | # C extensions
 30 | *.so
 31 | 
 32 | # Distribution / packaging
 33 | .Python
 34 | build/
 35 | !.azure-pipelines/build/
 36 | develop-eggs/
 37 | dist/
 38 | downloads/
 39 | eggs/
 40 | .eggs/
 41 | lib/
 42 | lib64/
 43 | parts/
 44 | sdist/
 45 | var/
 46 | wheels/
 47 | *.egg-info/
 48 | .installed.cfg
 49 | *.egg
 50 | MANIFEST
 51 | 
 52 | # PyInstaller
 53 | #  Usually these files are written by a python script from a template
 54 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 55 | *.manifest
 56 | *.spec
 57 | 
 58 | # Installer logs
 59 | pip-log.txt
 60 | pip-delete-this-directory.txt
 61 | 
 62 | # Unit test / coverage reports
 63 | htmlcov/
 64 | .tox/
 65 | .coverage
 66 | .coverage.*
 67 | .cache
 68 | nosetests.xml
 69 | coverage.xml
 70 | *.cover
 71 | .hypothesis/
 72 | .pytest_cache/
 73 | 
 74 | # Translations
 75 | *.mo
 76 | *.pot
 77 | 
 78 | # Django stuff:
 79 | *.log
 80 | local_settings.py
 81 | db.sqlite3
 82 | 
 83 | # Flask stuff:
 84 | instance/
 85 | .webassets-cache
 86 | 
 87 | # Scrapy stuff:
 88 | .scrapy
 89 | 
 90 | # Sphinx documentation
 91 | docs/_build/
 92 | 
 93 | # PyBuilder
 94 | target/
 95 | 
 96 | # Jupyter Notebook
 97 | .ipynb_checkpoints
 98 | 
 99 | # pyenv
100 | .python-version
101 | 
102 | # celery beat schedule file
103 | celerybeat-schedule
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .vs/*
130 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | title: CASPR
 3 | message: "Please use this information to cite CASPR in
 4 |   research or other publications."
 5 | authors:
 6 |   - given-names: Pin-Jung
 7 |     family-names: Chen
 8 |     email: pinjung.chen@microsoft.com
 9 |     affiliation: Microsoft Corporation
10 |   - given-names: Sahil
11 |     family-names: Bhatnagar
12 |     email: sahil.bhatnagar@microsoft.com
13 |     affiliation: Microsoft Corporation
14 |   - given-names: Damian Konrad
15 |     family-names: Kowalczyk
16 |     email: damian.kowalczyk@microsoft.com
17 |     affiliation: Microsoft Corporation
18 |   - given-names: Mayank
19 |     family-names: Shrivastava
20 |     email: mayank.shrivastava@microsoft.com
21 |     affiliation: Microsoft Corporation
22 |   - given-names: Sagar
23 |     family-names: Goyal
24 |     email: goyalsagar@outlook.com
25 | 
26 | date-released: 2022-11-16
27 | repository-code: "https://github.com/microsoft/CASPR"
28 | license: "MIT"
29 | keywords:
30 |   - deep learning
31 |   - machine learning
32 |   - tabular data
33 |   
34 | version: 0.2.6
35 | doi: 10.48550/arXiv.2211.09174
36 | references:
37 |   - type: article
38 |     authors:
39 |       - given-names: Pin-Jung
40 |         family-names: Chen
41 |         email: pinjung.chen@microsoft.com
42 |         affiliation: Microsoft Corporation
43 |       - given-names: Sahil
44 |         family-names: Bhatnagar
45 |         email: sahil.bhatnagar@microsoft.com
46 |         affiliation: Microsoft Corporation
47 |       - given-names: Damian Konrad
48 |         family-names: Kowalczyk
49 |         email: damian.kowalczyk@microsoft.com
50 |         affiliation: Microsoft Corporation
51 |       - given-names: Mayank
52 |         family-names: Shrivastava
53 |         email: mayank.shrivastava@microsoft.com
54 |         affiliation: Microsoft Corporation
55 |       - given-names: Sagar
56 |         family-names: Goyal
57 |         email: goyalsagar@outlook.com
58 |     title: "CASPR: Customer Activity Sequence-based Prediction and Representation"
59 |     year: 2022
60 |     journal: ArXiv
61 |     doi: 10.48550/arXiv.2211.09174
62 |     url: https://arxiv.org/abs/2211.09174
63 | 
64 | abstract: >-
65 |   Tasks critical to enterprise profitability, such as customer churn prediction, fraudulent account detection or customer lifetime value estimation, are often tackled by models trained on features engineered from customer data in tabular format. Application-specific feature engineering adds development, operationalization and maintenance costs over time. Recent advances in representation learning present an opportunity to simplify and generalize feature engineering across applications. When applying these advancements to tabular data researchers deal with data heterogeneity, variations in customer engagement history or the sheer volume of enterprise datasets. In this paper, we propose a novel approach to encode tabular data containing customer transactions, purchase history and other interactions into a generic representation of a customer's association with the business. We then evaluate these embeddings as features to train multiple models spanning a variety of applications. CASPR, Customer Activity Sequence-based Prediction and Representation, applies Transformer architecture to encode activity sequences to improve model performance and avoid bespoke feature engineering across applications. Our experiments at scale validate CASPR for both small and large enterprise applications.
66 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | We're looking for your help to improve CASPR (bug fixes, new features, documentation, etc).
 4 | 
 5 | ## Contribute a code change
 6 | * Start by reading the [CASPR Paper](https://arxiv.org/abs/2211.09174)
 7 | * If your change is non-trivial or introduces new public facing APIs (discussed in more detail below) please use the [feature request issue template](https://github.com/microsoft/CASPR/issues/new?template=feature_request.md) to discuss it with the team and get consensus on the basic design and direction first. For all other changes, you can directly create a pull request (PR) and we'll be happy to take a look.
 8 | * Make sure your PR adheres to the [PR Guidelines](./docs/PR_Guidelines.md) established by the team.
 9 | * If you're unsure about any of the above and want to contribute, you're welcome to start a discussion with the team.
10 | 
11 | ## Process details
12 | 
13 | Please search the [issue tracker](https://github.com/microsoft/CASPR/issues) for a similar idea first: there may already be an issue you can contribute to.
14 | 
15 | 1. **Create Issue**
16 | To propose a new feature or API please start by filing a new issue in the [issue tracker](https://github.com/microsoft/CASPR/issues).
17 | Include as much detail as you have. It's fine if it's not a complete design: just a summary and rationale is a good starting point.
18 | 
19 | 2. **Discussion**
20 | We'll keep the issue open for community discussion until it has been resolved or is deemed no longer relevant.
21 | Note that if an issue isn't a high priority or has many open questions then it might stay open for a long time.
22 | 
23 | 3. **Owner Review**
24 | The CASPR team will review the proposal and either approve or close the issue based on whether it broadly aligns with the CASPR Roadmap and contribution guidelines.
25 | 
26 | 4. **Implementation**
27 | * A feature can be implemented by you, the CASPR team, or other community members.  Code contributions are greatly appreciated: feel free to work on any reviewed feature you proposed, or choose one in the backlog and send us a PR. If you are new to the project and want to work on an existing issue, we recommend starting with issues that are tagged with “good first issue”. Please let us know in the issue comments if you are actively working on implementing a feature so we can ensure it's assigned to you.
28 | * Unit tests: New code *must* be accompanied by unit tests.
29 | * Documentation and sample updates: If the PR affects any of the documentation or samples then include those updates in the same PR.
30 | <!-- * Build instructions are [here](https://CASPR/build). -->
31 | * Once a feature is complete and tested according to the contribution guidelines follow these steps:
32 |   * Follow the [standard GitHub process to open a pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests)
33 |   * Add reviewers who have context from the earlier discussion. If you can't find a reviewer, add 'microsoft/CASPR'.
34 | * Note: After creating a pull request, you might not see a build getting triggered right away. One of the
35 | CASPR team members can trigger the build for you.
36 | 
37 | ## Licensing guidelines
38 | 
39 | This project welcomes contributions and suggestions. Most contributions require you to
40 | agree to a Contributor License Agreement (CLA) declaring that you have the right to,
41 | and actually do, grant us the rights to use your contribution. For details, visit
42 | https://cla.microsoft.com.
43 | 
44 | When you submit a pull request, a CLA-bot should automatically determine whether you need
45 | to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the
46 | instructions provided by the bot. You will only need to do this once across all repositories using our CLA.
47 | 
48 | ## Code of conduct
49 | 
50 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
51 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
52 | or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
53 | 
54 | ## Report a security issue
55 | 
56 | Security issues and bugs should be reported privately, via email, to the Microsoft Security
57 | Response Center (MSRC) at [secure@microsoft.com](mailto:secure@microsoft.com). You should
58 | receive a response within 24 hours. If for some reason you do not, please follow up via
59 | email to ensure we received your original message. Further information, including the
60 | [MSRC PGP](https://technet.microsoft.com/en-us/security/dn606155) key, can be found in
61 | the [Security TechCenter](https://technet.microsoft.com/en-us/security/default).
62 | 
63 | <!-- credits: we are heavily inspired by documentation practices of our colleagues in the ONNX Runtime team -->


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) Microsoft Corporation. All rights reserved.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include README.rst
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <p align="center"><img width="70%" src="docs/images/caspr-logo.png" /></p>
 2 | 
 3 | <!-- # AI.Models.CASPR -->
 4 | **CASPR is a transformer-based framework for deep learning from sequential data in tabular format, most common in business applications.**
 5 | 
 6 | <p align="justify">
 7 | Tasks critical to enterprise profitability, such as customer churn prediction, fraudulent account detection or customer lifetime value estimation, are often tackled by models trained on features engineered from customer data in tabular format. Application-specific feature engineering however adds development, operationalization and maintenance costs over time. Recent advances in representation learning present an opportunity to simplify and generalize feature engineering across applications.
 8 | 
 9 | With **CASPR**  we propose a novel approach to encode sequential data in tabular format (e.g., customer transactions, purchase history and other interactions) into a generic representation of a subject's (e.g., customer's) association with the business. We evaluate these embeddings as features to train multiple models spanning a variety of applications (see: [paper](https://arxiv.org/abs/2211.09174)). CASPR, Customer Activity Sequence-based Prediction and Representation, applies transformer architecture to encode activity sequences to improve model performance and avoid bespoke feature engineering across applications. Our experiments at scale validate CASPR for both small and large enterprise applications. 
10 | </p>
11 | 
12 | <!-- - **Representation**      (TODO: in 2 sentences WHY and HOW on CASPR embeddings, RFM)
13 | 
14 | - **Pre-Training**        (TODO: few words on self-supervised training, platforms supported, pointers to modules)
15 | 
16 | - **Inference**           (TODO: few words on inference at scale, platforms supported, pointers to modules) -->
17 | 
18 | ## Getting Started & Resources
19 | 
20 | * **CASPR: Customer Activity Sequence-based Prediction and Representation** (NeurIPS 2022, New Orleans: Tabular Representation Learning)
21 |    - [paper](https://arxiv.org/abs/2211.09174)
22 |    - [poster](https://github.com/microsoft/CASPR/docs/images/caspr-poster.png)
23 | 
24 | * **Build**
25 | 
26 |    - pre-requisites:  ```python==3.9, setuptools```
27 |    - building the wheel:  ```python setup.py build bdist_wheel```
28 | 
29 | * **Installation**
30 | 
31 |    ```
32 |    (now)
33 |    pip install .\dist\AI.Models.CASPR-<ver>.whl[<optional-env-modifier>]
34 | 
35 |    (future)
36 |    pip install AI.Models.CASPR[<optional-env-modifier>]
37 |    ```
38 | 
39 |    use any of below modifiers, to customize the installation for target system / usecase:
40 |    ```
41 |     horovod     - for distributed training and inference on Horovod
42 |     databricks  - for distributed training and inference on Databricks
43 |     aml         - for (distributed) training and inference on Azure ML
44 |     hdi         - for execution on Azure HD Insights
45 |     xai         - to enable explainability
46 |     test        - for extended test execution
47 |     dev         - for development purposes only
48 |    ```
49 | * **Examples**
50 |   
51 |    (TODO: can we point to a well commented one of our examples w/ or w/o data?)
52 | 
53 | ## Contributions and Feedback
54 | 
55 | We welcome contributions! Please see the [contribution guidelines](CONTRIBUTING.md).
56 | 
57 | For feature requests or bug reports please file a [GitHub Issue](https://github.com/Microsoft/CASPR/issues).
58 | 
59 | ## Code of Conduct
60 | 
61 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
62 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
63 | or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
64 | 
65 | ## License
66 | 
67 | This project is licensed under the [MIT License](LICENSE).
68 | 
69 | ---


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | AI.Models.CASPR
 2 | ==================
 3 | This is the deep learning CASPR model.
 4 | 
 5 | This package has been tested with Python 3.7
 6 | 
 7 | Usage
 8 | -----
 9 | You need to have access to Business360 artifact feed on Azure Devops
10 | 
11 | | pip install twine keyring artifacts-keyring
12 | | pip install AI.Models.CASPR --index-url=https://powerbi.pkgs.visualstudio.com/_packaging/Business360/pypi/simple


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.8 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # TODO: The maintainer of this repo has not yet edited this file
 2 | 
 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
 4 | 
 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help.
 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
 7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
 8 | 
 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10 | 
11 | # Support
12 | 
13 | ## How to file issues and get help  
14 | 
15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
16 | issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
17 | feature request as a new Issue.
18 | 
19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22 | 
23 | ## Microsoft Support Policy  
24 | 
25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
26 | 


--------------------------------------------------------------------------------
/caspr/__init__.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # --------------------------------------------------------------------------
 4 | 
 5 | """A series of modules for the CASPR deep learning AI model.
 6 | 
 7 | Provide a longer general description of the modules in this folder here.
 8 | 
 9 | Modules:
10 | :module1_name: A description of this specific module.
11 | """
12 | 
13 | __VERSION = "0.9.dev3"     # arbitrary low dev version for local build
14 | 


--------------------------------------------------------------------------------
/caspr/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 
 2 | 
 3 | # 
 4 | 
 5 | # Unless required by applicable law or agreed to in writing, software 
 6 | 
 7 | # distributed under the License is distributed on an "AS IS" BASIS, 
 8 | 
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
10 | 
11 | # See the License for the specific language governing permissions and 
12 | 
13 | # limitations under the License. 
14 | 
15 | # 
16 | 
17 | # ============================================================================== 
18 | 


--------------------------------------------------------------------------------
/caspr/data/common_dataset.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import numpy as np
 3 | import pandas as pd
 4 | import torch
 5 | from torch.utils.data.dataloader import default_collate
 6 | 
 7 | 
 8 | class CommonDataset(torch.utils.data.Dataset):
 9 |     def __init__(self, df, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps, tgt_id=[]):
10 |         self.len = df.shape[0]
11 |         self.seq_cols = seq_cols if seq_cols else []
12 | 
13 |         self.non_seq_cols = non_seq_cols
14 |         self.output_col = output_col
15 | 
16 |         self.seq_contX = torch.tensor(df[[item for item in seq_cols if item in cont_cols]].values, dtype=torch.float32)
17 |         self.seq_catX = torch.tensor(df[[item for item in seq_cols if item in cat_cols]].values, dtype=torch.long)
18 | 
19 |         self.seq_contX = self.seq_contX.reshape(
20 |             (self.seq_contX.shape[0], int(self.seq_contX.shape[1]/time_steps), time_steps))
21 |         self.seq_contX = self.seq_contX.permute(0, 2, 1)
22 | 
23 |         self.seq_catX = self.seq_catX.reshape(
24 |             (self.seq_catX.shape[0], int(self.seq_catX.shape[1]/time_steps), time_steps))
25 |         self.seq_catX = self.seq_catX.permute(0, 2, 1)
26 | 
27 |         self.non_seq_catX = torch.tensor(
28 |             df[[item for item in non_seq_cols if item in cat_cols]].values, dtype=torch.long)
29 |         self.non_seq_contX = torch.tensor(
30 |             df[[item for item in non_seq_cols if item in cont_cols]].values, dtype=torch.float32)
31 | 
32 |         self.y = torch.tensor(df[output_col].values, dtype=torch.float32)
33 | 
34 |         self.tgt_id = df[tgt_id].values
35 | 
36 |     @classmethod
37 |     def for_inference(cls, continuous: pd.Series, categorical: pd.Series, seq_cols, non_seq_cols, cat_cols, cont_cols, time_steps):
38 |         cont_df = pd.DataFrame(continuous.values.tolist(), columns=cont_cols)
39 |         cat_df = pd.DataFrame(categorical.values.tolist(), columns=cat_cols)
40 |         
41 |         df = pd.concat([cont_df, cat_df], axis=1)
42 |         return cls(df, seq_cols, non_seq_cols, [], cat_cols, cont_cols, time_steps, tgt_id=[])
43 | 
44 |     def __getitem__(self, index):
45 |         return [self.tgt_id[index], self.y[index], self.seq_catX[index], self.seq_contX[index], self.non_seq_catX[index], self.non_seq_contX[index]]
46 | 
47 |     def __len__(self):
48 |         return self.len
49 | 
50 | 
51 | def id_collate(batch):
52 |     ids = []
53 |     new_batch = []
54 |     for _batch in batch:
55 |         ids.append(_batch[0])
56 |         new_batch.append(_batch[1:])
57 |     ids = np.stack(ids, axis=0)
58 |     return tuple([ids] + default_collate(new_batch))
59 | 


--------------------------------------------------------------------------------
/caspr/data/load.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from sklearn.model_selection import train_test_split
  3 | from torch.utils.data import DataLoader
  4 | from torch.utils.data.distributed import DistributedSampler
  5 | 
  6 | from caspr.data.common_dataset import CommonDataset, id_collate
  7 | 
  8 | 
  9 | def transform_and_load(batch, device, tgt_id_cols, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps):
 10 |     """
 11 |     Transforms a batch of feature tensors from Petastorm, into input tensors for CASPR, then loads onto chosen device.
 12 |     """
 13 |     if not batch:
 14 |         raise ValueError("non-empty batch of tensors required")
 15 | 
 16 |     if not int(time_steps) > 0:
 17 |         raise ValueError("time_steps should be a positive integer")
 18 | 
 19 |     batch_size = batch[list(batch.keys())[0]].shape[0]
 20 | 
 21 |     seq_contX_cols = [item for item in seq_cols if item in cont_cols]
 22 |     if seq_contX_cols:
 23 |         seq_contX = torch.cat([batch[c] for c in seq_contX_cols], 0).float().to(device)
 24 |         seq_contX = seq_contX.reshape(-1, time_steps, batch_size).T
 25 |     else:
 26 |         seq_contX = torch.zeros((batch_size, time_steps, 0), device=device).float()
 27 | 
 28 |     seq_catX_cols = [item for item in seq_cols if item in cat_cols]
 29 |     if seq_catX_cols:
 30 |         seq_catX = torch.cat([batch[c] for c in seq_catX_cols], 0).long().to(device)
 31 |         seq_catX = seq_catX.reshape(-1, time_steps, batch_size).T
 32 |     else:
 33 |         seq_catX = torch.zeros((batch_size, time_steps, 0), device=device).long()
 34 | 
 35 |     non_seq_catX_cols = [item for item in non_seq_cols if item in cat_cols]
 36 |     if non_seq_catX_cols:
 37 |         non_seq_catX = torch.cat([batch[c] for c in non_seq_catX_cols], 0).long().to(device)
 38 |         non_seq_catX = non_seq_catX.reshape(len(non_seq_catX_cols), batch_size).T
 39 |     else:
 40 |         non_seq_catX = torch.zeros(batch_size, 0, device=device).long()
 41 | 
 42 |     non_seq_contX_cols = [item for item in non_seq_cols if item in cont_cols]
 43 |     if non_seq_contX_cols:
 44 |         non_seq_contX = torch.cat([batch[c] for c in non_seq_contX_cols], 0).float().to(device)
 45 |         non_seq_contX = non_seq_contX.reshape(len(non_seq_contX_cols), batch_size).T
 46 |     else:
 47 |         non_seq_contX = torch.zeros(batch_size, 0, device=device).float()
 48 | 
 49 |     if output_col:
 50 |         y = torch.cat([batch[c] for c in output_col], 0).to(device)
 51 |         y = y.reshape((len(output_col), -1)).T
 52 |     else:
 53 |         y = torch.zeros(batch_size, 0, device=device).float()
 54 | 
 55 |     if tgt_id_cols:
 56 |         tgt_id = torch.cat([batch[c] for c in tgt_id_cols], 0).long().cpu()
 57 |         tgt_id = tgt_id.reshape(len(tgt_id_cols), batch_size).T.numpy()
 58 |     else:
 59 |         tgt_id = torch.zeros(batch_size, 0).long().cpu().numpy()
 60 | 
 61 |     return tgt_id, y, seq_catX, seq_contX, non_seq_catX, non_seq_contX
 62 | 
 63 | 
 64 | def init_datasets(df, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, seq_len, test_ratio=0.2, seed=None):
 65 |     """
 66 |     Splits an incoming columnar dataframe into CASPR train and validation datasets
 67 |     """
 68 | 
 69 |     train_pd, val_pd = train_test_split(df, test_size=test_ratio, random_state=seed)
 70 | 
 71 |     print(f"train: {len(train_pd)}, val: {len(val_pd)}")
 72 | 
 73 |     dataset_train = CommonDataset(
 74 |         train_pd, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, seq_len)
 75 | 
 76 |     dataset_val = CommonDataset(
 77 |         val_pd, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, seq_len)
 78 | 
 79 |     return dataset_train, dataset_val
 80 | 
 81 | 
 82 | def init_loaders(ds_train, ds_val, batch_size, num_workers=0, shuffle=False, pin_memory=True, world_size=1, rank=0):
 83 |     """
 84 |     Initializes train and validation data loaders. The loaders support distributed sampling when world_size > 1.
 85 |     """
 86 | 
 87 |     print("Initializing dataloaders... Replica: %d of %d" % (rank + 1, world_size))
 88 | 
 89 |     val_sampler = DistributedSampler(ds_val,
 90 |                                      num_replicas=world_size, rank=rank, shuffle=shuffle) if world_size > 1 else None
 91 | 
 92 |     val_loader = DataLoader(ds_val, pin_memory=pin_memory,
 93 |                             batch_size=batch_size, num_workers=num_workers, sampler=val_sampler, collate_fn=id_collate)
 94 | 
 95 |     train_sampler = DistributedSampler(ds_train,
 96 |                                        num_replicas=world_size, rank=rank, shuffle=shuffle) if world_size > 1 else None
 97 | 
 98 |     train_loader = DataLoader(ds_train, pin_memory=pin_memory,
 99 |                               batch_size=batch_size, num_workers=num_workers, sampler=train_sampler, collate_fn=id_collate)
100 | 
101 |     return train_loader, val_loader
102 | 


--------------------------------------------------------------------------------
/caspr/models/README.md:
--------------------------------------------------------------------------------
 1 | The model architecture should follow the following guidelines to support explainability
 2 | 
 3 | Basic changes made:
 4 | 1.  Every model class should have the flags - explain, interpretable_emb_non_seq and interpretable_emb_seq
 5 | 
 6 | 2. The nn.Embedding layers and the dropout after that need to be modularised
 7 | out of the model and the Seq_Cat_Embedding and Non_Seq_Cat_Embedding classes present in the Embedding_Layers.py file should be used for them
 8 | 
 9 | 3. The input to every forward function should be a single concatenated vector
10 | 
11 | 4. The activate_explainer_mode and deactivate_explainer_mode functions should be a part of every model class (also every model wrapper class)
12 | 
13 | 
14 | """
15 | Some notes regarding the explainer:
16 | 1. When we join multiple models to form a new model - 
17 |     use the activate_explainer_mode functions to call the 
18 |     respective functions for all consituent sub_model classes
19 | 
20 | 2. Right now the architecture supports only model wrappers which join the model in a vertical fashion (the case for all our models for now)
21 | 
22 | 3. The explainer modes are activated by the DLExplainer module and 
23 |     also deactivated by it
24 | 
25 | 4. The indices to embedding conversion happens in the DLExplainer module
26 | 
27 | 
28 | """
29 | 
30 | Please refer the mlp_autoencoder.py file to have a look


--------------------------------------------------------------------------------
/caspr/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 
 2 | 
 3 | # 
 4 | 
 5 | # Unless required by applicable law or agreed to in writing, software 
 6 | 
 7 | # distributed under the License is distributed on an "AS IS" BASIS, 
 8 | 
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
10 | 
11 | # See the License for the specific language governing permissions and 
12 | 
13 | # limitations under the License. 
14 | 
15 | # 
16 | 
17 | # ============================================================================== 
18 | 


--------------------------------------------------------------------------------
/caspr/models/attention_mechanisms.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | """Attention mechanisms base class."""
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | 
  7 | 
  8 | class MultiHeadAttentionLayer(nn.Module): # noqa: W0223
  9 |     def __init__(self, hid_dim, n_heads, dropout):
 10 |         """Initialize model with params."""
 11 |         super().__init__()
 12 | 
 13 |         assert hid_dim % n_heads == 0
 14 | 
 15 |         self.hid_dim = hid_dim
 16 |         self.n_heads = n_heads
 17 |         self.head_dim = hid_dim // n_heads
 18 | 
 19 |         self.fc_q = nn.Linear(hid_dim, hid_dim)
 20 |         self.fc_k = nn.Linear(hid_dim, hid_dim)
 21 |         self.fc_v = nn.Linear(hid_dim, hid_dim)
 22 | 
 23 |         self.fc_o = nn.Linear(hid_dim, hid_dim)
 24 | 
 25 |         self.dropout = nn.Dropout(dropout)
 26 | 
 27 |         self.register_buffer('scale', torch.sqrt(torch.FloatTensor([self.head_dim])))
 28 | 
 29 |     def forward(self, query, key, value, mask=None):
 30 |         """Run a forward pass of model over the data."""
 31 |         batch_size = query.shape[0]
 32 | 
 33 |         # query = [batch size, query len, hid dim]
 34 |         # key = [batch size, key len, hid dim]
 35 |         # value = [batch size, value len, hid dim]
 36 | 
 37 |         Q = self.fc_q(query)
 38 |         K = self.fc_k(key)
 39 |         V = self.fc_v(value)
 40 | 
 41 |         # Q = [batch size, query len, hid dim]
 42 |         # K = [batch size, key len, hid dim]
 43 |         # V = [batch size, value len, hid dim]
 44 | 
 45 |         Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
 46 |         K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
 47 |         V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
 48 | 
 49 |         # Q = [batch size, n heads, query len, head dim]
 50 |         # K = [batch size, n heads, key len, head dim]
 51 |         # V = [batch size, n heads, value len, head dim]
 52 | 
 53 |         energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
 54 | 
 55 |         # energy = [batch size, n heads, query len, key len]
 56 | 
 57 |         if mask is not None:
 58 |             energy = energy.masked_fill(mask == 0, -1e10)
 59 | 
 60 |         attention = torch.softmax(energy, dim=-1)
 61 | 
 62 |         # attention = [batch size, n heads, query len, key len]
 63 | 
 64 |         x = torch.matmul(self.dropout(attention), V)
 65 | 
 66 |         # x = [batch size, n heads, query len, head dim]
 67 | 
 68 |         x = x.permute(0, 2, 1, 3).contiguous()
 69 | 
 70 |         # x = [batch size, query len, n heads, head dim]
 71 | 
 72 |         x = x.view(batch_size, -1, self.hid_dim)
 73 | 
 74 |         # x = [batch size, query len, hid dim]
 75 | 
 76 |         x = self.fc_o(x)
 77 | 
 78 |         # x = [batch size, query len, hid dim]
 79 | 
 80 |         return x, attention
 81 | 
 82 | 
 83 | class MultiHeadAttentionLSTMWrapper(nn.Module): # noqa: W0223
 84 |     def __init__(self, n_head, d_model, dropout=0.1):
 85 |         """Initialize model with params."""
 86 |         super().__init__()
 87 | 
 88 |         self.self_attn_layer_norm = nn.LayerNorm(d_model)
 89 |         self.multi_head_attn = MultiHeadAttentionLayer(hid_dim=d_model, n_heads=n_head, dropout=dropout)
 90 |         self.dropout = nn.Dropout(dropout)
 91 | 
 92 |     def forward(self, q, k, v, mask=None):
 93 |         """Run a forward pass of model over the data."""
 94 |         _q, _ = self.multi_head_attn(q, k, v, mask=mask)
 95 |         # dropout, residual connection and layer norm
 96 |         q = self.self_attn_layer_norm(q + self.dropout(_q))
 97 | 
 98 |         context_vector = torch.sum(q, 1)
 99 |         return context_vector
100 | 
101 | 
102 | class BahdanauAttention(nn.Module): # noqa: W0223
103 |     def __init__(self, hidden_size, num_directions=1):
104 |         """Initialize model with params."""
105 | 
106 |         super().__init__()
107 |         self.num_directions = num_directions
108 |         self.hidden_size = hidden_size
109 |         self.fc_encoder = nn.Linear(self.num_directions*self.hidden_size, self.hidden_size, bias=False)
110 |         self.attnHidden = nn.Linear(self.hidden_size, 1)
111 | 
112 |     def forward(self, enc_outputs):
113 |         """Run a forward pass of model over the data."""
114 |         tempX = torch.tanh(self.fc_encoder(enc_outputs))
115 | 
116 |         alignment_scores = self.attnHidden(tempX)
117 | 
118 |         attn_weights = F.softmax(alignment_scores, dim=1)
119 |         attn_weights = attn_weights.permute(0, 2, 1)
120 | 
121 |         context_vector = torch.bmm(attn_weights, enc_outputs)
122 | 
123 |         return context_vector
124 | 


--------------------------------------------------------------------------------
/caspr/models/convolutional_aggregation.py:
--------------------------------------------------------------------------------
 1 | """CNN based layer base class."""
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | 
 7 | 
 8 | class ConvAggregation(nn.Module): # noqa: W0223
 9 |     """Initialise a CNN based layer that reduces the size of our input.
10 | 
11 |     It treates the sequential input like an image with a single channel and performs learned aggregation
12 |     """
13 | 
14 |     def __init__(self, kernel_size=(3, 3), stride=(2, 2), max_pool_size=(2, 2), dropout_size=0.):
15 |         """Initiliase the cnn layers.
16 | 
17 |         Args:
18 |             kernel_size : Tuple which determines the size of the cnn kernel
19 |             stride : Tuple which determines the size of the strides in the x and y direction
20 |             max_pool_size = Tuple which determines the size of the max_pooling function
21 |             dropout_size = Value of dropout added after entire processing
22 |         """
23 |         super().__init__()
24 |         self.in_channels = 1
25 |         self.out_channels = 1
26 | 
27 |         self.conv_layer = nn.Conv2d(in_channels=self.in_channels,
28 |                                     out_channels=self.out_channels, kernel_size=kernel_size, stride=stride)
29 |         self.max_pool = nn.MaxPool2d(max_pool_size)
30 |         self.conv_dropout = nn.Dropout(dropout_size)
31 | 
32 |     def forward(self, input_tensor):
33 |         """Run a forward pass of model over the data."""
34 | 
35 |         # The CNN by default accepts the input as (batch_size, in_channels, height_img, width_img).
36 |         # We treat the sequential input as an image but we need an additional dimension to correspond to in_channels
37 |         # Therefore we need to unsqueeze a dimension out here
38 | 
39 |         input_tensor = torch.unsqueeze(input_tensor, 1)
40 | 
41 |         input_tensor = F.tanh(self.conv_layer(input_tensor))
42 |         input_tensor = self.max_pool(input_tensor)
43 | 
44 |         # The CNN by default outputs as (batch_size, out_channels, height_img, width_img).
45 |         # We need to squeeze away the dimension we had added earlier to remain consistent
46 | 
47 |         input_tensor = input_tensor.squeeze(1)
48 |         output_tensor = self.conv_dropout(input_tensor)
49 | 
50 |         return output_tensor
51 | 


--------------------------------------------------------------------------------
/caspr/models/dec.py:
--------------------------------------------------------------------------------
  1 | """CASPR deep embedding clustering class."""
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | from torch.nn import Parameter
  6 | 
  7 | from caspr.utils.preprocess import get_nonempty_tensors
  8 | 
  9 | 
 10 | class ClusterAssignment(nn.Module):  # noqa: W0223
 11 |     def __init__(self,
 12 |                  cluster_number,
 13 |                  embedding_dimension,
 14 |                  alpha=1.0,
 15 |                  cluster_centers=None):
 16 |         """Handle the soft assignment.
 17 | 
 18 |         For a description see in 3.1.1. in Xie/Girshick/Farhadi, where the Student's t-distribution
 19 |         is used to measure similarity between feature vector and each cluster centroid.
 20 | 
 21 |         Args:
 22 |             cluster_number (int): number of clusters
 23 |             embedding_dimension (int): embedding dimension of feature vectors
 24 |             alpha (float): parameter representing the degrees of freedom in the t-distribution, default 1.0
 25 |             cluster_centers (tensors): clusters centers to initialise, if None then use Xavier uniform
 26 |         """
 27 |         super().__init__()
 28 |         self.embedding_dimension = embedding_dimension
 29 |         self.cluster_number = cluster_number
 30 |         self.alpha = alpha
 31 |         if cluster_centers is None:
 32 |             initial_cluster_centers = torch.zeros(
 33 |                 self.cluster_number,
 34 |                 self.embedding_dimension,
 35 |                 dtype=torch.float
 36 |             )
 37 |             nn.init.xavier_uniform_(initial_cluster_centers)
 38 |         else:
 39 |             initial_cluster_centers = cluster_centers
 40 |         self.cluster_centers = Parameter(initial_cluster_centers)
 41 | 
 42 |     def forward(self, batch):
 43 |         """Run a forward pass of model over the data.
 44 | 
 45 |         Compute the soft assignment for a batch of feature vectors, returning a batch of assignments for each cluster.
 46 | 
 47 |         Args:
 48 |             batch: FloatTensor of [batch size, embedding dimension]
 49 | 
 50 |         Return:
 51 |             FloatTensor of [batch size, number of clusters]
 52 |         """
 53 |         norm_squared = torch.sum((batch.unsqueeze(1) - self.cluster_centers) ** 2, 2)
 54 |         numerator = 1.0 / (1.0 + (norm_squared / self.alpha))
 55 |         power = float(self.alpha + 1) / 2
 56 |         numerator = numerator**power
 57 |         return numerator / torch.sum(numerator, dim=1, keepdim=True)
 58 | 
 59 | 
 60 | class DEC(nn.Module):  # noqa: W0223
 61 |     def __init__(self,
 62 |                  cluster_number,
 63 |                  hidden_dimension,
 64 |                  enc,
 65 |                  alpha=1):
 66 |         """Initialize the parts of DEC algorithm.
 67 | 
 68 |         as described in Xie/Girshick/Farhadi; this includes the AutoEncoder stage and the ClusterAssignment stage.
 69 | 
 70 |         Args:
 71 |             cluster_number (int): number of clusters
 72 |             hidden_dimension (int): hidden dimension, output of the encoder
 73 |             enc (nn.Module): # noqa: W0223 encoder to use
 74 |             alpha (float): parameter representing the degrees of freedom in the t-distribution, default 1.0
 75 |         """
 76 |         super().__init__()
 77 |         self.enc = enc
 78 |         self.hidden_dimension = hidden_dimension
 79 |         self.cluster_number = cluster_number
 80 |         self.alpha = alpha
 81 |         self.assignment = ClusterAssignment(cluster_number, self.hidden_dimension, alpha)
 82 | 
 83 |     def forward(self, *args):
 84 |         """Compute the cluster assignment.
 85 | 
 86 |         Using the ClusterAssignment after running the batch
 87 |         through the encoder part of the associated AutoEncoder module.
 88 | 
 89 |         Args:
 90 |             batch: FloatTensor of [batch size, embedding dimension]
 91 | 
 92 |         Return:
 93 |             FloatTensor of [batch size, number of clusters]
 94 |         """
 95 |         return self.assignment(self.enc(*args))
 96 | 
 97 |     def run(self,  # noqa : R0913
 98 |             y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data, criterion): # noqa : W0613
 99 |         data = (seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data)
100 |         nonempty_tensors, nonempty_idx = get_nonempty_tensors(data)
101 |         output = self(*nonempty_tensors, nonempty_idx)
102 |         target = _target_distribution(output).detach()
103 |         loss = criterion(output.log(), target) / output.shape[0]
104 |         return output, loss
105 | 
106 | 
107 | def _target_distribution(batch):
108 |     """Compute the target distribution p_ij, given the batch (q_ij).
109 | 
110 |     3.1.3 Equation 3 of Xie/Girshick/Farhadi; this used the KL-divergence loss function.
111 | 
112 |     Args:
113 |         batch: FloatTensor of [batch size, number of clusters]
114 | 
115 |     Return:
116 |         FloatTensor of [batch size, number of clusters]
117 |     """
118 |     weight = (batch ** 2) / torch.sum(batch, 0)
119 |     return (weight.t() / torch.sum(weight, 1)).t()
120 | 


--------------------------------------------------------------------------------
/caspr/models/dense_bn_dropout.py:
--------------------------------------------------------------------------------
 1 | """CASPR base dense layer class."""
 2 | 
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | 
 7 | class DenseBnDropout(nn.Module):  # noqa: W0223
 8 |     """Dense Layers w/ dropout and batch-normalization.
 9 | 
10 |     A module comprising of a sequential structure of [Linear -> Batch Normalisation -> Dropout]
11 |         used for multiple iterations through it
12 |     When the input is a 3D tensor - batch_size x seq_len x features
13 |     When the input is a 2D tensor - batch_size x features
14 |     """
15 | 
16 |     def __init__(self, lin_layer_sizes, lin_layer_dropouts, input_size):
17 |         """Initiliasing the layers.
18 | 
19 |         Args:
20 |             lin_layer_sizes (list) = sizes of the linear layers being using across multiple iterations
21 |             lin_layer_dropouts (list) = values of the dropout layers across multiple iterations
22 |             input_size (integer) = size of the input tensor - batch_size x 'input_size' x seq_len
23 |         """
24 | 
25 |         super().__init__()
26 |         first_lin_layer = nn.Linear(input_size, lin_layer_sizes[0])
27 |         self.lin_layers = nn.ModuleList([first_lin_layer] +
28 |                                         [nn.Linear(lin_layer_sizes[i],
29 |                                                    lin_layer_sizes[i + 1])
30 |                                          for i in range(len(lin_layer_sizes) - 1)])
31 |         for lin_layer in self.lin_layers:
32 |             nn.init.kaiming_normal_(lin_layer.weight.data)
33 | 
34 |         self.dropout_layers = nn.ModuleList([nn.Dropout(p) for p in lin_layer_dropouts])
35 |         self.bn_layers = nn.ModuleList([nn.BatchNorm1d(size) for size in lin_layer_sizes])
36 | 
37 |     def forward(self, input_tensor):
38 |         """Run a forward pass of model over the data."""
39 |         is_seq = input_tensor.ndim == 3
40 | 
41 |         for lin_layer, dropout_layer, bn_layer in zip(self.lin_layers, self.dropout_layers, self.bn_layers):
42 |             input_tensor = F.relu(lin_layer(input_tensor))
43 |             if is_seq:
44 |                 # permute to adjust for the BN internal structure
45 |                 input_tensor = input_tensor.permute(0, 2, 1)
46 | 
47 |             input_tensor = bn_layer(input_tensor)
48 | 
49 |             if is_seq:
50 |                 # permute back to maintain the original structure required for linear layer
51 |                 input_tensor = input_tensor.permute(0, 2, 1)
52 | 
53 |             input_tensor = dropout_layer(input_tensor)
54 | 
55 |         output_tensor = input_tensor
56 |         return output_tensor
57 | 


--------------------------------------------------------------------------------
/caspr/models/embedding_layer.py:
--------------------------------------------------------------------------------
 1 | """CASPR embedding layer base class."""
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | 
 8 | class CategoricalEmbedding(nn.Module):  # noqa: W0223
 9 |     """Define embedding layers to convert categorical variable values to continuous embeddings.
10 | 
11 |     Uses pytorch defined nn.Embedding layers
12 |     The incoming data for this class has 3 dimensions - dim(1) is the number of time steps in the sequence
13 |     when used for a seq variable
14 |     When being used for non-seq variable - data has 2 dimensions
15 |     """
16 | 
17 |     def __init__(self,  # noqa: R0913
18 |                  emb_dims, emb_dropout, is_seq=False, pretrained_vecs=None, freeze_pretrained=True):
19 |         """Initialise the emb layer class.
20 | 
21 |         Args:
22 |             emb_dims: A list of tuple (x, y) which contains the input for the nn.Embedding layer
23 |             emb_dropout : The dropout value for the layers applied after concatenation of all the embeddings
24 |             is_seq = determines if this layer has been initialised for sequential or non-sequential data
25 |             pretrained_vecs = The tensor which contains the pretrained values. For variables for which we dont have the
26 |                 vecs we initialise the nn.Embedding layer and backpropagate through them
27 |             freeze_pretrained This boolean label determines if we freeze the pretrained embeddings and dont
28 |               backpropagate through them
29 |         """
30 | 
31 |         super().__init__()
32 | 
33 |         self.emb_dims = emb_dims
34 |         self.is_seq = is_seq
35 |         self.emb_layers = nn.ModuleList([nn.Embedding(x, y) for x, y in emb_dims])
36 |         if pretrained_vecs is not None and len(emb_dims) > 0:
37 |             for i, v in enumerate(pretrained_vecs):
38 |                 if v is not None:
39 |                     self.emb_layers[i] = nn.Embedding.from_pretrained(v, freeze=freeze_pretrained)
40 |         self.num_classes = [x for x, _ in emb_dims]
41 |         self.emb_size = np.sum([y for _, y in emb_dims], dtype=np.int32)
42 |         self.emb_dropout_layer = nn.Dropout(emb_dropout)
43 | 
44 |     def forward(self, cat_data):
45 |         """Run a forward pass of model over the data."""
46 |         cat_data = cat_data.long()
47 |         # across all rows and column i - useful for batches
48 |         cat_inp = [emb_layer(cat_data[..., i]) for i, emb_layer in enumerate(self.emb_layers)]
49 |         cat_inp = torch.cat(cat_inp, -1)
50 |         cat_inp = self.emb_dropout_layer(cat_inp)
51 |         return cat_inp
52 | 


--------------------------------------------------------------------------------
/caspr/models/factory.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import numpy as np
  4 | import torch.nn as nn
  5 | 
  6 | from caspr.models.dense_bn_dropout import DenseBnDropout
  7 | from caspr.models.lstm_decoder import LSTM_attention_embedding_decoder
  8 | from caspr.models.model_wrapper import LSTMAutoencoder, OutputLayer, TransformerAutoEncoder
  9 | from caspr.models.transformer import TransformerDecoder, TransformerEncoder
 10 | from caspr.models.unified_encoder import UnifiedEncoder
 11 | from caspr.models.unified_transformer_encoder import UnifiedTransformerEncoder
 12 | 
 13 | TRANSFORMER = 'TransformerAutoEncoder'
 14 | LSTM = 'LSTMAutoencoder'
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | class CASPRFactory:
 18 | 
 19 |     def __init__(self, cat_cols_, num_activities, cont_cols_, seq_cols_, non_seq_cols_, date_cols_=[], seq_len=15, max_emb_size=25, emb_dims_non_seq=None, emb_dims_seq=None) -> None:
 20 |         self.support = {
 21 |             TRANSFORMER : self.__create_transformer_autoencoder__,
 22 |             LSTM : self.__create_autoencoder__
 23 |         }
 24 | 
 25 |         if num_activities:
 26 |             self.emb_dims_non_seq, self.emb_dims_seq = self.calculate_embedding_dimensions(num_activities, seq_cols=seq_cols_,
 27 |                                                                                            non_seq_cols=non_seq_cols_,
 28 |                                                                                            max_emb_size=max_emb_size)
 29 |         else:
 30 |             self.emb_dims_non_seq = emb_dims_non_seq
 31 |             self.emb_dims_seq = emb_dims_seq
 32 | 
 33 |         self.seq_len = seq_len
 34 | 
 35 |         self.non_seq_cat_ = [x for x in cat_cols_ if x in non_seq_cols_]
 36 |         self.seq_cat_ = [x for x in cat_cols_ if x in seq_cols_]
 37 |         self.non_seq_cont_ = [x for x in cont_cols_ if x in non_seq_cols_]
 38 |         self.seq_cont_ = [x for x in cont_cols_+date_cols_ if x in seq_cols_]
 39 | 
 40 |         self.seq_cont_dim = len(set(seq_cols_) & set(cont_cols_)) + len(date_cols_)
 41 |         self.non_seq_cont_dim = len(set(non_seq_cols_) & set(cont_cols_))
 42 |         # Append non seq features to the end of the sequence if exist
 43 |         self.adjust_seq_len = seq_len + int(len(non_seq_cols_) > 0)
 44 | 
 45 |     @staticmethod
 46 |     def calculate_embedding_dimensions(num_activities, seq_cols=None, non_seq_cols=None, max_emb_size=25):
 47 |         """Calculate the emb dims for the categorical embedding layer for each categorical variable.
 48 | 
 49 |         Args:
 50 |             num_activities: number of unique activities for each categorical variable
 51 |             seq_cols (list): List of sequential vars
 52 |             non_seq_cols (list): List of non-sequential vars
 53 |             max_emb_size (Default = 25) : The max size of the embedding layer for a variable
 54 |                                         (needed when the possible values are very high)
 55 |         """
 56 | 
 57 |         # Avoid using empty lists as default values
 58 |         seq_cols = [] if seq_cols is None else seq_cols
 59 |         non_seq_cols = [] if non_seq_cols is None else non_seq_cols
 60 | 
 61 |         cat_seq_dims = [num_activities[c] for c in num_activities.keys() if c in seq_cols]
 62 |         cat_non_seq_dims = [num_activities[c] for c in num_activities.keys() if c in non_seq_cols]
 63 |         emb_dims_non_seq = [(x, int(np.minimum(max_emb_size, (x + 1) // 2))) for x in cat_non_seq_dims]
 64 |         emb_dims_seq = [(x, int(np.minimum(max_emb_size, (x + 1) // 2))) for x in cat_seq_dims]
 65 | 
 66 |         return emb_dims_non_seq, emb_dims_seq
 67 | 
 68 |     def __create_transformer_autoencoder__(self, device="cuda", HIDDEN_SIZE=64,
 69 |                                            NUM_LAYERS_ENC=4,
 70 |                                            NUM_LAYERS_DEC=2,
 71 |                                            NUM_HEADS_ENC=2,
 72 |                                            NUM_HEADS_DEC=4,
 73 |                                            PF_DIM_ENC=32,
 74 |                                            PF_DIM_DEC=128,
 75 |                                            DROPOUT_ENC=0.1,
 76 |                                            DROPOUT_DEC=0.1,
 77 |                                            EMBEDDING_DROPOUT_SEQUENTIAL=0.1,
 78 |                                            EMBEDDING_DROPOUT_NON_SEQUENTIAL=0.1) -> TransformerAutoEncoder:
 79 | 
 80 |         enc = TransformerEncoder(hid_dim=HIDDEN_SIZE, n_layers=NUM_LAYERS_ENC, n_heads=NUM_HEADS_ENC,
 81 |                                  pf_dim=PF_DIM_ENC, dropout=DROPOUT_ENC, max_length=self.adjust_seq_len)
 82 | 
 83 |         dec = TransformerDecoder(hid_dim=HIDDEN_SIZE, n_layers=NUM_LAYERS_DEC, n_heads=NUM_HEADS_DEC,
 84 |                                  pf_dim=PF_DIM_DEC, dropout=DROPOUT_DEC, pos_embedding=enc.pos_embedding)
 85 | 
 86 |         emb_seq_num_classes = [x for x, _ in self.emb_dims_seq]
 87 |         emb_non_seq_num_classes = [x for x, _ in self.emb_dims_non_seq]
 88 | 
 89 |         output_layer = OutputLayer(HIDDEN_SIZE, self.seq_cont_dim, self.non_seq_cont_dim,
 90 |                                    emb_seq_num_classes, emb_non_seq_num_classes)
 91 | 
 92 |         unified_transformer_encoder = UnifiedTransformerEncoder(enc,
 93 |                                                                 self.emb_dims_non_seq,
 94 |                                                                 EMBEDDING_DROPOUT_NON_SEQUENTIAL,
 95 |                                                                 self.emb_dims_seq,
 96 |                                                                 EMBEDDING_DROPOUT_SEQUENTIAL,
 97 |                                                                 HIDDEN_SIZE,
 98 |                                                                 self.seq_cont_dim,
 99 |                                                                 self.non_seq_cont_dim,
100 |                                                                 non_seq_pretrained_embs=None,
101 |                                                                 freeze_non_seq_pretrained_embs=True,
102 |                                                                 seq_pretrained_embs=None,
103 |                                                                 freeze_seq_pretrained_embs=True)
104 | 
105 |         return TransformerAutoEncoder(unified_transformer_encoder, dec, output_layer).to(device)
106 | 
107 |     def __create_autoencoder__(self, device="cuda", HIDDEN_SIZE=64,
108 |                                NUM_LAYERS=1,
109 |                                LIN_LAYER_SIZES_NON_SEQUENTIAL=[50, 25],
110 |                                LIN_LAYER_SIZES_SEQUENTIAL=[50, 25],
111 |                                EMBEDDING_DROPOUT_NON_SEQUENTIAL=0.04,
112 |                                LIN_LAYER_DROPOUTS_NON_SEQUENTIAL=[0.0001, 0.01],
113 |                                EMBEDDING_DROPOUT_SEQUENTIAL=0.04,
114 |                                LIN_LAYER_DROPOUTS_SEQUENTIAL=[0.001, 0.01]) -> LSTMAutoencoder:
115 | 
116 |         output_dim = len(self.seq_cont_)
117 |         num_classes = [x for (x, _) in self.emb_dims_seq]
118 | 
119 |         # Model objects initialisation
120 |         encoder = UnifiedEncoder(emb_dims_non_seq=self.emb_dims_non_seq,
121 |                                  emb_dropout_non_seq=EMBEDDING_DROPOUT_NON_SEQUENTIAL,
122 |                                  emb_dims_seq=self.emb_dims_seq,
123 |                                  emb_dropout_seq=EMBEDDING_DROPOUT_SEQUENTIAL,
124 |                                  emb_lin_layer_sizes_non_seq=LIN_LAYER_SIZES_NON_SEQUENTIAL,
125 |                                  emb_lin_layer_dropouts_non_seq=LIN_LAYER_DROPOUTS_NON_SEQUENTIAL,
126 |                                  emb_lin_layer_sizes_seq=LIN_LAYER_SIZES_SEQUENTIAL,
127 |                                  emb_lin_layer_dropouts_seq=LIN_LAYER_DROPOUTS_SEQUENTIAL,
128 |                                  lstm_hidden_size=HIDDEN_SIZE,
129 |                                  output_size=output_dim,
130 |                                  seq_len=self.seq_len,
131 |                                  non_seq_cont_count=len(self.non_seq_cont_),
132 |                                  seq_cat_count=len(self.seq_cat_),
133 |                                  seq_cont_count=len(self.seq_cont_),
134 |                                  non_seq_cat_count=len(self.non_seq_cat_))
135 | 
136 |         input_dim = int(encoder.seq_cont_count + encoder.no_of_embs_seq)
137 | 
138 |         decoder = LSTM_attention_embedding_decoder(input_dim=input_dim,
139 |                                                    hidden_size=HIDDEN_SIZE,
140 |                                                    num_layers=NUM_LAYERS,
141 |                                                    output_dim=output_dim,
142 |                                                    num_classes=num_classes)
143 | 
144 |         mlp_non_seq_cat_list = []
145 | 
146 |         for non_seq_cat, _ in self.emb_dims_non_seq:
147 |             mlp_non_seq_cat_list.append(DenseBnDropout(LIN_LAYER_SIZES_NON_SEQUENTIAL+[
148 |                                         non_seq_cat], LIN_LAYER_DROPOUTS_NON_SEQUENTIAL+[0], HIDDEN_SIZE))
149 |         mlp_non_seq_cont = DenseBnDropout(
150 |             LIN_LAYER_SIZES_NON_SEQUENTIAL, LIN_LAYER_DROPOUTS_NON_SEQUENTIAL, HIDDEN_SIZE)
151 | 
152 |         autoenc = LSTMAutoencoder(encoder, mlp_non_seq_cat_list, mlp_non_seq_cont, decoder).to(device)
153 | 
154 |         return autoenc
155 | 
156 |     def create(self, architecture: str, device="cuda", **hyperparams) -> nn.Module:
157 |         if architecture not in self.support:
158 |             raise ValueError("Unknown architecture specified. Model Factory currently supports: %s Requested: %s" % (str(self.support.keys()), architecture))
159 | 
160 |         constructor_f = self.support[architecture]
161 | 
162 |         logger.info("Initializing CASPR with %s architecture. Hyperparams provided: %s" % (architecture, hyperparams))
163 | 
164 |         return constructor_f(device, **hyperparams)
165 | 


--------------------------------------------------------------------------------
/caspr/models/lstm_autoencoder_sequence.py:
--------------------------------------------------------------------------------
  1 | """Bahdanau attention based LSTM encoder."""
  2 | 
  3 | import warnings
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | 
  9 | import caspr.models
 10 | 
 11 | warnings.simplefilter('ignore')
 12 | 
 13 | 
 14 | class LSTM_attention_embedding_encoder_sequence(nn.Module):  # noqa: W0223
 15 |     """Luong/Bahdanau attention based LSTM encoder."""
 16 | 
 17 |     def __init__(self,  # noqa: R0913, R0914
 18 |                  emb_dims_non_seq,
 19 |                  emb_dims_seq,
 20 |                  lin_layer_sizes_non_sequential,
 21 |                  lin_layer_sizes_sequential,
 22 |                  hidden_size,
 23 |                  num_layers,
 24 |                  bidirectional,
 25 |                  output_size,
 26 |                  emb_dropout_non_seq,
 27 |                  lin_layer_dropouts_non_sequential,
 28 |                  emb_dropout_seq,
 29 |                  lin_layer_dropouts_sequential,
 30 |                  lin_layer_sizes_fin,
 31 |                  lin_layer_dropouts_fin,  # noqa: W0613
 32 |                  seq_len, input_dim,
 33 |                  non_seq_cont_count, seq_cat_count, seq_cont_count, non_seq_cat_count,
 34 |                  device):
 35 |         """Initialise the pytorch LSTM layer.
 36 | 
 37 |         Args:
 38 |             emb_dims_non_seq, emb_dims_seq (list of int tuples):
 39 |                 List of category dimension and corresponding embedding size.
 40 |             lin_layer_sizes_non_sequential,  lin_layer_sizes_sequential (list of int tuples):
 41 |                 List of [m1*m2] tuples for embedding dimension reduction and non-linearity
 42 |             emb_dropout_non_seq, emb_dropout_seq (float): dropout values for embedding layers
 43 |             lin_layer_dropouts_non_seq, lin_layer_dropouts_seq (list of float):
 44 |                 dropout values for linear layers corresponding to embedding layers
 45 |             hidden_size (int): Size of the hidden state
 46 |             num_layers (int): Number of stacked LSTM layers
 47 |             bidirectional (bool):  Flag for bi/uni LSTM
 48 |             output_size (int): Size of the final output layer
 49 |             lin_layer_sizes_fin (list of int tuples):
 50 |                 List of [m1*m2] tuples for non-linear combination of sequential and nonsequential inputs
 51 |             seq_len (int): Length of input Sequence
 52 |         """
 53 |         super().__init__()
 54 | 
 55 |         self.device = device
 56 |         self.non_seq_emb_layers = nn.ModuleList(
 57 |             [nn.Embedding(x, y) for x, y in emb_dims_non_seq])
 58 |         self.seq_emb_layers = nn.ModuleList(
 59 |             [nn.Embedding(x, y) for x, y in emb_dims_seq])
 60 |         self.no_of_embs_non_seq = sum([y for x, y in emb_dims_non_seq])
 61 |         self.no_of_embs_seq = sum([y for x, y in emb_dims_seq])
 62 |         self.input_dim = input_dim
 63 |         self.seq_len = seq_len
 64 |         self.hidden_size = hidden_size
 65 |         self.non_seq_cont_count = non_seq_cont_count
 66 |         self.non_seq_cat_count = non_seq_cat_count
 67 |         self.context_vector_size = hidden_size
 68 |         self.output_dim = output_size
 69 |         self.num_layers = num_layers
 70 |         self.num_directions = 2 if bidirectional else 1
 71 | 
 72 |         self.seq_cat_count = seq_cat_count
 73 |         self.seq_cont_count = seq_cont_count
 74 |         self.non_seq_cat_count = non_seq_cat_count
 75 |         self.non_seq_cont_count = non_seq_cont_count
 76 | 
 77 |         # Linear Layers for non_seq_data parallel to LSTM
 78 |         if self.no_of_embs_non_seq != 0:
 79 |             first_lin_layer = nn.Linear(self.no_of_embs_non_seq, lin_layer_sizes_non_sequential[0])
 80 |             self.lin_layersnon_sequential = nn.ModuleList([first_lin_layer] +
 81 |                                                           [nn.Linear(lin_layer_sizes_non_sequential[i],
 82 |                                                                      lin_layer_sizes_non_sequential[i + 1])
 83 |                                                            for i in range(len(lin_layer_sizes_non_sequential) - 1)])
 84 |             for lin_layer in self.lin_layersnon_sequential:
 85 |                 nn.init.kaiming_normal_(lin_layer.weight.data)
 86 | 
 87 |             self.emb_dropout_layer_non_sequential = nn.Dropout(emb_dropout_non_seq)
 88 |             self.dropout_layersnon_sequential = nn.ModuleList(
 89 |                 [nn.Dropout(size) for size in lin_layer_dropouts_non_sequential])
 90 |             self.bn_layersnon_sequential = nn.ModuleList(
 91 |                 [nn.BatchNorm1d(size) for size in lin_layer_sizes_non_sequential])
 92 | 
 93 |         # Linear Layers for seq_cat_data
 94 |         if self.no_of_embs_seq != 0:
 95 |             first_lin_layer_seq = nn.Linear(self.no_of_embs_seq, lin_layer_sizes_sequential[0])
 96 |             self.lin_layers_seq = nn.ModuleList([first_lin_layer_seq] +
 97 |                                                 [nn.Linear(lin_layer_sizes_sequential[i],
 98 |                                                            lin_layer_sizes_sequential[i + 1])
 99 |                                                  for i in range(len(lin_layer_sizes_sequential) - 1)])
100 |             for lin_layer in self.lin_layers_seq:
101 |                 nn.init.kaiming_normal_(lin_layer.weight.data)
102 | 
103 |             self.emb_dropout_layer_seq = nn.Dropout(emb_dropout_seq)
104 |             self.dropout_layers_seq = nn.ModuleList([nn.Dropout(size) for size in lin_layer_dropouts_sequential])
105 |             self.bn_layers_seq = nn.ModuleList([nn.BatchNorm1d(size) for size in lin_layer_sizes_sequential])
106 | 
107 |         # Output Layer
108 |         self.output_layer = nn.Linear(lin_layer_sizes_fin[-1], output_size)
109 |         nn.init.kaiming_normal_(self.output_layer.weight.data)
110 | 
111 |         # LSTM layer
112 |         self.lstmLayer = nn.LSTM(
113 |             self.input_dim + lin_layer_sizes_sequential[-1],
114 |             self.hidden_size, self.num_layers, batch_first=True, bidirectional=bidirectional)
115 |         # self.lstmLayer = nn.LSTM(
116 |         #     self.input_dim+self.no_of_embs_seq,
117 |         #     self.hidden_size, self.num_layers, batch_first=True, bidirectional=bidirectional)
118 | 
119 |         # Linear Layers post LSTM
120 |         self.lin_layer_lstm_to_dense = nn.Linear(
121 |             self.num_directions*self.hidden_size, self.hidden_size)
122 | 
123 |         # Attention
124 |         self.bahdanau_attention_layer = caspr.models.attention_mechanisms.BahdanauAttention(
125 |             self.hidden_size, self.num_directions)
126 | 
127 |         # self.fc_encoder = nn.Linear(
128 |         #     self.num_directions*self.hidden_size, self.hidden_size, bias=False)
129 | 
130 |         # self.attnHidden = nn.Linear(self.hidden_size, 1)
131 | 
132 |         self.fin_layer = nn.Linear(
133 |             self.num_directions*self.hidden_size +
134 |             self.context_vector_size + self.no_of_embs_non_seq + self.non_seq_cont_count, hidden_size)
135 | #         self.fin_layer = nn.Linear(
136 | #             self.num_directions*self.hidden_size + self.context_vector_size , hidden_size)
137 | 
138 |     def forward(self, input_tensor):  # noqa : R0914
139 |         """Run a forward pass of model over the data."""
140 |         seq_cat_index = self.seq_len * self.seq_cat_count
141 |         seq_cont_index = seq_cat_index + self.seq_len * self.seq_cont_count
142 |         non_seq_cat_index = seq_cont_index + self.non_seq_cat_count
143 |         non_seq_cont_index = non_seq_cat_index + self.non_seq_cont_count
144 | 
145 |         seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data = input_tensor[:, :seq_cat_index], \
146 |             input_tensor[:, seq_cat_index: seq_cont_index], \
147 |             input_tensor[:, seq_cont_index: non_seq_cat_index], \
148 |             input_tensor[:, non_seq_cat_index: non_seq_cont_index]
149 |         seq_cat_data = seq_cat_data.type(torch.LongTensor)
150 |         seq_cat_data = seq_cat_data.reshape(
151 |             seq_cat_data.shape[0], self.seq_len, int(seq_cat_data.shape[1]/self.seq_len))
152 |         seq_cont_data = seq_cont_data.reshape(
153 |             seq_cont_data.shape[0], self.seq_len, int(seq_cont_data.shape[1]/self.seq_len))
154 | 
155 |         if self.no_of_embs_non_seq != 0:
156 |             non_seq_cat_data = non_seq_cat_data.type(
157 |                 torch.LongTensor).to(self.device)
158 |             #   across all rows and column i -  useful for batches
159 |             non_seq_cat_inp = [emb_layer(non_seq_cat_data[:, i])
160 |                                for i, emb_layer in enumerate(self.non_seq_emb_layers)]
161 |             non_seq_cat_inp = torch.cat(non_seq_cat_inp, 1)
162 |             non_seq_cat_inp = self.emb_dropout_layer_non_sequential(non_seq_cat_inp)
163 |             if self.non_seq_cont_count != 0:
164 |                 non_seq_inp = torch.cat((non_seq_cat_inp.type(torch.FloatTensor).to(
165 |                     self.device), non_seq_cont_data.type(torch.FloatTensor).to(self.device)), 1)
166 |             else:
167 |                 non_seq_inp = non_seq_cat_inp.type(torch.FloatTensor).to(self.device)
168 |         elif self.non_seq_cont_count != 0:
169 |             non_seq_inp = non_seq_cont_data.type(torch.FloatTensor).to(self.device)
170 | 
171 |         if self.no_of_embs_seq != 0:
172 |             seq_cat_data = seq_cat_data.type(
173 |                 torch.LongTensor).to(self.device)
174 |             #   across all rows and column i -  useful for batches
175 |             seq_cat_inp = [emb_layer(seq_cat_data[:, :, i])
176 |                            for i, emb_layer in enumerate(self.seq_emb_layers)]
177 |             # shape = batchsize * seq_len * 16(emb size)
178 |             seq_cat_inp = torch.cat(seq_cat_inp, 2)
179 |             seq_cat_inp = self.emb_dropout_layer_seq(seq_cat_inp)
180 |             seq_cat_inp_emb = seq_cat_inp
181 |             for lin_layer, dropout_layer, bn_layer in zip(self.lin_layers_seq,
182 |                                                           self.dropout_layers_seq, self.bn_layers_seq):
183 |                 seq_cat_inp_emb = F.relu(lin_layer(seq_cat_inp_emb))
184 |                 seq_cat_inp_emb = torch.cat([bn_layer(seq_cat_inp_emb[:, i, :]).unsqueeze(1)
185 |                                              for i in range(self.seq_len)], 1)
186 |                 seq_cat_inp_emb = dropout_layer(seq_cat_inp_emb)
187 | 
188 |         seq_cat_inp_emb = seq_cat_inp_emb.to(self.device)
189 |         # shape seq_cat = batchsize * seq_len * emb size/lin_layers_seq[-1].shape
190 |         # shape seq_cont = batchsize * seq_len * data
191 | 
192 |         seq_data = torch.cat([seq_cat_inp_emb, seq_cont_data], 2)
193 | 
194 |         # now the sequential data
195 |         inp_tens = seq_data
196 | 
197 |         temp_batch_size = inp_tens.size()[0]
198 | 
199 |         h0 = torch.zeros(self.num_directions*self.num_layers, temp_batch_size, self.hidden_size).to(
200 |             self.device).requires_grad_()
201 |         c0 = torch.zeros(self.num_directions*self.num_layers, temp_batch_size, self.hidden_size).to(
202 |             self.device).requires_grad_()
203 | 
204 |         output, (hn, cn) = self.lstmLayer(inp_tens, (h0, c0))
205 |         # passes through the embedding layer to generate the required embeddings
206 |         # attention weight calculation
207 | 
208 |         # tempX = torch.tanh(self.fc_encoder(output))
209 |         # alignment_scores = self.attnHidden(tempX)
210 |         # attn_weights = F.softmax(alignment_scores, dim=1)
211 |         # attn_weights = attn_weights.permute(0, 2, 1)
212 |         # context_vector = torch.bmm(attn_weights, output)
213 | 
214 |         context_vector = self.bahdanau_attention_layer(output)
215 | 
216 |         hn = hn.view(self.num_layers, self.num_directions, -
217 |                      1, self.hidden_size).to(self.device)
218 |         cn_ = cn.view(self.num_layers, self.num_directions, -
219 |                       1, self.hidden_size).to(self.device)
220 |         if self.num_directions > 1:
221 |             seq_inp = self.lin_layer_lstm_to_dense(torch.cat(
222 |                 [hn[self.num_layers-1, 0], hn[self.num_layers-1, -1]], 1).unsqueeze(0))
223 |         else:
224 |             seq_inp = self.lin_layer_lstm_to_dense(
225 |                 hn[self.num_layers-1, 0]).unsqueeze(0)
226 | 
227 |         seq_inp = seq_inp.reshape(seq_inp.size()[1], seq_inp.size()[2])
228 | 
229 |         context_vector = context_vector.reshape(
230 |             context_vector.size()[0], context_vector.size()[2])
231 | 
232 |         fin_input = torch.cat((non_seq_inp, seq_inp, context_vector), 1)
233 | #         fin_input = torch.cat((seq_inp, context_vector), 1)
234 | 
235 |         hn_ = F.relu(self.fin_layer(fin_input))
236 | 
237 |         return output, (hn_, cn_[self.num_layers-1, 0, :, :].unsqueeze(0))
238 | 


--------------------------------------------------------------------------------
/caspr/models/lstm_decoder.py:
--------------------------------------------------------------------------------
 1 | """CASPR LSTM decoder base class."""
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | 
 7 | 
 8 | class LSTM_attention_embedding_decoder(nn.Module): # noqa: W0223
 9 |     """Simple LSTM decoder."""
10 | 
11 |     def __init__(self, # noqa: R0913
12 |                  input_dim,
13 |                  hidden_size,
14 |                  output_dim,
15 |                  num_classes,
16 |                  num_layers=1):
17 |         """Initialize model with params."""
18 |         super().__init__()
19 | 
20 |         self.input_dim = input_dim
21 |         self.hidden_size = hidden_size
22 |         self.num_layers = num_layers
23 |         self.num_classes = num_classes
24 |         self.output_dim = output_dim
25 | 
26 |         # LSTM layer
27 |         self.lstm_layer = nn.LSTM(
28 |             input_dim, hidden_size, num_layers, batch_first=True)
29 | 
30 |         self.linear = nn.Linear(self.hidden_size, output_dim)
31 | 
32 |         self.output = nn.ModuleList([nn.Linear(self.hidden_size, num_class) for num_class in self.num_classes])
33 |         self.hidden = None
34 | 
35 |     def forward(self, inp, hidden):
36 |         """Forward pass through LSTM layer.
37 | 
38 |         shape of lstm_out: [input_size, batch_size, hidden_dim]
39 |         shape of self.hidden: (a, b), where a and b both
40 |         have shape (num_layers, batch_size, hidden_dim).
41 |         """
42 |         inp = inp.view(inp.shape[0], 1, -1)
43 |         self.hidden = hidden
44 | 
45 |         lstm_out, self.hidden = self.lstm_layer(inp, self.hidden)
46 |         decoder_out = (torch.tanh(lstm_out[:, -1, :]))
47 | 
48 |         y_pred = self.linear(decoder_out)
49 |         out_cont = F.relu(y_pred)
50 | #         out_cat = self.output(decoder_out)
51 |         out_cat = [  # across all rows and column i -  useful for batches
52 |             output_layer(decoder_out) for i, output_layer in enumerate(self.output)
53 |         ]
54 | #         out_cat = torch.cat(out_cat, -1)
55 | #         print(out_cat.shape)
56 | 
57 |         return out_cont, self.hidden, out_cont, out_cat
58 | 


--------------------------------------------------------------------------------
/caspr/models/lstm_timeseries_tpa_attention.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | """TPA attention based LSTM encoder."""
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | 
  8 | 
  9 | class LSTM_TPA_attention_timeseries(nn.Module):  # noqa: W0223
 10 |     """TPA attention based LSTM encoder."""
 11 | 
 12 |     def __init__(self,  # noqa: R0913
 13 |                  emb_dims_non_seq,
 14 |                  emb_dims_seq,
 15 |                  lin_layer_sizes_non_sequential,
 16 |                  lin_layer_sizes_sequential,
 17 |                  non_seq_cont_count,
 18 |                  hidden_size,
 19 |                  output_size,
 20 |                  emb_dropout_non_seq,
 21 |                  lin_layer_dropouts_non_sequential,
 22 |                  emb_dropout_seq,
 23 |                  lin_layer_dropouts_sequential,
 24 |                  lin_layer_sizes_fin,
 25 |                  lin_layer_dropouts_fin,
 26 |                  seq_len, input_dim, device):
 27 |         """Initialise the pytorch LSTM layer.
 28 | 
 29 |         Args:
 30 |             emb_dims_non_seq, emb_dims_seq (list of int tuples):
 31 |                 List of category dimension and corresponding embedding size.
 32 |             lin_layer_sizes_non_sequential,  lin_layer_sizes_sequential (list of int tuples):
 33 |                 List of [m1*m2] tuples for embedding dimension reduction and non-linearity
 34 |             emb_dropout_non_seq, emb_dropout_seq (float): dropout values for embedding layers
 35 |             lin_layer_dropouts_non_seq, lin_layer_dropouts_seq (list of float):
 36 |                 dropout values for linear layers corresponding to embedding layers
 37 |             hidden_size (int): Size of the hidden state
 38 |             output_size (int): Size of the final output layer
 39 |             lin_layer_sizes_fin (list of int tuples):
 40 |                 List of [m1*m2] tuples for non-linear combination of sequential and nonsequential inputs
 41 |             seq_len (int): Length of input Sequence
 42 |         """
 43 | 
 44 |         super().__init__()
 45 | 
 46 |         self.device = device
 47 |         self.non_seq_emb_layers = nn.ModuleList(
 48 |             [nn.Embedding(x, y) for x, y in emb_dims_non_seq])
 49 |         self.seq_emb_layers = nn.ModuleList(
 50 |             [nn.Embedding(x, y) for x, y in emb_dims_seq])
 51 |         self.no_of_embs_non_seq = sum([y for x, y in emb_dims_non_seq])
 52 |         self.no_of_embs_seq = sum([y for x, y in emb_dims_seq])
 53 |         self.input_dim = input_dim
 54 |         self.seq_len = seq_len
 55 |         self.hidden_size = hidden_size
 56 |         self.non_seq_cont_count = non_seq_cont_count
 57 |         self.context_vector_size = hidden_size
 58 |         self.output_dim = output_size
 59 | 
 60 |         if self.no_of_embs_non_seq != 0:
 61 |             first_lin_layer = nn.Linear(self.no_of_embs_non_seq, lin_layer_sizes_non_sequential[0])
 62 |             self.lin_layersnon_sequential = nn.ModuleList([first_lin_layer] +
 63 |                                                           [nn.Linear(lin_layer_sizes_non_sequential[i],
 64 |                                                                      lin_layer_sizes_non_sequential[i + 1])
 65 |                                                            for i in range(len(lin_layer_sizes_non_sequential) - 1)])
 66 |             for lin_layer in self.lin_layersnon_sequential:
 67 |                 nn.init.kaiming_normal_(lin_layer.weight.data)
 68 | 
 69 |             self.emb_dropout_layernon_sequential = nn.Dropout(emb_dropout_non_seq)
 70 |             self.dropout_layersnon_sequential = nn.ModuleList(
 71 |                 [nn.Dropout(size) for size in lin_layer_dropouts_non_sequential])
 72 |             self.bn_layersnon_sequential = nn.ModuleList(
 73 |                 [nn.BatchNorm1d(size) for size in lin_layer_sizes_non_sequential])
 74 | 
 75 |         # Linear Layers for seq_cat_data
 76 |         if self.no_of_embs_seq != 0:
 77 |             first_lin_layer_seq = nn.Linear(self.no_of_embs_seq, lin_layer_sizes_sequential[0])
 78 |             self.lin_layers_seq = nn.ModuleList([first_lin_layer_seq] +
 79 |                                                 [nn.Linear(lin_layer_sizes_sequential[i],
 80 |                                                            lin_layer_sizes_sequential[i + 1])
 81 |                                                  for i in range(len(lin_layer_sizes_sequential) - 1)])
 82 |             for lin_layer in self.lin_layers_seq:
 83 |                 nn.init.kaiming_normal_(lin_layer.weight.data)
 84 | 
 85 |             self.emb_dropout_layer_seq = nn.Dropout(emb_dropout_seq)
 86 |             self.dropout_layers_seq = nn.ModuleList([nn.Dropout(size) for size in lin_layer_dropouts_sequential])
 87 |             self.bn_layers_seq = nn.ModuleList([nn.BatchNorm1d(size) for size in lin_layer_sizes_sequential])
 88 | 
 89 |         # Output Layer
 90 |         self.output_layer = nn.Linear(lin_layer_sizes_fin[-1], output_size)
 91 | 
 92 |         for lin_layer in self.lin_layer_non_sequential:
 93 |             nn.init.kaiming_normal_(lin_layer.weight.data)
 94 | 
 95 |         nn.init.kaiming_normal_(self.output_layer.weight.data)
 96 | 
 97 |         # definitions for data parsing
 98 |         # primarily required to make sure embeddings are used for categorical data
 99 | 
100 |         # LSTM layer
101 |         self.lstm_layer = nn.LSTM(
102 |             self.input_dim + lin_layer_sizes_non_sequential[-1], self.hidden_size, batch_first=True)
103 | 
104 |         # Linear Layers post LSTM
105 |         self.lin_layer_lstm_to_dense = nn.Linear(
106 |             self.hidden_size, self.hidden_size)
107 | 
108 |         # TPA attention
109 |         self.convolution_filters = nn.ModuleList([nn.Conv1d(
110 |             in_channels=1, out_channels=1, kernel_size=self.seq_len) for i in range(hidden_size)])
111 |         self.tpa_linear = nn.Linear(
112 |             self.hidden_size, self.hidden_size, bias=False)
113 | 
114 |         self.tpa_hiddent_linear = nn.Linear(
115 |             self.hidden_size, self.hidden_size, bias=False)
116 |         self.tpa_context_linear = nn.Linear(
117 |             self.hidden_size, self.hidden_size, bias=False)
118 |         # Final MLP
119 |         first_fin_layer = nn.Linear(self.hidden_size + self.context_vector_size +
120 |                                     self.no_of_embs_non_seq + self.non_seq_cont_count, lin_layer_sizes_fin[0])
121 | 
122 |         self.lin_layers_final = nn.ModuleList([first_fin_layer] +
123 |                                               [nn.Linear(lin_layer_sizes_fin[i],
124 |                                                          lin_layer_sizes_fin[i + 1])
125 |                                                for i in range(len(lin_layer_sizes_fin) - 1)])
126 |         for lin_layer in self.lin_layers_final:
127 |             nn.init.kaiming_normal_(lin_layer.weight.data)
128 | 
129 |         # final dropout and batch norm layers for final prediction
130 |         self.dropout_layers_final = nn.ModuleList(
131 |             [nn.Dropout(size) for size in lin_layer_dropouts_fin])
132 |         self.bn_layers_final = nn.ModuleList(
133 |             [nn.BatchNorm1d(size) for size in lin_layer_sizes_fin])
134 | 
135 |         # Output Layer
136 |         self.output_layer = nn.Linear(lin_layer_sizes_fin[-1], output_size)
137 | 
138 |     def forward(self, seq_cont_data, seq_cat_data, non_seq_cat_data, non_seq_cont_data): # noqa : R0914
139 |         """Run a forward pass of model over the data."""
140 | 
141 |         if self.no_of_embs_non_seq != 0:
142 |             non_seq_cat_data = non_seq_cat_data.type(
143 |                 torch.LongTensor).to(self.device)
144 |             #   across all rows and column i -  useful for batches
145 |             non_seq_cat_inp = [emb_layer(non_seq_cat_data[:, i])
146 |                                for i, emb_layer in enumerate(self.emb_layers)]
147 |             non_seq_cat_inp = torch.cat(non_seq_cat_inp, 1)
148 |             non_seq_cat_inp = self.emb_dropout_layer_non_sequential(non_seq_cat_inp)
149 |             if self.non_seq_cont_count != 0:
150 |                 non_seq_inp = torch.cat((non_seq_cat_inp.type(torch.FloatTensor).to(
151 |                     self.device), non_seq_cont_data.type(torch.FloatTensor).to(self.device)), 1)
152 |             else:
153 |                 non_seq_inp = non_seq_cat_inp.type(torch.FloatTensor).to(self.device)
154 |         elif self.non_seq_cont_count != 0:
155 |             non_seq_inp = non_seq_cont_data.type(torch.FloatTensor).to(self.device)
156 | 
157 |         if self.no_of_embs_seq != 0:
158 |             seq_cat_data = seq_cat_data.type(
159 |                 torch.LongTensor).to(self.device)
160 |             #   across all rows and column i -  useful for batches
161 |             seq_cat_inp = [emb_layer(seq_cat_data[:, :, i])
162 |                            for i, emb_layer in enumerate(self.seq_emb_layers)]
163 |             seq_cat_inp = torch.cat(seq_cat_inp, 2)
164 |             seq_cat_inp = self.emb_dropout_layer_seq(seq_cat_inp)
165 | 
166 |             seq_cat_inp_emb = seq_cat_inp
167 | 
168 |         seq_cat_inp_emb = seq_cat_inp_emb.to(self.device)
169 | 
170 |         seq_data = torch.cat([seq_cat_inp_emb, seq_cont_data], 2)
171 | 
172 | #       now the sequential data ------------------------------
173 |         inp_tens = seq_data
174 | 
175 |         temp_batch_size = inp_tens.size()[0]
176 | 
177 |         h0 = torch.zeros(1, temp_batch_size, self.hidden_size).to(
178 |             self.device).requires_grad_()
179 |         c0 = torch.zeros(1, temp_batch_size, self.hidden_size).to(
180 |             self.device).requires_grad_()
181 | 
182 |         output, (hn, _) = self.lstm_layer(inp_tens, (h0, c0))
183 |         hn = hn.to(self.device)
184 |         # passes through the embedding layer to generate the required embeddings
185 |         # output shape batch_size * seq_len * hidden_size
186 |         # output[:,:,i] shape batch_size * seq_len - 1st row of H matrix
187 |         hc = torch.zeros(temp_batch_size, self.hidden_size,
188 |                          self.hidden_size).to(self.device)
189 | 
190 |         for i in range(self.hidden_size):
191 |             for j in range(self.hidden_size):
192 |                 hc[:, i, j] = self.convolution_filters[j](
193 |                     output[:, :, i].unsqueeze(1)).squeeze()
194 | 
195 |         alpha = torch.zeros(temp_batch_size, self.hidden_size).to(self.device)
196 | 
197 |         for i in range(self.hidden_size):
198 |             temp1 = self.tpa_linear(hc[:, i]).unsqueeze(1)
199 |             temp2 = hn.squeeze().unsqueeze(2)
200 |             temp = torch.bmm(temp1, temp2)
201 |             alpha[:, i] = F.sigmoid(temp).squeeze()
202 | 
203 |         vt = torch.zeros(temp_batch_size, self.hidden_size).to(self.device)
204 |         for i in range(self.hidden_size):
205 |             temp = torch.bmm(alpha[:, i].unsqueeze(1).unsqueeze(
206 |                 2), hc[:, i].unsqueeze(1)).squeeze()
207 |             vt += temp
208 | 
209 |         htprime = self.tpa_hiddent_linear(hn) + self.tpa_context_linear(vt)
210 | 
211 |         seq_inp = self.lin_layer_lstm_to_dense(hn)
212 |         seq_inp = seq_inp.reshape(seq_inp.size()[1], seq_inp.size()[2])
213 |         htprime = htprime.squeeze()
214 | 
215 |         # Linear mlp for prediction
216 |         # fin_input = torch.cat((seq_inp, htprime), 1)
217 |         fin_input = torch.cat((non_seq_inp, seq_inp, htprime), 1)
218 | 
219 |         x = fin_input
220 |         for lin_layer, dropout_layer, bn_layer in zip(self.lin_layers_final, self.dropout_layers_final,
221 |                                                       self.bn_layers_final):
222 |             x = F.relu(lin_layer(x))
223 |             x = bn_layer(x)
224 |             x = dropout_layer(x)
225 | 
226 |         x = F.relu(self.output_layer(x))
227 | 
228 |         return x, fin_input
229 | 


--------------------------------------------------------------------------------
/caspr/models/mlp.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | """CASPR mlp base class."""
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | from caspr.models.dense_bn_dropout import DenseBnDropout
 8 | 
 9 | 
10 | class MLP(nn.Module): # noqa: W0223
11 |     def __init__(self, # noqa: R0913
12 |                  input_size,
13 |                  lin_layer_sizes,
14 |                  lin_layer_dropouts,
15 |                  output_size,
16 |                  use_sigmoid=False):
17 |         """Initialize model with params."""
18 | 
19 |         super().__init__()
20 | 
21 |         self.output_size = output_size
22 |         self.use_sigmoid = use_sigmoid
23 | 
24 |         # final linear layers just before prediction
25 |         self.dense_bn_dropout = DenseBnDropout(
26 |             lin_layer_sizes=lin_layer_sizes, lin_layer_dropouts=lin_layer_dropouts, input_size=input_size)
27 | 
28 |         # Output Layer
29 |         self.output_layer = nn.Linear(lin_layer_sizes[-1], output_size)
30 |         nn.init.kaiming_normal_(self.output_layer.weight.data)
31 | 
32 |     def forward(self, inp):
33 |         """Run a forward pass of model over the data."""
34 |         inp = self.dense_bn_dropout(inp)
35 |         out = self.output_layer(inp)
36 |         if self.use_sigmoid:
37 |             out = torch.sigmoid(out)
38 |         return out
39 | 


--------------------------------------------------------------------------------
/caspr/models/multi_layer_lstm.py:
--------------------------------------------------------------------------------
 1 | """CASPR LSTM base class."""
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | 
 7 | class MultiLayerLSTM(nn.Module):  # noqa: W0223
 8 |     """Encapsulates the Pytorch LSTM.
 9 | 
10 |     Added functionality of aggregation / concatenation in cases of
11 |     bidirectional and multi-layered LSTM's
12 | 
13 |     It outputs the original outputs of the lstm along with an aggregated output vector
14 |     """
15 | 
16 |     def __init__(self, input_size, hidden_size, dropout=0., num_layers=1, bidirectional=False):  # noqa: R0913
17 |         """Initialise the pytorch LSTM layer.
18 | 
19 |         Args:
20 |             input_size = The size of the input in the lstm. This represents the number of input features
21 |             hidden_size = the hidden size of the lstm
22 |             dropout = the dropout layers between the multiple layers of the lstm (works only when we use a
23 |                 multi-layered lstm)
24 |             num_layers = num_layers of the lstm
25 |             bidirectional = represents the type of the lstm
26 |         """
27 |         super().__init__()
28 |         self.lstm_layer = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True,
29 |                                   bidirectional=bidirectional, dropout=dropout)
30 |         self.num_directions = 2 if bidirectional else 1
31 |         self.num_layers = num_layers
32 |         self.hidden_size = hidden_size
33 |         # Linear Layers post LSTM
34 |         self.lin_layer_lstm_to_dense = nn.Linear(
35 |             self.num_directions*self.hidden_size, self.hidden_size)
36 | 
37 |     def forward(self, input_tensor, hidden_state=None):
38 |         """Run a forward pass of model over the data."""
39 |         batch_size = input_tensor.size()[0]
40 |         device = input_tensor.device
41 | 
42 |         if hidden_state is not None:
43 |             h0 = hidden_state
44 |             c0 = torch.zeros(self.num_directions*self.num_layers, batch_size, self.hidden_size).to(device)
45 |             output, (hn, cn) = self.lstm_layer(input_tensor, (h0, c0))
46 |         else:
47 |             output, (hn, cn) = self.lstm_layer(input_tensor)
48 | 
49 |         hn = hn.view(self.num_layers, self.num_directions, -
50 |                      1, self.hidden_size)
51 |         cn = cn.view(self.num_layers, self.num_directions, -
52 |                      1, self.hidden_size)
53 | 
54 |         if self.num_directions > 1:
55 |             seq_inp = self.lin_layer_lstm_to_dense(torch.cat(
56 |                 [hn[self.num_layers-1, 0], hn[self.num_layers-1, -1]], 1).unsqueeze(0))
57 |         else:
58 |             seq_inp = self.lin_layer_lstm_to_dense(
59 |                 hn[self.num_layers-1, 0]).unsqueeze(0)
60 | 
61 |         seq_inp = seq_inp.reshape(seq_inp.size()[1], seq_inp.size()[2])
62 | 
63 |         return output, (hn[self.num_layers-1, 0, :, :], cn[self.num_layers-1, 0, :, :]), seq_inp
64 | 


--------------------------------------------------------------------------------
/caspr/models/transformer.py:
--------------------------------------------------------------------------------
  1 | """CASPR transformer base class."""
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | 
  6 | from caspr.models.attention_mechanisms import MultiHeadAttentionLayer
  7 | 
  8 | 
  9 | class TransformerEncoderLayer(nn.Module): # noqa: W0223  # noqa: W0223
 10 |     """TransformerEncoderLayer is made up of self-attn and feedforward network.
 11 | 
 12 |     Args:
 13 |         hid_dim: the hidden size of the encoder
 14 |         n_heads: the number of heads in the multi-head attention layers
 15 |         pf_dim: the dimension of the feedforward network model
 16 |         dropout: the dropout value
 17 |         device: the device on which the model is running
 18 |     """
 19 | 
 20 |     def __init__(self,  # noqa: R0913
 21 |                  hid_dim,
 22 |                  n_heads,
 23 |                  pf_dim,
 24 |                  dropout):
 25 |         """Initialize model with params."""
 26 | 
 27 |         super().__init__()
 28 | 
 29 |         self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
 30 |         self.ff_layer_norm = nn.LayerNorm(hid_dim)
 31 |         self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout)
 32 |         self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim,
 33 |                                                                      pf_dim,
 34 |                                                                      dropout)
 35 |         self.dropout = nn.Dropout(dropout)
 36 | 
 37 |     def forward(self, src, src_mask):
 38 |         """Run a forward pass of model over the data."""
 39 | 
 40 |         # src = [batch size, src len, hid dim]
 41 |         # src_mask = [batch size, src len]
 42 | 
 43 |         # self attention
 44 |         _src, _ = self.self_attention(src, src, src, src_mask)
 45 | 
 46 |         # dropout, residual connection and layer norm
 47 |         src = self.self_attn_layer_norm(src + self.dropout(_src))
 48 | 
 49 |         # src = [batch size, src len, hid dim]
 50 | 
 51 |         # positionwise feedforward
 52 |         _src = self.positionwise_feedforward(src)
 53 | 
 54 |         # dropout, residual and layer norm
 55 |         src = self.ff_layer_norm(src + self.dropout(_src))
 56 | 
 57 |         # src = [batch size, src len, hid dim]
 58 | 
 59 |         return src
 60 | 
 61 | 
 62 | class TransformerEncoder(nn.Module): # noqa: W0223  # noqa: W0223
 63 |     """TransformerEncoder is a stack of N encoder layers.
 64 | 
 65 |     Args:
 66 |         hid_dim: the hidden size of the encoder.
 67 |         n_layers: the number of sub-encoder-layers in the encoder
 68 |         n_heads: the number of heads in the multi-head attention layers
 69 |         pf_dim: the dimension of the feedforward network model
 70 |         dropout: the dropout value
 71 |         device: the device on which the model is running
 72 |         max_length: the maximum length of the input sequence
 73 |     """
 74 | 
 75 |     def __init__(self,  # noqa: R0913
 76 |                  hid_dim,
 77 |                  n_layers,
 78 |                  n_heads,
 79 |                  pf_dim,
 80 |                  dropout,
 81 |                  max_length=100):
 82 |         """Initialize model with params."""
 83 |         super().__init__()
 84 | 
 85 |         self.pos_embedding = nn.Embedding(max_length, hid_dim)
 86 | 
 87 |         self.layers = nn.ModuleList([TransformerEncoderLayer(hid_dim,
 88 |                                                              n_heads,
 89 |                                                              pf_dim,
 90 |                                                              dropout)
 91 |                                      for _ in range(n_layers)])
 92 | 
 93 |         self.dropout = nn.Dropout(dropout)
 94 | 
 95 |         self.register_buffer('scale', torch.sqrt(torch.FloatTensor([hid_dim])))
 96 | 
 97 |     def _make_src_mask(self, batch_size, src_len, device):
 98 | 
 99 |         src_mask = torch.ones((batch_size, 1, 1, src_len), device=device).bool()
100 | 
101 |         # src_mask = [batch size, 1, 1, src len]
102 | 
103 |         return src_mask
104 | 
105 |     def forward(self, src):
106 |         """Run a forward pass of model over the data."""
107 | 
108 |         # src = [batch size, src len, hid_dim]
109 | 
110 |         batch_size = src.shape[0]
111 |         src_len = src.shape[1]
112 |         device = src.device
113 | 
114 |         src_mask = self._make_src_mask(batch_size, src_len, device)
115 | 
116 |         # src_mask = [batch size, src len]
117 | 
118 |         pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(device)
119 | 
120 |         # pos = [batch size, src len]
121 | 
122 |         src = self.dropout(src * self.scale + self.pos_embedding(pos))
123 | 
124 |         # src = [batch size, src len, hid dim]
125 | 
126 |         for layer in self.layers:
127 |             src = layer(src, src_mask)
128 | 
129 |         # src = [batch size, src len, hid dim]
130 |         # src_mask = [batch size, 1, 1, src len]
131 | 
132 |         return src, src_mask
133 | 
134 | 
135 | class PositionwiseFeedforwardLayer(nn.Module): # noqa: W0223
136 |     """Fully connected feed-forward network consisting of two linear transformations with a ReLU activation in between.
137 | 
138 |     Args:
139 |         hid_dim: the hidden size of the encoder
140 |         pf_dim: the dimension of the feedforward network model
141 |         dropout: the dropout value
142 |     """
143 | 
144 |     def __init__(self, hid_dim, pf_dim, dropout):
145 |         """Initialize model with params."""
146 |         super().__init__()
147 | 
148 |         self.fc_1 = nn.Linear(hid_dim, pf_dim)
149 |         self.fc_2 = nn.Linear(pf_dim, hid_dim)
150 | 
151 |         self.dropout = nn.Dropout(dropout)
152 | 
153 |     def forward(self, x):
154 |         """Run a forward pass of model over the data."""
155 | 
156 |         # x = [batch size, seq len, hid dim]
157 | 
158 |         x = self.dropout(torch.relu(self.fc_1(x)))
159 | 
160 |         # x = [batch size, seq len, pf dim]
161 | 
162 |         x = self.fc_2(x)
163 | 
164 |         # x = [batch size, seq len, hid dim]
165 | 
166 |         return x
167 | 
168 | 
169 | class TransformerDecoderLayer(nn.Module): # noqa: W0223
170 |     """TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network.
171 | 
172 |     Args:
173 |         hid_dim: the hidden size of the encoder
174 |         n_heads: the number of heads in the multi-head attention layers
175 |         pf_dim: the dimension of the feedforward network model
176 |         dropout: the dropout value
177 |         device: the device on which the model is running
178 |     """
179 | 
180 |     def __init__(self, # noqa: R0913
181 |                  hid_dim,
182 |                  n_heads,
183 |                  pf_dim,
184 |                  dropout):
185 |         """Initialize model with params."""
186 |         super().__init__()
187 | 
188 |         self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
189 |         self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
190 |         self.ff_layer_norm = nn.LayerNorm(hid_dim)
191 |         self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout)
192 |         self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout)
193 |         self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim,
194 |                                                                      pf_dim,
195 |                                                                      dropout)
196 |         self.dropout = nn.Dropout(dropout)
197 | 
198 |     def forward(self, trg, enc_src, trg_mask, src_mask):
199 |         """Run a forward pass of model over the data."""
200 | 
201 |         # trg = [batch size, trg len, hid dim]
202 |         # enc_src = [batch size, src len, hid dim]
203 |         # trg_mask = [batch size, trg len]
204 |         # src_mask = [batch size, src len]
205 | 
206 |         # self attention
207 |         _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
208 | 
209 |         # dropout, residual connection and layer norm
210 |         trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
211 | 
212 |         # trg = [batch size, trg len, hid dim]
213 | 
214 |         # encoder attention
215 |         _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
216 | 
217 |         # dropout, residual connection and layer norm
218 |         trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
219 | 
220 |         # trg = [batch size, trg len, hid dim]
221 | 
222 |         # positionwise feedforward
223 |         _trg = self.positionwise_feedforward(trg)
224 | 
225 |         # dropout, residual and layer norm
226 |         trg = self.ff_layer_norm(trg + self.dropout(_trg))
227 | 
228 |         # trg = [batch size, trg len, hid dim]
229 |         # attention = [batch size, n heads, trg len, src len]
230 | 
231 |         return trg, attention
232 | 
233 | 
234 | class TransformerDecoder(nn.Module): # noqa: W0223
235 |     """TransformerDecoder is a stack of N decoder layers.
236 | 
237 |     Args:
238 |         hid_dim: the hidden size of the decoder
239 |         n_layers: the number of sub-decoder-layers in the decoder
240 |         n_heads: the number of heads in the multi-head attention layers
241 |         pf_dim: the dimension of the feedforward network model
242 |         dropout: the dropout value
243 |         pos_embedding: learned positional encoding added to the input embedding
244 |         device: the device on which the model is running
245 |     """
246 | 
247 |     def __init__(self, # noqa: R0913
248 |                  hid_dim,
249 |                  n_layers,
250 |                  n_heads,
251 |                  pf_dim,
252 |                  dropout,
253 |                  pos_embedding):
254 |         """Initialize model with params."""
255 |         super().__init__()
256 | 
257 |         self.pos_embedding = pos_embedding
258 | 
259 |         self.layers = nn.ModuleList([TransformerDecoderLayer(hid_dim,
260 |                                                              n_heads,
261 |                                                              pf_dim,
262 |                                                              dropout)
263 |                                      for _ in range(n_layers)])
264 | 
265 |         self.dropout = nn.Dropout(dropout)
266 | 
267 |         self.register_buffer('scale', torch.sqrt(torch.FloatTensor([hid_dim])))
268 | 
269 |     def _make_trg_mask(self, batch_size, trg_len, device):
270 | 
271 |         trg_mask = torch.tril(torch.ones((batch_size, 1, trg_len, trg_len), device=device)).bool()
272 | 
273 |         # trg_mask = [batch size, 1, trg len, trg len]
274 | 
275 |         return trg_mask
276 | 
277 |     def forward(self, trg, enc_src, src_mask):
278 |         """Run a forward pass of model over the data."""
279 | 
280 |         # trg = [batch size, trg len, hid_dim]
281 |         # enc_src = [batch size, src len, hid dim]
282 |         # src_mask = [batch size, 1, 1, src len]
283 | 
284 |         batch_size = trg.shape[0]
285 |         trg_len = trg.shape[1]
286 |         device = trg.device
287 | 
288 |         trg_mask = self._make_trg_mask(batch_size, trg_len, device)
289 | 
290 |         # trg_mask = [batch size, trg len]
291 | 
292 |         pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(device)
293 | 
294 |         # pos = [batch size, trg len]
295 | 
296 |         trg = self.dropout(trg * self.scale + self.pos_embedding(pos))
297 | 
298 |         # trg = [batch size, trg len, hid dim]
299 | 
300 |         for layer in self.layers:
301 |             trg, attention = layer(trg, enc_src, trg_mask, src_mask)
302 | 
303 |         # trg = [batch size, trg len, hid dim]
304 |         # attention = [batch size, n heads, trg len, src len]
305 | 
306 |         return trg, attention
307 | 


--------------------------------------------------------------------------------
/caspr/models/unified_encoder.py:
--------------------------------------------------------------------------------
  1 | """CASPR LSTM model."""
  2 | 
  3 | import warnings
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | 
 10 | from caspr.models.attention_mechanisms import BahdanauAttention, MultiHeadAttentionLSTMWrapper
 11 | from caspr.models.convolutional_aggregation import ConvAggregation
 12 | from caspr.models.dense_bn_dropout import DenseBnDropout
 13 | from caspr.models.embedding_layer import CategoricalEmbedding
 14 | from caspr.models.multi_layer_lstm import MultiLayerLSTM
 15 | 
 16 | warnings.simplefilter('ignore')
 17 | 
 18 | 
 19 | SEQ_CAT_INDEX = 0
 20 | SEQ_CONT_INDEX = 1
 21 | NON_SEQ_CAT_INDEX = 2
 22 | NON_SEQ_CONT_INDEX = 3
 23 | 
 24 | 
 25 | class UnifiedEncoder(nn.Module):  # noqa: R0902, W0223
 26 |     """Encapsulates the basic structure to run most of our models.
 27 | 
 28 |     It checks the various conditions for the presence
 29 |     or the absence of data and is compatible with functionalities like
 30 |         1. Usage of pretrained embedding vectors
 31 |         2. Multi-Layered LSTM use
 32 |         3. Convolutional Aggregation for data
 33 |         4. Self-Multi-Head and Bahdanau Attention (when number of heads = 1, Bahdanau is used by default)
 34 | 
 35 |     In this new edition, it is compatible with the DLExplainer module and should be used if
 36 |     explainability is a requirement
 37 |     """
 38 | 
 39 |     def __init__(self,  # noqa: R0912, R0913, R0914, R0915
 40 |                  emb_dims_non_seq,
 41 |                  emb_dropout_non_seq,
 42 |                  emb_dims_seq,
 43 |                  emb_dropout_seq,
 44 |                  emb_lin_layer_sizes_non_seq,
 45 |                  emb_lin_layer_dropouts_non_seq,
 46 |                  emb_lin_layer_sizes_seq,
 47 |                  emb_lin_layer_dropouts_seq,
 48 |                  lstm_hidden_size,
 49 |                  output_size,
 50 |                  seq_len,
 51 |                  non_seq_cont_count,
 52 |                  seq_cat_count,
 53 |                  seq_cont_count,
 54 |                  non_seq_cat_count,
 55 |                  attention_heads=1,
 56 |                  non_seq_pretrained_embs=None,
 57 |                  freeze_non_seq_pretrained_embs=True,
 58 |                  seq_pretrained_embs=None,
 59 |                  freeze_seq_pretrained_embs=True,
 60 |                  lstm_num_layers=1,
 61 |                  lstm_bidirectional=False,
 62 |                  use_conv_agg=False,
 63 |                  kernel_size=(3, 3),
 64 |                  max_pool_size=(2, 2),
 65 |                  stride=(2, 2)):
 66 |         """Initialize of all the variables and the layers depending on the arguments supplied.
 67 | 
 68 |         Args:
 69 |             emb_dims_non_seq = (List of tuples (x, y)) where x is the vocab size and y is the number of dimensions
 70 |                 for the respective embedding layer for every non_sequential categorical variable
 71 |             emb_dropout_non_seq = (Float) Dropout value of a layer used after the embedding layer - non_sequential
 72 |             emb_dims_seq = (List of tuples (x, y)) where x is the vocab size and y is the number of dimensions for the
 73 |                 respective embedding layer for every sequential categorical variable
 74 |             emb_dropout_seq = (Float) Dropout value of a layer used after the embedding layer - sequential
 75 |             emb_lin_layer_sizes_non_seq = (List of integers) determining the sizes of the stacked linear layers
 76 |                 used just after the embedding layers to learn better representations for non_sequential
 77 |                 categorical variables
 78 |             emb_lin_layer_dropouts_non_seq = (List of float) values determining the p values in the dropout
 79 |                 layers between linear layers
 80 |             emb_lin_layer_sizes_seq = (List of integers) determining the sizes of the stacked linear layers
 81 |                 used just after the embedding layers to learn better representations for sequential
 82 |                 categorical variables
 83 |             emb_lin_layer_dropouts_seq = (List of float) values determining the p values in the dropout
 84 |                 layers between linear layers
 85 |             lstm_hidden_size = (Integer) determining the Hidden size of the LSTM layer used to train the sequence model
 86 |             output_size = (Integer) Size of the final embedded output by the encoder.
 87 |             seq_len = (Integer) determining the length of the sequence in input
 88 |             non_seq_cont_count = (Integer) Number of non_sequential continuous variables
 89 |             seq_cat_count = (Integer) Number of sequential categorical variables
 90 |             seq_cont_count = (Integer) Number of sequential continuous variables
 91 |             non_seq_cat_count = (Integer) Number of non_sequential categorical variables
 92 |             attention_heads = (Integer: Default = 1) Describes the number of attention heads being used after the LSTM.
 93 |                 When 0 means that attention is not being used.
 94 |                 When = 1 uses Bahdanau attention by default and
 95 |                 When > 1 uses Multi-Head self-attention
 96 |             non_seq_pretrained_embs = (List of Tensors: Default = None) To be used as pretrained embeddings
 97 |                 in the embedding layers
 98 |             freeeze_non_seq_pretrained_embs = (Boolean: Default = True) Determines if the pretrained embeddings
 99 |                 are to be left untouched during backprop
100 |             seq_pretrained_embs = (List of Tensors: Default = None) To be used as pretrained embeddings in the
101 |                 embedding layers,
102 |             freeeze_seq_pretrained_embs = (Boolean: Default = True) Determines if the pretrained embeddings
103 |                 are to be left untouched during backprop
104 |             lstm_num_layers = (Integer: Default = 1) The number of stacked LSTM layers used
105 |             lstm_bidirectional = (Boolean: Default = False) Determines if the LSTM used is bidirectional
106 |             use_conv_agg = (Boolean: Default = False) Determines  if Convolutional aggregation is to be used in
107 |                 the model or not
108 |             kernel_size = (Tuple of Integers : Default = (3,3)) Determines the kernel size of the cnn aggregator
109 |             max_pool_size = (Tuple of Integers : Default = (2, 2)) Determines the max_pool size of the cnn aggregator
110 |             stride = (Tuple of Integers : Default = (2, 2)) Determines the stride of the cnn aggregator
111 |         """
112 |         super().__init__()
113 | 
114 |         self._explain = False
115 |         self.non_seq_emb_layers = CategoricalEmbedding(emb_dims=emb_dims_non_seq, emb_dropout=emb_dropout_non_seq,
116 |                                                        pretrained_vecs=non_seq_pretrained_embs,
117 |                                                        freeze_pretrained=freeze_non_seq_pretrained_embs)
118 |         self.seq_emb_layers = CategoricalEmbedding(emb_dims=emb_dims_seq, emb_dropout=emb_dropout_seq, is_seq=True,
119 |                                                    pretrained_vecs=seq_pretrained_embs,
120 |                                                    freeze_pretrained=freeze_seq_pretrained_embs)
121 | 
122 |         self.no_of_embs_non_seq = np.sum([y for x, y in emb_dims_non_seq])
123 |         self.no_of_embs_seq = np.sum([y for x, y in emb_dims_seq])
124 | 
125 |         self.non_seq_cat_final_size = 0
126 |         self.seq_len = seq_len
127 |         self.hidden_size = lstm_hidden_size
128 |         self.context_vector_size = lstm_hidden_size
129 |         self.output_dim = output_size
130 |         self.num_layers = lstm_num_layers
131 |         self.num_directions = 2 if lstm_bidirectional else 1
132 | 
133 |         self.seq_cat_count = seq_cat_count
134 |         self.seq_cont_count = seq_cont_count
135 |         self.non_seq_cat_count = non_seq_cat_count
136 |         self.non_seq_cont_count = non_seq_cont_count
137 |         self.attention_heads = attention_heads
138 | 
139 |         self.use_conv_agg = use_conv_agg
140 | 
141 |         # Linear Layers for non_seq_data parallel to LSTM
142 |         if self.no_of_embs_non_seq != 0:
143 |             self.emb_lin_layer_non_seq = DenseBnDropout(
144 |                 lin_layer_sizes=emb_lin_layer_sizes_non_seq,
145 |                 lin_layer_dropouts=emb_lin_layer_dropouts_non_seq, input_size=self.no_of_embs_non_seq)
146 |             self.non_seq_cat_final_size = emb_lin_layer_sizes_non_seq[-1]
147 | 
148 |         # LSTM layer
149 |         if self.no_of_embs_seq != 0:
150 |             self.emb_lin_layer_seq = DenseBnDropout(
151 |                 lin_layer_sizes=emb_lin_layer_sizes_seq,
152 |                 lin_layer_dropouts=emb_lin_layer_dropouts_seq, input_size=self.no_of_embs_seq)
153 | 
154 |         # LSTM layer
155 |         if self.no_of_embs_seq != 0:
156 |             self.emb_lin_layer_seq = DenseBnDropout(
157 |                 lin_layer_sizes=emb_lin_layer_sizes_seq,
158 |                 lin_layer_dropouts=emb_lin_layer_dropouts_seq, input_size=self.no_of_embs_seq)
159 |             self.lstm_inp_size = emb_lin_layer_sizes_seq[-1] + seq_cont_count
160 |         else:
161 |             self.lstm_inp_size = seq_cont_count
162 | 
163 |         if use_conv_agg and seq_len >= kernel_size[0] and \
164 |             (min(1, seq_cat_count)*emb_lin_layer_sizes_seq[-1] + seq_cont_count) >= kernel_size[1] and \
165 |                 int((min(1, seq_cat_count)*emb_lin_layer_sizes_seq[-1] + seq_cont_count -
166 |                      (kernel_size[1] - 1))/stride[1]) >= max_pool_size[1] and \
167 |                 int((seq_len - (kernel_size[0] - 1))/stride[0]) >= max_pool_size[0]:
168 |             # kernel_size[0] -> size of kernel along sequence dimension, hence must be <= seq_len
169 |             # kernel_size[1] -> size of kernel along features dimension, hence must be <= net size of input features
170 |             # int((min(1, seq_cat_count)*emb_lin_layer_sizes_seq[-1]
171 |             #  + seq_cont_count - (kernel_size[i] - 1))/stride[i])
172 |             # is the formula to calculate the final size of dimension i after the CNN filter is applied
173 |             # the above size should be >= max_pool[i] for pooling
174 |             self.conv_agg = ConvAggregation(
175 |                 kernel_size=kernel_size, stride=stride, max_pool_size=max_pool_size, dropout_size=0.4)
176 |             self.lstm_inp_size = int((int((min(1, seq_cat_count)*emb_lin_layer_sizes_seq[-1] + seq_cont_count - (
177 |                 kernel_size[1] - 1) - 1)/stride[1] + 1)) / max_pool_size[1])
178 |         else:
179 |             self.use_conv_agg = False
180 | 
181 |         if self.lstm_inp_size > 0:
182 |             self.lstm_layer = MultiLayerLSTM(input_size=self.lstm_inp_size, hidden_size=self.hidden_size,
183 |                                              num_layers=self.num_layers, bidirectional=lstm_bidirectional, dropout=0.4)
184 | 
185 |         # Attention
186 |         if self.attention_heads > 0:
187 |             if self.attention_heads == 1:
188 |                 self.bahdanau_attention_layer = BahdanauAttention(self.hidden_size, self.num_directions)
189 |             else:
190 |                 n_head = self.attention_heads
191 |                 d_model = self.hidden_size
192 |                 self.multi_head_attention_layer = MultiHeadAttentionLSTMWrapper(n_head, d_model, dropout=0.1)
193 | 
194 |         if self.attention_heads > 0:
195 |             self.output_layer = nn.Linear(int(self.num_directions*self.hidden_size + self.context_vector_size +
196 |                                               self.non_seq_cat_final_size + self.non_seq_cont_count),
197 |                                           int(self.hidden_size))
198 |         else:
199 |             self.output_layer = nn.Linear(int(self.num_directions*self.hidden_size +
200 |                                               self.non_seq_cat_final_size + self.non_seq_cont_count),
201 |                                           int(self.hidden_size))
202 |         nn.init.kaiming_normal_(self.output_layer.weight.data)
203 | 
204 |     def forward(self, *args):  # noqa: R0912, R0914
205 |         """Forward function accepts multiple arguments.
206 | 
207 |         The last argument is always a list of indices representing the index (if data present)
208 |         with -1 in places for the absence of data The indices are used to partition the data into 4 types
209 |         - seq_cat, seq_cont, non_seq_cat, non_seq_cont
210 |         """
211 |         nonempty_idx = args[-1]
212 |         data_exists = list(map(lambda x: x != -1, nonempty_idx))
213 |         device = args[0].device
214 |         batch_size = args[0].shape[0]
215 | 
216 |         seq_cat_data = args[nonempty_idx[SEQ_CAT_INDEX]] if data_exists[SEQ_CAT_INDEX] else torch.empty(batch_size, 0, 0, device=device)
217 |         seq_cont_data = args[nonempty_idx[SEQ_CONT_INDEX]] if data_exists[SEQ_CONT_INDEX] else torch.empty(batch_size, 0, 0, device=device)
218 |         non_seq_cat_data = args[nonempty_idx[NON_SEQ_CAT_INDEX]] if data_exists[NON_SEQ_CAT_INDEX] else torch.empty(batch_size, 0, device=device)
219 |         non_seq_cont_data = args[nonempty_idx[NON_SEQ_CONT_INDEX]] if data_exists[NON_SEQ_CONT_INDEX] else torch.empty(batch_size, 0, device=device)
220 | 
221 |         if self.no_of_embs_non_seq != 0:
222 |             non_seq_cat_inp = self.non_seq_emb_layers(non_seq_cat_data)
223 |             non_seq_inp = self.emb_lin_layer_non_seq(non_seq_cat_inp)
224 | 
225 |             if self.non_seq_cont_count != 0:
226 |                 non_seq_inp = torch.cat((non_seq_inp.type(torch.FloatTensor).to(device),
227 |                                          non_seq_cont_data.type(torch.FloatTensor).to(device)), 1)
228 |         else:
229 |             if self.non_seq_cont_count != 0:
230 |                 non_seq_inp = non_seq_cont_data.to(device)
231 |             else:
232 |                 non_seq_inp = torch.Tensor().to(device)
233 | 
234 |         if self.no_of_embs_seq != 0:
235 |             seq_cat_inp = self.seq_emb_layers(seq_cat_data)
236 |             seq_inp = self.emb_lin_layer_seq(seq_cat_inp)
237 |             if self.seq_cont_count != 0:
238 |                 seq_inp = torch.cat((seq_inp.type(torch.FloatTensor).to(device),
239 |                                      seq_cont_data.type(torch.FloatTensor).to(device)), 2)
240 | 
241 |         elif self.seq_cont_count != 0:
242 |             seq_inp = seq_cont_data.type(torch.FloatTensor).to(device)
243 | 
244 |         if self.no_of_embs_seq + self.seq_cont_count > 0:
245 | 
246 |             if self.use_conv_agg:
247 |                 seq_inp = self.conv_agg(seq_inp)
248 | 
249 |             output, (_, cn), seq_inp = self.lstm_layer(seq_inp)
250 | 
251 |             if self.attention_heads > 0:
252 |                 if self.attention_heads == 1:
253 |                     context_vector = self.bahdanau_attention_layer(output)
254 |                     context_vector = context_vector.reshape(context_vector.size()[0], context_vector.size()[2])
255 |                 else:
256 |                     context_vector = self.multi_head_attention_layer(output, output, output)
257 | 
258 |                 fin_input = torch.cat((seq_inp, context_vector), 1)
259 |             else:
260 |                 fin_input = seq_inp
261 | 
262 |             if self.no_of_embs_non_seq + self.non_seq_cont_count > 0:
263 |                 fin_input = torch.cat((non_seq_inp.type(torch.FloatTensor).to(device), fin_input), 1)
264 |         else:
265 |             fin_input = non_seq_inp
266 | 
267 |         fin_output = F.relu(self.output_layer(fin_input))
268 | 
269 |         if self._explain:
270 |             return fin_output
271 |         return output, (fin_output, cn)
272 | 
273 |     @property
274 |     def explain(self):
275 |         """Getter for explain."""
276 | 
277 |         return self._explain
278 | 
279 |     def set_explain(self, value):
280 |         """Setter for explain."""
281 | 
282 |         self._explain = value
283 | 


--------------------------------------------------------------------------------
/caspr/models/unified_transformer_encoder.py:
--------------------------------------------------------------------------------
  1 | """CASPR Transfomer model."""
  2 | 
  3 | import warnings
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | 
  8 | from caspr.models.embedding_layer import CategoricalEmbedding
  9 | 
 10 | warnings.simplefilter('ignore')
 11 | 
 12 | SEQ_CAT_INDEX = 0
 13 | SEQ_CONT_INDEX = 1
 14 | NON_SEQ_CAT_INDEX = 2
 15 | NON_SEQ_CONT_INDEX = 3
 16 | 
 17 | 
 18 | class UnifiedTransformerEncoder(nn.Module):  # noqa: R0902, W0223
 19 |     """Encapsulates the basic structure to run most of our models.
 20 | 
 21 |     It checks the various conditions for the presence or the absence of data
 22 |     and is compatible with functionalities like
 23 |             1. Usage of pretrained embedding vectors
 24 |             2. Multi-Layered transformer use
 25 |             3. Convolutional Aggregation for data
 26 |             4. Self-Multi-Head and Bahdanau Attention (when number of heads = 1, Bahdanau is used by default)
 27 | 
 28 |         In this new edition, it is compatible with the DLExplainer module and should be used if explainability
 29 |         is a requirement
 30 |     """
 31 | 
 32 |     def __init__(self,  # noqa: R0913, R0914
 33 |                  transformer_encoder,
 34 |                  emb_dims_non_seq,
 35 |                  emb_dropout_non_seq,
 36 |                  emb_dims_seq,
 37 |                  emb_dropout_seq,
 38 |                  hidden_size,
 39 |                  seq_cont_count,
 40 |                  non_seq_cont_count,
 41 |                  non_seq_pretrained_embs=None,
 42 |                  freeze_non_seq_pretrained_embs=True,
 43 |                  seq_pretrained_embs=None,
 44 |                  freeze_seq_pretrained_embs=True):
 45 |         """Initialize model with params."""
 46 | 
 47 |         super().__init__()
 48 | 
 49 |         self._explain = False
 50 | 
 51 |         self.emb_non_seq = CategoricalEmbedding(emb_dims=emb_dims_non_seq, emb_dropout=emb_dropout_non_seq,
 52 |                                                 is_seq=False, pretrained_vecs=non_seq_pretrained_embs,
 53 |                                                 freeze_pretrained=freeze_non_seq_pretrained_embs)
 54 |         self.emb_seq = CategoricalEmbedding(emb_dims=emb_dims_seq, emb_dropout=emb_dropout_seq,
 55 |                                             is_seq=True, pretrained_vecs=seq_pretrained_embs,
 56 |                                             freeze_pretrained=freeze_seq_pretrained_embs)
 57 | 
 58 |         self.hid_dim = hidden_size
 59 |         self.seq_cont_dim = seq_cont_count
 60 |         self.non_seq_cont_dim = non_seq_cont_count
 61 | 
 62 |         # Linear layers for seq_data
 63 |         seq_inp_size = self.emb_seq.emb_size + self.seq_cont_dim
 64 |         self.linear_seq = nn.Linear(seq_inp_size, self.hid_dim)
 65 | 
 66 |         # Linear layers for non_seq_data
 67 |         non_seq_inp_size = self.emb_non_seq.emb_size + self.non_seq_cont_dim
 68 |         self.linear_non_seq = nn.Linear(non_seq_inp_size, self.hid_dim) if non_seq_inp_size else None
 69 | 
 70 |         self.transformer_encoder = transformer_encoder
 71 | 
 72 |     def forward(self, *args):
 73 |         """Run a forward pass of model over the data."""
 74 | 
 75 |         nonempty_idx = args[-1]
 76 |         data_exists = list(map(lambda x: x != -1, nonempty_idx))
 77 |         device = args[0].device
 78 |         batch_size, seq_len = args[0].shape[:2]
 79 | 
 80 |         seq_cat_data = args[nonempty_idx[SEQ_CAT_INDEX]] if data_exists[SEQ_CAT_INDEX] else torch.empty(batch_size, seq_len, 0, device=device)
 81 |         seq_cont_data = args[nonempty_idx[SEQ_CONT_INDEX]] if data_exists[SEQ_CONT_INDEX] else torch.empty(batch_size, seq_len, 0, device=device)
 82 |         non_seq_cat_data = args[nonempty_idx[NON_SEQ_CAT_INDEX]] if data_exists[NON_SEQ_CAT_INDEX] else torch.empty(batch_size, 0, device=device)
 83 |         non_seq_cont_data = args[nonempty_idx[NON_SEQ_CONT_INDEX]] if data_exists[NON_SEQ_CONT_INDEX] else torch.empty(batch_size, 0, device=device)
 84 | 
 85 |         if self.emb_seq and data_exists[SEQ_CAT_INDEX]:
 86 |             seq_cat_data = self.emb_seq(seq_cat_data)
 87 |         seq_inp = torch.cat((seq_cat_data, seq_cont_data), -1)
 88 |         seq_inp = self.linear_seq(seq_inp)
 89 | 
 90 |         if self.emb_non_seq and data_exists[NON_SEQ_CAT_INDEX]:
 91 |             non_seq_cat_data = self.emb_non_seq(non_seq_cat_data)
 92 |         non_seq_inp = torch.cat((non_seq_cat_data, non_seq_cont_data), -1)
 93 |         if self.linear_non_seq:
 94 |             non_seq_inp = self.linear_non_seq(non_seq_inp).unsqueeze(1)
 95 |         
 96 |         src_inp = torch.cat((seq_inp, non_seq_inp), 1) if non_seq_inp.nelement() > 0 else seq_inp
 97 |         # src_inp = [batch_size, src len, hid dim]
 98 | 
 99 |         enc_src, src_mask = self.transformer_encoder(src_inp)
100 | 
101 |         if self._explain:
102 |             return enc_src.reshape(enc_src.shape[0], -1)
103 |         return enc_src, src_mask, src_inp
104 | 
105 |     @property
106 |     def explain(self):
107 |         """Getter for explain."""
108 | 
109 |         return self._explain
110 | 
111 |     def set_explain(self, value):
112 |         """Setter for explain."""
113 | 
114 |         self._explain = value
115 | 


--------------------------------------------------------------------------------
/caspr/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/CASPR/9cdbf270487751a0ad6862b2fea2ccc0e23a0b67/caspr/utils/__init__.py


--------------------------------------------------------------------------------
/caspr/utils/early_stopping.py:
--------------------------------------------------------------------------------
  1 | """Early stopping class for nn models."""
  2 | 
  3 | import logging
  4 | 
  5 | import torch
  6 | from torch.nn.parallel import DistributedDataParallel as DDP
  7 | 
  8 | from caspr.utils.onnx import export_onnx, register_custom_op
  9 | 
 10 | 
 11 | class EarlyStopping:
 12 |     """Stop the training early and save a PyTorch or ONNX model after specified iterations (patience)"""
 13 | 
 14 |     def __init__(self, logger, should_decrease,patience=3, verbose=True, delta=0, save_onnx=False):
 15 |         """Initialize the early stopping module.
 16 | 
 17 |         Args:
 18 |             logger: For logging
 19 |             should_decrease (bool): True if metrics improve by decreasing.
 20 |             patience (int): How long to wait after last time validation score improved.
 21 |                             Default: 3
 22 |             verbose (bool): If True, prints a message for each validation score improvement.
 23 |                             Default: True
 24 |             delta (float): Minimum change in the monitored quantity to qualify as an improvement.
 25 |                            Default: 0
 26 |             save_onnx (bool): If True, export the model as onnx format.
 27 |                               Default: False
 28 |         """
 29 |         self.logger = logger
 30 |         self.patience = patience
 31 |         self.verbose = verbose
 32 |         self.counter = 0
 33 |         self.best_score = None
 34 |         self.early_stop = False
 35 |         self.should_decrease = should_decrease
 36 |         self.delta = delta
 37 |         self.save_onnx = save_onnx
 38 |         if self.save_onnx:
 39 |             register_custom_op()
 40 | 
 41 |     def __call__(self, val_score, model, path):
 42 |         """Define __call__ method.
 43 | 
 44 |         Args:
 45 |             val_score (float): Validation score to determine whether to early stop.
 46 |             model (nn.Module): Model being trained.
 47 |             path (str): Model save path.
 48 |         """
 49 | 
 50 |         if self.should_decrease:
 51 |             val_score = -val_score
 52 | 
 53 |         if self.best_score is None:
 54 |             self.best_score = val_score
 55 |             self.save_checkpoint(model, path)
 56 |         elif val_score <= self.best_score + self.delta:
 57 |             self.counter += 1
 58 |             self.logger.info('EarlyStopping counter: {} out of {}\n'.format(self.counter, self.patience))
 59 |             if self.counter >= self.patience:
 60 |                 self.early_stop = True
 61 |         else:
 62 |             self.best_score = val_score
 63 |             self.save_checkpoint(model, path)
 64 |             self.counter = 0
 65 | 
 66 |     def save(self, model, path):
 67 |         if self.save_onnx:
 68 |             export_onnx(model, path)
 69 |         else:
 70 |             torch.save(model.state_dict(), path)
 71 | 
 72 |     def save_checkpoint(self, model, path):
 73 |         """Save model when validation score improves.
 74 | 
 75 |         The model parameter can be a list that allows multiple models to be saved.
 76 |         """
 77 | 
 78 |         if self.verbose:
 79 |             self.logger.info('Validation score improved.  Saving model ...\n')
 80 |         if not isinstance(model, list):
 81 |             self.save(model, path)
 82 |         else:
 83 |             for m, p in zip(model, path):
 84 |                 self.save(m, path)
 85 | 
 86 | 
 87 | class DistributedEarlyStopping(EarlyStopping):
 88 |     def __init__(self, logger, should_decrease=True, patience=3, verbose=True, delta=0, rank=None, save_onnx=False):
 89 |         super().__init__(logger, should_decrease, patience=patience, verbose=verbose, delta=delta, save_onnx=save_onnx)
 90 |         self.rank = rank
 91 |     
 92 |     def __call__(self, val_score, model, path, rank=None):
 93 |         if not rank:
 94 |             rank = self.rank
 95 | 
 96 |         if rank and rank > 0:
 97 |             return
 98 | 
 99 |         if isinstance(model, DDP):
100 |             model = model.module
101 |             
102 |         return super().__call__(val_score, model, path)
103 | 
104 | 
105 | if __name__ == '__main__':
106 | 
107 |     class TwoLayerNet(torch.nn.Module):
108 |         """Simple two-layer neural network for demonstration purposes."""
109 | 
110 |         def __init__(self, D_in, H, D_out):
111 |             """Instantiate two nn.Linear modules and assign them as member variables."""
112 | 
113 |             super().__init__()
114 | 
115 |             self.linear1 = torch.nn.Linear(D_in, H)
116 |             self.linear2 = torch.nn.Linear(H, D_out)
117 | 
118 |         def forward(self, x):
119 |             """In the forward function we accept a Tensor of input data and we must return a Tensor of output data.
120 | 
121 |             We can use Modules defined in the constructor as well as arbitrary operators on Tensors.
122 |             """
123 | 
124 |             h_relu = self.linear1(x).clamp(min=0)
125 |             y = self.linear2(h_relu)
126 |             return y
127 | 
128 |     # N is batch size; D_in is input dimension;
129 |     # H is hidden dimension; D_out is output dimension.
130 |     batch_size, input_dim, hidden_dim, output_dim = 1000, 1000, 100, 10
131 | 
132 |     # Create random Tensors to hold inputs and outputs
133 |     X = torch.randn(batch_size, input_dim)
134 |     y_true = torch.randn(batch_size, output_dim)
135 | 
136 |     # Construct our model by instantiating the class defined above
137 |     mlp = TwoLayerNet(input_dim, hidden_dim, output_dim)
138 | 
139 |     # Construct our loss function and an Optimizer. The call to model.parameters()
140 |     # in the SGD constructor will contain the learnable parameters of the two
141 |     # nn.Linear modules which are members of the model.
142 |     criterion = torch.nn.MSELoss(reduction='sum')
143 |     optimizer = torch.optim.SGD(mlp.parameters(), lr=1e-4)
144 |     logger = logging.getLogger(__name__)
145 |     early_stopping = EarlyStopping(logger, should_decrease=True, patience=3, verbose=True, delta=1e-5)
146 |     
147 |     for t in range(10000):
148 |         # Forward pass: Compute predicted y by passing x to the model
149 |         y_pred = mlp(X)
150 | 
151 |         # Compute and print loss
152 |         loss = criterion(y_pred, y_true)
153 |         if t % 100 == 99:
154 |             logger.info(t, loss.item())
155 |             early_stopping(loss.item(), mlp, 'early_stopping_test_model.pth')
156 |             if early_stopping.early_stop:
157 |                 break
158 | 
159 |         # Zero gradients, perform a backward pass, and update the weights.
160 |         optimizer.zero_grad()
161 |         loss.backward()
162 |         optimizer.step()
163 | 
164 |     mlp.load_state_dict(torch.load('early_stopping_test_model.pth'))
165 |     y_pred = mlp(X)
166 |     loss = criterion(y_pred, y_true)
167 |     logger.info('Best loss: {}'.format(loss.item()))
168 | 


--------------------------------------------------------------------------------
/caspr/utils/estimate_parameters.py:
--------------------------------------------------------------------------------
 1 | def estimate_linear_parameters(input_dim, output_dim, bias=True):
 2 |     if input_dim > 0 and bias:
 3 |         input_dim += 1
 4 |     return input_dim * output_dim
 5 | 
 6 | 
 7 | def estimate_embedding_parameters(df, cat_cols_, max_emb_dim):
 8 |     emb_num_classes = [df.select(c).distinct().count() for c in cat_cols_]
 9 |     emb_dims = [(x, int(min(max_emb_dim, (x + 1) // 2))) for x in emb_num_classes]
10 |     emb_size = sum([d for _, d in emb_dims])
11 |     emb_num_param = sum([estimate_linear_parameters(v, d, bias=False) for v, d in emb_dims])
12 |     return emb_num_param, emb_size, emb_num_classes
13 | 
14 | 
15 | def estimate_transformer_parameters(hidden_dim, seq_len, pf_dim, num_layers, is_encoder=True):
16 |     pos_emb_num_param = estimate_linear_parameters(seq_len, hidden_dim, bias=False) if is_encoder else 0
17 |     layer_norm_num_param = hidden_dim * 2
18 |     attn_num_param = estimate_linear_parameters(hidden_dim, hidden_dim) * 4
19 |     layer_norm_count = 2 if is_encoder else 3
20 |     attn_count = 1 if is_encoder else 2
21 |     pf_num_param = estimate_linear_parameters(hidden_dim, pf_dim) + estimate_linear_parameters(pf_dim, hidden_dim)
22 |     transformer_num_param = pos_emb_num_param + \
23 |         (layer_norm_num_param * layer_norm_count + attn_num_param * attn_count + pf_num_param) * num_layers
24 |     return transformer_num_param
25 | 
26 | 
27 | def estimate_output_parameters(hidden_dim, emb_num_classes, cont_dim):
28 |     output_num_param_cat = sum([estimate_linear_parameters(hidden_dim, v) for v in emb_num_classes])
29 |     output_num_param_cont = estimate_linear_parameters(hidden_dim, cont_dim)
30 |     output_num_param = output_num_param_cat + output_num_param_cont
31 |     return output_num_param
32 | 
33 | 
34 | def estimate_transformer_autoencoder_parameters(df, seq_cat_, seq_cont_, non_seq_cat_, non_seq_cont_,
35 |                                                 hidden_dim, pf_dim_enc, pf_dim_dec, num_layers_enc,
36 |                                                 num_layers_dec, seq_len, max_emb_dim=30):  
37 |     emb_num_param_seq, emb_size_seq, emb_num_classes_seq = estimate_embedding_parameters(df, seq_cat_, max_emb_dim)
38 |     emb_num_param_non_seq, emb_size_non_seq, emb_num_classes_non_seq = estimate_embedding_parameters(df, non_seq_cat_, max_emb_dim)
39 |     emb_num_param = emb_num_param_seq + emb_num_param_non_seq
40 | 
41 |     seq_cont_dim = len(seq_cont_)
42 |     non_seq_cont_dim = len(non_seq_cont_)
43 |     non_seq_dim = emb_size_non_seq + non_seq_cont_dim
44 |     linear_num_param_seq = estimate_linear_parameters(seq_cont_dim + emb_size_seq, hidden_dim)
45 |     linear_num_param_non_seq = estimate_linear_parameters(non_seq_dim, hidden_dim)
46 |     linear_num_param = linear_num_param_seq + linear_num_param_non_seq
47 | 
48 |     adjust_seq_len = seq_len + int(non_seq_dim > 0)
49 |     enc_num_param = estimate_transformer_parameters(hidden_dim, adjust_seq_len, pf_dim_enc, num_layers_enc)
50 |     dec_num_param = estimate_transformer_parameters(hidden_dim, adjust_seq_len, pf_dim_dec, num_layers_dec, is_encoder=False)
51 |     transformer_num_param = enc_num_param + dec_num_param
52 | 
53 |     output_num_param_seq = estimate_output_parameters(hidden_dim, emb_num_classes_seq, seq_cont_dim)
54 |     output_num_param_non_seq = estimate_output_parameters(hidden_dim, emb_num_classes_non_seq, non_seq_cont_dim)
55 |     output_num_param = output_num_param_seq + output_num_param_non_seq
56 | 
57 |     num_param = emb_num_param + linear_num_param + transformer_num_param + output_num_param
58 |     return num_param
59 | 


--------------------------------------------------------------------------------
/caspr/utils/explain/__init__.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # --------------------------------------------------------------------------
 4 | 
 5 | """The explainer module for the CASPR library.
 6 | 
 7 | Modules:
 8 | :module1_name: A description of this specific module.
 9 | """
10 | 


--------------------------------------------------------------------------------
/caspr/utils/explain/utils.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | 
 6 | def separate_pos_neg(attribution):
 7 |     """Separate out positive and negative attributes in the dataframe with attributes.
 8 | 
 9 |     Outputs two separated dataframes
10 |     """
11 |     attribution_pos_val = attribution*(attribution >= 0)
12 |     attribution_neg_val = attribution*~(attribution >= 0)
13 |     return attribution_pos_val, attribution_neg_val
14 | 
15 | 
16 | def visualize(explanations: pd.DataFrame, separate_pos_neg_imp: bool = False,
17 |               title="Average Feature Importances", axis_title="Features", save_fig: str = None):
18 |     """Visualize explanations.
19 | 
20 |     Utility function used to create bar graph visualisations at a model level
21 | 
22 |     Args:
23 |         explanations (pandas dataframe): Dataframe with feature attributions
24 |         separate_pos_neg_imp (Boolean: Default = False): Determines if the positive and negative attributions are to be
25 |             aggregated and plotted separately (two reverse sided bars) in the same plot
26 |         title (String : Default = "Average Feature Importances") : Represents the title of the graph
27 |         axis_title (String: Default = "Features") : Represents the title of the Y axis
28 |         save_fig (String) : Contains the path where to save the image plot. If None : the module doesnt save the image
29 | 
30 |     """
31 |     feature_names = explanations.columns
32 |     imp_pos_df, imp_neg_df = separate_pos_neg(explanations)
33 |     combine_importances = not separate_pos_neg_imp
34 | 
35 |     importances_pos = imp_pos_df.values
36 |     importances_neg = imp_neg_df.values
37 | 
38 |     if importances_pos.ndim == 2:
39 |         importances_pos = np.mean(importances_pos, axis=0)
40 |         importances_neg = np.mean(importances_neg, axis=0)
41 | 
42 |     xlim_pos = np.max(importances_pos)*1.25
43 |     xlim_neg = np.max(np.abs(importances_neg))*1.25
44 | 
45 |     if combine_importances:
46 |         xlim_pos += xlim_neg
47 |         xlim_neg = 0
48 |         importances_pos += np.abs(importances_neg)
49 | 
50 |     else:
51 |         xlim_pos = np.max([xlim_pos, xlim_neg])
52 |         xlim_neg = -1 * xlim_pos
53 | 
54 |     x_pos = (np.arange(len(feature_names)))
55 | 
56 |     # Plotting begins
57 |     plt.figure(figsize=(10, 10))
58 |     width = 0.3
59 | 
60 |     if combine_importances:
61 |         plt.barh(x_pos, importances_pos, width, align='center')
62 |     else:
63 |         plt.barh(x_pos, importances_pos, width, align='center')
64 |         plt.barh(x_pos + width, importances_neg, width, align='center')
65 | 
66 |     plt.yticks(x_pos + width/2, feature_names, wrap=True)
67 |     plt.ylabel(axis_title)
68 |     plt.title(title)
69 |     axes = plt.gca()
70 |     axes.set_xlim([xlim_neg, xlim_pos])
71 | 
72 |     if save_fig is not None:
73 |         plt.savefig(save_fig)
74 | 


--------------------------------------------------------------------------------
/caspr/utils/horovod/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 
 2 | 
 3 | # 
 4 | 
 5 | # Unless required by applicable law or agreed to in writing, software 
 6 | 
 7 | # distributed under the License is distributed on an "AS IS" BASIS, 
 8 | 
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
10 | 
11 | # See the License for the specific language governing permissions and 
12 | 
13 | # limitations under the License. 
14 | 
15 | # 
16 | 
17 | # ============================================================================== 
18 | 


--------------------------------------------------------------------------------
/caspr/utils/horovod/train.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import horovod.torch as hvd
  4 | import torch
  5 | import torch.nn as nn
  6 | from torch import optim
  7 | from torch.utils.data.distributed import DistributedSampler
  8 | 
  9 | from caspr.data.common_dataset import id_collate
 10 | from caspr.utils.early_stopping import DistributedEarlyStopping
 11 | from caspr.utils.train import init_lr_schedulers, run_autoencoder, run_autoencoder_val
 12 | 
 13 | BATCH_SIZE = 1024 * 32
 14 | NUM_EPOCHS = 100
 15 | EARLY_STOPPING_PATIENCE = 8
 16 | EARLY_STOPPING_DELTA = 1e-5
 17 | ROOT_RANK = 0
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | def save_checkpoint(model, optimizer, epoch, name):
 22 |     filepath = '/checkpoint-{epoch}-{model}.pth'.format(epoch=epoch, model=name)
 23 |     state = {
 24 |         'model': model.state_dict(),
 25 |         'optimizer': optimizer.state_dict(),
 26 |     }
 27 |     torch.save(state, filepath)
 28 | 
 29 | 
 30 | def metric_average(metric, name):
 31 |     avg_tensor = hvd.allreduce(metric, name=name)
 32 |     return avg_tensor.item()
 33 | 
 34 | 
 35 | def determine_early_stop(early_stopper: DistributedEarlyStopping, loss_averaged, model, path, epoch, num_epochs):
 36 |     # Call the distributed early stopper while passing rank info
 37 |     # Only rank 0 is allowed to checkpoint
 38 |     early_stopper(loss_averaged, model,
 39 |                   path, hvd.rank())
 40 |     if early_stopper.early_stop:
 41 |         epoch = num_epochs
 42 |     # The answer to whether to stop or not is decided by the root rank
 43 |     # The answer is then broadcased to other nodes
 44 |     epoch = hvd.broadcast_object(epoch, root_rank=ROOT_RANK)
 45 | 
 46 |     # The root rank loads the latest model checkpoint and broadcasts parameters
 47 |     if hvd.rank() == ROOT_RANK and epoch == num_epochs:
 48 |         model.load_state_dict(torch.load(path))
 49 |     hvd.broadcast_parameters(model.state_dict(), root_rank=ROOT_RANK)
 50 |     return epoch
 51 | 
 52 | 
 53 | def train_hvd(dataset_train, autoenc, device, batch_size=1024, epochs=10, learning_rate=0.01, warmup_epochs=5, save_model=False, path='./early_stopping_test_model.pth'):
 54 |     autoenc.train()
 55 |     hvd.init()
 56 |     logger.info("Number of workers:" + str(hvd.size()))
 57 | 
 58 |     if device.type == 'cuda':
 59 |         # Horovod: pin GPU to local rank.
 60 |         torch.cuda.set_device(hvd.local_rank())
 61 | 
 62 |     # Configure the sampler such that each worker obtains a distinct sample of input dataset.
 63 |     train_sampler = DistributedSampler(dataset_train, num_replicas=hvd.size(), rank=hvd.rank())
 64 |     train_loader = torch.utils.data.DataLoader(
 65 |         dataset_train, batch_size=batch_size, sampler=train_sampler, collate_fn=id_collate)
 66 | 
 67 |     num_epochs = epochs
 68 | 
 69 |     # Effective batch size in synchronous distributed training is scaled by the number of workers.
 70 |     # An increase in learning rate compensates for the increased batch size.
 71 |     optimizer = optim.Adam(autoenc.parameters(), lr=learning_rate * hvd.size())
 72 |     # Wrap the optimizer with Horovod's DistributedOptimizer.
 73 |     optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=autoenc.named_parameters())
 74 | 
 75 |     scheduler_wu, scheduler_re = init_lr_schedulers(
 76 |         optimizer, warmup_epochs, reduce_patience=int(EARLY_STOPPING_PATIENCE/2), verbose=True)
 77 | 
 78 |     hvd.broadcast_object(scheduler_wu, root_rank=ROOT_RANK)
 79 |     hvd.broadcast_object(scheduler_re, root_rank=ROOT_RANK)
 80 | 
 81 |     # Broadcast initial parameters so all workers start with the same parameters.
 82 |     hvd.broadcast_parameters(autoenc.state_dict(), root_rank=ROOT_RANK)
 83 | 
 84 |     criterion = [nn.MSELoss(), nn.CrossEntropyLoss()]
 85 | 
 86 |     losses = []
 87 |     early_stopper = DistributedEarlyStopping(logger, patience=EARLY_STOPPING_PATIENCE, delta=EARLY_STOPPING_DELTA)
 88 | 
 89 |     epoch = 1
 90 |     while epoch < num_epochs + 1:
 91 |         losses, _ = run_autoencoder(autoenc, optimizer, train_loader, criterion, device)
 92 |         loss_averaged = metric_average(torch.tensor(losses), 'avg_loss')
 93 |         logger.info("Average overall training loss in epoch {0} is {1}".format(
 94 |             epoch, loss_averaged))
 95 | 
 96 |         epoch = determine_early_stop(early_stopper, loss_averaged, autoenc, path, epoch, num_epochs)
 97 | 
 98 |         if epoch <= warmup_epochs:
 99 |             scheduler_wu.step()
100 |         scheduler_re.step(loss_averaged)
101 | 
102 |         if hvd.rank() == ROOT_RANK and epoch == num_epochs:
103 |             if save_model:
104 |                 save_checkpoint(autoenc, optimizer, epoch, 'encoder')
105 |             return autoenc, loss_averaged
106 |         epoch = epoch+1
107 | 
108 | 
109 | def train_val_hvd(dataset_train, dataset_val, autoenc, device, batch_size=1024, epochs=10, learning_rate=0.01, warmup_epochs=5, save_model=False, path='./early_stopping_test_model.pth'):
110 |     autoenc.train()
111 |     hvd.init()
112 |     logger.info("Number of workers:" + str(hvd.size()))
113 | 
114 |     if device.type == 'cuda':
115 |         # Horovod: pin GPU to local rank.
116 |         torch.cuda.set_device(hvd.local_rank())
117 | 
118 |     # Configure the sampler such that each worker obtains a distinct sample of input dataset.
119 |     train_sampler = DistributedSampler(dataset_train, num_replicas=hvd.size(), rank=hvd.rank())
120 |     train_loader = torch.utils.data.DataLoader(
121 |         dataset_train, batch_size=batch_size, sampler=train_sampler, collate_fn=id_collate)
122 | 
123 |     val_sampler = DistributedSampler(dataset_val, num_replicas=hvd.size(), rank=hvd.rank())
124 |     val_loader = torch.utils.data.DataLoader(dataset_val, batch_size=batch_size,
125 |                                              sampler=val_sampler, collate_fn=id_collate)
126 | 
127 |     num_epochs = epochs
128 | 
129 |     # Effective batch size in synchronous distributed training is scaled by the number of workers.
130 |     # An increase in learning rate compensates for the increased batch size.
131 |     optimizer = optim.Adam(autoenc.parameters(), lr=learning_rate * hvd.size())
132 |     # Wrap the optimizer with Horovod's DistributedOptimizer.
133 |     optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=autoenc.named_parameters())
134 | 
135 |     scheduler_wu, scheduler_re = init_lr_schedulers(
136 |         optimizer, warmup_epochs, reduce_patience=int(EARLY_STOPPING_PATIENCE/2), verbose=True)
137 | 
138 |     hvd.broadcast_object(scheduler_wu, root_rank=ROOT_RANK)
139 |     hvd.broadcast_object(scheduler_re, root_rank=ROOT_RANK)
140 | 
141 |     # Broadcast initial parameters so all workers start with the same parameters.
142 |     hvd.broadcast_parameters(autoenc.state_dict(), root_rank=ROOT_RANK)
143 | 
144 |     criterion = [nn.MSELoss(), nn.CrossEntropyLoss()]
145 | 
146 |     losses = []
147 |     early_stopper = DistributedEarlyStopping(logger, patience=EARLY_STOPPING_PATIENCE, delta=EARLY_STOPPING_DELTA)
148 | 
149 |     epoch = 1
150 |     while epoch < num_epochs + 1:
151 |         autoenc.train()
152 |         losses, _ = run_autoencoder(autoenc, optimizer, train_loader, criterion, device)
153 |         autoenc.eval()
154 |         losses_val = run_autoencoder_val(autoenc, val_loader, criterion, device)
155 |         loss_train_averaged = metric_average(torch.tensor(losses), 'avg_train_loss')
156 |         loss_val_averaged = metric_average(torch.tensor(losses_val), 'avg_val_loss')
157 | 
158 |         logger.info("Average training loss in epoch {0} is {1}".format(epoch, loss_train_averaged))
159 |         logger.info("Average validation loss in epoch {0} is {1}".format(epoch, loss_val_averaged))
160 | 
161 |         if epoch <= warmup_epochs:
162 |             scheduler_wu.step()
163 |         scheduler_re.step(loss_val_averaged)
164 | 
165 |         epoch = determine_early_stop(early_stopper, loss_val_averaged, autoenc, path, epoch, num_epochs)
166 |         if hvd.rank() == ROOT_RANK and epoch == num_epochs:
167 |             if save_model:
168 |                 save_checkpoint(autoenc, optimizer, epoch, 'encoder')
169 |             return autoenc, loss_val_averaged
170 |         epoch = epoch+1
171 | 


--------------------------------------------------------------------------------
/caspr/utils/metrics.py:
--------------------------------------------------------------------------------
 1 | """Get classification metrics for CASPR models."""
 2 | 
 3 | # coding: utf-8
 4 | import logging
 5 | 
 6 | from sklearn.metrics import auc, classification_report, confusion_matrix, precision_recall_curve, roc_auc_score
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | def check_topk_values_if_churn(k, preds, y):
11 |     """Check how many of top k churn predictions actually churned."""
12 | 
13 |     pred_arr = preds.cpu()
14 |     pred_arr = pred_arr.detach().numpy()
15 |     topk = pred_arr.argsort()[-k:][::-1]
16 |     count = 0
17 |     for ind in topk:
18 |         if y[ind] == 1:
19 |             count += 1
20 |     return count
21 | 
22 | 
23 | def pr_auc_score(y_true, y_score):
24 |     """Get pr_auc score."""
25 | 
26 |     precision, recall, _ = precision_recall_curve(y_true, y_score)
27 |     pr_auc = auc(recall, precision)
28 |     return pr_auc
29 | 
30 | 
31 | def get_metrics(y_true, y_score, threshold=0.5, digits=3):
32 |     """Get classification report, confusion matrix, roc_auc score, and pr_auc score."""
33 | 
34 |     y_pred = y_score > threshold
35 | 
36 |     report = classification_report(y_true, y_pred, digits=digits)
37 |     report_dict = convert_classification_report_to_dict(report)
38 |     logger.info(report)
39 | 
40 |     tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
41 |     report_dict.update({'tp': tp, 'fp': fp, 'tn': tn, 'fn': fn})
42 |     logger.info("tp: {}, fp: {}, tn: {}, fn: {}".format(tp, fp, tn, fn))
43 | 
44 |     roc_auc = roc_auc_score(y_true, y_score)
45 |     pr_auc = pr_auc_score(y_true, y_score)
46 |     report_dict.update({'roc_auc_score': roc_auc, 'pr_auc_score': pr_auc})
47 |     logger.info("roc_auc_score = {:.4f}, pr_auc_score = {:.4f}".format(roc_auc, pr_auc))
48 | 
49 |     return report_dict
50 | 
51 | 
52 | def convert_classification_report_to_dict(report):
53 |     """Convert classification report to Dict format."""
54 | 
55 |     rows = [row.split() for row in report.split('\n') if row]
56 |     headers = rows[0]
57 |     report_dict = {}
58 |     for row in rows[1:]:
59 |         if row[1] == 'avg':
60 |             label, scores = ' '.join(row[:2]), row[2:]
61 |         else:
62 |             label, scores = row[0], row[1:]
63 | 
64 |         if label == 'accuracy':
65 |             report_dict[label] = float(scores[-2])
66 |         else:
67 |             report_dict[label] = dict(zip(headers, [float(score) for score in scores[:-1]] + [int(scores[-1])]))
68 |     return report_dict
69 | 


--------------------------------------------------------------------------------
/caspr/utils/noise.py:
--------------------------------------------------------------------------------
 1 | """Noise class for generating noisy data."""
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | class Noise(torch.nn.Module):
 7 |     """Add different types of noise to the sequential inputs for denoising autoencoder.
 8 | 
 9 |     Usage:
10 |         noise = Noise(emb_dims, gau_prob, sub_prob, shuffle_dist)
11 |         seq_cat_noisy, seq_cont_noisy = noise(seq_cat, seq_cont)
12 |     """
13 | 
14 |     def __init__(self, emb_dims, gau_prob=0.1, sub_prob=0.1, shuffle_dist=1):
15 |         """Initialize Noise objects with probabilities for different noise types.
16 | 
17 |         Args:
18 |             emb_dims (List of tuples (x, y)): Embedding dimensions where x is the vocab size and
19 |                                               y is the embedding size for every categorical variable.
20 |             gau_prob (float): Probability of adding gaussian noise to the continuous variables.
21 |             sub_prob (float): Probability of substituting a categorical value with another randomly selected one.
22 |             shuffle_dist (int): The max distance that each element will be away from its original position
23 |                                 after shuffling.
24 |         """
25 | 
26 |         super().__init__()
27 | 
28 |         self.gau_prob = gau_prob
29 |         self.sub_prob = sub_prob
30 |         self.shuffle_dist = shuffle_dist
31 |         self.vocab_sizes = [dim[0] for dim in emb_dims]
32 | 
33 |     def forward(self, seq_cat_data, seq_cont_data):
34 |         """Run a forward pass of the module over the data to add noise."""
35 | 
36 |         return self.add_noise(seq_cat_data, seq_cont_data)
37 | 
38 |     def add_noise(self, seq_cat_data, seq_cont_data):
39 |         """Add noise to the sequential data based on the specified probabilities.
40 | 
41 |         Args:
42 |             seq_cat_data (Tensors): Sequential categorical data.
43 |             seq_cont_data (Tensors): Sequential continuous data.
44 |         """
45 | 
46 |         if self.sub_prob > 0:
47 |             seq_cat_data = self._word_substitute(seq_cat_data)
48 | 
49 |         if self.gau_prob > 0:
50 |             seq_cont_data = self._word_gaussian(seq_cont_data)
51 | 
52 |         if self.shuffle_dist > 0:
53 |             seq_cat_data, seq_cont_data = self._word_shuffle(seq_cat_data, seq_cont_data)
54 | 
55 |         return seq_cat_data, seq_cont_data
56 | 
57 |     def _word_shuffle(self, seq_cat_data, seq_cont_data):
58 |         batch_size, seq_len, _ = seq_cat_data.size()
59 |         base = torch.arange(seq_len, dtype=torch.float).repeat(batch_size, 1)
60 |         inc = (self.shuffle_dist+1) * torch.rand((batch_size, seq_len))
61 |         _, sigma = (base + inc).sort(dim=1)
62 |         return (seq_cat_data[torch.arange(batch_size).unsqueeze(1), sigma],
63 |                 seq_cont_data[torch.arange(batch_size).unsqueeze(1), sigma])
64 | 
65 |     def _word_substitute(self, x):
66 |         keep = (torch.rand(x.size(), device=x.device) > self.sub_prob)
67 |         x_ = x.clone()
68 |         for i in range(len(self.vocab_sizes)):
69 |             x_[:, :, i].random_(0, self.vocab_sizes[i])
70 |         x_[keep] = x[keep]
71 |         return x_
72 | 
73 |     def _word_gaussian(self, x):
74 |         gaussian = (torch.rand(x.size(), device=x.device) < self.gau_prob)
75 |         x_ = x.clone()
76 |         x_ += torch.randn(x.size(), device=x.device) * gaussian
77 |         return x_
78 | 


--------------------------------------------------------------------------------
/caspr/utils/onnx.py:
--------------------------------------------------------------------------------
  1 | import onnx
  2 | import torch
  3 | from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions, get_available_providers
  4 | from torch.onnx import register_custom_op_symbolic
  5 | 
  6 | from caspr.models.factory import LSTM, TRANSFORMER
  7 | from caspr.utils.preprocess import get_nonempty_tensors
  8 | from caspr.utils.score import get_architecture
  9 | 
 10 | OPSET_VERSION = 12
 11 | SEQ_CAT_INDEX = 0
 12 | SEQ_CONT_INDEX = 1
 13 | NON_SEQ_CAT_INDEX = 2
 14 | NON_SEQ_CONT_INDEX = 3
 15 | 
 16 | _onnx_opset_version = 1
 17 | 
 18 | def register_custom_op():
 19 |     """
 20 |     This function registers symbolic functions for
 21 |     custom ops that are implemented as part of ONNX Runtime
 22 |     """
 23 | 
 24 |     # Symbolic definition
 25 |     def inverse(g, self):
 26 |         return g.op("com.microsoft::Inverse", self)
 27 | 
 28 |     def gelu(g, self):
 29 |         return g.op("com.microsoft::Gelu", self)
 30 | 
 31 |     def triu(g, self, diagonal):
 32 |         return g.op("com.microsoft::Trilu", self, diagonal, upper_i=1)
 33 | 
 34 |     def tril(g, self, diagonal):
 35 |         return g.op("com.microsoft::Trilu", self, diagonal, upper_i=0)
 36 | 
 37 |     # Op Registration
 38 |     register_custom_op_symbolic('::inverse', inverse, _onnx_opset_version)
 39 |     register_custom_op_symbolic('::gelu', gelu, _onnx_opset_version)
 40 |     register_custom_op_symbolic('::triu', triu, _onnx_opset_version)
 41 |     register_custom_op_symbolic('::tril', tril, _onnx_opset_version)
 42 | 
 43 | 
 44 | def unregister_custom_op():
 45 |     """
 46 |     This function unregisters symbolic functions for
 47 |     custom ops that are implemented as part of ONNX Runtime
 48 |     """
 49 | 
 50 |     import torch.onnx.symbolic_registry as sym_registry
 51 | 
 52 |     # TODO: replace this once PyTorch supports unregister natively.
 53 |     def unregister(name, opset_version):
 54 |         ns, kind = name.split("::")
 55 |         from torch.onnx.symbolic_helper import _onnx_stable_opsets
 56 | 
 57 |         for version in _onnx_stable_opsets:
 58 |             if version >= opset_version and sym_registry.is_registered_op(kind, ns, version):
 59 |                 del sym_registry._registry[(ns, version)][kind]
 60 | 
 61 |     unregister('::inverse', _onnx_opset_version)
 62 |     unregister('::gelu', _onnx_opset_version)
 63 |     unregister('::triu', _onnx_opset_version)
 64 |     unregister('::tril', _onnx_opset_version)
 65 | 
 66 | 
 67 | def get_input_names(nonempty_idx):
 68 |     mapping = {SEQ_CAT_INDEX: 'seq_cat', SEQ_CONT_INDEX: 'seq_cont',
 69 |                NON_SEQ_CAT_INDEX: 'non_seq_cat', NON_SEQ_CONT_INDEX: 'non_seq_cont'}
 70 |     input_names = [mapping[idx] for idx in nonempty_idx if idx in mapping] + ['nonempty_idx']
 71 |     return input_names
 72 | 
 73 | 
 74 | def get_dummy_inputs(model):
 75 |     if get_architecture(model) == TRANSFORMER:
 76 |         seq_cat_dim = len(model.unified_encoder.emb_seq.emb_layers)
 77 |         seq_cont_dim = model.unified_encoder.seq_cont_dim
 78 |         non_seq_cat_dim = len(model.unified_encoder.emb_non_seq.emb_layers)
 79 |         non_seq_cont_dim = model.unified_encoder.non_seq_cont_dim
 80 |         adjust_seq_len = model.unified_encoder.transformer_encoder.pos_embedding.num_embeddings
 81 |         seq_len = adjust_seq_len - int((non_seq_cat_dim + non_seq_cont_dim) > 0)
 82 |     elif get_architecture(model) == LSTM:
 83 |         seq_cat_dim = model.unified_encoder.seq_cat_count
 84 |         seq_cont_dim = model.unified_encoder.seq_cont_count
 85 |         non_seq_cat_dim = model.unified_encoder.non_seq_cat_count
 86 |         non_seq_cont_dim = model.unified_encoder.non_seq_cont_count
 87 |         seq_len = model.unified_encoder.seq_len
 88 | 
 89 |     device = next(model.parameters()).device
 90 |     seq_cat_dummy = torch.zeros((1, seq_len, seq_cat_dim), dtype=torch.long, device=device)
 91 |     seq_cont_dummy = torch.zeros((1, seq_len, seq_cont_dim), dtype=torch.float32, device=device)
 92 |     non_seq_cat_dummy = torch.zeros((1, non_seq_cat_dim), dtype=torch.long, device=device)
 93 |     non_seq_cont_dummy = torch.zeros((1, non_seq_cont_dim), dtype=torch.float32, device=device)
 94 | 
 95 |     dummy = (seq_cat_dummy, seq_cont_dummy, non_seq_cat_dummy, non_seq_cont_dummy)
 96 |     nonempty_tensors, nonempty_idx = get_nonempty_tensors(dummy)
 97 |     dummy_inputs = (*nonempty_tensors, torch.tensor(nonempty_idx))
 98 | 
 99 |     input_names = get_input_names(nonempty_idx)
100 | 
101 |     return dummy_inputs, input_names
102 | 
103 | 
104 | def export_onnx(model, model_path):
105 |     model.eval()
106 | 
107 |     dummy_inputs, input_names = get_dummy_inputs(model)
108 | 
109 |     with torch.no_grad():
110 |         dummy_outputs = model.unified_encoder(*dummy_inputs)
111 |     output_names = [f"output_{i}" for i in range(len(dummy_outputs))]
112 | 
113 |     dynamic_axes = dict.fromkeys(input_names + output_names, {0: 'batch_size'})
114 |     torch.onnx.export(model=model.unified_encoder,
115 |                       args=dummy_inputs,
116 |                       f=model_path,
117 |                       input_names=input_names,
118 |                       output_names=output_names,
119 |                       dynamic_axes=dynamic_axes,
120 |                       opset_version=OPSET_VERSION,
121 |                       custom_opsets={'com.microsoft': 1},
122 |                       do_constant_folding=True)
123 | 
124 | 
125 | def to_numpy(tensor):
126 |     if tensor.requires_grad:
127 |         return tensor.detach().cpu().numpy()
128 |     else:
129 |         return tensor.cpu().numpy()
130 | 
131 | 
132 | class ONNXWrapper:
133 | 
134 |     def __init__(self, model_path_or_proto, model_type=TRANSFORMER):
135 |         if isinstance(model_path_or_proto, str):
136 |             with open(model_path_or_proto, 'rb') as model_file:
137 |                 self.model_bytes = model_file.read()
138 |         else:
139 |             self.model_bytes = onnx._serialize(model_path_or_proto)
140 |         self.session = self.load()
141 |         self.model_type = model_type
142 | 
143 |     def __getstate__(self):
144 |         state = self.__dict__.copy()
145 |         del state['session']
146 |         return state
147 | 
148 |     def __setstate__(self, state):
149 |         self.__dict__.update(state)
150 |         self.session = self.load()
151 | 
152 |     def unified_encoder(self, *args):
153 |         nonempty_tensors = args[:-1]
154 |         inputs = list(map(to_numpy, nonempty_tensors))
155 |         ort_inputs = dict((self.session.get_inputs()[i].name, inp) for i, inp in enumerate(inputs))
156 |         return (torch.from_numpy(out) for out in self.session.run(None, ort_inputs))
157 | 
158 |     def load(self, device=torch.device('cpu'), enable_all_optimization=True):
159 |         sess_options = SessionOptions()
160 |         sess_options.graph_optimization_level = (
161 |             GraphOptimizationLevel.ORT_ENABLE_ALL
162 |             if enable_all_optimization
163 |             else GraphOptimizationLevel.ORT_ENABLE_BASIC
164 |         )
165 | 
166 |         use_gpu = 'cuda' in device.type and 'CUDAExecutionProvider' in get_available_providers()
167 |         execution_providers = (
168 |             ["CPUExecutionProvider"] if not use_gpu else ["CUDAExecutionProvider", "CPUExecutionProvider"]
169 |         )
170 | 
171 |         session = InferenceSession(self.model_bytes, sess_options, providers=execution_providers)
172 |         return session
173 | 
174 |     def to(self, device):
175 |         self.session = self.load(device)
176 | 
177 |     def cpu(self):
178 |         self.to(torch.device('cpu'))
179 | 
180 |     def eval(self):
181 |         pass
182 | 


--------------------------------------------------------------------------------
/caspr/utils/score.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | 
 6 | from caspr.data.common_dataset import id_collate
 7 | from caspr.models.factory import LSTM, TRANSFORMER
 8 | from caspr.utils.preprocess import get_nonempty_tensors
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | def run_autoencoder_score(autoenc, dataloader_test, device):
13 | 
14 |     embeddings = []
15 |     tgt_ids = []
16 | 
17 |     for tgt_id, _, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data in dataloader_test:
18 | 
19 |         data = [seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data]
20 |         if isinstance(autoenc, torch.nn.Module):
21 |             data = [d.to(device) for d in data]
22 | 
23 |         nonempty_tensors, nonempty_idx = get_nonempty_tensors(data)
24 | 
25 |         tgt_ids.append(tgt_id)
26 | 
27 |         if get_architecture(autoenc) == TRANSFORMER:
28 |             emb, _, _ = autoenc.unified_encoder(*nonempty_tensors, nonempty_idx)
29 |             # Concatenate across timesteps
30 |             emb = emb.reshape(emb.shape[0], -1)
31 |             embeddings.append(emb.detach().cpu() if isinstance(emb, torch.Tensor) else emb)
32 | 
33 |         elif get_architecture(autoenc) == LSTM:
34 |             _, (hn, _) = autoenc.unified_encoder(*nonempty_tensors, nonempty_idx)
35 |             embeddings.append(hn.detach().cpu() if isinstance(hn, torch.Tensor) else hn)
36 | 
37 |     tgt_ids = np.concatenate(tgt_ids, axis=0)
38 |     embeddings = np.concatenate(embeddings, axis=0)
39 |     embeddings_with_id = np.hstack((tgt_ids, embeddings))
40 | 
41 |     return embeddings_with_id
42 | 
43 | def score(dataset_test, autoenc, device, batch_size=1024):
44 |     autoenc.eval()
45 |     test_loader = torch.utils.data.DataLoader(dataset_test, batch_size=batch_size, collate_fn=id_collate)
46 |     
47 |     logger.info("Performing inference on given dataset")
48 |     embeddings = run_autoencoder_score(autoenc, test_loader, device)
49 |     return embeddings
50 | 
51 | def get_architecture(model):
52 |     return model.__class__.__name__ if isinstance(model, torch.nn.Module) else model.model_type
53 | 


--------------------------------------------------------------------------------
/caspr/utils/segmentation/__init__.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # --------------------------------------------------------------------------
 4 | 
 5 | """The segmentation module for the CASPR library.
 6 | 
 7 | Modules:
 8 | :module1_name: A description of this specific module.
 9 | """
10 | 


--------------------------------------------------------------------------------
/caspr/utils/segmentation/dec_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | from scipy.optimize import linear_sum_assignment
 4 | from sklearn.cluster import KMeans
 5 | 
 6 | from caspr.utils.preprocess import get_nonempty_tensors
 7 | 
 8 | 
 9 | def cluster_initialize(model, dataloader, device):
10 |     """Initialize cluster.
11 | 
12 |     Args:
13 |         model (nn.Module): # noqa: W0223 Pretrained encoder-decoder model
14 |         dataloader (DataLoader): Data loader that provides an iterable over the given dataset
15 |         device ('cpu' or 'cuda'): Describes the machine on which the code is running
16 |     """
17 |     kmeans = KMeans(model.cluster_number, n_init=20)
18 |     model.train()
19 |     encoder_embs = []
20 |     labels = []
21 |     # form initial cluster centres
22 |     for _, y, seq_cat_x, seq_cont_x, non_seq_cat_x, non_seq_cont_x in dataloader:
23 |         seq_cat_x = seq_cat_x.to(device)
24 |         seq_cont_x = seq_cont_x.to(device)
25 |         non_seq_cat_x = non_seq_cat_x.to(device)
26 |         non_seq_cont_x = non_seq_cont_x.to(device)
27 | 
28 |         data = (seq_cat_x, seq_cont_x, non_seq_cat_x, non_seq_cont_x)
29 |         nonempty_tensors, nonempty_idx = get_nonempty_tensors(data)
30 |         encoder_embs.append(model.enc(*nonempty_tensors, nonempty_idx).detach().cpu())
31 | 
32 |         labels.append(y)
33 | 
34 |     labels = torch.cat(labels).long()
35 | 
36 |     predicted = kmeans.fit_predict(torch.cat(encoder_embs).numpy())
37 |     predicted_tensor = torch.tensor(np.copy(predicted), dtype=torch.long)
38 |     _, accuracy = cluster_accuracy(predicted, labels.cpu().numpy())
39 |     print('Initial Cluster Acc: ', accuracy)
40 |     cluster_centers = torch.tensor(kmeans.cluster_centers_, dtype=torch.float, requires_grad=True).to(device)
41 |     with torch.no_grad():
42 |         # initialise the cluster centers
43 |         model.state_dict()['assignment.cluster_centers'].copy_(cluster_centers)
44 |     return predicted_tensor
45 | 
46 | 
47 | def cluster_accuracy(y_true, y_predicted, cluster_number=None):
48 |     """Calculate clustering accuracy after using the linear_sum_assignment function in SciPy to determine reassignments.
49 | 
50 |     Args:
51 |         y_true (List of int): list of true cluster numbers, an integer array 0-indexed
52 |         y_predicted (List of int): list of predicted cluster numbers, an integer array 0-indexed
53 |         cluster_number (int): number of clusters, if None then calculated from input
54 |     Return:
55 |         reassignment dictionary, clustering accuracy
56 |     """
57 |     if cluster_number is None:
58 |         cluster_number = max(y_predicted.max(), y_true.max()) + 1  # assume labels are 0-indexed
59 |     count_matrix = np.zeros((cluster_number, cluster_number), dtype=np.int64)
60 |     for i in range(y_predicted.size):
61 |         count_matrix[y_predicted[i], y_true[i]] += 1
62 | 
63 |     row_ind, col_ind = linear_sum_assignment(count_matrix.max() - count_matrix)
64 |     reassignment = dict(zip(row_ind, col_ind))
65 |     accuracy = count_matrix[row_ind, col_ind].sum() / y_predicted.size
66 |     return reassignment, accuracy
67 | 
68 | 
69 | def cluster_predict(model, dataloader, device):
70 |     """Predict the cluster centers for the given input data.
71 | 
72 |     Args:
73 |         model (nn.Module): # noqa: W0223 Pretrained encoder-decoder model
74 |         dataloader (DataLoader): Data loader that provides an iterable over the given dataset
75 |         device ('cpu' or 'cuda'): Describes the machine on which the code is running
76 |     """
77 |     features = []
78 |     labels = []
79 |     for _, y, seq_cat_x, seq_cont_x, non_seq_cat_x, non_seq_cont_x in dataloader:
80 |         seq_cat_x = seq_cat_x.to(device)
81 |         seq_cont_x = seq_cont_x.to(device)
82 |         non_seq_cat_x = non_seq_cat_x.to(device)
83 |         non_seq_cont_x = non_seq_cont_x.to(device)
84 | 
85 |         data = (seq_cat_x, seq_cont_x, non_seq_cat_x, non_seq_cont_x)
86 |         nonempty_tensors, nonempty_idx = get_nonempty_tensors(data)
87 |         features.append(model(*nonempty_tensors, nonempty_idx).detach().cpu())
88 | 
89 |         labels.append(y)
90 | 
91 |     return torch.cat(features).max(1)[1], torch.cat(labels).long()
92 | 


--------------------------------------------------------------------------------
/caspr/utils/segmentation/pandas.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | 
  3 | import matplotlib.pyplot as plt
  4 | import numpy as np
  5 | import pandas as pd
  6 | from sklearn.cluster import KMeans
  7 | from sklearn.decomposition import PCA
  8 | from sklearn.manifold import TSNE
  9 | from sklearn.metrics import silhouette_score
 10 | from sklearn.preprocessing import MinMaxScaler, StandardScaler
 11 | from tqdm import tqdm
 12 | 
 13 | 
 14 | def check_sparsity(data):
 15 |     """Check sparsity in data."""
 16 |     for c in data.columns:
 17 |         try:
 18 |             temp = pd.qcut(data[c], q=10, labels=False, duplicates='drop').value_counts()/data.shape[0]
 19 | 
 20 |             # top quantile%, unique value%
 21 |             print(c, np.round(temp.values[0], 2), np.round(len(data[c].unique())/data.shape[0], 2))
 22 |         except Exception:
 23 |             print(c, np.nan, np.round(len(data[c].unique()), 2))
 24 | 
 25 | 
 26 | def quantile(df, q=5, col_features=None):
 27 |     """Score customers from 0 to 5 based on Engagement metrics.
 28 | 
 29 |     Input:
 30 |     Output:
 31 |     """
 32 | 
 33 |     # create quantile scores [0, q] with q interval
 34 |     for c in col_features:
 35 |         if 'R_' in c:
 36 |             df[c+'_q'] = pd.qcut(df[c], q=q+1, labels=range(q, -1, -1), duplicates='drop')
 37 |         else:
 38 |             df[c+'_q'] = pd.qcut(df[c], q=q+1, labels=range(0, q+1), duplicates='drop')
 39 | 
 40 |     df['AvgScore'] = df[[c + '_q' for c in col_features]].mean(axis=1)
 41 |     df['AvgScore'].hist(bins=q)
 42 |     plt.title('AvgScore')
 43 |     plt.show()
 44 | 
 45 |     # generate segments
 46 |     df['Segment'] = np.nan
 47 |     for i in range(1, q+1):
 48 |         df.loc[(df.AvgScore <= i) & (df.AvgScore > (i-1)), 'Segment'] = i
 49 | 
 50 |     df['Segment'].hist(bins=q)
 51 |     plt.title('Segment')
 52 |     plt.show()
 53 | 
 54 |     return df
 55 | 
 56 | 
 57 | def clustering(df, col_features=None, cluster_range=range(2, 10), scaling_option="minmax",
 58 |                pca=True, pca_param={'threshold': 0.8, 'show_plot': False},
 59 |                default_cluster_size=None, default_cluster_threshold=0.1,
 60 |                tsne_plt=True, tsne_sample=1000, removed_outlier=False):
 61 |     """Perform Clustering.
 62 | 
 63 |     Options to do transformation and PCA before performing clustering
 64 |         - featurization
 65 |         - find # of clusters
 66 |         - fit final model
 67 |     """
 68 |     inertias = []
 69 |     sil_scores = []
 70 | 
 71 |     # featurization
 72 |     df = apply_scaling(df, col_features, scaling_option, removed_outlier)
 73 | 
 74 |     if pca:
 75 |         df_features, n_pca, pca = apply_pca(df, col_features=col_features, pca_param=pca_param)  # noqa W0612
 76 |     else:
 77 |         df_features = df[col_features].values
 78 | 
 79 |     # find # of clusters
 80 |     for k in tqdm(cluster_range):
 81 |         # kc = KMeans(n_clusters=k, init='k-means++', n_init=10, random_state=1)
 82 |         kc = KMeans(n_clusters=k, random_state=1, n_jobs=4)
 83 | 
 84 |         kc.fit(df_features)
 85 |         sil_scores.append(silhouette_score(df_features, kc.labels_))
 86 |         inertias.append(kc.inertia_)
 87 | 
 88 |     results = pd.DataFrame(np.array([cluster_range, inertias, sil_scores]).T)
 89 |     results.columns = ['cluster_size', 'inertias', 'sil_scores']
 90 |     n_final = cluster_range[np.where(sil_scores == np.max(sil_scores))[0][0]]
 91 |     print('optimal cluster size:', n_final, np.round(np.max(sil_scores), 2))
 92 | 
 93 |     # fit final model
 94 |     if default_cluster_size is not None:
 95 |         # kc = KMeans(n_clusters=default_cluster_size, random_state=1)
 96 |         sil_score_default = results.loc[results.index == default_cluster_size, 'sil_scores'].values[0]
 97 | 
 98 |         if (np.max(sil_scores)/sil_score_default - 1) <= default_cluster_threshold:
 99 |             print('default is a good cluster size', sil_score_default, np.max(sil_scores))
100 |             kc = KMeans(n_clusters=default_cluster_size, random_state=1)
101 |         else:
102 |             print('optimal is a better cluster size', sil_score_default, np.max(sil_scores))
103 |             kc = KMeans(n_clusters=n_final, random_state=1)
104 |     else:
105 |         kc = KMeans(n_clusters=n_final, random_state=1)
106 | 
107 |     kc.fit(df_features)
108 | 
109 |     df['label'] = kc.labels_
110 | 
111 |     # score visualization
112 |     if len(cluster_range) > 1:
113 |         _, axes = plt.subplots(1, 2, figsize=(10, 5))
114 |         results.plot(ax=axes[0], x='cluster_size', y='inertias')
115 |         results.plot.bar(ax=axes[1], x='cluster_size', y='sil_scores')
116 |         plt.show()
117 | 
118 |     # clustering size distribution
119 |     print(df.label.value_counts().to_frame()/df.shape[0])
120 | 
121 |     # tsne visualization
122 |     if tsne_plt:
123 |         if (tsne_sample > 0) & (tsne_sample < len(kc.labels_)):
124 |             df_tsne = pd.DataFrame(df_features)
125 |             df_tsne['label'] = kc.labels_
126 | 
127 |             plt_tsne(x=df_tsne.drop(columns=['label']).sample(n=tsne_sample, random_state=1).values,
128 |                      label=df_tsne.sample(n=tsne_sample, random_state=1).label.values)
129 |         else:
130 |             plt_tsne(x=df_features, label=kc.labels_)
131 | 
132 |     return results, df, kc
133 | 
134 | 
135 | def apply_scaling(df, col_features=None, scaling_option=None, removed_outlier=False):
136 |     """Apply Scaling to dataframe."""
137 | 
138 |     if scaling_option == 'minmax':
139 |         scaler = MinMaxScaler()
140 |         df[col_features] = scaler.fit_transform(df[col_features])
141 |     elif scaling_option == 'qcut':
142 |         for c in col_features:
143 |             df[c] = pd.qcut(df[c], q=100, labels=False, duplicates='drop')
144 |     else:
145 |         pass
146 | 
147 |     if removed_outlier:
148 |         n_std = 3
149 |         for c in col_features:
150 |             if df[c].dtype != 'object':
151 |                 tic_cnt = df.shape[0]
152 |                 temp_mean = df[c].mean()
153 |                 temp_std = df[c].std()
154 |                 df = df[(df[c] <= (temp_mean + n_std*temp_std)) & (df[c] >= (temp_mean - n_std*temp_std))].copy()
155 |                 print('remove outlier', c, ':', df.shape[0] - tic_cnt)
156 | 
157 |     df.reset_index(drop=True, inplace=True)
158 | 
159 |     return df
160 | 
161 | 
162 | def apply_pca(df, col_features=None, pca_param={'threshold': 0.8, 'show_plot': False}):
163 |     """Apply PCA transformation and return # of eigen-vectors based on threshold (20/80 rules)."""
164 | 
165 |     # normalize the input matrix
166 |     matrix = df[col_features].values
167 |     scaler = StandardScaler()
168 |     scaler.fit(matrix)
169 |     scaled_matrix = scaler.transform(matrix)
170 | 
171 |     # perform PCA
172 |     pca = PCA()
173 |     pca.fit(scaled_matrix)
174 |     pca_samples = pca.transform(scaled_matrix)
175 | 
176 |     # # visualize explained variance
177 |     # if pca_param['show_plot']:
178 |     #     fig, ax = plt.subplots(figsize=(10, 5))
179 |     #     sns.set(font_scale=1)
180 |     #     plt.step(range(matrix.shape[1]), pca.explained_variance_ratio_.cumsum(), where='mid',
181 |     #             label='cumulative explained variance')
182 |     #     sns.barplot(np.arange(1,matrix.shape[1]+1), pca.explained_variance_ratio_, alpha=0.5, color = 'g',
183 |     #                 label='individual explained variance')
184 |     #     plt.xlim(0, len(col_features))
185 |     #     ax.set_xticklabels([s if int(s.get_text())%2 == 0 else '' for s in ax.get_xticklabels()])
186 |     #     plt.ylabel('Explained variance', fontsize = 14)
187 |     #     plt.xlabel('Principal components', fontsize = 14)
188 |     #     plt.legend(loc='best', fontsize = 13)
189 | 
190 |     # define n_pca based on the threshold
191 |     n_pca = np.where(pca.explained_variance_ratio_.cumsum() > pca_param['threshold'])[0][0] + 1
192 |     print('# of pca components:', n_pca, '/', scaled_matrix.shape[1])
193 |     print('# of variance explained:', pca.explained_variance_ratio_.cumsum()[n_pca-1])
194 | 
195 |     # see loadings of the main components
196 |     df_pca_components = pd.DataFrame(pca.components_, columns=col_features)
197 |     plt_bar(df_pca_components.head(n_pca).copy(), ncols=3, figsize=(10, 10), title='PCA ')
198 | 
199 |     # extract the main transformed features
200 |     df_pca = pca_samples[:, 0:n_pca]
201 | 
202 |     return df_pca, n_pca, pca
203 | 
204 | 
205 | def profiling(df, label, col_features, col_dropped=[]):
206 |     """Profile Dataframe using heatmap.
207 | 
208 |     Heat-map around KPIs: absolute & relative
209 |     - Useful technique to identify relative importance of each segment's attribute
210 |     - Calculate average values of each cluster
211 |     - Calculate average values of population
212 |     - Calculate importance score by dividing them and subtracting 1
213 |     (ensures 0 is returned when cluster average equals population average)
214 | 
215 |     col_dropped: automatic or apply min-max scaling to features [TO-DO]
216 |     """
217 |     df['Segment'] = label
218 | 
219 |     # classifying cat vs. cont features
220 |     cat_features = []
221 |     cont_features = []
222 |     for x in col_features:
223 |         if df[x].dtypes == 'object':
224 |             cat_features.append(x)
225 |         else:
226 |             cont_features.append(x)
227 | 
228 |     # customer counts
229 |     df_count = df.groupby('Segment')[cont_features[0]].count()
230 |     df_count.loc['All'] = df_count.sum()
231 |     df_count = df_count.to_frame()
232 |     df_count.columns = ['Customers']
233 |     df_count['Customers%'] = df_count.Customers/df_count.Customers.values[-1]*100
234 | 
235 |     # numerical features
236 |     df_cont = df.groupby('Segment')[cont_features].mean()
237 |     df_cont.loc['All'] = df_cont.mean()
238 | 
239 |     # categorical features
240 |     df_cat = pd.DataFrame()
241 |     for c in cat_features:
242 |         df_pivot = df.pivot_table(index='Segment',
243 |                                   columns=c, values=cont_features[0], aggfunc='count')
244 |         df_pivot.loc['All'] = df_pivot.sum()
245 |         df_pivot[df_pivot.columns] = df_pivot.values / df_pivot.sum(axis=1).values.reshape(-1, 1)*100
246 |         df_cat = pd.concat([df_cat, df_pivot], axis=1)
247 | 
248 |     # combine results and calcuate relative importance
249 |     result_profile = pd.concat([df_count, df_cont, df_cat], axis=1)
250 |     temp_all = result_profile.loc['All']
251 |     result_profile.drop('All', inplace=True)
252 |     result_profile.sort_index(ascending=False, inplace=True)
253 |     result_profile.loc['All'] = temp_all
254 | 
255 |     relative_imp = result_profile/result_profile.loc['All'] - 1
256 |     relative_imp.drop('All', inplace=True)
257 | 
258 |     # visualization - heatmap
259 |     temp = relative_imp.drop(columns=['Customers', 'Customers%'] + col_dropped).copy()
260 |     plt_heatmap(temp, x_labels=temp.columns, y_labels=temp.index)
261 | 
262 |     # visualization - barchart by clusters
263 |     plt_bar(temp)
264 | 
265 |     return relative_imp, result_profile
266 | 
267 | 
268 | def plt_tsne(x, label):
269 |     """Visualize TSNE."""
270 |     tic = time.time()
271 |     x_embedded = TSNE(n_components=2).fit_transform(x)
272 |     print('tsne takes time: ', time.time() - tic)
273 | 
274 |     vis_x = x_embedded[:, 0]
275 |     vis_y = x_embedded[:, 1]
276 | 
277 |     fig = plt.figure(figsize=(12, 8)) # noqa W0612
278 |     plt.scatter(vis_x, vis_y, c=label, cmap=plt.cm.get_cmap("jet", 256))
279 |     plt.colorbar(ticks=range(256))
280 |     plt.clim(-0.5, 9.5)
281 |     plt.show()
282 | 
283 | 
284 | def plt_heatmap(data, x_labels, y_labels):
285 |     """Plot Heatmap."""
286 |     fig = plt.figure(figsize=(10, 5))
287 |     ax = fig.add_axes([1, 1, 1.1, 1.1])
288 | 
289 |     plt.imshow(data, cmap='Blues', interpolation='nearest')
290 |     ax.set_yticks(range(len(y_labels)))
291 |     ax.set_yticklabels(y_labels)
292 |     ax.set_xticks(range(len(x_labels)))
293 |     ax.set_xticklabels(x_labels, rotation=60)
294 |     plt.colorbar()
295 |     plt.show()
296 | 
297 | 
298 | def plt_bar(data, ncols=3, figsize=(10, 10), title='Segment'):
299 |     """Plot bars."""
300 | 
301 |     data.dropna(axis=1, inplace=True)
302 |     nrows = int(np.ceil(data.shape[0]/ncols))
303 |     xlim_min = data.min().min()
304 |     xlim_max = data.max().max()
305 | 
306 |     fig = plt.figure(figsize=figsize)
307 |     for i in range(data.shape[0]):
308 |         temp = data.iloc[i]
309 |         ax = plt.subplot(nrows, ncols, i+1)
310 |         ax.barh(range(len(temp)), temp.values, align='center')
311 |         if title is not None:
312 |             ax.set_title(title + str(data.index[i]))
313 | 
314 |         plt.xticks(rotation=45)
315 |         plt.xlim(xlim_min, xlim_max)
316 | 
317 |         if i % ncols == 0:
318 |             ax.set_yticks(range(len(temp)))
319 |             ax.set_yticklabels(temp.index)
320 | 
321 |     fig.tight_layout()
322 |     plt.show()
323 | 
324 | 
325 | def generate_segmentation_graphs(combined_df, profile_features,
326 |                                  emb_features, use_profile=False, use_embedding=False):
327 |     """Generate segmentation graphs.
328 | 
329 |     combined_df - the dataframe containing embeddings and profile features
330 |                     with feature names as the column names
331 |     profile_features - the names of all the profile features in the data
332 | 
333 |     emb_features - name of the embedding features, by default they should be 'dim_0', 'dim_1'...
334 | 
335 |     use_profile - boolean flag to determine if we use the profile features or not
336 | 
337 |     use_embedding - boolean flag to determine if we use the embedding values
338 |     """
339 | 
340 |     # importlib.reload(segmentation_utils)
341 | 
342 |     df_emb = combined_df
343 |     # Need to remove the dimensions of the embedding which have only onevalue if we use scaling
344 |     col_one = []
345 |     for col in emb_features:
346 |         if df_emb[col].nunique() == 1:
347 |             col_one.append(col)
348 |     df_emb = df_emb.drop(columns=col_one, axis=1)
349 | 
350 |     emb_featuresN = []  # noqa: C0103
351 |     for item in emb_features:
352 |         if item not in col_one:
353 |             emb_featuresN.append(item)
354 | 
355 |     emb_features = emb_featuresN
356 | 
357 |     plt_heatmap(df_emb[emb_features].corr(), emb_features, emb_features)
358 |     df_emb[emb_features].describe()
359 | 
360 |     features_to_use = []
361 |     if use_profile:
362 |         features_to_use = profile_features
363 |     if use_embedding:
364 |         features_to_use = emb_features
365 | 
366 |     if use_embedding and use_profile:
367 |         features_to_use = profile_features + emb_features
368 | 
369 |     n = 5000
370 |     data_c = df_emb.sample(n=n, random_state=1).copy()
371 | 
372 |     results, df, kc = clustering(df=data_c.copy(),
373 |                                  col_features=features_to_use, cluster_range=range(2, 9), scaling_option='qcut',
374 |                                  pca=True, pca_param={'threshold': 0.8, 'show_plot': False},
375 |                                  default_cluster_size=None, default_cluster_threshold=0.1,
376 |                                  tsne_plt=True, tsne_sample=1000, removed_outlier=False)
377 | 
378 |     col_features = emb_features + profile_features
379 |     relative_imp, result_profile = profiling(data_c.copy(), kc.labels_, col_features, col_dropped=[])
380 | 
381 | 
382 | def generate_combined_df(embedding_data=None, profile_data: pd.DataFrame = None):
383 |     """Generate Combined DF.
384 | 
385 |     embedding data - The numpy array containing the embeddings in the NxM format where
386 |                     N = number of data entries, M = embedding dimension
387 | 
388 |     profile data - The dataframe containing all the profile features, with column names
389 |                     equal to the featur names
390 | 
391 |     """
392 |     if embedding_data is None:
393 |         profile_data.reset_index(drop=True, inplace=True)
394 |         return profile_data
395 | 
396 |     emb_dim = embedding_data.shape[1]
397 |     column_list = []
398 |     for i in range(emb_dim):
399 |         column_list.append('dim_' + str(i))
400 |     emb_df = pd.DataFrame(embedding_data, columns=column_list)
401 |     emb_df.reset_index(drop=True, inplace=True)
402 | 
403 |     if profile_data is None:
404 |         return emb_df
405 | 
406 |     profile_data.reset_index(drop=True, inplace=True)
407 |     final_df = pd.concat([profile_data, emb_df], axis=1)
408 |     return final_df
409 | 


--------------------------------------------------------------------------------
/caspr/utils/spark/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/CASPR/9cdbf270487751a0ad6862b2fea2ccc0e23a0b67/caspr/utils/spark/__init__.py


--------------------------------------------------------------------------------
/caspr/utils/spark/large/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/CASPR/9cdbf270487751a0ad6862b2fea2ccc0e23a0b67/caspr/utils/spark/large/__init__.py


--------------------------------------------------------------------------------
/caspr/utils/spark/large/score.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import torch
 6 | import torch.nn as nn
 7 | from petastorm.pytorch import BatchedDataLoader
 8 | 
 9 | from caspr.data.load import transform_and_load
10 | from caspr.models.model_wrapper import LSTMAutoencoder, TransformerAutoEncoder
11 | from caspr.utils.preprocess import get_nonempty_tensors
12 | from caspr.utils.spark.preprocess import remove_underscore_in_seq_col_name_list
13 | 
14 | PS_HDFS_DRIVER = 'libhdfs3'
15 | # lower overhead, alternative is 'process'
16 | PS_WORKER_TYPE = 'thread'
17 | # assuming the training relies on SSD backed dbfs:/ml, Petastorm's caching can be disabled
18 | PS_CACHE_TYPE = None
19 | 
20 | def get_default_parallelism():
21 |     try:
22 |         return sc.defaultParallelism
23 |     except NameError as _:
24 |         # Spark Context not initialized (sc)
25 |         return os.cpu_count()
26 | 
27 | 
28 | def run_autoencoder_score_peta(autoenc, steps_per_epoch, train_dataloader_iter, device, tgt_id_col, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps):
29 |     
30 |     embeddings = []
31 |     tgt_ids = []
32 | 
33 |     for _ in range(steps_per_epoch):
34 |         pd_batch = next(train_dataloader_iter)
35 |         tgt_id, _, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data = transform_and_load(
36 |             pd_batch, device, remove_underscore_in_seq_col_name_list(tgt_id_col), remove_underscore_in_seq_col_name_list(seq_cols), remove_underscore_in_seq_col_name_list(non_seq_cols), output_col, remove_underscore_in_seq_col_name_list(cat_cols), remove_underscore_in_seq_col_name_list(cont_cols), time_steps)
37 | 
38 |         data = (seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data)
39 |         nonempty_tensors, nonempty_idx = get_nonempty_tensors(data)
40 | 
41 |         tgt_ids.append(tgt_id)
42 | 
43 |         if isinstance(autoenc, TransformerAutoEncoder):
44 |             emb, _, _ = autoenc.unified_encoder(*nonempty_tensors, nonempty_idx)
45 |             # Concatenate across timesteps
46 |             emb = emb.view(emb.shape[0], -1)
47 |             embeddings.append(emb.detach().cpu())
48 | 
49 |         elif isinstance(autoenc, LSTMAutoencoder):
50 |             _, (hn, _) = autoenc.unified_encoder(*nonempty_tensors, nonempty_idx)
51 |             embeddings.append(hn.detach().cpu())
52 | 
53 |     tgt_ids = pd.DataFrame(np.concatenate(tgt_ids, axis=0))
54 |     tgt_ids.columns = tgt_id_col
55 |     embeddings = pd.DataFrame(np.concatenate(embeddings, axis=0))
56 |     # embeddings_with_id = np.hstack((tgt_ids, embeddings))
57 |     embeddings_with_id = pd.concat([tgt_ids, embeddings], axis=1)
58 |     return embeddings_with_id
59 | 
60 | 
61 | def score_peta(converter_test, autoenc, tgt_id, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps, batch_size=1024):
62 |     autoenc.eval()
63 |     if torch.cuda.is_available():
64 |         device = torch.cuda.current_device()
65 |     else:
66 |         device = torch.device("cpu")
67 |     criterion = [nn.MSELoss(), nn.CrossEntropyLoss()]
68 | 
69 |     with converter_test.make_torch_dataloader(batch_size=batch_size, data_loader_fn=BatchedDataLoader,
70 |                                               num_epochs=None, cache_type=PS_CACHE_TYPE,
71 |                                               workers_count=get_default_parallelism(),
72 |                                               reader_pool_type=PS_WORKER_TYPE,
73 |                                               hdfs_driver=PS_HDFS_DRIVER) as test_dataloader:
74 |         test_dataloader_iter = iter(test_dataloader)
75 |         steps_per_epoch = max(1, len(converter_test) // (batch_size))
76 |         embeddings = run_autoencoder_score_peta(autoenc, steps_per_epoch, test_dataloader_iter,
77 |                                                 device, tgt_id, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps) # noqa: E1121
78 |     return embeddings
79 | 


--------------------------------------------------------------------------------
/caspr/utils/spark/large/train.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import time
  3 | 
  4 | import horovod.torch as hvd
  5 | import torch
  6 | import torch.nn as nn
  7 | from petastorm.pytorch import BatchedDataLoader
  8 | from torch import optim
  9 | from torch.utils.data.distributed import DistributedSampler
 10 | 
 11 | from caspr.data.load import transform_and_load
 12 | from caspr.utils.early_stopping import DistributedEarlyStopping
 13 | from caspr.utils.horovod.train import determine_early_stop
 14 | from caspr.utils.spark.large.score import get_default_parallelism
 15 | from caspr.utils.spark.preprocess import remove_underscore_in_seq_col_name_list
 16 | from caspr.utils.train import init_lr_schedulers
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | def save_checkpoint(model, optimizer, epoch, name):
 22 |     filepath = '/checkpoint-{epoch}-{model}.pth'.format(epoch=epoch, model=name)
 23 |     state = {
 24 |         'model': model.state_dict(),
 25 |         'optimizer': optimizer.state_dict(),
 26 |     }
 27 |     torch.save(state, filepath)
 28 | 
 29 | 
 30 | def metric_average(metric, name):
 31 |     avg_tensor = hvd.allreduce(metric, name=name)
 32 |     return avg_tensor.item()
 33 | 
 34 | 
 35 | BATCH_SIZE = 1024 * 32
 36 | NUM_EPOCHS = 100
 37 | NUM_WORKERS = 4  # assume cluster consists of two workers 2x K80 each
 38 | # default loader parallism is low or None, this widens the IO bottleneck when feeding each GPU
 39 | PS_WORKERS_PER_CPU = 2
 40 | # this version is implemented in C, vs Java (slower) default
 41 | PS_HDFS_DRIVER = 'libhdfs3'
 42 | # lower overhead, alternative is 'process'
 43 | PS_WORKER_TYPE = 'thread'
 44 | # assuming the training relies on SSD backed dbfs:/ml, Petastorm's caching can be disabled
 45 | PS_CACHE_TYPE = None
 46 | EARLY_STOPPING_PATIENCE = 8
 47 | EARLY_STOPPING_DELTA = 1e-5
 48 | ROOT_RANK = 0
 49 | 
 50 | 
 51 | def run_autoencoder_peta(autoenc, optimizer, steps_per_epoch, train_dataloader_iter, criterion, device, tgt_id_col, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps):
 52 |     count = 0
 53 |     val_start_time = time.time()
 54 |     running_loss = 0.0
 55 |     for _ in range(steps_per_epoch):
 56 |         pd_batch = next(train_dataloader_iter)
 57 |         _, y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data = transform_and_load(
 58 |             pd_batch, device, remove_underscore_in_seq_col_name_list(tgt_id_col), remove_underscore_in_seq_col_name_list(seq_cols), remove_underscore_in_seq_col_name_list(non_seq_cols), output_col, remove_underscore_in_seq_col_name_list(cat_cols), remove_underscore_in_seq_col_name_list(cont_cols), time_steps)
 59 | 
 60 |         # Track history in training
 61 |         torch.set_grad_enabled(True)
 62 |         _, loss = autoenc.run(y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data, criterion)
 63 |         optimizer.zero_grad()
 64 |         loss.backward()
 65 |         optimizer.step()
 66 | 
 67 |         running_loss = (running_loss * count + loss.item()) / (count + 1)
 68 |         count = count + 1
 69 |         if count % 64 == 0:
 70 |             logger.info("Running Loss so far: " + str(running_loss))
 71 |             logger.info("Records processed so far: " + str(count*seq_cat_data.shape[0]))
 72 |             time_so_far = time.time() - val_start_time
 73 |             logger.info("Time taken since start:" + str(time_so_far))
 74 | 
 75 |     val_end_time = time.time()
 76 | 
 77 |     logger.info("Total time taken:" + str(val_end_time - val_start_time))
 78 |     logger.info("Running loss at the end of training epoch:" + str(running_loss))
 79 |     return running_loss, val_end_time - val_start_time
 80 | 
 81 | 
 82 | def run_autoencoder_val_peta(autoenc, steps_per_epoch, val_dataloader_iter, criterion, device, tgt_id_col, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps):
 83 |     criterion = [nn.MSELoss(), nn.CrossEntropyLoss()]
 84 |     count = 0
 85 |     val_start_time = time.time()
 86 |     running_loss = 0.0
 87 |     for _ in range(steps_per_epoch):
 88 |         pd_batch = next(val_dataloader_iter)
 89 |         _, y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data = transform_and_load(
 90 |             pd_batch, device, remove_underscore_in_seq_col_name_list(tgt_id_col), remove_underscore_in_seq_col_name_list(seq_cols), remove_underscore_in_seq_col_name_list(non_seq_cols), output_col, remove_underscore_in_seq_col_name_list(cat_cols), remove_underscore_in_seq_col_name_list(cont_cols), time_steps)
 91 | 
 92 |         # Track history in training
 93 |         torch.set_grad_enabled(False)
 94 |         _, loss = autoenc.run(y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data, criterion)
 95 | 
 96 |         running_loss = (running_loss * count + loss.item()) / (count + 1)
 97 | 
 98 |         count = count + 1
 99 |         if count % 64 == 0:
100 |             logger.info("Running Loss so far: " + str(running_loss))
101 |             logger.info("Records processed so far: " + str(count*seq_cat_data.shape[0]))
102 |             time_so_far = time.time() - val_start_time
103 |             logger.info("Time taken since start:" + str(time_so_far))
104 | 
105 |     val_end_time = time.time()
106 | 
107 |     logger.info("Total time taken:" + str(val_start_time - val_end_time))
108 |     logger.info("Running loss at the end of validation epoch:" + str(running_loss))
109 |     return running_loss, val_start_time - val_end_time
110 | 
111 | 
112 | def train_peta_hvd(converter_train, autoenc, tgt_id, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps, batch_size=1024, epochs=10, learning_rate=0.01, warmup_epochs=5, save_model=False, path='./early_stop_model.pth'):
113 |     autoenc.train()
114 |     hvd.init()  # Initialize Horovod.
115 |     logger.info("Number of workers:" + str(hvd.size()))
116 |     # Horovod: pin GPU to local rank.
117 |     if torch.cuda.is_available():
118 |         torch.cuda.set_device(hvd.local_rank())
119 |         device = torch.cuda.current_device()
120 |     else:
121 |         device = torch.device("cpu")
122 | 
123 |     # from torch.utils.data.distributed import DistributedSampler
124 |     # Configure the sampler such that each worker obtains a distinct sample of input dataset.
125 |         # train_sampler = DistributedSampler(dataset_train, num_replicas=hvd.size(), rank=hvd.rank())
126 |     # Use trian_sampler to load a different sample of data on each worker.
127 |         # train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size, sampler=train_sampler)
128 | 
129 |     autoenc = autoenc.to(device)
130 |     num_epochs = epochs
131 | 
132 |     # Effective batch size in synchronous distributed training is scaled by the number of workers.
133 |     # An increase in learning rate compensates for the increased batch size.
134 |     optimizer = optim.Adam(autoenc.parameters(), lr=learning_rate * hvd.size())
135 | 
136 |     # Broadcast initial parameters so all workers start with the same parameters.
137 |     hvd.broadcast_parameters(autoenc.state_dict(), root_rank=ROOT_RANK)
138 |     hvd.broadcast_optimizer_state(optimizer, root_rank=ROOT_RANK)
139 | 
140 |     # Wrap the optimizer with Horovod's DistributedOptimizer.
141 |     optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=autoenc.named_parameters())
142 | 
143 |     scheduler_wu, scheduler_re = init_lr_schedulers(
144 |         optimizer, warmup_epochs, reduce_patience=int(EARLY_STOPPING_PATIENCE/2), verbose=True)
145 | 
146 |     hvd.broadcast_object(scheduler_wu, root_rank=ROOT_RANK)
147 |     hvd.broadcast_object(scheduler_re, root_rank=ROOT_RANK)
148 | 
149 |     criterion = [nn.MSELoss(), nn.CrossEntropyLoss()]
150 |     early_stopper = DistributedEarlyStopping(logger, patience=EARLY_STOPPING_PATIENCE, delta=EARLY_STOPPING_DELTA)
151 | 
152 |     with converter_train.make_torch_dataloader(cur_shard=hvd.rank(), shard_count=hvd.size(),
153 |                                                batch_size=batch_size, data_loader_fn=BatchedDataLoader,
154 |                                                num_epochs=None, cache_type=PS_CACHE_TYPE,
155 |                                                workers_count=PS_WORKERS_PER_CPU * get_default_parallelism(),
156 |                                                reader_pool_type=PS_WORKER_TYPE,
157 |                                                hdfs_driver=PS_HDFS_DRIVER) as train_dataloader:
158 |         train_dataloader_iter = iter(train_dataloader)
159 |         steps_per_epoch = max(1, len(converter_train) // (batch_size * hvd.size()))
160 |         total_time = 0
161 | 
162 |         epoch = 1
163 |         while epoch < num_epochs + 1:
164 |             loss, epoch_time = run_autoencoder_peta(autoenc, optimizer, steps_per_epoch, train_dataloader_iter,
165 |                                                     criterion, device, tgt_id, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps)
166 |     #       Only save checkpoints on the first worker.
167 |             total_time = total_time + epoch_time
168 |             loss_averaged = metric_average(torch.tensor(loss), 'avg_loss')
169 |             logger.info("Average overall training loss in epoch {0} is {1}".format(
170 |                 epoch, loss_averaged))
171 | 
172 |             if epoch <= warmup_epochs:
173 |                 scheduler_wu.step()
174 |             scheduler_re.step(loss_averaged)
175 | 
176 |             epoch = determine_early_stop(early_stopper, loss_averaged, autoenc, path, epoch, num_epochs)
177 |             if hvd.rank() == ROOT_RANK and epoch == num_epochs:
178 |                 if save_model:
179 |                     save_checkpoint(autoenc, optimizer, epoch, 'encoder')
180 |                 return autoenc, loss_averaged, total_time
181 |             epoch = epoch+1
182 | 
183 | 
184 | def train_val_peta_hvd(converter_train, converter_val, autoenc, tgt_id, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps, batch_size=1024, epochs=10, learning_rate=0.01, warmup_epochs=5, save_model=False, path='./early_stop_model.pth'):
185 |     autoenc.train()
186 |     hvd.init()  # Initialize Horovod.
187 |     logger.info("Number of workers:" + str(hvd.size()))
188 |     # Horovod: pin GPU to local rank.
189 |     if torch.cuda.is_available():
190 |         torch.cuda.set_device(hvd.local_rank())
191 |         device = torch.cuda.current_device()
192 |     else:
193 |         device = torch.device("cpu")
194 | 
195 |     autoenc = autoenc.to(device)
196 |     num_epochs = epochs
197 | 
198 |     # Effective batch size in synchronous distributed training is scaled by the number of workers.
199 |     # An increase in learning rate compensates for the increased batch size.
200 |     optimizer = optim.Adam(autoenc.parameters(), lr=learning_rate * hvd.size())
201 | 
202 |     # Broadcast initial parameters so all workers start with the same parameters.
203 |     hvd.broadcast_parameters(autoenc.state_dict(), root_rank=ROOT_RANK)
204 |     hvd.broadcast_optimizer_state(optimizer, root_rank=ROOT_RANK)
205 | 
206 |     # Wrap the optimizer with Horovod's DistributedOptimizer.
207 |     optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=autoenc.named_parameters())
208 | 
209 |     scheduler_wu, scheduler_re = init_lr_schedulers(
210 |         optimizer, warmup_epochs, reduce_patience=int(EARLY_STOPPING_PATIENCE/2), verbose=True)
211 | 
212 |     hvd.broadcast_object(scheduler_wu, root_rank=ROOT_RANK)
213 |     hvd.broadcast_object(scheduler_re, root_rank=ROOT_RANK)
214 | 
215 |     criterion = [nn.MSELoss(), nn.CrossEntropyLoss()]
216 |     early_stopper = DistributedEarlyStopping(logger, patience=EARLY_STOPPING_PATIENCE, delta=EARLY_STOPPING_DELTA)
217 | 
218 |     with converter_val.make_torch_dataloader(cur_shard=hvd.rank(), shard_count=hvd.size(),
219 |                                              batch_size=batch_size, data_loader_fn=BatchedDataLoader,
220 |                                              num_epochs=None, cache_type=PS_CACHE_TYPE,
221 |                                              workers_count=PS_WORKERS_PER_CPU * get_default_parallelism(),
222 |                                              reader_pool_type=PS_WORKER_TYPE,
223 |                                              hdfs_driver=PS_HDFS_DRIVER) as val_dataloader, \
224 |         converter_train.make_torch_dataloader(cur_shard=hvd.rank(), shard_count=hvd.size(),
225 |                                               batch_size=batch_size, data_loader_fn=BatchedDataLoader,
226 |                                               num_epochs=None, cache_type=PS_CACHE_TYPE,
227 |                                               workers_count=PS_WORKERS_PER_CPU * get_default_parallelism(),
228 |                                               reader_pool_type=PS_WORKER_TYPE,
229 |                                               hdfs_driver=PS_HDFS_DRIVER) as train_dataloader:
230 | 
231 |         val_dataloader_iter = iter(val_dataloader)
232 |         steps_val = max(1, len(converter_val) // (batch_size * hvd.size()))
233 | 
234 |         train_dataloader_iter = iter(train_dataloader)
235 |         steps_per_epoch = max(1, len(converter_train) // (batch_size * hvd.size()))
236 |         total_time = 0
237 | 
238 |         epoch = 1
239 |         while epoch < num_epochs + 1:
240 |             autoenc.train()
241 |             _, epoch_time = run_autoencoder_peta(autoenc, optimizer, steps_per_epoch, train_dataloader_iter,
242 |                                                  criterion, device, tgt_id, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps)
243 |             autoenc.eval()
244 |             val_loss, val_epoch_time = run_autoencoder_val_peta(autoenc, steps_val, val_dataloader_iter, criterion,
245 |                                                                 device, tgt_id, seq_cols, non_seq_cols, output_col, cat_cols, cont_cols, time_steps)
246 |             total_time = total_time + epoch_time + val_epoch_time
247 | 
248 |             loss_averaged = metric_average(torch.tensor(val_loss), 'avg_loss')
249 |             logger.info("Average overall training loss in epoch {0} is {1}".format(
250 |                 epoch, loss_averaged))
251 | 
252 |             if epoch <= warmup_epochs:
253 |                 scheduler_wu.step()
254 |             scheduler_re.step(loss_averaged)
255 | 
256 |             epoch = determine_early_stop(early_stopper, loss_averaged, autoenc, path, epoch, num_epochs)
257 |             if hvd.rank() == ROOT_RANK and epoch == num_epochs:
258 |                 if save_model:
259 |                     save_checkpoint(autoenc, optimizer, epoch, 'encoder')
260 |                 return autoenc, loss_averaged, total_time
261 |             epoch = epoch+1
262 | 


--------------------------------------------------------------------------------
/caspr/utils/spark/score.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from math import frexp
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | import torch
 7 | from pyspark.sql.functions import array, col, pandas_udf
 8 | from pyspark.sql.types import ArrayType, FloatType
 9 | 
10 | from caspr.data.common_dataset import CommonDataset, id_collate
11 | from caspr.models.factory import LSTM, TRANSFORMER
12 | from caspr.utils.preprocess import get_nonempty_tensors
13 | from caspr.utils.score import get_architecture
14 | 
15 | logger = logging.getLogger(__name__)
16 | 
17 | 
18 | def score(df, model, seq_cols, non_seq_cols, cat_cols, cont_cols, time_steps, batch_size=16*2048):
19 |     model.eval()
20 | 
21 |     # vectorizing continuous and discrete features separately
22 |     output = df.withColumn('cont_features', array([col(f) for f in cont_cols])).drop(*cont_cols)
23 |     output = output.withColumn('cat_features', array([col(f) for f in cat_cols])).drop(*cat_cols)
24 | 
25 |     if torch.cuda.is_available():
26 |         device = torch.device("cuda")
27 |     else:
28 |         device = torch.device("cpu")
29 | 
30 |     logger.info("Scoring on: %s" % device)
31 | 
32 |     # making sure the model is on CPU before the UDF is defined
33 |     model.cpu()
34 | 
35 |     def calculate_embeddings(continuous, categorical):
36 |         try:
37 |             model.to(device)
38 |             embeddings = []
39 |             batch_ds = CommonDataset.for_inference(continuous, categorical, seq_cols,
40 |                                                    non_seq_cols, cat_cols, cont_cols, time_steps)
41 | 
42 |             # nested batching to ensure Spark does not trigger CUDA OOM with larger datasets
43 |             data_loader = torch.utils.data.DataLoader(batch_ds, batch_size=batch_size, collate_fn=id_collate)
44 | 
45 |             for _, _, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data in data_loader:
46 | 
47 |                 data = [seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data]
48 |                 if isinstance(model, torch.nn.Module):
49 |                     data = [d.to(device) for d in data]
50 | 
51 |                 nonempty_tensors, nonempty_idx = get_nonempty_tensors(data)
52 | 
53 |                 if get_architecture(model) == TRANSFORMER:
54 |                     emb, _, _ = model.unified_encoder(*nonempty_tensors, nonempty_idx)
55 |                     # Concatenate across timesteps
56 |                     emb = emb.reshape(emb.shape[0], -1)
57 |                     embeddings.append(emb.detach().cpu() if isinstance(emb, torch.Tensor) else emb)
58 | 
59 |                 elif get_architecture(model) == LSTM:
60 |                     _, (hn, _) = model.unified_encoder(*nonempty_tensors, nonempty_idx)
61 |                     embeddings.append(hn.detach().cpu() if isinstance(hn, torch.Tensor) else hn)
62 | 
63 |             embeddings = pd.DataFrame(np.concatenate(embeddings, axis=0))
64 | 
65 |             return pd.Series(embeddings.values.tolist())
66 | 
67 |         finally:
68 |             # can release resources here, if needed
69 |             pass
70 | 
71 |     # Pandas UDF declaration with float[] return type
72 |     score_udf = pandas_udf(calculate_embeddings, ArrayType(FloatType()))
73 | 
74 |     # Calculating the embeddings as an additional column and dropping the temporary vectors
75 |     output = output.withColumn('embeddings', score_udf('cont_features', 'cat_features')
76 |                                ).drop('cont_features', 'cat_features')
77 | 
78 |     return output
79 | 


--------------------------------------------------------------------------------
/caspr/utils/train.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | import logging
  4 | import os
  5 | import time
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | import torch.distributed as dist
 10 | import torch.multiprocessing as mp
 11 | from torch import optim
 12 | from torch.nn.parallel import DistributedDataParallel as DDP
 13 | 
 14 | from caspr.data.load import init_loaders
 15 | from caspr.models.factory import CASPRFactory
 16 | from caspr.models.model_wrapper import AutoencoderTeacherTraining, LSTMAutoencoder, TransformerAutoEncoder
 17 | from caspr.utils.early_stopping import DistributedEarlyStopping, EarlyStopping
 18 | from caspr.utils.metrics import get_metrics
 19 | from caspr.utils.onnx import ONNXWrapper
 20 | from caspr.utils.score import get_architecture
 21 | 
 22 | DDP_BACKEND = "nccl"
 23 | DDP_MASTER_ADDR = "localhost"
 24 | DDP_MASTER_PORT = "12355"
 25 | DDP_LOAD_WORKERS = 1
 26 | STD_LOAD_WORKERS = 0
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | 
 30 | def run_autoencoder(autoenc, optimizer, dataloader_train, criterion, device):
 31 |     count = 0
 32 |     epoch_start_time = time.time()
 33 |     running_loss = 0.0
 34 | 
 35 |     for _, y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data in dataloader_train:
 36 |         y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data = y.to(device), seq_cat_data.to(
 37 |             device), seq_cont_data.to(device), non_seq_cat_data.to(device), non_seq_cont_data.to(device)
 38 | 
 39 |         _, loss = autoenc.run(y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data, criterion)
 40 |         optimizer.zero_grad()
 41 |         loss.backward()
 42 |         optimizer.step()
 43 | 
 44 |         running_loss = (running_loss * count + loss.item()) / (count + 1)
 45 | 
 46 |         count = count + 1
 47 | 
 48 |         if count % 64 == 0:
 49 |             logger.info(loss, count*seq_cat_data.shape[0])
 50 |             time_so_far = time.time() - epoch_start_time
 51 |             logger.info("Time taken since start:" + str(time_so_far))
 52 | 
 53 |     epoch_end_time = time.time()
 54 |     logger.info(epoch_end_time - epoch_start_time)
 55 | 
 56 |     return running_loss, epoch_end_time - epoch_start_time
 57 | 
 58 | 
 59 | def run_autoencoder_val(autoenc, dataloader_val, criterion, device):
 60 |     count = 0
 61 |     running_loss = 0.0
 62 | 
 63 |     for _, y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data in dataloader_val:
 64 |         y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data = y.to(device), seq_cat_data.to(
 65 |             device), seq_cont_data.to(device), non_seq_cat_data.to(device), non_seq_cont_data.to(device)
 66 | 
 67 |         _, loss = autoenc.run(y, seq_cat_data, seq_cont_data, non_seq_cat_data, non_seq_cont_data, criterion)
 68 | 
 69 |         running_loss = (running_loss * count + loss.item()) / (count + 1)
 70 |         count = count + 1
 71 | 
 72 |         if count % 64 == 0:
 73 |             logger.info(loss, count*seq_cat_data.shape[0])
 74 | 
 75 |     return running_loss
 76 | 
 77 | 
 78 | def run_epoch(model, epoch, dataloader, criterion, device, optimizer=None, is_train=True, get_outputs=False):
 79 |     model.to(device)
 80 |     losses = []
 81 |     y_labels = []
 82 |     y_preds = []
 83 | 
 84 |     if isinstance(model, DDP):
 85 |         model = model.module
 86 | 
 87 |     for _, y, seq_cat_x, seq_cont_x, non_seq_cat_x, non_seq_cont_x in dataloader:
 88 |         if is_train:
 89 |             optimizer.zero_grad()
 90 | 
 91 |         seq_cat_x = seq_cat_x.to(device)
 92 |         seq_cont_x = seq_cont_x.to(device)
 93 |         non_seq_cat_x = non_seq_cat_x.to(device)
 94 |         non_seq_cont_x = non_seq_cont_x.to(device)
 95 |         y = y.to(device)
 96 | 
 97 |         # Forward Pass
 98 |         y_pred, loss = model.run(y, seq_cat_x, seq_cont_x, non_seq_cat_x, non_seq_cont_x, criterion=criterion)
 99 |         losses.append(loss.detach().cpu().numpy())
100 | 
101 |         if get_outputs:
102 |             y_labels.append(y)
103 |             y_preds.append(y_pred)
104 | 
105 |         # Backward Pass and Optimization
106 |         if is_train:
107 |             loss.backward()
108 |             optimizer.step()
109 | 
110 |     if get_outputs:
111 |         y_labels = torch.cat(y_labels, 0).detach().cpu().numpy()
112 |         y_preds = torch.cat(y_preds, 0).detach().cpu().numpy()
113 | 
114 |     mean_loss = np.mean(np.asarray(losses))
115 |     mode = 'training' if is_train else 'validation'
116 |     logger.info("Average {} loss in epoch {} is {}".format(mode, epoch, mean_loss))
117 |     return y_labels, y_preds, mean_loss
118 | 
119 | 
120 | def init_lr_schedulers(optimizer, warmup_epochs, reduce_mode='min', reduce_factor=0.1, reduce_patience=4, verbose=True):
121 |     """
122 |     Training batch size grows proportionally with training distribution, mandating upscaling of the learning rate, which in turn reduces the probability of finding the global optimum.
123 |     This function initializes learning rate schedulers for a given optimizer to facilitate dynamic adjustment (reduction) of learning rate during training.
124 |     """
125 |     
126 |     warm_up = lambda epoch: epoch / warmup_epochs if warmup_epochs > 0 & epoch <= warmup_epochs else 1
127 |     scheduler_wu = torch.optim.lr_scheduler.LambdaLR(optimizer=optimizer, lr_lambda=warm_up)
128 |     scheduler_re = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode=reduce_mode, factor=reduce_factor, patience=reduce_patience, verbose=verbose)
129 | 
130 |     return scheduler_wu, scheduler_re
131 | 
132 | 
133 | def train_model(model, criterion, num_epochs, dataloader_train, dataloader_val, device, save_path, lr=1e-3, fix_module_names=None,
134 |                 should_decrease=True, patience=8, verbose=True, evaluate_downstream=False, rank=0, world_size=1, warmup_epochs=5, save_onnx=False):
135 | 
136 |     if isinstance(model, (LSTMAutoencoder, AutoencoderTeacherTraining, TransformerAutoEncoder)) and evaluate_downstream:
137 |         raise ValueError('evaluate_downstream should be set to False when training autoencoder')
138 | 
139 |     if fix_module_names:
140 |         fix_modules = [module for name, module in model.named_modules() if name in fix_module_names]
141 |         for module in fix_modules:
142 |             for param in module.parameters():
143 |                 param.requires_grad = False
144 |             module.eval()
145 | 
146 |     optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
147 | 
148 |     scheduler_wu, scheduler_re = init_lr_schedulers(optimizer, warmup_epochs, reduce_patience=int(patience/2), verbose=verbose)
149 | 
150 |     if world_size > 1:
151 |         early_stopping = DistributedEarlyStopping(logger, should_decrease, patience, verbose, rank=rank, save_onnx=save_onnx)
152 |     else:
153 |         early_stopping = EarlyStopping(logger, should_decrease, patience, verbose, save_onnx=save_onnx)
154 | 
155 |     for epoch in range(num_epochs):
156 |         start = time.time()
157 | 
158 |         model.train()
159 |         if fix_module_names:
160 |             for module in fix_modules:
161 |                 module.eval()
162 | 
163 |         run_epoch(model, epoch, dataloader_train, criterion, device, optimizer)
164 | 
165 |         model.eval()
166 |         with torch.no_grad():
167 |             y_labels, y_preds, mean_val_loss = run_epoch(model, epoch, dataloader_val, criterion, device,
168 |                                                          is_train=False, get_outputs=evaluate_downstream)
169 |             if evaluate_downstream:
170 |                 get_metrics(y_labels, y_preds)
171 | 
172 |             end = time.time()
173 |             logger.info("Time for epoch {0} is {1}\n".format(epoch, (end - start)))
174 |             logger.info("Mean validation loss for epoch {0} is {1}\n".format(epoch, mean_val_loss))
175 | 
176 |             if epoch <= warmup_epochs:
177 |                 scheduler_wu.step()
178 |             scheduler_re.step(mean_val_loss)
179 | 
180 |             early_stopping(mean_val_loss, model, save_path)
181 |             if early_stopping.early_stop:
182 |                 logger.info('early stopping at epoch {}'.format(epoch))
183 |                 break
184 | 
185 |     if rank == 0:
186 |         if save_onnx:
187 |             model_type = get_architecture(model)
188 |             model = ONNXWrapper(save_path, model_type)
189 |         elif isinstance(model, DDP):
190 |             model.module.load_state_dict(torch.load(save_path))
191 |         else:
192 |             model.load_state_dict(torch.load(save_path))
193 |         return model
194 | 
195 | 
196 | def __setup_ddp(rank, world_size):
197 | 
198 |     os.environ['MASTER_ADDR'] = DDP_MASTER_ADDR
199 |     os.environ['MASTER_PORT'] = DDP_MASTER_PORT
200 | 
201 |     # initialize the process group
202 |     dist.init_process_group(DDP_BACKEND, rank=rank, world_size=world_size)
203 |     torch.cuda.set_device(rank)
204 | 
205 | 
206 | def __do_train_ddp(rank, args):
207 | 
208 |     __setup_ddp(rank, args['world_size'])
209 | 
210 |     caspr_factory = args['caspr_factory']
211 | 
212 |     model = caspr_factory.create(args['caspr_arch'], **args['hyper_params'])
213 | 
214 |     model = DDP(model.cuda(), device_ids=[rank])
215 | 
216 |     train_loader, val_loader = init_loaders(args['ds_train'], args['ds_val'], args['batch_size'],
217 |                                             num_workers=DDP_LOAD_WORKERS, world_size=args['world_size'], rank=rank)
218 | 
219 |     train_model(model, args['criterion'], args['num_epochs'], train_loader, val_loader, rank, args['save_path'],
220 |                 lr=args['lr'] * args['world_size'], rank=rank, world_size=args['world_size'], **args['kwargs'])
221 | 
222 |     dist.destroy_process_group()
223 | 
224 | 
225 | def train_model_ddp(caspr_factory : CASPRFactory, caspr_arch : str, hyper_params : dict, ds_train, ds_val, criterion, num_epochs, batch_size, save_path, lr=1e-3, **kwargs):
226 |     """
227 |     Distributed Data Parallel implementation of CASPR training. Will use all GPUs available on the current machine.
228 | 
229 |     Arguments:
230 |     ----------
231 | 
232 |     caspr_factory:  CASPR model factory for the specified dataset
233 | 
234 |     caspr_arch: CASPR architecture e.g. TransformerAutoEncoder
235 | 
236 |     hyper_params:  parameters for instantiating a new CASPR model with the above method
237 | 
238 |     ds_train:  CommonDataset for training
239 | 
240 |     ds_val: CommonDataset for validation
241 | 
242 |     criterion, num_epochs, batch_size, save_path, lr: self explanatory
243 | 
244 |     **kwargs: any other parameters to be passed to the train_model function by the DDP worker (e.g. evaluate, verbose or patience)
245 | 
246 |     Returns: Trained model
247 | 
248 |     """
249 |     logger.info("Setting up model training using torch DDP")
250 | 
251 |     for arg in [caspr_factory, caspr_arch, ds_train, ds_val, criterion, num_epochs, batch_size, save_path, lr]:
252 |         if not arg:
253 |             raise ValueError("Illegal null argument. Check for None values and try again.")
254 | 
255 |     world_size = torch.cuda.device_count()
256 | 
257 |     if not torch.cuda.is_available() or world_size < 2:
258 |         device = "cuda" if torch.cuda.is_available() else "cpu"
259 |         logger.warn("DDP mode disabled. Training on %s..." % device)
260 |         model = caspr_factory.create(caspr_arch, device=device, **hyper_params)
261 |         train_loader, val_loader = init_loaders(ds_train, ds_val, batch_size, num_workers=STD_LOAD_WORKERS)
262 |         return train_model(model, criterion, num_epochs, train_loader, val_loader, device, save_path, lr, **kwargs)
263 | 
264 |     logger.info("DDP mode enabled, will train on %d GPUs" % world_size)
265 | 
266 |     arguments = locals()
267 | 
268 |     mp.spawn(__do_train_ddp,
269 |              args=(arguments,),
270 |              nprocs=world_size,
271 |              join=True)
272 | 
273 |     model = caspr_factory.create(caspr_arch, **hyper_params)
274 |     model.load_state_dict(torch.load(save_path))
275 |     return model
276 | 
277 | 
278 | def test_model(model, dataloader_test, criterion, device):
279 |     model.eval()
280 |     with torch.no_grad():
281 |         y_labels, y_preds, _ = run_epoch(
282 |             model, 0, dataloader_test, criterion, device, is_train=False, get_outputs=True)
283 |     return y_labels, y_preds
284 | 
285 | 
286 | def count_parameters(model):
287 |     return sum(p.numel() for p in model.parameters() if p.requires_grad)
288 | 


--------------------------------------------------------------------------------
/docs/PR_Guidelines.md:
--------------------------------------------------------------------------------
 1 | # Guidelines for creating a good pull request
 2 | 
 3 | 1. A PR should describe the change clearly and most importantly it should mention the motivation behind the change. Filling out the PR template should satisfy this guideline.
 4 | 2. If the PR is fixing a performance issue, mention the improvement and how the measurement was done (for educational purposes).
 5 | 3. Do not leave comments unresolved. If PR comments have been addressed without making the requested code changes, explicitly mark them resolved with an appropriate comment explaining why you're resolving it. If you intend to resolve it in a follow up PR, create a task and mention why this comment cannot be fixed in this PR. Leaving comments unresolved sets a wrong precedent for other contributors that it's ok to ignore comments. 
 6 | 4. In the interest of time, discuss the PR/comments in person/phone if it's difficult to explain in writing. Document the resolution in the PR for the educational benefit of others. Don't just mark the comment resolved saying 'based on offline discussion'.
 7 | 5. Add comments, if not obvious, in the PR to help the reviewer navigate your PR faster. If this is a big change, include a short design doc (docs/ folder).
 8 | 6. Unit tests are mandatory for all PRs (except when the proposed changes are already covered by existing unit tests).
 9 | 7. Do not use PRs as scratch pads for development as they consume valuable build/CI cycles for every commit. Build and test your changes for at least one environment (windows/linux/mac) before creating a PR.
10 | 8. Keep it small. If the feature is big, it's best to split into multiple PRs. Modulo cosmetic changes, a PR with more than 10 files is notoriously hard to review. Be kind to the reviewers.
11 | 9. Separate cosmetic changes from functional changes by making them separate PRs.
12 | 10. The PR author is responsible for merging the changes once they're approved.
13 | 11. If you co-author a PR, seek review from someone else. Do not self-approve PRs.


--------------------------------------------------------------------------------
/docs/images/caspr-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/CASPR/9cdbf270487751a0ad6862b2fea2ccc0e23a0b67/docs/images/caspr-logo.png


--------------------------------------------------------------------------------
/docs/images/caspr-poster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/CASPR/9cdbf270487751a0ad6862b2fea2ccc0e23a0b67/docs/images/caspr-poster.png


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
  1 | # -----------------------------------------------------------------------------
  2 | # Copyright (c) Microsoft Corporation. All rights reserved.
  3 | # -----------------------------------------------------------------------------
  4 | 
  5 | # -----------------------------------------------------------------------------
  6 | # Setup() configuration
  7 | # -----------------------------------------------------------------------------
  8 | 
  9 | [metadata]
 10 | name = AI.Models.CASPR
 11 | version = attr: caspr.__VERSION
 12 | description = CASPR
 13 | long_description = file: README.rst, LICENSE
 14 | keywords = "CASPR", "Machine Learning", "Deep Learning"
 15 | license = "Microsoft"
 16 | classifiers =
 17 |     Programming Language :: Python :: 3.7,
 18 |     Intended Audience :: Developers,
 19 |     License :: OSI Approved :: MIT License,
 20 |     Natural Language :: English,
 21 |     Operating System :: OS Independent,
 22 |     Topic :: Scientific/Engineering :: Artificial Intelligence
 23 | url = https://powerbi.visualstudio.com/Business360%%20AI/_git/AI.Models.CASPR
 24 | 
 25 | [options]
 26 | zip_safe = False
 27 | include_package_data = True
 28 | packages = find:
 29 | 
 30 | install_requires =      ## base (common) requirements
 31 |     pandas>1.0
 32 |     imbalanced-learn>=0.8
 33 |     scikit-learn>=0.7
 34 |     scipy>=1.5
 35 |     matplotlib>=3.3
 36 |     torch~=1.11.0
 37 |     protobuf<4.0
 38 |     onnx~=1.10.1
 39 |     onnxruntime~=1.7.0
 40 | 
 41 | [options.packages.find]
 42 | include=caspr.*
 43 | exclude=tests
 44 | 
 45 | [options.extras_require]
 46 | 
 47 | horovod =       ##  install for horovod + petastorm execution (spark.large module)
 48 |     pyspark~=3.1
 49 |     torchvision
 50 |     petastorm~=0.11
 51 |     horovod[pytorch,spark]>=0.22
 52 |     b360sparkdl>=1.0                    
 53 | 
 54 | xai =           ##  install for explainability
 55 |     AI.Models.Explainer~=6.0
 56 |     captum>=0.2
 57 | 
 58 | databricks =    ##  install on Databricks
 59 |     mlflow>=1.19  
 60 |     petastorm~=0.11
 61 | 
 62 | aml =           ##  install on Azure ML
 63 |     azureml-core>=1.32
 64 |     mlflow>=1.19
 65 |     azureml-mlflow>=1.32
 66 | 
 67 | hdi =           ##  install on HDInsights
 68 |     pyspark~=2.4.5
 69 |     numpy<1.20.0
 70 |     pyarrow~=0.17.1
 71 | 
 72 | test =          ##  install before test runs
 73 |     pytest         
 74 |     pytest-cov
 75 |     pylint 
 76 |     pylint-junit
 77 | 
 78 | dev =           ##  install for PPE, latest
 79 |     AI.Models.Explainer
 80 |     captum
 81 |     imbalanced-learn
 82 |     matplotlib
 83 |     scikit-learn
 84 |     pandas
 85 |     numpy
 86 |     torch
 87 | 
 88 | 
 89 | # -----------------------------------------------------------------------------
 90 | # Pylama Configurations
 91 | # -----------------------------------------------------------------------------
 92 | # Documentation: https://pylama.readthedocs.io/en/latest/#command-line-options
 93 | [pylama]
 94 | format = pylint
 95 | skip = */.tox/*,*/.env/*
 96 | linters = isort,mccabe,pycodestyle,pydocstyle,pyflakes,pylint
 97 | ignore = D202,D203,D213,D406,D407,D413,D415,D417
 98 | 
 99 | 
100 | # -----------------------------------------------------------------------------
101 | # Linter-Specific Configurations
102 | # -----------------------------------------------------------------------------
103 | # Possible settings: https://github.com/timothycrosley/isort/wiki/isort-Settings
104 | [pylama:isort]
105 | line_length = 120
106 | multi_line_output = 0
107 | combine_star = True
108 | use_parentheses = True
109 | combine_as_imports = True
110 | 
111 | # Used by isort command
112 | [isort]
113 | line_length = 120
114 | multi_line_output = 0
115 | combine_star = True
116 | use_parentheses = True
117 | combine_as_imports = True
118 | 
119 | # Source code: https://github.com/pycqa/mccabe
120 | [pylama:mccabe]
121 | 
122 | # Codes: https://pycodestyle.pycqa.org/en/latest/intro.html#error-codes
123 | [pylama:pycodestyle]
124 | max_line_length = 120
125 | 
126 | # Used by auto-formatters
127 | [pycodestyle]
128 | max_line_length = 120
129 | 
130 | # Codes: http://www.pydocstyle.org/en/5.0.1/error_codes.html
131 | [pylama:pydocstyle]
132 | 
133 | # Source code: https://github.com/PyCQA/pyflakes
134 | [pylama:pyflakes]
135 | max_line_length = 120
136 | statistics = True
137 | doctests = False
138 | builtins = _
139 | 
140 | # Codes: https://docs.pylint.org/en/1.6.0/features.html
141 | # Default settings: https://github.com/PyCQA/pylint/blob/master/pylintrc
142 | [pylama:pylint]
143 | max_line_length = 120
144 | logging_format_style = new
145 | attr_rgx = [a-z_][a-z0-9_]{,30}$
146 | variable_rgx = [a-z_][a-z0-9_]{,30}$
147 | argument_rgx = [a-z_][a-z0-9_]{,30}$
148 | class_attribute_rgx = ([A-Za-z_][A-Za-z0-9_]{,30}|(__.*__))$
149 | # Modules whose attributes are generated at runtime and thus attributes cannot be found using static analysis:
150 | ignored_modules =
151 |     pyspark.sql.functions, torch, numpy
152 | 
153 | 
154 | # -----------------------------------------------------------------------------
155 | # File-Specific Configurations
156 | # -----------------------------------------------------------------------------
157 | [pylama:*tests/*.py]
158 | ignore = C0114,C0115,C0116,C0302,C0321,D,R0902,R0903,R0904,W0612,W0613,C0103,R0914
159 | 
160 | [pylama:*caspr/models/lstm_autoencoder_sequence.py]
161 | ignore = C0103
162 | 
163 | [pylama:*caspr/models/attention_mechanisms.py]
164 | ignore = C0103
165 | 
166 | [pylama:*caspr/utils/train.py]
167 | ignore = W0613
168 | 
169 | [pylama:*caspr/utils/spark/large/train.py]
170 | ignore = E1102, E1121
171 | 
172 | [pylama:*caspr/utils/spark/large/score.py]
173 | ignore = E1121
174 | 
175 | [pylama:*caspr/utils/preprocess.py]
176 | ignore = R0913, R0914
177 | 
178 | [pylama:*caspr/utils/spark/preprocess.py]
179 | ignore = R0913, R0914, W0640
180 | 
181 | [pylama:*caspr/utils/explain/CASPRExplainer.py]
182 | ignore = C0103, R0902, R0913, W0221
183 | 
184 | [pylama:*caspr/utils/explain/utils.py]
185 | ignore = R0914
186 | 
187 | [pylama:*caspr/utils/segmentation/pandas.py]
188 | ignore = W0703, W0102, R0913, R0914, W0612
189 | 
190 | [pylama:*caspr/utils/segmentation/dec_utils.py]
191 | ignore = E1102, R0914
192 | 
193 | [pylama:*setup.py]
194 | ignore = A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z
195 | # skip = 1  # Not currently enforced
196 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | # replaced by AI.Common build template
 4 | auto_replaced = "__version__"
 5 | 
 6 | # minor trick to circumvent version warning when building manually
 7 | version = None if 'version' in auto_replaced else auto_replaced
 8 | 
 9 | setup(version=version)
10 | 


--------------------------------------------------------------------------------