├── .flake8
├── .github
    └── PULL_REQUEST_TEMPLATE.md
├── .gitignore
├── .pylintrc
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── NOTICE
├── README.rst
├── VERSION
├── ci
    ├── buildspec-deploy.yml
    ├── buildspec-pr.yml
    ├── buildspec-release.yml
    └── scripts
    │   └── displaytime.sh
├── requirements.txt
├── setup.py
├── src
    └── sagemaker_sklearn_extension
    │   ├── __init__.py
    │   ├── contrib
    │       ├── README.md
    │       ├── __init__.py
    │       └── taei
    │       │   ├── README.md
    │       │   ├── __init__.py
    │       │   ├── images
    │       │       └── overview.png
    │       │   ├── latent_space_oversampler.py
    │       │   ├── models.py
    │       │   ├── nn_utils.py
    │       │   └── star_oversampler.py
    │   ├── decomposition
    │       ├── __init__.py
    │       └── robust_pca.py
    │   ├── externals
    │       ├── __init__.py
    │       ├── automl_transformer.py
    │       ├── header.py
    │       └── read_data.py
    │   ├── feature_extraction
    │       ├── __init__.py
    │       ├── date_time.py
    │       ├── sequences.py
    │       └── text.py
    │   ├── impute
    │       ├── __init__.py
    │       └── base.py
    │   └── preprocessing
    │       ├── __init__.py
    │       ├── base.py
    │       ├── data.py
    │       └── encoders.py
├── test
    ├── __init__.py
    ├── contrib
    │   └── taei
    │   │   ├── data
    │   │       └── data.csv
    │   │   └── test_taei.py
    ├── data
    │   └── csv
    │   │   ├── dictionaries.csv
    │   │   ├── dirty.csv
    │   │   ├── invalid.csv
    │   │   ├── kc_house_data.csv
    │   │   ├── missing_values.csv
    │   │   ├── mock_datasplitter_output
    │   │       ├── excel.csv
    │   │       ├── manual.csv
    │   │       ├── newline.csv
    │   │       └── oneline.csv
    │   │   └── regression_na_labels.csv
    ├── test_automl_transformer.py
    ├── test_common.py
    ├── test_data.py
    ├── test_date_time.py
    ├── test_feature_extraction_text.py
    ├── test_header.py
    ├── test_impute.py
    ├── test_preprocessing.py
    ├── test_preprocessing_encoders.py
    ├── test_read_data.py
    ├── test_robust_pca.py
    └── test_sequence_transformer.py
└── tox.ini


/.flake8:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | application_import_names = src, test
 3 | import-order-style = google
 4 | max-line-length = 120
 5 | ignore =
 6 |     E203,
 7 |     E231
 8 |     W503
 9 | exclude =
10 |     build/
11 |     .git
12 |     __pycache__
13 |     .tox
14 |     venv/
15 | max-complexity = 10
16 | require-code = True


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | *Issue #, if available:*
 2 | 
 3 | *Description of changes:*
 4 | 
 5 | ## Merge Checklist
 6 | 
 7 | _Put an `x` in the boxes that apply. You can also fill these out after creating the PR. If you're unsure about any of them, don't hesitate to ask. We're here to help! This is simply a reminder of what we are going to look for before merging your pull request._
 8 | 
 9 | - [ ] I have read the [CONTRIBUTING](https://github.com/aws/sagemaker-scikit-learn-extension/blob/master/CONTRIBUTING.md) doc
10 | - [ ] I used the commit message format described in [CONTRIBUTING](https://github.com/aws/sagemaker-scikit-learn-extension/blob/master/CONTRIBUTING.md#committing-your-change)
11 | - [ ] I have added tests that prove my fix is effective or that my feature works (if appropriate)
12 | - [ ] I have updated any necessary [documentation](https://github.com/aws/sagemaker-scikit-learn-extension/blob/master/README.rst) (if appropriate)
13 | 
14 | By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license.
15 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | *.pyc
  2 | *.pyo
  3 | *.class
  4 | *~
  5 | *#
  6 | /docs/_build
  7 | /runpy
  8 | /build
  9 | .coverage*
 10 | **/.idea
 11 | **/.history
 12 | **/.cache
 13 | **/.eggs
 14 | **/.DS_Store
 15 | *.egg
 16 | *.egg-info
 17 | .*.swp
 18 | .mypy_cache
 19 | .pytest_cache
 20 | tags
 21 | __pycache__
 22 | 
 23 | # Byte-compiled / optimized / DLL files
 24 | __pycache__/
 25 | *.py[cod]
 26 | *$py.class
 27 | 
 28 | # C extensions
 29 | *.so
 30 | 
 31 | # Distribution / packaging
 32 | .Python
 33 | build/
 34 | develop-eggs/
 35 | dist/
 36 | downloads/
 37 | eggs/
 38 | .eggs/
 39 | lib/
 40 | lib64/
 41 | parts/
 42 | sdist/
 43 | var/
 44 | wheels/
 45 | pip-wheel-metadata/
 46 | share/python-wheels/
 47 | *.egg-info/
 48 | .installed.cfg
 49 | *.egg
 50 | MANIFEST
 51 | 
 52 | # PyInstaller
 53 | #  Usually these files are written by a python script from a template
 54 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 55 | *.manifest
 56 | *.spec
 57 | 
 58 | # Installer logs
 59 | pip-log.txt
 60 | pip-delete-this-directory.txt
 61 | 
 62 | # Unit test / coverage reports
 63 | htmlcov/
 64 | .tox/
 65 | .nox/
 66 | .coverage
 67 | .coverage.*
 68 | .cache
 69 | nosetests.xml
 70 | coverage.xml
 71 | *.cover
 72 | .hypothesis/
 73 | .pytest_cache/
 74 | 
 75 | # Translations
 76 | *.mo
 77 | *.pot
 78 | 
 79 | # Sphinx documentation
 80 | docs/_build/
 81 | 
 82 | # PyBuilder
 83 | target/
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # pipenv
 89 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 90 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 91 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 92 | #   install all needed dependencies.
 93 | #Pipfile.lock
 94 | 
 95 | # Environments
 96 | .env
 97 | .venv
 98 | env/
 99 | venv/
100 | ENV/
101 | env.bak/
102 | venv.bak/
103 | 
104 | # mkdocs documentation
105 | /site
106 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | ## v2.5.0 (2022-02-17)
  4 | 
  5 | ### Features
  6 | 
  7 |  * Similarity encoding
  8 | 
  9 | ### Bug fixes and other changes
 10 | 
 11 |  * Merge pull request #40 from GiannisMitr/tsfresh_extractor_speedups
 12 |  * apply feature thresholds before extracting features & add cap in total generated features.
 13 |  * remove RobustScaler from TSFreshExtractor.
 14 |  * enable parallelism in TSFeature extraction, excluding "sagemaker_serve" executions
 15 |  * Merge pull request #39 from zkarnin/sim_encode
 16 |  * Fixing broken dependency in tsfresh
 17 | 
 18 | ## v2.4.0 (2021-09-23)
 19 | 
 20 | ### Features
 21 | 
 22 |  * expansion control for TSFeatureExtractor
 23 | 
 24 | ### Bug fixes and other changes
 25 | 
 26 |  * Merge pull request #38 from nikitaivkin/master
 27 | 
 28 | ## v2.3.0 (2021-08-16)
 29 | 
 30 | ### Features
 31 | 
 32 |  * transformers for time series
 33 | 
 34 | ## v2.2.1 (2021-05-21)
 35 | 
 36 | ### Bug fixes and other changes
 37 | 
 38 |  * Datetime fix
 39 | 
 40 | ## v2.2.0 (2021-04-13)
 41 | 
 42 | ### Features
 43 | 
 44 |  * taei contrib library
 45 | 
 46 | ### Bug fixes and other changes
 47 | 
 48 |  * broken tests and dependencies
 49 | 
 50 | ## v2.1.0 (2020-10-21)
 51 | 
 52 | ### Features
 53 | 
 54 |  * adds threshold and max_categories parameter to RobustOrdinalEncoder
 55 |  * Add weight of evidence encoder
 56 | 
 57 | ### Bug fixes and other changes
 58 | 
 59 |  * use named functions instead of lambdas in DateTimeDefintions because of pickle
 60 | 
 61 | ## v2.0.0 (2020-08-13)
 62 | 
 63 | ### Breaking changes
 64 | 
 65 |  * update sklearn dependency version to 0.23 and mlio version to 0.5
 66 | 
 67 | ### Features
 68 | 
 69 |  * OrdinalEncoder can output np.nan instead of n for unseen values
 70 | 
 71 | ### Bug fixes and other changes
 72 | 
 73 |  * minor performance optimizations and refactoring
 74 | 
 75 | ## v1.2.0 (2020-07-29)
 76 | 
 77 | ### Features
 78 | 
 79 |  * adds a `get_classes` method to `RobustLabelEncoder`
 80 | 
 81 | ## v1.1.1 (2020-07-21)
 82 | 
 83 | ### Bug fixes and other changes
 84 | 
 85 |  * Merge pull request #18 from ipanepen/rle-bug
 86 |  * test data reading when n_rows = 1 mod batch_size
 87 |  * bug fix: makes fit_transform behavior consistent with fit and transform
 88 |  * fix a minor bug in OneHotEncoder by by overloading the buggy method in ThresholdOneHotEncoder and fixing it
 89 | 
 90 | ## v1.1.0 (2020-02-24)
 91 | 
 92 | ### Features
 93 | 
 94 |  * dummy feature commit for RobustOrdinalEncoder & add badges to README
 95 | 
 96 | ### Bug fixes and other changes
 97 | 
 98 |  * libprotobuf==3.11.4 is not backwards compatible, specify tox version for testing
 99 |  * Merge pull request #11 from ipanepen/master
100 |  * fix for MemoryError in ThresholdOneHotEncoder
101 |  * Adding RobustOrdinalEncoder
102 |  * Specify mlio version 0.2.7
103 | 
104 | ## v1.0.0 (2019-12-03)
105 | 
106 | ### Bug fixes and other changes
107 | 
108 |  * update to 1.0.0, fix buildspec
109 |  * update ci deployment credentials
110 |  * Merge pull request #4 from wiltonwu/master
111 |  * update documentation, remove CHANGELOG.md for 0.1.0 deployment, add date_time module
112 |  * Merge pull request #2 from ipanepen/ipanepen-add-random-seed
113 |  * adds np.random.seed(0) to test_preprocessing.py to ensure deterministic behavior
114 |  * Initial commit
115 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing Guidelines
  2 | 
  3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
  4 | documentation, we greatly value feedback and contributions from our community.
  5 | 
  6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
  7 | information to effectively respond to your bug report or contribution.
  8 | 
  9 | 
 10 | ## Reporting Bugs/Feature Requests
 11 | 
 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
 13 | 
 14 | When filing an issue, please check [existing open](https://github.com/aws/sagemaker-scikit-learn-extension/issues), or [recently closed](https://github.com/aws/sagemaker-scikit-learn-extension/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already
 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
 16 | 
 17 | * A reproducible test case or series of steps
 18 | * The version of our code being used
 19 | * Any modifications you've made relevant to the bug
 20 | * Anything unusual about your environment or deployment
 21 | 
 22 | 
 23 | ## Contributing via Pull Requests
 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
 25 | 
 26 | 1. You are working against the latest source on the *master* branch.
 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
 29 | 
 30 | ### Pulling Down the Code
 31 | 
 32 | 1. If you do not already have one, create a GitHub account by following the prompts at [Join Github](https://github.com/join).
 33 | 1. Create a fork of this repository on GitHub. You should end up with a fork at `https://github.com/<username>/sagemaker-scikit-learn-extension`.
 34 |    1. Follow the instructions at [Fork a Repo](https://help.github.com/en/articles/fork-a-repo) to fork a GitHub repository.
 35 | 1. Clone your fork of the repository: `git clone https://github.com/<username>/sagemaker-scikit-learn-extension` where `<username>` is your github username.
 36 | 
 37 | 
 38 | ### Running the Unit Tests
 39 | 
 40 | 1. Install conda or miniconda if you have not already done so. See [conda/miniconda installation instructions.](https://conda.io/projects/conda/en/latest/user-guide/install/index.html)
 41 | 1. Install test dependencies using `pip install .[test]` (or, for Zsh users: `pip install .\[test\]`)
 42 | 1. cd into the sagemaker-scikit-learn-extension folder: `cd sagemaker-scikit-learn-extension` or `cd /environment/sagemaker-scikit-learn-extension`
 43 | 1. Run the following tox command and verify that all code checks and unit tests pass: `tox`
 44 |    1. Note that this will run unit tests, linting tests, package tests, and automatically formatting tests.
 45 | 
 46 | You can also run a single test with the following command: `tox -e py37 -- -s -vv <path_to_file><file_name>::<test_function_name>`  
 47 |   * Note that the coverage test will fail if you only run a single test, so make sure to surround the command with `export IGNORE_COVERAGE=-` and `unset IGNORE_COVERAGE`
 48 |   * Example: `export IGNORE_COVERAGE=- ; tox -e py37 -- -s -vv tests/test_impute.py::test_robust_imputer ; unset IGNORE_COVERAGE`
 49 | 
 50 | 
 51 | ### Making and Testing Your Change
 52 | 
 53 | 1. Create a new git branch:
 54 |      ```shell
 55 |      git checkout -b my-fix-branch master
 56 |      ```
 57 | 1. Make your changes, **including unit tests**.
 58 |    1. Include unit tests when you contribute new features or make bug fixes, as they help to:
 59 |       1. Prove that your code works correctly.
 60 |       1. Guard against future breaking changes to lower the maintenance cost.
 61 |    1. Please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
 62 | 1. Run all the unit tests as per [Running the Unit Tests](#running-the-unit-tests), and verify that all checks and tests pass.
 63 |    1. Note that this also runs tools that may be necessary for the automated build to pass (ex: code reformatting by 'black').  
 64 | 
 65 | 
 66 | ### Committing Your Change
 67 | 
 68 | We use commit messages to update the project version number and generate changelog entries, so it's important for them to follow the right format. Valid commit messages include a prefix, separated from the rest of the message by a colon and a space. Here are a few examples:
 69 | 
 70 | ```
 71 | feature: support sparse inputs for RobustStandardScaler
 72 | fix: fix flake8 errors
 73 | ```
 74 | 
 75 | Valid prefixes are listed in the table below.
 76 | 
 77 | | Prefix          | Use for...                                                                                     |
 78 | |----------------:|:-----------------------------------------------------------------------------------------------|
 79 | | `breaking`      | Incompatible API changes.                                                                      |
 80 | | `deprecation`   | Deprecating an existing API or feature, or removing something that was previously deprecated.  |
 81 | | `feature`       | Adding a new feature.                                                                          |
 82 | | `fix`           | Bug fixes.                                                                                     |
 83 | | `change`        | Any other code change.                                                                         |
 84 | | `documentation` | Documentation changes.                                                                         |
 85 | 
 86 | Some of the prefixes allow abbreviation ; `break`, `feat`, `depr`, and `doc` are all valid. If you omit a prefix, the commit will be treated as a `change`.
 87 | 
 88 | For the rest of the message, use imperative style and keep things concise but informative. See [How to Write a Git Commit Message](https://chris.beams.io/posts/git-commit/) for guidance.
 89 | 
 90 | 
 91 | ### Sending a Pull Request
 92 | 
 93 | GitHub provides additional document on [Creating a Pull Request](https://help.github.com/articles/creating-a-pull-request/).
 94 | 
 95 | Please remember to:
 96 | * Use commit messages (and PR titles) that follow the guidelines under [Committing Your Change](#committing-your-change).
 97 | * Send us a pull request, answering any default questions in the pull request interface.
 98 | * Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
 99 | 
100 | 
101 | ## Finding contributions to work on
102 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws/sagemaker-scikit-learn-extension/labels/help%20wanted) issues is a great place to start.
103 | 
104 | 
105 | ## Code of Conduct
106 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
107 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
108 | opensource-codeofconduct@amazon.com with any additional questions or comments.
109 | 
110 | 
111 | ## Security issue notifications
112 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
113 | 
114 | 
115 | ## Licensing
116 | 
117 | See the [LICENSE](https://github.com/aws/sagemaker-scikit-learn-extension/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
118 | 
119 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.
120 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | include LICENSE.txt
3 | include VERSION
4 | include README.rst


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Sagemaker Scikit Learn Extension
2 | Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 
3 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | SageMaker Scikit-Learn Extension
  2 | ================================
  3 | 
  4 | .. image:: https://img.shields.io/badge/License-Apache%202.0-blue.svg
  5 |    :target: https://opensource.org/licenses/Apache-2.0
  6 |    :alt: License
  7 | 
  8 | .. image:: https://img.shields.io/pypi/v/sagemaker-scikit-learn-extension.svg
  9 |    :target: https://pypi.python.org/pypi/sagemaker-scikit-learn-extension
 10 |    :alt: Latest Version
 11 | 
 12 | .. image:: https://img.shields.io/badge/code_style-black-000000.svg
 13 |    :target: https://github.com/python/black
 14 |    :alt: Code style: black
 15 | 
 16 | SageMaker Scikit-Learn Extension is a Python module for machine learning built on top of `scikit-learn <https://scikit-learn.org>`_.
 17 | 
 18 | This project contains standalone scikit-learn estimators and additional tools to support SageMaker Autopilot. Many of the additional estimators are based on existing scikit-learn estimators.
 19 | 
 20 | 
 21 | User Installation
 22 | -----------------
 23 | 
 24 | To install,
 25 | 
 26 | ::
 27 | 
 28 |     # install from pip
 29 |     pip install sagemaker-scikit-learn-extension
 30 | 
 31 | In order to use the I/O functionalies in the :code:`sagemaker_sklearn_extension.externals` module, you will also need to install the :code:`mlio` version 0.7 package via conda. The :code:`mlio` package is only available through conda at the moment.
 32 | 
 33 | To install :code:`mlio`,
 34 | 
 35 | ::
 36 | 
 37 |     # install mlio
 38 |     conda install -c mlio -c conda-forge mlio-py==0.7
 39 | 
 40 | To see more information about mlio, see https://github.com/awslabs/ml-io.
 41 | 
 42 | You can also install from source by cloning this repository and running a ``pip install`` command in the root directory of the repository:
 43 | 
 44 | ::
 45 | 
 46 |     # install from source
 47 |     git clone https://github.com/aws/sagemaker-scikit-learn-extension.git
 48 |     cd sagemaker-scikit-learn-extension
 49 |     pip install -e .
 50 | 
 51 | 
 52 | Supported Operating Systems
 53 | ---------------------------
 54 | 
 55 | SageMaker scikit-learn extension supports Unix/Linux and Mac.
 56 | 
 57 | Supported Python Versions
 58 | -------------------------
 59 | 
 60 | SageMaker scikit-learn extension is tested on:
 61 | 
 62 | - Python 3.7
 63 | 
 64 | License
 65 | -------
 66 | 
 67 | This library is licensed under the Apache 2.0 License.
 68 | 
 69 | Development
 70 | -----------
 71 | 
 72 | We welcome contributions from developers of all experience levels.
 73 | 
 74 | The SageMaker scikit-learn extension is meant to be a repository for scikit-learn estimators that don't meet scikit-learn's stringent inclusion criteria.
 75 | 
 76 | 
 77 | Setup
 78 | -----
 79 | 
 80 | We recommend using conda for development and testing.
 81 | 
 82 | To download conda, go to the `conda installation guide <https://conda.io/projects/conda/en/latest/user-guide/install/index.html>`_.
 83 | 
 84 | 
 85 | Running Tests
 86 | -------------
 87 | 
 88 | SageMaker scikit-learn extension contains an extensive suite of unit tests.
 89 | 
 90 | You can install the libraries needed to run the tests by running :code:`pip install --upgrade .[test]` or, for Zsh users: :code:`pip install --upgrade .\[test\]`
 91 | 
 92 | For unit tests, tox will use pytest to run the unit tests in a Python 3.7 interpreter. tox will also run flake8 and pylint for style checks.
 93 | 
 94 | conda is needed because of the dependency on mlio 0.7.
 95 | 
 96 | To run the tests with tox, run:
 97 | 
 98 | ::
 99 | 
100 |     tox
101 | 
102 | Running on SageMaker
103 | --------------------
104 | 
105 | To use sagemaker-scikit-learn-extension on SageMaker, you can build the `sagemaker-scikit-learn-extension-container <https://github.com/aws/sagemaker-scikit-learn-container>`_.
106 | 
107 | Overview of Submodules
108 | ----------------------
109 | 
110 | * :code:`sagemaker_sklearn_extension.decomposition`
111 |    * :code:`RobustPCA` dimension reduction for dense and sparse inputs
112 | * :code:`sagemaker_sklearn_extension.externals`
113 |    * :code:`AutoMLTransformer` utility class encapsulating feature and target transformation functionality used in SageMaker Autopilot
114 |    * :code:`Header` utility class to manage the header and target columns in tabular data
115 |    * :code:`read_csv_data` reads comma separated data and returns a numpy array (uses mlio)
116 | * :code:`sagemaker_sklearn_extension.feature_extraction.date_time`
117 |    * :code:`DateTimeVectorizer` convert datetime objects or strings into numeric features
118 | * :code:`sagemaker_sklearn_extension.feature_extraction.sequences`
119 |    * :code:`TSFlattener` convert strings of sequences into numeric features
120 |    * :code:`TSFreshFeatureExtractor` compute row-wise time series features from a numpy array (uses tsfresh)
121 | * :code:`sagemaker_sklearn_extension.feature_extraction.text`
122 |    * :code:`MultiColumnTfidfVectorizer` convert collections of raw documents to a matrix of TF-IDF features
123 | * :code:`sagemaker_sklearn_extension.impute`
124 |    * :code:`RobustImputer` imputer for missing values with customizable mask_function and multi-column constant imputation
125 |    * :code:`RobustMissingIndicator` binary indicator for missing values with customizable mask_function
126 | * :code:`sagemaker_sklearn_extension.preprocessing`
127 |    * :code:`BaseExtremeValuesTransformer` customizable transformer for columns that contain "extreme" values (columns that are heavy tailed)
128 |    * :code:`LogExtremeValuesTransformer` stateful log transformer for columns that contain "extreme" values (columns that are heavy tailed)
129 |    * :code:`NALabelEncoder` encoder for transforming labels to NA values
130 |    * :code:`QuadraticFeatures` generate and add quadratic features to feature matrix
131 |    * :code:`QuantileExtremeValuesTransformer` stateful quantiles transformer for columns that contain "extreme" values (columns that are he
132 |    * :code:`ThresholdOneHotEncoder` encode categorical integer features as a one-hot numeric array, with optional restrictions on feature encoding
133 |    * :code:`RemoveConstantColumnsTransformer` removes constant columns
134 |    * :code:`RobustLabelEncoder` encode labels for seen and unseen labels
135 |    * :code:`RobustStandardScaler` standardization for dense and sparse inputs
136 |    * :code:`WOEEncoder` weight of evidence supervised encoder
137 |    * :code:`SimilarityEncoder` encode categorical values based on their descriptive string
138 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 2.5.1.dev0
2 | 


--------------------------------------------------------------------------------
/ci/buildspec-deploy.yml:
--------------------------------------------------------------------------------
 1 | version: 0.2
 2 | 
 3 | phases:
 4 |   build:
 5 |     commands:
 6 |       - PACKAGE_FILE="$CODEBUILD_SRC_DIR_ARTIFACT_1/sagemaker-scikit-learn-extension-*.tar.gz"
 7 |       - PYPI_USER=$(aws secretsmanager get-secret-value --secret-id /codebuild/pypi/user --query SecretString --output text)
 8 |       - PYPI_PASSWORD=$(aws secretsmanager get-secret-value --secret-id /codebuild/pypi/password --query SecretString --output text)
 9 | 
10 |       - echo 'md5sum of python package:'
11 |       - md5sum $PACKAGE_FILE
12 | 
13 |       # publish to pypi
14 |       - twine upload $PACKAGE_FILE -u $PYPI_USER -p $PYPI_PASSWORD


--------------------------------------------------------------------------------
/ci/buildspec-pr.yml:
--------------------------------------------------------------------------------
 1 | version: 0.2
 2 | 
 3 | phases:
 4 |   build:
 5 |     commands:
 6 |       # install tbb dependency
 7 |       - apt update -y
 8 |       - apt-get install -y libtbb-dev
 9 | 
10 |       # install tox
11 |       - pip install tox tox-conda==0.7.3
12 | 
13 |       # run linters, format verification, and package checks
14 |       - start_time=`date +%s`
15 |       - tox -e flake8,pylint,black-check,twine
16 |       - ./ci/scripts/displaytime.sh 'flake8,pylint,twine,black-check' $start_time
17 | 
18 |       # run unit tests
19 |       - start_time=`date +%s`
20 |       - tox -e py37
21 |       - ./ci/scripts/displaytime.sh 'py37 unit' $start_time
22 | 
23 |       # run unit tests for contrib
24 |       - start_time=`date +%s`
25 |       - tox -e contrib_taei_py37
26 |       - ./ci/scripts/displaytime.sh 'contrib_taei_py37 unit' $start_time


--------------------------------------------------------------------------------
/ci/buildspec-release.yml:
--------------------------------------------------------------------------------
 1 | version: 0.2
 2 | 
 3 | phases:
 4 |   build:
 5 |     commands:
 6 |       # run git-secrets
 7 |       - git-secrets --scan-history
 8 | 
 9 |       # install tbb dependency
10 |       - apt update -y
11 |       - apt-get install -y libtbb-dev
12 | 
13 |       # install tox
14 |       - pip install tox tox-conda==0.7.3
15 | 
16 |       # prepare release
17 |       - git-release --prepare --min-version 1.0.0
18 | 
19 |       # run linters
20 |       - tox -e flake8,pylint
21 | 
22 |       # run format verification
23 |       - tox -e black-check
24 | 
25 |       # run package check
26 |       - tox -e twine
27 | 
28 |       # run unit tests
29 |       - tox -e py37
30 | 
31 |       # run unit tests for contrib
32 |       - tox -e contrib_taei_py37
33 | 
34 |       # generate distribution package
35 |       - python3 setup.py sdist
36 | 
37 |       # publish release to github
38 |       - git-release --publish --min-version 1.0.0
39 | 
40 | artifacts:
41 |   files:
42 |     - dist/sagemaker-scikit-learn-extension-*.tar.gz
43 |   name: ARTIFACT_1
44 |   discard-paths: yes


--------------------------------------------------------------------------------
/ci/scripts/displaytime.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 5 | # may not use this file except in compliance with the License. A copy of
 6 | # the License is located at
 7 | #
 8 | #     http://aws.amazon.com/apache2.0/
 9 | #
10 | # or in the "license" file accompanying this file. This file is
11 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
12 | # ANY KIND, either express or implied. See the License for the specific
13 | # language governing permissions and limitations under the License.
14 | 
15 | set -euo pipefail
16 | 
17 | echo =================== $1 execution time ===================
18 | 
19 | start_time=$2
20 | end_time=`date +%s`
21 | total_time=$(expr $end_time - $start_time + 1)
22 | hours=$((total_time/60/60%24))
23 | minutes=$((total_time/60%60))
24 | secs=$((total_time%60))
25 | 
26 | (( $hours > 0 )) && printf '%d hours ' $hours
27 | (( $minutes > 0 )) && printf '%d minutes ' $minutes
28 | (( $hours > 0 || $minutes > 0 )) && printf 'and '
29 | printf '%d seconds\n\n' $secs


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.16.4
2 | psutil
3 | scikit-learn==0.23.2
4 | python-dateutil==2.8.0
5 | pandas==1.2.4
6 | tsfresh==0.18.0
7 | statsmodels==0.12.2
8 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #      http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | 
14 | import os
15 | 
16 | from setuptools import find_packages, setup
17 | 
18 | 
19 | def read(fname):
20 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
21 | 
22 | 
23 | def read_version():
24 |     return read("VERSION").strip()
25 | 
26 | 
27 | EXTRAS_REQUIRE = {
28 |     "test": ["tox", "tox-conda==0.7.3", "pytest", "coverage"],
29 |     "taei": ["torch==1.7.1"],
30 | }
31 | 
32 | 
33 | setup(
34 |     name="sagemaker-scikit-learn-extension",
35 |     version=read_version(),
36 |     description="Open source library extension of scikit-learn for Amazon SageMaker.",
37 |     packages=find_packages(where="src", exclude=("test",)),
38 |     package_dir={"": "src"},
39 |     long_description=read("README.rst"),
40 |     author="Amazon Web Services",
41 |     url="https://github.com/aws/sagemaker-scikit-learn-extension/",
42 |     license="Apache License 2.0",
43 |     keywords="ML Amazon AWS AI SKLearn Scikit-Learn",
44 |     classifiers=["Development Status :: 4 - Beta", "License :: OSI Approved :: Apache Software License"],
45 |     extras_require=EXTRAS_REQUIRE,
46 | )
47 | 


--------------------------------------------------------------------------------
/src/sagemaker_sklearn_extension/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #      http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | 
14 | """
15 | Amazon SageMaker extension module of sklearn
16 | ============================================
17 | 
18 | 
19 | """
20 | from . import *  # noqa: F401, F403
21 | 


--------------------------------------------------------------------------------
/src/sagemaker_sklearn_extension/contrib/README.md:
--------------------------------------------------------------------------------
 1 | # SageMaker Scikit-Learn Extension Contrib Extensions
 2 | 
 3 | Contrib is a package of utilities that work with Scikit-Learn extension but are not directly within the scope of the core Scikit-Learn Extension library. Currently the contrib package includes:
 4 | - `taei`: Implementations of the latent space minority oversampling techniques proposed in [1]
 5 | 
 6 | ### References
 7 | [1] S. Darabi and Y. Elor "Synthesising Multi-Modal Minority Samples for Tabular Data"
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/src/sagemaker_sklearn_extension/contrib/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #      http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | 
14 | """
15 | Amazon SageMaker extension module of sklearn - contrib
16 | ======================================================
17 | 
18 | 
19 | """
20 | from . import *  # noqa: F401, F403
21 | 


--------------------------------------------------------------------------------
/src/sagemaker_sklearn_extension/contrib/taei/README.md:
--------------------------------------------------------------------------------
  1 | # Tabular AutoEncoder Interpolator
  2 | 
  3 | ## Overview
  4 | <img src="images/overview.png" alt="overview" height="200">
  5 | 
  6 | This library contains implementations of the latent space minority oversampling techniques proposed in [1] for 
  7 | multi-modal data. These oversamplers work by
  8 | 
  9 | 1. Mapping the multi-modal samples to a dense continuous latent space using an autoencoder
 10 | 2. Applying oversampling by interpolation in the latent space
 11 | 3. Mapping the synthetic samples back to the original feature space
 12 | 
 13 | This framework was shown to be effective in generating high-quality multi-modal synthetic data which then resulted in 
 14 | better prediction quality for downstream tasks.
 15 | 
 16 | #### LatentSpaceOversampler
 17 | The interpolator is implemented by `LatentSpaceOversampler` which takes two inputs at initialization:
 18 | - `model` - The autoencoder used to map the samples to the latent space and back. Currently, two 
 19 | autoencoders are provided with the package: `AE` which is a vanilla autoencoder and `VAE` which is a variational 
 20 | autoencoder.
 21 | - `base_oversampler` function - The oversampling function applied in the latent space. We have experimented with
 22 | `SMOTE` from [imbalanced-learn](https://github.com/scikit-learn-contrib/imbalanced-learn) and `StarOversampler` which is
 23 | our light weight implementation (provided with this package) of `polynom_fit_SMOTE`[2] based on the implementation of
 24 | [smote_variants](https://github.com/analyticalmindsltd/smote_variants)[3]
 25 | 
 26 | ## Installation
 27 | It is recommended to install from PyPI
 28 | ```
 29 | pip install sagemaker-scikit-learn-extension[taei]
 30 | 
 31 | # For Zsh users: 
 32 | pip install sagemaker-scikit-learn-extension\[taei]\
 33 | ```
 34 | 
 35 | ## Examples
 36 | [imbalanced-learn](https://github.com/scikit-learn-contrib/imbalanced-learn) is required to run the examples below as 
 37 | it provides the dataset and the base oversampler. Install
 38 | [imbalanced-learn](https://github.com/scikit-learn-contrib/imbalanced-learn) by
 39 | ```
 40 | pip install imbalanced-learn==0.7
 41 | ```
 42 | 
 43 | TAEI supports input of either a numpy.ndarray or a pandas.DataFrame object with two types of columns:
 44 | - Continuous columns: numeric values, can have very large cardinality
 45 | - Discrete (categorical) columns: numeric values with low cardinality. These columns need be encoded to ordinal integers
 46 | before using TAEI. This could be easily done using `sagemaker_sklearn_extension.preprocessing.OrdinalEncoder`
 47 | 
 48 | Next we load the dataset from [imbalanced-learn](https://github.com/scikit-learn-contrib/imbalanced-learn) and specify 
 49 | which columns are continuous and which are discrete
 50 | ```python
 51 | import imblearn.datasets
 52 | 
 53 | # load the datasets
 54 | d = imblearn.datasets.fetch_datasets()["abalone"]
 55 | # indexes of categorical features
 56 | categorical_features = [0, 1, 2]
 57 | # number of uniques for each categorical feature
 58 | categorical_dims = [2, 2, 2]
 59 | # indexes of continuous features
 60 | continuous_features = [3, 4, 5, 6, 7, 8, 9]
 61 | ```
 62 | 
 63 | ### Vanilla autoencoder + SMOTE
 64 | We start with an example of wrapping SMOTE with a vanilla autoencoder
 65 | ```python
 66 | from imblearn.over_sampling import SMOTE
 67 | from sagemaker_sklearn_extension.contrib.taei import LatentSpaceOversampler, AE
 68 | 
 69 | ae_smote = LatentSpaceOversampler(
 70 |     model=AE(
 71 |         categorical_features=categorical_features,
 72 |         categorical_dims=categorical_dims,
 73 |         continuous_features=continuous_features,
 74 |     ),
 75 |     base_oversampler=SMOTE(sampling_strategy=0.5).fit_resample,
 76 | )
 77 | ```
 78 | We train the autoencoder on the training data before using the oversampler
 79 | ```python
 80 | ae_smote.fit(X=d["data"], y=d["target"], verbose=True)
 81 | ```
 82 | 
 83 | Finally, we can oversample the minority class
 84 | ```python
 85 | # Oversample the minority class
 86 | X_oversampled, y_oversampled = ae_smote.resample(X=d["data"], y=d["target"], verbose=True)
 87 | ```
 88 | Note that the base oversampler, SMOTE in our case, controls the number of minority samples generated
 89 | 
 90 | ### Variational autoencoder + StarOversampler
 91 | We demonstrate PolynomFit using the "star" topology [2] wrapped by a variational autoencoder, a combination yielding
 92 | superior prediction quality in our experiments[1]. For PolynomFit, we use our light weight implementation,
 93 | `StarOversampler`, based on the implementation of
 94 | [smote_variants](https://github.com/analyticalmindsltd/smote_variants)[3]
 95 | ```python
 96 | from sagemaker_sklearn_extension.contrib.taei import LatentSpaceOversampler, VAE, StarOversampler
 97 | 
 98 | vae_poly = LatentSpaceOversampler(
 99 |     model=VAE(
100 |         categorical_features=categorical_features,
101 |         categorical_dims=categorical_dims,
102 |         continuous_features=continuous_features,
103 |     ),
104 |     base_oversampler=StarOversampler(proportion=1.0).resample
105 | )
106 | # Train the model and oversample in a single function call
107 | X_oversampled, y_oversampled = vae_poly.fit_resample(X=d['data'], y=d['target'], verbose=True)
108 | ```
109 | 
110 | ### Save and load trained models
111 | First, store the model we trained in `vae_poly` to a file. Note that `base_oversampler` is not stored, only the trained
112 | model
113 | ```python
114 | vae_poly.save_model('/tmp/vae_model.pth')
115 | ```
116 | We use the stored model by creating a new `LatentSpaceOversampler` and loading the trained model into it
117 | ```python
118 | vae_poly_loaded = LatentSpaceOversampler(
119 |     model=None,
120 |     base_oversampler=StarOversampler(proportion=1.0).resample
121 | )
122 | vae_poly_loaded.load_model('/tmp/vae_model.pth')
123 | # Oversample the minority class using the stored model
124 | X_os, y_os = vae_poly_loaded.resample(d['data'], d['target'], verbose=True)
125 | ```
126 | 
127 | 
128 | ## Citing TAEI
129 | 
130 | If you use TAEI, please cite the following work:
131 | - S. Darabi and Y. Elor "Synthesising Multi-Modal Minority Samples for Tabular Data"
132 | 
133 | ## References
134 | [1] S. Darabi and Y. Elor "Synthesising Multi-Modal Minority Samples for Tabular Data"
135 | 
136 | [2] Gazzah, S. and Amara, N. E. B., "New Oversampling Approaches Based on Polynomial Fitting for Imbalanced Data Sets", 
137 | 2008 The Eighth IAPR International Workshop on Document Analysis Systems, 2008, pp. 677-684
138 | 
139 | [3] Gy\"orgy Kov\'acs. "smote-variants: a Python Implementation of 85 Minority Oversampling Techniques", Neurocomputing
140 | 366, 2019


--------------------------------------------------------------------------------
/src/sagemaker_sklearn_extension/contrib/taei/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #      http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | 
14 | from .latent_space_oversampler import LatentSpaceOversampler
15 | from .models import AE, VAE
16 | from .star_oversampler import StarOversampler
17 | 
18 | __all__ = ["LatentSpaceOversampler", "AE", "VAE", "StarOversampler"]
19 | 


--------------------------------------------------------------------------------
/src/sagemaker_sklearn_extension/contrib/taei/images/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/sagemaker-scikit-learn-extension/2412131311433addbae9f6ad5aa393a8bdbbe61f/src/sagemaker_sklearn_extension/contrib/taei/images/overview.png


--------------------------------------------------------------------------------
/src/sagemaker_sklearn_extension/contrib/taei/latent_space_oversampler.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | from sklearn.model_selection import train_test_split
  4 | from sklearn.utils import check_X_y
  5 | 
  6 | 
  7 | class LatentSpaceOversampler:
  8 |     """
  9 |     Implementation of the latent space minority oversampling techniques proposed in [1]. The model (autoencoder) is used
 10 |     to encode the samples to the latent space where the base oversampler is applied to generate new minority samples.
 11 |     The generated synthetic minority samples are decoded back to the original feature space using the decoder.
 12 |     Interpolation parameters such as the oversampling ratio are controlled by the base oversampler.
 13 | 
 14 |     Parameters
 15 |     ----------
 16 |     model : (autoencoder) pytorch model
 17 |         A model to be used to encode the samples into the latent space before interpolation and from the latent space
 18 |         after interpolation
 19 |     base_oversampler : oversampler
 20 |         oversampler used to interpolate samples in the latent space
 21 |     device : 'cpu' or 'gpu' (default = 'cpu')
 22 |         Device used by pytorch for training the model and using the trained model for encoding/decoding
 23 |     random_state : int (default = 0)
 24 |         Random number generation seed
 25 | 
 26 |     References
 27 |     ----------
 28 |     .. [1] S. Darabi and Y. Elor "Synthesising Multi-Modal Minority Samples for Tabular Data"
 29 | 
 30 |     """
 31 | 
 32 |     def __init__(self, model, base_oversampler, device="cpu", random_state=0):
 33 |         self.model = model
 34 |         self.base_oversampler = base_oversampler
 35 |         self.device = device
 36 |         self.random_state = random_state
 37 | 
 38 |     def fit(self, X, y, validation_ratio=0.2, **kwargs):
 39 |         """
 40 |         Train the model using gradient descent back propagation
 41 | 
 42 |         Parameters
 43 |         ----------
 44 |         X : {array-like, sparse matrix} of shape (n_samples, n_features)
 45 |             Features matrix used to train the model
 46 |         y : vector-like of shape (n_samples, 1)
 47 |             The target vector used to train the model
 48 |         validation_ratio : float or None (default = 0.2)
 49 |             Ratio of samples to be used as validation set for early stopping in model training. If None then early
 50 |             stopping is not applied
 51 |         **kwargs:
 52 |             Additional arguments passed the the model internal fit function
 53 |         """
 54 |         X, y = check_X_y(X, y)
 55 |         if validation_ratio:
 56 |             X_train, X_validation, y_train, y_validation = train_test_split(
 57 |                 X, y, test_size=validation_ratio, stratify=y, random_state=self.random_state
 58 |             )
 59 |         else:
 60 |             X_train = X
 61 |             y_train = y
 62 |             X_validation = None
 63 |             y_validation = None
 64 |         self.model.fit(
 65 |             X_train=X_train,
 66 |             y_train=y_train,
 67 |             X_validation=X_validation,
 68 |             y_validation=y_validation,
 69 |             device=self.device,
 70 |             **kwargs,
 71 |         )
 72 |         return self
 73 | 
 74 |     def resample(self, X, y, verbose=False):
 75 |         """
 76 |         Use the model and the base oversampler to generate synthetic minority samples
 77 |         """
 78 |         X, y = check_X_y(X, y)
 79 |         self.model.eval()
 80 |         X = torch.Tensor(X)
 81 |         X = X.to(self.device)
 82 |         with torch.no_grad():
 83 |             z = self.model.encode(X)
 84 |         z = z.cpu().numpy()
 85 |         if verbose:
 86 |             print(f"LatentSpaceOversampler: Shape before oversampling z:{z.shape}, y:{y.shape}")
 87 |         z_samples, y_samples = self.base_oversampler(z, y)
 88 |         if verbose:
 89 |             print(f"LatentSpaceOversampler: Shape after oversampling z:{z_samples.shape}, y:{y_samples.shape}")
 90 |         z_samples = z_samples[-(len(z_samples) - len(X)) :]
 91 |         y_samples = y_samples[-(len(y_samples) - len(y)) :].reshape(-1)
 92 |         z_samples = torch.Tensor(z_samples).to(self.device)
 93 |         with torch.no_grad():
 94 |             x_samples = self.model.decode_sample(z_samples)
 95 |         X = torch.cat([X, x_samples], dim=0).cpu().numpy()
 96 |         y = np.concatenate((y, y_samples), axis=0)
 97 |         return X, y
 98 | 
 99 |     def fit_resample(self, X, y, verbose=False, **kwargs):
100 |         return self.fit(X, y, verbose=verbose, **kwargs).resample(X, y, verbose=verbose)
101 | 
102 |     def save_model(self, path):
103 |         torch.save(self.model, path)
104 | 
105 |     def load_model(self, path):
106 |         self.model = torch.load(path)
107 | 


--------------------------------------------------------------------------------
/src/sagemaker_sklearn_extension/contrib/taei/nn_utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | import numpy as np
  4 | 
  5 | 
  6 | class LambdaLogSoftmax(nn.Module):
  7 |     def __init__(self, dim):
  8 |         super().__init__()
  9 |         self.dim = dim
 10 | 
 11 |     def forward(self, *args, **kwargs):
 12 |         return nn.functional.log_softmax(dim=self.dim, *args, **kwargs)
 13 | 
 14 | 
 15 | class GBN(torch.nn.Module):
 16 |     """
 17 |         Ghost Batch Normalization
 18 |         https://arxiv.org/abs/1705.08741
 19 |     """
 20 | 
 21 |     def __init__(self, input_dim, virtual_batch_size=128, momentum=0.01):
 22 |         super(GBN, self).__init__()
 23 | 
 24 |         self.input_dim = input_dim
 25 |         self.virtual_batch_size = virtual_batch_size
 26 |         self.bn = nn.BatchNorm1d(self.input_dim, momentum=momentum)
 27 | 
 28 |     def forward(self, x):
 29 |         chunks = x.chunk(int(np.ceil(x.shape[0] / self.virtual_batch_size)), 0)
 30 |         res = [self.bn(x_) for x_ in chunks]
 31 |         return torch.cat(res, dim=0)
 32 | 
 33 | 
 34 | class EmbeddingGenerator(torch.nn.Module):
 35 |     """
 36 |         Classical embeddings generator
 37 |         adopted from https://github.com/dreamquark-ai/tabnet/
 38 |     """
 39 | 
 40 |     def __init__(self, input_dim, cat_dims, cat_idxs, cat_emb_dim=None):
 41 |         """ This is an embedding module for an entire set of features
 42 |         Parameters
 43 |         ----------
 44 |         input_dim : int
 45 |             Number of features coming as input (number of columns)
 46 |         cat_dims : list of int
 47 |             Number of modalities for each categorial features
 48 |             If the list is empty, no embeddings will be done
 49 |         cat_idxs : list of int
 50 |             Positional index for each categorical features in inputs
 51 |         cat_emb_dim : int or list of int
 52 |             Embedding dimension for each categorical features
 53 |             If int, the same embdeding dimension will be used for all categorical features
 54 |         """
 55 |         super(EmbeddingGenerator, self).__init__()
 56 |         if cat_dims == [] or cat_idxs == []:
 57 |             self.skip_embedding = True
 58 |             self.post_embed_dim = input_dim
 59 |             return
 60 |         if cat_emb_dim is None:
 61 |             # use heuristic
 62 |             cat_emb_dim = [min(600, round(1.6 * n_cats ** 0.56)) for n_cats in cat_dims]
 63 | 
 64 |         # heuristic
 65 |         self.skip_embedding = False
 66 |         if isinstance(cat_emb_dim, int):
 67 |             self.cat_emb_dims = [cat_emb_dim] * len(cat_idxs)
 68 |         else:
 69 |             self.cat_emb_dims = cat_emb_dim
 70 | 
 71 |         # check that all embeddings are provided
 72 |         if len(self.cat_emb_dims) != len(cat_dims):
 73 |             msg = """ cat_emb_dim and cat_dims must be lists of same length, got {len(self.cat_emb_dims)}
 74 |                       and {len(cat_dims)}"""
 75 |             raise ValueError(msg)
 76 |         self.post_embed_dim = int(input_dim + np.sum(self.cat_emb_dims) - len(self.cat_emb_dims))
 77 | 
 78 |         self.embeddings = torch.nn.ModuleList()
 79 | 
 80 |         # Sort dims by cat_idx
 81 |         sorted_idxs = np.argsort(cat_idxs)
 82 |         cat_dims = [cat_dims[i] for i in sorted_idxs]
 83 |         self.cat_emb_dims = [self.cat_emb_dims[i] for i in sorted_idxs]
 84 | 
 85 |         for cat_dim, emb_dim in zip(cat_dims, self.cat_emb_dims):
 86 |             self.embeddings.append(torch.nn.Embedding(cat_dim, emb_dim))
 87 | 
 88 |         # record continuous indices
 89 |         self.continuous_idx = torch.ones(input_dim, dtype=torch.bool)
 90 |         self.continuous_idx[cat_idxs] = 0
 91 | 
 92 |     def forward(self, x):
 93 |         """
 94 |         Apply embdeddings to inputs
 95 |         Inputs should be (batch_size, input_dim)
 96 |         Outputs will be of size (batch_size, self.post_embed_dim)
 97 |         """
 98 |         if self.skip_embedding:
 99 |             # no embeddings required
100 |             return x
101 |         cols = []
102 |         cat_feat_counter = 0
103 |         for feat_init_idx, is_continuous in enumerate(self.continuous_idx):
104 |             # Enumerate through continuous idx boolean mask to apply embeddings
105 |             if is_continuous:
106 |                 cols.append(x[:, feat_init_idx].float().view(-1, 1))
107 |             else:
108 |                 cols.append(self.embeddings[cat_feat_counter](x[:, feat_init_idx].long()))
109 |                 cat_feat_counter += 1
110 |         # concat
111 |         post_embeddings = torch.cat(cols, dim=1)
112 |         return post_embeddings
113 | 
114 | 
115 | def weight_init(m):
116 |     if isinstance(m, nn.Linear):
117 |         nn.init.kaiming_uniform_(m.weight)
118 | 


--------------------------------------------------------------------------------
/src/sagemaker_sklearn_extension/contrib/taei/star_oversampler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class StarOversampler:
 5 |     """
 6 |     Implementation of the oversampler proposed in [1] using the `star` topology. The implementation is based on the
 7 |     implementation of https://github.com/analyticalmindsltd/smote_variants
 8 | 
 9 |     Parameters
10 |     ----------
11 |     proportion: float (default = 1)
12 |         proportion of the difference of n_maj and n_min to sample e.g. 1.0 means that after sampling the number of
13 |         minority samples will be equal to the number of majority samples
14 | 
15 |     References
16 |     ----------
17 |     .. [1] Gazzah, S. and Amara, N. E. B. "New Oversampling Approaches Based on Polynomial Fitting for Imbalanced Data
18 |     Sets" The Eighth IAPR International Workshop on Document Analysis Systems
19 |     """
20 | 
21 |     def __init__(self, proportion=1.0):
22 |         self.proportion = proportion
23 | 
24 |     def fit(self, X, y=None):
25 |         pass
26 | 
27 |     def resample(self, X, y, verbose=False):
28 |         """
29 |         Generate synthetic minority samples
30 |         """
31 |         unique, counts = np.unique(y, return_counts=True)
32 |         class_stats = dict(zip(unique, counts))
33 |         min_label = unique[0] if counts[0] < counts[1] else unique[1]
34 |         maj_label = unique[1] if counts[0] < counts[1] else unique[0]
35 | 
36 |         # determine the number of samples to generate
37 |         n_to_sample = self.det_n_to_sample(self.proportion, class_stats[maj_label], class_stats[min_label])
38 | 
39 |         if n_to_sample == 0:
40 |             if verbose:
41 |                 print("StarOversampler: Sampling is not needed")
42 |             return X.copy(), y.copy()
43 | 
44 |         samples = []
45 |         # Implementation of the star topology
46 |         X_min = X[y == min_label]
47 |         X_mean = np.mean(X_min, axis=0)
48 |         k = max([1, int(np.rint(n_to_sample / len(X_min)))])
49 |         for x in X_min:
50 |             diff = X_mean - x
51 |             for i in range(1, k + 1):
52 |                 samples.append(x + float(i) / (k + 1) * diff)
53 |         return np.vstack([X, np.vstack(samples)]), np.hstack([y, np.repeat(min_label, len(samples))])
54 | 
55 |     def det_n_to_sample(self, proportion, n_maj, n_min):
56 |         """
57 |         Determines the number of samples to generate
58 | 
59 |         Parameters
60 |         ----------
61 |         proportion: float
62 |             proportion of the difference of n_maj and n_min to sample e.g. 1.0 means that after sampling the number of
63 |             minority samples will be equal to the number of majority samples
64 |         n_maj: int
65 |             number of majority samples
66 |         n_min: int
67 |             number of minority samples
68 |         """
69 |         return max([0, int((n_maj - n_min) * proportion)])
70 | 


--------------------------------------------------------------------------------
/src/sagemaker_sklearn_extension/decomposition/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #      http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | 
14 | """
15 | The :mod:`sagemaker_sklearn_extension.decomposition` module includes matrix decomposition algorithms.
16 | """
17 | 
18 | from .robust_pca import RobustPCA
19 | 
20 | __all__ = [
21 |     "RobustPCA",
22 | ]
23 | 


--------------------------------------------------------------------------------
/src/sagemaker_sklearn_extension/decomposition/robust_pca.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
  4 | # may not use this file except in compliance with the License. A copy of
  5 | # the License is located at
  6 | #
  7 | #      http://aws.amazon.com/apache2.0/
  8 | #
  9 | # or in the "license" file accompanying this file. This file is
 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 11 | # ANY KIND, either express or implied. See the License for the specific
 12 | # language governing permissions and limitations under the License.
 13 | 
 14 | from scipy.sparse import issparse
 15 | 
 16 | from sklearn.base import BaseEstimator, TransformerMixin
 17 | from sklearn.decomposition import PCA, TruncatedSVD
 18 | from sklearn.utils.validation import check_array, check_is_fitted
 19 | 
 20 | 
 21 | class RobustPCA(BaseEstimator, TransformerMixin):
 22 |     """RobustPCA dimension reduction for dense and sparse matrices.
 23 | 
 24 |     RobustPCA uses a different implementation of singular value decomposition depending on the input.
 25 |     - ``sklearn.decomposition.PCA`` for dense inputs
 26 |     - ``sklearn.decomposition.TruncatedSVD`` for sparse inputs
 27 | 
 28 |     Please see ``sklearn.decomposition.PCA`` or ``sklearn.decomposition.TruncatedSVD`` for more details.
 29 | 
 30 |     If input number of features (input dimension) is less than n_components (target dimension), then no dimension
 31 |     reduction will be performed. The output will be the same as the input.
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     n_components : int, optional (default=1000)
 36 |         Desired dimensionality of output data.
 37 |         Must be strictly less than the number of features. If n_components is greater than than the number of features,
 38 |         no dimension reduction will be performed.
 39 | 
 40 |     svd_solver : string, optional (default='auto')
 41 | 
 42 |         - If 'auto', the solver is selected by a default policy based on `X.shape` and `n_components`: if the input
 43 |           data is larger than 500x500 and the number of components to extract is lower than 80% of the smallest
 44 |           dimension of the data, then the more efficient 'randomized' method is enabled. Otherwise the exact full
 45 |           RobustPCA is computed and optionally truncated afterwards.
 46 |           Note: 'auto' option only available for dense inputs. If 'auto' and input is sparse, svd_solver will use
 47 |           'randomized'
 48 |         - If 'full', run exact full RobustPCA calling the standard LAPACK solver via `scipy.linalg.svd` and select the
 49 |           components by postprocessing.
 50 |           Note: 'full' option only available for dense inputs. If 'full' and input is sparse, svd_solver will use
 51 |           'randomized'
 52 |         - If 'arpack', run RobustPCA truncated to n_components calling ARPACK solver via `scipy.sparse.linalg.svds`.
 53 |           'arpack' requires strictly 0 < n_components < n_components
 54 |         - If 'randomized', run randomized RobustPCA by the method of Halko et al.
 55 | 
 56 |     iterated_power : int >= 0 or 'auto', optional (default='auto')
 57 |         Number of iterations for the power method computed by
 58 |         svd_solver == 'randomized'.
 59 |         Note: If 'auto' and input is sparse, default for `iterated_power` is 5.
 60 | 
 61 |     tol : float >= 0, optional (default=0.)
 62 |         Tolerance for singular values computed by svd_solver == 'arpack'. 0 means machine precision. Ignored by
 63 |         randomized RobustPCA solver.
 64 | 
 65 |     random_state : int, RandomState instance, or None, optional (default=None)
 66 |         - If int, random_state is the seed used by the random number generator;
 67 |         - If RandomState instance, random_state is the random number generator;
 68 |         - If None, the random number generator is the RandomState instance used
 69 |           by np.random. Used when svd_solver == 'arpack' or 'randomized'.
 70 | 
 71 | 
 72 |     Attributes
 73 |     ----------
 74 |     robust_pca_ : ``sklearn.decomposition.PCA``, ``sklearn.decomposition.TruncatedSVD``, or None
 75 |         - If input number of features (input dimension) is less than n_components (target dimension), then `svd_` will
 76 |           be set to None and no dimension reduction will be performed. The output will be the same as the input.
 77 | 
 78 |         Assuming number of features is more than n_components:
 79 |         - If input is sparse, `svd_` is ``sklearn.decomposition.TruncatedSVD``.
 80 |         - If input is dense, `svd_` is ``sklearn.decomposition.PCA``
 81 | 
 82 |     Notes
 83 |     -----
 84 |     For dense inputs, ``sklearn.decomposition.PCA`` will center the input data by per-feature mean subtraction before
 85 |     RobustPCA. Sparse inputs will not center data.
 86 |     """
 87 | 
 88 |     def __init__(self, n_components=1000, svd_solver="auto", iterated_power="auto", tol=0.0, random_state=None):
 89 |         self.n_components = n_components
 90 |         self.svd_solver = svd_solver
 91 |         self.iterated_power = iterated_power
 92 |         self.tol = tol
 93 |         self.random_state = random_state
 94 | 
 95 |     def fit(self, X, y=None):
 96 |         """Fit the model with X.
 97 | 
 98 |         Parameters
 99 |         ----------
100 |         X : array-like, shape (n_samples, n_features)
101 |             Training data.
102 | 
103 |         Returns
104 |         -------
105 |         self : RobustPCA
106 |         """
107 |         X = check_array(X, accept_sparse=True, dtype=None)
108 | 
109 |         # if input dimension is less than or equal to target dimension, no reduction will be performed
110 |         if X.shape[1] <= self.n_components:
111 |             self.robust_pca_ = None
112 |             return self
113 | 
114 |         # fit for sparse or dense input
115 |         if issparse(X):
116 |             algorithm = self.svd_solver if self.svd_solver == "arpack" else "randomized"
117 |             n_iter = self.iterated_power if self.iterated_power != "auto" else 5
118 | 
119 |             self.robust_pca_ = TruncatedSVD(
120 |                 n_components=self.n_components,
121 |                 algorithm=algorithm,
122 |                 n_iter=n_iter,
123 |                 random_state=self.random_state,
124 |                 tol=self.tol,
125 |             )
126 |         else:
127 |             self.robust_pca_ = PCA(
128 |                 n_components=self.n_components,
129 |                 svd_solver=self.svd_solver,
130 |                 tol=self.tol,
131 |                 iterated_power=self.iterated_power,
132 |                 random_state=self.random_state,
133 |             )
134 | 
135 |         self.robust_pca_.fit(X)
136 |         return self
137 | 
138 |     def transform(self, X, y=None):
139 |         """Fit the model with X and apply the dimensionality reduction on X.
140 | 
141 |         Parameters
142 |         ----------
143 |         X : array-like, shape (n_samples, n_features)
144 |             Training data
145 | 
146 |         Returns
147 |         -------
148 |         X : array-like, shape (n_samples, n_features)
149 |         or
150 |         X_new : array-like, shape (n_samples, n_components)
151 | 
152 |         """
153 |         check_is_fitted(self, "robust_pca_")
154 | 
155 |         if self.robust_pca_:
156 |             return self.robust_pca_.transform(X)
157 |         return X
158 | 


--------------------------------------------------------------------------------
/src/sagemaker_sklearn_extension/externals/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #      http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | 
14 | from .automl_transformer import AutoMLTransformer
15 | from .header import Header
16 | from .read_data import read_csv_data
17 | 
18 | __all__ = [
19 |     "AutoMLTransformer",
20 |     "Header",
21 |     "read_csv_data",
22 | ]
23 | 


--------------------------------------------------------------------------------
/src/sagemaker_sklearn_extension/externals/automl_transformer.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
  4 | # may not use this file except in compliance with the License. A copy of
  5 | # the License is located at
  6 | #
  7 | #      http://aws.amazon.com/apache2.0/
  8 | #
  9 | # or in the "license" file accompanying this file. This file is
 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 11 | # ANY KIND, either express or implied. See the License for the specific
 12 | # language governing permissions and limitations under the License.
 13 | 
 14 | import numpy as np
 15 | 
 16 | from scipy.sparse import isspmatrix
 17 | from sklearn.base import BaseEstimator
 18 | from sklearn.base import TransformerMixin
 19 | 
 20 | 
 21 | class AutoMLTransformer(BaseEstimator, TransformerMixin):
 22 |     """Utility class encapsulating feature and target transformation functionality used in AutoML pipelines.
 23 | 
 24 |     Parameters
 25 |     ----------
 26 |     header : Header instance
 27 |         Instance of the ``Header`` class from ``sagemaker_sklearn_extension.externals``. Contains indices of the
 28 |         features and response in the corresponding dataset.
 29 | 
 30 |     feature_transformer : transformer instance
 31 |         A Scikit-Learn transformer used on the feature columns in the dataset. Should have ``fit`` and ``transform``
 32 |         methods which accept 2-dimensional inputs.
 33 | 
 34 |     target_transformer : transformer instance
 35 |         A Scikit-Learn transformer used on the target column in the dataset. Should have ``fit``, ``transform``, and
 36 |         optionally ``inverse_transform`` methods which accept 1-dimensional inputs.
 37 |     """
 38 | 
 39 |     def __init__(self, header, feature_transformer, target_transformer):
 40 |         self.header = header
 41 |         self.feature_transformer = feature_transformer
 42 |         self.target_transformer = target_transformer
 43 | 
 44 |     def fit(self, X, y):
 45 |         """Fit and transform target, then fit feature data using the underlying transformers.
 46 | 
 47 |         Parameters
 48 |         ----------
 49 |         X : numpy array of shape [n_samples, n_features]
 50 |             The feature-only dataset.
 51 | 
 52 |         y : numpy array of shape [n_samples]
 53 |             The target column.
 54 | 
 55 |         Returns
 56 |         -------
 57 |         self : AutoMLTransformer
 58 |         """
 59 |         y_transformed = y
 60 | 
 61 |         if self.target_transformer:
 62 |             y_transformed = self.target_transformer.fit_transform(y)
 63 | 
 64 |         self.feature_transformer.fit(X, y_transformed)
 65 |         return self
 66 | 
 67 |     def transform(self, X):
 68 |         """Transform the dataset using the underlying transformers.
 69 | 
 70 |         Depending on the shape of the input, it transforms either the feature data, or the feature data and the target
 71 |         column and then concatenates them back into a single dataset.
 72 | 
 73 |         Parameters
 74 |         ----------
 75 |         X : numpy array
 76 |             The array to transform whose shape should be either:
 77 |             - [n_samples, n_features], if it only contains the features; or
 78 |             - [n_samples, n_features + 1], if it contains the feature columns and the target column.
 79 | 
 80 |         Returns
 81 |         -------
 82 |         array-like of shape [n_samples, n_transformed_features] or [n_samples, n_transformed_features + 1]
 83 |         """
 84 |         n_columns = X.shape[1]
 85 |         n_features = len(self.header.feature_column_indices)
 86 | 
 87 |         # X contains both features and response.
 88 |         if n_columns == n_features + 1:
 89 |             y = X[:, self.header.target_column_index]
 90 |             y_transformed = self.label_transform(y)
 91 |             non_nan_indices = np.arange(y_transformed.shape[0])[~np.isnan(y_transformed)]
 92 |             feature_indices = np.array(self.header.feature_column_indices)
 93 |             X_transformed = self.feature_transformer.transform(
 94 |                 X[non_nan_indices[:, np.newaxis], feature_indices[np.newaxis, :]]
 95 |             )
 96 |             y_transformed_no_nans = y_transformed[non_nan_indices]
 97 |             return np.column_stack((y_transformed_no_nans, self._dense_array(X_transformed)))
 98 | 
 99 |         # X contains only the features.
100 |         if n_columns == n_features:
101 |             return self.feature_transformer.transform(X)
102 | 
103 |         raise ValueError(
104 |             f"Received data of unknown size. Expected number of columns is {n_features}. "
105 |             f"Number of columns in the received data is {n_columns}."
106 |         )
107 | 
108 |     def label_transform(self, y):
109 |         """Apply transformation, if ``target_transformer`` has been specified.
110 | 
111 |         Parameters
112 |         ----------
113 |         y : array-like, 1-dimensional
114 | 
115 |         Returns
116 |         -------
117 |         array-like
118 |             The transformed data. If target transformer has not been specified, simply returns the input.
119 |         """
120 |         if self.target_transformer:
121 |             return self.target_transformer.transform(y)
122 | 
123 |         return y.astype("float32")
124 | 
125 |     def inverse_label_transform(self, yt):
126 |         """Apply inverse target transformation, if ``target_transformer`` has been specified set.
127 | 
128 |         Parameters
129 |         ----------
130 |         yt : array-like, 1-dimensional
131 | 
132 |         Returns
133 |         -------
134 |         array-like
135 |             The inverse-transformed target. If target transformer has not been specified, simply returns the input.
136 |         """
137 |         if not self.target_transformer:
138 |             return yt
139 | 
140 |         return self.target_transformer.inverse_transform(yt)
141 | 
142 |     @staticmethod
143 |     def _dense_array(arr):
144 |         """Converts the input array to dense array.
145 | 
146 |         Parameters
147 |         ----------
148 |         arr : numpy array or csr_matrix
149 |             The array to be densified.
150 | 
151 |         Returns
152 |         -------
153 |         array-like
154 |             Dense numpy array representing arr.
155 | 
156 |         """
157 |         if isspmatrix(arr):
158 |             return arr.todense()
159 |         return arr
160 | 


--------------------------------------------------------------------------------
/src/sagemaker_sklearn_extension/externals/header.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
  4 | # may not use this file except in compliance with the License. A copy of
  5 | # the License is located at
  6 | #
  7 | #      http://aws.amazon.com/apache2.0/
  8 | #
  9 | # or in the "license" file accompanying this file. This file is
 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 11 | # ANY KIND, either express or implied. See the License for the specific
 12 | # language governing permissions and limitations under the License.
 13 | 
 14 | from collections import defaultdict
 15 | from collections import namedtuple
 16 | from collections import OrderedDict
 17 | 
 18 | 
 19 | Indices = namedtuple("Indices", field_names=("column_index", "feature_index"))
 20 | 
 21 | 
 22 | class Header:
 23 |     """ A utility class to manage the header and target column. The header contains the names for
 24 |     all columns in a dataset including the target column.  This class validates the header,
 25 |     checking for presence of duplicate column names and absence of target column name.
 26 | 
 27 |     This class provides functionality to translate the column names to column indices (data set including target column)
 28 |     and feature indices (data set excluding target column) respectively.
 29 | 
 30 |     This class is used in the code generated by the SageMaker Pipeline Recommender algorithm.
 31 | 
 32 |     Usage
 33 |     ------
 34 |     >>> h = Header(column_names=['a', 'b', 'c'], target_column_name='b')
 35 |     >>> h.as_column_indices(['a', 'c'])
 36 |     [0, 2]
 37 | 
 38 |     >>> h.as_feature_indices(['a', 'c'])
 39 |     [0, 1]
 40 | 
 41 |     >>> h.target_column_name
 42 |     b
 43 | 
 44 |     >>> h.target_column_index
 45 |     1
 46 | 
 47 |     >>> h.as_column_indices(['b'])
 48 |     [1]
 49 | 
 50 |     """
 51 | 
 52 |     def __init__(self, column_names: list, target_column_name: str):
 53 |         """
 54 |         Parameters
 55 |         ----------
 56 |         column_names : iterable of the column names in the order of occurrence
 57 | 
 58 |         target_column_name : str, name of the target column
 59 | 
 60 |         Raises
 61 |         ------
 62 | 
 63 |         ValueError : target_column_name is not present in column_names or duplicate entries found in column_names
 64 |         """
 65 | 
 66 |         self.target_column_index = None
 67 |         self.target_column_name = target_column_name
 68 | 
 69 |         # maintaining a dict{column_name: Indices}
 70 |         self._column_name_indices = OrderedDict()
 71 | 
 72 |         feature_index_offset = 0
 73 |         duplicate_column_indices = defaultdict(list)
 74 | 
 75 |         for i, column_name in enumerate(column_names):
 76 |             # already seen the column, add to duplicate_column_indices
 77 |             if column_name in self._column_name_indices:
 78 |                 duplicate_column_indices[column_name].append(i)
 79 |             else:
 80 |                 self._column_name_indices[column_name] = Indices(column_index=i, feature_index=i - feature_index_offset)
 81 | 
 82 |             # if it's target column, setup target_index and adjust the feature index
 83 |             # offset for following features columns
 84 |             if column_name == target_column_name:
 85 |                 self.target_column_index = i
 86 |                 feature_index_offset = 1
 87 |                 self._column_name_indices[column_name] = Indices(column_index=i, feature_index=None)
 88 | 
 89 |         if self.target_column_index is None:
 90 |             raise ValueError(
 91 |                 "Specified target column '{target_column_name}' is "
 92 |                 "not a valid column name.".format(target_column_name=target_column_name)
 93 |             )
 94 | 
 95 |         if duplicate_column_indices:
 96 |             raise ValueError(
 97 |                 "Duplicate column names were found:\n{}".format(
 98 |                     "\n".join(
 99 |                         [
100 |                             "{name} at index {index}".format(name=name, index=index)
101 |                             for (name, index) in duplicate_column_indices.items()
102 |                         ]
103 |                     )
104 |                 )
105 |             )
106 | 
107 |     def as_feature_indices(self, column_names: list) -> list:
108 |         """ Returns list of feature indices for the given column names.
109 | 
110 |         Parameters
111 |         ----------
112 |         column_names : iterable containing feature names
113 | 
114 |         Returns
115 |         -------
116 |         feature_indices : iterable containing the indices corresponding to column_names,
117 |                           assuming target column excluded.
118 | 
119 |         Raises
120 |         ------
121 |         ValueError : At least one of the items in column_names is not a feature name.
122 | 
123 |         """
124 | 
125 |         def _index(name):
126 | 
127 |             if self.target_column_name == name:
128 |                 raise ValueError(
129 |                     "'{}' is the target column name. " "It cannot be converted to feature index.".format(name)
130 |                 )
131 | 
132 |             try:
133 |                 return self._column_name_indices[name].feature_index
134 |             except KeyError:
135 |                 raise ValueError("'{}' is an unknown feature name".format(name))
136 | 
137 |         return [_index(name) for name in column_names]
138 | 
139 |     def as_column_indices(self, column_names: list) -> list:
140 |         """ Returns list of indices for the given column names.
141 | 
142 |         Parameters
143 |         ----------
144 |         column_names : iterable containing column names
145 | 
146 |         Returns
147 |         -------
148 |         column_indices : iterable containing the indices corresponding to column names,
149 |                          assuming target column is included in the data.
150 | 
151 |         Raises
152 |         ------
153 |         ValueError : Unknown column name is found in column_names
154 | 
155 |         """
156 | 
157 |         def _index(name):
158 |             try:
159 |                 return self._column_name_indices[name].column_index
160 |             except KeyError:
161 |                 raise ValueError("'{}' is an unknown column name.".format(name))
162 | 
163 |         return [_index(name) for name in column_names]
164 | 
165 |     @property
166 |     def feature_column_indices(self):
167 |         """Returns list of feature column indices in the order in which they were provided.
168 | 
169 |         The order of the indices is determined by the ``column_names`` parameter.
170 | 
171 |         Returns
172 |         -------
173 |         feature_column_indices : list of int
174 |         """
175 |         return [
176 |             index_instance.column_index
177 |             for index_instance in self._column_name_indices.values()
178 |             if index_instance.feature_index is not None
179 |         ]
180 | 
181 |     @property
182 |     def num_columns(self):
183 |         """ Returns number of columns including target column.
184 | 
185 |         Returns
186 |         -------
187 |         num_columns : integer, Number of columns.
188 |         """
189 |         return len(self._column_name_indices)
190 | 
191 |     @property
192 |     def num_features(self):
193 |         """ Returns number of features, i.e. the number of columns excluding target column.
194 | 
195 |         Returns
196 |         -------
197 |         num_features : integer, Number of features.
198 | 
199 |         """
200 |         return len(self._column_name_indices) - 1
201 | 


--------------------------------------------------------------------------------
/src/sagemaker_sklearn_extension/feature_extraction/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #      http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | 
14 | """
15 | The :mod:`sagemaker_sklearn_extension.feature_extraction` module deals
16 | with feature extraction from raw data. It currently includes estimators
17 | to extract features from text. This module is based on the
18 | :mod:`sklearn.feature_extraction` module.
19 | """
20 | 
21 | from . import date_time
22 | from . import sequences
23 | from . import text
24 | 
25 | __all__ = ["date_time", "sequences", "text"]
26 | 


--------------------------------------------------------------------------------
/src/sagemaker_sklearn_extension/feature_extraction/date_time.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
  4 | # may not use this file except in compliance with the License. A copy of
  5 | # the License is located at
  6 | #
  7 | #      http://aws.amazon.com/apache2.0/
  8 | #
  9 | # or in the "license" file accompanying this file. This file is
 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 11 | # ANY KIND, either express or implied. See the License for the specific
 12 | # language governing permissions and limitations under the License.
 13 | 
 14 | from datetime import datetime
 15 | from enum import Enum
 16 | 
 17 | from dateutil import parser
 18 | import numpy as np
 19 | from sklearn.base import BaseEstimator, TransformerMixin
 20 | from sklearn.utils.validation import check_array, check_is_fitted
 21 | 
 22 | 
 23 | class DateTimeProperty:
 24 |     def __init__(self, extract_func, max_, min_):
 25 |         """Contains information about a property of a datetime object
 26 | 
 27 |         Parameters
 28 |         ----------
 29 |         extract_func: function
 30 |             function mapping a datetime object to the property
 31 |         max_: int
 32 |             maximum value for the property
 33 |         min_: int
 34 |             minimum value for the property
 35 |         """
 36 |         self.min = min_
 37 |         self.max = max_
 38 |         self.extract_func = extract_func
 39 | 
 40 | 
 41 | def extract_week_of_year(t):
 42 |     return t.isocalendar()[1] if isinstance(t, datetime) else np.nan
 43 | 
 44 | 
 45 | def extract_weekday(t):
 46 |     return t.isocalendar()[2] if isinstance(t, datetime) else np.nan
 47 | 
 48 | 
 49 | def extract_year(t):
 50 |     return t.year if isinstance(t, datetime) else np.nan
 51 | 
 52 | 
 53 | def extract_hour(t):
 54 |     return t.hour if isinstance(t, datetime) else np.nan
 55 | 
 56 | 
 57 | def extract_month(t):
 58 |     return t.month if isinstance(t, datetime) else np.nan
 59 | 
 60 | 
 61 | def extract_minute(t):
 62 |     return t.minute if isinstance(t, datetime) else np.nan
 63 | 
 64 | 
 65 | def extract_quarter(t):
 66 |     return (t.month - 1) // 3 + 1 if isinstance(t, datetime) else np.nan
 67 | 
 68 | 
 69 | def extract_second(t):
 70 |     return t.second if isinstance(t, datetime) else np.nan
 71 | 
 72 | 
 73 | def extract_day_of_year(t):
 74 |     return t.timetuple().tm_yday if isinstance(t, datetime) else np.nan
 75 | 
 76 | 
 77 | def extract_day_of_month(t):
 78 |     return t.day if isinstance(t, datetime) else np.nan
 79 | 
 80 | 
 81 | class DateTimeDefinition(Enum):
 82 |     WEEK_OF_YEAR = DateTimeProperty(extract_week_of_year, 53, 1)
 83 |     WEEKDAY = DateTimeProperty(extract_weekday, 7, 1)
 84 |     YEAR = DateTimeProperty(extract_year, None, None)
 85 |     HOUR = DateTimeProperty(extract_hour, 23, 0)
 86 |     MONTH = DateTimeProperty(extract_month, 12, 1)
 87 |     MINUTE = DateTimeProperty(extract_minute, 59, 0)
 88 |     QUARTER = DateTimeProperty(extract_quarter, 4, 1)
 89 |     SECOND = DateTimeProperty(extract_second, 59, 0)
 90 |     DAY_OF_YEAR = DateTimeProperty(extract_day_of_year, 366, 1)
 91 |     DAY_OF_MONTH = DateTimeProperty(extract_day_of_month, 31, 1)
 92 | 
 93 | 
 94 | class DateTimeVectorizer(BaseEstimator, TransformerMixin):
 95 |     def __init__(self, extract=None, mode="cyclic", ignore_constant_columns=True, default_datetime=None):
 96 |         """Converts array-like data with datetime.datetime or strings describing datetime objects into numeric features
 97 | 
 98 |         A datetime item contains categorical information: year, month, hour, day of week, etc. This information is given
 99 |         as the output features. The encoding of these categories can be ordinal or cyclic. The cyclic encoding of an
100 |         integer i between 0 and k consists of two floats: sin(i/k), cos(i/k). This makes sure for example that the
101 |         months Decembers and January are encoded to vectors that are close in Euclidean distance.
102 | 
103 |         Parameters
104 |         ----------
105 |         extract: list of DateTimeProperty, default None
106 |             Types of data to extract. See DateTimeDefinition class for options. If given None,
107 |             defaults to DateTimeVectorizer.default_data
108 |         mode: str, default cyclic
109 |             'ordinal': each data type is outputted to a non-negative integer, as in ordinal encoding for categorical
110 |                        data
111 |             'cyclic': each data type is converted to two numbers in [-1,1] so that the distance between these numbers
112 |                       is small for close items in the cyclic order (for example hour=23 is close to hour=0)
113 |         ignore_constant_columns: bool, default True
114 |             If True, fit will make sure the output columns are not constant in the training set.
115 |         default_datetime: DateTime, default None
116 |             Default DateTime object to use when information is missing from input array. This DateTime object is passed
117 |             as a keyword argument into the dateutil.parser.parse method. If this is a datetime object and not None,
118 |             elements specified in the parse method replace elements in the default object.
119 |             When ignore_constant_columns is True, the filled DateTime information will be removed if constant.
120 | 
121 |         Attributes
122 |         ----------
123 |         extract_ : list of DateTimeProperty
124 |             List of DateTimeProperty objects, each providing the necessary information for extracting a single property
125 |             from a datetime object. The properties corresponding to this list describe the different columns of the
126 |             output of the transform function
127 | 
128 | 
129 |         Examples
130 |         --------
131 |         >>> from sagemaker_sklearn_extension.feature_extraction.date_time import DateTimeVectorizer
132 |         >>> import numpy as np
133 |         >>> data = np.array([
134 |         ...     'Jan 3th, 2018, 1:34am',
135 |         ...     'Feb 11th, 2012, 11:34:59pm',
136 |         ...     ]).reshape((-1, 1))
137 |         >>> date_time = DateTimeVectorizer(mode='ordinal', ignore_constant_columns=False)
138 |         >>> X = date_time.fit_transform(data)
139 |         >>> print(X.shape)
140 |         (2, 7)
141 |         >>> print(X[0].astype(np.int))
142 |         [   2 2018    1   34    0    0    0]
143 |         >>> date_time = DateTimeVectorizer(mode='ordinal')
144 |         >>> # with ignore_constant_columns=True, the minute field, which is 34 in both examples, will be filtered
145 |         >>> X = date_time.fit_transform(data)
146 |         >>> print(X.shape)
147 |         (2, 6)
148 |         >>> print(X[0].astype(np.int))
149 |         [   2 2018    1    0    0    0]
150 | 
151 | 
152 | 
153 |         """
154 |         self.extract = extract
155 |         self.mode = mode
156 |         self.ignore_constant_columns = ignore_constant_columns
157 |         self.default_datetime = default_datetime
158 | 
159 |     @staticmethod
160 |     def _cyclic_transform(data, low, high):
161 |         """
162 |         Converts numeric data into 2d-cyclic.
163 | 
164 |         The conversion of a single integer into two floats makes sure that the Euclidian distance between two (output)
165 |         values is similar to the cyclic distance between the integers. For example, hour of day is a number between 0
166 |         and 23. The cyclic distance between the hours 0 and 23 is 1 (and not 23). After the cyclic transform, the
167 |         transformed hour 0 will be a vector very close to that of the hour 23, and far away from that of 12.
168 | 
169 |         Parameters
170 |         ----------
171 |         data: np.array of numbers
172 |         low: lower bound of the data values
173 |         high: upper bound of the data values
174 | 
175 |         Returns
176 |         -------
177 |         np.array with double the dimension in the last axis
178 | 
179 |         Examples
180 |         --------
181 |         >>> from sagemaker_sklearn_extension.feature_extraction.date_time import DateTimeVectorizer
182 |         >>> output = DateTimeVectorizer._cyclic_transform(np.array([[1],[2],[3],[4]]), low=1, high=4)
183 |         >>> # up to numeric precision, the outputs should be [[0,1], [1,0], [0,-1], [-1,0]]
184 |         >>> print(output)
185 |         [[ 0.0000000e+00  1.0000000e+00]
186 |          [ 1.0000000e+00  6.1232340e-17]
187 |          [ 1.2246468e-16 -1.0000000e+00]
188 |          [-1.0000000e+00 -1.8369702e-16]]
189 |         >>> output = DateTimeVectorizer._cyclic_transform(np.array([[1],[2],[3],[4],[5],[6],[7],[8]]), low=1, high=8)
190 |         >>> print(output)
191 |         [[ 0.00000000e+00  1.00000000e+00]
192 |          [ 7.07106781e-01  7.07106781e-01]
193 |          [ 1.00000000e+00  6.12323400e-17]
194 |          [ 7.07106781e-01 -7.07106781e-01]
195 |          [ 1.22464680e-16 -1.00000000e+00]
196 |          [-7.07106781e-01 -7.07106781e-01]
197 |          [-1.00000000e+00 -1.83697020e-16]
198 |          [-7.07106781e-01  7.07106781e-01]]
199 |         """
200 |         normalized = (data - low) * 2 * np.pi / (1 + high - low)
201 |         sin_values = np.sin(normalized)
202 |         cos_values = np.cos(normalized)
203 | 
204 |         shape = list(sin_values.shape)
205 | 
206 |         tmp_shape = tuple(shape + [1])
207 |         sin_values = sin_values.reshape(tmp_shape)
208 |         cos_values = cos_values.reshape(tmp_shape)
209 |         ret = np.concatenate((sin_values, cos_values), axis=len(tmp_shape) - 1)
210 | 
211 |         shape[-1] *= 2
212 |         return ret.reshape(tuple(shape))
213 | 
214 |     default_data = [
215 |         DateTimeDefinition.WEEKDAY.value,
216 |         DateTimeDefinition.YEAR.value,
217 |         DateTimeDefinition.HOUR.value,
218 |         DateTimeDefinition.MINUTE.value,
219 |         DateTimeDefinition.SECOND.value,
220 |         DateTimeDefinition.MONTH.value,
221 |         DateTimeDefinition.WEEK_OF_YEAR.value,
222 |     ]
223 | 
224 |     def _to_datetime_single(self, item):
225 |         if isinstance(item, datetime):
226 |             return item
227 |         try:
228 |             return parser.parse(item, default=self.default_datetime)
229 |         except ValueError:
230 |             pass
231 |         except TypeError:
232 |             pass
233 | 
234 |     def _to_datetime_array(self, X):
235 |         """Converts np array with string or datetime into datetime or None
236 | 
237 |         Parameters
238 |         ----------
239 |         X : np.array
240 |             numpy array containing data representing datetime objects
241 | 
242 |         Returns
243 |         -------
244 |         X : np.array
245 |             np.array with datetime objects of the same shape of the input. Items that could not be parsed become None
246 | 
247 |         """
248 |         X = np.vectorize(DateTimeVectorizer._to_datetime_single)(self, X)
249 |         return X
250 | 
251 |     def fit(self, X, y=None):
252 |         """Filter the extracted field so as not to contain constant columns.
253 | 
254 |         Parameters
255 |         ----------
256 |         X : {array-like}, datetime.datetime or str
257 | 
258 |         Notes
259 |         -----
260 |         If fitting with a 2d array with more than one column, any data type that is not constant in any column will
261 |         remain. If for example, column 1 has year=1999 for all rows but column 2 has two or more possible year values,
262 |         we will still produce an output with the year information from column 1. To avoid this, run fit on each column
263 |         separately, and obtain a separate DateTimeVectorizer for each column
264 | 
265 |         Returns
266 |         -------
267 |         self : DateTimeVectorizer
268 |         """
269 | 
270 |         X = check_array(X, dtype=None, force_all_finite="allow-nan")
271 |         X = np.array(X)
272 |         X = self._to_datetime_array(X)
273 | 
274 |         if self.mode not in ["cyclic", "ordinal"]:
275 |             raise ValueError("mode must be either cyclic or ordinal. Current value is {}".format(self.mode))
276 | 
277 |         self.extract_ = self.extract or self.default_data
278 | 
279 |         if self.ignore_constant_columns:
280 |             new_extract = []
281 |             for col in range(X.shape[1]):
282 |                 # convert the current column to get the different property values
283 |                 transformed = self._convert(X[:, col].reshape((-1, 1)), mode="ordinal")
284 |                 # check for constant columns
285 |                 transformed_var = np.nanvar(transformed, axis=0)
286 |                 for i, cur_var in enumerate(transformed_var):
287 |                     if cur_var > 0 and self.extract_[i] not in new_extract:
288 |                         new_extract.append(self.extract_[i])
289 |             if not new_extract:
290 |                 new_extract = [self.extract_[0]]
291 |             self.extract_ = new_extract
292 | 
293 |         return self
294 | 
295 |     def _convert(self, X, mode):
296 |         n_cols = X.shape[1]
297 | 
298 |         cols = []
299 | 
300 |         for datetime_property in self.extract_:
301 |             # apply the function on the datetime values in the input array, create a python list. To iterate over all
302 |             # items we view the input as a 1d vector
303 |             cur_conversions = list(map(datetime_property.extract_func, X.reshape((-1,))))
304 |             # convert the list to a float32 numpy array
305 |             cur_extract = np.array(cur_conversions, dtype=np.float32).reshape((-1, 1))
306 |             if datetime_property.min is None:
307 |                 # the output isn't cyclic. Leave it as is
308 |                 pass
309 |             elif mode == "ordinal":
310 |                 # the output is ordinal - shift it so the minimum value is 0
311 |                 cur_extract -= datetime_property.min
312 |             elif mode == "cyclic":
313 |                 # the output is cyclic - need to apply the cyclic transform
314 |                 cur_extract = self._cyclic_transform(cur_extract, low=datetime_property.min, high=datetime_property.max)
315 | 
316 |             cols.append(cur_extract)
317 | 
318 |         ret = np.concatenate(cols, axis=1)
319 |         # the return array is in 1d form. We need to reshape it to bring it back to the correct 2d form
320 |         ret = ret.reshape((-1, n_cols * ret.shape[1]))
321 |         return ret
322 | 
323 |     def transform(self, X, y=None):
324 |         X = check_array(X, dtype=None, force_all_finite="allow-nan")
325 |         check_is_fitted(self, "extract_")
326 | 
327 |         X = np.array(X)
328 |         X = self._to_datetime_array(X)
329 | 
330 |         return self._convert(X, self.mode)
331 | 
332 |     def _more_tags(self):
333 |         return {"X_types": ["datetime.datetime", "string"]}
334 | 


--------------------------------------------------------------------------------
/src/sagemaker_sklearn_extension/feature_extraction/text.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
  4 | # may not use this file except in compliance with the License. A copy of
  5 | # the License is located at
  6 | #
  7 | #      http://aws.amazon.com/apache2.0/
  8 | #
  9 | # or in the "license" file accompanying this file. This file is
 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 11 | # ANY KIND, either express or implied. See the License for the specific
 12 | # language governing permissions and limitations under the License.
 13 | 
 14 | import numpy as np
 15 | import scipy.sparse as sp
 16 | 
 17 | from sklearn.base import BaseEstimator, TransformerMixin
 18 | from sklearn.feature_extraction.text import VectorizerMixin, TfidfVectorizer
 19 | from sklearn.utils.validation import check_array, check_is_fitted
 20 | 
 21 | 
 22 | class MultiColumnTfidfVectorizer(BaseEstimator, VectorizerMixin, TransformerMixin):
 23 |     """Applies ``sklearn.feature_extraction.text.TfidfVectorizer`` to each column in an array.
 24 | 
 25 |     Each column of text is treated separately with a unique TfidfVectorizer. The vectorizers are applied sequentially.
 26 | 
 27 |     Parameters
 28 |     ----------
 29 |     strip_accents : {'ascii', 'unicode', None} (default=None)
 30 |         Remove accents and perform other character normalization during the preprocessing step.
 31 |         'ascii' is a fast method that only works on characters that have an direct ASCII mapping.
 32 |         'unicode' is a slightly slower method that works on any characters.
 33 |         None (default) does nothing.
 34 | 
 35 |         Both 'ascii' and 'unicode' use NFKD normalization from :func:`unicodedata.normalize`.
 36 | 
 37 |     lowercase : boolean (default=True)
 38 |         Convert all characters to lowercase before tokenizing.
 39 | 
 40 |     preprocessor : callable or None (default=None)
 41 |         Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams
 42 |         generation steps.
 43 | 
 44 |     tokenizer : callable or None (default=None)
 45 |         Override the string tokenization step while preserving the preprocessing and n-grams generation steps.
 46 |         Only applies if ``analyzer == 'word'``.
 47 | 
 48 |     stop_words : string {'english'}, list, or None (default)
 49 |         If 'english', a built-in stop word list for English is used.
 50 |         There are several known issues with 'english' and you should consider an alternative (see :ref:`stop_words`).
 51 | 
 52 |         If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens.
 53 |         Only applies if ``analyzer == 'word'``.
 54 | 
 55 |         If None, no stop words will be used. max_df can be set to a value in the range [0.7, 1.0) to automatically
 56 |         detect and filter stop words based on intra corpus document frequency of terms.
 57 | 
 58 |     token_pattern : string
 59 |         Regular expression denoting what constitutes a "token", only used if ``analyzer == 'word'``. The default regexp
 60 |         select tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a
 61 |         token separator).
 62 | 
 63 |     ngram_range : tuple (min_n, max_n) (default=(1, 1))
 64 |         The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n
 65 |         such that min_n <= n <= max_n will be used.
 66 | 
 67 |     analyzer : string, {'word', 'char', 'char_wb'} or callable
 68 |         Whether the feature should be made of word or character n-grams.
 69 |         Option 'char_wb' creates character n-grams only from text inside word boundaries; n-grams at the edges of words
 70 |         are padded with space.
 71 | 
 72 |         If a callable is passed it is used to extract the sequence of features out of the raw, unprocessed input.
 73 | 
 74 |     max_df : float in range [0.0, 1.0] or int (default=1.0)
 75 |         When building the vocabulary ignore terms that have a document frequency strictly higher than the given
 76 |         threshold (corpus-specific stop words).
 77 |         If float, the parameter represents a proportion of documents, integer absolute counts.
 78 |         This parameter is ignored if vocabulary is not None.
 79 | 
 80 |     min_df : float in range [0.0, 1.0] or int (default=1)
 81 |         When building the vocabulary ignore terms that have a document frequency strictly lower than the given
 82 |         threshold. This value is also called cut-off in the literature.
 83 |         If float, the parameter represents a proportion of documents, integer absolute counts.
 84 |         This parameter is ignored if vocabulary is not None.
 85 | 
 86 |     max_features : int or None (default=1000)
 87 |         If not None, build a vocabulary that only consider the top max_features ordered by term frequency across
 88 |         the corpus.
 89 |         This parameter is ignored if vocabulary is not None.
 90 | 
 91 |     vocabulary : Mapping or iterable, optional (default=None)
 92 |         Either a Mapping (e.g., a dict) where keys are terms and values are indices in the feature matrix, or an
 93 |         iterable over terms. If not given, a vocabulary is determined from the input.
 94 | 
 95 |     dtype : type, optional (default=float64)
 96 |         Type of the matrix returned by fit_transform() or transform().
 97 | 
 98 |     norm : 'l1', 'l2' or None, optional (default='l2')
 99 |         Each output row will have unit norm, either:
100 |         * 'l2': Sum of squares of vector elements is 1. The cosine similarity between two vectors is their dot product
101 |         when l2 norm has been applied.
102 |         * 'l1': Sum of absolute values of vector elements is 1.
103 |         See :func:`preprocessing.normalize`
104 | 
105 |     use_idf : boolean (default=True)
106 |         Enable inverse-document-frequency reweighting.
107 | 
108 |     smooth_idf : boolean (default=True)
109 |         Smooth idf weights by adding one to document frequencies, as if an extra document was seen containing every
110 |         term in the collection exactly once. Prevents zero divisions.
111 | 
112 |     sublinear_tf : boolean (default=False)
113 |         Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
114 | 
115 |     vocabulary_sizes : list(int) (default=None)
116 |         Specify the exact vocabulary size to use while encoding each column in the input dataset. The vocabulary size
117 |         of a column corresponds to the number of features in its TF-IDF encoding, before the feature matrices are
118 |         concatenated. If the feature matrix of column ``i`` has more features than the corresponding vocabulary size,
119 |         only the first ``vocabulary_sizes[i]`` features are kept. If the feature matrix of column ``i`` has fewer
120 |         features than the corresponding vocabulary size, zero columns are added to the feature matrix until it has
121 |         ``vocabulary_sizes[i]`` features. This parameter is useful if the total number of features of the encoding
122 |         has to be constant.
123 | 
124 |     ignore_columns_with_zero_vocabulary_size : boolean (default=True)
125 |         Allow ValueErrors thrown by ``sklearn.feature_extraction.text.TfidfVectorizer`` because of over-pruning
126 |         of terms to be ignored and an empty ``scipy.sparse.csr_matrix`` to be used in place of the given columns
127 |         TF-IDF document-term matrix.
128 | 
129 |     Attributes
130 |     ----------
131 |     vectorizers_ : list of ``sklearn.feature_extraction.text.TfidfVectorizers``
132 |         List of ``sklearn.feature_extraction.text.TfidfVectorizers``. Each TfidfVectorizer is separately instantiated
133 |         on an input column. len(self.vectorizers_) should equal to the number of input columns.
134 | 
135 |     Notes
136 |     -----
137 |     MultiColumnTfidfVectorizer should be used with 2D arrays of text strings, for 1D arrays of text data, use
138 |     ``sklearn.feature_extraction.text.TfidfVectorizer`` or reshape using array.reshape(-1, 1)
139 |     """
140 | 
141 |     def __init__(
142 |         self,
143 |         strip_accents=None,
144 |         lowercase=True,
145 |         preprocessor=None,
146 |         tokenizer=None,
147 |         stop_words=None,
148 |         token_pattern=r"(?u)\b\w\w+\b",
149 |         ngram_range=(1, 1),
150 |         analyzer="word",
151 |         max_df=1.0,
152 |         min_df=1,
153 |         max_features=1000,
154 |         vocabulary=None,
155 |         dtype=np.float64,
156 |         norm="l2",
157 |         use_idf=True,
158 |         smooth_idf=True,
159 |         sublinear_tf=False,
160 |         vocabulary_sizes=None,
161 |         ignore_columns_with_zero_vocabulary_size=True,
162 |     ):
163 |         self.strip_accents = strip_accents
164 |         self.lowercase = lowercase
165 |         self.preprocessor = preprocessor
166 |         self.tokenizer = tokenizer
167 |         self.stop_words = stop_words
168 |         self.token_pattern = token_pattern
169 |         self.ngram_range = ngram_range
170 |         self.analyzer = analyzer
171 |         self.max_df = max_df
172 |         self.min_df = min_df
173 |         self.max_features = max_features
174 |         self.vocabulary = vocabulary
175 |         self.dtype = dtype
176 |         self.norm = norm
177 |         self.use_idf = use_idf
178 |         self.smooth_idf = smooth_idf
179 |         self.sublinear_tf = sublinear_tf
180 |         self.vocabulary_sizes = vocabulary_sizes
181 |         self.ignore_columns_with_zero_vocabulary_size = ignore_columns_with_zero_vocabulary_size
182 | 
183 |     def _fit_vectorizer(self, col_idx, X):
184 |         max_features = self.max_features
185 | 
186 |         # Override max_features for the current column in order to enforce the vocabulary size.
187 |         if self.max_features and self.vocabulary_sizes:
188 |             max_features = min(self.max_features, self.vocabulary_sizes[col_idx])
189 |         elif self.vocabulary_sizes:
190 |             max_features = self.vocabulary_sizes[col_idx]
191 | 
192 |         try:
193 |             vectorizer = TfidfVectorizer(
194 |                 strip_accents=self.strip_accents,
195 |                 lowercase=self.lowercase,
196 |                 preprocessor=self.preprocessor,
197 |                 tokenizer=self.tokenizer,
198 |                 stop_words=self.stop_words,
199 |                 token_pattern=self.token_pattern,
200 |                 ngram_range=self.ngram_range,
201 |                 analyzer=self.analyzer,
202 |                 max_df=self.max_df,
203 |                 min_df=self.min_df,
204 |                 max_features=max_features,
205 |                 vocabulary=self.vocabulary,
206 |                 dtype=self.dtype,
207 |                 norm=self.norm,
208 |                 use_idf=self.use_idf,
209 |                 smooth_idf=self.smooth_idf,
210 |                 sublinear_tf=self.sublinear_tf,
211 |             )
212 |             vectorizer.fit(X[:, col_idx])
213 |         except ValueError as err:
214 |             zero_vocab_errors = [
215 |                 "After pruning, no terms remain. Try a lower min_df or a higher max_df.",
216 |                 "max_df corresponds to < documents than min_df",
217 |                 "empty vocabulary; perhaps the documents only contain stop words",
218 |             ]
219 |             if str(err) in zero_vocab_errors and self.ignore_columns_with_zero_vocabulary_size:
220 |                 vectorizer = None
221 |             else:
222 |                 raise
223 |         return vectorizer
224 | 
225 |     def fit(self, X, y=None):
226 |         """Build the list of TfidfVectorizers for each column.
227 | 
228 |         Parameters
229 |         ----------
230 |         X : {array-like}, text data
231 | 
232 |         Returns
233 |         -------
234 |         self : MultiColumnTfidfVectorizer
235 |         """
236 |         X = check_array(X, dtype=None)
237 |         n_columns = X.shape[1]
238 | 
239 |         # If specified, vocabulary size must be given for each column of the input dataset.
240 |         if self.vocabulary_sizes and len(self.vocabulary_sizes) != n_columns:
241 |             raise ValueError("If specified, vocabulary_sizes has to have exactly one entry per data column.")
242 | 
243 |         self.vectorizers_ = [self._fit_vectorizer(i, X) for i in range(n_columns)]
244 | 
245 |         return self
246 | 
247 |     def _transform_vectorizer(self, col_idx, X):
248 |         if self.vectorizers_[col_idx]:
249 |             tfidf_features = self.vectorizers_[col_idx].transform(X[:, col_idx])
250 |             # If the vocabulary size is specified and there are too few features, then pad the output with zeros.
251 |             if self.vocabulary_sizes and tfidf_features.shape[1] < self.vocabulary_sizes[col_idx]:
252 |                 tfidf_features = sp.csr_matrix(
253 |                     (tfidf_features.data, tfidf_features.indices, tfidf_features.indptr),
254 |                     shape=(tfidf_features.shape[0], self.vocabulary_sizes[col_idx]),
255 |                 )
256 |             return tfidf_features
257 |         # If ``TfidfVectorizer`` threw a value error, add an empty TF-IDF document-term matrix for the column
258 |         return sp.csr_matrix((X.shape[0], 0))
259 | 
260 |     def transform(self, X, y=None):
261 |         """Transform documents to document term-matrix.
262 | 
263 |         Parameters
264 |         ----------
265 |         X : 2D array of text data
266 | 
267 |         Returns
268 |         -------
269 |         tfidf_matrix : sparse matrix, [n_samples, n_features]
270 |                        Tf-idf-weighted document-term matrix.
271 |         """
272 |         check_is_fitted(self, "vectorizers_")
273 |         X = check_array(X, dtype=None)
274 | 
275 |         return sp.hstack([self._transform_vectorizer(i, X) for i in range(X.shape[1])])
276 | 
277 |     def _more_tags(self):
278 |         return {"X_types": ["string"]}
279 | 


--------------------------------------------------------------------------------
/src/sagemaker_sklearn_extension/impute/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #      http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | 
14 | """
15 | The :mod:`sagemaker_sklearn_extension.impute` module includes
16 | transformers that preform missing value imputation. This module
17 | is based on the :mod:`sklearn.impute` module.
18 | """
19 | 
20 | from .base import RobustImputer, RobustMissingIndicator, is_finite_numeric
21 | 
22 | __all__ = ["RobustImputer", "RobustMissingIndicator", "is_finite_numeric"]
23 | 


--------------------------------------------------------------------------------
/src/sagemaker_sklearn_extension/impute/base.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
  4 | # may not use this file except in compliance with the License. A copy of
  5 | # the License is located at
  6 | #
  7 | #      http://aws.amazon.com/apache2.0/
  8 | #
  9 | # or in the "license" file accompanying this file. This file is
 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 11 | # ANY KIND, either express or implied. See the License for the specific
 12 | # language governing permissions and limitations under the License.
 13 | 
 14 | import numpy as np
 15 | 
 16 | from sklearn.base import BaseEstimator, TransformerMixin
 17 | from sklearn.impute import MissingIndicator, SimpleImputer
 18 | from sklearn.utils.validation import check_array, check_is_fitted
 19 | 
 20 | 
 21 | def is_finite_numeric(arr):
 22 |     """Helper function to check if values in an array can be converted to finite numeric
 23 |     """
 24 | 
 25 |     def _is_finite_numeric(val):
 26 |         try:
 27 |             f = float(val)
 28 |             return np.isfinite(f)
 29 |         except ValueError:
 30 |             return False
 31 | 
 32 |     return np.vectorize(_is_finite_numeric)(arr)
 33 | 
 34 | 
 35 | def _get_mask(X, vectorized_mask_function):
 36 |     """Compute boolean mask of X for vectorized_mask_function(X) == False
 37 |     """
 38 |     return np.logical_not(vectorized_mask_function(X).astype("bool"))
 39 | 
 40 | 
 41 | def _apply_mask(X, mask):
 42 |     X[mask] = np.nan
 43 |     return X
 44 | 
 45 | 
 46 | class RobustImputer(BaseEstimator, TransformerMixin):
 47 |     """Imputer for completing missing values.
 48 | 
 49 |     Similar to sklearn.impute.SimpleImputer with added functionality
 50 |     - RobustImputer uses a custom mask_function to determine values to impute.
 51 |       The default mask_function is sagemaker_sklearn_extension.impute.is_finite_numeric
 52 |       which checks if a value can be converted into a float.
 53 |     - RobustImputer can perform multi-column imputation with different values
 54 |       for each column (strategy=="constant")
 55 | 
 56 |     Parameters
 57 |     ----------
 58 |     dtype : string, type, list of types or None (default=None)
 59 |         Data type for output.
 60 | 
 61 |         - If left to default, numeric imputation strategies ("median" and "mean"),
 62 |         output array dtype will always be floating point dtype. Otherwise it will be
 63 |         np.dtype('O')
 64 | 
 65 |     strategy : string, optional (default='median')
 66 |         The imputation strategy.
 67 | 
 68 |         - If "mean", then replace missing values using the mean along
 69 |           each column. Can only be used with numeric data.
 70 |         - If "median", then replace missing values using the median along
 71 |           each column. Can only be used with numeric data.
 72 |         - If "most_frequent", then replace missing using the most frequent
 73 |           value along each column. Can be used with strings or numeric data.
 74 |         - If "constant", then replace missing values with fill_values.
 75 |           fill_values can be a singular value or a list of values equal to
 76 |           number of columns. Can be used with strings or numeric data.
 77 |           If fill_values is not set, fill_value will be 0 when imputing numerical
 78 |           data and "missing_value" for strings or object data types.
 79 | 
 80 |     fill_values : string, numerical value, or list, optional (default=None)
 81 |         When strategy=="constant", fill_values is used to replace all
 82 |         values that should be imputed.
 83 | 
 84 |         - If string or numerical value, that one value will be used to replace
 85 |           all values that should be imputed.
 86 |         - If list, fill_values must equal to number of columns of input. Each
 87 |           column will be imputed with the corresponding value in fill_values.
 88 |           fill_values[i] will replace ith column (X[:,i]).
 89 |         - If left to the default, fill_value will be 0 when imputing numerical
 90 |           data and "missing_value" for strings or object data types.
 91 | 
 92 |     mask_function : callable -> np.array, dtype('bool') (default=None)
 93 |         A vectorized python function, accepts np.array, returns np.array
 94 |         with dtype('bool')
 95 | 
 96 |         For each value, if mask_function(val) == False, that value will
 97 |         be imputed. mask_function is used to create a boolean mask that determines
 98 |         which values in the input to impute.
 99 | 
100 |         Use np.vectorize to vectorize singular python functions.
101 | 
102 |         If left to default, mask_function will be
103 |         sagemaker_sklearn_extension.impute.is_finite_numeric
104 | 
105 |     Notes
106 |     -----
107 |     only accepts 2D, non-sparse inputs
108 |     """
109 | 
110 |     def __init__(self, dtype=None, strategy="median", fill_values=None, mask_function=None):
111 |         self.dtype = dtype
112 |         self.strategy = strategy
113 |         self.fill_values = fill_values
114 |         self.mask_function = mask_function
115 | 
116 |     def _validate_input(self, X):
117 |         if self._is_constant_multicolumn_imputation():
118 |             if len(self.fill_values) != X.shape[1]:
119 |                 raise ValueError(
120 |                     "'fill_values' should have length equal to number of features in X {num_features}, "
121 |                     "got {fill_values_length}".format(num_features=X.shape[1], fill_values_length=len(self.fill_values))
122 |                 )
123 | 
124 |         dtype = self.dtype or np.dtype("O")
125 | 
126 |         if hasattr(X, "dtype") and X.dtype is not None and hasattr(X.dtype, "kind") and X.dtype.kind == "c":
127 |             raise ValueError("Complex data not supported\n{}\n".format(X))
128 | 
129 |         return check_array(X, dtype=dtype, copy=True, force_all_finite=False, ensure_2d=True)
130 | 
131 |     def _is_constant_multicolumn_imputation(self):
132 |         return self.strategy == "constant" and isinstance(self.fill_values, (list, tuple, np.ndarray))
133 | 
134 |     def fit(self, X, y=None):
135 |         """Fit the imputer on X.
136 | 
137 |         Parameters
138 |         ----------
139 |         X : {array-like}, shape (n_samples, n_features)
140 |             Input data, where ``n_samples`` is the number of samples and
141 |             ``n_features`` is the number of features.
142 | 
143 |         Returns
144 |         -------
145 |         self : RobustImputer
146 |         """
147 |         X = self._validate_input(X)
148 | 
149 |         self.vectorized_mask_function_ = self.mask_function or is_finite_numeric
150 |         X = _apply_mask(X, _get_mask(X, self.vectorized_mask_function_))
151 | 
152 |         if self._is_constant_multicolumn_imputation():
153 |             self.simple_imputer_ = SimpleImputer(strategy=self.strategy)
154 |         else:
155 |             self.simple_imputer_ = SimpleImputer(strategy=self.strategy, fill_value=self.fill_values)
156 | 
157 |         self.simple_imputer_.fit(X)
158 | 
159 |         # set "SimpleImputer.statistics_" for multicolumn imputations with different column fill values
160 |         # SimpleImputer cannot preform multicolumn imputation with different column fill values
161 |         if self._is_constant_multicolumn_imputation():
162 |             self.simple_imputer_.statistics_ = np.asarray(self.fill_values)
163 | 
164 |         return self
165 | 
166 |     def transform(self, X):
167 |         """Impute all missing values in X.
168 | 
169 |         Parameters
170 |         ----------
171 |         X : {array-like}, shape (n_samples, n_features)
172 |             The input data to complete.
173 | 
174 |         Returns
175 |         -------
176 |         Xt : {ndarray}, shape (n_samples, n_features)
177 |             The imputed input data. The data type of ``Xt``
178 |             will depend on your input dtype.
179 |         """
180 |         check_is_fitted(self, ["simple_imputer_", "vectorized_mask_function_"])
181 |         X = self._validate_input(X)
182 | 
183 |         if X.shape[1] != self.simple_imputer_.statistics_.shape[0]:
184 |             raise ValueError(
185 |                 "'transform' input X has {transform_dim} features per sample, "
186 |                 "expected {fit_dim} from 'fit' input".format(
187 |                     transform_dim=X.shape[1], fit_dim=self.simple_imputer_.statistics_.shape[0]
188 |                 )
189 |             )
190 | 
191 |         X = _apply_mask(X, _get_mask(X, self.vectorized_mask_function_))
192 | 
193 |         return self.simple_imputer_.transform(X).astype(self.dtype)
194 | 
195 |     def _more_tags(self):
196 |         return {"allow_nan": True}
197 | 
198 | 
199 | class RobustMissingIndicator(BaseEstimator, TransformerMixin):
200 |     """Binary indicators for missing values.
201 | 
202 |     Note that this component typically should not be used in a vanilla
203 |     :class:`sklearn.pipeline.Pipeline` consisting of transformers and a classifier,
204 |     but rather could be added using a :class:`sklearn.pipeline.FeatureUnion` or
205 |     :class:`sklearn.compose.ColumnTransformer`.
206 | 
207 |     Similar to sklearn.impute.MissingIndicator with added functionality
208 |     - RobustMissingIndicator uses a custom mask_function to determine the boolean mask.
209 |       The default mask_function is sagemaker_sklearn_extension.impute.is_finite_numeric
210 |       which checks whether or not a value can be converted into a float.
211 | 
212 |     Parameters
213 |     ----------
214 |     features : str, optional (default="all")
215 |         Whether the imputer mask should represent all or a subset of
216 |         features.
217 | 
218 |         - If "missing-only", the imputer mask will only represent
219 |           features containing missing values during fit time.
220 |         - If "all" (default), the imputer mask will represent all features.
221 | 
222 |     error_on_new : boolean, optional (default=True)
223 |         If True (default), transform will raise an error when there are
224 |         features with missing values in transform that have no missing values
225 |         in fit. This is applicable only when ``features="missing-only"``.
226 | 
227 |     mask_function : callable -> np.array, dtype('bool') (default=None)
228 |         A vectorized python function, accepts np.array, returns np.array
229 |         with dtype('bool')
230 | 
231 |         For each value, if mask_function(val) == False, that value will
232 |         be imputed. mask_function is used to create a boolean mask that determines
233 |         which values in the input to impute.
234 | 
235 |         Use np.vectorize to vectorize singular python functions.
236 | 
237 |         By default, mask_function will be
238 |         sagemaker_sklearn_extension.impute.is_finite_numeric
239 | 
240 |     Notes
241 |     -----
242 |     only accepts 2D, non-sparse inputs
243 |     """
244 | 
245 |     def __init__(self, features="all", error_on_new=True, mask_function=None):
246 |         self.features = features
247 |         self.error_on_new = error_on_new
248 |         self.mask_function = mask_function
249 | 
250 |     def _validate_input(self, X):
251 |         if hasattr(X, "dtype") and X.dtype is not None and hasattr(X.dtype, "kind") and X.dtype.kind == "c":
252 |             raise ValueError("Complex data not supported\n{}\n".format(X))
253 | 
254 |         return check_array(X, dtype=np.dtype("O"), copy=True, force_all_finite=False, ensure_2d=True)
255 | 
256 |     def fit(self, X, y=None):
257 |         """Fit the transformer on X.
258 | 
259 |         Parameters
260 |         ----------
261 |         X : {array-like}, shape (n_samples, n_features)
262 |             Input data, where ``n_samples`` is the number of samples and
263 |             ``n_features`` is the number of features.
264 | 
265 |         Returns
266 |         -------
267 |         self : RobustMissingIndicator
268 |         """
269 |         X = self._validate_input(X)
270 | 
271 |         self.vectorized_mask_function_ = self.mask_function or is_finite_numeric
272 |         X = _apply_mask(X, _get_mask(X, self.vectorized_mask_function_))
273 | 
274 |         self.missing_indicator_ = MissingIndicator(features=self.features, error_on_new=self.error_on_new)
275 |         self.missing_indicator_.fit(X)
276 | 
277 |         return self
278 | 
279 |     def transform(self, X):
280 |         """Generate missing values indicator for X.
281 | 
282 |         Parameters
283 |         ----------
284 |         X : {array-like}, shape (n_samples, n_features)
285 |             The input data to complete.
286 | 
287 |         Returns
288 |         -------
289 |         Xt : {ndarray}, shape (n_samples, n_features)
290 |             The missing indicator for input data. The data type of ``Xt``
291 |             will be boolean.
292 |         """
293 |         check_is_fitted(self, ["missing_indicator_", "vectorized_mask_function_"])
294 |         X = self._validate_input(X)
295 | 
296 |         X = _apply_mask(X, _get_mask(X, self.vectorized_mask_function_))
297 | 
298 |         return self.missing_indicator_.transform(X)
299 | 
300 |     def _more_tags(self):
301 |         return {"allow_nan": True}
302 | 


--------------------------------------------------------------------------------
/src/sagemaker_sklearn_extension/preprocessing/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #      http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | 
14 | from .base import BaseExtremeValueTransformer
15 | from .base import LogExtremeValuesTransformer
16 | from .base import QuantileExtremeValuesTransformer
17 | from .base import RemoveConstantColumnsTransformer
18 | from .base import log_transform
19 | from .base import quantile_transform_nonrandom
20 | from .data import QuadraticFeatures
21 | from .data import RobustStandardScaler
22 | from .encoders import NALabelEncoder
23 | from .encoders import RobustLabelEncoder
24 | from .encoders import RobustOrdinalEncoder
25 | from .encoders import ThresholdOneHotEncoder
26 | from .encoders import WOEEncoder
27 | from .encoders import SimilarityEncoder
28 | 
29 | __all__ = [
30 |     "BaseExtremeValueTransformer",
31 |     "LogExtremeValuesTransformer",
32 |     "NALabelEncoder",
33 |     "QuadraticFeatures",
34 |     "QuantileExtremeValuesTransformer",
35 |     "ThresholdOneHotEncoder",
36 |     "RemoveConstantColumnsTransformer",
37 |     "RobustLabelEncoder",
38 |     "RobustOrdinalEncoder",
39 |     "RobustStandardScaler",
40 |     "log_transform",
41 |     "quantile_transform_nonrandom",
42 |     "WOEEncoder",
43 |     "SimilarityEncoder",
44 | ]
45 | 


--------------------------------------------------------------------------------
/src/sagemaker_sklearn_extension/preprocessing/data.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
  4 | # may not use this file except in compliance with the License. A copy of
  5 | # the License is located at
  6 | #
  7 | #      http://aws.amazon.com/apache2.0/
  8 | #
  9 | # or in the "license" file accompanying this file. This file is
 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 11 | # ANY KIND, either express or implied. See the License for the specific
 12 | # language governing permissions and limitations under the License.
 13 | 
 14 | from itertools import combinations
 15 | 
 16 | import numpy as np
 17 | from scipy.sparse import issparse
 18 | 
 19 | from sklearn.base import BaseEstimator, TransformerMixin
 20 | from sklearn.preprocessing import StandardScaler
 21 | from sklearn.utils import check_array
 22 | from sklearn.utils import check_random_state
 23 | from sklearn.utils.validation import check_is_fitted
 24 | from sklearn.utils.validation import FLOAT_DTYPES
 25 | 
 26 | 
 27 | class QuadraticFeatures(BaseEstimator, TransformerMixin):
 28 |     """Generate and add quadratic features to feature matrix.
 29 | 
 30 |     Generate a new feature matrix containing the original data, an optional bias column, a collection of squared
 31 |     features, and a collection of interaction terms. If ``max_n_features`` is not large enough to include all the
 32 |     squared features, then a random subset of them is added instead. If it is large enough to include all squared
 33 |     features, but not large enough to include all quadratic features, then all of the squared features and a random
 34 |     subset of the interaction features are added instead.
 35 | 
 36 |     This transformer is similar to ``PolynomialFeatures`` from the ``sklearn.preprocessing.data`` module.
 37 | 
 38 |     Parameters
 39 |     ----------
 40 |     include_bias : boolean (default = False)
 41 |         Whether to include a bias column -- the feature in which all entries are set to 1.0, and which acts as the
 42 |         intercept term in a linear model. Note that this parameter is False by default, in contrast to the corresponding
 43 |         parameter in ``sklearn``'s ``PolynomialFeatures``.
 44 | 
 45 |     interaction_only : boolean (default = False)
 46 |         Whether to produce only interaction features, and omit the squared features. For example, if the features are
 47 |         [a, b], then this will include ab, but not a^2 and b^2. The bias column is not affected by this parameter.
 48 | 
 49 |     max_n_features : int (default = 1000)
 50 |         The maximum number of features to include in the output data matrix. Squared features are prioritized over
 51 |         interaction features, unless ``interaction_only`` is ``True``. Must be larger than the number of input features
 52 |         (plus one, if ``include_bias`` is ``True``).
 53 | 
 54 |     order : str in {'C', 'F'} (default = 'C')
 55 |         Order of the input array: 'C' stands for C-contiguous order, and 'F' stands for Fortran-contiguous order.
 56 | 
 57 |     random_state : int, RandomState instance, or None (default = 0)
 58 |         If int, ``random_state`` is the seed used by the random number generator; if ``RandomState`` instance,
 59 |         ``random_state`` is the random number generator; if None, the random number generator is the ``RandomState``
 60 |         instance used by ``np.random``.  Used to determine which feature combinations to include in the output dataset
 61 |         when ``max_n_features`` is too small to fit all quadratic features.
 62 | 
 63 |     Examples
 64 |     --------
 65 |     >>> import numpy as np
 66 |     >>> from sagemaker_sklearn_extension.preprocessing import QuadraticFeatures
 67 |     >>> X = np.arange(1, 7).reshape((2, 3))
 68 |     >>> X
 69 |     array([[1, 2, 3],
 70 |            [4, 5, 6]])
 71 |     >>> QuadraticFeatures().fit_transform(X)
 72 |     array([[ 1,  2,  3,  1,  4,  9,  2,  3,  6],
 73 |            [ 4,  5,  6, 16, 25, 36, 20, 24, 30]])
 74 |     >>> QuadraticFeatures(interaction_only=True, max_n_features=5).fit_transform(X)
 75 |     array([[ 1,  2,  3,  2,  3],
 76 |            [ 4,  5,  6, 20, 24]])
 77 | 
 78 |     Attributes
 79 |     ----------
 80 |     combinations_ : list of tuples (i, j)
 81 |         List of tuples with two elements, each containing the indexes of the columns that are multiplied element-wise
 82 |         to form a single output column. Tuples appear in the same order as the corresponding output columns.
 83 |     n_input_features_ : int
 84 |         The number of columns in the input dataset.
 85 |     n_output_features_ : int
 86 |         The number of columns in the output dataset.
 87 | 
 88 |     Notes
 89 |     -----
 90 |     Accepts only two-dimensional, dense input arrays.
 91 |     """
 92 | 
 93 |     def __init__(self, include_bias=False, interaction_only=False, max_n_features=1000, order="C", random_state=0):
 94 |         self.include_bias = include_bias
 95 |         self.interaction_only = interaction_only
 96 |         self.max_n_features = max_n_features
 97 |         self.order = order
 98 |         self.random_state = random_state
 99 | 
100 |     def _build_combinations(self, n_features, random_state):
101 |         """Calculate the feature pairs to be added to the input data based on parameters and number of input columns.
102 | 
103 |         If ``interaction_only`` is ``True``, all squared features are omitted. Otherwise, they are added before
104 |         interaction features. If there is enough space--as indicated by ``max_n_features``--to add all squared features,
105 |         then do so. Otherwise, take a random sub-sample. Then, if there's enough space to add all interaction features,
106 |         do so. Otherwise, return a random sub-sample of those.
107 | 
108 |         Parameters
109 |         ----------
110 |         n_features : int
111 |             The number of columns in the input vector.
112 |         random_state : RandomState
113 |             The prepared (using ``check_random_state``) ``RandomState`` instance.
114 |         """
115 |         # First calculate how many new features of each kind (squared and interaction) we can add.
116 |         added_feature_budget = self.max_n_features - n_features - int(self.include_bias)
117 |         if added_feature_budget <= 0:
118 |             message = "max_n_features must be large enough for the output to contain more than the original dataset"
119 |             if self.include_bias:
120 |                 message += " and bias column"
121 |             raise ValueError(message)
122 |         squared_feature_budget = 0 if self.interaction_only else min(added_feature_budget, n_features)
123 |         interaction_feature_budget = max(0, added_feature_budget - squared_feature_budget)
124 | 
125 |         # Produce squared feature pairs.
126 |         squared_features = []
127 |         if squared_feature_budget == n_features:
128 |             # No need to reorder if we can fit all squared features.
129 |             squared_features = [(i, i) for i in range(n_features)]
130 |         elif squared_feature_budget > 0:
131 |             # Otherwise, take a random sample of them.
132 |             squared_features = [
133 |                 (i, i) for i in random_state.choice(range(n_features), size=squared_feature_budget, replace=False)
134 |             ]
135 | 
136 |         # Produce interaction feature pairs.
137 |         interaction_features = []
138 |         if interaction_feature_budget > 0:
139 |             interaction_features = list(combinations(range(n_features), 2))
140 | 
141 |             # Take a random sample of feature interactions if not all can fit.
142 |             if len(interaction_features) > interaction_feature_budget:
143 |                 random_state.shuffle(interaction_features)
144 | 
145 |             interaction_features = interaction_features[:interaction_feature_budget]
146 | 
147 |         return squared_features + interaction_features
148 | 
149 |     def fit(self, X, y=None):
150 |         """
151 |         Compute the number of output features and the combination of input features to multiply.
152 | 
153 |         Parameters
154 |         ----------
155 |         X : array-like , shape (n_samples, n_features)
156 |             The data array to transform. Must be a non-sparse two-dimensional numpy array.
157 | 
158 |         Returns
159 |         -------
160 |         self : instance
161 |         """
162 |         _, n_features = check_array(X).shape
163 |         random_state = check_random_state(self.random_state)
164 |         self.combinations_ = self._build_combinations(n_features, random_state)
165 |         self.n_input_features_ = n_features
166 |         self.n_output_features_ = n_features + len(self.combinations_) + int(self.include_bias)
167 |         return self
168 | 
169 |     def transform(self, X):
170 |         """
171 |         Transform data to the chosen quadratic features.
172 | 
173 |         Parameters
174 |         ----------
175 |         X : array-like, shape (n_samples, n_features)
176 |             The data array to transform. Must be a non-sparse and two-dimensional.
177 | 
178 |         Returns
179 |         -------
180 |         XQ : np.ndarray, shape (n_samples, n_output_features_)
181 |             The array of computed features.
182 |         """
183 |         check_is_fitted(self, ["n_input_features_", "n_output_features_", "combinations_"])
184 |         X = check_array(X, order=self.order)
185 |         n_samples, n_features = X.shape
186 | 
187 |         if n_features != self.n_input_features_:
188 |             raise ValueError("X shape does not match training shape.")
189 | 
190 |         XQ = np.empty((n_samples, self.n_output_features_), dtype=X.dtype, order=self.order)
191 | 
192 |         if self.include_bias:
193 |             XQ[:, 0] = 1.0
194 |             X_col_range_start, X_col_range_end = 1, self.n_input_features_ + 1
195 |         else:
196 |             X_col_range_start, X_col_range_end = 0, self.n_input_features_
197 | 
198 |         XQ[:, X_col_range_start:X_col_range_end] = X
199 |         XQ[:, X_col_range_end:] = np.column_stack([X[:, i] * X[:, j] for i, j in self.combinations_])
200 | 
201 |         return XQ
202 | 
203 | 
204 | class RobustStandardScaler(BaseEstimator, TransformerMixin):
205 |     """Scaler to adaptively scale dense and sparse inputs.
206 | 
207 |     RobustStandardScaler uses `sklearn.preprocessing.StandardScaler` to perform standardization, but adapts
208 |     the centering based on the sparsity of the data.
209 | 
210 |     For dense inputs, the standard score of a sample `x` is calculated as:
211 | 
212 |         z = (x - u) / s
213 | 
214 |     where `u` is the mean of the training samples, and `s` is the standard deviation of the training samples.
215 |     The mean `u` is a vector of means of each feature.  If the number of zeros for a feature is greater than or
216 |     equal to 70% of the total number of samples, the corresponding value in `u` is set to `0` to avoid centering
217 |     by mean.
218 | 
219 |     For sparse inputs, the standard score of a sample `x` is calculated as:
220 | 
221 |         z = x / s
222 | 
223 |     where `s` is the standard deviation of the training samples.
224 | 
225 |     Parameters
226 |     ----------
227 |     copy : boolean, optional, default True
228 |         If False, try to avoid a copy and do inplace scaling instead.
229 |         This is not guaranteed to always work inplace; e.g. if the data is
230 |         not a NumPy array or scipy.sparse CSR matrix, a copy may still be
231 |         returned.
232 | 
233 |      Attributes
234 |     ----------
235 |     self.scaler_ : ``sklearn.preprocessing.StandardScaler``
236 |         - `scaler_` is instantiated inside the fit method used for computing the center and the standard deviation.
237 | 
238 |     """
239 | 
240 |     def __init__(self, copy=True):
241 |         self.copy = copy
242 | 
243 |     def fit(self, X, y=None):
244 |         """Fit RobustStandardScaler to X.
245 | 
246 |         If input is sparse, `fit` overrides `self.with_mean` to standardize without subtracting mean (avoids breaking
247 |         for sparse matrix)
248 | 
249 |         If the data is dense, the mean is adjusted for sparse features and the scaled with mean.
250 | 
251 |         Parameters
252 |         ----------
253 |         X : array-like, shape [n_samples, n_features]
254 |             The data to standardize.
255 | 
256 |         Returns
257 |         -------
258 |         self : RobustStandardScaler
259 |         """
260 |         X = check_array(
261 |             X, accept_sparse=("csr", "csc"), estimator=self, dtype=FLOAT_DTYPES, force_all_finite="allow-nan"
262 |         )
263 | 
264 |         with_mean = not issparse(X)
265 | 
266 |         self.scaler_ = StandardScaler(with_mean=with_mean, with_std=True, copy=self.copy)
267 |         self.scaler_.fit(X)
268 | 
269 |         if self.scaler_.with_mean:
270 |             nnz_mean_mask = np.where(np.count_nonzero(X, axis=0) / X.shape[0] > 0.3, 1, 0)
271 |             self.scaler_.mean_ = self.scaler_.mean_ * nnz_mean_mask
272 | 
273 |         return self
274 | 
275 |     def transform(self, X):
276 |         """
277 |         Standardize data by centering and scaling.
278 | 
279 |         Parameters
280 |         ----------
281 |         X : array-like, shape (n_samples, n_features)
282 |             The data array to transform.
283 | 
284 |         Returns
285 |         -------
286 |         Xt : array-like, shape (n_samples, n_features)
287 |             The array of transformed input.
288 |         """
289 |         return self.scaler_.transform(X)
290 | 
291 |     def _more_tags(self):
292 |         return {"allow_nan": True}
293 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/sagemaker-scikit-learn-extension/2412131311433addbae9f6ad5aa393a8bdbbe61f/test/__init__.py


--------------------------------------------------------------------------------
/test/contrib/taei/data/data.csv:
--------------------------------------------------------------------------------
  1 | 0.000,0.000,1.000,0.455,0.365,0.095,0.514,0.225,0.101,0.150,-1.000
  2 | 0.000,0.000,1.000,0.350,0.265,0.090,0.226,0.100,0.049,0.070,1.000
  3 | 1.000,0.000,0.000,0.530,0.420,0.135,0.677,0.257,0.141,0.210,-1.000
  4 | 0.000,0.000,1.000,0.440,0.365,0.125,0.516,0.215,0.114,0.155,-1.000
  5 | 0.000,1.000,0.000,0.330,0.255,0.080,0.205,0.089,0.040,0.055,1.000
  6 | 0.000,1.000,0.000,0.425,0.300,0.095,0.351,0.141,0.077,0.120,-1.000
  7 | 1.000,0.000,0.000,0.530,0.415,0.150,0.777,0.237,0.141,0.330,-1.000
  8 | 1.000,0.000,0.000,0.545,0.425,0.125,0.768,0.294,0.149,0.260,-1.000
  9 | 0.000,0.000,1.000,0.475,0.370,0.125,0.509,0.216,0.113,0.165,-1.000
 10 | 1.000,0.000,0.000,0.550,0.440,0.150,0.894,0.315,0.151,0.320,-1.000
 11 | 1.000,0.000,0.000,0.525,0.380,0.140,0.607,0.194,0.147,0.210,-1.000
 12 | 0.000,0.000,1.000,0.430,0.350,0.110,0.406,0.168,0.081,0.135,-1.000
 13 | 0.000,0.000,1.000,0.490,0.380,0.135,0.541,0.217,0.095,0.190,-1.000
 14 | 1.000,0.000,0.000,0.535,0.405,0.145,0.684,0.273,0.171,0.205,-1.000
 15 | 1.000,0.000,0.000,0.470,0.355,0.100,0.475,0.168,0.081,0.185,-1.000
 16 | 0.000,0.000,1.000,0.500,0.400,0.130,0.664,0.258,0.133,0.240,-1.000
 17 | 0.000,1.000,0.000,0.355,0.280,0.085,0.290,0.095,0.040,0.115,1.000
 18 | 1.000,0.000,0.000,0.440,0.340,0.100,0.451,0.188,0.087,0.130,-1.000
 19 | 0.000,0.000,1.000,0.365,0.295,0.080,0.256,0.097,0.043,0.100,1.000
 20 | 0.000,0.000,1.000,0.450,0.320,0.100,0.381,0.171,0.075,0.115,-1.000
 21 | 0.000,0.000,1.000,0.355,0.280,0.095,0.245,0.096,0.062,0.075,-1.000
 22 | 0.000,1.000,0.000,0.380,0.275,0.100,0.226,0.080,0.049,0.085,-1.000
 23 | 1.000,0.000,0.000,0.565,0.440,0.155,0.940,0.427,0.214,0.270,-1.000
 24 | 1.000,0.000,0.000,0.550,0.415,0.135,0.763,0.318,0.210,0.200,-1.000
 25 | 1.000,0.000,0.000,0.615,0.480,0.165,1.161,0.513,0.301,0.305,-1.000
 26 | 1.000,0.000,0.000,0.560,0.440,0.140,0.928,0.383,0.188,0.300,-1.000
 27 | 1.000,0.000,0.000,0.580,0.450,0.185,0.996,0.395,0.272,0.285,-1.000
 28 | 0.000,0.000,1.000,0.590,0.445,0.140,0.931,0.356,0.234,0.280,-1.000
 29 | 0.000,0.000,1.000,0.605,0.475,0.180,0.936,0.394,0.219,0.295,-1.000
 30 | 0.000,0.000,1.000,0.575,0.425,0.140,0.864,0.393,0.227,0.200,-1.000
 31 | 0.000,0.000,1.000,0.580,0.470,0.165,0.998,0.394,0.242,0.330,-1.000
 32 | 1.000,0.000,0.000,0.680,0.560,0.165,1.639,0.606,0.281,0.460,-1.000
 33 | 0.000,0.000,1.000,0.665,0.525,0.165,1.338,0.551,0.357,0.350,-1.000
 34 | 1.000,0.000,0.000,0.680,0.550,0.175,1.798,0.815,0.393,0.455,-1.000
 35 | 1.000,0.000,0.000,0.705,0.550,0.200,1.710,0.633,0.411,0.490,-1.000
 36 | 0.000,0.000,1.000,0.465,0.355,0.105,0.479,0.227,0.124,0.125,-1.000
 37 | 1.000,0.000,0.000,0.540,0.475,0.155,1.217,0.530,0.307,0.340,-1.000
 38 | 1.000,0.000,0.000,0.450,0.355,0.105,0.522,0.237,0.117,0.145,-1.000
 39 | 1.000,0.000,0.000,0.575,0.445,0.135,0.883,0.381,0.203,0.260,-1.000
 40 | 0.000,0.000,1.000,0.355,0.290,0.090,0.328,0.134,0.086,0.090,-1.000
 41 | 1.000,0.000,0.000,0.450,0.335,0.105,0.425,0.186,0.091,0.115,-1.000
 42 | 1.000,0.000,0.000,0.550,0.425,0.135,0.852,0.362,0.196,0.270,-1.000
 43 | 0.000,1.000,0.000,0.240,0.175,0.045,0.070,0.032,0.024,0.020,-1.000
 44 | 0.000,1.000,0.000,0.205,0.150,0.055,0.042,0.025,0.015,0.012,-1.000
 45 | 0.000,1.000,0.000,0.210,0.150,0.050,0.042,0.018,0.013,0.015,-1.000
 46 | 0.000,1.000,0.000,0.390,0.295,0.095,0.203,0.087,0.045,0.075,1.000
 47 | 0.000,0.000,1.000,0.470,0.370,0.120,0.580,0.293,0.227,0.140,-1.000
 48 | 1.000,0.000,0.000,0.460,0.375,0.120,0.461,0.177,0.110,0.150,1.000
 49 | 0.000,1.000,0.000,0.325,0.245,0.070,0.161,0.075,0.025,0.045,-1.000
 50 | 1.000,0.000,0.000,0.525,0.425,0.160,0.836,0.354,0.213,0.245,-1.000
 51 | 0.000,1.000,0.000,0.520,0.410,0.120,0.595,0.238,0.111,0.190,-1.000
 52 | 0.000,0.000,1.000,0.400,0.320,0.095,0.303,0.134,0.060,0.100,1.000
 53 | 0.000,0.000,1.000,0.485,0.360,0.130,0.541,0.260,0.096,0.160,-1.000
 54 | 1.000,0.000,0.000,0.470,0.360,0.120,0.477,0.210,0.105,0.150,-1.000
 55 | 0.000,0.000,1.000,0.405,0.310,0.100,0.385,0.173,0.091,0.110,1.000
 56 | 1.000,0.000,0.000,0.500,0.400,0.140,0.661,0.257,0.175,0.220,-1.000
 57 | 0.000,0.000,1.000,0.445,0.350,0.120,0.443,0.192,0.096,0.135,-1.000
 58 | 0.000,0.000,1.000,0.470,0.385,0.135,0.590,0.277,0.120,0.170,-1.000
 59 | 0.000,1.000,0.000,0.245,0.190,0.060,0.086,0.042,0.014,0.025,-1.000
 60 | 1.000,0.000,0.000,0.505,0.400,0.125,0.583,0.246,0.130,0.175,1.000
 61 | 0.000,0.000,1.000,0.450,0.345,0.105,0.411,0.180,0.113,0.135,1.000
 62 | 0.000,0.000,1.000,0.505,0.405,0.110,0.625,0.305,0.160,0.175,-1.000
 63 | 1.000,0.000,0.000,0.530,0.410,0.130,0.697,0.302,0.194,0.200,-1.000
 64 | 0.000,0.000,1.000,0.425,0.325,0.095,0.379,0.171,0.080,0.100,1.000
 65 | 0.000,0.000,1.000,0.520,0.400,0.120,0.580,0.234,0.132,0.185,-1.000
 66 | 0.000,0.000,1.000,0.475,0.355,0.120,0.480,0.234,0.102,0.135,-1.000
 67 | 1.000,0.000,0.000,0.565,0.440,0.160,0.915,0.354,0.194,0.320,-1.000
 68 | 1.000,0.000,0.000,0.595,0.495,0.185,1.285,0.416,0.224,0.485,-1.000
 69 | 1.000,0.000,0.000,0.475,0.390,0.120,0.530,0.213,0.116,0.170,-1.000
 70 | 0.000,1.000,0.000,0.310,0.235,0.070,0.151,0.063,0.041,0.045,-1.000
 71 | 0.000,0.000,1.000,0.555,0.425,0.130,0.766,0.264,0.168,0.275,-1.000
 72 | 1.000,0.000,0.000,0.400,0.320,0.110,0.353,0.141,0.099,0.100,-1.000
 73 | 1.000,0.000,0.000,0.595,0.475,0.170,1.247,0.480,0.225,0.425,-1.000
 74 | 0.000,0.000,1.000,0.570,0.480,0.175,1.185,0.474,0.261,0.380,-1.000
 75 | 1.000,0.000,0.000,0.605,0.450,0.195,1.098,0.481,0.289,0.315,-1.000
 76 | 1.000,0.000,0.000,0.600,0.475,0.150,1.008,0.443,0.221,0.280,-1.000
 77 | 0.000,0.000,1.000,0.595,0.475,0.140,0.944,0.362,0.189,0.315,-1.000
 78 | 1.000,0.000,0.000,0.600,0.470,0.150,0.922,0.363,0.194,0.305,-1.000
 79 | 1.000,0.000,0.000,0.555,0.425,0.140,0.788,0.282,0.160,0.285,-1.000
 80 | 1.000,0.000,0.000,0.615,0.475,0.170,1.103,0.469,0.235,0.345,-1.000
 81 | 1.000,0.000,0.000,0.575,0.445,0.140,0.941,0.385,0.252,0.285,-1.000
 82 | 0.000,0.000,1.000,0.620,0.510,0.175,1.615,0.510,0.192,0.675,-1.000
 83 | 1.000,0.000,0.000,0.520,0.425,0.165,0.989,0.396,0.225,0.320,-1.000
 84 | 0.000,0.000,1.000,0.595,0.475,0.160,1.317,0.408,0.234,0.580,-1.000
 85 | 0.000,0.000,1.000,0.580,0.450,0.140,1.013,0.380,0.216,0.360,-1.000
 86 | 1.000,0.000,0.000,0.570,0.465,0.180,1.295,0.339,0.223,0.440,-1.000
 87 | 0.000,0.000,1.000,0.625,0.465,0.140,1.195,0.482,0.205,0.400,-1.000
 88 | 0.000,0.000,1.000,0.560,0.440,0.160,0.865,0.331,0.207,0.260,-1.000
 89 | 1.000,0.000,0.000,0.460,0.355,0.130,0.517,0.221,0.114,0.165,-1.000
 90 | 1.000,0.000,0.000,0.575,0.450,0.160,0.978,0.314,0.231,0.330,-1.000
 91 | 0.000,0.000,1.000,0.565,0.425,0.135,0.811,0.341,0.168,0.255,-1.000
 92 | 0.000,0.000,1.000,0.555,0.440,0.150,0.755,0.307,0.152,0.260,-1.000
 93 | 0.000,0.000,1.000,0.595,0.465,0.175,1.115,0.402,0.254,0.390,-1.000
 94 | 1.000,0.000,0.000,0.625,0.495,0.165,1.262,0.507,0.318,0.390,-1.000
 95 | 0.000,0.000,1.000,0.695,0.560,0.190,1.494,0.588,0.343,0.485,-1.000
 96 | 0.000,0.000,1.000,0.665,0.535,0.195,1.606,0.576,0.388,0.480,-1.000
 97 | 0.000,0.000,1.000,0.535,0.435,0.150,0.725,0.269,0.139,0.250,-1.000
 98 | 0.000,0.000,1.000,0.470,0.375,0.130,0.523,0.214,0.132,0.145,-1.000
 99 | 0.000,0.000,1.000,0.470,0.370,0.130,0.522,0.201,0.133,0.165,1.000
100 | 1.000,0.000,0.000,0.475,0.375,0.125,0.579,0.278,0.085,0.155,-1.000
101 | 


--------------------------------------------------------------------------------
/test/contrib/taei/test_taei.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | from sagemaker_sklearn_extension.contrib.taei import LatentSpaceOversampler, AE, VAE, StarOversampler
 4 | 
 5 | 
 6 | def test_latent_space_oversampler():
 7 |     # make torch deterministic
 8 |     torch.backends.cudnn.deterministic = True
 9 |     torch.backends.cudnn.benchmark = False
10 | 
11 |     d = np.genfromtxt("test/contrib/taei/data/data.csv", delimiter=",")
12 |     categorical_features = [0, 1, 2]
13 |     categorical_dims = [2, 2, 2]
14 |     continuous_features = [3, 4, 5, 6, 7, 8, 9]
15 | 
16 |     star_fit_resample = StarOversampler(proportion=1.0).resample
17 | 
18 |     # Test AE+StarOversampler
19 |     torch.manual_seed(0)
20 |     ae_smote = LatentSpaceOversampler(
21 |         model=AE(
22 |             categorical_features=categorical_features,
23 |             categorical_dims=categorical_dims,
24 |             continuous_features=continuous_features,
25 |             latent_dim=8,
26 |             hidden_dim=[64, 32],
27 |             nll_weight=0.5,
28 |         ),
29 |         base_oversampler=star_fit_resample,
30 |     )
31 |     # Train the model
32 |     ae_smote.fit(d[:, :10], d[:, 10], max_epoch=5, validation_ratio=None)
33 |     # Use the model for oversampling
34 |     X_os, y_os = ae_smote.resample(d[:, :10], d[:, 10])
35 |     np.testing.assert_almost_equal(
36 |         X_os[-1, :],
37 |         [1.0, 1.0, 0.0, 0.5661017, 0.7811485, 1.172961, 1.0983223, 1.5463793, 1.3487656, 0.605184],
38 |         decimal=2,
39 |     )
40 | 
41 |     # Test VAE+StarOversampler
42 |     torch.manual_seed(0)
43 |     vae_smote = LatentSpaceOversampler(
44 |         model=VAE(
45 |             categorical_features=categorical_features,
46 |             categorical_dims=categorical_dims,
47 |             continuous_features=continuous_features,
48 |             latent_dim=16,
49 |             hidden_dim=32,
50 |             nll_weight=0.1,
51 |             kld_weight=0.5,
52 |         ),
53 |         base_oversampler=star_fit_resample,
54 |     )
55 |     # Train and use the model in one function call
56 |     X_os, y_os = vae_smote.fit_resample(d[:, :10], d[:, 10], max_epoch=50, early_stopping=1)
57 |     np.testing.assert_almost_equal(
58 |         X_os[-1, :],
59 |         [0.0, 1.0, 1.0, 0.5926914, 0.4106686, 0.3133996, 0.0246359, 0.4813618, -0.1365427, -0.0096727],
60 |         decimal=2,
61 |     )
62 | 
63 |     # Test storing and loading models
64 |     vae_smote.save_model("/tmp/vae_model.pth")
65 |     vae_smote_loaded = LatentSpaceOversampler(model=None, base_oversampler=star_fit_resample)
66 |     vae_smote_loaded.load_model("/tmp/vae_model.pth")
67 |     X_os_loaded, y_os_loaded = vae_smote_loaded.resample(d[:, :10], d[:, 10])
68 |     np.testing.assert_almost_equal(X_os, X_os_loaded)
69 | 


--------------------------------------------------------------------------------
/test/data/csv/dirty.csv:
--------------------------------------------------------------------------------
 1 | t,23,111111,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666,a ,bcd,this is ml test table,2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,-14 days
 2 | ,23,111111,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666,\n,bcd~,"this is, ml test\n table",2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,-14 days
 3 | t,,222222,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666,", ",bcd,this is ml test table,2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,28 days
 4 | t,23,,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666,//,bcd,this is ml 'test table ,2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,28 days
 5 | t,23,222222,,123.456,9.999999999,6.80,6.6666666666666666666666666666,  , ,"this is ""ml test"" table ",2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,1 year
 6 | t,23,222222,0,,9.999999999,6.80,6.6666666666666666666666666666,\\,",","this, is. ""ml test"" \\\ntable ",2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,1 year
 7 | t,23,111111,888888888888,123.456,,6.78,6.6666666666666666666666666666,a ,bcd,this is ml test table,2019-10-09,00:00:00,2019-10-09 20:22:02,2019-10-09 12:22:02+00,-3 days
 8 | t,23,111111,888888888888,123.456,9.999999999,,6.6666666666666666666666666666,' ,\\,this is ~~~ ml test table,2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,-14 days
 9 | t,23,111111,888888888888,123.456,9.999999999,6.78,,"""""","",this is ml test table,2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,-14 days
10 | t,23,111111,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666,,"""""","this is""'""// ml test table",2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,-14 days
11 | t,23,111111,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666,  ,,"this is ml test table""""",2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,-14 days
12 | t,23,111111,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666,  ,bc/d,,2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,02:00:00
13 | t,23,111111,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666,\n,bcd\n,this is ml test table\n,,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,-14 days
14 | t,23,111111,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666,a ,bcd ,this is ml test table ,2019-10-09,,2019-10-09 20:22:02,2019-10-09 12:22:02+00,-14 days
15 | t,23,111111,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666,a;,bcd;,this is ml test table;,2019-10-09,20:22:02,,2019-10-09 12:22:02+00,-14 days
16 | t,23,111111,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666,あ ,あいう,this is あいう ml test table,2019-10-09,20:22:02,2019-10-09 20:22:02,,-14 days
17 | t,23,111111,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666,好 ,可变字符,this is ml 测试 table,2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,
18 | t,23,111111,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666,", ","
19 | bcdf
20 | ","this is 
21 |  ml test ' 	table \",2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,-14 days
22 | t,23,111111,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666,", ",\bcdf',"this is "" "" ml test ' table //",2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,-14 days
23 | 


--------------------------------------------------------------------------------
/test/data/csv/invalid.csv:
--------------------------------------------------------------------------------
1 | 1,2,3,4
2 | 5,6,7,8
3 | 9,10,11,12
4 | 13,14,15,16,extra
5 | 17,18,19,20
6 | 21,22,23,24
7 | 25,26,27,28
8 | 29,30,31,32


--------------------------------------------------------------------------------
/test/data/csv/missing_values.csv:
--------------------------------------------------------------------------------
1 | -1, rheumatoid arthritis expert tip info article treatment option support, understand rheumatoid arthritis everyday health, understand rheumatoid arthritis, understand rheumatoid arthritis everyday health root root act consumer root content everyday solution understand rheumatoid arthritis future ra treatment advance rheumatoid arthritis treatment expect future lead researcher ra treatment research exercise ra check tip slideshow help create workout program ra fitness tip question doctor print list rheumatoid arthritis question doctor visit list ra question understand rheumatoid arthritis tip manage rheumatoid arthritis pain mak key change help manage rheumatoid arthritis pain ease joint pain strive eat balance diet help healthy weight sufficient vitamin mineral counter chronic inflammation tip ease rheumatoid arthritis pain cause rheumatoid arthritis inflammation rheumatoid arthritis ra symptom cause inflammation learn inflammation lead ra symptom joint pain stiffness plus discover cause rheumatoid arthritis rheumatoid arthritis inflammation rheumatoid arthritis management research rheumatoid arthritis research lifestyle choice day impact ra symptom example people ra experience depression treate depression help people manage ra control ra rheumatoid arthritis expert yoga ra yoga safe exercise option person rheumatoid arthritis read dr susan lee answer root act consumer rheumatoid arthritis poll lifestyle change help manage ra pain please select option eat healthy balance diet muscle strengthen aerobic exercise sleep night try reduce manage stress technique haven lifestyle change toolkit healthy recipe shop list meal planner recipe box tool root act consumer enter search term register sign newsletter home health common condition add adhd addiction allergy alternative health alzheimer disease anxiety disorder arthritis asthma autism autoimmune disorder bipolar disorder pain breast cancer cancer cardiovascular health cold flu dental health depression diabete diet nutrition digestive health dvt emotional health epilepsy erectile dysfunction family health fibromyalgia fitness gerd headache migraine healthy home healthy live heart health cholesterol hiv aid hypertension ib incontinence kid health health menopause multiple sclerosis osteoporosis pain management pet health psoriasis rheumatoid arthritis schizophrenia senior health sexual health skin beauty sleep stop smok stroke swine flu weight women health yeast infection condition drug symptom checker flu checkup abdominal pain arm pain pain body ach breast pain breathing difficulty chest pain congestion cough diarrhea ear pain excessive sweate faintness fatigue fever ga headache irregular period joint pain leg pain mouth lesion nausea neck pain rash rectal bleed skin lump sore throat vaginal itch vomite food fitness calorie counter healthy recipe search recipe diet nutrition weight fitness community profile blog discussion photo albums everyday health health tool bmi calculator bmr calculator body fat calculator brain game conversion calculator glossary glucose tracker meal planner calorie counter photo gallery recipe box symptom checker video weight tracker everyday health edit profile inbox discussion blog friend tool copyright everyday health inc everydayhealth com everyday health inc help ad policy advertise link feedback advertise notice site third party advertisement site collect information visit site website provide advertisement service obtain information advertise practice choice online behavioral advertise please click material web site provide educational purpose medical advice diagnosis treatment additional information site subject term privacy policy site comply honcode standard trustworthy health information verify
2 | -1, ,practice location, , practice location arthritis rheumatology pllc rheumatology specialist anju varghese board certify internal medicine rheumatology practice limit rheumatology subtitle text home practice location patient resource rockland county ny medical park dr lower level pomona ny phone fax route exit palisad parkway westchester county ny north broadway nd floor yonker ny phone fax st john riverside hospital exit sawmill river parkway accept patient participate health insurance health plan content copyright host exchange
3 | -1, siemen water remediation water scarce resource siemen help preserve, siemen usa, , siemen usa skip content siemen skip site identifier siemen usa close site id layer skip language selection skip generic navigation contact skip search search industry energy healthcare business product industry solution motor drive build technology industry automation financial solution solution service lighte osram sylvania product lifecycle management mobility water technology power generation power transmission power distribution automation control protection electrical compression expansion ventilation mechanical drive service financial solution solution service diagnostic image therapy hear aid product laboratory diagnostics build technology financial solution solution service consumer product corporate research government solution information communication siemen financial solution solution service siemen usa investor relation press job career business siemen global website answer america renewable energy smart grid technology medical image electronic healthcare record green build commuter rail system employee siemen unit commit answer america toughest question close productfinder layer close logo layer siemen corporation corporate information privacy policy term digital id
4 | -1, symptom muscle weakness genetic disease symptom include search learn, , , page found
5 | 1, animal animal wild sa official tourism website, , , page found
6 | -1, dr enrico fazzini parkinson disease specialist nyu movement disorder neurologist www theparkinsonsdoctor com, , , page found
7 | -1, ulcerative colitis uc quiz ulcerative colitis, colitis treatment endless path, colitis treatment endless path,


--------------------------------------------------------------------------------
/test/data/csv/mock_datasplitter_output/excel.csv:
--------------------------------------------------------------------------------
1 | 1,2,"a
2 | b",2.0
3 | "c,d,e",f,3.0,'hi'
4 | """hi""","h""i",h'i,bye
5 | 


--------------------------------------------------------------------------------
/test/data/csv/mock_datasplitter_output/manual.csv:
--------------------------------------------------------------------------------
1 | 1,2,3,45,6,7,89,10,11,1213,14,15,1617,18,19,2021,22,23,2425,26,27,2829,30,31,32


--------------------------------------------------------------------------------
/test/data/csv/mock_datasplitter_output/newline.csv:
--------------------------------------------------------------------------------
 1 | 1,2,3,4
 2 | 2,3,4,5
 3 | 3,4,5,6
 4 | 4,5,6,7
 5 | 5,6,7,8
 6 | 6,7,8,9
 7 | 7,8,9,10
 8 | 8,9,10,11
 9 | 9,10,11,12
10 | 10,11,12,13
11 | 


--------------------------------------------------------------------------------
/test/data/csv/mock_datasplitter_output/oneline.csv:
--------------------------------------------------------------------------------
1 | col0,col1,col2,class


--------------------------------------------------------------------------------
/test/data/csv/regression_na_labels.csv:
--------------------------------------------------------------------------------
1 | 1, 2, 3, 1.1
2 | 4, string, 5, 2.2
3 | 6, 7, 8, string
4 | 9, 10, 11, inf
5 | 12, 13, 14, 3.3


--------------------------------------------------------------------------------
/test/test_automl_transformer.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #      http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | 
14 | import numpy as np
15 | import pytest
16 | from scipy.sparse import csr_matrix
17 | 
18 | from sklearn.decomposition import PCA
19 | from sklearn.impute import SimpleImputer
20 | from sklearn.pipeline import Pipeline
21 | from sklearn.preprocessing import LabelEncoder
22 | from sklearn.preprocessing import FunctionTransformer
23 | from sagemaker_sklearn_extension.externals import AutoMLTransformer
24 | from sagemaker_sklearn_extension.externals import Header
25 | from sagemaker_sklearn_extension.externals import read_csv_data
26 | from sagemaker_sklearn_extension.preprocessing import NALabelEncoder
27 | from sagemaker_sklearn_extension.impute import RobustImputer
28 | 
29 | 
30 | def to_csr(X):
31 |     return csr_matrix(X.shape, dtype=np.int8)
32 | 
33 | 
34 | impute_pca_pipeline = Pipeline(steps=[("impute", SimpleImputer()), ("pca", PCA(n_components=2))])
35 | 
36 | 
37 | @pytest.mark.parametrize(
38 |     "feature_transformer, target_transformer, " "expected_X_transformed_shape, expected_Xy_transformed_shape",
39 |     [
40 |         (impute_pca_pipeline, LabelEncoder(), (10, 2), (10, 3)),
41 |         (impute_pca_pipeline, NALabelEncoder(), (10, 2), (9, 3)),
42 |         (FunctionTransformer(to_csr, validate=False), None, (10, 3), (9, 4)),
43 |     ],
44 | )
45 | def test_automl_transformer(
46 |     feature_transformer, target_transformer, expected_X_transformed_shape, expected_Xy_transformed_shape
47 | ):
48 |     X = np.arange(0, 3 * 10).reshape((10, 3)).astype(np.str)
49 |     y = np.array([0] * 5 + [1] * 4 + [np.nan]).astype(np.str)
50 | 
51 |     header = Header(column_names=["x1", "x2", "x3", "class"], target_column_name="class")
52 |     automl_transformer = AutoMLTransformer(
53 |         header=header, feature_transformer=feature_transformer, target_transformer=target_transformer,
54 |     )
55 | 
56 |     model = automl_transformer.fit(X, y)
57 | 
58 |     X_transformed = model.transform(X)
59 |     assert X_transformed.shape == expected_X_transformed_shape
60 | 
61 |     Xy = np.column_stack([X, y])
62 | 
63 |     Xy_transformed = model.transform(Xy)
64 |     assert Xy_transformed.shape == expected_Xy_transformed_shape
65 | 
66 |     with pytest.raises(ValueError):
67 |         model.transform(X[:, 2:])
68 | 
69 | 
70 | def test_automl_transformer_regression():
71 |     """Tests that rows in a regression dataset where the target column is not a finite numeric are imputed"""
72 |     data = read_csv_data(source="test/data/csv/regression_na_labels.csv")
73 |     X = data[:, :3]
74 |     y = data[:, 3]
75 |     header = Header(column_names=["x1", "x2", "x3", "class"], target_column_name="class")
76 |     automl_transformer = AutoMLTransformer(
77 |         header=header,
78 |         feature_transformer=RobustImputer(strategy="constant", fill_values=0),
79 |         target_transformer=NALabelEncoder(),
80 |     )
81 |     model = automl_transformer.fit(X, y)
82 |     X_transformed = model.transform(X)
83 |     assert X_transformed.shape == X.shape
84 | 
85 |     Xy = np.concatenate((X, y.reshape(-1, 1)), axis=1)
86 | 
87 |     Xy_transformed = model.transform(Xy)
88 |     assert Xy_transformed.shape == (3, 4)
89 |     assert np.array_equal(
90 |         Xy_transformed, np.array([[1.1, 1.0, 2.0, 3.0], [2.2, 4.0, 0.0, 5.0], [3.3, 12.0, 13.0, 14.0]])
91 |     )
92 | 


--------------------------------------------------------------------------------
/test/test_common.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #      http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | 
14 | """
15 | General tests for all estimators in sagemaker-sklearn-extension.
16 | """
17 | import pytest
18 | 
19 | from sklearn.utils.estimator_checks import check_estimator
20 | 
21 | from sagemaker_sklearn_extension.feature_extraction.text import MultiColumnTfidfVectorizer
22 | from sagemaker_sklearn_extension.feature_extraction.date_time import DateTimeVectorizer
23 | from sagemaker_sklearn_extension.feature_extraction.sequences import TSFeatureExtractor
24 | from sagemaker_sklearn_extension.feature_extraction.sequences import TSFlattener
25 | from sagemaker_sklearn_extension.feature_extraction.sequences import TSFreshFeatureExtractor
26 | from sagemaker_sklearn_extension.impute import RobustImputer
27 | from sagemaker_sklearn_extension.impute import RobustMissingIndicator
28 | from sagemaker_sklearn_extension.preprocessing import LogExtremeValuesTransformer
29 | from sagemaker_sklearn_extension.preprocessing import NALabelEncoder
30 | from sagemaker_sklearn_extension.preprocessing import QuadraticFeatures
31 | from sagemaker_sklearn_extension.preprocessing import QuantileExtremeValuesTransformer
32 | from sagemaker_sklearn_extension.preprocessing import RemoveConstantColumnsTransformer
33 | from sagemaker_sklearn_extension.preprocessing import RobustLabelEncoder
34 | from sagemaker_sklearn_extension.preprocessing import RobustStandardScaler
35 | from sagemaker_sklearn_extension.preprocessing import ThresholdOneHotEncoder
36 | from sagemaker_sklearn_extension.preprocessing import WOEEncoder
37 | 
38 | 
39 | @pytest.mark.parametrize(
40 |     "Estimator",
41 |     [
42 |         DateTimeVectorizer(),
43 |         LogExtremeValuesTransformer(),
44 |         MultiColumnTfidfVectorizer(),
45 |         NALabelEncoder(),
46 |         QuadraticFeatures(),
47 |         QuantileExtremeValuesTransformer(),
48 |         RobustImputer(),
49 |         RemoveConstantColumnsTransformer(),
50 |         RobustLabelEncoder(),
51 |         RobustMissingIndicator(),
52 |         RobustStandardScaler(),
53 |         ThresholdOneHotEncoder(),
54 |         WOEEncoder(),
55 |         TSFeatureExtractor(),
56 |         TSFlattener(),
57 |         TSFreshFeatureExtractor(),
58 |     ],
59 | )
60 | def test_all_estimators(Estimator):
61 |     return check_estimator(Estimator)
62 | 


--------------------------------------------------------------------------------
/test/test_data.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
  4 | # may not use this file except in compliance with the License. A copy of
  5 | # the License is located at
  6 | #
  7 | #      http://aws.amazon.com/apache2.0/
  8 | #
  9 | # or in the "license" file accompanying this file. This file is
 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 11 | # ANY KIND, either express or implied. See the License for the specific
 12 | # language governing permissions and limitations under the License.
 13 | 
 14 | import numpy as np
 15 | import pytest
 16 | from scipy.sparse import csr_matrix, issparse
 17 | 
 18 | from sagemaker_sklearn_extension.preprocessing import QuadraticFeatures, RobustStandardScaler
 19 | 
 20 | 
 21 | def _n_choose_2(n):
 22 |     """Calculates the number of 2-combinations of n elements."""
 23 |     return (n * (n - 1)) // 2
 24 | 
 25 | 
 26 | X = np.array([[1.0, 5.0], [2.0, 3.0], [1.0, 1.0],])
 27 | X_sparse = csr_matrix(X)
 28 | X_standardized = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
 29 | 
 30 | X_small = np.arange(6).reshape((2, 3))
 31 | X_small_n_rows, X_small_n_cols = X_small.shape
 32 | X_small_n_col_combinations = _n_choose_2(X_small_n_cols)
 33 | 
 34 | X_low_nnz = np.array(
 35 |     [[1.0, 5.0, 0], [2.0, 0.0, 0], [2.0, 1.0, 0], [1.0, 0.0, 1], [2.0, 3.0, 0], [3.0, 0.0, 3], [4.0, 5.0, 0],]
 36 | )
 37 | low_nnz_mask = np.where((np.count_nonzero(X_low_nnz, axis=0) / X_low_nnz.shape[0]) > 0.3, 1, 0)
 38 | X_low_nnz_standardized = (X_low_nnz - np.mean(X_low_nnz, axis=0) * low_nnz_mask) / np.std(X_low_nnz, axis=0)
 39 | 
 40 | 
 41 | def test_quadratic_features_explicit():
 42 |     """Explicitly test the return value for a small float-filled input matrix."""
 43 |     X_observed = QuadraticFeatures().fit_transform(X_standardized)
 44 |     X_expected = np.hstack(
 45 |         [
 46 |             X_standardized,
 47 |             (X_standardized[:, 0] * X_standardized[:, 0]).reshape((-1, 1)),
 48 |             (X_standardized[:, 1] * X_standardized[:, 1]).reshape((-1, 1)),
 49 |             (X_standardized[:, 0] * X_standardized[:, 1]).reshape((-1, 1)),
 50 |         ]
 51 |     )
 52 |     np.testing.assert_array_equal(X_observed, X_expected)
 53 | 
 54 | 
 55 | def test_quadratic_features_max_n_features():
 56 |     """Test that small but valid ``max_n_features`` produces a non-complete set of combinations."""
 57 |     transformer = QuadraticFeatures(max_n_features=5)
 58 |     transformer.fit(X_small)
 59 |     assert len(transformer.combinations_) == 5 - X_small_n_cols
 60 | 
 61 | 
 62 | @pytest.mark.parametrize(
 63 |     ["include_bias", "max_n_features"],
 64 |     [
 65 |         # Exactly at limit of what's allowed.
 66 |         (False, X_small_n_col_combinations),
 67 |         (True, X_small_n_col_combinations + 1),
 68 |         # Smaller than limit of what's allowed.
 69 |         (False, X_small_n_col_combinations - 1),
 70 |         (True, X_small_n_col_combinations - 1),
 71 |     ],
 72 | )
 73 | def test_quadratic_features_max_n_features_too_small(include_bias, max_n_features):
 74 |     """Test that when the ``max_n_features`` parameter is too small, an exception is raised."""
 75 |     transformer = QuadraticFeatures(include_bias=include_bias, max_n_features=max_n_features,)
 76 |     with pytest.raises(ValueError):
 77 |         transformer.fit(X_small)
 78 | 
 79 | 
 80 | def test_quadratic_features_random_state_invariance():
 81 |     """Test that the exact same input is produced when using the same random seed."""
 82 |     transformer1 = QuadraticFeatures(random_state=0)
 83 |     transformer2 = QuadraticFeatures(random_state=0)
 84 |     X1 = transformer1.fit_transform(X_small)
 85 |     X2 = transformer2.fit_transform(X_small)
 86 |     assert np.all(X1 == X2)
 87 | 
 88 | 
 89 | @pytest.mark.parametrize(
 90 |     ["include_bias", "interaction_only", "n_output_features"],
 91 |     [
 92 |         (False, False, X_small_n_cols + 2 * X_small_n_col_combinations),
 93 |         (True, False, X_small_n_cols + 2 * X_small_n_col_combinations + 1),
 94 |         (False, True, X_small_n_cols + X_small_n_col_combinations),
 95 |         (True, True, X_small_n_cols + X_small_n_col_combinations + 1),
 96 |     ],
 97 | )
 98 | def test_quadratic_features_shape(include_bias, interaction_only, n_output_features):
 99 |     """Test that various parameter values produce expected resulting data shapes."""
100 |     transformer = QuadraticFeatures(include_bias=include_bias, interaction_only=interaction_only,)
101 |     XQ = transformer.fit_transform(X_small)
102 |     assert XQ.shape == (X_small_n_rows, n_output_features)
103 | 
104 | 
105 | def test_quadratic_features_single_column_input_explicit():
106 |     """Test that using a single-column matrix as input produces the expected output."""
107 |     X_observed = QuadraticFeatures().fit_transform(X_standardized[:, 0].reshape((-1, 1)))
108 |     X_expected = np.hstack([X_standardized[:, [0]], (X_standardized[:, 0] * X_standardized[:, 0]).reshape((-1, 1)),])
109 |     np.testing.assert_array_equal(X_observed, X_expected)
110 | 
111 | 
112 | def test_robust_standard_scaler_dense():
113 |     scaler = RobustStandardScaler()
114 |     X_observed = scaler.fit_transform(X)
115 | 
116 |     np.testing.assert_array_equal(X_observed, X_standardized)
117 | 
118 | 
119 | def test_robust_standard_scaler_sparse():
120 |     scaler = RobustStandardScaler()
121 |     X_observed = scaler.fit_transform(X_sparse)
122 | 
123 |     assert issparse(X_observed)
124 |     np.testing.assert_array_almost_equal(X_observed.toarray(), X / np.std(X, axis=0))
125 | 
126 | 
127 | def test_robust_standard_dense_with_low_nnz_columns():
128 |     scaler = RobustStandardScaler()
129 |     X_observed = scaler.fit_transform(X_low_nnz)
130 |     np.testing.assert_array_almost_equal(X_observed, X_low_nnz_standardized)
131 | 


--------------------------------------------------------------------------------
/test/test_date_time.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
  4 | # may not use this file except in compliance with the License. A copy of
  5 | # the License is located at
  6 | #
  7 | #      http://aws.amazon.com/apache2.0/
  8 | #
  9 | # or in the "license" file accompanying this file. This file is
 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 11 | # ANY KIND, either express or implied. See the License for the specific
 12 | # language governing permissions and limitations under the License.
 13 | 
 14 | from datetime import datetime
 15 | import numpy as np
 16 | import pytest
 17 | 
 18 | from dateutil import parser
 19 | 
 20 | from sagemaker_sklearn_extension.feature_extraction.date_time import DateTimeVectorizer, DateTimeDefinition
 21 | 
 22 | 
 23 | data_array = [
 24 |     [parser.parse("Jan 5th, 2012, 12:34am")],
 25 |     [parser.parse("Feb 2, 2011, 2:34:04am")],
 26 |     [parser.parse("Jan 1st, 2012, 11:59:59pm")],
 27 |     [parser.parse("Dec 2th, 2012, 12:00am")],
 28 |     [parser.parse("Jan 3th, 2012, 12:34am")],
 29 |     [parser.parse("Jan 3th, 2018, 1:34am")],
 30 | ]
 31 | 
 32 | data = np.array(data_array)
 33 | 
 34 | 
 35 | @pytest.mark.parametrize("data_shape", [(2, 3), (2, 3, 4), (2,)])
 36 | def test_cyclic_transform_outputs_correct_shape(data_shape):
 37 |     size = int(np.prod(data_shape))
 38 |     data = np.arange(size).reshape(data_shape)
 39 |     ret = DateTimeVectorizer._cyclic_transform(data, low=0, high=size - 1)
 40 | 
 41 |     new_shape = list(data_shape)
 42 |     new_shape[-1] *= 2
 43 |     new_shape = tuple(new_shape)
 44 |     assert ret.shape == new_shape
 45 | 
 46 |     ret = ret.reshape((-1, 2))
 47 |     ret = ret ** 2
 48 |     assert np.linalg.norm(np.sum(ret, axis=1) - 1) < 1e-8
 49 | 
 50 | 
 51 | @pytest.mark.parametrize("mode", ["ordinal", "cyclic"])
 52 | def test_fit_transform_works_with_non_np_input(mode):
 53 |     dtv = DateTimeVectorizer(
 54 |         mode=mode,
 55 |         extract=[
 56 |             DateTimeDefinition.HOUR.value,
 57 |             DateTimeDefinition.SECOND.value,
 58 |             DateTimeDefinition.YEAR.value,
 59 |             DateTimeDefinition.MONTH.value,
 60 |         ],
 61 |     )
 62 |     output = dtv.fit_transform(data_array)
 63 |     assert output.shape[0] == len(data_array)
 64 |     assert output.shape[1] > 1
 65 | 
 66 | 
 67 | @pytest.mark.parametrize("data_shape", [(2, 3), (2, 3, 4), (2,)])
 68 | def test_cyclic_transform_outputs_correct_cyclic_values(data_shape):
 69 |     size = int(np.prod(data_shape))
 70 |     data = np.arange(size).reshape(data_shape)
 71 |     ret = DateTimeVectorizer._cyclic_transform(data, low=0, high=size - 1)
 72 |     ret = ret.reshape((-1, 2))
 73 |     ret = ret ** 2
 74 |     assert np.linalg.norm(np.sum(ret, axis=1) - 1) < 1e-8
 75 | 
 76 | 
 77 | def test_fit_eliminates_constant_columns():
 78 |     dtv = DateTimeVectorizer(
 79 |         mode="ordinal",
 80 |         extract=[
 81 |             DateTimeDefinition.HOUR.value,
 82 |             DateTimeDefinition.SECOND.value,
 83 |             DateTimeDefinition.YEAR.value,
 84 |             DateTimeDefinition.MONTH.value,
 85 |         ],
 86 |     )
 87 |     # taking only odd items. Year and month are always the same.
 88 |     cur_data = data.reshape((-1, 2))[:, 0].reshape((-1, 1))
 89 |     dtv = dtv.fit(cur_data)
 90 |     # Year and month are constants, make sure they are out
 91 |     assert dtv.extract_ == [DateTimeDefinition.HOUR.value, DateTimeDefinition.SECOND.value]
 92 | 
 93 | 
 94 | @pytest.mark.parametrize("mode", ["ordinal", "cyclic"])
 95 | def test_fit_eliminates_constant_columns_multicol_input(mode):
 96 |     # set up data. Properties:
 97 |     # Hour: Constant thrghout - eliminate
 98 |     # Year: Constant in both, but has different value accross columns - should eliminate
 99 |     # Month: Constant in column 2, not in 1 - should not eliminate
100 |     # Day of month: not constant in both columns - should not eliminate
101 |     col1 = [
102 |         parser.parse("Jan 5th, 2012"),
103 |         parser.parse("Feb 2, 2012"),
104 |         parser.parse("Jan 1st, 2012"),
105 |     ]
106 |     col2 = [
107 |         parser.parse("Dec 2th, 2013"),
108 |         parser.parse("Dec 3th, 2013"),
109 |         parser.parse("Dec 3th, 2013"),
110 |     ]
111 | 
112 |     cur_data = np.array([col1, col2]).T
113 | 
114 |     dtv = DateTimeVectorizer(
115 |         mode=mode,
116 |         extract=[
117 |             DateTimeDefinition.HOUR.value,
118 |             DateTimeDefinition.DAY_OF_MONTH.value,
119 |             DateTimeDefinition.YEAR.value,
120 |             DateTimeDefinition.MONTH.value,
121 |         ],
122 |     )
123 |     # taking only odd items. Year and month are always the same.
124 |     dtv = dtv.fit(cur_data)
125 |     # Year and month are constants, make sure they are out
126 |     assert dtv.extract_ == [DateTimeDefinition.DAY_OF_MONTH.value, DateTimeDefinition.MONTH.value]
127 | 
128 | 
129 | def test_transform_categorical():
130 |     extract_keys = [k for k in dir(DateTimeDefinition) if not k.startswith("_")]
131 |     extract = [DateTimeDefinition.__dict__[k].value for k in extract_keys]
132 |     dtv = DateTimeVectorizer(mode="ordinal", extract=extract, ignore_constant_columns=False)
133 |     dtv.fit(data)
134 |     output = dtv.transform(data)
135 | 
136 |     assert np.all(output >= 0)
137 | 
138 |     loc_year = extract_keys.index("YEAR")
139 |     np.testing.assert_array_equal(output[:, loc_year], np.array([2012, 2011, 2012, 2012, 2012, 2018]))
140 | 
141 |     loc_month = extract_keys.index("MONTH")
142 |     np.testing.assert_array_equal(output[:, loc_month], np.array([0, 1, 0, 11, 0, 0]))
143 | 
144 | 
145 | def test_transform_cyclic_leaves_year():
146 |     extract_keys = [k for k in dir(DateTimeDefinition) if not k.startswith("_")]
147 |     extract = [DateTimeDefinition.__dict__[k].value for k in extract_keys]
148 | 
149 |     dtv = DateTimeVectorizer(mode="cyclic", extract=extract, ignore_constant_columns=False)
150 |     dtv.fit(data)
151 |     output = dtv.transform(data)
152 | 
153 |     loc_year = extract_keys.index("YEAR")
154 |     loc_year *= 2
155 |     np.testing.assert_array_equal(output[:, loc_year], np.array([2012, 2011, 2012, 2012, 2012, 2018]))
156 | 
157 |     assert output.shape[1] == len(extract) * 2 - 1
158 | 
159 | 
160 | def test_fit_transform_cyclic_leaves_year():
161 |     extract_keys = [k for k in dir(DateTimeDefinition) if not k.startswith("_")]
162 |     extract = [DateTimeDefinition.__dict__[k].value for k in extract_keys]
163 | 
164 |     dtv = DateTimeVectorizer(mode="cyclic", extract=extract, ignore_constant_columns=False)
165 |     output = dtv.fit_transform(data)
166 | 
167 |     loc_year = extract_keys.index("YEAR")
168 |     loc_year *= 2
169 |     np.testing.assert_array_equal(output[:, loc_year], np.array([2012, 2011, 2012, 2012, 2012, 2018]))
170 | 
171 |     assert output.shape[1] == len(dtv.extract_) * 2 - 1
172 | 
173 | 
174 | def test_fit_transform_accepts_mixed_str_datetime():
175 |     cur_data_array = data_array + [["Feb 12th, 15:33, 2011"], ["Nov 5th, 1am, 1975"], [432], [None], ["Feb 45th, 2018"]]
176 | 
177 |     dtv = DateTimeVectorizer(mode="ordinal")
178 |     processed = dtv.fit_transform(cur_data_array)
179 |     year_location = dtv.extract_.index(DateTimeDefinition.YEAR.value)
180 |     assert processed[0, year_location] == 2012
181 |     assert processed[-4, year_location] == 1975
182 |     assert np.isnan(processed[-3, year_location])
183 |     assert np.isnan(processed[-2, year_location])
184 |     assert np.isnan(processed[-1, year_location])
185 | 
186 |     dtv = DateTimeVectorizer(mode="cyclic")
187 |     processed = dtv.fit_transform(cur_data_array)
188 |     assert all(np.isnan(processed[-1]))
189 |     assert not any(np.isnan(processed[-4]))
190 |     assert not any(np.isnan(processed[0]))
191 | 
192 | 
193 | def test_fit_transform_default_datetime():
194 |     cur_data_array = [["Monday"], ["Tuesday"], ["Friday"]]
195 | 
196 |     dtv = DateTimeVectorizer(mode="ordinal", ignore_constant_columns=False, default_datetime=datetime(1900, 1, 1))
197 |     processed = dtv.fit_transform(cur_data_array)
198 |     year_location = dtv.extract_.index(DateTimeDefinition.YEAR.value)
199 |     month_location = dtv.extract_.index(DateTimeDefinition.MONTH.value)
200 |     weekday_location = dtv.extract_.index(DateTimeDefinition.WEEKDAY.value)
201 | 
202 |     assert processed[0, year_location] == 1900
203 |     assert processed[0, month_location] == 0
204 |     assert processed[0, weekday_location] == 0
205 | 
206 |     assert processed[1, year_location] == 1900
207 |     assert processed[1, month_location] == 0
208 |     assert processed[1, weekday_location] == 1
209 | 
210 |     assert processed[2, year_location] == 1900
211 |     assert processed[2, month_location] == 0
212 |     assert processed[2, weekday_location] == 4
213 | 


--------------------------------------------------------------------------------
/test/test_feature_extraction_text.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
  4 | # may not use this file except in compliance with the License. A copy of
  5 | # the License is located at
  6 | #
  7 | #      http://aws.amazon.com/apache2.0/
  8 | #
  9 | # or in the "license" file accompanying this file. This file is
 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 11 | # ANY KIND, either express or implied. See the License for the specific
 12 | # language governing permissions and limitations under the License.
 13 | 
 14 | import numpy as np
 15 | import pytest
 16 | import scipy.sparse as sp
 17 | 
 18 | from sagemaker_sklearn_extension.feature_extraction.text import MultiColumnTfidfVectorizer
 19 | 
 20 | from sklearn.feature_extraction.text import TfidfVectorizer
 21 | 
 22 | 
 23 | corpus = np.array(
 24 |     [
 25 |         ["Cats eat rats.", "Rats are mammals."],
 26 |         ["Dogs chase cats.", "Cats have ears."],
 27 |         ["People like dogs.", "People are mammals."],
 28 |         ["People hate rats.", "Rats are quite smart."],
 29 |     ]
 30 | )
 31 | 
 32 | 
 33 | def test_multi_column_tfidf_vectorizer():
 34 |     vec = MultiColumnTfidfVectorizer()
 35 |     output = vec.fit_transform(corpus)
 36 | 
 37 |     assert isinstance(output, sp.coo.coo_matrix)
 38 | 
 39 |     observed = output.todense()
 40 |     expected = np.hstack(
 41 |         [
 42 |             TfidfVectorizer().fit_transform(corpus[:, 0]).todense(),
 43 |             TfidfVectorizer().fit_transform(corpus[:, 1]).todense(),
 44 |         ]
 45 |     )
 46 | 
 47 |     np.testing.assert_array_equal(observed, expected)
 48 | 
 49 | 
 50 | def test_multi_column_tfidf_vectorizer_fit_dim_error():
 51 |     with pytest.raises(ValueError):
 52 |         vec = MultiColumnTfidfVectorizer()
 53 |         vec.fit(corpus[0])
 54 | 
 55 | 
 56 | def test_multi_column_tfidf_vectorizer_transform_dim_error():
 57 |     with pytest.raises(ValueError):
 58 |         vec = MultiColumnTfidfVectorizer()
 59 |         vec.fit(corpus)
 60 |         vec.transform(corpus[0])
 61 | 
 62 | 
 63 | def test_multi_column_tfidf_vectorizer_vocabulary_sizes_large():
 64 |     vocabulary_sizes = [TfidfVectorizer().fit_transform(corpus[:, i]).shape[1] + 1 for i in range(corpus.shape[1])]
 65 |     vectorizer = MultiColumnTfidfVectorizer(vocabulary_sizes=vocabulary_sizes)
 66 |     observed = vectorizer.fit_transform(corpus)
 67 |     assert observed.shape[1] == sum(vocabulary_sizes)
 68 |     assert sp.issparse(observed)
 69 | 
 70 | 
 71 | def test_multi_column_tfidf_vectorizer_vocabulary_sizes_small():
 72 |     vocabulary_sizes = [TfidfVectorizer().fit_transform(corpus[:, i]).shape[1] - 1 for i in range(corpus.shape[1])]
 73 |     vectorizer = MultiColumnTfidfVectorizer(vocabulary_sizes=vocabulary_sizes)
 74 |     observed = vectorizer.fit_transform(corpus)
 75 |     assert observed.shape[1] == sum(vocabulary_sizes)
 76 |     assert sp.issparse(observed)
 77 | 
 78 | 
 79 | def test_multi_column_tfidf_vectorizer_vocabulary_sizes_error():
 80 |     with pytest.raises(ValueError):
 81 |         vectorizer = MultiColumnTfidfVectorizer(vocabulary_sizes=[1])
 82 |         vectorizer.fit(corpus)
 83 | 
 84 | 
 85 | @pytest.mark.parametrize(
 86 |     "kwargs, data, shape",
 87 |     [
 88 |         ({"min_df": 0.9}, corpus, (4, 0)),
 89 |         ({"max_df": 0.1}, corpus, (4, 0)),
 90 |         ({"max_df": 0.9941}, np.array([[""], [""], [""]]), (3, 0)),
 91 |     ],
 92 | )
 93 | def test_multi_column_tfidf_vectorizer_zero_output_tokens_ignore_zero_vocab_on(kwargs, data, shape):
 94 |     """Tests for empty matrix when no terms remain after pruning"""
 95 |     vec = MultiColumnTfidfVectorizer(**kwargs)
 96 |     output = vec.fit_transform(data)
 97 |     assert output.shape == shape
 98 | 
 99 | 
100 | @pytest.mark.parametrize(
101 |     "kwargs, data",
102 |     [
103 |         ({"min_df": 0.9, "ignore_columns_with_zero_vocabulary_size": False}, corpus),
104 |         ({"max_df": 0.1, "ignore_columns_with_zero_vocabulary_size": False}, corpus),
105 |         ({"max_df": 0.9941, "ignore_columns_with_zero_vocabulary_size": False}, np.array([[""], [""], [""]])),
106 |     ],
107 | )
108 | def test_multi_column_tfidf_vectorizer_zero_output_tokens_ignore_zero_vocab_off(kwargs, data):
109 |     """Tests for ValueError when no terms remain after pruning and `ignore_overpruned_columns=False`"""
110 |     with pytest.raises(ValueError):
111 |         vec = MultiColumnTfidfVectorizer(**kwargs)
112 |         vec.fit_transform(data)
113 | 
114 | 
115 | @pytest.mark.parametrize("kwargs, output_shape", [({"min_df": 0.9}, (4, 3)), ({"max_df": 0.9}, (4, 8))])
116 | def test_multi_column_tfidf_vectorizer_one_column_zero_output_tokens(kwargs, output_shape):
117 |     """Tests that a TF-IDF document-term matrix is still returned when only one column breaks"""
118 |     corpus = np.array(
119 |         [
120 |             ["Cats eat rats.", "Rats are mammals."],
121 |             ["Dogs chase cats.", "Rats are mammals."],
122 |             ["People like dogs.", "Rats are mammals."],
123 |             ["People hate rats.", "Rats are mammals."],
124 |         ]
125 |     )
126 | 
127 |     vec = MultiColumnTfidfVectorizer(**kwargs)
128 |     output = vec.fit_transform(corpus)
129 |     assert output.shape == output_shape
130 | 


--------------------------------------------------------------------------------
/test/test_header.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #      http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | 
14 | import pytest
15 | 
16 | from sagemaker_sklearn_extension.externals import Header
17 | 
18 | 
19 | @pytest.mark.parametrize("names, col_idx, feature_idx", [(["a"], [0], [0]), (["a", "c"], [0, 2], [0, 1])])
20 | def test_header_happy(names, col_idx, feature_idx):
21 |     h = Header(column_names=["a", "b", "c"], target_column_name="b")
22 |     assert h.target_column_index == 1
23 |     assert h.as_feature_indices(names) == feature_idx
24 |     assert h.as_column_indices(names) == col_idx
25 |     assert h.num_features == 2
26 |     assert h.num_columns == 3
27 | 
28 | 
29 | def test_header_errors_target_missing():
30 |     with pytest.raises(ValueError):
31 |         Header(column_names=["a", "b"], target_column_name="c")
32 | 
33 | 
34 | @pytest.mark.parametrize("column_names, target_column", [(["a", "b", "b", "c"], "c"), (["a", "b", "c", "c"], "c")])
35 | def test_header_errors_duplicate_columns(column_names, target_column):
36 |     with pytest.raises(ValueError):
37 |         Header(column_names=column_names, target_column_name=target_column)
38 | 
39 | 
40 | @pytest.mark.parametrize(
41 |     "names, error_regex",
42 |     [(["unknown"], "'unknown' is an unknown feature name"), (["b"], "'b' is the target column name.")],
43 | )
44 | def test_header_error_as_feature_indices(names, error_regex):
45 |     h = Header(column_names=["a", "b", "c"], target_column_name="b")
46 |     assert h.target_column_index == 1
47 |     with pytest.raises(ValueError) as err:
48 |         h.as_feature_indices(names)
49 |         err.match(error_regex)
50 | 
51 | 
52 | def test_header_error_as_column_index():
53 |     h = Header(column_names=["a", "b", "c"], target_column_name="b")
54 |     assert h.target_column_index == 1
55 |     with pytest.raises(ValueError):
56 |         h.as_column_indices(["unknown"])
57 | 
58 | 
59 | def test_header_feature_column_index_order():
60 |     h = Header(column_names=["a", "b", "c", "d"], target_column_name="c")
61 |     assert h.feature_column_indices == [0, 1, 3]
62 | 


--------------------------------------------------------------------------------
/test/test_impute.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #      http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | 
14 | import numpy as np
15 | import pytest
16 | 
17 | from sklearn.utils.testing import assert_array_equal
18 | 
19 | from sagemaker_sklearn_extension.impute import RobustImputer, RobustMissingIndicator, is_finite_numeric
20 | 
21 | X_impute = np.array([[np.nan, 2, np.inf], [4, np.inf, 6], [10, np.nan, 10]])
22 | X_impute_boolean_mask = np.array([[True, False, True], [False, True, False], [False, True, False]])
23 | X_impute_string = X_impute.astype("O")
24 | X_impute_mixed = np.array([["2", "a"], ["inf", "nan"], ["-1e2", "10.0"], ["0.0", "foobar"], ["-inf", "8"]])
25 | X_impute_mixed_boolean_mask = np.array([[False, True], [True, True], [False, False], [False, True], [True, False]])
26 | X_impute_categorical = np.array([["hot dog"], ["hot dog"], ["hot dog"], ["banana"]])
27 | X_imputed_median = np.array([[7.0, 2.0, 8.0], [4.0, 2.0, 6.0], [10.0, 2.0, 10.0]])
28 | X_imputed_constant = np.array([[1.0, 2.0, 13.0], [4.0, 7.0, 6.0], [10.0, 7.0, 10.0]])
29 | X_imputed_mixed = np.array([[2.0, 9.0], [0.0, 9.0], [-1e2, 10.0], [0.0, 9.0], [0.0, 8.0]])
30 | X_imputed_categorical = np.array([["hot dog"], ["hot dog"], ["hot dog"], ["not hot dog"]])
31 | 
32 | transform_error_msg = "'transform' input X has 4 features per sample, expected 3 from 'fit' input"
33 | fill_values_error_msg = "'fill_values' should have length equal to number of features in X 3, got 5"
34 | 
35 | 
36 | @pytest.mark.parametrize(
37 |     "val, expected", [(np.array([1738, "10", np.inf, np.nan, "foobar"]), np.array([True, True, False, False, False]))]
38 | )
39 | def test_is_finite_numeric(val, expected):
40 |     observed = is_finite_numeric(val)
41 |     assert_array_equal(observed, expected)
42 | 
43 | 
44 | @pytest.mark.parametrize(
45 |     "X, X_expected, strategy, fill_values",
46 |     [
47 |         (X_impute_mixed, X_imputed_mixed, "median", None),
48 |         (X_impute, X_imputed_median, "median", None),
49 |         (X_impute_string, X_imputed_median, "median", None),
50 |         (X_impute, X_imputed_constant, "constant", [1.0, 7.0, 13.0]),
51 |         (X_impute_string, X_imputed_constant, "constant", [1.0, 7.0, 13.0]),
52 |     ],
53 | )
54 | def test_robust_imputer(X, X_expected, strategy, fill_values):
55 |     robust_imputer = RobustImputer(strategy=strategy, fill_values=fill_values)
56 |     robust_imputer.fit(X)
57 |     X_observed = robust_imputer.transform(X)
58 | 
59 |     assert_array_equal(X_observed, X_expected)
60 | 
61 | 
62 | def test_robust_imputer_categorical_custom_function():
63 |     robust_imputer = RobustImputer(
64 |         dtype=np.dtype("O"), strategy="constant", fill_values="not hot dog", mask_function=lambda x: x == "hot dog"
65 |     )
66 |     robust_imputer.fit(X_impute_categorical)
67 |     X_observed = robust_imputer.transform(X_impute_categorical)
68 | 
69 |     assert_array_equal(X_observed, X_imputed_categorical)
70 | 
71 | 
72 | def test_robust_imputer_transform_dim_error():
73 |     with pytest.raises(ValueError, match=transform_error_msg):
74 |         robust_imputer = RobustImputer()
75 |         robust_imputer.fit(X_impute)
76 |         robust_imputer.transform(np.zeros((3, 4)))
77 | 
78 | 
79 | def test_robust_imputer_fill_values_dim_error():
80 |     with pytest.raises(ValueError, match=fill_values_error_msg):
81 |         robust_imputer = RobustImputer(strategy="constant", fill_values=np.zeros(5))
82 |         robust_imputer.fit(X_impute)
83 | 
84 | 
85 | @pytest.mark.parametrize(
86 |     "X, boolean_mask_X", [(X_impute_mixed, X_impute_mixed_boolean_mask), (X_impute, X_impute_boolean_mask)]
87 | )
88 | def test_robust_missing_indicator(X, boolean_mask_X):
89 |     robust_indicator = RobustMissingIndicator()
90 |     robust_indicator.fit(X)
91 |     boolean_mask_X_observed = robust_indicator.transform(X)
92 | 
93 |     assert_array_equal(boolean_mask_X_observed, boolean_mask_X)
94 | 


--------------------------------------------------------------------------------
/test/test_preprocessing.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
  4 | # may not use this file except in compliance with the License. A copy of
  5 | # the License is located at
  6 | #
  7 | #      http://aws.amazon.com/apache2.0/
  8 | #
  9 | # or in the "license" file accompanying this file. This file is
 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 11 | # ANY KIND, either express or implied. See the License for the specific
 12 | # language governing permissions and limitations under the License.
 13 | 
 14 | import numpy as np
 15 | import pytest
 16 | 
 17 | from sagemaker_sklearn_extension.preprocessing import (
 18 |     LogExtremeValuesTransformer,
 19 |     QuantileExtremeValuesTransformer,
 20 |     RemoveConstantColumnsTransformer,
 21 |     log_transform,
 22 |     quantile_transform_nonrandom,
 23 | )
 24 | 
 25 | np.random.seed(0)
 26 | 
 27 | X_zeros = np.zeros((10, 10))
 28 | X_extreme_vals = np.array(
 29 |     [
 30 |         [0.0, 0.0, 0.0],
 31 |         [-1.0, 1.0, 1.0],
 32 |         [-2.0, 2.0, 2.0],
 33 |         [-3.0, 3.0, 3.0],
 34 |         [-4.0, 4.0, 4.0],
 35 |         [-5.0, 5.0, 5.0],
 36 |         [-6.0, 6.0, 6.0],
 37 |         [-7.0, 7.0, 7.0],
 38 |         [-8.0, 8.0, 8.0],
 39 |         [-9.0, 9.0, 9.0],
 40 |         [-10.0, 10.0, 10.0],
 41 |         [-1e5, 1e6, 11.0],
 42 |     ]
 43 | )
 44 | X_log_extreme_vals = np.column_stack(
 45 |     [log_transform(X_extreme_vals.copy()[:, 0]), log_transform(X_extreme_vals.copy()[:, 1]), X_extreme_vals[:, 2]]
 46 | )
 47 | X_quantile_extreme_vals = np.column_stack(
 48 |     [
 49 |         quantile_transform_nonrandom(X_extreme_vals.copy()[:, 0]),
 50 |         quantile_transform_nonrandom(X_extreme_vals.copy()[:, 1]),
 51 |         X_extreme_vals[:, 2],
 52 |     ]
 53 | )
 54 | X_all_positive = 5 * np.random.random((100, 1)) + 20
 55 | X_extreme_all_positive = np.vstack([np.random.random((90, 1)) + 100, np.array(10 * [[5]], dtype=np.float64)])
 56 | X_log_extreme_all_positive = np.array([log_transform(X_extreme_all_positive.copy()[:, 0])]).reshape(-1, 1)
 57 | X_all_uniques = np.arange(20).reshape(4, 5)
 58 | X_one_val = np.column_stack([np.arange(20).reshape(4, 5), np.array([1, 1, 1, 1])])
 59 | X_nans = np.array([[np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan]])
 60 | X_no_uniques = np.zeros((4, 5))
 61 | 
 62 | 
 63 | @pytest.mark.parametrize(
 64 |     "X, X_expected",
 65 |     [
 66 |         (X_all_uniques, X_all_uniques),
 67 |         (X_one_val, X_one_val[:, :5]),
 68 |         (X_nans, np.empty((0, 3))),
 69 |         (X_no_uniques, np.empty((0, 5))),
 70 |     ],
 71 | )
 72 | def test_remove_constant_columns_transformer(X, X_expected):
 73 |     transformer = RemoveConstantColumnsTransformer()
 74 |     X_observed = transformer.fit_transform(X)
 75 | 
 76 |     np.testing.assert_array_equal(X_observed, X_expected)
 77 | 
 78 | 
 79 | @pytest.mark.parametrize(
 80 |     ["X", "X_expected"],
 81 |     [
 82 |         (X_extreme_vals, X_log_extreme_vals,),
 83 |         (X_zeros, X_zeros),
 84 |         (X_all_positive, X_all_positive),
 85 |         (X_extreme_all_positive, X_log_extreme_all_positive),
 86 |     ],
 87 | )
 88 | def test_log_extreme_value_transformer(X, X_expected):
 89 |     transformer = LogExtremeValuesTransformer(threshold_std=2.0)
 90 |     X_observed = transformer.fit_transform(X)
 91 | 
 92 |     np.testing.assert_array_almost_equal(X_observed, X_expected)
 93 | 
 94 | 
 95 | def test_log_extreme_value_transformer_state():
 96 |     t = LogExtremeValuesTransformer(threshold_std=2.0)
 97 |     X_observed = t.fit_transform(X_extreme_vals)
 98 | 
 99 |     np.testing.assert_array_almost_equal(t.nonnegative_cols_, [1, 2])
100 |     np.testing.assert_array_almost_equal(X_observed, X_log_extreme_vals)
101 | 
102 | 
103 | @pytest.mark.parametrize(
104 |     ["X", "X_expected"],
105 |     [(X_extreme_vals, X_quantile_extreme_vals), (X_zeros, X_zeros), (X_all_positive, X_all_positive),],
106 | )
107 | def test_extreme_value_transformer(X, X_expected):
108 |     transformer = QuantileExtremeValuesTransformer(threshold_std=2.0)
109 |     X_observed = transformer.fit_transform(X)
110 | 
111 |     np.testing.assert_array_almost_equal(X_observed, X_expected)
112 | 


--------------------------------------------------------------------------------
/test/test_read_data.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
  4 | # may not use this file except in compliance with the License. A copy of
  5 | # the License is located at
  6 | #
  7 | #      http://aws.amazon.com/apache2.0/
  8 | #
  9 | # or in the "license" file accompanying this file. This file is
 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 11 | # ANY KIND, either express or implied. See the License for the specific
 12 | # language governing permissions and limitations under the License.
 13 | 
 14 | import psutil
 15 | 
 16 | import csv
 17 | from contextlib import contextmanager
 18 | import json
 19 | import numpy as np
 20 | import os
 21 | import pytest
 22 | 
 23 | from mlio import list_files
 24 | from mlio import InMemoryStore, SageMakerPipe
 25 | from mlio import File as mlio_file
 26 | from sagemaker_sklearn_extension.externals.read_data import _convert_megabytes_to_bytes
 27 | from sagemaker_sklearn_extension.externals.read_data import _get_data
 28 | from sagemaker_sklearn_extension.externals.read_data import _get_reader
 29 | from sagemaker_sklearn_extension.externals.read_data import _get_size_total
 30 | from sagemaker_sklearn_extension.externals.read_data import _read_to_fit_memory
 31 | from sagemaker_sklearn_extension.externals.read_data import read_csv_data
 32 | 
 33 | 
 34 | DATA_FILES = [
 35 |     "test/data/csv/mock_datasplitter_output/manual.csv",
 36 |     "test/data/csv/mock_datasplitter_output/newline.csv",
 37 |     "test/data/csv/mock_datasplitter_output/excel.csv",
 38 |     "test/data/csv/mock_datasplitter_output/oneline.csv",
 39 |     "test/data/csv/missing_values.csv",
 40 |     "test/data/csv/dictionaries.csv",
 41 |     "test/data/csv/dirty.csv",
 42 | ]
 43 | DATA_FILES_SHAPE = [(8, 4), (10, 4), (3, 4), (1, 4), (7, 5), (147, 18), (19, 16)]
 44 | LARGE_DATA_4MB = "test/data/csv/kc_house_data.csv"
 45 | BUFFER_DATA = (
 46 |     "1,2,3,4\n"
 47 |     + "5,6,7,8\n"
 48 |     + "9,10,11,12\n"
 49 |     + "13,14,15,16\n"
 50 |     + "17,18,19,20\n"
 51 |     + "21,22,23,24\n"
 52 |     + "25,26,27,28\n"
 53 |     + "29,30,31,32"
 54 | )
 55 | 
 56 | 
 57 | @contextmanager
 58 | def managed_env_var(cfg):
 59 |     os.environ.update({"SM_INPUT_DATA_CONFIG": json.dumps(cfg)})
 60 |     try:
 61 |         yield os.environ
 62 |     finally:
 63 |         os.environ.pop("SM_INPUT_DATA_CONFIG")
 64 | 
 65 | 
 66 | csv1 = [
 67 |     ["1.0", 2.0, "3", 4, ""],
 68 |     ["a,b", "c\nd", "f", '"""', np.nan],
 69 | ]
 70 | csv2 = [
 71 |     [10, "2\r\n4", "hello", 4.0, "!"],
 72 |     [" space", "", "space ", "\n", "hello\n"],
 73 |     ['{a: 5, b: "hello"}', "[a, b, 2]", "[]", "nan", " "],
 74 | ]
 75 | 
 76 | 
 77 | @pytest.fixture(scope="session")
 78 | def csv_data_dir(tmpdir_factory):
 79 |     """Fixture which fills a temporary directory with (multiple) csv file(s)."""
 80 |     csv_data_directory = tmpdir_factory.mktemp("csv_file_paths")
 81 |     csv_file1 = csv_data_directory.join("file_1.csv")
 82 |     csv_file2 = csv_data_directory.join("file_2.csv")
 83 | 
 84 |     with open(csv_file1.strpath, "w") as csv_file_handle:
 85 |         csv_writer = csv.writer(csv_file_handle, dialect="excel")
 86 |         csv_writer.writerows(csv1)
 87 |     with open(csv_file2.strpath, "w") as csv_file_handle:
 88 |         csv_writer = csv.writer(csv_file_handle, dialect="excel")
 89 |         csv_writer.writerows(csv2)
 90 | 
 91 |     return str(csv_data_directory)
 92 | 
 93 | 
 94 | def test_excel_dialect(csv_data_dir):
 95 |     """Test that read_csv_data function properly reads files in the excel dialect."""
 96 |     generated_contents = read_csv_data(source=csv_data_dir + "/file_1.csv")
 97 | 
 98 |     assert generated_contents.shape == (len(csv1), len(csv1[0]))
 99 |     assert np.all(generated_contents == np.array([[str(v) for v in row] for row in csv1], dtype=np.str))
100 | 
101 | 
102 | def test_directory_content(csv_data_dir):
103 |     """Test that read_csv_data function reads content correctly from a directory"""
104 |     generated_contents = read_csv_data(source=csv_data_dir)
105 |     correct_array = csv1 + csv2
106 |     assert generated_contents.shape == (len(correct_array), len(correct_array[0]))
107 |     assert np.all(generated_contents == np.array([[str(v) for v in row] for row in correct_array], dtype=np.str))
108 | 
109 | 
110 | def test_get_reader_pipe_mode():
111 |     """Test for getting a 'CsvReader' object with 'Pipe' mode"""
112 |     with managed_env_var({"abc": {"TrainingInputMode": "Pipe"}}):
113 |         reader = _get_data(source="abc")
114 |         assert isinstance(reader[0], SageMakerPipe)
115 | 
116 | 
117 | def test_get_reader_file_mode():
118 |     """Test for getting a 'CsvReader' object with 'File' mode"""
119 |     source = "test/data/csv/mock_datasplitter_output"
120 |     with managed_env_var({os.path.basename(source): {"TrainingInputMode": "File"}}):
121 |         reader = _get_data(source=source)
122 |         assert isinstance(reader[0], mlio_file)
123 | 
124 | 
125 | def test_get_reader_mlio_file_object():
126 |     """Test for getting a 'CsvReader' with a mlio.File object source"""
127 |     source = "test/data/csv/mock_datasplitter_output"
128 |     files = list_files(source, pattern="*")
129 |     reader = _get_data(source=files[0])
130 |     assert isinstance(reader[0], mlio_file)
131 | 
132 | 
133 | def test_get_reader_inmemory_mode():
134 |     """Test for getting a 'CsvReader' object with 'InMemory' mode"""
135 |     buffer = BUFFER_DATA.encode()
136 |     reader = _get_data(source=buffer)
137 |     assert isinstance(reader[0], InMemoryStore)
138 | 
139 | 
140 | def test_read_csv_data_inmemory_mode():
141 |     """Test to make sure 'InMemory' mode reads in content correctly"""
142 |     generated_contents = read_csv_data(source=BUFFER_DATA.encode())
143 |     correct_array = []
144 |     for i in range(8):
145 |         correct_array.append([i * 4 + j for j in range(1, 5)])
146 |     assert generated_contents.shape == (len(correct_array), len(correct_array[0]))
147 |     assert np.all(generated_contents == np.array([[str(v) for v in row] for row in correct_array], dtype=np.str))
148 | 
149 | 
150 | def test_read_empty_buffer():
151 |     """Test for getting an empty array if the buffer is empty"""
152 |     generated_contents = read_csv_data(source="".encode())
153 |     assert generated_contents.size == 0
154 | 
155 | 
156 | def test_get_reader_no_env_var():
157 |     """Test for getting a 'CsvReader' object with no environmental variable"""
158 |     reader = _get_data(source="test/data/csv/mock_datasplitter_output")
159 |     assert isinstance(reader[0], mlio_file)
160 | 
161 | 
162 | @pytest.mark.parametrize("cfg, expected_error", [({}, KeyError), ({"abc": {}}, KeyError),])
163 | def test_get_reader_error_malformed_channel_cfg(cfg, expected_error):
164 |     """Test for reading from an invalid channel"""
165 |     with pytest.raises(expected_error):
166 |         with managed_env_var(cfg):
167 |             _get_reader(source="abc", batch_size=1000)
168 | 
169 | 
170 | def test_get_reader_incorrect_path():
171 |     """Test for reading from a path that doesn't exist"""
172 |     with pytest.raises(FileNotFoundError):
173 |         _get_reader(source="incorrect", batch_size=100)
174 | 
175 | 
176 | def test_read_csv_data_invalid_csv():
177 |     with pytest.raises(RuntimeError):
178 |         read_csv_data(source="test/data/csv/invalid.csv")
179 | 
180 | 
181 | @pytest.mark.parametrize("data_file, shape", [(file, shape) for file, shape in zip(DATA_FILES, DATA_FILES_SHAPE)])
182 | def test_read_csv_data(data_file, shape):
183 |     """Test for reading individual csv data files"""
184 |     array = read_csv_data(source=data_file, batch_size=1, fit_memory_percent=100.0, output_dtype="U")
185 |     assert array.shape == shape
186 |     assert array.dtype.kind in {"U", "S"}
187 | 
188 | 
189 | def test_read_csv_data_directory():
190 |     """Test for reading from a directory of data"""
191 |     array = read_csv_data(source="test/data/csv/mock_datasplitter_output", fit_memory_percent=100.0)
192 |     assert array.shape == (22, 4)
193 | 
194 | 
195 | def test_read_csv_data_sample_append():
196 |     """Test for reading data in chunks."""
197 |     array = read_csv_data(source=LARGE_DATA_4MB, fit_memory_percent=100.0)
198 |     assert array.shape == (38223, 21)
199 | 
200 | 
201 | def test_read_csv_data_samples():
202 |     """Test for sample case where the entire dataset doesn't fit into the available memory"""
203 |     total_memory_in_bytes = psutil.virtual_memory().total
204 |     two_mb_in_bytes = _convert_megabytes_to_bytes(2)
205 |     fraction_of_memory_to_use = two_mb_in_bytes / total_memory_in_bytes
206 |     sample_data = read_csv_data(
207 |         source=LARGE_DATA_4MB, fit_memory_percent=fraction_of_memory_to_use * 100, output_dtype="U"
208 |     )
209 |     assert sample_data.dtype.kind == "U"
210 |     assert _convert_megabytes_to_bytes(1.9) < sample_data.nbytes <= two_mb_in_bytes
211 | 
212 | 
213 | def test_read_csv_data_split():
214 |     X, y = read_csv_data(LARGE_DATA_4MB, target_column_index=0, output_dtype="U")
215 |     yX = read_csv_data(LARGE_DATA_4MB, output_dtype="U")
216 |     assert X.shape == (38223, 20)
217 |     assert y.shape == (38223,)
218 |     assert np.array_equal(np.hstack((y.reshape(-1, 1), X)).astype(str), yX)
219 |     assert X.dtype.kind == "U"
220 |     assert y.dtype.kind == "U"
221 | 
222 | 
223 | def test_read_csv_data_split_limited():
224 |     total_memory_in_bytes = psutil.virtual_memory().total
225 |     two_mb_in_bytes = _convert_megabytes_to_bytes(2)
226 |     fraction_of_memory_to_use = two_mb_in_bytes / total_memory_in_bytes
227 |     X, y = read_csv_data(
228 |         LARGE_DATA_4MB, target_column_index=0, fit_memory_percent=fraction_of_memory_to_use * 100, output_dtype="U"
229 |     )
230 |     assert _convert_megabytes_to_bytes(1.9) < (X.nbytes + y.nbytes) <= two_mb_in_bytes
231 |     assert X.dtype.kind == "U"
232 |     assert y.dtype.kind == "U"
233 | 
234 | 
235 | def test_read_csv_data_samples_object():
236 |     """Test for sample case where the entire dataset doesn't fit into the available memory"""
237 |     total_memory_in_bytes = psutil.virtual_memory().total
238 |     two_mb_in_bytes = _convert_megabytes_to_bytes(2)
239 |     fraction_of_memory_to_use = two_mb_in_bytes / total_memory_in_bytes
240 |     sample_data = read_csv_data(
241 |         source=LARGE_DATA_4MB, fit_memory_percent=fraction_of_memory_to_use * 100, output_dtype="object"
242 |     )
243 |     array_memory = _get_size_total(sample_data)
244 |     assert _convert_megabytes_to_bytes(1.9) < array_memory <= two_mb_in_bytes
245 |     assert sample_data.dtype.kind == "O"
246 | 
247 | 
248 | def test_read_csv_data_split_object():
249 |     X, y = read_csv_data(LARGE_DATA_4MB, target_column_index=0, output_dtype="O")
250 |     yX = read_csv_data(LARGE_DATA_4MB, output_dtype="O")
251 |     assert X.shape == (38223, 20)
252 |     assert y.shape == (38223,)
253 |     assert np.array_equal(np.hstack((y.reshape(-1, 1), X)), yX)
254 |     assert X.dtype.kind == "O"
255 |     assert y.dtype.kind == "O"
256 | 
257 | 
258 | def test_read_csv_data_split_limited_object():
259 |     total_memory_in_bytes = psutil.virtual_memory().total
260 |     two_mb_in_bytes = _convert_megabytes_to_bytes(2)
261 |     fraction_of_memory_to_use = two_mb_in_bytes / total_memory_in_bytes
262 |     X, y = read_csv_data(
263 |         LARGE_DATA_4MB, target_column_index=0, fit_memory_percent=fraction_of_memory_to_use * 100, output_dtype="O"
264 |     )
265 |     arrays_memory = _get_size_total(X) + _get_size_total(y)
266 |     assert _convert_megabytes_to_bytes(1.9) < arrays_memory <= two_mb_in_bytes
267 |     assert X.dtype.kind == "O"
268 |     assert y.dtype.kind == "O"
269 | 
270 | 
271 | @pytest.mark.parametrize("output_dtype", ["O", "U"])
272 | def test_read_to_fit_memory_dangling_element(tmpdir_factory, output_dtype):
273 |     """Test that data is read in correctly when `len(data) = 1 mod batch_size`."""
274 |     data = np.zeros((10, 10)).astype(str)
275 |     for i in range(data.shape[0]):
276 |         data[i, i] = str(i + 1)
277 |     data_dir = tmpdir_factory.mktemp("ten_line_csv")
278 |     data_file = data_dir.join("ten_lines.csv")
279 |     np.savetxt(data_file.strpath, data, delimiter=",", newline="\n", fmt="%s")
280 | 
281 |     X_read, y_read = _read_to_fit_memory(
282 |         _get_reader(data_dir.strpath, 3),
283 |         psutil.virtual_memory().total,
284 |         output_dtype=output_dtype,
285 |         target_column_index=0,
286 |     )
287 |     assert np.array_equal(data[:, 1:], X_read)
288 |     assert np.array_equal(data[:, 0], y_read)
289 | 
290 | 
291 | def test_list_alphabetical():
292 |     """Test for checking 'list_files' returns alphabetically"""
293 |     path = "test/data/csv/mock_datasplitter_output"
294 |     mlio_list_files = list_files(path, pattern="*")
295 |     alphabetical_files = []
296 |     for file in ["excel.csv", "manual.csv", "newline.csv", "oneline.csv"]:
297 |         alphabetical_files.extend(list_files(path + "/" + file, pattern="*"))
298 |     assert mlio_list_files == alphabetical_files
299 | 
300 | 
301 | def test_list_recursive():
302 |     """Test for checking 'list_files' lists recursively"""
303 |     assert len(list_files("test/data/csv", pattern="*")) == 10
304 | 


--------------------------------------------------------------------------------
/test/test_robust_pca.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #      http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | 
14 | import numpy as np
15 | import pytest
16 | from scipy.sparse import csr_matrix
17 | 
18 | from sklearn import datasets
19 | from sklearn.decomposition import PCA, TruncatedSVD
20 | 
21 | from sagemaker_sklearn_extension.decomposition import RobustPCA
22 | 
23 | 
24 | X_iris = datasets.load_iris().data
25 | X_iris_sparse = csr_matrix(X_iris)
26 | 
27 | 
28 | @pytest.mark.parametrize(
29 |     ["X", "n_components", "X_expected"],
30 |     [
31 |         # Dense input
32 |         (X_iris, 2, PCA(n_components=2).fit_transform(X_iris)),
33 |         # Sparse input
34 |         (X_iris_sparse, 2, TruncatedSVD().fit_transform(X_iris_sparse)),
35 |         # n_components > X.shape[1], no dimension reduction
36 |         (X_iris, 1000, X_iris),
37 |     ],
38 | )
39 | def test_svd(X, n_components, X_expected):
40 |     svd = RobustPCA(n_components=n_components)
41 |     X_observed = svd.fit_transform(X)
42 | 
43 |     np.testing.assert_array_almost_equal(X_observed, X_expected)
44 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | # Tox (http://tox.testrun.org/) is a tool for running tests
 2 | # in multiple virtualenvs. This configuration file will run the
 3 | # test suite on all supported python versions. To use it, "pip install tox"
 4 | # and then run "tox" from this directory.
 5 | 
 6 | [tox]
 7 | envlist = black-format,flake8,pylint,twine,py37,contrib_taei_py37
 8 | skip_missing_interpreters = False
 9 | 
10 | [testenv:black-format]
11 | # Used during development (before committing) to format .py files.
12 | basepython = python3
13 | deps = black==19.10b0
14 | commands =
15 |     black -l 120 ./
16 | 
17 | [testenv:black-check]
18 | # Used by automated build steps to check that all files are properly formatted.
19 | basepython = python3
20 | deps = black==19.10b0
21 | commands =
22 |     black -l 120 --check ./
23 | 
24 | [testenv:flake8]
25 | basepython = python3
26 | skipdist = true
27 | skip_install = true
28 | deps = flake8
29 | commands =
30 |     flake8
31 | 
32 | [testenv:pylint]
33 | basepython = python3
34 | skipdist = true
35 | skip_install = true
36 | deps = pylint==2.3.1
37 | commands =
38 |     python -m pylint --rcfile=.pylintrc -j 0 src/sagemaker_sklearn_extension
39 | 
40 | [testenv:twine]
41 | basepython = python3
42 | # twine check was added starting in 1.12.0
43 | deps = twine>=1.12.0
44 | # https://github.com/pypa/twine/blob/master/docs/changelog.rst
45 | # https://packaging.python.org/guides/making-a-pypi-friendly-readme/#validating-restructuredtext-markup
46 | commands =
47 |     python setup.py sdist
48 |     twine check dist/*.tar.gz
49 | 
50 | [testenv:py37]
51 | # {posargs} can be passed in by additional arguments specified when invoking tox.
52 | # Can be used to specify which tests to run, e.g.: tox -- -s
53 | usedevelop = True
54 | deps =
55 |     -r{toxinidir}/requirements.txt
56 |     .[test]
57 | conda_deps =
58 |     mlio-py=0.7
59 |     libprotobuf=3.13.0
60 | conda_channels =
61 |     conda-forge
62 |     mlio
63 | commands =
64 |     coverage run --source src/sagemaker_sklearn_extension --omit src/sagemaker_sklearn_extension/contrib/* -m pytest --ignore-glob=test/contrib/* --verbose {posargs}
65 |     coverage report --fail-under=90
66 | 
67 | [testenv:contrib_taei_py37]
68 | # {posargs} can be passed in by additional arguments specified when invoking tox.
69 | # Can be used to specify which tests to run, e.g.: tox -- -s
70 | usedevelop = True
71 | deps =
72 |     -r{toxinidir}/requirements.txt
73 |     .[test]
74 |     .[taei]
75 | conda_deps =
76 |     mlio-py=0.7
77 |     libprotobuf=3.13.0
78 | conda_channels =
79 |     conda-forge
80 |     mlio
81 | commands =
82 |     coverage run --source src/sagemaker_sklearn_extension/contrib/taei -m pytest test/contrib/taei --verbose {posargs}
83 |     coverage report --fail-under=90
84 | 


--------------------------------------------------------------------------------