├── .flake8 ├── .github └── PULL_REQUEST_TEMPLATE.md ├── .gitignore ├── .pylintrc ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── NOTICE ├── README.rst ├── VERSION ├── ci ├── buildspec-deploy.yml ├── buildspec-pr.yml ├── buildspec-release.yml └── scripts │ └── displaytime.sh ├── requirements.txt ├── setup.py ├── src └── sagemaker_sklearn_extension │ ├── __init__.py │ ├── contrib │ ├── README.md │ ├── __init__.py │ └── taei │ │ ├── README.md │ │ ├── __init__.py │ │ ├── images │ │ └── overview.png │ │ ├── latent_space_oversampler.py │ │ ├── models.py │ │ ├── nn_utils.py │ │ └── star_oversampler.py │ ├── decomposition │ ├── __init__.py │ └── robust_pca.py │ ├── externals │ ├── __init__.py │ ├── automl_transformer.py │ ├── header.py │ └── read_data.py │ ├── feature_extraction │ ├── __init__.py │ ├── date_time.py │ ├── sequences.py │ └── text.py │ ├── impute │ ├── __init__.py │ └── base.py │ └── preprocessing │ ├── __init__.py │ ├── base.py │ ├── data.py │ └── encoders.py ├── test ├── __init__.py ├── contrib │ └── taei │ │ ├── data │ │ └── data.csv │ │ └── test_taei.py ├── data │ └── csv │ │ ├── dictionaries.csv │ │ ├── dirty.csv │ │ ├── invalid.csv │ │ ├── kc_house_data.csv │ │ ├── missing_values.csv │ │ ├── mock_datasplitter_output │ │ ├── excel.csv │ │ ├── manual.csv │ │ ├── newline.csv │ │ └── oneline.csv │ │ └── regression_na_labels.csv ├── test_automl_transformer.py ├── test_common.py ├── test_data.py ├── test_date_time.py ├── test_feature_extraction_text.py ├── test_header.py ├── test_impute.py ├── test_preprocessing.py ├── test_preprocessing_encoders.py ├── test_read_data.py ├── test_robust_pca.py └── test_sequence_transformer.py └── tox.ini /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | application_import_names = src, test 3 | import-order-style = google 4 | max-line-length = 120 5 | ignore = 6 | E203, 7 | E231 8 | W503 9 | exclude = 10 | build/ 11 | .git 12 | __pycache__ 13 | .tox 14 | venv/ 15 | max-complexity = 10 16 | require-code = True -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | *Issue #, if available:* 2 | 3 | *Description of changes:* 4 | 5 | ## Merge Checklist 6 | 7 | _Put an `x` in the boxes that apply. You can also fill these out after creating the PR. If you're unsure about any of them, don't hesitate to ask. We're here to help! This is simply a reminder of what we are going to look for before merging your pull request._ 8 | 9 | - [ ] I have read the [CONTRIBUTING](https://github.com/aws/sagemaker-scikit-learn-extension/blob/master/CONTRIBUTING.md) doc 10 | - [ ] I used the commit message format described in [CONTRIBUTING](https://github.com/aws/sagemaker-scikit-learn-extension/blob/master/CONTRIBUTING.md#committing-your-change) 11 | - [ ] I have added tests that prove my fix is effective or that my feature works (if appropriate) 12 | - [ ] I have updated any necessary [documentation](https://github.com/aws/sagemaker-scikit-learn-extension/blob/master/README.rst) (if appropriate) 13 | 14 | By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license. 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.pyo 3 | *.class 4 | *~ 5 | *# 6 | /docs/_build 7 | /runpy 8 | /build 9 | .coverage* 10 | **/.idea 11 | **/.history 12 | **/.cache 13 | **/.eggs 14 | **/.DS_Store 15 | *.egg 16 | *.egg-info 17 | .*.swp 18 | .mypy_cache 19 | .pytest_cache 20 | tags 21 | __pycache__ 22 | 23 | # Byte-compiled / optimized / DLL files 24 | __pycache__/ 25 | *.py[cod] 26 | *$py.class 27 | 28 | # C extensions 29 | *.so 30 | 31 | # Distribution / packaging 32 | .Python 33 | build/ 34 | develop-eggs/ 35 | dist/ 36 | downloads/ 37 | eggs/ 38 | .eggs/ 39 | lib/ 40 | lib64/ 41 | parts/ 42 | sdist/ 43 | var/ 44 | wheels/ 45 | pip-wheel-metadata/ 46 | share/python-wheels/ 47 | *.egg-info/ 48 | .installed.cfg 49 | *.egg 50 | MANIFEST 51 | 52 | # PyInstaller 53 | # Usually these files are written by a python script from a template 54 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 55 | *.manifest 56 | *.spec 57 | 58 | # Installer logs 59 | pip-log.txt 60 | pip-delete-this-directory.txt 61 | 62 | # Unit test / coverage reports 63 | htmlcov/ 64 | .tox/ 65 | .nox/ 66 | .coverage 67 | .coverage.* 68 | .cache 69 | nosetests.xml 70 | coverage.xml 71 | *.cover 72 | .hypothesis/ 73 | .pytest_cache/ 74 | 75 | # Translations 76 | *.mo 77 | *.pot 78 | 79 | # Sphinx documentation 80 | docs/_build/ 81 | 82 | # PyBuilder 83 | target/ 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # Environments 96 | .env 97 | .venv 98 | env/ 99 | venv/ 100 | ENV/ 101 | env.bak/ 102 | venv.bak/ 103 | 104 | # mkdocs documentation 105 | /site 106 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## v2.5.0 (2022-02-17) 4 | 5 | ### Features 6 | 7 | * Similarity encoding 8 | 9 | ### Bug fixes and other changes 10 | 11 | * Merge pull request #40 from GiannisMitr/tsfresh_extractor_speedups 12 | * apply feature thresholds before extracting features & add cap in total generated features. 13 | * remove RobustScaler from TSFreshExtractor. 14 | * enable parallelism in TSFeature extraction, excluding "sagemaker_serve" executions 15 | * Merge pull request #39 from zkarnin/sim_encode 16 | * Fixing broken dependency in tsfresh 17 | 18 | ## v2.4.0 (2021-09-23) 19 | 20 | ### Features 21 | 22 | * expansion control for TSFeatureExtractor 23 | 24 | ### Bug fixes and other changes 25 | 26 | * Merge pull request #38 from nikitaivkin/master 27 | 28 | ## v2.3.0 (2021-08-16) 29 | 30 | ### Features 31 | 32 | * transformers for time series 33 | 34 | ## v2.2.1 (2021-05-21) 35 | 36 | ### Bug fixes and other changes 37 | 38 | * Datetime fix 39 | 40 | ## v2.2.0 (2021-04-13) 41 | 42 | ### Features 43 | 44 | * taei contrib library 45 | 46 | ### Bug fixes and other changes 47 | 48 | * broken tests and dependencies 49 | 50 | ## v2.1.0 (2020-10-21) 51 | 52 | ### Features 53 | 54 | * adds threshold and max_categories parameter to RobustOrdinalEncoder 55 | * Add weight of evidence encoder 56 | 57 | ### Bug fixes and other changes 58 | 59 | * use named functions instead of lambdas in DateTimeDefintions because of pickle 60 | 61 | ## v2.0.0 (2020-08-13) 62 | 63 | ### Breaking changes 64 | 65 | * update sklearn dependency version to 0.23 and mlio version to 0.5 66 | 67 | ### Features 68 | 69 | * OrdinalEncoder can output np.nan instead of n for unseen values 70 | 71 | ### Bug fixes and other changes 72 | 73 | * minor performance optimizations and refactoring 74 | 75 | ## v1.2.0 (2020-07-29) 76 | 77 | ### Features 78 | 79 | * adds a `get_classes` method to `RobustLabelEncoder` 80 | 81 | ## v1.1.1 (2020-07-21) 82 | 83 | ### Bug fixes and other changes 84 | 85 | * Merge pull request #18 from ipanepen/rle-bug 86 | * test data reading when n_rows = 1 mod batch_size 87 | * bug fix: makes fit_transform behavior consistent with fit and transform 88 | * fix a minor bug in OneHotEncoder by by overloading the buggy method in ThresholdOneHotEncoder and fixing it 89 | 90 | ## v1.1.0 (2020-02-24) 91 | 92 | ### Features 93 | 94 | * dummy feature commit for RobustOrdinalEncoder & add badges to README 95 | 96 | ### Bug fixes and other changes 97 | 98 | * libprotobuf==3.11.4 is not backwards compatible, specify tox version for testing 99 | * Merge pull request #11 from ipanepen/master 100 | * fix for MemoryError in ThresholdOneHotEncoder 101 | * Adding RobustOrdinalEncoder 102 | * Specify mlio version 0.2.7 103 | 104 | ## v1.0.0 (2019-12-03) 105 | 106 | ### Bug fixes and other changes 107 | 108 | * update to 1.0.0, fix buildspec 109 | * update ci deployment credentials 110 | * Merge pull request #4 from wiltonwu/master 111 | * update documentation, remove CHANGELOG.md for 0.1.0 deployment, add date_time module 112 | * Merge pull request #2 from ipanepen/ipanepen-add-random-seed 113 | * adds np.random.seed(0) to test_preprocessing.py to ensure deterministic behavior 114 | * Initial commit 115 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check [existing open](https://github.com/aws/sagemaker-scikit-learn-extension/issues), or [recently closed](https://github.com/aws/sagemaker-scikit-learn-extension/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *master* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | ### Pulling Down the Code 31 | 32 | 1. If you do not already have one, create a GitHub account by following the prompts at [Join Github](https://github.com/join). 33 | 1. Create a fork of this repository on GitHub. You should end up with a fork at `https://github.com//sagemaker-scikit-learn-extension`. 34 | 1. Follow the instructions at [Fork a Repo](https://help.github.com/en/articles/fork-a-repo) to fork a GitHub repository. 35 | 1. Clone your fork of the repository: `git clone https://github.com//sagemaker-scikit-learn-extension` where `` is your github username. 36 | 37 | 38 | ### Running the Unit Tests 39 | 40 | 1. Install conda or miniconda if you have not already done so. See [conda/miniconda installation instructions.](https://conda.io/projects/conda/en/latest/user-guide/install/index.html) 41 | 1. Install test dependencies using `pip install .[test]` (or, for Zsh users: `pip install .\[test\]`) 42 | 1. cd into the sagemaker-scikit-learn-extension folder: `cd sagemaker-scikit-learn-extension` or `cd /environment/sagemaker-scikit-learn-extension` 43 | 1. Run the following tox command and verify that all code checks and unit tests pass: `tox` 44 | 1. Note that this will run unit tests, linting tests, package tests, and automatically formatting tests. 45 | 46 | You can also run a single test with the following command: `tox -e py37 -- -s -vv ::` 47 | * Note that the coverage test will fail if you only run a single test, so make sure to surround the command with `export IGNORE_COVERAGE=-` and `unset IGNORE_COVERAGE` 48 | * Example: `export IGNORE_COVERAGE=- ; tox -e py37 -- -s -vv tests/test_impute.py::test_robust_imputer ; unset IGNORE_COVERAGE` 49 | 50 | 51 | ### Making and Testing Your Change 52 | 53 | 1. Create a new git branch: 54 | ```shell 55 | git checkout -b my-fix-branch master 56 | ``` 57 | 1. Make your changes, **including unit tests**. 58 | 1. Include unit tests when you contribute new features or make bug fixes, as they help to: 59 | 1. Prove that your code works correctly. 60 | 1. Guard against future breaking changes to lower the maintenance cost. 61 | 1. Please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 62 | 1. Run all the unit tests as per [Running the Unit Tests](#running-the-unit-tests), and verify that all checks and tests pass. 63 | 1. Note that this also runs tools that may be necessary for the automated build to pass (ex: code reformatting by 'black'). 64 | 65 | 66 | ### Committing Your Change 67 | 68 | We use commit messages to update the project version number and generate changelog entries, so it's important for them to follow the right format. Valid commit messages include a prefix, separated from the rest of the message by a colon and a space. Here are a few examples: 69 | 70 | ``` 71 | feature: support sparse inputs for RobustStandardScaler 72 | fix: fix flake8 errors 73 | ``` 74 | 75 | Valid prefixes are listed in the table below. 76 | 77 | | Prefix | Use for... | 78 | |----------------:|:-----------------------------------------------------------------------------------------------| 79 | | `breaking` | Incompatible API changes. | 80 | | `deprecation` | Deprecating an existing API or feature, or removing something that was previously deprecated. | 81 | | `feature` | Adding a new feature. | 82 | | `fix` | Bug fixes. | 83 | | `change` | Any other code change. | 84 | | `documentation` | Documentation changes. | 85 | 86 | Some of the prefixes allow abbreviation ; `break`, `feat`, `depr`, and `doc` are all valid. If you omit a prefix, the commit will be treated as a `change`. 87 | 88 | For the rest of the message, use imperative style and keep things concise but informative. See [How to Write a Git Commit Message](https://chris.beams.io/posts/git-commit/) for guidance. 89 | 90 | 91 | ### Sending a Pull Request 92 | 93 | GitHub provides additional document on [Creating a Pull Request](https://help.github.com/articles/creating-a-pull-request/). 94 | 95 | Please remember to: 96 | * Use commit messages (and PR titles) that follow the guidelines under [Committing Your Change](#committing-your-change). 97 | * Send us a pull request, answering any default questions in the pull request interface. 98 | * Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 99 | 100 | 101 | ## Finding contributions to work on 102 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws/sagemaker-scikit-learn-extension/labels/help%20wanted) issues is a great place to start. 103 | 104 | 105 | ## Code of Conduct 106 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 107 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 108 | opensource-codeofconduct@amazon.com with any additional questions or comments. 109 | 110 | 111 | ## Security issue notifications 112 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 113 | 114 | 115 | ## Licensing 116 | 117 | See the [LICENSE](https://github.com/aws/sagemaker-scikit-learn-extension/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 118 | 119 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes. 120 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | include LICENSE.txt 3 | include VERSION 4 | include README.rst -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Sagemaker Scikit Learn Extension 2 | Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | SageMaker Scikit-Learn Extension 2 | ================================ 3 | 4 | .. image:: https://img.shields.io/badge/License-Apache%202.0-blue.svg 5 | :target: https://opensource.org/licenses/Apache-2.0 6 | :alt: License 7 | 8 | .. image:: https://img.shields.io/pypi/v/sagemaker-scikit-learn-extension.svg 9 | :target: https://pypi.python.org/pypi/sagemaker-scikit-learn-extension 10 | :alt: Latest Version 11 | 12 | .. image:: https://img.shields.io/badge/code_style-black-000000.svg 13 | :target: https://github.com/python/black 14 | :alt: Code style: black 15 | 16 | SageMaker Scikit-Learn Extension is a Python module for machine learning built on top of `scikit-learn `_. 17 | 18 | This project contains standalone scikit-learn estimators and additional tools to support SageMaker Autopilot. Many of the additional estimators are based on existing scikit-learn estimators. 19 | 20 | 21 | User Installation 22 | ----------------- 23 | 24 | To install, 25 | 26 | :: 27 | 28 | # install from pip 29 | pip install sagemaker-scikit-learn-extension 30 | 31 | In order to use the I/O functionalies in the :code:`sagemaker_sklearn_extension.externals` module, you will also need to install the :code:`mlio` version 0.7 package via conda. The :code:`mlio` package is only available through conda at the moment. 32 | 33 | To install :code:`mlio`, 34 | 35 | :: 36 | 37 | # install mlio 38 | conda install -c mlio -c conda-forge mlio-py==0.7 39 | 40 | To see more information about mlio, see https://github.com/awslabs/ml-io. 41 | 42 | You can also install from source by cloning this repository and running a ``pip install`` command in the root directory of the repository: 43 | 44 | :: 45 | 46 | # install from source 47 | git clone https://github.com/aws/sagemaker-scikit-learn-extension.git 48 | cd sagemaker-scikit-learn-extension 49 | pip install -e . 50 | 51 | 52 | Supported Operating Systems 53 | --------------------------- 54 | 55 | SageMaker scikit-learn extension supports Unix/Linux and Mac. 56 | 57 | Supported Python Versions 58 | ------------------------- 59 | 60 | SageMaker scikit-learn extension is tested on: 61 | 62 | - Python 3.7 63 | 64 | License 65 | ------- 66 | 67 | This library is licensed under the Apache 2.0 License. 68 | 69 | Development 70 | ----------- 71 | 72 | We welcome contributions from developers of all experience levels. 73 | 74 | The SageMaker scikit-learn extension is meant to be a repository for scikit-learn estimators that don't meet scikit-learn's stringent inclusion criteria. 75 | 76 | 77 | Setup 78 | ----- 79 | 80 | We recommend using conda for development and testing. 81 | 82 | To download conda, go to the `conda installation guide `_. 83 | 84 | 85 | Running Tests 86 | ------------- 87 | 88 | SageMaker scikit-learn extension contains an extensive suite of unit tests. 89 | 90 | You can install the libraries needed to run the tests by running :code:`pip install --upgrade .[test]` or, for Zsh users: :code:`pip install --upgrade .\[test\]` 91 | 92 | For unit tests, tox will use pytest to run the unit tests in a Python 3.7 interpreter. tox will also run flake8 and pylint for style checks. 93 | 94 | conda is needed because of the dependency on mlio 0.7. 95 | 96 | To run the tests with tox, run: 97 | 98 | :: 99 | 100 | tox 101 | 102 | Running on SageMaker 103 | -------------------- 104 | 105 | To use sagemaker-scikit-learn-extension on SageMaker, you can build the `sagemaker-scikit-learn-extension-container `_. 106 | 107 | Overview of Submodules 108 | ---------------------- 109 | 110 | * :code:`sagemaker_sklearn_extension.decomposition` 111 | * :code:`RobustPCA` dimension reduction for dense and sparse inputs 112 | * :code:`sagemaker_sklearn_extension.externals` 113 | * :code:`AutoMLTransformer` utility class encapsulating feature and target transformation functionality used in SageMaker Autopilot 114 | * :code:`Header` utility class to manage the header and target columns in tabular data 115 | * :code:`read_csv_data` reads comma separated data and returns a numpy array (uses mlio) 116 | * :code:`sagemaker_sklearn_extension.feature_extraction.date_time` 117 | * :code:`DateTimeVectorizer` convert datetime objects or strings into numeric features 118 | * :code:`sagemaker_sklearn_extension.feature_extraction.sequences` 119 | * :code:`TSFlattener` convert strings of sequences into numeric features 120 | * :code:`TSFreshFeatureExtractor` compute row-wise time series features from a numpy array (uses tsfresh) 121 | * :code:`sagemaker_sklearn_extension.feature_extraction.text` 122 | * :code:`MultiColumnTfidfVectorizer` convert collections of raw documents to a matrix of TF-IDF features 123 | * :code:`sagemaker_sklearn_extension.impute` 124 | * :code:`RobustImputer` imputer for missing values with customizable mask_function and multi-column constant imputation 125 | * :code:`RobustMissingIndicator` binary indicator for missing values with customizable mask_function 126 | * :code:`sagemaker_sklearn_extension.preprocessing` 127 | * :code:`BaseExtremeValuesTransformer` customizable transformer for columns that contain "extreme" values (columns that are heavy tailed) 128 | * :code:`LogExtremeValuesTransformer` stateful log transformer for columns that contain "extreme" values (columns that are heavy tailed) 129 | * :code:`NALabelEncoder` encoder for transforming labels to NA values 130 | * :code:`QuadraticFeatures` generate and add quadratic features to feature matrix 131 | * :code:`QuantileExtremeValuesTransformer` stateful quantiles transformer for columns that contain "extreme" values (columns that are he 132 | * :code:`ThresholdOneHotEncoder` encode categorical integer features as a one-hot numeric array, with optional restrictions on feature encoding 133 | * :code:`RemoveConstantColumnsTransformer` removes constant columns 134 | * :code:`RobustLabelEncoder` encode labels for seen and unseen labels 135 | * :code:`RobustStandardScaler` standardization for dense and sparse inputs 136 | * :code:`WOEEncoder` weight of evidence supervised encoder 137 | * :code:`SimilarityEncoder` encode categorical values based on their descriptive string 138 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 2.5.1.dev0 2 | -------------------------------------------------------------------------------- /ci/buildspec-deploy.yml: -------------------------------------------------------------------------------- 1 | version: 0.2 2 | 3 | phases: 4 | build: 5 | commands: 6 | - PACKAGE_FILE="$CODEBUILD_SRC_DIR_ARTIFACT_1/sagemaker-scikit-learn-extension-*.tar.gz" 7 | - PYPI_USER=$(aws secretsmanager get-secret-value --secret-id /codebuild/pypi/user --query SecretString --output text) 8 | - PYPI_PASSWORD=$(aws secretsmanager get-secret-value --secret-id /codebuild/pypi/password --query SecretString --output text) 9 | 10 | - echo 'md5sum of python package:' 11 | - md5sum $PACKAGE_FILE 12 | 13 | # publish to pypi 14 | - twine upload $PACKAGE_FILE -u $PYPI_USER -p $PYPI_PASSWORD -------------------------------------------------------------------------------- /ci/buildspec-pr.yml: -------------------------------------------------------------------------------- 1 | version: 0.2 2 | 3 | phases: 4 | build: 5 | commands: 6 | # install tbb dependency 7 | - apt update -y 8 | - apt-get install -y libtbb-dev 9 | 10 | # install tox 11 | - pip install tox tox-conda==0.7.3 12 | 13 | # run linters, format verification, and package checks 14 | - start_time=`date +%s` 15 | - tox -e flake8,pylint,black-check,twine 16 | - ./ci/scripts/displaytime.sh 'flake8,pylint,twine,black-check' $start_time 17 | 18 | # run unit tests 19 | - start_time=`date +%s` 20 | - tox -e py37 21 | - ./ci/scripts/displaytime.sh 'py37 unit' $start_time 22 | 23 | # run unit tests for contrib 24 | - start_time=`date +%s` 25 | - tox -e contrib_taei_py37 26 | - ./ci/scripts/displaytime.sh 'contrib_taei_py37 unit' $start_time -------------------------------------------------------------------------------- /ci/buildspec-release.yml: -------------------------------------------------------------------------------- 1 | version: 0.2 2 | 3 | phases: 4 | build: 5 | commands: 6 | # run git-secrets 7 | - git-secrets --scan-history 8 | 9 | # install tbb dependency 10 | - apt update -y 11 | - apt-get install -y libtbb-dev 12 | 13 | # install tox 14 | - pip install tox tox-conda==0.7.3 15 | 16 | # prepare release 17 | - git-release --prepare --min-version 1.0.0 18 | 19 | # run linters 20 | - tox -e flake8,pylint 21 | 22 | # run format verification 23 | - tox -e black-check 24 | 25 | # run package check 26 | - tox -e twine 27 | 28 | # run unit tests 29 | - tox -e py37 30 | 31 | # run unit tests for contrib 32 | - tox -e contrib_taei_py37 33 | 34 | # generate distribution package 35 | - python3 setup.py sdist 36 | 37 | # publish release to github 38 | - git-release --publish --min-version 1.0.0 39 | 40 | artifacts: 41 | files: 42 | - dist/sagemaker-scikit-learn-extension-*.tar.gz 43 | name: ARTIFACT_1 44 | discard-paths: yes -------------------------------------------------------------------------------- /ci/scripts/displaytime.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"). You 5 | # may not use this file except in compliance with the License. A copy of 6 | # the License is located at 7 | # 8 | # http://aws.amazon.com/apache2.0/ 9 | # 10 | # or in the "license" file accompanying this file. This file is 11 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 12 | # ANY KIND, either express or implied. See the License for the specific 13 | # language governing permissions and limitations under the License. 14 | 15 | set -euo pipefail 16 | 17 | echo =================== $1 execution time =================== 18 | 19 | start_time=$2 20 | end_time=`date +%s` 21 | total_time=$(expr $end_time - $start_time + 1) 22 | hours=$((total_time/60/60%24)) 23 | minutes=$((total_time/60%60)) 24 | secs=$((total_time%60)) 25 | 26 | (( $hours > 0 )) && printf '%d hours ' $hours 27 | (( $minutes > 0 )) && printf '%d minutes ' $minutes 28 | (( $hours > 0 || $minutes > 0 )) && printf 'and ' 29 | printf '%d seconds\n\n' $secs -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.16.4 2 | psutil 3 | scikit-learn==0.23.2 4 | python-dateutil==2.8.0 5 | pandas==1.2.4 6 | tsfresh==0.18.0 7 | statsmodels==0.12.2 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | import os 15 | 16 | from setuptools import find_packages, setup 17 | 18 | 19 | def read(fname): 20 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 21 | 22 | 23 | def read_version(): 24 | return read("VERSION").strip() 25 | 26 | 27 | EXTRAS_REQUIRE = { 28 | "test": ["tox", "tox-conda==0.7.3", "pytest", "coverage"], 29 | "taei": ["torch==1.7.1"], 30 | } 31 | 32 | 33 | setup( 34 | name="sagemaker-scikit-learn-extension", 35 | version=read_version(), 36 | description="Open source library extension of scikit-learn for Amazon SageMaker.", 37 | packages=find_packages(where="src", exclude=("test",)), 38 | package_dir={"": "src"}, 39 | long_description=read("README.rst"), 40 | author="Amazon Web Services", 41 | url="https://github.com/aws/sagemaker-scikit-learn-extension/", 42 | license="Apache License 2.0", 43 | keywords="ML Amazon AWS AI SKLearn Scikit-Learn", 44 | classifiers=["Development Status :: 4 - Beta", "License :: OSI Approved :: Apache Software License"], 45 | extras_require=EXTRAS_REQUIRE, 46 | ) 47 | -------------------------------------------------------------------------------- /src/sagemaker_sklearn_extension/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | """ 15 | Amazon SageMaker extension module of sklearn 16 | ============================================ 17 | 18 | 19 | """ 20 | from . import * # noqa: F401, F403 21 | -------------------------------------------------------------------------------- /src/sagemaker_sklearn_extension/contrib/README.md: -------------------------------------------------------------------------------- 1 | # SageMaker Scikit-Learn Extension Contrib Extensions 2 | 3 | Contrib is a package of utilities that work with Scikit-Learn extension but are not directly within the scope of the core Scikit-Learn Extension library. Currently the contrib package includes: 4 | - `taei`: Implementations of the latent space minority oversampling techniques proposed in [1] 5 | 6 | ### References 7 | [1] S. Darabi and Y. Elor "Synthesising Multi-Modal Minority Samples for Tabular Data" 8 | 9 | 10 | -------------------------------------------------------------------------------- /src/sagemaker_sklearn_extension/contrib/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | """ 15 | Amazon SageMaker extension module of sklearn - contrib 16 | ====================================================== 17 | 18 | 19 | """ 20 | from . import * # noqa: F401, F403 21 | -------------------------------------------------------------------------------- /src/sagemaker_sklearn_extension/contrib/taei/README.md: -------------------------------------------------------------------------------- 1 | # Tabular AutoEncoder Interpolator 2 | 3 | ## Overview 4 | overview 5 | 6 | This library contains implementations of the latent space minority oversampling techniques proposed in [1] for 7 | multi-modal data. These oversamplers work by 8 | 9 | 1. Mapping the multi-modal samples to a dense continuous latent space using an autoencoder 10 | 2. Applying oversampling by interpolation in the latent space 11 | 3. Mapping the synthetic samples back to the original feature space 12 | 13 | This framework was shown to be effective in generating high-quality multi-modal synthetic data which then resulted in 14 | better prediction quality for downstream tasks. 15 | 16 | #### LatentSpaceOversampler 17 | The interpolator is implemented by `LatentSpaceOversampler` which takes two inputs at initialization: 18 | - `model` - The autoencoder used to map the samples to the latent space and back. Currently, two 19 | autoencoders are provided with the package: `AE` which is a vanilla autoencoder and `VAE` which is a variational 20 | autoencoder. 21 | - `base_oversampler` function - The oversampling function applied in the latent space. We have experimented with 22 | `SMOTE` from [imbalanced-learn](https://github.com/scikit-learn-contrib/imbalanced-learn) and `StarOversampler` which is 23 | our light weight implementation (provided with this package) of `polynom_fit_SMOTE`[2] based on the implementation of 24 | [smote_variants](https://github.com/analyticalmindsltd/smote_variants)[3] 25 | 26 | ## Installation 27 | It is recommended to install from PyPI 28 | ``` 29 | pip install sagemaker-scikit-learn-extension[taei] 30 | 31 | # For Zsh users: 32 | pip install sagemaker-scikit-learn-extension\[taei]\ 33 | ``` 34 | 35 | ## Examples 36 | [imbalanced-learn](https://github.com/scikit-learn-contrib/imbalanced-learn) is required to run the examples below as 37 | it provides the dataset and the base oversampler. Install 38 | [imbalanced-learn](https://github.com/scikit-learn-contrib/imbalanced-learn) by 39 | ``` 40 | pip install imbalanced-learn==0.7 41 | ``` 42 | 43 | TAEI supports input of either a numpy.ndarray or a pandas.DataFrame object with two types of columns: 44 | - Continuous columns: numeric values, can have very large cardinality 45 | - Discrete (categorical) columns: numeric values with low cardinality. These columns need be encoded to ordinal integers 46 | before using TAEI. This could be easily done using `sagemaker_sklearn_extension.preprocessing.OrdinalEncoder` 47 | 48 | Next we load the dataset from [imbalanced-learn](https://github.com/scikit-learn-contrib/imbalanced-learn) and specify 49 | which columns are continuous and which are discrete 50 | ```python 51 | import imblearn.datasets 52 | 53 | # load the datasets 54 | d = imblearn.datasets.fetch_datasets()["abalone"] 55 | # indexes of categorical features 56 | categorical_features = [0, 1, 2] 57 | # number of uniques for each categorical feature 58 | categorical_dims = [2, 2, 2] 59 | # indexes of continuous features 60 | continuous_features = [3, 4, 5, 6, 7, 8, 9] 61 | ``` 62 | 63 | ### Vanilla autoencoder + SMOTE 64 | We start with an example of wrapping SMOTE with a vanilla autoencoder 65 | ```python 66 | from imblearn.over_sampling import SMOTE 67 | from sagemaker_sklearn_extension.contrib.taei import LatentSpaceOversampler, AE 68 | 69 | ae_smote = LatentSpaceOversampler( 70 | model=AE( 71 | categorical_features=categorical_features, 72 | categorical_dims=categorical_dims, 73 | continuous_features=continuous_features, 74 | ), 75 | base_oversampler=SMOTE(sampling_strategy=0.5).fit_resample, 76 | ) 77 | ``` 78 | We train the autoencoder on the training data before using the oversampler 79 | ```python 80 | ae_smote.fit(X=d["data"], y=d["target"], verbose=True) 81 | ``` 82 | 83 | Finally, we can oversample the minority class 84 | ```python 85 | # Oversample the minority class 86 | X_oversampled, y_oversampled = ae_smote.resample(X=d["data"], y=d["target"], verbose=True) 87 | ``` 88 | Note that the base oversampler, SMOTE in our case, controls the number of minority samples generated 89 | 90 | ### Variational autoencoder + StarOversampler 91 | We demonstrate PolynomFit using the "star" topology [2] wrapped by a variational autoencoder, a combination yielding 92 | superior prediction quality in our experiments[1]. For PolynomFit, we use our light weight implementation, 93 | `StarOversampler`, based on the implementation of 94 | [smote_variants](https://github.com/analyticalmindsltd/smote_variants)[3] 95 | ```python 96 | from sagemaker_sklearn_extension.contrib.taei import LatentSpaceOversampler, VAE, StarOversampler 97 | 98 | vae_poly = LatentSpaceOversampler( 99 | model=VAE( 100 | categorical_features=categorical_features, 101 | categorical_dims=categorical_dims, 102 | continuous_features=continuous_features, 103 | ), 104 | base_oversampler=StarOversampler(proportion=1.0).resample 105 | ) 106 | # Train the model and oversample in a single function call 107 | X_oversampled, y_oversampled = vae_poly.fit_resample(X=d['data'], y=d['target'], verbose=True) 108 | ``` 109 | 110 | ### Save and load trained models 111 | First, store the model we trained in `vae_poly` to a file. Note that `base_oversampler` is not stored, only the trained 112 | model 113 | ```python 114 | vae_poly.save_model('/tmp/vae_model.pth') 115 | ``` 116 | We use the stored model by creating a new `LatentSpaceOversampler` and loading the trained model into it 117 | ```python 118 | vae_poly_loaded = LatentSpaceOversampler( 119 | model=None, 120 | base_oversampler=StarOversampler(proportion=1.0).resample 121 | ) 122 | vae_poly_loaded.load_model('/tmp/vae_model.pth') 123 | # Oversample the minority class using the stored model 124 | X_os, y_os = vae_poly_loaded.resample(d['data'], d['target'], verbose=True) 125 | ``` 126 | 127 | 128 | ## Citing TAEI 129 | 130 | If you use TAEI, please cite the following work: 131 | - S. Darabi and Y. Elor "Synthesising Multi-Modal Minority Samples for Tabular Data" 132 | 133 | ## References 134 | [1] S. Darabi and Y. Elor "Synthesising Multi-Modal Minority Samples for Tabular Data" 135 | 136 | [2] Gazzah, S. and Amara, N. E. B., "New Oversampling Approaches Based on Polynomial Fitting for Imbalanced Data Sets", 137 | 2008 The Eighth IAPR International Workshop on Document Analysis Systems, 2008, pp. 677-684 138 | 139 | [3] Gy\"orgy Kov\'acs. "smote-variants: a Python Implementation of 85 Minority Oversampling Techniques", Neurocomputing 140 | 366, 2019 -------------------------------------------------------------------------------- /src/sagemaker_sklearn_extension/contrib/taei/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | from .latent_space_oversampler import LatentSpaceOversampler 15 | from .models import AE, VAE 16 | from .star_oversampler import StarOversampler 17 | 18 | __all__ = ["LatentSpaceOversampler", "AE", "VAE", "StarOversampler"] 19 | -------------------------------------------------------------------------------- /src/sagemaker_sklearn_extension/contrib/taei/images/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/sagemaker-scikit-learn-extension/2412131311433addbae9f6ad5aa393a8bdbbe61f/src/sagemaker_sklearn_extension/contrib/taei/images/overview.png -------------------------------------------------------------------------------- /src/sagemaker_sklearn_extension/contrib/taei/latent_space_oversampler.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.utils import check_X_y 5 | 6 | 7 | class LatentSpaceOversampler: 8 | """ 9 | Implementation of the latent space minority oversampling techniques proposed in [1]. The model (autoencoder) is used 10 | to encode the samples to the latent space where the base oversampler is applied to generate new minority samples. 11 | The generated synthetic minority samples are decoded back to the original feature space using the decoder. 12 | Interpolation parameters such as the oversampling ratio are controlled by the base oversampler. 13 | 14 | Parameters 15 | ---------- 16 | model : (autoencoder) pytorch model 17 | A model to be used to encode the samples into the latent space before interpolation and from the latent space 18 | after interpolation 19 | base_oversampler : oversampler 20 | oversampler used to interpolate samples in the latent space 21 | device : 'cpu' or 'gpu' (default = 'cpu') 22 | Device used by pytorch for training the model and using the trained model for encoding/decoding 23 | random_state : int (default = 0) 24 | Random number generation seed 25 | 26 | References 27 | ---------- 28 | .. [1] S. Darabi and Y. Elor "Synthesising Multi-Modal Minority Samples for Tabular Data" 29 | 30 | """ 31 | 32 | def __init__(self, model, base_oversampler, device="cpu", random_state=0): 33 | self.model = model 34 | self.base_oversampler = base_oversampler 35 | self.device = device 36 | self.random_state = random_state 37 | 38 | def fit(self, X, y, validation_ratio=0.2, **kwargs): 39 | """ 40 | Train the model using gradient descent back propagation 41 | 42 | Parameters 43 | ---------- 44 | X : {array-like, sparse matrix} of shape (n_samples, n_features) 45 | Features matrix used to train the model 46 | y : vector-like of shape (n_samples, 1) 47 | The target vector used to train the model 48 | validation_ratio : float or None (default = 0.2) 49 | Ratio of samples to be used as validation set for early stopping in model training. If None then early 50 | stopping is not applied 51 | **kwargs: 52 | Additional arguments passed the the model internal fit function 53 | """ 54 | X, y = check_X_y(X, y) 55 | if validation_ratio: 56 | X_train, X_validation, y_train, y_validation = train_test_split( 57 | X, y, test_size=validation_ratio, stratify=y, random_state=self.random_state 58 | ) 59 | else: 60 | X_train = X 61 | y_train = y 62 | X_validation = None 63 | y_validation = None 64 | self.model.fit( 65 | X_train=X_train, 66 | y_train=y_train, 67 | X_validation=X_validation, 68 | y_validation=y_validation, 69 | device=self.device, 70 | **kwargs, 71 | ) 72 | return self 73 | 74 | def resample(self, X, y, verbose=False): 75 | """ 76 | Use the model and the base oversampler to generate synthetic minority samples 77 | """ 78 | X, y = check_X_y(X, y) 79 | self.model.eval() 80 | X = torch.Tensor(X) 81 | X = X.to(self.device) 82 | with torch.no_grad(): 83 | z = self.model.encode(X) 84 | z = z.cpu().numpy() 85 | if verbose: 86 | print(f"LatentSpaceOversampler: Shape before oversampling z:{z.shape}, y:{y.shape}") 87 | z_samples, y_samples = self.base_oversampler(z, y) 88 | if verbose: 89 | print(f"LatentSpaceOversampler: Shape after oversampling z:{z_samples.shape}, y:{y_samples.shape}") 90 | z_samples = z_samples[-(len(z_samples) - len(X)) :] 91 | y_samples = y_samples[-(len(y_samples) - len(y)) :].reshape(-1) 92 | z_samples = torch.Tensor(z_samples).to(self.device) 93 | with torch.no_grad(): 94 | x_samples = self.model.decode_sample(z_samples) 95 | X = torch.cat([X, x_samples], dim=0).cpu().numpy() 96 | y = np.concatenate((y, y_samples), axis=0) 97 | return X, y 98 | 99 | def fit_resample(self, X, y, verbose=False, **kwargs): 100 | return self.fit(X, y, verbose=verbose, **kwargs).resample(X, y, verbose=verbose) 101 | 102 | def save_model(self, path): 103 | torch.save(self.model, path) 104 | 105 | def load_model(self, path): 106 | self.model = torch.load(path) 107 | -------------------------------------------------------------------------------- /src/sagemaker_sklearn_extension/contrib/taei/nn_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import numpy as np 4 | 5 | 6 | class LambdaLogSoftmax(nn.Module): 7 | def __init__(self, dim): 8 | super().__init__() 9 | self.dim = dim 10 | 11 | def forward(self, *args, **kwargs): 12 | return nn.functional.log_softmax(dim=self.dim, *args, **kwargs) 13 | 14 | 15 | class GBN(torch.nn.Module): 16 | """ 17 | Ghost Batch Normalization 18 | https://arxiv.org/abs/1705.08741 19 | """ 20 | 21 | def __init__(self, input_dim, virtual_batch_size=128, momentum=0.01): 22 | super(GBN, self).__init__() 23 | 24 | self.input_dim = input_dim 25 | self.virtual_batch_size = virtual_batch_size 26 | self.bn = nn.BatchNorm1d(self.input_dim, momentum=momentum) 27 | 28 | def forward(self, x): 29 | chunks = x.chunk(int(np.ceil(x.shape[0] / self.virtual_batch_size)), 0) 30 | res = [self.bn(x_) for x_ in chunks] 31 | return torch.cat(res, dim=0) 32 | 33 | 34 | class EmbeddingGenerator(torch.nn.Module): 35 | """ 36 | Classical embeddings generator 37 | adopted from https://github.com/dreamquark-ai/tabnet/ 38 | """ 39 | 40 | def __init__(self, input_dim, cat_dims, cat_idxs, cat_emb_dim=None): 41 | """ This is an embedding module for an entire set of features 42 | Parameters 43 | ---------- 44 | input_dim : int 45 | Number of features coming as input (number of columns) 46 | cat_dims : list of int 47 | Number of modalities for each categorial features 48 | If the list is empty, no embeddings will be done 49 | cat_idxs : list of int 50 | Positional index for each categorical features in inputs 51 | cat_emb_dim : int or list of int 52 | Embedding dimension for each categorical features 53 | If int, the same embdeding dimension will be used for all categorical features 54 | """ 55 | super(EmbeddingGenerator, self).__init__() 56 | if cat_dims == [] or cat_idxs == []: 57 | self.skip_embedding = True 58 | self.post_embed_dim = input_dim 59 | return 60 | if cat_emb_dim is None: 61 | # use heuristic 62 | cat_emb_dim = [min(600, round(1.6 * n_cats ** 0.56)) for n_cats in cat_dims] 63 | 64 | # heuristic 65 | self.skip_embedding = False 66 | if isinstance(cat_emb_dim, int): 67 | self.cat_emb_dims = [cat_emb_dim] * len(cat_idxs) 68 | else: 69 | self.cat_emb_dims = cat_emb_dim 70 | 71 | # check that all embeddings are provided 72 | if len(self.cat_emb_dims) != len(cat_dims): 73 | msg = """ cat_emb_dim and cat_dims must be lists of same length, got {len(self.cat_emb_dims)} 74 | and {len(cat_dims)}""" 75 | raise ValueError(msg) 76 | self.post_embed_dim = int(input_dim + np.sum(self.cat_emb_dims) - len(self.cat_emb_dims)) 77 | 78 | self.embeddings = torch.nn.ModuleList() 79 | 80 | # Sort dims by cat_idx 81 | sorted_idxs = np.argsort(cat_idxs) 82 | cat_dims = [cat_dims[i] for i in sorted_idxs] 83 | self.cat_emb_dims = [self.cat_emb_dims[i] for i in sorted_idxs] 84 | 85 | for cat_dim, emb_dim in zip(cat_dims, self.cat_emb_dims): 86 | self.embeddings.append(torch.nn.Embedding(cat_dim, emb_dim)) 87 | 88 | # record continuous indices 89 | self.continuous_idx = torch.ones(input_dim, dtype=torch.bool) 90 | self.continuous_idx[cat_idxs] = 0 91 | 92 | def forward(self, x): 93 | """ 94 | Apply embdeddings to inputs 95 | Inputs should be (batch_size, input_dim) 96 | Outputs will be of size (batch_size, self.post_embed_dim) 97 | """ 98 | if self.skip_embedding: 99 | # no embeddings required 100 | return x 101 | cols = [] 102 | cat_feat_counter = 0 103 | for feat_init_idx, is_continuous in enumerate(self.continuous_idx): 104 | # Enumerate through continuous idx boolean mask to apply embeddings 105 | if is_continuous: 106 | cols.append(x[:, feat_init_idx].float().view(-1, 1)) 107 | else: 108 | cols.append(self.embeddings[cat_feat_counter](x[:, feat_init_idx].long())) 109 | cat_feat_counter += 1 110 | # concat 111 | post_embeddings = torch.cat(cols, dim=1) 112 | return post_embeddings 113 | 114 | 115 | def weight_init(m): 116 | if isinstance(m, nn.Linear): 117 | nn.init.kaiming_uniform_(m.weight) 118 | -------------------------------------------------------------------------------- /src/sagemaker_sklearn_extension/contrib/taei/star_oversampler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class StarOversampler: 5 | """ 6 | Implementation of the oversampler proposed in [1] using the `star` topology. The implementation is based on the 7 | implementation of https://github.com/analyticalmindsltd/smote_variants 8 | 9 | Parameters 10 | ---------- 11 | proportion: float (default = 1) 12 | proportion of the difference of n_maj and n_min to sample e.g. 1.0 means that after sampling the number of 13 | minority samples will be equal to the number of majority samples 14 | 15 | References 16 | ---------- 17 | .. [1] Gazzah, S. and Amara, N. E. B. "New Oversampling Approaches Based on Polynomial Fitting for Imbalanced Data 18 | Sets" The Eighth IAPR International Workshop on Document Analysis Systems 19 | """ 20 | 21 | def __init__(self, proportion=1.0): 22 | self.proportion = proportion 23 | 24 | def fit(self, X, y=None): 25 | pass 26 | 27 | def resample(self, X, y, verbose=False): 28 | """ 29 | Generate synthetic minority samples 30 | """ 31 | unique, counts = np.unique(y, return_counts=True) 32 | class_stats = dict(zip(unique, counts)) 33 | min_label = unique[0] if counts[0] < counts[1] else unique[1] 34 | maj_label = unique[1] if counts[0] < counts[1] else unique[0] 35 | 36 | # determine the number of samples to generate 37 | n_to_sample = self.det_n_to_sample(self.proportion, class_stats[maj_label], class_stats[min_label]) 38 | 39 | if n_to_sample == 0: 40 | if verbose: 41 | print("StarOversampler: Sampling is not needed") 42 | return X.copy(), y.copy() 43 | 44 | samples = [] 45 | # Implementation of the star topology 46 | X_min = X[y == min_label] 47 | X_mean = np.mean(X_min, axis=0) 48 | k = max([1, int(np.rint(n_to_sample / len(X_min)))]) 49 | for x in X_min: 50 | diff = X_mean - x 51 | for i in range(1, k + 1): 52 | samples.append(x + float(i) / (k + 1) * diff) 53 | return np.vstack([X, np.vstack(samples)]), np.hstack([y, np.repeat(min_label, len(samples))]) 54 | 55 | def det_n_to_sample(self, proportion, n_maj, n_min): 56 | """ 57 | Determines the number of samples to generate 58 | 59 | Parameters 60 | ---------- 61 | proportion: float 62 | proportion of the difference of n_maj and n_min to sample e.g. 1.0 means that after sampling the number of 63 | minority samples will be equal to the number of majority samples 64 | n_maj: int 65 | number of majority samples 66 | n_min: int 67 | number of minority samples 68 | """ 69 | return max([0, int((n_maj - n_min) * proportion)]) 70 | -------------------------------------------------------------------------------- /src/sagemaker_sklearn_extension/decomposition/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | """ 15 | The :mod:`sagemaker_sklearn_extension.decomposition` module includes matrix decomposition algorithms. 16 | """ 17 | 18 | from .robust_pca import RobustPCA 19 | 20 | __all__ = [ 21 | "RobustPCA", 22 | ] 23 | -------------------------------------------------------------------------------- /src/sagemaker_sklearn_extension/decomposition/robust_pca.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | from scipy.sparse import issparse 15 | 16 | from sklearn.base import BaseEstimator, TransformerMixin 17 | from sklearn.decomposition import PCA, TruncatedSVD 18 | from sklearn.utils.validation import check_array, check_is_fitted 19 | 20 | 21 | class RobustPCA(BaseEstimator, TransformerMixin): 22 | """RobustPCA dimension reduction for dense and sparse matrices. 23 | 24 | RobustPCA uses a different implementation of singular value decomposition depending on the input. 25 | - ``sklearn.decomposition.PCA`` for dense inputs 26 | - ``sklearn.decomposition.TruncatedSVD`` for sparse inputs 27 | 28 | Please see ``sklearn.decomposition.PCA`` or ``sklearn.decomposition.TruncatedSVD`` for more details. 29 | 30 | If input number of features (input dimension) is less than n_components (target dimension), then no dimension 31 | reduction will be performed. The output will be the same as the input. 32 | 33 | Parameters 34 | ---------- 35 | n_components : int, optional (default=1000) 36 | Desired dimensionality of output data. 37 | Must be strictly less than the number of features. If n_components is greater than than the number of features, 38 | no dimension reduction will be performed. 39 | 40 | svd_solver : string, optional (default='auto') 41 | 42 | - If 'auto', the solver is selected by a default policy based on `X.shape` and `n_components`: if the input 43 | data is larger than 500x500 and the number of components to extract is lower than 80% of the smallest 44 | dimension of the data, then the more efficient 'randomized' method is enabled. Otherwise the exact full 45 | RobustPCA is computed and optionally truncated afterwards. 46 | Note: 'auto' option only available for dense inputs. If 'auto' and input is sparse, svd_solver will use 47 | 'randomized' 48 | - If 'full', run exact full RobustPCA calling the standard LAPACK solver via `scipy.linalg.svd` and select the 49 | components by postprocessing. 50 | Note: 'full' option only available for dense inputs. If 'full' and input is sparse, svd_solver will use 51 | 'randomized' 52 | - If 'arpack', run RobustPCA truncated to n_components calling ARPACK solver via `scipy.sparse.linalg.svds`. 53 | 'arpack' requires strictly 0 < n_components < n_components 54 | - If 'randomized', run randomized RobustPCA by the method of Halko et al. 55 | 56 | iterated_power : int >= 0 or 'auto', optional (default='auto') 57 | Number of iterations for the power method computed by 58 | svd_solver == 'randomized'. 59 | Note: If 'auto' and input is sparse, default for `iterated_power` is 5. 60 | 61 | tol : float >= 0, optional (default=0.) 62 | Tolerance for singular values computed by svd_solver == 'arpack'. 0 means machine precision. Ignored by 63 | randomized RobustPCA solver. 64 | 65 | random_state : int, RandomState instance, or None, optional (default=None) 66 | - If int, random_state is the seed used by the random number generator; 67 | - If RandomState instance, random_state is the random number generator; 68 | - If None, the random number generator is the RandomState instance used 69 | by np.random. Used when svd_solver == 'arpack' or 'randomized'. 70 | 71 | 72 | Attributes 73 | ---------- 74 | robust_pca_ : ``sklearn.decomposition.PCA``, ``sklearn.decomposition.TruncatedSVD``, or None 75 | - If input number of features (input dimension) is less than n_components (target dimension), then `svd_` will 76 | be set to None and no dimension reduction will be performed. The output will be the same as the input. 77 | 78 | Assuming number of features is more than n_components: 79 | - If input is sparse, `svd_` is ``sklearn.decomposition.TruncatedSVD``. 80 | - If input is dense, `svd_` is ``sklearn.decomposition.PCA`` 81 | 82 | Notes 83 | ----- 84 | For dense inputs, ``sklearn.decomposition.PCA`` will center the input data by per-feature mean subtraction before 85 | RobustPCA. Sparse inputs will not center data. 86 | """ 87 | 88 | def __init__(self, n_components=1000, svd_solver="auto", iterated_power="auto", tol=0.0, random_state=None): 89 | self.n_components = n_components 90 | self.svd_solver = svd_solver 91 | self.iterated_power = iterated_power 92 | self.tol = tol 93 | self.random_state = random_state 94 | 95 | def fit(self, X, y=None): 96 | """Fit the model with X. 97 | 98 | Parameters 99 | ---------- 100 | X : array-like, shape (n_samples, n_features) 101 | Training data. 102 | 103 | Returns 104 | ------- 105 | self : RobustPCA 106 | """ 107 | X = check_array(X, accept_sparse=True, dtype=None) 108 | 109 | # if input dimension is less than or equal to target dimension, no reduction will be performed 110 | if X.shape[1] <= self.n_components: 111 | self.robust_pca_ = None 112 | return self 113 | 114 | # fit for sparse or dense input 115 | if issparse(X): 116 | algorithm = self.svd_solver if self.svd_solver == "arpack" else "randomized" 117 | n_iter = self.iterated_power if self.iterated_power != "auto" else 5 118 | 119 | self.robust_pca_ = TruncatedSVD( 120 | n_components=self.n_components, 121 | algorithm=algorithm, 122 | n_iter=n_iter, 123 | random_state=self.random_state, 124 | tol=self.tol, 125 | ) 126 | else: 127 | self.robust_pca_ = PCA( 128 | n_components=self.n_components, 129 | svd_solver=self.svd_solver, 130 | tol=self.tol, 131 | iterated_power=self.iterated_power, 132 | random_state=self.random_state, 133 | ) 134 | 135 | self.robust_pca_.fit(X) 136 | return self 137 | 138 | def transform(self, X, y=None): 139 | """Fit the model with X and apply the dimensionality reduction on X. 140 | 141 | Parameters 142 | ---------- 143 | X : array-like, shape (n_samples, n_features) 144 | Training data 145 | 146 | Returns 147 | ------- 148 | X : array-like, shape (n_samples, n_features) 149 | or 150 | X_new : array-like, shape (n_samples, n_components) 151 | 152 | """ 153 | check_is_fitted(self, "robust_pca_") 154 | 155 | if self.robust_pca_: 156 | return self.robust_pca_.transform(X) 157 | return X 158 | -------------------------------------------------------------------------------- /src/sagemaker_sklearn_extension/externals/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | from .automl_transformer import AutoMLTransformer 15 | from .header import Header 16 | from .read_data import read_csv_data 17 | 18 | __all__ = [ 19 | "AutoMLTransformer", 20 | "Header", 21 | "read_csv_data", 22 | ] 23 | -------------------------------------------------------------------------------- /src/sagemaker_sklearn_extension/externals/automl_transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | import numpy as np 15 | 16 | from scipy.sparse import isspmatrix 17 | from sklearn.base import BaseEstimator 18 | from sklearn.base import TransformerMixin 19 | 20 | 21 | class AutoMLTransformer(BaseEstimator, TransformerMixin): 22 | """Utility class encapsulating feature and target transformation functionality used in AutoML pipelines. 23 | 24 | Parameters 25 | ---------- 26 | header : Header instance 27 | Instance of the ``Header`` class from ``sagemaker_sklearn_extension.externals``. Contains indices of the 28 | features and response in the corresponding dataset. 29 | 30 | feature_transformer : transformer instance 31 | A Scikit-Learn transformer used on the feature columns in the dataset. Should have ``fit`` and ``transform`` 32 | methods which accept 2-dimensional inputs. 33 | 34 | target_transformer : transformer instance 35 | A Scikit-Learn transformer used on the target column in the dataset. Should have ``fit``, ``transform``, and 36 | optionally ``inverse_transform`` methods which accept 1-dimensional inputs. 37 | """ 38 | 39 | def __init__(self, header, feature_transformer, target_transformer): 40 | self.header = header 41 | self.feature_transformer = feature_transformer 42 | self.target_transformer = target_transformer 43 | 44 | def fit(self, X, y): 45 | """Fit and transform target, then fit feature data using the underlying transformers. 46 | 47 | Parameters 48 | ---------- 49 | X : numpy array of shape [n_samples, n_features] 50 | The feature-only dataset. 51 | 52 | y : numpy array of shape [n_samples] 53 | The target column. 54 | 55 | Returns 56 | ------- 57 | self : AutoMLTransformer 58 | """ 59 | y_transformed = y 60 | 61 | if self.target_transformer: 62 | y_transformed = self.target_transformer.fit_transform(y) 63 | 64 | self.feature_transformer.fit(X, y_transformed) 65 | return self 66 | 67 | def transform(self, X): 68 | """Transform the dataset using the underlying transformers. 69 | 70 | Depending on the shape of the input, it transforms either the feature data, or the feature data and the target 71 | column and then concatenates them back into a single dataset. 72 | 73 | Parameters 74 | ---------- 75 | X : numpy array 76 | The array to transform whose shape should be either: 77 | - [n_samples, n_features], if it only contains the features; or 78 | - [n_samples, n_features + 1], if it contains the feature columns and the target column. 79 | 80 | Returns 81 | ------- 82 | array-like of shape [n_samples, n_transformed_features] or [n_samples, n_transformed_features + 1] 83 | """ 84 | n_columns = X.shape[1] 85 | n_features = len(self.header.feature_column_indices) 86 | 87 | # X contains both features and response. 88 | if n_columns == n_features + 1: 89 | y = X[:, self.header.target_column_index] 90 | y_transformed = self.label_transform(y) 91 | non_nan_indices = np.arange(y_transformed.shape[0])[~np.isnan(y_transformed)] 92 | feature_indices = np.array(self.header.feature_column_indices) 93 | X_transformed = self.feature_transformer.transform( 94 | X[non_nan_indices[:, np.newaxis], feature_indices[np.newaxis, :]] 95 | ) 96 | y_transformed_no_nans = y_transformed[non_nan_indices] 97 | return np.column_stack((y_transformed_no_nans, self._dense_array(X_transformed))) 98 | 99 | # X contains only the features. 100 | if n_columns == n_features: 101 | return self.feature_transformer.transform(X) 102 | 103 | raise ValueError( 104 | f"Received data of unknown size. Expected number of columns is {n_features}. " 105 | f"Number of columns in the received data is {n_columns}." 106 | ) 107 | 108 | def label_transform(self, y): 109 | """Apply transformation, if ``target_transformer`` has been specified. 110 | 111 | Parameters 112 | ---------- 113 | y : array-like, 1-dimensional 114 | 115 | Returns 116 | ------- 117 | array-like 118 | The transformed data. If target transformer has not been specified, simply returns the input. 119 | """ 120 | if self.target_transformer: 121 | return self.target_transformer.transform(y) 122 | 123 | return y.astype("float32") 124 | 125 | def inverse_label_transform(self, yt): 126 | """Apply inverse target transformation, if ``target_transformer`` has been specified set. 127 | 128 | Parameters 129 | ---------- 130 | yt : array-like, 1-dimensional 131 | 132 | Returns 133 | ------- 134 | array-like 135 | The inverse-transformed target. If target transformer has not been specified, simply returns the input. 136 | """ 137 | if not self.target_transformer: 138 | return yt 139 | 140 | return self.target_transformer.inverse_transform(yt) 141 | 142 | @staticmethod 143 | def _dense_array(arr): 144 | """Converts the input array to dense array. 145 | 146 | Parameters 147 | ---------- 148 | arr : numpy array or csr_matrix 149 | The array to be densified. 150 | 151 | Returns 152 | ------- 153 | array-like 154 | Dense numpy array representing arr. 155 | 156 | """ 157 | if isspmatrix(arr): 158 | return arr.todense() 159 | return arr 160 | -------------------------------------------------------------------------------- /src/sagemaker_sklearn_extension/externals/header.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | from collections import defaultdict 15 | from collections import namedtuple 16 | from collections import OrderedDict 17 | 18 | 19 | Indices = namedtuple("Indices", field_names=("column_index", "feature_index")) 20 | 21 | 22 | class Header: 23 | """ A utility class to manage the header and target column. The header contains the names for 24 | all columns in a dataset including the target column. This class validates the header, 25 | checking for presence of duplicate column names and absence of target column name. 26 | 27 | This class provides functionality to translate the column names to column indices (data set including target column) 28 | and feature indices (data set excluding target column) respectively. 29 | 30 | This class is used in the code generated by the SageMaker Pipeline Recommender algorithm. 31 | 32 | Usage 33 | ------ 34 | >>> h = Header(column_names=['a', 'b', 'c'], target_column_name='b') 35 | >>> h.as_column_indices(['a', 'c']) 36 | [0, 2] 37 | 38 | >>> h.as_feature_indices(['a', 'c']) 39 | [0, 1] 40 | 41 | >>> h.target_column_name 42 | b 43 | 44 | >>> h.target_column_index 45 | 1 46 | 47 | >>> h.as_column_indices(['b']) 48 | [1] 49 | 50 | """ 51 | 52 | def __init__(self, column_names: list, target_column_name: str): 53 | """ 54 | Parameters 55 | ---------- 56 | column_names : iterable of the column names in the order of occurrence 57 | 58 | target_column_name : str, name of the target column 59 | 60 | Raises 61 | ------ 62 | 63 | ValueError : target_column_name is not present in column_names or duplicate entries found in column_names 64 | """ 65 | 66 | self.target_column_index = None 67 | self.target_column_name = target_column_name 68 | 69 | # maintaining a dict{column_name: Indices} 70 | self._column_name_indices = OrderedDict() 71 | 72 | feature_index_offset = 0 73 | duplicate_column_indices = defaultdict(list) 74 | 75 | for i, column_name in enumerate(column_names): 76 | # already seen the column, add to duplicate_column_indices 77 | if column_name in self._column_name_indices: 78 | duplicate_column_indices[column_name].append(i) 79 | else: 80 | self._column_name_indices[column_name] = Indices(column_index=i, feature_index=i - feature_index_offset) 81 | 82 | # if it's target column, setup target_index and adjust the feature index 83 | # offset for following features columns 84 | if column_name == target_column_name: 85 | self.target_column_index = i 86 | feature_index_offset = 1 87 | self._column_name_indices[column_name] = Indices(column_index=i, feature_index=None) 88 | 89 | if self.target_column_index is None: 90 | raise ValueError( 91 | "Specified target column '{target_column_name}' is " 92 | "not a valid column name.".format(target_column_name=target_column_name) 93 | ) 94 | 95 | if duplicate_column_indices: 96 | raise ValueError( 97 | "Duplicate column names were found:\n{}".format( 98 | "\n".join( 99 | [ 100 | "{name} at index {index}".format(name=name, index=index) 101 | for (name, index) in duplicate_column_indices.items() 102 | ] 103 | ) 104 | ) 105 | ) 106 | 107 | def as_feature_indices(self, column_names: list) -> list: 108 | """ Returns list of feature indices for the given column names. 109 | 110 | Parameters 111 | ---------- 112 | column_names : iterable containing feature names 113 | 114 | Returns 115 | ------- 116 | feature_indices : iterable containing the indices corresponding to column_names, 117 | assuming target column excluded. 118 | 119 | Raises 120 | ------ 121 | ValueError : At least one of the items in column_names is not a feature name. 122 | 123 | """ 124 | 125 | def _index(name): 126 | 127 | if self.target_column_name == name: 128 | raise ValueError( 129 | "'{}' is the target column name. " "It cannot be converted to feature index.".format(name) 130 | ) 131 | 132 | try: 133 | return self._column_name_indices[name].feature_index 134 | except KeyError: 135 | raise ValueError("'{}' is an unknown feature name".format(name)) 136 | 137 | return [_index(name) for name in column_names] 138 | 139 | def as_column_indices(self, column_names: list) -> list: 140 | """ Returns list of indices for the given column names. 141 | 142 | Parameters 143 | ---------- 144 | column_names : iterable containing column names 145 | 146 | Returns 147 | ------- 148 | column_indices : iterable containing the indices corresponding to column names, 149 | assuming target column is included in the data. 150 | 151 | Raises 152 | ------ 153 | ValueError : Unknown column name is found in column_names 154 | 155 | """ 156 | 157 | def _index(name): 158 | try: 159 | return self._column_name_indices[name].column_index 160 | except KeyError: 161 | raise ValueError("'{}' is an unknown column name.".format(name)) 162 | 163 | return [_index(name) for name in column_names] 164 | 165 | @property 166 | def feature_column_indices(self): 167 | """Returns list of feature column indices in the order in which they were provided. 168 | 169 | The order of the indices is determined by the ``column_names`` parameter. 170 | 171 | Returns 172 | ------- 173 | feature_column_indices : list of int 174 | """ 175 | return [ 176 | index_instance.column_index 177 | for index_instance in self._column_name_indices.values() 178 | if index_instance.feature_index is not None 179 | ] 180 | 181 | @property 182 | def num_columns(self): 183 | """ Returns number of columns including target column. 184 | 185 | Returns 186 | ------- 187 | num_columns : integer, Number of columns. 188 | """ 189 | return len(self._column_name_indices) 190 | 191 | @property 192 | def num_features(self): 193 | """ Returns number of features, i.e. the number of columns excluding target column. 194 | 195 | Returns 196 | ------- 197 | num_features : integer, Number of features. 198 | 199 | """ 200 | return len(self._column_name_indices) - 1 201 | -------------------------------------------------------------------------------- /src/sagemaker_sklearn_extension/feature_extraction/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | """ 15 | The :mod:`sagemaker_sklearn_extension.feature_extraction` module deals 16 | with feature extraction from raw data. It currently includes estimators 17 | to extract features from text. This module is based on the 18 | :mod:`sklearn.feature_extraction` module. 19 | """ 20 | 21 | from . import date_time 22 | from . import sequences 23 | from . import text 24 | 25 | __all__ = ["date_time", "sequences", "text"] 26 | -------------------------------------------------------------------------------- /src/sagemaker_sklearn_extension/feature_extraction/date_time.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | from datetime import datetime 15 | from enum import Enum 16 | 17 | from dateutil import parser 18 | import numpy as np 19 | from sklearn.base import BaseEstimator, TransformerMixin 20 | from sklearn.utils.validation import check_array, check_is_fitted 21 | 22 | 23 | class DateTimeProperty: 24 | def __init__(self, extract_func, max_, min_): 25 | """Contains information about a property of a datetime object 26 | 27 | Parameters 28 | ---------- 29 | extract_func: function 30 | function mapping a datetime object to the property 31 | max_: int 32 | maximum value for the property 33 | min_: int 34 | minimum value for the property 35 | """ 36 | self.min = min_ 37 | self.max = max_ 38 | self.extract_func = extract_func 39 | 40 | 41 | def extract_week_of_year(t): 42 | return t.isocalendar()[1] if isinstance(t, datetime) else np.nan 43 | 44 | 45 | def extract_weekday(t): 46 | return t.isocalendar()[2] if isinstance(t, datetime) else np.nan 47 | 48 | 49 | def extract_year(t): 50 | return t.year if isinstance(t, datetime) else np.nan 51 | 52 | 53 | def extract_hour(t): 54 | return t.hour if isinstance(t, datetime) else np.nan 55 | 56 | 57 | def extract_month(t): 58 | return t.month if isinstance(t, datetime) else np.nan 59 | 60 | 61 | def extract_minute(t): 62 | return t.minute if isinstance(t, datetime) else np.nan 63 | 64 | 65 | def extract_quarter(t): 66 | return (t.month - 1) // 3 + 1 if isinstance(t, datetime) else np.nan 67 | 68 | 69 | def extract_second(t): 70 | return t.second if isinstance(t, datetime) else np.nan 71 | 72 | 73 | def extract_day_of_year(t): 74 | return t.timetuple().tm_yday if isinstance(t, datetime) else np.nan 75 | 76 | 77 | def extract_day_of_month(t): 78 | return t.day if isinstance(t, datetime) else np.nan 79 | 80 | 81 | class DateTimeDefinition(Enum): 82 | WEEK_OF_YEAR = DateTimeProperty(extract_week_of_year, 53, 1) 83 | WEEKDAY = DateTimeProperty(extract_weekday, 7, 1) 84 | YEAR = DateTimeProperty(extract_year, None, None) 85 | HOUR = DateTimeProperty(extract_hour, 23, 0) 86 | MONTH = DateTimeProperty(extract_month, 12, 1) 87 | MINUTE = DateTimeProperty(extract_minute, 59, 0) 88 | QUARTER = DateTimeProperty(extract_quarter, 4, 1) 89 | SECOND = DateTimeProperty(extract_second, 59, 0) 90 | DAY_OF_YEAR = DateTimeProperty(extract_day_of_year, 366, 1) 91 | DAY_OF_MONTH = DateTimeProperty(extract_day_of_month, 31, 1) 92 | 93 | 94 | class DateTimeVectorizer(BaseEstimator, TransformerMixin): 95 | def __init__(self, extract=None, mode="cyclic", ignore_constant_columns=True, default_datetime=None): 96 | """Converts array-like data with datetime.datetime or strings describing datetime objects into numeric features 97 | 98 | A datetime item contains categorical information: year, month, hour, day of week, etc. This information is given 99 | as the output features. The encoding of these categories can be ordinal or cyclic. The cyclic encoding of an 100 | integer i between 0 and k consists of two floats: sin(i/k), cos(i/k). This makes sure for example that the 101 | months Decembers and January are encoded to vectors that are close in Euclidean distance. 102 | 103 | Parameters 104 | ---------- 105 | extract: list of DateTimeProperty, default None 106 | Types of data to extract. See DateTimeDefinition class for options. If given None, 107 | defaults to DateTimeVectorizer.default_data 108 | mode: str, default cyclic 109 | 'ordinal': each data type is outputted to a non-negative integer, as in ordinal encoding for categorical 110 | data 111 | 'cyclic': each data type is converted to two numbers in [-1,1] so that the distance between these numbers 112 | is small for close items in the cyclic order (for example hour=23 is close to hour=0) 113 | ignore_constant_columns: bool, default True 114 | If True, fit will make sure the output columns are not constant in the training set. 115 | default_datetime: DateTime, default None 116 | Default DateTime object to use when information is missing from input array. This DateTime object is passed 117 | as a keyword argument into the dateutil.parser.parse method. If this is a datetime object and not None, 118 | elements specified in the parse method replace elements in the default object. 119 | When ignore_constant_columns is True, the filled DateTime information will be removed if constant. 120 | 121 | Attributes 122 | ---------- 123 | extract_ : list of DateTimeProperty 124 | List of DateTimeProperty objects, each providing the necessary information for extracting a single property 125 | from a datetime object. The properties corresponding to this list describe the different columns of the 126 | output of the transform function 127 | 128 | 129 | Examples 130 | -------- 131 | >>> from sagemaker_sklearn_extension.feature_extraction.date_time import DateTimeVectorizer 132 | >>> import numpy as np 133 | >>> data = np.array([ 134 | ... 'Jan 3th, 2018, 1:34am', 135 | ... 'Feb 11th, 2012, 11:34:59pm', 136 | ... ]).reshape((-1, 1)) 137 | >>> date_time = DateTimeVectorizer(mode='ordinal', ignore_constant_columns=False) 138 | >>> X = date_time.fit_transform(data) 139 | >>> print(X.shape) 140 | (2, 7) 141 | >>> print(X[0].astype(np.int)) 142 | [ 2 2018 1 34 0 0 0] 143 | >>> date_time = DateTimeVectorizer(mode='ordinal') 144 | >>> # with ignore_constant_columns=True, the minute field, which is 34 in both examples, will be filtered 145 | >>> X = date_time.fit_transform(data) 146 | >>> print(X.shape) 147 | (2, 6) 148 | >>> print(X[0].astype(np.int)) 149 | [ 2 2018 1 0 0 0] 150 | 151 | 152 | 153 | """ 154 | self.extract = extract 155 | self.mode = mode 156 | self.ignore_constant_columns = ignore_constant_columns 157 | self.default_datetime = default_datetime 158 | 159 | @staticmethod 160 | def _cyclic_transform(data, low, high): 161 | """ 162 | Converts numeric data into 2d-cyclic. 163 | 164 | The conversion of a single integer into two floats makes sure that the Euclidian distance between two (output) 165 | values is similar to the cyclic distance between the integers. For example, hour of day is a number between 0 166 | and 23. The cyclic distance between the hours 0 and 23 is 1 (and not 23). After the cyclic transform, the 167 | transformed hour 0 will be a vector very close to that of the hour 23, and far away from that of 12. 168 | 169 | Parameters 170 | ---------- 171 | data: np.array of numbers 172 | low: lower bound of the data values 173 | high: upper bound of the data values 174 | 175 | Returns 176 | ------- 177 | np.array with double the dimension in the last axis 178 | 179 | Examples 180 | -------- 181 | >>> from sagemaker_sklearn_extension.feature_extraction.date_time import DateTimeVectorizer 182 | >>> output = DateTimeVectorizer._cyclic_transform(np.array([[1],[2],[3],[4]]), low=1, high=4) 183 | >>> # up to numeric precision, the outputs should be [[0,1], [1,0], [0,-1], [-1,0]] 184 | >>> print(output) 185 | [[ 0.0000000e+00 1.0000000e+00] 186 | [ 1.0000000e+00 6.1232340e-17] 187 | [ 1.2246468e-16 -1.0000000e+00] 188 | [-1.0000000e+00 -1.8369702e-16]] 189 | >>> output = DateTimeVectorizer._cyclic_transform(np.array([[1],[2],[3],[4],[5],[6],[7],[8]]), low=1, high=8) 190 | >>> print(output) 191 | [[ 0.00000000e+00 1.00000000e+00] 192 | [ 7.07106781e-01 7.07106781e-01] 193 | [ 1.00000000e+00 6.12323400e-17] 194 | [ 7.07106781e-01 -7.07106781e-01] 195 | [ 1.22464680e-16 -1.00000000e+00] 196 | [-7.07106781e-01 -7.07106781e-01] 197 | [-1.00000000e+00 -1.83697020e-16] 198 | [-7.07106781e-01 7.07106781e-01]] 199 | """ 200 | normalized = (data - low) * 2 * np.pi / (1 + high - low) 201 | sin_values = np.sin(normalized) 202 | cos_values = np.cos(normalized) 203 | 204 | shape = list(sin_values.shape) 205 | 206 | tmp_shape = tuple(shape + [1]) 207 | sin_values = sin_values.reshape(tmp_shape) 208 | cos_values = cos_values.reshape(tmp_shape) 209 | ret = np.concatenate((sin_values, cos_values), axis=len(tmp_shape) - 1) 210 | 211 | shape[-1] *= 2 212 | return ret.reshape(tuple(shape)) 213 | 214 | default_data = [ 215 | DateTimeDefinition.WEEKDAY.value, 216 | DateTimeDefinition.YEAR.value, 217 | DateTimeDefinition.HOUR.value, 218 | DateTimeDefinition.MINUTE.value, 219 | DateTimeDefinition.SECOND.value, 220 | DateTimeDefinition.MONTH.value, 221 | DateTimeDefinition.WEEK_OF_YEAR.value, 222 | ] 223 | 224 | def _to_datetime_single(self, item): 225 | if isinstance(item, datetime): 226 | return item 227 | try: 228 | return parser.parse(item, default=self.default_datetime) 229 | except ValueError: 230 | pass 231 | except TypeError: 232 | pass 233 | 234 | def _to_datetime_array(self, X): 235 | """Converts np array with string or datetime into datetime or None 236 | 237 | Parameters 238 | ---------- 239 | X : np.array 240 | numpy array containing data representing datetime objects 241 | 242 | Returns 243 | ------- 244 | X : np.array 245 | np.array with datetime objects of the same shape of the input. Items that could not be parsed become None 246 | 247 | """ 248 | X = np.vectorize(DateTimeVectorizer._to_datetime_single)(self, X) 249 | return X 250 | 251 | def fit(self, X, y=None): 252 | """Filter the extracted field so as not to contain constant columns. 253 | 254 | Parameters 255 | ---------- 256 | X : {array-like}, datetime.datetime or str 257 | 258 | Notes 259 | ----- 260 | If fitting with a 2d array with more than one column, any data type that is not constant in any column will 261 | remain. If for example, column 1 has year=1999 for all rows but column 2 has two or more possible year values, 262 | we will still produce an output with the year information from column 1. To avoid this, run fit on each column 263 | separately, and obtain a separate DateTimeVectorizer for each column 264 | 265 | Returns 266 | ------- 267 | self : DateTimeVectorizer 268 | """ 269 | 270 | X = check_array(X, dtype=None, force_all_finite="allow-nan") 271 | X = np.array(X) 272 | X = self._to_datetime_array(X) 273 | 274 | if self.mode not in ["cyclic", "ordinal"]: 275 | raise ValueError("mode must be either cyclic or ordinal. Current value is {}".format(self.mode)) 276 | 277 | self.extract_ = self.extract or self.default_data 278 | 279 | if self.ignore_constant_columns: 280 | new_extract = [] 281 | for col in range(X.shape[1]): 282 | # convert the current column to get the different property values 283 | transformed = self._convert(X[:, col].reshape((-1, 1)), mode="ordinal") 284 | # check for constant columns 285 | transformed_var = np.nanvar(transformed, axis=0) 286 | for i, cur_var in enumerate(transformed_var): 287 | if cur_var > 0 and self.extract_[i] not in new_extract: 288 | new_extract.append(self.extract_[i]) 289 | if not new_extract: 290 | new_extract = [self.extract_[0]] 291 | self.extract_ = new_extract 292 | 293 | return self 294 | 295 | def _convert(self, X, mode): 296 | n_cols = X.shape[1] 297 | 298 | cols = [] 299 | 300 | for datetime_property in self.extract_: 301 | # apply the function on the datetime values in the input array, create a python list. To iterate over all 302 | # items we view the input as a 1d vector 303 | cur_conversions = list(map(datetime_property.extract_func, X.reshape((-1,)))) 304 | # convert the list to a float32 numpy array 305 | cur_extract = np.array(cur_conversions, dtype=np.float32).reshape((-1, 1)) 306 | if datetime_property.min is None: 307 | # the output isn't cyclic. Leave it as is 308 | pass 309 | elif mode == "ordinal": 310 | # the output is ordinal - shift it so the minimum value is 0 311 | cur_extract -= datetime_property.min 312 | elif mode == "cyclic": 313 | # the output is cyclic - need to apply the cyclic transform 314 | cur_extract = self._cyclic_transform(cur_extract, low=datetime_property.min, high=datetime_property.max) 315 | 316 | cols.append(cur_extract) 317 | 318 | ret = np.concatenate(cols, axis=1) 319 | # the return array is in 1d form. We need to reshape it to bring it back to the correct 2d form 320 | ret = ret.reshape((-1, n_cols * ret.shape[1])) 321 | return ret 322 | 323 | def transform(self, X, y=None): 324 | X = check_array(X, dtype=None, force_all_finite="allow-nan") 325 | check_is_fitted(self, "extract_") 326 | 327 | X = np.array(X) 328 | X = self._to_datetime_array(X) 329 | 330 | return self._convert(X, self.mode) 331 | 332 | def _more_tags(self): 333 | return {"X_types": ["datetime.datetime", "string"]} 334 | -------------------------------------------------------------------------------- /src/sagemaker_sklearn_extension/feature_extraction/text.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | import numpy as np 15 | import scipy.sparse as sp 16 | 17 | from sklearn.base import BaseEstimator, TransformerMixin 18 | from sklearn.feature_extraction.text import VectorizerMixin, TfidfVectorizer 19 | from sklearn.utils.validation import check_array, check_is_fitted 20 | 21 | 22 | class MultiColumnTfidfVectorizer(BaseEstimator, VectorizerMixin, TransformerMixin): 23 | """Applies ``sklearn.feature_extraction.text.TfidfVectorizer`` to each column in an array. 24 | 25 | Each column of text is treated separately with a unique TfidfVectorizer. The vectorizers are applied sequentially. 26 | 27 | Parameters 28 | ---------- 29 | strip_accents : {'ascii', 'unicode', None} (default=None) 30 | Remove accents and perform other character normalization during the preprocessing step. 31 | 'ascii' is a fast method that only works on characters that have an direct ASCII mapping. 32 | 'unicode' is a slightly slower method that works on any characters. 33 | None (default) does nothing. 34 | 35 | Both 'ascii' and 'unicode' use NFKD normalization from :func:`unicodedata.normalize`. 36 | 37 | lowercase : boolean (default=True) 38 | Convert all characters to lowercase before tokenizing. 39 | 40 | preprocessor : callable or None (default=None) 41 | Override the preprocessing (string transformation) stage while preserving the tokenizing and n-grams 42 | generation steps. 43 | 44 | tokenizer : callable or None (default=None) 45 | Override the string tokenization step while preserving the preprocessing and n-grams generation steps. 46 | Only applies if ``analyzer == 'word'``. 47 | 48 | stop_words : string {'english'}, list, or None (default) 49 | If 'english', a built-in stop word list for English is used. 50 | There are several known issues with 'english' and you should consider an alternative (see :ref:`stop_words`). 51 | 52 | If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. 53 | Only applies if ``analyzer == 'word'``. 54 | 55 | If None, no stop words will be used. max_df can be set to a value in the range [0.7, 1.0) to automatically 56 | detect and filter stop words based on intra corpus document frequency of terms. 57 | 58 | token_pattern : string 59 | Regular expression denoting what constitutes a "token", only used if ``analyzer == 'word'``. The default regexp 60 | select tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a 61 | token separator). 62 | 63 | ngram_range : tuple (min_n, max_n) (default=(1, 1)) 64 | The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n 65 | such that min_n <= n <= max_n will be used. 66 | 67 | analyzer : string, {'word', 'char', 'char_wb'} or callable 68 | Whether the feature should be made of word or character n-grams. 69 | Option 'char_wb' creates character n-grams only from text inside word boundaries; n-grams at the edges of words 70 | are padded with space. 71 | 72 | If a callable is passed it is used to extract the sequence of features out of the raw, unprocessed input. 73 | 74 | max_df : float in range [0.0, 1.0] or int (default=1.0) 75 | When building the vocabulary ignore terms that have a document frequency strictly higher than the given 76 | threshold (corpus-specific stop words). 77 | If float, the parameter represents a proportion of documents, integer absolute counts. 78 | This parameter is ignored if vocabulary is not None. 79 | 80 | min_df : float in range [0.0, 1.0] or int (default=1) 81 | When building the vocabulary ignore terms that have a document frequency strictly lower than the given 82 | threshold. This value is also called cut-off in the literature. 83 | If float, the parameter represents a proportion of documents, integer absolute counts. 84 | This parameter is ignored if vocabulary is not None. 85 | 86 | max_features : int or None (default=1000) 87 | If not None, build a vocabulary that only consider the top max_features ordered by term frequency across 88 | the corpus. 89 | This parameter is ignored if vocabulary is not None. 90 | 91 | vocabulary : Mapping or iterable, optional (default=None) 92 | Either a Mapping (e.g., a dict) where keys are terms and values are indices in the feature matrix, or an 93 | iterable over terms. If not given, a vocabulary is determined from the input. 94 | 95 | dtype : type, optional (default=float64) 96 | Type of the matrix returned by fit_transform() or transform(). 97 | 98 | norm : 'l1', 'l2' or None, optional (default='l2') 99 | Each output row will have unit norm, either: 100 | * 'l2': Sum of squares of vector elements is 1. The cosine similarity between two vectors is their dot product 101 | when l2 norm has been applied. 102 | * 'l1': Sum of absolute values of vector elements is 1. 103 | See :func:`preprocessing.normalize` 104 | 105 | use_idf : boolean (default=True) 106 | Enable inverse-document-frequency reweighting. 107 | 108 | smooth_idf : boolean (default=True) 109 | Smooth idf weights by adding one to document frequencies, as if an extra document was seen containing every 110 | term in the collection exactly once. Prevents zero divisions. 111 | 112 | sublinear_tf : boolean (default=False) 113 | Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf). 114 | 115 | vocabulary_sizes : list(int) (default=None) 116 | Specify the exact vocabulary size to use while encoding each column in the input dataset. The vocabulary size 117 | of a column corresponds to the number of features in its TF-IDF encoding, before the feature matrices are 118 | concatenated. If the feature matrix of column ``i`` has more features than the corresponding vocabulary size, 119 | only the first ``vocabulary_sizes[i]`` features are kept. If the feature matrix of column ``i`` has fewer 120 | features than the corresponding vocabulary size, zero columns are added to the feature matrix until it has 121 | ``vocabulary_sizes[i]`` features. This parameter is useful if the total number of features of the encoding 122 | has to be constant. 123 | 124 | ignore_columns_with_zero_vocabulary_size : boolean (default=True) 125 | Allow ValueErrors thrown by ``sklearn.feature_extraction.text.TfidfVectorizer`` because of over-pruning 126 | of terms to be ignored and an empty ``scipy.sparse.csr_matrix`` to be used in place of the given columns 127 | TF-IDF document-term matrix. 128 | 129 | Attributes 130 | ---------- 131 | vectorizers_ : list of ``sklearn.feature_extraction.text.TfidfVectorizers`` 132 | List of ``sklearn.feature_extraction.text.TfidfVectorizers``. Each TfidfVectorizer is separately instantiated 133 | on an input column. len(self.vectorizers_) should equal to the number of input columns. 134 | 135 | Notes 136 | ----- 137 | MultiColumnTfidfVectorizer should be used with 2D arrays of text strings, for 1D arrays of text data, use 138 | ``sklearn.feature_extraction.text.TfidfVectorizer`` or reshape using array.reshape(-1, 1) 139 | """ 140 | 141 | def __init__( 142 | self, 143 | strip_accents=None, 144 | lowercase=True, 145 | preprocessor=None, 146 | tokenizer=None, 147 | stop_words=None, 148 | token_pattern=r"(?u)\b\w\w+\b", 149 | ngram_range=(1, 1), 150 | analyzer="word", 151 | max_df=1.0, 152 | min_df=1, 153 | max_features=1000, 154 | vocabulary=None, 155 | dtype=np.float64, 156 | norm="l2", 157 | use_idf=True, 158 | smooth_idf=True, 159 | sublinear_tf=False, 160 | vocabulary_sizes=None, 161 | ignore_columns_with_zero_vocabulary_size=True, 162 | ): 163 | self.strip_accents = strip_accents 164 | self.lowercase = lowercase 165 | self.preprocessor = preprocessor 166 | self.tokenizer = tokenizer 167 | self.stop_words = stop_words 168 | self.token_pattern = token_pattern 169 | self.ngram_range = ngram_range 170 | self.analyzer = analyzer 171 | self.max_df = max_df 172 | self.min_df = min_df 173 | self.max_features = max_features 174 | self.vocabulary = vocabulary 175 | self.dtype = dtype 176 | self.norm = norm 177 | self.use_idf = use_idf 178 | self.smooth_idf = smooth_idf 179 | self.sublinear_tf = sublinear_tf 180 | self.vocabulary_sizes = vocabulary_sizes 181 | self.ignore_columns_with_zero_vocabulary_size = ignore_columns_with_zero_vocabulary_size 182 | 183 | def _fit_vectorizer(self, col_idx, X): 184 | max_features = self.max_features 185 | 186 | # Override max_features for the current column in order to enforce the vocabulary size. 187 | if self.max_features and self.vocabulary_sizes: 188 | max_features = min(self.max_features, self.vocabulary_sizes[col_idx]) 189 | elif self.vocabulary_sizes: 190 | max_features = self.vocabulary_sizes[col_idx] 191 | 192 | try: 193 | vectorizer = TfidfVectorizer( 194 | strip_accents=self.strip_accents, 195 | lowercase=self.lowercase, 196 | preprocessor=self.preprocessor, 197 | tokenizer=self.tokenizer, 198 | stop_words=self.stop_words, 199 | token_pattern=self.token_pattern, 200 | ngram_range=self.ngram_range, 201 | analyzer=self.analyzer, 202 | max_df=self.max_df, 203 | min_df=self.min_df, 204 | max_features=max_features, 205 | vocabulary=self.vocabulary, 206 | dtype=self.dtype, 207 | norm=self.norm, 208 | use_idf=self.use_idf, 209 | smooth_idf=self.smooth_idf, 210 | sublinear_tf=self.sublinear_tf, 211 | ) 212 | vectorizer.fit(X[:, col_idx]) 213 | except ValueError as err: 214 | zero_vocab_errors = [ 215 | "After pruning, no terms remain. Try a lower min_df or a higher max_df.", 216 | "max_df corresponds to < documents than min_df", 217 | "empty vocabulary; perhaps the documents only contain stop words", 218 | ] 219 | if str(err) in zero_vocab_errors and self.ignore_columns_with_zero_vocabulary_size: 220 | vectorizer = None 221 | else: 222 | raise 223 | return vectorizer 224 | 225 | def fit(self, X, y=None): 226 | """Build the list of TfidfVectorizers for each column. 227 | 228 | Parameters 229 | ---------- 230 | X : {array-like}, text data 231 | 232 | Returns 233 | ------- 234 | self : MultiColumnTfidfVectorizer 235 | """ 236 | X = check_array(X, dtype=None) 237 | n_columns = X.shape[1] 238 | 239 | # If specified, vocabulary size must be given for each column of the input dataset. 240 | if self.vocabulary_sizes and len(self.vocabulary_sizes) != n_columns: 241 | raise ValueError("If specified, vocabulary_sizes has to have exactly one entry per data column.") 242 | 243 | self.vectorizers_ = [self._fit_vectorizer(i, X) for i in range(n_columns)] 244 | 245 | return self 246 | 247 | def _transform_vectorizer(self, col_idx, X): 248 | if self.vectorizers_[col_idx]: 249 | tfidf_features = self.vectorizers_[col_idx].transform(X[:, col_idx]) 250 | # If the vocabulary size is specified and there are too few features, then pad the output with zeros. 251 | if self.vocabulary_sizes and tfidf_features.shape[1] < self.vocabulary_sizes[col_idx]: 252 | tfidf_features = sp.csr_matrix( 253 | (tfidf_features.data, tfidf_features.indices, tfidf_features.indptr), 254 | shape=(tfidf_features.shape[0], self.vocabulary_sizes[col_idx]), 255 | ) 256 | return tfidf_features 257 | # If ``TfidfVectorizer`` threw a value error, add an empty TF-IDF document-term matrix for the column 258 | return sp.csr_matrix((X.shape[0], 0)) 259 | 260 | def transform(self, X, y=None): 261 | """Transform documents to document term-matrix. 262 | 263 | Parameters 264 | ---------- 265 | X : 2D array of text data 266 | 267 | Returns 268 | ------- 269 | tfidf_matrix : sparse matrix, [n_samples, n_features] 270 | Tf-idf-weighted document-term matrix. 271 | """ 272 | check_is_fitted(self, "vectorizers_") 273 | X = check_array(X, dtype=None) 274 | 275 | return sp.hstack([self._transform_vectorizer(i, X) for i in range(X.shape[1])]) 276 | 277 | def _more_tags(self): 278 | return {"X_types": ["string"]} 279 | -------------------------------------------------------------------------------- /src/sagemaker_sklearn_extension/impute/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | """ 15 | The :mod:`sagemaker_sklearn_extension.impute` module includes 16 | transformers that preform missing value imputation. This module 17 | is based on the :mod:`sklearn.impute` module. 18 | """ 19 | 20 | from .base import RobustImputer, RobustMissingIndicator, is_finite_numeric 21 | 22 | __all__ = ["RobustImputer", "RobustMissingIndicator", "is_finite_numeric"] 23 | -------------------------------------------------------------------------------- /src/sagemaker_sklearn_extension/impute/base.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | import numpy as np 15 | 16 | from sklearn.base import BaseEstimator, TransformerMixin 17 | from sklearn.impute import MissingIndicator, SimpleImputer 18 | from sklearn.utils.validation import check_array, check_is_fitted 19 | 20 | 21 | def is_finite_numeric(arr): 22 | """Helper function to check if values in an array can be converted to finite numeric 23 | """ 24 | 25 | def _is_finite_numeric(val): 26 | try: 27 | f = float(val) 28 | return np.isfinite(f) 29 | except ValueError: 30 | return False 31 | 32 | return np.vectorize(_is_finite_numeric)(arr) 33 | 34 | 35 | def _get_mask(X, vectorized_mask_function): 36 | """Compute boolean mask of X for vectorized_mask_function(X) == False 37 | """ 38 | return np.logical_not(vectorized_mask_function(X).astype("bool")) 39 | 40 | 41 | def _apply_mask(X, mask): 42 | X[mask] = np.nan 43 | return X 44 | 45 | 46 | class RobustImputer(BaseEstimator, TransformerMixin): 47 | """Imputer for completing missing values. 48 | 49 | Similar to sklearn.impute.SimpleImputer with added functionality 50 | - RobustImputer uses a custom mask_function to determine values to impute. 51 | The default mask_function is sagemaker_sklearn_extension.impute.is_finite_numeric 52 | which checks if a value can be converted into a float. 53 | - RobustImputer can perform multi-column imputation with different values 54 | for each column (strategy=="constant") 55 | 56 | Parameters 57 | ---------- 58 | dtype : string, type, list of types or None (default=None) 59 | Data type for output. 60 | 61 | - If left to default, numeric imputation strategies ("median" and "mean"), 62 | output array dtype will always be floating point dtype. Otherwise it will be 63 | np.dtype('O') 64 | 65 | strategy : string, optional (default='median') 66 | The imputation strategy. 67 | 68 | - If "mean", then replace missing values using the mean along 69 | each column. Can only be used with numeric data. 70 | - If "median", then replace missing values using the median along 71 | each column. Can only be used with numeric data. 72 | - If "most_frequent", then replace missing using the most frequent 73 | value along each column. Can be used with strings or numeric data. 74 | - If "constant", then replace missing values with fill_values. 75 | fill_values can be a singular value or a list of values equal to 76 | number of columns. Can be used with strings or numeric data. 77 | If fill_values is not set, fill_value will be 0 when imputing numerical 78 | data and "missing_value" for strings or object data types. 79 | 80 | fill_values : string, numerical value, or list, optional (default=None) 81 | When strategy=="constant", fill_values is used to replace all 82 | values that should be imputed. 83 | 84 | - If string or numerical value, that one value will be used to replace 85 | all values that should be imputed. 86 | - If list, fill_values must equal to number of columns of input. Each 87 | column will be imputed with the corresponding value in fill_values. 88 | fill_values[i] will replace ith column (X[:,i]). 89 | - If left to the default, fill_value will be 0 when imputing numerical 90 | data and "missing_value" for strings or object data types. 91 | 92 | mask_function : callable -> np.array, dtype('bool') (default=None) 93 | A vectorized python function, accepts np.array, returns np.array 94 | with dtype('bool') 95 | 96 | For each value, if mask_function(val) == False, that value will 97 | be imputed. mask_function is used to create a boolean mask that determines 98 | which values in the input to impute. 99 | 100 | Use np.vectorize to vectorize singular python functions. 101 | 102 | If left to default, mask_function will be 103 | sagemaker_sklearn_extension.impute.is_finite_numeric 104 | 105 | Notes 106 | ----- 107 | only accepts 2D, non-sparse inputs 108 | """ 109 | 110 | def __init__(self, dtype=None, strategy="median", fill_values=None, mask_function=None): 111 | self.dtype = dtype 112 | self.strategy = strategy 113 | self.fill_values = fill_values 114 | self.mask_function = mask_function 115 | 116 | def _validate_input(self, X): 117 | if self._is_constant_multicolumn_imputation(): 118 | if len(self.fill_values) != X.shape[1]: 119 | raise ValueError( 120 | "'fill_values' should have length equal to number of features in X {num_features}, " 121 | "got {fill_values_length}".format(num_features=X.shape[1], fill_values_length=len(self.fill_values)) 122 | ) 123 | 124 | dtype = self.dtype or np.dtype("O") 125 | 126 | if hasattr(X, "dtype") and X.dtype is not None and hasattr(X.dtype, "kind") and X.dtype.kind == "c": 127 | raise ValueError("Complex data not supported\n{}\n".format(X)) 128 | 129 | return check_array(X, dtype=dtype, copy=True, force_all_finite=False, ensure_2d=True) 130 | 131 | def _is_constant_multicolumn_imputation(self): 132 | return self.strategy == "constant" and isinstance(self.fill_values, (list, tuple, np.ndarray)) 133 | 134 | def fit(self, X, y=None): 135 | """Fit the imputer on X. 136 | 137 | Parameters 138 | ---------- 139 | X : {array-like}, shape (n_samples, n_features) 140 | Input data, where ``n_samples`` is the number of samples and 141 | ``n_features`` is the number of features. 142 | 143 | Returns 144 | ------- 145 | self : RobustImputer 146 | """ 147 | X = self._validate_input(X) 148 | 149 | self.vectorized_mask_function_ = self.mask_function or is_finite_numeric 150 | X = _apply_mask(X, _get_mask(X, self.vectorized_mask_function_)) 151 | 152 | if self._is_constant_multicolumn_imputation(): 153 | self.simple_imputer_ = SimpleImputer(strategy=self.strategy) 154 | else: 155 | self.simple_imputer_ = SimpleImputer(strategy=self.strategy, fill_value=self.fill_values) 156 | 157 | self.simple_imputer_.fit(X) 158 | 159 | # set "SimpleImputer.statistics_" for multicolumn imputations with different column fill values 160 | # SimpleImputer cannot preform multicolumn imputation with different column fill values 161 | if self._is_constant_multicolumn_imputation(): 162 | self.simple_imputer_.statistics_ = np.asarray(self.fill_values) 163 | 164 | return self 165 | 166 | def transform(self, X): 167 | """Impute all missing values in X. 168 | 169 | Parameters 170 | ---------- 171 | X : {array-like}, shape (n_samples, n_features) 172 | The input data to complete. 173 | 174 | Returns 175 | ------- 176 | Xt : {ndarray}, shape (n_samples, n_features) 177 | The imputed input data. The data type of ``Xt`` 178 | will depend on your input dtype. 179 | """ 180 | check_is_fitted(self, ["simple_imputer_", "vectorized_mask_function_"]) 181 | X = self._validate_input(X) 182 | 183 | if X.shape[1] != self.simple_imputer_.statistics_.shape[0]: 184 | raise ValueError( 185 | "'transform' input X has {transform_dim} features per sample, " 186 | "expected {fit_dim} from 'fit' input".format( 187 | transform_dim=X.shape[1], fit_dim=self.simple_imputer_.statistics_.shape[0] 188 | ) 189 | ) 190 | 191 | X = _apply_mask(X, _get_mask(X, self.vectorized_mask_function_)) 192 | 193 | return self.simple_imputer_.transform(X).astype(self.dtype) 194 | 195 | def _more_tags(self): 196 | return {"allow_nan": True} 197 | 198 | 199 | class RobustMissingIndicator(BaseEstimator, TransformerMixin): 200 | """Binary indicators for missing values. 201 | 202 | Note that this component typically should not be used in a vanilla 203 | :class:`sklearn.pipeline.Pipeline` consisting of transformers and a classifier, 204 | but rather could be added using a :class:`sklearn.pipeline.FeatureUnion` or 205 | :class:`sklearn.compose.ColumnTransformer`. 206 | 207 | Similar to sklearn.impute.MissingIndicator with added functionality 208 | - RobustMissingIndicator uses a custom mask_function to determine the boolean mask. 209 | The default mask_function is sagemaker_sklearn_extension.impute.is_finite_numeric 210 | which checks whether or not a value can be converted into a float. 211 | 212 | Parameters 213 | ---------- 214 | features : str, optional (default="all") 215 | Whether the imputer mask should represent all or a subset of 216 | features. 217 | 218 | - If "missing-only", the imputer mask will only represent 219 | features containing missing values during fit time. 220 | - If "all" (default), the imputer mask will represent all features. 221 | 222 | error_on_new : boolean, optional (default=True) 223 | If True (default), transform will raise an error when there are 224 | features with missing values in transform that have no missing values 225 | in fit. This is applicable only when ``features="missing-only"``. 226 | 227 | mask_function : callable -> np.array, dtype('bool') (default=None) 228 | A vectorized python function, accepts np.array, returns np.array 229 | with dtype('bool') 230 | 231 | For each value, if mask_function(val) == False, that value will 232 | be imputed. mask_function is used to create a boolean mask that determines 233 | which values in the input to impute. 234 | 235 | Use np.vectorize to vectorize singular python functions. 236 | 237 | By default, mask_function will be 238 | sagemaker_sklearn_extension.impute.is_finite_numeric 239 | 240 | Notes 241 | ----- 242 | only accepts 2D, non-sparse inputs 243 | """ 244 | 245 | def __init__(self, features="all", error_on_new=True, mask_function=None): 246 | self.features = features 247 | self.error_on_new = error_on_new 248 | self.mask_function = mask_function 249 | 250 | def _validate_input(self, X): 251 | if hasattr(X, "dtype") and X.dtype is not None and hasattr(X.dtype, "kind") and X.dtype.kind == "c": 252 | raise ValueError("Complex data not supported\n{}\n".format(X)) 253 | 254 | return check_array(X, dtype=np.dtype("O"), copy=True, force_all_finite=False, ensure_2d=True) 255 | 256 | def fit(self, X, y=None): 257 | """Fit the transformer on X. 258 | 259 | Parameters 260 | ---------- 261 | X : {array-like}, shape (n_samples, n_features) 262 | Input data, where ``n_samples`` is the number of samples and 263 | ``n_features`` is the number of features. 264 | 265 | Returns 266 | ------- 267 | self : RobustMissingIndicator 268 | """ 269 | X = self._validate_input(X) 270 | 271 | self.vectorized_mask_function_ = self.mask_function or is_finite_numeric 272 | X = _apply_mask(X, _get_mask(X, self.vectorized_mask_function_)) 273 | 274 | self.missing_indicator_ = MissingIndicator(features=self.features, error_on_new=self.error_on_new) 275 | self.missing_indicator_.fit(X) 276 | 277 | return self 278 | 279 | def transform(self, X): 280 | """Generate missing values indicator for X. 281 | 282 | Parameters 283 | ---------- 284 | X : {array-like}, shape (n_samples, n_features) 285 | The input data to complete. 286 | 287 | Returns 288 | ------- 289 | Xt : {ndarray}, shape (n_samples, n_features) 290 | The missing indicator for input data. The data type of ``Xt`` 291 | will be boolean. 292 | """ 293 | check_is_fitted(self, ["missing_indicator_", "vectorized_mask_function_"]) 294 | X = self._validate_input(X) 295 | 296 | X = _apply_mask(X, _get_mask(X, self.vectorized_mask_function_)) 297 | 298 | return self.missing_indicator_.transform(X) 299 | 300 | def _more_tags(self): 301 | return {"allow_nan": True} 302 | -------------------------------------------------------------------------------- /src/sagemaker_sklearn_extension/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | from .base import BaseExtremeValueTransformer 15 | from .base import LogExtremeValuesTransformer 16 | from .base import QuantileExtremeValuesTransformer 17 | from .base import RemoveConstantColumnsTransformer 18 | from .base import log_transform 19 | from .base import quantile_transform_nonrandom 20 | from .data import QuadraticFeatures 21 | from .data import RobustStandardScaler 22 | from .encoders import NALabelEncoder 23 | from .encoders import RobustLabelEncoder 24 | from .encoders import RobustOrdinalEncoder 25 | from .encoders import ThresholdOneHotEncoder 26 | from .encoders import WOEEncoder 27 | from .encoders import SimilarityEncoder 28 | 29 | __all__ = [ 30 | "BaseExtremeValueTransformer", 31 | "LogExtremeValuesTransformer", 32 | "NALabelEncoder", 33 | "QuadraticFeatures", 34 | "QuantileExtremeValuesTransformer", 35 | "ThresholdOneHotEncoder", 36 | "RemoveConstantColumnsTransformer", 37 | "RobustLabelEncoder", 38 | "RobustOrdinalEncoder", 39 | "RobustStandardScaler", 40 | "log_transform", 41 | "quantile_transform_nonrandom", 42 | "WOEEncoder", 43 | "SimilarityEncoder", 44 | ] 45 | -------------------------------------------------------------------------------- /src/sagemaker_sklearn_extension/preprocessing/data.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | from itertools import combinations 15 | 16 | import numpy as np 17 | from scipy.sparse import issparse 18 | 19 | from sklearn.base import BaseEstimator, TransformerMixin 20 | from sklearn.preprocessing import StandardScaler 21 | from sklearn.utils import check_array 22 | from sklearn.utils import check_random_state 23 | from sklearn.utils.validation import check_is_fitted 24 | from sklearn.utils.validation import FLOAT_DTYPES 25 | 26 | 27 | class QuadraticFeatures(BaseEstimator, TransformerMixin): 28 | """Generate and add quadratic features to feature matrix. 29 | 30 | Generate a new feature matrix containing the original data, an optional bias column, a collection of squared 31 | features, and a collection of interaction terms. If ``max_n_features`` is not large enough to include all the 32 | squared features, then a random subset of them is added instead. If it is large enough to include all squared 33 | features, but not large enough to include all quadratic features, then all of the squared features and a random 34 | subset of the interaction features are added instead. 35 | 36 | This transformer is similar to ``PolynomialFeatures`` from the ``sklearn.preprocessing.data`` module. 37 | 38 | Parameters 39 | ---------- 40 | include_bias : boolean (default = False) 41 | Whether to include a bias column -- the feature in which all entries are set to 1.0, and which acts as the 42 | intercept term in a linear model. Note that this parameter is False by default, in contrast to the corresponding 43 | parameter in ``sklearn``'s ``PolynomialFeatures``. 44 | 45 | interaction_only : boolean (default = False) 46 | Whether to produce only interaction features, and omit the squared features. For example, if the features are 47 | [a, b], then this will include ab, but not a^2 and b^2. The bias column is not affected by this parameter. 48 | 49 | max_n_features : int (default = 1000) 50 | The maximum number of features to include in the output data matrix. Squared features are prioritized over 51 | interaction features, unless ``interaction_only`` is ``True``. Must be larger than the number of input features 52 | (plus one, if ``include_bias`` is ``True``). 53 | 54 | order : str in {'C', 'F'} (default = 'C') 55 | Order of the input array: 'C' stands for C-contiguous order, and 'F' stands for Fortran-contiguous order. 56 | 57 | random_state : int, RandomState instance, or None (default = 0) 58 | If int, ``random_state`` is the seed used by the random number generator; if ``RandomState`` instance, 59 | ``random_state`` is the random number generator; if None, the random number generator is the ``RandomState`` 60 | instance used by ``np.random``. Used to determine which feature combinations to include in the output dataset 61 | when ``max_n_features`` is too small to fit all quadratic features. 62 | 63 | Examples 64 | -------- 65 | >>> import numpy as np 66 | >>> from sagemaker_sklearn_extension.preprocessing import QuadraticFeatures 67 | >>> X = np.arange(1, 7).reshape((2, 3)) 68 | >>> X 69 | array([[1, 2, 3], 70 | [4, 5, 6]]) 71 | >>> QuadraticFeatures().fit_transform(X) 72 | array([[ 1, 2, 3, 1, 4, 9, 2, 3, 6], 73 | [ 4, 5, 6, 16, 25, 36, 20, 24, 30]]) 74 | >>> QuadraticFeatures(interaction_only=True, max_n_features=5).fit_transform(X) 75 | array([[ 1, 2, 3, 2, 3], 76 | [ 4, 5, 6, 20, 24]]) 77 | 78 | Attributes 79 | ---------- 80 | combinations_ : list of tuples (i, j) 81 | List of tuples with two elements, each containing the indexes of the columns that are multiplied element-wise 82 | to form a single output column. Tuples appear in the same order as the corresponding output columns. 83 | n_input_features_ : int 84 | The number of columns in the input dataset. 85 | n_output_features_ : int 86 | The number of columns in the output dataset. 87 | 88 | Notes 89 | ----- 90 | Accepts only two-dimensional, dense input arrays. 91 | """ 92 | 93 | def __init__(self, include_bias=False, interaction_only=False, max_n_features=1000, order="C", random_state=0): 94 | self.include_bias = include_bias 95 | self.interaction_only = interaction_only 96 | self.max_n_features = max_n_features 97 | self.order = order 98 | self.random_state = random_state 99 | 100 | def _build_combinations(self, n_features, random_state): 101 | """Calculate the feature pairs to be added to the input data based on parameters and number of input columns. 102 | 103 | If ``interaction_only`` is ``True``, all squared features are omitted. Otherwise, they are added before 104 | interaction features. If there is enough space--as indicated by ``max_n_features``--to add all squared features, 105 | then do so. Otherwise, take a random sub-sample. Then, if there's enough space to add all interaction features, 106 | do so. Otherwise, return a random sub-sample of those. 107 | 108 | Parameters 109 | ---------- 110 | n_features : int 111 | The number of columns in the input vector. 112 | random_state : RandomState 113 | The prepared (using ``check_random_state``) ``RandomState`` instance. 114 | """ 115 | # First calculate how many new features of each kind (squared and interaction) we can add. 116 | added_feature_budget = self.max_n_features - n_features - int(self.include_bias) 117 | if added_feature_budget <= 0: 118 | message = "max_n_features must be large enough for the output to contain more than the original dataset" 119 | if self.include_bias: 120 | message += " and bias column" 121 | raise ValueError(message) 122 | squared_feature_budget = 0 if self.interaction_only else min(added_feature_budget, n_features) 123 | interaction_feature_budget = max(0, added_feature_budget - squared_feature_budget) 124 | 125 | # Produce squared feature pairs. 126 | squared_features = [] 127 | if squared_feature_budget == n_features: 128 | # No need to reorder if we can fit all squared features. 129 | squared_features = [(i, i) for i in range(n_features)] 130 | elif squared_feature_budget > 0: 131 | # Otherwise, take a random sample of them. 132 | squared_features = [ 133 | (i, i) for i in random_state.choice(range(n_features), size=squared_feature_budget, replace=False) 134 | ] 135 | 136 | # Produce interaction feature pairs. 137 | interaction_features = [] 138 | if interaction_feature_budget > 0: 139 | interaction_features = list(combinations(range(n_features), 2)) 140 | 141 | # Take a random sample of feature interactions if not all can fit. 142 | if len(interaction_features) > interaction_feature_budget: 143 | random_state.shuffle(interaction_features) 144 | 145 | interaction_features = interaction_features[:interaction_feature_budget] 146 | 147 | return squared_features + interaction_features 148 | 149 | def fit(self, X, y=None): 150 | """ 151 | Compute the number of output features and the combination of input features to multiply. 152 | 153 | Parameters 154 | ---------- 155 | X : array-like , shape (n_samples, n_features) 156 | The data array to transform. Must be a non-sparse two-dimensional numpy array. 157 | 158 | Returns 159 | ------- 160 | self : instance 161 | """ 162 | _, n_features = check_array(X).shape 163 | random_state = check_random_state(self.random_state) 164 | self.combinations_ = self._build_combinations(n_features, random_state) 165 | self.n_input_features_ = n_features 166 | self.n_output_features_ = n_features + len(self.combinations_) + int(self.include_bias) 167 | return self 168 | 169 | def transform(self, X): 170 | """ 171 | Transform data to the chosen quadratic features. 172 | 173 | Parameters 174 | ---------- 175 | X : array-like, shape (n_samples, n_features) 176 | The data array to transform. Must be a non-sparse and two-dimensional. 177 | 178 | Returns 179 | ------- 180 | XQ : np.ndarray, shape (n_samples, n_output_features_) 181 | The array of computed features. 182 | """ 183 | check_is_fitted(self, ["n_input_features_", "n_output_features_", "combinations_"]) 184 | X = check_array(X, order=self.order) 185 | n_samples, n_features = X.shape 186 | 187 | if n_features != self.n_input_features_: 188 | raise ValueError("X shape does not match training shape.") 189 | 190 | XQ = np.empty((n_samples, self.n_output_features_), dtype=X.dtype, order=self.order) 191 | 192 | if self.include_bias: 193 | XQ[:, 0] = 1.0 194 | X_col_range_start, X_col_range_end = 1, self.n_input_features_ + 1 195 | else: 196 | X_col_range_start, X_col_range_end = 0, self.n_input_features_ 197 | 198 | XQ[:, X_col_range_start:X_col_range_end] = X 199 | XQ[:, X_col_range_end:] = np.column_stack([X[:, i] * X[:, j] for i, j in self.combinations_]) 200 | 201 | return XQ 202 | 203 | 204 | class RobustStandardScaler(BaseEstimator, TransformerMixin): 205 | """Scaler to adaptively scale dense and sparse inputs. 206 | 207 | RobustStandardScaler uses `sklearn.preprocessing.StandardScaler` to perform standardization, but adapts 208 | the centering based on the sparsity of the data. 209 | 210 | For dense inputs, the standard score of a sample `x` is calculated as: 211 | 212 | z = (x - u) / s 213 | 214 | where `u` is the mean of the training samples, and `s` is the standard deviation of the training samples. 215 | The mean `u` is a vector of means of each feature. If the number of zeros for a feature is greater than or 216 | equal to 70% of the total number of samples, the corresponding value in `u` is set to `0` to avoid centering 217 | by mean. 218 | 219 | For sparse inputs, the standard score of a sample `x` is calculated as: 220 | 221 | z = x / s 222 | 223 | where `s` is the standard deviation of the training samples. 224 | 225 | Parameters 226 | ---------- 227 | copy : boolean, optional, default True 228 | If False, try to avoid a copy and do inplace scaling instead. 229 | This is not guaranteed to always work inplace; e.g. if the data is 230 | not a NumPy array or scipy.sparse CSR matrix, a copy may still be 231 | returned. 232 | 233 | Attributes 234 | ---------- 235 | self.scaler_ : ``sklearn.preprocessing.StandardScaler`` 236 | - `scaler_` is instantiated inside the fit method used for computing the center and the standard deviation. 237 | 238 | """ 239 | 240 | def __init__(self, copy=True): 241 | self.copy = copy 242 | 243 | def fit(self, X, y=None): 244 | """Fit RobustStandardScaler to X. 245 | 246 | If input is sparse, `fit` overrides `self.with_mean` to standardize without subtracting mean (avoids breaking 247 | for sparse matrix) 248 | 249 | If the data is dense, the mean is adjusted for sparse features and the scaled with mean. 250 | 251 | Parameters 252 | ---------- 253 | X : array-like, shape [n_samples, n_features] 254 | The data to standardize. 255 | 256 | Returns 257 | ------- 258 | self : RobustStandardScaler 259 | """ 260 | X = check_array( 261 | X, accept_sparse=("csr", "csc"), estimator=self, dtype=FLOAT_DTYPES, force_all_finite="allow-nan" 262 | ) 263 | 264 | with_mean = not issparse(X) 265 | 266 | self.scaler_ = StandardScaler(with_mean=with_mean, with_std=True, copy=self.copy) 267 | self.scaler_.fit(X) 268 | 269 | if self.scaler_.with_mean: 270 | nnz_mean_mask = np.where(np.count_nonzero(X, axis=0) / X.shape[0] > 0.3, 1, 0) 271 | self.scaler_.mean_ = self.scaler_.mean_ * nnz_mean_mask 272 | 273 | return self 274 | 275 | def transform(self, X): 276 | """ 277 | Standardize data by centering and scaling. 278 | 279 | Parameters 280 | ---------- 281 | X : array-like, shape (n_samples, n_features) 282 | The data array to transform. 283 | 284 | Returns 285 | ------- 286 | Xt : array-like, shape (n_samples, n_features) 287 | The array of transformed input. 288 | """ 289 | return self.scaler_.transform(X) 290 | 291 | def _more_tags(self): 292 | return {"allow_nan": True} 293 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/sagemaker-scikit-learn-extension/2412131311433addbae9f6ad5aa393a8bdbbe61f/test/__init__.py -------------------------------------------------------------------------------- /test/contrib/taei/data/data.csv: -------------------------------------------------------------------------------- 1 | 0.000,0.000,1.000,0.455,0.365,0.095,0.514,0.225,0.101,0.150,-1.000 2 | 0.000,0.000,1.000,0.350,0.265,0.090,0.226,0.100,0.049,0.070,1.000 3 | 1.000,0.000,0.000,0.530,0.420,0.135,0.677,0.257,0.141,0.210,-1.000 4 | 0.000,0.000,1.000,0.440,0.365,0.125,0.516,0.215,0.114,0.155,-1.000 5 | 0.000,1.000,0.000,0.330,0.255,0.080,0.205,0.089,0.040,0.055,1.000 6 | 0.000,1.000,0.000,0.425,0.300,0.095,0.351,0.141,0.077,0.120,-1.000 7 | 1.000,0.000,0.000,0.530,0.415,0.150,0.777,0.237,0.141,0.330,-1.000 8 | 1.000,0.000,0.000,0.545,0.425,0.125,0.768,0.294,0.149,0.260,-1.000 9 | 0.000,0.000,1.000,0.475,0.370,0.125,0.509,0.216,0.113,0.165,-1.000 10 | 1.000,0.000,0.000,0.550,0.440,0.150,0.894,0.315,0.151,0.320,-1.000 11 | 1.000,0.000,0.000,0.525,0.380,0.140,0.607,0.194,0.147,0.210,-1.000 12 | 0.000,0.000,1.000,0.430,0.350,0.110,0.406,0.168,0.081,0.135,-1.000 13 | 0.000,0.000,1.000,0.490,0.380,0.135,0.541,0.217,0.095,0.190,-1.000 14 | 1.000,0.000,0.000,0.535,0.405,0.145,0.684,0.273,0.171,0.205,-1.000 15 | 1.000,0.000,0.000,0.470,0.355,0.100,0.475,0.168,0.081,0.185,-1.000 16 | 0.000,0.000,1.000,0.500,0.400,0.130,0.664,0.258,0.133,0.240,-1.000 17 | 0.000,1.000,0.000,0.355,0.280,0.085,0.290,0.095,0.040,0.115,1.000 18 | 1.000,0.000,0.000,0.440,0.340,0.100,0.451,0.188,0.087,0.130,-1.000 19 | 0.000,0.000,1.000,0.365,0.295,0.080,0.256,0.097,0.043,0.100,1.000 20 | 0.000,0.000,1.000,0.450,0.320,0.100,0.381,0.171,0.075,0.115,-1.000 21 | 0.000,0.000,1.000,0.355,0.280,0.095,0.245,0.096,0.062,0.075,-1.000 22 | 0.000,1.000,0.000,0.380,0.275,0.100,0.226,0.080,0.049,0.085,-1.000 23 | 1.000,0.000,0.000,0.565,0.440,0.155,0.940,0.427,0.214,0.270,-1.000 24 | 1.000,0.000,0.000,0.550,0.415,0.135,0.763,0.318,0.210,0.200,-1.000 25 | 1.000,0.000,0.000,0.615,0.480,0.165,1.161,0.513,0.301,0.305,-1.000 26 | 1.000,0.000,0.000,0.560,0.440,0.140,0.928,0.383,0.188,0.300,-1.000 27 | 1.000,0.000,0.000,0.580,0.450,0.185,0.996,0.395,0.272,0.285,-1.000 28 | 0.000,0.000,1.000,0.590,0.445,0.140,0.931,0.356,0.234,0.280,-1.000 29 | 0.000,0.000,1.000,0.605,0.475,0.180,0.936,0.394,0.219,0.295,-1.000 30 | 0.000,0.000,1.000,0.575,0.425,0.140,0.864,0.393,0.227,0.200,-1.000 31 | 0.000,0.000,1.000,0.580,0.470,0.165,0.998,0.394,0.242,0.330,-1.000 32 | 1.000,0.000,0.000,0.680,0.560,0.165,1.639,0.606,0.281,0.460,-1.000 33 | 0.000,0.000,1.000,0.665,0.525,0.165,1.338,0.551,0.357,0.350,-1.000 34 | 1.000,0.000,0.000,0.680,0.550,0.175,1.798,0.815,0.393,0.455,-1.000 35 | 1.000,0.000,0.000,0.705,0.550,0.200,1.710,0.633,0.411,0.490,-1.000 36 | 0.000,0.000,1.000,0.465,0.355,0.105,0.479,0.227,0.124,0.125,-1.000 37 | 1.000,0.000,0.000,0.540,0.475,0.155,1.217,0.530,0.307,0.340,-1.000 38 | 1.000,0.000,0.000,0.450,0.355,0.105,0.522,0.237,0.117,0.145,-1.000 39 | 1.000,0.000,0.000,0.575,0.445,0.135,0.883,0.381,0.203,0.260,-1.000 40 | 0.000,0.000,1.000,0.355,0.290,0.090,0.328,0.134,0.086,0.090,-1.000 41 | 1.000,0.000,0.000,0.450,0.335,0.105,0.425,0.186,0.091,0.115,-1.000 42 | 1.000,0.000,0.000,0.550,0.425,0.135,0.852,0.362,0.196,0.270,-1.000 43 | 0.000,1.000,0.000,0.240,0.175,0.045,0.070,0.032,0.024,0.020,-1.000 44 | 0.000,1.000,0.000,0.205,0.150,0.055,0.042,0.025,0.015,0.012,-1.000 45 | 0.000,1.000,0.000,0.210,0.150,0.050,0.042,0.018,0.013,0.015,-1.000 46 | 0.000,1.000,0.000,0.390,0.295,0.095,0.203,0.087,0.045,0.075,1.000 47 | 0.000,0.000,1.000,0.470,0.370,0.120,0.580,0.293,0.227,0.140,-1.000 48 | 1.000,0.000,0.000,0.460,0.375,0.120,0.461,0.177,0.110,0.150,1.000 49 | 0.000,1.000,0.000,0.325,0.245,0.070,0.161,0.075,0.025,0.045,-1.000 50 | 1.000,0.000,0.000,0.525,0.425,0.160,0.836,0.354,0.213,0.245,-1.000 51 | 0.000,1.000,0.000,0.520,0.410,0.120,0.595,0.238,0.111,0.190,-1.000 52 | 0.000,0.000,1.000,0.400,0.320,0.095,0.303,0.134,0.060,0.100,1.000 53 | 0.000,0.000,1.000,0.485,0.360,0.130,0.541,0.260,0.096,0.160,-1.000 54 | 1.000,0.000,0.000,0.470,0.360,0.120,0.477,0.210,0.105,0.150,-1.000 55 | 0.000,0.000,1.000,0.405,0.310,0.100,0.385,0.173,0.091,0.110,1.000 56 | 1.000,0.000,0.000,0.500,0.400,0.140,0.661,0.257,0.175,0.220,-1.000 57 | 0.000,0.000,1.000,0.445,0.350,0.120,0.443,0.192,0.096,0.135,-1.000 58 | 0.000,0.000,1.000,0.470,0.385,0.135,0.590,0.277,0.120,0.170,-1.000 59 | 0.000,1.000,0.000,0.245,0.190,0.060,0.086,0.042,0.014,0.025,-1.000 60 | 1.000,0.000,0.000,0.505,0.400,0.125,0.583,0.246,0.130,0.175,1.000 61 | 0.000,0.000,1.000,0.450,0.345,0.105,0.411,0.180,0.113,0.135,1.000 62 | 0.000,0.000,1.000,0.505,0.405,0.110,0.625,0.305,0.160,0.175,-1.000 63 | 1.000,0.000,0.000,0.530,0.410,0.130,0.697,0.302,0.194,0.200,-1.000 64 | 0.000,0.000,1.000,0.425,0.325,0.095,0.379,0.171,0.080,0.100,1.000 65 | 0.000,0.000,1.000,0.520,0.400,0.120,0.580,0.234,0.132,0.185,-1.000 66 | 0.000,0.000,1.000,0.475,0.355,0.120,0.480,0.234,0.102,0.135,-1.000 67 | 1.000,0.000,0.000,0.565,0.440,0.160,0.915,0.354,0.194,0.320,-1.000 68 | 1.000,0.000,0.000,0.595,0.495,0.185,1.285,0.416,0.224,0.485,-1.000 69 | 1.000,0.000,0.000,0.475,0.390,0.120,0.530,0.213,0.116,0.170,-1.000 70 | 0.000,1.000,0.000,0.310,0.235,0.070,0.151,0.063,0.041,0.045,-1.000 71 | 0.000,0.000,1.000,0.555,0.425,0.130,0.766,0.264,0.168,0.275,-1.000 72 | 1.000,0.000,0.000,0.400,0.320,0.110,0.353,0.141,0.099,0.100,-1.000 73 | 1.000,0.000,0.000,0.595,0.475,0.170,1.247,0.480,0.225,0.425,-1.000 74 | 0.000,0.000,1.000,0.570,0.480,0.175,1.185,0.474,0.261,0.380,-1.000 75 | 1.000,0.000,0.000,0.605,0.450,0.195,1.098,0.481,0.289,0.315,-1.000 76 | 1.000,0.000,0.000,0.600,0.475,0.150,1.008,0.443,0.221,0.280,-1.000 77 | 0.000,0.000,1.000,0.595,0.475,0.140,0.944,0.362,0.189,0.315,-1.000 78 | 1.000,0.000,0.000,0.600,0.470,0.150,0.922,0.363,0.194,0.305,-1.000 79 | 1.000,0.000,0.000,0.555,0.425,0.140,0.788,0.282,0.160,0.285,-1.000 80 | 1.000,0.000,0.000,0.615,0.475,0.170,1.103,0.469,0.235,0.345,-1.000 81 | 1.000,0.000,0.000,0.575,0.445,0.140,0.941,0.385,0.252,0.285,-1.000 82 | 0.000,0.000,1.000,0.620,0.510,0.175,1.615,0.510,0.192,0.675,-1.000 83 | 1.000,0.000,0.000,0.520,0.425,0.165,0.989,0.396,0.225,0.320,-1.000 84 | 0.000,0.000,1.000,0.595,0.475,0.160,1.317,0.408,0.234,0.580,-1.000 85 | 0.000,0.000,1.000,0.580,0.450,0.140,1.013,0.380,0.216,0.360,-1.000 86 | 1.000,0.000,0.000,0.570,0.465,0.180,1.295,0.339,0.223,0.440,-1.000 87 | 0.000,0.000,1.000,0.625,0.465,0.140,1.195,0.482,0.205,0.400,-1.000 88 | 0.000,0.000,1.000,0.560,0.440,0.160,0.865,0.331,0.207,0.260,-1.000 89 | 1.000,0.000,0.000,0.460,0.355,0.130,0.517,0.221,0.114,0.165,-1.000 90 | 1.000,0.000,0.000,0.575,0.450,0.160,0.978,0.314,0.231,0.330,-1.000 91 | 0.000,0.000,1.000,0.565,0.425,0.135,0.811,0.341,0.168,0.255,-1.000 92 | 0.000,0.000,1.000,0.555,0.440,0.150,0.755,0.307,0.152,0.260,-1.000 93 | 0.000,0.000,1.000,0.595,0.465,0.175,1.115,0.402,0.254,0.390,-1.000 94 | 1.000,0.000,0.000,0.625,0.495,0.165,1.262,0.507,0.318,0.390,-1.000 95 | 0.000,0.000,1.000,0.695,0.560,0.190,1.494,0.588,0.343,0.485,-1.000 96 | 0.000,0.000,1.000,0.665,0.535,0.195,1.606,0.576,0.388,0.480,-1.000 97 | 0.000,0.000,1.000,0.535,0.435,0.150,0.725,0.269,0.139,0.250,-1.000 98 | 0.000,0.000,1.000,0.470,0.375,0.130,0.523,0.214,0.132,0.145,-1.000 99 | 0.000,0.000,1.000,0.470,0.370,0.130,0.522,0.201,0.133,0.165,1.000 100 | 1.000,0.000,0.000,0.475,0.375,0.125,0.579,0.278,0.085,0.155,-1.000 101 | -------------------------------------------------------------------------------- /test/contrib/taei/test_taei.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from sagemaker_sklearn_extension.contrib.taei import LatentSpaceOversampler, AE, VAE, StarOversampler 4 | 5 | 6 | def test_latent_space_oversampler(): 7 | # make torch deterministic 8 | torch.backends.cudnn.deterministic = True 9 | torch.backends.cudnn.benchmark = False 10 | 11 | d = np.genfromtxt("test/contrib/taei/data/data.csv", delimiter=",") 12 | categorical_features = [0, 1, 2] 13 | categorical_dims = [2, 2, 2] 14 | continuous_features = [3, 4, 5, 6, 7, 8, 9] 15 | 16 | star_fit_resample = StarOversampler(proportion=1.0).resample 17 | 18 | # Test AE+StarOversampler 19 | torch.manual_seed(0) 20 | ae_smote = LatentSpaceOversampler( 21 | model=AE( 22 | categorical_features=categorical_features, 23 | categorical_dims=categorical_dims, 24 | continuous_features=continuous_features, 25 | latent_dim=8, 26 | hidden_dim=[64, 32], 27 | nll_weight=0.5, 28 | ), 29 | base_oversampler=star_fit_resample, 30 | ) 31 | # Train the model 32 | ae_smote.fit(d[:, :10], d[:, 10], max_epoch=5, validation_ratio=None) 33 | # Use the model for oversampling 34 | X_os, y_os = ae_smote.resample(d[:, :10], d[:, 10]) 35 | np.testing.assert_almost_equal( 36 | X_os[-1, :], 37 | [1.0, 1.0, 0.0, 0.5661017, 0.7811485, 1.172961, 1.0983223, 1.5463793, 1.3487656, 0.605184], 38 | decimal=2, 39 | ) 40 | 41 | # Test VAE+StarOversampler 42 | torch.manual_seed(0) 43 | vae_smote = LatentSpaceOversampler( 44 | model=VAE( 45 | categorical_features=categorical_features, 46 | categorical_dims=categorical_dims, 47 | continuous_features=continuous_features, 48 | latent_dim=16, 49 | hidden_dim=32, 50 | nll_weight=0.1, 51 | kld_weight=0.5, 52 | ), 53 | base_oversampler=star_fit_resample, 54 | ) 55 | # Train and use the model in one function call 56 | X_os, y_os = vae_smote.fit_resample(d[:, :10], d[:, 10], max_epoch=50, early_stopping=1) 57 | np.testing.assert_almost_equal( 58 | X_os[-1, :], 59 | [0.0, 1.0, 1.0, 0.5926914, 0.4106686, 0.3133996, 0.0246359, 0.4813618, -0.1365427, -0.0096727], 60 | decimal=2, 61 | ) 62 | 63 | # Test storing and loading models 64 | vae_smote.save_model("/tmp/vae_model.pth") 65 | vae_smote_loaded = LatentSpaceOversampler(model=None, base_oversampler=star_fit_resample) 66 | vae_smote_loaded.load_model("/tmp/vae_model.pth") 67 | X_os_loaded, y_os_loaded = vae_smote_loaded.resample(d[:, :10], d[:, 10]) 68 | np.testing.assert_almost_equal(X_os, X_os_loaded) 69 | -------------------------------------------------------------------------------- /test/data/csv/dirty.csv: -------------------------------------------------------------------------------- 1 | t,23,111111,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666,a ,bcd,this is ml test table,2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,-14 days 2 | ,23,111111,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666,\n,bcd~,"this is, ml test\n table",2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,-14 days 3 | t,,222222,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666,", ",bcd,this is ml test table,2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,28 days 4 | t,23,,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666,//,bcd,this is ml 'test table ,2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,28 days 5 | t,23,222222,,123.456,9.999999999,6.80,6.6666666666666666666666666666, , ,"this is ""ml test"" table ",2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,1 year 6 | t,23,222222,0,,9.999999999,6.80,6.6666666666666666666666666666,\\,",","this, is. ""ml test"" \\\ntable ",2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,1 year 7 | t,23,111111,888888888888,123.456,,6.78,6.6666666666666666666666666666,a ,bcd,this is ml test table,2019-10-09,00:00:00,2019-10-09 20:22:02,2019-10-09 12:22:02+00,-3 days 8 | t,23,111111,888888888888,123.456,9.999999999,,6.6666666666666666666666666666,' ,\\,this is ~~~ ml test table,2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,-14 days 9 | t,23,111111,888888888888,123.456,9.999999999,6.78,,"""""","",this is ml test table,2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,-14 days 10 | t,23,111111,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666,,"""""","this is""'""// ml test table",2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,-14 days 11 | t,23,111111,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666, ,,"this is ml test table""""",2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,-14 days 12 | t,23,111111,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666, ,bc/d,,2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,02:00:00 13 | t,23,111111,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666,\n,bcd\n,this is ml test table\n,,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,-14 days 14 | t,23,111111,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666,a ,bcd ,this is ml test table ,2019-10-09,,2019-10-09 20:22:02,2019-10-09 12:22:02+00,-14 days 15 | t,23,111111,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666,a;,bcd;,this is ml test table;,2019-10-09,20:22:02,,2019-10-09 12:22:02+00,-14 days 16 | t,23,111111,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666,あ ,あいう,this is あいう ml test table,2019-10-09,20:22:02,2019-10-09 20:22:02,,-14 days 17 | t,23,111111,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666,好 ,可变字符,this is ml 测试 table,2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00, 18 | t,23,111111,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666,", "," 19 | bcdf 20 | ","this is 21 | ml test ' table \",2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,-14 days 22 | t,23,111111,888888888888,123.456,9.999999999,6.78,6.6666666666666666666666666666,", ",\bcdf',"this is "" "" ml test ' table //",2019-10-09,20:22:02,2019-10-09 20:22:02,2019-10-09 12:22:02+00,-14 days 23 | -------------------------------------------------------------------------------- /test/data/csv/invalid.csv: -------------------------------------------------------------------------------- 1 | 1,2,3,4 2 | 5,6,7,8 3 | 9,10,11,12 4 | 13,14,15,16,extra 5 | 17,18,19,20 6 | 21,22,23,24 7 | 25,26,27,28 8 | 29,30,31,32 -------------------------------------------------------------------------------- /test/data/csv/missing_values.csv: -------------------------------------------------------------------------------- 1 | -1, rheumatoid arthritis expert tip info article treatment option support, understand rheumatoid arthritis everyday health, understand rheumatoid arthritis, understand rheumatoid arthritis everyday health root root act consumer root content everyday solution understand rheumatoid arthritis future ra treatment advance rheumatoid arthritis treatment expect future lead researcher ra treatment research exercise ra check tip slideshow help create workout program ra fitness tip question doctor print list rheumatoid arthritis question doctor visit list ra question understand rheumatoid arthritis tip manage rheumatoid arthritis pain mak key change help manage rheumatoid arthritis pain ease joint pain strive eat balance diet help healthy weight sufficient vitamin mineral counter chronic inflammation tip ease rheumatoid arthritis pain cause rheumatoid arthritis inflammation rheumatoid arthritis ra symptom cause inflammation learn inflammation lead ra symptom joint pain stiffness plus discover cause rheumatoid arthritis rheumatoid arthritis inflammation rheumatoid arthritis management research rheumatoid arthritis research lifestyle choice day impact ra symptom example people ra experience depression treate depression help people manage ra control ra rheumatoid arthritis expert yoga ra yoga safe exercise option person rheumatoid arthritis read dr susan lee answer root act consumer rheumatoid arthritis poll lifestyle change help manage ra pain please select option eat healthy balance diet muscle strengthen aerobic exercise sleep night try reduce manage stress technique haven lifestyle change toolkit healthy recipe shop list meal planner recipe box tool root act consumer enter search term register sign newsletter home health common condition add adhd addiction allergy alternative health alzheimer disease anxiety disorder arthritis asthma autism autoimmune disorder bipolar disorder pain breast cancer cancer cardiovascular health cold flu dental health depression diabete diet nutrition digestive health dvt emotional health epilepsy erectile dysfunction family health fibromyalgia fitness gerd headache migraine healthy home healthy live heart health cholesterol hiv aid hypertension ib incontinence kid health health menopause multiple sclerosis osteoporosis pain management pet health psoriasis rheumatoid arthritis schizophrenia senior health sexual health skin beauty sleep stop smok stroke swine flu weight women health yeast infection condition drug symptom checker flu checkup abdominal pain arm pain pain body ach breast pain breathing difficulty chest pain congestion cough diarrhea ear pain excessive sweate faintness fatigue fever ga headache irregular period joint pain leg pain mouth lesion nausea neck pain rash rectal bleed skin lump sore throat vaginal itch vomite food fitness calorie counter healthy recipe search recipe diet nutrition weight fitness community profile blog discussion photo albums everyday health health tool bmi calculator bmr calculator body fat calculator brain game conversion calculator glossary glucose tracker meal planner calorie counter photo gallery recipe box symptom checker video weight tracker everyday health edit profile inbox discussion blog friend tool copyright everyday health inc everydayhealth com everyday health inc help ad policy advertise link feedback advertise notice site third party advertisement site collect information visit site website provide advertisement service obtain information advertise practice choice online behavioral advertise please click material web site provide educational purpose medical advice diagnosis treatment additional information site subject term privacy policy site comply honcode standard trustworthy health information verify 2 | -1, ,practice location, , practice location arthritis rheumatology pllc rheumatology specialist anju varghese board certify internal medicine rheumatology practice limit rheumatology subtitle text home practice location patient resource rockland county ny medical park dr lower level pomona ny phone fax route exit palisad parkway westchester county ny north broadway nd floor yonker ny phone fax st john riverside hospital exit sawmill river parkway accept patient participate health insurance health plan content copyright host exchange 3 | -1, siemen water remediation water scarce resource siemen help preserve, siemen usa, , siemen usa skip content siemen skip site identifier siemen usa close site id layer skip language selection skip generic navigation contact skip search search industry energy healthcare business product industry solution motor drive build technology industry automation financial solution solution service lighte osram sylvania product lifecycle management mobility water technology power generation power transmission power distribution automation control protection electrical compression expansion ventilation mechanical drive service financial solution solution service diagnostic image therapy hear aid product laboratory diagnostics build technology financial solution solution service consumer product corporate research government solution information communication siemen financial solution solution service siemen usa investor relation press job career business siemen global website answer america renewable energy smart grid technology medical image electronic healthcare record green build commuter rail system employee siemen unit commit answer america toughest question close productfinder layer close logo layer siemen corporation corporate information privacy policy term digital id 4 | -1, symptom muscle weakness genetic disease symptom include search learn, , , page found 5 | 1, animal animal wild sa official tourism website, , , page found 6 | -1, dr enrico fazzini parkinson disease specialist nyu movement disorder neurologist www theparkinsonsdoctor com, , , page found 7 | -1, ulcerative colitis uc quiz ulcerative colitis, colitis treatment endless path, colitis treatment endless path, -------------------------------------------------------------------------------- /test/data/csv/mock_datasplitter_output/excel.csv: -------------------------------------------------------------------------------- 1 | 1,2,"a 2 | b",2.0 3 | "c,d,e",f,3.0,'hi' 4 | """hi""","h""i",h'i,bye 5 | -------------------------------------------------------------------------------- /test/data/csv/mock_datasplitter_output/manual.csv: -------------------------------------------------------------------------------- 1 | 1,2,3,4 5,6,7,8 9,10,11,12 13,14,15,16 17,18,19,20 21,22,23,24 25,26,27,28 29,30,31,32 -------------------------------------------------------------------------------- /test/data/csv/mock_datasplitter_output/newline.csv: -------------------------------------------------------------------------------- 1 | 1,2,3,4 2 | 2,3,4,5 3 | 3,4,5,6 4 | 4,5,6,7 5 | 5,6,7,8 6 | 6,7,8,9 7 | 7,8,9,10 8 | 8,9,10,11 9 | 9,10,11,12 10 | 10,11,12,13 11 | -------------------------------------------------------------------------------- /test/data/csv/mock_datasplitter_output/oneline.csv: -------------------------------------------------------------------------------- 1 | col0,col1,col2,class -------------------------------------------------------------------------------- /test/data/csv/regression_na_labels.csv: -------------------------------------------------------------------------------- 1 | 1, 2, 3, 1.1 2 | 4, string, 5, 2.2 3 | 6, 7, 8, string 4 | 9, 10, 11, inf 5 | 12, 13, 14, 3.3 -------------------------------------------------------------------------------- /test/test_automl_transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | import numpy as np 15 | import pytest 16 | from scipy.sparse import csr_matrix 17 | 18 | from sklearn.decomposition import PCA 19 | from sklearn.impute import SimpleImputer 20 | from sklearn.pipeline import Pipeline 21 | from sklearn.preprocessing import LabelEncoder 22 | from sklearn.preprocessing import FunctionTransformer 23 | from sagemaker_sklearn_extension.externals import AutoMLTransformer 24 | from sagemaker_sklearn_extension.externals import Header 25 | from sagemaker_sklearn_extension.externals import read_csv_data 26 | from sagemaker_sklearn_extension.preprocessing import NALabelEncoder 27 | from sagemaker_sklearn_extension.impute import RobustImputer 28 | 29 | 30 | def to_csr(X): 31 | return csr_matrix(X.shape, dtype=np.int8) 32 | 33 | 34 | impute_pca_pipeline = Pipeline(steps=[("impute", SimpleImputer()), ("pca", PCA(n_components=2))]) 35 | 36 | 37 | @pytest.mark.parametrize( 38 | "feature_transformer, target_transformer, " "expected_X_transformed_shape, expected_Xy_transformed_shape", 39 | [ 40 | (impute_pca_pipeline, LabelEncoder(), (10, 2), (10, 3)), 41 | (impute_pca_pipeline, NALabelEncoder(), (10, 2), (9, 3)), 42 | (FunctionTransformer(to_csr, validate=False), None, (10, 3), (9, 4)), 43 | ], 44 | ) 45 | def test_automl_transformer( 46 | feature_transformer, target_transformer, expected_X_transformed_shape, expected_Xy_transformed_shape 47 | ): 48 | X = np.arange(0, 3 * 10).reshape((10, 3)).astype(np.str) 49 | y = np.array([0] * 5 + [1] * 4 + [np.nan]).astype(np.str) 50 | 51 | header = Header(column_names=["x1", "x2", "x3", "class"], target_column_name="class") 52 | automl_transformer = AutoMLTransformer( 53 | header=header, feature_transformer=feature_transformer, target_transformer=target_transformer, 54 | ) 55 | 56 | model = automl_transformer.fit(X, y) 57 | 58 | X_transformed = model.transform(X) 59 | assert X_transformed.shape == expected_X_transformed_shape 60 | 61 | Xy = np.column_stack([X, y]) 62 | 63 | Xy_transformed = model.transform(Xy) 64 | assert Xy_transformed.shape == expected_Xy_transformed_shape 65 | 66 | with pytest.raises(ValueError): 67 | model.transform(X[:, 2:]) 68 | 69 | 70 | def test_automl_transformer_regression(): 71 | """Tests that rows in a regression dataset where the target column is not a finite numeric are imputed""" 72 | data = read_csv_data(source="test/data/csv/regression_na_labels.csv") 73 | X = data[:, :3] 74 | y = data[:, 3] 75 | header = Header(column_names=["x1", "x2", "x3", "class"], target_column_name="class") 76 | automl_transformer = AutoMLTransformer( 77 | header=header, 78 | feature_transformer=RobustImputer(strategy="constant", fill_values=0), 79 | target_transformer=NALabelEncoder(), 80 | ) 81 | model = automl_transformer.fit(X, y) 82 | X_transformed = model.transform(X) 83 | assert X_transformed.shape == X.shape 84 | 85 | Xy = np.concatenate((X, y.reshape(-1, 1)), axis=1) 86 | 87 | Xy_transformed = model.transform(Xy) 88 | assert Xy_transformed.shape == (3, 4) 89 | assert np.array_equal( 90 | Xy_transformed, np.array([[1.1, 1.0, 2.0, 3.0], [2.2, 4.0, 0.0, 5.0], [3.3, 12.0, 13.0, 14.0]]) 91 | ) 92 | -------------------------------------------------------------------------------- /test/test_common.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | """ 15 | General tests for all estimators in sagemaker-sklearn-extension. 16 | """ 17 | import pytest 18 | 19 | from sklearn.utils.estimator_checks import check_estimator 20 | 21 | from sagemaker_sklearn_extension.feature_extraction.text import MultiColumnTfidfVectorizer 22 | from sagemaker_sklearn_extension.feature_extraction.date_time import DateTimeVectorizer 23 | from sagemaker_sklearn_extension.feature_extraction.sequences import TSFeatureExtractor 24 | from sagemaker_sklearn_extension.feature_extraction.sequences import TSFlattener 25 | from sagemaker_sklearn_extension.feature_extraction.sequences import TSFreshFeatureExtractor 26 | from sagemaker_sklearn_extension.impute import RobustImputer 27 | from sagemaker_sklearn_extension.impute import RobustMissingIndicator 28 | from sagemaker_sklearn_extension.preprocessing import LogExtremeValuesTransformer 29 | from sagemaker_sklearn_extension.preprocessing import NALabelEncoder 30 | from sagemaker_sklearn_extension.preprocessing import QuadraticFeatures 31 | from sagemaker_sklearn_extension.preprocessing import QuantileExtremeValuesTransformer 32 | from sagemaker_sklearn_extension.preprocessing import RemoveConstantColumnsTransformer 33 | from sagemaker_sklearn_extension.preprocessing import RobustLabelEncoder 34 | from sagemaker_sklearn_extension.preprocessing import RobustStandardScaler 35 | from sagemaker_sklearn_extension.preprocessing import ThresholdOneHotEncoder 36 | from sagemaker_sklearn_extension.preprocessing import WOEEncoder 37 | 38 | 39 | @pytest.mark.parametrize( 40 | "Estimator", 41 | [ 42 | DateTimeVectorizer(), 43 | LogExtremeValuesTransformer(), 44 | MultiColumnTfidfVectorizer(), 45 | NALabelEncoder(), 46 | QuadraticFeatures(), 47 | QuantileExtremeValuesTransformer(), 48 | RobustImputer(), 49 | RemoveConstantColumnsTransformer(), 50 | RobustLabelEncoder(), 51 | RobustMissingIndicator(), 52 | RobustStandardScaler(), 53 | ThresholdOneHotEncoder(), 54 | WOEEncoder(), 55 | TSFeatureExtractor(), 56 | TSFlattener(), 57 | TSFreshFeatureExtractor(), 58 | ], 59 | ) 60 | def test_all_estimators(Estimator): 61 | return check_estimator(Estimator) 62 | -------------------------------------------------------------------------------- /test/test_data.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | import numpy as np 15 | import pytest 16 | from scipy.sparse import csr_matrix, issparse 17 | 18 | from sagemaker_sklearn_extension.preprocessing import QuadraticFeatures, RobustStandardScaler 19 | 20 | 21 | def _n_choose_2(n): 22 | """Calculates the number of 2-combinations of n elements.""" 23 | return (n * (n - 1)) // 2 24 | 25 | 26 | X = np.array([[1.0, 5.0], [2.0, 3.0], [1.0, 1.0],]) 27 | X_sparse = csr_matrix(X) 28 | X_standardized = (X - np.mean(X, axis=0)) / np.std(X, axis=0) 29 | 30 | X_small = np.arange(6).reshape((2, 3)) 31 | X_small_n_rows, X_small_n_cols = X_small.shape 32 | X_small_n_col_combinations = _n_choose_2(X_small_n_cols) 33 | 34 | X_low_nnz = np.array( 35 | [[1.0, 5.0, 0], [2.0, 0.0, 0], [2.0, 1.0, 0], [1.0, 0.0, 1], [2.0, 3.0, 0], [3.0, 0.0, 3], [4.0, 5.0, 0],] 36 | ) 37 | low_nnz_mask = np.where((np.count_nonzero(X_low_nnz, axis=0) / X_low_nnz.shape[0]) > 0.3, 1, 0) 38 | X_low_nnz_standardized = (X_low_nnz - np.mean(X_low_nnz, axis=0) * low_nnz_mask) / np.std(X_low_nnz, axis=0) 39 | 40 | 41 | def test_quadratic_features_explicit(): 42 | """Explicitly test the return value for a small float-filled input matrix.""" 43 | X_observed = QuadraticFeatures().fit_transform(X_standardized) 44 | X_expected = np.hstack( 45 | [ 46 | X_standardized, 47 | (X_standardized[:, 0] * X_standardized[:, 0]).reshape((-1, 1)), 48 | (X_standardized[:, 1] * X_standardized[:, 1]).reshape((-1, 1)), 49 | (X_standardized[:, 0] * X_standardized[:, 1]).reshape((-1, 1)), 50 | ] 51 | ) 52 | np.testing.assert_array_equal(X_observed, X_expected) 53 | 54 | 55 | def test_quadratic_features_max_n_features(): 56 | """Test that small but valid ``max_n_features`` produces a non-complete set of combinations.""" 57 | transformer = QuadraticFeatures(max_n_features=5) 58 | transformer.fit(X_small) 59 | assert len(transformer.combinations_) == 5 - X_small_n_cols 60 | 61 | 62 | @pytest.mark.parametrize( 63 | ["include_bias", "max_n_features"], 64 | [ 65 | # Exactly at limit of what's allowed. 66 | (False, X_small_n_col_combinations), 67 | (True, X_small_n_col_combinations + 1), 68 | # Smaller than limit of what's allowed. 69 | (False, X_small_n_col_combinations - 1), 70 | (True, X_small_n_col_combinations - 1), 71 | ], 72 | ) 73 | def test_quadratic_features_max_n_features_too_small(include_bias, max_n_features): 74 | """Test that when the ``max_n_features`` parameter is too small, an exception is raised.""" 75 | transformer = QuadraticFeatures(include_bias=include_bias, max_n_features=max_n_features,) 76 | with pytest.raises(ValueError): 77 | transformer.fit(X_small) 78 | 79 | 80 | def test_quadratic_features_random_state_invariance(): 81 | """Test that the exact same input is produced when using the same random seed.""" 82 | transformer1 = QuadraticFeatures(random_state=0) 83 | transformer2 = QuadraticFeatures(random_state=0) 84 | X1 = transformer1.fit_transform(X_small) 85 | X2 = transformer2.fit_transform(X_small) 86 | assert np.all(X1 == X2) 87 | 88 | 89 | @pytest.mark.parametrize( 90 | ["include_bias", "interaction_only", "n_output_features"], 91 | [ 92 | (False, False, X_small_n_cols + 2 * X_small_n_col_combinations), 93 | (True, False, X_small_n_cols + 2 * X_small_n_col_combinations + 1), 94 | (False, True, X_small_n_cols + X_small_n_col_combinations), 95 | (True, True, X_small_n_cols + X_small_n_col_combinations + 1), 96 | ], 97 | ) 98 | def test_quadratic_features_shape(include_bias, interaction_only, n_output_features): 99 | """Test that various parameter values produce expected resulting data shapes.""" 100 | transformer = QuadraticFeatures(include_bias=include_bias, interaction_only=interaction_only,) 101 | XQ = transformer.fit_transform(X_small) 102 | assert XQ.shape == (X_small_n_rows, n_output_features) 103 | 104 | 105 | def test_quadratic_features_single_column_input_explicit(): 106 | """Test that using a single-column matrix as input produces the expected output.""" 107 | X_observed = QuadraticFeatures().fit_transform(X_standardized[:, 0].reshape((-1, 1))) 108 | X_expected = np.hstack([X_standardized[:, [0]], (X_standardized[:, 0] * X_standardized[:, 0]).reshape((-1, 1)),]) 109 | np.testing.assert_array_equal(X_observed, X_expected) 110 | 111 | 112 | def test_robust_standard_scaler_dense(): 113 | scaler = RobustStandardScaler() 114 | X_observed = scaler.fit_transform(X) 115 | 116 | np.testing.assert_array_equal(X_observed, X_standardized) 117 | 118 | 119 | def test_robust_standard_scaler_sparse(): 120 | scaler = RobustStandardScaler() 121 | X_observed = scaler.fit_transform(X_sparse) 122 | 123 | assert issparse(X_observed) 124 | np.testing.assert_array_almost_equal(X_observed.toarray(), X / np.std(X, axis=0)) 125 | 126 | 127 | def test_robust_standard_dense_with_low_nnz_columns(): 128 | scaler = RobustStandardScaler() 129 | X_observed = scaler.fit_transform(X_low_nnz) 130 | np.testing.assert_array_almost_equal(X_observed, X_low_nnz_standardized) 131 | -------------------------------------------------------------------------------- /test/test_date_time.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | from datetime import datetime 15 | import numpy as np 16 | import pytest 17 | 18 | from dateutil import parser 19 | 20 | from sagemaker_sklearn_extension.feature_extraction.date_time import DateTimeVectorizer, DateTimeDefinition 21 | 22 | 23 | data_array = [ 24 | [parser.parse("Jan 5th, 2012, 12:34am")], 25 | [parser.parse("Feb 2, 2011, 2:34:04am")], 26 | [parser.parse("Jan 1st, 2012, 11:59:59pm")], 27 | [parser.parse("Dec 2th, 2012, 12:00am")], 28 | [parser.parse("Jan 3th, 2012, 12:34am")], 29 | [parser.parse("Jan 3th, 2018, 1:34am")], 30 | ] 31 | 32 | data = np.array(data_array) 33 | 34 | 35 | @pytest.mark.parametrize("data_shape", [(2, 3), (2, 3, 4), (2,)]) 36 | def test_cyclic_transform_outputs_correct_shape(data_shape): 37 | size = int(np.prod(data_shape)) 38 | data = np.arange(size).reshape(data_shape) 39 | ret = DateTimeVectorizer._cyclic_transform(data, low=0, high=size - 1) 40 | 41 | new_shape = list(data_shape) 42 | new_shape[-1] *= 2 43 | new_shape = tuple(new_shape) 44 | assert ret.shape == new_shape 45 | 46 | ret = ret.reshape((-1, 2)) 47 | ret = ret ** 2 48 | assert np.linalg.norm(np.sum(ret, axis=1) - 1) < 1e-8 49 | 50 | 51 | @pytest.mark.parametrize("mode", ["ordinal", "cyclic"]) 52 | def test_fit_transform_works_with_non_np_input(mode): 53 | dtv = DateTimeVectorizer( 54 | mode=mode, 55 | extract=[ 56 | DateTimeDefinition.HOUR.value, 57 | DateTimeDefinition.SECOND.value, 58 | DateTimeDefinition.YEAR.value, 59 | DateTimeDefinition.MONTH.value, 60 | ], 61 | ) 62 | output = dtv.fit_transform(data_array) 63 | assert output.shape[0] == len(data_array) 64 | assert output.shape[1] > 1 65 | 66 | 67 | @pytest.mark.parametrize("data_shape", [(2, 3), (2, 3, 4), (2,)]) 68 | def test_cyclic_transform_outputs_correct_cyclic_values(data_shape): 69 | size = int(np.prod(data_shape)) 70 | data = np.arange(size).reshape(data_shape) 71 | ret = DateTimeVectorizer._cyclic_transform(data, low=0, high=size - 1) 72 | ret = ret.reshape((-1, 2)) 73 | ret = ret ** 2 74 | assert np.linalg.norm(np.sum(ret, axis=1) - 1) < 1e-8 75 | 76 | 77 | def test_fit_eliminates_constant_columns(): 78 | dtv = DateTimeVectorizer( 79 | mode="ordinal", 80 | extract=[ 81 | DateTimeDefinition.HOUR.value, 82 | DateTimeDefinition.SECOND.value, 83 | DateTimeDefinition.YEAR.value, 84 | DateTimeDefinition.MONTH.value, 85 | ], 86 | ) 87 | # taking only odd items. Year and month are always the same. 88 | cur_data = data.reshape((-1, 2))[:, 0].reshape((-1, 1)) 89 | dtv = dtv.fit(cur_data) 90 | # Year and month are constants, make sure they are out 91 | assert dtv.extract_ == [DateTimeDefinition.HOUR.value, DateTimeDefinition.SECOND.value] 92 | 93 | 94 | @pytest.mark.parametrize("mode", ["ordinal", "cyclic"]) 95 | def test_fit_eliminates_constant_columns_multicol_input(mode): 96 | # set up data. Properties: 97 | # Hour: Constant thrghout - eliminate 98 | # Year: Constant in both, but has different value accross columns - should eliminate 99 | # Month: Constant in column 2, not in 1 - should not eliminate 100 | # Day of month: not constant in both columns - should not eliminate 101 | col1 = [ 102 | parser.parse("Jan 5th, 2012"), 103 | parser.parse("Feb 2, 2012"), 104 | parser.parse("Jan 1st, 2012"), 105 | ] 106 | col2 = [ 107 | parser.parse("Dec 2th, 2013"), 108 | parser.parse("Dec 3th, 2013"), 109 | parser.parse("Dec 3th, 2013"), 110 | ] 111 | 112 | cur_data = np.array([col1, col2]).T 113 | 114 | dtv = DateTimeVectorizer( 115 | mode=mode, 116 | extract=[ 117 | DateTimeDefinition.HOUR.value, 118 | DateTimeDefinition.DAY_OF_MONTH.value, 119 | DateTimeDefinition.YEAR.value, 120 | DateTimeDefinition.MONTH.value, 121 | ], 122 | ) 123 | # taking only odd items. Year and month are always the same. 124 | dtv = dtv.fit(cur_data) 125 | # Year and month are constants, make sure they are out 126 | assert dtv.extract_ == [DateTimeDefinition.DAY_OF_MONTH.value, DateTimeDefinition.MONTH.value] 127 | 128 | 129 | def test_transform_categorical(): 130 | extract_keys = [k for k in dir(DateTimeDefinition) if not k.startswith("_")] 131 | extract = [DateTimeDefinition.__dict__[k].value for k in extract_keys] 132 | dtv = DateTimeVectorizer(mode="ordinal", extract=extract, ignore_constant_columns=False) 133 | dtv.fit(data) 134 | output = dtv.transform(data) 135 | 136 | assert np.all(output >= 0) 137 | 138 | loc_year = extract_keys.index("YEAR") 139 | np.testing.assert_array_equal(output[:, loc_year], np.array([2012, 2011, 2012, 2012, 2012, 2018])) 140 | 141 | loc_month = extract_keys.index("MONTH") 142 | np.testing.assert_array_equal(output[:, loc_month], np.array([0, 1, 0, 11, 0, 0])) 143 | 144 | 145 | def test_transform_cyclic_leaves_year(): 146 | extract_keys = [k for k in dir(DateTimeDefinition) if not k.startswith("_")] 147 | extract = [DateTimeDefinition.__dict__[k].value for k in extract_keys] 148 | 149 | dtv = DateTimeVectorizer(mode="cyclic", extract=extract, ignore_constant_columns=False) 150 | dtv.fit(data) 151 | output = dtv.transform(data) 152 | 153 | loc_year = extract_keys.index("YEAR") 154 | loc_year *= 2 155 | np.testing.assert_array_equal(output[:, loc_year], np.array([2012, 2011, 2012, 2012, 2012, 2018])) 156 | 157 | assert output.shape[1] == len(extract) * 2 - 1 158 | 159 | 160 | def test_fit_transform_cyclic_leaves_year(): 161 | extract_keys = [k for k in dir(DateTimeDefinition) if not k.startswith("_")] 162 | extract = [DateTimeDefinition.__dict__[k].value for k in extract_keys] 163 | 164 | dtv = DateTimeVectorizer(mode="cyclic", extract=extract, ignore_constant_columns=False) 165 | output = dtv.fit_transform(data) 166 | 167 | loc_year = extract_keys.index("YEAR") 168 | loc_year *= 2 169 | np.testing.assert_array_equal(output[:, loc_year], np.array([2012, 2011, 2012, 2012, 2012, 2018])) 170 | 171 | assert output.shape[1] == len(dtv.extract_) * 2 - 1 172 | 173 | 174 | def test_fit_transform_accepts_mixed_str_datetime(): 175 | cur_data_array = data_array + [["Feb 12th, 15:33, 2011"], ["Nov 5th, 1am, 1975"], [432], [None], ["Feb 45th, 2018"]] 176 | 177 | dtv = DateTimeVectorizer(mode="ordinal") 178 | processed = dtv.fit_transform(cur_data_array) 179 | year_location = dtv.extract_.index(DateTimeDefinition.YEAR.value) 180 | assert processed[0, year_location] == 2012 181 | assert processed[-4, year_location] == 1975 182 | assert np.isnan(processed[-3, year_location]) 183 | assert np.isnan(processed[-2, year_location]) 184 | assert np.isnan(processed[-1, year_location]) 185 | 186 | dtv = DateTimeVectorizer(mode="cyclic") 187 | processed = dtv.fit_transform(cur_data_array) 188 | assert all(np.isnan(processed[-1])) 189 | assert not any(np.isnan(processed[-4])) 190 | assert not any(np.isnan(processed[0])) 191 | 192 | 193 | def test_fit_transform_default_datetime(): 194 | cur_data_array = [["Monday"], ["Tuesday"], ["Friday"]] 195 | 196 | dtv = DateTimeVectorizer(mode="ordinal", ignore_constant_columns=False, default_datetime=datetime(1900, 1, 1)) 197 | processed = dtv.fit_transform(cur_data_array) 198 | year_location = dtv.extract_.index(DateTimeDefinition.YEAR.value) 199 | month_location = dtv.extract_.index(DateTimeDefinition.MONTH.value) 200 | weekday_location = dtv.extract_.index(DateTimeDefinition.WEEKDAY.value) 201 | 202 | assert processed[0, year_location] == 1900 203 | assert processed[0, month_location] == 0 204 | assert processed[0, weekday_location] == 0 205 | 206 | assert processed[1, year_location] == 1900 207 | assert processed[1, month_location] == 0 208 | assert processed[1, weekday_location] == 1 209 | 210 | assert processed[2, year_location] == 1900 211 | assert processed[2, month_location] == 0 212 | assert processed[2, weekday_location] == 4 213 | -------------------------------------------------------------------------------- /test/test_feature_extraction_text.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | import numpy as np 15 | import pytest 16 | import scipy.sparse as sp 17 | 18 | from sagemaker_sklearn_extension.feature_extraction.text import MultiColumnTfidfVectorizer 19 | 20 | from sklearn.feature_extraction.text import TfidfVectorizer 21 | 22 | 23 | corpus = np.array( 24 | [ 25 | ["Cats eat rats.", "Rats are mammals."], 26 | ["Dogs chase cats.", "Cats have ears."], 27 | ["People like dogs.", "People are mammals."], 28 | ["People hate rats.", "Rats are quite smart."], 29 | ] 30 | ) 31 | 32 | 33 | def test_multi_column_tfidf_vectorizer(): 34 | vec = MultiColumnTfidfVectorizer() 35 | output = vec.fit_transform(corpus) 36 | 37 | assert isinstance(output, sp.coo.coo_matrix) 38 | 39 | observed = output.todense() 40 | expected = np.hstack( 41 | [ 42 | TfidfVectorizer().fit_transform(corpus[:, 0]).todense(), 43 | TfidfVectorizer().fit_transform(corpus[:, 1]).todense(), 44 | ] 45 | ) 46 | 47 | np.testing.assert_array_equal(observed, expected) 48 | 49 | 50 | def test_multi_column_tfidf_vectorizer_fit_dim_error(): 51 | with pytest.raises(ValueError): 52 | vec = MultiColumnTfidfVectorizer() 53 | vec.fit(corpus[0]) 54 | 55 | 56 | def test_multi_column_tfidf_vectorizer_transform_dim_error(): 57 | with pytest.raises(ValueError): 58 | vec = MultiColumnTfidfVectorizer() 59 | vec.fit(corpus) 60 | vec.transform(corpus[0]) 61 | 62 | 63 | def test_multi_column_tfidf_vectorizer_vocabulary_sizes_large(): 64 | vocabulary_sizes = [TfidfVectorizer().fit_transform(corpus[:, i]).shape[1] + 1 for i in range(corpus.shape[1])] 65 | vectorizer = MultiColumnTfidfVectorizer(vocabulary_sizes=vocabulary_sizes) 66 | observed = vectorizer.fit_transform(corpus) 67 | assert observed.shape[1] == sum(vocabulary_sizes) 68 | assert sp.issparse(observed) 69 | 70 | 71 | def test_multi_column_tfidf_vectorizer_vocabulary_sizes_small(): 72 | vocabulary_sizes = [TfidfVectorizer().fit_transform(corpus[:, i]).shape[1] - 1 for i in range(corpus.shape[1])] 73 | vectorizer = MultiColumnTfidfVectorizer(vocabulary_sizes=vocabulary_sizes) 74 | observed = vectorizer.fit_transform(corpus) 75 | assert observed.shape[1] == sum(vocabulary_sizes) 76 | assert sp.issparse(observed) 77 | 78 | 79 | def test_multi_column_tfidf_vectorizer_vocabulary_sizes_error(): 80 | with pytest.raises(ValueError): 81 | vectorizer = MultiColumnTfidfVectorizer(vocabulary_sizes=[1]) 82 | vectorizer.fit(corpus) 83 | 84 | 85 | @pytest.mark.parametrize( 86 | "kwargs, data, shape", 87 | [ 88 | ({"min_df": 0.9}, corpus, (4, 0)), 89 | ({"max_df": 0.1}, corpus, (4, 0)), 90 | ({"max_df": 0.9941}, np.array([[""], [""], [""]]), (3, 0)), 91 | ], 92 | ) 93 | def test_multi_column_tfidf_vectorizer_zero_output_tokens_ignore_zero_vocab_on(kwargs, data, shape): 94 | """Tests for empty matrix when no terms remain after pruning""" 95 | vec = MultiColumnTfidfVectorizer(**kwargs) 96 | output = vec.fit_transform(data) 97 | assert output.shape == shape 98 | 99 | 100 | @pytest.mark.parametrize( 101 | "kwargs, data", 102 | [ 103 | ({"min_df": 0.9, "ignore_columns_with_zero_vocabulary_size": False}, corpus), 104 | ({"max_df": 0.1, "ignore_columns_with_zero_vocabulary_size": False}, corpus), 105 | ({"max_df": 0.9941, "ignore_columns_with_zero_vocabulary_size": False}, np.array([[""], [""], [""]])), 106 | ], 107 | ) 108 | def test_multi_column_tfidf_vectorizer_zero_output_tokens_ignore_zero_vocab_off(kwargs, data): 109 | """Tests for ValueError when no terms remain after pruning and `ignore_overpruned_columns=False`""" 110 | with pytest.raises(ValueError): 111 | vec = MultiColumnTfidfVectorizer(**kwargs) 112 | vec.fit_transform(data) 113 | 114 | 115 | @pytest.mark.parametrize("kwargs, output_shape", [({"min_df": 0.9}, (4, 3)), ({"max_df": 0.9}, (4, 8))]) 116 | def test_multi_column_tfidf_vectorizer_one_column_zero_output_tokens(kwargs, output_shape): 117 | """Tests that a TF-IDF document-term matrix is still returned when only one column breaks""" 118 | corpus = np.array( 119 | [ 120 | ["Cats eat rats.", "Rats are mammals."], 121 | ["Dogs chase cats.", "Rats are mammals."], 122 | ["People like dogs.", "Rats are mammals."], 123 | ["People hate rats.", "Rats are mammals."], 124 | ] 125 | ) 126 | 127 | vec = MultiColumnTfidfVectorizer(**kwargs) 128 | output = vec.fit_transform(corpus) 129 | assert output.shape == output_shape 130 | -------------------------------------------------------------------------------- /test/test_header.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | import pytest 15 | 16 | from sagemaker_sklearn_extension.externals import Header 17 | 18 | 19 | @pytest.mark.parametrize("names, col_idx, feature_idx", [(["a"], [0], [0]), (["a", "c"], [0, 2], [0, 1])]) 20 | def test_header_happy(names, col_idx, feature_idx): 21 | h = Header(column_names=["a", "b", "c"], target_column_name="b") 22 | assert h.target_column_index == 1 23 | assert h.as_feature_indices(names) == feature_idx 24 | assert h.as_column_indices(names) == col_idx 25 | assert h.num_features == 2 26 | assert h.num_columns == 3 27 | 28 | 29 | def test_header_errors_target_missing(): 30 | with pytest.raises(ValueError): 31 | Header(column_names=["a", "b"], target_column_name="c") 32 | 33 | 34 | @pytest.mark.parametrize("column_names, target_column", [(["a", "b", "b", "c"], "c"), (["a", "b", "c", "c"], "c")]) 35 | def test_header_errors_duplicate_columns(column_names, target_column): 36 | with pytest.raises(ValueError): 37 | Header(column_names=column_names, target_column_name=target_column) 38 | 39 | 40 | @pytest.mark.parametrize( 41 | "names, error_regex", 42 | [(["unknown"], "'unknown' is an unknown feature name"), (["b"], "'b' is the target column name.")], 43 | ) 44 | def test_header_error_as_feature_indices(names, error_regex): 45 | h = Header(column_names=["a", "b", "c"], target_column_name="b") 46 | assert h.target_column_index == 1 47 | with pytest.raises(ValueError) as err: 48 | h.as_feature_indices(names) 49 | err.match(error_regex) 50 | 51 | 52 | def test_header_error_as_column_index(): 53 | h = Header(column_names=["a", "b", "c"], target_column_name="b") 54 | assert h.target_column_index == 1 55 | with pytest.raises(ValueError): 56 | h.as_column_indices(["unknown"]) 57 | 58 | 59 | def test_header_feature_column_index_order(): 60 | h = Header(column_names=["a", "b", "c", "d"], target_column_name="c") 61 | assert h.feature_column_indices == [0, 1, 3] 62 | -------------------------------------------------------------------------------- /test/test_impute.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | import numpy as np 15 | import pytest 16 | 17 | from sklearn.utils.testing import assert_array_equal 18 | 19 | from sagemaker_sklearn_extension.impute import RobustImputer, RobustMissingIndicator, is_finite_numeric 20 | 21 | X_impute = np.array([[np.nan, 2, np.inf], [4, np.inf, 6], [10, np.nan, 10]]) 22 | X_impute_boolean_mask = np.array([[True, False, True], [False, True, False], [False, True, False]]) 23 | X_impute_string = X_impute.astype("O") 24 | X_impute_mixed = np.array([["2", "a"], ["inf", "nan"], ["-1e2", "10.0"], ["0.0", "foobar"], ["-inf", "8"]]) 25 | X_impute_mixed_boolean_mask = np.array([[False, True], [True, True], [False, False], [False, True], [True, False]]) 26 | X_impute_categorical = np.array([["hot dog"], ["hot dog"], ["hot dog"], ["banana"]]) 27 | X_imputed_median = np.array([[7.0, 2.0, 8.0], [4.0, 2.0, 6.0], [10.0, 2.0, 10.0]]) 28 | X_imputed_constant = np.array([[1.0, 2.0, 13.0], [4.0, 7.0, 6.0], [10.0, 7.0, 10.0]]) 29 | X_imputed_mixed = np.array([[2.0, 9.0], [0.0, 9.0], [-1e2, 10.0], [0.0, 9.0], [0.0, 8.0]]) 30 | X_imputed_categorical = np.array([["hot dog"], ["hot dog"], ["hot dog"], ["not hot dog"]]) 31 | 32 | transform_error_msg = "'transform' input X has 4 features per sample, expected 3 from 'fit' input" 33 | fill_values_error_msg = "'fill_values' should have length equal to number of features in X 3, got 5" 34 | 35 | 36 | @pytest.mark.parametrize( 37 | "val, expected", [(np.array([1738, "10", np.inf, np.nan, "foobar"]), np.array([True, True, False, False, False]))] 38 | ) 39 | def test_is_finite_numeric(val, expected): 40 | observed = is_finite_numeric(val) 41 | assert_array_equal(observed, expected) 42 | 43 | 44 | @pytest.mark.parametrize( 45 | "X, X_expected, strategy, fill_values", 46 | [ 47 | (X_impute_mixed, X_imputed_mixed, "median", None), 48 | (X_impute, X_imputed_median, "median", None), 49 | (X_impute_string, X_imputed_median, "median", None), 50 | (X_impute, X_imputed_constant, "constant", [1.0, 7.0, 13.0]), 51 | (X_impute_string, X_imputed_constant, "constant", [1.0, 7.0, 13.0]), 52 | ], 53 | ) 54 | def test_robust_imputer(X, X_expected, strategy, fill_values): 55 | robust_imputer = RobustImputer(strategy=strategy, fill_values=fill_values) 56 | robust_imputer.fit(X) 57 | X_observed = robust_imputer.transform(X) 58 | 59 | assert_array_equal(X_observed, X_expected) 60 | 61 | 62 | def test_robust_imputer_categorical_custom_function(): 63 | robust_imputer = RobustImputer( 64 | dtype=np.dtype("O"), strategy="constant", fill_values="not hot dog", mask_function=lambda x: x == "hot dog" 65 | ) 66 | robust_imputer.fit(X_impute_categorical) 67 | X_observed = robust_imputer.transform(X_impute_categorical) 68 | 69 | assert_array_equal(X_observed, X_imputed_categorical) 70 | 71 | 72 | def test_robust_imputer_transform_dim_error(): 73 | with pytest.raises(ValueError, match=transform_error_msg): 74 | robust_imputer = RobustImputer() 75 | robust_imputer.fit(X_impute) 76 | robust_imputer.transform(np.zeros((3, 4))) 77 | 78 | 79 | def test_robust_imputer_fill_values_dim_error(): 80 | with pytest.raises(ValueError, match=fill_values_error_msg): 81 | robust_imputer = RobustImputer(strategy="constant", fill_values=np.zeros(5)) 82 | robust_imputer.fit(X_impute) 83 | 84 | 85 | @pytest.mark.parametrize( 86 | "X, boolean_mask_X", [(X_impute_mixed, X_impute_mixed_boolean_mask), (X_impute, X_impute_boolean_mask)] 87 | ) 88 | def test_robust_missing_indicator(X, boolean_mask_X): 89 | robust_indicator = RobustMissingIndicator() 90 | robust_indicator.fit(X) 91 | boolean_mask_X_observed = robust_indicator.transform(X) 92 | 93 | assert_array_equal(boolean_mask_X_observed, boolean_mask_X) 94 | -------------------------------------------------------------------------------- /test/test_preprocessing.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | import numpy as np 15 | import pytest 16 | 17 | from sagemaker_sklearn_extension.preprocessing import ( 18 | LogExtremeValuesTransformer, 19 | QuantileExtremeValuesTransformer, 20 | RemoveConstantColumnsTransformer, 21 | log_transform, 22 | quantile_transform_nonrandom, 23 | ) 24 | 25 | np.random.seed(0) 26 | 27 | X_zeros = np.zeros((10, 10)) 28 | X_extreme_vals = np.array( 29 | [ 30 | [0.0, 0.0, 0.0], 31 | [-1.0, 1.0, 1.0], 32 | [-2.0, 2.0, 2.0], 33 | [-3.0, 3.0, 3.0], 34 | [-4.0, 4.0, 4.0], 35 | [-5.0, 5.0, 5.0], 36 | [-6.0, 6.0, 6.0], 37 | [-7.0, 7.0, 7.0], 38 | [-8.0, 8.0, 8.0], 39 | [-9.0, 9.0, 9.0], 40 | [-10.0, 10.0, 10.0], 41 | [-1e5, 1e6, 11.0], 42 | ] 43 | ) 44 | X_log_extreme_vals = np.column_stack( 45 | [log_transform(X_extreme_vals.copy()[:, 0]), log_transform(X_extreme_vals.copy()[:, 1]), X_extreme_vals[:, 2]] 46 | ) 47 | X_quantile_extreme_vals = np.column_stack( 48 | [ 49 | quantile_transform_nonrandom(X_extreme_vals.copy()[:, 0]), 50 | quantile_transform_nonrandom(X_extreme_vals.copy()[:, 1]), 51 | X_extreme_vals[:, 2], 52 | ] 53 | ) 54 | X_all_positive = 5 * np.random.random((100, 1)) + 20 55 | X_extreme_all_positive = np.vstack([np.random.random((90, 1)) + 100, np.array(10 * [[5]], dtype=np.float64)]) 56 | X_log_extreme_all_positive = np.array([log_transform(X_extreme_all_positive.copy()[:, 0])]).reshape(-1, 1) 57 | X_all_uniques = np.arange(20).reshape(4, 5) 58 | X_one_val = np.column_stack([np.arange(20).reshape(4, 5), np.array([1, 1, 1, 1])]) 59 | X_nans = np.array([[np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan]]) 60 | X_no_uniques = np.zeros((4, 5)) 61 | 62 | 63 | @pytest.mark.parametrize( 64 | "X, X_expected", 65 | [ 66 | (X_all_uniques, X_all_uniques), 67 | (X_one_val, X_one_val[:, :5]), 68 | (X_nans, np.empty((0, 3))), 69 | (X_no_uniques, np.empty((0, 5))), 70 | ], 71 | ) 72 | def test_remove_constant_columns_transformer(X, X_expected): 73 | transformer = RemoveConstantColumnsTransformer() 74 | X_observed = transformer.fit_transform(X) 75 | 76 | np.testing.assert_array_equal(X_observed, X_expected) 77 | 78 | 79 | @pytest.mark.parametrize( 80 | ["X", "X_expected"], 81 | [ 82 | (X_extreme_vals, X_log_extreme_vals,), 83 | (X_zeros, X_zeros), 84 | (X_all_positive, X_all_positive), 85 | (X_extreme_all_positive, X_log_extreme_all_positive), 86 | ], 87 | ) 88 | def test_log_extreme_value_transformer(X, X_expected): 89 | transformer = LogExtremeValuesTransformer(threshold_std=2.0) 90 | X_observed = transformer.fit_transform(X) 91 | 92 | np.testing.assert_array_almost_equal(X_observed, X_expected) 93 | 94 | 95 | def test_log_extreme_value_transformer_state(): 96 | t = LogExtremeValuesTransformer(threshold_std=2.0) 97 | X_observed = t.fit_transform(X_extreme_vals) 98 | 99 | np.testing.assert_array_almost_equal(t.nonnegative_cols_, [1, 2]) 100 | np.testing.assert_array_almost_equal(X_observed, X_log_extreme_vals) 101 | 102 | 103 | @pytest.mark.parametrize( 104 | ["X", "X_expected"], 105 | [(X_extreme_vals, X_quantile_extreme_vals), (X_zeros, X_zeros), (X_all_positive, X_all_positive),], 106 | ) 107 | def test_extreme_value_transformer(X, X_expected): 108 | transformer = QuantileExtremeValuesTransformer(threshold_std=2.0) 109 | X_observed = transformer.fit_transform(X) 110 | 111 | np.testing.assert_array_almost_equal(X_observed, X_expected) 112 | -------------------------------------------------------------------------------- /test/test_read_data.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | import psutil 15 | 16 | import csv 17 | from contextlib import contextmanager 18 | import json 19 | import numpy as np 20 | import os 21 | import pytest 22 | 23 | from mlio import list_files 24 | from mlio import InMemoryStore, SageMakerPipe 25 | from mlio import File as mlio_file 26 | from sagemaker_sklearn_extension.externals.read_data import _convert_megabytes_to_bytes 27 | from sagemaker_sklearn_extension.externals.read_data import _get_data 28 | from sagemaker_sklearn_extension.externals.read_data import _get_reader 29 | from sagemaker_sklearn_extension.externals.read_data import _get_size_total 30 | from sagemaker_sklearn_extension.externals.read_data import _read_to_fit_memory 31 | from sagemaker_sklearn_extension.externals.read_data import read_csv_data 32 | 33 | 34 | DATA_FILES = [ 35 | "test/data/csv/mock_datasplitter_output/manual.csv", 36 | "test/data/csv/mock_datasplitter_output/newline.csv", 37 | "test/data/csv/mock_datasplitter_output/excel.csv", 38 | "test/data/csv/mock_datasplitter_output/oneline.csv", 39 | "test/data/csv/missing_values.csv", 40 | "test/data/csv/dictionaries.csv", 41 | "test/data/csv/dirty.csv", 42 | ] 43 | DATA_FILES_SHAPE = [(8, 4), (10, 4), (3, 4), (1, 4), (7, 5), (147, 18), (19, 16)] 44 | LARGE_DATA_4MB = "test/data/csv/kc_house_data.csv" 45 | BUFFER_DATA = ( 46 | "1,2,3,4\n" 47 | + "5,6,7,8\n" 48 | + "9,10,11,12\n" 49 | + "13,14,15,16\n" 50 | + "17,18,19,20\n" 51 | + "21,22,23,24\n" 52 | + "25,26,27,28\n" 53 | + "29,30,31,32" 54 | ) 55 | 56 | 57 | @contextmanager 58 | def managed_env_var(cfg): 59 | os.environ.update({"SM_INPUT_DATA_CONFIG": json.dumps(cfg)}) 60 | try: 61 | yield os.environ 62 | finally: 63 | os.environ.pop("SM_INPUT_DATA_CONFIG") 64 | 65 | 66 | csv1 = [ 67 | ["1.0", 2.0, "3", 4, ""], 68 | ["a,b", "c\nd", "f", '"""', np.nan], 69 | ] 70 | csv2 = [ 71 | [10, "2\r\n4", "hello", 4.0, "!"], 72 | [" space", "", "space ", "\n", "hello\n"], 73 | ['{a: 5, b: "hello"}', "[a, b, 2]", "[]", "nan", " "], 74 | ] 75 | 76 | 77 | @pytest.fixture(scope="session") 78 | def csv_data_dir(tmpdir_factory): 79 | """Fixture which fills a temporary directory with (multiple) csv file(s).""" 80 | csv_data_directory = tmpdir_factory.mktemp("csv_file_paths") 81 | csv_file1 = csv_data_directory.join("file_1.csv") 82 | csv_file2 = csv_data_directory.join("file_2.csv") 83 | 84 | with open(csv_file1.strpath, "w") as csv_file_handle: 85 | csv_writer = csv.writer(csv_file_handle, dialect="excel") 86 | csv_writer.writerows(csv1) 87 | with open(csv_file2.strpath, "w") as csv_file_handle: 88 | csv_writer = csv.writer(csv_file_handle, dialect="excel") 89 | csv_writer.writerows(csv2) 90 | 91 | return str(csv_data_directory) 92 | 93 | 94 | def test_excel_dialect(csv_data_dir): 95 | """Test that read_csv_data function properly reads files in the excel dialect.""" 96 | generated_contents = read_csv_data(source=csv_data_dir + "/file_1.csv") 97 | 98 | assert generated_contents.shape == (len(csv1), len(csv1[0])) 99 | assert np.all(generated_contents == np.array([[str(v) for v in row] for row in csv1], dtype=np.str)) 100 | 101 | 102 | def test_directory_content(csv_data_dir): 103 | """Test that read_csv_data function reads content correctly from a directory""" 104 | generated_contents = read_csv_data(source=csv_data_dir) 105 | correct_array = csv1 + csv2 106 | assert generated_contents.shape == (len(correct_array), len(correct_array[0])) 107 | assert np.all(generated_contents == np.array([[str(v) for v in row] for row in correct_array], dtype=np.str)) 108 | 109 | 110 | def test_get_reader_pipe_mode(): 111 | """Test for getting a 'CsvReader' object with 'Pipe' mode""" 112 | with managed_env_var({"abc": {"TrainingInputMode": "Pipe"}}): 113 | reader = _get_data(source="abc") 114 | assert isinstance(reader[0], SageMakerPipe) 115 | 116 | 117 | def test_get_reader_file_mode(): 118 | """Test for getting a 'CsvReader' object with 'File' mode""" 119 | source = "test/data/csv/mock_datasplitter_output" 120 | with managed_env_var({os.path.basename(source): {"TrainingInputMode": "File"}}): 121 | reader = _get_data(source=source) 122 | assert isinstance(reader[0], mlio_file) 123 | 124 | 125 | def test_get_reader_mlio_file_object(): 126 | """Test for getting a 'CsvReader' with a mlio.File object source""" 127 | source = "test/data/csv/mock_datasplitter_output" 128 | files = list_files(source, pattern="*") 129 | reader = _get_data(source=files[0]) 130 | assert isinstance(reader[0], mlio_file) 131 | 132 | 133 | def test_get_reader_inmemory_mode(): 134 | """Test for getting a 'CsvReader' object with 'InMemory' mode""" 135 | buffer = BUFFER_DATA.encode() 136 | reader = _get_data(source=buffer) 137 | assert isinstance(reader[0], InMemoryStore) 138 | 139 | 140 | def test_read_csv_data_inmemory_mode(): 141 | """Test to make sure 'InMemory' mode reads in content correctly""" 142 | generated_contents = read_csv_data(source=BUFFER_DATA.encode()) 143 | correct_array = [] 144 | for i in range(8): 145 | correct_array.append([i * 4 + j for j in range(1, 5)]) 146 | assert generated_contents.shape == (len(correct_array), len(correct_array[0])) 147 | assert np.all(generated_contents == np.array([[str(v) for v in row] for row in correct_array], dtype=np.str)) 148 | 149 | 150 | def test_read_empty_buffer(): 151 | """Test for getting an empty array if the buffer is empty""" 152 | generated_contents = read_csv_data(source="".encode()) 153 | assert generated_contents.size == 0 154 | 155 | 156 | def test_get_reader_no_env_var(): 157 | """Test for getting a 'CsvReader' object with no environmental variable""" 158 | reader = _get_data(source="test/data/csv/mock_datasplitter_output") 159 | assert isinstance(reader[0], mlio_file) 160 | 161 | 162 | @pytest.mark.parametrize("cfg, expected_error", [({}, KeyError), ({"abc": {}}, KeyError),]) 163 | def test_get_reader_error_malformed_channel_cfg(cfg, expected_error): 164 | """Test for reading from an invalid channel""" 165 | with pytest.raises(expected_error): 166 | with managed_env_var(cfg): 167 | _get_reader(source="abc", batch_size=1000) 168 | 169 | 170 | def test_get_reader_incorrect_path(): 171 | """Test for reading from a path that doesn't exist""" 172 | with pytest.raises(FileNotFoundError): 173 | _get_reader(source="incorrect", batch_size=100) 174 | 175 | 176 | def test_read_csv_data_invalid_csv(): 177 | with pytest.raises(RuntimeError): 178 | read_csv_data(source="test/data/csv/invalid.csv") 179 | 180 | 181 | @pytest.mark.parametrize("data_file, shape", [(file, shape) for file, shape in zip(DATA_FILES, DATA_FILES_SHAPE)]) 182 | def test_read_csv_data(data_file, shape): 183 | """Test for reading individual csv data files""" 184 | array = read_csv_data(source=data_file, batch_size=1, fit_memory_percent=100.0, output_dtype="U") 185 | assert array.shape == shape 186 | assert array.dtype.kind in {"U", "S"} 187 | 188 | 189 | def test_read_csv_data_directory(): 190 | """Test for reading from a directory of data""" 191 | array = read_csv_data(source="test/data/csv/mock_datasplitter_output", fit_memory_percent=100.0) 192 | assert array.shape == (22, 4) 193 | 194 | 195 | def test_read_csv_data_sample_append(): 196 | """Test for reading data in chunks.""" 197 | array = read_csv_data(source=LARGE_DATA_4MB, fit_memory_percent=100.0) 198 | assert array.shape == (38223, 21) 199 | 200 | 201 | def test_read_csv_data_samples(): 202 | """Test for sample case where the entire dataset doesn't fit into the available memory""" 203 | total_memory_in_bytes = psutil.virtual_memory().total 204 | two_mb_in_bytes = _convert_megabytes_to_bytes(2) 205 | fraction_of_memory_to_use = two_mb_in_bytes / total_memory_in_bytes 206 | sample_data = read_csv_data( 207 | source=LARGE_DATA_4MB, fit_memory_percent=fraction_of_memory_to_use * 100, output_dtype="U" 208 | ) 209 | assert sample_data.dtype.kind == "U" 210 | assert _convert_megabytes_to_bytes(1.9) < sample_data.nbytes <= two_mb_in_bytes 211 | 212 | 213 | def test_read_csv_data_split(): 214 | X, y = read_csv_data(LARGE_DATA_4MB, target_column_index=0, output_dtype="U") 215 | yX = read_csv_data(LARGE_DATA_4MB, output_dtype="U") 216 | assert X.shape == (38223, 20) 217 | assert y.shape == (38223,) 218 | assert np.array_equal(np.hstack((y.reshape(-1, 1), X)).astype(str), yX) 219 | assert X.dtype.kind == "U" 220 | assert y.dtype.kind == "U" 221 | 222 | 223 | def test_read_csv_data_split_limited(): 224 | total_memory_in_bytes = psutil.virtual_memory().total 225 | two_mb_in_bytes = _convert_megabytes_to_bytes(2) 226 | fraction_of_memory_to_use = two_mb_in_bytes / total_memory_in_bytes 227 | X, y = read_csv_data( 228 | LARGE_DATA_4MB, target_column_index=0, fit_memory_percent=fraction_of_memory_to_use * 100, output_dtype="U" 229 | ) 230 | assert _convert_megabytes_to_bytes(1.9) < (X.nbytes + y.nbytes) <= two_mb_in_bytes 231 | assert X.dtype.kind == "U" 232 | assert y.dtype.kind == "U" 233 | 234 | 235 | def test_read_csv_data_samples_object(): 236 | """Test for sample case where the entire dataset doesn't fit into the available memory""" 237 | total_memory_in_bytes = psutil.virtual_memory().total 238 | two_mb_in_bytes = _convert_megabytes_to_bytes(2) 239 | fraction_of_memory_to_use = two_mb_in_bytes / total_memory_in_bytes 240 | sample_data = read_csv_data( 241 | source=LARGE_DATA_4MB, fit_memory_percent=fraction_of_memory_to_use * 100, output_dtype="object" 242 | ) 243 | array_memory = _get_size_total(sample_data) 244 | assert _convert_megabytes_to_bytes(1.9) < array_memory <= two_mb_in_bytes 245 | assert sample_data.dtype.kind == "O" 246 | 247 | 248 | def test_read_csv_data_split_object(): 249 | X, y = read_csv_data(LARGE_DATA_4MB, target_column_index=0, output_dtype="O") 250 | yX = read_csv_data(LARGE_DATA_4MB, output_dtype="O") 251 | assert X.shape == (38223, 20) 252 | assert y.shape == (38223,) 253 | assert np.array_equal(np.hstack((y.reshape(-1, 1), X)), yX) 254 | assert X.dtype.kind == "O" 255 | assert y.dtype.kind == "O" 256 | 257 | 258 | def test_read_csv_data_split_limited_object(): 259 | total_memory_in_bytes = psutil.virtual_memory().total 260 | two_mb_in_bytes = _convert_megabytes_to_bytes(2) 261 | fraction_of_memory_to_use = two_mb_in_bytes / total_memory_in_bytes 262 | X, y = read_csv_data( 263 | LARGE_DATA_4MB, target_column_index=0, fit_memory_percent=fraction_of_memory_to_use * 100, output_dtype="O" 264 | ) 265 | arrays_memory = _get_size_total(X) + _get_size_total(y) 266 | assert _convert_megabytes_to_bytes(1.9) < arrays_memory <= two_mb_in_bytes 267 | assert X.dtype.kind == "O" 268 | assert y.dtype.kind == "O" 269 | 270 | 271 | @pytest.mark.parametrize("output_dtype", ["O", "U"]) 272 | def test_read_to_fit_memory_dangling_element(tmpdir_factory, output_dtype): 273 | """Test that data is read in correctly when `len(data) = 1 mod batch_size`.""" 274 | data = np.zeros((10, 10)).astype(str) 275 | for i in range(data.shape[0]): 276 | data[i, i] = str(i + 1) 277 | data_dir = tmpdir_factory.mktemp("ten_line_csv") 278 | data_file = data_dir.join("ten_lines.csv") 279 | np.savetxt(data_file.strpath, data, delimiter=",", newline="\n", fmt="%s") 280 | 281 | X_read, y_read = _read_to_fit_memory( 282 | _get_reader(data_dir.strpath, 3), 283 | psutil.virtual_memory().total, 284 | output_dtype=output_dtype, 285 | target_column_index=0, 286 | ) 287 | assert np.array_equal(data[:, 1:], X_read) 288 | assert np.array_equal(data[:, 0], y_read) 289 | 290 | 291 | def test_list_alphabetical(): 292 | """Test for checking 'list_files' returns alphabetically""" 293 | path = "test/data/csv/mock_datasplitter_output" 294 | mlio_list_files = list_files(path, pattern="*") 295 | alphabetical_files = [] 296 | for file in ["excel.csv", "manual.csv", "newline.csv", "oneline.csv"]: 297 | alphabetical_files.extend(list_files(path + "/" + file, pattern="*")) 298 | assert mlio_list_files == alphabetical_files 299 | 300 | 301 | def test_list_recursive(): 302 | """Test for checking 'list_files' lists recursively""" 303 | assert len(list_files("test/data/csv", pattern="*")) == 10 304 | -------------------------------------------------------------------------------- /test/test_robust_pca.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | import numpy as np 15 | import pytest 16 | from scipy.sparse import csr_matrix 17 | 18 | from sklearn import datasets 19 | from sklearn.decomposition import PCA, TruncatedSVD 20 | 21 | from sagemaker_sklearn_extension.decomposition import RobustPCA 22 | 23 | 24 | X_iris = datasets.load_iris().data 25 | X_iris_sparse = csr_matrix(X_iris) 26 | 27 | 28 | @pytest.mark.parametrize( 29 | ["X", "n_components", "X_expected"], 30 | [ 31 | # Dense input 32 | (X_iris, 2, PCA(n_components=2).fit_transform(X_iris)), 33 | # Sparse input 34 | (X_iris_sparse, 2, TruncatedSVD().fit_transform(X_iris_sparse)), 35 | # n_components > X.shape[1], no dimension reduction 36 | (X_iris, 1000, X_iris), 37 | ], 38 | ) 39 | def test_svd(X, n_components, X_expected): 40 | svd = RobustPCA(n_components=n_components) 41 | X_observed = svd.fit_transform(X) 42 | 43 | np.testing.assert_array_almost_equal(X_observed, X_expected) 44 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # Tox (http://tox.testrun.org/) is a tool for running tests 2 | # in multiple virtualenvs. This configuration file will run the 3 | # test suite on all supported python versions. To use it, "pip install tox" 4 | # and then run "tox" from this directory. 5 | 6 | [tox] 7 | envlist = black-format,flake8,pylint,twine,py37,contrib_taei_py37 8 | skip_missing_interpreters = False 9 | 10 | [testenv:black-format] 11 | # Used during development (before committing) to format .py files. 12 | basepython = python3 13 | deps = black==19.10b0 14 | commands = 15 | black -l 120 ./ 16 | 17 | [testenv:black-check] 18 | # Used by automated build steps to check that all files are properly formatted. 19 | basepython = python3 20 | deps = black==19.10b0 21 | commands = 22 | black -l 120 --check ./ 23 | 24 | [testenv:flake8] 25 | basepython = python3 26 | skipdist = true 27 | skip_install = true 28 | deps = flake8 29 | commands = 30 | flake8 31 | 32 | [testenv:pylint] 33 | basepython = python3 34 | skipdist = true 35 | skip_install = true 36 | deps = pylint==2.3.1 37 | commands = 38 | python -m pylint --rcfile=.pylintrc -j 0 src/sagemaker_sklearn_extension 39 | 40 | [testenv:twine] 41 | basepython = python3 42 | # twine check was added starting in 1.12.0 43 | deps = twine>=1.12.0 44 | # https://github.com/pypa/twine/blob/master/docs/changelog.rst 45 | # https://packaging.python.org/guides/making-a-pypi-friendly-readme/#validating-restructuredtext-markup 46 | commands = 47 | python setup.py sdist 48 | twine check dist/*.tar.gz 49 | 50 | [testenv:py37] 51 | # {posargs} can be passed in by additional arguments specified when invoking tox. 52 | # Can be used to specify which tests to run, e.g.: tox -- -s 53 | usedevelop = True 54 | deps = 55 | -r{toxinidir}/requirements.txt 56 | .[test] 57 | conda_deps = 58 | mlio-py=0.7 59 | libprotobuf=3.13.0 60 | conda_channels = 61 | conda-forge 62 | mlio 63 | commands = 64 | coverage run --source src/sagemaker_sklearn_extension --omit src/sagemaker_sklearn_extension/contrib/* -m pytest --ignore-glob=test/contrib/* --verbose {posargs} 65 | coverage report --fail-under=90 66 | 67 | [testenv:contrib_taei_py37] 68 | # {posargs} can be passed in by additional arguments specified when invoking tox. 69 | # Can be used to specify which tests to run, e.g.: tox -- -s 70 | usedevelop = True 71 | deps = 72 | -r{toxinidir}/requirements.txt 73 | .[test] 74 | .[taei] 75 | conda_deps = 76 | mlio-py=0.7 77 | libprotobuf=3.13.0 78 | conda_channels = 79 | conda-forge 80 | mlio 81 | commands = 82 | coverage run --source src/sagemaker_sklearn_extension/contrib/taei -m pytest test/contrib/taei --verbose {posargs} 83 | coverage report --fail-under=90 84 | --------------------------------------------------------------------------------