├── .flake8 ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ ├── CI-python-AutoML.yml │ ├── CI-python-minimal.yml │ ├── CI-python.yml │ ├── code-scan.yml │ ├── linkcheck.yml │ ├── python-linting.yml │ └── release-ml-wrappers.yml ├── .gitignore ├── .readthedocs.yaml ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── SECURITY.md ├── SUPPORT.md ├── docs ├── WrapperSpecifications.md ├── object-detection-schema.md └── release-process.md ├── python ├── README.md ├── docs │ ├── api_reference.rst │ ├── code_of_conduct.rst │ ├── conf.py │ ├── contributing.rst │ ├── dataset_wrapping.rst │ ├── dependencies.rst │ ├── getting_started.rst │ ├── image_model_wrapping.rst │ ├── index.rst │ ├── license_information.rst │ ├── model_wrapper_specifications.rst │ ├── model_wrapping.rst │ ├── object_detection_model_wrapping.rst │ ├── overview.rst │ ├── privacy_policy.rst │ ├── pytorch_model_wrapping.rst │ ├── support.rst │ ├── supported_frameworks.rst │ ├── supported_models.rst │ ├── tensorflow_model_wrapping.rst │ ├── text_model_wrapping.rst │ └── versioning.rst ├── ml_wrappers │ ├── __init__.py │ ├── common │ │ ├── __init__.py │ │ ├── constants.py │ │ ├── gpu_kmeans.py │ │ └── warnings_suppressor.py │ ├── dataset │ │ ├── __init__.py │ │ ├── dataset_utils.py │ │ ├── dataset_wrapper.py │ │ └── timestamp_featurizer.py │ ├── model │ │ ├── __init__.py │ │ ├── base_wrapped_model.py │ │ ├── endpoint_wrapper.py │ │ ├── evaluator.py │ │ ├── fastai_wrapper.py │ │ ├── function_wrapper.py │ │ ├── image_model_wrapper.py │ │ ├── model_utils.py │ │ ├── model_wrapper.py │ │ ├── openai_wrapper.py │ │ ├── predictions_wrapper.py │ │ ├── pytorch_wrapper.py │ │ ├── tensorflow_wrapper.py │ │ ├── text_model_wrapper.py │ │ ├── wrapped_classification_model.py │ │ ├── wrapped_classification_without_proba_model.py │ │ └── wrapped_regression_model.py │ └── version.py ├── setup.cfg └── setup.py ├── requirements-automl.txt ├── requirements-dev.txt ├── requirements-doc.txt ├── requirements-linting.txt ├── requirements-test.txt └── tests ├── automl ├── test_automl_image_model_wrapper.py └── test_automl_image_object_detection_model_wrapper.py ├── common_text_utils.py ├── common_utils.py ├── common_vision_utils.py ├── conftest.py ├── constants.py ├── main ├── test_dataset_wrapper.py ├── test_endpoint_wrapper.py ├── test_image_model_wrapper.py ├── test_model_wrapper.py ├── test_openai_wrapper.py ├── test_predictions_wrapper.py ├── test_pytorch_model_wrapper.py ├── test_text_model_wrapper.py ├── test_tf_model_wrapper.py └── test_timestamp_featurizer.py ├── minimal └── test_minimal.py ├── train_wrapper_utils.py └── wrapper_validator.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 119 3 | max-complexity = 20 4 | exclude = .git/, __pycache__/, dist/ 5 | ignore = G001, B023, B902 6 | show-source = True 7 | statistics = True 8 | count = True 9 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/CI-python-AutoML.yml: -------------------------------------------------------------------------------- 1 | name: CI Python AutoML 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | schedule: 9 | - cron: '30 5 * * *' 10 | 11 | jobs: 12 | ci-python-automl: 13 | strategy: 14 | matrix: 15 | packageDirectory: ["ml_wrappers"] 16 | operatingSystem: [ubuntu-latest] 17 | pythonVersion: ['3.9'] 18 | 19 | runs-on: ${{ matrix.operatingSystem }} 20 | 21 | steps: 22 | - uses: actions/checkout@v4 23 | - uses: conda-incubator/setup-miniconda@v3 24 | with: 25 | auto-update-conda: true 26 | python-version: ${{ matrix.pythonVersion }} 27 | - if: ${{ matrix.operatingSystem == 'macos-latest' }} 28 | name: Use Homebrew to install libomp on MacOS 29 | shell: bash -l {0} 30 | run: | 31 | brew install libomp 32 | - if: ${{ matrix.pythonVersion != '3.6' }} 33 | name: Install numpy 34 | shell: bash -l {0} 35 | run: | 36 | conda install --yes --quiet "numpy<2.0" -c conda-forge 37 | - if: ${{ matrix.operatingSystem != 'macos-latest' }} 38 | name: Install pytorch on non-MacOS 39 | shell: bash -l {0} 40 | run: | 41 | conda install --yes --quiet pytorch==2.2.2 torchvision captum cpuonly -c pytorch 42 | - if: ${{ matrix.operatingSystem == 'macos-latest' }} 43 | name: Install Anaconda packages on MacOS, which should not include cpuonly according to official docs 44 | shell: bash -l {0} 45 | run: | 46 | conda install --yes --quiet pytorch==2.2.2 torchvision captum -c pytorch 47 | - if: ${{ matrix.operatingSystem == 'macos-latest' }} 48 | name: Install lightgbm from conda on MacOS 49 | shell: bash -l {0} 50 | run: | 51 | conda install --yes -c conda-forge lightgbm 52 | - name: Install automl dependencies 53 | shell: bash -l {0} 54 | run: | 55 | pip install -r requirements-automl.txt 56 | - name: Install package 57 | shell: bash -l {0} 58 | run: | 59 | pip install -e ./python 60 | - name: Install test dependencies 61 | shell: bash -l {0} 62 | run: | 63 | pip install -r requirements-test.txt 64 | - name: Test with pytest 65 | shell: bash -l {0} 66 | run: | 67 | pytest ./tests/automl -s -v --durations=10 --cov='ml_wrappers' --cov-report=xml --cov-report=html 68 | - name: Upload code coverage results 69 | uses: actions/upload-artifact@v4 70 | with: 71 | name: ${{ matrix.packageDirectory }}-code-coverage-results 72 | path: htmlcov 73 | # Use always() to always run this step to publish test results when there are test failures 74 | if: ${{ always() }} 75 | - if: ${{ (matrix.operatingSystem == 'windows-latest') && (matrix.pythonVersion == '3.7') }} 76 | name: Upload to codecov 77 | id: codecovupload1 78 | uses: codecov/codecov-action@v3 79 | with: 80 | token: ${{ secrets.CODECOV_TOKEN }} 81 | directory: . 82 | env_vars: OS,PYTHON 83 | fail_ci_if_error: false 84 | files: ./coverage.xml 85 | flags: unittests 86 | name: codecov-umbrella 87 | verbose: true 88 | - if: ${{ (steps.codecovupload1.outcome == 'failure') && (matrix.pythonVersion == '3.7') && (matrix.operatingSystem == 'windows-latest') }} 89 | name: Retry upload to codecov 90 | id: codecovupload2 91 | uses: codecov/codecov-action@v3 92 | with: 93 | token: ${{ secrets.CODECOV_TOKEN }} 94 | directory: . 95 | env_vars: OS,PYTHON 96 | fail_ci_if_error: false 97 | files: ./coverage.xml 98 | flags: unittests 99 | name: codecov-umbrella 100 | verbose: true 101 | - name: Set codecov status 102 | if: ${{ (matrix.pythonVersion == '3.7') && (matrix.operatingSystem == 'windows-latest') }} 103 | shell: bash 104 | run: | 105 | if ${{ (steps.codecovupload1.outcome == 'success') || (steps.codecovupload2.outcome == 'success') }} ; then 106 | echo fine 107 | else 108 | exit 1 109 | fi 110 | -------------------------------------------------------------------------------- /.github/workflows/CI-python-minimal.yml: -------------------------------------------------------------------------------- 1 | name: CI Python minimal environment 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | schedule: 9 | - cron: '30 5 * * *' 10 | 11 | jobs: 12 | ci-python-minimal: 13 | strategy: 14 | matrix: 15 | packageDirectory: ["ml_wrappers"] 16 | operatingSystem: [ubuntu-latest, macos-latest, windows-latest] 17 | pythonVersion: ['3.9', '3.10', '3.11'] 18 | 19 | runs-on: ${{ matrix.operatingSystem }} 20 | 21 | steps: 22 | - uses: actions/checkout@v4 23 | - uses: conda-incubator/setup-miniconda@v3 24 | with: 25 | auto-update-conda: true 26 | python-version: ${{ matrix.pythonVersion }} 27 | - if: ${{ matrix.operatingSystem == 'macos-latest' }} 28 | name: Use Homebrew to install libomp on MacOS 29 | shell: bash -l {0} 30 | run: | 31 | brew install libomp 32 | - name: Install package 33 | shell: bash -l {0} 34 | run: | 35 | pip install -e ./python 36 | - name: Install test dependencies 37 | shell: bash -l {0} 38 | run: | 39 | pip install -r requirements-test.txt 40 | - name: Test with pytest 41 | shell: bash -l {0} 42 | run: | 43 | pytest ./tests/minimal -s -v --durations=10 --cov='ml_wrappers' --cov-report=xml --cov-report=html 44 | - name: Upload code coverage results 45 | uses: actions/upload-artifact@v4 46 | with: 47 | name: ${{ matrix.packageDirectory }}-${{ matrix.pythonVersion }}-${{ matrix.operatingSystem }}-code-coverage-results 48 | path: htmlcov 49 | # Use always() to always run this step to publish test results when there are test failures 50 | if: ${{ always() }} 51 | - if: ${{ (matrix.operatingSystem == 'windows-latest') && (matrix.pythonVersion == '3.9') }} 52 | name: Upload to codecov 53 | id: codecovupload1 54 | uses: codecov/codecov-action@v3 55 | with: 56 | token: ${{ secrets.CODECOV_TOKEN }} 57 | directory: . 58 | env_vars: OS,PYTHON 59 | fail_ci_if_error: false 60 | files: ./coverage.xml 61 | flags: unittests 62 | name: codecov-umbrella 63 | verbose: true 64 | - if: ${{ (steps.codecovupload1.outcome == 'failure') && (matrix.pythonVersion == '3.9') && (matrix.operatingSystem == 'windows-latest') }} 65 | name: Retry upload to codecov 66 | id: codecovupload2 67 | uses: codecov/codecov-action@v3 68 | with: 69 | token: ${{ secrets.CODECOV_TOKEN }} 70 | directory: . 71 | env_vars: OS,PYTHON 72 | fail_ci_if_error: false 73 | files: ./coverage.xml 74 | flags: unittests 75 | name: codecov-umbrella 76 | verbose: true 77 | - name: Set codecov status 78 | if: ${{ (matrix.pythonVersion == '3.9') && (matrix.operatingSystem == 'windows-latest') }} 79 | shell: bash 80 | run: | 81 | if ${{ (steps.codecovupload1.outcome == 'success') || (steps.codecovupload2.outcome == 'success') }} ; then 82 | echo fine 83 | else 84 | exit 1 85 | fi 86 | -------------------------------------------------------------------------------- /.github/workflows/CI-python.yml: -------------------------------------------------------------------------------- 1 | name: CI Python 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | schedule: 9 | - cron: '30 5 * * *' 10 | 11 | jobs: 12 | ci-python: 13 | strategy: 14 | matrix: 15 | packageDirectory: ["ml_wrappers"] 16 | operatingSystem: [ubuntu-latest, macos-latest, windows-latest] 17 | pythonVersion: ['3.9', '3.10', '3.11'] 18 | openaiVersion: ['0.28.1', 'openai-latest'] 19 | exclude: 20 | - openaiVersion: '0.28.1' 21 | pythonVersion: '3.9' 22 | - openaiVersion: '0.28.1' 23 | pythonVersion: '3.10' 24 | - openaiVersion: '0.28.1' 25 | operatingSystem: 'macos-latest' 26 | - openaiVersion: '0.28.1' 27 | operatingSystem: 'windows-latest' 28 | 29 | runs-on: ${{ matrix.operatingSystem }} 30 | 31 | steps: 32 | - uses: actions/checkout@v4 33 | - uses: conda-incubator/setup-miniconda@v3 34 | with: 35 | auto-update-conda: true 36 | python-version: ${{ matrix.pythonVersion }} 37 | channels: conda-forge 38 | 39 | - if: ${{ matrix.operatingSystem == 'macos-latest' }} 40 | name: Use Homebrew to install libomp on MacOS 41 | shell: bash -l {0} 42 | run: | 43 | brew install libomp 44 | 45 | - if: ${{ matrix.operatingSystem == 'windows-latest' }} 46 | name: Install pytorch on windows for python 3.9 to 3.11 47 | shell: bash -l {0} 48 | run: | 49 | conda install --yes --quiet pytorch torchvision captum cpuonly "libtiff<4.5.0" -c pytorch -c conda-forge --strict-channel-priority 50 | 51 | - if: ${{ matrix.operatingSystem == 'ubuntu-latest' }} 52 | name: Install pytorch on ubuntu for python 3.9 to 3.11 53 | shell: bash -l {0} 54 | run: | 55 | conda install --yes --quiet pytorch torchvision captum cpuonly -c pytorch -c conda-forge --strict-channel-priority 56 | 57 | - if: ${{ matrix.operatingSystem == 'macos-latest' }} 58 | name: Install pytorch on MacOS for python 3.9 to 3.11 59 | shell: bash -l {0} 60 | run: | 61 | conda install --yes --quiet pytorch torchvision captum "protobuf<5.26.0" -c pytorch -c conda-forge 62 | 63 | - if: ${{ matrix.operatingSystem == 'macos-latest' }} 64 | name: Install lightgbm from conda on MacOS 65 | shell: bash -l {0} 66 | run: | 67 | conda install --yes lightgbm -c conda-forge 68 | 69 | - name: Install backwards-compatible tf-keras for transformers 70 | shell: bash -l {0} 71 | run: | 72 | pip install tf-keras 73 | 74 | - name: Install package 75 | shell: bash -l {0} 76 | run: | 77 | pip install -e ./python 78 | 79 | - name: Install dev dependencies 80 | shell: bash -l {0} 81 | run: | 82 | pip install -r requirements-dev.txt 83 | 84 | - name: Install test dependencies 85 | shell: bash -l {0} 86 | run: | 87 | pip install -r requirements-test.txt 88 | 89 | - if: ${{ matrix.openaiVersion != 'openai-latest' }} 90 | name: Install openai version ${{ matrix.openaiVersion }} 91 | shell: bash -l {0} 92 | run: | 93 | pip install openai==${{ matrix.openaiVersion }} 94 | 95 | - name: Test with pytest 96 | shell: bash -l {0} 97 | run: | 98 | pytest ./tests/main -s -v --durations=10 --cov='ml_wrappers' --cov-report=xml --cov-report=html 99 | 100 | - name: Upload code coverage results 101 | uses: actions/upload-artifact@v4 102 | with: 103 | name: ${{ matrix.packageDirectory }}-${{ matrix.openaiVersion }}-${{ matrix.pythonVersion }}-${{ matrix.operatingSystem }}-code-coverage-results 104 | path: htmlcov 105 | # Use always() to always run this step to publish test results when there are test failures 106 | if: ${{ always() }} 107 | 108 | - if: ${{ (matrix.operatingSystem == 'windows-latest') && (matrix.pythonVersion == '3.9') }} 109 | name: Upload to codecov 110 | id: codecovupload1 111 | uses: codecov/codecov-action@v3 112 | with: 113 | token: ${{ secrets.CODECOV_TOKEN }} 114 | directory: . 115 | env_vars: OS,PYTHON 116 | fail_ci_if_error: false 117 | files: ./coverage.xml 118 | flags: unittests 119 | name: codecov-umbrella 120 | verbose: true 121 | 122 | - if: ${{ (steps.codecovupload1.outcome == 'failure') && (matrix.pythonVersion == '3.9') && (matrix.operatingSystem == 'windows-latest') }} 123 | name: Retry upload to codecov 124 | id: codecovupload2 125 | uses: codecov/codecov-action@v3 126 | with: 127 | token: ${{ secrets.CODECOV_TOKEN }} 128 | directory: . 129 | env_vars: OS,PYTHON 130 | fail_ci_if_error: false 131 | files: ./coverage.xml 132 | flags: unittests 133 | name: codecov-umbrella 134 | verbose: true 135 | 136 | - name: Set codecov status 137 | if: ${{ (matrix.pythonVersion == '3.9') && (matrix.operatingSystem == 'windows-latest') }} 138 | shell: bash 139 | run: | 140 | if ${{ (steps.codecovupload1.outcome == 'success') || (steps.codecovupload2.outcome == 'success') }} ; then 141 | echo fine 142 | else 143 | exit 1 144 | fi 145 | -------------------------------------------------------------------------------- /.github/workflows/code-scan.yml: -------------------------------------------------------------------------------- 1 | name: CI code scan 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | schedule: 9 | - cron: '30 5 * * *' 10 | 11 | jobs: 12 | analyze: 13 | name: Analyze 14 | runs-on: ubuntu-latest 15 | permissions: 16 | actions: read 17 | contents: read 18 | security-events: write 19 | 20 | strategy: 21 | fail-fast: false 22 | matrix: 23 | language: ["python"] 24 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ] 25 | # Learn more: 26 | # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed 27 | 28 | steps: 29 | - name: Checkout repository 30 | uses: actions/checkout@v4 31 | 32 | # Initializes the CodeQL tools for scanning. 33 | - name: Initialize CodeQL 34 | uses: github/codeql-action/init@v1 35 | with: 36 | languages: ${{ matrix.language }} 37 | # If you wish to specify custom queries, you can do so here or in a config file. 38 | # By default, queries listed here will override any specified in a config file. 39 | # Prefix the list here with "+" to use these queries and those in the config file. 40 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 41 | 42 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 43 | # If this step fails, then you should remove it and run the build manually (see below) 44 | - name: Autobuild 45 | uses: github/codeql-action/autobuild@v1 46 | 47 | # ℹ️ Command-line programs to run using the OS shell. 48 | # 📚 https://git.io/JvXDl 49 | 50 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 51 | # and modify them (or add more) to build your code if your project 52 | # uses a compiled language 53 | 54 | #- run: | 55 | # make bootstrap 56 | # make release 57 | 58 | - name: Perform CodeQL Analysis 59 | uses: github/codeql-action/analyze@v1 60 | -------------------------------------------------------------------------------- /.github/workflows/linkcheck.yml: -------------------------------------------------------------------------------- 1 | 2 | # This is a basic workflow to help you get started with link checks in md files 3 | 4 | name: Link check 5 | 6 | # Controls when the workflow will run 7 | on: 8 | # Triggers the workflow on push or pull request events but only for the "main" branch 9 | push: 10 | branches: [ "main" ] 11 | pull_request: 12 | branches: [ "main" ] 13 | 14 | # Allows you to run this workflow manually from the Actions tab 15 | workflow_dispatch: 16 | 17 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 18 | jobs: 19 | markdown-link-check: 20 | runs-on: ubuntu-latest 21 | steps: 22 | - uses: actions/checkout@v4 23 | - uses: gaurav-nelson/github-action-markdown-link-check@v1 24 | with: 25 | use-verbose-mode: 'yes' 26 | -------------------------------------------------------------------------------- /.github/workflows/python-linting.yml: -------------------------------------------------------------------------------- 1 | # This workflow will lint python code with flake8. 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python linting 5 | 6 | on: 7 | push: 8 | branches: [ main ] 9 | pull_request: 10 | branches: [ main ] 11 | schedule: 12 | - cron: '30 5 * * *' 13 | 14 | jobs: 15 | build: 16 | runs-on: ubuntu-latest 17 | 18 | steps: 19 | - uses: actions/checkout@v4 20 | - name: Set up Python 3.10 21 | uses: actions/setup-python@v5 22 | with: 23 | python-version: '3.10' 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install -r requirements-linting.txt 28 | - name: Check sorted python imports using isort 29 | run: | 30 | isort . -c 31 | - name: Lint code with flake8 32 | run: | 33 | flake8 . 34 | 35 | -------------------------------------------------------------------------------- /.github/workflows/release-ml-wrappers.yml: -------------------------------------------------------------------------------- 1 | name: Release ml-wrappers to PyPI 2 | 3 | # trigger manually only ("collaborator" or more permissions required) 4 | on: 5 | workflow_dispatch: 6 | inputs: 7 | releaseType: 8 | description: "Test or Prod PyPI?" 9 | required: true 10 | default: "Test" 11 | 12 | jobs: 13 | release-build: 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - name: fail if Test nor Prod 18 | if: ${{ ! (github.event.inputs.releaseType == 'Test' || github.event.inputs.releaseType == 'Prod') }} 19 | run: | 20 | echo "Only Test or Prod can be used." 21 | exit 1 22 | 23 | - uses: actions/checkout@v4 24 | 25 | - uses: conda-incubator/setup-miniconda@v3 26 | with: 27 | auto-update-conda: true 28 | python-version: 3.9 29 | 30 | - name: Install pytorch on non-MacOS 31 | shell: bash -l {0} 32 | run: | 33 | conda install --yes --quiet pytorch torchvision captum cpuonly -c pytorch -c conda-forge --strict-channel-priority 34 | 35 | - name: update and upgrade pip, setuptools, wheel, and twine 36 | shell: bash -l {0} 37 | run: | 38 | python -m pip install --upgrade pip 39 | pip install --upgrade setuptools wheel twine 40 | 41 | - name: Install backwards-compatible tf-keras for transformers 42 | shell: bash -l {0} 43 | run: | 44 | pip install tf-keras 45 | 46 | - name: Install dev dependencies 47 | shell: bash -l {0} 48 | run: | 49 | pip install -r requirements-dev.txt 50 | 51 | - name: Install test dependencies 52 | shell: bash -l {0} 53 | run: | 54 | pip install -r requirements-test.txt 55 | 56 | - name: pip freeze 57 | shell: bash -l {0} 58 | run: pip freeze 59 | 60 | - name: build wheel for ml-wrappers 61 | shell: bash -l {0} 62 | run: python setup.py sdist bdist_wheel 63 | working-directory: python 64 | 65 | # run tests before publishing to PyPI 66 | - name: install ml-wrappers wheel locally 67 | shell: bash -l {0} 68 | run: find ./dist/ -name '*.whl' -exec pip install {} \; 69 | working-directory: python 70 | 71 | - name: run ml-wrappers tests 72 | shell: bash -l {0} 73 | run: pytest ./tests/main 74 | 75 | - name: Upload a ml-wrappers build result 76 | uses: actions/upload-artifact@v4 77 | with: 78 | name: ml_wrappers-${{ github.event.inputs.releaseType }} 79 | path: python/dist/ 80 | 81 | # publish to PyPI 82 | - name: Publish ml-wrappers package to Test PyPI 83 | if: ${{ github.event.inputs.releaseType == 'Test' }} 84 | uses: pypa/gh-action-pypi-publish@release/v1 85 | with: 86 | user: __token__ 87 | password: ${{ secrets.TEST_PYPI_API_TOKEN_ML_WRAPPERS }} 88 | repository_url: https://test.pypi.org/legacy/ 89 | packages_dir: python/dist/ 90 | - name: Publish ml-wrappers package to PyPI 91 | if: ${{ github.event.inputs.releaseType == 'Prod' }} 92 | uses: pypa/gh-action-pypi-publish@release/v1 93 | with: 94 | user: __token__ 95 | password: ${{ secrets.PYPI_API_TOKEN_ML_WRAPPERS }} 96 | packages_dir: python/dist/ 97 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | 2 | version: 2 3 | 4 | build: 5 | os: ubuntu-20.04 6 | tools: 7 | python: "3.8" 8 | 9 | sphinx: 10 | builder: html 11 | configuration: python/docs/conf.py 12 | 13 | python: 14 | install: 15 | - requirements: requirements-doc.txt 16 | - method: pip 17 | path: python 18 | 19 | formats: 20 | - epub 21 | - pdf 22 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning Wrappers 2 | ![PyPI](https://img.shields.io/pypi/v/ml-wrappers) 3 | ![MIT license](https://img.shields.io/badge/License-MIT-blue.svg) 4 | ![versions](https://img.shields.io/pypi/pyversions/ml-wrappers) 5 | [![Downloads](https://static.pepy.tech/badge/ml-wrappers)](https://pepy.tech/project/ml-wrappers) 6 | 7 | [![CI Python minimal environment](https://github.com/microsoft/ml-wrappers/actions/workflows/CI-python-minimal.yml/badge.svg)](https://github.com/microsoft/ml-wrappers/actions/workflows/CI-python-minimal.yml) 8 | [![CI Python](https://github.com/microsoft/ml-wrappers/actions/workflows/CI-python.yml/badge.svg)](https://github.com/microsoft/ml-wrappers/actions/workflows/CI-python.yml) 9 | [![CI Python AutoML](https://github.com/microsoft/ml-wrappers/actions/workflows/CI-python-AutoML.yml/badge.svg)](https://github.com/microsoft/ml-wrappers/actions/workflows/CI-python-AutoML.yml) 10 | 11 | 12 | ## Overview and Motivation 13 | Responsible AI tools should be able to work with a broad spectrum of machine learning models and datasets. Much of this functionality is based on the ability to call predict or predict_proba on a model and get back the predicted values or probabilities in a specific format. 14 | 15 | However, there are many different models outside of scikit-learn and even within scikit-learn which have unusual outputs or require the input in a specific format. Some, like pytorch, don’t even have the predict/predict_proba function specification. 16 | 17 | We initially started adding wrappers in the https://github.com/interpretml/interpret-community repository but found that they are needed by other teams as well, including https://github.com/fairlearn/fairlearn and https://github.com/microsoft/responsible-ai-toolbox, hence the code has been moved to this repository. Anyone is welcome to use or contribute to these model and dataset wrappers. 18 | 19 | These wrappers handle a variety of frameworks, including pytorch, tensorflow, keras wrappers on tensorflow, variations on scikit-learn models (such as the SVC classification model that doesn’t have a predict_proba function), lightgbm and xgboost, as well as certain strange pipelines we have encountered from customers and internal users in the past. 20 | 21 | The dataset wrapper handles a variety of different dataset types and converts them to a common numpy or scipy sparse format for internal code to handle in one simple way. Hence, the code doesn’t have to worry about whether the current input is pandas or some other format, it doesn’t have to include if/else branches everywhere in the code. 22 | 23 | The dataset wrapper simply converts the input to the common format, and after the common code finishes running, we convert the representation back to the original format, which can be handled by the original model. 24 | 25 | Currently supported data types include: 26 | 27 | - numpy.ndarray 28 | - pandas.DataFrame 29 | - pandas.Series 30 | - scipy.sparse.csr_matrix 31 | - shap.DenseData 32 | - torch.Tensor 33 | - tensorflow.python.data.ops.dataset_ops.BatchDataset 34 | 35 | For more information about common format from the wrappers, please see the [Wrapper Specifications](https://github.com/microsoft/ml-wrappers/tree/main/docs/WrapperSpecifications.md) documentation. 36 | 37 | ## Installation 38 | 39 | To install the package, simply run: 40 | 41 | ``` 42 | pip install ml-wrappers 43 | ``` 44 | 45 | ## Code example of wrap_model 46 | 47 | ```python 48 | from ml_wrappers import wrap_model 49 | wrapped_model = wrap_model(model, input, model_task='regression') 50 | # Use wrapped model in any common code 51 | ``` 52 | 53 | ## Code example of DatasetWrapper 54 | 55 | ```python 56 | from ml_wrappers import DatasetWrapper 57 | wrapped_dataset = DatasetWrappper(input) 58 | numpy_or_scipy = wrapped_dataset.dataset 59 | # Perform some operations on common converted numpy or scipy dataset 60 | ... 61 | # Get back the original dataset type after modifications 62 | modified_input = wrapped_dataset.typed_dataset(numpy_or_scipy) 63 | ``` 64 | 65 | 66 | 67 | ## Contributing 68 | 69 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 70 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 71 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 72 | 73 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 74 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 75 | provided by the bot. You will only need to do this once across all repos using our CLA. 76 | 77 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 78 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 79 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 80 | 81 | ## Trademarks 82 | 83 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 84 | trademarks or logos is subject to and must follow 85 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks). 86 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. 87 | Any use of third-party trademarks or logos are subject to those third-party's policies. 88 | 89 | 90 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ## Security 5 | 6 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 7 | 8 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. 9 | 10 | ## Reporting Security Issues 11 | 12 | **Please do not report security vulnerabilities through public GitHub issues.** 13 | 14 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). 15 | 16 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). 17 | 18 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/en-us/msrc?rtc=1). 19 | 20 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 21 | 22 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 23 | * Full paths of source file(s) related to the manifestation of the issue 24 | * The location of the affected source code (tag/branch/commit or direct URL) 25 | * Any special configuration required to reproduce the issue 26 | * Step-by-step instructions to reproduce the issue 27 | * Proof-of-concept or exploit code (if possible) 28 | * Impact of the issue, including how an attacker might exploit the issue 29 | 30 | This information will help us triage your report more quickly. 31 | 32 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://www.microsoft.com/en-us/msrc/bounty?rtc=1) page for more details about our active programs. 33 | 34 | ## Preferred Languages 35 | 36 | We prefer all communications to be in English. 37 | 38 | ## Policy 39 | 40 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 41 | 42 | 43 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # Support 2 | 3 | ## How to file issues and get help 4 | 5 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 6 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 7 | feature request as a new Issue. 8 | 9 | ## Microsoft Support Policy 10 | 11 | Support for this project, ml-wrappers, is limited to the resources listed above. 12 | -------------------------------------------------------------------------------- /docs/WrapperSpecifications.md: -------------------------------------------------------------------------------- 1 | # Wrapper Specifications: How to infer classification or regression model type 2 | 3 | In the ML Wrappers SDK there needs to be a clear understanding of the model type to have a solid contract for users and visualizations. 4 | 5 | For example, in the machine learning interpretability space for blackbox models such as in the https://github.com/interpretml/interpret-community/ library, this means that the user can pass in a function from a classifier or regressor, or a model that is a classifier or regressor. For model-specific explainers, the user would pass in the model directly. We can usually infer whether the model is a classifier or regressor in most cases. 6 | 7 | - Functions - We can evaluate the function on the data and look at the output to understand if the model is a classifier or regressor. In general, if the user passes a function that returns a 1D array, we can infer it is a regressor. If the function returns a 2D array, we can infer it is a classifier. There is a tricky case where the function may return a 2D array of 1 column. In this case, we can throw an exception and force the user to specify model_task=(infer, classifier, regressor), and not allow automatic inferencing. The user can override this behavior if they specify an optional parameter model_task=(infer, classifier, regressor), which will have the value model_task=infer by default. 8 | 9 | - If they specify model_task=infer: 10 | - We will try to infer whether the function is for classification or regression based on the specifications above. 11 | - If they specify model_task=classifier and: 12 | - They have a 2D array - run function, treat output as classifier 13 | - They have a 1D array - add wrapper function to convert output to 2D array. Run function on samples and assert all values are probabilities. If are not all 1, convert to a 2D array with 2 columns [1-p, p]. If they are greater than 1, throw exception. 14 | - They pass in classes parameter - run function, treat output as classifier 15 | - If they have model_task=regressor and: 16 | - They have a 2D array - if it has 1 column, treat it as regressor, if more than one column throw exception 17 | - They have a 1D array - run function, treat output as regressor 18 | - They pass in a classes parameter - throw exception, since user specified they are not using a classifier 19 | 20 | Note for some types of frameworks, like catboost, we have found that the prediction results (in this case the predicted probabilities) for a single instance may be of a different shape than prediction results for multiple instances. In this scenario, we can call the model for both single and multiple instances and compare the output dimensionality, and if they differ by one, wrap the prediction function to add an additional dimension if a single instance is predicted on. 21 | 22 | - Models - We can convert the model to a function and then use the specifications listed above. We convert the model to a function by either using the predict_proba function, or, if it is not available, the predict function. In some specific cases, we may be able to get additional information from the model to help us decide which function to use. Specifically, if we know that the model is a Keras model, the model will always have a predict_proba method available. In this case, we can look at the shape of predict_proba, and if it has multiple columns or is a single column with values outside the range of [0, 1], we can by default use predict instead. Otherwise, we can use predict_proba. If the user specified model_task=classifier, this will always override the behavior for Keras models and specify whether to use predict or predict_proba. Also, if the user specifies that model_task=classifier, but the model does not have a predict_proba function, we can wrap the function in a one-hot vector of probabilities. After the model is converted to a function that conforms to our specifications, we can wrap that in our model wrapper, which can contain a reference to the original model in cases where it may be needed or for debugging. 23 | 24 | - Supported Frameworks - Our library can directly support the most popular machine learning frameworks. In general, based on the description above, the library can support models and functions in scikit-learn. However, we can extend support to other frameworks with the model wrapper concept. Currently, the list of supported frameworks, or frameworks we plan to support, are: 25 | - Scikit-Learn - This framework is directly supported by our APIs. 26 | - LightGBM - We can wrap the function into a scikit-learn compatible wrapper. 27 | - XGBoost - We can wrap the function into a scikit-learn compatible wrapper. 28 | - Catboost - We can wrap the function into a scikit-learn compatible wrapper. 29 | - Keras with Tensorflow backend - Keras has both a predict_proba and predict function on all models, so it is difficult to know for sure if the model is a classifier or regressor. We can force the user to specify whether the model is a classifier or regressor in case only a single column is output, and then wrap the model in a model wrapper. If the user specifies the model is a regressor we can fix the structure to be 2D. 30 | - Pytorch - Pytorch does not have a predict or predict_proba function, but the model can be called on the dataset directly to get probabilities. The probabilities can then be transformed into predicted labels for classifiers. Similarly to Keras, we can force the user to specify whether the model is a classifier or regressor in case only a single column is output, and then wrap the model in a model wrapper. If the user specifies the model is a regressor we can fix the structure to be 2D. 31 | - ONNX - ONNX is not yet supported, but we plan to support it in the future. We can use a model wrapper to conform to the predict and predict_proba specifications the SDK requires. 32 | 33 | We would like to support caffe/caffe2 and other ML frameworks in the future as well. Please feel free to contribute to this repository. -------------------------------------------------------------------------------- /docs/object-detection-schema.md: -------------------------------------------------------------------------------- 1 | # Object Detection Scenario Documentation 2 | 3 | ML-Wrappers supports model wrapping of Pytorch object detection methods. We convert the model to a function by either using the predict_proba function, or, if it is not available, the predict function. 4 | 5 | ## Schema 6 | For each image in the dataset, the model is used to generate predictions. Then, the predictions are filtered 7 | using non maximal suppression (based on the iuo threshold parameter). 8 | 9 | The predictions is a list of Pytorch tensors. Each tensor is composed of the labels, boxes (bounding boxes), scores. Example: 10 | 11 | ``` 12 | detections = [{'boxes': tensor([[ 97.0986, 170.7908, 241.4255, 516.5880]], grad_fn=), 'labels': tensor([2]), 'scores': tensor([0.9905], grad_fn=)}] 13 | 14 | predict_output = [[[2.0, 97.09860229492188, 170.7908172607422, 241.425537109375, 516.5879516601562, 0.9904877543449402]]] 15 | ``` 16 | 17 | ## Limitations 18 | This wrapper functionality only supports Pytorch machine learning models. 19 | -------------------------------------------------------------------------------- /docs/release-process.md: -------------------------------------------------------------------------------- 1 | # Release process for ml-wrappers 2 | 3 | When ready to release, create a separate PR in ml-wrappers to bump up the version in the version.py file under the python/ml_wrappers directory: 4 | 5 | ``` 6 | _major = '0' 7 | _minor = 8 | _patch = 9 | ``` 10 | 11 | In the notes make sure to mention all of the changes that have been introduced since the last release. Usually you can take the main description in the PR. 12 | 13 | After the PR has been merged, checkout the master branch and get the latest code. 14 | 15 | ## Release notes 16 | 17 | On the main page, click on releases, and select "Draft a new release". 18 | 19 | In "tag version", enter the version in the format v0.*.*, for example v0.10.0. Keep the target as master branch. 20 | 21 | In release title, enter either "Patch release v0.*.*" or "Release v0.*.*". 22 | 23 | In the release notes, enter the same release notes as in the PR above for all changes that have been made to the package. 24 | 25 | ## PyPI release 26 | 27 | For a guide on the PyPI release process, please see: 28 | 29 | https://packaging.python.org/tutorials/packaging-projects/ 30 | 31 | ### PyPI file 32 | 33 | Create a .pypirc file in the users home directory, it should look similar to: 34 | 35 | ``` 36 | [distutils] 37 | index-servers = 38 | pypi 39 | pypitest 40 | 41 | [pypi] 42 | repository: https://upload.pypi.org/legacy/ 43 | username: interpret-community 44 | password: PASSWORD_REMOVED 45 | 46 | [pypitest] 47 | repository: https://test.pypi.org/legacy/ 48 | username: interpret-community 49 | password: PASSWORD_REMOVED 50 | ``` 51 | 52 | Note interpret-community PyPI user is currently used to publish ml-wrappers to PyPI but this may change in the future. 53 | 54 | ### Clean repo 55 | 56 | Make sure the repo is clean prior to release on the master branch, run: 57 | 58 | ``` 59 | git clean -fdx 60 | ``` 61 | 62 | ### Creating wheel 63 | 64 | Generate the wheel file. First activate your release environment, this can be any conda environment on the release machine: 65 | ``` 66 | conda activate my_env 67 | ``` 68 | Then update setuptools and wheel, always make sure you have the latest version installed before releasing to PyPI: 69 | ``` 70 | pip install --upgrade setuptools wheel 71 | ``` 72 | Generate the wheel where setup.py is located: 73 | ``` 74 | cd (ml-wrappers location)\python 75 | python setup.py sdist bdist_wheel 76 | ``` 77 | If using WSL, it may be necessary to use 78 | ``` 79 | python setup.py sdist bdist_wheel --bdist-dir ~/temp/bdistwheel 80 | ``` 81 | You should see the following files in the dist directory: 82 | ``` 83 | dist/ 84 | ml-wrappers-0.0.1-py3-none-any.whl 85 | ml-wrappers-0.0.1.tar.gz 86 | ``` 87 | 88 | Upgrade twine before uploading to PyPI: 89 | ``` 90 | pip install --upgrade twine 91 | ``` 92 | 93 | Note: you may need to specify --user on some environments: 94 | ``` 95 | pip install --user --upgrade twine 96 | ``` 97 | 98 | Run twine upload to the PyPI test repository: 99 | ``` 100 | twine upload --repository pypitest dist/* 101 | ``` 102 | The twine install location may not be on PATH by default; either add it or call twine using its full path. 103 | 104 | Validate that the page looks correct on the PyPI release page. 105 | 106 | OPTIONAL: 107 | You can install and validate the package locally: 108 | 109 | pip install --index-url https://test.pypi.org/simple/ --no-deps ml-wrappers 110 | 111 | Run twine upload to the PyPI repository: 112 | ``` 113 | twine upload --repository pypi dist/* 114 | ``` -------------------------------------------------------------------------------- /python/README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning Wrappers SDK for Python 2 | 3 | ### This package has been tested with Python 3.9, 3.10 and 3.11 4 | 5 | The Machine Learning Wrappers SDK provides a unified wrapper for various ML frameworks - to have one uniform scikit-learn format predict and predict_proba functions. 6 | 7 | Highlights of the package include: 8 | 9 | - A dataset wrapper to handle scipy sparse, pandas and numpy datasets in a uniform manner. 10 | - A model wrapper to handle models from various frameworks uniformly, including scikit-learn, tensorflow, pytorch, lightgbm and xgboost 11 | 12 | Please see the github website for the documentation and sample notebooks: 13 | https://github.com/microsoft/ml-wrappers 14 | -------------------------------------------------------------------------------- /python/docs/api_reference.rst: -------------------------------------------------------------------------------- 1 | .. _API Reference: 2 | 3 | API Reference 4 | ============= 5 | 6 | .. contents:: Table of Contents 7 | :local: 8 | 9 | ml_wrappers 10 | ----------- 11 | 12 | .. automodule:: ml_wrappers 13 | :members: 14 | :undoc-members: 15 | :show-inheritance: 16 | 17 | ml_wrappers.common 18 | ------------------ 19 | 20 | .. automodule:: ml_wrappers.common 21 | :members: 22 | :undoc-members: 23 | :show-inheritance: 24 | 25 | ml_wrappers.common.constants 26 | --------------------------- 27 | 28 | .. automodule:: ml_wrappers.common.constants 29 | :members: 30 | :undoc-members: 31 | :show-inheritance: 32 | 33 | ml_wrappers.dataset 34 | ------------------- 35 | 36 | .. automodule:: ml_wrappers.dataset 37 | :members: 38 | :undoc-members: 39 | :show-inheritance: 40 | 41 | ml_wrappers.dataset.dataset_utils 42 | -------------------------------- 43 | 44 | .. automodule:: ml_wrappers.dataset.dataset_utils 45 | :members: 46 | :undoc-members: 47 | :show-inheritance: 48 | 49 | ml_wrappers.dataset.dataset_wrapper 50 | ---------------------------------- 51 | 52 | .. automodule:: ml_wrappers.dataset.dataset_wrapper 53 | :members: 54 | :undoc-members: 55 | :show-inheritance: 56 | 57 | ml_wrappers.dataset.timestamp_featurizer 58 | --------------------------------------- 59 | 60 | .. automodule:: ml_wrappers.dataset.timestamp_featurizer 61 | :members: 62 | :undoc-members: 63 | :show-inheritance: 64 | 65 | ml_wrappers.model 66 | ----------------- 67 | 68 | .. automodule:: ml_wrappers.model 69 | :members: 70 | :undoc-members: 71 | :show-inheritance: 72 | 73 | ml_wrappers.model.base_wrapped_model 74 | ----------------------------------- 75 | 76 | .. automodule:: ml_wrappers.model.base_wrapped_model 77 | :members: 78 | :undoc-members: 79 | :show-inheritance: 80 | 81 | ml_wrappers.model.evaluator 82 | --------------------------- 83 | 84 | .. automodule:: ml_wrappers.model.evaluator 85 | :members: 86 | :undoc-members: 87 | :show-inheritance: 88 | 89 | ml_wrappers.model.fastai_wrapper 90 | ------------------------------- 91 | 92 | .. automodule:: ml_wrappers.model.fastai_wrapper 93 | :members: 94 | :undoc-members: 95 | :show-inheritance: 96 | 97 | ml_wrappers.model.function_wrapper 98 | --------------------------------- 99 | 100 | .. automodule:: ml_wrappers.model.function_wrapper 101 | :members: 102 | :undoc-members: 103 | :show-inheritance: 104 | 105 | ml_wrappers.model.image_model_wrapper 106 | ------------------------------------ 107 | 108 | .. automodule:: ml_wrappers.model.image_model_wrapper 109 | :members: 110 | :undoc-members: 111 | :show-inheritance: 112 | 113 | ml_wrappers.model.model_utils 114 | ---------------------------- 115 | 116 | .. automodule:: ml_wrappers.model.model_utils 117 | :members: 118 | :undoc-members: 119 | :show-inheritance: 120 | 121 | ml_wrappers.model.model_wrapper 122 | ------------------------------ 123 | 124 | .. automodule:: ml_wrappers.model.model_wrapper 125 | :members: 126 | :undoc-members: 127 | :show-inheritance: 128 | 129 | ml_wrappers.model.predictions_wrapper 130 | ------------------------------------ 131 | 132 | .. automodule:: ml_wrappers.model.predictions_wrapper 133 | :members: 134 | :undoc-members: 135 | :show-inheritance: 136 | 137 | ml_wrappers.model.pytorch_wrapper 138 | -------------------------------- 139 | 140 | .. automodule:: ml_wrappers.model.pytorch_wrapper 141 | :members: 142 | :undoc-members: 143 | :show-inheritance: 144 | 145 | ml_wrappers.model.tensorflow_wrapper 146 | ----------------------------------- 147 | 148 | .. automodule:: ml_wrappers.model.tensorflow_wrapper 149 | :members: 150 | :undoc-members: 151 | :show-inheritance: 152 | 153 | ml_wrappers.model.text_model_wrapper 154 | ----------------------------------- 155 | 156 | .. automodule:: ml_wrappers.model.text_model_wrapper 157 | :members: 158 | :undoc-members: 159 | :show-inheritance: 160 | 161 | ml_wrappers.model.wrapped_classification_model 162 | --------------------------------------------- 163 | 164 | .. automodule:: ml_wrappers.model.wrapped_classification_model 165 | :members: 166 | :undoc-members: 167 | :show-inheritance: 168 | 169 | ml_wrappers.model.wrapped_classification_without_proba_model 170 | ----------------------------------------------------------- 171 | 172 | .. automodule:: ml_wrappers.model.wrapped_classification_without_proba_model 173 | :members: 174 | :undoc-members: 175 | :show-inheritance: 176 | 177 | ml_wrappers.model.wrapped_regression_model 178 | ----------------------------------------- 179 | 180 | .. automodule:: ml_wrappers.model.wrapped_regression_model 181 | :members: 182 | :undoc-members: 183 | :show-inheritance: 184 | 185 | ml_wrappers.version 186 | ------------------- 187 | 188 | .. automodule:: ml_wrappers.version 189 | :members: 190 | :undoc-members: 191 | :show-inheritance: -------------------------------------------------------------------------------- /python/docs/code_of_conduct.rst: -------------------------------------------------------------------------------- 1 | .. _code_of_conduct: 2 | 3 | Code of Conduct 4 | =============== 5 | 6 | This project has adopted the `Microsoft Open Source Code of Conduct `_. 7 | 8 | Resources: 9 | 10 | - `Microsoft Open Source Code of Conduct `_ 11 | - `Microsoft Code of Conduct FAQ `_ 12 | - Contact `opencode@microsoft.com `_ with questions or concerns -------------------------------------------------------------------------------- /python/docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | 3 | # -- Path setup -------------------------------------------------------------- 4 | 5 | import os 6 | import sys 7 | 8 | sys.path.insert(0, os.path.abspath('../../python')) 9 | 10 | 11 | # -- Project information ----------------------------------------------------- 12 | 13 | project = 'ml_wrappers' 14 | author = 'Microsoft Corporation' 15 | 16 | # The full version, including alpha/beta/rc tags 17 | release = '0.4.12' 18 | 19 | 20 | # -- General configuration --------------------------------------------------- 21 | 22 | # Add any Sphinx extension module names here, as strings. They can be 23 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 24 | # ones. 25 | extensions = [ 26 | 'sphinx.ext.autodoc', 27 | 'sphinx.ext.viewcode', 28 | 'sphinx.ext.githubpages', 29 | 'sphinx.ext.napoleon', 30 | 'sphinx.ext.mathjax', 31 | 'sphinx.ext.todo', 32 | 'sphinx.ext.coverage', 33 | 'sphinx.ext.ifconfig', 34 | 'sphinx.ext.intersphinx', 35 | 'sphinx.ext.doctest', 36 | 'sphinx.ext.inheritance_diagram', 37 | 'sphinx.ext.autosummary' 38 | ] 39 | 40 | # Add any paths that contain templates here, relative to this directory. 41 | templates_path = ['_templates'] 42 | 43 | # List of patterns, relative to source directory, that match files and 44 | # directories to ignore when looking for source files. 45 | # This pattern also affects html_static_path and html_extra_path. 46 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 47 | 48 | 49 | # -- Options for HTML output ------------------------------------------------- 50 | 51 | # The theme to use for HTML and HTML Help pages. See the documentation for 52 | # a list of builtin themes. 53 | html_theme = 'sphinx_rtd_theme' 54 | 55 | # Add any paths that contain custom static files (such as style sheets) here, 56 | # relative to this directory. They are copied after the builtin static files, 57 | # so a file named "default.css" will overwrite the builtin "default.css". 58 | html_static_path = ['_static'] 59 | 60 | # -- Extension configuration ------------------------------------------------- 61 | 62 | # -- Options for intersphinx extension --------------------------------------- 63 | 64 | # Example configuration for intersphinx: refer to the Python standard library. 65 | intersphinx_mapping = {'https://docs.python.org/': None} 66 | 67 | # -- Options for todo extension ---------------------------------------------- 68 | 69 | # If true, `todo` and `todoList` produce output, else they produce nothing. 70 | todo_include_todos = True 71 | -------------------------------------------------------------------------------- /python/docs/contributing.rst: -------------------------------------------------------------------------------- 1 | .. _contributing: 2 | 3 | Contributing 4 | ============ 5 | 6 | We welcome contributions and suggestions! Please see the `CONTRIBUTING.md `_ file for more details. 7 | 8 | Feature Request 9 | --------------- 10 | 11 | If you have a feature request related to this project, please follow the template provided in the `feature_request.md `_ file. This template will guide you to describe the problem, the solution you'd like, any alternative solutions you've considered, and any additional context or screenshots about the feature request. 12 | 13 | Bug Report 14 | ---------- 15 | 16 | If you encounter a bug and want to report it, please use the template provided in the `bug_report.md `_ file. This template will guide you to describe the bug, the steps to reproduce it, the expected behavior, any screenshots, and any additional context about the problem. 17 | 18 | Support 19 | ------- 20 | 21 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing issues before filing new issues to avoid duplicates. For new issues, file your bug or feature request as a new Issue. Support for this project, ml-wrappers, is limited to the resources listed above. For more details, please see the `SUPPORT.md `_ file. 22 | 23 | Microsoft Open Source Code of Conduct 24 | ------------------------------------- 25 | 26 | This project has adopted the `Microsoft Open Source Code of Conduct `_. For more information, see the `CODE_OF_CONDUCT.md `_ file. 27 | 28 | Security 29 | -------- 30 | 31 | Microsoft takes the security of our software products and services seriously. If you believe you have found a security vulnerability in any Microsoft-owned repository, please report it to us as described in the `SECURITY.md `_ file. 32 | 33 | Release Process 34 | --------------- 35 | 36 | When ready to release, create a separate PR in ml-wrappers to bump up the version in the version.py file under the python/ml_wrappers directory. For more details, please see the `release-process.md `_ file. 37 | 38 | License 39 | ------- 40 | 41 | This project is licensed under the MIT License. For more details, please see the `LICENSE.txt `_ file. -------------------------------------------------------------------------------- /python/docs/dataset_wrapping.rst: -------------------------------------------------------------------------------- 1 | .. _dataset_wrapping: 2 | 3 | Dataset Wrapping 4 | ================ 5 | 6 | The ``DatasetWrapper`` class in the ``ml_wrappers`` package provides a uniform interface for handling datasets across different explainers. It supports various data types including numpy arrays, pandas DataFrame, pandas Series, scipy sparse matrices, shap.DenseData, torch.Tensor, and tensorflow.python.data.ops.dataset_ops.BatchDataset. 7 | 8 | .. code-block:: python 9 | 10 | from ml_wrappers.dataset import DatasetWrapper 11 | 12 | # Initialize the dataset wrapper 13 | wrapper = DatasetWrapper(dataset) 14 | 15 | Here, ``dataset`` is a matrix of feature vector examples (# examples x # features) for initializing the explainer. 16 | 17 | The ``DatasetWrapper`` class also provides methods for operations such as summarizing data, taking the subset or sampling. It also provides an option to clear all references after use in explainers for memory optimization. 18 | 19 | .. code-block:: python 20 | 21 | # Initialize the dataset wrapper with clear_references option 22 | wrapper = DatasetWrapper(dataset, clear_references=True) 23 | 24 | The ``DatasetWrapper`` class also provides a method for sampling examples from the dataset. If the number of rows in the dataset is less than a lower bound, it returns the full dataset. If the number of rows is more than an upper bound, it samples randomly. It also provides an option to resample based on the optimal number of clusters. 25 | 26 | .. code-block:: python 27 | 28 | # Sample examples from the dataset 29 | sampled_dataset = wrapper.sample_examples() 30 | 31 | The ``DatasetWrapper`` class also provides a method to clear all references for memory optimization. 32 | 33 | .. code-block:: python 34 | 35 | # Clear all references 36 | wrapper._clear() 37 | 38 | The ``DatasetWrapper`` class is part of the ``ml_wrappers.dataset`` module, which also includes the ``CustomTimestampFeaturizer`` class for timestamp featurization. 39 | 40 | .. code-block:: python 41 | 42 | from ml_wrappers.dataset import CustomTimestampFeaturizer 43 | 44 | # Initialize the timestamp featurizer 45 | featurizer = CustomTimestampFeaturizer() -------------------------------------------------------------------------------- /python/docs/dependencies.rst: -------------------------------------------------------------------------------- 1 | .. _dependencies: 2 | 3 | Dependencies 4 | ============ 5 | 6 | The ml-wrappers library has several dependencies that are required for it to function correctly. These dependencies are listed in various files throughout the repository. Here are the main dependencies: 7 | 8 | python/ml_wrappers.egg-info/dependency_links.txt 9 | ------------------------------------------------ 10 | 11 | This file does not list any specific dependencies. 12 | 13 | requirements-test.txt 14 | --------------------- 15 | 16 | - pytest 17 | - pytest-cov 18 | - rai-test-utils==0.3.0 19 | 20 | requirements-linting.txt 21 | ------------------------ 22 | 23 | - flake8==4.0.1 24 | - flake8-bugbear==21.11.29 25 | - flake8-blind-except==0.1.1 26 | - flake8-breakpoint 27 | - flake8-builtins==1.5.3 28 | - flake8-logging-format==0.6.0 29 | - flake8-pytest-style 30 | - isort 31 | 32 | python/ml_wrappers.egg-info/requires.txt 33 | ---------------------------------------- 34 | 35 | - numpy 36 | - pandas 37 | - scipy 38 | - scikit-learn 39 | 40 | requirements-dev.txt 41 | -------------------- 42 | 43 | - lightgbm 44 | - xgboost 45 | - catboost 46 | - tensorflow 47 | - shap 48 | - transformers<4.40.0 49 | - datasets 50 | - raiutils 51 | - fastai 52 | - vision_explanation_methods 53 | - mlflow 54 | - joblib<1.3.0; python_version <= '3.7' 55 | - scikeras 56 | - openai; python_version >= '3.7' 57 | 58 | requirements-automl.txt 59 | ----------------------- 60 | 61 | - mlflow 62 | - azureml-automl-dnn-vision 63 | - vision_explanation_methods 64 | 65 | Please note that the versions of these dependencies are subject to change and it is always a good idea to check the latest version of the library for the most up-to-date information. -------------------------------------------------------------------------------- /python/docs/getting_started.rst: -------------------------------------------------------------------------------- 1 | .. _getting_started: 2 | 3 | Getting Started 4 | =============== 5 | 6 | This documentation provides an overview of the ML Wrappers SDK, which is designed to provide a uniform format for wrapping datasets and models. 7 | 8 | Installation 9 | ------------ 10 | 11 | The ML Wrappers SDK can be installed via pip: 12 | 13 | .. code-block:: bash 14 | 15 | pip install ml-wrappers 16 | 17 | Supported Models 18 | ---------------- 19 | 20 | The ML Wrappers SDK supports the following models: 21 | 22 | - Scikit-Learn 23 | - LightGBM 24 | - XGBoost 25 | - Catboost 26 | - Keras with Tensorflow backend 27 | - Pytorch 28 | - ONNX (planned for future support) 29 | 30 | For more details, please refer to the :ref:`supported_models` section. 31 | 32 | Supported Frameworks 33 | -------------------- 34 | 35 | The ML Wrappers SDK supports the following frameworks: 36 | 37 | - Scikit-Learn 38 | - LightGBM 39 | - XGBoost 40 | - Catboost 41 | - Keras with Tensorflow backend 42 | - Pytorch 43 | - ONNX (planned for future support) 44 | 45 | For more details, please refer to the :ref:`supported_frameworks` section. 46 | 47 | Model Wrapping 48 | -------------- 49 | 50 | The ML Wrappers SDK provides a way to wrap models into a uniform format. This is done by either using the predict_proba function, or, if it is not available, the predict function. For more details, please refer to the :ref:`model_wrapping` section. 51 | 52 | Dataset Wrapping 53 | ---------------- 54 | 55 | The ML Wrappers SDK provides a way to wrap datasets into a uniform format. This is done using the DatasetWrapper class. For more details, please refer to the :ref:`dataset_wrapping` section. 56 | 57 | License Information 58 | ------------------- 59 | 60 | The ML Wrappers SDK is licensed under the MIT License. For more details, please refer to the :ref:`license_information` section. 61 | 62 | Support 63 | ------- 64 | 65 | Support for this project is limited to the resources listed in the :ref:`support` section. -------------------------------------------------------------------------------- /python/docs/image_model_wrapping.rst: -------------------------------------------------------------------------------- 1 | .. _image_model_wrapping: 2 | 3 | Image Model Wrapping 4 | ==================== 5 | 6 | The ML-Wrappers SDK supports model wrapping for vision-based models. The wrapping process is handled by the ``wrap_model`` function, which takes in a model, data, and a model task as parameters. The model task can be one of the following: ``ModelTask.IMAGE_CLASSIFICATION``, ``ModelTask.MULTILABEL_IMAGE_CLASSIFICATION``, or ``ModelTask.OBJECT_DETECTION``. 7 | 8 | The ``wrap_model`` function determines the type of the model and wraps it accordingly. For instance, if the model is a FastAI model, it is wrapped as a ``WrappedFastAIImageClassificationModel``. If the model is an AutoML model, it is wrapped as a ``WrappedMlflowAutomlImagesClassificationModel`` or a ``WrappedMlflowAutomlObjectDetectionModel`` depending on the model task. If the model is a callable pipeline, it is wrapped as a ``WrappedTransformerImageClassificationModel``. 9 | 10 | For object detection models, the ``wrap_model`` function can also take in an additional parameter, ``classes``, which is a list of class labels. The function returns the wrapped model and the model task. 11 | 12 | The wrapped model can then be used for various tasks such as validation and prediction. For instance, the ``validate_wrapped_classification_model`` function can be used to validate a wrapped classification model. 13 | 14 | The ML-Wrappers SDK also provides support for PyTorch models. The ``PytorchDRiseWrapper`` and ``WrappedObjectDetectionModel`` classes are used to wrap PyTorch models for object detection tasks. 15 | 16 | .. note:: 17 | The ML-Wrappers SDK currently only supports PyTorch machine learning models for object detection tasks. 18 | 19 | For more information on how to use the ML-Wrappers SDK for image model wrapping, refer to the `tests/main/test_image_model_wrapper.py` file in the repository. -------------------------------------------------------------------------------- /python/docs/index.rst: -------------------------------------------------------------------------------- 1 | .. ml-wrappers documentation master file, created by 2 | sphinx-quickstart on Tue Mar 22 13:30:40 2022. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to ml-wrappers's documentation! 7 | ======================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | overview 14 | getting_started 15 | supported_models 16 | supported_frameworks 17 | model_wrapping 18 | model_wrapper_specifications 19 | dataset_wrapping 20 | pytorch_model_wrapping 21 | tensorflow_model_wrapping 22 | image_model_wrapping 23 | text_model_wrapping 24 | object_detection_model_wrapping 25 | api_reference 26 | contributing 27 | support 28 | versioning 29 | dependencies 30 | license_information 31 | code_of_conduct 32 | privacy_policy 33 | 34 | Indices and tables 35 | ================== 36 | 37 | * :ref:`genindex` 38 | * :ref:`modindex` 39 | * :ref:`search` -------------------------------------------------------------------------------- /python/docs/license_information.rst: -------------------------------------------------------------------------------- 1 | .. _license_information: 2 | 3 | License Information 4 | =================== 5 | 6 | The ml-wrappers project is licensed under the MIT License. The full license text is as follows: 7 | 8 | :: 9 | 10 | MIT License 11 | 12 | Copyright (c) Microsoft Corporation. 13 | 14 | Permission is hereby granted, free of charge, to any person obtaining a copy 15 | of this software and associated documentation files (the "Software"), to deal 16 | in the Software without restriction, including without limitation the rights 17 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 18 | copies of the Software, and to permit persons to whom the Software is 19 | furnished to do so, subject to the following conditions: 20 | 21 | The above copyright notice and this permission notice shall be included in all 22 | copies or substantial portions of the Software. 23 | 24 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 25 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 26 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 27 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 28 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 29 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 | SOFTWARE. 31 | 32 | For more information about the license, please refer to the `LICENSE.txt` file in the repository. -------------------------------------------------------------------------------- /python/docs/model_wrapper_specifications.rst: -------------------------------------------------------------------------------- 1 | .. _model_wrapper_specifications: 2 | 3 | Model Wrapper Specifications 4 | ============================ 5 | 6 | In the ML Wrappers SDK, there needs to be a clear understanding of the model type to have a solid contract for users and visualizations. This is particularly important for blackbox models such as those used in the `interpret-community` library. The user can pass in a function from a classifier or regressor, or a model that is a classifier or regressor. For model-specific explainers, the user would pass in the model directly. We can usually infer whether the model is a classifier or regressor in most cases. 7 | 8 | Functions 9 | --------- 10 | 11 | We can evaluate the function on the data and look at the output to understand if the model is a classifier or regressor. In general, if the user passes a function that returns a 1D array, we can infer it is a regressor. If the function returns a 2D array, we can infer it is a classifier. There is a tricky case where the function may return a 2D array of 1 column. In this case, we can throw an exception and force the user to specify model_task=(infer, classifier, regressor), and not allow automatic inferencing. The user can override this behavior if they specify an optional parameter model_task=(infer, classifier, regressor), which will have the value model_task=infer by default. 12 | 13 | Models 14 | ------ 15 | 16 | We can convert the model to a function and then use the specifications listed above. We convert the model to a function by either using the predict_proba function, or, if it is not available, the predict function. In some specific cases, we may be able to get additional information from the model to help us decide which function to use. Specifically, if we know that the model is a Keras model, the model will always have a predict_proba method available. In this case, we can look at the shape of predict_proba, and if it has multiple columns or is a single column with values outside the range of [0, 1], we can by default use predict instead. Otherwise, we can use predict_proba. If the user specified model_task=classifier, this will always override the behavior for Keras models and specify whether to use predict or predict_proba. Also, if the user specifies that model_task=classifier, but the model does not have a predict_proba function, we can wrap the function in a one-hot vector of probabilities. After the model is converted to a function that conforms to our specifications, we can wrap that in our model wrapper, which can contain a reference to the original model in cases where it may be needed or for debugging. 17 | 18 | Supported Frameworks 19 | -------------------- 20 | 21 | Our library can directly support the most popular machine learning frameworks. In general, based on the description above, the library can support models and functions in scikit-learn. However, we can extend support to other frameworks with the model wrapper concept. Currently, the list of supported frameworks, or frameworks we plan to support, are: 22 | 23 | - Scikit-Learn 24 | - LightGBM 25 | - XGBoost 26 | - Catboost 27 | - Keras with Tensorflow backend 28 | - Pytorch 29 | - ONNX 30 | 31 | We would like to support caffe/caffe2 and other ML frameworks in the future as well. -------------------------------------------------------------------------------- /python/docs/model_wrapping.rst: -------------------------------------------------------------------------------- 1 | .. _model_wrapping: 2 | 3 | Model Wrapping 4 | ============== 5 | 6 | The ML Wrappers SDK provides a clear understanding of the model type to have a solid contract for users and visualizations. This is particularly important in the machine learning interpretability space for blackbox models. 7 | 8 | The user can pass in a function from a classifier or regressor, or a model that is a classifier or regressor. For model-specific explainers, the user would pass in the model directly. We can usually infer whether the model is a classifier or regressor in most cases. 9 | 10 | Functions 11 | --------- 12 | 13 | We can evaluate the function on the data and look at the output to understand if the model is a classifier or regressor. In general, if the user passes a function that returns a 1D array, we can infer it is a regressor. If the function returns a 2D array, we can infer it is a classifier. There is a tricky case where the function may return a 2D array of 1 column. In this case, we can throw an exception and force the user to specify model_task=(infer, classifier, regressor), and not allow automatic inferencing. The user can override this behavior if they specify an optional parameter model_task=(infer, classifier, regressor), which will have the value model_task=infer by default. 14 | 15 | Models 16 | ------ 17 | 18 | We can convert the model to a function and then use the specifications listed above. We convert the model to a function by either using the predict_proba function, or, if it is not available, the predict function. In some specific cases, we may be able to get additional information from the model to help us decide which function to use. Specifically, if we know that the model is a Keras model, the model will always have a predict_proba method available. In this case, we can look at the shape of predict_proba, and if it has multiple columns or is a single column with values outside the range of [0, 1], we can by default use predict instead. Otherwise, we can use predict_proba. If the user specified model_task=classifier, this will always override the behavior for Keras models and specify whether to use predict or predict_proba. Also, if the user specifies that model_task=classifier, but the model does not have a predict_proba function, we can wrap the function in a one-hot vector of probabilities. After the model is converted to a function that conforms to our specifications, we can wrap that in our model wrapper, which can contain a reference to the original model in cases where it may be needed or for debugging. 19 | 20 | Supported Frameworks 21 | -------------------- 22 | 23 | Our library can directly support the most popular machine learning frameworks. In general, based on the description above, the library can support models and functions in scikit-learn. However, we can extend support to other frameworks with the model wrapper concept. Currently, the list of supported frameworks, or frameworks we plan to support, are: 24 | 25 | - Scikit-Learn 26 | - LightGBM 27 | - XGBoost 28 | - Catboost 29 | - Keras with Tensorflow backend 30 | - Pytorch 31 | - ONNX (planned) 32 | 33 | We would like to support caffe/caffe2 and other ML frameworks in the future as well. -------------------------------------------------------------------------------- /python/docs/object_detection_model_wrapping.rst: -------------------------------------------------------------------------------- 1 | .. _object_detection_model_wrapping: 2 | 3 | Object Detection Model Wrapping 4 | =============================== 5 | 6 | ML-Wrappers supports model wrapping of Pytorch object detection methods. The model is converted to a function by either using the predict_proba function, or, if it is not available, the predict function. 7 | 8 | Schema 9 | ------ 10 | For each image in the dataset, the model is used to generate predictions. Then, the predictions are filtered using non maximal suppression (based on the iuo threshold parameter). 11 | 12 | The predictions is a list of Pytorch tensors. Each tensor is composed of the labels, boxes (bounding boxes), scores. 13 | 14 | Example: 15 | 16 | .. code-block:: python 17 | 18 | detections = [{'boxes': tensor([[ 97.0986, 170.7908, 241.4255, 516.5880]], grad_fn=), 'labels': tensor([2]), 'scores': tensor([0.9905], grad_fn=)}] 19 | 20 | predict_output = [[[2.0, 97.09860229492188, 170.7908172607422, 241.425537109375, 516.5879516601562, 0.9904877543449402]]] 21 | 22 | Limitations 23 | ----------- 24 | This wrapper functionality only supports Pytorch machine learning models. 25 | 26 | Model Wrapping 27 | -------------- 28 | The model wrapping process involves the following steps: 29 | 30 | 1. Processing the raw detections to generate bounding boxes, class scores, and objectness scores. 31 | 2. Applying non-maximal suppression and score filtering based on the iou threshold and score threshold parameters. 32 | 3. Creating a list of detection records from the image predictions. 33 | 34 | Example: 35 | 36 | .. code-block:: python 37 | 38 | class WrappedObjectDetectionModel: 39 | """A class for wrapping a object detection model in the scikit-learn style.""" 40 | 41 | def __init__(self, model: Any, number_of_classes: int, device=Device.AUTO.value) -> None: 42 | """Initialize the WrappedObjectDetectionModel with the model and evaluation function.""" 43 | self._device = torch.device(_get_device(device)) 44 | model.eval() 45 | model.to(self._device) 46 | 47 | self._model = model 48 | self._number_of_classes = number_of_classes 49 | 50 | def predict(self, x, iou_threshold: float = 0.5, score_threshold: float = 0.5): 51 | """Create a list of detection records from the image predictions.""" 52 | detections = [] 53 | for image in x: 54 | if type(image) == Tensor: 55 | raw_detections = self._model(image.to(self._device).unsqueeze(0)) 56 | else: 57 | raw_detections = self._model(T.ToTensor()(image).to(self._device).unsqueeze(0)) 58 | 59 | Supported Frameworks 60 | -------------------- 61 | The following machine learning frameworks are supported: 62 | 63 | - Scikit-Learn 64 | - LightGBM 65 | - XGBoost 66 | - Catboost 67 | - Keras with Tensorflow backend 68 | - Pytorch 69 | 70 | ONNX is not yet supported, but there are plans to support it in the future. Other ML frameworks like caffe/caffe2 are also planned to be supported in the future. -------------------------------------------------------------------------------- /python/docs/overview.rst: -------------------------------------------------------------------------------- 1 | .. _overview: 2 | 3 | Overview 4 | ======== 5 | 6 | The ml-wrappers project is a Python library that provides a unified interface for wrapping machine learning models and datasets. It is designed to make it easier to work with different types of models and datasets, and to facilitate the process of explaining and interpreting machine learning models. 7 | 8 | The library includes support for a variety of machine learning frameworks, including Scikit-Learn, LightGBM, XGBoost, Catboost, Keras with Tensorflow backend, Pytorch, and ONNX. It also provides a mechanism for inferring whether a model is a classifier or regressor, and for wrapping models in a way that conforms to the specifications required by the library. 9 | 10 | The ml-wrappers library also provides a DatasetWrapper class that makes it easier to perform operations such as summarizing data, taking subsets of data, and sampling data. This class can handle a variety of data types, including numpy arrays, pandas DataFrames, pandas Series, scipy sparse matrices, and more. 11 | 12 | In addition to wrapping models and datasets, the ml-wrappers library also provides a number of utilities for working with machine learning models. These include functions for evaluating models, generating augmented data, and more. 13 | 14 | The library is released under the MIT License and adheres to the Microsoft Open Source Code of Conduct. It is maintained by Microsoft and contributions are welcome. -------------------------------------------------------------------------------- /python/docs/privacy_policy.rst: -------------------------------------------------------------------------------- 1 | .. _privacy_policy: 2 | 3 | Privacy Policy 4 | ============== 5 | 6 | The ml-wrappers project does not collect any personal data. As an open-source project, it is designed to be used locally on your machine. Any data used for model training or prediction remains on your local machine and is not sent or shared with any external entities. 7 | 8 | However, please note that if you choose to contribute to the project by submitting pull requests or issues, your username and any information you include in these submissions will be publicly visible. We recommend that you do not include any sensitive personal information in these submissions. 9 | 10 | For more information about Microsoft's privacy policies, please visit the `Microsoft Privacy Statement `_. 11 | 12 | If you have any questions or concerns about privacy in relation to the use of ml-wrappers, please contact us at opencode@microsoft.com. -------------------------------------------------------------------------------- /python/docs/pytorch_model_wrapping.rst: -------------------------------------------------------------------------------- 1 | .. _pytorch_model_wrapping: 2 | 3 | Pytorch Model Wrapping 4 | ======================= 5 | 6 | The ML Wrappers library provides support for wrapping Pytorch models. This is achieved through the use of model wrappers and utilities specifically designed for Pytorch models. 7 | 8 | .. code-block:: python 9 | 10 | import logging 11 | import numpy as np 12 | import pandas as pd 13 | 14 | module_logger = logging.getLogger(__name__) 15 | module_logger.setLevel(logging.INFO) 16 | 17 | try: 18 | import torch 19 | except ImportError: 20 | module_logger.debug('Could not import torch, required if using a PyTorch model') 21 | 22 | try: 23 | from torchvision.transforms import ToTensor 24 | except ImportError: 25 | module_logger.debug('Could not import torchvision, required if using' + 26 | ' a vision PyTorch model') 27 | 28 | The library attempts to import the necessary Pytorch and torchvision modules. If these imports fail, a debug message is logged indicating that these modules are required when using a Pytorch model. 29 | 30 | The library provides a WrappedPytorchModel class for wrapping Pytorch models. This class is used in the wrap_model function to wrap the model if it is a Pytorch model. 31 | 32 | .. code-block:: python 33 | 34 | class WrappedPytorchModel(object): 35 | def __init__(self, model): 36 | self._model = model 37 | 38 | def predict(self, dataset): 39 | return self._model(dataset) 40 | 41 | def predict_proba(self, dataset): 42 | return self._model(dataset) 43 | 44 | The WrappedPytorchModel class provides a predict and predict_proba method, which call the model's predict method on the given dataset. 45 | 46 | The library also provides a PytorchModelInitializer class for initializing Pytorch models. This class is used in the wrapped_pytorch_model_initializer function to initialize the model. 47 | 48 | .. code-block:: python 49 | 50 | class PytorchModelInitializer(): 51 | def __init__(self, model_initializer, model_task): 52 | self._model_initializer = model_initializer 53 | self._model_task = model_task 54 | 55 | def __call__(self, X_train, y_train): 56 | fitted_model = self._model_initializer(X_train, y_train) 57 | wrapped_pytorch_model = WrappedPytorchModel(fitted_model) 58 | validate_wrapped_pytorch_model(wrapped_pytorch_model, X_train, 59 | self._model_task) 60 | return wrapped_pytorch_model 61 | 62 | The PytorchModelInitializer class provides a __call__ method, which initializes the model and wraps it using the WrappedPytorchModel class. 63 | 64 | .. note:: 65 | 66 | The ML Wrappers library only supports Pytorch machine learning models. -------------------------------------------------------------------------------- /python/docs/support.rst: -------------------------------------------------------------------------------- 1 | .. _support: 2 | 3 | Support 4 | ======= 5 | 6 | How to file issues and get help 7 | ------------------------------- 8 | 9 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing issues before filing new issues to avoid duplicates. For new issues, file your bug or feature request as a new issue. 10 | 11 | Microsoft Support Policy 12 | ------------------------ 13 | 14 | Support for this project, ml-wrappers, is limited to the resources listed above. 15 | 16 | Feature request 17 | --------------- 18 | 19 | If you have a feature request related to this project, you can suggest an idea through the GitHub Issues. Please provide a clear and concise description of what the problem is and the solution you'd like. Also, describe any alternative solutions or features you've considered. You can add any other context or screenshots about the feature request. 20 | 21 | Bug report 22 | ---------- 23 | 24 | If you encounter a bug, you can create a report to help us improve. Please provide a clear and concise description of what the bug is. Include steps to reproduce the behavior and what you expected to happen. If applicable, add screenshots to help explain your problem. Also, provide information about your desktop or smartphone, including the OS, browser, and version. Add any other context about the problem. 25 | 26 | Microsoft Open Source Code of Conduct 27 | ------------------------------------- 28 | 29 | This project has adopted the `Microsoft Open Source Code of Conduct `_. For more information, see the `Microsoft Code of Conduct FAQ `_ or contact `opencode@microsoft.com `_ with any additional questions or comments. 30 | 31 | Security 32 | -------- 33 | 34 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations. If you believe you have found a security vulnerability in any Microsoft-owned repository, please report it to us as described in the `Microsoft's definition of a security vulnerability `_. Please do not report security vulnerabilities through public GitHub issues. Instead, report them to the Microsoft Security Response Center (MSRC) at `https://msrc.microsoft.com/create-report `_. -------------------------------------------------------------------------------- /python/docs/supported_frameworks.rst: -------------------------------------------------------------------------------- 1 | .. _supported_frameworks: 2 | 3 | Supported Frameworks 4 | ==================== 5 | 6 | The ml-wrappers library supports a variety of machine learning frameworks. The following frameworks are directly supported or can be supported with the model wrapper concept: 7 | 8 | - **Scikit-Learn**: This framework is directly supported by our APIs. 9 | 10 | - **LightGBM**: The functions of this framework can be wrapped into a scikit-learn compatible wrapper. 11 | 12 | - **XGBoost**: The functions of this framework can be wrapped into a scikit-learn compatible wrapper. 13 | 14 | - **Catboost**: The functions of this framework can be wrapped into a scikit-learn compatible wrapper. 15 | 16 | - **Keras with Tensorflow backend**: Keras has both a predict_proba and predict function on all models, so it is difficult to know for sure if the model is a classifier or regressor. We can force the user to specify whether the model is a classifier or regressor in case only a single column is output, and then wrap the model in a model wrapper. If the user specifies the model is a regressor we can fix the structure to be 2D. 17 | 18 | - **Pytorch**: Pytorch does not have a predict or predict_proba function, but the model can be called on the dataset directly to get probabilities. The probabilities can then be transformed into predicted labels for classifiers. Similarly to Keras, we can force the user to specify whether the model is a classifier or regressor in case only a single column is output, and then wrap the model in a model wrapper. If the user specifies the model is a regressor we can fix the structure to be 2D. 19 | 20 | - **ONNX**: ONNX is not yet supported, but we plan to support it in the future. We can use a model wrapper to conform to the predict and predict_proba specifications the SDK requires. 21 | 22 | We would like to support caffe/caffe2 and other ML frameworks in the future as well. Contributions to this repository are welcome. -------------------------------------------------------------------------------- /python/docs/supported_models.rst: -------------------------------------------------------------------------------- 1 | .. _supported_models: 2 | 3 | Supported Models 4 | ================ 5 | 6 | The ML-Wrappers library supports a variety of machine learning models. The following sections provide an overview of the supported models. 7 | 8 | Scikit-Learn 9 | ------------ 10 | 11 | Scikit-Learn models are directly supported by our APIs. 12 | 13 | LightGBM 14 | -------- 15 | 16 | LightGBM models can be wrapped into a scikit-learn compatible wrapper. 17 | 18 | XGBoost 19 | ------- 20 | 21 | XGBoost models can be wrapped into a scikit-learn compatible wrapper. 22 | 23 | Catboost 24 | -------- 25 | 26 | Catboost models can be wrapped into a scikit-learn compatible wrapper. 27 | 28 | Keras with Tensorflow backend 29 | ----------------------------- 30 | 31 | Keras models have both a predict_proba and predict function on all models, so it is difficult to know for sure if the model is a classifier or regressor. We can force the user to specify whether the model is a classifier or regressor in case only a single column is output, and then wrap the model in a model wrapper. If the user specifies the model is a regressor we can fix the structure to be 2D. 32 | 33 | Pytorch 34 | ------- 35 | 36 | Pytorch does not have a predict or predict_proba function, but the model can be called on the dataset directly to get probabilities. The probabilities can then be transformed into predicted labels for classifiers. Similarly to Keras, we can force the user to specify whether the model is a classifier or regressor in case only a single column is output, and then wrap the model in a model wrapper. If the user specifies the model is a regressor we can fix the structure to be 2D. 37 | 38 | ONNX 39 | ---- 40 | 41 | ONNX is not yet supported, but we plan to support it in the future. We can use a model wrapper to conform to the predict and predict_proba specifications the SDK requires. 42 | 43 | Future Support 44 | -------------- 45 | 46 | We would like to support caffe/caffe2 and other ML frameworks in the future as well. Contributions to this repository are welcome. -------------------------------------------------------------------------------- /python/docs/tensorflow_model_wrapping.rst: -------------------------------------------------------------------------------- 1 | .. _tensorflow_model_wrapping: 2 | 3 | Tensorflow Model Wrapping 4 | ========================= 5 | 6 | The ML Wrappers library provides support for wrapping Tensorflow models to conform to the required specifications for model explanations. This is achieved through the ``WrappedTensorflowModel`` class and the ``is_sequential`` function. 7 | 8 | WrappedTensorflowModel 9 | ---------------------- 10 | 11 | The ``WrappedTensorflowModel`` class is used to wrap a Tensorflow model. This class is initialized with the model to be wrapped. It provides the ``predict`` method for making predictions using the wrapped Tensorflow model. 12 | 13 | .. code-block:: python 14 | 15 | class WrappedTensorflowModel(object): 16 | def __init__(self, model): 17 | self._model = model 18 | 19 | def predict(self, dataset): 20 | if isinstance(dataset, pd.DataFrame): 21 | dataset = dataset.values 22 | return self._model.predict(dataset) 23 | 24 | is_sequential 25 | ------------- 26 | 27 | The ``is_sequential`` function checks if a given model is a sequential model. It returns True if the model is a sequential model and False otherwise. 28 | 29 | .. code-block:: python 30 | 31 | def is_sequential(model): 32 | return str(type(model)).endswith("keras.engine.sequential.Sequential'>") 33 | 34 | Tensorflow Model Initializer 35 | ---------------------------- 36 | 37 | The Tensorflow Model Initializer is a class that initializes a Tensorflow model and wraps it using the ``WrappedTensorflowModel`` class. It also validates the wrapped Tensorflow model. 38 | 39 | .. code-block:: python 40 | 41 | class TensorflowModelInitializer(): 42 | def __init__(self, model_initializer, model_task): 43 | self._model_initializer = model_initializer 44 | self._model_task = model_task 45 | 46 | def __call__(self, X_train, y_train): 47 | fitted_model = self._model_initializer(X_train, y_train) 48 | wrapped_tf_model = WrappedTensorflowModel(fitted_model) 49 | validate_wrapped_tf_model(wrapped_tf_model, X_train, self._model_task) 50 | return wrapped_tf_model 51 | 52 | The ``wrapped_tensorflow_model_initializer`` function returns an instance of the TensorflowModelInitializer class. 53 | 54 | .. code-block:: python 55 | 56 | def wrapped_tensorflow_model_initializer(model_initializer, model_task): 57 | return TensorflowModelInitializer(model_initializer, model_task) 58 | 59 | Supported Frameworks 60 | -------------------- 61 | 62 | The ML Wrappers library supports a variety of machine learning frameworks. For Tensorflow models, the library can wrap the model in a model wrapper if the user specifies whether the model is a classifier or regressor in case only a single column is output. If the user specifies the model is a regressor, the structure can be fixed to be 2D. 63 | 64 | .. note:: 65 | 66 | The library can directly support the most popular machine learning frameworks. However, support can be extended to other frameworks with the model wrapper concept. -------------------------------------------------------------------------------- /python/docs/text_model_wrapping.rst: -------------------------------------------------------------------------------- 1 | .. _text_model_wrapping: 2 | 3 | Text Model Wrapping 4 | =================== 5 | 6 | The ml-wrappers library provides support for wrapping text-based models. This includes both classification and question-answering models. 7 | 8 | WrappedQuestionAnsweringModel 9 | ----------------------------- 10 | 11 | The WrappedQuestionAnsweringModel class is used for wrapping a Transformers model in the scikit-learn style. 12 | 13 | .. code-block:: python 14 | 15 | class WrappedQuestionAnsweringModel(object): 16 | """A class for wrapping a Transformers model in the scikit-learn style.""" 17 | 18 | def __init__(self, model): 19 | """Initialize the WrappedQuestionAnsweringModel.""" 20 | self._model = model 21 | 22 | def predict(self, dataset): 23 | """Predict the output using the wrapped Transformers model. 24 | 25 | :param dataset: The dataset to predict on. 26 | :type dataset: ml_wrappers.DatasetWrapper 27 | """ 28 | output = [] 29 | for context, question in zip(dataset['context'], dataset['questions']): 30 | answer = self._model({'context': context, 'question': question}) 31 | output.append(answer['answer']) 32 | return output 33 | 34 | WrappedTextClassificationModel 35 | ------------------------------ 36 | 37 | The WrappedTextClassificationModel class is used for wrapping a Transformers model in the scikit-learn style. 38 | 39 | .. code-block:: python 40 | 41 | class WrappedTextClassificationModel(object): 42 | """A class for wrapping a Transformers model in the scikit-learn style.""" 43 | 44 | def __init__(self, model, multilabel=False): 45 | """Initialize the WrappedTextClassificationModel.""" 46 | self._model = model 47 | if not shap_installed: 48 | raise ImportError("SHAP is not installed. Please install it " + 49 | "to use WrappedTextClassificationModel.") 50 | self._wrapped_model = models.TransformersPipeline(model) 51 | self._multilabel = multilabel 52 | 53 | def predict(self, dataset): 54 | """Predict the output using the wrapped Transformers model. 55 | 56 | :param dataset: The dataset to predict on. 57 | :type dataset: ml_wrappers.DatasetWrapper 58 | """ 59 | pipeline_dicts = self._wrapped_model.inner_model(dataset) 60 | output = [] 61 | for val in pipeline_dicts: 62 | if not isinstance(val, list): 63 | val = [val] 64 | scores = [obj["score"] for obj in val] 65 | if self._multilabel: 66 | threshold = MULTILABEL_THRESHOLD 67 | labels = np.where(np.array(scores) > threshold) 68 | predictions = np.zeros(len(scores)) 69 | predictions[labels] = 1 70 | output.append(predictions) 71 | else: 72 | max_score_index = np.argmax(scores) 73 | output.append(max_score_index) 74 | return np.array(output) 75 | 76 | def predict_proba(self, dataset): 77 | """Predict the output probability using the Transformers model. 78 | 79 | :param dataset: The dataset to predict_proba on. 80 | :type dataset: ml_wrappers.DatasetWrapper 81 | """ 82 | return self._wrapped_model(dataset) 83 | 84 | The wrap_model function is used to wrap the model. It takes as input the model, the data, and the model task (in this case, text classification or question answering). The function returns a wrapped model that can be used for further processing or evaluation. 85 | 86 | .. code-block:: python 87 | 88 | from ml_wrappers import wrap_model 89 | from ml_wrappers.common.constants import ModelTask 90 | 91 | wrapped_model = wrap_model(model, data, ModelTask.TEXT_CLASSIFICATION) 92 | 93 | For more information on how to use these classes and functions, please refer to the source code and the provided examples. -------------------------------------------------------------------------------- /python/docs/versioning.rst: -------------------------------------------------------------------------------- 1 | .. _versioning: 2 | 3 | Versioning 4 | ========== 5 | 6 | The version of the ml-wrappers package is defined in the ``version.py`` file located in the ``python/ml_wrappers`` directory. The version is specified using three variables: ``_major``, ``_minor``, and ``_patch``. 7 | 8 | .. code-block:: python 9 | 10 | name = 'ml_wrappers' 11 | _major = '0' 12 | _minor = '4' 13 | _patch = '12' 14 | version = '{}.{}.{}'.format(_major, _minor, _patch) 15 | 16 | The version follows the format of ``major.minor.patch``. 17 | 18 | - ``major``: This is incremented for major changes or redesigns in the package. 19 | - ``minor``: This is incremented for minor changes or additions of new features. 20 | - ``patch``: This is incremented for bug fixes or minor improvements. 21 | 22 | When ready to release a new version, create a separate PR in ml-wrappers to bump up the version in the ``version.py`` file. In the notes, make sure to mention all of the changes that have been introduced since the last release. 23 | 24 | .. code-block:: python 25 | 26 | _major = '0' 27 | _minor = 28 | _patch = 29 | 30 | After the PR has been merged, checkout the master branch and get the latest code. For more details on the release process, refer to the `Release Process `_ section. -------------------------------------------------------------------------------- /python/ml_wrappers/__init__.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Module for wrapping datasets and models in one uniform format. 6 | """ 7 | from .dataset import DatasetWrapper 8 | from .model import wrap_model 9 | from .version import name, version 10 | 11 | __all__ = ['DatasetWrapper', 'wrap_model'] 12 | 13 | import atexit 14 | # Setup logging infrustructure 15 | import logging 16 | import os 17 | 18 | # Only log to disk if environment variable specified 19 | ml_wrappers_c_logs = os.environ.get('ML_WRAPPERS_C_LOGS') 20 | if ml_wrappers_c_logs is not None: 21 | logger = logging.getLogger(__name__) 22 | logger.setLevel(logging.INFO) 23 | os.makedirs(os.path.dirname(ml_wrappers_c_logs), exist_ok=True) 24 | handler = logging.FileHandler(ml_wrappers_c_logs, mode='w') 25 | handler.setLevel(logging.INFO) 26 | logger.addHandler(handler) 27 | logger.info('Initializing logging file for ml-wrappers') 28 | 29 | def close_handler(): 30 | handler.close() 31 | logger.removeHandler(handler) 32 | atexit.register(close_handler) 33 | 34 | __name__ = name 35 | __version__ = version 36 | -------------------------------------------------------------------------------- /python/ml_wrappers/common/__init__.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Defines a common directory shared across ML model and dataset wrappers.""" 6 | -------------------------------------------------------------------------------- /python/ml_wrappers/common/gpu_kmeans.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | """ 17 | The code is based on the similar utility function from SHAP: 18 | https://github.com/slundberg/shap/blob/9411b68e8057a6c6f3621765b89b24d82bee13d4/shap/utils/_legacy.py 19 | This version makes use of cuml kmeans instead of sklearn for speed. 20 | """ 21 | 22 | import numpy as np 23 | 24 | try: 25 | import cuml 26 | from cuml import KMeans 27 | from cuml.preprocessing import SimpleImputer 28 | rapids_installed = True 29 | except BaseException: # noqa: B036 30 | rapids_installed = False 31 | from scipy.sparse import issparse 32 | 33 | 34 | def kmeans(X, k, round_values=True): 35 | """ Summarize a dataset with k mean samples weighted by the number of data points they 36 | each represent. 37 | Parameters 38 | ---------- 39 | X : numpy.ndarray or pandas.DataFrame or any scipy.sparse matrix 40 | Matrix of data samples to summarize (# samples x # features) 41 | k : int 42 | Number of means to use for approximation. 43 | round_values : bool 44 | For all i, round the ith dimension of each mean sample to match the nearest value 45 | from X[:,i]. This ensures discrete features always get a valid value. 46 | Returns 47 | ------- 48 | DenseData object. 49 | """ 50 | 51 | if not rapids_installed: 52 | raise RuntimeError( 53 | "cuML is required to use GPU explainers. Check https://rapids.ai/start.html \ 54 | for more information on how to install it.") 55 | if cuml.__version__ >= '21.08': 56 | from cuml.explainer.sampling import kmeans_sampling 57 | summary, group_names, labels = kmeans_sampling(X, k, round_values, detailed=True) 58 | 59 | return DenseData(summary, 60 | group_names, 61 | None, 62 | 1.0 * np.bincount(labels)) 63 | # For backward compatibility 64 | group_names = [str(i) for i in range(X.shape[1])] 65 | if str(type(X)).endswith("'pandas.core.frame.DataFrame'>"): 66 | group_names = X.columns 67 | X = X.values 68 | 69 | # in case there are any missing values in data impute them 70 | imp = SimpleImputer(missing_values=np.nan, strategy='mean') 71 | X = imp.fit_transform(X) 72 | 73 | kmeans = KMeans(n_clusters=k, random_state=0).fit(X) 74 | 75 | if round_values: 76 | for i in range(k): 77 | for j in range(X.shape[1]): 78 | xj = X[:, j].toarray().flatten() if issparse( 79 | X) else X[:, j] # sparse support courtesy of @PrimozGodec 80 | ind = np.argmin(np.abs(xj - kmeans.cluster_centers_[i, j])) 81 | kmeans.cluster_centers_[i, j] = X[ind, j] 82 | return DenseData( 83 | kmeans.cluster_centers_, 84 | group_names, 85 | None, 86 | 1.0 * np.bincount(kmeans.labels_) 87 | ) 88 | 89 | 90 | class Data: 91 | def __init__(self): 92 | pass 93 | 94 | 95 | class DenseData(Data): 96 | def __init__(self, data, group_names, *args): 97 | self.groups = args[0] if len(args) > 0 and args[0] is not None else [ 98 | np.array([i]) for i in range(len(group_names))] 99 | 100 | length = sum(len(g) for g in self.groups) 101 | num_samples = data.shape[0] 102 | t = False 103 | if length != data.shape[1]: 104 | t = True 105 | num_samples = data.shape[1] 106 | 107 | valid = ( 108 | not t and length == data.shape[1]) or ( 109 | t and length == data.shape[0]) 110 | assert valid, "# of names must match data matrix!" 111 | 112 | self.weights = args[1] if len(args) > 1 else np.ones(num_samples) 113 | self.weights /= np.sum(self.weights) 114 | wl = len(self.weights) 115 | valid = (not t and wl == data.shape[0]) or (t and wl == data.shape[1]) 116 | assert valid, "# weights must match data matrix!" 117 | 118 | self.transposed = t 119 | self.group_names = group_names 120 | self.data = data 121 | self.groups_size = len(self.groups) 122 | -------------------------------------------------------------------------------- /python/ml_wrappers/common/warnings_suppressor.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Suppresses warnings on imports.""" 6 | 7 | import os 8 | import warnings 9 | 10 | TF_CPP_MIN_LOG_LEVEL = 'TF_CPP_MIN_LOG_LEVEL' 11 | 12 | 13 | class tf_warnings_suppressor(object): 14 | """Context manager to suppress warnings from tensorflow.""" 15 | 16 | def __init__(self): 17 | """Initialize the tf_warnings_suppressor.""" 18 | self._entered = False 19 | if TF_CPP_MIN_LOG_LEVEL in os.environ: 20 | self._default_tf_log_level = os.environ[TF_CPP_MIN_LOG_LEVEL] 21 | else: 22 | self._default_tf_log_level = '0' 23 | 24 | def __enter__(self): 25 | """Begins suppressing tensorflow warnings.""" 26 | if self._entered: 27 | raise RuntimeError("Cannot enter %r twice" % self) 28 | self._entered = True 29 | os.environ[TF_CPP_MIN_LOG_LEVEL] = '2' 30 | 31 | def __exit__(self, *exc_info): 32 | """Finishes suppressing tensorflow warnings.""" 33 | if not self._entered: 34 | raise RuntimeError("Cannot exit %r without entering first" % self) 35 | os.environ[TF_CPP_MIN_LOG_LEVEL] = self._default_tf_log_level 36 | 37 | 38 | class shap_warnings_suppressor(object): 39 | """Context manager to suppress warnings from shap.""" 40 | 41 | def __init__(self): 42 | """Initialize the shap_warnings_suppressor.""" 43 | self._catch_warnings = warnings.catch_warnings() 44 | self._tf_warnings_suppressor = tf_warnings_suppressor() 45 | self._entered = False 46 | if TF_CPP_MIN_LOG_LEVEL in os.environ: 47 | self._default_tf_log_level = os.environ[TF_CPP_MIN_LOG_LEVEL] 48 | else: 49 | self._default_tf_log_level = '0' 50 | 51 | def __enter__(self): 52 | """Begins suppressing shap warnings.""" 53 | if self._entered: 54 | raise RuntimeError("Cannot enter %r twice" % self) 55 | self._entered = True 56 | self._tf_warnings_suppressor.__enter__() 57 | log = self._catch_warnings.__enter__() 58 | warnings.filterwarnings('ignore', 'Starting from version 2.2.1', UserWarning) 59 | return log 60 | 61 | def __exit__(self, *exc_info): 62 | """Finishes suppressing shap warnings.""" 63 | if not self._entered: 64 | raise RuntimeError("Cannot exit %r without entering first" % self) 65 | self._tf_warnings_suppressor.__exit__() 66 | self._catch_warnings.__exit__() 67 | -------------------------------------------------------------------------------- /python/ml_wrappers/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Defines a common dataset wrapper and common functions for data manipulation.""" 6 | 7 | from .dataset_wrapper import DatasetWrapper 8 | from .timestamp_featurizer import CustomTimestampFeaturizer 9 | 10 | __all__ = ['CustomTimestampFeaturizer', 'DatasetWrapper'] 11 | -------------------------------------------------------------------------------- /python/ml_wrappers/dataset/dataset_utils.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Defines helpful utilities for the DatasetWrapper.""" 6 | 7 | import logging 8 | 9 | import numpy as np 10 | from scipy.sparse import csr_matrix, issparse 11 | from scipy.sparse import vstack as sparse_vstack 12 | from sklearn.utils import shuffle 13 | from sklearn.utils.sparsefuncs import csc_median_axis_0 14 | 15 | from ..common.gpu_kmeans import kmeans 16 | from ..common.warnings_suppressor import shap_warnings_suppressor 17 | 18 | with shap_warnings_suppressor(): 19 | try: 20 | import shap 21 | shap_installed = True 22 | except BaseException: # noqa: B036 23 | shap_installed = False 24 | 25 | module_logger = logging.getLogger(__name__) 26 | module_logger.setLevel(logging.INFO) 27 | 28 | 29 | def _generate_augmented_data(x, max_num_of_augmentations=np.inf): 30 | """Augment x by appending x with itself shuffled columnwise many times. 31 | 32 | :param x: data that has to be augmented, array or sparse matrix of 2 dimensions 33 | :type x: numpy.ndarray or scipy.sparse.csr_matrix 34 | :param max_augment_data_size: number of times we stack permuted x to augment. 35 | :type max_augment_data_size: int 36 | :return: augmented data with roughly number of rows that are equal to number of columns 37 | :rtype: numpy.ndarray or scipy.sparse.csr_matrix 38 | """ 39 | x_augmented = x 40 | vstack = sparse_vstack if issparse(x) else np.vstack 41 | for i in range(min(x.shape[1] // x.shape[0] - 1, max_num_of_augmentations)): 42 | x_permuted = shuffle(x.T, random_state=i).T 43 | x_augmented = vstack([x_augmented, x_permuted]) 44 | 45 | return x_augmented 46 | 47 | 48 | def _summarize_data(X, k=10, use_gpu=False, to_round_values=True): 49 | """Summarize a dataset. 50 | 51 | For dense dataset, use k mean samples weighted by the number of data points they 52 | each represent. 53 | For sparse dataset, use a sparse row for the background with calculated 54 | median for dense columns. 55 | 56 | :param X: Matrix of data samples to summarize (# samples x # features). 57 | :type X: numpy.ndarray or pandas.DataFrame or scipy.sparse.csr_matrix 58 | :param k: Number of cluster centroids to use for approximation. 59 | :type k: int 60 | :param to_round_values: When using kmeans, for each element of every cluster centroid to match the nearest value 61 | from X in the corresponding dimension. This ensures discrete features 62 | always get a valid value. Ignored for sparse data sample. 63 | :type to_round_values: bool 64 | :return: summarized numpy array or csr_matrix object. 65 | :rtype: numpy.ndarray or scipy.sparse.csr_matrix or DenseData 66 | """ 67 | is_sparse = issparse(X) 68 | if not str(type(X)).endswith(".DenseData'>"): 69 | if is_sparse: 70 | module_logger.debug('Creating sparse data summary as csr matrix') 71 | # calculate median of sparse background data 72 | median_dense = csc_median_axis_0(X.tocsc()) 73 | return csr_matrix(median_dense) 74 | elif len(X) > 10 * k: 75 | module_logger.debug('Create dense data summary with k-means') 76 | # use kmeans to summarize the examples for initialization 77 | # if there are more than 10 x k of them 78 | if use_gpu: 79 | return kmeans(X, k, to_round_values) 80 | else: 81 | if not shap_installed: 82 | raise RuntimeError('shap is required to compute dataset summary in DatasetWrapper') 83 | return shap.kmeans(X, k, to_round_values) 84 | return X 85 | 86 | 87 | def _convert_batch_dataset_to_numpy(batch_dataset): 88 | """Convert a TensorFlow batch dataset to a numpy array. 89 | 90 | :param batch_dataset: batch dataset to convert 91 | :type batch_dataset: BatchDataset 92 | :return: data, feature names and batch size 93 | :rtype: numpy.ndarray, list, int 94 | """ 95 | batches = [] 96 | set_keys = False 97 | features = [] 98 | batch_size = 0 99 | for data, _ in batch_dataset: 100 | columns = [] 101 | for column in data.values(): 102 | columns.append(np.array(column)) 103 | if not set_keys: 104 | for key in data.keys(): 105 | features.append(key) 106 | batch_size = columns[0].shape[0] 107 | set_keys = True 108 | batches.append(np.stack(columns, axis=1)) 109 | converted_data = np.vstack(batches) 110 | return converted_data, features, batch_size 111 | -------------------------------------------------------------------------------- /python/ml_wrappers/dataset/timestamp_featurizer.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Defines a custom timestamp featurizer for converting timestamp columns to numeric.""" 6 | 7 | import numpy as np 8 | import pandas as pd 9 | from pandas.api.types import is_datetime64_any_dtype as is_datetime 10 | from scipy.sparse import issparse 11 | from sklearn.base import BaseEstimator, TransformerMixin 12 | 13 | 14 | class CustomTimestampFeaturizer(BaseEstimator, TransformerMixin): 15 | """An estimator for featurizing timestamp columns to numeric data. 16 | 17 | :param features: Optional feature column names. 18 | :type features: list[str] 19 | :param return_pandas: Whether to return the transformed dataset as a pandas DataFrame. 20 | :type return_pandas: bool 21 | :param modify_in_place: Whether to modify the original dataset in place. 22 | :type modify_in_place: bool 23 | """ 24 | 25 | def __init__(self, features=None, return_pandas=False, modify_in_place=False): 26 | """Initialize the CustomTimestampFeaturizer. 27 | 28 | :param features: Optional feature column names. 29 | :type features: list[str] 30 | :param return_pandas: Whether to return the transformed dataset as a pandas DataFrame. 31 | :type return_pandas: bool 32 | :param modify_in_place: Whether to modify the original dataset in place. 33 | :type modify_in_place: bool 34 | """ 35 | self.features = features 36 | self.return_pandas = return_pandas 37 | self.modify_in_place = modify_in_place 38 | self._time_col_names = [] 39 | return 40 | 41 | def fit(self, X, y=None): 42 | """Fits the CustomTimestampFeaturizer. 43 | 44 | :param X: The dataset containing timestamp columns to featurize. 45 | :type X: numpy.ndarray or pandas.DataFrame or scipy.sparse.csr_matrix 46 | :param y: The target values. 47 | :type y: Optional target values (None for unsupervised transformations). 48 | """ 49 | # If the data was previously successfully summarized, then there are no 50 | # timestamp columns as it must be numeric. 51 | # Also, if the dataset is sparse, we can assume there are no timestamps 52 | if str(type(X)).endswith(".DenseData'>") or issparse(X): 53 | return self 54 | tmp_dataset = X 55 | # If numpy array, temporarily convert to pandas for easier and uniform timestamp handling 56 | if isinstance(X, np.ndarray): 57 | tmp_dataset = pd.DataFrame(X, columns=self.features) 58 | self._time_col_names = [column for column in tmp_dataset.columns if is_datetime(tmp_dataset[column])] 59 | # Calculate the min date for each column 60 | self._min = [] 61 | for time_col_name in self._time_col_names: 62 | self._min.append(tmp_dataset[time_col_name].map(lambda x: x.timestamp()).min()) 63 | return self 64 | 65 | def transform(self, X): 66 | """Transforms the timestamp columns to numeric type in the given dataset. 67 | 68 | Specifically, extracts the year, month, day, hour, minute, second and time 69 | since min timestamp in the training dataset. 70 | 71 | :param X: The dataset containing timestamp columns to featurize. 72 | :type X: numpy.ndarray or pandas.DataFrame or scipy.sparse.csr_matrix 73 | :return: The transformed dataset. 74 | :rtype: numpy.ndarray or scipy.sparse.csr_matrix 75 | """ 76 | tmp_dataset = X 77 | if len(self._time_col_names) > 0: 78 | # Temporarily convert to pandas for easier and uniform timestamp handling 79 | if isinstance(X, np.ndarray): 80 | tmp_dataset = pd.DataFrame(X, columns=self.features) 81 | elif not self.modify_in_place: 82 | # If originally pandas, make a copy to avoid changing the original dataset 83 | tmp_dataset = X.copy() 84 | # Get the year, day, month, hour, minute, second 85 | for idx, time_col_name in enumerate(self._time_col_names): 86 | tmp_dataset[time_col_name + '_year'] = tmp_dataset[time_col_name].map(lambda x: x.year) 87 | tmp_dataset[time_col_name + '_month'] = tmp_dataset[time_col_name].map(lambda x: x.month) 88 | tmp_dataset[time_col_name + '_day'] = tmp_dataset[time_col_name].map(lambda x: x.day) 89 | tmp_dataset[time_col_name + '_hour'] = tmp_dataset[time_col_name].map(lambda x: x.hour) 90 | tmp_dataset[time_col_name + '_minute'] = tmp_dataset[time_col_name].map(lambda x: x.minute) 91 | tmp_dataset[time_col_name + '_second'] = tmp_dataset[time_col_name].map(lambda x: x.second) 92 | # Replace column itself with difference from min value, leave as same name 93 | # to keep index so order of other columns remains the same for other transformations 94 | tmp_dataset[time_col_name] = tmp_dataset[time_col_name].map(lambda x: x.timestamp() - self._min[idx]) 95 | if not self.return_pandas: 96 | tmp_dataset = tmp_dataset.values 97 | return tmp_dataset 98 | -------------------------------------------------------------------------------- /python/ml_wrappers/model/__init__.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Common infrastructure, class hierarchy and utilities for model explanations.""" 6 | 7 | from .endpoint_wrapper import EndpointWrapperModel 8 | from .model_wrapper import _wrap_model, wrap_model 9 | from .openai_wrapper import OpenaiWrapperModel 10 | from .pytorch_wrapper import WrappedPytorchModel 11 | from .tensorflow_wrapper import WrappedTensorflowModel, is_sequential 12 | from .wrapped_classification_model import WrappedClassificationModel 13 | from .wrapped_regression_model import WrappedRegressionModel 14 | 15 | __all__ = ['EndpointWrapperModel', 'OpenaiWrapperModel', 16 | 'WrappedClassificationModel', 'WrappedPytorchModel', 17 | 'WrappedRegressionModel', 'WrappedTensorflowModel', 18 | '_wrap_model', 'is_sequential', 'wrap_model'] 19 | -------------------------------------------------------------------------------- /python/ml_wrappers/model/base_wrapped_model.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Defines a base class for wrapping models.""" 6 | 7 | from ml_wrappers.model.evaluator import _eval_model 8 | 9 | 10 | class BaseWrappedModel(object): 11 | """A base class for WrappedClassificationModel and WrappedRegressionModel.""" 12 | 13 | def __init__(self, model, eval_function, examples, model_task): 14 | """Initialize the WrappedClassificationModel with the model and evaluation function.""" 15 | self._eval_function = eval_function 16 | self._model = model 17 | self._examples = examples 18 | self._model_task = model_task 19 | 20 | def __getstate__(self): 21 | """Influence how BaseWrappedModel is pickled. 22 | 23 | Removes _eval_function which may not be serializable. 24 | 25 | :return state: The state to be pickled, with _eval_function removed. 26 | :rtype: dict 27 | """ 28 | odict = self.__dict__.copy() 29 | if self._examples is not None: 30 | del odict['_eval_function'] 31 | return odict 32 | 33 | def __setstate__(self, state): 34 | """Influence how BaseWrappedModel is unpickled. 35 | 36 | Re-adds _eval_function which may not be serializable. 37 | 38 | :param dict: A dictionary of deserialized state. 39 | :type dict: dict 40 | """ 41 | self.__dict__.update(state) 42 | if self._examples is not None: 43 | eval_function, _ = _eval_model(self._model, self._examples, self._model_task) 44 | self._eval_function = eval_function 45 | -------------------------------------------------------------------------------- /python/ml_wrappers/model/evaluator.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | import pandas as pd 6 | 7 | from ..common.constants import ModelTask, SKLearn 8 | from .function_wrapper import (_convert_to_two_cols, _FunctionWrapper, 9 | _MultiVsSingleInstanceFunctionResolver) 10 | from .pytorch_wrapper import WrappedPytorchModel 11 | from .tensorflow_wrapper import WrappedTensorflowModel, is_sequential 12 | 13 | 14 | def _is_classification_task(task): 15 | """Return True if the task is a classification task. 16 | 17 | :param task: The task to check. 18 | :type task: str 19 | :return: True if the task is a classification task. 20 | :rtype: bool 21 | """ 22 | return task == ModelTask.CLASSIFICATION or task == ModelTask.IMAGE_CLASSIFICATION 23 | 24 | 25 | def _eval_model(model, examples, model_task): 26 | """Return function from model and specify the ML Domain using model evaluation on examples. 27 | 28 | :param model: The model to evaluate on the examples. 29 | :type model: model with a predict or predict_proba function 30 | :param examples: The model evaluation examples. 31 | :type examples: ml_wrappers.DatasetWrapper 32 | :param model_task: Optional parameter to specify whether the model is a classification or regression model. 33 | In most cases, the type of the model can be inferred based on the shape of the output, where a classifier 34 | has a predict_proba method and outputs a 2 dimensional array, while a regressor has a predict method and 35 | outputs a 1 dimensional array. 36 | :type model_task: str 37 | :return: The function chosen from given model and chosen domain. 38 | :rtype: (function, str) 39 | """ 40 | is_tf_seq = is_sequential(model) 41 | is_wrapped_pytorch = isinstance(model, WrappedPytorchModel) 42 | is_wrapped_tf = isinstance(model, WrappedTensorflowModel) 43 | if is_tf_seq or is_wrapped_pytorch or is_wrapped_tf: 44 | if model_task == ModelTask.REGRESSION: 45 | return _eval_function(model.predict, examples, ModelTask.REGRESSION) 46 | if model_task == ModelTask.IMAGE_CLASSIFICATION: 47 | examples_dataset = examples.dataset 48 | if isinstance(examples_dataset, pd.DataFrame): 49 | return _eval_function(model.predict_proba, examples, 50 | model_task, wrapped=True) 51 | is_pytorch_image_model = True 52 | wrapper = _FunctionWrapper(model.predict_proba, 53 | len(examples_dataset[0].shape), 54 | is_pytorch_image_model) 55 | return _eval_function(wrapper._function_input_expand_wrapper, 56 | examples, model_task, wrapped=True) 57 | result = model.predict_proba(examples.typed_wrapper_func(examples.dataset[0:1])) 58 | if result.shape[1] == 1 and model_task == ModelTask.UNKNOWN: 59 | raise Exception("Please specify model_task to disambiguate model type since " 60 | "result of calling function is 2D array of one column.") 61 | else: 62 | return _eval_function(model.predict_proba, examples, ModelTask.CLASSIFICATION) 63 | else: 64 | has_predict_proba = hasattr(model, SKLearn.PREDICT_PROBA) 65 | # Note: Allow user to override default to use predict method for regressor 66 | if has_predict_proba and model_task != ModelTask.REGRESSION: 67 | return _eval_function(model.predict_proba, examples, model_task) 68 | else: 69 | return _eval_function(model.predict, examples, model_task) 70 | 71 | 72 | def _eval_function(function, examples, model_task, wrapped=False): 73 | """Return function and specify the ML Domain using function evaluation on examples. 74 | 75 | :param function: The prediction function to evaluate on the examples. 76 | :type function: function 77 | :param examples: The model evaluation examples. 78 | :type examples: ml_wrappers.DatasetWrapper 79 | :param model_task: Optional parameter to specify whether the model is a classification or regression model. 80 | In most cases, the type of the model can be inferred based on the shape of the output, where a classifier 81 | has a predict_proba method and outputs a 2 dimensional array, while a regressor has a predict method and 82 | outputs a 1 dimensional array. 83 | :type model_task: str 84 | :param wrapped: Indicates if function has already been wrapped. 85 | :type wrapped: bool 86 | :return: The function chosen from given model and chosen domain. 87 | :rtype: (function, str) 88 | """ 89 | # Try to run the function on a single example - if it doesn't work wrap 90 | # it in a function that converts a 1D array to 2D for those functions 91 | # that only support 2D arrays as input 92 | examples_dataset = examples.dataset 93 | if str(type(examples_dataset)).endswith(".DenseData'>"): 94 | examples_dataset = examples_dataset.data 95 | try: 96 | inst_result = function(examples.typed_wrapper_func(examples_dataset[0])) 97 | if inst_result is None: 98 | raise Exception("Wrapped function returned None in model wrapper when called on dataset") 99 | multi_inst_result = function(examples.typed_wrapper_func(examples_dataset[0:1])) 100 | if multi_inst_result.shape != inst_result.shape: 101 | if len(multi_inst_result.shape) == len(inst_result.shape) + 1: 102 | resolver = _MultiVsSingleInstanceFunctionResolver(function) 103 | return _eval_function(resolver._add_single_predict_dimension, examples, model_task) 104 | else: 105 | raise Exception("Wrapped function dimensions for single and multi predict unresolvable") 106 | except Exception as ex: 107 | # If function has already been wrapped, re-throw error to prevent stack overflow 108 | if wrapped: 109 | raise ex 110 | wrapper = _FunctionWrapper(function, len(examples_dataset[0].shape)) 111 | return _eval_function(wrapper._function_input_expand_wrapper, examples, model_task, wrapped=True) 112 | if len(inst_result.shape) == 2: 113 | # If the result of evaluation the function is a 2D array of 1 column, 114 | # and they did not specify classifier or regressor, throw exception 115 | # to force the user to disambiguate the results. 116 | if inst_result.shape[1] == 1: 117 | if model_task == ModelTask.UNKNOWN: 118 | if isinstance(inst_result, pd.DataFrame): 119 | return (function, ModelTask.REGRESSION) 120 | raise Exception("Please specify model_task to disambiguate model type since " 121 | "result of calling function is 2D array of one column.") 122 | elif _is_classification_task(model_task): 123 | return _convert_to_two_cols(function, examples_dataset) 124 | else: 125 | # model_task == ModelTask.REGRESSION 126 | # In case user specified a regressor but we have a 2D output with one column, 127 | # we want to flatten the function to 1D 128 | wrapper = _FunctionWrapper(function) 129 | return (wrapper._function_flatten, model_task) 130 | else: 131 | if model_task == ModelTask.UNKNOWN or _is_classification_task(model_task): 132 | return (function, ModelTask.CLASSIFICATION) 133 | else: 134 | raise Exception("Invalid shape for prediction: " 135 | "Regression function cannot output 2D array with multiple columns") 136 | elif len(inst_result.shape) == 1: 137 | if model_task == ModelTask.UNKNOWN: 138 | return (function, ModelTask.REGRESSION) 139 | elif _is_classification_task(model_task): 140 | return _convert_to_two_cols(function, examples_dataset) 141 | return (function, model_task) 142 | elif len(inst_result.shape) == 0: 143 | # single value returned, flatten output array 144 | wrapper = _FunctionWrapper(function) 145 | return (wrapper._function_flatten, model_task) 146 | raise Exception("Failed to wrap function, may require custom wrapper for input function or model") 147 | -------------------------------------------------------------------------------- /python/ml_wrappers/model/fastai_wrapper.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Defines model wrappers and utilities for fastai tabular models.""" 6 | 7 | import numpy as np 8 | 9 | FASTAI_TABULAR_MODEL_SUFFIX = "fastai.tabular.learner.TabularLearner'>" 10 | 11 | 12 | def _is_fastai_tabular_model(model): 13 | """Check if the model is a fastai tabular model. 14 | 15 | :param model: The model to check. 16 | :type model: object 17 | :return: True if the model is a fastai model, False otherwise. 18 | :rtype: bool 19 | """ 20 | return str(type(model)).endswith(FASTAI_TABULAR_MODEL_SUFFIX) 21 | 22 | 23 | class WrappedFastAITabularModel(object): 24 | """A class for wrapping a FastAI tabular model in the scikit-learn style.""" 25 | 26 | def __init__(self, model): 27 | """Initialize the WrappedFastAITabularModel. 28 | 29 | :param model: The model to wrap. 30 | :type model: fastai.learner.Learner 31 | """ 32 | self._model = model 33 | dl = self._model.dls[0] 34 | self.cat_cols = dl.dataset.cat_names 35 | self.cont_cols = dl.dataset.cont_names 36 | 37 | def _fastai_predict(self, dataset, index, model=None): 38 | """Predict the output using the wrapped FastAI model. 39 | 40 | :param dataset: The dataset to predict on. 41 | :type dataset: ml_wrappers.DatasetWrapper 42 | :param index: The index into the predicted data. 43 | Index 1 is for the predicted class and index 44 | 2 is for the predicted probability. 45 | :type index: int 46 | :param model: The model to use for prediction. 47 | If None, the wrapped model is used. 48 | :type model: fastai.learner.Learner 49 | :return: The predicted data. 50 | :rtype: numpy.ndarray 51 | """ 52 | if model is None: 53 | model = self._model 54 | predictions = [] 55 | for i in range(len(dataset)): 56 | row = dataset.iloc[i] 57 | # get only feature columns for prediction 58 | row = row[self.cat_cols + self.cont_cols] 59 | predictions.append(np.array(model.predict(row)[index])) 60 | predictions = np.array(predictions) 61 | if index == 1: 62 | is_boolean = predictions.dtype == bool 63 | if is_boolean: 64 | predictions = predictions.astype(int) 65 | return predictions 66 | 67 | def _fastai_predict_without_callbacks(self, dataset, index): 68 | """Predict the output using the wrapped FastAI model without callbacks. 69 | 70 | :param dataset: The dataset to predict on. 71 | :type dataset: ml_wrappers.DatasetWrapper 72 | :param index: The index into the predicted data. 73 | Index 1 is for the predicted class and index 74 | 2 is for the predicted probability. 75 | :type index: int 76 | :return: The predicted data. 77 | :rtype: numpy.ndarray 78 | """ 79 | removed_cbs = [] 80 | default_cbs = ['TrainEvalCallback', 'Recorder', 'CastToTensor'] 81 | for cb in self._model.cbs: 82 | if cb.__class__.__name__ not in default_cbs: 83 | removed_cbs.append(cb) 84 | with self._model.removed_cbs(removed_cbs) as model: 85 | return self._fastai_predict(dataset, index, model) 86 | 87 | def predict(self, dataset): 88 | """Predict the output value using the wrapped FastAI model. 89 | 90 | :param dataset: The dataset to predict on. 91 | :type dataset: ml_wrappers.DatasetWrapper 92 | :return: The predicted values. 93 | :rtype: numpy.ndarray 94 | """ 95 | try: 96 | return self._fastai_predict(dataset, 1) 97 | except Exception: 98 | return self._fastai_predict_without_callbacks(dataset, 1) 99 | 100 | def predict_proba(self, dataset): 101 | """Predict the output probability using the FastAI model. 102 | 103 | :param dataset: The dataset to predict_proba on. 104 | :type dataset: ml_wrappers.DatasetWrapper 105 | :return: The predicted probabilities. 106 | :rtype: numpy.ndarray 107 | """ 108 | try: 109 | return self._fastai_predict(dataset, 2) 110 | except Exception: 111 | return self._fastai_predict_without_callbacks(dataset, 2) 112 | -------------------------------------------------------------------------------- /python/ml_wrappers/model/model_utils.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Defines common model utilities.""" 6 | 7 | from ml_wrappers.common.warnings_suppressor import shap_warnings_suppressor 8 | 9 | with shap_warnings_suppressor(): 10 | try: 11 | from shap.utils import safe_isinstance 12 | shap_installed = True 13 | except BaseException: # noqa: B036 14 | shap_installed = False 15 | 16 | 17 | MULTILABEL_THRESHOLD = 0.5 18 | 19 | 20 | def _is_transformers_pipeline(model): 21 | """Checks if the model is a transformers pipeline. 22 | 23 | :param model: The model to check. 24 | :type model: object 25 | :return: True if the model is a transformers pipeline, False otherwise. 26 | :rtype: bool 27 | """ 28 | return shap_installed and safe_isinstance( 29 | model, "transformers.pipelines.Pipeline") 30 | 31 | 32 | def _is_callable_pipeline(model): 33 | """Checks if the model is a callable pipeline. 34 | 35 | Returns false if the model has a predict and predict_proba method. 36 | 37 | :param model: The model to check. 38 | :type model: object 39 | :return: True if the model is a callable pipeline, False otherwise. 40 | :rtype: bool 41 | """ 42 | has_predict = hasattr(model, 'predict') 43 | has_predict_proba = hasattr(model, 'predict_proba') 44 | return callable(model) and not has_predict and not has_predict_proba 45 | -------------------------------------------------------------------------------- /python/ml_wrappers/model/model_wrapper.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Defines helpful model wrapper and utils for implicitly rewrapping the model 6 | to conform to explainer contracts.""" 7 | 8 | import logging 9 | import warnings 10 | from typing import Union 11 | 12 | import numpy as np 13 | from ml_wrappers.model.wrapped_classification_model import \ 14 | WrappedClassificationModel 15 | from ml_wrappers.model.wrapped_classification_without_proba_model import \ 16 | WrappedClassificationWithoutProbaModel 17 | from ml_wrappers.model.wrapped_regression_model import WrappedRegressionModel 18 | from sklearn.linear_model import SGDClassifier 19 | 20 | from ..common.constants import (Device, ModelTask, SKLearn, image_model_tasks, 21 | text_model_tasks) 22 | from ..dataset.dataset_wrapper import DatasetWrapper 23 | from .evaluator import _eval_function, _eval_model 24 | from .fastai_wrapper import WrappedFastAITabularModel, _is_fastai_tabular_model 25 | from .image_model_wrapper import _wrap_image_model 26 | from .pytorch_wrapper import WrappedPytorchModel 27 | from .tensorflow_wrapper import WrappedTensorflowModel, is_sequential 28 | from .text_model_wrapper import _is_transformers_pipeline, _wrap_text_model 29 | 30 | with warnings.catch_warnings(): 31 | warnings.filterwarnings( 32 | 'ignore', 'Starting from version 2.2.1', UserWarning) 33 | 34 | 35 | module_logger = logging.getLogger(__name__) 36 | module_logger.setLevel(logging.INFO) 37 | 38 | 39 | try: 40 | import torch.nn as nn 41 | except ImportError: 42 | module_logger.debug( 43 | 'Could not import torch, required if using a PyTorch model') 44 | 45 | 46 | def wrap_model(model, examples, model_task: str = ModelTask.UNKNOWN, 47 | num_classes: int = None, classes: Union[list, np.array] = None, 48 | device=Device.AUTO.value): 49 | """If needed, wraps the model in a common API based on model task and 50 | prediction function contract. 51 | 52 | :param model: The model to evaluate on the examples. 53 | :type model: model with a predict or predict_proba function. 54 | :param examples: The model evaluation examples. 55 | Note the examples will be wrapped in a DatasetWrapper, if not 56 | wrapped when input. 57 | :type examples: ml_wrappers.DatasetWrapper or numpy.ndarray 58 | or pandas.DataFrame or panads.Series or scipy.sparse.csr_matrix 59 | or shap.DenseData or torch.Tensor 60 | :param model_task: Optional parameter to specify whether the model 61 | is a classification or regression model. 62 | In most cases, the type of the model can be inferred 63 | based on the shape of the output, where a classifier 64 | has a predict_proba method and outputs a 2 dimensional 65 | array, while a regressor has a predict method and 66 | outputs a 1 dimensional array. 67 | :param classes: optional parameter specifying a list of class names 68 | the dataset 69 | :type classes: list or np.ndarray 70 | :param num_classes: optional parameter specifying the number of classes in 71 | the dataset 72 | :type num_classes: int 73 | :type model_task: str 74 | :param device: optional parameter specifying the device to move the model 75 | to. If not specified, then cpu is the default 76 | :type device: str, for instance: 'cpu', 'cuda' 77 | :return: The wrapper model. 78 | :rtype: model 79 | """ 80 | if model_task == ModelTask.UNKNOWN and _is_transformers_pipeline(model): 81 | # TODO: can we also dynamically figure out the task if it was 82 | # originally unknown for text scenarios? 83 | raise ValueError("ModelTask must be specified for text-based models") 84 | if model_task in text_model_tasks: 85 | return _wrap_text_model(model, examples, model_task, False)[0] 86 | if model_task in image_model_tasks: 87 | return _wrap_image_model(model, examples, model_task, 88 | False, num_classes, classes, 89 | device)[0] 90 | return _wrap_model(model, examples, model_task, False)[0] 91 | 92 | 93 | def _wrap_model(model, examples, model_task, is_function): 94 | """If needed, wraps the model or function in a common API based on model 95 | task and prediction function contract. 96 | 97 | :param model: The model or function to evaluate on the examples. 98 | :type model: function or model with a predict or predict_proba function 99 | :param examples: The model evaluation examples. 100 | Note the examples will be wrapped in a DatasetWrapper, if not 101 | wrapped when input. 102 | :type examples: ml_wrappers.DatasetWrapper or numpy.ndarray 103 | or pandas.DataFrame or panads.Series or scipy.sparse.csr_matrix 104 | or shap.DenseData or torch.Tensor 105 | :param model_task: Optional parameter to specify whether the model 106 | is a classification or regression model. 107 | In most cases, the type of the model can be inferred 108 | based on the shape of the output, where a classifier 109 | has a predict_proba method and outputs a 2 dimensional 110 | array, while a regressor has a predict method and 111 | outputs a 1 dimensional array. 112 | :type model_task: str 113 | :return: The function chosen from given model and chosen domain, or 114 | model wrapping the function and chosen domain. 115 | :rtype: (function, str) or (model, str) 116 | """ 117 | if not isinstance(examples, DatasetWrapper): 118 | examples = DatasetWrapper(examples) 119 | if is_function: 120 | return _eval_function(model, examples, model_task) 121 | else: 122 | try: 123 | if isinstance(model, nn.Module): 124 | # Wrap the model in an extra layer that 125 | # converts the numpy array 126 | 127 | # to pytorch Variable and adds predict and 128 | # predict_proba functions 129 | model = WrappedPytorchModel(model) 130 | except (NameError, AttributeError): 131 | module_logger.debug( 132 | 'Could not import torch, required if using a pytorch model') 133 | if _is_fastai_tabular_model(model): 134 | model = WrappedFastAITabularModel(model) 135 | if is_sequential(model): 136 | model = WrappedTensorflowModel(model) 137 | if _classifier_without_proba(model): 138 | model = WrappedClassificationWithoutProbaModel(model) 139 | eval_function, eval_ml_domain = _eval_model( 140 | model, examples, model_task) 141 | if eval_ml_domain == ModelTask.CLASSIFICATION: 142 | return WrappedClassificationModel(model, eval_function, examples), \ 143 | eval_ml_domain 144 | else: 145 | return WrappedRegressionModel(model, eval_function, examples), \ 146 | eval_ml_domain 147 | 148 | 149 | def _classifier_without_proba(model): 150 | """Returns True if the given model is a classifier without predict_proba, 151 | eg SGDClassifier. 152 | 153 | :param model: The model to evaluate on the examples. 154 | :type model: model with a predict or predict_proba function 155 | :return: True if the given model is a classifier without predict_proba. 156 | :rtype: bool 157 | """ 158 | return isinstance(model, SGDClassifier) and not \ 159 | hasattr(model, SKLearn.PREDICT_PROBA) 160 | -------------------------------------------------------------------------------- /python/ml_wrappers/model/pytorch_wrapper.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Defines model wrappers and utilities for pytorch models.""" 6 | 7 | import logging 8 | 9 | import numpy as np 10 | import pandas as pd 11 | 12 | module_logger = logging.getLogger(__name__) 13 | module_logger.setLevel(logging.INFO) 14 | 15 | 16 | try: 17 | import torch 18 | except ImportError: 19 | module_logger.debug('Could not import torch, required if using a PyTorch model') 20 | 21 | try: 22 | from torchvision.transforms import ToTensor 23 | except ImportError: 24 | module_logger.debug('Could not import torchvision, required if using' 25 | ' a vision PyTorch model') 26 | 27 | 28 | class WrappedPytorchModel(object): 29 | """A class for wrapping a PyTorch model. 30 | 31 | Note at time of initialization, since we don't have 32 | access to the dataset, we can't infer if this is for 33 | classification or regression case. Hence, we add 34 | the predict_classes method for classification, and keep 35 | predict for either outputting values in regression or 36 | probabilities in classification. 37 | """ 38 | 39 | def __init__(self, model, image_to_tensor=False): 40 | """Initialize the WrappedPytorchModel with the model and evaluation function. 41 | 42 | :param model: The PyTorch model to wrap. 43 | :type model: torch.nn.Module 44 | :param image_to_tensor: Whether to convert the image to tensor. 45 | :type image_to_tensor: bool 46 | """ 47 | self._model = model 48 | # Set eval automatically for user for batchnorm and dropout layers 49 | self._model.eval() 50 | self._image_to_tensor = image_to_tensor 51 | 52 | def _convert_to_tensor(self, dataset): 53 | """Convert the dataset to a pytorch tensor. 54 | 55 | For image datasets, we use ToTensor from torchvision, 56 | which moves channel to the first dimension and for 57 | 2D images adds a third dimension. 58 | 59 | :param dataset: The dataset to convert. 60 | :type dataset: ml_wrappers.DatasetWrapper 61 | :return: The converted dataset. 62 | :rtype: torch.Tensor 63 | """ 64 | # Convert the data to pytorch Variable 65 | if isinstance(dataset, pd.DataFrame): 66 | if self._image_to_tensor: 67 | dataset = dataset.iloc[0] 68 | dataset = dataset.values 69 | # If represented as a list of arrays, 70 | # convert to a 3D array instead of array 71 | # of 2D arrays 72 | if len(dataset.shape) == 1: 73 | if self._image_to_tensor and len(dataset[0].shape) == 2: 74 | # add channel to end of image if 2D grayscale 75 | for i in range(dataset.shape[0]): 76 | dataset[i] = np.expand_dims(dataset[i], axis=2) 77 | dataset = np.stack(dataset) 78 | # If not already tensor, convert 79 | if not isinstance(dataset, torch.Tensor): 80 | if self._image_to_tensor: 81 | # For torchvision images, can only convert one 82 | # image at a time 83 | # Note pytorch wrapper expects extra dimension for rows 84 | # to be expanded in evaluator for image case, 85 | # otherwise this code won't work for a single 86 | # image input to predict call 87 | rows = [] 88 | for row in range(dataset.shape[0]): 89 | instance = dataset[row] 90 | if not isinstance(instance, torch.Tensor): 91 | instance = ToTensor()(instance) 92 | rows.append(instance) 93 | dataset = torch.stack(rows) 94 | else: 95 | dataset = torch.Tensor(dataset) 96 | return dataset 97 | 98 | def predict(self, dataset): 99 | """Predict the output using the wrapped PyTorch model. 100 | 101 | :param dataset: The dataset to predict on. 102 | :type dataset: ml_wrappers.DatasetWrapper 103 | :return: The prediction results. 104 | :rtype: numpy.ndarray 105 | """ 106 | wrapped_dataset = self._convert_to_tensor(dataset) 107 | with torch.no_grad(): 108 | result = self._model(wrapped_dataset).numpy() 109 | # Reshape to 2D if output is 1D and input has one row 110 | if len(dataset.shape) == 1: 111 | result = result.reshape(1, -1) 112 | return result 113 | 114 | def predict_classes(self, dataset): 115 | """Predict the class using the wrapped PyTorch model. 116 | 117 | :param dataset: The dataset to predict on. 118 | :type dataset: ml_wrappers.DatasetWrapper 119 | :return: The predicted classes. 120 | :rtype: numpy.ndarray 121 | """ 122 | wrapped_dataset = self._convert_to_tensor(dataset) 123 | with torch.no_grad(): 124 | result = self._model(wrapped_dataset) 125 | result_len = len(result.shape) 126 | if result_len == 1 or (result_len > 1 and result.shape[1] == 1): 127 | result = np.where(result.numpy() > 0.5, 1, 0) 128 | else: 129 | result = torch.max(result, 1)[1].numpy() 130 | # Reshape to 2D if output is 1D and input has one row 131 | if len(dataset.shape) == 1: 132 | result = result.reshape(1, -1) 133 | return result 134 | 135 | def predict_proba(self, dataset): 136 | """Predict the output probability using the wrapped PyTorch model. 137 | 138 | :param dataset: The dataset to predict_proba on. 139 | :type dataset: ml_wrappers.DatasetWrapper 140 | :return: The predicted probabilities. 141 | :rtype: numpy.ndarray 142 | """ 143 | return self.predict(dataset) 144 | -------------------------------------------------------------------------------- /python/ml_wrappers/model/tensorflow_wrapper.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Defines model wrappers and utilities for tensorflow models.""" 6 | 7 | import numpy as np 8 | import pandas as pd 9 | 10 | PREDICT_CLASSES = 'predict_classes' 11 | 12 | 13 | def is_sequential(model): 14 | """Returns True if the model is a sequential model. 15 | 16 | Note the model class name can be 17 | keras.src.engine.sequential.Sequential, 18 | keras.engine.sequential.Sequential or 19 | tensorflow.python.keras.engine.sequential.Sequential 20 | depending on the tensorflow version. 21 | In the latest 2.13 version, the namespace changed from 22 | keras.engine to keras.src.engine. 23 | The check should include all of these cases. 24 | 25 | :param model: The model to check. 26 | :type model: tf.keras.Model 27 | :return: True if the model is a sequential model. 28 | :rtype: bool 29 | """ 30 | model_type = str(type(model)) 31 | old_sequential_ns = "keras.engine.sequential.Sequential'>" 32 | # the sequential namespace changed in tensorflow 2.13 33 | new_sequential_ns1 = "keras.src.engine.sequential.Sequential'>" 34 | # it changed again in tensorflow 2.17 35 | new_sequential_ns2 = "keras.src.models.sequential.Sequential'>" 36 | return any([model_type.endswith(old_sequential_ns), 37 | model_type.endswith(new_sequential_ns1), 38 | model_type.endswith(new_sequential_ns2)]) 39 | 40 | 41 | class WrappedTensorflowModel(object): 42 | """A class for wrapping a TensorFlow model. 43 | 44 | Note at time of initialization, since we don't have 45 | access to the dataset, we can't infer if this is for 46 | classification or regression case. Hence, we add 47 | the predict_classes method for classification, and keep 48 | predict for either outputting values in regression or 49 | probabilities in classification. 50 | """ 51 | 52 | def __init__(self, model): 53 | """Initialize the WrappedTensorflowModel with the model. 54 | 55 | :param model: The model to wrap. 56 | :type model: tf.keras.Model 57 | """ 58 | self._model = model 59 | 60 | def predict(self, dataset): 61 | """Predict the output using the wrapped TensorFlow model. 62 | 63 | :param dataset: The dataset to predict on. 64 | :type dataset: ml_wrappers.DatasetWrapper 65 | :return: The prediction results. 66 | :rtype: numpy.ndarray 67 | """ 68 | # Convert the data to numpy 69 | if isinstance(dataset, pd.DataFrame): 70 | dataset = dataset.values 71 | return self._model.predict(dataset) 72 | 73 | def predict_classes(self, dataset): 74 | """Predict the class using the wrapped TensorFlow model. 75 | 76 | :param dataset: The dataset to predict on. 77 | :type dataset: ml_wrappers.DatasetWrapper 78 | :return: The predicted classes. 79 | :rtype: numpy.ndarray 80 | """ 81 | # Note predict_classes was removed for models after 82 | # tensorflow version 2.6 83 | if hasattr(self._model, PREDICT_CLASSES): 84 | return self._model.predict_classes(dataset) 85 | probabilities = self.predict_proba(dataset) 86 | return np.argmax(probabilities, axis=1) 87 | 88 | def predict_proba(self, dataset): 89 | """Predict the output probability using the wrapped TensorFlow model. 90 | 91 | :param dataset: The dataset to predict_proba on. 92 | :type dataset: ml_wrappers.DatasetWrapper 93 | :return: The predicted probabilities. 94 | :rtype: numpy.ndarray 95 | """ 96 | return self.predict(dataset) 97 | -------------------------------------------------------------------------------- /python/ml_wrappers/model/text_model_wrapper.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Defines wrappers for text-based models.""" 6 | 7 | import numpy as np 8 | from ml_wrappers.common.constants import ModelTask 9 | from ml_wrappers.common.warnings_suppressor import shap_warnings_suppressor 10 | from ml_wrappers.model.model_utils import (MULTILABEL_THRESHOLD, 11 | _is_transformers_pipeline) 12 | 13 | with shap_warnings_suppressor(): 14 | try: 15 | from shap import models 16 | shap_installed = True 17 | except BaseException: # noqa: B036 18 | shap_installed = False 19 | 20 | 21 | def _wrap_text_model(model, examples, model_task, is_function): 22 | """If needed, wraps the model or function in a common API. 23 | 24 | Wraps the model based on model task and prediction function contract. 25 | 26 | :param model: The model or function to evaluate on the examples. 27 | :type model: function or model with a predict or predict_proba function 28 | :param examples: The model evaluation examples. 29 | Note the examples will be wrapped in a DatasetWrapper, if not 30 | wrapped when input. 31 | :type examples: ml_wrappers.DatasetWrapper or numpy.ndarray 32 | or pandas.DataFrame or panads.Series or scipy.sparse.csr_matrix 33 | or shap.DenseData or torch.Tensor 34 | :param model_task: Parameter to specify whether the model is a 35 | 'text_classification', 'sentiment_analysis', 'question_answering', 36 | 'entailment' or 'summarizations' text model. 37 | :type model_task: str 38 | :return: The function chosen from given model and chosen domain, or 39 | model wrapping the function and chosen domain. 40 | :rtype: (function, str) or (model, str) 41 | """ 42 | _wrapped_model = model 43 | if _is_transformers_pipeline(model): 44 | if model_task == ModelTask.TEXT_CLASSIFICATION: 45 | _wrapped_model = WrappedTextClassificationModel(model) 46 | elif model_task == ModelTask.QUESTION_ANSWERING: 47 | _wrapped_model = WrappedQuestionAnsweringModel(model) 48 | elif model_task == ModelTask.MULTILABEL_TEXT_CLASSIFICATION: 49 | _wrapped_model = WrappedTextClassificationModel(model, multilabel=True) 50 | return _wrapped_model, model_task 51 | 52 | 53 | class WrappedTextClassificationModel(object): 54 | """A class for wrapping a Transformers model in the scikit-learn style.""" 55 | 56 | def __init__(self, model, multilabel=False): 57 | """Initialize the WrappedTextClassificationModel.""" 58 | self._model = model 59 | if not shap_installed: 60 | raise ImportError("SHAP is not installed. Please install it " 61 | "to use WrappedTextClassificationModel.") 62 | self._wrapped_model = models.TransformersPipeline(model) 63 | self._multilabel = multilabel 64 | 65 | def predict(self, dataset): 66 | """Predict the output using the wrapped Transformers model. 67 | 68 | :param dataset: The dataset to predict on. 69 | :type dataset: ml_wrappers.DatasetWrapper 70 | """ 71 | pipeline_dicts = self._wrapped_model.inner_model(dataset) 72 | output = [] 73 | for val in pipeline_dicts: 74 | if not isinstance(val, list): 75 | val = [val] 76 | scores = [obj["score"] for obj in val] 77 | if self._multilabel: 78 | threshold = MULTILABEL_THRESHOLD 79 | # jagged, thresholded array of labels model predicted 80 | labels = np.where(np.array(scores) > threshold) 81 | predictions = np.zeros(len(scores)) 82 | # indicator matrix of labels since numpy does not 83 | # support jagged arrays, which seems to be the format 84 | # scikit-learn MultiOutputClassifier uses, 85 | # see sklearn.multioutput.MultiOutputClassifier.predict 86 | predictions[labels] = 1 87 | output.append(predictions) 88 | else: 89 | max_score_index = np.argmax(scores) 90 | output.append(max_score_index) 91 | return np.array(output) 92 | 93 | def predict_proba(self, dataset): 94 | """Predict the output probability using the Transformers model. 95 | 96 | :param dataset: The dataset to predict_proba on. 97 | :type dataset: ml_wrappers.DatasetWrapper 98 | """ 99 | return self._wrapped_model(dataset) 100 | 101 | 102 | class WrappedQuestionAnsweringModel(object): 103 | """A class for wrapping a Transformers model in the scikit-learn style.""" 104 | 105 | def __init__(self, model): 106 | """Initialize the WrappedQuestionAnsweringModel.""" 107 | self._model = model 108 | 109 | def predict(self, dataset): 110 | """Predict the output using the wrapped Transformers model. 111 | 112 | :param dataset: The dataset to predict on. 113 | :type dataset: ml_wrappers.DatasetWrapper 114 | """ 115 | output = [] 116 | for context, question in zip(dataset['context'], dataset['questions']): 117 | answer = self._model({'context': context, 'question': question}) 118 | output.append(answer['answer']) 119 | return np.array(output) 120 | -------------------------------------------------------------------------------- /python/ml_wrappers/model/wrapped_classification_model.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Defines a class for wrapping classification models.""" 6 | 7 | import numpy as np 8 | import pandas as pd 9 | from ml_wrappers.common.constants import ModelTask, SKLearn 10 | from ml_wrappers.model.base_wrapped_model import BaseWrappedModel 11 | from ml_wrappers.model.function_wrapper import _FunctionWrapper 12 | from ml_wrappers.model.pytorch_wrapper import WrappedPytorchModel 13 | from ml_wrappers.model.tensorflow_wrapper import (WrappedTensorflowModel, 14 | is_sequential) 15 | 16 | 17 | class WrappedClassificationModel(BaseWrappedModel): 18 | """A class for wrapping a classification model.""" 19 | 20 | def __init__(self, model, eval_function, examples=None): 21 | """Initialize the WrappedClassificationModel with the model and evaluation function.""" 22 | super(WrappedClassificationModel, self).__init__(model, eval_function, examples, ModelTask.CLASSIFICATION) 23 | 24 | def predict(self, dataset): 25 | """Predict the output using the wrapped classification model. 26 | 27 | :param dataset: The dataset to predict on. 28 | :type dataset: ml_wrappers.DatasetWrapper 29 | """ 30 | is_tf_seq = is_sequential(self._model) 31 | is_wrapped_pytroch = isinstance(self._model, WrappedPytorchModel) 32 | is_wrapped_tf = isinstance(self._model, WrappedTensorflowModel) 33 | if is_tf_seq or is_wrapped_pytroch or is_wrapped_tf: 34 | wrapped_predict_classes = self._wrap_function(self._model.predict_classes) 35 | return wrapped_predict_classes(dataset).flatten() 36 | wrapped_predict = self._wrap_function(self._model.predict) 37 | preds = wrapped_predict(dataset) 38 | if isinstance(preds, pd.DataFrame): 39 | preds = preds.values.ravel() 40 | # Handle possible case where the model has only a predict function and it outputs probabilities 41 | # Note this is different from WrappedClassificationWithoutProbaModel where there is no predict_proba 42 | # method but the predict method outputs classes 43 | has_predict_proba = hasattr(self._model, SKLearn.PREDICT_PROBA) 44 | if not has_predict_proba: 45 | if len(preds.shape) == 1: 46 | return np.argmax(preds) 47 | else: 48 | return np.argmax(preds, axis=1) 49 | # Handle the case that the model predicts a two-dimensional array of one column 50 | if len(preds.shape) == 2 and preds.shape[1] == 1: 51 | preds = preds.ravel() 52 | return np.array(preds) 53 | 54 | def predict_proba(self, dataset): 55 | """Predict the output probability using the wrapped model. 56 | 57 | :param dataset: The dataset to predict_proba on. 58 | :type dataset: ml_wrappers.DatasetWrapper 59 | """ 60 | proba_preds = self._eval_function(dataset) 61 | if isinstance(proba_preds, pd.DataFrame): 62 | proba_preds = proba_preds.values 63 | 64 | return proba_preds 65 | 66 | def _wrap_function(self, function): 67 | """Wrap a function to conform to the prediction input contracts. 68 | 69 | If model requires _function_input_expand_wrapper, re-wraps 70 | the given function with _function_input_expand_wrapper. 71 | 72 | :param function: The function to wrap. 73 | :type function: function 74 | :return: The wrapped function. 75 | :rtype: function 76 | """ 77 | eval_function = self._eval_function 78 | exp_wrapper = _FunctionWrapper._function_input_expand_wrapper 79 | exp_wrapper_name = exp_wrapper.__name__ 80 | if eval_function.__name__ == exp_wrapper_name: 81 | base_dims = eval_function.__self__._base_dims 82 | function_wrapper = _FunctionWrapper(function, base_dims) 83 | function = function_wrapper._function_input_expand_wrapper 84 | return function 85 | -------------------------------------------------------------------------------- /python/ml_wrappers/model/wrapped_classification_without_proba_model.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Defines a class for wrapping classifiers without predict_proba.""" 6 | 7 | import numpy as np 8 | 9 | 10 | class WrappedClassificationWithoutProbaModel(object): 11 | """A class for wrapping a classifier without a predict_proba method. 12 | 13 | Note: the classifier may not output numeric values for its predictions. 14 | We generate a trival boolean version of predict_proba 15 | """ 16 | 17 | def __init__(self, model): 18 | """Initialize the WrappedClassificationWithoutProbaModel with the model.""" 19 | self._model = model 20 | # Create a map from classes to index 21 | self._classes_to_index = {} 22 | for index, i in enumerate(self._model.classes_): 23 | self._classes_to_index[i] = index 24 | self._num_classes = len(self._model.classes_) 25 | 26 | def predict(self, dataset): 27 | """Predict the output using the wrapped regression model. 28 | 29 | :param dataset: The dataset to predict on. 30 | :type dataset: ml_wrappers.DatasetWrapper 31 | """ 32 | return self._model.predict(dataset) 33 | 34 | def predict_proba(self, dataset): 35 | """Predict the output probability using the wrapped model. 36 | 37 | :param dataset: The dataset to predict_proba on. 38 | :type dataset: ml_wrappers.DatasetWrapper 39 | """ 40 | predictions = self.predict(dataset) 41 | # Generate trivial boolean array for predictions 42 | probabilities = np.zeros((predictions.shape[0], self._num_classes)) 43 | for row_idx, pred_class in enumerate(predictions): 44 | class_index = self._classes_to_index[pred_class] 45 | probabilities[row_idx, class_index] = 1 46 | return probabilities 47 | -------------------------------------------------------------------------------- /python/ml_wrappers/model/wrapped_regression_model.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Defines a class for wrapping regression models.""" 6 | 7 | import pandas as pd 8 | from ml_wrappers.common.constants import ModelTask 9 | from ml_wrappers.model.base_wrapped_model import BaseWrappedModel 10 | 11 | 12 | class WrappedRegressionModel(BaseWrappedModel): 13 | """A class for wrapping a regression model.""" 14 | 15 | def __init__(self, model, eval_function, examples=None): 16 | """Initialize the WrappedRegressionModel with the model and evaluation function.""" 17 | super(WrappedRegressionModel, self).__init__( 18 | model, eval_function, examples, ModelTask.REGRESSION) 19 | 20 | def predict(self, dataset): 21 | """Predict the output using the wrapped regression model. 22 | 23 | :param dataset: The dataset to predict on. 24 | :type dataset: ml_wrappers.DatasetWrapper 25 | """ 26 | preds = self._eval_function(dataset) 27 | if isinstance(preds, pd.DataFrame): 28 | preds = preds.values.ravel() 29 | 30 | return preds 31 | -------------------------------------------------------------------------------- /python/ml_wrappers/version.py: -------------------------------------------------------------------------------- 1 | name = 'ml_wrappers' 2 | _major = '0' 3 | _minor = '6' 4 | _patch = '0' 5 | version = '{}.{}.{}'.format(_major, _minor, _patch) 6 | -------------------------------------------------------------------------------- /python/setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | license_file=LICENSE.txt 3 | -------------------------------------------------------------------------------- /python/setup.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Setup file for ml-wrappers package.""" 6 | import os 7 | import shutil 8 | 9 | from setuptools import find_packages, setup 10 | 11 | with open('ml_wrappers/version.py') as f: 12 | code = compile(f.read(), f.name, 'exec') 13 | exec(code) 14 | 15 | README_FILE = 'README.md' 16 | LICENSE_FILE = 'LICENSE.txt' 17 | 18 | # Note: used when generating the wheel but not on pip install of the package 19 | if os.path.exists('../LICENSE'): 20 | shutil.copyfile('../LICENSE', LICENSE_FILE) 21 | 22 | 23 | CLASSIFIERS = [ 24 | 'Development Status :: 5 - Production/Stable', 25 | 'Intended Audience :: Developers', 26 | 'Intended Audience :: Science/Research', 27 | 'License :: OSI Approved :: MIT License', 28 | 'Programming Language :: Python :: 3', 29 | 'Programming Language :: Python :: 3.9', 30 | 'Programming Language :: Python :: 3.10', 31 | 'Programming Language :: Python :: 3.11', 32 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 33 | 'Operating System :: Microsoft :: Windows', 34 | 'Operating System :: MacOS', 35 | 'Operating System :: POSIX :: Linux' 36 | ] 37 | 38 | DEPENDENCIES = [ 39 | 'numpy', 40 | 'packaging', 41 | 'pandas', 42 | 'scipy', 43 | 'scikit-learn' 44 | ] 45 | 46 | with open(README_FILE, 'r', encoding='utf-8') as f: 47 | README = f.read() 48 | 49 | setup( 50 | name=name, # noqa: F821 51 | version=version, # noqa: F821 52 | description='Machine Learning Wrappers SDK for Python', 53 | long_description=README, 54 | long_description_content_type='text/markdown', 55 | author='Microsoft Corp', 56 | author_email='ilmat@microsoft.com', 57 | license='MIT License', 58 | url='https://github.com/microsoft/ml-wrappers', 59 | classifiers=CLASSIFIERS, 60 | packages=find_packages(exclude=["*.tests"]), 61 | install_requires=DEPENDENCIES, 62 | zip_safe=False 63 | ) 64 | -------------------------------------------------------------------------------- /requirements-automl.txt: -------------------------------------------------------------------------------- 1 | tensorflow 2 | azureml-automl-dnn-vision 3 | vision_explanation_methods -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | lightgbm 2 | xgboost 3 | catboost 4 | tensorflow 5 | # due to macos errors with old absl-py, remove with next absl-py release 6 | protobuf<5.26.0; platform_system == 'Darwin' 7 | shap 8 | transformers<4.40.0 9 | datasets 10 | raiutils 11 | fastai 12 | vision_explanation_methods 13 | mlflow 14 | joblib<1.3.0; python_version <= '3.7' 15 | scikeras 16 | openai; python_version >= '3.7' 17 | -------------------------------------------------------------------------------- /requirements-doc.txt: -------------------------------------------------------------------------------- 1 | sphinx==4.3.0 2 | pyyaml -------------------------------------------------------------------------------- /requirements-linting.txt: -------------------------------------------------------------------------------- 1 | flake8 2 | flake8-bugbear 3 | flake8-blind-except 4 | flake8-breakpoint 5 | flake8-builtins 6 | flake8-logging-format 7 | flake8-pytest-style 8 | flake8-all-not-strings 9 | isort 10 | -------------------------------------------------------------------------------- /requirements-test.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | pytest-cov 3 | rai-test-utils==0.3.0 4 | -------------------------------------------------------------------------------- /tests/automl/test_automl_image_model_wrapper.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Tests for wrap_model function on vision-based models""" 6 | 7 | import copy 8 | import json 9 | import os 10 | import sys 11 | import tempfile 12 | 13 | import azureml.automl.core.shared.constants as shared_constants 14 | import mlflow 15 | import pytest 16 | import torch 17 | from azureml.automl.dnn.vision.classification.common.constants import \ 18 | ModelNames 19 | from azureml.automl.dnn.vision.classification.models import ModelFactory 20 | from azureml.automl.dnn.vision.common.mlflow.mlflow_model_wrapper import \ 21 | MLFlowImagesModelWrapper 22 | from azureml.automl.dnn.vision.common.model_export_utils import ( 23 | _get_mlflow_signature, _get_scoring_method) 24 | from common_vision_utils import load_base64_images, load_fridge_dataset 25 | from ml_wrappers import wrap_model 26 | from ml_wrappers.common.constants import ModelTask 27 | from wrapper_validator import validate_wrapped_classification_model 28 | 29 | 30 | @pytest.mark.usefixtures('_clean_dir') 31 | class TestImageModelWrapper(object): 32 | # Skip for older versions of python as azureml-automl-dnn-vision 33 | # works with 3.9 only 34 | @pytest.mark.skipif( 35 | sys.version_info < (3, 9), 36 | reason=('azureml-automl-dnn-vision not supported ' 37 | 'for newer versions of python')) 38 | @pytest.mark.skipif( 39 | sys.version_info >= (3, 10), 40 | reason=('azureml-automl-dnn-vision not supported ' 41 | 'for newer versions of python')) 42 | def test_wrap_automl_image_classification_model(self): 43 | data = load_fridge_dataset() 44 | model_name = ModelNames.SERESNEXT 45 | multilabel = False 46 | with tempfile.TemporaryDirectory() as tmp_output_dir: 47 | 48 | task_type = shared_constants.Tasks.IMAGE_CLASSIFICATION 49 | number_of_classes = 10 50 | model_wrapper = ModelFactory().get_model_wrapper( 51 | model_name, 52 | number_of_classes, 53 | multilabel=multilabel, 54 | device="cpu", 55 | distributed=False, 56 | local_rank=0) 57 | 58 | # mock for Mlflow model generation 59 | model_file = os.path.join(tmp_output_dir, "model.pt") 60 | torch.save({ 61 | 'model_name': model_name, 62 | 'number_of_classes': number_of_classes, 63 | 'model_state': copy.deepcopy(model_wrapper.state_dict()), 64 | 'specs': { 65 | 'multilabel': model_wrapper.multilabel, 66 | 'model_settings': model_wrapper.model_settings, 67 | 'labels': model_wrapper.labels 68 | }, 69 | 70 | }, model_file) 71 | settings_file = os.path.join( 72 | tmp_output_dir, 73 | shared_constants.MLFlowLiterals.MODEL_SETTINGS_FILENAME) 74 | remote_path = os.path.join(tmp_output_dir, "outputs") 75 | 76 | with open(settings_file, 'w') as f: 77 | json.dump({}, f) 78 | 79 | conda_env = { 80 | 'channels': ['conda-forge', 'pytorch'], 81 | 'dependencies': [ 82 | 'python=3.9', 83 | 'numpy==1.26.4', 84 | 'pytorch==2.2.0', 85 | 'torchvision==0.17.2', 86 | {'pip': ['azureml-automl-dnn-vision']} 87 | ], 88 | 'name': 'azureml-automl-dnn-vision-env' 89 | } 90 | 91 | mlflow_model_wrapper = MLFlowImagesModelWrapper( 92 | model_settings={}, 93 | task_type=task_type, 94 | scoring_method=_get_scoring_method(task_type) 95 | ) 96 | print("Saving mlflow model at {}".format(remote_path)) 97 | mlflow.pyfunc.save_model( 98 | path=remote_path, 99 | python_model=mlflow_model_wrapper, 100 | artifacts={"model": model_file, 101 | "settings": settings_file}, 102 | conda_env=conda_env, 103 | signature=_get_mlflow_signature(task_type)) 104 | mlflow_model = mlflow.pyfunc.load_model(remote_path) 105 | 106 | # load the paths as base64 images 107 | data = load_base64_images(data) 108 | wrapped_model = wrap_model( 109 | mlflow_model, data, ModelTask.IMAGE_CLASSIFICATION) 110 | validate_wrapped_classification_model(wrapped_model, data) 111 | -------------------------------------------------------------------------------- /tests/common_text_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation 2 | # Licensed under the MIT License. 3 | 4 | import zipfile 5 | 6 | import datasets 7 | import pandas as pd 8 | from raiutils.common.retries import retry_function 9 | from transformers import (AutoModelForSequenceClassification, AutoTokenizer, 10 | pipeline) 11 | 12 | try: 13 | from urllib import urlretrieve 14 | except ImportError: 15 | from urllib.request import urlretrieve 16 | 17 | 18 | EMOTION_DATASET = 'SetFit/emotion' 19 | EMOTION = 'emotion' 20 | COVID19_EVENTS_MODEL_NAME = "covid19_events_model" 21 | 22 | 23 | def load_emotion_dataset(): 24 | dataset = datasets.load_dataset(EMOTION_DATASET, split="train") 25 | data = pd.DataFrame({'text': dataset['text'], 26 | EMOTION: dataset['label']}) 27 | return data 28 | 29 | 30 | def load_squad_dataset(): 31 | dataset = datasets.load_dataset("squad", split="train") 32 | answers = [] 33 | for row in dataset['answers']: 34 | answers.append(row['text'][0]) 35 | questions = [] 36 | context = [] 37 | for row in dataset: 38 | context.append(row['context']) 39 | questions.append(row['question']) 40 | data = pd.DataFrame({'context': context, 'questions': questions, 'answers': answers}) 41 | return data 42 | 43 | 44 | def load_covid19_emergency_event_dataset(): 45 | dataset = datasets.load_dataset("joelito/covid19_emergency_event", split="train") 46 | dataset = pd.DataFrame({"language": dataset["language"], 47 | "text": dataset["text"], 48 | "event1": dataset["event1"], 49 | "event2": dataset["event2"], 50 | "event3": dataset["event3"], 51 | "event4": dataset["event4"], 52 | "event5": dataset["event5"], 53 | "event6": dataset["event6"], 54 | "event7": dataset["event7"], 55 | "event8": dataset["event8"]}) 56 | dataset = dataset[dataset.language == "en"].reset_index(drop=True) 57 | dataset = dataset.drop(columns="language") 58 | return dataset 59 | 60 | 61 | def create_text_classification_pipeline(): 62 | # load the model and tokenizer 63 | tokenizer = AutoTokenizer.from_pretrained( 64 | "nateraw/bert-base-uncased-emotion", use_fast=True) 65 | model = AutoModelForSequenceClassification.from_pretrained( 66 | "nateraw/bert-base-uncased-emotion") 67 | 68 | # build a pipeline object to do predictions 69 | pred = pipeline("text-classification", model=model, 70 | tokenizer=tokenizer, device=-1, 71 | return_all_scores=True) 72 | return pred 73 | 74 | 75 | def create_question_answering_pipeline(): 76 | return pipeline('question-answering') 77 | 78 | 79 | class FetchCovid19Model(object): 80 | def __init__(self): 81 | pass 82 | 83 | def fetch(self): 84 | zipfilename = COVID19_EVENTS_MODEL_NAME + '.zip' 85 | url = ('https://publictestdatasets.blob.core.windows.net/models/' + COVID19_EVENTS_MODEL_NAME + '.zip') 86 | urlretrieve(url, zipfilename) 87 | with zipfile.ZipFile(zipfilename, 'r') as unzip: 88 | unzip.extractall(COVID19_EVENTS_MODEL_NAME) 89 | 90 | 91 | def create_multilabel_text_pipeline(): 92 | fetcher = FetchCovid19Model() 93 | action_name = "Model download" 94 | err_msg = "Failed to download model" 95 | max_retries = 4 96 | retry_delay = 60 97 | retry_function(fetcher.fetch, action_name, err_msg, 98 | max_retries=max_retries, 99 | retry_delay=retry_delay) 100 | labels = ["event1", "event2", "event3", "event4", "event5", "event6", "event7", "event8"] 101 | num_labels = len(labels) 102 | id2label = {idx: label for idx, label in enumerate(labels)} 103 | label2id = {label: idx for idx, label in enumerate(labels)} 104 | model = AutoModelForSequenceClassification.from_pretrained( 105 | COVID19_EVENTS_MODEL_NAME, num_labels=num_labels, 106 | problem_type="multi_label_classification", 107 | id2label=id2label, 108 | label2id=label2id) 109 | tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") 110 | device = -1 111 | # build a pipeline object to do predictions 112 | pred = pipeline( 113 | "text-classification", 114 | model=model, 115 | tokenizer=tokenizer, 116 | device=device, 117 | return_all_scores=True 118 | ) 119 | return pred 120 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | import logging 6 | import os 7 | import tempfile 8 | 9 | import pytest 10 | 11 | try: 12 | from common_utils import create_cancer_data_booleans 13 | except ModuleNotFoundError: 14 | print("Could not import common_utils, may be running minimal tests") 15 | pass 16 | 17 | from constants import DatasetConstants 18 | from rai_test_utils.datasets.tabular import ( 19 | create_cancer_data, create_complex_titanic_data, create_diabetes_data, 20 | create_energy_data, create_housing_data, create_iris_data, 21 | create_multiclass_classification_dataset, create_simple_titanic_data, 22 | create_wine_data) 23 | 24 | test_logger = logging.getLogger(__name__) 25 | test_logger.setLevel(logging.DEBUG) 26 | 27 | 28 | @pytest.fixture 29 | def _clean_dir(): 30 | new_path = tempfile.mkdtemp() 31 | print("tmp test directory: " + new_path) 32 | os.chdir(new_path) 33 | 34 | 35 | @pytest.fixture(scope='session') 36 | def iris(): 37 | x_train, x_test, y_train, y_test, features, classes = create_iris_data() 38 | return { 39 | DatasetConstants.X_TRAIN: x_train.values, 40 | DatasetConstants.X_TEST: x_test.values, 41 | DatasetConstants.Y_TRAIN: y_train, 42 | DatasetConstants.Y_TEST: y_test, 43 | DatasetConstants.FEATURES: features, 44 | DatasetConstants.CLASSES: classes 45 | } 46 | 47 | 48 | @pytest.fixture(scope='session') 49 | def cancer(): 50 | x_train, x_test, y_train, y_test, features, classes = create_cancer_data() 51 | return { 52 | DatasetConstants.X_TRAIN: x_train, 53 | DatasetConstants.X_TEST: x_test, 54 | DatasetConstants.Y_TRAIN: y_train, 55 | DatasetConstants.Y_TEST: y_test, 56 | DatasetConstants.FEATURES: features, 57 | DatasetConstants.CLASSES: classes 58 | } 59 | 60 | 61 | @pytest.fixture(scope='session') 62 | def cancer_booleans(): 63 | x_train, x_test, y_train, y_test, features, classes = create_cancer_data_booleans() 64 | return { 65 | DatasetConstants.X_TRAIN: x_train, 66 | DatasetConstants.X_TEST: x_test, 67 | DatasetConstants.Y_TRAIN: y_train, 68 | DatasetConstants.Y_TEST: y_test, 69 | DatasetConstants.FEATURES: features, 70 | DatasetConstants.CLASSES: classes 71 | } 72 | 73 | 74 | @pytest.fixture(scope='session') 75 | def titanic_simple(): 76 | x_train, x_test, y_train, y_test, numeric, categorical = create_simple_titanic_data() 77 | return { 78 | DatasetConstants.X_TRAIN: x_train, 79 | DatasetConstants.X_TEST: x_test, 80 | DatasetConstants.Y_TRAIN: y_train, 81 | DatasetConstants.Y_TEST: y_test, 82 | DatasetConstants.NUMERIC: numeric, 83 | DatasetConstants.CATEGORICAL: categorical 84 | } 85 | 86 | 87 | @pytest.fixture(scope='session') 88 | def titanic_complex(): 89 | x_train, x_test, y_train, y_test = create_complex_titanic_data() 90 | return { 91 | DatasetConstants.X_TRAIN: x_train, 92 | DatasetConstants.X_TEST: x_test, 93 | DatasetConstants.Y_TRAIN: y_train, 94 | DatasetConstants.Y_TEST: y_test 95 | } 96 | 97 | 98 | @pytest.fixture(scope='session') 99 | def wine(): 100 | x_train, x_test, y_train, y_test, features, classes = create_wine_data() 101 | return { 102 | DatasetConstants.X_TRAIN: x_train, 103 | DatasetConstants.X_TEST: x_test, 104 | DatasetConstants.Y_TRAIN: y_train, 105 | DatasetConstants.Y_TEST: y_test, 106 | DatasetConstants.FEATURES: features, 107 | DatasetConstants.CLASSES: classes 108 | } 109 | 110 | 111 | @pytest.fixture(scope='session') 112 | def multiclass_classification(): 113 | x_train, y_train, x_test, y_test, classes = \ 114 | create_multiclass_classification_dataset() 115 | feature_names = ["col" + str(i) for i in list(range(x_train.shape[1]))] 116 | 117 | return { 118 | DatasetConstants.X_TRAIN: x_train.values, 119 | DatasetConstants.X_TEST: x_test.values, 120 | DatasetConstants.Y_TRAIN: y_train, 121 | DatasetConstants.Y_TEST: y_test, 122 | DatasetConstants.FEATURES: feature_names, 123 | DatasetConstants.CLASSES: classes 124 | } 125 | 126 | 127 | @pytest.fixture(scope='session') 128 | def housing(): 129 | x_train, x_test, y_train, y_test, features = create_housing_data() 130 | return { 131 | DatasetConstants.X_TRAIN: x_train, 132 | DatasetConstants.X_TEST: x_test, 133 | DatasetConstants.Y_TRAIN: y_train, 134 | DatasetConstants.Y_TEST: y_test, 135 | DatasetConstants.FEATURES: features 136 | } 137 | 138 | 139 | @pytest.fixture(scope='session') 140 | def energy(): 141 | x_train, x_test, y_train, y_test, features = create_energy_data() 142 | return { 143 | DatasetConstants.X_TRAIN: x_train, 144 | DatasetConstants.X_TEST: x_test, 145 | DatasetConstants.Y_TRAIN: y_train, 146 | DatasetConstants.Y_TEST: y_test, 147 | DatasetConstants.FEATURES: features 148 | } 149 | 150 | 151 | @pytest.fixture(scope='session') 152 | def diabetes(): 153 | x_train, x_test, y_train, y_test, features = create_diabetes_data() 154 | return { 155 | DatasetConstants.X_TRAIN: x_train, 156 | DatasetConstants.X_TEST: x_test, 157 | DatasetConstants.Y_TRAIN: y_train, 158 | DatasetConstants.Y_TEST: y_test, 159 | DatasetConstants.FEATURES: features 160 | } 161 | -------------------------------------------------------------------------------- /tests/constants.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | 6 | UTF8 = 'utf-8' 7 | 8 | 9 | class DatasetConstants(object): 10 | """Dataset related constants.""" 11 | CATEGORICAL = 'categorical' 12 | CLASSES = 'classes' 13 | FEATURES = 'features' 14 | NUMERIC = 'numeric' 15 | X_TEST = 'x_test' 16 | X_TRAIN = 'x_train' 17 | Y_TEST = 'y_test' 18 | Y_TRAIN = 'y_train' 19 | 20 | 21 | class ModelType(object): 22 | """Model type constants.""" 23 | XGBOOST = 'xgboost' 24 | TREE = 'tree' 25 | DEFAULT = 'default' 26 | -------------------------------------------------------------------------------- /tests/main/test_dataset_wrapper.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Tests for DatasetWrapper class""" 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import pytest 10 | from common_utils import assert_batch_equal, assert_sparse_equal 11 | from ml_wrappers.dataset.dataset_utils import _summarize_data 12 | from ml_wrappers.dataset.dataset_wrapper import DatasetWrapper 13 | from pandas.testing import assert_frame_equal, assert_series_equal 14 | from scipy.sparse import csr_matrix 15 | 16 | try: 17 | import torch 18 | except ImportError: 19 | pass 20 | 21 | try: 22 | import tensorflow as tf 23 | except ImportError: 24 | pass 25 | 26 | 27 | @pytest.mark.usefixtures('_clean_dir') 28 | class TestDatasetWrapper(object): 29 | def test_supported_types(self): 30 | test_dataframe = pd.DataFrame(data=[[1, 2, 3]], columns=['c1,', 'c2', 'c3']) 31 | wrapper = DatasetWrapper(dataset=test_dataframe) 32 | df_converted = wrapper.typed_dataset 33 | assert_frame_equal(df_converted, test_dataframe) 34 | 35 | test_array = test_dataframe.values 36 | wrapper = DatasetWrapper(dataset=test_array) 37 | numpy_converted = wrapper.typed_dataset 38 | assert np.array_equal(numpy_converted, test_array) 39 | 40 | test_series = test_dataframe.squeeze().reset_index(drop=True) 41 | wrapper = DatasetWrapper(dataset=test_series) 42 | series_converted = wrapper.typed_dataset 43 | assert_series_equal(series_converted, test_series, 44 | check_names=False) 45 | 46 | sparse_matrix = csr_matrix((3, 4), 47 | dtype=np.int8) 48 | wrapper = DatasetWrapper(dataset=sparse_matrix) 49 | sparse_matrix_converted = wrapper.typed_dataset 50 | assert_sparse_equal(sparse_matrix_converted, sparse_matrix) 51 | 52 | background = _summarize_data(test_dataframe.values) 53 | DatasetWrapper(dataset=background) 54 | 55 | torch_input = torch.rand(100, 3) 56 | wrapper = DatasetWrapper(dataset=torch_input) 57 | torch_converted = wrapper.typed_dataset 58 | assert torch.all(torch.eq(torch_converted, torch_input)) 59 | 60 | tensor_slices = (dict(test_dataframe), None) 61 | tf_batch_dataset = tf.data.Dataset.from_tensor_slices(tensor_slices).batch(32) 62 | wrapper = DatasetWrapper(dataset=tf_batch_dataset) 63 | tf_batch_dataset_converted = wrapper.typed_dataset 64 | assert_batch_equal(tf_batch_dataset_converted, tf_batch_dataset) 65 | 66 | def test_unsupported_types(self): 67 | test_dataframe = pd.DataFrame(data=[[1, 2, 3]], columns=['c1,', 'c2', 'c3']) 68 | test_array = test_dataframe.values 69 | test_list = test_array.tolist() 70 | 71 | with pytest.raises( 72 | TypeError, 73 | match='Got type which is not supported in DatasetWrapper'): 74 | DatasetWrapper(test_list) 75 | -------------------------------------------------------------------------------- /tests/main/test_endpoint_wrapper.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Tests the EndpointWrapperModel class.""" 6 | 7 | import json 8 | import urllib.request 9 | from unittest.mock import patch 10 | 11 | import numpy as np 12 | import pandas as pd 13 | import pytest 14 | from ml_wrappers.model import EndpointWrapperModel 15 | 16 | 17 | class MockRead(): 18 | """Mock class for urllib.request.urlopen().read()""" 19 | 20 | def __init__(self, json_data, fail_read=False): 21 | """Initialize the MockRead class. 22 | 23 | :param json_data: The json data to return from the read method. 24 | :type json_data: str 25 | """ 26 | self.json_data = json_data 27 | self.fail_read = fail_read 28 | 29 | def read(self): 30 | """Return the json data. 31 | 32 | :return: The json data. 33 | :rtype: str 34 | """ 35 | if self.fail_read: 36 | # reset fail_read to False so that the next call to read 37 | # does not fail 38 | self.fail_read = False 39 | raise urllib.error.HTTPError('url', 500, 'Internal Server Error', {}, None) 40 | return self.json_data 41 | 42 | 43 | def mock_api_key_auto_refresh_method(): 44 | """Mock method for auto refreshing the API key. 45 | 46 | :return: The mock API key. 47 | :rtype: str 48 | """ 49 | return 'mock_key' 50 | 51 | 52 | @pytest.mark.usefixtures('_clean_dir') 53 | class TestEndpointWrapperModel(object): 54 | def test_predict_call(self): 55 | # test creating the EndpointWrapperModel and 56 | # calling the predict function 57 | test_dataframe = pd.DataFrame(data=[[1, 2, 3]], columns=['c1,', 'c2', 'c3']) 58 | endpoint_wrapper = EndpointWrapperModel('mock_key', 'http://mock.url') 59 | # mock the urllib.request.urlopen function 60 | with patch('urllib.request.urlopen') as mock_urlopen: 61 | json_inference_value = json.dumps(test_dataframe.values.tolist()) 62 | # wrap return value in mock class with read method 63 | mock_urlopen.return_value = MockRead(json_inference_value) 64 | context = {} 65 | result = endpoint_wrapper.predict(context, test_dataframe) 66 | # assert result and test_dataframe.values equal 67 | assert np.array_equal(result, test_dataframe.values) 68 | 69 | def test_auto_refresh_token(self): 70 | test_dataframe = pd.DataFrame(data=[[1, 2, 3]], columns=['c1,', 'c2', 'c3']) 71 | endpoint_wrapper = EndpointWrapperModel.from_auto_refresh_callable( 72 | mock_api_key_auto_refresh_method, 73 | 'http://mock.url') 74 | # mock the urllib.request.urlopen function 75 | with patch('urllib.request.urlopen') as mock_urlopen: 76 | json_inference_value = json.dumps(test_dataframe.values.tolist()) 77 | # wrap return value in mock class with read method 78 | mock_urlopen.return_value = MockRead( 79 | json_inference_value, fail_read=True) 80 | context = {} 81 | result = endpoint_wrapper.predict(context, test_dataframe) 82 | # assert result and test_dataframe.values equal 83 | assert np.array_equal(result, test_dataframe.values) 84 | -------------------------------------------------------------------------------- /tests/main/test_model_wrapper.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Tests for wrap_model function""" 6 | 7 | import sys 8 | 9 | import pandas as pd 10 | import pytest 11 | from common_utils import (create_catboost_classifier, 12 | create_catboost_regressor, 13 | create_fastai_tabular_classifier, 14 | create_fastai_tabular_classifier_multimetric, 15 | create_fastai_tabular_regressor, 16 | create_keras_classifier, create_keras_regressor, 17 | create_lightgbm_classifier, 18 | create_lightgbm_regressor, 19 | create_pytorch_multiclass_classifier, 20 | create_pytorch_regressor, 21 | create_scikit_keras_multiclass_classifier, 22 | create_scikit_keras_regressor, 23 | create_sklearn_linear_regressor, 24 | create_sklearn_logistic_regressor, create_tf_model, 25 | create_xgboost_classifier, create_xgboost_regressor) 26 | from constants import DatasetConstants 27 | from ml_wrappers import wrap_model 28 | from ml_wrappers.dataset.dataset_wrapper import DatasetWrapper 29 | from train_wrapper_utils import (train_classification_model_numpy, 30 | train_classification_model_pandas, 31 | train_regression_model_numpy, 32 | train_regression_model_pandas) 33 | from wrapper_validator import validate_wrapped_regression_model 34 | 35 | try: 36 | import tensorflow as tf 37 | except ImportError: 38 | pass 39 | 40 | 41 | @pytest.mark.usefixtures('_clean_dir') 42 | class TestModelWrapper(object): 43 | def test_wrap_sklearn_logistic_regression_model(self, iris): 44 | train_classification_model_numpy( 45 | create_sklearn_logistic_regressor, iris) 46 | train_classification_model_pandas( 47 | create_sklearn_logistic_regressor, iris) 48 | train_classification_model_numpy( 49 | create_sklearn_logistic_regressor, iris, 50 | use_dataset_wrapper=False) 51 | train_classification_model_pandas( 52 | create_sklearn_logistic_regressor, iris, 53 | use_dataset_wrapper=False) 54 | 55 | def test_wrap_pytorch_classification_model(self, iris): 56 | train_classification_model_numpy( 57 | create_pytorch_multiclass_classifier, iris) 58 | train_classification_model_numpy( 59 | create_pytorch_multiclass_classifier, iris, 60 | use_dataset_wrapper=False) 61 | 62 | def test_wrap_xgboost_classification_model(self, iris): 63 | train_classification_model_numpy(create_xgboost_classifier, iris) 64 | train_classification_model_pandas(create_xgboost_classifier, iris) 65 | 66 | def test_wrap_catboost_classification_model(self, iris): 67 | train_classification_model_numpy(create_catboost_classifier, iris) 68 | train_classification_model_pandas(create_catboost_classifier, iris) 69 | 70 | def test_wrap_lightgbm_classification_model(self, iris): 71 | train_classification_model_numpy(create_lightgbm_classifier, iris) 72 | train_classification_model_pandas(create_lightgbm_classifier, iris) 73 | 74 | def test_wrap_keras_classification_model(self, iris): 75 | train_classification_model_numpy(create_keras_classifier, iris) 76 | train_classification_model_pandas(create_keras_classifier, iris) 77 | 78 | def test_wrap_scikit_keras_classification_model(self, iris): 79 | train_classification_model_numpy(create_scikit_keras_multiclass_classifier, iris) 80 | train_classification_model_pandas(create_scikit_keras_multiclass_classifier, iris) 81 | 82 | # Skip for older versions due to latest fastai not supporting 3.6 83 | @pytest.mark.skipif(sys.version_info.minor <= 6, 84 | reason='Fastai not supported for older versions') 85 | # Skip is using macos due to fastai failing on latest macos 86 | @pytest.mark.skipif(sys.platform == 'darwin', 87 | reason='Fastai not supported for latest macos') 88 | def test_wrap_fastai_classification_model(self, iris): 89 | train_classification_model_pandas(create_fastai_tabular_classifier, iris) 90 | 91 | # Skip for older versions due to latest fastai not supporting 3.6 92 | @pytest.mark.skipif(sys.version_info.minor <= 6, 93 | reason='Fastai not supported for older versions') 94 | # Skip is using macos due to fastai failing on latest macos 95 | @pytest.mark.skipif(sys.platform == 'darwin', 96 | reason='Fastai not supported for latest macos') 97 | def test_wrap_fastai_classification_model_multimetric(self, iris): 98 | iris = iris.copy() 99 | data_to_transform = [DatasetConstants.Y_TRAIN, DatasetConstants.Y_TEST] 100 | for data in data_to_transform: 101 | iris[data][iris[data] == 2] = 1 102 | train_classification_model_pandas( 103 | create_fastai_tabular_classifier_multimetric, iris, 104 | validate_single_row=True) 105 | 106 | def test_wrap_sklearn_linear_regression_model(self, housing): 107 | train_regression_model_numpy( 108 | create_sklearn_linear_regressor, housing) 109 | train_regression_model_pandas( 110 | create_sklearn_linear_regressor, housing) 111 | train_regression_model_numpy( 112 | create_sklearn_linear_regressor, housing, 113 | use_dataset_wrapper=False) 114 | train_regression_model_pandas( 115 | create_sklearn_linear_regressor, housing, 116 | use_dataset_wrapper=False) 117 | 118 | def test_wrap_pytorch_regression_model(self, housing): 119 | train_regression_model_numpy( 120 | create_pytorch_regressor, housing) 121 | 122 | def test_wrap_xgboost_regression_model(self, housing): 123 | train_regression_model_numpy(create_xgboost_regressor, housing) 124 | train_regression_model_pandas(create_xgboost_regressor, housing) 125 | 126 | def test_wrap_catboost_regression_model(self, housing): 127 | train_regression_model_numpy(create_catboost_regressor, housing) 128 | train_regression_model_pandas(create_catboost_regressor, housing) 129 | 130 | def test_wrap_lightgbm_regression_model(self, housing): 131 | train_regression_model_numpy(create_lightgbm_regressor, housing) 132 | train_regression_model_pandas(create_lightgbm_regressor, housing) 133 | 134 | def test_wrap_keras_regression_model(self, housing): 135 | train_regression_model_numpy(create_keras_regressor, housing) 136 | train_regression_model_pandas(create_keras_regressor, housing) 137 | 138 | def test_wrap_scikit_keras_regression_model(self, housing): 139 | train_regression_model_numpy(create_scikit_keras_regressor, housing) 140 | train_regression_model_pandas(create_scikit_keras_regressor, housing) 141 | 142 | # Skip for older versions due to latest fastai not supporting 3.6 143 | @pytest.mark.skipif(sys.version_info.minor <= 6, 144 | reason='Fastai not supported for older versions') 145 | # Skip is using macos due to fastai failing on latest macos 146 | @pytest.mark.skipif(sys.platform == 'darwin', 147 | reason='Fastai not supported for latest macos') 148 | def test_wrap_fastai_regression_model(self, iris): 149 | train_regression_model_pandas(create_fastai_tabular_regressor, iris) 150 | 151 | def test_batch_dataset(self, housing): 152 | X_train = housing[DatasetConstants.X_TRAIN] 153 | X_test = housing[DatasetConstants.X_TEST] 154 | y_train = housing[DatasetConstants.Y_TRAIN] 155 | y_test = housing[DatasetConstants.Y_TEST] 156 | features = housing[DatasetConstants.FEATURES] 157 | X_train_df = pd.DataFrame(X_train, columns=list(features)) 158 | X_test_df = pd.DataFrame(X_test, columns=list(features)) 159 | inp = (dict(X_train_df), y_train) 160 | inp_ds = tf.data.Dataset.from_tensor_slices(inp).batch(32) 161 | val = (dict(X_test_df), y_test) 162 | val_ds = tf.data.Dataset.from_tensor_slices(val).batch(32) 163 | model = create_tf_model(inp_ds, val_ds, features) 164 | wrapped_dataset = DatasetWrapper(val_ds) 165 | wrapped_model = wrap_model(model, wrapped_dataset, model_task='regression') 166 | validate_wrapped_regression_model(wrapped_model, val_ds) 167 | -------------------------------------------------------------------------------- /tests/main/test_pytorch_model_wrapper.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Tests for WrappedPytorchModel""" 6 | 7 | import pytest 8 | from common_utils import (create_pytorch_multiclass_classifier, 9 | create_pytorch_regressor) 10 | from ml_wrappers.common.constants import ModelTask 11 | from ml_wrappers.model import WrappedPytorchModel 12 | from train_wrapper_utils import (train_classification_model_numpy, 13 | train_regression_model_numpy) 14 | from wrapper_validator import validate_wrapped_pytorch_model 15 | 16 | 17 | @pytest.mark.usefixtures('_clean_dir') 18 | class TestPytorchModelWrapper(object): 19 | def test_wrap_pytorch_classification_model(self, iris): 20 | wrapped_init = wrapped_pytorch_model_initializer( 21 | create_pytorch_multiclass_classifier, 22 | model_task=ModelTask.CLASSIFICATION) 23 | train_classification_model_numpy(wrapped_init, iris) 24 | train_classification_model_numpy(wrapped_init, iris, 25 | use_dataset_wrapper=False) 26 | 27 | def test_wrap_pytorch_regression_model(self, housing): 28 | wrapped_init = wrapped_pytorch_model_initializer( 29 | create_pytorch_regressor, model_task=ModelTask.REGRESSION) 30 | train_regression_model_numpy( 31 | wrapped_init, housing) 32 | 33 | 34 | class PytorchModelInitializer(): 35 | def __init__(self, model_initializer, model_task): 36 | self._model_initializer = model_initializer 37 | self._model_task = model_task 38 | 39 | def __call__(self, X_train, y_train): 40 | fitted_model = self._model_initializer(X_train, y_train) 41 | wrapped_pytorch_model = WrappedPytorchModel(fitted_model) 42 | validate_wrapped_pytorch_model(wrapped_pytorch_model, X_train, 43 | self._model_task) 44 | return wrapped_pytorch_model 45 | 46 | 47 | def wrapped_pytorch_model_initializer(model_initializer, model_task): 48 | return PytorchModelInitializer(model_initializer, model_task) 49 | -------------------------------------------------------------------------------- /tests/main/test_text_model_wrapper.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Tests for wrap_model function on text-based models""" 6 | 7 | import pytest 8 | from common_text_utils import (EMOTION, create_multilabel_text_pipeline, 9 | create_question_answering_pipeline, 10 | create_text_classification_pipeline, 11 | load_covid19_emergency_event_dataset, 12 | load_emotion_dataset, load_squad_dataset) 13 | from ml_wrappers import wrap_model 14 | from ml_wrappers.common.constants import ModelTask 15 | from wrapper_validator import (validate_wrapped_classification_model, 16 | validate_wrapped_multilabel_model, 17 | validate_wrapped_question_answering_model) 18 | 19 | 20 | @pytest.mark.usefixtures('_clean_dir') 21 | class TestTextModelWrapper(object): 22 | @pytest.mark.skip("Need to update wrapper as only text pairs now supported") 23 | def test_wrap_transformers_model(self): 24 | emotion_data = load_emotion_dataset() 25 | docs = emotion_data[:10].drop(columns=EMOTION).values.tolist() 26 | pred = create_text_classification_pipeline() 27 | wrapped_model = wrap_model(pred, docs, ModelTask.TEXT_CLASSIFICATION) 28 | validate_wrapped_classification_model(wrapped_model, docs) 29 | 30 | def test_wrap_question_answering_model(self): 31 | squad_data = load_squad_dataset() 32 | docs = squad_data[:10].drop(columns=['answers']) 33 | pred = create_question_answering_pipeline() 34 | wrapped_model = wrap_model(pred, docs, ModelTask.QUESTION_ANSWERING) 35 | validate_wrapped_question_answering_model(wrapped_model, docs) 36 | 37 | def test_wrap_multilabel_model(self): 38 | covid19_data = load_covid19_emergency_event_dataset() 39 | docs = covid19_data[:10]['text'].values.tolist() 40 | pred = create_multilabel_text_pipeline() 41 | wrapped_model = wrap_model( 42 | pred, docs, ModelTask.MULTILABEL_TEXT_CLASSIFICATION) 43 | num_labels = pred.model.num_labels 44 | validate_wrapped_multilabel_model(wrapped_model, docs, num_labels) 45 | -------------------------------------------------------------------------------- /tests/main/test_tf_model_wrapper.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Tests for WrappedTensorflowModel""" 6 | 7 | import pytest 8 | import tensorflow as tf 9 | from common_utils import (create_keras_classifier, create_keras_regressor, 10 | create_scikit_keras_regressor) 11 | from ml_wrappers.common.constants import ModelTask 12 | from ml_wrappers.model import WrappedTensorflowModel 13 | from ml_wrappers.model.tensorflow_wrapper import is_sequential 14 | from train_wrapper_utils import (train_classification_model_numpy, 15 | train_classification_model_pandas, 16 | train_regression_model_numpy, 17 | train_regression_model_pandas) 18 | from wrapper_validator import validate_wrapped_tf_model 19 | 20 | 21 | @pytest.mark.usefixtures('_clean_dir') 22 | class TestTensorflowModelWrapper(object): 23 | def test_wrap_keras_classification_model(self, iris): 24 | wrapped_init = wrapped_tensorflow_model_initializer( 25 | create_keras_classifier, model_task=ModelTask.CLASSIFICATION) 26 | train_classification_model_numpy(wrapped_init, iris) 27 | train_classification_model_pandas(wrapped_init, iris) 28 | 29 | def test_wrap_keras_regression_model(self, housing): 30 | wrapped_init = wrapped_tensorflow_model_initializer( 31 | create_keras_regressor, model_task=ModelTask.REGRESSION) 32 | train_regression_model_numpy(wrapped_init, housing) 33 | train_regression_model_pandas(wrapped_init, housing) 34 | 35 | def test_wrap_scikit_keras_regression_model(self, housing): 36 | wrapped_init = wrapped_tensorflow_model_initializer( 37 | create_scikit_keras_regressor, model_task=ModelTask.REGRESSION) 38 | train_regression_model_numpy(wrapped_init, housing) 39 | train_regression_model_pandas(wrapped_init, housing) 40 | 41 | def test_validate_is_sequential(self): 42 | sequential_layer = tf.keras.Sequential(layers=None, name=None) 43 | assert is_sequential(sequential_layer) 44 | 45 | 46 | class TensorflowModelInitializer(): 47 | def __init__(self, model_initializer, model_task): 48 | self._model_initializer = model_initializer 49 | self._model_task = model_task 50 | 51 | def __call__(self, X_train, y_train): 52 | fitted_model = self._model_initializer(X_train, y_train) 53 | wrapped_tf_model = WrappedTensorflowModel(fitted_model) 54 | validate_wrapped_tf_model(wrapped_tf_model, X_train, self._model_task) 55 | return wrapped_tf_model 56 | 57 | 58 | def wrapped_tensorflow_model_initializer(model_initializer, model_task): 59 | return TensorflowModelInitializer(model_initializer, model_task) 60 | -------------------------------------------------------------------------------- /tests/main/test_timestamp_featurizer.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | import pandas as pd 6 | import pytest 7 | from constants import DatasetConstants 8 | from ml_wrappers.dataset import CustomTimestampFeaturizer 9 | from pandas.api.types import is_datetime64_any_dtype as is_datetime 10 | from rai_test_utils.datasets.tabular import create_timeseries_data 11 | 12 | 13 | @pytest.mark.usefixtures('_clean_dir') 14 | class TestTimestampFeaturizer(object): 15 | 16 | def test_working(self): 17 | assert True 18 | 19 | def test_no_timestamps(self, iris): 20 | # create pandas dataframes without any timestamps 21 | x_train = pd.DataFrame(data=iris[DatasetConstants.X_TRAIN], columns=iris[DatasetConstants.FEATURES]) 22 | x_test = pd.DataFrame(data=iris[DatasetConstants.X_TEST], columns=iris[DatasetConstants.FEATURES]) 23 | featurizer = CustomTimestampFeaturizer(iris[DatasetConstants.FEATURES]).fit(x_train) 24 | result = featurizer.transform(x_test) 25 | # Assert result is same as before, pandas dataframe 26 | assert isinstance(result, pd.DataFrame) 27 | # Assert the result is the same as the original passed in data (no featurization was done) 28 | assert result.equals(x_test) 29 | 30 | @pytest.mark.parametrize(("sample_cnt_per_grain", "grains_dict"), [ 31 | (240, {}), 32 | (20, {'fruit': ['apple', 'grape'], 'store': [100, 200, 50]})]) 33 | def test_timestamp_featurization(self, sample_cnt_per_grain, grains_dict): 34 | # create timeseries data 35 | X, _ = create_timeseries_data(sample_cnt_per_grain, 'time', 'y', grains_dict) 36 | original_cols = list(X.columns.values) 37 | # featurize and validate the timestamp column 38 | featurizer = CustomTimestampFeaturizer(original_cols).fit(X) 39 | result = featurizer.transform(X) 40 | # Form a temporary dataframe for validation 41 | tmp_result = pd.DataFrame(result) 42 | # Assert there are no timestamp columns 43 | assert ([column for column in tmp_result.columns if is_datetime(tmp_result[column])] == []) 44 | # Assert we have the expected number of columns - 1 time columns * 6 featurized plus original 45 | assert (result.shape[1] == len(original_cols) + 6) 46 | 47 | @pytest.mark.parametrize(("return_pandas"), [True, False]) 48 | def test_separate_fit_with_no_features(self, return_pandas): 49 | sample_cnt_per_grain = 20 50 | grains_dict = {'fruit': ['apple', 'grape'], 'store': [100, 200, 50]} 51 | # create timeseries data 52 | X, _ = create_timeseries_data(sample_cnt_per_grain, 'time', 'y', grains_dict) 53 | original_cols = list(X.columns.values) 54 | # featurize and validate the timestamp column as a separate fit call and fit_transform 55 | # Note: in this case we don't pass the feature names to the constructor 56 | ctf1 = CustomTimestampFeaturizer(return_pandas=return_pandas) 57 | ctf2 = CustomTimestampFeaturizer(return_pandas=return_pandas) 58 | ctf1.fit(X) 59 | result1 = ctf1.transform(X) 60 | result2 = ctf2.fit_transform(X) 61 | for result in [result1, result2]: 62 | if not return_pandas: 63 | assert not isinstance(result, pd.DataFrame) 64 | # Form a temporary dataframe for validation 65 | result = pd.DataFrame(result) 66 | # Assert there are no timestamp columns 67 | assert ([column for column in result.columns if is_datetime(result[column])] == []) 68 | # Assert we have the expected number of columns - 1 time columns * 6 featurized plus original 69 | assert (result.shape[1] == len(original_cols) + 6) 70 | -------------------------------------------------------------------------------- /tests/minimal/test_minimal.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Tests minimal imports and functions from ml-wrappers""" 6 | 7 | import pytest 8 | 9 | 10 | @pytest.mark.usefixtures('_clean_dir') 11 | class TestMinialImports(object): 12 | def test_main_import(self): 13 | import ml_wrappers # noqa 14 | 15 | def test_import_wrap_model(self): 16 | from ml_wrappers import wrap_model # noqa 17 | 18 | def test_import_constants(self): 19 | from ml_wrappers.common.constants import ModelTask # noqa 20 | -------------------------------------------------------------------------------- /tests/train_wrapper_utils.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Utilities for calling the wrap_model function and validating the results.""" 6 | 7 | import pandas as pd 8 | from constants import DatasetConstants 9 | from ml_wrappers import wrap_model 10 | from ml_wrappers.common.constants import ModelTask 11 | from ml_wrappers.dataset.dataset_wrapper import DatasetWrapper 12 | from wrapper_validator import (validate_wrapped_classification_model, 13 | validate_wrapped_regression_model) 14 | 15 | 16 | def train_classification_model_numpy(model_initializer, dataset, 17 | use_dataset_wrapper=True): 18 | X_train = dataset[DatasetConstants.X_TRAIN] 19 | X_test = dataset[DatasetConstants.X_TEST] 20 | y_train = dataset[DatasetConstants.Y_TRAIN] 21 | model = model_initializer(X_train, y_train) 22 | if use_dataset_wrapper: 23 | X_test_wrapped = DatasetWrapper(X_test) 24 | else: 25 | X_test_wrapped = X_test 26 | wrapped_model = wrap_model(model, X_test_wrapped, 27 | model_task=ModelTask.CLASSIFICATION) 28 | validate_wrapped_classification_model(wrapped_model, X_test) 29 | 30 | 31 | def train_classification_model_pandas(model_initializer, dataset, 32 | use_dataset_wrapper=True, 33 | validate_single_row=False): 34 | X_train = pd.DataFrame(data=dataset[DatasetConstants.X_TRAIN], 35 | columns=dataset[DatasetConstants.FEATURES]) 36 | X_test = pd.DataFrame(data=dataset[DatasetConstants.X_TEST], 37 | columns=dataset[DatasetConstants.FEATURES]) 38 | y_train = dataset[DatasetConstants.Y_TRAIN] 39 | model = model_initializer(X_train, y_train) 40 | if use_dataset_wrapper: 41 | X_test_wrapped = DatasetWrapper(X_test) 42 | else: 43 | X_test_wrapped = X_test 44 | wrapped_model = wrap_model(model, X_test_wrapped, 45 | model_task=ModelTask.CLASSIFICATION) 46 | if validate_single_row: 47 | validate_wrapped_classification_model(wrapped_model, X_test.iloc[0:1]) 48 | validate_wrapped_classification_model(wrapped_model, X_test) 49 | 50 | 51 | def train_regression_model_numpy(model_initializer, dataset, 52 | use_dataset_wrapper=True): 53 | X_train = dataset[DatasetConstants.X_TRAIN] 54 | X_test = dataset[DatasetConstants.X_TEST] 55 | y_train = dataset[DatasetConstants.Y_TRAIN] 56 | model = model_initializer(X_train, y_train) 57 | if use_dataset_wrapper: 58 | X_test_wrapped = DatasetWrapper(X_test) 59 | else: 60 | X_test_wrapped = X_test 61 | wrapped_model = wrap_model(model, X_test_wrapped, 62 | model_task=ModelTask.REGRESSION) 63 | validate_wrapped_regression_model(wrapped_model, X_test) 64 | 65 | 66 | def train_regression_model_pandas(model_initializer, dataset, 67 | use_dataset_wrapper=True): 68 | X_train = pd.DataFrame(data=dataset[DatasetConstants.X_TRAIN], 69 | columns=dataset[DatasetConstants.FEATURES]) 70 | X_test = pd.DataFrame(data=dataset[DatasetConstants.X_TEST], 71 | columns=dataset[DatasetConstants.FEATURES]) 72 | y_train = dataset[DatasetConstants.Y_TRAIN] 73 | model = model_initializer(X_train, y_train) 74 | if use_dataset_wrapper: 75 | X_test_wrapped = DatasetWrapper(X_test) 76 | else: 77 | X_test_wrapped = X_test 78 | wrapped_model = wrap_model(model, X_test_wrapped, 79 | model_task=ModelTask.REGRESSION) 80 | validate_wrapped_regression_model(wrapped_model, X_test) 81 | -------------------------------------------------------------------------------- /tests/wrapper_validator.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # --------------------------------------------------------- 4 | 5 | """Utilities for validating wrapped models.""" 6 | 7 | from ml_wrappers.common.constants import ModelTask, SKLearn 8 | from ml_wrappers.model import WrappedPytorchModel, WrappedTensorflowModel 9 | 10 | PREDICT_CLASSES = 'predict_classes' 11 | 12 | 13 | def validate_wrapped_classification_model(wrapped_model, X_test): 14 | # validate wrapped model has predict and predict_proba functions 15 | function_names = [SKLearn.PREDICT, SKLearn.PREDICT_PROBA] 16 | validate_functions(wrapped_model, function_names) 17 | # validate we can call the model on the dataset 18 | predictions = wrapped_model.predict(X_test) 19 | probabilities = wrapped_model.predict_proba(X_test) 20 | # validate predictions and probabilities have correct shape 21 | assert len(predictions.shape) == 1 22 | assert len(probabilities.shape) == 2 23 | 24 | 25 | def validate_wrapped_object_detection_custom_model(wrapped_model, X_test, has_predict_proba=True): 26 | # validate wrapped model has predict and predict_proba functions 27 | function_names = [SKLearn.PREDICT, SKLearn.PREDICT_PROBA] \ 28 | if has_predict_proba else [SKLearn.PREDICT] 29 | validate_functions(wrapped_model, function_names) 30 | # validate we can call the model on the dataset 31 | predictions = wrapped_model.predict(X_test) 32 | # validate predictions and probabilities have correct shape 33 | assert len(predictions) == 2, "Expected number of predictions to be 2." + \ 34 | f"Got {len(predictions)} predictions" 35 | if has_predict_proba: 36 | probabilities = wrapped_model.predict_proba(X_test) 37 | assert len(probabilities) == 2, "Expected number of probabilities to be 2." + \ 38 | f"Got {len(probabilities)} probabilities" 39 | 40 | 41 | def validate_wrapped_object_detection_mlflow_drise_model( 42 | wrapped_model, X_test): 43 | # validate wrapped model has predict and predict_proba functions 44 | function_names = [SKLearn.PREDICT] 45 | validate_functions(wrapped_model, function_names) 46 | # validate we can call the model on the dataset 47 | predictions = wrapped_model.predict(X_test) 48 | # validate predictions and probabilities have correct shape 49 | assert len(predictions) == 1 50 | 51 | 52 | def validate_wrapped_object_detection_model(wrapped_model, X_test, 53 | num_predictions=3): 54 | # validate wrapped model has predict and predict_proba functions 55 | function_names = [SKLearn.PREDICT, SKLearn.PREDICT_PROBA] 56 | validate_functions(wrapped_model, function_names) 57 | # validate we can call the model on the dataset 58 | predictions = wrapped_model.predict(X_test) 59 | probabilities = wrapped_model.predict_proba(X_test) 60 | # validate predictions and probabilities have correct shape 61 | assert len(predictions) == num_predictions 62 | assert len(probabilities) == num_predictions 63 | 64 | 65 | def validate_wrapped_multilabel_model(wrapped_model, X_test, num_labels): 66 | # validate wrapped model has predict and predict_proba functions 67 | function_names = [SKLearn.PREDICT, SKLearn.PREDICT_PROBA] 68 | validate_functions(wrapped_model, function_names) 69 | # validate we can call the model on the dataset 70 | predictions = wrapped_model.predict(X_test) 71 | probabilities = wrapped_model.predict_proba(X_test) 72 | # validate predictions and probabilities have correct shape 73 | assert len(predictions.shape) == 2 74 | assert len(probabilities.shape) == 2 75 | assert predictions.shape[1] == num_labels 76 | assert probabilities.shape[1] == num_labels 77 | 78 | 79 | def validate_wrapped_regression_model(wrapped_model, X_test): 80 | # validate wrapped model has predict function and NO predict_proba function 81 | assert hasattr(wrapped_model, SKLearn.PREDICT) 82 | assert not hasattr(wrapped_model, SKLearn.PREDICT_PROBA) 83 | # validate we can call the model on the dataset 84 | predictions = wrapped_model.predict(X_test) 85 | # validate predictions have correct shape 86 | assert len(predictions.shape) == 1 87 | 88 | 89 | def validate_wrapped_question_answering_model(wrapped_model, X_test): 90 | # validate wrapped model has predict and predict_proba functions 91 | assert hasattr(wrapped_model, SKLearn.PREDICT) 92 | assert not hasattr(wrapped_model, SKLearn.PREDICT_PROBA) 93 | # validate we can call the model on the dataset 94 | predictions = wrapped_model.predict(X_test) 95 | # validate predictions have correct shape 96 | assert len(predictions) == len(X_test) 97 | assert isinstance(predictions[0], str) 98 | 99 | 100 | def validate_wrapped_tf_model(wrapped_tf_model, X_test, model_task): 101 | assert isinstance(wrapped_tf_model, WrappedTensorflowModel) 102 | validate_wrapped_pred_classes_model(wrapped_tf_model, X_test, model_task) 103 | 104 | 105 | def validate_wrapped_pytorch_model(wrapped_pytorch_model, X_test, model_task): 106 | assert isinstance(wrapped_pytorch_model, WrappedPytorchModel) 107 | validate_wrapped_pred_classes_model( 108 | wrapped_pytorch_model, X_test, model_task) 109 | 110 | 111 | def validate_wrapped_pred_classes_model(wrapped_model, X_test, model_task): 112 | function_names = [SKLearn.PREDICT, SKLearn.PREDICT_PROBA, PREDICT_CLASSES] 113 | validate_functions(wrapped_model, function_names) 114 | # validate we can call the model on the dataset 115 | if model_task == ModelTask.CLASSIFICATION: 116 | probabilities = wrapped_model.predict_proba(X_test) 117 | predictions = wrapped_model.predict_classes(X_test) 118 | # validate predictions and probabilities have correct shape 119 | assert len(predictions.shape) == 1 120 | assert len(probabilities.shape) == 2 121 | else: 122 | predictions = wrapped_model.predict(X_test) 123 | # validate predictions have correct shape 124 | assert len(predictions.shape) == 1 or predictions.shape[1] == 1 125 | 126 | 127 | def validate_functions(wrapped_model, function_names): 128 | for function_name in function_names: 129 | assert hasattr(wrapped_model, function_name) 130 | --------------------------------------------------------------------------------