├── .github
    └── workflows
    │   ├── formatting.yml
    │   ├── nv-a6000-fastgen.yml
    │   ├── nv-v100-legacy.yml
    │   ├── release.yml
    │   └── setup-venv
    │       └── action.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yaml
├── .style.yapf
├── CODEOWNERS
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── SECURITY.md
├── SUPPORT.md
├── docs
    ├── CNAME
    ├── Makefile
    ├── images
    │   ├── fast-gen-overview.png
    │   ├── fastgen-24-01-hero-dark.png
    │   ├── fastgen-24-01-hero-light.png
    │   ├── fastgen-arch-dark.png
    │   ├── fastgen-arch-light.png
    │   ├── fastgen-hero-dark.png
    │   ├── fastgen-hero-light.png
    │   ├── fastgen-hero.png
    │   ├── fastgen-overview-dark.png
    │   ├── fastgen-overview-light.png
    │   ├── mii-arch-dark.png
    │   ├── mii-arch-light.png
    │   ├── mii-dark.svg
    │   └── mii-white.svg
    ├── make.bat
    ├── requirements.txt
    └── source
    │   ├── api.rst
    │   ├── conf.py
    │   ├── config.rst
    │   ├── deployment.rst
    │   ├── index.rst
    │   ├── install.rst
    │   ├── parallelism.rst
    │   ├── pipeline.rst
    │   ├── quick-start.rst
    │   ├── replicas.rst
    │   ├── response.rst
    │   └── rest.rst
├── examples
    ├── README.md
    └── chat_templates
    │   └── template_alpaca.jinja
├── mii
    ├── __init__.py
    ├── aml_related
    │   ├── __init__.py
    │   ├── templates.py
    │   └── utils.py
    ├── api.py
    ├── backend
    │   ├── __init__.py
    │   ├── client.py
    │   └── server.py
    ├── batching
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── data_classes.py
    │   ├── generation
    │   │   ├── __init__.py
    │   │   ├── logit_processors.py
    │   │   ├── samplers.py
    │   │   └── stop_criterion.py
    │   ├── postprocess.py
    │   ├── ragged_batching.py
    │   └── utils.py
    ├── config.py
    ├── constants.py
    ├── entrypoints
    │   ├── __init__.py
    │   ├── api_server.py
    │   ├── data_models.py
    │   └── openai_api_server.py
    ├── errors.py
    ├── grpc_related
    │   ├── __init__.py
    │   ├── modelresponse_server.py
    │   ├── proto
    │   │   ├── __init__.py
    │   │   ├── build_script.sh
    │   │   ├── modelresponse.proto
    │   │   ├── modelresponse_pb2.py
    │   │   └── modelresponse_pb2_grpc.py
    │   ├── restful_gateway.py
    │   └── task_methods.py
    ├── launch
    │   ├── __init__.py
    │   └── multi_gpu_server.py
    ├── legacy
    │   ├── README.md
    │   ├── __init__.py
    │   ├── aml_related
    │   │   ├── __init__.py
    │   │   ├── templates.py
    │   │   └── utils.py
    │   ├── client.py
    │   ├── config.py
    │   ├── constants.py
    │   ├── deployment.py
    │   ├── docs
    │   │   ├── CNAME
    │   │   ├── GPT-NeoX.md
    │   │   └── images
    │   │   │   ├── azure-cost.png
    │   │   │   ├── bert.png
    │   │   │   ├── bloom.png
    │   │   │   ├── gpt.png
    │   │   │   ├── hero-dark.png
    │   │   │   ├── hero-transparent.png
    │   │   │   ├── hero.png
    │   │   │   ├── llm-latency-sd-latency.png
    │   │   │   ├── mii-arch.png
    │   │   │   ├── mii-dark.svg
    │   │   │   ├── mii-white.svg
    │   │   │   ├── multi-gpu-latency.png
    │   │   │   ├── opt-bloom.png
    │   │   │   ├── opt.png
    │   │   │   ├── roberta.png
    │   │   │   ├── sd-hero-dark.png
    │   │   │   ├── sd-hero-light.png
    │   │   │   ├── sd-latency.png
    │   │   │   └── tput-llms.png
    │   ├── examples
    │   │   ├── aml
    │   │   │   ├── fill-mask-example.py
    │   │   │   ├── text-generation-bloom.py
    │   │   │   └── text-generation-bloom560m-example.py
    │   │   ├── benchmark
    │   │   │   └── txt2img
    │   │   │   │   ├── README.md
    │   │   │   │   ├── baseline-sd.py
    │   │   │   │   ├── mii-sd.py
    │   │   │   │   ├── requirements.txt
    │   │   │   │   └── utils.py
    │   │   ├── local
    │   │   │   ├── chat
    │   │   │   │   ├── README.md
    │   │   │   │   ├── chat-client-example.py
    │   │   │   │   └── chat-server-example.py
    │   │   │   ├── conversational-example.py
    │   │   │   ├── conversational-query-example.py
    │   │   │   ├── fill-mask-example.py
    │   │   │   ├── question-answering-example.py
    │   │   │   ├── question-answering-query-example.py
    │   │   │   ├── text-classification-example.py
    │   │   │   ├── text-classification-query-example.py
    │   │   │   ├── text-generation-bloom-example.py
    │   │   │   ├── text-generation-bloom560m-example.py
    │   │   │   ├── text-generation-fbopt-example.py
    │   │   │   ├── text-generation-query-example.py
    │   │   │   ├── text-generation-zero-example.py
    │   │   │   ├── token-classification-example.py
    │   │   │   ├── token-classification-query-example.py
    │   │   │   └── txt2img-example.py
    │   │   └── non_persistent
    │   │   │   └── text-generation-bloom560-example.py
    │   ├── grpc_related
    │   │   ├── __init__.py
    │   │   ├── modelresponse_server.py
    │   │   ├── proto
    │   │   │   ├── __init__.py
    │   │   │   ├── build_script.sh
    │   │   │   ├── legacymodelresponse.proto
    │   │   │   ├── legacymodelresponse_pb2.py
    │   │   │   └── legacymodelresponse_pb2_grpc.py
    │   │   └── restful_gateway.py
    │   ├── launch
    │   │   ├── __init__.py
    │   │   └── multi_gpu_server.py
    │   ├── logging.py
    │   ├── method_table.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── load_models.py
    │   │   ├── providers
    │   │   │   ├── __init__.py
    │   │   │   ├── diffusers.py
    │   │   │   ├── eleutherai.py
    │   │   │   ├── huggingface.py
    │   │   │   └── utils.py
    │   │   ├── score
    │   │   │   ├── __init__.py
    │   │   │   ├── generate.py
    │   │   │   └── score_template.py
    │   │   └── utils.py
    │   ├── server.py
    │   ├── terminate.py
    │   └── utils.py
    ├── logging.py
    ├── modeling
    │   ├── __init__.py
    │   ├── models.py
    │   └── tokenizers.py
    ├── score
    │   ├── __init__.py
    │   ├── generate.py
    │   └── score_template.py
    └── utils.py
├── pyproject.toml
├── release
    ├── bump_patch_version.py
    ├── check_release_version.py
    └── release.sh
├── requirements
    ├── requirements-dev.txt
    └── requirements.txt
├── scripts
    ├── check-license.py
    └── model_download.py
├── setup.py
├── tests
    ├── __init__.py
    ├── conftest.py
    ├── legacy
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── pytest.ini
    │   ├── test_config.py
    │   ├── test_deployment_options.py
    │   ├── test_local_deployment.py
    │   └── test_non_persistent_deployment.py
    ├── pytest.ini
    ├── test_arg_parsing.py
    ├── test_config.py
    ├── test_deployment.py
    ├── test_model_support.py
    ├── test_pipeline.py
    └── test_ragged_batching.py
└── version.txt


/.github/workflows/formatting.yml:
--------------------------------------------------------------------------------
 1 | name: Formatting
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   pull_request:
 6 |     branches:
 7 |       '**'
 8 |   schedule:
 9 |     - cron: "0 0 * * *"
10 | 
11 | concurrency:
12 |   group: ${{ github.workflow }}-${{ github.ref }}
13 |   cancel-in-progress: true
14 | 
15 | jobs:
16 | 
17 |   # formatting and basic install on cpu-only machine
18 |   formatting:
19 |     runs-on: ubuntu-22.04
20 | 
21 |     steps:
22 |       - uses: actions/checkout@v4
23 | 
24 |       - name: environment
25 |         run: |
26 |           which python
27 |           python --version
28 | 
29 |       - name: Install dependencies
30 |         run: |
31 |           grep -E "clang-format|pre-commit" requirements/requirements-dev.txt | xargs pip install
32 | 
33 |       - name: Formatting checks
34 |         run: |
35 |            pre-commit run --all-files
36 | 


--------------------------------------------------------------------------------
/.github/workflows/nv-a6000-fastgen.yml:
--------------------------------------------------------------------------------
 1 | name: nv-a6000-fastgen
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   schedule:
 6 |     - cron: "0 0 * * *"
 7 |   pull_request:
 8 |     paths-ignore:
 9 |       - 'mii/legacy/**'
10 |       - 'tests/legacy/**'
11 |       - '.github/workflows/nv-v100-legacy.yml'
12 | 
13 | concurrency:
14 |   group: ${{ github.workflow }}-${{ github.ref }}
15 |   cancel-in-progress: true
16 | 
17 | jobs:
18 |   unit-tests:
19 |     runs-on: [self-hosted, nvidia, a6000]
20 |     container:
21 |       image: nvcr.io/nvidia/pytorch:24.03-py3
22 |       ports:
23 |         - 80
24 |       options: --gpus all --shm-size "8G"
25 | 
26 |     steps:
27 |       - uses: actions/checkout@v4
28 | 
29 |       - name: Check container state
30 |         run: |
31 |           ldd --version
32 |           nvcc --version
33 |           nvidia-smi
34 |           python -c "import torch; print('torch:', torch.__version__, torch)"
35 |           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
36 |       - name: Install transformers
37 |         run: |
38 |           git clone --depth=1 https://github.com/huggingface/transformers
39 |           cd transformers
40 |           git rev-parse --short HEAD
41 |           python -m pip install .
42 |       - name: Install deepspeed
43 |         run: |
44 |           git clone --depth=1 https://github.com/deepspeedai/DeepSpeed
45 |           cd DeepSpeed
46 |           python -m pip install .
47 |           ds_report
48 |       - name: Install MII
49 |         run: |
50 |           pip install .[dev]
51 |       - name: Python environment
52 |         run: |
53 |           python -m pip list
54 |       - name: Unit tests
55 |         run: |
56 |           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
57 |           cd tests
58 |           python -m pytest --color=yes --durations=0 --verbose -rF ./
59 | 


--------------------------------------------------------------------------------
/.github/workflows/nv-v100-legacy.yml:
--------------------------------------------------------------------------------
 1 | name: nv-v100-legacy
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   schedule:
 6 |     - cron: "0 0 * * *"
 7 |   pull_request:
 8 |     paths:
 9 |       - 'mii/__init__.py'
10 |       - 'mii/legacy/**'
11 |       - 'tests/legacy/**'
12 |       - '.github/workflows/nv-v100-legacy.yml'
13 |       - 'requirements/**'
14 |       - 'setup.py'
15 | 
16 | concurrency:
17 |   group: ${{ github.workflow }}-${{ github.ref }}
18 |   cancel-in-progress: true
19 | 
20 | jobs:
21 |   unit-tests:
22 |     runs-on: [self-hosted, nvidia, cu121, v100]
23 | 
24 |     steps:
25 |       - uses: actions/checkout@v4
26 | 
27 |       - id: setup-venv
28 |         uses: ./.github/workflows/setup-venv
29 | 
30 |       - name: Install pytorch
31 |         run: |
32 |           pip3 install -U --cache-dir /blob/torch_cache/ torch --index-url https://download.pytorch.org/whl/cu121
33 |           python -c "import torch; print('torch:', torch.__version__, torch)"
34 |           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
35 | 
36 |       - name: Install dependencies
37 |         run: |
38 |           pip install git+https://github.com/deepspeedai/DeepSpeed.git
39 |           pip install git+https://github.com/huggingface/transformers.git@v4.42.4
40 |           pip install -U accelerate
41 |           ds_report
42 | 
43 |       - name: Python environment
44 |         run: |
45 |           pip list
46 | 
47 |       - name: Install MII
48 |         run: |
49 |           pip install .[dev]
50 | 
51 |       - name: Unit tests
52 |         run: |
53 |           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
54 |           if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
55 |           cd tests/legacy
56 |           TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose ./
57 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Build and publish DeepSpeed-MII release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v*.*.*'
 7 | 
 8 | jobs:
 9 |   deploy:
10 |     runs-on: ubuntu-22.04
11 |     environment: release-env
12 | 
13 |     steps:
14 |     - uses: actions/checkout@v4
15 |       with:
16 |         ref: "main"
17 |     - name: Get release version from tag
18 |       run: |
19 |         echo "RELEASE_VERSION=${GITHUB_REF#refs/*/v}" >> $GITHUB_ENV
20 |     - name: Check release version
21 |       run: |
22 |         python release/check_release_version.py --release_version ${{ env.RELEASE_VERSION }}
23 |     - name: Build DeepSpeed-MII
24 |       run: |
25 |         pip install build
26 |         MII_BUILD_STRING=" " python -m build --wheel
27 |     - name: Publish to PyPI
28 |       uses: pypa/gh-action-pypi-publish@release/v1
29 |       with:
30 |         password: ${{ secrets.PYPI_API_TOKEN }}
31 |         repository-url: https://upload.pypi.org/legacy/
32 |     - name: Bump version
33 |       run: |
34 |         python release/bump_patch_version.py --current_version ${{ env.RELEASE_VERSION }}
35 |     - name: Create Pull Request
36 |       uses: peter-evans/create-pull-request@v6
37 |       with:
38 |         token: ${{ secrets.GH_PAT }}
39 |         add-paths: |
40 |           version.txt
41 |         body: |
42 |           **Auto-generated PR to update version.txt after a DeepSpeed release**
43 |           Released version - ${{ env.RELEASE_VERSION }}
44 |           Author           - @${{ github.actor }}
45 |         branch: AutoPR/${{ env.RELEASE_VERSION }}
46 |         assignees: ${{ github.actor }}
47 |         title: "Update version.txt after ${{ env.RELEASE_VERSION }} release"
48 |         author: ${{ github.actor }} <${{ github.actor }}@users.noreply.github.com>
49 | 


--------------------------------------------------------------------------------
/.github/workflows/setup-venv/action.yml:
--------------------------------------------------------------------------------
 1 | name: Create Virtual Environment
 2 | 
 3 | runs:
 4 |   using: "composite"
 5 |   steps:
 6 |     - id: update-env
 7 |       run: |
 8 |         sudo apt-get update
 9 |         sudo apt-get install -y libaio-dev
10 |         python -m pip install --user --upgrade pip
11 |         python -m pip install --user --upgrade virtualenv
12 |       shell: bash
13 |     - id: create-venv
14 |       run: |
15 |         python -m venv unit-test-venv
16 |         source ./unit-test-venv/bin/activate
17 |         python -m pip install --upgrade pip
18 |         pip install wheel # required after pip>=23.1
19 |         echo PATH=$PATH >> $GITHUB_ENV # Make it so venv is inherited for other steps
20 |       shell: bash
21 |     - id: print-env
22 |       run: |
23 |         which python
24 |         python --version
25 |         if [[ -z "${AISC_NODE_INSTANCE_ID}" ]]; then
26 |           echo "Not on self-hosted node"
27 |         else
28 |           echo "JobID: ${AISC_NODE_INSTANCE_ID}"
29 |         fi
30 |         if command -v nvidia-smi; then
31 |           nvidia-smi
32 |           which nvcc
33 |           nvcc --version
34 |         elif command -v rocm-smi; then
35 |           rocm-smi --showhw
36 |           which hipcc
37 |           hipcc --version
38 |         fi
39 |       shell: bash
40 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.swp
 3 | *.log
 4 | *~
 5 | .idea
 6 | build
 7 | dist
 8 | *.so
 9 | *.egg-info
10 | build.txt
11 | .vscode
12 | .theia
13 | .cache
14 | __pycache__
15 | mii/version.py
16 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: meta
 3 |     hooks:
 4 |     -   id: check-hooks-apply
 5 |     -   id: check-useless-excludes
 6 | 
 7 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 8 |     rev: v4.0.1
 9 |     hooks:
10 |     -   id: check-case-conflict
11 |     -   id: check-yaml
12 |     -   id: destroyed-symlinks
13 |     -   id: end-of-file-fixer
14 |         exclude: docs/CNAME
15 |     -   id: fix-byte-order-marker
16 |     -   id: fix-encoding-pragma
17 |         args: [--remove]
18 |     -   id: mixed-line-ending
19 |         args: [--fix=lf]
20 |     -   id: requirements-txt-fixer
21 |     -   id: trailing-whitespace
22 | 
23 | -   repo: https://github.com/pre-commit/mirrors-yapf
24 |     rev: v0.31.0
25 |     hooks:
26 |     -   id: yapf
27 | 
28 | -   repo: https://github.com/codespell-project/codespell
29 |     rev: v2.1.0
30 |     hooks:
31 |     -   id: codespell
32 |         args: [
33 |             # Do not check files that are automatically generated
34 |             '--skip=docs/Gemfile.lock,tests/unit/gpt2-merges.txt,tests/unit/gpt2-vocab.json',
35 |             '--ignore-regex=\\n',  # Do not count the 'n' in an escaped newline as part of a word
36 |             '--ignore-words-list=unsupport,aks',  # Word used in error messages that need rewording
37 |             --check-filenames,
38 |             --check-hidden
39 |         ]
40 | 
41 | -   repo: https://github.com/pycqa/flake8
42 |     rev: 4.0.1
43 |     hooks:
44 |     -   id: flake8
45 |         args: ['--ignore=E,F403,F405,F541,F841,W', '--select=E9,F,W6', '--per-file-ignores=__init__.py:F401,mii/grpc_related/proto/modelresponse_pb2.py:F821,F401,mii/legacy/grpc_related/proto/legacymodelresponse_pb2.py:F821,F401']
46 | 
47 | -   repo: local
48 |     hooks:
49 |     -   id: check-license
50 |         name: check-license
51 |         entry: ./scripts/check-license.py
52 |         language: script
53 |         files: \.(py|c|cpp|cu|cc|h|hpp|cuh|hip|tr|sh)$
54 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: "2"
 2 | 
 3 | build:
 4 |   os: "ubuntu-22.04"
 5 |   tools:
 6 |     python: "3.10"
 7 | 
 8 | python:
 9 |   install:
10 |     - requirements: docs/requirements.txt
11 | 
12 | sphinx:
13 |   configuration: docs/source/conf.py
14 | 


--------------------------------------------------------------------------------
/.style.yapf:
--------------------------------------------------------------------------------
1 | [style]
2 | SPLIT_ALL_COMMA_SEPARATED_VALUES = true
3 | COLUMN_LIMIT = 89
4 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | *       @tohtana @tjruwase @loadams
2 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | DeepSpeed-MII welcomes your contributions!
 3 | 
 4 | ## Prerequisites
 5 | We use [pre-commit](https://pre-commit.com/) to ensure that formatting is
 6 | consistent across DeepSpeed. First, ensure that `pre-commit` is installed from either
 7 | installing DeepSpeed or `pip install pre-commit`. Next, the pre-commit hooks must be
 8 | installed once before commits can be made:
 9 | ```bash
10 | pre-commit install
11 | ```
12 | 
13 | Afterwards, our suite of formatting tests run automatically before each `git commit`. You
14 | can also run these manually:
15 | ```bash
16 | pre-commit run --all-files
17 | ```
18 | If a formatting test fails, it will fix the modified code in place and abort
19 | the `git commit`. After looking over the changes, you can `git add <modified files>`
20 | and then repeat the previous `git commit` command.
21 | 
22 | ## Developer Certificate of Origin
23 | This project welcomes contributions and suggestions. All contributions to deepspeedai projects
24 | require commits to be signed off with a [Developer Certificate of Origin](https://en.wikipedia.org/wiki/Developer_Certificate_of_Origin)
25 | (DCO) declaring that you have the right to, and actually do, grant us the rights to use your contribution.
26 | 
27 | When you submit a pull request, the DCO app will check for the presence of signed commits.
28 | Information about how this check works is here: https://github.com/dcoapp/app?tab=readme-ov-file#how-it-works
29 | 
30 | ## Code of Conduct
31 | This project has adopted the [Microsoft Open Source Code of
32 | Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the
33 | [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact
34 | [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or
35 | comments.
36 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # TODO: The maintainer of this repo has not yet edited this file
 2 | 
 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
 4 | 
 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help.
 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport).
 7 | - **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide.
 8 | 
 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10 | 
11 | # Support
12 | 
13 | ## How to file issues and get help
14 | 
15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing
16 | issues before filing new issues to avoid duplicates.  For new issues, file your bug or
17 | feature request as a new Issue.
18 | 
19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE
20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22 | 
23 | ## Microsoft Support Policy
24 | 
25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
26 | 


--------------------------------------------------------------------------------
/docs/CNAME:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/docs/CNAME


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/images/fast-gen-overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/docs/images/fast-gen-overview.png


--------------------------------------------------------------------------------
/docs/images/fastgen-24-01-hero-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/docs/images/fastgen-24-01-hero-dark.png


--------------------------------------------------------------------------------
/docs/images/fastgen-24-01-hero-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/docs/images/fastgen-24-01-hero-light.png


--------------------------------------------------------------------------------
/docs/images/fastgen-arch-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/docs/images/fastgen-arch-dark.png


--------------------------------------------------------------------------------
/docs/images/fastgen-arch-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/docs/images/fastgen-arch-light.png


--------------------------------------------------------------------------------
/docs/images/fastgen-hero-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/docs/images/fastgen-hero-dark.png


--------------------------------------------------------------------------------
/docs/images/fastgen-hero-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/docs/images/fastgen-hero-light.png


--------------------------------------------------------------------------------
/docs/images/fastgen-hero.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/docs/images/fastgen-hero.png


--------------------------------------------------------------------------------
/docs/images/fastgen-overview-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/docs/images/fastgen-overview-dark.png


--------------------------------------------------------------------------------
/docs/images/fastgen-overview-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/docs/images/fastgen-overview-light.png


--------------------------------------------------------------------------------
/docs/images/mii-arch-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/docs/images/mii-arch-dark.png


--------------------------------------------------------------------------------
/docs/images/mii-arch-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/docs/images/mii-arch-light.png


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | asyncio
 2 | autodoc_pydantic>=2.0.0
 3 | deepspeed>=0.15.0
 4 | grpcio
 5 | grpcio-tools
 6 | sphinx==7.1.2
 7 | sphinx-prompt
 8 | sphinx-rtd-theme==1.3.0rc1
 9 | sphinx_autodoc_typehints
10 | sphinx_copybutton
11 | torch
12 | transformers
13 | ujson
14 | zmq
15 | 


--------------------------------------------------------------------------------
/docs/source/api.rst:
--------------------------------------------------------------------------------
 1 | API
 2 | ===
 3 | 
 4 | DeepSpeed-MII provides a very simple API to deploy your LLM:
 5 | 
 6 | .. autofunction:: mii.pipeline
 7 | 
 8 | The :func:`mii.pipeline` API is a great way to try DeepSpeed-MII with ragged
 9 | batching and dynamic splitfuse. The pipeline is non-persistent and only exists
10 | for the lifetime of the python script where it is used. For examples of how to
11 | use :func:`mii.pipeline` please see :doc:`pipeline`.
12 | 
13 | .. autofunction:: mii.serve
14 | 
15 | The :func:`mii.serve` API is intended for production use cases, where a
16 | persistent model deployment is necessary. The persistent deployment utilizes
17 | ragged batching and dynamic splitfuse to deliver high throughput and low latency
18 | to multiple clients in parallel. For examples of how to use :func:`mii.serve`
19 | please see :doc:`deployment`.
20 | 
21 | .. autofunction:: mii.client
22 | 
23 | The :func:`mii.client` API allows multiple processes to connect to a persistent
24 | deployment created with :func:`mii.serve`. For examples of how to use
25 | :func:`mii.client` please see :doc:`deployment`.
26 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | # Configuration file for the Sphinx documentation builder.
 6 | import os
 7 | import sys
 8 | 
 9 | sys.path.insert(0, os.path.abspath('../../'))
10 | 
11 | # -- Project information
12 | 
13 | project = 'DeepSpeed-MII'
14 | copyright = '2023, Microsoft'
15 | author = 'Microsoft'
16 | 
17 | with open("../../version.txt", "r") as f:
18 |     release = f.readline().rstrip()
19 | 
20 | # -- General configuration
21 | 
22 | extensions = [
23 |     'sphinx.ext.duration',
24 |     'sphinx.ext.doctest',
25 |     'sphinx.ext.autodoc',
26 |     'sphinx.ext.autosummary',
27 |     'sphinx.ext.intersphinx',
28 |     'sphinx.ext.viewcode',
29 |     'sphinx_autodoc_typehints',
30 |     'sphinx_copybutton',
31 |     'sphinx-prompt',
32 |     'sphinxcontrib.autodoc_pydantic',
33 | ]
34 | 
35 | intersphinx_mapping = {
36 |     'python': ('https://docs.python.org/3/',
37 |                None),
38 |     'sphinx': ('https://www.sphinx-doc.org/en/master/',
39 |                None),
40 | }
41 | intersphinx_disabled_domains = ['std']
42 | 
43 | # sphinx_autodoc_typehints config
44 | typehints_defaults = "braces"
45 | 
46 | # autodoc_pyandtic config
47 | autodoc_pydantic_model_show_field_summary = False
48 | autodoc_pydantic_field_signature_prefix = ' '
49 | autodoc_pydantic_model_signature_prefix = 'class'
50 | autodoc_pydantic_model_show_json = False
51 | autodoc_pydantic_model_show_config_summary = False
52 | autodoc_pydantic_model_show_config_member = False
53 | autodoc_pydantic_model_show_validator_summary = False
54 | autodoc_pydantic_model_show_validator_members = False
55 | autodoc_pydantic_model_summary_list_order = 'bysource'
56 | autodoc_pydantic_model_member_order = 'bysource'
57 | autodoc_pydantic_field_list_validators = False
58 | 
59 | # sphinx_copybutton config
60 | copybutton_prompt_text = r">>> |\$ |\(.venv\) \$ "
61 | copybutton_prompt_is_regexp = True
62 | 
63 | #autodoc_mock_imports = ["deepspeed", "torch"]
64 | autodoc_member_order = 'bysource'
65 | autosummary_generate = True
66 | 
67 | templates_path = ['_templates']
68 | 
69 | # -- Options for HTML output
70 | 
71 | html_theme = 'sphinx_rtd_theme'
72 | html_theme_options = {
73 |     "logo_only": True,
74 | }
75 | html_logo = "../images/mii-dark.svg"
76 | logo_only = True
77 | 
78 | # -- Options for EPUB output
79 | epub_show_urls = 'footnote'
80 | 


--------------------------------------------------------------------------------
/docs/source/config.rst:
--------------------------------------------------------------------------------
 1 | Configuration
 2 | =============
 3 | 
 4 | The config classes described here are used to customize :doc:`pipeline` and :doc:`deployment`.
 5 | 
 6 | .. _model_configuration:
 7 | 
 8 | Model Configuration
 9 | -------------------
10 | 
11 | The :class:`ModelConfig <mii.config.ModelConfig>` is used to stand up a
12 | DeepSpeed inference engine and provides a large amount of control to users. This
13 | class is automatically generated from user-provided arguments to
14 | :func:`mii.pipeline` and :func:`mii.serve`. The fields can be provided in a
15 | ``model_config`` dictionary or as keyword arguments.
16 | 
17 | For example, to change the default ``max_length`` for token generation of a
18 | pipeline, the following are equivalent:
19 | 
20 | As a keyword argument:
21 | 
22 | .. code-block:: python
23 | 
24 |     pipe = mii.pipeline("mistralai/Mistral-7B-v0.1", max_length=2048)
25 | 
26 | As a ``model_config`` dictionary:
27 | 
28 | .. code-block:: python
29 | 
30 |     pipe = mii.pipeline("mistralai/Mistral-7B-v0.1", model_config={"max_length": 2048})
31 | 
32 | .. autopydantic_model:: mii.config.ModelConfig
33 | 
34 | .. _mii_configuration:
35 | 
36 | MII Server Configuration
37 | ------------------------
38 | 
39 | The :class:`MIIConfig <mii.config.MIIConfig>` is used to stand up a
40 | DeepSpeed-MII `gRPC <https://grpc.io/>`_ server and provide a large amount of
41 | control to users. This class is automatically generated from user-provided
42 | arguments to :func:`mii.serve`. The fields can be provided in a ``mii_config``
43 | dictionary or as keyword arguments.
44 | 
45 | For example, to change the base port number used to to communicate with a
46 | persistent deployment and the default ``max_length`` for token generation, the
47 | following are equivalent:
48 | 
49 | As keyword arguments:
50 | 
51 | .. code-block:: python
52 | 
53 |     client = mii.serve("mistralai/Mistral-7B-v0.1", port_number=50055, max_length=2048)
54 | 
55 | As ``model_config`` and ``mii_config`` dictionaries:
56 | 
57 | .. code-block:: python
58 | 
59 |     client = mii.serve("mistralai/Mistral-7B-v0.1", mii_config={"port_number": 50055}, model_config={"max_length": 2048})
60 | 
61 | .. autopydantic_model:: mii.config.MIIConfig
62 | 
63 | Text-Generation Configuration
64 | -----------------------------
65 | 
66 | The :class:`GenerateParamsConfig <mii.config.GenerateParamsConfig>` is used to
67 | process user-provided keyword arguments passed to :class:`MIIPipeline
68 | <mii.batching.ragged_batching.MIIPipeline>` and :class:`MIIClient
69 | <mii.backend.client.MIIClient>` when doing text-generation.
70 | 
71 | .. autopydantic_model:: mii.config.GenerateParamsConfig
72 |     :exclude-members: prompt_length
73 | 


--------------------------------------------------------------------------------
/docs/source/deployment.rst:
--------------------------------------------------------------------------------
  1 | Persistent Deployments
  2 | ======================
  3 | 
  4 | A persistent model deployment can created with the :func:`mii.serve` API. This
  5 | stands up a gRPC server and returns a :class:`MIIClient
  6 | <mii.backend.client.MIIClient>` object that can be used to send generation
  7 | requests to the inference server. The inference server will persist after the
  8 | python script exits and until it is explicitly terminated.
  9 | 
 10 | To connect to an existing deployment, the :func:`mii.client` API is used. This
 11 | will connect with an existing gRPC server and return a :class:`MIIClient
 12 | <mii.backend.client.MIIClient>` object.
 13 | 
 14 | MIIClient
 15 | ---------
 16 | 
 17 | .. autoclass::
 18 |     mii.backend.client.MIIClient
 19 | 
 20 |     .. automethod:: __call__
 21 | 
 22 |     .. automethod:: generate
 23 | 
 24 |     .. automethod:: terminate_server
 25 | 
 26 | :class:`MIIClient <mii.backend.client.MIIClient>` is a callable class that
 27 | provides a simplified interface for generating text for prompt inputs on a
 28 | persistent model deployment. To create a persistent deployment, you must only
 29 | provide the HuggingFace model name (or path to a locally stored model) to the
 30 | :func:`mii.serve` API. DeepSpeed-MII will automatically load the model weights,
 31 | create an inference engine, stand up a gRPC server, and return the callable
 32 | client. An example is provided below:
 33 | 
 34 | .. code-block:: python
 35 | 
 36 |     import mii
 37 |     client = mii.serve("mistralai/Mistral-7B-v0.1")
 38 |     response = client(["DeepSpeed is", "Seattle is"], max_new_tokens=128)
 39 |     print(response)
 40 | 
 41 | Because the deployment is persistent, this server will continue running until it
 42 | is explicitly shutdown. This allows users to connect to a deployment from other
 43 | processes using the :func:`mii.client` API:
 44 | 
 45 | .. code-block:: python
 46 | 
 47 |     import mii
 48 |     client = mii.client("mistralai/Mistral-7B-v0.1")
 49 |     response = client(["DeepSpeed is", "Seattle is"], max_new_tokens=128)
 50 |     print(response)
 51 | 
 52 | When a server needs to be shutdown, this can be done from any client object:
 53 | 
 54 | .. code-block:: python
 55 | 
 56 |     import mii
 57 |     client = mii.client("mistralai/Mistral-7B-v0.1")
 58 |     client.terminate_server()
 59 | 
 60 | Deployment Configuration
 61 | ------------------------
 62 | 
 63 | While we prioritize offering a simple interface for loading models into
 64 | production-ready persistent deployments, we also provide many configuration
 65 | options for our persistent deployment.
 66 | 
 67 | **Any of the fields in** :class:`ModelConfig <mii.config.ModelConfig>` **and**
 68 | :class:`MIIConfig <mii.config.MIIConfig>` **can be passed as keyword
 69 | arguments or in respective** ``model_config`` **and** ``mii_config``
 70 | **dictionaries to the** :func:`mii.serve` **API. Please see** :ref:`Model
 71 | Configuration <model_configuration>` **and** :ref:`MII Server Configuration
 72 | <mii_configuration>` **for more information.**
 73 | 
 74 | 
 75 | Generate Options
 76 | ----------------
 77 | 
 78 | Text-generation behavior using the callable :class:`MIIClient
 79 | <mii.backend.client.MIIClient>` class can be customized with several keyword
 80 | arguments. A full list of the available options can be found in
 81 | :class:`GenerateParamsConfig <mii.config.GenerateParamsConfig>`.
 82 | 
 83 | The generate options affect on the prompt(s) passed in a given call the client.
 84 | For example, the generation length can be controlled on a per-prompt basis and
 85 | override the default ``max_length``:
 86 | 
 87 | .. code-block:: python
 88 | 
 89 |     response_long = client(prompt, max_length=1024)
 90 |     response_short = client(prompt, max_length=128)
 91 | 
 92 | .. _deployment_model_parallelism:
 93 | 
 94 | Model Parallelism
 95 | -----------------
 96 | 
 97 | Our persistent deployment supports splitting models across multiple GPUs using
 98 | tensor parallelism. To enable model parallelism, pass the ``tensor_parallel``
 99 | argument to :func:`mii.serve`:
100 | 
101 | .. code-block:: python
102 | 
103 |     client = mii.serve("mistralai/Mistral-7B-v0.1", tensor_parallel=2)
104 | 
105 | .. _deployment_model_replicas:
106 | 
107 | Model Replicas
108 | --------------
109 | 
110 | The persistent deployment can also create multiple model replicas. Passing the
111 | ``replica_num`` argument to :func:`mii.serve` enables this feature:
112 | 
113 | .. code-block:: python
114 | 
115 |     client = mii.serve("mistralai/Mistral-7B-v0.1", replica_num=2)
116 | 
117 | With multiple model replicas, the incoming requests from clients will be
118 | forwarded to the replicas in a round-robin scheduling by an intermediate
119 | load-balancer process. For example, if 4 requests with ids ``0, 1, 2, 3`` are
120 | sent to the persistent deployment, then ``replica 0`` will process requests
121 | ``0`` and ``2`` while ``replica 1`` will process requests ``1`` and ``3``.
122 | 
123 | Model replicas also compose with model parallelism. For example, 2 replicas can
124 | be created each split across 2 GPUs on a system with 4 GPUs total:
125 | 
126 | .. code-block:: python
127 | 
128 |     client = mii.serve("mistralai/Mistral-7B-v0.1", replica_num=2, tensor_parallel=2)
129 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | DeepSpeed-MII
 2 | =============
 3 | 
 4 | .. image:: ../images/mii-white.svg
 5 |    :width: 600
 6 | 
 7 | .. note::
 8 | 
 9 |    This project is under active development.
10 | 
11 | 
12 | Introducing MII, an open-source Python library designed by DeepSpeed to
13 | democratize powerful model inference with a focus on high-throughput, low
14 | latency, and cost-effectiveness.
15 | 
16 | MII v0.1 introduced several features as part of our `DeepSpeed-FastGen release
17 | <https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen>`_
18 | such as blocked KV-caching, continuous batching, Dynamic SplitFuse, tensor
19 | parallelism, and high-performance CUDA kernels to support fast high throughput
20 | text-generation with LLMs. The latest version of MII delivers up to 2.5 times
21 | higher effective throughput compared to leading systems such as vLLM. For
22 | detailed performance results please see our `DeepSpeed-FastGen release blog
23 | <https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen>`_
24 | and the `latest DeepSpeed-FastGen blog
25 | <https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19>`_.
26 | 
27 | MII-Legacy
28 | ----------
29 | 
30 | We first `announced MII <https://www.deepspeed.ai/2022/10/10/mii.html>`_ in
31 | 2022. Since then, MII has undergone a large refactoring effort to bring support
32 | of DeepSpeed-FastGen. MII-Legacy, which covers all prior releases up to v0.0.9,
33 | provides support for running inference for a wide variety of language model
34 | tasks. We also support accelerating `text2image models like Stable Diffusion
35 | <https://github.com/deepspeedai/DeepSpeed-MII/tree/main/mii/legacy/examples/benchmark/txt2img>`_.
36 | For more details on our previous releases please see our `legacy APIs
37 | <https://github.com/deepspeedai/DeepSpeed-MII/tree/main/mii/legacy/>`_.
38 | 
39 | 
40 | Contents
41 | --------
42 | 
43 | .. toctree::
44 |    :maxdepth: 1
45 | 
46 |    quick-start
47 |    install
48 |    api
49 |    pipeline
50 |    deployment
51 |    response
52 |    config
53 |    rest
54 |    parallelism
55 |    replicas
56 | 


--------------------------------------------------------------------------------
/docs/source/install.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ============
 3 | 
 4 | PyPI Install
 5 | ------------
 6 | 
 7 | The quickest way to get started with DeepSpeed-MII is to install it from `PyPI
 8 | <https://pypi.org/project/deepspeed-mii/>`_ using pip:
 9 | 
10 | .. code-block:: console
11 | 
12 |    (.venv) $ pip install deepspeed-mii
13 | 
14 | Source Install
15 | --------------
16 | 
17 | If you want the latest changes on the ``main`` repository branch, you can use
18 | pip to install from source:
19 | 
20 | .. code-block:: console
21 | 
22 |    (.venv) $ pip install git+https://github.com/deepspeedai/DeepSpeed-MII.git
23 | 
24 | Or you can clone the repository and install:
25 | 
26 | .. code-block:: console
27 | 
28 |    (.venv) $ git clone https://github.com/deepspeedai/DeepSpeed-MII.git
29 |    (.venv) $ pip install ./DeepSpeed-MII
30 | 


--------------------------------------------------------------------------------
/docs/source/parallelism.rst:
--------------------------------------------------------------------------------
 1 | Model Parallelism
 2 | =================
 3 | 
 4 | DeepSpeed-MII supports model parallelism via tensor parallelism for splitting models across multiple GPUs.
 5 | 
 6 | For model parallelism with :doc:`pipeline`, please see :ref:`Pipeline Model
 7 | Parallelism <pipeline_model_parallelism>`.
 8 | 
 9 | For model parallelism with :doc:`deployment`, please see :ref:`Persistent
10 | Deployment Model Parallelism <deployment_model_parallelism>`.
11 | 


--------------------------------------------------------------------------------
/docs/source/pipeline.rst:
--------------------------------------------------------------------------------
  1 | Non-Persistent Pipelines
  2 | ========================
  3 | 
  4 | A non-persistent pipeline can be created with the :func:`mii.pipeline` API. This
  5 | returns a non-persistent :class:`MIIPipeline
  6 | <mii.batching.ragged_batching.MIIPipeline>` object that is destroyed when the
  7 | python script exits.
  8 | 
  9 | MIIPipeline
 10 | -----------
 11 | 
 12 | .. autoclass::
 13 |     mii.batching.ragged_batching.MIIPipeline
 14 | 
 15 |     .. automethod:: __call__
 16 | 
 17 | :class:`MIIPipeline <mii.batching.ragged_batching.MIIPipeline>` is a callable
 18 | class that provides a simplified interface for generating text for prompt
 19 | inputs. To create a pipeline, you must only provide the HuggingFace model name
 20 | (or path to a locally stored model) to the :func:`mii.pipeline` API.
 21 | DeepSpeed-MII will automatically load the model weights, create an inference
 22 | engine, and return the callable pipeline. A simple 4-line example is provided below:
 23 | 
 24 | .. code-block:: python
 25 | 
 26 |     import mii
 27 |     pipe = mii.pipeline("mistralai/Mistral-7B-v0.1")
 28 |     response = pipe(["DeepSpeed is", "Seattle is"], max_new_tokens=128)
 29 |     print(response)
 30 | 
 31 | Pipeline Configuration
 32 | ----------------------
 33 | 
 34 | While we prioritize offering a simple interface to load models and run
 35 | text-generation, we also provide many configuration options for users that want
 36 | to customize the pipeline.
 37 | 
 38 | **Any of the fields in** :class:`ModelConfig <mii.config.ModelConfig>` **can be
 39 | passed as keyword arguments or in a** ``model_config`` **dictionary to the**
 40 | :func:`mii.pipeline` **API. Please see** :ref:`Model Configuration
 41 | <model_configuration>` **for more information.**
 42 | 
 43 | Generate Options
 44 | ----------------
 45 | 
 46 | The text-generation of the callable :class:`MIIPipeline
 47 | <mii.batching.ragged_batching.MIIPipeline>` class can be modified with several
 48 | keyword arguments. A full list of the available options can be found in
 49 | :class:`GenerateParamsConfig <mii.config.GenerateParamsConfig>`.
 50 | 
 51 | The generate options affect only the prompt(s) passed in a given call to the
 52 | pipeline. For example, you can control per-prompt generation length:
 53 | 
 54 | .. code-block:: python
 55 | 
 56 |     response_long = pipeline(prompt, max_length=1024)
 57 |     response_short = pipeline(prompt, max_length=128)
 58 | 
 59 | .. _pipeline_model_parallelism:
 60 | 
 61 | Model Parallelism
 62 | -----------------
 63 | 
 64 | Our pipeline object supports splitting models across multiple GPUs using tensor
 65 | parallelism. You must use the ``deepspeed`` launcher to enable tennsor parallelism
 66 | with the non-persistent pipeline, where the number of devices is controlled by
 67 | the ``--num_gpus <int>`` option.
 68 | 
 69 | As an example, consider the following ``example.py`` python script:
 70 | 
 71 | .. code-block:: python
 72 | 
 73 |     # example.py
 74 |     import mii
 75 |     pipe = mii.pipeline("mistralai/Mistral-7B-v0.1")
 76 | 
 77 | To run this pipeline on a single GPU, use ``python`` or ``deepspeed --num_gpus 1``:
 78 | 
 79 | .. code-block:: console
 80 | 
 81 |     (.venv) $ python example.py
 82 | 
 83 | To enable tensor parallelism across 2 GPUs, use ``deepspeed --num_gpus 2``:
 84 | 
 85 | .. code-block:: console
 86 | 
 87 |     (.venv) $ deepspeed --num_gpus 2 example.py
 88 | 
 89 | Because the ``deepspeed`` launcher will run multiple processes of
 90 | ``example.py``, anything in the script will be executed by each process. For
 91 | example, consider the following script:
 92 | 
 93 | .. code-block:: python
 94 | 
 95 |     # example.py
 96 |     import os
 97 |     import mii
 98 |     local_rank = int(os.getenv("LOCAL_RANK", 0))
 99 |     pipe = mii.pipeline("mistralai/Mistral-7B-v0.1")
100 |     response = pipe("DeepSpeed is", max_length=16)
101 |     print(f"rank {local_rank} response: {response}")
102 | 
103 | By default, the response is returned to only the rank 0 process. When run
104 | with ``deepspeed --num_gpus 2 example.py`` the following output is produced:
105 | 
106 | .. code-block:: console
107 | 
108 |     (.venv) $ deepspeed --num_gpus 2 example.py
109 |     rank 0 response: [a library for parallelizing and accelerating PyTorch.]
110 |     rank 1 response: []
111 | 
112 | This behavior can be changed by enabling ``all_rank_output`` when creating the
113 | pipeline (i.e., ``pipe = mii.pipeline("mistralai/Mistral-7B-v0.1",
114 | all_rank_output=True)``):
115 | 
116 | .. code-block:: console
117 | 
118 |     (.venv) $ deepspeed --num_gpus 2 example.py
119 |     rank 0 response: [a library for parallelizing and accelerating PyTorch.]
120 |     rank 1 response: [a library for parallelizing and accelerating PyTorch.]
121 | 


--------------------------------------------------------------------------------
/docs/source/quick-start.rst:
--------------------------------------------------------------------------------
 1 | FastGen Quick Start Guide
 2 | =========================
 3 | 
 4 | This guide is aimed to get you quickly up and running DeepSpeed-MII and DeepSpeed-FastGen.
 5 | 
 6 | Requirements
 7 | ------------
 8 | 
 9 | - 1 or more NVIDIA GPUs with >=sm_80 compute capability (e.g., A100, A6000)
10 | - `PyTorch <https://pytorch.org/get-started/locally/>`_ installed in your local Python environment
11 | 
12 | Install
13 | -------
14 | 
15 | Install the latest version of DeepSpeed-MII with the following:
16 | 
17 | .. code-block:: console
18 | 
19 |    (.venv) $ pip install -U deepspeed-mii
20 | 
21 | Run a Non-Persistent Pipeline
22 | -----------------------------
23 | 
24 | A pipeline provides a non-persistent instance of the model for running
25 | inference. When the script running this code exits, the model will also be
26 | destroyed. The pipeline is ideal for doing quick tests or in cases where the
27 | best performance is not necessary.
28 | 
29 | Copy the following code block into an ``example.py`` file on your local machine.
30 | Run it with ``deepspeed --num_gpus <num of GPUs> example.py``.
31 | 
32 | .. code-block:: python
33 | 
34 |     import mii
35 |     pipe = mii.pipeline("mistralai/Mistral-7B-v0.1")
36 |     response = pipe(["DeepSpeed is", "Seattle is"], max_new_tokens=128)
37 |     for r in response:
38 |         print(r.generated_text)
39 | 
40 | .. note::
41 | 
42 |    Depending on your internet connection, the download of model weights could
43 |    take a few minutes. If you wish to try a smaller model, replace
44 |    ``mistralai/Mistral-7B-v0.1`` with ``facebook/opt-125m`` in the above code.
45 | 
46 | If the code successfully runs, you should see the generated text printed in your terminal.
47 | 
48 | Run a Persistent Deployment
49 | ---------------------------
50 | 
51 | In contrast the pipeline, deployments create a server process that persists
52 | beyond the execution of the python script. These deployments are intended for
53 | production use cases and allow for multiple clients to connect while providing
54 | the best performance from DeepSpeed-FastGen.
55 | 
56 | Copy the following code block into a ``serve.py`` file on your local machine.
57 | Run it with ``python serve.py``.
58 | 
59 | .. code-block:: python
60 | 
61 |     import mii
62 |     mii.serve("mistralai/Mistral-7B-v0.1")
63 | 
64 | You should see logging messages indicating the server is starting and a final
65 | log message of ``server has started on ports [50051]``.
66 | 
67 | Now copy the following code block into a ``client.py`` file on your local
68 | machine. Run it with ``python client.py``.
69 | 
70 | .. code-block:: python
71 | 
72 |     import mii
73 |     client = mii.client("mistralai/Mistral-7B-v0.1")
74 |     response = client(["DeepSpeed is", "Seattle is"], max_new_tokens=128)
75 |     for r in response:
76 |         print(r.generated_text)
77 | 
78 | If the code successfully runs, you should see the generated text printed in your
79 | terminal. You can run this client script as many times (and from as many
80 | different processes) as you like and the model deployment will remain active.
81 | 
82 | Finally copy the following code block into a ``terminate.py`` file on your local
83 | machine. Run it with ``python terminate.py``.
84 | 
85 | .. code-block:: python
86 | 
87 |     import mii
88 |     client = mii.client("mistralai/Mistral-7B-v0.1")
89 |     client.terminate_server()
90 | 
91 | This will shutdown the model deployment and free GPU memory.
92 | 


--------------------------------------------------------------------------------
/docs/source/replicas.rst:
--------------------------------------------------------------------------------
1 | Model Replicas
2 | ==============
3 | 
4 | DeepSpeed-MII supports creating multiple replicas of a model with
5 | :doc:`deployment`. Please see :ref:`Persistent Deployment Model Replicas
6 | <deployment_model_replicas>`.
7 | 


--------------------------------------------------------------------------------
/docs/source/response.rst:
--------------------------------------------------------------------------------
 1 | Response Objects
 2 | ================
 3 | 
 4 | Generated text from :doc:`pipeline` and :doc:`deployment` are wrapped in the
 5 | :class:`Response <mii.batching.data_classes.Response>` class.
 6 | 
 7 | .. autoclass::
 8 |     mii.batching.data_classes.Response
 9 |     :members:
10 | 
11 | Printing a :class:`Response <mii.batching.data_classes.Response>` object will
12 | print only the ``generated_text`` attribute. Details about the generation can be
13 | accessed as python attributes of the class:
14 | 
15 | .. code-block:: python
16 | 
17 |     responses = pipeline(["DeepSpeed is", "Seattle is"], max_length=128)
18 |     for r in responses:
19 |         print(f"generated length: {r.generated_length}, finish reason: {r.finish_reason}")
20 | 
21 | The reason that a text-generation request completed will be one of the values
22 | found in the :class:`GenerationFinishReason
23 | <mii.constants.GenerationFinishReason>` enum:
24 | 
25 | .. autoclass::
26 |     mii.constants.GenerationFinishReason
27 |     :inherited-members:
28 | 


--------------------------------------------------------------------------------
/docs/source/rest.rst:
--------------------------------------------------------------------------------
 1 | RESTful API
 2 | ===========
 3 | 
 4 | With a :doc:`deployment`, a RESTful API can be created. This allows users to
 5 | send requests to the server via ``HTTP POST`` methods (e.g., using ``curl`` or
 6 | the Python ``requests`` module). The RESTful API can be enabled with the
 7 | ``enable_restful_api`` option using :func:`mii.serve`:
 8 | 
 9 | .. code-block:: python
10 | 
11 |     client = mii.serve(
12 |         "mistralai/Mistral-7B-v0.1",
13 |         deployment_name="test_dep",
14 |         enable_restful_api=True,
15 |         restful_api_port=28080,
16 |     )
17 | 
18 | It is useful to provide a ``deployment_name`` and ``restful_api_port`` when
19 | enabling the RESTful API as it will be used to provide an address where requests
20 | can be sent. The address for sending requests will be
21 | ``http://{HOST}:{RESTFUL_API_PORT}/mii/{DEPLOYMENT_NAME}``. In the above
22 | example, this will be ``http://localhost:28080/mii/test_dep``.
23 | 
24 | To send a request to the RESTful API, use the ``HTTP POST`` method. For example, using ``curl``:
25 | 
26 | .. code-block:: console
27 | 
28 |     (.venv) $ curl --header "Content-Type: application/json" --request POST  -d '{"prompts": ["DeepSpeed is", "Seattle is"], "max_length": 128}' http://localhost:28080/mii/test_dep
29 | 
30 | or using the Python ``requests`` module:
31 | 
32 | .. code-block:: python
33 | 
34 |     import json
35 |     import requests
36 |     url = f"http://localhost:28080/mii/test_dep"
37 |     params = {"prompts": ["DeepSpeed is", "Seattle is"], "max_length": 128}
38 |     json_params = json.dumps(params)
39 |     output = requests.post(
40 |         url, data=json_params, headers={"Content-Type": "application/json"}
41 |     )
42 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # MII Examples
2 | Please see [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/inference/mii) for a few examples on using MII.
3 | 


--------------------------------------------------------------------------------
/examples/chat_templates/template_alpaca.jinja:
--------------------------------------------------------------------------------
 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
 2 | 
 3 | {% for message in messages %}
 4 | {% if message['role'] == 'user' %}
 5 | ### Instruction:
 6 | {{ message['content']|trim -}}
 7 | {% if not loop.last %}
 8 | 
 9 | 
10 | {% endif %}
11 | {% elif message['role'] == 'assistant' %}
12 | ### Response:
13 | {{ message['content']|trim -}}
14 | {% if not loop.last %}
15 | 
16 | 
17 | {% endif %}
18 | {% elif message['role'] == 'user_context' %}
19 | ### Input:
20 | {{ message['content']|trim -}}
21 | {% if not loop.last %}
22 | 
23 | 
24 | {% endif %}
25 | {% endif %}
26 | {% endfor %}
27 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
28 | ### Response:
29 | {% endif %}
30 | 


--------------------------------------------------------------------------------
/mii/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | from .api import client, serve, pipeline
 6 | 
 7 | from .legacy import MIIServer, MIIClient, mii_query_handle, deploy, terminate, DeploymentType, TaskType, aml_output_path, MIIConfig, ModelConfig, get_supported_models
 8 | 
 9 | __version__ = "0.0.0"
10 | try:
11 |     from .version import __version__
12 | except ImportError:
13 |     pass
14 | 


--------------------------------------------------------------------------------
/mii/aml_related/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | # DeepSpeed Team
5 | from .templates import *
6 | from .utils import get_acr_name, generate_aml_scripts, aml_output_path
7 | 


--------------------------------------------------------------------------------
/mii/backend/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | # DeepSpeed Team
5 | from .client import MIIClient
6 | from .server import MIIServer
7 | 


--------------------------------------------------------------------------------
/mii/backend/client.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | # DeepSpeed Team
  5 | import asyncio
  6 | import grpc
  7 | import requests
  8 | from typing import Dict, Any, Callable, List, Union
  9 | 
 10 | from mii.batching.data_classes import Response
 11 | from mii.config import MIIConfig
 12 | from mii.constants import GRPC_MAX_MSG_SIZE
 13 | from mii.grpc_related.proto import modelresponse_pb2, modelresponse_pb2_grpc
 14 | from mii.grpc_related.task_methods import TASK_METHODS_DICT
 15 | 
 16 | 
 17 | def create_channel(host, port):
 18 |     return grpc.aio.insecure_channel(
 19 |         f"{host}:{port}",
 20 |         options=[
 21 |             ("grpc.max_send_message_length",
 22 |              GRPC_MAX_MSG_SIZE),
 23 |             ("grpc.max_receive_message_length",
 24 |              GRPC_MAX_MSG_SIZE),
 25 |         ],
 26 |     )
 27 | 
 28 | 
 29 | class MIIClient:
 30 |     """
 31 |     Client for sending generation requests to a persistent deployment created
 32 |     with :func:`mii.serve`. Use :func:`mii.client` to create an instance of this
 33 |     class.
 34 | 
 35 |     :param mii_config: MII config for the persistent deployment to connect with.
 36 |     :param host: hostname where the persistent deployment is running.
 37 |     """
 38 |     def __init__(self, mii_config: MIIConfig, host: str = "localhost") -> None:
 39 |         self.mii_config = mii_config
 40 |         self.task = mii_config.model_conf.task
 41 |         self.port = mii_config.port_number
 42 |         self.asyncio_loop = asyncio.get_event_loop()
 43 |         channel = create_channel(host, self.port)
 44 |         # This stub allows interaction the client to send/receive messages with
 45 |         # the load balancer process
 46 |         self.stub = modelresponse_pb2_grpc.ModelResponseStub(channel)
 47 | 
 48 |     def __call__(self, *args, **kwargs) -> List[Response]:
 49 |         """
 50 |         All args and kwargs get passed directly to
 51 |         :meth:`~mii.backend.client.MIIClient.generate`.
 52 | 
 53 |         :return: A list of :class:`Response` objects containing the generated
 54 |             text for all prompts.
 55 |         """
 56 |         return self.generate(*args, **kwargs)
 57 | 
 58 |     async def _request_async_response(self, prompts, **query_kwargs):
 59 |         task_methods = TASK_METHODS_DICT[self.task]
 60 |         proto_request = task_methods.pack_request_to_proto(prompts, **query_kwargs)
 61 |         proto_response = await getattr(self.stub, task_methods.method)(proto_request)
 62 |         return task_methods.unpack_response_from_proto(proto_response)
 63 | 
 64 |     async def _request_async_response_stream(self, prompts, **query_kwargs):
 65 |         task_methods = TASK_METHODS_DICT[self.task]
 66 |         proto_request = task_methods.pack_request_to_proto(prompts, **query_kwargs)
 67 |         assert hasattr(task_methods, "method_stream_out"), f"{self.task} does not support streaming response"
 68 |         async for response in getattr(self.stub,
 69 |                                       task_methods.method_stream_out)(proto_request):
 70 |             yield task_methods.unpack_response_from_proto(response)
 71 | 
 72 |     def generate(self,
 73 |                  prompts: Union[str,
 74 |                                 List[str]],
 75 |                  streaming_fn: Callable = None,
 76 |                  **generate_kwargs: Dict) -> List[Response]:
 77 |         """
 78 |         Generates text for the given prompts.
 79 | 
 80 |         :param prompts: The string or list of strings used as prompts for generation.
 81 |         :param streaming_fn: Streaming support is currently a WIP.
 82 |         :param \\*\\*generate_kwargs: Generation keywords. A full list can be found here.
 83 | 
 84 |         :return: A list of :class:`Response` objects containing the generated
 85 |             text for all prompts.
 86 |         """ # noqa: W605
 87 |         if isinstance(prompts, str):
 88 |             prompts = [prompts]
 89 |         if streaming_fn is not None:
 90 |             if len(prompts) > 1:
 91 |                 raise RuntimeError(
 92 |                     "MII client streaming only supports a single prompt input.")
 93 |             generate_kwargs["stream"] = True
 94 |             return self._generate_stream(streaming_fn, prompts, **generate_kwargs)
 95 | 
 96 |         return self.asyncio_loop.run_until_complete(
 97 |             self._request_async_response(prompts,
 98 |                                          **generate_kwargs))
 99 | 
100 |     def _generate_stream(self,
101 |                          callback,
102 |                          prompts: List[str],
103 |                          **query_kwargs: Dict[str,
104 |                                               Any]) -> None:
105 |         async def put_result():
106 |             response_stream = self._request_async_response_stream(
107 |                 prompts,
108 |                 **query_kwargs)
109 | 
110 |             while True:
111 |                 try:
112 |                     response = await response_stream.__anext__()
113 |                     callback(response)
114 |                 except StopAsyncIteration:
115 |                     break
116 | 
117 |         self.asyncio_loop.run_until_complete(put_result())
118 | 
119 |     async def terminate_async(self) -> None:
120 |         await self.stub.Terminate(
121 |             modelresponse_pb2.google_dot_protobuf_dot_empty__pb2.Empty())
122 | 
123 |     def terminate_server(self) -> None:
124 |         """
125 |         Terminates the persistent deployment server. This can be called from any
126 |         client.
127 |         """
128 |         self.asyncio_loop.run_until_complete(self.terminate_async())
129 |         if self.mii_config.enable_restful_api:
130 |             requests.get(
131 |                 f"http://localhost:{self.mii_config.restful_api_port}/terminate")
132 | 


--------------------------------------------------------------------------------
/mii/batching/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | # DeepSpeed Team
5 | 
6 | from .ragged_batching import MIIAsyncPipeline, MIIPipeline
7 | 


--------------------------------------------------------------------------------
/mii/batching/constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | # Processing method key names
 7 | TOP_K_NAME = "TopK"
 8 | TOP_P_NAME = "TopP"
 9 | TEMP_NAME = "Temp"
10 | SAMPLER_NAME = "Sampler"
11 | STOP_NAME = "Stop"
12 | 


--------------------------------------------------------------------------------
/mii/batching/generation/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | # DeepSpeed Team
5 | 


--------------------------------------------------------------------------------
/mii/batching/generation/logit_processors.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | # DeepSpeed Team
  5 | import abc
  6 | from typing import List, Optional
  7 | 
  8 | import torch
  9 | import torch.nn.functional as F
 10 | 
 11 | FLOAT_PAD = -float("inf")
 12 | 
 13 | 
 14 | class BaseLogitProcessor(abc.ABC):
 15 |     def __call__(self, logits: torch.Tensor) -> torch.Tensor:
 16 |         return self.forward(logits)
 17 | 
 18 |     @abc.abstractmethod
 19 |     def forward(self, logits: torch.Tensor) -> torch.Tensor:
 20 |         ...
 21 | 
 22 |     def get_key(self) -> str:
 23 |         return self.__class__.__name__
 24 | 
 25 | 
 26 | class TopKLogitProcessor(BaseLogitProcessor):
 27 |     def __init__(self, top_k: int) -> None:
 28 |         self.top_k = top_k
 29 | 
 30 |     def forward(self, logits: torch.Tensor) -> torch.Tensor:
 31 |         # Remove all tokens with a probability less than the
 32 |         # last token of the top-k
 33 |         indices_to_remove = logits < torch.topk(logits, self.top_k)[0][..., -1, None]
 34 |         logits[indices_to_remove] = FLOAT_PAD
 35 |         return logits
 36 | 
 37 |     def get_key(self) -> str:
 38 |         return super().get_key() + f"_top_k={self.top_k}"
 39 | 
 40 | 
 41 | class TopPLogitProcessor(BaseLogitProcessor):
 42 |     def __init__(self, top_p: float) -> None:
 43 |         assert 0.0 <= top_p <= 1.0
 44 |         self.top_p = top_p
 45 | 
 46 |     def forward(self, logits: torch.Tensor) -> torch.Tensor:
 47 |         # convert to 1D
 48 |         sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
 49 |         cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
 50 | 
 51 |         # Remove tokens with cumulative probability above the threshold
 52 |         sorted_indices_to_remove = cumulative_probs > self.top_p
 53 |         # Shift the indices to the right to keep also the first token
 54 |         # above the threshold
 55 |         sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
 56 |         sorted_indices_to_remove[..., 0] = 0
 57 | 
 58 |         indices_to_remove = sorted_indices_to_remove.scatter(1,
 59 |                                                              sorted_indices,
 60 |                                                              sorted_indices_to_remove)
 61 |         return logits.masked_fill(indices_to_remove, FLOAT_PAD)
 62 | 
 63 |     def get_key(self) -> str:
 64 |         return super().get_key() + f"_top_p={self.top_p}"
 65 | 
 66 | 
 67 | class TemperatureLogitProcessor(BaseLogitProcessor):
 68 |     def __init__(self, temperature: float) -> None:
 69 |         self.temperature = temperature
 70 |         assert self.temperature > 0.0
 71 | 
 72 |     def forward(self, logits: torch.Tensor) -> torch.Tensor:
 73 |         return logits / self.temperature
 74 | 
 75 |     def get_key(self) -> str:
 76 |         return super().get_key() + f"_temperature={self.temperature}"
 77 | 
 78 | 
 79 | class PipelineLogitProcessor(BaseLogitProcessor):
 80 |     def __init__(self, pipeline: List[BaseLogitProcessor]) -> None:
 81 |         assert all(isinstance(step, BaseLogitProcessor) for step in pipeline)
 82 |         self.pipeline = pipeline
 83 | 
 84 |     def forward(self, logits: torch.Tensor) -> torch.Tensor:
 85 |         for step in self.pipeline:
 86 |             logits = step(logits)
 87 |         return logits
 88 | 
 89 |     def get_key(self) -> str:
 90 |         return super().get_key(
 91 |         ) + f"_{'_'.join(step.get_key() for step in self.pipeline)}"
 92 | 
 93 | 
 94 | class NucleusSamplingLogitProcessor(BaseLogitProcessor):
 95 |     def __init__(self,
 96 |                  top_k: Optional[int] = None,
 97 |                  top_p: Optional[float] = None) -> None:
 98 |         assert top_k is not None or top_p is not None
 99 |         if top_k is None:
100 |             self._processor = TopPLogitProcessor(top_p)
101 |         elif top_p is None:
102 |             self._processor = TopKLogitProcessor(top_k)
103 |         else:
104 |             self._processor = PipelineLogitProcessor(
105 |                 [TopKLogitProcessor(top_k),
106 |                  TopPLogitProcessor(top_p)])
107 | 
108 |     def forward(self, logits: torch.Tensor) -> torch.Tensor:
109 |         return self._processor(logits)
110 | 
111 |     def get_key(self) -> str:
112 |         return super().get_key() + f"_{self._processor.get_key()}"
113 | 


--------------------------------------------------------------------------------
/mii/batching/generation/samplers.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import abc
 6 | from typing import Tuple
 7 | 
 8 | import torch
 9 | from torch.distributions import Categorical
10 | 
11 | 
12 | class BaseGenerationSampler(abc.ABC):
13 |     @abc.abstractmethod
14 |     def __call__(
15 |         self,
16 |         logits: torch.Tensor,
17 |     ) -> Tuple[torch.LongTensor,
18 |                torch.Tensor]:
19 |         """
20 |         Given the logits, return the next token to add to the sequence, as well
21 |         as the log probability of the token
22 | 
23 |         Args:
24 |             logits (torch.Tensor):
25 |                 The logits from the model. Shape: (batch_size, vocab_size)
26 | 
27 |         Returns:
28 |             Tuple[torch.LongTensor, torch.Tensor]:
29 |                 The next token to add to the sequence, and the log probability
30 |                 of the token. Shape: (batch_size,) and (batch_size,)
31 |         """
32 |         ...
33 | 
34 |     def get_key(self) -> str:
35 |         return self.__class__.__name__
36 | 
37 | 
38 | class LogitsSampler(BaseGenerationSampler):
39 |     def __call__(
40 |         self,
41 |         logits: torch.Tensor,
42 |     ) -> Tuple[torch.LongTensor,
43 |                torch.Tensor]:
44 |         logits = logits.float()
45 |         sampler = Categorical(logits=logits)
46 |         next_tokens = sampler.sample()
47 |         #logprobs = sampler.log_prob(next_tokens)
48 |         return next_tokens  #, logprobs
49 | 
50 | 
51 | class GreedySampler(BaseGenerationSampler):
52 |     def __call__(self, logits: torch.Tensor) -> Tuple[torch.LongTensor, torch.Tensor]:
53 |         logits = logits.float()
54 |         #sampler = Categorical(logits=logits)
55 |         next_tokens = logits.argmax(dim=-1)
56 |         #logprobs = sampler.log_prob(next_tokens)
57 |         return next_tokens  #, logprobs
58 | 


--------------------------------------------------------------------------------
/mii/batching/generation/stop_criterion.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import abc
 6 | from typing import List, Union
 7 | 
 8 | import torch
 9 | 
10 | # from megatron import get_tokenizer
11 | # from megatron.tokenizer.tokenizer import AbstractTokenizer
12 | 
13 | 
14 | class BaseGenerationStopCriterion(abc.ABC):
15 |     def __init__(self, tokenizer):
16 |         self.tokenizer = tokenizer
17 | 
18 |     def __call__(self, tokens: torch.LongTensor) -> torch.BoolTensor:
19 |         return self.forward(tokens)
20 | 
21 |     @abc.abstractmethod
22 |     def forward(self, tokens: torch.LongTensor) -> torch.BoolTensor:
23 |         ...
24 | 
25 |     def get_key(self) -> str:
26 |         return self.__class__.__name__
27 | 
28 | 
29 | class TokenStopCriterion(BaseGenerationStopCriterion):
30 |     def __init__(self, token: Union[str, int], tokenizer) -> None:
31 |         super().__init__(tokenizer=tokenizer)
32 |         if isinstance(token, str):
33 |             token_id = self.tokenizer.convert_tokens_to_ids(token)
34 |         else:
35 |             token_id = token
36 |         self.stop_token_id = token_id
37 | 
38 |     def forward(self, tokens: torch.LongTensor) -> torch.BoolTensor:
39 |         retval = torch.zeros_like(tokens, dtype=torch.bool)
40 |         retval |= tokens == self.stop_token_id
41 |         return retval
42 | 
43 |     def get_key(self) -> str:
44 |         return self.__class__.__name__ + f"_token_id={self.stop_token_id}"
45 | 
46 | 
47 | class EosGenerationStopCriterion(BaseGenerationStopCriterion):
48 |     def __init__(self, tokenizer):
49 |         super().__init__(tokenizer=tokenizer)
50 |         if hasattr(self.tokenizer, "eod"):
51 |             self.eos_id = self.tokenizer.eod
52 |         elif hasattr(self.tokenizer, "eos_token_id"):
53 |             self.eos_id = self.tokenizer.eos_token_id
54 |         elif hasattr(self.tokenizer, "eos_token"):
55 |             self.eos_id = self.tokenizer.eos_token
56 |         else:
57 |             raise ValueError(
58 |                 "Tokenizer must have either an `eod` or `eos_token` attribute.")
59 | 
60 |     def forward(self, tokens: torch.LongTensor) -> torch.BoolTensor:
61 |         return tokens == self.eos_id
62 | 
63 | 
64 | class NewLineDelimitedStopCriterion(BaseGenerationStopCriterion):
65 |     def __init__(self, tokenizer):
66 |         super().__init__(tokenizer=tokenizer)
67 |         self.stop_token_ids = list(
68 |             set([self.tokenizer.tokenize(x)[0] for x in ["\n",
69 |                                                          "\r\n",
70 |                                                          "\n\n",
71 |                                                          ".\n\n"]]))
72 | 
73 |     def forward(self, tokens: torch.LongTensor) -> torch.BoolTensor:
74 |         retval = torch.zeros_like(tokens, dtype=torch.bool)
75 |         for stop_token_id in self.stop_token_ids:
76 |             retval |= tokens == stop_token_id
77 |         return retval
78 | 
79 | 
80 | class PipelinedCriterion(BaseGenerationStopCriterion):
81 |     def __init__(
82 |         self,
83 |         criteria: List[BaseGenerationStopCriterion],
84 |         tokenizer,
85 |     ):
86 |         super().__init__(tokenizer=tokenizer)
87 |         self.criteria = criteria
88 | 
89 |     def forward(self, tokens: torch.LongTensor) -> torch.BoolTensor:
90 |         retval = torch.zeros_like(tokens, dtype=torch.bool)
91 |         for criterion in self.criteria:
92 |             retval |= criterion(tokens)
93 |         return retval
94 | 
95 |     def get_key(self) -> str:
96 |         return super().get_key(
97 |         ) + f"_{'_'.join(criterion.get_key() for criterion in self.criteria)}"
98 | 


--------------------------------------------------------------------------------
/mii/batching/postprocess.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | from typing import TYPE_CHECKING, Any, Dict, List
 6 | 
 7 | import torch
 8 | 
 9 | if TYPE_CHECKING:
10 |     from mii.batching.ragged_batching import RaggedRequestBatch
11 | 
12 | 
13 | def run_batch_processing(input_tensor: torch.Tensor,
14 |                          requests: "RaggedRequestBatch",
15 |                          processor_fns: Dict[str,
16 |                                              Any]) -> torch.Tensor:
17 |     """
18 |     Runs the post-processing steps for batched requests. If we apply the
19 |     post-processing one-by-one for each request performance takes a big hit.
20 |     Instead, we identify all the requests that need to be processed by a given
21 |     post-processor, sampler, etc. and perform the action on a batch of requests.
22 |     """
23 |     idx_list: List[int] = []
24 |     output_list: List[torch.Tensor] = []
25 | 
26 |     # Apply all the post-processing functions
27 |     for key, process_fn in processor_fns.items():
28 | 
29 |         # Get the index of tensors that need to be processed
30 |         idx = [i for i, r in enumerate(requests) if key in r.post_processing]
31 |         if not idx:
32 |             # Short circuit if there is not work to do
33 |             continue
34 | 
35 |         # Run post processing on the filtered inputs
36 |         filtered_input = input_tensor[idx]
37 |         idx_list.extend(idx)
38 |         output_list.append(process_fn(filtered_input))
39 | 
40 |     # If there was no work done, return the input tensor
41 |     if not output_list:
42 |         return input_tensor
43 | 
44 |     # If there are unprocessed requests, append them to the output
45 |     unprocessed_idx = list(set(range(len(requests))).difference(idx_list))
46 |     if unprocessed_idx:
47 |         idx_list.append(unprocessed_idx)
48 |         output_list.append(input_tensor[unprocessed_idx])
49 | 
50 |     # Concatenate and return the output
51 |     output = torch.cat(output_list, dim=0)
52 |     return output[torch.argsort(torch.tensor(idx_list))]
53 | 
54 | 
55 | def run_batch_logit_processing(input_logits: torch.Tensor,
56 |                                requests: "RaggedRequestBatch",
57 |                                processor_map: Dict[str,
58 |                                                    Any]) -> torch.Tensor:
59 |     top_k_fns = {k: v for k, v in processor_map.items() if "TopK" in k}
60 |     top_p_fns = {k: v for k, v in processor_map.items() if "TopP" in k}
61 |     temp_fns = {k: v for k, v in processor_map.items() if "Temp" in k}
62 | 
63 |     # Apply TopK, TopP, and Temperature in sequence
64 |     output_logits = input_logits
65 |     for fns in (top_k_fns, top_p_fns, temp_fns):
66 |         output_logits = run_batch_processing(output_logits, requests, fns)
67 |     return output_logits
68 | 
69 | 
70 | def run_batch_sampler(input_logits: torch.Tensor,
71 |                       requests: "RaggedRequestBatch",
72 |                       processor_map: Dict[str,
73 |                                           Any]) -> torch.Tensor:
74 |     sampler_fns = {k: v for k, v in processor_map.items() if "Sampler" in k}
75 |     next_tokens = run_batch_processing(input_logits, requests, sampler_fns)
76 |     return next_tokens
77 | 
78 | 
79 | def run_batch_stop_criterion(next_tokens: torch.Tensor,
80 |                              requests: "RaggedRequestBatch",
81 |                              processor_map: Dict[str,
82 |                                                  Any]) -> torch.Tensor:
83 |     stop_fns = {k: v for k, v in processor_map.items() if "Stop" in k}
84 |     done_tokens = run_batch_processing(next_tokens, requests, stop_fns)
85 |     done_tokens = torch.any(done_tokens.view((len(requests), -1)), dim=1)
86 | 
87 |     return done_tokens
88 | 


--------------------------------------------------------------------------------
/mii/batching/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | from functools import wraps
 6 | 
 7 | from deepspeed.accelerator import get_accelerator
 8 | 
 9 | from mii.logging import logger
10 | 
11 | 
12 | def sync_debug(func):
13 |     @wraps(func)
14 |     def wrapper(self, *args, **kwargs):
15 |         if self.sync_debug:
16 |             get_accelerator().synchronize()
17 |             logger.debug(f"Calling {func.__name__} with args: {args}, kwargs: {kwargs}")
18 |         result = func(self, *args, **kwargs)
19 |         if self.sync_debug:
20 |             get_accelerator().synchronize()
21 |             logger.debug(f"Finished calling {func.__name__}")
22 |         return result
23 | 
24 |     return wrapper
25 | 
26 | 
27 | def profiler(func):
28 |     @wraps(func)
29 |     def wrapper(self, *args, **kwargs):
30 |         if not self.profile_model_time:
31 |             return func(self, *args, **kwargs)
32 | 
33 |         self._timers(func.__name__).start()
34 |         result = func(self, *args, **kwargs)
35 |         self._timers(func.__name__).stop()
36 |         self._profiled_times[func.__name__].append(
37 |             self._timers(func.__name__).elapsed(reset=True))
38 |         return result
39 | 
40 |     return wrapper
41 | 


--------------------------------------------------------------------------------
/mii/constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | from enum import Enum
 6 | 
 7 | 
 8 | class DeploymentType(str, Enum):
 9 |     LOCAL = "local"
10 |     AML = "aml"
11 |     NON_PERSISTENT = "non-persistent"
12 | 
13 | 
14 | class TaskType(str, Enum):
15 |     TEXT_GENERATION = "text-generation"
16 | 
17 | 
18 | class ModelProvider(str, Enum):
19 |     HUGGING_FACE = "hugging-face"
20 | 
21 | 
22 | class GenerationFinishReason(str, Enum):
23 |     """ Reason for text-generation to stop. """
24 | 
25 |     STOP = "stop"
26 |     """ Reached an EoS token. """
27 | 
28 |     LENGTH = "length"
29 |     """ Reached ``max_length`` or ``max_new_tokens``. """
30 | 
31 |     NONE = "none"
32 | 
33 | 
34 | SUPPORTED_MODEL_TYPES = {
35 |     'opt': ModelProvider.HUGGING_FACE,
36 |     'llama': ModelProvider.HUGGING_FACE
37 | }
38 | 
39 | REQUIRED_KEYS_PER_TASK = {
40 |     TaskType.TEXT_GENERATION: ["query"],
41 | }
42 | 
43 | MII_CACHE_PATH = "MII_CACHE_PATH"
44 | MII_CACHE_PATH_DEFAULT = "/tmp/mii_cache"
45 | 
46 | MII_HF_CACHE_EXPIRATION = "MII_HF_CACHE_EXPIRATION"
47 | MII_HF_CACHE_EXPIRATION_DEFAULT = 60 * 60  # 1 hour
48 | 
49 | MII_DEBUG_MODE = "MII_DEBUG_MODE"
50 | MII_DEBUG_MODE_DEFAULT = "0"
51 | 
52 | MII_DEBUG_DEPLOY_KEY = "MII_DEBUG_DEPLOY_KEY"
53 | 
54 | MII_DEBUG_BRANCH = "MII_DEBUG_BRANCH"
55 | MII_DEBUG_BRANCH_DEFAULT = "main"
56 | 
57 | MII_MODEL_PATH_DEFAULT = "/tmp/mii_models"
58 | 
59 | GRPC_MAX_MSG_SIZE = 2**27  # ~100MB
60 | 
61 | TERMINATE_METHOD = "Terminate"
62 | 
63 | LB_MAX_WORKER_THREADS = 256
64 | 
65 | SERVER_SHUTDOWN_TIMEOUT = 10
66 | 
67 | RESTFUL_GATEWAY_SHUTDOWN_TIMEOUT = 1
68 | RESTFUL_API_PATH = "mii"
69 | 
70 | STREAM_RESPONSE_QUEUE_TIMEOUT = 600
71 | ZMQ_RECV_TIMEOUT = 5 * 1000
72 | 


--------------------------------------------------------------------------------
/mii/entrypoints/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | # DeepSpeed Team
5 | 


--------------------------------------------------------------------------------
/mii/errors.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | 
 7 | class DeploymentNotFoundError(Exception):
 8 |     pass
 9 | 
10 | 
11 | class UnknownArgument(Exception):
12 |     pass
13 | 


--------------------------------------------------------------------------------
/mii/grpc_related/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | # DeepSpeed Team
5 | 


--------------------------------------------------------------------------------
/mii/grpc_related/proto/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | # DeepSpeed Team
5 | 


--------------------------------------------------------------------------------
/mii/grpc_related/proto/build_script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | python3 -m grpc_tools.protoc -I./ --python_out=. --grpc_python_out=. ./modelresponse.proto
 7 | 
 8 | # update import to be global wrt mii
 9 | sed -i 's/modelresponse_pb2/mii.grpc_related.proto.modelresponse_pb2/g' modelresponse_pb2_grpc.py
10 | 


--------------------------------------------------------------------------------
/mii/grpc_related/proto/modelresponse.proto:
--------------------------------------------------------------------------------
 1 | // Copyright 2015 gRPC authors.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | syntax = "proto3";
16 | 
17 | /*option java_multiple_files = true;
18 | option java_package = "io.grpc.examples.helloworld";
19 | option java_outer_classname = "HelloWorldProto";
20 | option objc_class_prefix = "HLW";*/
21 | 
22 | import "google/protobuf/empty.proto";
23 | 
24 | package modelresponse;
25 | 
26 | service ModelResponse {
27 |   rpc Terminate (google.protobuf.Empty) returns (google.protobuf.Empty) {}
28 |   rpc GeneratorReply (MultiStringRequest) returns (MultiGenerationReply) {}
29 |   rpc GeneratorReplyStream (MultiStringRequest) returns (stream MultiGenerationReply) {}
30 | }
31 | 
32 | message Dictionary {
33 |     map<string, Value> values = 1;
34 | }
35 | 
36 | message Value {
37 |     oneof oneof_values {
38 |         string svalue = 1;
39 |         int64 ivalue = 2;
40 |         float fvalue = 3;
41 |         bool bvalue = 4;
42 |         Dictionary mvalue = 5;
43 |     }
44 | }
45 | 
46 | message SingleStringRequest {
47 |   string request = 1;
48 |   map<string,Value> query_kwargs = 2;
49 | }
50 | 
51 | message MultiStringRequest {
52 |   repeated string request = 1;
53 |   map<string,Value> query_kwargs = 2;
54 | }
55 | 
56 | message SingleGenerationReply {
57 |   string response = 1;
58 |   string finish_reason = 2;
59 |   int64 prompt_tokens = 3;
60 |   int64 generated_tokens = 4;
61 |   float time_taken = 5;
62 |   float model_time_taken = 6;
63 | }
64 | 
65 | message MultiGenerationReply {
66 |   repeated SingleGenerationReply response = 1;
67 | }
68 | 


--------------------------------------------------------------------------------
/mii/grpc_related/proto/modelresponse_pb2.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | # Generated by the protocol buffer compiler.  DO NOT EDIT!
 6 | # source: modelresponse.proto
 7 | """Generated protocol buffer code."""
 8 | from google.protobuf import descriptor as _descriptor
 9 | from google.protobuf import descriptor_pool as _descriptor_pool
10 | from google.protobuf import symbol_database as _symbol_database
11 | from google.protobuf.internal import builder as _builder
12 | # @@protoc_insertion_point(imports)
13 | 
14 | _sym_db = _symbol_database.Default()
15 | 
16 | from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2
17 | 
18 | DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
19 |     b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"\x88\x01\n\nDictionary\x12\x35\n\x06values\x18\x01 \x03(\x0b\x32%.modelresponse.Dictionary.ValuesEntry\x1a\x43\n\x0bValuesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\"\x8c\x01\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x12+\n\x06mvalue\x18\x05 \x01(\x0b\x32\x19.modelresponse.DictionaryH\x00\x42\x0e\n\x0coneof_values\"\xbb\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\"\xb9\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\"\x9f\x01\n\x15SingleGenerationReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x15\n\rfinish_reason\x18\x02 \x01(\t\x12\x15\n\rprompt_tokens\x18\x03 \x01(\x03\x12\x18\n\x10generated_tokens\x18\x04 \x01(\x03\x12\x12\n\ntime_taken\x18\x05 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x06 \x01(\x02\"N\n\x14MultiGenerationReply\x12\x36\n\x08response\x18\x01 \x03(\x0b\x32$.modelresponse.SingleGenerationReply2\x8e\x02\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12Z\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a#.modelresponse.MultiGenerationReply\"\x00\x12\x62\n\x14GeneratorReplyStream\x12!.modelresponse.MultiStringRequest\x1a#.modelresponse.MultiGenerationReply\"\x00\x30\x01\x62\x06proto3'
20 | )
21 | 
22 | _globals = globals()
23 | _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
24 | _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'modelresponse_pb2', _globals)
25 | if _descriptor._USE_C_DESCRIPTORS == False:
26 |     DESCRIPTOR._options = None
27 |     _DICTIONARY_VALUESENTRY._options = None
28 |     _DICTIONARY_VALUESENTRY._serialized_options = b'8\001'
29 |     _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._options = None
30 |     _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
31 |     _MULTISTRINGREQUEST_QUERYKWARGSENTRY._options = None
32 |     _MULTISTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001'
33 |     _globals['_DICTIONARY']._serialized_start = 68
34 |     _globals['_DICTIONARY']._serialized_end = 204
35 |     _globals['_DICTIONARY_VALUESENTRY']._serialized_start = 137
36 |     _globals['_DICTIONARY_VALUESENTRY']._serialized_end = 204
37 |     _globals['_VALUE']._serialized_start = 207
38 |     _globals['_VALUE']._serialized_end = 347
39 |     _globals['_SINGLESTRINGREQUEST']._serialized_start = 350
40 |     _globals['_SINGLESTRINGREQUEST']._serialized_end = 537
41 |     _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start = 465
42 |     _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end = 537
43 |     _globals['_MULTISTRINGREQUEST']._serialized_start = 540
44 |     _globals['_MULTISTRINGREQUEST']._serialized_end = 725
45 |     _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start = 465
46 |     _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end = 537
47 |     _globals['_SINGLEGENERATIONREPLY']._serialized_start = 728
48 |     _globals['_SINGLEGENERATIONREPLY']._serialized_end = 887
49 |     _globals['_MULTIGENERATIONREPLY']._serialized_start = 889
50 |     _globals['_MULTIGENERATIONREPLY']._serialized_end = 967
51 |     _globals['_MODELRESPONSE']._serialized_start = 970
52 |     _globals['_MODELRESPONSE']._serialized_end = 1240
53 | # @@protoc_insertion_point(module_scope)
54 | 


--------------------------------------------------------------------------------
/mii/grpc_related/restful_gateway.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import asyncio
 6 | import time
 7 | import threading
 8 | 
 9 | from flask import Flask, request, jsonify
10 | from flask_restful import Resource, Api
11 | from werkzeug.serving import make_server
12 | 
13 | import mii
14 | from mii.constants import RESTFUL_GATEWAY_SHUTDOWN_TIMEOUT, RESTFUL_API_PATH
15 | 
16 | 
17 | def shutdown(thread):
18 |     time.sleep(RESTFUL_GATEWAY_SHUTDOWN_TIMEOUT)
19 |     thread.server.shutdown()
20 | 
21 | 
22 | def createRestfulGatewayApp(deployment_name, server_thread):
23 |     class RestfulGatewayService(Resource):
24 |         def __init__(self):
25 |             super().__init__()
26 |             loop = asyncio.new_event_loop()
27 |             asyncio.set_event_loop(loop)
28 |             self.client = mii.client(deployment_name)
29 | 
30 |         def post(self):
31 |             data = request.get_json()
32 |             result = self.client.generate(**data)
33 |             return jsonify([r.to_msg_dict() for r in result])
34 | 
35 |     app = Flask("RestfulGateway")
36 | 
37 |     @app.route("/terminate", methods=["GET"])
38 |     def terminate():
39 |         # Need to shutdown *after* completing the request
40 |         threading.Thread(target=shutdown, args=(server_thread, )).start()
41 |         return "Shutting down RESTful API gateway server"
42 | 
43 |     @app.route("/healthz", methods=["GET"])
44 |     def healthz():
45 |         return "ok"
46 | 
47 |     api = Api(app)
48 |     path = "/{}/{}".format(RESTFUL_API_PATH, deployment_name)
49 |     api.add_resource(RestfulGatewayService, path)
50 | 
51 |     return app
52 | 
53 | 
54 | class RestfulGatewayThread(threading.Thread):
55 |     def __init__(self, deployment_name, rest_host, rest_port, rest_procs):
56 |         threading.Thread.__init__(self)
57 | 
58 |         app = createRestfulGatewayApp(deployment_name, self)
59 |         self.server = make_server(rest_host,
60 |                                   rest_port,
61 |                                   app,
62 |                                   threaded=False,
63 |                                   processes=rest_procs)
64 |         self.ctx = app.app_context()
65 |         self.ctx.push()
66 | 
67 |         self._stop_event = threading.Event()
68 | 
69 |     def run(self):
70 |         self.server.serve_forever()
71 |         self._stop_event.set()
72 | 
73 |     def get_stop_event(self):
74 |         return self._stop_event
75 | 


--------------------------------------------------------------------------------
/mii/grpc_related/task_methods.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | # DeepSpeed Team
  5 | 
  6 | from abc import ABC, abstractmethod
  7 | from typing import Any, Dict, List, Tuple
  8 | 
  9 | from google.protobuf.message import Message
 10 | 
 11 | from mii.batching.data_classes import Response
 12 | from mii.constants import TaskType
 13 | from mii.grpc_related.proto import modelresponse_pb2
 14 | from mii.utils import kwarg_dict_to_proto, unpack_proto_query_kwargs
 15 | 
 16 | 
 17 | def single_string_request_to_proto(self, request_dict, **query_kwargs):
 18 |     return modelresponse_pb2.SingleStringRequest(
 19 |         request=request_dict["query"],
 20 |         query_kwargs=kwarg_dict_to_proto(query_kwargs))
 21 | 
 22 | 
 23 | def single_string_response_to_proto(self, response, time_taken, model_time_taken):
 24 |     return modelresponse_pb2.SingleStringReply(response=f"{response}",
 25 |                                                time_taken=time_taken,
 26 |                                                model_time_taken=model_time_taken)
 27 | 
 28 | 
 29 | class TaskMethods(ABC):
 30 |     @property
 31 |     @abstractmethod
 32 |     def method(self):
 33 |         ...
 34 | 
 35 |     @abstractmethod
 36 |     def pack_request_to_proto(self, request, **query_kwargs):
 37 |         ...
 38 | 
 39 |     @abstractmethod
 40 |     def unpack_request_from_proto(self, proto_request):
 41 |         ...
 42 | 
 43 |     @abstractmethod
 44 |     def pack_response_to_proto(self, response):
 45 |         ...
 46 | 
 47 |     @abstractmethod
 48 |     def unpack_response_from_proto(self, proto_response):
 49 |         ...
 50 | 
 51 | 
 52 | class TextGenerationMethods(TaskMethods):
 53 |     @property
 54 |     def method(self):
 55 |         return "GeneratorReply"
 56 | 
 57 |     @property
 58 |     def method_stream_out(self):
 59 |         return "GeneratorReplyStream"
 60 | 
 61 |     def pack_request_to_proto(self,
 62 |                               prompts: List[str],
 63 |                               **query_kwargs: Dict[str,
 64 |                                                    Any]) -> Message:
 65 |         proto_request = modelresponse_pb2.MultiStringRequest(
 66 |             request=prompts,
 67 |             query_kwargs=kwarg_dict_to_proto(query_kwargs),
 68 |         )
 69 |         return proto_request
 70 | 
 71 |     def unpack_request_from_proto(self,
 72 |                                   proto_request: Message) -> Tuple[List[str],
 73 |                                                                    Dict[str,
 74 |                                                                         Any]]:
 75 |         prompts = [r for r in proto_request.request]
 76 |         kwargs = unpack_proto_query_kwargs(proto_request.query_kwargs)
 77 |         return prompts, kwargs
 78 | 
 79 |     def pack_response_to_proto(self, responses: List[Response]) -> Message:
 80 |         proto_responses = []
 81 |         for r in responses:
 82 |             proto_responses.append(
 83 |                 modelresponse_pb2.SingleGenerationReply(
 84 |                     response=r.generated_text,
 85 |                     finish_reason=str(r.finish_reason.value),
 86 |                     prompt_tokens=r.prompt_length,
 87 |                     generated_tokens=r.generated_length,
 88 |                     time_taken=-1,
 89 |                     model_time_taken=-1,
 90 |                 ))
 91 | 
 92 |         return modelresponse_pb2.MultiGenerationReply(response=proto_responses, )
 93 | 
 94 |     def unpack_response_from_proto(self, response: Message) -> List[Response]:
 95 |         response_batch = []
 96 |         for r in response.response:
 97 |             response_batch.append(
 98 |                 Response(
 99 |                     generated_text=r.response,
100 |                     prompt_length=r.prompt_tokens,
101 |                     generated_length=r.generated_tokens,
102 |                     finish_reason=r.finish_reason,
103 |                 ))
104 |         return response_batch
105 | 
106 | 
107 | TASK_METHODS_DICT = {
108 |     TaskType.TEXT_GENERATION: TextGenerationMethods(),
109 | }
110 | 


--------------------------------------------------------------------------------
/mii/launch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | # DeepSpeed Team
5 | 


--------------------------------------------------------------------------------
/mii/launch/multi_gpu_server.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | # DeepSpeed Team
  5 | import argparse
  6 | import base64
  7 | import json
  8 | import os
  9 | 
 10 | from mii.config import ModelConfig
 11 | from mii.grpc_related.modelresponse_server import serve_inference, serve_load_balancing
 12 | from mii.grpc_related.restful_gateway import RestfulGatewayThread
 13 | from mii.api import async_pipeline
 14 | 
 15 | 
 16 | def b64_encoded_config(config_str: str) -> ModelConfig:
 17 |     # str -> bytes
 18 |     b64_bytes = config_str.encode()
 19 |     # decode b64 bytes -> json bytes
 20 |     config_bytes = base64.urlsafe_b64decode(b64_bytes)
 21 |     # convert json bytes -> str -> dict
 22 |     config_dict = json.loads(config_bytes.decode())
 23 |     # return mii.ModelConfig object
 24 |     return ModelConfig(**config_dict)
 25 | 
 26 | 
 27 | def main() -> None:
 28 |     parser = argparse.ArgumentParser()
 29 |     parser.add_argument("--deployment-name", type=str, help="Name of deployment")
 30 |     parser.add_argument(
 31 |         "--model-config",
 32 |         type=b64_encoded_config,
 33 |         help="base64 encoded model config",
 34 |     )
 35 |     parser.add_argument(
 36 |         "--server-port",
 37 |         type=int,
 38 |         default=0,
 39 |         help="Port to user for DeepSpeed inference server.",
 40 |     )
 41 |     parser.add_argument("--zmq-port", type=int, default=0, help="Port to use for ZMQ.")
 42 |     parser.add_argument("--load-balancer",
 43 |                         action="store_true",
 44 |                         help="Launch load balancer process.")
 45 |     parser.add_argument(
 46 |         "--load-balancer-port",
 47 |         type=int,
 48 |         default=0,
 49 |         help="Port to use for load balancer.",
 50 |     )
 51 |     parser.add_argument(
 52 |         "--restful-gateway",
 53 |         action="store_true",
 54 |         help="Launches restful gateway process.",
 55 |     )
 56 |     parser.add_argument(
 57 |         "--restful-gateway-port",
 58 |         type=int,
 59 |         default=0,
 60 |         help="Port to use for restful gateway.",
 61 |     )
 62 |     parser.add_argument("--restful-gateway-host",
 63 |                         type=str,
 64 |                         default="localhost",
 65 |                         help="Host to use for restful gateway.")
 66 |     parser.add_argument("--restful-gateway-procs",
 67 |                         type=int,
 68 |                         default=32,
 69 |                         help="Number of processes to use for restful gateway.")
 70 |     args = parser.parse_args()
 71 |     assert not (
 72 |         args.load_balancer and args.restful_gateway
 73 |     ), "Select only load-balancer OR restful-gateway."
 74 | 
 75 |     if args.restful_gateway:
 76 |         assert args.restful_gateway_port, "--restful-gateway-port must be provided."
 77 |         print(f"Starting RESTful API gateway on port: {args.restful_gateway_port}")
 78 |         gateway_thread = RestfulGatewayThread(
 79 |             deployment_name=args.deployment_name,
 80 |             rest_host=args.restful_gateway_host,
 81 |             rest_port=args.restful_gateway_port,
 82 |             rest_procs=args.restful_gateway_procs,
 83 |         )
 84 |         stop_event = gateway_thread.get_stop_event()
 85 |         gateway_thread.start()
 86 |         stop_event.wait()
 87 | 
 88 |     elif args.load_balancer:
 89 |         assert args.load_balancer_port, "--load-balancer-port must be provided."
 90 |         print(f"Starting load balancer on port: {args.load_balancer_port}")
 91 |         serve_load_balancing(args.model_config, args.load_balancer_port)
 92 | 
 93 |     else:
 94 |         assert args.server_port, "--server-port must be provided."
 95 |         local_rank = int(os.getenv("LOCAL_RANK", "0"))
 96 |         port = args.server_port + local_rank
 97 |         args.model_config.zmq_port_number = args.zmq_port
 98 |         inference_pipeline = async_pipeline(args.model_config)
 99 |         print(f"Starting server on port: {port}")
100 |         serve_inference(inference_pipeline, port)
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     # python -m mii.launch.multi_gpu_server
105 |     main()
106 | 


--------------------------------------------------------------------------------
/mii/legacy/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import grpc
 6 | from .server import MIIServer
 7 | from .client import MIIClient, mii_query_handle
 8 | from .deployment import deploy
 9 | from .terminate import terminate
10 | from .constants import DeploymentType, TaskType
11 | from .aml_related.utils import aml_output_path
12 | from .config import MIIConfig, ModelConfig
13 | from .utils import get_supported_models
14 | from .grpc_related.proto import legacymodelresponse_pb2_grpc as modelresponse_pb2_grpc
15 | 
16 | __version__ = "0.0.0"
17 | non_persistent_models = {}
18 | try:
19 |     from .version import __version__
20 | except ImportError:
21 |     pass
22 | 


--------------------------------------------------------------------------------
/mii/legacy/aml_related/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | # DeepSpeed Team
5 | from .templates import *
6 | from .utils import get_acr_name, generate_aml_scripts, aml_output_path
7 | 


--------------------------------------------------------------------------------
/mii/legacy/constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | from enum import Enum
 6 | 
 7 | 
 8 | class DeploymentType(str, Enum):
 9 |     LOCAL = "local"
10 |     AML = "aml"
11 |     NON_PERSISTENT = "non-persistent"
12 | 
13 | 
14 | class TaskType(str, Enum):
15 |     TEXT_GENERATION = "text-generation"
16 |     TEXT_CLASSIFICATION = "text-classification"
17 |     QUESTION_ANSWERING = "question-answering"
18 |     FILL_MASK = "fill-mask"
19 |     TOKEN_CLASSIFICATION = "token-classification"
20 |     TEXT2IMG = "text-to-image"
21 |     ZERO_SHOT_IMAGE_CLASSIFICATION = "zero-shot-image-classification"
22 |     INPAINTING = "text-to-image-inpainting"
23 | 
24 | 
25 | class ModelProvider(str, Enum):
26 |     HUGGING_FACE = "hugging-face"
27 |     ELEUTHER_AI = "eleuther-ai"
28 |     DIFFUSERS = "diffusers"
29 | 
30 | 
31 | SUPPORTED_MODEL_TYPES = {
32 |     'roberta': ModelProvider.HUGGING_FACE,
33 |     'xlm-roberta': ModelProvider.HUGGING_FACE,
34 |     'gpt2': ModelProvider.HUGGING_FACE,
35 |     'distilbert': ModelProvider.HUGGING_FACE,
36 |     'bert': ModelProvider.HUGGING_FACE,
37 |     'gpt_neo': ModelProvider.HUGGING_FACE,
38 |     'gptj': ModelProvider.HUGGING_FACE,
39 |     'opt': ModelProvider.HUGGING_FACE,
40 |     'bloom': ModelProvider.HUGGING_FACE,
41 |     'gpt-neox': ModelProvider.ELEUTHER_AI,
42 |     'stable-diffusion': ModelProvider.DIFFUSERS,
43 |     'llama': ModelProvider.HUGGING_FACE,
44 |     'clip': ModelProvider.HUGGING_FACE
45 | }
46 | 
47 | REQUIRED_KEYS_PER_TASK = {
48 |     TaskType.TEXT_GENERATION: ["query"],
49 |     TaskType.TEXT_CLASSIFICATION: ["query"],
50 |     TaskType.QUESTION_ANSWERING: ["context",
51 |                                   "question"],
52 |     TaskType.FILL_MASK: ["query"],
53 |     TaskType.TOKEN_CLASSIFICATION: ["query"],
54 |     TaskType.TEXT2IMG: ["prompt"],
55 |     TaskType.ZERO_SHOT_IMAGE_CLASSIFICATION: ["image",
56 |                                               "candidate_labels"],
57 |     TaskType.INPAINTING: [
58 |         "prompt",
59 |         "image",
60 |         "mask_image",
61 |     ]
62 | }
63 | 
64 | MII_CACHE_PATH = "MII_CACHE_PATH"
65 | MII_CACHE_PATH_DEFAULT = "/tmp/mii_cache"
66 | 
67 | MII_HF_CACHE_EXPIRATION = "MII_HF_CACHE_EXPIRATION"
68 | MII_HF_CACHE_EXPIRATION_DEFAULT = 60 * 60  # 1 hour
69 | 
70 | MII_DEBUG_MODE = "MII_DEBUG_MODE"
71 | MII_DEBUG_MODE_DEFAULT = "0"
72 | 
73 | MII_DEBUG_DEPLOY_KEY = "MII_DEBUG_DEPLOY_KEY"
74 | 
75 | MII_DEBUG_BRANCH = "MII_DEBUG_BRANCH"
76 | MII_DEBUG_BRANCH_DEFAULT = "main"
77 | 
78 | MII_MODEL_PATH_DEFAULT = "/tmp/mii_models"
79 | 
80 | GRPC_MAX_MSG_SIZE = 2**27  # ~100MB
81 | 
82 | TERMINATE_METHOD = "Terminate"
83 | CREATE_SESSION_METHOD = "CreateSession"
84 | DESTROY_SESSION_METHOD = "DestroySession"
85 | 
86 | LB_MAX_WORKER_THREADS = 32
87 | 
88 | SERVER_SHUTDOWN_TIMEOUT = 10
89 | 
90 | RESTFUL_GATEWAY_SHUTDOWN_TIMEOUT = 1
91 | RESTFUL_API_PATH = "mii"
92 | 


--------------------------------------------------------------------------------
/mii/legacy/deployment.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | # DeepSpeed Team
  5 | import os
  6 | import mii.legacy as mii
  7 | 
  8 | from .logging import logger
  9 | from .models.score import create_score_file
 10 | from .models import load_models
 11 | from .config import MIIConfig, DeploymentType
 12 | 
 13 | 
 14 | def support_legacy_api(
 15 |     task,
 16 |     model,
 17 |     deployment_type=DeploymentType.LOCAL,
 18 |     model_path="",
 19 |     enable_deepspeed=True,
 20 |     enable_zero=False,
 21 |     ds_config=None,
 22 |     mii_config=None,
 23 |     version=1,
 24 | ):
 25 |     if ds_config is None:
 26 |         ds_config = {}
 27 |     if mii_config is None:
 28 |         mii_config = {}
 29 | 
 30 |     model_config = {
 31 |         "task": task,
 32 |         "model": model,
 33 |         "model_path": model_path,
 34 |         "enable_deepspeed": enable_deepspeed,
 35 |         "enable_zero": enable_zero,
 36 |         "ds_config": ds_config,
 37 |     }
 38 |     # TODO do this in a single for loop
 39 |     for key, val in mii_config.items():
 40 |         if key not in MIIConfig.model_fields.keys():
 41 |             model_config[key] = val
 42 |     mii_config = {
 43 |         k: v
 44 |         for k,
 45 |         v in mii_config.items() if k in MIIConfig.model_fields.keys()
 46 |     }
 47 |     mii_config["version"] = version
 48 |     mii_config["deployment_type"] = deployment_type
 49 | 
 50 |     return model_config, mii_config
 51 | 
 52 | 
 53 | def deploy(
 54 |     deployment_name: str,
 55 |     model_config: dict = None,
 56 |     mii_config: dict = None,
 57 |     *args,
 58 |     **kwargs,
 59 | ):
 60 |     if mii_config is None:
 61 |         mii_config = {}
 62 | 
 63 |     if args or kwargs:
 64 |         assert (
 65 |             not model_config
 66 |         ), "We do not support mixture of legacy and new API options, use latest API."
 67 |         kwargs["mii_config"] = mii_config
 68 |         model_config, mii_config = support_legacy_api(*args, **kwargs)
 69 | 
 70 |     mii_config["deployment_name"] = deployment_name
 71 |     mii_config["model_conf"] = model_config
 72 |     mii_config = mii.config.MIIConfig(**mii_config)
 73 | 
 74 |     if mii_config.model_conf.enable_deepspeed:
 75 |         logger.info(
 76 |             "************* MII is using DeepSpeed Optimizations to accelerate your model *************"
 77 |         )
 78 |     else:
 79 |         logger.info(
 80 |             "************* DeepSpeed Optimizations not enabled. Please use enable_deepspeed to get better performance *************"
 81 |         )
 82 | 
 83 |     if mii_config.deployment_type != DeploymentType.NON_PERSISTENT:
 84 |         create_score_file(mii_config)
 85 | 
 86 |     if mii_config.deployment_type == DeploymentType.AML:
 87 |         _deploy_aml(mii_config)
 88 |     elif mii_config.deployment_type == DeploymentType.LOCAL:
 89 |         _deploy_local(mii_config)
 90 |     elif mii_config.deployment_type == DeploymentType.NON_PERSISTENT:
 91 |         _deploy_nonpersistent(mii_config)
 92 | 
 93 | 
 94 | def _deploy_local(mii_config):
 95 |     mii.utils.import_score_file(mii_config.deployment_name, DeploymentType.LOCAL).init()
 96 | 
 97 | 
 98 | def _deploy_aml(mii_config):
 99 |     acr_name = mii.aml_related.utils.get_acr_name()
100 |     mii.aml_related.utils.generate_aml_scripts(
101 |         acr_name=acr_name,
102 |         deployment_name=mii_config.deployment_name,
103 |         model_name=mii_config.model_conf.model,
104 |         task_name=mii_config.model_conf.task,
105 |         replica_num=mii_config.model_conf.replica_num,
106 |         instance_type=mii_config.instance_type,
107 |         version=mii_config.version,
108 |     )
109 |     print(
110 |         f"AML deployment assets at {mii.aml_related.utils.aml_output_path(mii_config.deployment_name)}"
111 |     )
112 |     print("Please run 'deploy.sh' to bring your deployment online")
113 | 
114 | 
115 | def _deploy_nonpersistent(mii_config):
116 |     assert (
117 |         int(os.getenv("WORLD_SIZE", "1"))
118 |         == mii_config.model_conf.tensor_parallel
119 |     ), "World Size does not equal number of tensors. When using non-persistent deployment type, please launch with `deepspeed --num_gpus <tensor_parallel>`"
120 |     deployment_name = mii_config.deployment_name
121 |     mii.non_persistent_models[deployment_name] = (
122 |         load_models(mii_config.model_conf),
123 |         mii_config.model_conf.task,
124 |     )
125 | 


--------------------------------------------------------------------------------
/mii/legacy/docs/CNAME:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/CNAME


--------------------------------------------------------------------------------
/mii/legacy/docs/GPT-NeoX.md:
--------------------------------------------------------------------------------
 1 | # GPT-NeoX with MII
 2 | In this document, we provide the steps to setup MII for doing a local deployment of the [GPT-NeoX model](https://github.com/EleutherAI/gpt-neox).
 3 | 
 4 | ## Setup Environment
 5 | We recommend using a conda environment or virtual environment for installing all dependencies:
 6 | ```bash
 7 | # conda
 8 | conda create --name MII-GPT-NeoX
 9 | conda activate MII-GPT-NeoX
10 | # python virtualenv
11 | python3 -m venv MII-GPT-NeoX
12 | source ./MII-GPT-NeoX/bin/activate
13 | ```
14 | ---
15 | 📌 **Note:** You should use Python3 <= 3.8. We recommend Python 3.8
16 | 
17 | ---
18 | 
19 | ## Install MII
20 | ```bash
21 | git clone https://github.com/deepspeedai/DeepSpeed-MII.git
22 | cd DeepSpeed-MII
23 | pip install .[local]
24 | pip install .
25 | ```
26 | 
27 | ## Install DeepSpeed-GPT-NeoX
28 | ```bash
29 | git clone -b ds-updates https://github.com/deepspeedai/DeepSpeed-gpt-neox.git
30 | cd deepspeed-gpt-neox
31 | pip install -r requirements/requirements-inference.txt
32 | pip install .
33 | python ./megatron/fused_kernels/setup.py install
34 | cd ..
35 | ```
36 | 
37 | ## Download Checkpoint
38 | You can download the checkpoint file with:
39 | ```bash
40 | wget --cut-dirs=5 -nH -r --no-parent --reject "index.html*" https://mystic.the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights/ -P 20B_checkpoints
41 | ```
42 | or you can download with your favorite bittorrent client: [slim_weights.torrent](https://mystic.the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights.torrent)
43 | 
44 | Remember the location where you save the checkpoint directory and we will refer to this location as `{CKPT_DIR}`
45 | 
46 | ---
47 | 📌 **Note:** The checkpoint file is nearly 40GB in size and may take a long time to download
48 | 
49 | ---
50 | 
51 | ## Run GPT-NeoX with MII
52 | Modify the example file `examples/local/text-generation-neox-example.py`:
53 |  - Change the `tensor_parallel` value in the `mii_config` dict to the number of GPUs on your system
54 |  - Change the `local_model_path` in `mii.deploy()` call to `{CKPT_DIR}`
55 | 
56 | To run the example:
57 |  - Start the server with `python3 examples/local/text-generation-neox-example.py`
58 |  - Wait for the server to initialize
59 |  - Run a query with `python3 examples/local/text-generation-query-example.py`
60 | 


--------------------------------------------------------------------------------
/mii/legacy/docs/images/azure-cost.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/azure-cost.png


--------------------------------------------------------------------------------
/mii/legacy/docs/images/bert.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/bert.png


--------------------------------------------------------------------------------
/mii/legacy/docs/images/bloom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/bloom.png


--------------------------------------------------------------------------------
/mii/legacy/docs/images/gpt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/gpt.png


--------------------------------------------------------------------------------
/mii/legacy/docs/images/hero-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/hero-dark.png


--------------------------------------------------------------------------------
/mii/legacy/docs/images/hero-transparent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/hero-transparent.png


--------------------------------------------------------------------------------
/mii/legacy/docs/images/hero.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/hero.png


--------------------------------------------------------------------------------
/mii/legacy/docs/images/llm-latency-sd-latency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/llm-latency-sd-latency.png


--------------------------------------------------------------------------------
/mii/legacy/docs/images/mii-arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/mii-arch.png


--------------------------------------------------------------------------------
/mii/legacy/docs/images/multi-gpu-latency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/multi-gpu-latency.png


--------------------------------------------------------------------------------
/mii/legacy/docs/images/opt-bloom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/opt-bloom.png


--------------------------------------------------------------------------------
/mii/legacy/docs/images/opt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/opt.png


--------------------------------------------------------------------------------
/mii/legacy/docs/images/roberta.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/roberta.png


--------------------------------------------------------------------------------
/mii/legacy/docs/images/sd-hero-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/sd-hero-dark.png


--------------------------------------------------------------------------------
/mii/legacy/docs/images/sd-hero-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/sd-hero-light.png


--------------------------------------------------------------------------------
/mii/legacy/docs/images/sd-latency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/sd-latency.png


--------------------------------------------------------------------------------
/mii/legacy/docs/images/tput-llms.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/tput-llms.png


--------------------------------------------------------------------------------
/mii/legacy/examples/aml/fill-mask-example.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import mii
 6 | 
 7 | name = "bert-base-uncased"
 8 | print(f"Deploying {name}...")
 9 | 
10 | mii.deploy(task='fill-mask',
11 |            model=name,
12 |            deployment_name=name + "-deployment",
13 |            deployment_type=mii.constants.DeploymentType.AML)
14 | 


--------------------------------------------------------------------------------
/mii/legacy/examples/aml/text-generation-bloom.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import mii
 6 | 
 7 | mii_configs = {
 8 |     "dtype": "fp16",
 9 |     "tensor_parallel": 8,
10 |     "meta_tensor": True,
11 | }
12 | name = "microsoft/bloom-deepspeed-inference-fp16"
13 | 
14 | mii.deploy(task='text-generation',
15 |            model=name,
16 |            deployment_name="bloom-deployment",
17 |            deployment_type=mii.constants.DeploymentType.AML,
18 |            mii_config=mii_configs)
19 | 


--------------------------------------------------------------------------------
/mii/legacy/examples/aml/text-generation-bloom560m-example.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import mii
 6 | 
 7 | mii_configs = {
 8 |     "tensor_parallel": 1,
 9 |     "dtype": "fp16",
10 |     "aml_model_path": "models/bloom-560m"
11 | }
12 | mii.deploy(task='text-generation',
13 |            model="bigscience/bloom-560m",
14 |            deployment_name="bloom560m-deployment",
15 |            deployment_type=mii.constants.DeploymentType.AML,
16 |            mii_config=mii_configs)
17 | 


--------------------------------------------------------------------------------
/mii/legacy/examples/benchmark/txt2img/baseline-sd.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import os
 6 | import torch
 7 | import diffusers
 8 | from utils import benchmark
 9 | 
10 | # Get HF auth key from environment or replace with key
11 | hf_auth_key = os.environ["HF_AUTH_TOKEN"]
12 | 
13 | trials = 10
14 | batch_size = 1
15 | save_path = "."
16 | 
17 | # Setup the stable diffusion pipeline via the diffusers pipeline api
18 | pipe = diffusers.StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4",
19 |                                                          use_auth_token=hf_auth_key,
20 |                                                          torch_dtype=torch.float16,
21 |                                                          revision="fp16").to("cuda")
22 | 
23 | # Create batch size number of prompts
24 | prompts = ["a photo of an astronaut riding a horse on mars"] * batch_size
25 | 
26 | # Example usage of diffusers pipeline
27 | results = pipe(prompts)
28 | for idx, img in enumerate(results.images):
29 |     img.save(os.path.join(save_path, f"baseline-img{idx}.png"))
30 | 
31 | # Evaluate performance of pipeline
32 | benchmark(pipe, prompts, save_path, trials, "baseline")
33 | 


--------------------------------------------------------------------------------
/mii/legacy/examples/benchmark/txt2img/mii-sd.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import os
 6 | import mii
 7 | from utils import benchmark
 8 | 
 9 | # Get HF auth key from environment or replace with key
10 | hf_auth_key = os.environ["HF_AUTH_TOKEN"]
11 | 
12 | trials = 10
13 | batch_size = 1
14 | save_path = "."
15 | deploy_name = "sd_deploy"
16 | 
17 | # Deploy Stable Diffusion w. MII
18 | mii_config = {"dtype": "fp16", "hf_auth_token": hf_auth_key}
19 | mii.deploy(task='text-to-image',
20 |            model="CompVis/stable-diffusion-v1-4",
21 |            deployment_name=deploy_name,
22 |            mii_config=mii_config)
23 | 
24 | # Example usage of MII deployment
25 | pipe = mii.mii_query_handle(deploy_name)
26 | prompts = {"query": ["a photo of an astronaut riding a horse on mars"] * batch_size}
27 | results = pipe.query(prompts)
28 | for idx, img in enumerate(results.images):
29 |     img.save(os.path.join(save_path, f"mii-img{idx}.png"))
30 | 
31 | # Evaluate performance of MII
32 | benchmark(pipe.query, prompts, save_path, trials, "mii")
33 | 
34 | # Tear down the persistent deployment
35 | mii.terminate(deploy_name)
36 | 


--------------------------------------------------------------------------------
/mii/legacy/examples/benchmark/txt2img/requirements.txt:
--------------------------------------------------------------------------------
1 | deepspeed>=0.7.4
2 | deepspeed-mii>=0.0.3
3 | diffusers>=0.6.0
4 | 


--------------------------------------------------------------------------------
/mii/legacy/examples/benchmark/txt2img/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import os
 6 | import torch
 7 | import time
 8 | import deepspeed
 9 | import mii
10 | import numpy
11 | import diffusers
12 | import transformers
13 | 
14 | from packaging import version
15 | 
16 | assert version.parse(diffusers.__version__) >= version.parse('0.7.1'), "diffusers must be 0.7.1+"
17 | assert version.parse(mii.__version__) >= version.parse("0.0.3"), "mii must be 0.0.3+"
18 | assert version.parse(deepspeed.__version__) >= version.parse("0.7.5"), "deepspeed must be 0.7.5+"
19 | assert version.parse(transformers.__version__) >= version.parse("4.24.0"), "transformers must be 4.24.0+"
20 | 
21 | 
22 | def benchmark(func, inputs, save_path=".", trials=5, tag="", save=True):
23 |     # Turn off the tqdm progress bar
24 |     if hasattr(func, "set_progress_bar_config"):
25 |         func.set_progress_bar_config(disable=True)
26 | 
27 |     durations = []
28 |     for trial in range(trials):
29 |         torch.cuda.synchronize()
30 |         start = time.perf_counter()
31 |         with torch.inference_mode():
32 |             results = func(inputs)
33 |         torch.cuda.synchronize()
34 |         duration = time.perf_counter() - start
35 |         durations.append(duration)
36 |         print(f"trial={trial}, time_taken={duration:.4f}")
37 |         if save:
38 |             for idx, img in enumerate(results.images):
39 |                 img.save(os.path.join(save_path, f"{tag}-trial{trial}-img{idx}.png"))
40 |     print(f"median duration: {numpy.median(durations):.4f}")
41 | 


--------------------------------------------------------------------------------
/mii/legacy/examples/local/chat/README.md:
--------------------------------------------------------------------------------
 1 | # Multi-turn Conversation Example for Chat Applications
 2 | 
 3 | MII can manage multi-turn conversations, enabling users to easily create their own chat applications.
 4 | The scripts in this folder provide a complete example of a multi-turn conversation scenario.
 5 | 
 6 | ## Starting the server
 7 | 
 8 | Starting the server for your chat application requires nothing special.
 9 | Just make sure that the model supports `text-generation` and is trained for conversations.
10 | 
11 | The example script uses [AdamG012/chat-opt-1.3b-rlhf-actor-deepspeed](https://huggingface.co/AdamG012/chat-opt-1.3b-rlhf-actor-deepspeed), which was trained using [DeepSpeed-Chat](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/README.md).
12 | 
13 | ```python
14 | name = "AdamG012/chat-opt-1.3b-rlhf-actor-deepspeed"
15 | ...
16 | mii.deploy(task='text-generation', model=name, deployment_name="chat_example_deployment")
17 | ```
18 | 
19 | ## Running multi-turn conversation
20 | 
21 | The client create a *session* to make MII recognize the session of the conversation.
22 | `create_session` creates a new session with a given name.
23 | 
24 | ```python
25 | # You can set a session name
26 | session_id = "chat_example_session"
27 | # You need to call `create_session` before you start a multi-turn conversation session
28 | generator.create_session(session_id)
29 | ```
30 | 
31 | The session ID is given as a keyword argument as shown below.
32 | Besides this, you can call `query` function as well as the normal usage of MII inference.
33 | Note that the prompt may need to be designed according to your model.
34 | 
35 | ```python
36 | query_template = "Human: {}\n Assistant: "
37 | print("# Start a conversation session. Type 'q' to exit.")
38 | 
39 | while True:
40 |     user_input = input("You: ")
41 |     if user_input == "q":
42 |         break
43 |     result = generator.query({"query": query_template.format(user_input)},
44 |                              session_id=session_id,
45 |                              max_new_tokens=128)
46 |     print(f"Bot: {result.response[0].replace('</s>', '')}")
47 | ```
48 | 
49 | Once the user finishes the session, you need to call `destroy_session` to free the internal data for the session.
50 | 
51 | ```python
52 | generator.destroy_session(session_id)
53 | ```
54 | 
55 | The following is an example conversation:
56 | ```bash
57 | $ python chat-client-example.py
58 | # Start a conversation session. Type 'q' to exit.
59 | You: Can you tell me about deep learning?
60 | Bot: Yes, it is a type of artificial intelligence that learns from data.  It can process large amounts of data quickly and accurately, and it can develop sophisticated models to analyze data.  Deep learning techniques are being applied to a wide variety of tasks, including image recognition, speech recognition, recommendation systems, and self-driving cars.
61 | You: I want to try it.
62 | Bot: Yes, it is a very powerful technology.  It has the potential to revolutionize many fields, including artificial intelligence, data analysis, and machine learning.
63 | You: Is it hard to learn?
64 | Bot: Yes, it is a challenging field of study.  It requires a lot of training and practice to develop effective models.  It also requires a high level of computational power and data storage.
65 | You: Where can I start?
66 | Bot: Yes, it is a good idea to start with a basic model.  It will help you develop the necessary skills and knowledge to progress to more advanced models..
67 | ```
68 | 


--------------------------------------------------------------------------------
/mii/legacy/examples/local/chat/chat-client-example.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import mii
 6 | 
 7 | # Run `chat-server-example.py` before running this script
 8 | generator = mii.mii_query_handle("chat_example_deployment")
 9 | 
10 | # You can set a session name
11 | session_id = "chat_example_session"
12 | # You need to call `create_session` before you start a multi-turn conversation session
13 | generator.create_session(session_id)
14 | 
15 | print("# Start a conversation session. Type 'q' to exit.")
16 | query_template = "Human: {}\n Assistant: "
17 | while True:
18 |     user_input = input("You: ")
19 |     if user_input == "q":
20 |         break
21 | 
22 |     # A session ID is given as a keyword argument
23 |     result = generator.query({"query": query_template.format(user_input)},
24 |                              session_id=session_id,
25 |                              max_new_tokens=128)
26 |     print(f"Bot: {result.response[0].replace('</s>', '').strip()}")
27 | 
28 | # You need to destroy the session after finishing the conversation
29 | generator.destroy_session(session_id)
30 | 


--------------------------------------------------------------------------------
/mii/legacy/examples/local/chat/chat-server-example.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import mii
 6 | 
 7 | mii_configs = {'tensor_parallel': 1}
 8 | 
 9 | # This checkpoint is create using DeepSpeed-Chat
10 | # https://github.com/deepspeedai/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/README.md
11 | name = "AdamG012/chat-opt-1.3b-rlhf-actor-deepspeed"
12 | 
13 | print(f"Deploying {name}...")
14 | 
15 | # Deploy as "text-generation" task
16 | mii.deploy(task='text-generation', model=name, deployment_name="chat_example_deployment")
17 | 


--------------------------------------------------------------------------------
/mii/legacy/examples/local/conversational-example.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import mii
 6 | 
 7 | mii_configs = {'tensor_parallel': 1}
 8 | 
 9 | # gpt2
10 | name = "microsoft/DialoGPT-large"
11 | 
12 | print(f"Deploying {name}...")
13 | 
14 | mii.deploy(task='text-generation', model=name, deployment_name=name + "_deployment")
15 | 


--------------------------------------------------------------------------------
/mii/legacy/examples/local/conversational-query-example.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import mii
 6 | 
 7 | # gpt2
 8 | name = "microsoft/DialoGPT-large"
 9 | 
10 | print(f"Querying {name}...")
11 | 
12 | conv_id = 0
13 | text = "DeepSpeed is the greatest"
14 | 
15 | generator = mii.mii_query_handle(name + "_deployment")
16 | result = generator.query({
17 |     'text': text,
18 |     'conversation_id': conv_id,
19 |     'past_user_inputs': [],
20 |     'generated_responses': []
21 | })
22 | 
23 | print(result)
24 | print(f"time_taken: {result.time_taken}")
25 | 
26 | text = "How is DeepSpeed?"
27 | result = generator.query({
28 |     'text': str,
29 |     'conversation_id': result.conversation_id,
30 |     'past_user_inputs': result.past_user_inputs,
31 |     'generated_responses': result.generated_responses
32 | })
33 | 
34 | print(result)
35 | print("time_taken:", result.time_taken)
36 | 


--------------------------------------------------------------------------------
/mii/legacy/examples/local/fill-mask-example.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import mii
 6 | import argparse
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument("-q", "--query", action="store_true", help="query")
10 | args = parser.parse_args()
11 | 
12 | name = "bert-base-uncased"
13 | mask = "[MASK]"
14 | 
15 | if not args.query:
16 |     print(f"Deploying {name}...")
17 |     mii.deploy(task='fill-mask', model=name, deployment_name=name + "_deployment")
18 | else:
19 |     print(f"Querying {name}...")
20 |     generator = mii.mii_query_handle(name + "_deployment")
21 |     result = generator.query({'query': f"Hello I'm a {mask} model."})
22 |     print(result.response)
23 |     print("time_taken:", result.time_taken)
24 | 


--------------------------------------------------------------------------------
/mii/legacy/examples/local/question-answering-example.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import mii
 6 | 
 7 | mii_config = {'tensor_parallel': 1, 'port_number': 50050}
 8 | 
 9 | name = "deepset/roberta-large-squad2"
10 | mii.deploy(task="question-answering",
11 |            model=name,
12 |            deployment_name=name + "-qa-deployment",
13 |            mii_config=mii_config)
14 | 


--------------------------------------------------------------------------------
/mii/legacy/examples/local/question-answering-query-example.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import mii
 6 | 
 7 | name = "deepset/roberta-large-squad2"
 8 | 
 9 | generator = mii.mii_query_handle(name + "-qa-deployment")
10 | results = generator.query({
11 |     'question': "What is the greatest?",
12 |     'context': "DeepSpeed is the greatest"
13 | })
14 | print(results.response)
15 | print(f"time_taken: {results.time_taken}")
16 | 


--------------------------------------------------------------------------------
/mii/legacy/examples/local/text-classification-example.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import mii
 6 | 
 7 | # gpt2
 8 | name = "microsoft/DialogRPT-human-vs-rand"
 9 | 
10 | # roberta
11 | name = "roberta-large-mnli"
12 | 
13 | print(f"Deploying {name}...")
14 | 
15 | mii.deploy(task='text-classification', model=name, deployment_name=name + "_deployment")
16 | 


--------------------------------------------------------------------------------
/mii/legacy/examples/local/text-classification-query-example.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import mii
 6 | 
 7 | # gpt2
 8 | name = "microsoft/DialogRPT-human-vs-rand"
 9 | 
10 | # roberta
11 | name = "roberta-large-mnli"
12 | 
13 | print(f"Querying {name}...")
14 | 
15 | generator = mii.mii_query_handle(name + "_deployment")
16 | result = generator.query({'query': "DeepSpeed is the greatest"})
17 | print(result.response)
18 | print("time_taken:", result.time_taken)
19 | 


--------------------------------------------------------------------------------
/mii/legacy/examples/local/text-generation-bloom-example.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import mii
 6 | 
 7 | mii_configs = {
 8 |     "dtype": "fp16",
 9 |     "tensor_parallel": 8,
10 |     "port_number": 50950,
11 |     "meta_tensor": True,
12 | }
13 | name = "microsoft/bloom-deepspeed-inference-fp16"
14 | 
15 | mii.deploy(task='text-generation',
16 |            model=name,
17 |            deployment_name=name + "_deployment",
18 |            model_path="/data/bloom-mp",
19 |            mii_config=mii_configs)
20 | 


--------------------------------------------------------------------------------
/mii/legacy/examples/local/text-generation-bloom560m-example.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import mii
 6 | 
 7 | mii_configs = {"tensor_parallel": 1, "dtype": "fp16"}
 8 | mii.deploy(task='text-generation',
 9 |            model="bigscience/bloom-560m",
10 |            deployment_name="bloom560m_deployment",
11 |            mii_config=mii_configs)
12 | 


--------------------------------------------------------------------------------
/mii/legacy/examples/local/text-generation-fbopt-example.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import mii
 6 | 
 7 | mii_config = {'dtype': 'fp16'}
 8 | 
 9 | name = "facebook/opt-1.3b"
10 | 
11 | ds_config = {
12 |     "fp16": {
13 |         "enabled": True
14 |     },
15 |     "bf16": {
16 |         "enabled": False
17 |     },
18 |     "zero_optimization": {
19 |         "stage": 3,
20 |         "offload_param": {
21 |             "device": "cpu",
22 |         },
23 |     },
24 |     "train_micro_batch_size_per_gpu": 1,
25 | }
26 | 
27 | mii.deploy(task='text-generation',
28 |            model=name,
29 |            deployment_name=name + "_deployment",
30 |            model_path=".cache/models/" + name,
31 |            mii_config=mii_config,
32 |            enable_deepspeed=False,
33 |            enable_zero=True,
34 |            ds_config=ds_config)
35 | 


--------------------------------------------------------------------------------
/mii/legacy/examples/local/text-generation-query-example.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import mii
 6 | import argparse
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument('--deployment',
10 |                     '-d',
11 |                     type=str,
12 |                     required=True,
13 |                     help="deployment_name set in the MII deployment")
14 | args = parser.parse_args()
15 | 
16 | generator = mii.mii_query_handle(args.deployment)
17 | result = generator.query({'query': ["DeepSpeed is the", "Seattle is"]})
18 | print(result.response)
19 | print("time_taken:", result.time_taken)
20 | 


--------------------------------------------------------------------------------
/mii/legacy/examples/local/text-generation-zero-example.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import mii
 6 | from transformers import AutoConfig
 7 | 
 8 | mii_config = {"dtype": "fp16"}
 9 | 
10 | name = "gpt2-xl"
11 | 
12 | config = AutoConfig.from_pretrained(name)
13 | model_hidden_size = config.n_embd
14 | 
15 | ds_config = {
16 |     "fp16": {
17 |         "enabled": True
18 |     },
19 |     "bf16": {
20 |         "enabled": False
21 |     },
22 |     "aio": {
23 |         "block_size": 262144,
24 |         "queue_depth": 32,
25 |         "thread_count": 1,
26 |         "single_submit": False,
27 |         "overlap_events": True
28 |     },
29 |     "zero_optimization": {
30 |         "stage": 3,
31 |         "offload_param": {
32 |             "device": "cpu",
33 |         },
34 |         "overlap_comm": True,
35 |         "contiguous_gradients": True,
36 |         "reduce_bucket_size": model_hidden_size * model_hidden_size,
37 |         "stage3_prefetch_bucket_size": 0.1 * model_hidden_size * model_hidden_size,
38 |         "stage3_max_live_parameters": 1e8,
39 |         "stage3_max_reuse_distance": 1e8,
40 |         "stage3_param_persistence_threshold": 10 * model_hidden_size
41 |     },
42 |     "train_micro_batch_size_per_gpu": 1,
43 | }
44 | 
45 | mii.deploy(task='text-generation',
46 |            model=name,
47 |            deployment_name=name + "_deployment",
48 |            model_path=".cache/models/" + name,
49 |            mii_config=mii_config,
50 |            enable_deepspeed=False,
51 |            enable_zero=True,
52 |            ds_config=ds_config)
53 | 


--------------------------------------------------------------------------------
/mii/legacy/examples/local/token-classification-example.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import mii
 6 | 
 7 | # roberta
 8 | name = "Jean-Baptiste/roberta-large-ner-english"
 9 | 
10 | print(f"Deploying {name}...")
11 | 
12 | mii.deploy(task='token-classification', model=name, deployment_name=name + "_deployment")
13 | 


--------------------------------------------------------------------------------
/mii/legacy/examples/local/token-classification-query-example.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import mii
 6 | 
 7 | # roberta
 8 | name = "Jean-Baptiste/roberta-large-ner-english"
 9 | 
10 | print(f"Querying {name}...")
11 | 
12 | generator = mii.mii_query_handle(name + "_deployment")
13 | result = generator.query({'query': "My name is jean-baptiste and I live in montreal."})
14 | print(result.response)
15 | print("time_taken:", result.time_taken)
16 | 


--------------------------------------------------------------------------------
/mii/legacy/examples/local/txt2img-example.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import os
 6 | import mii
 7 | import argparse
 8 | 
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument("-q", "--query", action="store_true", help="query")
11 | args = parser.parse_args()
12 | 
13 | if not args.query:
14 |     mii_configs = {
15 |         "tensor_parallel":
16 |         1,
17 |         "enable_cuda_graph":
18 |         True,
19 |         "replace_with_kernel_inject":
20 |         True,
21 |         "dtype":
22 |         "fp16",
23 |         "hf_auth_token":
24 |         os.environ.get("HF_AUTH_TOKEN",
25 |                        "hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"),
26 |         "port_number":
27 |         50050
28 |     }
29 |     mii.deploy(task='text-to-image',
30 |                model="runwayml/stable-diffusion-v1-5",
31 |                deployment_name="sd_deploy",
32 |                mii_config=mii_configs)
33 |     print(
34 |         "\nText to image model deployment complete! To use this deployment, run the following command: python txt2img-example.py --query\n"
35 |     )
36 | else:
37 |     generator = mii.mii_query_handle("sd_deploy")
38 |     result = generator.query({
39 |         'query':
40 |         ["a panda in space with a rainbow",
41 |          "a soda can on top a snowy mountain"]
42 |     })
43 |     for idx, img in enumerate(result.images):
44 |         img.save(f"test-{idx}.png")
45 | 


--------------------------------------------------------------------------------
/mii/legacy/examples/non_persistent/text-generation-bloom560-example.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import mii
 6 | 
 7 | mii_configs = {"tensor_parallel": 1, "dtype": "fp16"}
 8 | name = "bloom560m"
 9 | mii.deploy(task='text-generation',
10 |            model="bigscience/bloom-560m",
11 |            deployment_name=name + "_deployment",
12 |            deployment_type=mii.constants.DeploymentType.NON_PERSISTENT,
13 |            mii_config=mii_configs)
14 | generator = mii.mii_query_handle(name + "_deployment")
15 | result = generator.query({'query': ["DeepSpeed is the", "Seattle is"]})
16 | print(result)
17 | 


--------------------------------------------------------------------------------
/mii/legacy/grpc_related/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | # DeepSpeed Team
5 | 


--------------------------------------------------------------------------------
/mii/legacy/grpc_related/proto/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | # DeepSpeed Team
5 | 


--------------------------------------------------------------------------------
/mii/legacy/grpc_related/proto/build_script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | python3 -m grpc_tools.protoc -I./ --python_out=. --grpc_python_out=. ./legacymodelresponse.proto
 7 | 
 8 | # update import to be global wrt mii
 9 | sed -i 's/legacymodelresponse_pb2/mii.legacy.grpc_related.proto.legacymodelresponse_pb2/g' legacymodelresponse_pb2_grpc.py
10 | 


--------------------------------------------------------------------------------
/mii/legacy/grpc_related/proto/legacymodelresponse.proto:
--------------------------------------------------------------------------------
  1 | // Copyright 2015 gRPC authors.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | syntax = "proto3";
 16 | 
 17 | /*option java_multiple_files = true;
 18 | option java_package = "io.grpc.examples.helloworld";
 19 | option java_outer_classname = "HelloWorldProto";
 20 | option objc_class_prefix = "HLW";*/
 21 | 
 22 | import "google/protobuf/empty.proto";
 23 | 
 24 | package legacymodelresponse;
 25 | 
 26 | service ModelResponse {
 27 |   rpc Terminate (google.protobuf.Empty) returns (google.protobuf.Empty) {}
 28 |   rpc CreateSession (SessionID) returns (google.protobuf.Empty) {}
 29 |   rpc DestroySession (SessionID) returns (google.protobuf.Empty) {}
 30 |   rpc GeneratorReply (MultiStringRequest) returns (MultiStringReply) {}
 31 |   rpc ClassificationReply (SingleStringRequest) returns (SingleStringReply) {}
 32 |   rpc QuestionAndAnswerReply(QARequest) returns (SingleStringReply) {}
 33 |   rpc FillMaskReply(SingleStringRequest) returns (SingleStringReply) {}
 34 |   rpc TokenClassificationReply(SingleStringRequest) returns (SingleStringReply) {}
 35 |   rpc Txt2ImgReply(Text2ImageRequest) returns (ImageReply) {}
 36 |   rpc ZeroShotImgClassificationReply (ZeroShotImgClassificationRequest) returns (SingleStringReply) {}
 37 |   rpc InpaintingReply(InpaintingRequest) returns (ImageReply) {}
 38 | }
 39 | 
 40 | message Value {
 41 |     oneof oneof_values {
 42 |         string svalue = 1;
 43 |         int64 ivalue = 2;
 44 |         float fvalue = 3;
 45 |         bool bvalue = 4;
 46 |     }
 47 | }
 48 | 
 49 | message SessionID {
 50 |   string session_id = 1;
 51 | }
 52 | 
 53 | message SingleStringRequest {
 54 |   string request = 1;
 55 |   map<string,Value> query_kwargs = 2;
 56 | }
 57 | 
 58 | message MultiStringRequest {
 59 |   repeated string request = 1;
 60 |   map<string,Value> query_kwargs = 2;
 61 | }
 62 | 
 63 | message SingleStringReply {
 64 |   string response = 1;
 65 |   float time_taken = 2;
 66 |   float model_time_taken = 3;
 67 | }
 68 | 
 69 | message MultiStringReply {
 70 |   repeated string response = 1;
 71 |   float time_taken = 2;
 72 |   float model_time_taken = 3;
 73 | }
 74 | 
 75 | message QARequest {
 76 |   string question = 1;
 77 |   string context = 2;
 78 |   map<string,Value> query_kwargs = 3;
 79 | }
 80 | 
 81 | message ImageReply {
 82 |   repeated bytes images = 1;
 83 |   repeated bool nsfw_content_detected = 2;
 84 |   string mode = 3;
 85 |   int64 size_w = 4;
 86 |   int64 size_h = 5;
 87 |   float time_taken = 6;
 88 | }
 89 | 
 90 | message Text2ImageRequest {
 91 |   repeated string prompt = 1;
 92 |   repeated string negative_prompt = 2;
 93 |   map<string,Value> query_kwargs = 3;
 94 | }
 95 | 
 96 | message ZeroShotImgClassificationRequest {
 97 |   string image = 1;
 98 |   repeated string candidate_labels = 2;
 99 |   map<string,Value> query_kwargs = 3;
100 | }
101 | 
102 | message InpaintingRequest {
103 |   repeated string prompt = 1;
104 |   repeated bytes image = 2;
105 |   repeated bytes mask_image = 3;
106 |   repeated string negative_prompt = 4;
107 |   map<string,Value> query_kwargs = 5;
108 | }
109 | 


--------------------------------------------------------------------------------
/mii/legacy/grpc_related/restful_gateway.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import time
 6 | import threading
 7 | import mii.legacy as mii
 8 | from flask import Flask, request
 9 | from flask_restful import Resource, Api
10 | from werkzeug.serving import make_server
11 | from mii.legacy.constants import RESTFUL_GATEWAY_SHUTDOWN_TIMEOUT, RESTFUL_API_PATH
12 | from google.protobuf.json_format import MessageToJson
13 | 
14 | 
15 | def shutdown(thread):
16 |     time.sleep(RESTFUL_GATEWAY_SHUTDOWN_TIMEOUT)
17 |     thread.server.shutdown()
18 | 
19 | 
20 | def createRestfulGatewayApp(deployment_name, task, lb_port, server_thread):
21 |     # client must be thread-safe
22 |     client = mii.MIIClient(task, "localhost", lb_port)
23 | 
24 |     class RestfulGatewayService(Resource):
25 |         def __init__(self):
26 |             super().__init__()
27 | 
28 |         def post(self):
29 |             data = request.get_json()
30 |             kwargs = data["kwargs"] if "kwargs" in data else {}
31 |             result = client.query(data["request"], **kwargs)
32 |             return MessageToJson(result)
33 | 
34 |     app = Flask("RestfulGateway")
35 | 
36 |     @app.route("/terminate", methods=["GET"])
37 |     def terminate():
38 |         # Need to shutdown *after* completing the request
39 |         threading.Thread(target=shutdown, args=(server_thread, )).start()
40 |         return "Shutting down RESTful API gateway server"
41 | 
42 |     api = Api(app)
43 |     path = "/{}/{}".format(RESTFUL_API_PATH, deployment_name)
44 |     api.add_resource(RestfulGatewayService, path)
45 | 
46 |     return app
47 | 
48 | 
49 | class RestfulGatewayThread(threading.Thread):
50 |     def __init__(self, deployment_name, task, lb_port, rest_port):
51 |         threading.Thread.__init__(self)
52 | 
53 |         app = createRestfulGatewayApp(deployment_name, task, lb_port, self)
54 |         self.server = make_server("127.0.0.1", rest_port, app)
55 |         self.ctx = app.app_context()
56 |         self.ctx.push()
57 | 
58 |         self._stop_event = threading.Event()
59 | 
60 |     def run(self):
61 |         self.server.serve_forever()
62 |         self._stop_event.set()
63 | 
64 |     def get_stop_event(self):
65 |         return self._stop_event
66 | 


--------------------------------------------------------------------------------
/mii/legacy/launch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | # DeepSpeed Team
5 | 


--------------------------------------------------------------------------------
/mii/legacy/launch/multi_gpu_server.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import os
 6 | import argparse
 7 | import base64
 8 | import json
 9 | 
10 | from mii.legacy.config import ModelConfig
11 | from mii.legacy.models.load_models import load_models
12 | from mii.legacy.grpc_related.modelresponse_server import serve_inference, serve_load_balancing
13 | from mii.legacy.grpc_related.restful_gateway import RestfulGatewayThread
14 | 
15 | 
16 | def b64_encoded_config(config_str):
17 |     # str -> bytes
18 |     b64_bytes = config_str.encode()
19 |     # decode b64 bytes -> json bytes
20 |     config_bytes = base64.urlsafe_b64decode(b64_bytes)
21 |     # convert json bytes -> str -> dict
22 |     config_dict = json.loads(config_bytes.decode())
23 |     # return mii.ModelConfig object
24 |     return ModelConfig(**config_dict)
25 | 
26 | 
27 | def main():
28 |     parser = argparse.ArgumentParser()
29 |     parser.add_argument("--deployment-name", type=str, help="Name of deployment")
30 |     parser.add_argument(
31 |         "--model-config",
32 |         type=b64_encoded_config,
33 |         help="base64 encoded model config",
34 |     )
35 |     parser.add_argument(
36 |         "--server-port",
37 |         type=int,
38 |         default=0,
39 |         help="Port to user for DeepSpeed inference server.",
40 |     )
41 |     parser.add_argument("--load-balancer",
42 |                         action="store_true",
43 |                         help="Launch load balancer process.")
44 |     parser.add_argument(
45 |         "--load-balancer-port",
46 |         type=int,
47 |         default=0,
48 |         help="Port to use for load balancer.",
49 |     )
50 |     parser.add_argument(
51 |         "--restful-gateway",
52 |         action="store_true",
53 |         help="Launches restful gateway process.",
54 |     )
55 |     parser.add_argument(
56 |         "--restful-gateway-port",
57 |         type=int,
58 |         default=0,
59 |         help="Port to use for restful gateway.",
60 |     )
61 |     args = parser.parse_args()
62 |     assert not (
63 |         args.load_balancer and args.restful_gateway
64 |     ), "Select only load-balancer OR restful-gateway."
65 | 
66 |     if args.restful_gateway:
67 |         assert args.restful_gateway_port, "--restful-gateway-port must be provided."
68 |         print(f"Starting RESTful API gateway on port: {args.restful_gateway_port}")
69 |         gateway_thread = RestfulGatewayThread(
70 |             deployment_name=args.deployment_name,
71 |             task=args.model_config.task,
72 |             lb_port=args.load_balancer_port,
73 |             rest_port=args.restful_gateway_port,
74 |         )
75 |         stop_event = gateway_thread.get_stop_event()
76 |         gateway_thread.start()
77 |         stop_event.wait()
78 | 
79 |     elif args.load_balancer:
80 |         assert args.load_balancer_port, "--load-balancer-port must be provided."
81 |         print(f"Starting load balancer on port: {args.load_balancer_port}")
82 |         serve_load_balancing(args.model_config, args.load_balancer_port)
83 | 
84 |     else:
85 |         assert args.server_port, "--server-port must be provided."
86 |         local_rank = int(os.getenv("LOCAL_RANK", "0"))
87 |         port = args.server_port + local_rank
88 | 
89 |         inference_pipeline = load_models(args.model_config)
90 | 
91 |         print(f"Starting server on port: {port}")
92 |         serve_inference(inference_pipeline, port)
93 | 
94 | 
95 | if __name__ == "__main__":
96 |     # python -m mii.launch.multi_gpu_server
97 |     main()
98 | 


--------------------------------------------------------------------------------
/mii/legacy/logging.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import sys
 6 | import logging
 7 | 
 8 | log_levels = {
 9 |     "debug": logging.DEBUG,
10 |     "info": logging.INFO,
11 |     "warning": logging.WARNING,
12 |     "error": logging.ERROR,
13 |     "critical": logging.CRITICAL,
14 | }
15 | 
16 | 
17 | class LoggerFactory:
18 |     @staticmethod
19 |     def create_logger(name=None, level=logging.INFO):
20 |         """create a logger
21 |         Args:
22 |             name (str): name of the logger
23 |             level: level of logger
24 |         Raises:
25 |             ValueError is name is None
26 |         """
27 | 
28 |         if name is None:
29 |             raise ValueError("name for logger cannot be None")
30 | 
31 |         formatter = logging.Formatter(
32 |             "[%(asctime)s] [%(levelname)s] "
33 |             "[%(filename)s:%(lineno)d:%(funcName)s] %(message)s")
34 | 
35 |         logger_ = logging.getLogger(name)
36 |         logger_.setLevel(level)
37 |         logger_.propagate = False
38 |         ch = logging.StreamHandler(stream=sys.stdout)
39 |         ch.setLevel(level)
40 |         ch.setFormatter(formatter)
41 |         logger_.addHandler(ch)
42 |         return logger_
43 | 
44 | 
45 | logger = LoggerFactory.create_logger(name="MII_legacy", level=logging.INFO)
46 | 


--------------------------------------------------------------------------------
/mii/legacy/models/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | # DeepSpeed Team
5 | from .score import create_score_file
6 | from .load_models import load_models
7 | 


--------------------------------------------------------------------------------
/mii/legacy/models/load_models.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | # DeepSpeed Team
  5 | import os
  6 | import mii.legacy as mii
  7 | import torch
  8 | import inspect
  9 | import deepspeed
 10 | from deepspeed.runtime.config import DeepSpeedConfig
 11 | from deepspeed.runtime.zero.config import ZeroStageEnum
 12 | 
 13 | 
 14 | def load_models(model_config):
 15 |     local_rank = int(os.getenv("LOCAL_RANK", "0"))
 16 |     world_size = int(os.getenv("WORLD_SIZE", "1"))
 17 | 
 18 |     inf_config = {
 19 |         "tensor_parallel": {
 20 |             "tp_size": model_config.tensor_parallel,
 21 |             "mpu": None
 22 |         },
 23 |         "dtype": model_config.dtype,
 24 |         "replace_method": "auto",
 25 |         "enable_cuda_graph": model_config.enable_cuda_graph,
 26 |         "checkpoint": None,
 27 |         "config": None,
 28 |         "training_mp_size": 1,
 29 |         "replace_with_kernel_inject": model_config.replace_with_kernel_inject,
 30 |         "max_tokens": model_config.max_tokens,
 31 |         "min_tokens": model_config.max_tokens,
 32 |     }
 33 | 
 34 |     provider = model_config.provider
 35 |     if provider == mii.constants.ModelProvider.HUGGING_FACE:
 36 |         from mii.legacy.models.providers.huggingface import hf_provider
 37 | 
 38 |         inference_pipeline = hf_provider(model_config)
 39 |         if model_config.meta_tensor:
 40 |             inf_config["checkpoint"] = inference_pipeline.checkpoint_dict
 41 |             if model_config.dtype == torch.int8:
 42 |                 # Support for older DeepSpeed versions
 43 |                 if ("enable_qkv_quantization"
 44 |                         in inspect.signature(deepspeed.init_inference).parameters):
 45 |                     inf_config["enable_qkv_quantization"] = True
 46 |     elif provider == mii.constants.ModelProvider.ELEUTHER_AI:
 47 |         assert False, "Eleuther AI support is currently disabled."
 48 |         # TODO: Re-enable EleutherAI model support
 49 |         """
 50 |         from mii.models.providers.eleutherai import eleutherai_provider
 51 |         assert mii_config.dtype == torch.half, "gpt-neox only support fp16"
 52 |         assert mii_config.enable_cuda_graph == False, "Provider EleutherAI not supported with Cuda Graphs"
 53 |         from megatron import mpu
 54 |         inf_config["tensor_parallel"]["mpu"] = mpu
 55 |         inference_pipeline = eleutherai_provider(model_path,
 56 |                                                  model_name,
 57 |                                                  task_name,
 58 |                                                  mii_config)
 59 |         inf_config["training_mp_size"] = 2
 60 |         inf_config["config"] = inference_pipeline.neox_args
 61 |         """
 62 |     elif provider == mii.constants.ModelProvider.DIFFUSERS:
 63 |         from mii.legacy.models.providers.diffusers import diffusers_provider
 64 |         inference_pipeline = diffusers_provider(model_config)
 65 |     else:
 66 |         raise ValueError(f"Unknown model provider {provider}")
 67 |     print(
 68 |         f"> --------- MII Settings: ds_optimize={model_config.enable_deepspeed}, replace_with_kernel_inject={model_config.replace_with_kernel_inject}, enable_cuda_graph={model_config.enable_cuda_graph} "
 69 |     )
 70 |     if model_config.enable_deepspeed:
 71 |         engine = deepspeed.init_inference(getattr(inference_pipeline,
 72 |                                                   "model",
 73 |                                                   inference_pipeline),
 74 |                                           config=inf_config)
 75 |         if model_config.profile_model_time:
 76 |             engine.profile_model_time()
 77 |         if hasattr(inference_pipeline, "model"):
 78 |             inference_pipeline.model = engine
 79 | 
 80 |     elif model_config.enable_zero:
 81 |         ds_config = DeepSpeedConfig(model_config.ds_config)
 82 |         assert (
 83 |             ds_config.zero_optimization_stage == ZeroStageEnum.weights
 84 |         ), "DeepSpeed ZeRO inference is only supported for ZeRO-3"
 85 | 
 86 |         # initialise Deepspeed ZeRO and store only the engine object
 87 |         ds_engine = deepspeed.initialize(model=inference_pipeline.model,
 88 |                                          config=model_config.ds_config)[0]
 89 |         ds_engine.module.eval()  # inference
 90 |         inference_pipeline.model = ds_engine.module
 91 | 
 92 |     if model_config.load_with_sys_mem:
 93 |         inference_pipeline.device = torch.device(f"cuda:{local_rank}")
 94 | 
 95 |     # Free up memory used when initially loading models
 96 |     # so nvidia-smi reports correct amount of memory used.
 97 |     torch.cuda.empty_cache()
 98 | 
 99 |     return inference_pipeline
100 | 


--------------------------------------------------------------------------------
/mii/legacy/models/providers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | # DeepSpeed Team
5 | 


--------------------------------------------------------------------------------
/mii/legacy/models/providers/diffusers.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import os
 6 | import torch
 7 | 
 8 | from .utils import attempt_load
 9 | from mii.config import ModelConfig
10 | 
11 | 
12 | def diffusers_provider(model_config: ModelConfig):
13 |     from diffusers import DiffusionPipeline
14 | 
15 |     local_rank = int(os.getenv("LOCAL_RANK", "0"))
16 | 
17 |     kwargs = model_config.pipeline_kwargs
18 |     if model_config.dtype == torch.half:
19 |         kwargs["torch_dtype"] = torch.float16
20 |         kwargs["revision"] = "fp16"
21 | 
22 |     pipeline = attempt_load(DiffusionPipeline.from_pretrained,
23 |                             model_config.model,
24 |                             model_config.model_path,
25 |                             kwargs=kwargs)
26 |     pipeline = pipeline.to(f"cuda:{local_rank}")
27 |     pipeline.set_progress_bar_config(disable=True)
28 |     return pipeline
29 | 


--------------------------------------------------------------------------------
/mii/legacy/models/providers/eleutherai.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | # DeepSpeed Team
5 | 


--------------------------------------------------------------------------------
/mii/legacy/models/providers/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | from mii.utils import is_aml, mii_cache_path
 7 | 
 8 | 
 9 | def attempt_load(load_fn, model_name, model_path, cache_path=None, kwargs={}):
10 |     try:
11 |         value = load_fn(model_name, **kwargs)
12 |     except Exception as ex:
13 |         if is_aml():
14 |             print(
15 |                 f"Attempted load but failed - {str(ex)}, retrying using model_path={model_path}"
16 |             )
17 |             value = load_fn(model_path, **kwargs)
18 |         else:
19 |             cache_path = cache_path or mii_cache_path()
20 |             print(
21 |                 f"Attempted load but failed - {str(ex)}, retrying using cache_dir={cache_path}"
22 |             )
23 |             value = load_fn(model_name, cache_dir=cache_path, **kwargs)
24 |     return value
25 | 


--------------------------------------------------------------------------------
/mii/legacy/models/score/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | # DeepSpeed Team
5 | from .generate import create_score_file, generated_score_path
6 | 


--------------------------------------------------------------------------------
/mii/legacy/models/score/generate.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import os
 6 | import mii.legacy as mii
 7 | import pprint
 8 | from mii.legacy.logging import logger
 9 | from mii.legacy.constants import DeploymentType
10 | 
11 | 
12 | def create_score_file(mii_config):
13 |     if len(mii.__path__) > 1:
14 |         logger.warning(
15 |             f"Detected mii path as multiple sources: {mii.__path__}, might cause unknown behavior"
16 |         )
17 | 
18 |     with open(os.path.join(mii.__path__[0],
19 |                            "models/score/score_template.py"),
20 |               "r") as fd:
21 |         score_src = fd.read()
22 | 
23 |     # update score file w. global config dict
24 |     config_dict = mii_config.dict()
25 |     source_with_config = f"{score_src}\n"
26 |     source_with_config += f"mii_config = {pprint.pformat(config_dict, indent=4)}"
27 | 
28 |     with open(
29 |             generated_score_path(mii_config.deployment_name,
30 |                                  mii_config.deployment_type),
31 |             "w") as fd:
32 |         fd.write(source_with_config)
33 |         fd.write("\n")
34 | 
35 | 
36 | def generated_score_path(deployment_name, deployment_type):
37 |     if deployment_type == DeploymentType.LOCAL:
38 |         score_path = os.path.join(mii.utils.mii_cache_path(), deployment_name)
39 |     elif deployment_type == DeploymentType.AML:
40 |         score_path = os.path.join(mii.aml_related.utils.aml_output_path(deployment_name),
41 |                                   "code")
42 |     if not os.path.isdir(score_path):
43 |         os.makedirs(score_path)
44 |     return os.path.join(score_path, "score.py")
45 | 


--------------------------------------------------------------------------------
/mii/legacy/models/score/score_template.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | # flake8: noqa
 7 | import os
 8 | import json
 9 | import time
10 | import torch
11 | 
12 | import mii.legacy as mii
13 | 
14 | model = None
15 | 
16 | 
17 | def init():
18 |     global mii_config
19 |     mii_config = mii.MIIConfig(**mii_config)
20 | 
21 |     # For AML deployments, we stand up multiple nginx server workers, one for
22 |     # each replica. This is so that we can properly run multiple requests in
23 |     # parallel on the different replicas. However, each worker will run this
24 |     # generated score.py and try to stand up an entire MII deployment
25 |     # (load-balancer, replicas, etc.). We want only one worker to spawn the
26 |     # load-balancer and replicas. We take advantage of the nginx worker PIDs
27 |     # being consecutive to achieve that here.
28 |     start_server = True
29 |     if mii.utils.is_aml() and (int(os.getpid()) % mii_config.replica_num != 0):
30 |         start_server = False
31 | 
32 |     if start_server:
33 |         mii.MIIServer(mii_config)
34 | 
35 |     global model
36 |     model = None
37 | 
38 |     # In AML deployments both the GRPC client and server are used in the same process
39 |     if mii.utils.is_aml():
40 |         model = mii.MIIClient(mii_config=mii_config)
41 | 
42 | 
43 | def run(request):
44 |     global mii_config, model
45 |     assert (
46 |         model is not None
47 |     ), "grpc client has not been setup when this model was created"
48 | 
49 |     request_dict = json.loads(request)
50 | 
51 |     query_dict = mii.utils.extract_query_dict(mii_config.task, request_dict)
52 | 
53 |     response = model.query(query_dict, **request_dict)
54 | 
55 |     time_taken = response.time_taken
56 |     if not isinstance(response.response, str):
57 |         response = [r for r in response.response]
58 |     return json.dumps({"responses": response, "time": time_taken})
59 | 
60 | 
61 | ### Auto-generated config will be appended below at run-time
62 | 


--------------------------------------------------------------------------------
/mii/legacy/models/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import os
 6 | import io
 7 | from mii.legacy.utils import mii_cache_path
 8 | 
 9 | 
10 | def supported_models_from_huggingface():
11 |     return ["gpt2", "deepset/roberta-large-squad2"]
12 | 
13 | 
14 | """TODO make this more robust. If the pipeline has already been imported then
15 | this might not work since the cache is set by the first import"""
16 | 
17 | 
18 | def _download_hf_model_to_path(task, model_name, model_path):
19 | 
20 |     os.environ["TRANSFORMERS_CACHE"] = model_path
21 |     from transformers import pipeline
22 | 
23 |     inference_pipeline = pipeline(task, model=model_name)
24 | 
25 | 
26 | """generic method that will allow downloading all models that we support.
27 | Currently only supports HF models, but will be extended to support model checkpoints
28 | from other sources"""
29 | 
30 | 
31 | def download_model_and_get_path(task, model_name):
32 | 
33 |     model_path = os.path.join(mii_cache_path(), model_name)
34 |     if not os.path.isdir(model_path):
35 |         os.makedirs(model_path)
36 | 
37 |     if model_name in supported_models_from_huggingface():
38 |         _download_hf_model_to_path(task, model_name, model_path)
39 |     else:
40 |         assert False, "Only models from HF supported so far"
41 | 
42 |     return model_path
43 | 
44 | 
45 | class ImageResponse:
46 |     def __init__(self, response):
47 |         self._response = response
48 |         self.nsfw_content_detected = response.nsfw_content_detected
49 |         self._deserialized_images = None
50 | 
51 |     @property
52 |     def images(self):
53 |         if self._deserialized_images is None:
54 |             from PIL import Image
55 | 
56 |             images = []
57 |             for idx, img_bytes in enumerate(self._response.images):
58 |                 size = (self._response.size_w, self._response.size_h)
59 |                 img = Image.frombytes(self._response.mode, size, img_bytes)
60 |                 images.append(img)
61 |             self._deserialized_images = images
62 |         return self._deserialized_images
63 | 
64 | 
65 | def convert_bytes_to_pil_image(image_bytes: bytes):
66 |     """Converts bytes to a PIL Image object."""
67 |     if not isinstance(image_bytes, bytes):
68 |         return image_bytes
69 | 
70 |     from PIL import Image
71 |     image = Image.open(io.BytesIO(image_bytes))
72 |     return image
73 | 


--------------------------------------------------------------------------------
/mii/legacy/terminate.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import grpc
 6 | 
 7 | import mii.legacy as mii
 8 | from mii.legacy.logging import logger
 9 | 
10 | 
11 | def terminate(deployment_name):
12 |     logger.info(f"Terminating server for {deployment_name}")
13 |     generator = mii.mii_query_handle(deployment_name)
14 |     if deployment_name in mii.non_persistent_models:
15 |         generator.terminate()
16 |         return
17 |     try:
18 |         generator.query({"query": ""})
19 |     except grpc.aio._call.AioRpcError as error:
20 |         if error._code == grpc.StatusCode.UNAVAILABLE:
21 |             logger.warn(f"Server for {deployment_name} not found")
22 |         else:
23 |             pass
24 |     except (KeyError, TypeError) as error:
25 |         pass
26 | 
27 |     generator.terminate()
28 |     mii.client.terminate_restful_gateway(deployment_name)
29 | 


--------------------------------------------------------------------------------
/mii/logging.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import logging
 6 | import sys
 7 | 
 8 | log_levels = {
 9 |     "debug": logging.DEBUG,
10 |     "info": logging.INFO,
11 |     "warning": logging.WARNING,
12 |     "error": logging.ERROR,
13 |     "critical": logging.CRITICAL,
14 | }
15 | 
16 | 
17 | class LoggerFactory:
18 |     @staticmethod
19 |     def create_logger(name=None, level=logging.INFO):
20 |         """create a logger
21 |         Args:
22 |             name (str): name of the logger
23 |             level: level of logger
24 |         Raises:
25 |             ValueError is name is None
26 |         """
27 | 
28 |         if name is None:
29 |             raise ValueError("name for logger cannot be None")
30 | 
31 |         formatter = logging.Formatter(
32 |             "[%(asctime)s] [%(levelname)s] "
33 |             "[%(filename)s:%(lineno)d:%(funcName)s] %(message)s")
34 | 
35 |         logger_ = logging.getLogger(name)
36 |         logger_.setLevel(level)
37 |         logger_.propagate = False
38 |         ch = logging.StreamHandler(stream=sys.stdout)
39 |         ch.setLevel(level)
40 |         ch.setFormatter(formatter)
41 |         logger_.addHandler(ch)
42 |         return logger_
43 | 
44 | 
45 | logger = LoggerFactory.create_logger(name="MII", level=logging.INFO)
46 | 


--------------------------------------------------------------------------------
/mii/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | # DeepSpeed Team
5 | 


--------------------------------------------------------------------------------
/mii/modeling/models.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | from deepspeed.inference import build_hf_engine, InferenceEngineV2
 7 | 
 8 | from mii.config import ModelConfig
 9 | from mii.constants import ModelProvider
10 | from mii.utils import init_distributed
11 | 
12 | 
13 | def load_model(model_config: ModelConfig) -> InferenceEngineV2:
14 |     init_distributed(model_config)
15 |     provider = model_config.provider
16 |     if provider == ModelProvider.HUGGING_FACE:
17 |         inference_engine = build_hf_engine(
18 |             path=model_config.model_name_or_path,
19 |             engine_config=model_config.inference_engine_config)
20 |     else:
21 |         raise ValueError(f"Unknown model provider {provider}")
22 | 
23 |     return inference_engine
24 | 


--------------------------------------------------------------------------------
/mii/modeling/tokenizers.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | from abc import ABC, abstractmethod
 7 | from typing import TYPE_CHECKING, Union
 8 | 
 9 | import torch
10 | from transformers import AutoTokenizer
11 | 
12 | from mii.constants import ModelProvider
13 | 
14 | if TYPE_CHECKING:
15 |     from mii.config import ModelConfig
16 | 
17 | 
18 | class MIITokenizerWrapper(ABC):
19 |     def __init__(self, tokenizer: object) -> None:
20 |         self.tokenizer = tokenizer
21 | 
22 |     @property
23 |     @abstractmethod
24 |     def vocab_size(self) -> int:
25 |         ...
26 | 
27 |     @property
28 |     @abstractmethod
29 |     def eos_token_id(self) -> int:
30 |         ...
31 | 
32 |     @abstractmethod
33 |     def encode(self, input: str) -> torch.Tensor:
34 |         ...
35 | 
36 |     @abstractmethod
37 |     def decode(self, tokens: torch.Tensor) -> str:
38 |         ...
39 | 
40 | 
41 | class HFTokenizer(MIITokenizerWrapper):
42 |     def __init__(self, tokenizer: Union[str, object]) -> None:
43 |         if isinstance(tokenizer, str):
44 |             tokenizer = AutoTokenizer.from_pretrained(tokenizer, trust_remote_code=True)
45 |             tokenizer.pad_token = tokenizer.eos_token
46 |         super().__init__(tokenizer)
47 | 
48 |     @property
49 |     def vocab_size(self) -> int:
50 |         return len(self.tokenizer)
51 | 
52 |     @property
53 |     def eos_token_id(self) -> int:
54 |         eos_token_attrs = ["eod", "eos_token_id", "eos_token", "eod_id"]
55 |         for attr in eos_token_attrs:
56 |             if getattr(self.tokenizer, attr, None) is not None:
57 |                 return getattr(self.tokenizer, attr)
58 |         raise ValueError(f"Tokenizer must have one of {eos_token_attrs} attributes.")
59 | 
60 |     def encode(self, input: str) -> torch.Tensor:
61 |         return self.tokenizer.encode(input, return_tensors="pt").flatten()
62 | 
63 |     def convert_tokens_to_ids(self, input: str) -> int:
64 |         return self.tokenizer.convert_tokens_to_ids(input)
65 | 
66 |     def decode(self, tokens: torch.Tensor) -> str:
67 |         return self.tokenizer.decode(tokens)
68 | 
69 | 
70 | def load_tokenizer(model_config: "ModelConfig") -> MIITokenizerWrapper:
71 |     provider = model_config.provider
72 |     if provider == ModelProvider.HUGGING_FACE:
73 |         tokenizer = HFTokenizer(model_config.tokenizer)
74 |     else:
75 |         raise ValueError(f"Unknown model provider {provider}")
76 | 
77 |     return tokenizer
78 | 


--------------------------------------------------------------------------------
/mii/score/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | # DeepSpeed Team
5 | from .generate import create_score_file, generated_score_path
6 | 


--------------------------------------------------------------------------------
/mii/score/generate.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import os
 6 | import mii
 7 | import pprint
 8 | from mii.logging import logger
 9 | from mii.constants import DeploymentType
10 | 
11 | 
12 | def create_score_file(mii_config):
13 |     if len(mii.__path__) > 1:
14 |         logger.warning(
15 |             f"Detected mii path as multiple sources: {mii.__path__}, might cause unknown behavior"
16 |         )
17 | 
18 |     with open(os.path.join(mii.__path__[0], "score/score_template.py"), "r") as fd:
19 |         score_src = fd.read()
20 | 
21 |     # update score file w. global config dict
22 |     config_dict = mii_config.model_dump()
23 |     source_with_config = f"{score_src}\n"
24 |     source_with_config += f"mii_config = {pprint.pformat(config_dict, indent=4)}"
25 | 
26 |     with open(
27 |             generated_score_path(mii_config.deployment_name,
28 |                                  mii_config.deployment_type),
29 |             "w") as fd:
30 |         fd.write(source_with_config)
31 |         fd.write("\n")
32 | 
33 | 
34 | def generated_score_path(deployment_name, deployment_type):
35 |     if deployment_type == DeploymentType.LOCAL:
36 |         score_path = os.path.join(mii.utils.mii_cache_path(), deployment_name)
37 |     elif deployment_type == DeploymentType.AML:
38 |         score_path = os.path.join(mii.aml_related.utils.aml_output_path(deployment_name),
39 |                                   "code")
40 |     if not os.path.isdir(score_path):
41 |         os.makedirs(score_path)
42 |     return os.path.join(score_path, "score.py")
43 | 


--------------------------------------------------------------------------------
/mii/score/score_template.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | # flake8: noqa
 7 | import os
 8 | import json
 9 | import time
10 | import torch
11 | 
12 | import mii
13 | 
14 | model = None
15 | 
16 | 
17 | def init():
18 |     global mii_config
19 |     mii_config = mii.config.MIIConfig(**mii_config)
20 | 
21 |     # For AML deployments, we stand up multiple nginx server workers, one for
22 |     # each replica. This is so that we can properly run multiple requests in
23 |     # parallel on the different replicas. However, each worker will run this
24 |     # generated score.py and try to stand up an entire MII deployment
25 |     # (load-balancer, replicas, etc.). We want only one worker to spawn the
26 |     # load-balancer and replicas. We take advantage of the nginx worker PIDs
27 |     # being consecutive to achieve that here.
28 |     start_server = True
29 |     if mii.utils.is_aml() and (int(os.getpid()) % mii_config.replica_num != 0):
30 |         start_server = False
31 | 
32 |     if start_server:
33 |         mii.backend.MIIServer(mii_config)
34 | 
35 |     global model
36 |     model = None
37 | 
38 |     # In AML deployments both the GRPC client and server are used in the same process
39 |     if mii.utils.is_aml():
40 |         model = mii.backend.MIIClient(mii_config=mii_config)
41 | 
42 | 
43 | def run(request):
44 |     global mii_config, model
45 |     assert (
46 |         model is not None
47 |     ), "grpc client has not been setup when this model was created"
48 | 
49 |     request_dict = json.loads(request)
50 | 
51 |     query_dict = mii.utils.extract_query_dict(mii_config.task, request_dict)
52 | 
53 |     response = model.query(query_dict, **request_dict)
54 | 
55 |     time_taken = response.time_taken
56 |     if not isinstance(response.response, str):
57 |         response = [r for r in response.response]
58 |     return json.dumps({"responses": response, "time": time_taken})
59 | 
60 | 
61 | ### Auto-generated config will be appended below at run-time
62 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 |     "build",
4 |     "setuptools>=64",
5 |     "wheel"
6 | ]
7 | # Use legacy backend to import local packages in setup.py
8 | build-backend = "setuptools.build_meta:__legacy__"
9 | 


--------------------------------------------------------------------------------
/release/bump_patch_version.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | import argparse
 7 | from packaging import version as pkg_version
 8 | 
 9 | parser = argparse.ArgumentParser()
10 | 
11 | parser.add_argument(
12 |     "--current_version",
13 |     type=str,
14 |     help="The current version being published to help set the next version.")
15 | 
16 | args = parser.parse_args()
17 | 
18 | current_version = pkg_version.parse(args.current_version)
19 | 
20 | with open('./version.txt', 'w') as fd:
21 |     fd.write(
22 |         f'{current_version.major}.{current_version.minor}.{current_version.micro + 1}\n')
23 | 
24 | print(
25 |     f'{current_version} -> {current_version.major}.{current_version.minor}.{current_version.micro + 1}'
26 | )
27 | 


--------------------------------------------------------------------------------
/release/check_release_version.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | import argparse
 7 | from packaging import version as pkg_version
 8 | 
 9 | parser = argparse.ArgumentParser()
10 | 
11 | parser.add_argument("--release_version",
12 |                     type=str,
13 |                     help="The new version being published.")
14 | 
15 | args = parser.parse_args()
16 | 
17 | release_version = pkg_version.parse(args.release_version)
18 | 
19 | with open('./version.txt') as fd:
20 |     repo_version = pkg_version.parse(fd.read())
21 | 
22 | assert repo_version == release_version, f"{repo_version=} does not match {release_version=}, unable to proceed"
23 | 


--------------------------------------------------------------------------------
/release/release.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | 
 7 | cd ..
 8 | 
 9 | if [ ! -f ~/.pypirc ]; then
10 |     echo 'create .pypirc in order to upload to PyPI'
11 |     exit 1
12 | fi
13 | 
14 | version=$1
15 | 
16 | if [ -z $version ]; then
17 |     echo "please provide version number for release"
18 |     exit 1
19 | fi
20 | 
21 | if [[ $version == *"v"* ]]; then
22 |     echo "please only include version number without 'v' prefix"
23 |     exit 1
24 | fi
25 | 
26 | if [ "${version}" != `cat version.txt` ]; then
27 |     echo "version=${version} does not match version.txt"
28 |     cat version.txt
29 |     exit 1
30 | fi
31 | 
32 | python -c "import twine"
33 | if [ $? != 0 ]; then
34 |     echo 'please install twine via pip'
35 |     exit 1
36 | fi
37 | 
38 | MII_BUILD_STRING="" python -m build --wheel
39 | WHL=deepspeed_mii-${version}-py3-none-any.whl
40 | 
41 | if [ ! -f dist/${WHL} ]; then
42 |     echo "prepared version does not match version given ($version), bump version first?"
43 |     ls dist
44 |     exit 1
45 | fi
46 | 
47 | python -m twine upload --verbose dist/${WHL} --repository mii
48 | 
49 | git tag v${version}
50 | git push origin v${version}
51 | 
52 | echo "bumping up patch version"
53 | cd -
54 | python bump_patch_version.py
55 | 


--------------------------------------------------------------------------------
/requirements/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | clang-format==18.1.3
2 | einops
3 | pre-commit>=2.20.0
4 | pytest
5 | pytest-forked
6 | sentencepiece
7 | tiktoken
8 | transformers-stream-generator
9 | 


--------------------------------------------------------------------------------
/requirements/requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate
 2 | asyncio
 3 | deepspeed>=0.15.0
 4 | deepspeed-kernels
 5 | fastapi
 6 | fastchat
 7 | Flask-RESTful
 8 | grpcio
 9 | grpcio-tools
10 | Pillow
11 | pydantic>=2.0.0
12 | pyzmq
13 | safetensors
14 | shortuuid
15 | torch
16 | transformers
17 | ujson
18 | Werkzeug
19 | 


--------------------------------------------------------------------------------
/scripts/check-license.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | from __future__ import annotations
 7 | '''Copyright The Microsoft DeepSpeed Team'''
 8 | """
 9 | Modified from https://github.com/jlebar/pre-commit-hooks/blob/master/check_do_not_submit.py
10 | """
11 | 
12 | import subprocess
13 | import sys
14 | 
15 | 
16 | def err(s: str) -> None:
17 |     print(s, file=sys.stderr)
18 | 
19 | 
20 | COPYRIGHT = [
21 |     r"^\(\/\/\|#\) Copyright (c) Microsoft Corporation.$",
22 |     r"^\(\/\/\|#\) SPDX-License-Identifier: Apache-2.0$",
23 |     r"^\(\/\/\|#\) DeepSpeed Team$"
24 | ]
25 | 
26 | success = True
27 | failures = []
28 | for f in sys.argv[1:]:
29 |     for copyright_line in COPYRIGHT:
30 |         if not success:
31 |             break
32 |         res = subprocess.run(["git",
33 |                               "grep",
34 |                               "--quiet",
35 |                               "-e",
36 |                               copyright_line,
37 |                               f],
38 |                              capture_output=True)
39 |         if res.returncode == 1:
40 |             success = False
41 |             failures.append(f)
42 |         elif res.returncode == 2:
43 |             err(f"Error invoking grep on {', '.join(sys.argv[1:])}:")
44 |             err(res.stderr.decode("utf-8"))
45 |             sys.exit(2)
46 | 
47 | if not success:
48 |     err(f'{failures}: Missing license at top of file')
49 |     err(res.stdout.decode("utf-8"))
50 |     sys.exit(1)
51 | 


--------------------------------------------------------------------------------
/scripts/model_download.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Copyright (c) Microsoft Corporation.
 3 | # SPDX-License-Identifier: Apache-2.0
 4 | 
 5 | # DeepSpeed Team
 6 | import os
 7 | import argparse
 8 | 
 9 | from huggingface_hub import HfApi
10 | from transformers import AutoConfig, AutoTokenizer, AutoModel
11 | 
12 | 
13 | def dir_path(path_str):
14 |     if os.path.isdir(path_str):
15 |         return path_str
16 |     elif input(f"{path_str} does not exist, create directory? [y/n]").lower() == "y":
17 |         os.makedirs(path_str)
18 |         return path_str
19 |     else:
20 |         raise NotADirectoryError(path_str)
21 | 
22 | 
23 | class HFModelNotFoundError(Exception):
24 |     def __init__(self, model_str):
25 |         super().__init__(f"HuggingFace model not found: '{model_str}'")
26 | 
27 | 
28 | def hf_model(model_str):
29 |     api = HfApi()
30 |     models = [m.id for m in api.list_models()]
31 |     if model_str in models:
32 |         return model_str
33 |     else:
34 |         raise HFModelNotFoundError(model_str)
35 | 
36 | 
37 | parser = argparse.ArgumentParser()
38 | parser.add_argument("--model_path",
39 |                     '-o',
40 |                     type=dir_path,
41 |                     required=True,
42 |                     help="Output directory for downloaded model files")
43 | parser.add_argument("--model_name",
44 |                     '-m',
45 |                     type=hf_model,
46 |                     required=True,
47 |                     help="HuggingFace model name")
48 | args = parser.parse_args()
49 | 
50 | for auto_func in [AutoConfig, AutoTokenizer, AutoModel]:
51 |     auto_func.from_pretrained(args.model_name, cache_dir=args.model_path)
52 | 
53 | print(f"Cached files for '{args.model_name}' downloaded to '{args.model_path}'")
54 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | # DeepSpeed Team
  5 | import os
  6 | import sys
  7 | import subprocess
  8 | from setuptools import setup, find_packages
  9 | 
 10 | 
 11 | def fetch_requirements(path):
 12 |     with open(path, 'r') as fd:
 13 |         return [r.strip() for r in fd.readlines()]
 14 | 
 15 | 
 16 | install_requires = fetch_requirements('requirements/requirements.txt')
 17 | 
 18 | extras_require = {"dev": fetch_requirements('requirements/requirements-dev.txt')}
 19 | 
 20 | 
 21 | def command_exists(cmd):
 22 |     if sys.platform == "win32":
 23 |         result = subprocess.Popen(f'{cmd}', stdout=subprocess.PIPE, shell=True)
 24 |         return result.wait() == 1
 25 |     else:
 26 |         result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
 27 |         return result.wait() == 0
 28 | 
 29 | 
 30 | # Write out version/git info
 31 | git_hash_cmd = "git rev-parse --short HEAD"
 32 | git_branch_cmd = "git rev-parse --abbrev-ref HEAD"
 33 | if command_exists('git') and 'DS_BUILD_STRING' not in os.environ:
 34 |     try:
 35 |         result = subprocess.check_output(git_hash_cmd, shell=True)
 36 |         git_hash = result.decode('utf-8').strip()
 37 |         result = subprocess.check_output(git_branch_cmd, shell=True)
 38 |         git_branch = result.decode('utf-8').strip()
 39 |     except subprocess.CalledProcessError:
 40 |         git_hash = "unknown"
 41 |         git_branch = "unknown"
 42 | else:
 43 |     git_hash = "unknown"
 44 |     git_branch = "unknown"
 45 | 
 46 | # Parse the MII version string from version.txt
 47 | version_str = open('version.txt', 'r').read().strip()
 48 | 
 49 | # Build specifiers like .devX can be added at install time. Otherwise, add the git hash.
 50 | # example: MII_BUILD_STR=".dev20201022" python -m build --sdist --wheel
 51 | 
 52 | MII_BUILD_STRING = 'MII_BUILD_STRING'
 53 | BUILD_FILE = 'build.txt'
 54 | mii_build_string = os.environ.get(MII_BUILD_STRING)
 55 | 
 56 | # Building wheel for distribution, update version file
 57 | if mii_build_string:
 58 |     # Build string env specified, probably building for distribution
 59 |     with open(BUILD_FILE, 'w') as fd:
 60 |         fd.write(mii_build_string)
 61 |     version_str += mii_build_string
 62 | elif os.path.isfile(BUILD_FILE):
 63 |     # build.txt exists, probably installing from distribution
 64 |     with open(BUILD_FILE, 'r') as fd:
 65 |         version_str += fd.read().strip()
 66 | else:
 67 |     # None of the above, probably installing from source
 68 |     version_str += f'+{git_hash}'
 69 | 
 70 | # write out installed version
 71 | with open("mii/version.py", 'w') as fd:
 72 |     fd.write(f"__version__ = '{version_str}'\n")
 73 | 
 74 | # Parse README.md to make long_description for PyPI page.
 75 | thisdir = os.path.abspath(os.path.dirname(__file__))
 76 | with open(os.path.join(thisdir, 'README.md'), encoding='utf-8') as fin:
 77 |     readme_text = fin.read()
 78 | print("PACKAGES", find_packages())
 79 | setup(name="deepspeed-mii",
 80 |       version=version_str,
 81 |       long_description=readme_text,
 82 |       long_description_content_type='text/markdown',
 83 |       description='deepspeed mii',
 84 |       author='DeepSpeed Team',
 85 |       author_email='deepspeed-mii@microsoft.com',
 86 |       url='http://deepspeed.ai',
 87 |       project_urls={
 88 |           'Documentation': 'https://github.com/deepspeedai/DeepSpeed-MII',
 89 |           'Source': 'https://github.com/deepspeedai/DeepSpeed-MII',
 90 |       },
 91 |       install_requires=install_requires,
 92 |       extras_require=extras_require,
 93 |       packages=find_packages(exclude=("tests",
 94 |                                       )),
 95 |       classifiers=[
 96 |           'Programming Language :: Python :: 3.8',
 97 |           'Programming Language :: Python :: 3.9',
 98 |           'Programming Language :: Python :: 3.10',
 99 |           'Programming Language :: Python :: 3.11',
100 |           'Programming Language :: Python :: 3.12'
101 |       ])
102 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | # DeepSpeed Team
5 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | # DeepSpeed Team
  5 | 
  6 | import pytest
  7 | import time
  8 | import os
  9 | import mii
 10 | from types import SimpleNamespace
 11 | from typing import Union
 12 | from deepspeed.launcher.runner import DLTS_HOSTFILE
 13 | import deepspeed.comm as dist
 14 | from huggingface_hub import snapshot_download
 15 | 
 16 | 
 17 | @pytest.fixture(scope="function", params=[None])
 18 | def tensor_parallel(request):
 19 |     if request.param is not None:
 20 |         return request.param
 21 |     return int(os.getenv("WORLD_SIZE", "1"))
 22 | 
 23 | 
 24 | @pytest.fixture(scope="function", params=[50050])
 25 | def port_number(request):
 26 |     return request.param
 27 | 
 28 | 
 29 | @pytest.fixture(scope="function", params=[1])
 30 | def replica_num(request):
 31 |     return request.param
 32 | 
 33 | 
 34 | @pytest.fixture(scope="function", params=[mii.config.DEVICE_MAP_DEFAULT])
 35 | def device_map(request):
 36 |     return request.param
 37 | 
 38 | 
 39 | @pytest.fixture(scope="function", params=[False])
 40 | def enable_restful_api(request):
 41 |     return request.param
 42 | 
 43 | 
 44 | @pytest.fixture(scope="function", params=[28080])
 45 | def restful_api_port(request):
 46 |     return request.param
 47 | 
 48 | 
 49 | @pytest.fixture(scope="function", params=[None])
 50 | def hostfile_content(request):
 51 |     return request.param
 52 | 
 53 | 
 54 | @pytest.fixture(scope="function", params=[DLTS_HOSTFILE])
 55 | def hostfile(request, hostfile_content, tmpdir):
 56 |     if hostfile_content is None:
 57 |         return request.param
 58 |     hostfile_path = tmpdir.join("hostfile")
 59 |     with open(hostfile_path, "w") as f:
 60 |         for line in hostfile_content:
 61 |             f.write(line + "\n")
 62 |     return str(hostfile_path)
 63 | 
 64 | 
 65 | @pytest.fixture(scope="function", params=[mii.TaskType.TEXT_GENERATION])
 66 | def task_name(request):
 67 |     return request.param
 68 | 
 69 | 
 70 | @pytest.fixture(scope="function", params=["facebook/opt-125m"])
 71 | def model_name(request):
 72 |     return request.param
 73 | 
 74 | 
 75 | @pytest.fixture(scope="function", params=[False])
 76 | def local_model(request):
 77 |     return request.param
 78 | 
 79 | 
 80 | @pytest.fixture(scope="function")
 81 | def model_path(model_name, local_model, tmpdir):
 82 |     if not local_model:
 83 |         return None
 84 | 
 85 |     base_dir = os.getenv("HF_HOME", tmpdir)
 86 |     download_dir = os.path.join(base_dir, "mii-ci-models", model_name)
 87 |     snapshot_download(model_name, local_dir=download_dir)
 88 |     return download_dir
 89 | 
 90 | 
 91 | @pytest.fixture(scope="function")
 92 | def model_name_or_path(model_name, model_path):
 93 |     if model_path is not None:
 94 |         return model_path
 95 |     return model_name
 96 | 
 97 | 
 98 | @pytest.fixture(scope="function", params=["test-dep"])
 99 | def deployment_name(request):
100 |     return request.param
101 | 
102 | 
103 | @pytest.fixture(scope="function", params=[mii.DeploymentType.LOCAL])
104 | def deployment_type(request):
105 |     return request.param
106 | 
107 | 
108 | @pytest.fixture(scope="function", params=[True])
109 | def all_rank_output(request):
110 |     return request.param
111 | 
112 | 
113 | @pytest.fixture(scope="function")
114 | def model_config(
115 |     model_name_or_path: str,
116 |     task_name: str,
117 |     tensor_parallel: int,
118 |     replica_num: int,
119 |     device_map: Union[str,
120 |                       dict],
121 | ):
122 |     config = SimpleNamespace(
123 |         model_name_or_path=model_name_or_path,
124 |         task=task_name,
125 |         tensor_parallel=tensor_parallel,
126 |         replica_num=replica_num,
127 |         device_map=device_map,
128 |     )
129 |     return config.__dict__
130 | 
131 | 
132 | @pytest.fixture(scope="function")
133 | def mii_config(
134 |     deployment_name: str,
135 |     deployment_type: str,
136 |     port_number: int,
137 |     enable_restful_api: bool,
138 |     restful_api_port: int,
139 |     hostfile: str,
140 |     model_config: dict,
141 | ):
142 |     config = SimpleNamespace(
143 |         deployment_name=deployment_name,
144 |         deployment_type=deployment_type,
145 |         port_number=port_number,
146 |         enable_restful_api=enable_restful_api,
147 |         restful_api_port=restful_api_port,
148 |         hostfile=hostfile,
149 |         model_config=model_config,
150 |     )
151 |     return config.__dict__
152 | 
153 | 
154 | @pytest.fixture(scope="function", params=[None], ids=["nofail"])
155 | def expected_failure(request):
156 |     return request.param
157 | 
158 | 
159 | @pytest.fixture(scope="function")
160 | def pipeline(model_config, all_rank_output, expected_failure):
161 |     if expected_failure is not None:
162 |         with pytest.raises(expected_failure) as excinfo:
163 |             mii.pipeline(model_config=model_config, all_rank_output=all_rank_output)
164 |         yield excinfo
165 |     else:
166 |         pipe = mii.pipeline(model_config=model_config, all_rank_output=all_rank_output)
167 |         yield pipe
168 |         pipe.destroy()
169 |         dist.destroy_process_group()
170 | 
171 | 
172 | @pytest.fixture(scope="function")
173 | def deployment(mii_config, expected_failure):
174 |     if expected_failure is not None:
175 |         with pytest.raises(expected_failure) as excinfo:
176 |             mii.serve(mii_config=mii_config)
177 |         yield excinfo
178 |     else:
179 |         client = mii.serve(mii_config=mii_config)
180 |         yield client
181 |         client.terminate_server()
182 |         time.sleep(1)  # Give a second for ports to be released
183 | 
184 | 
185 | @pytest.fixture(scope="function", params=["DeepSpeed is the greatest"], ids=["query0"])
186 | def query(request):
187 |     return request.param
188 | 


--------------------------------------------------------------------------------
/tests/legacy/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | # DeepSpeed Team
5 | 


--------------------------------------------------------------------------------
/tests/legacy/conftest.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | # DeepSpeed Team
  5 | 
  6 | import pytest
  7 | import os
  8 | import mii.legacy as mii
  9 | from types import SimpleNamespace
 10 | 
 11 | 
 12 | @pytest.fixture(scope="function", params=["fp16"])
 13 | def dtype(request):
 14 |     return request.param
 15 | 
 16 | 
 17 | @pytest.fixture(scope="function", params=[1])
 18 | def tensor_parallel(request):
 19 |     return request.param
 20 | 
 21 | 
 22 | @pytest.fixture(scope="function", params=[50050])
 23 | def port_number(request):
 24 |     return request.param
 25 | 
 26 | 
 27 | @pytest.fixture(scope="function", params=[False])
 28 | def meta_tensor(request):
 29 |     return request.param
 30 | 
 31 | 
 32 | @pytest.fixture(scope="function", params=[False])
 33 | def load_with_sys_mem(request):
 34 |     return request.param
 35 | 
 36 | 
 37 | @pytest.fixture(scope="function", params=[1])
 38 | def replica_num(request):
 39 |     return request.param
 40 | 
 41 | 
 42 | @pytest.fixture(scope="function", params=[False])
 43 | def enable_restful_api(request):
 44 |     return request.param
 45 | 
 46 | 
 47 | @pytest.fixture(scope="function", params=[28080])
 48 | def restful_api_port(request):
 49 |     return request.param
 50 | 
 51 | 
 52 | @pytest.fixture(scope="function", params=["text-generation"])
 53 | def task_name(request):
 54 |     return request.param
 55 | 
 56 | 
 57 | @pytest.fixture(scope="function", params=["bigscience/bloom-560m"])
 58 | def model_name(request):
 59 |     return request.param
 60 | 
 61 | 
 62 | @pytest.fixture(scope="function")
 63 | def deployment_name(model_name):
 64 |     return model_name + "-deployment"
 65 | 
 66 | 
 67 | @pytest.fixture(scope="function", params=[mii.DeploymentType.LOCAL])
 68 | def deployment_type(request):
 69 |     return request.param
 70 | 
 71 | 
 72 | @pytest.fixture(scope="function", params=[True])
 73 | def enable_deepspeed(request):
 74 |     return request.param
 75 | 
 76 | 
 77 | @pytest.fixture(scope="function", params=[False])
 78 | def enable_zero(request):
 79 |     return request.param
 80 | 
 81 | 
 82 | @pytest.fixture(scope="function", params=[{}])
 83 | def ds_config(request):
 84 |     return request.param
 85 | 
 86 | 
 87 | @pytest.fixture(scope="function")
 88 | def replace_with_kernel_inject(model_name):
 89 |     if "clip-vit" in model_name:
 90 |         return False
 91 |     return True
 92 | 
 93 | 
 94 | @pytest.fixture(scope="function")
 95 | def model_config(
 96 |     task_name: str,
 97 |     model_name: str,
 98 |     dtype: str,
 99 |     tensor_parallel: int,
100 |     meta_tensor: bool,
101 |     load_with_sys_mem: bool,
102 |     replica_num: int,
103 |     enable_deepspeed: bool,
104 |     enable_zero: bool,
105 |     ds_config: dict,
106 |     replace_with_kernel_inject: bool,
107 | ):
108 |     config = SimpleNamespace(
109 |         skip_model_check=True, # TODO: remove this once conversation task check is fixed
110 |         task=task_name,
111 |         model=model_name,
112 |         dtype=dtype,
113 |         tensor_parallel=tensor_parallel,
114 |         model_path=os.getenv("TRANSFORMERS_CACHE",
115 |                              ""),
116 |         meta_tensor=meta_tensor,
117 |         load_with_sys_mem=load_with_sys_mem,
118 |         replica_num=replica_num,
119 |         enable_deepspeed=enable_deepspeed,
120 |         enable_zero=enable_zero,
121 |         ds_config=ds_config,
122 |         replace_with_kernel_inject=replace_with_kernel_inject,
123 |     )
124 |     return config.__dict__
125 | 
126 | 
127 | @pytest.fixture(scope="function")
128 | def mii_config(
129 |     deployment_type: str,
130 |     port_number: int,
131 |     enable_restful_api: bool,
132 |     restful_api_port: int,
133 | ):
134 |     config = SimpleNamespace(
135 |         deployment_type=deployment_type,
136 |         port_number=port_number,
137 |         enable_restful_api=enable_restful_api,
138 |         restful_api_port=restful_api_port,
139 |     )
140 |     return config.__dict__
141 | 
142 | 
143 | @pytest.fixture(scope="function", params=[None])
144 | def expected_failure(request):
145 |     return request.param
146 | 
147 | 
148 | @pytest.fixture(scope="function")
149 | def deployment(deployment_name, mii_config, model_config, expected_failure):
150 |     if expected_failure is not None:
151 |         with pytest.raises(expected_failure) as excinfo:
152 |             mii.deploy(
153 |                 deployment_name=deployment_name,
154 |                 mii_config=mii_config,
155 |                 model_config=model_config,
156 |             )
157 |         yield excinfo
158 |     else:
159 |         mii.deploy(
160 |             deployment_name=deployment_name,
161 |             mii_config=mii_config,
162 |             model_config=model_config,
163 |         )
164 |         yield deployment_name
165 |         mii.terminate(deployment_name)
166 | 
167 | 
168 | @pytest.fixture(scope="function", params=[{"query": "DeepSpeed is the greatest"}])
169 | def query(request):
170 |     return request.param
171 | 


--------------------------------------------------------------------------------
/tests/legacy/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | markers =
3 |     deepspeed:Run test for deepspeed CI
4 | 


--------------------------------------------------------------------------------
/tests/legacy/test_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | import pytest
 7 | 
 8 | import mii.legacy as mii
 9 | from pydantic import ValidationError
10 | 
11 | 
12 | @pytest.mark.parametrize("port_number", [12345])
13 | @pytest.mark.parametrize("tensor_parallel", [4])
14 | def test_base_configs(deployment_name, mii_config, model_config):
15 |     mii_config["deployment_name"] = deployment_name
16 |     mii_config["model_conf"] = model_config
17 |     mii_config = mii.config.MIIConfig(**mii_config)
18 | 
19 |     assert mii_config.port_number == 12345
20 |     assert mii_config.model_conf.tensor_parallel == 4
21 | 
22 | 
23 | @pytest.mark.parametrize("port_number", ["fail"])
24 | @pytest.mark.parametrize("tensor_parallel", [3.5])
25 | def test_base_configs_literalfail(deployment_name, mii_config, model_config):
26 |     with pytest.raises(ValidationError):
27 |         mii_config["deployment_name"] = deployment_name
28 |         mii_config["model_conf"] = model_config
29 |         mii_config = mii.config.MIIConfig(**mii_config)
30 | 


--------------------------------------------------------------------------------
/tests/legacy/test_deployment_options.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | # DeepSpeed Team
  5 | 
  6 | import pytest
  7 | import json
  8 | import requests
  9 | import mii.legacy as mii
 10 | from pydantic import ValidationError
 11 | 
 12 | 
 13 | @pytest.mark.deepspeed
 14 | @pytest.mark.parametrize("meta_tensor", [True])
 15 | @pytest.mark.parametrize("tensor_parallel", [2])
 16 | def test_meta_tensor(deployment, query):
 17 |     generator = mii.mii_query_handle(deployment)
 18 |     result = generator.query(query)
 19 |     assert result
 20 | 
 21 | 
 22 | @pytest.mark.parametrize("enable_restful_api", [True])
 23 | def test_restful_api(deployment, query, restful_api_port):
 24 |     generator = mii.mii_query_handle(deployment)
 25 |     for _ in range(2):
 26 |         result = generator.query(query)
 27 | 
 28 |     url = f"http://localhost:{restful_api_port}/mii/{deployment}"
 29 |     params = {"request": query}
 30 |     json_params = json.dumps(params)
 31 |     result = requests.post(url,
 32 |                            data=json_params,
 33 |                            headers={"Content-Type": "application/json"})
 34 |     assert result.status_code == 200
 35 |     assert "response" in result.json()
 36 | 
 37 | 
 38 | @pytest.mark.parametrize("load_with_sys_mem", [True])
 39 | def test_load_to_sys_mem(deployment, query):
 40 |     generator = mii.mii_query_handle(deployment)
 41 |     result = generator.query(query)
 42 |     assert result
 43 | 
 44 | 
 45 | @pytest.mark.parametrize("replica_num", [2])
 46 | def test_replicas(deployment, query, replica_num):
 47 |     generator = mii.mii_query_handle(deployment)
 48 |     # Replicas are given queries in round-robin, so test each model is responding
 49 |     for _ in range(replica_num):
 50 |         result = generator.query(query)
 51 |         assert result
 52 | 
 53 | 
 54 | @pytest.mark.deepspeed
 55 | @pytest.mark.parametrize("enable_deepspeed", [False])
 56 | @pytest.mark.parametrize("enable_zero", [True])
 57 | @pytest.mark.parametrize(
 58 |     "ds_config",
 59 |     [
 60 |         {
 61 |             "fp16": {
 62 |                 "enabled": True
 63 |             },
 64 |             "bf16": {
 65 |                 "enabled": False
 66 |             },
 67 |             "zero_optimization": {
 68 |                 "stage": 3,
 69 |                 "offload_param": {
 70 |                     "device": "cpu",
 71 |                 },
 72 |             },
 73 |             "train_micro_batch_size_per_gpu": 1,
 74 |         },
 75 |     ],
 76 | )
 77 | def test_zero_config(deployment, query):
 78 |     generator = mii.mii_query_handle(deployment)
 79 |     result = generator.query(query)
 80 |     assert result
 81 | 
 82 | 
 83 | @pytest.mark.deepspeed
 84 | @pytest.mark.parametrize("expected_failure", [ValidationError])
 85 | @pytest.mark.parametrize(
 86 |     "enable_deepspeed, enable_zero, dtype",
 87 |     [(True,
 88 |       True,
 89 |       "fp32"),
 90 |      (False,
 91 |       True,
 92 |       "fp16")],
 93 | )
 94 | @pytest.mark.parametrize(
 95 |     "ds_config",
 96 |     [
 97 |         {
 98 |             "fp16": {
 99 |                 "enabled": False
100 |             },
101 |             "bf16": {
102 |                 "enabled": False
103 |             },
104 |             "zero_optimization": {
105 |                 "stage": 3,
106 |                 "offload_param": {
107 |                     "device": "cpu",
108 |                 },
109 |             },
110 |             "train_micro_batch_size_per_gpu": 1,
111 |         },
112 |     ],
113 | )
114 | def test_zero_config_fail(deployment, query):
115 |     assert "assertion_error" in str(deployment.value)
116 | 


--------------------------------------------------------------------------------
/tests/legacy/test_local_deployment.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | # DeepSpeed Team
  5 | import pytest
  6 | import mii.legacy as mii
  7 | 
  8 | 
  9 | @pytest.mark.parametrize(
 10 |     "task_name, model_name, query",
 11 |     [
 12 |         (
 13 |             "fill-mask",
 14 |             "bert-base-uncased",
 15 |             {
 16 |                 "query": "Hello I'm a [MASK] model."
 17 |             },
 18 |         ),
 19 |         (
 20 |             "question-answering",
 21 |             "deepset/roberta-large-squad2",
 22 |             {
 23 |                 "question": "What is the greatest?",
 24 |                 "context": "DeepSpeed is the greatest",
 25 |             },
 26 |         ),
 27 |         (
 28 |             "text-generation",
 29 |             "distilgpt2",
 30 |             {
 31 |                 "query": ["DeepSpeed is the greatest"]
 32 |             },
 33 |         ),
 34 |         (
 35 |             "text-generation",
 36 |             "bigscience/bloom-560m",
 37 |             {
 38 |                 "query": ["DeepSpeed is the greatest",
 39 |                           "Seattle is"]
 40 |             },
 41 |         ),
 42 |         (
 43 |             "token-classification",
 44 |             "Jean-Baptiste/roberta-large-ner-english",
 45 |             {
 46 |                 "query": "My name is jean-baptiste and I live in montreal."
 47 |             },
 48 |         ),
 49 |         (
 50 |             "text-classification",
 51 |             "roberta-large-mnli",
 52 |             {
 53 |                 "query": "DeepSpeed is the greatest"
 54 |             },
 55 |         ),
 56 |         (
 57 |             "zero-shot-image-classification",
 58 |             "openai/clip-vit-base-patch32",
 59 |             {
 60 |                 "image":
 61 |                 "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
 62 |                 "candidate_labels": ["animals",
 63 |                                      "humans",
 64 |                                      "landscape"]
 65 |             },
 66 |         ),
 67 |     ],
 68 | )
 69 | def test_single_GPU(deployment, query):
 70 |     generator = mii.mii_query_handle(deployment)
 71 |     result = generator.query(query)
 72 |     assert result
 73 | 
 74 | 
 75 | @pytest.mark.parametrize(
 76 |     "task_name, model_name, query",
 77 |     [
 78 |         (
 79 |             "text-generation",
 80 |             "bigscience/bloom-560m",
 81 |             {
 82 |                 "query": ["DeepSpeed is the greatest",
 83 |                           "Seattle is"]
 84 |             },
 85 |         ),
 86 |     ],
 87 | )
 88 | def test_multi_GPU(deployment, query):
 89 |     generator = mii.mii_query_handle(deployment)
 90 |     result = generator.query(query)
 91 |     assert result
 92 | 
 93 | 
 94 | @pytest.mark.parametrize(
 95 |     "task_name, model_name, query",
 96 |     [
 97 |         (
 98 |             "text-generation",
 99 |             "bigscience/bloom-560m",
100 |             {
101 |                 "query": ["DeepSpeed is the greatest",
102 |                           'Seattle is']
103 |             },
104 |         ),
105 |     ],
106 | )
107 | def test_session(deployment, query):
108 |     generator = mii.mii_query_handle(deployment)
109 |     session_name = "test_session"
110 |     generator.create_session(session_name)
111 |     result = generator.query(query)
112 |     generator.destroy_session(session_name)
113 |     assert result
114 | 


--------------------------------------------------------------------------------
/tests/legacy/test_non_persistent_deployment.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | import pytest
 7 | import mii.legacy as mii
 8 | 
 9 | 
10 | @pytest.mark.parametrize("deployment_type", [mii.DeploymentType.NON_PERSISTENT])
11 | @pytest.mark.parametrize(
12 |     "task_name, model_name, query",
13 |     [
14 |         (
15 |             "fill-mask",
16 |             "bert-base-uncased",
17 |             {
18 |                 "query": "Hello I'm a [MASK] model."
19 |             },
20 |         ),
21 |         (
22 |             "question-answering",
23 |             "deepset/roberta-large-squad2",
24 |             {
25 |                 "question": "What is the greatest?",
26 |                 "context": "DeepSpeed is the greatest",
27 |             },
28 |         ),
29 |         (
30 |             "text-generation",
31 |             "distilgpt2",
32 |             {
33 |                 "query": ["DeepSpeed is the greatest"]
34 |             },
35 |         ),
36 |         (
37 |             "text-generation",
38 |             "bigscience/bloom-560m",
39 |             {
40 |                 "query": ["DeepSpeed is the greatest",
41 |                           "Seattle is"]
42 |             },
43 |         ),
44 |         (
45 |             "token-classification",
46 |             "Jean-Baptiste/roberta-large-ner-english",
47 |             {
48 |                 "query": "My name is jean-baptiste and I live in montreal."
49 |             },
50 |         ),
51 |         (
52 |             "text-classification",
53 |             "roberta-large-mnli",
54 |             {
55 |                 "query": "DeepSpeed is the greatest"
56 |             },
57 |         ),
58 |         (
59 |             "zero-shot-image-classification",
60 |             "openai/clip-vit-base-patch32",
61 |             {
62 |                 "image":
63 |                 "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
64 |                 "candidate_labels": ["animals",
65 |                                      "humans",
66 |                                      "landscape"],
67 |             },
68 |         ),
69 |     ],
70 | )
71 | def test_single_GPU(deployment, query):
72 |     generator = mii.mii_query_handle(deployment)
73 |     result = generator.query(query)
74 |     assert result
75 | 


--------------------------------------------------------------------------------
/tests/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = --ignore=legacy
3 | 


--------------------------------------------------------------------------------
/tests/test_arg_parsing.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | # DeepSpeed Team
  5 | 
  6 | import pytest
  7 | 
  8 | from pydantic import ValidationError
  9 | 
 10 | from mii.api import _parse_kwargs_to_model_config, _parse_kwargs_to_mii_config
 11 | from mii.errors import UnknownArgument
 12 | 
 13 | 
 14 | def test_model_name_or_path():
 15 |     # model_name_or_path is required
 16 |     with pytest.raises(ValidationError):
 17 |         _parse_kwargs_to_mii_config()
 18 |     with pytest.raises(ValidationError):
 19 |         _parse_kwargs_to_model_config()
 20 | 
 21 |     # passing model_name_or_path as positional arg
 22 |     mii_config = _parse_kwargs_to_mii_config("test")
 23 |     assert mii_config.model_conf.model_name_or_path == "test"
 24 |     model_config, _ = _parse_kwargs_to_model_config("test")
 25 |     assert model_config.model_name_or_path == "test"
 26 | 
 27 |     # passing model_name_or_path in model_config
 28 |     mii_config = _parse_kwargs_to_mii_config(model_config={"model_name_or_path": "test"})
 29 |     assert mii_config.model_conf.model_name_or_path == "test"
 30 |     mii_config = _parse_kwargs_to_mii_config(
 31 |         mii_config={"model_config": {
 32 |             "model_name_or_path": "test"
 33 |         }})
 34 |     assert mii_config.model_conf.model_name_or_path == "test"
 35 |     model_config, _ = _parse_kwargs_to_model_config(
 36 |         model_config={"model_name_or_path": "test"}
 37 |     )
 38 |     assert model_config.model_name_or_path == "test"
 39 | 
 40 |     # checking that model_name_or_path in model_config matches positional arg
 41 |     with pytest.raises(AssertionError):
 42 |         _parse_kwargs_to_mii_config("test", model_config={"model_name_or_path": "test2"})
 43 |     with pytest.raises(AssertionError):
 44 |         _parse_kwargs_to_mii_config(
 45 |             "test",
 46 |             mii_config={"model_config": {
 47 |                 "model_name_or_path": "test2"
 48 |             }})
 49 |     with pytest.raises(AssertionError):
 50 |         _parse_kwargs_to_model_config("test",
 51 |                                       model_config={"model_name_or_path": "test2"})
 52 | 
 53 | 
 54 | def test_only_kwargs():
 55 |     mii_config = _parse_kwargs_to_mii_config("test",
 56 |                                              tensor_parallel=2,
 57 |                                              enable_restful_api=True)
 58 |     assert mii_config.model_conf.model_name_or_path == "test"
 59 |     assert mii_config.model_conf.tensor_parallel == 2
 60 |     assert mii_config.enable_restful_api is True
 61 | 
 62 |     model_config, _ = _parse_kwargs_to_model_config("test", tensor_parallel=2)
 63 |     assert model_config.model_name_or_path == "test"
 64 |     assert model_config.tensor_parallel == 2
 65 | 
 66 | 
 67 | def test_only_config_dicts():
 68 |     mii_config = _parse_kwargs_to_mii_config(
 69 |         mii_config={"enable_restful_api": True},
 70 |         model_config={
 71 |             "model_name_or_path": "test",
 72 |             "tensor_parallel": 2
 73 |         },
 74 |     )
 75 |     assert mii_config.model_conf.model_name_or_path == "test"
 76 |     assert mii_config.model_conf.tensor_parallel == 2
 77 |     assert mii_config.enable_restful_api is True
 78 | 
 79 |     mii_config = _parse_kwargs_to_mii_config(
 80 |         mii_config={
 81 |             "enable_restful_api": True,
 82 |             "model_config": {
 83 |                 "model_name_or_path": "test",
 84 |                 "tensor_parallel": 2
 85 |             },
 86 |         })
 87 |     assert mii_config.model_conf.model_name_or_path == "test"
 88 |     assert mii_config.model_conf.tensor_parallel == 2
 89 |     assert mii_config.enable_restful_api is True
 90 | 
 91 |     model_config, _ = _parse_kwargs_to_model_config(
 92 |         model_config={"model_name_or_path": "test", "tensor_parallel": 2}
 93 |     )
 94 |     assert model_config.model_name_or_path == "test"
 95 |     assert model_config.tensor_parallel == 2
 96 | 
 97 | 
 98 | def test_unknown_kwargs():
 99 |     with pytest.raises(UnknownArgument):
100 |         _parse_kwargs_to_mii_config("test", unknown_kwarg=True)
101 | 
102 |     _, remaining_kwargs = _parse_kwargs_to_model_config("test", unknown_kwarg=True)
103 |     assert remaining_kwargs == {"unknown_kwarg": True}
104 | 


--------------------------------------------------------------------------------
/tests/test_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | import pytest
 7 | 
 8 | import mii
 9 | 
10 | 
11 | @pytest.mark.parametrize("replica_num", [2])
12 | @pytest.mark.parametrize("tensor_parallel", [2])
13 | @pytest.mark.parametrize(
14 |     "device_map",
15 |     [
16 |         {
17 |             "host_0": [[0,
18 |                         1,
19 |                         2,
20 |                         3]]
21 |         },
22 |         {
23 |             "host_0": [[0,
24 |                         1]],
25 |             "host_1": [[0]]
26 |         },
27 |         {
28 |             "host_0": [[0,
29 |                         1],
30 |                        [2,
31 |                         3],
32 |                        [4,
33 |                         5]]
34 |         },
35 |         {
36 |             "host_0": [[0,
37 |                         1]]
38 |         },
39 |     ],
40 | )
41 | @pytest.mark.parametrize("hostfile_content", [["host_0 slots=8", "host_1 slots=8"]])
42 | def test_deploy_map_fail(mii_config):
43 |     mii_config = mii.config.MIIConfig(**mii_config)
44 |     with pytest.raises(ValueError):
45 |         mii_config.generate_replica_configs()
46 | 
47 | 
48 | @pytest.mark.parametrize("replica_num", [2])
49 | @pytest.mark.parametrize("tensor_parallel", [2])
50 | @pytest.mark.parametrize(
51 |     "device_map",
52 |     [
53 |         {
54 |             "host_0": [[0,
55 |                         1],
56 |                        [2,
57 |                         3]]
58 |         },
59 |         {
60 |             "host_0": [[0,
61 |                         1]],
62 |             "host_1": [[0,
63 |                         1]]
64 |         },
65 |     ],
66 | )
67 | @pytest.mark.parametrize("hostfile_content", [["host_0 slots=4", "host_1 slots=4"]])
68 | def test_deploy_map(mii_config):
69 |     mii_config = mii.config.MIIConfig(**mii_config)
70 |     mii_config.generate_replica_configs()
71 | 
72 | 
73 | @pytest.mark.parametrize("replica_num", [2])
74 | @pytest.mark.parametrize("tensor_parallel", [2])
75 | @pytest.mark.parametrize(
76 |     "hostfile_content",
77 |     [["host_0 slots=4"],
78 |      ["host_0 slots=2",
79 |       "host_1 slots=2"],
80 |      ["host_0 slots=8"]],
81 | )
82 | def test_auto_fill_deploy_map(mii_config):
83 |     mii_config = mii.config.MIIConfig(**mii_config)
84 |     mii_config.generate_replica_configs()
85 | 
86 | 
87 | @pytest.mark.parametrize("device_map", [{"host_0": [[0, 1]]}, [[0, 1]], [0, 1], 1])
88 | def test_deploy_map_input_types(mii_config):
89 |     mii_config = mii.config.MIIConfig(**mii_config)
90 | 


--------------------------------------------------------------------------------
/tests/test_deployment.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | 
  4 | # DeepSpeed Team
  5 | import pytest
  6 | 
  7 | import json
  8 | import re
  9 | import requests
 10 | import subprocess
 11 | import time
 12 | 
 13 | import mii
 14 | 
 15 | 
 16 | def test_single_gpu(deployment, query):
 17 |     outputs = deployment(query)
 18 |     assert outputs[0], "output is empty"
 19 | 
 20 | 
 21 | def test_streaming(deployment, query):
 22 |     outputs = []
 23 | 
 24 |     def callback(response):
 25 |         outputs.append(response[0].generated_text)
 26 | 
 27 |     deployment(query, streaming_fn=callback)
 28 |     assert outputs, "output is empty"
 29 | 
 30 | 
 31 | def test_streaming_consistency(deployment, query):
 32 |     expected_output = deployment(query, do_sample=False)
 33 |     streaming_parts = []
 34 | 
 35 |     def callback(response):
 36 |         streaming_parts.append(response[0].generated_text)
 37 | 
 38 |     deployment(query, do_sample=False, streaming_fn=callback)
 39 |     streaming_output = "".join(streaming_parts)
 40 | 
 41 |     assert streaming_output == expected_output[0].generated_text, "outputs w and w/o streaming are not equal"
 42 | 
 43 | 
 44 | def test_multi_prompt(deployment, query):
 45 |     outputs = deployment([query] * 4)
 46 |     for r in outputs:
 47 |         assert r, "output is empty"
 48 | 
 49 | 
 50 | @pytest.mark.parametrize("tensor_parallel", [2])
 51 | def test_multi_gpu(deployment, query):
 52 |     outputs = deployment(query)
 53 |     assert outputs[0], "output is empty"
 54 | 
 55 | 
 56 | @pytest.mark.parametrize("replica_num", [2])
 57 | def test_multi_replica(deployment, query):
 58 |     deployment_name = deployment.mii_config.deployment_name
 59 | 
 60 |     start = time.time()
 61 |     outputs = mii.client(deployment_name)(query, max_length=128, ignore_eos=True)
 62 |     end = time.time()
 63 |     assert outputs[0], "output is empty"
 64 |     single_query_time = end - start
 65 | 
 66 |     procs = []
 67 |     double_query_time = []
 68 |     for _ in range(2):
 69 |         p = subprocess.Popen(
 70 |             [
 71 |                 "python3",
 72 |                 "-c",
 73 |                 f"import time, mii; start=time.time(); mii.client('{deployment_name}')('{query}', max_length=128, ignore_eos=True); print('time',time.time()-start)",
 74 |             ],
 75 |             stdout=subprocess.PIPE,
 76 |         )
 77 |         procs.append(p)
 78 |     for p in procs:
 79 |         output, error = p.communicate()
 80 |         m = re.search(r"time (\d+.\d+)", output.decode("utf-8").strip())
 81 |         assert m, "time not found"
 82 |         double_query_time.append(float(m.group(1)))
 83 | 
 84 |     double_query_time = sum(double_query_time) / 2
 85 | 
 86 |     assert single_query_time == pytest.approx(
 87 |         double_query_time, single_query_time / 2
 88 |     ), "two queries should take about the same time as one query"
 89 | 
 90 | 
 91 | def test_query_kwargs(deployment, query):
 92 |     # test ignore_eos
 93 |     outputs = deployment(
 94 |         query,
 95 |         max_length=128,
 96 |         min_new_tokens=16,
 97 |         ignore_eos=True,
 98 |         top_p=0.9,
 99 |         top_k=50,
100 |         temperature=0.9,
101 |     )
102 |     assert outputs[0], "output is empty"
103 | 
104 | 
105 | def test_do_sample(deployment, query):
106 |     output_0 = deployment(query, do_sample=False, max_length=128)
107 |     output_1 = deployment(query, do_sample=False, max_length=128)
108 |     assert (
109 |         output_0[0] == output_1[0]
110 |     ), "do_sample=False should always return the same output"
111 | 
112 | 
113 | def test_return_full_text(deployment, query):
114 |     outputs = deployment(query, max_length=128, return_full_text=True)
115 |     assert outputs[0].generated_text.startswith(query), "output should start with the prompt"
116 | 
117 | 
118 | @pytest.mark.parametrize("enable_restful_api", [True])
119 | def test_restful_api(deployment, query, deployment_name, restful_api_port):
120 |     # Verify deployment is running
121 |     outputs = deployment(query, max_length=128)
122 |     assert outputs[0], "output is empty"
123 | 
124 |     # Verify REST API
125 |     url = f"http://localhost:{restful_api_port}/mii/{deployment_name}"
126 |     params = {"prompts": query, "max_length": 128}
127 |     json_params = json.dumps(params)
128 |     result = requests.post(url,
129 |                            data=json_params,
130 |                            headers={"Content-Type": "application/json"})
131 |     assert result.status_code == 200
132 |     assert "generated_text" in result.json()[0]
133 | 


--------------------------------------------------------------------------------
/tests/test_model_support.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | import pytest
 7 | 
 8 | import deepspeed
 9 | import torch
10 | from deepspeed.inference.v2.checkpoint import (
11 |     CheckpointEngineBase,
12 |     HuggingFaceCheckpointEngine,
13 | )
14 | from transformers import AutoConfig, AutoModelForCausalLM, GenerationConfig
15 | from typing import Iterable, Tuple
16 | 
17 | 
18 | class ZeroWeightsCheckpointEngine(CheckpointEngineBase):
19 |     """ Generates weight with all zeros for a given model for testing purposes. """
20 |     def __init__(self, model_name_or_path: str, auth_token: str = None) -> None:
21 |         self.model_name_or_path = model_name_or_path
22 |         self.model_config = AutoConfig.from_pretrained(self.model_name_or_path,
23 |                                                        trust_remote_code=True)
24 |         if hasattr(self.model_config, "max_position_embeddings"):
25 |             self.model_config.max_seq_length = self.model_config.max_position_embeddings
26 |         else:
27 |             try:
28 |                 generation_config = GenerationConfig.from_pretrained(
29 |                     self.model_name_or_path)
30 |                 self.model_config.max_seq_length = generation_config.max_length
31 |             except OSError:
32 |                 self.model_config.max_seq_length = 2048
33 | 
34 |     def parameters(self) -> Iterable[Tuple[str, torch.Tensor]]:
35 |         # Load with meta device is faster
36 |         with deepspeed.OnDevice(dtype=torch.float16, device="meta"):
37 |             model = AutoModelForCausalLM.from_config(self.model_config,
38 |                                                      trust_remote_code=True)
39 | 
40 |         for param_name, param in model.state_dict().items():
41 |             yield param_name, torch.zeros(param.shape)
42 | 
43 | 
44 | @pytest.fixture(scope="module", autouse=True)
45 | def inject_checkpoint_engine():
46 |     # Inject the random weihts checkpoint engine
47 |     deepspeed.inference.v2.engine_factory.HuggingFaceCheckpointEngine = (
48 |         ZeroWeightsCheckpointEngine)
49 |     yield None
50 |     # Restore the original checkpoint engine
51 |     deepspeed.inference.v2.engine_factory.HuggingFaceCheckpointEngine = (
52 |         HuggingFaceCheckpointEngine)
53 | 
54 | 
55 | @pytest.mark.parametrize(
56 |     "model_name",
57 |     [
58 |         "tiiuae/falcon-7b",
59 |         "huggyllama/llama-7b",
60 |         "NousResearch/Llama-2-7b-hf",
61 |         "NousResearch/Hermes-2-Pro-Mistral-7B",
62 |         "cloudyu/Mixtral_11Bx2_MoE_19B",
63 |         "facebook/opt-125m",
64 |         "microsoft/phi-2",
65 |         "Qwen/Qwen-7B-Chat",
66 |         "Qwen/Qwen1.5-0.5B",
67 |     ],
68 |     ids=[
69 |         "falcon",
70 |         "llama",
71 |         "llama-2",
72 |         "mistral",
73 |         "mixtral",
74 |         "opt",
75 |         "phi-2",
76 |         "qwen",
77 |         "qwen-2"
78 |     ],
79 | )
80 | def test_model(pipeline, query):
81 |     outputs = pipeline(query, max_new_tokens=16)
82 |     assert outputs[0], "output is empty"
83 | 
84 | 
85 | @pytest.mark.parametrize("local_model", [True])
86 | def test_local_model_dir(pipeline):
87 |     assert pipeline
88 | 


--------------------------------------------------------------------------------
/tests/test_pipeline.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | 
 6 | 
 7 | def test_single_gpu(pipeline, query):
 8 |     outputs = pipeline(query)
 9 |     assert outputs[0], "output is empty"
10 | 
11 | 
12 | def test_multi_prompt(pipeline, query):
13 |     outputs = pipeline([query] * 4)
14 |     for r in outputs:
15 |         assert r, "output is empty"
16 | 
17 | 
18 | def test_query_kwargs(pipeline, query):
19 |     # test ignore_eos
20 |     outputs = pipeline(
21 |         query,
22 |         max_length=128,
23 |         min_new_tokens=16,
24 |         ignore_eos=True,
25 |         top_p=0.9,
26 |         top_k=50,
27 |         temperature=0.9,
28 |     )
29 |     assert outputs[0], "output is empty"
30 | 


--------------------------------------------------------------------------------
/tests/test_ragged_batching.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Microsoft Corporation.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | # DeepSpeed Team
 5 | import pytest
 6 | 
 7 | from mii.batching.ragged_batching import ReadableStream
 8 | from mii.config import ModelConfig
 9 | from mii.modeling.tokenizers import load_tokenizer
10 | 
11 | 
12 | @pytest.mark.parametrize(
13 |     "model_name",
14 |     [
15 |         "tiiuae/falcon-7b",
16 |         "NousResearch/Llama-2-7b-hf",
17 |         "NousResearch/Hermes-2-Pro-Mistral-7B",
18 |         "cloudyu/Mixtral_11Bx2_MoE_19B",
19 |         "facebook/opt-125m",
20 |     ],
21 |     ids=["falcon",
22 |          "llama",
23 |          "mistral",
24 |          "mixtral",
25 |          "opt"],
26 | )
27 | @pytest.mark.parametrize(
28 |     "query",
29 |     [
30 |         "It’s a region that includes Washington, Oregon, and Idaho.",
31 |         "# Heading\n\n<s>title</s>   redundant  spaces, #id — an anchor",
32 |         "例如",
33 |     ],
34 |     ids=[
35 |         "apostrophe",
36 |         "markdown",
37 |         "chinese",
38 |     ])
39 | def test_readable_stream(model_config, query):
40 |     tokenizer = load_tokenizer(ModelConfig(**model_config))
41 |     thread_id = 42
42 | 
43 |     token_ids = tokenizer.encode(query)
44 |     expected = tokenizer.decode(token_ids)
45 |     decoded = []
46 | 
47 |     stream = ReadableStream(tokenizer)
48 |     for token_id in token_ids:
49 |         decoded.append(stream.decode(thread_id, [token_id]))
50 | 
51 |     assert "".join(decoded) == expected
52 | 
53 | 
54 | @pytest.mark.parametrize(
55 |     "model_name,expected_size",
56 |     [
57 |         ("tiiuae/falcon-7b",
58 |          65024),
59 |         ("NousResearch/Llama-2-7b-hf",
60 |          32000),
61 |         ("NousResearch/Hermes-2-Pro-Mistral-7B",
62 |          32032),
63 |         ("cloudyu/Mixtral_11Bx2_MoE_19B",
64 |          32000),
65 |         ("facebook/opt-125m",
66 |          50265),
67 |         ("nvidia/Llama3-ChatQA-1.5-8B",
68 |          128256),
69 |     ],
70 |     ids=["falcon",
71 |          "llama",
72 |          "mistral",
73 |          "mixtral",
74 |          "opt",
75 |          "llama3"],
76 | )
77 | def test_vocab_size(model_config, expected_size):
78 |     tokenizer = load_tokenizer(ModelConfig(**model_config))
79 |     assert tokenizer.vocab_size == expected_size
80 | 


--------------------------------------------------------------------------------
/version.txt:
--------------------------------------------------------------------------------
1 | 0.3.4
2 | 


--------------------------------------------------------------------------------