├── .github └── workflows │ ├── formatting.yml │ ├── nv-a6000-fastgen.yml │ ├── nv-v100-legacy.yml │ ├── release.yml │ └── setup-venv │ └── action.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── .style.yapf ├── CODEOWNERS ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── SECURITY.md ├── SUPPORT.md ├── docs ├── CNAME ├── Makefile ├── images │ ├── fast-gen-overview.png │ ├── fastgen-24-01-hero-dark.png │ ├── fastgen-24-01-hero-light.png │ ├── fastgen-arch-dark.png │ ├── fastgen-arch-light.png │ ├── fastgen-hero-dark.png │ ├── fastgen-hero-light.png │ ├── fastgen-hero.png │ ├── fastgen-overview-dark.png │ ├── fastgen-overview-light.png │ ├── mii-arch-dark.png │ ├── mii-arch-light.png │ ├── mii-dark.svg │ └── mii-white.svg ├── make.bat ├── requirements.txt └── source │ ├── api.rst │ ├── conf.py │ ├── config.rst │ ├── deployment.rst │ ├── index.rst │ ├── install.rst │ ├── parallelism.rst │ ├── pipeline.rst │ ├── quick-start.rst │ ├── replicas.rst │ ├── response.rst │ └── rest.rst ├── examples ├── README.md └── chat_templates │ └── template_alpaca.jinja ├── mii ├── __init__.py ├── aml_related │ ├── __init__.py │ ├── templates.py │ └── utils.py ├── api.py ├── backend │ ├── __init__.py │ ├── client.py │ └── server.py ├── batching │ ├── __init__.py │ ├── constants.py │ ├── data_classes.py │ ├── generation │ │ ├── __init__.py │ │ ├── logit_processors.py │ │ ├── samplers.py │ │ └── stop_criterion.py │ ├── postprocess.py │ ├── ragged_batching.py │ └── utils.py ├── config.py ├── constants.py ├── entrypoints │ ├── __init__.py │ ├── api_server.py │ ├── data_models.py │ └── openai_api_server.py ├── errors.py ├── grpc_related │ ├── __init__.py │ ├── modelresponse_server.py │ ├── proto │ │ ├── __init__.py │ │ ├── build_script.sh │ │ ├── modelresponse.proto │ │ ├── modelresponse_pb2.py │ │ └── modelresponse_pb2_grpc.py │ ├── restful_gateway.py │ └── task_methods.py ├── launch │ ├── __init__.py │ └── multi_gpu_server.py ├── legacy │ ├── README.md │ ├── __init__.py │ ├── aml_related │ │ ├── __init__.py │ │ ├── templates.py │ │ └── utils.py │ ├── client.py │ ├── config.py │ ├── constants.py │ ├── deployment.py │ ├── docs │ │ ├── CNAME │ │ ├── GPT-NeoX.md │ │ └── images │ │ │ ├── azure-cost.png │ │ │ ├── bert.png │ │ │ ├── bloom.png │ │ │ ├── gpt.png │ │ │ ├── hero-dark.png │ │ │ ├── hero-transparent.png │ │ │ ├── hero.png │ │ │ ├── llm-latency-sd-latency.png │ │ │ ├── mii-arch.png │ │ │ ├── mii-dark.svg │ │ │ ├── mii-white.svg │ │ │ ├── multi-gpu-latency.png │ │ │ ├── opt-bloom.png │ │ │ ├── opt.png │ │ │ ├── roberta.png │ │ │ ├── sd-hero-dark.png │ │ │ ├── sd-hero-light.png │ │ │ ├── sd-latency.png │ │ │ └── tput-llms.png │ ├── examples │ │ ├── aml │ │ │ ├── fill-mask-example.py │ │ │ ├── text-generation-bloom.py │ │ │ └── text-generation-bloom560m-example.py │ │ ├── benchmark │ │ │ └── txt2img │ │ │ │ ├── README.md │ │ │ │ ├── baseline-sd.py │ │ │ │ ├── mii-sd.py │ │ │ │ ├── requirements.txt │ │ │ │ └── utils.py │ │ ├── local │ │ │ ├── chat │ │ │ │ ├── README.md │ │ │ │ ├── chat-client-example.py │ │ │ │ └── chat-server-example.py │ │ │ ├── conversational-example.py │ │ │ ├── conversational-query-example.py │ │ │ ├── fill-mask-example.py │ │ │ ├── question-answering-example.py │ │ │ ├── question-answering-query-example.py │ │ │ ├── text-classification-example.py │ │ │ ├── text-classification-query-example.py │ │ │ ├── text-generation-bloom-example.py │ │ │ ├── text-generation-bloom560m-example.py │ │ │ ├── text-generation-fbopt-example.py │ │ │ ├── text-generation-query-example.py │ │ │ ├── text-generation-zero-example.py │ │ │ ├── token-classification-example.py │ │ │ ├── token-classification-query-example.py │ │ │ └── txt2img-example.py │ │ └── non_persistent │ │ │ └── text-generation-bloom560-example.py │ ├── grpc_related │ │ ├── __init__.py │ │ ├── modelresponse_server.py │ │ ├── proto │ │ │ ├── __init__.py │ │ │ ├── build_script.sh │ │ │ ├── legacymodelresponse.proto │ │ │ ├── legacymodelresponse_pb2.py │ │ │ └── legacymodelresponse_pb2_grpc.py │ │ └── restful_gateway.py │ ├── launch │ │ ├── __init__.py │ │ └── multi_gpu_server.py │ ├── logging.py │ ├── method_table.py │ ├── models │ │ ├── __init__.py │ │ ├── load_models.py │ │ ├── providers │ │ │ ├── __init__.py │ │ │ ├── diffusers.py │ │ │ ├── eleutherai.py │ │ │ ├── huggingface.py │ │ │ └── utils.py │ │ ├── score │ │ │ ├── __init__.py │ │ │ ├── generate.py │ │ │ └── score_template.py │ │ └── utils.py │ ├── server.py │ ├── terminate.py │ └── utils.py ├── logging.py ├── modeling │ ├── __init__.py │ ├── models.py │ └── tokenizers.py ├── score │ ├── __init__.py │ ├── generate.py │ └── score_template.py └── utils.py ├── pyproject.toml ├── release ├── bump_patch_version.py ├── check_release_version.py └── release.sh ├── requirements ├── requirements-dev.txt └── requirements.txt ├── scripts ├── check-license.py └── model_download.py ├── setup.py ├── tests ├── __init__.py ├── conftest.py ├── legacy │ ├── __init__.py │ ├── conftest.py │ ├── pytest.ini │ ├── test_config.py │ ├── test_deployment_options.py │ ├── test_local_deployment.py │ └── test_non_persistent_deployment.py ├── pytest.ini ├── test_arg_parsing.py ├── test_config.py ├── test_deployment.py ├── test_model_support.py ├── test_pipeline.py └── test_ragged_batching.py └── version.txt /.github/workflows/formatting.yml: -------------------------------------------------------------------------------- 1 | name: Formatting 2 | 3 | on: 4 | workflow_dispatch: 5 | pull_request: 6 | branches: 7 | '**' 8 | schedule: 9 | - cron: "0 0 * * *" 10 | 11 | concurrency: 12 | group: ${{ github.workflow }}-${{ github.ref }} 13 | cancel-in-progress: true 14 | 15 | jobs: 16 | 17 | # formatting and basic install on cpu-only machine 18 | formatting: 19 | runs-on: ubuntu-22.04 20 | 21 | steps: 22 | - uses: actions/checkout@v4 23 | 24 | - name: environment 25 | run: | 26 | which python 27 | python --version 28 | 29 | - name: Install dependencies 30 | run: | 31 | grep -E "clang-format|pre-commit" requirements/requirements-dev.txt | xargs pip install 32 | 33 | - name: Formatting checks 34 | run: | 35 | pre-commit run --all-files 36 | -------------------------------------------------------------------------------- /.github/workflows/nv-a6000-fastgen.yml: -------------------------------------------------------------------------------- 1 | name: nv-a6000-fastgen 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | - cron: "0 0 * * *" 7 | pull_request: 8 | paths-ignore: 9 | - 'mii/legacy/**' 10 | - 'tests/legacy/**' 11 | - '.github/workflows/nv-v100-legacy.yml' 12 | 13 | concurrency: 14 | group: ${{ github.workflow }}-${{ github.ref }} 15 | cancel-in-progress: true 16 | 17 | jobs: 18 | unit-tests: 19 | runs-on: [self-hosted, nvidia, a6000] 20 | container: 21 | image: nvcr.io/nvidia/pytorch:24.03-py3 22 | ports: 23 | - 80 24 | options: --gpus all --shm-size "8G" 25 | 26 | steps: 27 | - uses: actions/checkout@v4 28 | 29 | - name: Check container state 30 | run: | 31 | ldd --version 32 | nvcc --version 33 | nvidia-smi 34 | python -c "import torch; print('torch:', torch.__version__, torch)" 35 | python -c "import torch; print('CUDA available:', torch.cuda.is_available())" 36 | - name: Install transformers 37 | run: | 38 | git clone --depth=1 https://github.com/huggingface/transformers 39 | cd transformers 40 | git rev-parse --short HEAD 41 | python -m pip install . 42 | - name: Install deepspeed 43 | run: | 44 | git clone --depth=1 https://github.com/deepspeedai/DeepSpeed 45 | cd DeepSpeed 46 | python -m pip install . 47 | ds_report 48 | - name: Install MII 49 | run: | 50 | pip install .[dev] 51 | - name: Python environment 52 | run: | 53 | python -m pip list 54 | - name: Unit tests 55 | run: | 56 | unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch 57 | cd tests 58 | python -m pytest --color=yes --durations=0 --verbose -rF ./ 59 | -------------------------------------------------------------------------------- /.github/workflows/nv-v100-legacy.yml: -------------------------------------------------------------------------------- 1 | name: nv-v100-legacy 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | - cron: "0 0 * * *" 7 | pull_request: 8 | paths: 9 | - 'mii/__init__.py' 10 | - 'mii/legacy/**' 11 | - 'tests/legacy/**' 12 | - '.github/workflows/nv-v100-legacy.yml' 13 | - 'requirements/**' 14 | - 'setup.py' 15 | 16 | concurrency: 17 | group: ${{ github.workflow }}-${{ github.ref }} 18 | cancel-in-progress: true 19 | 20 | jobs: 21 | unit-tests: 22 | runs-on: [self-hosted, nvidia, cu121, v100] 23 | 24 | steps: 25 | - uses: actions/checkout@v4 26 | 27 | - id: setup-venv 28 | uses: ./.github/workflows/setup-venv 29 | 30 | - name: Install pytorch 31 | run: | 32 | pip3 install -U --cache-dir /blob/torch_cache/ torch --index-url https://download.pytorch.org/whl/cu121 33 | python -c "import torch; print('torch:', torch.__version__, torch)" 34 | python -c "import torch; print('CUDA available:', torch.cuda.is_available())" 35 | 36 | - name: Install dependencies 37 | run: | 38 | pip install git+https://github.com/deepspeedai/DeepSpeed.git 39 | pip install git+https://github.com/huggingface/transformers.git@v4.42.4 40 | pip install -U accelerate 41 | ds_report 42 | 43 | - name: Python environment 44 | run: | 45 | pip list 46 | 47 | - name: Install MII 48 | run: | 49 | pip install .[dev] 50 | 51 | - name: Unit tests 52 | run: | 53 | unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch 54 | if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi 55 | cd tests/legacy 56 | TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose ./ 57 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Build and publish DeepSpeed-MII release 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*.*.*' 7 | 8 | jobs: 9 | deploy: 10 | runs-on: ubuntu-22.04 11 | environment: release-env 12 | 13 | steps: 14 | - uses: actions/checkout@v4 15 | with: 16 | ref: "main" 17 | - name: Get release version from tag 18 | run: | 19 | echo "RELEASE_VERSION=${GITHUB_REF#refs/*/v}" >> $GITHUB_ENV 20 | - name: Check release version 21 | run: | 22 | python release/check_release_version.py --release_version ${{ env.RELEASE_VERSION }} 23 | - name: Build DeepSpeed-MII 24 | run: | 25 | pip install build 26 | MII_BUILD_STRING=" " python -m build --wheel 27 | - name: Publish to PyPI 28 | uses: pypa/gh-action-pypi-publish@release/v1 29 | with: 30 | password: ${{ secrets.PYPI_API_TOKEN }} 31 | repository-url: https://upload.pypi.org/legacy/ 32 | - name: Bump version 33 | run: | 34 | python release/bump_patch_version.py --current_version ${{ env.RELEASE_VERSION }} 35 | - name: Create Pull Request 36 | uses: peter-evans/create-pull-request@v6 37 | with: 38 | token: ${{ secrets.GH_PAT }} 39 | add-paths: | 40 | version.txt 41 | body: | 42 | **Auto-generated PR to update version.txt after a DeepSpeed release** 43 | Released version - ${{ env.RELEASE_VERSION }} 44 | Author - @${{ github.actor }} 45 | branch: AutoPR/${{ env.RELEASE_VERSION }} 46 | assignees: ${{ github.actor }} 47 | title: "Update version.txt after ${{ env.RELEASE_VERSION }} release" 48 | author: ${{ github.actor }} <${{ github.actor }}@users.noreply.github.com> 49 | -------------------------------------------------------------------------------- /.github/workflows/setup-venv/action.yml: -------------------------------------------------------------------------------- 1 | name: Create Virtual Environment 2 | 3 | runs: 4 | using: "composite" 5 | steps: 6 | - id: update-env 7 | run: | 8 | sudo apt-get update 9 | sudo apt-get install -y libaio-dev 10 | python -m pip install --user --upgrade pip 11 | python -m pip install --user --upgrade virtualenv 12 | shell: bash 13 | - id: create-venv 14 | run: | 15 | python -m venv unit-test-venv 16 | source ./unit-test-venv/bin/activate 17 | python -m pip install --upgrade pip 18 | pip install wheel # required after pip>=23.1 19 | echo PATH=$PATH >> $GITHUB_ENV # Make it so venv is inherited for other steps 20 | shell: bash 21 | - id: print-env 22 | run: | 23 | which python 24 | python --version 25 | if [[ -z "${AISC_NODE_INSTANCE_ID}" ]]; then 26 | echo "Not on self-hosted node" 27 | else 28 | echo "JobID: ${AISC_NODE_INSTANCE_ID}" 29 | fi 30 | if command -v nvidia-smi; then 31 | nvidia-smi 32 | which nvcc 33 | nvcc --version 34 | elif command -v rocm-smi; then 35 | rocm-smi --showhw 36 | which hipcc 37 | hipcc --version 38 | fi 39 | shell: bash 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.swp 3 | *.log 4 | *~ 5 | .idea 6 | build 7 | dist 8 | *.so 9 | *.egg-info 10 | build.txt 11 | .vscode 12 | .theia 13 | .cache 14 | __pycache__ 15 | mii/version.py 16 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: meta 3 | hooks: 4 | - id: check-hooks-apply 5 | - id: check-useless-excludes 6 | 7 | - repo: https://github.com/pre-commit/pre-commit-hooks 8 | rev: v4.0.1 9 | hooks: 10 | - id: check-case-conflict 11 | - id: check-yaml 12 | - id: destroyed-symlinks 13 | - id: end-of-file-fixer 14 | exclude: docs/CNAME 15 | - id: fix-byte-order-marker 16 | - id: fix-encoding-pragma 17 | args: [--remove] 18 | - id: mixed-line-ending 19 | args: [--fix=lf] 20 | - id: requirements-txt-fixer 21 | - id: trailing-whitespace 22 | 23 | - repo: https://github.com/pre-commit/mirrors-yapf 24 | rev: v0.31.0 25 | hooks: 26 | - id: yapf 27 | 28 | - repo: https://github.com/codespell-project/codespell 29 | rev: v2.1.0 30 | hooks: 31 | - id: codespell 32 | args: [ 33 | # Do not check files that are automatically generated 34 | '--skip=docs/Gemfile.lock,tests/unit/gpt2-merges.txt,tests/unit/gpt2-vocab.json', 35 | '--ignore-regex=\\n', # Do not count the 'n' in an escaped newline as part of a word 36 | '--ignore-words-list=unsupport,aks', # Word used in error messages that need rewording 37 | --check-filenames, 38 | --check-hidden 39 | ] 40 | 41 | - repo: https://github.com/pycqa/flake8 42 | rev: 4.0.1 43 | hooks: 44 | - id: flake8 45 | args: ['--ignore=E,F403,F405,F541,F841,W', '--select=E9,F,W6', '--per-file-ignores=__init__.py:F401,mii/grpc_related/proto/modelresponse_pb2.py:F821,F401,mii/legacy/grpc_related/proto/legacymodelresponse_pb2.py:F821,F401'] 46 | 47 | - repo: local 48 | hooks: 49 | - id: check-license 50 | name: check-license 51 | entry: ./scripts/check-license.py 52 | language: script 53 | files: \.(py|c|cpp|cu|cc|h|hpp|cuh|hip|tr|sh)$ 54 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: "2" 2 | 3 | build: 4 | os: "ubuntu-22.04" 5 | tools: 6 | python: "3.10" 7 | 8 | python: 9 | install: 10 | - requirements: docs/requirements.txt 11 | 12 | sphinx: 13 | configuration: docs/source/conf.py 14 | -------------------------------------------------------------------------------- /.style.yapf: -------------------------------------------------------------------------------- 1 | [style] 2 | SPLIT_ALL_COMMA_SEPARATED_VALUES = true 3 | COLUMN_LIMIT = 89 4 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @tohtana @tjruwase @loadams 2 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | DeepSpeed-MII welcomes your contributions! 3 | 4 | ## Prerequisites 5 | We use [pre-commit](https://pre-commit.com/) to ensure that formatting is 6 | consistent across DeepSpeed. First, ensure that `pre-commit` is installed from either 7 | installing DeepSpeed or `pip install pre-commit`. Next, the pre-commit hooks must be 8 | installed once before commits can be made: 9 | ```bash 10 | pre-commit install 11 | ``` 12 | 13 | Afterwards, our suite of formatting tests run automatically before each `git commit`. You 14 | can also run these manually: 15 | ```bash 16 | pre-commit run --all-files 17 | ``` 18 | If a formatting test fails, it will fix the modified code in place and abort 19 | the `git commit`. After looking over the changes, you can `git add ` 20 | and then repeat the previous `git commit` command. 21 | 22 | ## Developer Certificate of Origin 23 | This project welcomes contributions and suggestions. All contributions to deepspeedai projects 24 | require commits to be signed off with a [Developer Certificate of Origin](https://en.wikipedia.org/wiki/Developer_Certificate_of_Origin) 25 | (DCO) declaring that you have the right to, and actually do, grant us the rights to use your contribution. 26 | 27 | When you submit a pull request, the DCO app will check for the presence of signed commits. 28 | Information about how this check works is here: https://github.com/dcoapp/app?tab=readme-ov-file#how-it-works 29 | 30 | ## Code of Conduct 31 | This project has adopted the [Microsoft Open Source Code of 32 | Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the 33 | [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact 34 | [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or 35 | comments. 36 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # TODO: The maintainer of this repo has not yet edited this file 2 | 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? 4 | 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help. 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). 7 | - **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide. 8 | 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.* 10 | 11 | # Support 12 | 13 | ## How to file issues and get help 14 | 15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 17 | feature request as a new Issue. 18 | 19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER 21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**. 22 | 23 | ## Microsoft Support Policy 24 | 25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 26 | -------------------------------------------------------------------------------- /docs/CNAME: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/docs/CNAME -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/images/fast-gen-overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/docs/images/fast-gen-overview.png -------------------------------------------------------------------------------- /docs/images/fastgen-24-01-hero-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/docs/images/fastgen-24-01-hero-dark.png -------------------------------------------------------------------------------- /docs/images/fastgen-24-01-hero-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/docs/images/fastgen-24-01-hero-light.png -------------------------------------------------------------------------------- /docs/images/fastgen-arch-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/docs/images/fastgen-arch-dark.png -------------------------------------------------------------------------------- /docs/images/fastgen-arch-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/docs/images/fastgen-arch-light.png -------------------------------------------------------------------------------- /docs/images/fastgen-hero-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/docs/images/fastgen-hero-dark.png -------------------------------------------------------------------------------- /docs/images/fastgen-hero-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/docs/images/fastgen-hero-light.png -------------------------------------------------------------------------------- /docs/images/fastgen-hero.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/docs/images/fastgen-hero.png -------------------------------------------------------------------------------- /docs/images/fastgen-overview-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/docs/images/fastgen-overview-dark.png -------------------------------------------------------------------------------- /docs/images/fastgen-overview-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/docs/images/fastgen-overview-light.png -------------------------------------------------------------------------------- /docs/images/mii-arch-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/docs/images/mii-arch-dark.png -------------------------------------------------------------------------------- /docs/images/mii-arch-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/docs/images/mii-arch-light.png -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | asyncio 2 | autodoc_pydantic>=2.0.0 3 | deepspeed>=0.15.0 4 | grpcio 5 | grpcio-tools 6 | sphinx==7.1.2 7 | sphinx-prompt 8 | sphinx-rtd-theme==1.3.0rc1 9 | sphinx_autodoc_typehints 10 | sphinx_copybutton 11 | torch 12 | transformers 13 | ujson 14 | zmq 15 | -------------------------------------------------------------------------------- /docs/source/api.rst: -------------------------------------------------------------------------------- 1 | API 2 | === 3 | 4 | DeepSpeed-MII provides a very simple API to deploy your LLM: 5 | 6 | .. autofunction:: mii.pipeline 7 | 8 | The :func:`mii.pipeline` API is a great way to try DeepSpeed-MII with ragged 9 | batching and dynamic splitfuse. The pipeline is non-persistent and only exists 10 | for the lifetime of the python script where it is used. For examples of how to 11 | use :func:`mii.pipeline` please see :doc:`pipeline`. 12 | 13 | .. autofunction:: mii.serve 14 | 15 | The :func:`mii.serve` API is intended for production use cases, where a 16 | persistent model deployment is necessary. The persistent deployment utilizes 17 | ragged batching and dynamic splitfuse to deliver high throughput and low latency 18 | to multiple clients in parallel. For examples of how to use :func:`mii.serve` 19 | please see :doc:`deployment`. 20 | 21 | .. autofunction:: mii.client 22 | 23 | The :func:`mii.client` API allows multiple processes to connect to a persistent 24 | deployment created with :func:`mii.serve`. For examples of how to use 25 | :func:`mii.client` please see :doc:`deployment`. 26 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | # Configuration file for the Sphinx documentation builder. 6 | import os 7 | import sys 8 | 9 | sys.path.insert(0, os.path.abspath('../../')) 10 | 11 | # -- Project information 12 | 13 | project = 'DeepSpeed-MII' 14 | copyright = '2023, Microsoft' 15 | author = 'Microsoft' 16 | 17 | with open("../../version.txt", "r") as f: 18 | release = f.readline().rstrip() 19 | 20 | # -- General configuration 21 | 22 | extensions = [ 23 | 'sphinx.ext.duration', 24 | 'sphinx.ext.doctest', 25 | 'sphinx.ext.autodoc', 26 | 'sphinx.ext.autosummary', 27 | 'sphinx.ext.intersphinx', 28 | 'sphinx.ext.viewcode', 29 | 'sphinx_autodoc_typehints', 30 | 'sphinx_copybutton', 31 | 'sphinx-prompt', 32 | 'sphinxcontrib.autodoc_pydantic', 33 | ] 34 | 35 | intersphinx_mapping = { 36 | 'python': ('https://docs.python.org/3/', 37 | None), 38 | 'sphinx': ('https://www.sphinx-doc.org/en/master/', 39 | None), 40 | } 41 | intersphinx_disabled_domains = ['std'] 42 | 43 | # sphinx_autodoc_typehints config 44 | typehints_defaults = "braces" 45 | 46 | # autodoc_pyandtic config 47 | autodoc_pydantic_model_show_field_summary = False 48 | autodoc_pydantic_field_signature_prefix = ' ' 49 | autodoc_pydantic_model_signature_prefix = 'class' 50 | autodoc_pydantic_model_show_json = False 51 | autodoc_pydantic_model_show_config_summary = False 52 | autodoc_pydantic_model_show_config_member = False 53 | autodoc_pydantic_model_show_validator_summary = False 54 | autodoc_pydantic_model_show_validator_members = False 55 | autodoc_pydantic_model_summary_list_order = 'bysource' 56 | autodoc_pydantic_model_member_order = 'bysource' 57 | autodoc_pydantic_field_list_validators = False 58 | 59 | # sphinx_copybutton config 60 | copybutton_prompt_text = r">>> |\$ |\(.venv\) \$ " 61 | copybutton_prompt_is_regexp = True 62 | 63 | #autodoc_mock_imports = ["deepspeed", "torch"] 64 | autodoc_member_order = 'bysource' 65 | autosummary_generate = True 66 | 67 | templates_path = ['_templates'] 68 | 69 | # -- Options for HTML output 70 | 71 | html_theme = 'sphinx_rtd_theme' 72 | html_theme_options = { 73 | "logo_only": True, 74 | } 75 | html_logo = "../images/mii-dark.svg" 76 | logo_only = True 77 | 78 | # -- Options for EPUB output 79 | epub_show_urls = 'footnote' 80 | -------------------------------------------------------------------------------- /docs/source/config.rst: -------------------------------------------------------------------------------- 1 | Configuration 2 | ============= 3 | 4 | The config classes described here are used to customize :doc:`pipeline` and :doc:`deployment`. 5 | 6 | .. _model_configuration: 7 | 8 | Model Configuration 9 | ------------------- 10 | 11 | The :class:`ModelConfig ` is used to stand up a 12 | DeepSpeed inference engine and provides a large amount of control to users. This 13 | class is automatically generated from user-provided arguments to 14 | :func:`mii.pipeline` and :func:`mii.serve`. The fields can be provided in a 15 | ``model_config`` dictionary or as keyword arguments. 16 | 17 | For example, to change the default ``max_length`` for token generation of a 18 | pipeline, the following are equivalent: 19 | 20 | As a keyword argument: 21 | 22 | .. code-block:: python 23 | 24 | pipe = mii.pipeline("mistralai/Mistral-7B-v0.1", max_length=2048) 25 | 26 | As a ``model_config`` dictionary: 27 | 28 | .. code-block:: python 29 | 30 | pipe = mii.pipeline("mistralai/Mistral-7B-v0.1", model_config={"max_length": 2048}) 31 | 32 | .. autopydantic_model:: mii.config.ModelConfig 33 | 34 | .. _mii_configuration: 35 | 36 | MII Server Configuration 37 | ------------------------ 38 | 39 | The :class:`MIIConfig ` is used to stand up a 40 | DeepSpeed-MII `gRPC `_ server and provide a large amount of 41 | control to users. This class is automatically generated from user-provided 42 | arguments to :func:`mii.serve`. The fields can be provided in a ``mii_config`` 43 | dictionary or as keyword arguments. 44 | 45 | For example, to change the base port number used to to communicate with a 46 | persistent deployment and the default ``max_length`` for token generation, the 47 | following are equivalent: 48 | 49 | As keyword arguments: 50 | 51 | .. code-block:: python 52 | 53 | client = mii.serve("mistralai/Mistral-7B-v0.1", port_number=50055, max_length=2048) 54 | 55 | As ``model_config`` and ``mii_config`` dictionaries: 56 | 57 | .. code-block:: python 58 | 59 | client = mii.serve("mistralai/Mistral-7B-v0.1", mii_config={"port_number": 50055}, model_config={"max_length": 2048}) 60 | 61 | .. autopydantic_model:: mii.config.MIIConfig 62 | 63 | Text-Generation Configuration 64 | ----------------------------- 65 | 66 | The :class:`GenerateParamsConfig ` is used to 67 | process user-provided keyword arguments passed to :class:`MIIPipeline 68 | ` and :class:`MIIClient 69 | ` when doing text-generation. 70 | 71 | .. autopydantic_model:: mii.config.GenerateParamsConfig 72 | :exclude-members: prompt_length 73 | -------------------------------------------------------------------------------- /docs/source/deployment.rst: -------------------------------------------------------------------------------- 1 | Persistent Deployments 2 | ====================== 3 | 4 | A persistent model deployment can created with the :func:`mii.serve` API. This 5 | stands up a gRPC server and returns a :class:`MIIClient 6 | ` object that can be used to send generation 7 | requests to the inference server. The inference server will persist after the 8 | python script exits and until it is explicitly terminated. 9 | 10 | To connect to an existing deployment, the :func:`mii.client` API is used. This 11 | will connect with an existing gRPC server and return a :class:`MIIClient 12 | ` object. 13 | 14 | MIIClient 15 | --------- 16 | 17 | .. autoclass:: 18 | mii.backend.client.MIIClient 19 | 20 | .. automethod:: __call__ 21 | 22 | .. automethod:: generate 23 | 24 | .. automethod:: terminate_server 25 | 26 | :class:`MIIClient ` is a callable class that 27 | provides a simplified interface for generating text for prompt inputs on a 28 | persistent model deployment. To create a persistent deployment, you must only 29 | provide the HuggingFace model name (or path to a locally stored model) to the 30 | :func:`mii.serve` API. DeepSpeed-MII will automatically load the model weights, 31 | create an inference engine, stand up a gRPC server, and return the callable 32 | client. An example is provided below: 33 | 34 | .. code-block:: python 35 | 36 | import mii 37 | client = mii.serve("mistralai/Mistral-7B-v0.1") 38 | response = client(["DeepSpeed is", "Seattle is"], max_new_tokens=128) 39 | print(response) 40 | 41 | Because the deployment is persistent, this server will continue running until it 42 | is explicitly shutdown. This allows users to connect to a deployment from other 43 | processes using the :func:`mii.client` API: 44 | 45 | .. code-block:: python 46 | 47 | import mii 48 | client = mii.client("mistralai/Mistral-7B-v0.1") 49 | response = client(["DeepSpeed is", "Seattle is"], max_new_tokens=128) 50 | print(response) 51 | 52 | When a server needs to be shutdown, this can be done from any client object: 53 | 54 | .. code-block:: python 55 | 56 | import mii 57 | client = mii.client("mistralai/Mistral-7B-v0.1") 58 | client.terminate_server() 59 | 60 | Deployment Configuration 61 | ------------------------ 62 | 63 | While we prioritize offering a simple interface for loading models into 64 | production-ready persistent deployments, we also provide many configuration 65 | options for our persistent deployment. 66 | 67 | **Any of the fields in** :class:`ModelConfig ` **and** 68 | :class:`MIIConfig ` **can be passed as keyword 69 | arguments or in respective** ``model_config`` **and** ``mii_config`` 70 | **dictionaries to the** :func:`mii.serve` **API. Please see** :ref:`Model 71 | Configuration ` **and** :ref:`MII Server Configuration 72 | ` **for more information.** 73 | 74 | 75 | Generate Options 76 | ---------------- 77 | 78 | Text-generation behavior using the callable :class:`MIIClient 79 | ` class can be customized with several keyword 80 | arguments. A full list of the available options can be found in 81 | :class:`GenerateParamsConfig `. 82 | 83 | The generate options affect on the prompt(s) passed in a given call the client. 84 | For example, the generation length can be controlled on a per-prompt basis and 85 | override the default ``max_length``: 86 | 87 | .. code-block:: python 88 | 89 | response_long = client(prompt, max_length=1024) 90 | response_short = client(prompt, max_length=128) 91 | 92 | .. _deployment_model_parallelism: 93 | 94 | Model Parallelism 95 | ----------------- 96 | 97 | Our persistent deployment supports splitting models across multiple GPUs using 98 | tensor parallelism. To enable model parallelism, pass the ``tensor_parallel`` 99 | argument to :func:`mii.serve`: 100 | 101 | .. code-block:: python 102 | 103 | client = mii.serve("mistralai/Mistral-7B-v0.1", tensor_parallel=2) 104 | 105 | .. _deployment_model_replicas: 106 | 107 | Model Replicas 108 | -------------- 109 | 110 | The persistent deployment can also create multiple model replicas. Passing the 111 | ``replica_num`` argument to :func:`mii.serve` enables this feature: 112 | 113 | .. code-block:: python 114 | 115 | client = mii.serve("mistralai/Mistral-7B-v0.1", replica_num=2) 116 | 117 | With multiple model replicas, the incoming requests from clients will be 118 | forwarded to the replicas in a round-robin scheduling by an intermediate 119 | load-balancer process. For example, if 4 requests with ids ``0, 1, 2, 3`` are 120 | sent to the persistent deployment, then ``replica 0`` will process requests 121 | ``0`` and ``2`` while ``replica 1`` will process requests ``1`` and ``3``. 122 | 123 | Model replicas also compose with model parallelism. For example, 2 replicas can 124 | be created each split across 2 GPUs on a system with 4 GPUs total: 125 | 126 | .. code-block:: python 127 | 128 | client = mii.serve("mistralai/Mistral-7B-v0.1", replica_num=2, tensor_parallel=2) 129 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | DeepSpeed-MII 2 | ============= 3 | 4 | .. image:: ../images/mii-white.svg 5 | :width: 600 6 | 7 | .. note:: 8 | 9 | This project is under active development. 10 | 11 | 12 | Introducing MII, an open-source Python library designed by DeepSpeed to 13 | democratize powerful model inference with a focus on high-throughput, low 14 | latency, and cost-effectiveness. 15 | 16 | MII v0.1 introduced several features as part of our `DeepSpeed-FastGen release 17 | `_ 18 | such as blocked KV-caching, continuous batching, Dynamic SplitFuse, tensor 19 | parallelism, and high-performance CUDA kernels to support fast high throughput 20 | text-generation with LLMs. The latest version of MII delivers up to 2.5 times 21 | higher effective throughput compared to leading systems such as vLLM. For 22 | detailed performance results please see our `DeepSpeed-FastGen release blog 23 | `_ 24 | and the `latest DeepSpeed-FastGen blog 25 | `_. 26 | 27 | MII-Legacy 28 | ---------- 29 | 30 | We first `announced MII `_ in 31 | 2022. Since then, MII has undergone a large refactoring effort to bring support 32 | of DeepSpeed-FastGen. MII-Legacy, which covers all prior releases up to v0.0.9, 33 | provides support for running inference for a wide variety of language model 34 | tasks. We also support accelerating `text2image models like Stable Diffusion 35 | `_. 36 | For more details on our previous releases please see our `legacy APIs 37 | `_. 38 | 39 | 40 | Contents 41 | -------- 42 | 43 | .. toctree:: 44 | :maxdepth: 1 45 | 46 | quick-start 47 | install 48 | api 49 | pipeline 50 | deployment 51 | response 52 | config 53 | rest 54 | parallelism 55 | replicas 56 | -------------------------------------------------------------------------------- /docs/source/install.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | PyPI Install 5 | ------------ 6 | 7 | The quickest way to get started with DeepSpeed-MII is to install it from `PyPI 8 | `_ using pip: 9 | 10 | .. code-block:: console 11 | 12 | (.venv) $ pip install deepspeed-mii 13 | 14 | Source Install 15 | -------------- 16 | 17 | If you want the latest changes on the ``main`` repository branch, you can use 18 | pip to install from source: 19 | 20 | .. code-block:: console 21 | 22 | (.venv) $ pip install git+https://github.com/deepspeedai/DeepSpeed-MII.git 23 | 24 | Or you can clone the repository and install: 25 | 26 | .. code-block:: console 27 | 28 | (.venv) $ git clone https://github.com/deepspeedai/DeepSpeed-MII.git 29 | (.venv) $ pip install ./DeepSpeed-MII 30 | -------------------------------------------------------------------------------- /docs/source/parallelism.rst: -------------------------------------------------------------------------------- 1 | Model Parallelism 2 | ================= 3 | 4 | DeepSpeed-MII supports model parallelism via tensor parallelism for splitting models across multiple GPUs. 5 | 6 | For model parallelism with :doc:`pipeline`, please see :ref:`Pipeline Model 7 | Parallelism `. 8 | 9 | For model parallelism with :doc:`deployment`, please see :ref:`Persistent 10 | Deployment Model Parallelism `. 11 | -------------------------------------------------------------------------------- /docs/source/pipeline.rst: -------------------------------------------------------------------------------- 1 | Non-Persistent Pipelines 2 | ======================== 3 | 4 | A non-persistent pipeline can be created with the :func:`mii.pipeline` API. This 5 | returns a non-persistent :class:`MIIPipeline 6 | ` object that is destroyed when the 7 | python script exits. 8 | 9 | MIIPipeline 10 | ----------- 11 | 12 | .. autoclass:: 13 | mii.batching.ragged_batching.MIIPipeline 14 | 15 | .. automethod:: __call__ 16 | 17 | :class:`MIIPipeline ` is a callable 18 | class that provides a simplified interface for generating text for prompt 19 | inputs. To create a pipeline, you must only provide the HuggingFace model name 20 | (or path to a locally stored model) to the :func:`mii.pipeline` API. 21 | DeepSpeed-MII will automatically load the model weights, create an inference 22 | engine, and return the callable pipeline. A simple 4-line example is provided below: 23 | 24 | .. code-block:: python 25 | 26 | import mii 27 | pipe = mii.pipeline("mistralai/Mistral-7B-v0.1") 28 | response = pipe(["DeepSpeed is", "Seattle is"], max_new_tokens=128) 29 | print(response) 30 | 31 | Pipeline Configuration 32 | ---------------------- 33 | 34 | While we prioritize offering a simple interface to load models and run 35 | text-generation, we also provide many configuration options for users that want 36 | to customize the pipeline. 37 | 38 | **Any of the fields in** :class:`ModelConfig ` **can be 39 | passed as keyword arguments or in a** ``model_config`` **dictionary to the** 40 | :func:`mii.pipeline` **API. Please see** :ref:`Model Configuration 41 | ` **for more information.** 42 | 43 | Generate Options 44 | ---------------- 45 | 46 | The text-generation of the callable :class:`MIIPipeline 47 | ` class can be modified with several 48 | keyword arguments. A full list of the available options can be found in 49 | :class:`GenerateParamsConfig `. 50 | 51 | The generate options affect only the prompt(s) passed in a given call to the 52 | pipeline. For example, you can control per-prompt generation length: 53 | 54 | .. code-block:: python 55 | 56 | response_long = pipeline(prompt, max_length=1024) 57 | response_short = pipeline(prompt, max_length=128) 58 | 59 | .. _pipeline_model_parallelism: 60 | 61 | Model Parallelism 62 | ----------------- 63 | 64 | Our pipeline object supports splitting models across multiple GPUs using tensor 65 | parallelism. You must use the ``deepspeed`` launcher to enable tennsor parallelism 66 | with the non-persistent pipeline, where the number of devices is controlled by 67 | the ``--num_gpus `` option. 68 | 69 | As an example, consider the following ``example.py`` python script: 70 | 71 | .. code-block:: python 72 | 73 | # example.py 74 | import mii 75 | pipe = mii.pipeline("mistralai/Mistral-7B-v0.1") 76 | 77 | To run this pipeline on a single GPU, use ``python`` or ``deepspeed --num_gpus 1``: 78 | 79 | .. code-block:: console 80 | 81 | (.venv) $ python example.py 82 | 83 | To enable tensor parallelism across 2 GPUs, use ``deepspeed --num_gpus 2``: 84 | 85 | .. code-block:: console 86 | 87 | (.venv) $ deepspeed --num_gpus 2 example.py 88 | 89 | Because the ``deepspeed`` launcher will run multiple processes of 90 | ``example.py``, anything in the script will be executed by each process. For 91 | example, consider the following script: 92 | 93 | .. code-block:: python 94 | 95 | # example.py 96 | import os 97 | import mii 98 | local_rank = int(os.getenv("LOCAL_RANK", 0)) 99 | pipe = mii.pipeline("mistralai/Mistral-7B-v0.1") 100 | response = pipe("DeepSpeed is", max_length=16) 101 | print(f"rank {local_rank} response: {response}") 102 | 103 | By default, the response is returned to only the rank 0 process. When run 104 | with ``deepspeed --num_gpus 2 example.py`` the following output is produced: 105 | 106 | .. code-block:: console 107 | 108 | (.venv) $ deepspeed --num_gpus 2 example.py 109 | rank 0 response: [a library for parallelizing and accelerating PyTorch.] 110 | rank 1 response: [] 111 | 112 | This behavior can be changed by enabling ``all_rank_output`` when creating the 113 | pipeline (i.e., ``pipe = mii.pipeline("mistralai/Mistral-7B-v0.1", 114 | all_rank_output=True)``): 115 | 116 | .. code-block:: console 117 | 118 | (.venv) $ deepspeed --num_gpus 2 example.py 119 | rank 0 response: [a library for parallelizing and accelerating PyTorch.] 120 | rank 1 response: [a library for parallelizing and accelerating PyTorch.] 121 | -------------------------------------------------------------------------------- /docs/source/quick-start.rst: -------------------------------------------------------------------------------- 1 | FastGen Quick Start Guide 2 | ========================= 3 | 4 | This guide is aimed to get you quickly up and running DeepSpeed-MII and DeepSpeed-FastGen. 5 | 6 | Requirements 7 | ------------ 8 | 9 | - 1 or more NVIDIA GPUs with >=sm_80 compute capability (e.g., A100, A6000) 10 | - `PyTorch `_ installed in your local Python environment 11 | 12 | Install 13 | ------- 14 | 15 | Install the latest version of DeepSpeed-MII with the following: 16 | 17 | .. code-block:: console 18 | 19 | (.venv) $ pip install -U deepspeed-mii 20 | 21 | Run a Non-Persistent Pipeline 22 | ----------------------------- 23 | 24 | A pipeline provides a non-persistent instance of the model for running 25 | inference. When the script running this code exits, the model will also be 26 | destroyed. The pipeline is ideal for doing quick tests or in cases where the 27 | best performance is not necessary. 28 | 29 | Copy the following code block into an ``example.py`` file on your local machine. 30 | Run it with ``deepspeed --num_gpus example.py``. 31 | 32 | .. code-block:: python 33 | 34 | import mii 35 | pipe = mii.pipeline("mistralai/Mistral-7B-v0.1") 36 | response = pipe(["DeepSpeed is", "Seattle is"], max_new_tokens=128) 37 | for r in response: 38 | print(r.generated_text) 39 | 40 | .. note:: 41 | 42 | Depending on your internet connection, the download of model weights could 43 | take a few minutes. If you wish to try a smaller model, replace 44 | ``mistralai/Mistral-7B-v0.1`` with ``facebook/opt-125m`` in the above code. 45 | 46 | If the code successfully runs, you should see the generated text printed in your terminal. 47 | 48 | Run a Persistent Deployment 49 | --------------------------- 50 | 51 | In contrast the pipeline, deployments create a server process that persists 52 | beyond the execution of the python script. These deployments are intended for 53 | production use cases and allow for multiple clients to connect while providing 54 | the best performance from DeepSpeed-FastGen. 55 | 56 | Copy the following code block into a ``serve.py`` file on your local machine. 57 | Run it with ``python serve.py``. 58 | 59 | .. code-block:: python 60 | 61 | import mii 62 | mii.serve("mistralai/Mistral-7B-v0.1") 63 | 64 | You should see logging messages indicating the server is starting and a final 65 | log message of ``server has started on ports [50051]``. 66 | 67 | Now copy the following code block into a ``client.py`` file on your local 68 | machine. Run it with ``python client.py``. 69 | 70 | .. code-block:: python 71 | 72 | import mii 73 | client = mii.client("mistralai/Mistral-7B-v0.1") 74 | response = client(["DeepSpeed is", "Seattle is"], max_new_tokens=128) 75 | for r in response: 76 | print(r.generated_text) 77 | 78 | If the code successfully runs, you should see the generated text printed in your 79 | terminal. You can run this client script as many times (and from as many 80 | different processes) as you like and the model deployment will remain active. 81 | 82 | Finally copy the following code block into a ``terminate.py`` file on your local 83 | machine. Run it with ``python terminate.py``. 84 | 85 | .. code-block:: python 86 | 87 | import mii 88 | client = mii.client("mistralai/Mistral-7B-v0.1") 89 | client.terminate_server() 90 | 91 | This will shutdown the model deployment and free GPU memory. 92 | -------------------------------------------------------------------------------- /docs/source/replicas.rst: -------------------------------------------------------------------------------- 1 | Model Replicas 2 | ============== 3 | 4 | DeepSpeed-MII supports creating multiple replicas of a model with 5 | :doc:`deployment`. Please see :ref:`Persistent Deployment Model Replicas 6 | `. 7 | -------------------------------------------------------------------------------- /docs/source/response.rst: -------------------------------------------------------------------------------- 1 | Response Objects 2 | ================ 3 | 4 | Generated text from :doc:`pipeline` and :doc:`deployment` are wrapped in the 5 | :class:`Response ` class. 6 | 7 | .. autoclass:: 8 | mii.batching.data_classes.Response 9 | :members: 10 | 11 | Printing a :class:`Response ` object will 12 | print only the ``generated_text`` attribute. Details about the generation can be 13 | accessed as python attributes of the class: 14 | 15 | .. code-block:: python 16 | 17 | responses = pipeline(["DeepSpeed is", "Seattle is"], max_length=128) 18 | for r in responses: 19 | print(f"generated length: {r.generated_length}, finish reason: {r.finish_reason}") 20 | 21 | The reason that a text-generation request completed will be one of the values 22 | found in the :class:`GenerationFinishReason 23 | ` enum: 24 | 25 | .. autoclass:: 26 | mii.constants.GenerationFinishReason 27 | :inherited-members: 28 | -------------------------------------------------------------------------------- /docs/source/rest.rst: -------------------------------------------------------------------------------- 1 | RESTful API 2 | =========== 3 | 4 | With a :doc:`deployment`, a RESTful API can be created. This allows users to 5 | send requests to the server via ``HTTP POST`` methods (e.g., using ``curl`` or 6 | the Python ``requests`` module). The RESTful API can be enabled with the 7 | ``enable_restful_api`` option using :func:`mii.serve`: 8 | 9 | .. code-block:: python 10 | 11 | client = mii.serve( 12 | "mistralai/Mistral-7B-v0.1", 13 | deployment_name="test_dep", 14 | enable_restful_api=True, 15 | restful_api_port=28080, 16 | ) 17 | 18 | It is useful to provide a ``deployment_name`` and ``restful_api_port`` when 19 | enabling the RESTful API as it will be used to provide an address where requests 20 | can be sent. The address for sending requests will be 21 | ``http://{HOST}:{RESTFUL_API_PORT}/mii/{DEPLOYMENT_NAME}``. In the above 22 | example, this will be ``http://localhost:28080/mii/test_dep``. 23 | 24 | To send a request to the RESTful API, use the ``HTTP POST`` method. For example, using ``curl``: 25 | 26 | .. code-block:: console 27 | 28 | (.venv) $ curl --header "Content-Type: application/json" --request POST -d '{"prompts": ["DeepSpeed is", "Seattle is"], "max_length": 128}' http://localhost:28080/mii/test_dep 29 | 30 | or using the Python ``requests`` module: 31 | 32 | .. code-block:: python 33 | 34 | import json 35 | import requests 36 | url = f"http://localhost:28080/mii/test_dep" 37 | params = {"prompts": ["DeepSpeed is", "Seattle is"], "max_length": 128} 38 | json_params = json.dumps(params) 39 | output = requests.post( 40 | url, data=json_params, headers={"Content-Type": "application/json"} 41 | ) 42 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # MII Examples 2 | Please see [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/inference/mii) for a few examples on using MII. 3 | -------------------------------------------------------------------------------- /examples/chat_templates/template_alpaca.jinja: -------------------------------------------------------------------------------- 1 | {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }} 2 | 3 | {% for message in messages %} 4 | {% if message['role'] == 'user' %} 5 | ### Instruction: 6 | {{ message['content']|trim -}} 7 | {% if not loop.last %} 8 | 9 | 10 | {% endif %} 11 | {% elif message['role'] == 'assistant' %} 12 | ### Response: 13 | {{ message['content']|trim -}} 14 | {% if not loop.last %} 15 | 16 | 17 | {% endif %} 18 | {% elif message['role'] == 'user_context' %} 19 | ### Input: 20 | {{ message['content']|trim -}} 21 | {% if not loop.last %} 22 | 23 | 24 | {% endif %} 25 | {% endif %} 26 | {% endfor %} 27 | {% if add_generation_prompt and messages[-1]['role'] != 'assistant' %} 28 | ### Response: 29 | {% endif %} 30 | -------------------------------------------------------------------------------- /mii/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | from .api import client, serve, pipeline 6 | 7 | from .legacy import MIIServer, MIIClient, mii_query_handle, deploy, terminate, DeploymentType, TaskType, aml_output_path, MIIConfig, ModelConfig, get_supported_models 8 | 9 | __version__ = "0.0.0" 10 | try: 11 | from .version import __version__ 12 | except ImportError: 13 | pass 14 | -------------------------------------------------------------------------------- /mii/aml_related/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | from .templates import * 6 | from .utils import get_acr_name, generate_aml_scripts, aml_output_path 7 | -------------------------------------------------------------------------------- /mii/backend/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | from .client import MIIClient 6 | from .server import MIIServer 7 | -------------------------------------------------------------------------------- /mii/backend/client.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import asyncio 6 | import grpc 7 | import requests 8 | from typing import Dict, Any, Callable, List, Union 9 | 10 | from mii.batching.data_classes import Response 11 | from mii.config import MIIConfig 12 | from mii.constants import GRPC_MAX_MSG_SIZE 13 | from mii.grpc_related.proto import modelresponse_pb2, modelresponse_pb2_grpc 14 | from mii.grpc_related.task_methods import TASK_METHODS_DICT 15 | 16 | 17 | def create_channel(host, port): 18 | return grpc.aio.insecure_channel( 19 | f"{host}:{port}", 20 | options=[ 21 | ("grpc.max_send_message_length", 22 | GRPC_MAX_MSG_SIZE), 23 | ("grpc.max_receive_message_length", 24 | GRPC_MAX_MSG_SIZE), 25 | ], 26 | ) 27 | 28 | 29 | class MIIClient: 30 | """ 31 | Client for sending generation requests to a persistent deployment created 32 | with :func:`mii.serve`. Use :func:`mii.client` to create an instance of this 33 | class. 34 | 35 | :param mii_config: MII config for the persistent deployment to connect with. 36 | :param host: hostname where the persistent deployment is running. 37 | """ 38 | def __init__(self, mii_config: MIIConfig, host: str = "localhost") -> None: 39 | self.mii_config = mii_config 40 | self.task = mii_config.model_conf.task 41 | self.port = mii_config.port_number 42 | self.asyncio_loop = asyncio.get_event_loop() 43 | channel = create_channel(host, self.port) 44 | # This stub allows interaction the client to send/receive messages with 45 | # the load balancer process 46 | self.stub = modelresponse_pb2_grpc.ModelResponseStub(channel) 47 | 48 | def __call__(self, *args, **kwargs) -> List[Response]: 49 | """ 50 | All args and kwargs get passed directly to 51 | :meth:`~mii.backend.client.MIIClient.generate`. 52 | 53 | :return: A list of :class:`Response` objects containing the generated 54 | text for all prompts. 55 | """ 56 | return self.generate(*args, **kwargs) 57 | 58 | async def _request_async_response(self, prompts, **query_kwargs): 59 | task_methods = TASK_METHODS_DICT[self.task] 60 | proto_request = task_methods.pack_request_to_proto(prompts, **query_kwargs) 61 | proto_response = await getattr(self.stub, task_methods.method)(proto_request) 62 | return task_methods.unpack_response_from_proto(proto_response) 63 | 64 | async def _request_async_response_stream(self, prompts, **query_kwargs): 65 | task_methods = TASK_METHODS_DICT[self.task] 66 | proto_request = task_methods.pack_request_to_proto(prompts, **query_kwargs) 67 | assert hasattr(task_methods, "method_stream_out"), f"{self.task} does not support streaming response" 68 | async for response in getattr(self.stub, 69 | task_methods.method_stream_out)(proto_request): 70 | yield task_methods.unpack_response_from_proto(response) 71 | 72 | def generate(self, 73 | prompts: Union[str, 74 | List[str]], 75 | streaming_fn: Callable = None, 76 | **generate_kwargs: Dict) -> List[Response]: 77 | """ 78 | Generates text for the given prompts. 79 | 80 | :param prompts: The string or list of strings used as prompts for generation. 81 | :param streaming_fn: Streaming support is currently a WIP. 82 | :param \\*\\*generate_kwargs: Generation keywords. A full list can be found here. 83 | 84 | :return: A list of :class:`Response` objects containing the generated 85 | text for all prompts. 86 | """ # noqa: W605 87 | if isinstance(prompts, str): 88 | prompts = [prompts] 89 | if streaming_fn is not None: 90 | if len(prompts) > 1: 91 | raise RuntimeError( 92 | "MII client streaming only supports a single prompt input.") 93 | generate_kwargs["stream"] = True 94 | return self._generate_stream(streaming_fn, prompts, **generate_kwargs) 95 | 96 | return self.asyncio_loop.run_until_complete( 97 | self._request_async_response(prompts, 98 | **generate_kwargs)) 99 | 100 | def _generate_stream(self, 101 | callback, 102 | prompts: List[str], 103 | **query_kwargs: Dict[str, 104 | Any]) -> None: 105 | async def put_result(): 106 | response_stream = self._request_async_response_stream( 107 | prompts, 108 | **query_kwargs) 109 | 110 | while True: 111 | try: 112 | response = await response_stream.__anext__() 113 | callback(response) 114 | except StopAsyncIteration: 115 | break 116 | 117 | self.asyncio_loop.run_until_complete(put_result()) 118 | 119 | async def terminate_async(self) -> None: 120 | await self.stub.Terminate( 121 | modelresponse_pb2.google_dot_protobuf_dot_empty__pb2.Empty()) 122 | 123 | def terminate_server(self) -> None: 124 | """ 125 | Terminates the persistent deployment server. This can be called from any 126 | client. 127 | """ 128 | self.asyncio_loop.run_until_complete(self.terminate_async()) 129 | if self.mii_config.enable_restful_api: 130 | requests.get( 131 | f"http://localhost:{self.mii_config.restful_api_port}/terminate") 132 | -------------------------------------------------------------------------------- /mii/batching/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | from .ragged_batching import MIIAsyncPipeline, MIIPipeline 7 | -------------------------------------------------------------------------------- /mii/batching/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | # Processing method key names 7 | TOP_K_NAME = "TopK" 8 | TOP_P_NAME = "TopP" 9 | TEMP_NAME = "Temp" 10 | SAMPLER_NAME = "Sampler" 11 | STOP_NAME = "Stop" 12 | -------------------------------------------------------------------------------- /mii/batching/generation/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | -------------------------------------------------------------------------------- /mii/batching/generation/logit_processors.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import abc 6 | from typing import List, Optional 7 | 8 | import torch 9 | import torch.nn.functional as F 10 | 11 | FLOAT_PAD = -float("inf") 12 | 13 | 14 | class BaseLogitProcessor(abc.ABC): 15 | def __call__(self, logits: torch.Tensor) -> torch.Tensor: 16 | return self.forward(logits) 17 | 18 | @abc.abstractmethod 19 | def forward(self, logits: torch.Tensor) -> torch.Tensor: 20 | ... 21 | 22 | def get_key(self) -> str: 23 | return self.__class__.__name__ 24 | 25 | 26 | class TopKLogitProcessor(BaseLogitProcessor): 27 | def __init__(self, top_k: int) -> None: 28 | self.top_k = top_k 29 | 30 | def forward(self, logits: torch.Tensor) -> torch.Tensor: 31 | # Remove all tokens with a probability less than the 32 | # last token of the top-k 33 | indices_to_remove = logits < torch.topk(logits, self.top_k)[0][..., -1, None] 34 | logits[indices_to_remove] = FLOAT_PAD 35 | return logits 36 | 37 | def get_key(self) -> str: 38 | return super().get_key() + f"_top_k={self.top_k}" 39 | 40 | 41 | class TopPLogitProcessor(BaseLogitProcessor): 42 | def __init__(self, top_p: float) -> None: 43 | assert 0.0 <= top_p <= 1.0 44 | self.top_p = top_p 45 | 46 | def forward(self, logits: torch.Tensor) -> torch.Tensor: 47 | # convert to 1D 48 | sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1) 49 | cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) 50 | 51 | # Remove tokens with cumulative probability above the threshold 52 | sorted_indices_to_remove = cumulative_probs > self.top_p 53 | # Shift the indices to the right to keep also the first token 54 | # above the threshold 55 | sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() 56 | sorted_indices_to_remove[..., 0] = 0 57 | 58 | indices_to_remove = sorted_indices_to_remove.scatter(1, 59 | sorted_indices, 60 | sorted_indices_to_remove) 61 | return logits.masked_fill(indices_to_remove, FLOAT_PAD) 62 | 63 | def get_key(self) -> str: 64 | return super().get_key() + f"_top_p={self.top_p}" 65 | 66 | 67 | class TemperatureLogitProcessor(BaseLogitProcessor): 68 | def __init__(self, temperature: float) -> None: 69 | self.temperature = temperature 70 | assert self.temperature > 0.0 71 | 72 | def forward(self, logits: torch.Tensor) -> torch.Tensor: 73 | return logits / self.temperature 74 | 75 | def get_key(self) -> str: 76 | return super().get_key() + f"_temperature={self.temperature}" 77 | 78 | 79 | class PipelineLogitProcessor(BaseLogitProcessor): 80 | def __init__(self, pipeline: List[BaseLogitProcessor]) -> None: 81 | assert all(isinstance(step, BaseLogitProcessor) for step in pipeline) 82 | self.pipeline = pipeline 83 | 84 | def forward(self, logits: torch.Tensor) -> torch.Tensor: 85 | for step in self.pipeline: 86 | logits = step(logits) 87 | return logits 88 | 89 | def get_key(self) -> str: 90 | return super().get_key( 91 | ) + f"_{'_'.join(step.get_key() for step in self.pipeline)}" 92 | 93 | 94 | class NucleusSamplingLogitProcessor(BaseLogitProcessor): 95 | def __init__(self, 96 | top_k: Optional[int] = None, 97 | top_p: Optional[float] = None) -> None: 98 | assert top_k is not None or top_p is not None 99 | if top_k is None: 100 | self._processor = TopPLogitProcessor(top_p) 101 | elif top_p is None: 102 | self._processor = TopKLogitProcessor(top_k) 103 | else: 104 | self._processor = PipelineLogitProcessor( 105 | [TopKLogitProcessor(top_k), 106 | TopPLogitProcessor(top_p)]) 107 | 108 | def forward(self, logits: torch.Tensor) -> torch.Tensor: 109 | return self._processor(logits) 110 | 111 | def get_key(self) -> str: 112 | return super().get_key() + f"_{self._processor.get_key()}" 113 | -------------------------------------------------------------------------------- /mii/batching/generation/samplers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import abc 6 | from typing import Tuple 7 | 8 | import torch 9 | from torch.distributions import Categorical 10 | 11 | 12 | class BaseGenerationSampler(abc.ABC): 13 | @abc.abstractmethod 14 | def __call__( 15 | self, 16 | logits: torch.Tensor, 17 | ) -> Tuple[torch.LongTensor, 18 | torch.Tensor]: 19 | """ 20 | Given the logits, return the next token to add to the sequence, as well 21 | as the log probability of the token 22 | 23 | Args: 24 | logits (torch.Tensor): 25 | The logits from the model. Shape: (batch_size, vocab_size) 26 | 27 | Returns: 28 | Tuple[torch.LongTensor, torch.Tensor]: 29 | The next token to add to the sequence, and the log probability 30 | of the token. Shape: (batch_size,) and (batch_size,) 31 | """ 32 | ... 33 | 34 | def get_key(self) -> str: 35 | return self.__class__.__name__ 36 | 37 | 38 | class LogitsSampler(BaseGenerationSampler): 39 | def __call__( 40 | self, 41 | logits: torch.Tensor, 42 | ) -> Tuple[torch.LongTensor, 43 | torch.Tensor]: 44 | logits = logits.float() 45 | sampler = Categorical(logits=logits) 46 | next_tokens = sampler.sample() 47 | #logprobs = sampler.log_prob(next_tokens) 48 | return next_tokens #, logprobs 49 | 50 | 51 | class GreedySampler(BaseGenerationSampler): 52 | def __call__(self, logits: torch.Tensor) -> Tuple[torch.LongTensor, torch.Tensor]: 53 | logits = logits.float() 54 | #sampler = Categorical(logits=logits) 55 | next_tokens = logits.argmax(dim=-1) 56 | #logprobs = sampler.log_prob(next_tokens) 57 | return next_tokens #, logprobs 58 | -------------------------------------------------------------------------------- /mii/batching/generation/stop_criterion.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import abc 6 | from typing import List, Union 7 | 8 | import torch 9 | 10 | # from megatron import get_tokenizer 11 | # from megatron.tokenizer.tokenizer import AbstractTokenizer 12 | 13 | 14 | class BaseGenerationStopCriterion(abc.ABC): 15 | def __init__(self, tokenizer): 16 | self.tokenizer = tokenizer 17 | 18 | def __call__(self, tokens: torch.LongTensor) -> torch.BoolTensor: 19 | return self.forward(tokens) 20 | 21 | @abc.abstractmethod 22 | def forward(self, tokens: torch.LongTensor) -> torch.BoolTensor: 23 | ... 24 | 25 | def get_key(self) -> str: 26 | return self.__class__.__name__ 27 | 28 | 29 | class TokenStopCriterion(BaseGenerationStopCriterion): 30 | def __init__(self, token: Union[str, int], tokenizer) -> None: 31 | super().__init__(tokenizer=tokenizer) 32 | if isinstance(token, str): 33 | token_id = self.tokenizer.convert_tokens_to_ids(token) 34 | else: 35 | token_id = token 36 | self.stop_token_id = token_id 37 | 38 | def forward(self, tokens: torch.LongTensor) -> torch.BoolTensor: 39 | retval = torch.zeros_like(tokens, dtype=torch.bool) 40 | retval |= tokens == self.stop_token_id 41 | return retval 42 | 43 | def get_key(self) -> str: 44 | return self.__class__.__name__ + f"_token_id={self.stop_token_id}" 45 | 46 | 47 | class EosGenerationStopCriterion(BaseGenerationStopCriterion): 48 | def __init__(self, tokenizer): 49 | super().__init__(tokenizer=tokenizer) 50 | if hasattr(self.tokenizer, "eod"): 51 | self.eos_id = self.tokenizer.eod 52 | elif hasattr(self.tokenizer, "eos_token_id"): 53 | self.eos_id = self.tokenizer.eos_token_id 54 | elif hasattr(self.tokenizer, "eos_token"): 55 | self.eos_id = self.tokenizer.eos_token 56 | else: 57 | raise ValueError( 58 | "Tokenizer must have either an `eod` or `eos_token` attribute.") 59 | 60 | def forward(self, tokens: torch.LongTensor) -> torch.BoolTensor: 61 | return tokens == self.eos_id 62 | 63 | 64 | class NewLineDelimitedStopCriterion(BaseGenerationStopCriterion): 65 | def __init__(self, tokenizer): 66 | super().__init__(tokenizer=tokenizer) 67 | self.stop_token_ids = list( 68 | set([self.tokenizer.tokenize(x)[0] for x in ["\n", 69 | "\r\n", 70 | "\n\n", 71 | ".\n\n"]])) 72 | 73 | def forward(self, tokens: torch.LongTensor) -> torch.BoolTensor: 74 | retval = torch.zeros_like(tokens, dtype=torch.bool) 75 | for stop_token_id in self.stop_token_ids: 76 | retval |= tokens == stop_token_id 77 | return retval 78 | 79 | 80 | class PipelinedCriterion(BaseGenerationStopCriterion): 81 | def __init__( 82 | self, 83 | criteria: List[BaseGenerationStopCriterion], 84 | tokenizer, 85 | ): 86 | super().__init__(tokenizer=tokenizer) 87 | self.criteria = criteria 88 | 89 | def forward(self, tokens: torch.LongTensor) -> torch.BoolTensor: 90 | retval = torch.zeros_like(tokens, dtype=torch.bool) 91 | for criterion in self.criteria: 92 | retval |= criterion(tokens) 93 | return retval 94 | 95 | def get_key(self) -> str: 96 | return super().get_key( 97 | ) + f"_{'_'.join(criterion.get_key() for criterion in self.criteria)}" 98 | -------------------------------------------------------------------------------- /mii/batching/postprocess.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | from typing import TYPE_CHECKING, Any, Dict, List 6 | 7 | import torch 8 | 9 | if TYPE_CHECKING: 10 | from mii.batching.ragged_batching import RaggedRequestBatch 11 | 12 | 13 | def run_batch_processing(input_tensor: torch.Tensor, 14 | requests: "RaggedRequestBatch", 15 | processor_fns: Dict[str, 16 | Any]) -> torch.Tensor: 17 | """ 18 | Runs the post-processing steps for batched requests. If we apply the 19 | post-processing one-by-one for each request performance takes a big hit. 20 | Instead, we identify all the requests that need to be processed by a given 21 | post-processor, sampler, etc. and perform the action on a batch of requests. 22 | """ 23 | idx_list: List[int] = [] 24 | output_list: List[torch.Tensor] = [] 25 | 26 | # Apply all the post-processing functions 27 | for key, process_fn in processor_fns.items(): 28 | 29 | # Get the index of tensors that need to be processed 30 | idx = [i for i, r in enumerate(requests) if key in r.post_processing] 31 | if not idx: 32 | # Short circuit if there is not work to do 33 | continue 34 | 35 | # Run post processing on the filtered inputs 36 | filtered_input = input_tensor[idx] 37 | idx_list.extend(idx) 38 | output_list.append(process_fn(filtered_input)) 39 | 40 | # If there was no work done, return the input tensor 41 | if not output_list: 42 | return input_tensor 43 | 44 | # If there are unprocessed requests, append them to the output 45 | unprocessed_idx = list(set(range(len(requests))).difference(idx_list)) 46 | if unprocessed_idx: 47 | idx_list.append(unprocessed_idx) 48 | output_list.append(input_tensor[unprocessed_idx]) 49 | 50 | # Concatenate and return the output 51 | output = torch.cat(output_list, dim=0) 52 | return output[torch.argsort(torch.tensor(idx_list))] 53 | 54 | 55 | def run_batch_logit_processing(input_logits: torch.Tensor, 56 | requests: "RaggedRequestBatch", 57 | processor_map: Dict[str, 58 | Any]) -> torch.Tensor: 59 | top_k_fns = {k: v for k, v in processor_map.items() if "TopK" in k} 60 | top_p_fns = {k: v for k, v in processor_map.items() if "TopP" in k} 61 | temp_fns = {k: v for k, v in processor_map.items() if "Temp" in k} 62 | 63 | # Apply TopK, TopP, and Temperature in sequence 64 | output_logits = input_logits 65 | for fns in (top_k_fns, top_p_fns, temp_fns): 66 | output_logits = run_batch_processing(output_logits, requests, fns) 67 | return output_logits 68 | 69 | 70 | def run_batch_sampler(input_logits: torch.Tensor, 71 | requests: "RaggedRequestBatch", 72 | processor_map: Dict[str, 73 | Any]) -> torch.Tensor: 74 | sampler_fns = {k: v for k, v in processor_map.items() if "Sampler" in k} 75 | next_tokens = run_batch_processing(input_logits, requests, sampler_fns) 76 | return next_tokens 77 | 78 | 79 | def run_batch_stop_criterion(next_tokens: torch.Tensor, 80 | requests: "RaggedRequestBatch", 81 | processor_map: Dict[str, 82 | Any]) -> torch.Tensor: 83 | stop_fns = {k: v for k, v in processor_map.items() if "Stop" in k} 84 | done_tokens = run_batch_processing(next_tokens, requests, stop_fns) 85 | done_tokens = torch.any(done_tokens.view((len(requests), -1)), dim=1) 86 | 87 | return done_tokens 88 | -------------------------------------------------------------------------------- /mii/batching/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | from functools import wraps 6 | 7 | from deepspeed.accelerator import get_accelerator 8 | 9 | from mii.logging import logger 10 | 11 | 12 | def sync_debug(func): 13 | @wraps(func) 14 | def wrapper(self, *args, **kwargs): 15 | if self.sync_debug: 16 | get_accelerator().synchronize() 17 | logger.debug(f"Calling {func.__name__} with args: {args}, kwargs: {kwargs}") 18 | result = func(self, *args, **kwargs) 19 | if self.sync_debug: 20 | get_accelerator().synchronize() 21 | logger.debug(f"Finished calling {func.__name__}") 22 | return result 23 | 24 | return wrapper 25 | 26 | 27 | def profiler(func): 28 | @wraps(func) 29 | def wrapper(self, *args, **kwargs): 30 | if not self.profile_model_time: 31 | return func(self, *args, **kwargs) 32 | 33 | self._timers(func.__name__).start() 34 | result = func(self, *args, **kwargs) 35 | self._timers(func.__name__).stop() 36 | self._profiled_times[func.__name__].append( 37 | self._timers(func.__name__).elapsed(reset=True)) 38 | return result 39 | 40 | return wrapper 41 | -------------------------------------------------------------------------------- /mii/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | from enum import Enum 6 | 7 | 8 | class DeploymentType(str, Enum): 9 | LOCAL = "local" 10 | AML = "aml" 11 | NON_PERSISTENT = "non-persistent" 12 | 13 | 14 | class TaskType(str, Enum): 15 | TEXT_GENERATION = "text-generation" 16 | 17 | 18 | class ModelProvider(str, Enum): 19 | HUGGING_FACE = "hugging-face" 20 | 21 | 22 | class GenerationFinishReason(str, Enum): 23 | """ Reason for text-generation to stop. """ 24 | 25 | STOP = "stop" 26 | """ Reached an EoS token. """ 27 | 28 | LENGTH = "length" 29 | """ Reached ``max_length`` or ``max_new_tokens``. """ 30 | 31 | NONE = "none" 32 | 33 | 34 | SUPPORTED_MODEL_TYPES = { 35 | 'opt': ModelProvider.HUGGING_FACE, 36 | 'llama': ModelProvider.HUGGING_FACE 37 | } 38 | 39 | REQUIRED_KEYS_PER_TASK = { 40 | TaskType.TEXT_GENERATION: ["query"], 41 | } 42 | 43 | MII_CACHE_PATH = "MII_CACHE_PATH" 44 | MII_CACHE_PATH_DEFAULT = "/tmp/mii_cache" 45 | 46 | MII_HF_CACHE_EXPIRATION = "MII_HF_CACHE_EXPIRATION" 47 | MII_HF_CACHE_EXPIRATION_DEFAULT = 60 * 60 # 1 hour 48 | 49 | MII_DEBUG_MODE = "MII_DEBUG_MODE" 50 | MII_DEBUG_MODE_DEFAULT = "0" 51 | 52 | MII_DEBUG_DEPLOY_KEY = "MII_DEBUG_DEPLOY_KEY" 53 | 54 | MII_DEBUG_BRANCH = "MII_DEBUG_BRANCH" 55 | MII_DEBUG_BRANCH_DEFAULT = "main" 56 | 57 | MII_MODEL_PATH_DEFAULT = "/tmp/mii_models" 58 | 59 | GRPC_MAX_MSG_SIZE = 2**27 # ~100MB 60 | 61 | TERMINATE_METHOD = "Terminate" 62 | 63 | LB_MAX_WORKER_THREADS = 256 64 | 65 | SERVER_SHUTDOWN_TIMEOUT = 10 66 | 67 | RESTFUL_GATEWAY_SHUTDOWN_TIMEOUT = 1 68 | RESTFUL_API_PATH = "mii" 69 | 70 | STREAM_RESPONSE_QUEUE_TIMEOUT = 600 71 | ZMQ_RECV_TIMEOUT = 5 * 1000 72 | -------------------------------------------------------------------------------- /mii/entrypoints/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | -------------------------------------------------------------------------------- /mii/errors.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | 7 | class DeploymentNotFoundError(Exception): 8 | pass 9 | 10 | 11 | class UnknownArgument(Exception): 12 | pass 13 | -------------------------------------------------------------------------------- /mii/grpc_related/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | -------------------------------------------------------------------------------- /mii/grpc_related/proto/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | -------------------------------------------------------------------------------- /mii/grpc_related/proto/build_script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | python3 -m grpc_tools.protoc -I./ --python_out=. --grpc_python_out=. ./modelresponse.proto 7 | 8 | # update import to be global wrt mii 9 | sed -i 's/modelresponse_pb2/mii.grpc_related.proto.modelresponse_pb2/g' modelresponse_pb2_grpc.py 10 | -------------------------------------------------------------------------------- /mii/grpc_related/proto/modelresponse.proto: -------------------------------------------------------------------------------- 1 | // Copyright 2015 gRPC authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | syntax = "proto3"; 16 | 17 | /*option java_multiple_files = true; 18 | option java_package = "io.grpc.examples.helloworld"; 19 | option java_outer_classname = "HelloWorldProto"; 20 | option objc_class_prefix = "HLW";*/ 21 | 22 | import "google/protobuf/empty.proto"; 23 | 24 | package modelresponse; 25 | 26 | service ModelResponse { 27 | rpc Terminate (google.protobuf.Empty) returns (google.protobuf.Empty) {} 28 | rpc GeneratorReply (MultiStringRequest) returns (MultiGenerationReply) {} 29 | rpc GeneratorReplyStream (MultiStringRequest) returns (stream MultiGenerationReply) {} 30 | } 31 | 32 | message Dictionary { 33 | map values = 1; 34 | } 35 | 36 | message Value { 37 | oneof oneof_values { 38 | string svalue = 1; 39 | int64 ivalue = 2; 40 | float fvalue = 3; 41 | bool bvalue = 4; 42 | Dictionary mvalue = 5; 43 | } 44 | } 45 | 46 | message SingleStringRequest { 47 | string request = 1; 48 | map query_kwargs = 2; 49 | } 50 | 51 | message MultiStringRequest { 52 | repeated string request = 1; 53 | map query_kwargs = 2; 54 | } 55 | 56 | message SingleGenerationReply { 57 | string response = 1; 58 | string finish_reason = 2; 59 | int64 prompt_tokens = 3; 60 | int64 generated_tokens = 4; 61 | float time_taken = 5; 62 | float model_time_taken = 6; 63 | } 64 | 65 | message MultiGenerationReply { 66 | repeated SingleGenerationReply response = 1; 67 | } 68 | -------------------------------------------------------------------------------- /mii/grpc_related/proto/modelresponse_pb2.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | # Generated by the protocol buffer compiler. DO NOT EDIT! 6 | # source: modelresponse.proto 7 | """Generated protocol buffer code.""" 8 | from google.protobuf import descriptor as _descriptor 9 | from google.protobuf import descriptor_pool as _descriptor_pool 10 | from google.protobuf import symbol_database as _symbol_database 11 | from google.protobuf.internal import builder as _builder 12 | # @@protoc_insertion_point(imports) 13 | 14 | _sym_db = _symbol_database.Default() 15 | 16 | from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 17 | 18 | DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( 19 | b'\n\x13modelresponse.proto\x12\rmodelresponse\x1a\x1bgoogle/protobuf/empty.proto\"\x88\x01\n\nDictionary\x12\x35\n\x06values\x18\x01 \x03(\x0b\x32%.modelresponse.Dictionary.ValuesEntry\x1a\x43\n\x0bValuesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\"\x8c\x01\n\x05Value\x12\x10\n\x06svalue\x18\x01 \x01(\tH\x00\x12\x10\n\x06ivalue\x18\x02 \x01(\x03H\x00\x12\x10\n\x06\x66value\x18\x03 \x01(\x02H\x00\x12\x10\n\x06\x62value\x18\x04 \x01(\x08H\x00\x12+\n\x06mvalue\x18\x05 \x01(\x0b\x32\x19.modelresponse.DictionaryH\x00\x42\x0e\n\x0coneof_values\"\xbb\x01\n\x13SingleStringRequest\x12\x0f\n\x07request\x18\x01 \x01(\t\x12I\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x33.modelresponse.SingleStringRequest.QueryKwargsEntry\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\"\xb9\x01\n\x12MultiStringRequest\x12\x0f\n\x07request\x18\x01 \x03(\t\x12H\n\x0cquery_kwargs\x18\x02 \x03(\x0b\x32\x32.modelresponse.MultiStringRequest.QueryKwargsEntry\x1aH\n\x10QueryKwargsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12#\n\x05value\x18\x02 \x01(\x0b\x32\x14.modelresponse.Value:\x02\x38\x01\"\x9f\x01\n\x15SingleGenerationReply\x12\x10\n\x08response\x18\x01 \x01(\t\x12\x15\n\rfinish_reason\x18\x02 \x01(\t\x12\x15\n\rprompt_tokens\x18\x03 \x01(\x03\x12\x18\n\x10generated_tokens\x18\x04 \x01(\x03\x12\x12\n\ntime_taken\x18\x05 \x01(\x02\x12\x18\n\x10model_time_taken\x18\x06 \x01(\x02\"N\n\x14MultiGenerationReply\x12\x36\n\x08response\x18\x01 \x03(\x0b\x32$.modelresponse.SingleGenerationReply2\x8e\x02\n\rModelResponse\x12=\n\tTerminate\x12\x16.google.protobuf.Empty\x1a\x16.google.protobuf.Empty\"\x00\x12Z\n\x0eGeneratorReply\x12!.modelresponse.MultiStringRequest\x1a#.modelresponse.MultiGenerationReply\"\x00\x12\x62\n\x14GeneratorReplyStream\x12!.modelresponse.MultiStringRequest\x1a#.modelresponse.MultiGenerationReply\"\x00\x30\x01\x62\x06proto3' 20 | ) 21 | 22 | _globals = globals() 23 | _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) 24 | _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'modelresponse_pb2', _globals) 25 | if _descriptor._USE_C_DESCRIPTORS == False: 26 | DESCRIPTOR._options = None 27 | _DICTIONARY_VALUESENTRY._options = None 28 | _DICTIONARY_VALUESENTRY._serialized_options = b'8\001' 29 | _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._options = None 30 | _SINGLESTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' 31 | _MULTISTRINGREQUEST_QUERYKWARGSENTRY._options = None 32 | _MULTISTRINGREQUEST_QUERYKWARGSENTRY._serialized_options = b'8\001' 33 | _globals['_DICTIONARY']._serialized_start = 68 34 | _globals['_DICTIONARY']._serialized_end = 204 35 | _globals['_DICTIONARY_VALUESENTRY']._serialized_start = 137 36 | _globals['_DICTIONARY_VALUESENTRY']._serialized_end = 204 37 | _globals['_VALUE']._serialized_start = 207 38 | _globals['_VALUE']._serialized_end = 347 39 | _globals['_SINGLESTRINGREQUEST']._serialized_start = 350 40 | _globals['_SINGLESTRINGREQUEST']._serialized_end = 537 41 | _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start = 465 42 | _globals['_SINGLESTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end = 537 43 | _globals['_MULTISTRINGREQUEST']._serialized_start = 540 44 | _globals['_MULTISTRINGREQUEST']._serialized_end = 725 45 | _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_start = 465 46 | _globals['_MULTISTRINGREQUEST_QUERYKWARGSENTRY']._serialized_end = 537 47 | _globals['_SINGLEGENERATIONREPLY']._serialized_start = 728 48 | _globals['_SINGLEGENERATIONREPLY']._serialized_end = 887 49 | _globals['_MULTIGENERATIONREPLY']._serialized_start = 889 50 | _globals['_MULTIGENERATIONREPLY']._serialized_end = 967 51 | _globals['_MODELRESPONSE']._serialized_start = 970 52 | _globals['_MODELRESPONSE']._serialized_end = 1240 53 | # @@protoc_insertion_point(module_scope) 54 | -------------------------------------------------------------------------------- /mii/grpc_related/restful_gateway.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import asyncio 6 | import time 7 | import threading 8 | 9 | from flask import Flask, request, jsonify 10 | from flask_restful import Resource, Api 11 | from werkzeug.serving import make_server 12 | 13 | import mii 14 | from mii.constants import RESTFUL_GATEWAY_SHUTDOWN_TIMEOUT, RESTFUL_API_PATH 15 | 16 | 17 | def shutdown(thread): 18 | time.sleep(RESTFUL_GATEWAY_SHUTDOWN_TIMEOUT) 19 | thread.server.shutdown() 20 | 21 | 22 | def createRestfulGatewayApp(deployment_name, server_thread): 23 | class RestfulGatewayService(Resource): 24 | def __init__(self): 25 | super().__init__() 26 | loop = asyncio.new_event_loop() 27 | asyncio.set_event_loop(loop) 28 | self.client = mii.client(deployment_name) 29 | 30 | def post(self): 31 | data = request.get_json() 32 | result = self.client.generate(**data) 33 | return jsonify([r.to_msg_dict() for r in result]) 34 | 35 | app = Flask("RestfulGateway") 36 | 37 | @app.route("/terminate", methods=["GET"]) 38 | def terminate(): 39 | # Need to shutdown *after* completing the request 40 | threading.Thread(target=shutdown, args=(server_thread, )).start() 41 | return "Shutting down RESTful API gateway server" 42 | 43 | @app.route("/healthz", methods=["GET"]) 44 | def healthz(): 45 | return "ok" 46 | 47 | api = Api(app) 48 | path = "/{}/{}".format(RESTFUL_API_PATH, deployment_name) 49 | api.add_resource(RestfulGatewayService, path) 50 | 51 | return app 52 | 53 | 54 | class RestfulGatewayThread(threading.Thread): 55 | def __init__(self, deployment_name, rest_host, rest_port, rest_procs): 56 | threading.Thread.__init__(self) 57 | 58 | app = createRestfulGatewayApp(deployment_name, self) 59 | self.server = make_server(rest_host, 60 | rest_port, 61 | app, 62 | threaded=False, 63 | processes=rest_procs) 64 | self.ctx = app.app_context() 65 | self.ctx.push() 66 | 67 | self._stop_event = threading.Event() 68 | 69 | def run(self): 70 | self.server.serve_forever() 71 | self._stop_event.set() 72 | 73 | def get_stop_event(self): 74 | return self._stop_event 75 | -------------------------------------------------------------------------------- /mii/grpc_related/task_methods.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | from abc import ABC, abstractmethod 7 | from typing import Any, Dict, List, Tuple 8 | 9 | from google.protobuf.message import Message 10 | 11 | from mii.batching.data_classes import Response 12 | from mii.constants import TaskType 13 | from mii.grpc_related.proto import modelresponse_pb2 14 | from mii.utils import kwarg_dict_to_proto, unpack_proto_query_kwargs 15 | 16 | 17 | def single_string_request_to_proto(self, request_dict, **query_kwargs): 18 | return modelresponse_pb2.SingleStringRequest( 19 | request=request_dict["query"], 20 | query_kwargs=kwarg_dict_to_proto(query_kwargs)) 21 | 22 | 23 | def single_string_response_to_proto(self, response, time_taken, model_time_taken): 24 | return modelresponse_pb2.SingleStringReply(response=f"{response}", 25 | time_taken=time_taken, 26 | model_time_taken=model_time_taken) 27 | 28 | 29 | class TaskMethods(ABC): 30 | @property 31 | @abstractmethod 32 | def method(self): 33 | ... 34 | 35 | @abstractmethod 36 | def pack_request_to_proto(self, request, **query_kwargs): 37 | ... 38 | 39 | @abstractmethod 40 | def unpack_request_from_proto(self, proto_request): 41 | ... 42 | 43 | @abstractmethod 44 | def pack_response_to_proto(self, response): 45 | ... 46 | 47 | @abstractmethod 48 | def unpack_response_from_proto(self, proto_response): 49 | ... 50 | 51 | 52 | class TextGenerationMethods(TaskMethods): 53 | @property 54 | def method(self): 55 | return "GeneratorReply" 56 | 57 | @property 58 | def method_stream_out(self): 59 | return "GeneratorReplyStream" 60 | 61 | def pack_request_to_proto(self, 62 | prompts: List[str], 63 | **query_kwargs: Dict[str, 64 | Any]) -> Message: 65 | proto_request = modelresponse_pb2.MultiStringRequest( 66 | request=prompts, 67 | query_kwargs=kwarg_dict_to_proto(query_kwargs), 68 | ) 69 | return proto_request 70 | 71 | def unpack_request_from_proto(self, 72 | proto_request: Message) -> Tuple[List[str], 73 | Dict[str, 74 | Any]]: 75 | prompts = [r for r in proto_request.request] 76 | kwargs = unpack_proto_query_kwargs(proto_request.query_kwargs) 77 | return prompts, kwargs 78 | 79 | def pack_response_to_proto(self, responses: List[Response]) -> Message: 80 | proto_responses = [] 81 | for r in responses: 82 | proto_responses.append( 83 | modelresponse_pb2.SingleGenerationReply( 84 | response=r.generated_text, 85 | finish_reason=str(r.finish_reason.value), 86 | prompt_tokens=r.prompt_length, 87 | generated_tokens=r.generated_length, 88 | time_taken=-1, 89 | model_time_taken=-1, 90 | )) 91 | 92 | return modelresponse_pb2.MultiGenerationReply(response=proto_responses, ) 93 | 94 | def unpack_response_from_proto(self, response: Message) -> List[Response]: 95 | response_batch = [] 96 | for r in response.response: 97 | response_batch.append( 98 | Response( 99 | generated_text=r.response, 100 | prompt_length=r.prompt_tokens, 101 | generated_length=r.generated_tokens, 102 | finish_reason=r.finish_reason, 103 | )) 104 | return response_batch 105 | 106 | 107 | TASK_METHODS_DICT = { 108 | TaskType.TEXT_GENERATION: TextGenerationMethods(), 109 | } 110 | -------------------------------------------------------------------------------- /mii/launch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | -------------------------------------------------------------------------------- /mii/launch/multi_gpu_server.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import argparse 6 | import base64 7 | import json 8 | import os 9 | 10 | from mii.config import ModelConfig 11 | from mii.grpc_related.modelresponse_server import serve_inference, serve_load_balancing 12 | from mii.grpc_related.restful_gateway import RestfulGatewayThread 13 | from mii.api import async_pipeline 14 | 15 | 16 | def b64_encoded_config(config_str: str) -> ModelConfig: 17 | # str -> bytes 18 | b64_bytes = config_str.encode() 19 | # decode b64 bytes -> json bytes 20 | config_bytes = base64.urlsafe_b64decode(b64_bytes) 21 | # convert json bytes -> str -> dict 22 | config_dict = json.loads(config_bytes.decode()) 23 | # return mii.ModelConfig object 24 | return ModelConfig(**config_dict) 25 | 26 | 27 | def main() -> None: 28 | parser = argparse.ArgumentParser() 29 | parser.add_argument("--deployment-name", type=str, help="Name of deployment") 30 | parser.add_argument( 31 | "--model-config", 32 | type=b64_encoded_config, 33 | help="base64 encoded model config", 34 | ) 35 | parser.add_argument( 36 | "--server-port", 37 | type=int, 38 | default=0, 39 | help="Port to user for DeepSpeed inference server.", 40 | ) 41 | parser.add_argument("--zmq-port", type=int, default=0, help="Port to use for ZMQ.") 42 | parser.add_argument("--load-balancer", 43 | action="store_true", 44 | help="Launch load balancer process.") 45 | parser.add_argument( 46 | "--load-balancer-port", 47 | type=int, 48 | default=0, 49 | help="Port to use for load balancer.", 50 | ) 51 | parser.add_argument( 52 | "--restful-gateway", 53 | action="store_true", 54 | help="Launches restful gateway process.", 55 | ) 56 | parser.add_argument( 57 | "--restful-gateway-port", 58 | type=int, 59 | default=0, 60 | help="Port to use for restful gateway.", 61 | ) 62 | parser.add_argument("--restful-gateway-host", 63 | type=str, 64 | default="localhost", 65 | help="Host to use for restful gateway.") 66 | parser.add_argument("--restful-gateway-procs", 67 | type=int, 68 | default=32, 69 | help="Number of processes to use for restful gateway.") 70 | args = parser.parse_args() 71 | assert not ( 72 | args.load_balancer and args.restful_gateway 73 | ), "Select only load-balancer OR restful-gateway." 74 | 75 | if args.restful_gateway: 76 | assert args.restful_gateway_port, "--restful-gateway-port must be provided." 77 | print(f"Starting RESTful API gateway on port: {args.restful_gateway_port}") 78 | gateway_thread = RestfulGatewayThread( 79 | deployment_name=args.deployment_name, 80 | rest_host=args.restful_gateway_host, 81 | rest_port=args.restful_gateway_port, 82 | rest_procs=args.restful_gateway_procs, 83 | ) 84 | stop_event = gateway_thread.get_stop_event() 85 | gateway_thread.start() 86 | stop_event.wait() 87 | 88 | elif args.load_balancer: 89 | assert args.load_balancer_port, "--load-balancer-port must be provided." 90 | print(f"Starting load balancer on port: {args.load_balancer_port}") 91 | serve_load_balancing(args.model_config, args.load_balancer_port) 92 | 93 | else: 94 | assert args.server_port, "--server-port must be provided." 95 | local_rank = int(os.getenv("LOCAL_RANK", "0")) 96 | port = args.server_port + local_rank 97 | args.model_config.zmq_port_number = args.zmq_port 98 | inference_pipeline = async_pipeline(args.model_config) 99 | print(f"Starting server on port: {port}") 100 | serve_inference(inference_pipeline, port) 101 | 102 | 103 | if __name__ == "__main__": 104 | # python -m mii.launch.multi_gpu_server 105 | main() 106 | -------------------------------------------------------------------------------- /mii/legacy/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import grpc 6 | from .server import MIIServer 7 | from .client import MIIClient, mii_query_handle 8 | from .deployment import deploy 9 | from .terminate import terminate 10 | from .constants import DeploymentType, TaskType 11 | from .aml_related.utils import aml_output_path 12 | from .config import MIIConfig, ModelConfig 13 | from .utils import get_supported_models 14 | from .grpc_related.proto import legacymodelresponse_pb2_grpc as modelresponse_pb2_grpc 15 | 16 | __version__ = "0.0.0" 17 | non_persistent_models = {} 18 | try: 19 | from .version import __version__ 20 | except ImportError: 21 | pass 22 | -------------------------------------------------------------------------------- /mii/legacy/aml_related/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | from .templates import * 6 | from .utils import get_acr_name, generate_aml_scripts, aml_output_path 7 | -------------------------------------------------------------------------------- /mii/legacy/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | from enum import Enum 6 | 7 | 8 | class DeploymentType(str, Enum): 9 | LOCAL = "local" 10 | AML = "aml" 11 | NON_PERSISTENT = "non-persistent" 12 | 13 | 14 | class TaskType(str, Enum): 15 | TEXT_GENERATION = "text-generation" 16 | TEXT_CLASSIFICATION = "text-classification" 17 | QUESTION_ANSWERING = "question-answering" 18 | FILL_MASK = "fill-mask" 19 | TOKEN_CLASSIFICATION = "token-classification" 20 | TEXT2IMG = "text-to-image" 21 | ZERO_SHOT_IMAGE_CLASSIFICATION = "zero-shot-image-classification" 22 | INPAINTING = "text-to-image-inpainting" 23 | 24 | 25 | class ModelProvider(str, Enum): 26 | HUGGING_FACE = "hugging-face" 27 | ELEUTHER_AI = "eleuther-ai" 28 | DIFFUSERS = "diffusers" 29 | 30 | 31 | SUPPORTED_MODEL_TYPES = { 32 | 'roberta': ModelProvider.HUGGING_FACE, 33 | 'xlm-roberta': ModelProvider.HUGGING_FACE, 34 | 'gpt2': ModelProvider.HUGGING_FACE, 35 | 'distilbert': ModelProvider.HUGGING_FACE, 36 | 'bert': ModelProvider.HUGGING_FACE, 37 | 'gpt_neo': ModelProvider.HUGGING_FACE, 38 | 'gptj': ModelProvider.HUGGING_FACE, 39 | 'opt': ModelProvider.HUGGING_FACE, 40 | 'bloom': ModelProvider.HUGGING_FACE, 41 | 'gpt-neox': ModelProvider.ELEUTHER_AI, 42 | 'stable-diffusion': ModelProvider.DIFFUSERS, 43 | 'llama': ModelProvider.HUGGING_FACE, 44 | 'clip': ModelProvider.HUGGING_FACE 45 | } 46 | 47 | REQUIRED_KEYS_PER_TASK = { 48 | TaskType.TEXT_GENERATION: ["query"], 49 | TaskType.TEXT_CLASSIFICATION: ["query"], 50 | TaskType.QUESTION_ANSWERING: ["context", 51 | "question"], 52 | TaskType.FILL_MASK: ["query"], 53 | TaskType.TOKEN_CLASSIFICATION: ["query"], 54 | TaskType.TEXT2IMG: ["prompt"], 55 | TaskType.ZERO_SHOT_IMAGE_CLASSIFICATION: ["image", 56 | "candidate_labels"], 57 | TaskType.INPAINTING: [ 58 | "prompt", 59 | "image", 60 | "mask_image", 61 | ] 62 | } 63 | 64 | MII_CACHE_PATH = "MII_CACHE_PATH" 65 | MII_CACHE_PATH_DEFAULT = "/tmp/mii_cache" 66 | 67 | MII_HF_CACHE_EXPIRATION = "MII_HF_CACHE_EXPIRATION" 68 | MII_HF_CACHE_EXPIRATION_DEFAULT = 60 * 60 # 1 hour 69 | 70 | MII_DEBUG_MODE = "MII_DEBUG_MODE" 71 | MII_DEBUG_MODE_DEFAULT = "0" 72 | 73 | MII_DEBUG_DEPLOY_KEY = "MII_DEBUG_DEPLOY_KEY" 74 | 75 | MII_DEBUG_BRANCH = "MII_DEBUG_BRANCH" 76 | MII_DEBUG_BRANCH_DEFAULT = "main" 77 | 78 | MII_MODEL_PATH_DEFAULT = "/tmp/mii_models" 79 | 80 | GRPC_MAX_MSG_SIZE = 2**27 # ~100MB 81 | 82 | TERMINATE_METHOD = "Terminate" 83 | CREATE_SESSION_METHOD = "CreateSession" 84 | DESTROY_SESSION_METHOD = "DestroySession" 85 | 86 | LB_MAX_WORKER_THREADS = 32 87 | 88 | SERVER_SHUTDOWN_TIMEOUT = 10 89 | 90 | RESTFUL_GATEWAY_SHUTDOWN_TIMEOUT = 1 91 | RESTFUL_API_PATH = "mii" 92 | -------------------------------------------------------------------------------- /mii/legacy/deployment.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import os 6 | import mii.legacy as mii 7 | 8 | from .logging import logger 9 | from .models.score import create_score_file 10 | from .models import load_models 11 | from .config import MIIConfig, DeploymentType 12 | 13 | 14 | def support_legacy_api( 15 | task, 16 | model, 17 | deployment_type=DeploymentType.LOCAL, 18 | model_path="", 19 | enable_deepspeed=True, 20 | enable_zero=False, 21 | ds_config=None, 22 | mii_config=None, 23 | version=1, 24 | ): 25 | if ds_config is None: 26 | ds_config = {} 27 | if mii_config is None: 28 | mii_config = {} 29 | 30 | model_config = { 31 | "task": task, 32 | "model": model, 33 | "model_path": model_path, 34 | "enable_deepspeed": enable_deepspeed, 35 | "enable_zero": enable_zero, 36 | "ds_config": ds_config, 37 | } 38 | # TODO do this in a single for loop 39 | for key, val in mii_config.items(): 40 | if key not in MIIConfig.model_fields.keys(): 41 | model_config[key] = val 42 | mii_config = { 43 | k: v 44 | for k, 45 | v in mii_config.items() if k in MIIConfig.model_fields.keys() 46 | } 47 | mii_config["version"] = version 48 | mii_config["deployment_type"] = deployment_type 49 | 50 | return model_config, mii_config 51 | 52 | 53 | def deploy( 54 | deployment_name: str, 55 | model_config: dict = None, 56 | mii_config: dict = None, 57 | *args, 58 | **kwargs, 59 | ): 60 | if mii_config is None: 61 | mii_config = {} 62 | 63 | if args or kwargs: 64 | assert ( 65 | not model_config 66 | ), "We do not support mixture of legacy and new API options, use latest API." 67 | kwargs["mii_config"] = mii_config 68 | model_config, mii_config = support_legacy_api(*args, **kwargs) 69 | 70 | mii_config["deployment_name"] = deployment_name 71 | mii_config["model_conf"] = model_config 72 | mii_config = mii.config.MIIConfig(**mii_config) 73 | 74 | if mii_config.model_conf.enable_deepspeed: 75 | logger.info( 76 | "************* MII is using DeepSpeed Optimizations to accelerate your model *************" 77 | ) 78 | else: 79 | logger.info( 80 | "************* DeepSpeed Optimizations not enabled. Please use enable_deepspeed to get better performance *************" 81 | ) 82 | 83 | if mii_config.deployment_type != DeploymentType.NON_PERSISTENT: 84 | create_score_file(mii_config) 85 | 86 | if mii_config.deployment_type == DeploymentType.AML: 87 | _deploy_aml(mii_config) 88 | elif mii_config.deployment_type == DeploymentType.LOCAL: 89 | _deploy_local(mii_config) 90 | elif mii_config.deployment_type == DeploymentType.NON_PERSISTENT: 91 | _deploy_nonpersistent(mii_config) 92 | 93 | 94 | def _deploy_local(mii_config): 95 | mii.utils.import_score_file(mii_config.deployment_name, DeploymentType.LOCAL).init() 96 | 97 | 98 | def _deploy_aml(mii_config): 99 | acr_name = mii.aml_related.utils.get_acr_name() 100 | mii.aml_related.utils.generate_aml_scripts( 101 | acr_name=acr_name, 102 | deployment_name=mii_config.deployment_name, 103 | model_name=mii_config.model_conf.model, 104 | task_name=mii_config.model_conf.task, 105 | replica_num=mii_config.model_conf.replica_num, 106 | instance_type=mii_config.instance_type, 107 | version=mii_config.version, 108 | ) 109 | print( 110 | f"AML deployment assets at {mii.aml_related.utils.aml_output_path(mii_config.deployment_name)}" 111 | ) 112 | print("Please run 'deploy.sh' to bring your deployment online") 113 | 114 | 115 | def _deploy_nonpersistent(mii_config): 116 | assert ( 117 | int(os.getenv("WORLD_SIZE", "1")) 118 | == mii_config.model_conf.tensor_parallel 119 | ), "World Size does not equal number of tensors. When using non-persistent deployment type, please launch with `deepspeed --num_gpus `" 120 | deployment_name = mii_config.deployment_name 121 | mii.non_persistent_models[deployment_name] = ( 122 | load_models(mii_config.model_conf), 123 | mii_config.model_conf.task, 124 | ) 125 | -------------------------------------------------------------------------------- /mii/legacy/docs/CNAME: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/CNAME -------------------------------------------------------------------------------- /mii/legacy/docs/GPT-NeoX.md: -------------------------------------------------------------------------------- 1 | # GPT-NeoX with MII 2 | In this document, we provide the steps to setup MII for doing a local deployment of the [GPT-NeoX model](https://github.com/EleutherAI/gpt-neox). 3 | 4 | ## Setup Environment 5 | We recommend using a conda environment or virtual environment for installing all dependencies: 6 | ```bash 7 | # conda 8 | conda create --name MII-GPT-NeoX 9 | conda activate MII-GPT-NeoX 10 | # python virtualenv 11 | python3 -m venv MII-GPT-NeoX 12 | source ./MII-GPT-NeoX/bin/activate 13 | ``` 14 | --- 15 | 📌 **Note:** You should use Python3 <= 3.8. We recommend Python 3.8 16 | 17 | --- 18 | 19 | ## Install MII 20 | ```bash 21 | git clone https://github.com/deepspeedai/DeepSpeed-MII.git 22 | cd DeepSpeed-MII 23 | pip install .[local] 24 | pip install . 25 | ``` 26 | 27 | ## Install DeepSpeed-GPT-NeoX 28 | ```bash 29 | git clone -b ds-updates https://github.com/deepspeedai/DeepSpeed-gpt-neox.git 30 | cd deepspeed-gpt-neox 31 | pip install -r requirements/requirements-inference.txt 32 | pip install . 33 | python ./megatron/fused_kernels/setup.py install 34 | cd .. 35 | ``` 36 | 37 | ## Download Checkpoint 38 | You can download the checkpoint file with: 39 | ```bash 40 | wget --cut-dirs=5 -nH -r --no-parent --reject "index.html*" https://mystic.the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights/ -P 20B_checkpoints 41 | ``` 42 | or you can download with your favorite bittorrent client: [slim_weights.torrent](https://mystic.the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights.torrent) 43 | 44 | Remember the location where you save the checkpoint directory and we will refer to this location as `{CKPT_DIR}` 45 | 46 | --- 47 | 📌 **Note:** The checkpoint file is nearly 40GB in size and may take a long time to download 48 | 49 | --- 50 | 51 | ## Run GPT-NeoX with MII 52 | Modify the example file `examples/local/text-generation-neox-example.py`: 53 | - Change the `tensor_parallel` value in the `mii_config` dict to the number of GPUs on your system 54 | - Change the `local_model_path` in `mii.deploy()` call to `{CKPT_DIR}` 55 | 56 | To run the example: 57 | - Start the server with `python3 examples/local/text-generation-neox-example.py` 58 | - Wait for the server to initialize 59 | - Run a query with `python3 examples/local/text-generation-query-example.py` 60 | -------------------------------------------------------------------------------- /mii/legacy/docs/images/azure-cost.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/azure-cost.png -------------------------------------------------------------------------------- /mii/legacy/docs/images/bert.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/bert.png -------------------------------------------------------------------------------- /mii/legacy/docs/images/bloom.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/bloom.png -------------------------------------------------------------------------------- /mii/legacy/docs/images/gpt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/gpt.png -------------------------------------------------------------------------------- /mii/legacy/docs/images/hero-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/hero-dark.png -------------------------------------------------------------------------------- /mii/legacy/docs/images/hero-transparent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/hero-transparent.png -------------------------------------------------------------------------------- /mii/legacy/docs/images/hero.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/hero.png -------------------------------------------------------------------------------- /mii/legacy/docs/images/llm-latency-sd-latency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/llm-latency-sd-latency.png -------------------------------------------------------------------------------- /mii/legacy/docs/images/mii-arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/mii-arch.png -------------------------------------------------------------------------------- /mii/legacy/docs/images/multi-gpu-latency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/multi-gpu-latency.png -------------------------------------------------------------------------------- /mii/legacy/docs/images/opt-bloom.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/opt-bloom.png -------------------------------------------------------------------------------- /mii/legacy/docs/images/opt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/opt.png -------------------------------------------------------------------------------- /mii/legacy/docs/images/roberta.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/roberta.png -------------------------------------------------------------------------------- /mii/legacy/docs/images/sd-hero-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/sd-hero-dark.png -------------------------------------------------------------------------------- /mii/legacy/docs/images/sd-hero-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/sd-hero-light.png -------------------------------------------------------------------------------- /mii/legacy/docs/images/sd-latency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/sd-latency.png -------------------------------------------------------------------------------- /mii/legacy/docs/images/tput-llms.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepspeedai/DeepSpeed-MII/4e99f2c7978ff5bca153dd97e7c6776d4766e0ee/mii/legacy/docs/images/tput-llms.png -------------------------------------------------------------------------------- /mii/legacy/examples/aml/fill-mask-example.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import mii 6 | 7 | name = "bert-base-uncased" 8 | print(f"Deploying {name}...") 9 | 10 | mii.deploy(task='fill-mask', 11 | model=name, 12 | deployment_name=name + "-deployment", 13 | deployment_type=mii.constants.DeploymentType.AML) 14 | -------------------------------------------------------------------------------- /mii/legacy/examples/aml/text-generation-bloom.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import mii 6 | 7 | mii_configs = { 8 | "dtype": "fp16", 9 | "tensor_parallel": 8, 10 | "meta_tensor": True, 11 | } 12 | name = "microsoft/bloom-deepspeed-inference-fp16" 13 | 14 | mii.deploy(task='text-generation', 15 | model=name, 16 | deployment_name="bloom-deployment", 17 | deployment_type=mii.constants.DeploymentType.AML, 18 | mii_config=mii_configs) 19 | -------------------------------------------------------------------------------- /mii/legacy/examples/aml/text-generation-bloom560m-example.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import mii 6 | 7 | mii_configs = { 8 | "tensor_parallel": 1, 9 | "dtype": "fp16", 10 | "aml_model_path": "models/bloom-560m" 11 | } 12 | mii.deploy(task='text-generation', 13 | model="bigscience/bloom-560m", 14 | deployment_name="bloom560m-deployment", 15 | deployment_type=mii.constants.DeploymentType.AML, 16 | mii_config=mii_configs) 17 | -------------------------------------------------------------------------------- /mii/legacy/examples/benchmark/txt2img/baseline-sd.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import os 6 | import torch 7 | import diffusers 8 | from utils import benchmark 9 | 10 | # Get HF auth key from environment or replace with key 11 | hf_auth_key = os.environ["HF_AUTH_TOKEN"] 12 | 13 | trials = 10 14 | batch_size = 1 15 | save_path = "." 16 | 17 | # Setup the stable diffusion pipeline via the diffusers pipeline api 18 | pipe = diffusers.StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", 19 | use_auth_token=hf_auth_key, 20 | torch_dtype=torch.float16, 21 | revision="fp16").to("cuda") 22 | 23 | # Create batch size number of prompts 24 | prompts = ["a photo of an astronaut riding a horse on mars"] * batch_size 25 | 26 | # Example usage of diffusers pipeline 27 | results = pipe(prompts) 28 | for idx, img in enumerate(results.images): 29 | img.save(os.path.join(save_path, f"baseline-img{idx}.png")) 30 | 31 | # Evaluate performance of pipeline 32 | benchmark(pipe, prompts, save_path, trials, "baseline") 33 | -------------------------------------------------------------------------------- /mii/legacy/examples/benchmark/txt2img/mii-sd.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import os 6 | import mii 7 | from utils import benchmark 8 | 9 | # Get HF auth key from environment or replace with key 10 | hf_auth_key = os.environ["HF_AUTH_TOKEN"] 11 | 12 | trials = 10 13 | batch_size = 1 14 | save_path = "." 15 | deploy_name = "sd_deploy" 16 | 17 | # Deploy Stable Diffusion w. MII 18 | mii_config = {"dtype": "fp16", "hf_auth_token": hf_auth_key} 19 | mii.deploy(task='text-to-image', 20 | model="CompVis/stable-diffusion-v1-4", 21 | deployment_name=deploy_name, 22 | mii_config=mii_config) 23 | 24 | # Example usage of MII deployment 25 | pipe = mii.mii_query_handle(deploy_name) 26 | prompts = {"query": ["a photo of an astronaut riding a horse on mars"] * batch_size} 27 | results = pipe.query(prompts) 28 | for idx, img in enumerate(results.images): 29 | img.save(os.path.join(save_path, f"mii-img{idx}.png")) 30 | 31 | # Evaluate performance of MII 32 | benchmark(pipe.query, prompts, save_path, trials, "mii") 33 | 34 | # Tear down the persistent deployment 35 | mii.terminate(deploy_name) 36 | -------------------------------------------------------------------------------- /mii/legacy/examples/benchmark/txt2img/requirements.txt: -------------------------------------------------------------------------------- 1 | deepspeed>=0.7.4 2 | deepspeed-mii>=0.0.3 3 | diffusers>=0.6.0 4 | -------------------------------------------------------------------------------- /mii/legacy/examples/benchmark/txt2img/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import os 6 | import torch 7 | import time 8 | import deepspeed 9 | import mii 10 | import numpy 11 | import diffusers 12 | import transformers 13 | 14 | from packaging import version 15 | 16 | assert version.parse(diffusers.__version__) >= version.parse('0.7.1'), "diffusers must be 0.7.1+" 17 | assert version.parse(mii.__version__) >= version.parse("0.0.3"), "mii must be 0.0.3+" 18 | assert version.parse(deepspeed.__version__) >= version.parse("0.7.5"), "deepspeed must be 0.7.5+" 19 | assert version.parse(transformers.__version__) >= version.parse("4.24.0"), "transformers must be 4.24.0+" 20 | 21 | 22 | def benchmark(func, inputs, save_path=".", trials=5, tag="", save=True): 23 | # Turn off the tqdm progress bar 24 | if hasattr(func, "set_progress_bar_config"): 25 | func.set_progress_bar_config(disable=True) 26 | 27 | durations = [] 28 | for trial in range(trials): 29 | torch.cuda.synchronize() 30 | start = time.perf_counter() 31 | with torch.inference_mode(): 32 | results = func(inputs) 33 | torch.cuda.synchronize() 34 | duration = time.perf_counter() - start 35 | durations.append(duration) 36 | print(f"trial={trial}, time_taken={duration:.4f}") 37 | if save: 38 | for idx, img in enumerate(results.images): 39 | img.save(os.path.join(save_path, f"{tag}-trial{trial}-img{idx}.png")) 40 | print(f"median duration: {numpy.median(durations):.4f}") 41 | -------------------------------------------------------------------------------- /mii/legacy/examples/local/chat/README.md: -------------------------------------------------------------------------------- 1 | # Multi-turn Conversation Example for Chat Applications 2 | 3 | MII can manage multi-turn conversations, enabling users to easily create their own chat applications. 4 | The scripts in this folder provide a complete example of a multi-turn conversation scenario. 5 | 6 | ## Starting the server 7 | 8 | Starting the server for your chat application requires nothing special. 9 | Just make sure that the model supports `text-generation` and is trained for conversations. 10 | 11 | The example script uses [AdamG012/chat-opt-1.3b-rlhf-actor-deepspeed](https://huggingface.co/AdamG012/chat-opt-1.3b-rlhf-actor-deepspeed), which was trained using [DeepSpeed-Chat](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/README.md). 12 | 13 | ```python 14 | name = "AdamG012/chat-opt-1.3b-rlhf-actor-deepspeed" 15 | ... 16 | mii.deploy(task='text-generation', model=name, deployment_name="chat_example_deployment") 17 | ``` 18 | 19 | ## Running multi-turn conversation 20 | 21 | The client create a *session* to make MII recognize the session of the conversation. 22 | `create_session` creates a new session with a given name. 23 | 24 | ```python 25 | # You can set a session name 26 | session_id = "chat_example_session" 27 | # You need to call `create_session` before you start a multi-turn conversation session 28 | generator.create_session(session_id) 29 | ``` 30 | 31 | The session ID is given as a keyword argument as shown below. 32 | Besides this, you can call `query` function as well as the normal usage of MII inference. 33 | Note that the prompt may need to be designed according to your model. 34 | 35 | ```python 36 | query_template = "Human: {}\n Assistant: " 37 | print("# Start a conversation session. Type 'q' to exit.") 38 | 39 | while True: 40 | user_input = input("You: ") 41 | if user_input == "q": 42 | break 43 | result = generator.query({"query": query_template.format(user_input)}, 44 | session_id=session_id, 45 | max_new_tokens=128) 46 | print(f"Bot: {result.response[0].replace('', '')}") 47 | ``` 48 | 49 | Once the user finishes the session, you need to call `destroy_session` to free the internal data for the session. 50 | 51 | ```python 52 | generator.destroy_session(session_id) 53 | ``` 54 | 55 | The following is an example conversation: 56 | ```bash 57 | $ python chat-client-example.py 58 | # Start a conversation session. Type 'q' to exit. 59 | You: Can you tell me about deep learning? 60 | Bot: Yes, it is a type of artificial intelligence that learns from data. It can process large amounts of data quickly and accurately, and it can develop sophisticated models to analyze data. Deep learning techniques are being applied to a wide variety of tasks, including image recognition, speech recognition, recommendation systems, and self-driving cars. 61 | You: I want to try it. 62 | Bot: Yes, it is a very powerful technology. It has the potential to revolutionize many fields, including artificial intelligence, data analysis, and machine learning. 63 | You: Is it hard to learn? 64 | Bot: Yes, it is a challenging field of study. It requires a lot of training and practice to develop effective models. It also requires a high level of computational power and data storage. 65 | You: Where can I start? 66 | Bot: Yes, it is a good idea to start with a basic model. It will help you develop the necessary skills and knowledge to progress to more advanced models.. 67 | ``` 68 | -------------------------------------------------------------------------------- /mii/legacy/examples/local/chat/chat-client-example.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import mii 6 | 7 | # Run `chat-server-example.py` before running this script 8 | generator = mii.mii_query_handle("chat_example_deployment") 9 | 10 | # You can set a session name 11 | session_id = "chat_example_session" 12 | # You need to call `create_session` before you start a multi-turn conversation session 13 | generator.create_session(session_id) 14 | 15 | print("# Start a conversation session. Type 'q' to exit.") 16 | query_template = "Human: {}\n Assistant: " 17 | while True: 18 | user_input = input("You: ") 19 | if user_input == "q": 20 | break 21 | 22 | # A session ID is given as a keyword argument 23 | result = generator.query({"query": query_template.format(user_input)}, 24 | session_id=session_id, 25 | max_new_tokens=128) 26 | print(f"Bot: {result.response[0].replace('', '').strip()}") 27 | 28 | # You need to destroy the session after finishing the conversation 29 | generator.destroy_session(session_id) 30 | -------------------------------------------------------------------------------- /mii/legacy/examples/local/chat/chat-server-example.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import mii 6 | 7 | mii_configs = {'tensor_parallel': 1} 8 | 9 | # This checkpoint is create using DeepSpeed-Chat 10 | # https://github.com/deepspeedai/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/README.md 11 | name = "AdamG012/chat-opt-1.3b-rlhf-actor-deepspeed" 12 | 13 | print(f"Deploying {name}...") 14 | 15 | # Deploy as "text-generation" task 16 | mii.deploy(task='text-generation', model=name, deployment_name="chat_example_deployment") 17 | -------------------------------------------------------------------------------- /mii/legacy/examples/local/conversational-example.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import mii 6 | 7 | mii_configs = {'tensor_parallel': 1} 8 | 9 | # gpt2 10 | name = "microsoft/DialoGPT-large" 11 | 12 | print(f"Deploying {name}...") 13 | 14 | mii.deploy(task='text-generation', model=name, deployment_name=name + "_deployment") 15 | -------------------------------------------------------------------------------- /mii/legacy/examples/local/conversational-query-example.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import mii 6 | 7 | # gpt2 8 | name = "microsoft/DialoGPT-large" 9 | 10 | print(f"Querying {name}...") 11 | 12 | conv_id = 0 13 | text = "DeepSpeed is the greatest" 14 | 15 | generator = mii.mii_query_handle(name + "_deployment") 16 | result = generator.query({ 17 | 'text': text, 18 | 'conversation_id': conv_id, 19 | 'past_user_inputs': [], 20 | 'generated_responses': [] 21 | }) 22 | 23 | print(result) 24 | print(f"time_taken: {result.time_taken}") 25 | 26 | text = "How is DeepSpeed?" 27 | result = generator.query({ 28 | 'text': str, 29 | 'conversation_id': result.conversation_id, 30 | 'past_user_inputs': result.past_user_inputs, 31 | 'generated_responses': result.generated_responses 32 | }) 33 | 34 | print(result) 35 | print("time_taken:", result.time_taken) 36 | -------------------------------------------------------------------------------- /mii/legacy/examples/local/fill-mask-example.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import mii 6 | import argparse 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("-q", "--query", action="store_true", help="query") 10 | args = parser.parse_args() 11 | 12 | name = "bert-base-uncased" 13 | mask = "[MASK]" 14 | 15 | if not args.query: 16 | print(f"Deploying {name}...") 17 | mii.deploy(task='fill-mask', model=name, deployment_name=name + "_deployment") 18 | else: 19 | print(f"Querying {name}...") 20 | generator = mii.mii_query_handle(name + "_deployment") 21 | result = generator.query({'query': f"Hello I'm a {mask} model."}) 22 | print(result.response) 23 | print("time_taken:", result.time_taken) 24 | -------------------------------------------------------------------------------- /mii/legacy/examples/local/question-answering-example.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import mii 6 | 7 | mii_config = {'tensor_parallel': 1, 'port_number': 50050} 8 | 9 | name = "deepset/roberta-large-squad2" 10 | mii.deploy(task="question-answering", 11 | model=name, 12 | deployment_name=name + "-qa-deployment", 13 | mii_config=mii_config) 14 | -------------------------------------------------------------------------------- /mii/legacy/examples/local/question-answering-query-example.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import mii 6 | 7 | name = "deepset/roberta-large-squad2" 8 | 9 | generator = mii.mii_query_handle(name + "-qa-deployment") 10 | results = generator.query({ 11 | 'question': "What is the greatest?", 12 | 'context': "DeepSpeed is the greatest" 13 | }) 14 | print(results.response) 15 | print(f"time_taken: {results.time_taken}") 16 | -------------------------------------------------------------------------------- /mii/legacy/examples/local/text-classification-example.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import mii 6 | 7 | # gpt2 8 | name = "microsoft/DialogRPT-human-vs-rand" 9 | 10 | # roberta 11 | name = "roberta-large-mnli" 12 | 13 | print(f"Deploying {name}...") 14 | 15 | mii.deploy(task='text-classification', model=name, deployment_name=name + "_deployment") 16 | -------------------------------------------------------------------------------- /mii/legacy/examples/local/text-classification-query-example.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import mii 6 | 7 | # gpt2 8 | name = "microsoft/DialogRPT-human-vs-rand" 9 | 10 | # roberta 11 | name = "roberta-large-mnli" 12 | 13 | print(f"Querying {name}...") 14 | 15 | generator = mii.mii_query_handle(name + "_deployment") 16 | result = generator.query({'query': "DeepSpeed is the greatest"}) 17 | print(result.response) 18 | print("time_taken:", result.time_taken) 19 | -------------------------------------------------------------------------------- /mii/legacy/examples/local/text-generation-bloom-example.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import mii 6 | 7 | mii_configs = { 8 | "dtype": "fp16", 9 | "tensor_parallel": 8, 10 | "port_number": 50950, 11 | "meta_tensor": True, 12 | } 13 | name = "microsoft/bloom-deepspeed-inference-fp16" 14 | 15 | mii.deploy(task='text-generation', 16 | model=name, 17 | deployment_name=name + "_deployment", 18 | model_path="/data/bloom-mp", 19 | mii_config=mii_configs) 20 | -------------------------------------------------------------------------------- /mii/legacy/examples/local/text-generation-bloom560m-example.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import mii 6 | 7 | mii_configs = {"tensor_parallel": 1, "dtype": "fp16"} 8 | mii.deploy(task='text-generation', 9 | model="bigscience/bloom-560m", 10 | deployment_name="bloom560m_deployment", 11 | mii_config=mii_configs) 12 | -------------------------------------------------------------------------------- /mii/legacy/examples/local/text-generation-fbopt-example.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import mii 6 | 7 | mii_config = {'dtype': 'fp16'} 8 | 9 | name = "facebook/opt-1.3b" 10 | 11 | ds_config = { 12 | "fp16": { 13 | "enabled": True 14 | }, 15 | "bf16": { 16 | "enabled": False 17 | }, 18 | "zero_optimization": { 19 | "stage": 3, 20 | "offload_param": { 21 | "device": "cpu", 22 | }, 23 | }, 24 | "train_micro_batch_size_per_gpu": 1, 25 | } 26 | 27 | mii.deploy(task='text-generation', 28 | model=name, 29 | deployment_name=name + "_deployment", 30 | model_path=".cache/models/" + name, 31 | mii_config=mii_config, 32 | enable_deepspeed=False, 33 | enable_zero=True, 34 | ds_config=ds_config) 35 | -------------------------------------------------------------------------------- /mii/legacy/examples/local/text-generation-query-example.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import mii 6 | import argparse 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('--deployment', 10 | '-d', 11 | type=str, 12 | required=True, 13 | help="deployment_name set in the MII deployment") 14 | args = parser.parse_args() 15 | 16 | generator = mii.mii_query_handle(args.deployment) 17 | result = generator.query({'query': ["DeepSpeed is the", "Seattle is"]}) 18 | print(result.response) 19 | print("time_taken:", result.time_taken) 20 | -------------------------------------------------------------------------------- /mii/legacy/examples/local/text-generation-zero-example.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import mii 6 | from transformers import AutoConfig 7 | 8 | mii_config = {"dtype": "fp16"} 9 | 10 | name = "gpt2-xl" 11 | 12 | config = AutoConfig.from_pretrained(name) 13 | model_hidden_size = config.n_embd 14 | 15 | ds_config = { 16 | "fp16": { 17 | "enabled": True 18 | }, 19 | "bf16": { 20 | "enabled": False 21 | }, 22 | "aio": { 23 | "block_size": 262144, 24 | "queue_depth": 32, 25 | "thread_count": 1, 26 | "single_submit": False, 27 | "overlap_events": True 28 | }, 29 | "zero_optimization": { 30 | "stage": 3, 31 | "offload_param": { 32 | "device": "cpu", 33 | }, 34 | "overlap_comm": True, 35 | "contiguous_gradients": True, 36 | "reduce_bucket_size": model_hidden_size * model_hidden_size, 37 | "stage3_prefetch_bucket_size": 0.1 * model_hidden_size * model_hidden_size, 38 | "stage3_max_live_parameters": 1e8, 39 | "stage3_max_reuse_distance": 1e8, 40 | "stage3_param_persistence_threshold": 10 * model_hidden_size 41 | }, 42 | "train_micro_batch_size_per_gpu": 1, 43 | } 44 | 45 | mii.deploy(task='text-generation', 46 | model=name, 47 | deployment_name=name + "_deployment", 48 | model_path=".cache/models/" + name, 49 | mii_config=mii_config, 50 | enable_deepspeed=False, 51 | enable_zero=True, 52 | ds_config=ds_config) 53 | -------------------------------------------------------------------------------- /mii/legacy/examples/local/token-classification-example.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import mii 6 | 7 | # roberta 8 | name = "Jean-Baptiste/roberta-large-ner-english" 9 | 10 | print(f"Deploying {name}...") 11 | 12 | mii.deploy(task='token-classification', model=name, deployment_name=name + "_deployment") 13 | -------------------------------------------------------------------------------- /mii/legacy/examples/local/token-classification-query-example.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import mii 6 | 7 | # roberta 8 | name = "Jean-Baptiste/roberta-large-ner-english" 9 | 10 | print(f"Querying {name}...") 11 | 12 | generator = mii.mii_query_handle(name + "_deployment") 13 | result = generator.query({'query': "My name is jean-baptiste and I live in montreal."}) 14 | print(result.response) 15 | print("time_taken:", result.time_taken) 16 | -------------------------------------------------------------------------------- /mii/legacy/examples/local/txt2img-example.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import os 6 | import mii 7 | import argparse 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument("-q", "--query", action="store_true", help="query") 11 | args = parser.parse_args() 12 | 13 | if not args.query: 14 | mii_configs = { 15 | "tensor_parallel": 16 | 1, 17 | "enable_cuda_graph": 18 | True, 19 | "replace_with_kernel_inject": 20 | True, 21 | "dtype": 22 | "fp16", 23 | "hf_auth_token": 24 | os.environ.get("HF_AUTH_TOKEN", 25 | "hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"), 26 | "port_number": 27 | 50050 28 | } 29 | mii.deploy(task='text-to-image', 30 | model="runwayml/stable-diffusion-v1-5", 31 | deployment_name="sd_deploy", 32 | mii_config=mii_configs) 33 | print( 34 | "\nText to image model deployment complete! To use this deployment, run the following command: python txt2img-example.py --query\n" 35 | ) 36 | else: 37 | generator = mii.mii_query_handle("sd_deploy") 38 | result = generator.query({ 39 | 'query': 40 | ["a panda in space with a rainbow", 41 | "a soda can on top a snowy mountain"] 42 | }) 43 | for idx, img in enumerate(result.images): 44 | img.save(f"test-{idx}.png") 45 | -------------------------------------------------------------------------------- /mii/legacy/examples/non_persistent/text-generation-bloom560-example.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import mii 6 | 7 | mii_configs = {"tensor_parallel": 1, "dtype": "fp16"} 8 | name = "bloom560m" 9 | mii.deploy(task='text-generation', 10 | model="bigscience/bloom-560m", 11 | deployment_name=name + "_deployment", 12 | deployment_type=mii.constants.DeploymentType.NON_PERSISTENT, 13 | mii_config=mii_configs) 14 | generator = mii.mii_query_handle(name + "_deployment") 15 | result = generator.query({'query': ["DeepSpeed is the", "Seattle is"]}) 16 | print(result) 17 | -------------------------------------------------------------------------------- /mii/legacy/grpc_related/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | -------------------------------------------------------------------------------- /mii/legacy/grpc_related/proto/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | -------------------------------------------------------------------------------- /mii/legacy/grpc_related/proto/build_script.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | python3 -m grpc_tools.protoc -I./ --python_out=. --grpc_python_out=. ./legacymodelresponse.proto 7 | 8 | # update import to be global wrt mii 9 | sed -i 's/legacymodelresponse_pb2/mii.legacy.grpc_related.proto.legacymodelresponse_pb2/g' legacymodelresponse_pb2_grpc.py 10 | -------------------------------------------------------------------------------- /mii/legacy/grpc_related/proto/legacymodelresponse.proto: -------------------------------------------------------------------------------- 1 | // Copyright 2015 gRPC authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | syntax = "proto3"; 16 | 17 | /*option java_multiple_files = true; 18 | option java_package = "io.grpc.examples.helloworld"; 19 | option java_outer_classname = "HelloWorldProto"; 20 | option objc_class_prefix = "HLW";*/ 21 | 22 | import "google/protobuf/empty.proto"; 23 | 24 | package legacymodelresponse; 25 | 26 | service ModelResponse { 27 | rpc Terminate (google.protobuf.Empty) returns (google.protobuf.Empty) {} 28 | rpc CreateSession (SessionID) returns (google.protobuf.Empty) {} 29 | rpc DestroySession (SessionID) returns (google.protobuf.Empty) {} 30 | rpc GeneratorReply (MultiStringRequest) returns (MultiStringReply) {} 31 | rpc ClassificationReply (SingleStringRequest) returns (SingleStringReply) {} 32 | rpc QuestionAndAnswerReply(QARequest) returns (SingleStringReply) {} 33 | rpc FillMaskReply(SingleStringRequest) returns (SingleStringReply) {} 34 | rpc TokenClassificationReply(SingleStringRequest) returns (SingleStringReply) {} 35 | rpc Txt2ImgReply(Text2ImageRequest) returns (ImageReply) {} 36 | rpc ZeroShotImgClassificationReply (ZeroShotImgClassificationRequest) returns (SingleStringReply) {} 37 | rpc InpaintingReply(InpaintingRequest) returns (ImageReply) {} 38 | } 39 | 40 | message Value { 41 | oneof oneof_values { 42 | string svalue = 1; 43 | int64 ivalue = 2; 44 | float fvalue = 3; 45 | bool bvalue = 4; 46 | } 47 | } 48 | 49 | message SessionID { 50 | string session_id = 1; 51 | } 52 | 53 | message SingleStringRequest { 54 | string request = 1; 55 | map query_kwargs = 2; 56 | } 57 | 58 | message MultiStringRequest { 59 | repeated string request = 1; 60 | map query_kwargs = 2; 61 | } 62 | 63 | message SingleStringReply { 64 | string response = 1; 65 | float time_taken = 2; 66 | float model_time_taken = 3; 67 | } 68 | 69 | message MultiStringReply { 70 | repeated string response = 1; 71 | float time_taken = 2; 72 | float model_time_taken = 3; 73 | } 74 | 75 | message QARequest { 76 | string question = 1; 77 | string context = 2; 78 | map query_kwargs = 3; 79 | } 80 | 81 | message ImageReply { 82 | repeated bytes images = 1; 83 | repeated bool nsfw_content_detected = 2; 84 | string mode = 3; 85 | int64 size_w = 4; 86 | int64 size_h = 5; 87 | float time_taken = 6; 88 | } 89 | 90 | message Text2ImageRequest { 91 | repeated string prompt = 1; 92 | repeated string negative_prompt = 2; 93 | map query_kwargs = 3; 94 | } 95 | 96 | message ZeroShotImgClassificationRequest { 97 | string image = 1; 98 | repeated string candidate_labels = 2; 99 | map query_kwargs = 3; 100 | } 101 | 102 | message InpaintingRequest { 103 | repeated string prompt = 1; 104 | repeated bytes image = 2; 105 | repeated bytes mask_image = 3; 106 | repeated string negative_prompt = 4; 107 | map query_kwargs = 5; 108 | } 109 | -------------------------------------------------------------------------------- /mii/legacy/grpc_related/restful_gateway.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import time 6 | import threading 7 | import mii.legacy as mii 8 | from flask import Flask, request 9 | from flask_restful import Resource, Api 10 | from werkzeug.serving import make_server 11 | from mii.legacy.constants import RESTFUL_GATEWAY_SHUTDOWN_TIMEOUT, RESTFUL_API_PATH 12 | from google.protobuf.json_format import MessageToJson 13 | 14 | 15 | def shutdown(thread): 16 | time.sleep(RESTFUL_GATEWAY_SHUTDOWN_TIMEOUT) 17 | thread.server.shutdown() 18 | 19 | 20 | def createRestfulGatewayApp(deployment_name, task, lb_port, server_thread): 21 | # client must be thread-safe 22 | client = mii.MIIClient(task, "localhost", lb_port) 23 | 24 | class RestfulGatewayService(Resource): 25 | def __init__(self): 26 | super().__init__() 27 | 28 | def post(self): 29 | data = request.get_json() 30 | kwargs = data["kwargs"] if "kwargs" in data else {} 31 | result = client.query(data["request"], **kwargs) 32 | return MessageToJson(result) 33 | 34 | app = Flask("RestfulGateway") 35 | 36 | @app.route("/terminate", methods=["GET"]) 37 | def terminate(): 38 | # Need to shutdown *after* completing the request 39 | threading.Thread(target=shutdown, args=(server_thread, )).start() 40 | return "Shutting down RESTful API gateway server" 41 | 42 | api = Api(app) 43 | path = "/{}/{}".format(RESTFUL_API_PATH, deployment_name) 44 | api.add_resource(RestfulGatewayService, path) 45 | 46 | return app 47 | 48 | 49 | class RestfulGatewayThread(threading.Thread): 50 | def __init__(self, deployment_name, task, lb_port, rest_port): 51 | threading.Thread.__init__(self) 52 | 53 | app = createRestfulGatewayApp(deployment_name, task, lb_port, self) 54 | self.server = make_server("127.0.0.1", rest_port, app) 55 | self.ctx = app.app_context() 56 | self.ctx.push() 57 | 58 | self._stop_event = threading.Event() 59 | 60 | def run(self): 61 | self.server.serve_forever() 62 | self._stop_event.set() 63 | 64 | def get_stop_event(self): 65 | return self._stop_event 66 | -------------------------------------------------------------------------------- /mii/legacy/launch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | -------------------------------------------------------------------------------- /mii/legacy/launch/multi_gpu_server.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import os 6 | import argparse 7 | import base64 8 | import json 9 | 10 | from mii.legacy.config import ModelConfig 11 | from mii.legacy.models.load_models import load_models 12 | from mii.legacy.grpc_related.modelresponse_server import serve_inference, serve_load_balancing 13 | from mii.legacy.grpc_related.restful_gateway import RestfulGatewayThread 14 | 15 | 16 | def b64_encoded_config(config_str): 17 | # str -> bytes 18 | b64_bytes = config_str.encode() 19 | # decode b64 bytes -> json bytes 20 | config_bytes = base64.urlsafe_b64decode(b64_bytes) 21 | # convert json bytes -> str -> dict 22 | config_dict = json.loads(config_bytes.decode()) 23 | # return mii.ModelConfig object 24 | return ModelConfig(**config_dict) 25 | 26 | 27 | def main(): 28 | parser = argparse.ArgumentParser() 29 | parser.add_argument("--deployment-name", type=str, help="Name of deployment") 30 | parser.add_argument( 31 | "--model-config", 32 | type=b64_encoded_config, 33 | help="base64 encoded model config", 34 | ) 35 | parser.add_argument( 36 | "--server-port", 37 | type=int, 38 | default=0, 39 | help="Port to user for DeepSpeed inference server.", 40 | ) 41 | parser.add_argument("--load-balancer", 42 | action="store_true", 43 | help="Launch load balancer process.") 44 | parser.add_argument( 45 | "--load-balancer-port", 46 | type=int, 47 | default=0, 48 | help="Port to use for load balancer.", 49 | ) 50 | parser.add_argument( 51 | "--restful-gateway", 52 | action="store_true", 53 | help="Launches restful gateway process.", 54 | ) 55 | parser.add_argument( 56 | "--restful-gateway-port", 57 | type=int, 58 | default=0, 59 | help="Port to use for restful gateway.", 60 | ) 61 | args = parser.parse_args() 62 | assert not ( 63 | args.load_balancer and args.restful_gateway 64 | ), "Select only load-balancer OR restful-gateway." 65 | 66 | if args.restful_gateway: 67 | assert args.restful_gateway_port, "--restful-gateway-port must be provided." 68 | print(f"Starting RESTful API gateway on port: {args.restful_gateway_port}") 69 | gateway_thread = RestfulGatewayThread( 70 | deployment_name=args.deployment_name, 71 | task=args.model_config.task, 72 | lb_port=args.load_balancer_port, 73 | rest_port=args.restful_gateway_port, 74 | ) 75 | stop_event = gateway_thread.get_stop_event() 76 | gateway_thread.start() 77 | stop_event.wait() 78 | 79 | elif args.load_balancer: 80 | assert args.load_balancer_port, "--load-balancer-port must be provided." 81 | print(f"Starting load balancer on port: {args.load_balancer_port}") 82 | serve_load_balancing(args.model_config, args.load_balancer_port) 83 | 84 | else: 85 | assert args.server_port, "--server-port must be provided." 86 | local_rank = int(os.getenv("LOCAL_RANK", "0")) 87 | port = args.server_port + local_rank 88 | 89 | inference_pipeline = load_models(args.model_config) 90 | 91 | print(f"Starting server on port: {port}") 92 | serve_inference(inference_pipeline, port) 93 | 94 | 95 | if __name__ == "__main__": 96 | # python -m mii.launch.multi_gpu_server 97 | main() 98 | -------------------------------------------------------------------------------- /mii/legacy/logging.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import sys 6 | import logging 7 | 8 | log_levels = { 9 | "debug": logging.DEBUG, 10 | "info": logging.INFO, 11 | "warning": logging.WARNING, 12 | "error": logging.ERROR, 13 | "critical": logging.CRITICAL, 14 | } 15 | 16 | 17 | class LoggerFactory: 18 | @staticmethod 19 | def create_logger(name=None, level=logging.INFO): 20 | """create a logger 21 | Args: 22 | name (str): name of the logger 23 | level: level of logger 24 | Raises: 25 | ValueError is name is None 26 | """ 27 | 28 | if name is None: 29 | raise ValueError("name for logger cannot be None") 30 | 31 | formatter = logging.Formatter( 32 | "[%(asctime)s] [%(levelname)s] " 33 | "[%(filename)s:%(lineno)d:%(funcName)s] %(message)s") 34 | 35 | logger_ = logging.getLogger(name) 36 | logger_.setLevel(level) 37 | logger_.propagate = False 38 | ch = logging.StreamHandler(stream=sys.stdout) 39 | ch.setLevel(level) 40 | ch.setFormatter(formatter) 41 | logger_.addHandler(ch) 42 | return logger_ 43 | 44 | 45 | logger = LoggerFactory.create_logger(name="MII_legacy", level=logging.INFO) 46 | -------------------------------------------------------------------------------- /mii/legacy/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | from .score import create_score_file 6 | from .load_models import load_models 7 | -------------------------------------------------------------------------------- /mii/legacy/models/load_models.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import os 6 | import mii.legacy as mii 7 | import torch 8 | import inspect 9 | import deepspeed 10 | from deepspeed.runtime.config import DeepSpeedConfig 11 | from deepspeed.runtime.zero.config import ZeroStageEnum 12 | 13 | 14 | def load_models(model_config): 15 | local_rank = int(os.getenv("LOCAL_RANK", "0")) 16 | world_size = int(os.getenv("WORLD_SIZE", "1")) 17 | 18 | inf_config = { 19 | "tensor_parallel": { 20 | "tp_size": model_config.tensor_parallel, 21 | "mpu": None 22 | }, 23 | "dtype": model_config.dtype, 24 | "replace_method": "auto", 25 | "enable_cuda_graph": model_config.enable_cuda_graph, 26 | "checkpoint": None, 27 | "config": None, 28 | "training_mp_size": 1, 29 | "replace_with_kernel_inject": model_config.replace_with_kernel_inject, 30 | "max_tokens": model_config.max_tokens, 31 | "min_tokens": model_config.max_tokens, 32 | } 33 | 34 | provider = model_config.provider 35 | if provider == mii.constants.ModelProvider.HUGGING_FACE: 36 | from mii.legacy.models.providers.huggingface import hf_provider 37 | 38 | inference_pipeline = hf_provider(model_config) 39 | if model_config.meta_tensor: 40 | inf_config["checkpoint"] = inference_pipeline.checkpoint_dict 41 | if model_config.dtype == torch.int8: 42 | # Support for older DeepSpeed versions 43 | if ("enable_qkv_quantization" 44 | in inspect.signature(deepspeed.init_inference).parameters): 45 | inf_config["enable_qkv_quantization"] = True 46 | elif provider == mii.constants.ModelProvider.ELEUTHER_AI: 47 | assert False, "Eleuther AI support is currently disabled." 48 | # TODO: Re-enable EleutherAI model support 49 | """ 50 | from mii.models.providers.eleutherai import eleutherai_provider 51 | assert mii_config.dtype == torch.half, "gpt-neox only support fp16" 52 | assert mii_config.enable_cuda_graph == False, "Provider EleutherAI not supported with Cuda Graphs" 53 | from megatron import mpu 54 | inf_config["tensor_parallel"]["mpu"] = mpu 55 | inference_pipeline = eleutherai_provider(model_path, 56 | model_name, 57 | task_name, 58 | mii_config) 59 | inf_config["training_mp_size"] = 2 60 | inf_config["config"] = inference_pipeline.neox_args 61 | """ 62 | elif provider == mii.constants.ModelProvider.DIFFUSERS: 63 | from mii.legacy.models.providers.diffusers import diffusers_provider 64 | inference_pipeline = diffusers_provider(model_config) 65 | else: 66 | raise ValueError(f"Unknown model provider {provider}") 67 | print( 68 | f"> --------- MII Settings: ds_optimize={model_config.enable_deepspeed}, replace_with_kernel_inject={model_config.replace_with_kernel_inject}, enable_cuda_graph={model_config.enable_cuda_graph} " 69 | ) 70 | if model_config.enable_deepspeed: 71 | engine = deepspeed.init_inference(getattr(inference_pipeline, 72 | "model", 73 | inference_pipeline), 74 | config=inf_config) 75 | if model_config.profile_model_time: 76 | engine.profile_model_time() 77 | if hasattr(inference_pipeline, "model"): 78 | inference_pipeline.model = engine 79 | 80 | elif model_config.enable_zero: 81 | ds_config = DeepSpeedConfig(model_config.ds_config) 82 | assert ( 83 | ds_config.zero_optimization_stage == ZeroStageEnum.weights 84 | ), "DeepSpeed ZeRO inference is only supported for ZeRO-3" 85 | 86 | # initialise Deepspeed ZeRO and store only the engine object 87 | ds_engine = deepspeed.initialize(model=inference_pipeline.model, 88 | config=model_config.ds_config)[0] 89 | ds_engine.module.eval() # inference 90 | inference_pipeline.model = ds_engine.module 91 | 92 | if model_config.load_with_sys_mem: 93 | inference_pipeline.device = torch.device(f"cuda:{local_rank}") 94 | 95 | # Free up memory used when initially loading models 96 | # so nvidia-smi reports correct amount of memory used. 97 | torch.cuda.empty_cache() 98 | 99 | return inference_pipeline 100 | -------------------------------------------------------------------------------- /mii/legacy/models/providers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | -------------------------------------------------------------------------------- /mii/legacy/models/providers/diffusers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import os 6 | import torch 7 | 8 | from .utils import attempt_load 9 | from mii.config import ModelConfig 10 | 11 | 12 | def diffusers_provider(model_config: ModelConfig): 13 | from diffusers import DiffusionPipeline 14 | 15 | local_rank = int(os.getenv("LOCAL_RANK", "0")) 16 | 17 | kwargs = model_config.pipeline_kwargs 18 | if model_config.dtype == torch.half: 19 | kwargs["torch_dtype"] = torch.float16 20 | kwargs["revision"] = "fp16" 21 | 22 | pipeline = attempt_load(DiffusionPipeline.from_pretrained, 23 | model_config.model, 24 | model_config.model_path, 25 | kwargs=kwargs) 26 | pipeline = pipeline.to(f"cuda:{local_rank}") 27 | pipeline.set_progress_bar_config(disable=True) 28 | return pipeline 29 | -------------------------------------------------------------------------------- /mii/legacy/models/providers/eleutherai.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | -------------------------------------------------------------------------------- /mii/legacy/models/providers/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | from mii.utils import is_aml, mii_cache_path 7 | 8 | 9 | def attempt_load(load_fn, model_name, model_path, cache_path=None, kwargs={}): 10 | try: 11 | value = load_fn(model_name, **kwargs) 12 | except Exception as ex: 13 | if is_aml(): 14 | print( 15 | f"Attempted load but failed - {str(ex)}, retrying using model_path={model_path}" 16 | ) 17 | value = load_fn(model_path, **kwargs) 18 | else: 19 | cache_path = cache_path or mii_cache_path() 20 | print( 21 | f"Attempted load but failed - {str(ex)}, retrying using cache_dir={cache_path}" 22 | ) 23 | value = load_fn(model_name, cache_dir=cache_path, **kwargs) 24 | return value 25 | -------------------------------------------------------------------------------- /mii/legacy/models/score/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | from .generate import create_score_file, generated_score_path 6 | -------------------------------------------------------------------------------- /mii/legacy/models/score/generate.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import os 6 | import mii.legacy as mii 7 | import pprint 8 | from mii.legacy.logging import logger 9 | from mii.legacy.constants import DeploymentType 10 | 11 | 12 | def create_score_file(mii_config): 13 | if len(mii.__path__) > 1: 14 | logger.warning( 15 | f"Detected mii path as multiple sources: {mii.__path__}, might cause unknown behavior" 16 | ) 17 | 18 | with open(os.path.join(mii.__path__[0], 19 | "models/score/score_template.py"), 20 | "r") as fd: 21 | score_src = fd.read() 22 | 23 | # update score file w. global config dict 24 | config_dict = mii_config.dict() 25 | source_with_config = f"{score_src}\n" 26 | source_with_config += f"mii_config = {pprint.pformat(config_dict, indent=4)}" 27 | 28 | with open( 29 | generated_score_path(mii_config.deployment_name, 30 | mii_config.deployment_type), 31 | "w") as fd: 32 | fd.write(source_with_config) 33 | fd.write("\n") 34 | 35 | 36 | def generated_score_path(deployment_name, deployment_type): 37 | if deployment_type == DeploymentType.LOCAL: 38 | score_path = os.path.join(mii.utils.mii_cache_path(), deployment_name) 39 | elif deployment_type == DeploymentType.AML: 40 | score_path = os.path.join(mii.aml_related.utils.aml_output_path(deployment_name), 41 | "code") 42 | if not os.path.isdir(score_path): 43 | os.makedirs(score_path) 44 | return os.path.join(score_path, "score.py") 45 | -------------------------------------------------------------------------------- /mii/legacy/models/score/score_template.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | # flake8: noqa 7 | import os 8 | import json 9 | import time 10 | import torch 11 | 12 | import mii.legacy as mii 13 | 14 | model = None 15 | 16 | 17 | def init(): 18 | global mii_config 19 | mii_config = mii.MIIConfig(**mii_config) 20 | 21 | # For AML deployments, we stand up multiple nginx server workers, one for 22 | # each replica. This is so that we can properly run multiple requests in 23 | # parallel on the different replicas. However, each worker will run this 24 | # generated score.py and try to stand up an entire MII deployment 25 | # (load-balancer, replicas, etc.). We want only one worker to spawn the 26 | # load-balancer and replicas. We take advantage of the nginx worker PIDs 27 | # being consecutive to achieve that here. 28 | start_server = True 29 | if mii.utils.is_aml() and (int(os.getpid()) % mii_config.replica_num != 0): 30 | start_server = False 31 | 32 | if start_server: 33 | mii.MIIServer(mii_config) 34 | 35 | global model 36 | model = None 37 | 38 | # In AML deployments both the GRPC client and server are used in the same process 39 | if mii.utils.is_aml(): 40 | model = mii.MIIClient(mii_config=mii_config) 41 | 42 | 43 | def run(request): 44 | global mii_config, model 45 | assert ( 46 | model is not None 47 | ), "grpc client has not been setup when this model was created" 48 | 49 | request_dict = json.loads(request) 50 | 51 | query_dict = mii.utils.extract_query_dict(mii_config.task, request_dict) 52 | 53 | response = model.query(query_dict, **request_dict) 54 | 55 | time_taken = response.time_taken 56 | if not isinstance(response.response, str): 57 | response = [r for r in response.response] 58 | return json.dumps({"responses": response, "time": time_taken}) 59 | 60 | 61 | ### Auto-generated config will be appended below at run-time 62 | -------------------------------------------------------------------------------- /mii/legacy/models/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import os 6 | import io 7 | from mii.legacy.utils import mii_cache_path 8 | 9 | 10 | def supported_models_from_huggingface(): 11 | return ["gpt2", "deepset/roberta-large-squad2"] 12 | 13 | 14 | """TODO make this more robust. If the pipeline has already been imported then 15 | this might not work since the cache is set by the first import""" 16 | 17 | 18 | def _download_hf_model_to_path(task, model_name, model_path): 19 | 20 | os.environ["TRANSFORMERS_CACHE"] = model_path 21 | from transformers import pipeline 22 | 23 | inference_pipeline = pipeline(task, model=model_name) 24 | 25 | 26 | """generic method that will allow downloading all models that we support. 27 | Currently only supports HF models, but will be extended to support model checkpoints 28 | from other sources""" 29 | 30 | 31 | def download_model_and_get_path(task, model_name): 32 | 33 | model_path = os.path.join(mii_cache_path(), model_name) 34 | if not os.path.isdir(model_path): 35 | os.makedirs(model_path) 36 | 37 | if model_name in supported_models_from_huggingface(): 38 | _download_hf_model_to_path(task, model_name, model_path) 39 | else: 40 | assert False, "Only models from HF supported so far" 41 | 42 | return model_path 43 | 44 | 45 | class ImageResponse: 46 | def __init__(self, response): 47 | self._response = response 48 | self.nsfw_content_detected = response.nsfw_content_detected 49 | self._deserialized_images = None 50 | 51 | @property 52 | def images(self): 53 | if self._deserialized_images is None: 54 | from PIL import Image 55 | 56 | images = [] 57 | for idx, img_bytes in enumerate(self._response.images): 58 | size = (self._response.size_w, self._response.size_h) 59 | img = Image.frombytes(self._response.mode, size, img_bytes) 60 | images.append(img) 61 | self._deserialized_images = images 62 | return self._deserialized_images 63 | 64 | 65 | def convert_bytes_to_pil_image(image_bytes: bytes): 66 | """Converts bytes to a PIL Image object.""" 67 | if not isinstance(image_bytes, bytes): 68 | return image_bytes 69 | 70 | from PIL import Image 71 | image = Image.open(io.BytesIO(image_bytes)) 72 | return image 73 | -------------------------------------------------------------------------------- /mii/legacy/terminate.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import grpc 6 | 7 | import mii.legacy as mii 8 | from mii.legacy.logging import logger 9 | 10 | 11 | def terminate(deployment_name): 12 | logger.info(f"Terminating server for {deployment_name}") 13 | generator = mii.mii_query_handle(deployment_name) 14 | if deployment_name in mii.non_persistent_models: 15 | generator.terminate() 16 | return 17 | try: 18 | generator.query({"query": ""}) 19 | except grpc.aio._call.AioRpcError as error: 20 | if error._code == grpc.StatusCode.UNAVAILABLE: 21 | logger.warn(f"Server for {deployment_name} not found") 22 | else: 23 | pass 24 | except (KeyError, TypeError) as error: 25 | pass 26 | 27 | generator.terminate() 28 | mii.client.terminate_restful_gateway(deployment_name) 29 | -------------------------------------------------------------------------------- /mii/logging.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import logging 6 | import sys 7 | 8 | log_levels = { 9 | "debug": logging.DEBUG, 10 | "info": logging.INFO, 11 | "warning": logging.WARNING, 12 | "error": logging.ERROR, 13 | "critical": logging.CRITICAL, 14 | } 15 | 16 | 17 | class LoggerFactory: 18 | @staticmethod 19 | def create_logger(name=None, level=logging.INFO): 20 | """create a logger 21 | Args: 22 | name (str): name of the logger 23 | level: level of logger 24 | Raises: 25 | ValueError is name is None 26 | """ 27 | 28 | if name is None: 29 | raise ValueError("name for logger cannot be None") 30 | 31 | formatter = logging.Formatter( 32 | "[%(asctime)s] [%(levelname)s] " 33 | "[%(filename)s:%(lineno)d:%(funcName)s] %(message)s") 34 | 35 | logger_ = logging.getLogger(name) 36 | logger_.setLevel(level) 37 | logger_.propagate = False 38 | ch = logging.StreamHandler(stream=sys.stdout) 39 | ch.setLevel(level) 40 | ch.setFormatter(formatter) 41 | logger_.addHandler(ch) 42 | return logger_ 43 | 44 | 45 | logger = LoggerFactory.create_logger(name="MII", level=logging.INFO) 46 | -------------------------------------------------------------------------------- /mii/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | -------------------------------------------------------------------------------- /mii/modeling/models.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | from deepspeed.inference import build_hf_engine, InferenceEngineV2 7 | 8 | from mii.config import ModelConfig 9 | from mii.constants import ModelProvider 10 | from mii.utils import init_distributed 11 | 12 | 13 | def load_model(model_config: ModelConfig) -> InferenceEngineV2: 14 | init_distributed(model_config) 15 | provider = model_config.provider 16 | if provider == ModelProvider.HUGGING_FACE: 17 | inference_engine = build_hf_engine( 18 | path=model_config.model_name_or_path, 19 | engine_config=model_config.inference_engine_config) 20 | else: 21 | raise ValueError(f"Unknown model provider {provider}") 22 | 23 | return inference_engine 24 | -------------------------------------------------------------------------------- /mii/modeling/tokenizers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | from abc import ABC, abstractmethod 7 | from typing import TYPE_CHECKING, Union 8 | 9 | import torch 10 | from transformers import AutoTokenizer 11 | 12 | from mii.constants import ModelProvider 13 | 14 | if TYPE_CHECKING: 15 | from mii.config import ModelConfig 16 | 17 | 18 | class MIITokenizerWrapper(ABC): 19 | def __init__(self, tokenizer: object) -> None: 20 | self.tokenizer = tokenizer 21 | 22 | @property 23 | @abstractmethod 24 | def vocab_size(self) -> int: 25 | ... 26 | 27 | @property 28 | @abstractmethod 29 | def eos_token_id(self) -> int: 30 | ... 31 | 32 | @abstractmethod 33 | def encode(self, input: str) -> torch.Tensor: 34 | ... 35 | 36 | @abstractmethod 37 | def decode(self, tokens: torch.Tensor) -> str: 38 | ... 39 | 40 | 41 | class HFTokenizer(MIITokenizerWrapper): 42 | def __init__(self, tokenizer: Union[str, object]) -> None: 43 | if isinstance(tokenizer, str): 44 | tokenizer = AutoTokenizer.from_pretrained(tokenizer, trust_remote_code=True) 45 | tokenizer.pad_token = tokenizer.eos_token 46 | super().__init__(tokenizer) 47 | 48 | @property 49 | def vocab_size(self) -> int: 50 | return len(self.tokenizer) 51 | 52 | @property 53 | def eos_token_id(self) -> int: 54 | eos_token_attrs = ["eod", "eos_token_id", "eos_token", "eod_id"] 55 | for attr in eos_token_attrs: 56 | if getattr(self.tokenizer, attr, None) is not None: 57 | return getattr(self.tokenizer, attr) 58 | raise ValueError(f"Tokenizer must have one of {eos_token_attrs} attributes.") 59 | 60 | def encode(self, input: str) -> torch.Tensor: 61 | return self.tokenizer.encode(input, return_tensors="pt").flatten() 62 | 63 | def convert_tokens_to_ids(self, input: str) -> int: 64 | return self.tokenizer.convert_tokens_to_ids(input) 65 | 66 | def decode(self, tokens: torch.Tensor) -> str: 67 | return self.tokenizer.decode(tokens) 68 | 69 | 70 | def load_tokenizer(model_config: "ModelConfig") -> MIITokenizerWrapper: 71 | provider = model_config.provider 72 | if provider == ModelProvider.HUGGING_FACE: 73 | tokenizer = HFTokenizer(model_config.tokenizer) 74 | else: 75 | raise ValueError(f"Unknown model provider {provider}") 76 | 77 | return tokenizer 78 | -------------------------------------------------------------------------------- /mii/score/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | from .generate import create_score_file, generated_score_path 6 | -------------------------------------------------------------------------------- /mii/score/generate.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import os 6 | import mii 7 | import pprint 8 | from mii.logging import logger 9 | from mii.constants import DeploymentType 10 | 11 | 12 | def create_score_file(mii_config): 13 | if len(mii.__path__) > 1: 14 | logger.warning( 15 | f"Detected mii path as multiple sources: {mii.__path__}, might cause unknown behavior" 16 | ) 17 | 18 | with open(os.path.join(mii.__path__[0], "score/score_template.py"), "r") as fd: 19 | score_src = fd.read() 20 | 21 | # update score file w. global config dict 22 | config_dict = mii_config.model_dump() 23 | source_with_config = f"{score_src}\n" 24 | source_with_config += f"mii_config = {pprint.pformat(config_dict, indent=4)}" 25 | 26 | with open( 27 | generated_score_path(mii_config.deployment_name, 28 | mii_config.deployment_type), 29 | "w") as fd: 30 | fd.write(source_with_config) 31 | fd.write("\n") 32 | 33 | 34 | def generated_score_path(deployment_name, deployment_type): 35 | if deployment_type == DeploymentType.LOCAL: 36 | score_path = os.path.join(mii.utils.mii_cache_path(), deployment_name) 37 | elif deployment_type == DeploymentType.AML: 38 | score_path = os.path.join(mii.aml_related.utils.aml_output_path(deployment_name), 39 | "code") 40 | if not os.path.isdir(score_path): 41 | os.makedirs(score_path) 42 | return os.path.join(score_path, "score.py") 43 | -------------------------------------------------------------------------------- /mii/score/score_template.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | # flake8: noqa 7 | import os 8 | import json 9 | import time 10 | import torch 11 | 12 | import mii 13 | 14 | model = None 15 | 16 | 17 | def init(): 18 | global mii_config 19 | mii_config = mii.config.MIIConfig(**mii_config) 20 | 21 | # For AML deployments, we stand up multiple nginx server workers, one for 22 | # each replica. This is so that we can properly run multiple requests in 23 | # parallel on the different replicas. However, each worker will run this 24 | # generated score.py and try to stand up an entire MII deployment 25 | # (load-balancer, replicas, etc.). We want only one worker to spawn the 26 | # load-balancer and replicas. We take advantage of the nginx worker PIDs 27 | # being consecutive to achieve that here. 28 | start_server = True 29 | if mii.utils.is_aml() and (int(os.getpid()) % mii_config.replica_num != 0): 30 | start_server = False 31 | 32 | if start_server: 33 | mii.backend.MIIServer(mii_config) 34 | 35 | global model 36 | model = None 37 | 38 | # In AML deployments both the GRPC client and server are used in the same process 39 | if mii.utils.is_aml(): 40 | model = mii.backend.MIIClient(mii_config=mii_config) 41 | 42 | 43 | def run(request): 44 | global mii_config, model 45 | assert ( 46 | model is not None 47 | ), "grpc client has not been setup when this model was created" 48 | 49 | request_dict = json.loads(request) 50 | 51 | query_dict = mii.utils.extract_query_dict(mii_config.task, request_dict) 52 | 53 | response = model.query(query_dict, **request_dict) 54 | 55 | time_taken = response.time_taken 56 | if not isinstance(response.response, str): 57 | response = [r for r in response.response] 58 | return json.dumps({"responses": response, "time": time_taken}) 59 | 60 | 61 | ### Auto-generated config will be appended below at run-time 62 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "build", 4 | "setuptools>=64", 5 | "wheel" 6 | ] 7 | # Use legacy backend to import local packages in setup.py 8 | build-backend = "setuptools.build_meta:__legacy__" 9 | -------------------------------------------------------------------------------- /release/bump_patch_version.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | import argparse 7 | from packaging import version as pkg_version 8 | 9 | parser = argparse.ArgumentParser() 10 | 11 | parser.add_argument( 12 | "--current_version", 13 | type=str, 14 | help="The current version being published to help set the next version.") 15 | 16 | args = parser.parse_args() 17 | 18 | current_version = pkg_version.parse(args.current_version) 19 | 20 | with open('./version.txt', 'w') as fd: 21 | fd.write( 22 | f'{current_version.major}.{current_version.minor}.{current_version.micro + 1}\n') 23 | 24 | print( 25 | f'{current_version} -> {current_version.major}.{current_version.minor}.{current_version.micro + 1}' 26 | ) 27 | -------------------------------------------------------------------------------- /release/check_release_version.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | import argparse 7 | from packaging import version as pkg_version 8 | 9 | parser = argparse.ArgumentParser() 10 | 11 | parser.add_argument("--release_version", 12 | type=str, 13 | help="The new version being published.") 14 | 15 | args = parser.parse_args() 16 | 17 | release_version = pkg_version.parse(args.release_version) 18 | 19 | with open('./version.txt') as fd: 20 | repo_version = pkg_version.parse(fd.read()) 21 | 22 | assert repo_version == release_version, f"{repo_version=} does not match {release_version=}, unable to proceed" 23 | -------------------------------------------------------------------------------- /release/release.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | 7 | cd .. 8 | 9 | if [ ! -f ~/.pypirc ]; then 10 | echo 'create .pypirc in order to upload to PyPI' 11 | exit 1 12 | fi 13 | 14 | version=$1 15 | 16 | if [ -z $version ]; then 17 | echo "please provide version number for release" 18 | exit 1 19 | fi 20 | 21 | if [[ $version == *"v"* ]]; then 22 | echo "please only include version number without 'v' prefix" 23 | exit 1 24 | fi 25 | 26 | if [ "${version}" != `cat version.txt` ]; then 27 | echo "version=${version} does not match version.txt" 28 | cat version.txt 29 | exit 1 30 | fi 31 | 32 | python -c "import twine" 33 | if [ $? != 0 ]; then 34 | echo 'please install twine via pip' 35 | exit 1 36 | fi 37 | 38 | MII_BUILD_STRING="" python -m build --wheel 39 | WHL=deepspeed_mii-${version}-py3-none-any.whl 40 | 41 | if [ ! -f dist/${WHL} ]; then 42 | echo "prepared version does not match version given ($version), bump version first?" 43 | ls dist 44 | exit 1 45 | fi 46 | 47 | python -m twine upload --verbose dist/${WHL} --repository mii 48 | 49 | git tag v${version} 50 | git push origin v${version} 51 | 52 | echo "bumping up patch version" 53 | cd - 54 | python bump_patch_version.py 55 | -------------------------------------------------------------------------------- /requirements/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | clang-format==18.1.3 2 | einops 3 | pre-commit>=2.20.0 4 | pytest 5 | pytest-forked 6 | sentencepiece 7 | tiktoken 8 | transformers-stream-generator 9 | -------------------------------------------------------------------------------- /requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | asyncio 3 | deepspeed>=0.15.0 4 | deepspeed-kernels 5 | fastapi 6 | fastchat 7 | Flask-RESTful 8 | grpcio 9 | grpcio-tools 10 | Pillow 11 | pydantic>=2.0.0 12 | pyzmq 13 | safetensors 14 | shortuuid 15 | torch 16 | transformers 17 | ujson 18 | Werkzeug 19 | -------------------------------------------------------------------------------- /scripts/check-license.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | from __future__ import annotations 7 | '''Copyright The Microsoft DeepSpeed Team''' 8 | """ 9 | Modified from https://github.com/jlebar/pre-commit-hooks/blob/master/check_do_not_submit.py 10 | """ 11 | 12 | import subprocess 13 | import sys 14 | 15 | 16 | def err(s: str) -> None: 17 | print(s, file=sys.stderr) 18 | 19 | 20 | COPYRIGHT = [ 21 | r"^\(\/\/\|#\) Copyright (c) Microsoft Corporation.$", 22 | r"^\(\/\/\|#\) SPDX-License-Identifier: Apache-2.0$", 23 | r"^\(\/\/\|#\) DeepSpeed Team$" 24 | ] 25 | 26 | success = True 27 | failures = [] 28 | for f in sys.argv[1:]: 29 | for copyright_line in COPYRIGHT: 30 | if not success: 31 | break 32 | res = subprocess.run(["git", 33 | "grep", 34 | "--quiet", 35 | "-e", 36 | copyright_line, 37 | f], 38 | capture_output=True) 39 | if res.returncode == 1: 40 | success = False 41 | failures.append(f) 42 | elif res.returncode == 2: 43 | err(f"Error invoking grep on {', '.join(sys.argv[1:])}:") 44 | err(res.stderr.decode("utf-8")) 45 | sys.exit(2) 46 | 47 | if not success: 48 | err(f'{failures}: Missing license at top of file') 49 | err(res.stdout.decode("utf-8")) 50 | sys.exit(1) 51 | -------------------------------------------------------------------------------- /scripts/model_download.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) Microsoft Corporation. 3 | # SPDX-License-Identifier: Apache-2.0 4 | 5 | # DeepSpeed Team 6 | import os 7 | import argparse 8 | 9 | from huggingface_hub import HfApi 10 | from transformers import AutoConfig, AutoTokenizer, AutoModel 11 | 12 | 13 | def dir_path(path_str): 14 | if os.path.isdir(path_str): 15 | return path_str 16 | elif input(f"{path_str} does not exist, create directory? [y/n]").lower() == "y": 17 | os.makedirs(path_str) 18 | return path_str 19 | else: 20 | raise NotADirectoryError(path_str) 21 | 22 | 23 | class HFModelNotFoundError(Exception): 24 | def __init__(self, model_str): 25 | super().__init__(f"HuggingFace model not found: '{model_str}'") 26 | 27 | 28 | def hf_model(model_str): 29 | api = HfApi() 30 | models = [m.id for m in api.list_models()] 31 | if model_str in models: 32 | return model_str 33 | else: 34 | raise HFModelNotFoundError(model_str) 35 | 36 | 37 | parser = argparse.ArgumentParser() 38 | parser.add_argument("--model_path", 39 | '-o', 40 | type=dir_path, 41 | required=True, 42 | help="Output directory for downloaded model files") 43 | parser.add_argument("--model_name", 44 | '-m', 45 | type=hf_model, 46 | required=True, 47 | help="HuggingFace model name") 48 | args = parser.parse_args() 49 | 50 | for auto_func in [AutoConfig, AutoTokenizer, AutoModel]: 51 | auto_func.from_pretrained(args.model_name, cache_dir=args.model_path) 52 | 53 | print(f"Cached files for '{args.model_name}' downloaded to '{args.model_path}'") 54 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import os 6 | import sys 7 | import subprocess 8 | from setuptools import setup, find_packages 9 | 10 | 11 | def fetch_requirements(path): 12 | with open(path, 'r') as fd: 13 | return [r.strip() for r in fd.readlines()] 14 | 15 | 16 | install_requires = fetch_requirements('requirements/requirements.txt') 17 | 18 | extras_require = {"dev": fetch_requirements('requirements/requirements-dev.txt')} 19 | 20 | 21 | def command_exists(cmd): 22 | if sys.platform == "win32": 23 | result = subprocess.Popen(f'{cmd}', stdout=subprocess.PIPE, shell=True) 24 | return result.wait() == 1 25 | else: 26 | result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True) 27 | return result.wait() == 0 28 | 29 | 30 | # Write out version/git info 31 | git_hash_cmd = "git rev-parse --short HEAD" 32 | git_branch_cmd = "git rev-parse --abbrev-ref HEAD" 33 | if command_exists('git') and 'DS_BUILD_STRING' not in os.environ: 34 | try: 35 | result = subprocess.check_output(git_hash_cmd, shell=True) 36 | git_hash = result.decode('utf-8').strip() 37 | result = subprocess.check_output(git_branch_cmd, shell=True) 38 | git_branch = result.decode('utf-8').strip() 39 | except subprocess.CalledProcessError: 40 | git_hash = "unknown" 41 | git_branch = "unknown" 42 | else: 43 | git_hash = "unknown" 44 | git_branch = "unknown" 45 | 46 | # Parse the MII version string from version.txt 47 | version_str = open('version.txt', 'r').read().strip() 48 | 49 | # Build specifiers like .devX can be added at install time. Otherwise, add the git hash. 50 | # example: MII_BUILD_STR=".dev20201022" python -m build --sdist --wheel 51 | 52 | MII_BUILD_STRING = 'MII_BUILD_STRING' 53 | BUILD_FILE = 'build.txt' 54 | mii_build_string = os.environ.get(MII_BUILD_STRING) 55 | 56 | # Building wheel for distribution, update version file 57 | if mii_build_string: 58 | # Build string env specified, probably building for distribution 59 | with open(BUILD_FILE, 'w') as fd: 60 | fd.write(mii_build_string) 61 | version_str += mii_build_string 62 | elif os.path.isfile(BUILD_FILE): 63 | # build.txt exists, probably installing from distribution 64 | with open(BUILD_FILE, 'r') as fd: 65 | version_str += fd.read().strip() 66 | else: 67 | # None of the above, probably installing from source 68 | version_str += f'+{git_hash}' 69 | 70 | # write out installed version 71 | with open("mii/version.py", 'w') as fd: 72 | fd.write(f"__version__ = '{version_str}'\n") 73 | 74 | # Parse README.md to make long_description for PyPI page. 75 | thisdir = os.path.abspath(os.path.dirname(__file__)) 76 | with open(os.path.join(thisdir, 'README.md'), encoding='utf-8') as fin: 77 | readme_text = fin.read() 78 | print("PACKAGES", find_packages()) 79 | setup(name="deepspeed-mii", 80 | version=version_str, 81 | long_description=readme_text, 82 | long_description_content_type='text/markdown', 83 | description='deepspeed mii', 84 | author='DeepSpeed Team', 85 | author_email='deepspeed-mii@microsoft.com', 86 | url='http://deepspeed.ai', 87 | project_urls={ 88 | 'Documentation': 'https://github.com/deepspeedai/DeepSpeed-MII', 89 | 'Source': 'https://github.com/deepspeedai/DeepSpeed-MII', 90 | }, 91 | install_requires=install_requires, 92 | extras_require=extras_require, 93 | packages=find_packages(exclude=("tests", 94 | )), 95 | classifiers=[ 96 | 'Programming Language :: Python :: 3.8', 97 | 'Programming Language :: Python :: 3.9', 98 | 'Programming Language :: Python :: 3.10', 99 | 'Programming Language :: Python :: 3.11', 100 | 'Programming Language :: Python :: 3.12' 101 | ]) 102 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | import pytest 7 | import time 8 | import os 9 | import mii 10 | from types import SimpleNamespace 11 | from typing import Union 12 | from deepspeed.launcher.runner import DLTS_HOSTFILE 13 | import deepspeed.comm as dist 14 | from huggingface_hub import snapshot_download 15 | 16 | 17 | @pytest.fixture(scope="function", params=[None]) 18 | def tensor_parallel(request): 19 | if request.param is not None: 20 | return request.param 21 | return int(os.getenv("WORLD_SIZE", "1")) 22 | 23 | 24 | @pytest.fixture(scope="function", params=[50050]) 25 | def port_number(request): 26 | return request.param 27 | 28 | 29 | @pytest.fixture(scope="function", params=[1]) 30 | def replica_num(request): 31 | return request.param 32 | 33 | 34 | @pytest.fixture(scope="function", params=[mii.config.DEVICE_MAP_DEFAULT]) 35 | def device_map(request): 36 | return request.param 37 | 38 | 39 | @pytest.fixture(scope="function", params=[False]) 40 | def enable_restful_api(request): 41 | return request.param 42 | 43 | 44 | @pytest.fixture(scope="function", params=[28080]) 45 | def restful_api_port(request): 46 | return request.param 47 | 48 | 49 | @pytest.fixture(scope="function", params=[None]) 50 | def hostfile_content(request): 51 | return request.param 52 | 53 | 54 | @pytest.fixture(scope="function", params=[DLTS_HOSTFILE]) 55 | def hostfile(request, hostfile_content, tmpdir): 56 | if hostfile_content is None: 57 | return request.param 58 | hostfile_path = tmpdir.join("hostfile") 59 | with open(hostfile_path, "w") as f: 60 | for line in hostfile_content: 61 | f.write(line + "\n") 62 | return str(hostfile_path) 63 | 64 | 65 | @pytest.fixture(scope="function", params=[mii.TaskType.TEXT_GENERATION]) 66 | def task_name(request): 67 | return request.param 68 | 69 | 70 | @pytest.fixture(scope="function", params=["facebook/opt-125m"]) 71 | def model_name(request): 72 | return request.param 73 | 74 | 75 | @pytest.fixture(scope="function", params=[False]) 76 | def local_model(request): 77 | return request.param 78 | 79 | 80 | @pytest.fixture(scope="function") 81 | def model_path(model_name, local_model, tmpdir): 82 | if not local_model: 83 | return None 84 | 85 | base_dir = os.getenv("HF_HOME", tmpdir) 86 | download_dir = os.path.join(base_dir, "mii-ci-models", model_name) 87 | snapshot_download(model_name, local_dir=download_dir) 88 | return download_dir 89 | 90 | 91 | @pytest.fixture(scope="function") 92 | def model_name_or_path(model_name, model_path): 93 | if model_path is not None: 94 | return model_path 95 | return model_name 96 | 97 | 98 | @pytest.fixture(scope="function", params=["test-dep"]) 99 | def deployment_name(request): 100 | return request.param 101 | 102 | 103 | @pytest.fixture(scope="function", params=[mii.DeploymentType.LOCAL]) 104 | def deployment_type(request): 105 | return request.param 106 | 107 | 108 | @pytest.fixture(scope="function", params=[True]) 109 | def all_rank_output(request): 110 | return request.param 111 | 112 | 113 | @pytest.fixture(scope="function") 114 | def model_config( 115 | model_name_or_path: str, 116 | task_name: str, 117 | tensor_parallel: int, 118 | replica_num: int, 119 | device_map: Union[str, 120 | dict], 121 | ): 122 | config = SimpleNamespace( 123 | model_name_or_path=model_name_or_path, 124 | task=task_name, 125 | tensor_parallel=tensor_parallel, 126 | replica_num=replica_num, 127 | device_map=device_map, 128 | ) 129 | return config.__dict__ 130 | 131 | 132 | @pytest.fixture(scope="function") 133 | def mii_config( 134 | deployment_name: str, 135 | deployment_type: str, 136 | port_number: int, 137 | enable_restful_api: bool, 138 | restful_api_port: int, 139 | hostfile: str, 140 | model_config: dict, 141 | ): 142 | config = SimpleNamespace( 143 | deployment_name=deployment_name, 144 | deployment_type=deployment_type, 145 | port_number=port_number, 146 | enable_restful_api=enable_restful_api, 147 | restful_api_port=restful_api_port, 148 | hostfile=hostfile, 149 | model_config=model_config, 150 | ) 151 | return config.__dict__ 152 | 153 | 154 | @pytest.fixture(scope="function", params=[None], ids=["nofail"]) 155 | def expected_failure(request): 156 | return request.param 157 | 158 | 159 | @pytest.fixture(scope="function") 160 | def pipeline(model_config, all_rank_output, expected_failure): 161 | if expected_failure is not None: 162 | with pytest.raises(expected_failure) as excinfo: 163 | mii.pipeline(model_config=model_config, all_rank_output=all_rank_output) 164 | yield excinfo 165 | else: 166 | pipe = mii.pipeline(model_config=model_config, all_rank_output=all_rank_output) 167 | yield pipe 168 | pipe.destroy() 169 | dist.destroy_process_group() 170 | 171 | 172 | @pytest.fixture(scope="function") 173 | def deployment(mii_config, expected_failure): 174 | if expected_failure is not None: 175 | with pytest.raises(expected_failure) as excinfo: 176 | mii.serve(mii_config=mii_config) 177 | yield excinfo 178 | else: 179 | client = mii.serve(mii_config=mii_config) 180 | yield client 181 | client.terminate_server() 182 | time.sleep(1) # Give a second for ports to be released 183 | 184 | 185 | @pytest.fixture(scope="function", params=["DeepSpeed is the greatest"], ids=["query0"]) 186 | def query(request): 187 | return request.param 188 | -------------------------------------------------------------------------------- /tests/legacy/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | -------------------------------------------------------------------------------- /tests/legacy/conftest.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | import pytest 7 | import os 8 | import mii.legacy as mii 9 | from types import SimpleNamespace 10 | 11 | 12 | @pytest.fixture(scope="function", params=["fp16"]) 13 | def dtype(request): 14 | return request.param 15 | 16 | 17 | @pytest.fixture(scope="function", params=[1]) 18 | def tensor_parallel(request): 19 | return request.param 20 | 21 | 22 | @pytest.fixture(scope="function", params=[50050]) 23 | def port_number(request): 24 | return request.param 25 | 26 | 27 | @pytest.fixture(scope="function", params=[False]) 28 | def meta_tensor(request): 29 | return request.param 30 | 31 | 32 | @pytest.fixture(scope="function", params=[False]) 33 | def load_with_sys_mem(request): 34 | return request.param 35 | 36 | 37 | @pytest.fixture(scope="function", params=[1]) 38 | def replica_num(request): 39 | return request.param 40 | 41 | 42 | @pytest.fixture(scope="function", params=[False]) 43 | def enable_restful_api(request): 44 | return request.param 45 | 46 | 47 | @pytest.fixture(scope="function", params=[28080]) 48 | def restful_api_port(request): 49 | return request.param 50 | 51 | 52 | @pytest.fixture(scope="function", params=["text-generation"]) 53 | def task_name(request): 54 | return request.param 55 | 56 | 57 | @pytest.fixture(scope="function", params=["bigscience/bloom-560m"]) 58 | def model_name(request): 59 | return request.param 60 | 61 | 62 | @pytest.fixture(scope="function") 63 | def deployment_name(model_name): 64 | return model_name + "-deployment" 65 | 66 | 67 | @pytest.fixture(scope="function", params=[mii.DeploymentType.LOCAL]) 68 | def deployment_type(request): 69 | return request.param 70 | 71 | 72 | @pytest.fixture(scope="function", params=[True]) 73 | def enable_deepspeed(request): 74 | return request.param 75 | 76 | 77 | @pytest.fixture(scope="function", params=[False]) 78 | def enable_zero(request): 79 | return request.param 80 | 81 | 82 | @pytest.fixture(scope="function", params=[{}]) 83 | def ds_config(request): 84 | return request.param 85 | 86 | 87 | @pytest.fixture(scope="function") 88 | def replace_with_kernel_inject(model_name): 89 | if "clip-vit" in model_name: 90 | return False 91 | return True 92 | 93 | 94 | @pytest.fixture(scope="function") 95 | def model_config( 96 | task_name: str, 97 | model_name: str, 98 | dtype: str, 99 | tensor_parallel: int, 100 | meta_tensor: bool, 101 | load_with_sys_mem: bool, 102 | replica_num: int, 103 | enable_deepspeed: bool, 104 | enable_zero: bool, 105 | ds_config: dict, 106 | replace_with_kernel_inject: bool, 107 | ): 108 | config = SimpleNamespace( 109 | skip_model_check=True, # TODO: remove this once conversation task check is fixed 110 | task=task_name, 111 | model=model_name, 112 | dtype=dtype, 113 | tensor_parallel=tensor_parallel, 114 | model_path=os.getenv("TRANSFORMERS_CACHE", 115 | ""), 116 | meta_tensor=meta_tensor, 117 | load_with_sys_mem=load_with_sys_mem, 118 | replica_num=replica_num, 119 | enable_deepspeed=enable_deepspeed, 120 | enable_zero=enable_zero, 121 | ds_config=ds_config, 122 | replace_with_kernel_inject=replace_with_kernel_inject, 123 | ) 124 | return config.__dict__ 125 | 126 | 127 | @pytest.fixture(scope="function") 128 | def mii_config( 129 | deployment_type: str, 130 | port_number: int, 131 | enable_restful_api: bool, 132 | restful_api_port: int, 133 | ): 134 | config = SimpleNamespace( 135 | deployment_type=deployment_type, 136 | port_number=port_number, 137 | enable_restful_api=enable_restful_api, 138 | restful_api_port=restful_api_port, 139 | ) 140 | return config.__dict__ 141 | 142 | 143 | @pytest.fixture(scope="function", params=[None]) 144 | def expected_failure(request): 145 | return request.param 146 | 147 | 148 | @pytest.fixture(scope="function") 149 | def deployment(deployment_name, mii_config, model_config, expected_failure): 150 | if expected_failure is not None: 151 | with pytest.raises(expected_failure) as excinfo: 152 | mii.deploy( 153 | deployment_name=deployment_name, 154 | mii_config=mii_config, 155 | model_config=model_config, 156 | ) 157 | yield excinfo 158 | else: 159 | mii.deploy( 160 | deployment_name=deployment_name, 161 | mii_config=mii_config, 162 | model_config=model_config, 163 | ) 164 | yield deployment_name 165 | mii.terminate(deployment_name) 166 | 167 | 168 | @pytest.fixture(scope="function", params=[{"query": "DeepSpeed is the greatest"}]) 169 | def query(request): 170 | return request.param 171 | -------------------------------------------------------------------------------- /tests/legacy/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | markers = 3 | deepspeed:Run test for deepspeed CI 4 | -------------------------------------------------------------------------------- /tests/legacy/test_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | import pytest 7 | 8 | import mii.legacy as mii 9 | from pydantic import ValidationError 10 | 11 | 12 | @pytest.mark.parametrize("port_number", [12345]) 13 | @pytest.mark.parametrize("tensor_parallel", [4]) 14 | def test_base_configs(deployment_name, mii_config, model_config): 15 | mii_config["deployment_name"] = deployment_name 16 | mii_config["model_conf"] = model_config 17 | mii_config = mii.config.MIIConfig(**mii_config) 18 | 19 | assert mii_config.port_number == 12345 20 | assert mii_config.model_conf.tensor_parallel == 4 21 | 22 | 23 | @pytest.mark.parametrize("port_number", ["fail"]) 24 | @pytest.mark.parametrize("tensor_parallel", [3.5]) 25 | def test_base_configs_literalfail(deployment_name, mii_config, model_config): 26 | with pytest.raises(ValidationError): 27 | mii_config["deployment_name"] = deployment_name 28 | mii_config["model_conf"] = model_config 29 | mii_config = mii.config.MIIConfig(**mii_config) 30 | -------------------------------------------------------------------------------- /tests/legacy/test_deployment_options.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | import pytest 7 | import json 8 | import requests 9 | import mii.legacy as mii 10 | from pydantic import ValidationError 11 | 12 | 13 | @pytest.mark.deepspeed 14 | @pytest.mark.parametrize("meta_tensor", [True]) 15 | @pytest.mark.parametrize("tensor_parallel", [2]) 16 | def test_meta_tensor(deployment, query): 17 | generator = mii.mii_query_handle(deployment) 18 | result = generator.query(query) 19 | assert result 20 | 21 | 22 | @pytest.mark.parametrize("enable_restful_api", [True]) 23 | def test_restful_api(deployment, query, restful_api_port): 24 | generator = mii.mii_query_handle(deployment) 25 | for _ in range(2): 26 | result = generator.query(query) 27 | 28 | url = f"http://localhost:{restful_api_port}/mii/{deployment}" 29 | params = {"request": query} 30 | json_params = json.dumps(params) 31 | result = requests.post(url, 32 | data=json_params, 33 | headers={"Content-Type": "application/json"}) 34 | assert result.status_code == 200 35 | assert "response" in result.json() 36 | 37 | 38 | @pytest.mark.parametrize("load_with_sys_mem", [True]) 39 | def test_load_to_sys_mem(deployment, query): 40 | generator = mii.mii_query_handle(deployment) 41 | result = generator.query(query) 42 | assert result 43 | 44 | 45 | @pytest.mark.parametrize("replica_num", [2]) 46 | def test_replicas(deployment, query, replica_num): 47 | generator = mii.mii_query_handle(deployment) 48 | # Replicas are given queries in round-robin, so test each model is responding 49 | for _ in range(replica_num): 50 | result = generator.query(query) 51 | assert result 52 | 53 | 54 | @pytest.mark.deepspeed 55 | @pytest.mark.parametrize("enable_deepspeed", [False]) 56 | @pytest.mark.parametrize("enable_zero", [True]) 57 | @pytest.mark.parametrize( 58 | "ds_config", 59 | [ 60 | { 61 | "fp16": { 62 | "enabled": True 63 | }, 64 | "bf16": { 65 | "enabled": False 66 | }, 67 | "zero_optimization": { 68 | "stage": 3, 69 | "offload_param": { 70 | "device": "cpu", 71 | }, 72 | }, 73 | "train_micro_batch_size_per_gpu": 1, 74 | }, 75 | ], 76 | ) 77 | def test_zero_config(deployment, query): 78 | generator = mii.mii_query_handle(deployment) 79 | result = generator.query(query) 80 | assert result 81 | 82 | 83 | @pytest.mark.deepspeed 84 | @pytest.mark.parametrize("expected_failure", [ValidationError]) 85 | @pytest.mark.parametrize( 86 | "enable_deepspeed, enable_zero, dtype", 87 | [(True, 88 | True, 89 | "fp32"), 90 | (False, 91 | True, 92 | "fp16")], 93 | ) 94 | @pytest.mark.parametrize( 95 | "ds_config", 96 | [ 97 | { 98 | "fp16": { 99 | "enabled": False 100 | }, 101 | "bf16": { 102 | "enabled": False 103 | }, 104 | "zero_optimization": { 105 | "stage": 3, 106 | "offload_param": { 107 | "device": "cpu", 108 | }, 109 | }, 110 | "train_micro_batch_size_per_gpu": 1, 111 | }, 112 | ], 113 | ) 114 | def test_zero_config_fail(deployment, query): 115 | assert "assertion_error" in str(deployment.value) 116 | -------------------------------------------------------------------------------- /tests/legacy/test_local_deployment.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import pytest 6 | import mii.legacy as mii 7 | 8 | 9 | @pytest.mark.parametrize( 10 | "task_name, model_name, query", 11 | [ 12 | ( 13 | "fill-mask", 14 | "bert-base-uncased", 15 | { 16 | "query": "Hello I'm a [MASK] model." 17 | }, 18 | ), 19 | ( 20 | "question-answering", 21 | "deepset/roberta-large-squad2", 22 | { 23 | "question": "What is the greatest?", 24 | "context": "DeepSpeed is the greatest", 25 | }, 26 | ), 27 | ( 28 | "text-generation", 29 | "distilgpt2", 30 | { 31 | "query": ["DeepSpeed is the greatest"] 32 | }, 33 | ), 34 | ( 35 | "text-generation", 36 | "bigscience/bloom-560m", 37 | { 38 | "query": ["DeepSpeed is the greatest", 39 | "Seattle is"] 40 | }, 41 | ), 42 | ( 43 | "token-classification", 44 | "Jean-Baptiste/roberta-large-ner-english", 45 | { 46 | "query": "My name is jean-baptiste and I live in montreal." 47 | }, 48 | ), 49 | ( 50 | "text-classification", 51 | "roberta-large-mnli", 52 | { 53 | "query": "DeepSpeed is the greatest" 54 | }, 55 | ), 56 | ( 57 | "zero-shot-image-classification", 58 | "openai/clip-vit-base-patch32", 59 | { 60 | "image": 61 | "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", 62 | "candidate_labels": ["animals", 63 | "humans", 64 | "landscape"] 65 | }, 66 | ), 67 | ], 68 | ) 69 | def test_single_GPU(deployment, query): 70 | generator = mii.mii_query_handle(deployment) 71 | result = generator.query(query) 72 | assert result 73 | 74 | 75 | @pytest.mark.parametrize( 76 | "task_name, model_name, query", 77 | [ 78 | ( 79 | "text-generation", 80 | "bigscience/bloom-560m", 81 | { 82 | "query": ["DeepSpeed is the greatest", 83 | "Seattle is"] 84 | }, 85 | ), 86 | ], 87 | ) 88 | def test_multi_GPU(deployment, query): 89 | generator = mii.mii_query_handle(deployment) 90 | result = generator.query(query) 91 | assert result 92 | 93 | 94 | @pytest.mark.parametrize( 95 | "task_name, model_name, query", 96 | [ 97 | ( 98 | "text-generation", 99 | "bigscience/bloom-560m", 100 | { 101 | "query": ["DeepSpeed is the greatest", 102 | 'Seattle is'] 103 | }, 104 | ), 105 | ], 106 | ) 107 | def test_session(deployment, query): 108 | generator = mii.mii_query_handle(deployment) 109 | session_name = "test_session" 110 | generator.create_session(session_name) 111 | result = generator.query(query) 112 | generator.destroy_session(session_name) 113 | assert result 114 | -------------------------------------------------------------------------------- /tests/legacy/test_non_persistent_deployment.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | import pytest 7 | import mii.legacy as mii 8 | 9 | 10 | @pytest.mark.parametrize("deployment_type", [mii.DeploymentType.NON_PERSISTENT]) 11 | @pytest.mark.parametrize( 12 | "task_name, model_name, query", 13 | [ 14 | ( 15 | "fill-mask", 16 | "bert-base-uncased", 17 | { 18 | "query": "Hello I'm a [MASK] model." 19 | }, 20 | ), 21 | ( 22 | "question-answering", 23 | "deepset/roberta-large-squad2", 24 | { 25 | "question": "What is the greatest?", 26 | "context": "DeepSpeed is the greatest", 27 | }, 28 | ), 29 | ( 30 | "text-generation", 31 | "distilgpt2", 32 | { 33 | "query": ["DeepSpeed is the greatest"] 34 | }, 35 | ), 36 | ( 37 | "text-generation", 38 | "bigscience/bloom-560m", 39 | { 40 | "query": ["DeepSpeed is the greatest", 41 | "Seattle is"] 42 | }, 43 | ), 44 | ( 45 | "token-classification", 46 | "Jean-Baptiste/roberta-large-ner-english", 47 | { 48 | "query": "My name is jean-baptiste and I live in montreal." 49 | }, 50 | ), 51 | ( 52 | "text-classification", 53 | "roberta-large-mnli", 54 | { 55 | "query": "DeepSpeed is the greatest" 56 | }, 57 | ), 58 | ( 59 | "zero-shot-image-classification", 60 | "openai/clip-vit-base-patch32", 61 | { 62 | "image": 63 | "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", 64 | "candidate_labels": ["animals", 65 | "humans", 66 | "landscape"], 67 | }, 68 | ), 69 | ], 70 | ) 71 | def test_single_GPU(deployment, query): 72 | generator = mii.mii_query_handle(deployment) 73 | result = generator.query(query) 74 | assert result 75 | -------------------------------------------------------------------------------- /tests/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = --ignore=legacy 3 | -------------------------------------------------------------------------------- /tests/test_arg_parsing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | import pytest 7 | 8 | from pydantic import ValidationError 9 | 10 | from mii.api import _parse_kwargs_to_model_config, _parse_kwargs_to_mii_config 11 | from mii.errors import UnknownArgument 12 | 13 | 14 | def test_model_name_or_path(): 15 | # model_name_or_path is required 16 | with pytest.raises(ValidationError): 17 | _parse_kwargs_to_mii_config() 18 | with pytest.raises(ValidationError): 19 | _parse_kwargs_to_model_config() 20 | 21 | # passing model_name_or_path as positional arg 22 | mii_config = _parse_kwargs_to_mii_config("test") 23 | assert mii_config.model_conf.model_name_or_path == "test" 24 | model_config, _ = _parse_kwargs_to_model_config("test") 25 | assert model_config.model_name_or_path == "test" 26 | 27 | # passing model_name_or_path in model_config 28 | mii_config = _parse_kwargs_to_mii_config(model_config={"model_name_or_path": "test"}) 29 | assert mii_config.model_conf.model_name_or_path == "test" 30 | mii_config = _parse_kwargs_to_mii_config( 31 | mii_config={"model_config": { 32 | "model_name_or_path": "test" 33 | }}) 34 | assert mii_config.model_conf.model_name_or_path == "test" 35 | model_config, _ = _parse_kwargs_to_model_config( 36 | model_config={"model_name_or_path": "test"} 37 | ) 38 | assert model_config.model_name_or_path == "test" 39 | 40 | # checking that model_name_or_path in model_config matches positional arg 41 | with pytest.raises(AssertionError): 42 | _parse_kwargs_to_mii_config("test", model_config={"model_name_or_path": "test2"}) 43 | with pytest.raises(AssertionError): 44 | _parse_kwargs_to_mii_config( 45 | "test", 46 | mii_config={"model_config": { 47 | "model_name_or_path": "test2" 48 | }}) 49 | with pytest.raises(AssertionError): 50 | _parse_kwargs_to_model_config("test", 51 | model_config={"model_name_or_path": "test2"}) 52 | 53 | 54 | def test_only_kwargs(): 55 | mii_config = _parse_kwargs_to_mii_config("test", 56 | tensor_parallel=2, 57 | enable_restful_api=True) 58 | assert mii_config.model_conf.model_name_or_path == "test" 59 | assert mii_config.model_conf.tensor_parallel == 2 60 | assert mii_config.enable_restful_api is True 61 | 62 | model_config, _ = _parse_kwargs_to_model_config("test", tensor_parallel=2) 63 | assert model_config.model_name_or_path == "test" 64 | assert model_config.tensor_parallel == 2 65 | 66 | 67 | def test_only_config_dicts(): 68 | mii_config = _parse_kwargs_to_mii_config( 69 | mii_config={"enable_restful_api": True}, 70 | model_config={ 71 | "model_name_or_path": "test", 72 | "tensor_parallel": 2 73 | }, 74 | ) 75 | assert mii_config.model_conf.model_name_or_path == "test" 76 | assert mii_config.model_conf.tensor_parallel == 2 77 | assert mii_config.enable_restful_api is True 78 | 79 | mii_config = _parse_kwargs_to_mii_config( 80 | mii_config={ 81 | "enable_restful_api": True, 82 | "model_config": { 83 | "model_name_or_path": "test", 84 | "tensor_parallel": 2 85 | }, 86 | }) 87 | assert mii_config.model_conf.model_name_or_path == "test" 88 | assert mii_config.model_conf.tensor_parallel == 2 89 | assert mii_config.enable_restful_api is True 90 | 91 | model_config, _ = _parse_kwargs_to_model_config( 92 | model_config={"model_name_or_path": "test", "tensor_parallel": 2} 93 | ) 94 | assert model_config.model_name_or_path == "test" 95 | assert model_config.tensor_parallel == 2 96 | 97 | 98 | def test_unknown_kwargs(): 99 | with pytest.raises(UnknownArgument): 100 | _parse_kwargs_to_mii_config("test", unknown_kwarg=True) 101 | 102 | _, remaining_kwargs = _parse_kwargs_to_model_config("test", unknown_kwarg=True) 103 | assert remaining_kwargs == {"unknown_kwarg": True} 104 | -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | import pytest 7 | 8 | import mii 9 | 10 | 11 | @pytest.mark.parametrize("replica_num", [2]) 12 | @pytest.mark.parametrize("tensor_parallel", [2]) 13 | @pytest.mark.parametrize( 14 | "device_map", 15 | [ 16 | { 17 | "host_0": [[0, 18 | 1, 19 | 2, 20 | 3]] 21 | }, 22 | { 23 | "host_0": [[0, 24 | 1]], 25 | "host_1": [[0]] 26 | }, 27 | { 28 | "host_0": [[0, 29 | 1], 30 | [2, 31 | 3], 32 | [4, 33 | 5]] 34 | }, 35 | { 36 | "host_0": [[0, 37 | 1]] 38 | }, 39 | ], 40 | ) 41 | @pytest.mark.parametrize("hostfile_content", [["host_0 slots=8", "host_1 slots=8"]]) 42 | def test_deploy_map_fail(mii_config): 43 | mii_config = mii.config.MIIConfig(**mii_config) 44 | with pytest.raises(ValueError): 45 | mii_config.generate_replica_configs() 46 | 47 | 48 | @pytest.mark.parametrize("replica_num", [2]) 49 | @pytest.mark.parametrize("tensor_parallel", [2]) 50 | @pytest.mark.parametrize( 51 | "device_map", 52 | [ 53 | { 54 | "host_0": [[0, 55 | 1], 56 | [2, 57 | 3]] 58 | }, 59 | { 60 | "host_0": [[0, 61 | 1]], 62 | "host_1": [[0, 63 | 1]] 64 | }, 65 | ], 66 | ) 67 | @pytest.mark.parametrize("hostfile_content", [["host_0 slots=4", "host_1 slots=4"]]) 68 | def test_deploy_map(mii_config): 69 | mii_config = mii.config.MIIConfig(**mii_config) 70 | mii_config.generate_replica_configs() 71 | 72 | 73 | @pytest.mark.parametrize("replica_num", [2]) 74 | @pytest.mark.parametrize("tensor_parallel", [2]) 75 | @pytest.mark.parametrize( 76 | "hostfile_content", 77 | [["host_0 slots=4"], 78 | ["host_0 slots=2", 79 | "host_1 slots=2"], 80 | ["host_0 slots=8"]], 81 | ) 82 | def test_auto_fill_deploy_map(mii_config): 83 | mii_config = mii.config.MIIConfig(**mii_config) 84 | mii_config.generate_replica_configs() 85 | 86 | 87 | @pytest.mark.parametrize("device_map", [{"host_0": [[0, 1]]}, [[0, 1]], [0, 1], 1]) 88 | def test_deploy_map_input_types(mii_config): 89 | mii_config = mii.config.MIIConfig(**mii_config) 90 | -------------------------------------------------------------------------------- /tests/test_deployment.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import pytest 6 | 7 | import json 8 | import re 9 | import requests 10 | import subprocess 11 | import time 12 | 13 | import mii 14 | 15 | 16 | def test_single_gpu(deployment, query): 17 | outputs = deployment(query) 18 | assert outputs[0], "output is empty" 19 | 20 | 21 | def test_streaming(deployment, query): 22 | outputs = [] 23 | 24 | def callback(response): 25 | outputs.append(response[0].generated_text) 26 | 27 | deployment(query, streaming_fn=callback) 28 | assert outputs, "output is empty" 29 | 30 | 31 | def test_streaming_consistency(deployment, query): 32 | expected_output = deployment(query, do_sample=False) 33 | streaming_parts = [] 34 | 35 | def callback(response): 36 | streaming_parts.append(response[0].generated_text) 37 | 38 | deployment(query, do_sample=False, streaming_fn=callback) 39 | streaming_output = "".join(streaming_parts) 40 | 41 | assert streaming_output == expected_output[0].generated_text, "outputs w and w/o streaming are not equal" 42 | 43 | 44 | def test_multi_prompt(deployment, query): 45 | outputs = deployment([query] * 4) 46 | for r in outputs: 47 | assert r, "output is empty" 48 | 49 | 50 | @pytest.mark.parametrize("tensor_parallel", [2]) 51 | def test_multi_gpu(deployment, query): 52 | outputs = deployment(query) 53 | assert outputs[0], "output is empty" 54 | 55 | 56 | @pytest.mark.parametrize("replica_num", [2]) 57 | def test_multi_replica(deployment, query): 58 | deployment_name = deployment.mii_config.deployment_name 59 | 60 | start = time.time() 61 | outputs = mii.client(deployment_name)(query, max_length=128, ignore_eos=True) 62 | end = time.time() 63 | assert outputs[0], "output is empty" 64 | single_query_time = end - start 65 | 66 | procs = [] 67 | double_query_time = [] 68 | for _ in range(2): 69 | p = subprocess.Popen( 70 | [ 71 | "python3", 72 | "-c", 73 | f"import time, mii; start=time.time(); mii.client('{deployment_name}')('{query}', max_length=128, ignore_eos=True); print('time',time.time()-start)", 74 | ], 75 | stdout=subprocess.PIPE, 76 | ) 77 | procs.append(p) 78 | for p in procs: 79 | output, error = p.communicate() 80 | m = re.search(r"time (\d+.\d+)", output.decode("utf-8").strip()) 81 | assert m, "time not found" 82 | double_query_time.append(float(m.group(1))) 83 | 84 | double_query_time = sum(double_query_time) / 2 85 | 86 | assert single_query_time == pytest.approx( 87 | double_query_time, single_query_time / 2 88 | ), "two queries should take about the same time as one query" 89 | 90 | 91 | def test_query_kwargs(deployment, query): 92 | # test ignore_eos 93 | outputs = deployment( 94 | query, 95 | max_length=128, 96 | min_new_tokens=16, 97 | ignore_eos=True, 98 | top_p=0.9, 99 | top_k=50, 100 | temperature=0.9, 101 | ) 102 | assert outputs[0], "output is empty" 103 | 104 | 105 | def test_do_sample(deployment, query): 106 | output_0 = deployment(query, do_sample=False, max_length=128) 107 | output_1 = deployment(query, do_sample=False, max_length=128) 108 | assert ( 109 | output_0[0] == output_1[0] 110 | ), "do_sample=False should always return the same output" 111 | 112 | 113 | def test_return_full_text(deployment, query): 114 | outputs = deployment(query, max_length=128, return_full_text=True) 115 | assert outputs[0].generated_text.startswith(query), "output should start with the prompt" 116 | 117 | 118 | @pytest.mark.parametrize("enable_restful_api", [True]) 119 | def test_restful_api(deployment, query, deployment_name, restful_api_port): 120 | # Verify deployment is running 121 | outputs = deployment(query, max_length=128) 122 | assert outputs[0], "output is empty" 123 | 124 | # Verify REST API 125 | url = f"http://localhost:{restful_api_port}/mii/{deployment_name}" 126 | params = {"prompts": query, "max_length": 128} 127 | json_params = json.dumps(params) 128 | result = requests.post(url, 129 | data=json_params, 130 | headers={"Content-Type": "application/json"}) 131 | assert result.status_code == 200 132 | assert "generated_text" in result.json()[0] 133 | -------------------------------------------------------------------------------- /tests/test_model_support.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | import pytest 7 | 8 | import deepspeed 9 | import torch 10 | from deepspeed.inference.v2.checkpoint import ( 11 | CheckpointEngineBase, 12 | HuggingFaceCheckpointEngine, 13 | ) 14 | from transformers import AutoConfig, AutoModelForCausalLM, GenerationConfig 15 | from typing import Iterable, Tuple 16 | 17 | 18 | class ZeroWeightsCheckpointEngine(CheckpointEngineBase): 19 | """ Generates weight with all zeros for a given model for testing purposes. """ 20 | def __init__(self, model_name_or_path: str, auth_token: str = None) -> None: 21 | self.model_name_or_path = model_name_or_path 22 | self.model_config = AutoConfig.from_pretrained(self.model_name_or_path, 23 | trust_remote_code=True) 24 | if hasattr(self.model_config, "max_position_embeddings"): 25 | self.model_config.max_seq_length = self.model_config.max_position_embeddings 26 | else: 27 | try: 28 | generation_config = GenerationConfig.from_pretrained( 29 | self.model_name_or_path) 30 | self.model_config.max_seq_length = generation_config.max_length 31 | except OSError: 32 | self.model_config.max_seq_length = 2048 33 | 34 | def parameters(self) -> Iterable[Tuple[str, torch.Tensor]]: 35 | # Load with meta device is faster 36 | with deepspeed.OnDevice(dtype=torch.float16, device="meta"): 37 | model = AutoModelForCausalLM.from_config(self.model_config, 38 | trust_remote_code=True) 39 | 40 | for param_name, param in model.state_dict().items(): 41 | yield param_name, torch.zeros(param.shape) 42 | 43 | 44 | @pytest.fixture(scope="module", autouse=True) 45 | def inject_checkpoint_engine(): 46 | # Inject the random weihts checkpoint engine 47 | deepspeed.inference.v2.engine_factory.HuggingFaceCheckpointEngine = ( 48 | ZeroWeightsCheckpointEngine) 49 | yield None 50 | # Restore the original checkpoint engine 51 | deepspeed.inference.v2.engine_factory.HuggingFaceCheckpointEngine = ( 52 | HuggingFaceCheckpointEngine) 53 | 54 | 55 | @pytest.mark.parametrize( 56 | "model_name", 57 | [ 58 | "tiiuae/falcon-7b", 59 | "huggyllama/llama-7b", 60 | "NousResearch/Llama-2-7b-hf", 61 | "NousResearch/Hermes-2-Pro-Mistral-7B", 62 | "cloudyu/Mixtral_11Bx2_MoE_19B", 63 | "facebook/opt-125m", 64 | "microsoft/phi-2", 65 | "Qwen/Qwen-7B-Chat", 66 | "Qwen/Qwen1.5-0.5B", 67 | ], 68 | ids=[ 69 | "falcon", 70 | "llama", 71 | "llama-2", 72 | "mistral", 73 | "mixtral", 74 | "opt", 75 | "phi-2", 76 | "qwen", 77 | "qwen-2" 78 | ], 79 | ) 80 | def test_model(pipeline, query): 81 | outputs = pipeline(query, max_new_tokens=16) 82 | assert outputs[0], "output is empty" 83 | 84 | 85 | @pytest.mark.parametrize("local_model", [True]) 86 | def test_local_model_dir(pipeline): 87 | assert pipeline 88 | -------------------------------------------------------------------------------- /tests/test_pipeline.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | 6 | 7 | def test_single_gpu(pipeline, query): 8 | outputs = pipeline(query) 9 | assert outputs[0], "output is empty" 10 | 11 | 12 | def test_multi_prompt(pipeline, query): 13 | outputs = pipeline([query] * 4) 14 | for r in outputs: 15 | assert r, "output is empty" 16 | 17 | 18 | def test_query_kwargs(pipeline, query): 19 | # test ignore_eos 20 | outputs = pipeline( 21 | query, 22 | max_length=128, 23 | min_new_tokens=16, 24 | ignore_eos=True, 25 | top_p=0.9, 26 | top_k=50, 27 | temperature=0.9, 28 | ) 29 | assert outputs[0], "output is empty" 30 | -------------------------------------------------------------------------------- /tests/test_ragged_batching.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # DeepSpeed Team 5 | import pytest 6 | 7 | from mii.batching.ragged_batching import ReadableStream 8 | from mii.config import ModelConfig 9 | from mii.modeling.tokenizers import load_tokenizer 10 | 11 | 12 | @pytest.mark.parametrize( 13 | "model_name", 14 | [ 15 | "tiiuae/falcon-7b", 16 | "NousResearch/Llama-2-7b-hf", 17 | "NousResearch/Hermes-2-Pro-Mistral-7B", 18 | "cloudyu/Mixtral_11Bx2_MoE_19B", 19 | "facebook/opt-125m", 20 | ], 21 | ids=["falcon", 22 | "llama", 23 | "mistral", 24 | "mixtral", 25 | "opt"], 26 | ) 27 | @pytest.mark.parametrize( 28 | "query", 29 | [ 30 | "It’s a region that includes Washington, Oregon, and Idaho.", 31 | "# Heading\n\ntitle redundant spaces, #id — an anchor", 32 | "例如", 33 | ], 34 | ids=[ 35 | "apostrophe", 36 | "markdown", 37 | "chinese", 38 | ]) 39 | def test_readable_stream(model_config, query): 40 | tokenizer = load_tokenizer(ModelConfig(**model_config)) 41 | thread_id = 42 42 | 43 | token_ids = tokenizer.encode(query) 44 | expected = tokenizer.decode(token_ids) 45 | decoded = [] 46 | 47 | stream = ReadableStream(tokenizer) 48 | for token_id in token_ids: 49 | decoded.append(stream.decode(thread_id, [token_id])) 50 | 51 | assert "".join(decoded) == expected 52 | 53 | 54 | @pytest.mark.parametrize( 55 | "model_name,expected_size", 56 | [ 57 | ("tiiuae/falcon-7b", 58 | 65024), 59 | ("NousResearch/Llama-2-7b-hf", 60 | 32000), 61 | ("NousResearch/Hermes-2-Pro-Mistral-7B", 62 | 32032), 63 | ("cloudyu/Mixtral_11Bx2_MoE_19B", 64 | 32000), 65 | ("facebook/opt-125m", 66 | 50265), 67 | ("nvidia/Llama3-ChatQA-1.5-8B", 68 | 128256), 69 | ], 70 | ids=["falcon", 71 | "llama", 72 | "mistral", 73 | "mixtral", 74 | "opt", 75 | "llama3"], 76 | ) 77 | def test_vocab_size(model_config, expected_size): 78 | tokenizer = load_tokenizer(ModelConfig(**model_config)) 79 | assert tokenizer.vocab_size == expected_size 80 | -------------------------------------------------------------------------------- /version.txt: -------------------------------------------------------------------------------- 1 | 0.3.4 2 | --------------------------------------------------------------------------------