├── .github └── workflows │ └── lint-check.yml ├── .gitignore ├── .redocly.lint-ignore.yaml ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── LICENSE ├── OWNERS ├── README.md ├── minutes └── README.md └── specification └── protocol ├── buf.yaml ├── generate_rest.yaml ├── inference_grpc.md ├── inference_rest.md ├── open_inference_grpc.proto └── open_inference_rest.yaml /.github/workflows/lint-check.yml: -------------------------------------------------------------------------------- 1 | name: "OpenApi lint" 2 | 3 | on: 4 | push: 5 | branches: [ main, release* ] 6 | pull_request: 7 | workflow_dispatch: 8 | 9 | concurrency: 10 | group: ${{ github.workflow }}-${{ github.ref }} 11 | cancel-in-progress: true 12 | 13 | jobs: 14 | openapi-lint: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - name: Check out source repository 18 | uses: actions/checkout@v3 19 | 20 | - name: Install redocly cli 21 | run: | 22 | npm install -g @redocly/cli@1.4.0 23 | 24 | - name: Lint OpenAPI spec 25 | run: | 26 | redocly lint ./specification/protocol/open_inference_rest.yaml 27 | 28 | grpc-proto-lint: 29 | runs-on: ubuntu-latest 30 | steps: 31 | - name: Check out source repository 32 | uses: actions/checkout@v3 33 | 34 | - name: Setup buf 35 | uses: bufbuild/buf-setup-action@v1 36 | 37 | - name: Run buf lint 38 | uses: bufbuild/buf-lint-action@v1 39 | with: 40 | input: ./specification/protocol 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Binaries for programs and plugins 3 | *.exe 4 | *.exe~ 5 | *.dll 6 | *.so 7 | *.dylib 8 | *.pyc 9 | *.egg-info/ 10 | 11 | # Mac file system 12 | **/.DS_Store 13 | 14 | # editor and IDE paraphernalia 15 | .idea 16 | .editorconfig 17 | .vscode 18 | .project 19 | *.swp 20 | *.swo 21 | *~ 22 | 23 | # Python dev 24 | .mypy_cache 25 | 26 | .gitlab-ci.yml 27 | .openapi-generator-ignore 28 | .openapi-generator/ 29 | 30 | # node js 31 | **/node_modules 32 | -------------------------------------------------------------------------------- /.redocly.lint-ignore.yaml: -------------------------------------------------------------------------------- 1 | # This file instructs Redocly's linter to ignore the rules contained for specific parts of your API. 2 | # See https://redoc.ly/docs/cli/ for more information. 3 | specification/protocol/open_inference_rest.yaml: 4 | no-empty-servers: 5 | - '#/servers' 6 | operation-4xx-response: 7 | - '#/paths/~1v2~1health~1live/get/responses' 8 | - '#/paths/~1v2~1health~1ready/get/responses' 9 | - >- 10 | #/paths/~1v2~1models~1${MODEL_NAME}~1versions~1${MODEL_VERSION}~1ready/get/responses 11 | - >- 12 | #/paths/~1v2~1models~1${MODEL_NAME}~1versions~1${MODEL_VERSION}/get/responses 13 | no-path-trailing-slash: 14 | - '#/paths/~1v2~1' 15 | no-unused-components: 16 | - '#/components/schemas/metadata_model_error_response' 17 | security-defined: 18 | - '#/paths/~1v2~1health~1live/get' 19 | - '#/paths/~1v2~1health~1ready/get' 20 | - '#/paths/~1v2~1models~1${MODEL_NAME}~1versions~1${MODEL_VERSION}~1ready/get' 21 | - '#/paths/~1v2~1/get' 22 | - '#/paths/~1v2~1models~1${MODEL_NAME}~1versions~1${MODEL_VERSION}/get' 23 | - >- 24 | #/paths/~1v2~1models~1${MODEL_NAME}~1versions~1${MODEL_VERSION}~1infer/post 25 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## CNCF Community Code of Conduct v1.3 2 | 3 | Other languages available: 4 | - [Arabic/العربية](https://github.com/cncf/foundation/blob/main/code-of-conduct-languages/ar.md) 5 | - [Bengali/বাংলা](https://github.com/cncf/foundation/blob/main/code-of-conduct-languages/bn.md) 6 | - [Bulgarian/Български](https://github.com/cncf/foundation/blob/main/code-of-conduct-languages/bg.md) 7 | - [Chinese/中文](https://github.com/cncf/foundation/blob/main/code-of-conduct-languages/zh.md) 8 | - [Czech/Česky](https://github.com/cncf/foundation/blob/main/code-of-conduct-languages/cs.md) 9 | - [Farsi/فارسی](https://github.com/cncf/foundation/blob/main/code-of-conduct-languages/fa.md) 10 | - [French/Français](https://github.com/cncf/foundation/blob/main/code-of-conduct-languages/fr.md) 11 | - [German/Deutsch](https://github.com/cncf/foundation/blob/main/code-of-conduct-languages/de.md) 12 | - [Hebrew/עברית](https://github.com/cncf/foundation/blob/main/code-of-conduct-languages/he.md) 13 | - [Hindi/हिन्दी](https://github.com/cncf/foundation/blob/main/code-of-conduct-languages/hi.md) 14 | - [Hungarian/Magyar](https://github.com/cncf/foundation/blob/main/code-of-conduct-languages/hu.md) 15 | - [Indonesian/Bahasa Indonesia](https://github.com/cncf/foundation/blob/main/code-of-conduct-languages/id.md) 16 | - [Italian/Italiano](https://github.com/cncf/foundation/blob/main/code-of-conduct-languages/it.md) 17 | - [Japanese/日本語](https://github.com/cncf/foundation/blob/main/code-of-conduct-languages/ja.md) 18 | - [Korean/한국어](https://github.com/cncf/foundation/blob/main/code-of-conduct-languages/ko.md) 19 | - [Polish/Polski](https://github.com/cncf/foundation/blob/main/code-of-conduct-languages/pl.md) 20 | - [Portuguese/Português](https://github.com/cncf/foundation/blob/main/code-of-conduct-languages/pt.md) 21 | - [Russian/Русский](https://github.com/cncf/foundation/blob/main/code-of-conduct-languages/ru.md) 22 | - [Spanish/Español](https://github.com/cncf/foundation/blob/main/code-of-conduct-languages/es.md) 23 | - [Turkish/Türkçe](https://github.com/cncf/foundation/blob/main/code-of-conduct-languages/tr.md) 24 | - [Ukrainian/Українська](https://github.com/cncf/foundation/blob/main/code-of-conduct-languages/uk.md) 25 | - [Vietnamese/Tiếng Việt](https://github.com/cncf/foundation/blob/main/code-of-conduct-languages/vi.md) 26 | 27 | ### Community Code of Conduct 28 | 29 | As contributors, maintainers, and participants in the CNCF community, and in the interest of fostering 30 | an open and welcoming community, we pledge to respect all people who participate or contribute 31 | through reporting issues, posting feature requests, updating documentation, 32 | submitting pull requests or patches, attending conferences or events, or engaging in other community or project activities. 33 | 34 | We are committed to making participation in the CNCF community a harassment-free experience for everyone, regardless of age, body size, caste, disability, ethnicity, level of experience, family status, gender, gender identity and expression, marital status, military or veteran status, nationality, personal appearance, race, religion, sexual orientation, socioeconomic status, tribe, or any other dimension of diversity. 35 | 36 | ## Scope 37 | 38 | This code of conduct applies: 39 | * within project and community spaces, 40 | * in other spaces when an individual CNCF community participant's words or actions are directed at or are about a CNCF project, the CNCF community, or another CNCF community participant in the context of a CNCF activity. 41 | 42 | ### CNCF Events 43 | 44 | CNCF events that are produced by the Linux Foundation with professional events staff are governed by the Linux Foundation [Events Code of Conduct](https://events.linuxfoundation.org/code-of-conduct/) available on the event page. This is designed to be used in conjunction with the CNCF Code of Conduct. 45 | 46 | ## Our Standards 47 | 48 | The CNCF Community is open, inclusive and respectful. Every member of our community has the right to have their identity respected. 49 | 50 | Examples of behavior that contributes to a positive environment include but are not limited to: 51 | 52 | * Demonstrating empathy and kindness toward other people 53 | * Being respectful of differing opinions, viewpoints, and experiences 54 | * Giving and gracefully accepting constructive feedback 55 | * Accepting responsibility and apologizing to those affected by our mistakes, 56 | and learning from the experience 57 | * Focusing on what is best not just for us as individuals, but for the 58 | overall community 59 | * Using welcoming and inclusive language 60 | 61 | 62 | Examples of unacceptable behavior include but are not limited to: 63 | 64 | * The use of sexualized language or imagery 65 | * Trolling, insulting or derogatory comments, and personal or political attacks 66 | * Public or private harassment in any form 67 | * Publishing others' private information, such as a physical or email 68 | address, without their explicit permission 69 | * Violence, threatening violence, or encouraging others to engage in violent behavior 70 | * Stalking or following someone without their consent 71 | * Unwelcome physical contact 72 | * Unwelcome sexual or romantic attention or advances 73 | * Using CNCF projects or community spaces for political campaigning or promotion of political causes 74 | that are unrelated to the advancement of cloud native technology. To clarify, this policy does not restrict individuals' personal attire, including attire that expresses personal beliefs or aspects of identity. 75 | * Other conduct which could reasonably be considered inappropriate in a 76 | professional setting 77 | 78 | The following behaviors are also prohibited: 79 | * Providing knowingly false or misleading information in connection with a Code of Conduct investigation or otherwise intentionally tampering with an investigation. 80 | * Retaliating against a person because they reported an incident or provided information about an incident as a witness. 81 | 82 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct. 83 | By adopting this Code of Conduct, project maintainers commit themselves to fairly and consistently applying these principles to every aspect 84 | of managing a CNCF project. 85 | Project maintainers who do not follow or enforce the Code of Conduct may be temporarily or permanently removed from the project team. 86 | 87 | ## Reporting 88 | 89 | For incidents occurring in the Kubernetes community, contact the [Kubernetes Code of Conduct Committee](https://git.k8s.io/community/committee-code-of-conduct) via . You can expect a response within three business days. 90 | 91 | For other projects, or for incidents that are project-agnostic or impact multiple CNCF projects, please contact the [CNCF Code of Conduct Committee](https://www.cncf.io/conduct/committee/) via . Alternatively, you can contact any of the individual members of the [CNCF Code of Conduct Committee](https://www.cncf.io/conduct/committee/) to submit your report. For more detailed instructions on how to submit a report, including how to submit a report anonymously, please see our [Incident Resolution Procedures](https://github.com/cncf/foundation/blob/main/code-of-conduct/coc-incident-resolution-procedures.md). You can expect a response within three business days. 92 | 93 | For incidents occurring at CNCF event that is produced by the Linux Foundation, please contact . 94 | 95 | ## Frequently asked questions 96 | For more information about this Code of Conduct, please see the [CNCF Code of Conduct Frequently Asked Questions](https://www.cncf.io/conduct/faq/). 97 | 98 | ## Enforcement 99 | 100 | Upon review and investigation of a reported incident, the CoC response team that has jurisdiction will determine what action is appropriate based on this Code of Conduct and its related documentation. 101 | 102 | For information about which Code of Conduct incidents are handled by project leadership, which incidents are handled by the CNCF Code of Conduct Committee, and which incidents are handled by the Linux Foundation (including its events team), see our [Jurisdiction Policy](https://github.com/cncf/foundation/blob/main/code-of-conduct/coc-committee-jurisdiction-policy.md). 103 | 104 | ## Amendments 105 | 106 | Consistent with the CNCF Charter, any substantive changes to this Code of Conduct must be approved by the Technical Oversight Committee. 107 | 108 | ## Acknowledgements 109 | 110 | This Code of Conduct is adapted from the Contributor Covenant 111 | (http://contributor-covenant.org), version 2.0 available at 112 | http://contributor-covenant.org/version/2/0/code_of_conduct/ 113 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /OWNERS: -------------------------------------------------------------------------------- 1 | approvers: 2 | - yuzisun 3 | - adriangonz 4 | reviewers: 5 | - yuzisun 6 | - adriangonz 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Open Inference Protocol Specification 2 | 3 | The Open Inference Protocol(OIP) specification defines a standard protocol for performing machine learning model inference across 4 | serving runtimes for different ML frameworks. The protocol facilitates the implementation of a standardized and high performance data plane, 5 | promoting interoperability among model serving runtimes. The specification enables the creation of cohesive inference experience, 6 | empowering the development of versatile client or benchmarking tools that can work with all 7 | supported serving runtimes. 8 | 9 | - The inference REST [specification](./specification/protocol/inference_rest.md) 10 | - The inference gRPC [specification](./specification/protocol/inference_grpc.md) 11 | 12 | ## Adoptions 13 | - KServe [v2 inference protocol](https://kserve.github.io/website/master/modelserving/data_plane/v2_protocol/) 14 | - NVIDIA [Triton inference server protocol](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/customization_guide/inference_protocols.html) 15 | - Seldon [MLServer](https://mlserver.readthedocs.io/en/stable/user-guide/content-type.html) 16 | - Seldon Core [v2 inference protocol](https://docs.seldon.io/projects/seldon-core/en/v2/contents/getting-started/#api-for-inference) 17 | - OpenVino [RESTful API](https://docs.openvino.ai/latest/ovms_docs_rest_api_kfs.html) and [gRPC API](https://docs.openvino.ai/latest/ovms_docs_grpc_api_kfs.html) 18 | - AMD [Inference Server](https://xilinx.github.io/inference-server/main/kserve.html) 19 | - TorchServe [Inference API](https://github.com/pytorch/serve/tree/master/kubernetes/kserve) 20 | 21 | ## Versioning the Specification 22 | Changes to the specification are versioned according to Semantic Versioning 2.0 and described in [CHANGELOG.md](CHANGELOG.md). Layout changes are not versioned. Specific implementations of the specification should specify which version they implement. 23 | 24 | ## Community meeting 25 | We have a public monthly community meeting on Wed 10AM US/Pacific. Please [map that to your local time](https://www.google.com/search?q=1000+am+in+pst&hl=en). 26 | 27 | You can also find these meetings [on the community calendar](https://zoom-lfx.platform.linuxfoundation.org/meetings/kserve?view=month), along with other major community events. 28 | 29 | You can find the meeting minutes from the monthly work group sessions in this [Google Doc](https://docs.google.com/document/d/1f21bja1ejHPrZRmY5ke0UxKVD26j0VntJxx0qGN3fKE). 30 | 31 | You can access the meeting recordings on [the community calendar](https://zoom-lfx.platform.linuxfoundation.org/meetings/kserve?view=month) by clicking on the respective date's event details. 32 | 33 | ## Questions and issues 34 | 35 | For questions or issues, you can use: 36 | 1. `#kserve-oip-collaboration` channel in CNCF Slack, please follow the steps below: 37 | 1. Create your Slack account [here](https://slack.cncf.io/) 38 | 2. Search for `#kserve-oip-collaboration` channel or join directly via [this link](https://cloud-native.slack.com/archives/C06P4SYCNRX) 39 | 40 | For bug reports and features requests, please use [Open Inference Protocol issues](https://github.com/kserve/open-inference-protocol/issues). 41 | 42 | ## License 43 | By contributing to Open Inference Protocol Specification repository, you agree that your contributions will be licensed under its Apache 2.0 License. 44 | -------------------------------------------------------------------------------- /minutes/README.md: -------------------------------------------------------------------------------- 1 | # Meeting Minutes 2 | 3 | You can find the meeting minutes from the monthly work group sessions in this [Google Doc](https://docs.google.com/document/d/1f21bja1ejHPrZRmY5ke0UxKVD26j0VntJxx0qGN3fKE). Use the links below to jump to the notes from a particular meeting. 4 | 5 | 6 | ## 10/04/2023 7 | - [Notes](https://docs.google.com/document/d/1f21bja1ejHPrZRmY5ke0UxKVD26j0VntJxx0qGN3fKE/edit#heading=h.634yxromwxj6) 8 | 9 | ## 09/06/2023 10 | - [Notes](https://docs.google.com/document/d/1f21bja1ejHPrZRmY5ke0UxKVD26j0VntJxx0qGN3fKE/edit#heading=h.he6eqblp1n0y) 11 | 12 | ## 08/09/2023 13 | - [Notes](https://docs.google.com/document/d/1f21bja1ejHPrZRmY5ke0UxKVD26j0VntJxx0qGN3fKE/edit#heading=h.ms1eyd6wd6il) 14 | 15 | ## 07/12/2023 16 | - [Notes](https://docs.google.com/document/d/1f21bja1ejHPrZRmY5ke0UxKVD26j0VntJxx0qGN3fKE/edit#heading=h.6g0atnw3j3l8) 17 | 18 | ## 05/31/2023 19 | - [Notes](https://docs.google.com/document/d/1W5I3G_Kc8MNPXg8LqppQZFsDPxmbNNw4HLPskUAncYU/edit) 20 | 21 | ## 04/24/2023 22 | 23 | - [Slide Deck](https://docs.google.com/presentation/d/10p9CngDjWwsvC3FKLtnDZNSTpGrZFOXr-hOQrkm4cgU/edit?usp=sharing) 24 | covering the rename to the Open Inference Protocol. 25 | - [Zoom Recording](https://us02web.zoom.us/rec/share/jCK5Kye_J9sslz0ZF1IcU3kHnCs7bgr4ecRkRDEAX6ncdWDJLSol4UYVDr42SjgN.xIpnOcPmkaHijSKL) 26 | -------------------------------------------------------------------------------- /specification/protocol/buf.yaml: -------------------------------------------------------------------------------- 1 | version: v1 2 | breaking: 3 | use: 4 | - FILE 5 | lint: 6 | use: 7 | - DEFAULT 8 | ignore_only: 9 | PACKAGE_VERSION_SUFFIX: 10 | - open_inference_grpc.proto 11 | PACKAGE_DIRECTORY_MATCH: 12 | - open_inference_grpc.proto 13 | allow_comment_ignores: true 14 | -------------------------------------------------------------------------------- /specification/protocol/generate_rest.yaml: -------------------------------------------------------------------------------- 1 | openapi: 3.1.0 2 | info: 3 | title: Open Inference API for text generation 4 | description: Open Inference API for text generation 5 | version: 1.0.0 6 | components: 7 | schemas: 8 | Details: 9 | type: object 10 | required: 11 | - finish_reason 12 | - logprobs 13 | additionalProperties: {} 14 | properties: 15 | finish_reason: 16 | $ref: '#/components/schemas/Finish_Reason' 17 | logprobs: 18 | $ref: '#/components/schemas/Logprobs' 19 | Finish_Reason: 20 | type: string 21 | enum: 22 | - length 23 | - eos_token 24 | - stop_sequence 25 | description: The reason the model stopped generating tokens. `length` if number of generated tokens == `max_tokens`. `eos_token` if the model generated its end of sequence token and `stop_sequence` if the model generated a text included in `stop` array 26 | GenerateErrorResponse: 27 | type: object 28 | required: 29 | - error 30 | properties: 31 | error: 32 | type: string 33 | GenerateParameters: 34 | type: object 35 | additionalProperties: {} 36 | properties: 37 | temperature: 38 | type: number 39 | format: float 40 | default: 1 41 | minimum: 0 42 | description: What sampling temperature to use, higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. 43 | top_p: 44 | type: number 45 | format: float 46 | maximum: 1 47 | minimum: 0 48 | description: An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered. 49 | max_tokens: 50 | type: integer 51 | format: int32 52 | default: 20 53 | minimum: 1 54 | description: The maximum number of tokens to generate in the completion. 55 | stop: 56 | type: array 57 | items: 58 | type: string 59 | description: Sequences where the API will stop generating further tokens. 60 | details: 61 | type: boolean 62 | description: Flag to request for detailed response body that would include finish_reason and logprobs. 63 | GenerateRequest: 64 | type: object 65 | required: 66 | - text_input 67 | properties: 68 | text_input: 69 | type: string 70 | parameters: 71 | allOf: 72 | - $ref: '#/components/schemas/GenerateParameters' 73 | GenerateResponse: 74 | type: object 75 | required: 76 | - text_output 77 | - model_name 78 | properties: 79 | text_output: 80 | type: string 81 | model_name: 82 | type: string 83 | model_version: 84 | type: string 85 | details: 86 | $ref: '#/components/schemas/Details' 87 | GenerateStreamResponse: 88 | type: object 89 | required: 90 | - text_output 91 | - model_name 92 | properties: 93 | text_output: 94 | type: string 95 | model_name: 96 | type: string 97 | model_version: 98 | type: string 99 | details: 100 | $ref: '#/components/schemas/StreamDetails' 101 | Logprobs: 102 | type: array 103 | items: 104 | $ref: '#/components/schemas/Token' 105 | description: Log probability information for the tokens. 106 | StreamDetails: 107 | type: object 108 | required: 109 | - finish_reason 110 | - token 111 | additionalProperties: {} 112 | properties: 113 | finish_reason: 114 | $ref: '#/components/schemas/Finish_Reason' 115 | token: 116 | $ref: '#/components/schemas/Token' 117 | Token: 118 | type: object 119 | required: 120 | - id 121 | - text 122 | - logprob 123 | - special 124 | properties: 125 | id: 126 | type: integer 127 | format: int32 128 | minimum: 0 129 | description: Id of the token. 130 | logprob: 131 | type: number 132 | format: float 133 | description: The log probability of this token. 134 | special: 135 | type: boolean 136 | description: Describes if the token is a special token. Can be used to ignore tokens when concatenating 137 | text: 138 | type: string 139 | description: The token text value. 140 | paths: 141 | /v2/models/${MODEL_NAME}/versions/${MODEL_VERSION}/generate: 142 | post: 143 | parameters: 144 | - name: MODEL_NAME 145 | required: true 146 | in: path 147 | schema: 148 | type: string 149 | - name: MODEL_VERSION 150 | required: true 151 | in: path 152 | schema: 153 | type: string 154 | requestBody: 155 | content: 156 | application/json: 157 | schema: 158 | $ref: '#/components/schemas/GenerateRequest' 159 | responses: 160 | '200': 161 | description: generated text 162 | content: 163 | application/json: 164 | schema: 165 | $ref: '#/components/schemas/GenerateResponse' 166 | '422': 167 | description: Input validation error 168 | content: 169 | application/json: 170 | schema: 171 | $ref: '#/components/schemas/GenerateErrorResponse' 172 | example: 173 | error: Input validation error 174 | '424': 175 | description: Generation Error 176 | content: 177 | application/json: 178 | schema: 179 | $ref: '#/components/schemas/GenerateErrorResponse' 180 | example: 181 | error: Request failed during generation 182 | '429': 183 | description: Model is overloaded 184 | content: 185 | application/json: 186 | schema: 187 | $ref: '#/components/schemas/GenerateErrorResponse' 188 | example: 189 | error: Model is overloaded 190 | '500': 191 | description: Incomplete generation 192 | content: 193 | application/json: 194 | schema: 195 | $ref: '#/components/schemas/GenerateErrorResponse' 196 | example: 197 | error: Incomplete generation 198 | 199 | /v2/models/${MODEL_NAME}/versions/${MODEL_VERSION}/generate_stream: 200 | post: 201 | parameters: 202 | - name: MODEL_NAME 203 | required: true 204 | in: path 205 | schema: 206 | type: string 207 | - name: MODEL_VERSION 208 | required: true 209 | in: path 210 | schema: 211 | type: string 212 | requestBody: 213 | content: 214 | application/json: 215 | schema: 216 | $ref: '#/components/schemas/GenerateRequest' 217 | responses: 218 | '200': 219 | description: generated text stream 220 | content: 221 | text/event-stream: 222 | schema: 223 | $ref: '#/components/schemas/GenerateStreamResponse' 224 | '422': 225 | description: Input validation error 226 | content: 227 | text/event-stream: 228 | schema: 229 | $ref: '#/components/schemas/GenerateErrorResponse' 230 | example: 231 | error: Input validation error 232 | '424': 233 | description: Generation Error 234 | content: 235 | text/event-stream: 236 | schema: 237 | $ref: '#/components/schemas/GenerateErrorResponse' 238 | example: 239 | error: Request failed during generation 240 | '429': 241 | description: Model is overloaded 242 | content: 243 | text/event-stream: 244 | schema: 245 | $ref: '#/components/schemas/GenerateErrorResponse' 246 | example: 247 | error: Model is overloaded 248 | '500': 249 | description: Incomplete generation 250 | content: 251 | text/event-stream: 252 | schema: 253 | $ref: '#/components/schemas/GenerateErrorResponse' 254 | example: 255 | error: Incomplete generation 256 | -------------------------------------------------------------------------------- /specification/protocol/inference_grpc.md: -------------------------------------------------------------------------------- 1 | ## gRPC 2 | 3 | The GRPC API closely follows the concepts defined in the 4 | [HTTP/REST](./inference_rest.md) API. A compliant server must implement the 5 | health, metadata, and inference APIs described in this section. 6 | 7 | 8 | | API | rpc Endpoint | Request Message | Response Message | 9 | | --- | --- | --- | ---| 10 | | Inference | [ModelInfer](#inference) | ModelInferRequest | ModelInferResponse | 11 | | Model Ready | [ModelReady](#model-ready) | ModelReadyRequest | ModelReadyResponse | 12 | | Model Metadata | [ModelMetadata](#model-metadata)| ModelMetadataRequest | ModelMetadataResponse | 13 | | Server Ready | [ServerReady](#server-ready) | ServerReadyRequest | ServerReadyResponse | 14 | | Server Live | [ServerLive](#server-live) | ServerLiveRequest | ServerLiveResponse | 15 | 16 | For more detailed information on each endpoint and its contents, see `API Definitions` and `Message Contents`. 17 | 18 | See also: The gRPC endpoints, request/response messages and contents are defined in [open_inference_grpc.proto](./open_inference_grpc.proto) 19 | 20 | 21 | ### **API Definitions** 22 | 23 | The GRPC definition of the service is: 24 | 25 | // 26 | // Inference Server GRPC endpoints. 27 | // 28 | service GRPCInferenceService 29 | { 30 | // Check liveness of the inference server. 31 | rpc ServerLive(ServerLiveRequest) returns (ServerLiveResponse) {} 32 | 33 | // Check readiness of the inference server. 34 | rpc ServerReady(ServerReadyRequest) returns (ServerReadyResponse) {} 35 | 36 | // Check readiness of a model in the inference server. 37 | rpc ModelReady(ModelReadyRequest) returns (ModelReadyResponse) {} 38 | 39 | // Get server metadata. 40 | rpc ServerMetadata(ServerMetadataRequest) returns (ServerMetadataResponse) {} 41 | 42 | // Get model metadata. 43 | rpc ModelMetadata(ModelMetadataRequest) returns (ModelMetadataResponse) {} 44 | 45 | // Perform inference using a specific model. 46 | rpc ModelInfer(ModelInferRequest) returns (ModelInferResponse) {} 47 | } 48 | 49 | ### **Message Contents** 50 | 51 | ### Health 52 | 53 | A health request is made using the ServerLive, ServerReady, or 54 | ModelReady endpoint. For each of these endpoints errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. 55 | 56 | #### Server Live 57 | 58 | The ServerLive API indicates if the inference server is able to 59 | receive and respond to metadata and inference requests. The request 60 | and response messages for ServerLive are: 61 | 62 | message ServerLiveRequest {} 63 | 64 | message ServerLiveResponse 65 | { 66 | // True if the inference server is live, false if not live. 67 | bool live = 1; 68 | } 69 | 70 | #### Server Ready 71 | 72 | The ServerReady API indicates if the server is ready for 73 | inferencing. The request and response messages for ServerReady are: 74 | 75 | message ServerReadyRequest {} 76 | 77 | message ServerReadyResponse 78 | { 79 | // True if the inference server is ready, false if not ready. 80 | bool ready = 1; 81 | } 82 | 83 | #### Model Ready 84 | 85 | The ModelReady API indicates if a specific model is ready for 86 | inferencing. The request and response messages for ModelReady are: 87 | 88 | message ModelReadyRequest 89 | { 90 | // The name of the model to check for readiness. 91 | string name = 1; 92 | 93 | // The version of the model to check for readiness. If not given the 94 | // server will choose a version based on the model and internal policy. 95 | string version = 2; 96 | } 97 | 98 | message ModelReadyResponse 99 | { 100 | // True if the model is ready, false if not ready. 101 | bool ready = 1; 102 | } 103 | 104 | --- 105 | 106 | ### Metadata 107 | 108 | #### Server Metadata 109 | 110 | The ServerMetadata API provides information about the server. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate failure. The request and response messages for ServerMetadata are: 111 | 112 | message ServerMetadataRequest {} 113 | 114 | message ServerMetadataResponse 115 | { 116 | // The server name. 117 | string name = 1; 118 | 119 | // The server version. 120 | string version = 2; 121 | 122 | // The extensions supported by the server. 123 | repeated string extensions = 3; 124 | } 125 | 126 | #### Model Metadata 127 | 128 | The per-model metadata API provides information about a model. Errors 129 | are indicated by the google.rpc.Status returned for the request. The 130 | OK code indicates success and other codes indicate failure. The 131 | request and response messages for ModelMetadata are: 132 | 133 | message ModelMetadataRequest 134 | { 135 | // The name of the model. 136 | string name = 1; 137 | 138 | // The version of the model to check for readiness. If not given the 139 | // server will choose a version based on the model and internal policy. 140 | string version = 2; 141 | } 142 | 143 | message ModelMetadataResponse 144 | { 145 | // Metadata for a tensor. 146 | message TensorMetadata 147 | { 148 | // The tensor name. 149 | string name = 1; 150 | 151 | // The tensor data type. 152 | string datatype = 2; 153 | 154 | // The tensor shape. A variable-size dimension is represented 155 | // by a -1 value. 156 | repeated int64 shape = 3; 157 | } 158 | 159 | // The model name. 160 | string name = 1; 161 | 162 | // The versions of the model available on the server. 163 | repeated string versions = 2; 164 | 165 | // The model's platform. See Platforms. 166 | string platform = 3; 167 | 168 | // The model's inputs. 169 | repeated TensorMetadata inputs = 4; 170 | 171 | // The model's outputs. 172 | repeated TensorMetadata outputs = 5; 173 | } 174 | 175 | #### Platforms 176 | 177 | A platform is a string indicating a DL/ML framework or 178 | backend. Platform is returned as part of the response to a 179 | [Model Metadata](#model_metadata) request but is information only. The 180 | proposed inference APIs are generic relative to the DL/ML framework 181 | used by a model and so a client does not need to know the platform of 182 | a given model to use the API. Platform names use the format 183 | “_”. The following platform names are allowed: 184 | 185 | * tensorrt_plan : A TensorRT model encoded as a serialized engine or “plan”. 186 | * tensorflow_graphdef : A TensorFlow model encoded as a GraphDef. 187 | * tensorflow_savedmodel : A TensorFlow model encoded as a SavedModel. 188 | * onnx_onnxv1 : A ONNX model encoded for ONNX Runtime. 189 | * pytorch_torchscript : A PyTorch model encoded as TorchScript. 190 | * mxnet_mxnet: An MXNet model 191 | * caffe2_netdef : A Caffe2 model encoded as a NetDef. 192 | 193 | --- 194 | 195 | ### Inference 196 | 197 | The ModelInfer API performs inference using the specified 198 | model. Errors are indicated by the google.rpc.Status returned for the request. The OK code indicates success and other codes indicate 199 | failure. The request and response messages for ModelInfer are: 200 | 201 | message ModelInferRequest 202 | { 203 | // An input tensor for an inference request. 204 | message InferInputTensor 205 | { 206 | // The tensor name. 207 | string name = 1; 208 | 209 | // The tensor data type. 210 | string datatype = 2; 211 | 212 | // The tensor shape. 213 | repeated int64 shape = 3; 214 | 215 | // Optional inference input tensor parameters. 216 | map parameters = 4; 217 | 218 | // The tensor contents using a data-type format. This field must 219 | // not be specified if "raw" tensor contents are being used for 220 | // the inference request. 221 | InferTensorContents contents = 5; 222 | } 223 | 224 | // An output tensor requested for an inference request. 225 | message InferRequestedOutputTensor 226 | { 227 | // The tensor name. 228 | string name = 1; 229 | 230 | // Optional requested output tensor parameters. 231 | map parameters = 2; 232 | } 233 | 234 | // The name of the model to use for inferencing. 235 | string model_name = 1; 236 | 237 | // The version of the model to use for inference. If not given the 238 | // server will choose a version based on the model and internal policy. 239 | string model_version = 2; 240 | 241 | // Optional identifier for the request. If specified will be 242 | // returned in the response. 243 | string id = 3; 244 | 245 | // Optional inference parameters. 246 | map parameters = 4; 247 | 248 | // The input tensors for the inference. 249 | repeated InferInputTensor inputs = 5; 250 | 251 | // The requested output tensors for the inference. Optional, if not 252 | // specified all outputs produced by the model will be returned. 253 | repeated InferRequestedOutputTensor outputs = 6; 254 | 255 | // The data contained in an input tensor can be represented in "raw" 256 | // bytes form or in the repeated type that matches the tensor's data 257 | // type. To use the raw representation 'raw_input_contents' must be 258 | // initialized with data for each tensor in the same order as 259 | // 'inputs'. For each tensor, the size of this content must match 260 | // what is expected by the tensor's shape and data type. The raw 261 | // data must be the flattened, one-dimensional, row-major order of 262 | // the tensor elements without any stride or padding between the 263 | // elements. Note that the FP16 data type must be represented as raw 264 | // content as there is no specific data type for a 16-bit float 265 | // type. 266 | // 267 | // If this field is specified then InferInputTensor::contents must 268 | // not be specified for any input tensor. 269 | repeated bytes raw_input_contents = 7; 270 | } 271 | 272 | message ModelInferResponse 273 | { 274 | // An output tensor returned for an inference request. 275 | message InferOutputTensor 276 | { 277 | // The tensor name. 278 | string name = 1; 279 | 280 | // The tensor data type. 281 | string datatype = 2; 282 | 283 | // The tensor shape. 284 | repeated int64 shape = 3; 285 | 286 | // Optional output tensor parameters. 287 | map parameters = 4; 288 | 289 | // The tensor contents using a data-type format. This field must 290 | // not be specified if "raw" tensor contents are being used for 291 | // the inference response. 292 | InferTensorContents contents = 5; 293 | } 294 | 295 | // The name of the model used for inference. 296 | string model_name = 1; 297 | 298 | // The version of the model used for inference. 299 | string model_version = 2; 300 | 301 | // The id of the inference request if one was specified. 302 | string id = 3; 303 | 304 | // Optional inference response parameters. 305 | map parameters = 4; 306 | 307 | // The output tensors holding inference results. 308 | repeated InferOutputTensor outputs = 5; 309 | 310 | // The data contained in an output tensor can be represented in 311 | // "raw" bytes form or in the repeated type that matches the 312 | // tensor's data type. To use the raw representation 'raw_output_contents' 313 | // must be initialized with data for each tensor in the same order as 314 | // 'outputs'. For each tensor, the size of this content must match 315 | // what is expected by the tensor's shape and data type. The raw 316 | // data must be the flattened, one-dimensional, row-major order of 317 | // the tensor elements without any stride or padding between the 318 | // elements. Note that the FP16 data type must be represented as raw 319 | // content as there is no specific data type for a 16-bit float 320 | // type. 321 | // 322 | // If this field is specified then InferOutputTensor::contents must 323 | // not be specified for any output tensor. 324 | repeated bytes raw_output_contents = 6; 325 | } 326 | 327 | #### Parameters 328 | 329 | The Parameters message describes a “name”/”value” pair, where the 330 | “name” is the name of the parameter and the “value” is a boolean, 331 | integer, or string corresponding to the parameter. 332 | 333 | Currently, no parameters are defined. As required a future proposal may define one or more standard parameters to allow portable functionality across different inference servers. A server can implement server-specific parameters to provide non-standard capabilities. 334 | 335 | // 336 | // An inference parameter value. 337 | // 338 | message InferParameter 339 | { 340 | // The parameter value can be a string, an int64, a boolean 341 | // or a message specific to a predefined parameter. 342 | oneof parameter_choice 343 | { 344 | // A boolean parameter value. 345 | bool bool_param = 1; 346 | 347 | // An int64 parameter value. 348 | int64 int64_param = 2; 349 | 350 | // A string parameter value. 351 | string string_param = 3; 352 | } 353 | } 354 | 355 | --- 356 | 357 | ### Tensor Data 358 | 359 | In all representations tensor data must be flattened to a 360 | one-dimensional, row-major order of the tensor elements. Element 361 | values must be given in "linear" order without any stride or padding 362 | between elements. 363 | 364 | Using a "raw" representation of tensors with 365 | ModelInferRequest::raw_input_contents and 366 | ModelInferResponse::raw_output_contents will typically allow higher 367 | performance due to the way protobuf allocation and reuse interacts 368 | with GRPC. For example, see https://github.com/grpc/grpc/issues/23231. 369 | 370 | An alternative to the "raw" representation is to use 371 | InferTensorContents to represent the tensor data in a format that 372 | matches the tensor's data type. 373 | 374 | // 375 | // The data contained in a tensor represented by the repeated type 376 | // that matches the tensor's data type. Protobuf oneof is not used 377 | // because oneofs cannot contain repeated fields. 378 | // 379 | message InferTensorContents 380 | { 381 | // Representation for BOOL data type. The size must match what is 382 | // expected by the tensor's shape. The contents must be the flattened, 383 | // one-dimensional, row-major order of the tensor elements. 384 | repeated bool bool_contents = 1; 385 | 386 | // Representation for INT8, INT16, and INT32 data types. The size 387 | // must match what is expected by the tensor's shape. The contents 388 | // must be the flattened, one-dimensional, row-major order of the 389 | // tensor elements. 390 | repeated int32 int_contents = 2; 391 | 392 | // Representation for INT64 data types. The size must match what 393 | // is expected by the tensor's shape. The contents must be the 394 | // flattened, one-dimensional, row-major order of the tensor elements. 395 | repeated int64 int64_contents = 3; 396 | 397 | // Representation for UINT8, UINT16, and UINT32 data types. The size 398 | // must match what is expected by the tensor's shape. The contents 399 | // must be the flattened, one-dimensional, row-major order of the 400 | // tensor elements. 401 | repeated uint32 uint_contents = 4; 402 | 403 | // Representation for UINT64 data types. The size must match what 404 | // is expected by the tensor's shape. The contents must be the 405 | // flattened, one-dimensional, row-major order of the tensor elements. 406 | repeated uint64 uint64_contents = 5; 407 | 408 | // Representation for FP32 data type. The size must match what is 409 | // expected by the tensor's shape. The contents must be the flattened, 410 | // one-dimensional, row-major order of the tensor elements. 411 | repeated float fp32_contents = 6; 412 | 413 | // Representation for FP64 data type. The size must match what is 414 | // expected by the tensor's shape. The contents must be the flattened, 415 | // one-dimensional, row-major order of the tensor elements. 416 | repeated double fp64_contents = 7; 417 | 418 | // Representation for BYTES data type. The size must match what is 419 | // expected by the tensor's shape. The contents must be the flattened, 420 | // one-dimensional, row-major order of the tensor elements. 421 | repeated bytes bytes_contents = 8; 422 | } 423 | 424 | #### Tensor Data Types 425 | 426 | Tensor data types are shown in the following table along with the size 427 | of each type, in bytes. 428 | 429 | 430 | | Data Type | Size (bytes) | 431 | | --------- | ------------ | 432 | | BOOL | 1 | 433 | | UINT8 | 1 | 434 | | UINT16 | 2 | 435 | | UINT32 | 4 | 436 | | UINT64 | 8 | 437 | | INT8 | 1 | 438 | | INT16 | 2 | 439 | | INT32 | 4 | 440 | | INT64 | 8 | 441 | | FP16 | 2 | 442 | | FP32 | 4 | 443 | | FP64 | 8 | 444 | | BYTES | Variable (max 232) | 445 | 446 | --- 447 | -------------------------------------------------------------------------------- /specification/protocol/inference_rest.md: -------------------------------------------------------------------------------- 1 | ## HTTP/REST 2 | 3 | The HTTP/REST API uses JSON because it is widely supported and 4 | language independent. In all JSON schemas shown in this document 5 | $number, $string, $boolean, $object and $array refer to the 6 | fundamental JSON types. #optional indicates an optional JSON field. 7 | Inference Request Examples 8 | 9 | See also: The HTTP/REST endpoints are defined in [open_inference_rest.yaml](./open_inference_rest.yaml) 10 | 11 | | API | Verb | Path | Request Payload | Response Payload | 12 | | ------------- | ------------- | ------------- | ------------- | ------------- | 13 | | Inference | POST | v2/models/[/versions/\]/infer | [$inference_request](#inference-request-json-object) | [$inference_response](#inference-response-json-object) | 14 | | Model Metadata | GET | v2/models/\[/versions/\] | | [$metadata_model_response](#model-metadata-response-json-object) | 15 | | Server Ready | GET | v2/health/ready | | [$ready_server_response](#server-ready-response-json-object) | 16 | | Server Live | GET | v2/health/live | | [$live_server_response](#server-live-response-json-objet)| 17 | | Server Metadata | GET | v2 | | [$metadata_server_response](#server-metadata-response-json-object) | 18 | | Model Ready| GET | v2/models/\[/versions/]/ready | | [$ready_model_response](#model-ready-response-json-object) | 19 | 20 | ** path contents in `[]` are optional 21 | 22 | For more information regarding payload contents, see `Payload Contents`. 23 | 24 | The versions portion of the `Path` URLs (in `[]`) is shown as **optional** to allow implementations that don’t support versioning or for cases when the user does not want to specify a specific model version (in which case the server will choose a version based on its own policies). 25 | For example, if a model does not implement a version, the Model Metadata request path could look like `v2/model/my_model`. If the model has been configured to implement a version, the request path could look something like `v2/models/my_model/versions/v10`, where the version of the model is v10. 26 | 27 | 28 | 29 | ### **API Definitions** 30 | 31 | | API | Definition | 32 | | --- | --- | 33 | | Inference | The `/infer` endpoint performs inference on a model. The response is the prediction result.| 34 | | Model Metadata | The "model metadata" API is a per-model endpoint that returns details about the model passed in the path. | 35 | | Server Ready | The “server ready” health API indicates if all the models are ready for inferencing. The “server ready” health API can be used directly to implement the Kubernetes readinessProbe | 36 | | Server Live | The “server live” health API indicates if the inference server is able to receive and respond to metadata and inference requests. The “server live” API can be used directly to implement the Kubernetes livenessProbe. | 37 | | Server Metadata | The "server metadata" API returns details describing the server. | 38 | | Model Ready | The “model ready” health API indicates if a specific model is ready for inferencing. The model name and (optionally) version must be available in the URL. | 39 | 40 | ### Health/Readiness/Liveness Probes 41 | 42 | The Model Readiness probe the question "Was the model successfully downloaded and loaded onto the server to be able to run inference requests?" and responds with the available model name(s). The Server Readiness/Liveness probes answer the question "Is my service and its infrastructure running, healthy, and able to receive and process requests?" 43 | 44 | To read more about liveness and readiness probe concepts, visit the [Configure Liveness, Readiness and Startup Probes](https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/) 45 | Kubernetes documentation. 46 | 47 | ### **Payload Contents** 48 | 49 | ### **Model Ready** 50 | 51 | The model ready endpoint returns the readiness probe response for the server along with the name of the model. 52 | 53 | #### Model Ready Response JSON Object 54 | 55 | 56 | $ready_model_response = 57 | { 58 | "name" : $string, 59 | "ready": $bool 60 | } 61 | 62 | 63 | ### Server Ready 64 | 65 | The server ready endpoint returns the readiness probe response for the server. 66 | 67 | #### Server Ready Response JSON Object 68 | 69 | $ready_server_response = 70 | { 71 | "live" : $bool, 72 | } 73 | 74 | --- 75 | 76 | ### Server Live 77 | 78 | The server live endpoint returns the liveness probe response for the server. 79 | 80 | #### Server Live Response JSON Objet 81 | 82 | $live_server_response = 83 | { 84 | "live" : $bool, 85 | } 86 | 87 | --- 88 | 89 | ### Server Metadata 90 | 91 | The server metadata endpoint provides information about the server. A 92 | server metadata request is made with an HTTP GET to a server metadata 93 | endpoint. In the corresponding response the HTTP body contains the 94 | [Server Metadata Response JSON Object](#server-metadata-response-json-object) 95 | or the 96 | [Server Metadata Response JSON Error Object](#server-metadata-response-json-error-object). 97 | 98 | #### Server Metadata Response JSON Object 99 | 100 | A successful server metadata request is indicated by a 200 HTTP status 101 | code. The server metadata response object, identified as 102 | *$metadata_server_response*, is returned in the HTTP body. 103 | 104 | $metadata_server_response = 105 | { 106 | "name" : $string, 107 | "version" : $string, 108 | "extensions" : [ $string, ... ] 109 | } 110 | 111 | * “name” : A descriptive name for the server. 112 | * "version" : The server version. 113 | * “extensions” : The extensions supported by the server. Currently, no standard extensions are defined. Individual inference servers may define and document their own extensions. 114 | 115 | 116 | #### Server Metadata Response JSON Error Object 117 | 118 | A failed server metadata request must be indicated by an HTTP error 119 | status (typically 400). The HTTP body must contain the 120 | *$metadata_server_error_response* object. 121 | 122 | $metadata_server_error_response = 123 | { 124 | "error": $string 125 | } 126 | 127 | * “error” : The descriptive message for the error. 128 | 129 | --- 130 | 131 | ### Model Metadata 132 | 133 | The per-model metadata endpoint provides information about a model. A 134 | model metadata request is made with an HTTP GET to a model metadata 135 | endpoint. In the corresponding response the HTTP body contains the 136 | [Model Metadata Response JSON Object](#model-metadata-response-json-object) 137 | or the 138 | [Model Metadata Response JSON Error Object](#model-metadata-response-json-error-object). 139 | The model name and (optionally) version must be available in the 140 | URL. If a version is not provided the server may choose a version 141 | based on its own policies or return an error. 142 | 143 | #### Model Metadata Response JSON Object 144 | 145 | A successful model metadata request is indicated by a 200 HTTP status 146 | code. The metadata response object, identified as 147 | *$metadata_model_response*, is returned in the HTTP body for every 148 | successful model metadata request. 149 | 150 | $metadata_model_response = 151 | { 152 | "name" : $string, 153 | "versions" : [ $string, ... ] #optional, 154 | "platform" : $string, 155 | "inputs" : [ $metadata_tensor, ... ], 156 | "outputs" : [ $metadata_tensor, ... ] 157 | } 158 | 159 | * “name” : The name of the model. 160 | * "versions" : The model versions that may be explicitly requested via 161 | the appropriate endpoint. Optional for servers that don’t support 162 | versions. Optional for models that don’t allow a version to be 163 | explicitly requested. 164 | * “platform” : The framework/backend for the model. See 165 | [Platforms](#platforms). 166 | * “inputs” : The inputs required by the model. 167 | * “outputs” : The outputs produced by the model. 168 | 169 | Each model input and output tensors’ metadata is described with a 170 | *$metadata_tensor object*. 171 | 172 | $metadata_tensor = 173 | { 174 | "name" : $string, 175 | "datatype" : $string, 176 | "shape" : [ $number, ... ] 177 | } 178 | 179 | * “name” : The name of the tensor. 180 | * "datatype" : The data-type of the tensor elements as defined in 181 | [Tensor Data Types](#tensor-data-types). 182 | * "shape" : The shape of the tensor. Variable-size dimensions are 183 | specified as -1. 184 | 185 | #### Model Metadata Response JSON Error Object 186 | 187 | A failed model metadata request must be indicated by an HTTP error 188 | status (typically 400). The HTTP body must contain the 189 | *$metadata_model_error_response* object. 190 | 191 | $metadata_model_error_response = 192 | { 193 | "error": $string 194 | } 195 | 196 | * “error” : The descriptive message for the error. 197 | 198 | #### Platforms 199 | 200 | A platform is a string indicating a DL/ML framework or 201 | backend. Platform is returned as part of the response to a 202 | [Model Metadata](#model-metadata) request but is information only. The 203 | proposed inference APIs are generic relative to the DL/ML framework 204 | used by a model and so a client does not need to know the platform of 205 | a given model to use the API. Platform names use the format 206 | “_”. The following platform names are allowed: 207 | 208 | * tensorrt_plan : A TensorRT model encoded as a serialized engine or “plan”. 209 | * tensorflow_graphdef : A TensorFlow model encoded as a GraphDef. 210 | * tensorflow_savedmodel : A TensorFlow model encoded as a SavedModel. 211 | * onnx_onnxv1 : A ONNX model encoded for ONNX Runtime. 212 | * pytorch_torchscript : A PyTorch model encoded as TorchScript. 213 | * mxnet_mxnet: An MXNet model 214 | * caffe2_netdef : A Caffe2 model encoded as a NetDef. 215 | --- 216 | 217 | ### Inference 218 | 219 | An inference request is made with an HTTP POST to an inference 220 | endpoint. In the request the HTTP body contains the 221 | [Inference Request JSON Object](#inference-request-json-object). In 222 | the corresponding response the HTTP body contains the 223 | [Inference Response JSON Object](#inference-response-json-object) or 224 | [Inference Response JSON Error Object](#inference-response-json-error-object). See 225 | [Inference Request Examples](#inference-request-examples) for some 226 | example HTTP/REST requests and responses. 227 | 228 | #### Inference Request JSON Object 229 | 230 | The inference request object, identified as *$inference_request*, is 231 | required in the HTTP body of the POST request. The model name and 232 | (optionally) version must be available in the URL. If a version is not 233 | provided the server may choose a version based on its own policies or 234 | return an error. 235 | 236 | $inference_request = 237 | { 238 | "id" : $string #optional, 239 | "parameters" : $parameters #optional, 240 | "inputs" : [ $request_input, ... ], 241 | "outputs" : [ $request_output, ... ] #optional 242 | } 243 | 244 | * "id" : An identifier for this request. Optional, but if specified 245 | this identifier must be returned in the response. 246 | * "parameters" : An object containing zero or more parameters for this 247 | inference request expressed as key/value pairs. See 248 | [Parameters](#parameters) for more information. 249 | * "inputs" : The input tensors. Each input is described using the 250 | *$request_input* schema defined in [Request Input](#request-input). 251 | * "outputs" : The output tensors requested for this inference. Each 252 | requested output is described using the *$request_output* schema 253 | defined in [Request Output](#request-output). Optional, if not 254 | specified all outputs produced by the model will be returned using 255 | default *$request_output* settings. 256 | 257 | ##### Request Input 258 | 259 | The *$inference_request_input* JSON describes an input to the model. If the 260 | input is batched, the shape and data must represent the full shape and 261 | contents of the entire batch. 262 | 263 | $inference_request_input = 264 | { 265 | "name" : $string, 266 | "shape" : [ $number, ... ], 267 | "datatype" : $string, 268 | "parameters" : $parameters #optional, 269 | "data" : $tensor_data 270 | } 271 | 272 | * "name" : The name of the input tensor. 273 | * "shape" : The shape of the input tensor. Each dimension must be an 274 | integer representable as an unsigned 64-bit integer value. 275 | * "datatype" : The data-type of the input tensor elements as defined 276 | in [Tensor Data Types](#tensor-data-types). 277 | * "parameters" : An object containing zero or more parameters for this 278 | input expressed as key/value pairs. See [Parameters](#parameters) 279 | for more information. 280 | * "data": The contents of the tensor. See [Tensor Data](#tensor-data) 281 | for more information. 282 | 283 | ##### Request Output 284 | 285 | The *$request_output* JSON is used to request which output tensors 286 | should be returned from the model. 287 | 288 | $inference_request_output = 289 | { 290 | "name" : $string, 291 | "parameters" : $parameters #optional, 292 | } 293 | 294 | * "name" : The name of the output tensor. 295 | * "parameters" : An object containing zero or more parameters for this 296 | output expressed as key/value pairs. See [Parameters](#parameters) 297 | for more information. 298 | 299 | #### Inference Response JSON Object 300 | 301 | A successful inference request is indicated by a 200 HTTP status 302 | code. The inference response object, identified as 303 | *$inference_response*, is returned in the HTTP body. 304 | 305 | $inference_response = 306 | { 307 | "model_name" : $string, 308 | "model_version" : $string #optional, 309 | "id" : $string, 310 | "parameters" : $parameters #optional, 311 | "outputs" : [ $response_output, ... ] 312 | } 313 | 314 | * "model_name" : The name of the model used for inference. 315 | * "model_version" : The specific model version used for 316 | inference. Inference servers that do not implement versioning should 317 | not provide this field in the response. 318 | * "id" : The "id" identifier given in the request, if any. 319 | * "parameters" : An object containing zero or more parameters for this 320 | response expressed as key/value pairs. See [Parameters](#parameters) 321 | for more information. 322 | * "outputs" : The output tensors. Each output is described using the 323 | $response_output schema defined in 324 | [Response Output](#response-output). 325 | 326 | ##### Response Output 327 | 328 | The *$response_output* JSON describes an output from the model. If the 329 | output is batched, the shape and data represents the full shape of the 330 | entire batch. 331 | 332 | $response_output = 333 | { 334 | "name" : $string, 335 | "shape" : [ $number, ... ], 336 | "datatype" : $string, 337 | "parameters" : $parameters #optional, 338 | "data" : $tensor_data 339 | } 340 | 341 | * "name" : The name of the output tensor. 342 | * "shape" : The shape of the output tensor. Each dimension must be an 343 | integer representable as an unsigned 64-bit integer value. 344 | * "datatype" : The data-type of the output tensor elements as defined 345 | in [Tensor Data Types](#tensor-data-types). 346 | * "parameters" : An object containing zero or more parameters for this 347 | input expressed as key/value pairs. See [Parameters](#parameters) 348 | for more information. 349 | * “data”: The contents of the tensor. See [Tensor Data](#tensor-data) 350 | for more information. 351 | 352 | #### Inference Response JSON Error Object 353 | 354 | A failed inference request must be indicated by an HTTP error status 355 | (typically 400). The HTTP body must contain the 356 | *$inference_error_response* object. 357 | 358 | $inference_error_response = 359 | { 360 | "error": 361 | } 362 | 363 | * “error” : The descriptive message for the error. 364 | 365 | #### Parameters 366 | 367 | The *$parameters* JSON describes zero or more “name”/”value” pairs, 368 | where the “name” is the name of the parameter and the “value” is a 369 | $string, $number, or $boolean. 370 | 371 | $parameters = 372 | { 373 | $parameter, ... 374 | } 375 | 376 | $parameter = $string : $string | $number | $boolean 377 | 378 | Currently no parameters are defined. As required a future proposal may 379 | define one or more standard parameters to allow portable functionality 380 | across different inference servers. A server can implement 381 | server-specific parameters to provide non-standard capabilities. 382 | 383 | ### Tensor Data 384 | 385 | Tensor data must be presented in row-major order of the tensor 386 | elements. Element values must be given in "linear" order without any 387 | stride or padding between elements. Tensor elements may be presented 388 | in their nature multi-dimensional representation, or as a flattened 389 | one-dimensional representation. 390 | 391 | Tensor data given explicitly is provided in a JSON array. Each element 392 | of the array may be an integer, floating-point number, string or 393 | boolean value. The server can decide to coerce each element to the 394 | required type or return an error if an unexpected value is 395 | received. Note that fp16 and bf16 are problematic to communicate explicitly 396 | since there is not a standard fp16/bf16 representation across backends nor 397 | typically the programmatic support to create the fp16/bf16 representation 398 | for a JSON number. 399 | 400 | For example, the 2-dimensional matrix: 401 | 402 | [ 1 2 403 | 4 5 ] 404 | 405 | Can be represented in its natural format as: 406 | 407 | "data" : [ [ 1, 2 ], [ 4, 5 ] ] 408 | 409 | Or in a flattened one-dimensional representation: 410 | 411 | "data" : [ 1, 2, 4, 5 ] 412 | 413 | #### Tensor Data Types 414 | 415 | Tensor data types are shown in the following table along with the size 416 | of each type, in bytes. 417 | 418 | 419 | | Data Type | Size (bytes) | 420 | | --------- | ------------ | 421 | | BOOL | 1 | 422 | | UINT8 | 1 | 423 | | UINT16 | 2 | 424 | | UINT32 | 4 | 425 | | UINT64 | 8 | 426 | | INT8 | 1 | 427 | | INT16 | 2 | 428 | | INT32 | 4 | 429 | | INT64 | 8 | 430 | | FP16 | 2 | 431 | | FP32 | 4 | 432 | | FP64 | 8 | 433 | | BYTES | Variable (max 232) | 434 | --- 435 | 436 | 437 | ### **Inference Request Examples** 438 | 439 | The following example shows an inference request to a model with two 440 | inputs and one output. The HTTP Content-Length header gives the size 441 | of the JSON object. 442 | 443 | POST /v2/models/mymodel/infer HTTP/1.1 444 | Host: localhost:8000 445 | Content-Type: application/json 446 | Content-Length: 447 | { 448 | "id" : "42", 449 | "inputs" : [ 450 | { 451 | "name" : "input0", 452 | "shape" : [ 2, 2 ], 453 | "datatype" : "UINT32", 454 | "data" : [ 1, 2, 3, 4 ] 455 | }, 456 | { 457 | "name" : "input1", 458 | "shape" : [ 3 ], 459 | "datatype" : "BOOL", 460 | "data" : [ true ] 461 | } 462 | ], 463 | "outputs" : [ 464 | { 465 | "name" : "output0" 466 | } 467 | ] 468 | } 469 | 470 | For the above request the inference server must return the “output0” 471 | output tensor. Assuming the model returns a [ 3, 2 ] tensor of data 472 | type FP32 the following response would be returned. 473 | 474 | HTTP/1.1 200 OK 475 | Content-Type: application/json 476 | Content-Length: 477 | { 478 | "id" : "42" 479 | "outputs" : [ 480 | { 481 | "name" : "output0", 482 | "shape" : [ 3, 2 ], 483 | "datatype" : "FP32", 484 | "data" : [ 1.0, 1.1, 2.0, 2.1, 3.0, 3.1 ] 485 | } 486 | ] 487 | } 488 | -------------------------------------------------------------------------------- /specification/protocol/open_inference_grpc.proto: -------------------------------------------------------------------------------- 1 | // Copyright 2023 The KServe Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | syntax = "proto3"; 16 | package inference; 17 | 18 | // Inference Server GRPC endpoints. 19 | service GRPCInferenceService 20 | { 21 | // The ServerLive API indicates if the inference server is able to receive 22 | // and respond to metadata and inference requests. 23 | rpc ServerLive(ServerLiveRequest) returns (ServerLiveResponse) {} 24 | 25 | // The ServerReady API indicates if the server is ready for inferencing. 26 | rpc ServerReady(ServerReadyRequest) returns (ServerReadyResponse) {} 27 | 28 | // The ModelReady API indicates if a specific model is ready for inferencing. 29 | rpc ModelReady(ModelReadyRequest) returns (ModelReadyResponse) {} 30 | 31 | // The ServerMetadata API provides information about the server. Errors are 32 | // indicated by the google.rpc.Status returned for the request. The OK code 33 | // indicates success and other codes indicate failure. 34 | rpc ServerMetadata(ServerMetadataRequest) returns (ServerMetadataResponse) {} 35 | 36 | // The per-model metadata API provides information about a model. Errors are 37 | // indicated by the google.rpc.Status returned for the request. The OK code 38 | // indicates success and other codes indicate failure. 39 | rpc ModelMetadata(ModelMetadataRequest) returns (ModelMetadataResponse) {} 40 | 41 | // The ModelInfer API performs inference using the specified model. Errors are 42 | // indicated by the google.rpc.Status returned for the request. The OK code 43 | // indicates success and other codes indicate failure. 44 | rpc ModelInfer(ModelInferRequest) returns (ModelInferResponse) {} 45 | } 46 | 47 | message ServerLiveRequest {} 48 | 49 | message ServerLiveResponse 50 | { 51 | // True if the inference server is live, false if not live. 52 | bool live = 1; 53 | } 54 | 55 | message ServerReadyRequest {} 56 | 57 | message ServerReadyResponse 58 | { 59 | // True if the inference server is ready, false if not ready. 60 | bool ready = 1; 61 | } 62 | 63 | message ModelReadyRequest 64 | { 65 | // The name of the model to check for readiness. 66 | string name = 1; 67 | 68 | // The version of the model to check for readiness. If not given the 69 | // server will choose a version based on the model and internal policy. 70 | string version = 2; 71 | } 72 | 73 | message ModelReadyResponse 74 | { 75 | // True if the model is ready, false if not ready. 76 | bool ready = 1; 77 | } 78 | 79 | message ServerMetadataRequest {} 80 | 81 | message ServerMetadataResponse 82 | { 83 | // The server name. 84 | string name = 1; 85 | 86 | // The server version. 87 | string version = 2; 88 | 89 | // The extensions supported by the server. 90 | repeated string extensions = 3; 91 | } 92 | 93 | message ModelMetadataRequest 94 | { 95 | // The name of the model. 96 | string name = 1; 97 | 98 | // The version of the model to check for readiness. If not given the 99 | // server will choose a version based on the model and internal policy. 100 | string version = 2; 101 | } 102 | 103 | message ModelMetadataResponse 104 | { 105 | // Metadata for a tensor. 106 | message TensorMetadata 107 | { 108 | // The tensor name. 109 | string name = 1; 110 | 111 | // The tensor data type. 112 | string datatype = 2; 113 | 114 | // The tensor shape. A variable-size dimension is represented 115 | // by a -1 value. 116 | repeated int64 shape = 3; 117 | } 118 | 119 | // The model name. 120 | string name = 1; 121 | 122 | // The versions of the model available on the server. 123 | repeated string versions = 2; 124 | 125 | // The model's platform. See Platforms. 126 | string platform = 3; 127 | 128 | // The model's inputs. 129 | repeated TensorMetadata inputs = 4; 130 | 131 | // The model's outputs. 132 | repeated TensorMetadata outputs = 5; 133 | 134 | // Optional Model Properties 135 | map properties = 6; 136 | } 137 | 138 | message ModelInferRequest 139 | { 140 | // An input tensor for an inference request. 141 | message InferInputTensor 142 | { 143 | // The tensor name. 144 | string name = 1; 145 | 146 | // The tensor data type. 147 | string datatype = 2; 148 | 149 | // The tensor shape. 150 | repeated int64 shape = 3; 151 | 152 | // Optional inference input tensor parameters. 153 | map parameters = 4; 154 | 155 | // The tensor contents using a data-type format. This field must 156 | // not be specified if "raw" tensor contents are being used for 157 | // the inference request. 158 | InferTensorContents contents = 5; 159 | } 160 | 161 | // An output tensor requested for an inference request. 162 | message InferRequestedOutputTensor 163 | { 164 | // The tensor name. 165 | string name = 1; 166 | 167 | // Optional requested output tensor parameters. 168 | map parameters = 2; 169 | } 170 | 171 | // The name of the model to use for inferencing. 172 | string model_name = 1; 173 | 174 | // The version of the model to use for inference. If not given the 175 | // server will choose a version based on the model and internal policy. 176 | string model_version = 2; 177 | 178 | // Optional identifier for the request. If specified will be 179 | // returned in the response. 180 | string id = 3; 181 | 182 | // Optional inference parameters. 183 | map parameters = 4; 184 | 185 | // The input tensors for the inference. 186 | repeated InferInputTensor inputs = 5; 187 | 188 | // The requested output tensors for the inference. Optional, if not 189 | // specified all outputs produced by the model will be returned. 190 | repeated InferRequestedOutputTensor outputs = 6; 191 | 192 | // The data contained in an input tensor can be represented in "raw" 193 | // bytes form or in the repeated type that matches the tensor's data 194 | // type. To use the raw representation 'raw_input_contents' must be 195 | // initialized with data for each tensor in the same order as 196 | // 'inputs'. For each tensor, the size of this content must match 197 | // what is expected by the tensor's shape and data type. The raw 198 | // data must be the flattened, one-dimensional, row-major order of 199 | // the tensor elements without any stride or padding between the 200 | // elements. Note that the FP16 and BF16 data types must be represented as 201 | // raw content as there is no specific data type for a 16-bit float type. 202 | // 203 | // If this field is specified then InferInputTensor::contents must 204 | // not be specified for any input tensor. 205 | repeated bytes raw_input_contents = 7; 206 | } 207 | 208 | message ModelInferResponse 209 | { 210 | // An output tensor returned for an inference request. 211 | message InferOutputTensor 212 | { 213 | // The tensor name. 214 | string name = 1; 215 | 216 | // The tensor data type. 217 | string datatype = 2; 218 | 219 | // The tensor shape. 220 | repeated int64 shape = 3; 221 | 222 | // Optional output tensor parameters. 223 | map parameters = 4; 224 | 225 | // The tensor contents using a data-type format. This field must 226 | // not be specified if "raw" tensor contents are being used for 227 | // the inference response. 228 | InferTensorContents contents = 5; 229 | } 230 | 231 | // The name of the model used for inference. 232 | string model_name = 1; 233 | 234 | // The version of the model used for inference. 235 | string model_version = 2; 236 | 237 | // The id of the inference request if one was specified. 238 | string id = 3; 239 | 240 | // Optional inference response parameters. 241 | map parameters = 4; 242 | 243 | // The output tensors holding inference results. 244 | repeated InferOutputTensor outputs = 5; 245 | 246 | // The data contained in an output tensor can be represented in 247 | // "raw" bytes form or in the repeated type that matches the 248 | // tensor's data type. To use the raw representation 'raw_output_contents' 249 | // must be initialized with data for each tensor in the same order as 250 | // 'outputs'. For each tensor, the size of this content must match 251 | // what is expected by the tensor's shape and data type. The raw 252 | // data must be the flattened, one-dimensional, row-major order of 253 | // the tensor elements without any stride or padding between the 254 | // elements. Note that the FP16 and BF16 data types must be represented as 255 | // raw content as there is no specific data type for a 16-bit float type. 256 | // 257 | // If this field is specified then InferOutputTensor::contents must 258 | // not be specified for any output tensor. 259 | repeated bytes raw_output_contents = 6; 260 | } 261 | 262 | // An inference parameter value. The Parameters message describes a 263 | // “name”/”value” pair, where the “name” is the name of the parameter 264 | // and the “value” is a boolean, integer, or string corresponding to 265 | // the parameter. 266 | message InferParameter 267 | { 268 | // The parameter value can be a string, an int64, a boolean 269 | // or a message specific to a predefined parameter. 270 | oneof parameter_choice 271 | { 272 | // A boolean parameter value. 273 | bool bool_param = 1; 274 | 275 | // An int64 parameter value. 276 | int64 int64_param = 2; 277 | 278 | // A string parameter value. 279 | string string_param = 3; 280 | 281 | // A double parameter value. 282 | double double_param = 4; 283 | 284 | // A uint64 parameter value. 285 | uint64 uint64_param = 5; 286 | } 287 | } 288 | 289 | // The data contained in a tensor represented by the repeated type 290 | // that matches the tensor's data type. Protobuf oneof is not used 291 | // because oneofs cannot contain repeated fields. 292 | message InferTensorContents 293 | { 294 | // Representation for BOOL data type. The size must match what is 295 | // expected by the tensor's shape. The contents must be the flattened, 296 | // one-dimensional, row-major order of the tensor elements. 297 | repeated bool bool_contents = 1; 298 | 299 | // Representation for INT8, INT16, and INT32 data types. The size 300 | // must match what is expected by the tensor's shape. The contents 301 | // must be the flattened, one-dimensional, row-major order of the 302 | // tensor elements. 303 | repeated int32 int_contents = 2; 304 | 305 | // Representation for INT64 data types. The size must match what 306 | // is expected by the tensor's shape. The contents must be the 307 | // flattened, one-dimensional, row-major order of the tensor elements. 308 | repeated int64 int64_contents = 3; 309 | 310 | // Representation for UINT8, UINT16, and UINT32 data types. The size 311 | // must match what is expected by the tensor's shape. The contents 312 | // must be the flattened, one-dimensional, row-major order of the 313 | // tensor elements. 314 | repeated uint32 uint_contents = 4; 315 | 316 | // Representation for UINT64 data types. The size must match what 317 | // is expected by the tensor's shape. The contents must be the 318 | // flattened, one-dimensional, row-major order of the tensor elements. 319 | repeated uint64 uint64_contents = 5; 320 | 321 | // Representation for FP32 data type. The size must match what is 322 | // expected by the tensor's shape. The contents must be the flattened, 323 | // one-dimensional, row-major order of the tensor elements. 324 | repeated float fp32_contents = 6; 325 | 326 | // Representation for FP64 data type. The size must match what is 327 | // expected by the tensor's shape. The contents must be the flattened, 328 | // one-dimensional, row-major order of the tensor elements. 329 | repeated double fp64_contents = 7; 330 | 331 | // Representation for BYTES data type. The size must match what is 332 | // expected by the tensor's shape. The contents must be the flattened, 333 | // one-dimensional, row-major order of the tensor elements. 334 | repeated bytes bytes_contents = 8; 335 | } 336 | -------------------------------------------------------------------------------- /specification/protocol/open_inference_rest.yaml: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The KServe Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | openapi: 3.0.0 16 | info: 17 | title: Data Plane 18 | version: '2.0' 19 | description: 'https://github.com/kserve/open-inference-protocol/blob/main/specification/protocol/inference_rest.md' 20 | license: 21 | name: Apache 2.0 22 | url: 'https://www.apache.org/licenses/LICENSE-2.0' 23 | servers: [] 24 | paths: 25 | /v2/health/live: 26 | get: 27 | summary: Server Live 28 | responses: 29 | '200': 30 | description: OK 31 | operationId: get-v2-health-live 32 | description: The “server live” API indicates if the inference server is able to receive and respond to metadata and inference requests. The “server live” API can be used directly to implement the Kubernetes livenessProbe. 33 | /v2/health/ready: 34 | get: 35 | summary: Server Ready 36 | tags: [] 37 | responses: 38 | '200': 39 | description: OK 40 | operationId: get-v2-health-ready 41 | description: The “server ready” health API indicates if all the models are ready for inferencing. The “server ready” health API can be used directly to implement the Kubernetes readinessProbe. 42 | '/v2/models/${MODEL_NAME}/versions/${MODEL_VERSION}/ready': 43 | parameters: 44 | - schema: 45 | type: string 46 | name: MODEL_NAME 47 | in: path 48 | required: true 49 | - schema: 50 | type: string 51 | name: MODEL_VERSION 52 | in: path 53 | required: true 54 | get: 55 | summary: Model Ready 56 | tags: [] 57 | responses: 58 | '200': 59 | description: OK 60 | operationId: get-v2-models-$-modelName-versions-$-modelVersion-ready 61 | description: The “model ready” health API indicates if a specific model is ready for inferencing. The model name and (optionally) version must be available in the URL. If a version is not provided the server may choose a version based on its own policies. 62 | /v2/: 63 | get: 64 | summary: Server Metadata 65 | tags: [] 66 | responses: 67 | '200': 68 | description: OK 69 | content: 70 | application/json: 71 | schema: 72 | $ref: '#/components/schemas/metadata_server_response' 73 | '400': 74 | description: Bad Request 75 | content: 76 | application/json: 77 | schema: 78 | $ref: '#/components/schemas/metadata_server_error_response' 79 | operationId: get-v2 80 | description: 'The server metadata endpoint provides information about the server. A server metadata request is made with an HTTP GET to a server metadata endpoint. In the corresponding response the HTTP body contains the [Server Metadata Response JSON Object](#server-metadata-response-json-object) or the [Server Metadata Response JSON Error Object](#server-metadata-response-json-error-object).' 81 | '/v2/models/${MODEL_NAME}/versions/${MODEL_VERSION}': 82 | parameters: 83 | - schema: 84 | type: string 85 | name: MODEL_NAME 86 | in: path 87 | required: true 88 | - schema: 89 | type: string 90 | name: MODEL_VERSION 91 | in: path 92 | required: true 93 | get: 94 | summary: Model Metadata 95 | tags: [] 96 | responses: 97 | '200': 98 | description: OK 99 | content: 100 | application/json: 101 | schema: 102 | $ref: '#/components/schemas/metadata_model_response' 103 | operationId: get-v2-models-$-modelName-versions-$-modelVersion 104 | description: 'The per-model metadata endpoint provides information about a model. A model metadata request is made with an HTTP GET to a model metadata endpoint. In the corresponding response the HTTP body contains the [Model Metadata Response JSON Object](#model-metadata-response-json-object) or the [Model Metadata Response JSON Error Object](#model-metadata-response-json-error-object). The model name and (optionally) version must be available in the URL. If a version is not provided the server may choose a version based on its own policies or return an error.' 105 | '/v2/models/${MODEL_NAME}/versions/${MODEL_VERSION}/infer': 106 | parameters: 107 | - schema: 108 | type: string 109 | name: MODEL_NAME 110 | in: path 111 | required: true 112 | - schema: 113 | type: string 114 | name: MODEL_VERSION 115 | in: path 116 | required: true 117 | post: 118 | summary: Inference 119 | operationId: post-v2-models-$-MODEL_NAME-versions-$-MODEL_VERSION-infer 120 | responses: 121 | '200': 122 | description: OK 123 | content: 124 | application/json: 125 | schema: 126 | $ref: '#/components/schemas/inference_response' 127 | '400': 128 | description: Bad Request 129 | content: 130 | application/json: 131 | schema: 132 | $ref: '#/components/schemas/inference_error_response' 133 | requestBody: 134 | content: 135 | application/json: 136 | schema: 137 | $ref: '#/components/schemas/inference_request' 138 | description: 'An inference request is made with an HTTP POST to an inference endpoint. In the request the HTTP body contains the [Inference Request JSON Object](#inference-request-json-object). In the corresponding response the HTTP body contains the [Inference Response JSON Object](#inference-response-json-object) or [Inference Response JSON Error Object](#inference-response-json-error-object). See [Inference Request Examples](#inference-request-examples) for some example HTTP/REST requests and responses.' 139 | components: 140 | schemas: 141 | metadata_server_response: 142 | title: metadata_server_response 143 | type: object 144 | description: '' 145 | x-examples: {} 146 | properties: 147 | name: 148 | type: string 149 | version: 150 | type: string 151 | extensions: 152 | type: array 153 | items: 154 | type: string 155 | required: 156 | - name 157 | - version 158 | - extensions 159 | metadata_server_error_response: 160 | title: metadata_server_error_response 161 | type: object 162 | properties: 163 | error: 164 | type: string 165 | required: 166 | - error 167 | metadata_model_response: 168 | title: metadata_model_response 169 | type: object 170 | properties: 171 | name: 172 | type: string 173 | versions: 174 | type: array 175 | items: 176 | type: string 177 | platform: 178 | type: string 179 | inputs: 180 | type: array 181 | items: 182 | $ref: '#/components/schemas/metadata_tensor' 183 | outputs: 184 | type: array 185 | items: 186 | $ref: '#/components/schemas/metadata_tensor' 187 | properties: 188 | type: object 189 | additionalProperties: 190 | type: string 191 | required: 192 | - name 193 | - platform 194 | metadata_tensor: 195 | title: metadata_tensor 196 | type: object 197 | properties: 198 | name: 199 | type: string 200 | datatype: 201 | type: string 202 | shape: 203 | type: array 204 | items: 205 | type: integer 206 | required: 207 | - name 208 | - datatype 209 | - shape 210 | metadata_model_error_response: 211 | title: metadata_model_error_response 212 | type: object 213 | properties: 214 | error: 215 | type: string 216 | required: 217 | - error 218 | inference_request: 219 | title: inference_request 220 | type: object 221 | x-examples: 222 | Example 1: 223 | id: '42' 224 | inputs: 225 | - name: input0 226 | shape: 227 | - 2 228 | - 2 229 | datatype: UINT32 230 | data: 231 | - 1 232 | - 2 233 | - 3 234 | - 4 235 | - name: input1 236 | shape: 237 | - 3 238 | datatype: BOOL 239 | data: 240 | - true 241 | outputs: 242 | - name: output0 243 | Example 2: 244 | id: '42' 245 | outputs: 246 | - name: output0 247 | shape: 248 | - 3 249 | - 2 250 | datatype: FP32 251 | data: 252 | - 1 253 | - 1.1 254 | - 2 255 | - 2.1 256 | - 3 257 | - 3.1 258 | properties: 259 | id: 260 | type: string 261 | parameters: 262 | $ref: '#/components/schemas/parameters' 263 | inputs: 264 | type: array 265 | items: 266 | $ref: '#/components/schemas/request_input' 267 | outputs: 268 | type: array 269 | items: 270 | $ref: '#/components/schemas/request_output' 271 | required: 272 | - inputs 273 | parameters: 274 | title: parameters 275 | x-examples: {} 276 | type: object 277 | request_input: 278 | title: request_input 279 | type: object 280 | properties: 281 | name: 282 | type: string 283 | shape: 284 | type: array 285 | items: 286 | type: integer 287 | datatype: 288 | type: string 289 | parameters: 290 | $ref: '#/components/schemas/parameters' 291 | data: 292 | $ref: '#/components/schemas/tensor_data' 293 | required: 294 | - name 295 | - shape 296 | - datatype 297 | - data 298 | tensor_data: 299 | title: tensor_data 300 | type: array 301 | items: 302 | anyOf: 303 | - $ref: '#/components/schemas/tensor_data' 304 | - type: number 305 | - type: string 306 | - type: boolean 307 | request_output: 308 | title: request_output 309 | type: object 310 | properties: 311 | name: 312 | type: string 313 | parameters: 314 | $ref: '#/components/schemas/parameters' 315 | required: 316 | - name 317 | response_output: 318 | title: response_output 319 | type: object 320 | properties: 321 | name: 322 | type: string 323 | shape: 324 | type: array 325 | items: 326 | type: integer 327 | datatype: 328 | type: string 329 | parameters: 330 | $ref: '#/components/schemas/parameters' 331 | data: 332 | $ref: '#/components/schemas/tensor_data' 333 | required: 334 | - name 335 | - shape 336 | - datatype 337 | - data 338 | inference_response: 339 | title: inference_response 340 | type: object 341 | properties: 342 | model_name: 343 | type: string 344 | model_version: 345 | type: string 346 | id: 347 | type: string 348 | parameters: 349 | $ref: '#/components/schemas/parameters' 350 | outputs: 351 | type: array 352 | items: 353 | $ref: '#/components/schemas/response_output' 354 | required: 355 | - model_name 356 | - outputs 357 | inference_error_response: 358 | title: inference_error_response 359 | type: object 360 | properties: 361 | error: 362 | type: string 363 | --------------------------------------------------------------------------------