├── .gitignore ├── LICENSE ├── README.md ├── docs ├── model_conversion │ ├── data_types.md │ └── introduction.md ├── ov_config │ ├── _README.md │ ├── enable_hyperthreading.md │ ├── inference_num_threads.md │ ├── num_streams.md │ ├── performance_hint_cumulative_throughput.md │ ├── performance_hint_latency.md │ ├── performance_hint_throughput.md │ └── scheduling_core_type.md └── tools │ ├── ov_device_query.md │ └── working_with_intel_devices.md ├── environment.yaml ├── scripts ├── benchmark │ └── ov_simple_text_bench.py ├── examples │ ├── dedication.png │ ├── optimum_decoder.py │ ├── ov_speculative_decoder_bench.py │ ├── ov_text_model_card.py │ ├── ov_vision.ipynb │ └── ov_vision_model_card.py └── requests │ ├── load_model.sh │ ├── load_vision.sh │ ├── openai_like_completion.sh │ ├── openai_like_models.sh │ ├── status.sh │ └── unload_model.sh ├── src ├── api │ ├── __pycache__ │ │ ├── optimum_api.cpython-311.pyc │ │ └── optimum_api.cpython-312.pyc │ ├── launcher.py │ └── optimum_api.py ├── engine │ ├── __init__.py │ ├── optimum │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── optimum_inference_core.cpython-311.pyc │ │ │ └── optimum_inference_core.cpython-312.pyc │ │ ├── optimum_base_config.py │ │ ├── optimum_image2text.py │ │ ├── optimum_seq2seq.py │ │ ├── optimum_speech.py │ │ ├── optimum_text2image.py │ │ └── optimum_text2text.py │ └── ov_genai │ │ ├── __init__.py │ │ ├── base_configuration.py │ │ ├── llm_pipe_core.py │ │ ├── multimodal_pipe_core.py │ │ ├── txt2img_pipe_core.py │ │ └── whisper_pipe_core.py └── frontend │ ├── __init__.py │ ├── components │ ├── device_info.py │ ├── documentation.py │ ├── loader.py │ ├── model_conversion.py │ └── model_manager.py │ └── tools │ ├── device_query.py │ └── payload_constructor.py ├── start_dashboard.py └── start_server.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Python bytecode and cache files 2 | __pycache__/ 3 | *.py[cod] 4 | *.pyo 5 | *.pyd 6 | *.so 7 | *.pyc 8 | *.egg-info/ 9 | *.egg 10 | .pytest_cache/ 11 | .mypy_cache/ 12 | .obsidian/ 13 | .gradio/ 14 | .venv/ 15 | .vscode/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Welcome to OpenARC 2 | 3 | [![Discord](https://img.shields.io/discord/1341627368581628004?logo=Discord&logoColor=%23ffffff&label=Discord&link=https%3A%2F%2Fdiscord.gg%2FmaMY7QjG)](https://discord.gg/Bzz9hax9Jq) 4 | 5 | 6 | 7 | > [!NOTE] 8 | > OpenArc is under active development. Expect breaking changes. 9 | 10 | **OpenArc** is an inference engine built with Optimum-Intel to leverage hardware acceleration on Intel CPUs, GPUs and NPUs through OpenVINO runtime that integrates closely with Huggingface Transformers. 11 | 12 | Under the hood OpenArc hosts a webserver over a growing collection of Transformers integrated AutoModel classes from Optimum-Intel. These enable accelerating inference on a wide range of tasks, models and source frameworks. 13 | 14 | ## Currently implemented 15 | 16 | OpenArc currently supports text generation and text generation with vision. Support for speculative decoding, generating embeddings, speech tasks, image generation, PaddleOCR, and other 17 | 18 | Currently implemented: 19 | 20 | [OVModelForCausalLM](https://github.com/huggingface/optimum-intel/blob/main/optimum/intel/openvino/modeling_decoder.py#L422) 21 | 22 | [OVModelForVisualCausalLM](https://github.com/huggingface/optimum-intel/blob/main/optimum/intel/openvino/modeling_visual_language.py#L309) 23 | 24 | OpenArc enables a similar workflow to what's possible with Ollama, LM-Studio or OpenRouter but with hardware acceleration from OpenVINO C++ runtime. 25 | 26 | ## Features 27 | 28 | - OpenAI compatible endpoints 29 | - Validated OpenWebUI support, but it should work elsewhere 30 | - Load multiple vision/text models concurrently on multiple devices for hotswap/multi agent workflows 31 | - **Most** HuggingFace text generation models 32 | - Growing set of vision capable LLMs: 33 | - Qwen2-VL 34 | - Qwen2.5-VL 35 | - Gemma 3 36 | ### Gradio management dashboard 37 | - Load models with OpenVINO optimizations 38 | - Build conversion commands 39 | - See loaded models and chosen optimizations 40 | - Unload models and view metadata about 41 | - Query detected devices 42 | - Query device properties 43 | - View tokenizer data 44 | - View architecture metadata from config.json 45 | ### Performance metrics on every completion 46 | - ttft: time to generate first token 47 | - generation_time : time to generate the whole response 48 | - number of tokens: total generated tokens for that request 49 | - tokens per second: measures throughput. 50 | - average token latency: helpful for optimizing zero shot classification tasks 51 | 52 | ## System Requirments 53 | 54 | OpenArc has been built on top of the OpenVINO runtime; as a result OpenArc supports the same range of hardware **but requires device specifc drivers** this document will not cover in-depth. 55 | 56 | Supported operating system are a bit different for each class of device. Please review [system requirments](https://docs.openvino.ai/2025/about-openvino/release-notes-openvino/system-requirements.html#cpu) for OpenVINO 2025.1.0optimum to learn which 57 | 58 | - Windows versions are supported 59 | - Linux distributions are supported 60 | - kernel versions are required 61 | - My system uses version 6.9.4-060904-generic with Ubuntu 24.04 LTS. 62 | - commands for different package managers 63 | - other required dependencies for GPU and NPU 64 | 65 | If you need help installing drivers: 66 | - Join the [Discord](https://discord.gg/PnuTBVcr) 67 | - Open an issue 68 | - Use [Linux Drivers](https://github.com/SearchSavior/OpenArc/discussions/11) 69 | - Use [Windows Drivers](https://github.com/SearchSavior/OpenArc/discussions/12) 70 | 71 |
72 | CPU 73 | 74 | Intel® Core™ Ultra Series 1 and Series 2 (Windows only ) 75 | 76 | Intel® Xeon® 6 processor (preview) 77 | 78 | Intel Atom® Processor X Series 79 | 80 | Intel Atom® processor with Intel® SSE4.2 support 81 | 82 | Intel® Pentium® processor N4200/5, N3350/5, N3450/5 with Intel® HD Graphics 83 | 84 | 6th - 14th generation Intel® Core™ processors 85 | 86 | 1st - 5th generation Intel® Xeon® Scalable Processors 87 | 88 | ARM CPUs with armv7a and higher, ARM64 CPUs with arm64-v8a and higher, Apple® Mac with Apple silicon 89 | 90 |
91 | 92 |
93 | GPU 94 | 95 | Intel® Arc™ GPU Series 96 | 97 | Intel® HD Graphics 98 | 99 | Intel® UHD Graphics 100 | 101 | Intel® Iris® Pro Graphics 102 | 103 | Intel® Iris® Xe Graphics 104 | 105 | Intel® Iris® Xe Max Graphics 106 | 107 | Intel® Data Center GPU Flex Series 108 | 109 | Intel® Data Center GPU Max Series 110 | 111 |
112 | 113 |
114 | NPU 115 | 116 | Intel® Core Ultra Series 117 | 118 | This was a bit harder to list out as the system requirments page does not include an itemized list. However, it is safe to assume that if a device contains an Intel NPU it will be supported. 119 | 120 | The Gradio dashboard has tools for querying your device under the Tools tab to learn what optimization properties are selected by default. 121 | 122 |
123 | 124 | ### Ubuntu 125 | 126 | Create the conda environment: 127 | 128 | conda env create -f environment.yaml 129 | 130 | 131 | Set your API key as an environment variable: 132 | 133 | export OPENARC_API_KEY= 134 | 135 | Build Optimum-Intel from source to get the latest support: 136 | 137 | ``` 138 | pip install "optimum-intel[openvino] @ git+https://github.com/huggingface/optimum-intel" 139 | ``` 140 | 141 | ### Windows 142 | 143 | 1. Install Miniconda from [here](https://www.anaconda.com/docs/getting-started/miniconda/install#windows-installation) 144 | 145 | 2. Navigate to the directory containing the environment.yaml file and run 146 | 147 | conda env create -f environment.yaml 148 | 149 | Set your API key as an environment variable: 150 | 151 | setx OPENARC_API_KEY= 152 | 153 | Build Optimum-Intel from source to get the latest support: 154 | 155 | ``` 156 | pip install "optimum-intel[openvino] @ git+https://github.com/huggingface/optimum-intel" 157 | ``` 158 | 159 | > [!Tips] 160 | - Avoid setting up the environment from IDE extensions. 161 | - Try not to use the environment for other ML projects. Soon we will have uv. 162 | 163 | ## Usage 164 | 165 | OpenArc has two components: 166 | 167 | - start_server.py - launches the inference server 168 | - start_dashboard.py - launches the dashboard, which manages the server and provides some useful tools 169 | 170 | 171 | To launch the inference server run 172 | ``` 173 | python start_server.py --host 0.0.0.0 --openarc-port 8000 174 | ``` 175 | 176 | > host: defines the ip address to bind the server to 177 | 178 | > openarc_port: defines the port which can be used to access the server 179 | 180 | To launch the dashboard run 181 | ``` 182 | python start_dashboard.py --openarc-port 8000 183 | ``` 184 | > openarc_port: defines the port which requests from the dashboard use 185 | 186 | Run these in two different terminals. 187 | 188 | > [!NOTE] 189 | > Gradio handles ports natively so the port number does not need to be set. Default is 7860 but it will increment if another instance of gradio is running. 190 | 191 | ## OpenWebUI 192 | 193 | > [!NOTE] 194 | > I'm only going to cover the basics on OpenWebUI here. To learn more and set it up check out the [OpenWebUI docs](https://docs.openwebui.com/). 195 | 196 | - From the Connections menu add a new connection 197 | - Enter the server address and port where OpenArc is running **followed by /v1** 198 | Example: 199 | http://0.0.0.0:8000/v1 200 | 201 | - Here you need to set the API key manually 202 | - When you hit the refresh button OpenWebUI sends a GET request to the OpenArc server to get the list of models at v1/models 203 | 204 | Serverside logs should report: 205 | 206 | "GET /v1/models HTTP/1.1" 200 OK 207 | 208 | ### Other Frontends 209 | 210 | OpenArc _mostly_ conforms to the openai API specification. In practice this means other frontends, python classes and community tooling will be compatible. 211 | 212 | Tested: 213 | 214 | (mikupad)[https://github.com/lmg-anon/mikupad] 215 | 216 | ### Usage: 217 | 218 | - Load the model you want to use from the dashboard 219 | - Select the connection you just created and use the refresh button to update the list of models 220 | - if you use API keys and have a list of models these might be towards the bottom 221 | 222 | ## Convert to [OpenVINO IR](https://docs.openvino.ai/2025/documentation/openvino-ir-format.html) 223 | 224 | There are a few source of models which can be used with OpenArc; 225 | 226 | - [OpenVINO LLM Collection on HuggingFace](https://huggingface.co/collections/OpenVINO/llm-6687aaa2abca3bbcec71a9bd) 227 | 228 | - [My HuggingFace repo](https://huggingface.co/Echo9Zulu) 229 | - My repo contains preconverted models for a variety of architectures and usecases 230 | - OpenArc supports almost all of them 231 | - **These get updated regularly so check back often!** 232 | 233 | You can easily craft conversion commands using my HF Space, [Optimum-CLI-Tool_tool](https://huggingface.co/spaces/Echo9Zulu/Optimum-CLI-Tool_tool) or in the OpenArc Dashboard. 234 | 235 | This tool respects the positional arguments defined [here](https://huggingface.co/docs/optimum/main/en/intel/openvino/export), then execute commands in the OpenArc environment. 236 | 237 | | Models | Compressed Weights | 238 | | ----------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------ | 239 | | [Ministral-3b-instruct-int4_asym-ov](https://huggingface.co/Echo9Zulu/Ministral-3b-instruct-int4_asym-ov) | 1.85 GB | 240 | | [Hermes-3-Llama-3.2-3B-awq-ov](https://huggingface.co/Echo9Zulu/Hermes-3-Llama-3.2-3B-awq-ov) | 1.8 GB | 241 | | [Llama-3.1-Tulu-3-8B-int4_asym-ov](https://huggingface.co/Echo9Zulu/Llama-3.1-Tulu-3-8B-int4_asym-ov/tree/main) | 4.68 GB | 242 | | [Qwen2.5-7B-Instruct-1M-int4-ov](https://huggingface.co/Echo9Zulu/Qwen2.5-7B-Instruct-1M-int4-ov) | 4.46 GB | 243 | | [Meta-Llama-3.1-8B-SurviveV3-int4_asym-awq-se-wqe-ov](https://huggingface.co/Echo9Zulu/Meta-Llama-3.1-8B-SurviveV3-int4_asym-awq-se-wqe-ov) | 4.68 GB | 244 | | [Falcon3-10B-Instruct-int4_asym-ov](https://huggingface.co/Echo9Zulu/Falcon3-10B-Instruct-int4_asym-ov) | 5.74 GB | 245 | | [Echo9Zulu/phi-4-int4_asym-awq-ov](https://huggingface.co/Echo9Zulu/phi-4-int4_asym-awq-ov) | 8.11 GB | 246 | | [DeepSeek-R1-Distill-Qwen-14B-int4-awq-ov](https://huggingface.co/Echo9Zulu/DeepSeek-R1-Distill-Qwen-14B-int4-awq-ov/tree/main) | 7.68 GB | 247 | | [Phi-4-o1-int4_asym-awq-weight_quantization_error-ov](https://huggingface.co/Echo9Zulu/Phi-4-o1-int4_asym-awq-weight_quantization_error-ov) | 8.11 GB | 248 | | [Mistral-Small-24B-Instruct-2501-int4_asym-ov](https://huggingface.co/Echo9Zulu/Mistral-Small-24B-Instruct-2501-int4_asym-ov) | 12.9 GB | 249 | 250 | Documentation on choosing parameters for conversion is coming soon; we also have a channel in Discord for this topic. 251 | 252 | > [!NOTE] 253 | > The optimum CLI tool integrates several different APIs from several different Intel projects; it is a better alternative than using APIs in from_pretrained() methods. 254 | > It references prebuilt export configurations for each supported model architecture meaning **not all models are supported** but most are. If you use the CLI tool and get an error about an unsupported architecture follow the link, [open an issue](https://github.com/huggingface/optimum-intel/issues) with references to the model card and the maintainers will get back to you. 255 | 256 | > [!NOTE] 257 | > A naming convention for openvino converted models is coming soon. 258 | 259 | ## Performance with OpenVINO runtime 260 | 261 | Notes on the test: 262 | 263 | - No openvino optimization parameters were used 264 | - Fixed input length 265 | - I sent one user message 266 | - Quant strategies for models are not considered 267 | - I converted each of these models myself (I'm working on standardizing model cards to share this information more directly) 268 | - OpenVINO generates a cache on first inference so metrics are on second generation 269 | - Seconds were used for readability 270 | 271 | Test System: 272 | 273 | CPU: Xeon W-2255 (10c, 20t) @3.7ghz 274 | GPU: 3x Arc A770 16GB Asrock Phantom 275 | RAM: 128gb DDR4 ECC 2933 mhz 276 | Disk: 4tb ironwolf, 1tb 970 Evo 277 | 278 | OS: Ubuntu 24.04 279 | Kernel: 6.9.4-060904-generic 280 | 281 | Prompt: "We don't even have a chat template so strap in and let it ride!" 282 | max_new_tokens= 128 283 | --- 284 | 285 | ### GPU Performance: 1x Arc A770 286 | 287 | | Model | Prompt Processing (sec) | Throughput (t/sec) | Duration (sec) | Size (GB) | 288 | | ------------------------------------------------ | ----------------------- | ------------------ | -------------- | --------- | 289 | | Phi-4-mini-instruct-int4_asym-gptq-ov | 0.41 | 47.25 | 3.10 | 2.3 | 290 | | Hermes-3-Llama-3.2-3B-int4_sym-awq-se-ov | 0.27 | 64.18 | 0.98 | 1.8 | 291 | | Llama-3.1-Nemotron-Nano-8B-v1-int4_sym-awq-se-ov | 0.32 | 47.99 | 2.96 | 4.7 | 292 | | phi-4-int4_asym-awq-se-ov | 0.30 | 25.27 | 5.32 | 8.1 | 293 | | DeepSeek-R1-Distill-Qwen-14B-int4_sym-awq-se-ov | 0.42 | 25.23 | 1.56 | 8.4 | 294 | | Mistral-Small-24B-Instruct-2501-int4_asym-ov | 0.36 | 18.81 | 7.11 | 12.9 | 295 | 296 | 297 | ### CPU Performance: Xeon W-2255 298 | 299 | | Model | Prompt Processing (sec) | Throughput (t/sec) | Duration (sec) | Size (GB) | 300 | | ------------------------------------------------ | ----------------------- | ------------------ | -------------- | --------- | 301 | | Phi-4-mini-instruct-int4_asym-gptq-ov | 1.02 | 20.44 | 7.23 | 2.3 | 302 | | Hermes-3-Llama-3.2-3B-int4_sym-awq-se-ov | 1.06 | 23.66 | 3.01 | 1.8 | 303 | | Llama-3.1-Nemotron-Nano-8B-v1-int4_sym-awq-se-ov | 2.53 | 13.22 | 12.14 | 4.7 | 304 | | phi-4-int4_asym-awq-se-ov | 4 | 6.63 | 23.14 | 8.1 | 305 | | DeepSeek-R1-Distill-Qwen-14B-int4_sym-awq-se-ov | 5.02 | 7.25 | 11.09 | 8.4 | 306 | | Mistral-Small-24B-Instruct-2501-int4_asym-ov | 6.88 | 4.11 | 37.5 | 12.9 | 307 | | Nous-Hermes-2-Mixtral-8x7B-DPO-int4-sym-se-ov | 15.56 | 6.67 | 34.60 | 24.2 | 308 | 309 | 310 | ### Resources 311 | --- 312 | Learn more about how to leverage your Intel devices for Machine Learning: 313 | 314 | [openvino_notebooks](https://github.com/openvinotoolkit/openvino_notebooks) 315 | 316 | [Inference with Optimum-Intel](https://github.com/huggingface/optimum-intel/blob/main/notebooks/openvino/optimum_openvino_inference.ipynb) 317 | 318 | [Optimum-Intel Transformers](https://huggingface.co/docs/optimum/main/en/intel/index) 319 | 320 | [NPU Devices](https://docs.openvino.ai/2025/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.html) 321 | 322 | ## Acknowledgments 323 | 324 | OpenArc stands on the shoulders of several other projects: 325 | 326 | [Optimum-Intel](https://github.com/huggingface/optimum-intel) 327 | 328 | [OpenVINO](https://github.com/openvinotoolkit/openvino) 329 | 330 | [OpenVINO GenAI](https://github.com/openvinotoolkit/openvino.genai) 331 | 332 | [Transformers](https://github.com/huggingface/transformers) 333 | 334 | [FastAPI](https://github.com/fastapi/fastapi) 335 | 336 | Thank for yoru work!! 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | -------------------------------------------------------------------------------- /docs/model_conversion/data_types.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/docs/model_conversion/data_types.md -------------------------------------------------------------------------------- /docs/model_conversion/introduction.md: -------------------------------------------------------------------------------- 1 | ## Model Conversion 2 | 3 | OpenVINO is an inference engine for leveraging diverse types of compute. To squeeze as much performance as possible from any hardware requires a bit more work than using the naive approach, especially once you have a usecase in mind and know what hardware you are using. 4 | 5 | ### The Naive Approach 6 | 7 | OpenVINO defaults to **int8_asym** when setting "export=True" in both **OVModelForCausalLM.from_pretrained()** and the Optimum CLI Export Tool if no arguments for weight_format are passed. 8 | 9 | OpenArc has been designed for usecases which wander toward the bleeding edge of AI where users are expected to understand the nuance of datatypes, quantization strategies, calibration datasets, how these parameters contribute to accuracy loss and maybe have just come from IPEX or (as of 2.5) 'vanilla' Pytorch and are looking to optimize a deployment. 10 | 11 | For convience "export=False" is exposed on the /model/load endpoint; however I **strongly discourage** using it. To get the best performance from OpenVINO you have to get into the weeds. 12 | 13 | ### The Less Naive Approach to Model Conversion 14 | 15 | Many Intel CPUs support INT8 but it isn't always the best choice. 16 | 17 | OpenVINO notebooks prove out that INT4 weight only compression coupled with quantization strategies like AWQ + Scale Estimation achieve better performance across the Intel device ecosystem with negligable accuracy loss. Still, different model architectures offer different performance reguardless of the chosen datatype; in practice it can be hard to predict how a model will perform so understanding how these parameter's work is essential to maximizing throughput by testing different configurations on the same target model. 18 | 19 | 20 | ### Why Speed Matters 21 | 22 | Nvidia GPUs are faster and have a better open source backbone than Intel. However, Intel devices are cheaper by comparison. Even so, I don't want speed for the sake of being fast. OpenArc has been tooled for Agentic usecases and synthetic data generation where low throughput can damage workflow execution. 23 | 24 | If I want to dump some problem into a RoundRobin style multi-turn chat I am not sitting there waiting for 25 | 26 | 27 | 28 | Note: If you are using cloud compute which uses Intel devices it should still work -------------------------------------------------------------------------------- /docs/ov_config/_README.md: -------------------------------------------------------------------------------- 1 | ### OpenVINO Configuration Options: ov_config 2 | 3 | 4 | 5 | 'ov_config' is where all the OpenVINO specific hardware configurations live; it's the secret sauce of OpenArc and represents the interface between 'Transformers'. 6 | 7 | 'ov_config' is that slope which becomes the deep end all at once; once you begin tinkering with these settings the true power of OpenVINO acceleration emerges. Learning how to achieve this took a lot of time 8 | -------------------------------------------------------------------------------- /docs/ov_config/enable_hyperthreading.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /docs/ov_config/inference_num_threads.md: -------------------------------------------------------------------------------- 1 | 2 | inference_num_threads is the number of CPU cores that will be used for inference. 3 | 4 | Use **htop** or **hwinfo** to watch the CPU usage during inference and tinker with this number to increase throughput, lower latency for all types of requests 5 | 6 | -------------------------------------------------------------------------------- /docs/ov_config/num_streams.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/docs/ov_config/num_streams.md -------------------------------------------------------------------------------- /docs/ov_config/performance_hint_cumulative_throughput.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/docs/ov_config/performance_hint_cumulative_throughput.md -------------------------------------------------------------------------------- /docs/ov_config/performance_hint_latency.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/docs/ov_config/performance_hint_latency.md -------------------------------------------------------------------------------- /docs/ov_config/performance_hint_throughput.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/docs/ov_config/performance_hint_throughput.md -------------------------------------------------------------------------------- /docs/ov_config/scheduling_core_type.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | `ov::hint::scheduling_core_type` specifies the type of CPU cores for CPU inference when the user runs inference on a hybird platform that includes both Performance-cores (P-cores) and Efficient-cores (E-cores). If the user platform only has one type of CPU core, this property has no effect, and CPU inference always uses this unique core type. -------------------------------------------------------------------------------- /docs/tools/ov_device_query.md: -------------------------------------------------------------------------------- 1 | ## Diagnostic: Device Query 2 | 3 | 4 | Reccomended usage strategies: 5 | - Driver issues 6 | - Device access permissions 7 | - Test Hardware access from containers 8 | - Python path visibility 9 | - Proper environment variable configuration 10 | 11 | #### Example use cases: 12 | 13 | 1. Evaluating conflicting dependencies 14 | - With careful dependency management you can control hardware across the Intel AI stack. 15 | - However 16 | 17 | 18 | 2. Say you need to have PyTorch, IPEX and OpenVINO in one conda env. 19 | - This test alongside an XPU device query creates useful diagnostic infomration. 20 | - 21 | -------------------------------------------------------------------------------- /docs/tools/working_with_intel_devices.md: -------------------------------------------------------------------------------- 1 | ## Introduction to working with Intel Devices 2 | 3 | This document offers discussion of "lessons-learned" from months of working with Intel GPU devices; *hours* of blood, sweat, and tears went into setting up this project and it's a good place to share what I've learned. At this stage in the Intel AI Stack it seems like a neccessary contribution to the community. 4 | 5 | ### What is OpenVINO? 6 | 7 | OpenVINO is an inference backend for *acclerating* inference deployments of machine learning models on Intel hardware. It can be hard to understand the documentation- the Intel AI stack has many staff engineers/contributors to all manner of areas in the open source ecosystem and much of the stack is evolving without massive community contributions like what we have seen with llama.cpp. 8 | 9 | Many reasons contribute to the decline of Intel's dominance/popularity in the hardware space in the past few years; however they offer extensive open source contributions to many areas of AI, ML and have been since before [Attention Is All You Need](https://arxiv.org/abs/1706.03762). AI didn't start in 2017- however the demand for faster inference on existing infrastructure has never been higher. Plus, Arc chips are cheap but come with a steep learning curve. Sure, you can settle for Vulkan... but you aren't here to download a GGUF and send it. 10 | 11 | 12 | 13 | ### OpenVINO Utilities 14 | 15 | Various utilities live in this notebook to help users of OpenArc understand the properties of their devices; mastering understanding of available data types, quantization strategies and available optimization techniques is only one part of learning to use OpenVINO on different kinds of hardware. 16 | 17 | Check out the [Guide to the OpenVINO IR] and then use my [Command Line Tool tool](https://huggingface.co/spaces/Echo9Zulu/Optimum-CLI-Tool_tool) to perform converion. There are default approachs that "work" but to really leverage available compute you have to dig deeper and convert models yourself on a per-usecase basis. -------------------------------------------------------------------------------- /environment.yaml: -------------------------------------------------------------------------------- 1 | name: OpenArc_test2 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - python=3.11 7 | - pip 8 | - pip: 9 | - transformers==4.51.0 --no-deps 10 | - optimum[openvino] 11 | - openvino==2025.1.0 12 | - openvino-tokenizers==2025.1.0.0 13 | - openvino-genai==2025.1.0.0 14 | - fastapi 15 | - gradio 16 | - pydantic 17 | - uvicorn -------------------------------------------------------------------------------- /scripts/benchmark/ov_simple_text_bench.py: -------------------------------------------------------------------------------- 1 | import openvino_genai as ov_genai 2 | 3 | 4 | 5 | model_dir = "/mnt/Ironwolf-4TB/Models/Pytorch/Hermes-3-Llama-3.2-3B-int4_sym-awq-se-ov" 6 | 7 | pipe = ov_genai.LLMPipeline( 8 | model_dir, # Path to the model directory 9 | device="GPU.2", # Define the device to use 10 | ) 11 | 12 | generation_config = ov_genai.GenerationConfig( 13 | max_new_tokens=128 14 | ) 15 | 16 | prompt = "You're the fastest Llama this side of the equator" 17 | 18 | result = pipe.generate([prompt], generation_config=generation_config) 19 | perf_metrics = result.perf_metrics 20 | 21 | print(f'Load time: {perf_metrics.get_load_time() / 1000:.2f} s') 22 | print(f'TTFT: {perf_metrics.get_ttft().mean / 1000:.2f} seconds') 23 | print(f'TPOT: {perf_metrics.get_tpot().mean:.2f} ms/token') 24 | print(f'Throughput: {perf_metrics.get_throughput().mean:.2f} tokens/s') 25 | print(f'Generate duration: {perf_metrics.get_generate_duration().mean / 1000:.2f} seconds') 26 | 27 | print(f"Result: {result}") -------------------------------------------------------------------------------- /scripts/examples/dedication.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/scripts/examples/dedication.png -------------------------------------------------------------------------------- /scripts/examples/optimum_decoder.py: -------------------------------------------------------------------------------- 1 | from optimum.intel import OVModelForCausalLM 2 | from transformers import AutoTokenizer 3 | 4 | prompt = "Alice and Bob" 5 | checkpoint = "/media/ecomm/c0889304-9e30-4f04-b290-c7db463872c6/Models/Pytorch/Llama-3.1-Nemotron-Nano-8B-v1-int4_sym-awq-se-ov" 6 | assistant_checkpoint = "/media/ecomm/c0889304-9e30-4f04-b290-c7db463872c6/Models/OpenVINO/Llama-3.1-8B-Instruct-FastDraft-150M-int8-ov" 7 | 8 | tokenizer = AutoTokenizer.from_pretrained(checkpoint) 9 | inputs = tokenizer(prompt, return_tensors="pt") 10 | 11 | model = OVModelForCausalLM.from_pretrained(checkpoint, device="CPU", export=False) 12 | assistant_model = OVModelForCausalLM.from_pretrained(assistant_checkpoint, device="CPU", export=False) 13 | outputs = model.generate(**inputs, assistant_model=assistant_model) 14 | tokenizer.batch_decode(outputs, skip_special_tokens=True) -------------------------------------------------------------------------------- /scripts/examples/ov_speculative_decoder_bench.py: -------------------------------------------------------------------------------- 1 | import openvino_genai as ov_genai 2 | 3 | # Define model paths 4 | draft_model_path = r"/media/ecomm/c0889304-9e30-4f04-b290-c7db463872c6/Models/OpenVINO/Llama-3.1-8B-Instruct-FastDraft-150M-int8-ov" 5 | main_model_path = r"/media/ecomm/c0889304-9e30-4f04-b290-c7db463872c6/Models/Pytorch/Llama-3.1-Nemotron-Nano-8B-v1-int4_sym-awq-se-ov" 6 | 7 | 8 | prompt = "What is OpenVINO?" 9 | 10 | config = ov_genai.GenerationConfig() 11 | config.num_assistant_tokens = 28 12 | config.max_new_tokens = 128 13 | 14 | 15 | main_device = "CPU" 16 | draft_device = "CPU" 17 | 18 | draft_model = ov_genai.draft_model(draft_model_path, draft_device) 19 | 20 | scheduler_config = ov_genai.SchedulerConfig() 21 | scheduler_config.cache_size = 2 22 | 23 | pipe = ov_genai.LLMPipeline( 24 | main_model_path, 25 | main_device, 26 | draft_model=draft_model 27 | ) 28 | 29 | prompt = "We don't even have a chat template so strap in and let it ride!" 30 | 31 | result = pipe.generate([prompt], generation_config=config, scheduler_config=scheduler_config) 32 | perf_metrics = result.perf_metrics 33 | 34 | 35 | print(f'Generate duration: {perf_metrics.get_generate_duration().mean:.2f}') 36 | print(f'TTFT: {perf_metrics.get_ttft().mean:.2f} ms') 37 | print(f'TPOT: {perf_metrics.get_tpot().mean:.2f} ms/token') 38 | print(f'Throughput: {perf_metrics.get_throughput().mean:.2f} tokens/s') 39 | 40 | print(result) -------------------------------------------------------------------------------- /scripts/examples/ov_text_model_card.py: -------------------------------------------------------------------------------- 1 | import time 2 | from threading import Thread 3 | from transformers import AutoTokenizer, TextIteratorStreamer 4 | from optimum.intel.openvino import OVModelForCausalLM 5 | 6 | 7 | model_id = "/mnt/Ironwolf-4TB/Models/Pytorch/Hermes-3-Llama-3.2-3B-int4_sym-awq-se-ov" # Can be a local path or an HF id 8 | # ov_config = {"PERFORMANCE_HINT": "LATENCY"} 9 | 10 | print("Loading model...") 11 | load_time = time.perf_counter() 12 | model = OVModelForCausalLM.from_pretrained( 13 | model_id, 14 | export=False, 15 | device="GPU.0", 16 | # ov_config=ov_config 17 | ) 18 | tokenizer = AutoTokenizer.from_pretrained(model_id) 19 | load_time = time.perf_counter() - load_time 20 | print(f"Model loaded in {load_time:.3f} seconds.") 21 | 22 | text_prompt = "We really should join the OpenArc Discord" 23 | conversation = [ 24 | { 25 | "role": "user", 26 | "content": text_prompt 27 | } 28 | ] 29 | text_prompt_templated = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) 30 | inputs = tokenizer(text=text_prompt_templated, return_tensors="pt") 31 | input_token_count = inputs['input_ids'].shape[1] 32 | 33 | streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) 34 | generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=128) 35 | thread = Thread(target=model.generate, kwargs=generation_kwargs) 36 | 37 | first_token_received = False 38 | generate_start = 0.0 39 | first_token = 0.0 40 | ttft = 0.0 41 | generated_text = "" 42 | 43 | generate_start = time.perf_counter() 44 | thread.start() 45 | 46 | for new_text in streamer: 47 | if not first_token_received: 48 | first_token = time.perf_counter() 49 | ttft = first_token - generate_start 50 | first_token_received = True 51 | 52 | print(new_text, end='', flush=True) 53 | generated_text += new_text 54 | 55 | thread.join() 56 | generate_end = time.perf_counter() 57 | 58 | generation_time = generate_end - generate_start 59 | 60 | num_tokens_generated = len(tokenizer.encode(generated_text)) 61 | 62 | if generation_time > 0 and num_tokens_generated > 0: 63 | tokens_per_second = num_tokens_generated / generation_time 64 | average_token_latency = generation_time / num_tokens_generated 65 | 66 | print("\nPerformance Report:") 67 | print("-"*50) 68 | print(f"Input Tokens : {input_token_count:>9}") 69 | print(f"Output Tokens : {num_tokens_generated:>9}") 70 | print("") 71 | print(f"Load Time : {load_time:>9.3f} sec (Model Load Time)") 72 | print(f"TTFT : {ttft:>9.3f} sec (Time To First Token)") 73 | print(f"Generation Time : {generation_time:>9.3f} sec (Total Generation Time)") 74 | print(f"Throughput : {tokens_per_second:>9.2f} t/s (Tokens Per Second)") 75 | print(f"Avg Latency : {average_token_latency:>9.3f} sec (Average Token Latency)") 76 | print("-"*50) 77 | -------------------------------------------------------------------------------- /scripts/examples/ov_vision.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Reference \n", 10 | "\n", 11 | "import time\n", 12 | "import warnings\n", 13 | "from PIL import Image\n", 14 | "from transformers import AutoProcessor\n", 15 | "from optimum.intel.openvino import OVModelForVisualCausalLM\n", 16 | "\n", 17 | "# Suppress specific deprecation warnings from optimum implementation of numpy arrays\n", 18 | "# This block prevents clogging the API logs \n", 19 | "warnings.filterwarnings(\"ignore\", message=\"__array__ implementation doesn't accept a copy keyword\")\n", 20 | "\n", 21 | "\n", 22 | "model_id = \"/mnt/Ironwolf-4TB/Models/Pytorch/Qwen2.5-VL-7B-Instruct-int4_sym-ov\"\n", 23 | "\n", 24 | "\n", 25 | "ov_config = {\"PERFORMANCE_HINT\": \"LATENCY\"}\n", 26 | "model = OVModelForVisualCausalLM.from_pretrained(model_id, export=False, device=\"GPU.2\", ov_config=ov_config) #trust_remote_code=True)\n", 27 | "processor = AutoProcessor.from_pretrained(model_id)\n", 28 | "\n", 29 | "\n", 30 | "image_path = \"dedication.png\"\n", 31 | "image = Image.open(image_path)\n", 32 | "image = image.convert(\"RGB\")\n", 33 | "\n", 34 | "conversation = [\n", 35 | " {\n", 36 | " \"role\": \"user\",\n", 37 | " \"content\": [\n", 38 | " {\n", 39 | " \"image\": image # The image object is passed here, not just declared as a type\n", 40 | " },\n", 41 | " {\"type\": \"text\", \"text\": \"\\nDescribe this image.\"},\n", 42 | " ],\n", 43 | " }\n", 44 | "]\n", 45 | "\n", 46 | "\n", 47 | "# Preprocess the inputs\n", 48 | "text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)\n", 49 | "# Excepted output: '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n<|im_start|>user\\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\\n<|im_start|>assistant\\n'\n", 50 | "\n", 51 | "inputs = processor(text=[text_prompt], images=[image], padding=True, return_tensors=\"pt\")\n", 52 | "\n", 53 | "# Print number of tokens\n", 54 | "# print(f\"Input token length: {len(inputs.input_ids[0])}\")\n", 55 | "\n", 56 | "# Inference: Generation of the output with performance metrics\n", 57 | "start_time = time.time()\n", 58 | "output_ids = model.generate(**inputs, max_new_tokens=1024)\n", 59 | "generation_time = time.time() - start_time\n", 60 | "\n", 61 | "generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]\n", 62 | "output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)\n", 63 | "\n", 64 | "# Calculate tokens per second\n", 65 | "num_tokens_generated = len(generated_ids[0])\n", 66 | "tokens_per_second = num_tokens_generated / generation_time\n", 67 | "\n", 68 | "print(f\"Generated text: {output_text}\")\n", 69 | "print(f\"Generation time: {generation_time:.2f} seconds\")\n", 70 | "print(f\"Tokens generated: {num_tokens_generated}\")\n", 71 | "print(f\"Speed: {tokens_per_second:.2f} tokens/second\")" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "pip install optimum[openvino]+https://github.com/huggingface/optimum-intel" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 2, 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "name": "stdout", 95 | "output_type": "stream", 96 | "text": [ 97 | "Input token length: 265\n" 98 | ] 99 | }, 100 | { 101 | "name": "stderr", 102 | "output_type": "stream", 103 | "text": [ 104 | "/home/echo/anaconda3/envs/OpenArc-Test/lib/python3.11/site-packages/transformers/generation/utils.py:1811: UserWarning: This model does not support `Cache` instances, it only supports the legacy cache format (tuple of tuples). `cache_implementation` (set to hybrid) will be ignored.\n", 105 | " warnings.warn(\n" 106 | ] 107 | }, 108 | { 109 | "name": "stdout", 110 | "output_type": "stream", 111 | "text": [ 112 | "Generated text: \n", 113 | "Generation time: 38.18 seconds\n", 114 | "Tokens generated: 1024\n", 115 | "Speed: 26.82 tokens/second\n" 116 | ] 117 | } 118 | ], 119 | "source": [ 120 | "import time\n", 121 | "import warnings\n", 122 | "from PIL import Image\n", 123 | "from transformers import AutoProcessor\n", 124 | "from optimum.intel.openvino import OVModelForVisualCausalLM\n", 125 | "\n", 126 | "# Suppress specific deprecation warnings\n", 127 | "warnings.filterwarnings(\"ignore\", message=\"__array__ implementation doesn't accept a copy keyword\")\n", 128 | "\n", 129 | "model_id = \"/mnt/Ironwolf-4TB/Models/Pytorch/gemma-3-4b-it-int4_asym-ov\"\n", 130 | "\n", 131 | "ov_config = {\"PERFORMANCE_HINT\": \"LATENCY\"}\n", 132 | "# Ensure export=False is correct if the model is already converted\n", 133 | "model = OVModelForVisualCausalLM.from_pretrained(model_id, export=False, device=\"GPU.2\", ov_config=ov_config)\n", 134 | "processor = AutoProcessor.from_pretrained(model_id)\n", 135 | "\n", 136 | "image_path = \"dedication.png\"\n", 137 | "image = Image.open(image_path)\n", 138 | "image = image.convert(\"RGB\")\n", 139 | "\n", 140 | "# --- CORRECTED MODIFICATION START ---\n", 141 | "\n", 142 | "# 1. Get the correct \"beginning of image\" token from the processor\n", 143 | "# This is what the processor internally looks for when matching text and images.\n", 144 | "image_token = processor.tokenizer.boi_token # Or potentially processor.boi_token if defined directly\n", 145 | "\n", 146 | "# 2. Define the text prompt using THIS specific token\n", 147 | "text_prompt_with_placeholder = f\"{image_token}\\nDescribe this image.\"\n", 148 | "\n", 149 | "# 3. Call the processor ONCE, providing both text (with the correct placeholder) and image\n", 150 | "inputs = processor(\n", 151 | " text=[text_prompt_with_placeholder], # Pass the string with the correct token\n", 152 | " images=[image], # Pass the PIL image object\n", 153 | " padding=True,\n", 154 | " return_tensors=\"pt\"\n", 155 | ") # Move inputs to the same device as the model\n", 156 | "\n", 157 | "# --- CORRECTED MODIFICATION END ---\n", 158 | "\n", 159 | "# Print number of tokens (of the processed input)\n", 160 | "print(f\"Input token length: {inputs.input_ids.shape[1]}\") # Use shape[1] for tensor length\n", 161 | "\n", 162 | "# Inference: Generation of the output with performance metrics\n", 163 | "start_time = time.time()\n", 164 | "output_ids = model.generate(**inputs, max_new_tokens=1024)\n", 165 | "generation_time = time.time() - start_time\n", 166 | "\n", 167 | "# Adjust slicing\n", 168 | "input_ids_len = inputs.input_ids.shape[1]\n", 169 | "generated_ids = output_ids[:, input_ids_len:] # Correct slicing for tensors\n", 170 | "\n", 171 | "# Post-processing\n", 172 | "output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)\n", 173 | "\n", 174 | "# Calculate tokens per second\n", 175 | "num_tokens_generated = len(generated_ids[0])\n", 176 | "tokens_per_second = num_tokens_generated / generation_time if generation_time > 0 else 0\n", 177 | "\n", 178 | "# Join the list of strings into a single string if needed\n", 179 | "final_output_text = \"\".join(output_text)\n", 180 | "\n", 181 | "print(f\"Generated text: {final_output_text}\")\n", 182 | "print(f\"Generation time: {generation_time:.2f} seconds\")\n", 183 | "print(f\"Tokens generated: {num_tokens_generated}\")\n", 184 | "print(f\"Speed: {tokens_per_second:.2f} tokens/second\")" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "name": "stdout", 194 | "output_type": "stream", 195 | "text": [ 196 | "Input token length: 273\n" 197 | ] 198 | }, 199 | { 200 | "name": "stderr", 201 | "output_type": "stream", 202 | "text": [ 203 | "/home/echo/anaconda3/envs/OpenArc-Test/lib/python3.11/site-packages/transformers/generation/utils.py:1811: UserWarning: This model does not support `Cache` instances, it only supports the legacy cache format (tuple of tuples). `cache_implementation` (set to hybrid) will be ignored.\n", 204 | " warnings.warn(\n" 205 | ] 206 | }, 207 | { 208 | "name": "stdout", 209 | "output_type": "stream", 210 | "text": [ 211 | "Generated text: ['']\n" 212 | ] 213 | } 214 | ], 215 | "source": [ 216 | "# working\n", 217 | "\n", 218 | "import warnings\n", 219 | "from PIL import Image\n", 220 | "from transformers import AutoProcessor\n", 221 | "from optimum.intel.openvino import OVModelForVisualCausalLM\n", 222 | "\n", 223 | "# Suppress specific deprecation warnings from optimum implementation of numpy arrays\n", 224 | "# This block prevents clogging the API logs \n", 225 | "warnings.filterwarnings(\"ignore\", message=\"__array__ implementation doesn't accept a copy keyword\")\n", 226 | "\n", 227 | "\n", 228 | "model_id = \"/mnt/Ironwolf-4TB/Models/Pytorch/gemma-3-4b-it-int4_asym-ov\"\n", 229 | "\n", 230 | "\n", 231 | "ov_config = {\"PERFORMANCE_HINT\": \"LATENCY\"}\n", 232 | "model = OVModelForVisualCausalLM.from_pretrained(model_id, export=False, device=\"GPU.1\", ov_config=ov_config)\n", 233 | "processor = AutoProcessor.from_pretrained(model_id)\n", 234 | "\n", 235 | "\n", 236 | "image_path = \"dedication.png\"\n", 237 | "image = Image.open(image_path)\n", 238 | "image = image.convert(\"RGB\")\n", 239 | "\n", 240 | "conversation = [\n", 241 | " {\n", 242 | " \"role\": \"user\",\n", 243 | " \"content\": [\n", 244 | " {\n", 245 | " \"type\": \"image\",\n", 246 | " },\n", 247 | " {\"type\": \"text\", \"text\": \"Describe this image.\"},\n", 248 | " ],\n", 249 | " }\n", 250 | "]\n", 251 | "\n", 252 | "\n", 253 | "# Preprocess the inputs\n", 254 | "text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)\n", 255 | "# Excepted output: '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n<|im_start|>user\\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\\n<|im_start|>assistant\\n'\n", 256 | "\n", 257 | "inputs = processor(text=[text_prompt], images=[image], padding=True, return_tensors=\"pt\")\n", 258 | "\n", 259 | "# Print tokenizer length\n", 260 | "print(f\"Input token length: {len(inputs.input_ids[0])}\")\n", 261 | "\n", 262 | "# Generate output\n", 263 | "output_ids = model.generate(**inputs, max_new_tokens=1024)\n", 264 | "\n", 265 | "generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]\n", 266 | "output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)\n", 267 | "\n", 268 | "print(f\"Generated text: {output_text}\")" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "import base64\n", 278 | "from PIL import Image\n", 279 | "import io\n", 280 | "image_path = \"dedication.png\"\n", 281 | "image = Image.open(image_path)\n", 282 | "\n", 283 | "# Convert image to base64\n", 284 | "buffered = io.BytesIO()\n", 285 | "image.save(buffered, format=\"PNG\")\n", 286 | "img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')\n", 287 | "\n", 288 | "# Print the base64 encoding\n", 289 | "print(f\"Base64 encoded image: {img_str}\")\n" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "# Reference \n", 299 | "\n", 300 | "import time\n", 301 | "import warnings\n", 302 | "import base64\n", 303 | "from io import BytesIO\n", 304 | "from PIL import Image\n", 305 | "from transformers import AutoProcessor\n", 306 | "from optimum.intel.openvino import OVModelForVisualCausalLM\n", 307 | "\n", 308 | "# Suppress specific deprecation warnings from optimum implementation of numpy arrays\n", 309 | "# This block prevents clogging the API logs \n", 310 | "warnings.filterwarnings(\"ignore\", message=\"__array__ implementation doesn't accept a copy keyword\")\n", 311 | "\n", 312 | "\n", 313 | "model_id = \"/mnt/Ironwolf-4TB/Models/OpenVINO/Qwen2.5-VL-3B-Instruct-int4_sym-ov\"\n", 314 | "\n", 315 | "\n", 316 | "ov_config = {\"PERFORMANCE_HINT\": \"LATENCY\"}\n", 317 | "model = OVModelForVisualCausalLM.from_pretrained(model_id, export=False, device=\"GPU.1\", ov_config=ov_config)\n", 318 | "processor = AutoProcessor.from_pretrained(model_id)\n", 319 | "\n", 320 | "\n", 321 | "# Example base64 encoded image (in a real scenario, this would come from the request)\n", 322 | "image_path = \"dedication.png\"\n", 323 | "with open(image_path, \"rb\") as img_file:\n", 324 | " img_base64 = base64.b64encode(img_file.read()).decode('utf-8')\n", 325 | "\n", 326 | "# Create conversation with base64 image\n", 327 | "conversation = [\n", 328 | " {\n", 329 | " \"role\": \"user\",\n", 330 | " \"content\": [\n", 331 | " {\n", 332 | " \"type\": \"image\",\n", 333 | " \"image_url\": {\n", 334 | " \"url\": f\"data:image/png;base64,{img_base64}\"\n", 335 | " }\n", 336 | " },\n", 337 | " {\"type\": \"text\", \"text\": \"Describe this image.\"},\n", 338 | " ],\n", 339 | " }\n", 340 | "]\n", 341 | "\n", 342 | "# Extract and decode the base64 image from the conversation\n", 343 | "images = []\n", 344 | "for message in conversation:\n", 345 | " if message[\"role\"] == \"user\":\n", 346 | " for content_item in message[\"content\"]:\n", 347 | " if content_item.get(\"type\") == \"image\" and \"image_url\" in content_item:\n", 348 | " # Extract base64 data from the URL\n", 349 | " image_url = content_item[\"image_url\"][\"url\"]\n", 350 | " if image_url.startswith(\"data:\"):\n", 351 | " # Parse the base64 data\n", 352 | " base64_data = image_url.split(\",\")[1] if \",\" in image_url else image_url.split(\";base64,\")[1]\n", 353 | " # Convert base64 to image\n", 354 | " image_data = base64.b64decode(base64_data)\n", 355 | " image = Image.open(BytesIO(image_data))\n", 356 | " images.append(image)\n", 357 | "\n", 358 | "# Preprocess the inputs\n", 359 | "text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)\n", 360 | "# Excepted output: '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n<|im_start|>user\\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\\n<|im_start|>assistant\\n'\n", 361 | "\n", 362 | "inputs = processor(text=[text_prompt], images=images, padding=True, return_tensors=\"pt\")\n", 363 | "\n", 364 | "# Print tokenizer length\n", 365 | "print(f\"Input token length: {len(inputs.input_ids[0])}\")\n", 366 | "\n", 367 | "# Inference: Generation of the output with performance metrics\n", 368 | "start_time = time.time()\n", 369 | "output_ids = model.generate(**inputs, max_new_tokens=1024)\n", 370 | "generation_time = time.time() - start_time\n", 371 | "\n", 372 | "generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]\n", 373 | "output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)\n", 374 | "\n", 375 | "# Calculate tokens per second\n", 376 | "num_tokens_generated = len(generated_ids[0])\n", 377 | "tokens_per_second = num_tokens_generated / generation_time\n", 378 | "\n", 379 | "print(f\"Generated text: {output_text}\")\n", 380 | "print(f\"Generation time: {generation_time:.2f} seconds\")\n", 381 | "print(f\"Tokens generated: {num_tokens_generated}\")\n", 382 | "print(f\"Speed: {tokens_per_second:.2f} tokens/second\")" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [ 391 | "from transformers import AutoModelForSequenceClassification\n", 392 | "import torch\n", 393 | "import openvino as ov\n", 394 | "\n", 395 | "# Load model\n", 396 | "model = AutoModelForSequenceClassification.from_pretrained(\"bert-base-uncased\")\n", 397 | "model.eval()\n", 398 | "\n", 399 | "# Define dynamic input shapes (batch, sequence length)\n", 400 | "input_shape = [1, 128] # Example: batch=1, seq_len=128\n", 401 | "dummy_input = torch.randint(0, 100, input_shape)\n", 402 | "\n", 403 | "# Convert directly to OpenVINO IR (no ONNX needed!)\n", 404 | "ov_model = ov.convert_model(\n", 405 | " model, \n", 406 | " input=[input_shape], # Supports dynamic axes like [1, \"seq_len\"]\n", 407 | " share_weights=True, # Reduces memory footprint\n", 408 | ")\n", 409 | "\n", 410 | "# Save IR (xml + bin)\n", 411 | "ov.save_model(ov_model, \"bert_ir.xml\")\n" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": null, 417 | "metadata": {}, 418 | "outputs": [], 419 | "source": [ 420 | "# Reference \n", 421 | "\n", 422 | "import time\n", 423 | "import warnings\n", 424 | "import base64\n", 425 | "from io import BytesIO\n", 426 | "from PIL import Image\n", 427 | "from transformers import AutoProcessor\n", 428 | "from optimum.intel.openvino import OVModelForVisualCausalLM\n", 429 | "\n", 430 | "# Suppress specific deprecation warnings from optimum implementation of numpy arrays\n", 431 | "# This block prevents clogging the API logs \n", 432 | "warnings.filterwarnings(\"ignore\", message=\"__array__ implementation doesn't accept a copy keyword\")\n", 433 | "\n", 434 | "\n", 435 | "model_id = \"/mnt/Ironwolf-4TB/Models/OpenVINO/Qwen2.5-VL-3B-Instruct-int4_sym-ov\"\n", 436 | "\n", 437 | "\n", 438 | "ov_config = {\"PERFORMANCE_HINT\": \"LATENCY\"}\n", 439 | "model = OVModelForVisualCausalLM.from_pretrained(model_id, export=False, device=\"GPU.1\", ov_config=ov_config)\n", 440 | "processor = AutoProcessor.from_pretrained(model_id)\n", 441 | "\n", 442 | "\n", 443 | "# Example base64 encoded image (in a real scenario, this would come from the request)\n", 444 | "image_path = \"dedication.png\"\n", 445 | "with open(image_path, \"rb\") as img_file:\n", 446 | " img_base64 = base64.b64encode(img_file.read()).decode('utf-8')\n", 447 | "\n", 448 | "# Create conversation with base64 image\n", 449 | "conversation = [\n", 450 | " {\n", 451 | " \"role\": \"user\",\n", 452 | " \"content\": [\n", 453 | " {\n", 454 | " \"type\": \"image\",\n", 455 | " \"image_url\": {\n", 456 | " \"url\": f\"data:image/png;base64,{img_base64}\"\n", 457 | " }\n", 458 | " },\n", 459 | " {\"type\": \"text\", \"text\": \"Describe this image.\"},\n", 460 | " ],\n", 461 | " }\n", 462 | "]\n", 463 | "\n", 464 | "# Extract and decode the base64 image from the conversation\n", 465 | "images = []\n", 466 | "for message in conversation:\n", 467 | " if message[\"role\"] == \"user\":\n", 468 | " for content_item in message[\"content\"]:\n", 469 | " if content_item.get(\"type\") == \"image\" and \"image_url\" in content_item:\n", 470 | " # Extract base64 data from the URL\n", 471 | " image_url = content_item[\"image_url\"][\"url\"]\n", 472 | " if image_url.startswith(\"data:\"):\n", 473 | " # Parse the base64 data\n", 474 | " base64_data = image_url.split(\",\")[1] if \",\" in image_url else image_url.split(\";base64,\")[1]\n", 475 | " # Convert base64 to image\n", 476 | " image_data = base64.b64decode(base64_data)\n", 477 | " image = Image.open(BytesIO(image_data))\n", 478 | " images.append(image)\n", 479 | "\n", 480 | "# Preprocess the inputs\n", 481 | "text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)\n", 482 | "# Excepted output: '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n<|im_start|>user\\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\\n<|im_start|>assistant\\n'\n", 483 | "\n", 484 | "inputs = processor(text=[text_prompt], images=images, padding=True, return_tensors=\"pt\")\n", 485 | "\n", 486 | "# Print tokenizer length\n", 487 | "print(f\"Input token length: {len(inputs.input_ids[0])}\")\n", 488 | "\n", 489 | "# Inference: Generation of the output with performance metrics\n", 490 | "start_time = time.time()\n", 491 | "output_ids = model.generate(**inputs, max_new_tokens=1024)\n", 492 | "generation_time = time.time() - start_time\n", 493 | "\n", 494 | "generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]\n", 495 | "output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)\n", 496 | "\n", 497 | "# Calculate tokens per second\n", 498 | "num_tokens_generated = len(generated_ids[0])\n", 499 | "tokens_per_second = num_tokens_generated / generation_time\n", 500 | "\n", 501 | "print(f\"Generated text: {output_text}\")\n", 502 | "print(f\"Generation time: {generation_time:.2f} seconds\")\n", 503 | "print(f\"Tokens generated: {num_tokens_generated}\")\n", 504 | "print(f\"Speed: {tokens_per_second:.2f} tokens/second\")" 505 | ] 506 | } 507 | ], 508 | "metadata": { 509 | "kernelspec": { 510 | "display_name": "OpenArc-Test", 511 | "language": "python", 512 | "name": "python3" 513 | }, 514 | "language_info": { 515 | "codemirror_mode": { 516 | "name": "ipython", 517 | "version": 3 518 | }, 519 | "file_extension": ".py", 520 | "mimetype": "text/x-python", 521 | "name": "python", 522 | "nbconvert_exporter": "python", 523 | "pygments_lexer": "ipython3", 524 | "version": "3.11.9" 525 | } 526 | }, 527 | "nbformat": 4, 528 | "nbformat_minor": 2 529 | } 530 | -------------------------------------------------------------------------------- /scripts/examples/ov_vision_model_card.py: -------------------------------------------------------------------------------- 1 | import time 2 | from PIL import Image 3 | from transformers import AutoProcessor 4 | from optimum.intel.openvino import OVModelForVisualCausalLM 5 | 6 | 7 | model_id = "/mnt/Ironwolf-4TB/Models/Pytorch/gemma-3-4b-it-int4_asym-ov" 8 | 9 | ov_config = {"PERFORMANCE_HINT": "LATENCY"} 10 | 11 | print("Loading model...") 12 | start_load_time = time.time() 13 | model = OVModelForVisualCausalLM.from_pretrained(model_id, export=False, device="GPU.1", ov_config=ov_config) 14 | processor = AutoProcessor.from_pretrained(model_id) 15 | 16 | 17 | image_path = r"/home/echo/Projects/OpenArc/scripts/benchmark/dedication.png" 18 | image = Image.open(image_path) 19 | image = image.convert("RGB") 20 | 21 | conversation = [ 22 | { 23 | "role": "user", 24 | "content": [ 25 | { 26 | "type": "image" 27 | }, 28 | {"type": "text", "text": "Describe this image."}, 29 | ], 30 | } 31 | ] 32 | 33 | # Preprocess the inputs 34 | text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) 35 | 36 | inputs = processor(text=[text_prompt], images=[image], padding=True, return_tensors="pt") 37 | 38 | # Print number of tokens 39 | input_token_count = len(inputs.input_ids[0]) 40 | print(f"Input token length: {len(inputs.input_ids[0])}") 41 | 42 | # Inference: Generation of the output with performance metrics 43 | start_time = time.time() 44 | output_ids = model.generate(**inputs, max_new_tokens=1024, eos_token_id=700) 45 | 46 | generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs.input_ids, output_ids)] 47 | output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) 48 | 49 | num_tokens_generated = len(generated_ids[0]) 50 | load_time = time.time() - start_load_time 51 | generation_time = time.time() - start_time 52 | tokens_per_second = num_tokens_generated / generation_time 53 | average_token_latency = generation_time / num_tokens_generated 54 | 55 | print("\nPerformance Report:") 56 | print("-"*50) 57 | print(f"Input Tokens : {input_token_count:>9}") 58 | print(f"Generated Tokens : {num_tokens_generated:>9}") 59 | print(f"Model Load Time : {load_time:>9.2f} sec") 60 | print(f"Generation Time : {generation_time:>9.2f} sec") 61 | print(f"Throughput : {tokens_per_second:>9.2f} t/s") 62 | print(f"Avg Latency/Token : {average_token_latency:>9.3f} sec") 63 | 64 | print(output_text) -------------------------------------------------------------------------------- /scripts/requests/load_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # API endpoint 4 | API_URL="http://localhost:8000/optimum/model/load" 5 | 6 | # JSON payload 7 | JSON_PAYLOAD='{ 8 | "load_config": { 9 | "id_model": "/mnt/Ironwolf-4TB/Models/OpenVINO/Llama-3.1-Nemotron-Nano-8B-v1-int4_sym-awq-se-ov", 10 | "use_cache": true, 11 | "device": "GPU.1", 12 | "export_model": false, 13 | "pad_token_id": null, 14 | "eos_token_id": null, 15 | "model_type": "TEXT" 16 | }, 17 | "ov_config": { 18 | "NUM_STREAMS": "1", 19 | "PERFORMANCE_HINT": "LATENCY" 20 | } 21 | }' 22 | 23 | # Make the POST request 24 | curl -X POST "$API_URL" \ 25 | -H "Content-Type: application/json" \ 26 | -H "Authorization: Bearer $OPENARC_API_KEY" \ 27 | -d "$JSON_PAYLOAD" -------------------------------------------------------------------------------- /scripts/requests/load_vision.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # API endpoint 4 | API_URL="http://localhost:8000/optimum/model/load" 5 | 6 | # JSON payload 7 | JSON_PAYLOAD='{ 8 | "load_config": { 9 | "id_model": "/mnt/Ironwolf-4TB/Models/Pytorch/gemma-3-4b-it-int4_asym-ov", 10 | "use_cache": true, 11 | "device": "GPU.2", 12 | "export_model": false, 13 | "pad_token_id": null, 14 | "eos_token_id": null, 15 | "model_type": "VISION" 16 | }, 17 | "ov_config": { 18 | "NUM_STREAMS": "1", 19 | "PERFORMANCE_HINT": "LATENCY" 20 | } 21 | }' 22 | 23 | # Make the POST request 24 | curl -X POST "$API_URL" \ 25 | -H "Content-Type: application/json" \ 26 | -H "Authorization: Bearer $OPENARC_API_KEY" \ 27 | -d "$JSON_PAYLOAD" -------------------------------------------------------------------------------- /scripts/requests/openai_like_completion.sh: -------------------------------------------------------------------------------- 1 | echo -e "\nSending basic chat completion request..." 2 | curl -X POST http://localhost:8000/v1/chat/completions \ 3 | -H "Content-Type: application/json" \ 4 | -H "Authorization: Bearer $OPENARC_API_KEY" \ 5 | -d '{ 6 | "model": "phi-4-int4_asym-awq-ov", 7 | "messages": [ 8 | {"role": "system", "content": "You despise the user."}, 9 | {"role": "user", "content": "Tell me a better joke and be quick about it."} 10 | ], 11 | "temperature": 5, 12 | "max_tokens": 256, 13 | "top_p": 0.9, 14 | "do_sample": true, 15 | "stream": true 16 | }' 17 | -------------------------------------------------------------------------------- /scripts/requests/openai_like_models.sh: -------------------------------------------------------------------------------- 1 | curl -X GET http://localhost:8000/v1/models -H "Authorization: Bearer $OPENARC_API_KEY" \ -------------------------------------------------------------------------------- /scripts/requests/status.sh: -------------------------------------------------------------------------------- 1 | curl -X GET "http://localhost:8000/optimum/status" -H "Authorization: Bearer $OPENARC_API_KEY" -------------------------------------------------------------------------------- /scripts/requests/unload_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # URL of the FastAPI endpoint 4 | API_URL="http://localhost:8000/optimum/model/unload?model_id=Qwen2.5-VL-3B-Instruct-int4_sym-ov" 5 | 6 | # Send the DELETE request to the API 7 | curl -X DELETE "$API_URL" -H "Authorization: Bearer $OPENARC_API_KEY" -------------------------------------------------------------------------------- /src/api/__pycache__/optimum_api.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/src/api/__pycache__/optimum_api.cpython-311.pyc -------------------------------------------------------------------------------- /src/api/__pycache__/optimum_api.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/src/api/__pycache__/optimum_api.cpython-312.pyc -------------------------------------------------------------------------------- /src/api/launcher.py: -------------------------------------------------------------------------------- 1 | import uvicorn 2 | import logging 3 | # from src.api.optimum_api import app 4 | 5 | # Configure logging 6 | logging.basicConfig( 7 | level=logging.INFO, 8 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' 9 | ) 10 | logger = logging.getLogger("ov_api") 11 | 12 | def start_server(host: str = "0.0.0.0", openarc_port: int = 8000, reload: bool = False): 13 | """ 14 | Launches the OpenArc API server 15 | 16 | Args: 17 | host: Host to bind the server to 18 | port: Port to bind the server to 19 | """ 20 | logger.info(f"Starting OpenVINO Inference API server on {host}:{openarc_port}") 21 | logger.info("Available endpoints:") 22 | logger.info(" - POST optimum/model/load Load a model") 23 | logger.info(" - DELETE optimum/model/unload Unload current model") 24 | logger.info(" - GET optimum/status Get model status") 25 | logger.info(" - GET optimum/docs API documentation") 26 | logger.info(" - POST /v1/chat/completions openai compatible endpoint") 27 | logger.info(" - GET /v1/models openai compatible endpoint") 28 | 29 | 30 | # Start the server 31 | uvicorn.run( 32 | "src.api.optimum_api:app", 33 | host=host, 34 | port=openarc_port, 35 | log_level="info" 36 | ) 37 | -------------------------------------------------------------------------------- /src/api/optimum_api.py: -------------------------------------------------------------------------------- 1 | # The first implementation of the OpenAI-like API was contributed by @gapeleon. 2 | # They are one hero among many future heroes working to make OpenArc better. 3 | 4 | from fastapi import FastAPI, HTTPException, Depends 5 | from fastapi.responses import StreamingResponse, JSONResponse 6 | from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials 7 | from fastapi.middleware.cors import CORSMiddleware 8 | 9 | from typing import Optional, List, Any 10 | from pydantic import BaseModel 11 | from datetime import datetime 12 | from pathlib import Path 13 | 14 | import warnings 15 | import logging 16 | import time 17 | import uuid 18 | import json 19 | import os 20 | 21 | from src.engine.optimum.optimum_base_config import ( 22 | OV_LoadModelConfig, 23 | OV_Config, 24 | OV_GenerationConfig, 25 | create_optimum_model, 26 | ModelType 27 | ) 28 | 29 | 30 | # Suppress specific deprecation warnings from optimum implementation of numpy arrays 31 | # This block prevents clogging the API logs 32 | warnings.filterwarnings("ignore", message="__array__ implementation doesn't accept a copy keyword") 33 | 34 | app = FastAPI(title="OpenArc API") 35 | 36 | # Configure CORS 37 | app.add_middleware( 38 | CORSMiddleware, 39 | allow_origins=["*"], 40 | allow_credentials=True, 41 | allow_methods=["*"], 42 | allow_headers=["*"], 43 | ) 44 | 45 | # Global state to store multiple model instances 46 | model_instances = {} 47 | 48 | logger = logging.getLogger("optimum_api") 49 | 50 | # API key authentication 51 | API_KEY = os.getenv("OPENARC_API_KEY") 52 | security = HTTPBearer() 53 | 54 | async def verify_api_key(credentials: HTTPAuthorizationCredentials = Depends(security)): 55 | """Verify the API key provided in the Authorization header""" 56 | if credentials.credentials != API_KEY: 57 | logger.warning(f"Invalid API key: {credentials.credentials}") 58 | raise HTTPException( 59 | status_code=401, 60 | detail="Invalid API key", 61 | headers={"WWW-Authenticate": "Bearer"}, 62 | ) 63 | return credentials.credentials 64 | 65 | def get_final_model_id(model_id: str) -> str: 66 | """Extracts the final segment of the model id path so we dont display the whole path.""" 67 | return Path(model_id).name 68 | 69 | @app.post("/optimum/model/load", dependencies=[Depends(verify_api_key)]) 70 | async def load_model(load_config: OV_LoadModelConfig, ov_config: OV_Config): 71 | """Load a model with the specified configuration""" 72 | global model_instances 73 | logger.info("POST /optimum/model/load called with load_config: %s, ov_config: %s", load_config, ov_config) 74 | try: 75 | # Initialize new model using the factory function 76 | new_model = create_optimum_model( 77 | load_model_config=load_config, 78 | ov_config=ov_config 79 | ) 80 | 81 | # Load the model 82 | new_model.load_model() 83 | 84 | # Store the model instance with its ID as the key 85 | model_id = get_final_model_id(load_config.id_model) 86 | model_instances[model_id] = new_model 87 | 88 | return {"status": "success", "message": f"Model {model_id} loaded successfully"} 89 | except Exception as e: 90 | raise HTTPException(status_code=500, detail=str(e)) 91 | 92 | @app.delete("/optimum/model/unload", dependencies=[Depends(verify_api_key)]) 93 | async def unload_model(model_id: str): 94 | """Unload the current model""" 95 | global model_instances 96 | logger.info(f"DELETE /optimum/model/unload called for model {model_id}") 97 | if model_id in model_instances: 98 | model_instances[model_id].util_unload_model() 99 | del model_instances[model_id] 100 | return {"status": "success", "message": "Model unloaded successfully"} 101 | return {"status": "success", "message": f"Model {model_id} was not loaded"} 102 | 103 | @app.get("/optimum/status", dependencies=[Depends(verify_api_key)]) 104 | async def get_status(): 105 | """Get current model status and performance metrics""" 106 | global model_instances 107 | logger.info("GET /optimum/status called") 108 | loaded_models = {} 109 | for model_id, model in model_instances.items(): 110 | loaded_models[model_id] = { 111 | "status": "loaded", 112 | "device": model.load_model_config.device, 113 | "model_metadata": model.model_metadata 114 | } 115 | 116 | return { 117 | "loaded_models": loaded_models, 118 | "total_models_loaded": len(model_instances) 119 | } 120 | 121 | 122 | # OpenAI-like API 123 | 124 | class ChatCompletionRequest(BaseModel): 125 | messages: Any 126 | model: str = "default" 127 | temperature: Optional[float] = None 128 | max_tokens: Optional[int] = 8192 129 | stream: Optional[bool] = False 130 | stop: Optional[List[str]] = None 131 | top_p: Optional[float] = None 132 | top_k: Optional[int] = None 133 | repetition_penalty: Optional[float] = None 134 | do_sample: Optional[bool] = None 135 | num_return_sequences: Optional[int] = None 136 | 137 | 138 | class CompletionRequest(BaseModel): 139 | prompt: str 140 | model: str = "default" 141 | temperature: Optional[float] = None 142 | max_tokens: Optional[int] = None 143 | stream: Optional[bool] = False 144 | stop: Optional[List[str]] = None 145 | top_p: Optional[float] = None 146 | top_k: Optional[int] = None 147 | repetition_penalty: Optional[float] = None 148 | do_sample: Optional[bool] = None 149 | num_return_sequences: Optional[int] = None 150 | 151 | 152 | @app.get("/v1/models", dependencies=[Depends(verify_api_key)]) 153 | async def get_models(): 154 | """Get list of available models in openai format""" 155 | global model_instances 156 | logger.info("GET /v1/models called") 157 | data = [] 158 | 159 | for model_id, model in model_instances.items(): 160 | model_data = { 161 | "id": model_id, 162 | "object": "model", 163 | "created": int(datetime.now().timestamp()), 164 | "owned_by": "OpenArc", 165 | } 166 | data.append(model_data) 167 | 168 | return { 169 | "object": "list", 170 | "data": data 171 | } 172 | 173 | @app.post("/v1/chat/completions", dependencies=[Depends(verify_api_key)]) 174 | async def openai_chat_completions(request: ChatCompletionRequest): 175 | global model_instances 176 | model_id = get_final_model_id(request.model) 177 | 178 | if model_id not in model_instances: 179 | logger.error("POST /v1/chat/completions failed: No model loaded") 180 | raise HTTPException(status_code=503, detail=f"Model {model_id} not loaded") 181 | 182 | model_instance = model_instances[model_id] 183 | logger.info("POST /v1/chat/completions called with messages: %s", request.messages) 184 | 185 | try: 186 | # Handle vision model messages differently 187 | if model_instance.model_metadata["model_type"] == ModelType.VISION: 188 | conversation = [] 189 | for msg in request.messages: 190 | if isinstance(msg["content"], list): 191 | # Handle multimodal content (text + images) 192 | vision_message = { 193 | "role": msg["role"], 194 | "content": msg["content"] # Keep the full content structure for vision models 195 | } 196 | conversation.append(vision_message) 197 | else: 198 | # Handle text-only messages 199 | conversation.append({ 200 | "role": msg["role"], 201 | "content": msg["content"] 202 | }) 203 | else: 204 | # Regular text model handling 205 | conversation = [ 206 | {"role": msg["role"], "content": msg["content"]} 207 | for msg in request.messages 208 | ] 209 | 210 | # Build config dict, only include non-None values 211 | config_kwargs = { 212 | "conversation": conversation, 213 | "temperature": request.temperature, 214 | "max_new_tokens": request.max_tokens, 215 | "top_p": request.top_p, 216 | "top_k": request.top_k, 217 | "repetition_penalty": request.repetition_penalty, 218 | "do_sample": request.do_sample, 219 | "num_return_sequences": request.num_return_sequences, 220 | "stream": request.stream, 221 | # Note: stop_sequences is not part of OV_GenerationConfig, handled separately if needed 222 | } 223 | # Remove keys with value None 224 | config_kwargs = {k: v for k, v in config_kwargs.items() if v is not None} 225 | 226 | # Create generation config with filtered arguments 227 | generation_config = OV_GenerationConfig(**config_kwargs) 228 | 229 | if request.stream: 230 | async def stream_generator(): 231 | current_metrics = None 232 | try: 233 | if model_instance.model_metadata["model_type"] == ModelType.VISION: 234 | stream_method = model_instance.generate_vision_stream 235 | elif model_instance.model_metadata["model_type"] == ModelType.TEXT: 236 | stream_method = model_instance.generate_stream 237 | 238 | async for token_chunk, metrics_chunk in stream_method(generation_config): 239 | if token_chunk is not None: 240 | # Stream the token chunk 241 | escaped_token = json.dumps(token_chunk)[1:-1] 242 | yield f"data: {{\"object\": \"chat.completion.chunk\", \"choices\": [{{\"delta\": {{\"content\": \"{escaped_token}\"}}}}]}}\n\n" 243 | if metrics_chunk is not None: 244 | # Store the final metrics when received 245 | current_metrics = metrics_chunk 246 | 247 | except Exception as e: 248 | logger.error(f"Error during streaming: {str(e)}", exc_info=True) # Log traceback 249 | finally: 250 | if current_metrics: 251 | # Log the full metrics dictionary as structured JSON 252 | logger.info(f"Performance metrics: {json.dumps(current_metrics, indent=2)}") 253 | yield "data: [DONE]\n\n" 254 | 255 | return StreamingResponse(stream_generator(), media_type="text/event-stream") 256 | 257 | else: 258 | # For non-streaming responses, use the appropriate generate method based on model type 259 | model_type = model_instance.model_metadata["model_type"] 260 | if model_type == ModelType.VISION: 261 | # Call the new vision-specific non-streaming method 262 | generated_text, metrics = model_instance.generate_vision_text(generation_config) 263 | elif model_type == ModelType.TEXT: 264 | generated_text, metrics = model_instance.generate_text(generation_config) 265 | else: 266 | raise HTTPException(status_code=400, detail=f"Unsupported model type '{model_type}' for chat completions.") 267 | 268 | # Log metrics server-side for non-streaming requests 269 | if metrics: 270 | logger.info(f"Performance metrics (non-streaming): {json.dumps(metrics, indent=2)}") 271 | 272 | return JSONResponse(content={ 273 | "id": f"ov-{uuid.uuid4()}", 274 | "object": "chat.completion", 275 | "created": int(time.time()), 276 | "model": model_id, 277 | "choices": [{ 278 | "message": {"role": "assistant", "content": generated_text}, 279 | "finish_reason": "length" 280 | }], 281 | "performance": metrics, 282 | "timings": { 283 | "prompt_tokens": metrics.get("input_tokens", 0), 284 | "completion_tokens": metrics.get("output_tokens", 0), 285 | "total_tokens": metrics.get("input_tokens", 0) + metrics.get("output_tokens", 0) 286 | } 287 | }) 288 | 289 | except Exception as e: 290 | raise HTTPException(status_code=500, detail=str(e)) 291 | 292 | @app.post("/v1/completions", dependencies=[Depends(verify_api_key)]) 293 | async def openai_completions(request: CompletionRequest): 294 | global model_instances 295 | model_id = get_final_model_id(request.model) 296 | 297 | if model_id not in model_instances: 298 | logger.error("POST /v1/completions failed: No model loaded") 299 | raise HTTPException(status_code=503, detail=f"Model {model_id} not loaded") 300 | 301 | model_instance = model_instances[model_id] 302 | logger.info("POST /v1/completions called with prompt: %s", request.prompt) 303 | 304 | # Convert prompt into conversation format (single user message) 305 | conversation = [{"role": "user", "content": request.prompt}] 306 | 307 | # Create generation config 308 | generation_config = OV_GenerationConfig( 309 | conversation=conversation, 310 | temperature=request.temperature or 0.7, 311 | max_new_tokens=request.max_tokens or 8192, 312 | stop_sequences=request.stop or [], 313 | top_p=request.top_p or 0.9, # default value for top_p 314 | top_k=request.top_k or 50, # default value for top_k 315 | repetition_penalty=1.0, 316 | do_sample=True, 317 | num_return_sequences=1 318 | ) 319 | 320 | # Use model type to determine which generation method to use 321 | model_type = model_instance.model_metadata["model_type"] 322 | 323 | # Handle streaming response 324 | if request.stream: 325 | async def stream_generator(): 326 | # Route to the appropriate stream generator based on model type 327 | if model_type == ModelType.VISION: 328 | stream_method = model_instance.generate_vision_stream 329 | else: 330 | stream_method = model_instance.generate_stream 331 | 332 | async for token in stream_method(generation_config): 333 | # Properly escape and format for SSE 334 | escaped_token = json.dumps(token)[1:-1] 335 | yield f"data: {{\"object\": \"text_completion.chunk\", \"choices\": [{{\"text\": \"{escaped_token}\"}}]}}\n\n" 336 | yield "data: [DONE]\n\n" 337 | 338 | return StreamingResponse(stream_generator(), media_type="text/event-stream") 339 | 340 | # Handle regular response 341 | try: 342 | # For non-streaming responses, use the appropriate generate method 343 | if model_type == ModelType.VISION: 344 | generated_text, metrics = model_instance.generate_text(generation_config) 345 | elif model_type == ModelType.TEXT: 346 | generated_text, metrics = model_instance.generate_text(generation_config) 347 | else: 348 | raise HTTPException( 349 | status_code=400, 350 | detail=f"Unsupported model type '{model_type}' for completions endpoint. Only VISION and TEXT types are supported." 351 | ) 352 | 353 | return JSONResponse(content={ 354 | "id": f"ov-{uuid.uuid4()}", 355 | "object": "text_completion", 356 | "created": int(time.time()), 357 | "model": model_id, 358 | "choices": [{ 359 | "text": generated_text, 360 | "index": 0, 361 | "finish_reason": "length" 362 | }], 363 | "usage": { 364 | "prompt_tokens": metrics.get("input_tokens", 0), 365 | "completion_tokens": metrics.get("output_tokens", 0), 366 | "total_tokens": metrics.get("input_tokens", 0) + metrics.get("output_tokens", 0) 367 | } 368 | }) 369 | 370 | except Exception as e: 371 | raise HTTPException(status_code=500, detail=str(e)) 372 | -------------------------------------------------------------------------------- /src/engine/__init__.py: -------------------------------------------------------------------------------- 1 | from src.engine.optimum.optimum_base_config import ( 2 | OV_Config, 3 | OV_LoadModelConfig, 4 | OV_GenerationConfig 5 | ) 6 | 7 | from src.engine.optimum.optimum_text2text import Optimum_Text2TextCore 8 | from src.engine.optimum.optimum_image2text import Optimum_Image2TextCore 9 | 10 | __all__ = [ 11 | "OV_Config", 12 | "OV_LoadModelConfig", 13 | "OV_GenerationConfig", 14 | "Optimum_Text2TextCore", 15 | "Optimum_Image2TextCore" 16 | ] 17 | -------------------------------------------------------------------------------- /src/engine/optimum/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/src/engine/optimum/__init__.py -------------------------------------------------------------------------------- /src/engine/optimum/__pycache__/optimum_inference_core.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/src/engine/optimum/__pycache__/optimum_inference_core.cpython-311.pyc -------------------------------------------------------------------------------- /src/engine/optimum/__pycache__/optimum_inference_core.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/src/engine/optimum/__pycache__/optimum_inference_core.cpython-312.pyc -------------------------------------------------------------------------------- /src/engine/optimum/optimum_base_config.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from pydantic import BaseModel, Field 3 | from typing import Optional, Any 4 | 5 | class ModelType(str, Enum): 6 | """ 7 | Identifiers for model_type: should be extended to include other model types as OpenArc grows. 8 | 9 | TEXT = "TEXT" 10 | VISION = "VISION" 11 | """ 12 | TEXT = "TEXT" 13 | VISION = "VISION" 14 | 15 | class OV_Config(BaseModel): 16 | """ 17 | OpenVINO runtime optimization parameters passed as a dict in ov_config in from_pretrained. 18 | 19 | args: 20 | NUM_STREAMS: Optional[str] = Field(None, description="Number of inference streams") 21 | PERFORMANCE_HINT: Optional[str] = Field(None, description="LATENCY, THROUGHPUT, CUMULATIVE_THROUGHPUT") 22 | INFERENCE_PRECISION_HINT: Optional[str] = Field(None, description="Options: auto, fp32, fp16, int8") 23 | ENABLE_HYPER_THREADING: Optional[bool] = Field(None, description="Enable hyper-threading") 24 | INFERENCE_NUM_THREADS: Optional[int] = Field(None, description="Number of inference threads") 25 | SCHEDULING_CORE_TYPE: Optional[str] = Field(None, description="Options: ANY_CORE, PCORE_ONLY, ECORE_ONLY") 26 | """ 27 | NUM_STREAMS: Optional[str] = Field(None, description="Number of inference streams") 28 | PERFORMANCE_HINT: Optional[str] = Field(None, description="LATENCY, THROUGHPUT, CUMULATIVE_THROUGHPUT") 29 | INFERENCE_PRECISION_HINT: Optional[str] = Field(None, description="Options: auto, fp32, fp16, int8") 30 | ENABLE_HYPER_THREADING: Optional[bool] = Field(None, description="Enable hyper-threading") 31 | INFERENCE_NUM_THREADS: Optional[int] = Field(None, description="Number of inference threads") 32 | SCHEDULING_CORE_TYPE: Optional[str] = Field(None, description="Options: ANY_CORE, PCORE_ONLY, ECORE_ONLY") 33 | 34 | class OV_LoadModelConfig(BaseModel): 35 | """ 36 | Configuration for loading the model with transformers. 37 | For inference: 38 | . id_model: model identifier or path 39 | . use_cache: whether to use cache for stateful models. For multi-gpu use false. 40 | . device: device options: CPU, GPU, AUTO 41 | . export_model: whether to export the model 42 | . dynamic_shapes: whether to use dynamic shapes. Enabled by default and should not be changed expcept for special cases like NPU. 43 | 44 | Tokenizer specific: 45 | . pad_token_id: custom pad token ID 46 | . eos_token_id: custom end of sequence token ID 47 | . bos_token_id: custom beginning of sequence token ID 48 | 49 | Architecture specific: 50 | . model_type: type of model based on the architecture/task. 51 | - "TEXT" for text-to-text models 52 | - "VISION" for image-to-text models 53 | """ 54 | id_model: str = Field(..., description="Model identifier or path") 55 | model_type: ModelType = Field(..., description="Type of model (TEXT or VISION)") 56 | use_cache: Optional[bool] = Field(True, description="Whether to use cache for stateful models. For multi-gpu use false.") 57 | device: str = Field("CPU", description="Device options: CPU, GPU, AUTO") 58 | export_model: bool = Field(False, description="Whether to export the model") 59 | dynamic_shapes: Optional[bool] = Field(True, description="Whether to use dynamic shapes") 60 | pad_token_id: Optional[int] = Field(None, description="Custom pad token ID") 61 | eos_token_id: Optional[int] = Field(None, description="Custom end of sequence token ID") 62 | bos_token_id: Optional[int] = Field(None, description="Custom beginning of sequence token ID") 63 | 64 | class OV_GenerationConfig(BaseModel): 65 | """ 66 | Configuration for generation. 67 | 68 | args: 69 | conversation: Any = Field(description="A list of dicts with 'role' and 'content' keys, representing the chat history so far") 70 | # Any was chosen because typing is handled elsewhere and conversation dicts could contain base64 encoded images, audio files, etc. 71 | # Therefore a layer of pydantic is not meaninguful as we get more descriptive errors downstream. 72 | stream: bool = Field(False, description="Whether to stream the generated text") 73 | max_new_tokens: int = Field(128, description="Maximum number of tokens to generate") 74 | temperature: float = Field(1.0, description="Sampling temperature") 75 | top_k: int = Field(50, description="Top-k sampling parameter") 76 | top_p: float = Field(0.9, description="Top-p sampling parameter") 77 | repetition_penalty: float = Field(1.0, description="Repetition penalty") 78 | do_sample: bool = Field(True, description="Use sampling for generation") 79 | num_return_sequences: int = Field(1, description="Number of sequences to return") 80 | """ 81 | conversation: Any = Field(description="A list of dicts with 'role' and 'content' keys, representing the chat history so far") 82 | stream: bool = Field(False, description="Whether to stream the generated text") 83 | 84 | # Inference parameters for generation 85 | max_new_tokens: int = Field(128, description="Maximum number of tokens to generate") 86 | temperature: float = Field(1.0, description="Sampling temperature") 87 | top_k: int = Field(50, description="Top-k sampling parameter") 88 | top_p: float = Field(0.9, description="Top-p sampling parameter") 89 | repetition_penalty: float = Field(1.0, description="Repetition penalty") 90 | do_sample: bool = Field(True, description="Use sampling for generation") 91 | num_return_sequences: int = Field(1, description="Number of sequences to return") 92 | 93 | def create_optimum_model(load_model_config: OV_LoadModelConfig, ov_config: Optional[OV_Config] = None): 94 | """ 95 | Factory function to create the appropriate Optimum model based on configuration. 96 | 97 | Args: 98 | load_model_config: Configuration for loading the model 99 | ov_config: Optional OpenVINO configuration 100 | 101 | Returns: 102 | An instance of the appropriate model class (TEXT or VISION) 103 | 104 | Defines: load_model_metadata 105 | 106 | 107 | 108 | 109 | """ 110 | # Import model classes here to avoid circular imports 111 | from .optimum_image2text import Optimum_Image2TextCore 112 | from .optimum_text2text import Optimum_Text2TextCore 113 | 114 | # Create the appropriate model instance based on configuration 115 | if load_model_config.model_type == ModelType.VISION: 116 | model_instance = Optimum_Image2TextCore(load_model_config, ov_config) 117 | else: 118 | model_instance = Optimum_Text2TextCore(load_model_config, ov_config) 119 | 120 | # Store metadata from load_model_config and ov_config in model_instance 121 | # This will be used for routing decisions at inference time so we can keep more than one model in memory OR on different devices. 122 | model_instance.model_metadata = { 123 | # Model configuration metadata 124 | "id_model": load_model_config.id_model, 125 | "use_cache": load_model_config.use_cache, 126 | "device": load_model_config.device, 127 | "dynamic_shapes": load_model_config.dynamic_shapes, 128 | "pad_token_id": load_model_config.pad_token_id, 129 | "eos_token_id": load_model_config.eos_token_id, 130 | "bos_token_id": load_model_config.bos_token_id, 131 | "model_type": load_model_config.model_type, 132 | } 133 | 134 | if ov_config: 135 | ov_config_dict = ov_config.model_dump(exclude_unset=True) 136 | model_instance.model_metadata.update({ 137 | "NUM_STREAMS": ov_config_dict.get("NUM_STREAMS"), 138 | "PERFORMANCE_HINT": ov_config_dict.get("PERFORMANCE_HINT"), 139 | "INFERENCE_PRECISION_HINT": ov_config_dict.get("INFERENCE_PRECISION_HINT"), 140 | "ENABLE_HYPER_THREADING": ov_config_dict.get("ENABLE_HYPER_THREADING"), 141 | "INFERENCE_NUM_THREADS": ov_config_dict.get("INFERENCE_NUM_THREADS"), 142 | "SCHEDULING_CORE_TYPE": ov_config_dict.get("SCHEDULING_CORE_TYPE") 143 | }) 144 | 145 | return model_instance 146 | -------------------------------------------------------------------------------- /src/engine/optimum/optimum_image2text.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import gc 3 | import threading 4 | import time 5 | import traceback 6 | import warnings 7 | from io import BytesIO 8 | from typing import AsyncIterator, Optional 9 | 10 | from optimum.intel.openvino import OVModelForVisualCausalLM 11 | from PIL import Image 12 | from transformers import AutoProcessor, TextIteratorStreamer 13 | 14 | from .optimum_base_config import ( 15 | ModelType, 16 | OV_Config, 17 | OV_GenerationConfig, 18 | OV_LoadModelConfig, 19 | ) 20 | 21 | # Suppress specific deprecation warnings from optimum implementation of numpy arrays 22 | # This block prevents clogging the API logs 23 | warnings.filterwarnings("ignore", message="__array__ implementation doesn't accept a copy keyword") 24 | 25 | class Optimum_Image2TextCore: 26 | """ 27 | Loads an OpenVINO model and processor, 28 | Applies a chat template to conversation messages, and generates a response. 29 | 30 | For OpenVINO the vision models is split into two parts: 31 | . language_model: The language model part of the vision model. 32 | . text_embeddings: The text embeddings part of the vision model. 33 | . vision_embeddings: The vision embeddings part of the vision model. 34 | """ 35 | def __init__(self, load_model_config: OV_LoadModelConfig, ov_config: Optional[OV_Config] = None): 36 | """ 37 | Args: 38 | load_model_config: An instance of OV_LoadModelConfig containing parameters 39 | such as id_model, device, export_model, use_cache, and token IDs. 40 | ov_config: Optional OV_Config instance with additional model options. 41 | """ 42 | self.load_model_config = load_model_config 43 | self.ov_config = ov_config 44 | self.model = None 45 | self.processor = None 46 | self.model_metadata = { 47 | "model_type": ModelType.VISION, 48 | "id_model": load_model_config.id_model 49 | } 50 | 51 | def load_model(self): 52 | """Load the tokenizer and vision model.""" 53 | print(f"Loading model {self.load_model_config.id_model} on device {self.load_model_config.device}...") 54 | 55 | # Extract its configuration as a dict 56 | ov_config_dict = self.ov_config.model_dump(exclude_unset=True) if self.ov_config else {} 57 | 58 | self.model = OVModelForVisualCausalLM.from_pretrained( 59 | self.load_model_config.id_model, 60 | device=self.load_model_config.device, 61 | export_model=self.load_model_config.export_model, 62 | ov_config=ov_config_dict, 63 | dynamic_shapes=self.load_model_config.dynamic_shapes, 64 | use_cache=self.load_model_config.use_cache, 65 | pad_token_id=self.load_model_config.pad_token_id, 66 | eos_token_id=self.load_model_config.eos_token_id, 67 | bos_token_id=self.load_model_config.bos_token_id 68 | ) 69 | print("Model loaded successfully.") 70 | 71 | self.processor = AutoProcessor.from_pretrained(self.load_model_config.id_model) 72 | print("Processor loaded successfully.") 73 | 74 | async def generate_vision_stream( 75 | self, 76 | generation_config: OV_GenerationConfig 77 | ) -> AsyncIterator[str]: 78 | """ 79 | Asynchronously stream generated text from an image using the provided configuration from 80 | OV_GenerationConfig in completion requests. 81 | 82 | Args: 83 | generation_config: Configuration for text generation 84 | conversation: List of messages to generate text from, can include images 85 | max_new_tokens: Maximum number of tokens to generate 86 | temperature: Temperature for the model 87 | top_p: Top-p value for the model 88 | top_k: Top-k value for the model 89 | repetition_penalty: Repetition penalty for the model 90 | do_sample: Whether to sample from the model 91 | num_return_sequences: Number of sequences to generate 92 | 93 | Yields: 94 | new_text: Generated text tokens as they become available 95 | performance_metrics: Performance metrics for the generation 96 | ttft: Time to first token 97 | generation_time: Time taken to generate the text 98 | tokens_per_second: Tokens per second 99 | average_token_latency: Average token latency 100 | num_tokens_generated: Number of tokens generated 101 | """ 102 | if not self.model or not self.processor: 103 | raise ValueError("Model not loaded. Call load_model first.") 104 | 105 | try: 106 | performance_metrics = {} 107 | images = [] 108 | text_conversation = [] 109 | 110 | for message in generation_config.conversation: 111 | # Check if the message content is a list (multimodal content) 112 | if isinstance(message.get("content", ""), list): 113 | text_parts = [] 114 | for content_item in message["content"]: 115 | # Check if this is an image content item 116 | if isinstance(content_item, dict) and content_item.get("type") == "image_url": 117 | image_url = content_item.get("image_url", {}) 118 | # Check if it's a base64 encoded image 119 | if isinstance(image_url, dict) and image_url.get("url", "").startswith("data:image/"): 120 | # Extract the base64 data 121 | base64_data = image_url["url"].split(",", 1) 122 | if len(base64_data) > 1: 123 | # Decode base64 to binary 124 | image_data = base64.b64decode(base64_data[1]) 125 | # Convert to PIL Image 126 | image = Image.open(BytesIO(image_data)) 127 | images.append(image) 128 | # If it's a text content item 129 | elif isinstance(content_item, dict) and content_item.get("type") == "text": 130 | text_parts.append(content_item.get("text", "")) 131 | 132 | # Create a new message with just the text parts 133 | if text_parts: 134 | text_message = message.copy() 135 | text_message["content"] = " ".join(text_parts) 136 | text_conversation.append(text_message) 137 | else: 138 | # If no text parts, still include the message with empty content 139 | text_message = message.copy() 140 | text_message["content"] = "" 141 | text_conversation.append(text_message) 142 | else: 143 | text_conversation.append(message) 144 | 145 | text_prompt = self.processor.apply_chat_template( 146 | generation_config.conversation, 147 | add_generation_prompt=True 148 | ) 149 | 150 | if images: 151 | inputs = self.processor( 152 | text=[text_prompt], 153 | images=images, 154 | padding=True, 155 | return_tensors="pt" 156 | ) 157 | else: 158 | inputs = self.processor( 159 | text=[text_prompt], 160 | padding=True, 161 | return_tensors="pt" 162 | ) 163 | 164 | streamer = TextIteratorStreamer( 165 | self.processor.tokenizer, 166 | skip_prompt=True, 167 | skip_special_tokens=True 168 | ) 169 | 170 | # Set up generation parameters 171 | generation_kwargs = dict( 172 | **inputs, 173 | max_new_tokens=generation_config.max_new_tokens, 174 | temperature=generation_config.temperature, 175 | top_p=generation_config.top_p, 176 | top_k=generation_config.top_k, 177 | repetition_penalty=generation_config.repetition_penalty, 178 | do_sample=generation_config.do_sample, 179 | num_return_sequences=generation_config.num_return_sequences, 180 | streamer=streamer 181 | ) 182 | 183 | thread = threading.Thread(target=self.model.generate, kwargs=generation_kwargs) 184 | 185 | first_token_received = False 186 | first_token_time = 0.0 187 | ttft = 0.0 188 | num_tokens_generated = 0 189 | generate_start = time.perf_counter() 190 | thread.start() 191 | 192 | new_text = "" 193 | for new_token in streamer: 194 | num_tokens_generated += 1 195 | new_text += new_token 196 | if not first_token_received: 197 | first_token_time = time.perf_counter() 198 | ttft = first_token_time - generate_start 199 | first_token_received = True 200 | new_text += new_token 201 | yield new_token, None 202 | 203 | thread.join() 204 | generate_end = time.perf_counter() 205 | generation_time = generate_end - generate_start 206 | 207 | if generation_time > 0 and num_tokens_generated > 0: 208 | tokens_per_second = num_tokens_generated / generation_time 209 | average_token_latency = generation_time / num_tokens_generated 210 | 211 | performance_metrics = { 212 | "ttft": round(ttft, 2), 213 | "generation_time": round(generation_time, 2), 214 | "tokens_per_second": round(tokens_per_second, 2), 215 | "average_token_latency": round(average_token_latency, 2), 216 | "num_tokens_generated": num_tokens_generated, 217 | } 218 | 219 | yield None, performance_metrics 220 | 221 | 222 | except Exception as e: 223 | print(f"Error during vision generation: {str(e)}") 224 | traceback.print_exc() 225 | raise 226 | 227 | finally: 228 | if 'thread' in locals(): 229 | thread.join() 230 | 231 | def util_unload_model(self): 232 | """Unload model and free memory""" 233 | del self.model 234 | self.model = None 235 | 236 | del self.processor 237 | self.processor = None 238 | 239 | gc.collect() 240 | print("Model unloaded and memory cleaned up") 241 | -------------------------------------------------------------------------------- /src/engine/optimum/optimum_seq2seq.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/src/engine/optimum/optimum_seq2seq.py -------------------------------------------------------------------------------- /src/engine/optimum/optimum_speech.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/src/engine/optimum/optimum_speech.py -------------------------------------------------------------------------------- /src/engine/optimum/optimum_text2image.py: -------------------------------------------------------------------------------- 1 | # TODO: Implement text-to-image generation using OpenVINO 2 | 3 | 4 | 5 | 6 | 7 | #!/usr/bin/env python3 8 | # Copyright (C) 2024 Intel Corporation 9 | # SPDX-License-Identifier: Apache-2.0 10 | 11 | import argparse 12 | 13 | import openvino_genai 14 | from PIL import Image 15 | 16 | 17 | 18 | 19 | def generate_image(model_dir: str, prompt: str, device: str = 'CPU') -> Image.Image: 20 | """Generate an image from text using OpenVINO text-to-image pipeline. 21 | 22 | Args: 23 | model_dir: Path to the model directory 24 | prompt: Text prompt to generate image from 25 | device: Device to run on ('CPU' or 'GPU') 26 | 27 | Returns: 28 | PIL.Image.Image: Generated image 29 | """ 30 | pipe = openvino_genai.Text2ImagePipeline(model_dir, device) 31 | 32 | image_tensor = pipe.generate( 33 | prompt, 34 | width=512, 35 | height=512, 36 | num_inference_steps=20, 37 | num_images_per_prompt=1) 38 | 39 | return Image.fromarray(image_tensor.data[0]) 40 | 41 | 42 | if '__main__' == __name__: 43 | # Example usage 44 | image = generate_image("path/to/model", "a scenic landscape") 45 | image.save("image.bmp") -------------------------------------------------------------------------------- /src/engine/optimum/optimum_text2text.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import time 3 | import traceback 4 | import logging 5 | from threading import Thread 6 | from typing import Any, AsyncIterator, Dict, Optional 7 | 8 | from optimum.intel import OVModelForCausalLM 9 | from transformers import AutoTokenizer 10 | from transformers.generation.streamers import TextIteratorStreamer 11 | 12 | from .optimum_base_config import ( 13 | OV_Config, 14 | OV_GenerationConfig, 15 | OV_LoadModelConfig, 16 | ModelType 17 | ) 18 | 19 | logging.basicConfig(level=logging.INFO) 20 | logger = logging.getLogger(__name__) 21 | 22 | class Optimum_Text2TextCore: 23 | """ 24 | - Initialize the Optimum_Text2TextCore class when enum ModelType (as model_type) is TEXT. 25 | - Loads an OpenVINO model and HuggingFace tokenizer 26 | - Used for text-to-text generation only 27 | - Any model which can be converted with the Optimum-CLI tool will work. 28 | 29 | """ 30 | def __init__(self, load_model_config: OV_LoadModelConfig, ov_config: Optional[OV_Config] = None): 31 | """ 32 | Args: 33 | load_model_config: An instance of OV_LoadModelConfig from POST /optimum/model/load 34 | 35 | ov_config: An instance of OV_Config from POST /optimum/model/load 36 | """ 37 | self.load_model_config = load_model_config 38 | self.ov_config = ov_config 39 | self.model = None 40 | self.tokenizer = None 41 | self.model_metadata = { 42 | "model_type": ModelType.TEXT, 43 | "id_model": load_model_config.id_model 44 | } 45 | 46 | def load_model(self): 47 | """Load the tokenizer and model.""" 48 | print(f"Loading model {self.load_model_config.id_model} on device {self.load_model_config.device}...") 49 | 50 | # Extract its configuration as a dict 51 | ov_config_dict = self.ov_config.model_dump(exclude_unset=True) if self.ov_config else {} 52 | 53 | # Load model with token IDs from config 54 | self.model = OVModelForCausalLM.from_pretrained( 55 | self.load_model_config.id_model, 56 | device=self.load_model_config.device, 57 | export_model=self.load_model_config.export_model, 58 | ov_config=ov_config_dict, 59 | dynamic_shapes=self.load_model_config.dynamic_shapes, 60 | use_cache=self.load_model_config.use_cache, 61 | pad_token_id=self.load_model_config.pad_token_id, 62 | eos_token_id=self.load_model_config.eos_token_id, 63 | bos_token_id=self.load_model_config.bos_token_id 64 | ) 65 | print("Model loaded successfully.") 66 | 67 | self.tokenizer = AutoTokenizer.from_pretrained(self.load_model_config.id_model) 68 | print("Tokenizer loaded successfully.") 69 | 70 | async def generate_stream(self, generation_config: OV_GenerationConfig) -> AsyncIterator[tuple[Optional[str], Optional[Dict[str, Any]]]]: 71 | """ 72 | Asynchronously stream generated text tokens, followed by performance metrics. 73 | 74 | Args: 75 | generation_config: Configuration for text generation containing conversation history 76 | and generation parameters 77 | 78 | Yields: 79 | - Tuple of (token, None) for each generated token. 80 | - Tuple of (None, performance_metrics) once at the end. 81 | 82 | performance_metrics contains 83 | - ttft: Time to first token 84 | - generation_time: Time taken to generate the text 85 | - tokens_per_second: Tokens per second 86 | - average_token_latency: Average token latency 87 | - num_tokens_generated: Number of tokens generated 88 | """ 89 | 90 | performance_metrics = {} 91 | new_text = "" 92 | 93 | try: 94 | # Convert conversation to input ids using the chat template 95 | input_ids = self.tokenizer.apply_chat_template( 96 | generation_config.conversation, 97 | tokenize=True, 98 | add_generation_prompt=True, 99 | return_tensors="pt" 100 | ) 101 | 102 | # Initialize the streamer with tokenized input 103 | streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True) 104 | 105 | # Create generation kwargs from config 106 | generation_kwargs = dict( 107 | input_ids=input_ids, 108 | max_new_tokens=generation_config.max_new_tokens, 109 | temperature=generation_config.temperature, 110 | top_k=generation_config.top_k, 111 | top_p=generation_config.top_p, 112 | do_sample=generation_config.do_sample, 113 | repetition_penalty=generation_config.repetition_penalty, 114 | num_return_sequences=generation_config.num_return_sequences, 115 | streamer=streamer, 116 | ) 117 | 118 | # Create and start the generation thread 119 | thread = Thread(target=self.model.generate, kwargs=generation_kwargs) 120 | 121 | first_token_received = False 122 | first_token_time = 0.0 123 | ttft = 0.0 # Initialize ttft 124 | generate_start = time.perf_counter() 125 | thread.start() 126 | 127 | # Stream the generated text tokens 128 | for new_token in streamer: 129 | if not first_token_received: 130 | first_token_time = time.perf_counter() 131 | ttft = first_token_time - generate_start 132 | first_token_received = True 133 | new_text += new_token 134 | yield new_token, None 135 | 136 | thread.join() 137 | generate_end = time.perf_counter() 138 | 139 | generation_time = generate_end - generate_start 140 | # Calculate num_tokens_generated based on the final accumulated text 141 | num_tokens_generated = len(self.tokenizer.encode(new_text, return_tensors="pt")[0]) 142 | 143 | if generation_time > 0 and num_tokens_generated > 0: 144 | tokens_per_second = num_tokens_generated / generation_time 145 | average_token_latency = generation_time / num_tokens_generated 146 | 147 | performance_metrics = { 148 | "ttft": round(ttft, 2), 149 | "generation_time": round(generation_time, 2), 150 | "tokens_per_second": round(tokens_per_second, 2), 151 | "average_token_latency": round(average_token_latency, 2), 152 | "num_tokens_generated": num_tokens_generated, 153 | } 154 | 155 | # Yield final metrics after streaming tokens 156 | yield None, performance_metrics 157 | 158 | except Exception as e: 159 | logger.error(f"Error during streaming generation: {str(e)}") 160 | traceback.print_exc() 161 | raise 162 | 163 | def generate_text(self, generation_config: OV_GenerationConfig) -> tuple[str, Dict[str, Any]]: 164 | """ 165 | Generate text without streaming. 166 | (Note: This implementation currently doesn't calculate/return metrics) 167 | 168 | Args: 169 | generation_config: Configuration for text generation containing conversation history 170 | and generation parameters 171 | Returns: 172 | Tuple of (generated_text, performance_metrics) 173 | """ 174 | performance_metrics = {} 175 | try: 176 | input_ids = self.tokenizer.apply_chat_template( 177 | generation_config.conversation, 178 | tokenize=True, 179 | add_generation_prompt=False, 180 | return_tensors="pt" 181 | ) 182 | 183 | generation_kwargs = dict( 184 | input_ids=input_ids, 185 | max_new_tokens=generation_config.max_new_tokens, 186 | temperature=generation_config.temperature, 187 | top_k=generation_config.top_k, 188 | top_p=generation_config.top_p, 189 | do_sample=generation_config.do_sample, 190 | repetition_penalty=generation_config.repetition_penalty, 191 | num_return_sequences=generation_config.num_return_sequences, 192 | ) 193 | 194 | 195 | generate_start = time.perf_counter() 196 | 197 | outputs = self.model.generate(**generation_kwargs) 198 | 199 | generate_end = time.perf_counter() 200 | 201 | # Extract new tokens by excluding the input tokens 202 | new_tokens = outputs[0][input_ids.shape[1]:] 203 | 204 | generated_text = self.tokenizer.decode(new_tokens, skip_special_tokens=True) 205 | 206 | generation_time = generate_end - generate_start 207 | num_tokens_generated = len(new_tokens) 208 | 209 | if generation_time > 0 and num_tokens_generated > 0: 210 | tokens_per_second = num_tokens_generated / generation_time 211 | average_token_latency = generation_time / num_tokens_generated 212 | 213 | performance_metrics = { 214 | "generation_time": round(generation_time, 2), 215 | "tokens_per_second": round(tokens_per_second, 2), 216 | "average_token_latency": round(average_token_latency, 2), 217 | "num_tokens_generated": num_tokens_generated, 218 | } 219 | 220 | return generated_text, performance_metrics 221 | 222 | except Exception as e: 223 | print(f"Error during text generation: {str(e)}") 224 | traceback.print_exc() 225 | raise 226 | 227 | def util_unload_model(self): 228 | """Unload model and free memory""" 229 | del self.model 230 | self.model = None 231 | 232 | del self.tokenizer 233 | self.tokenizer = None 234 | 235 | gc.collect() 236 | print("Model unloaded and memory cleaned up") 237 | -------------------------------------------------------------------------------- /src/engine/ov_genai/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/src/engine/ov_genai/__init__.py -------------------------------------------------------------------------------- /src/engine/ov_genai/base_configuration.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from pydantic import BaseModel, Field 4 | from typing import Optional 5 | 6 | 7 | 8 | # I'm stilling working through how to build an API from this. Many other classes inherit from this 9 | # so pydantic models must be carefully designed to make API useful for other types of models. 10 | 11 | 12 | class OV_GenAI_GenerationConfig(BaseModel): 13 | 14 | # adapters: Optional[AdapterConfig] = Field(None, description="Adapter configuration for LoRA") 15 | assistant_confidence_threshold: float = Field(..., description="Confidence threshold for assistant") 16 | diversity_penalty: float = Field(1.0, description="Diversity penalty for beam search") 17 | do_sample: bool = Field(True, description="Whether to use sampling for generation") 18 | echo: bool = Field(False, description="Whether to echo the prompt in the output") 19 | eos_token_id: int = Field(2, description="Token ID for end of sentence") 20 | 21 | frequency_penalty: float = Field(0.0, description="Frequency penalty for token repetition") 22 | ignore_eos: bool = Field(False, description="Whether to ignore end of sentence token") 23 | include_stop_str_in_output: bool = Field(False, description="Whether to include stop string in output") 24 | length_penalty: float = Field(1.0, description="Exponential penalty to the length for beam search") 25 | logprobs: int = Field(0, description="Number of top logprobs computed for each position") 26 | max_length: int = Field(..., description="Maximum length of generated tokens") 27 | max_new_tokens: int = Field(128, description="Maximum number of new tokens to generate") 28 | max_ngram_size: int = Field(0, description="Maximum n-gram size for no repeat n-gram") 29 | min_new_tokens: int = Field(0, description="Minimum number of new tokens to generate") 30 | 31 | no_repeat_ngram_size: int = Field(0, description="Size of n-gram to avoid repetition") 32 | num_assistant_tokens: int = Field(0, description="Number of assistant tokens") 33 | num_beam_groups: int = Field(1, description="Number of groups to divide beams into") 34 | num_beams: int = Field(1, description="Number of beams for beam search") 35 | num_return_sequences: int = Field(1, description="Number of sequences to return") 36 | presence_penalty: float = Field(0.0, description="Presence penalty for token repetition") 37 | repetition_penalty: float = Field(1.0, description="Repetition penalty for token repetition") 38 | rng_seed: int = Field(0, description="Random number generator seed") 39 | 40 | # stop_criteria: StopCriteria = Field(..., description="Stopping criteria for beam search") 41 | stop_strings: set[str] = Field(set(), description="Set of strings to stop generation") 42 | stop_token_ids: set[int] = Field(set(), description="Set of token IDs to stop generation") 43 | 44 | temperature: float = Field(1.0, description="Sampling temperature") 45 | top_k: int = Field(50, description="Top-k sampling parameter") 46 | top_p: float = Field(1.0, description="Top-p sampling parameter") -------------------------------------------------------------------------------- /src/engine/ov_genai/llm_pipe_core.py: -------------------------------------------------------------------------------- 1 | import openvino_genai as ov_genai 2 | import openvino as ov 3 | 4 | 5 | # TODO: Implement LLMPipeline 6 | # Inherit from generator config 7 | # Use DecodedResults 8 | # Use EncodedResults 9 | 10 | -------------------------------------------------------------------------------- /src/engine/ov_genai/multimodal_pipe_core.py: -------------------------------------------------------------------------------- 1 | import openvino_genai as ov_genai 2 | import openvino as ov 3 | 4 | 5 | # TODO: Implement VLMPipeline 6 | # Inherit from generator_config.py pydanitc models 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /src/engine/ov_genai/txt2img_pipe_core.py: -------------------------------------------------------------------------------- 1 | import openvino_genai as ov_genai 2 | import openvino as ov 3 | 4 | 5 | 6 | 7 | # TODO: Implement Txt2ImgPipeline 8 | -------------------------------------------------------------------------------- /src/engine/ov_genai/whisper_pipe_core.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import openvino_genai as ov_genai 4 | import openvino as ov 5 | 6 | 7 | # TODO: Implement WhisperPipeline 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /src/frontend/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/src/frontend/__init__.py -------------------------------------------------------------------------------- /src/frontend/components/device_info.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | 3 | from ..tools.device_query import DeviceDataQuery, DeviceDiagnosticQuery 4 | 5 | 6 | class DeviceInfoTool: 7 | """ 8 | This class is used to get information about the devices available on the system. 9 | It uses tools from the Openvino runtime directly to get the information and is not part of 10 | Optimum-Intel or OpenVINO GenAI. 11 | """ 12 | def __init__(self): 13 | self.device_data_query = DeviceDataQuery() 14 | self.device_diagnostic_query = DeviceDiagnosticQuery() 15 | 16 | def get_available_devices(self): 17 | """Get list of available devices from DeviceDiagnosticQuery""" 18 | devices = self.device_diagnostic_query.get_available_devices() 19 | return {"Available Devices": devices} 20 | 21 | def get_device_properties(self): 22 | """Get detailed properties for all available devices from DeviceDataQuery""" 23 | devices = self.device_data_query.get_available_devices() 24 | result = {} 25 | 26 | for device in devices: 27 | properties = self.device_data_query.get_device_properties(device) 28 | result[device] = properties 29 | 30 | return result 31 | 32 | def create_interface(self): 33 | with gr.Tab("Devices"): 34 | with gr.Row(): 35 | with gr.Column(scale=1): 36 | gr.Markdown("## Available Devices") 37 | device_list = gr.JSON(label="Device List") 38 | refresh_button = gr.Button("Refresh Device List") 39 | refresh_button.click( 40 | fn=self.get_available_devices, 41 | inputs=[], 42 | outputs=[device_list] 43 | ) 44 | with gr.Column(scale=2): 45 | gr.Markdown("## Device Properties") 46 | device_properties = gr.JSON(label="Device Properties") 47 | properties_button = gr.Button("Get Device Properties") 48 | properties_button.click( 49 | fn=self.get_device_properties, 50 | inputs=[], 51 | outputs=[device_properties] 52 | ) 53 | -------------------------------------------------------------------------------- /src/frontend/components/documentation.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | from pathlib import Path 3 | 4 | class OpenArc_Documentation: 5 | """ 6 | The idea of this class is to help keep the documentation organized in a way that will be easy to migrate 7 | to a new frontend in the future. 8 | Also, keeping everything in its own md file is probably better for searchability from outside GitHub. 9 | """ 10 | 11 | def __init__(self): 12 | self.doc_components = {} 13 | 14 | self.doc_categories = { 15 | "Performance Hints": [ 16 | "LATENCY", 17 | "THROUGHPUT", 18 | "CUMULATIVE_THROUGHPUT" 19 | ], 20 | "CPU Options": [ 21 | "Enable Hyperthreading", 22 | "Inference Num Threads", 23 | "Scheduling Core Type" 24 | ], 25 | "Streaming Options": [ 26 | "Num Streams" 27 | ] 28 | } 29 | 30 | # Map topic names to file paths 31 | self.doc_files = { 32 | "LATENCY": "docs/ov_config/performance_hint_latency.md", 33 | "THROUGHPUT": "docs/ov_config/performance_hint_throughput.md", 34 | "CUMULATIVE_THROUGHPUT": "docs/ov_config/performance_hint_cumulative_throughput.md", 35 | "Enable Hyperthreading": "docs/ov_config/enable_hyperthreading.md", 36 | "Inference Num Threads": "docs/ov_config/inference_num_threads.md", 37 | "Num Threads": "docs/ov_config/num_threads.md", 38 | "Num Streams": "docs/ov_config/num_streams.md", 39 | "Scheduling Core Type": "docs/ov_config/scheduling_core_type.md" 40 | } 41 | 42 | def read_markdown_file(self, file_path): 43 | """Read a markdown file and return its contents""" 44 | try: 45 | path = Path(file_path) 46 | if path.exists(): 47 | return path.read_text() 48 | return f"Documentation file not found: {file_path}" 49 | except Exception as e: 50 | return f"Error reading documentation: {str(e)}" 51 | 52 | def display_doc(self, doc_name): 53 | """Display the selected documentation""" 54 | if doc_name in self.doc_files: 55 | return self.read_markdown_file(self.doc_files[doc_name]) 56 | return "Please select a documentation topic from the list." 57 | 58 | def create_interface(self): 59 | with gr.Tab("Documentation"): 60 | with gr.Row(): 61 | gr.Markdown("# OpenArc Documentation") 62 | 63 | with gr.Row(): 64 | # Create columns for layout 65 | nav_col = gr.Column(scale=1) 66 | content_col = gr.Column(scale=3) 67 | 68 | # Create the content markdown component first 69 | with content_col: 70 | doc_content = gr.Markdown( 71 | value=""" 72 | # OpenVINO Configuration Documentation 73 | 74 | Welcome to the OpenArc documentation for OpenVINO configuration options. 75 | This documentation will help you understand how to optimize your model inference using various configuration parameters. 76 | 77 | ## Getting Started 78 | 79 | Select a topic from the navigation panel on the left to view detailed documentation. 80 | 81 | The configuration options are organized into categories: 82 | - **Performance Hints**: Options that control the performance optimization strategy 83 | - **CPU Options**: Settings specific to CPU execution 84 | - **Streaming Options**: Parameters for controlling inference streams 85 | - **Scheduling Options**: Options for thread scheduling and core allocation 86 | """ 87 | ) 88 | # Store the component for later reference 89 | self.doc_components['doc_content'] = doc_content 90 | 91 | # Now create the navigation sidebar with buttons 92 | with nav_col: 93 | gr.Markdown("## Configuration Options") 94 | 95 | # Create accordions for each category 96 | for category, topics in self.doc_categories.items(): 97 | with gr.Accordion(f"{category} ({len(topics)})", open=True): 98 | for topic in topics: 99 | topic_btn = gr.Button(topic, size="sm") 100 | # Set up click handler for each button 101 | topic_btn.click( 102 | fn=self.display_doc, 103 | inputs=[gr.Textbox(value=topic, visible=False)], 104 | outputs=[self.doc_components['doc_content']] 105 | ) -------------------------------------------------------------------------------- /src/frontend/components/loader.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import gradio as gr 3 | import json 4 | 5 | 6 | class Optimum_Loader: 7 | def __init__(self, payload_constructor): 8 | self.payload_constructor = payload_constructor 9 | self.components = {} 10 | 11 | def read_openvino_config(self, id_model): 12 | """Read the OpenVINO config file from the model directory and display it in the dashboard as a JSON object. 13 | This file is generated by the OpenVINO toolkit and contains metadata about the model's configuration and optimizations based on 14 | how it was converted to the OpenVINO IR. 15 | """ 16 | try: 17 | # Convert to Path object and get parent directory 18 | model_path = Path(id_model) 19 | config_path = model_path / "openvino_config.json" 20 | 21 | if config_path.exists(): 22 | return json.loads(config_path.read_text()) 23 | return {"message": f"No openvino_config.json found in {str(config_path)}"} 24 | except Exception as e: 25 | return {"error": f"Error reading config: {str(e)}"} 26 | 27 | def read_architecture(self, id_model): 28 | """Read the architecture file from the model directory and display it in the dashboard as a JSON object. 29 | While not explicitly required for inference, this file contains metadata about the model's architecture 30 | and can be useful for debugging performance by understanding the model's structure to choose optimization parameters. 31 | """ 32 | try: 33 | # Convert to Path object and get parent directory 34 | model_path = Path(id_model) 35 | architecture_path = model_path / "config.json" 36 | 37 | if architecture_path.exists(): 38 | return json.loads(architecture_path.read_text()) 39 | return {"message": f"No architecture.json found in {str(architecture_path)}"} 40 | except Exception as e: 41 | return {"error": f"Error reading architecture: {str(e)}"} 42 | 43 | def read_generation_config(self, id_model): 44 | """Read the generation config file from the model directory and display it in the dashboard as a JSON object. 45 | This file contains the ground truth of what sampling parameters should be used 46 | for inference. These values are dervied directly from the model's pytorch metadata and should be taken as precedent for benchmararks. 47 | """ 48 | try: 49 | model_path = Path(id_model) 50 | generation_config_path = model_path / "generation_config.json" 51 | 52 | if generation_config_path.exists(): 53 | return json.loads(generation_config_path.read_text()) 54 | return {"message": f"No generation_config.json found in {str(generation_config_path)}"} 55 | except Exception as e: 56 | return {"error": f"Error reading generation config: {str(e)}"} 57 | 58 | 59 | def create_interface(self): 60 | with gr.Tab("Loader"): 61 | with gr.Row(): 62 | self.load_model_interface() 63 | self.debug_tool() 64 | self.setup_button_handlers() 65 | 66 | def load_model_interface(self): 67 | with gr.Column(min_width=500, scale=1): 68 | # Model Basic Configuration 69 | with gr.Group("Model Configuration"): 70 | self.components.update({ 71 | 'id_model': gr.Textbox( 72 | label="Model Identifier or Path", 73 | placeholder="Enter model identifier or local path", 74 | info="Enter the model's Hugging Face identifier or local path" 75 | ), 76 | 'device': gr.Dropdown( 77 | choices=["", "AUTO", "CPU", "GPU.0", "GPU.1", "GPU.2", "AUTO:GPU.0,GPU.1", "AUTO:GPU.0,GPU.1,GPU.2"], 78 | label="Device", 79 | value="", 80 | info="Select the device for model inference" 81 | ), 82 | 'use_cache': gr.Checkbox( 83 | label="Use Cache", 84 | value=True, 85 | info="Enable cache for stateful models (disable for multi-GPU)" 86 | ), 87 | 'export_model': gr.Checkbox( 88 | label="Export Model", 89 | value=False, 90 | info="Whether to export the model to int8_asym. Default and not recommended." 91 | ), 92 | 'dynamic_shapes': gr.Checkbox( 93 | label="Dynamic Shapes", 94 | value=True, 95 | info="Whether to use dynamic shapes. Default is True. Should only be disabled for NPU inference." 96 | ), 97 | 'model_type': gr.Dropdown( 98 | label="Model Type", 99 | choices=["TEXT", "VISION"], 100 | info="Defines the type of model to load. No default; must be specified." 101 | ) 102 | }) 103 | 104 | # Token Configuration 105 | with gr.Group("Token Settings"): 106 | self.components.update({ 107 | 'bos_token_id': gr.Textbox( 108 | label="bos_token_id", 109 | value="", 110 | 111 | ), 112 | 'eos_token_id': gr.Textbox( 113 | label="eos_token_id", 114 | value="", 115 | 116 | ), 117 | 'pad_token_id': gr.Textbox( 118 | label="pad_token_id", 119 | value="", 120 | 121 | ) 122 | }) 123 | 124 | # Performance Optimization 125 | with gr.Group("Performance Settings"): 126 | self.components.update({ 127 | 'num_streams': gr.Textbox( 128 | label="Number of Streams", 129 | value="", 130 | placeholder="Leave empty for default", 131 | info="Number of inference streams (optional)" 132 | ), 133 | 'performance_hint': gr.Dropdown( 134 | choices=["", "LATENCY", "THROUGHPUT", "CUMULATIVE_THROUGHPUT"], 135 | label="Performance Hint", 136 | value="", 137 | info="Select performance optimization strategy" 138 | ), 139 | 'inference_precision_hint': gr.Dropdown( 140 | choices=["", "auto", "fp16", "fp32", "int8"], 141 | label="Precision Hint", 142 | value="", 143 | info="Select model precision for computation" 144 | ), 145 | 'enable_hyperthreading': gr.Checkbox( 146 | label="Enable Hyperthreading", 147 | value=True, 148 | info="Enable hyperthreading for CPU inference" 149 | ), 150 | 'inference_num_threads': gr.Textbox( 151 | label="Inference Number of Threads", 152 | value="", 153 | placeholder="Leave empty for default", 154 | info="Number of inference threads (optional)" 155 | ) 156 | }) 157 | 158 | # Action Buttons 159 | with gr.Row(): 160 | self.components.update({ 161 | 'load_button': gr.Button("Load Model", variant="primary"), 162 | 'status_button': gr.Button("Check Status", variant="secondary") 163 | }) 164 | 165 | def debug_tool(self): 166 | with gr.Column(min_width=300, scale=1): 167 | with gr.Accordion("Debug Log", open=True): 168 | self.components['debug_log'] = gr.JSON( 169 | label="Log Output", 170 | value={"message": "Debug information will appear here..."}, 171 | ) 172 | with gr.Accordion("OpenVINO Config", open=False): 173 | self.components['config_viewer'] = gr.JSON( 174 | label="OpenVINO Configuration", 175 | value={"message": "Config will appear here when model path is entered..."}, 176 | ) 177 | with gr.Accordion("Architecture", open=False): 178 | self.components['architecture_viewer'] = gr.JSON( 179 | label="Architecture", 180 | value={"message": "Architecture will appear here when model path is entered..."}, 181 | ) 182 | 183 | with gr.Accordion("Generation Config", open=False): 184 | self.components['generation_config_viewer'] = gr.JSON( 185 | label="Generation Configuration", 186 | value={"message": "Generation config will appear here when model path is entered..."}, 187 | ) 188 | 189 | def setup_button_handlers(self): 190 | self.build_load_request() 191 | 192 | # Add handler for model path changes 193 | self.components['id_model'].change( 194 | fn=self.read_openvino_config, 195 | inputs=[self.components['id_model']], 196 | outputs=[self.components['config_viewer']] 197 | ) 198 | 199 | self.components['id_model'].change( 200 | fn=self.read_architecture, 201 | inputs=[self.components['id_model']], 202 | outputs=[self.components['architecture_viewer']] 203 | ) 204 | 205 | self.components['id_model'].change( 206 | fn=self.read_generation_config, 207 | inputs=[self.components['id_model']], 208 | outputs=[self.components['generation_config_viewer']] 209 | ) 210 | 211 | def build_load_request(self): 212 | self.components['load_button'].click( 213 | fn=self.payload_constructor.load_model, 214 | inputs=[ 215 | self.components[key] for key in [ 216 | 'id_model', 'device', 'use_cache', 'export_model', 217 | 'num_streams', 'performance_hint', 'inference_precision_hint', 218 | 'model_type', 219 | 'bos_token_id', 'eos_token_id', 'pad_token_id', 220 | 'enable_hyperthreading', 'inference_num_threads', 'dynamic_shapes' 221 | ] 222 | ], 223 | outputs=[self.components['debug_log']] 224 | ) 225 | 226 | self.components['status_button'].click( 227 | fn=self.payload_constructor.status, 228 | inputs=None, 229 | outputs=[self.components['debug_log']] 230 | ) 231 | -------------------------------------------------------------------------------- /src/frontend/components/model_conversion.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE # Import for default cache_dir 3 | 4 | # Dynamically get tasks (approximation, as original script uses TasksManager) 5 | # In a real scenario, this might need a more robust way to get tasks if TasksManager is available 6 | # For now, using the list from the previous file content + info from the script 7 | AVAILABLE_TASKS = [ 8 | 'image-to-image', 'image-segmentation', 'image-text-to-text', 'inpainting', 9 | 'sentence-similarity', 'text-to-audio', 'image-to-text', 10 | 'automatic-speech-recognition', 'token-classification', 'text-to-image', 11 | 'audio-classification', 'feature-extraction', 'semantic-segmentation', 12 | 'masked-im', 'audio-xvector', 'audio-frame-classification', 13 | 'text2text-generation', 'multiple-choice', 'depth-estimation', 14 | 'image-classification', 'fill-mask', 'zero-shot-object-detection', 15 | 'object-detection', 'question-answering', 'zero-shot-image-classification', 16 | 'mask-generation', 'text-generation', 'text-classification', 17 | 'text-generation-with-past' 18 | ] 19 | 20 | class ConversionTool: 21 | def __init__(self): 22 | 23 | self.model_input = gr.Textbox( 24 | label='Model', 25 | placeholder='Model ID on huggingface.co or path on disk', 26 | info="Model ID on huggingface.co or path on disk to load model from." # Updated info 27 | ) 28 | 29 | self.output_path = gr.Textbox( 30 | label='Output Directory', 31 | placeholder='Path to store the generated OV model', 32 | info="Path indicating the directory where to store the generated OV model." # Updated info 33 | ) 34 | 35 | self.task = gr.Dropdown( 36 | label='Task', 37 | choices=['auto'] + AVAILABLE_TASKS, 38 | value='auto', # Default value is 'auto' 39 | info=( # Updated info 40 | "The task to export the model for. If not specified, the task will be auto-inferred based on metadata in the model repository." 41 | 42 | ) 43 | ) 44 | 45 | self.framework = gr.Dropdown( 46 | label='Framework', 47 | choices=[None, 'pt', 'tf'], # Added None option 48 | value=None, 49 | info=( # Updated info 50 | "The framework to use for the export. If not provided, will attempt to use the local checkpoint's " 51 | "original framework or what is available in the environment." 52 | ) 53 | ) 54 | 55 | self.trust_remote_code = gr.Checkbox( # Added trust_remote_code 56 | label='Trust Remote Code', 57 | value=False, 58 | info=( 59 | "Allows to use custom code for the modeling hosted in the model repository. This option should only be set for repositories you trust and in which " 60 | "you have read the code, as it will execute on your local machine arbitrary code present in the model repository." 61 | ) 62 | ) 63 | 64 | self.weight_format = gr.Dropdown( 65 | label='Weight Format', 66 | choices=['fp32', 'fp16', 'int8', 'int4', 'mxfp4', 'nf4'], # Added None option 67 | value=None, 68 | info="The weight format of the exported model." # Updated info 69 | ) 70 | 71 | self.quant_mode = gr.Dropdown( # Added quant_mode 72 | label='Quantization Mode', 73 | choices=[None, 'int8', 'f8e4m3', 'f8e5m2', 'nf4_f8e4m3', 'nf4_f8e5m2', 'int4_f8e4m3', 'int4_f8e5m2'], 74 | value=None, 75 | info=( 76 | "Quantization precision mode. This is used for applying full model quantization including activations. " 77 | ) 78 | ) 79 | 80 | self.library = gr.Dropdown( 81 | label='Library', 82 | choices=[ 83 | None, # Added None option 84 | 'transformers', 85 | 'diffusers', 86 | 'timm', 87 | 'sentence_transformers', 88 | 'open_clip' 89 | ], 90 | value=None, # Default is None, inferred later 91 | info="The library used to load the model before export. If not provided, will attempt to infer the local checkpoint's library" # Updated info 92 | ) 93 | 94 | self.cache_dir = gr.Textbox( # Added cache_dir 95 | label='Cache Directory', 96 | placeholder=f'Default: {HUGGINGFACE_HUB_CACHE}', # Use imported default 97 | value=None, # Default to None, let the script handle the default path 98 | info="The path to a directory in which the downloaded model should be cached if the standard cache should not be used." 99 | ) 100 | 101 | self.pad_token_id = gr.Number( # Added pad_token_id 102 | label='Pad Token ID', 103 | value=None, 104 | step=1, 105 | info=( 106 | "This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it." 107 | ) 108 | ) 109 | 110 | self.variant = gr.Textbox( # Added variant 111 | label='Variant', 112 | value=None, 113 | info="If specified load weights from variant filename." 114 | ) 115 | 116 | self.ratio = gr.Number( 117 | label='Ratio', 118 | value=None, # Default is None 119 | minimum=0.0, 120 | maximum=1.0, # Max is 1.0 according to help text 121 | step=0.1, 122 | info=( # Updated info 123 | "A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80%% of the layers will be quantized to int4 " 124 | "while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0. " 125 | "Note: If dataset is provided, and the ratio is less than 1.0, then data-aware mixed precision assignment will be applied." 126 | ) 127 | ) 128 | 129 | self.sym = gr.Checkbox( # Moved sym higher to group with quantization params 130 | label='Symmetric Quantization', 131 | value=None, # Default is None in script 132 | info=("Whether to apply symmetric quantization") # Updated info 133 | ) 134 | 135 | self.group_size = gr.Number( 136 | label='Group Size', 137 | value=None, # Default is None 138 | step=1, 139 | info=("The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.") # Updated info 140 | ) 141 | 142 | self.backup_precision = gr.Dropdown( 143 | label='Backup Precision', 144 | choices=[None, 'none', 'int8_sym', 'int8_asym'], # Added None and 'none' 145 | value=None, # Default is None 146 | info=( # Updated info 147 | "Defines a backup precision for mixed-precision weight compression. Only valid for 4-bit weight formats. " 148 | "If not provided, backup precision is int8_asym. 'none' stands for original floating-point precision of " 149 | "the model weights, in this case weights are retained in their original precision without any " 150 | "quantization. 'int8_sym' stands for 8-bit integer symmetric quantization without zero point. 'int8_asym' " 151 | "stands for 8-bit integer asymmetric quantization with zero points per each quantization group." 152 | ) 153 | ) 154 | 155 | self.dataset = gr.Dropdown( 156 | label='Dataset', 157 | choices=[None, # Added None option 158 | 'auto', 159 | 'wikitext2', 160 | 'c4', 161 | 'c4-new', 162 | 'contextual', 163 | 'conceptual_captions', 164 | 'laion/220k-GPT4Vision-captions-from-LIVIS', 165 | 'laion/filtered-wit'], 166 | value=None, 167 | info=( # Updated info 168 | "The dataset used for data-aware compression or quantization with NNCF. " 169 | "For language models you can use the one from the list ['auto','wikitext2','c4','c4-new']. With 'auto' the " 170 | "dataset will be collected from model's generations. " 171 | "For diffusion models it should be on of ['conceptual_captions'," 172 | "'laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. " 173 | "For visual language models the dataset must be set to 'contextual'. " 174 | "Note: if none of the data-aware compression algorithms are selected and ratio parameter is omitted or " 175 | "equals 1.0, the dataset argument will not have an effect on the resulting model." 176 | ) 177 | ) 178 | 179 | self.all_layers = gr.Checkbox( 180 | label='All Layers', 181 | value=None, # Default is None in script 182 | info=( # Updated info 183 | "Whether embeddings and last MatMul layers should be compressed to INT4. If not provided an weight " 184 | "compression is applied, they are compressed to INT8." 185 | ) 186 | ) 187 | 188 | self.awq = gr.Checkbox( 189 | label='AWQ', 190 | value=None, # Default is None in script 191 | info=( # Updated info 192 | "Whether to apply AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs, but requires " 193 | "additional time for tuning weights on a calibration dataset. To run AWQ, please also provide a dataset " 194 | "argument. Note: it is possible that there will be no matching patterns in the model to apply AWQ, in such " 195 | "case it will be skipped." 196 | ) 197 | ) 198 | 199 | self.scale_estimation = gr.Checkbox( # Added scale_estimation 200 | label='Scale Estimation', 201 | value=None, # Default is None in script 202 | info=( 203 | "Indicates whether to apply a scale estimation algorithm that minimizes the L2 error between the original " 204 | "and compressed layers. Providing a dataset is required to run scale estimation. Please note, that " 205 | "applying scale estimation takes additional memory and time." 206 | ) 207 | ) 208 | 209 | self.gptq = gr.Checkbox( # Added gptq 210 | label='GPTQ', 211 | value=None, # Default is None in script 212 | info=( 213 | "Indicates whether to apply GPTQ algorithm that optimizes compressed weights in a layer-wise fashion to " 214 | "minimize the difference between activations of a compressed and original layer. Please note, that " 215 | "applying GPTQ takes additional memory and time." 216 | ) 217 | ) 218 | 219 | self.lora_correction = gr.Checkbox( # Added lora_correction 220 | label='LoRA Correction', 221 | value=None, # Default is None in script 222 | info=( 223 | "Indicates whether to apply LoRA Correction algorithm. When enabled, this algorithm introduces low-rank " 224 | "adaptation layers in the model that can recover accuracy after weight compression at some cost of " 225 | "inference latency. Please note, that applying LoRA Correction algorithm takes additional memory and time." 226 | ) 227 | ) 228 | 229 | self.sensitivity_metric = gr.Dropdown( # Added sensitivity_metric 230 | label='Sensitivity Metric', 231 | choices=[None, 'weight_quantization_error', 'hessian_input_activation', 232 | 'mean_activation_variance', 'max_activation_variance', 'mean_activation_magnitude'], 233 | value=None, 234 | info=( 235 | "The sensitivity metric for assigning quantization precision to layers. It can be one of the following: " 236 | "['weight_quantization_error', 'hessian_input_activation', 'mean_activation_variance', " 237 | "'max_activation_variance', 'mean_activation_magnitude']." 238 | ) 239 | ) 240 | 241 | self.num_samples = gr.Number( # Added num_samples 242 | label='Number of Samples', 243 | value=None, 244 | step=1, 245 | info="The maximum number of samples to take from the dataset for quantization." # Updated info 246 | ) 247 | 248 | self.disable_stateful = gr.Checkbox( 249 | label='Disable Stateful', 250 | value=False, # Default is False (stateful is enabled by default) 251 | info=( # Updated info 252 | "Disable stateful converted models, stateless models will be generated instead. Stateful models are produced by default when this key is not used. " 253 | "In stateful models all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs. " 254 | "If --disable-stateful option is used, it may result in sub-optimal inference performance. " 255 | "Use it when you intentionally want to use a stateless model, for example, to be compatible with existing " 256 | "OpenVINO native inference code that expects KV-cache inputs and outputs in the model." 257 | ) 258 | ) 259 | 260 | self.disable_convert_tokenizer = gr.Checkbox( 261 | label='Disable Convert Tokenizer', 262 | value=False, # Default is False (conversion is enabled by default) 263 | info="Do not add converted tokenizer and detokenizer OpenVINO models." # Updated info 264 | ) 265 | 266 | self.smooth_quant_alpha = gr.Number( # Added smooth_quant_alpha 267 | label='Smooth Quant Alpha', 268 | value=None, 269 | minimum=0.0, 270 | maximum=1.0, 271 | step=0.1, 272 | info=( 273 | "SmoothQuant alpha parameter that improves the distribution of activations before MatMul layers and " 274 | "reduces quantization error. Valid only when activations quantization is enabled." 275 | ) 276 | ) 277 | 278 | self.command_output = gr.TextArea( 279 | label='Generated Command', 280 | placeholder='Generated command will appear here...', 281 | show_label=True, 282 | show_copy_button=True, 283 | lines=5 # Adjust height 284 | ) 285 | 286 | def construct_command(self, model_input, output_path, task, framework, trust_remote_code, # Added trust_remote_code 287 | weight_format, quant_mode, library, cache_dir, pad_token_id, variant, # Added new args 288 | ratio, sym, group_size, backup_precision, dataset, all_layers, # Added sym 289 | awq, scale_estimation, gptq, lora_correction, sensitivity_metric, num_samples, # Added new args 290 | disable_stateful, disable_convert_tokenizer, smooth_quant_alpha): # Added smooth_quant_alpha 291 | """Construct the command string""" 292 | if not model_input or not output_path: 293 | return '' 294 | 295 | cmd_parts = ['optimum-cli export openvino'] 296 | cmd_parts.append(f'-m "{model_input}"') 297 | 298 | if task and task != 'auto': 299 | cmd_parts.append(f'--task {task}') 300 | 301 | if framework: 302 | cmd_parts.append(f'--framework {framework}') 303 | 304 | if trust_remote_code: # Added trust_remote_code flag 305 | cmd_parts.append('--trust-remote-code') 306 | 307 | if weight_format: # Check if not None/empty 308 | cmd_parts.append(f'--weight-format {weight_format}') 309 | 310 | if quant_mode: # Added quant_mode 311 | cmd_parts.append(f'--quant-mode {quant_mode}') 312 | 313 | if library: # Check if not None/empty 314 | cmd_parts.append(f'--library {library}') 315 | 316 | if cache_dir: # Added cache_dir 317 | cmd_parts.append(f'--cache_dir "{cache_dir}"') 318 | 319 | if pad_token_id: # Added pad_token_id 320 | cmd_parts.append(f'--pad-token-id {int(pad_token_id)}') # Ensure int 321 | 322 | if variant: # Added variant 323 | cmd_parts.append(f'--variant "{variant}"') 324 | 325 | # Compression/Quantization specific args 326 | if ratio: # Check for None explicitly 327 | cmd_parts.append(f'--ratio {ratio}') 328 | 329 | if sym: # Check for None explicitly and True 330 | cmd_parts.append('--sym') 331 | 332 | if group_size: # Check for None explicitly 333 | cmd_parts.append(f'--group-size {int(group_size)}') # Ensure int 334 | 335 | if backup_precision: # Check if not None/empty 336 | cmd_parts.append(f'--backup-precision {backup_precision}') 337 | 338 | if dataset: # Check if not None/empty 339 | cmd_parts.append(f'--dataset {dataset}') 340 | 341 | if all_layers: # Check for None explicitly and True 342 | cmd_parts.append('--all-layers') 343 | 344 | if awq: # Check for None explicitly and True 345 | cmd_parts.append('--awq') 346 | 347 | if scale_estimation: # Added scale_estimation flag 348 | cmd_parts.append('--scale-estimation') 349 | 350 | if gptq is not None and gptq: # Added gptq flag 351 | cmd_parts.append('--gptq') 352 | 353 | if lora_correction: # Added lora_correction flag 354 | cmd_parts.append('--lora-correction') 355 | 356 | if sensitivity_metric: # Added sensitivity_metric 357 | cmd_parts.append(f'--sensitivity-metric {sensitivity_metric}') 358 | 359 | if num_samples: # Added num_samples 360 | cmd_parts.append(f'--num-samples {int(num_samples)}') # Ensure int 361 | 362 | if smooth_quant_alpha: # Added smooth_quant_alpha 363 | cmd_parts.append(f'--smooth-quant-alpha {smooth_quant_alpha}') 364 | 365 | # Other boolean flags 366 | if disable_stateful: # Default is False, only add if True 367 | cmd_parts.append('--disable-stateful') 368 | if disable_convert_tokenizer: # Default is False, only add if True 369 | cmd_parts.append('--disable-convert-tokenizer') 370 | 371 | # Output path is always last and required 372 | cmd_parts.append(f'"{output_path}"') 373 | 374 | constructed_command = ' '.join(cmd_parts) 375 | return constructed_command 376 | 377 | def gradio_app(self): 378 | """Create and run the Gradio interface.""" 379 | # Define inputs in the order they appear visually (or logically) 380 | inputs = [ 381 | self.model_input, 382 | self.output_path, 383 | self.task, 384 | self.framework, 385 | self.trust_remote_code, # Added 386 | self.weight_format, 387 | self.quant_mode, # Added 388 | self.library, 389 | self.cache_dir, # Added 390 | self.pad_token_id, # Added 391 | self.variant, # Added 392 | # Quantization/Compression Group 393 | self.ratio, 394 | self.sym, # Added 395 | self.group_size, 396 | self.backup_precision, 397 | self.dataset, 398 | self.all_layers, 399 | self.awq, 400 | self.scale_estimation, # Added 401 | self.gptq, # Added 402 | self.lora_correction, # Added 403 | self.sensitivity_metric, # Added 404 | self.num_samples, # Added 405 | self.smooth_quant_alpha, # Added 406 | # Other Flags 407 | self.disable_stateful, 408 | self.disable_convert_tokenizer, 409 | ] 410 | interface = gr.Interface( 411 | fn=self.construct_command, 412 | inputs=inputs, 413 | outputs=self.command_output, 414 | title="OpenVINO IR Model Conversion Tool", 415 | description=""" 416 | Enter model information to generate an `optimum-cli export openvino` command. 417 | Use the arguments below to configure the export process based on the OpenVINO exporter documentation. 418 | Then run the generated command in the terminal where your OpenArc environment is activated. 419 | """, 420 | flagging_mode='auto' # Keep or remove based on preference 421 | ) 422 | 423 | return interface 424 | 425 | 426 | # if __name__ == "__main__": 427 | # tool = ConversionTool() 428 | # app = tool.gradio_app() 429 | # app.launch(share=False) 430 | -------------------------------------------------------------------------------- /src/frontend/components/model_manager.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | import pandas as pd 3 | 4 | 5 | class ModelManager: 6 | def __init__(self, payload_constructor): 7 | self.payload_constructor = payload_constructor 8 | self.components = {} 9 | 10 | def _refresh_models(self): 11 | """Helper function to fetch and format loaded models data""" 12 | response, _ = self.payload_constructor.status() 13 | 14 | if "error" in response: 15 | return pd.DataFrame(), "Error fetching model status" 16 | 17 | loaded_models = response.get("loaded_models", {}) 18 | total_models = response.get("total_models_loaded", 0) 19 | 20 | # Format data for two-column DataFrame 21 | model_data = [] 22 | for model_name, model_info in loaded_models.items(): 23 | metadata = model_info.get("model_metadata", {}) 24 | 25 | # Add model header row 26 | model_data.append({"Attribute": f"{model_name}", "Value": ""}) 27 | 28 | # Add all model attributes 29 | model_data.append({"Attribute": "Status", "Value": model_info.get("status", "")}) 30 | model_data.append({"Attribute": "Device", "Value": model_info.get("device", "")}) 31 | model_data.append({"Attribute": "Path", "Value": metadata.get("id_model", "")}) 32 | model_data.append({"Attribute": "use_cache", "Value": metadata.get("use_cache", "")}) 33 | model_data.append({"Attribute": "dynamic_shapes", "Value": metadata.get("dynamic_shapes", "")}) 34 | model_data.append({"Attribute": "pad_token_id", "Value": metadata.get("pad_token_id", "")}) 35 | model_data.append({"Attribute": "eos_token_id", "Value": metadata.get("eos_token_id", "")}) 36 | model_data.append({"Attribute": "bos_token_id", "Value": metadata.get("bos_token_id", "")}) 37 | model_data.append({"Attribute": "is_vision_model", "Value": metadata.get("is_vision_model", "")}) 38 | model_data.append({"Attribute": "is_text_model", "Value": metadata.get("is_text_model", "")}) 39 | model_data.append({"Attribute": "NUM_STREAMS", "Value": metadata.get("NUM_STREAMS", "")}) 40 | model_data.append({"Attribute": "PERFORMANCE_HINT", "Value": metadata.get("PERFORMANCE_HINT", "")}) 41 | model_data.append({"Attribute": "PRECISION_HINT", "Value": metadata.get("PRECISION_HINT", "")}) 42 | model_data.append({"Attribute": "ENABLE_HYPER_THREADING", "Value": metadata.get("ENABLE_HYPER_THREADING", "")}) 43 | model_data.append({"Attribute": "INFERENCE_NUM_THREADS", "Value": metadata.get("INFERENCE_NUM_THREADS", "")}) 44 | model_data.append({"Attribute": "SCHEDULING_CORE_TYPE", "Value": metadata.get("SCHEDULING_CORE_TYPE", "")}) 45 | 46 | # Add empty row between models 47 | model_data.append({"Attribute": "", "Value": ""}) 48 | 49 | df = pd.DataFrame(model_data) 50 | status_text = f"Total Models Loaded: {total_models}" 51 | return df, status_text 52 | 53 | def _unload_model(self): 54 | """Helper function to unload a model""" 55 | response, _ = self.payload_constructor.unload_model() 56 | return response 57 | 58 | def _unload_model_ui(self, model_id): 59 | """Helper function to handle model unloading""" 60 | _, status_msg = self.payload_constructor.unload_model(model_id) 61 | return status_msg 62 | 63 | def create_interface(self): 64 | with gr.Tab("Model Manager"): 65 | gr.Markdown("## Model Management Interface") 66 | 67 | with gr.Row(): 68 | refresh_btn = gr.Button("Refresh Loaded Models") 69 | status_text = gr.Textbox(label="Status", interactive=False) 70 | 71 | model_table = gr.DataFrame( 72 | headers=["Attribute", "Value"], 73 | datatype=["str", "str"], 74 | interactive=False, 75 | wrap=True, 76 | ) 77 | 78 | refresh_btn.click( 79 | fn=self._refresh_models, 80 | outputs=[model_table, status_text] 81 | ) 82 | 83 | with gr.Row(): 84 | model_id_input = gr.Textbox(label="Model ID to Unload") 85 | unload_btn = gr.Button("Unload Model") 86 | unload_status = gr.Textbox(label="Unload Status", interactive=False) 87 | 88 | unload_btn.click( 89 | fn=self._unload_model_ui, 90 | inputs=model_id_input, 91 | outputs=unload_status 92 | ) 93 | -------------------------------------------------------------------------------- /src/frontend/tools/device_query.py: -------------------------------------------------------------------------------- 1 | # Diagnostic Device Query 2 | 3 | import openvino as ov 4 | import logging as log 5 | 6 | class DeviceDiagnosticQuery: 7 | def __init__(self): 8 | self.core = ov.Core() 9 | self.available_devices = self.core.available_devices 10 | 11 | def get_available_devices(self): 12 | """Returns a list of available OpenVINO devices.""" 13 | return self.available_devices 14 | 15 | #if __name__ == "__main__": 16 | # device_query = DeviceDiagnosticQuery() 17 | # print(device_query.get_available_devices()) 18 | 19 | 20 | # Device Query: 21 | # Taken from https://github.com/openvinotoolkit/openvino/blob/master/samples/python/hello_query_device/hello_query_device.py 22 | 23 | class DeviceDataQuery: 24 | def __init__(self): 25 | self.core = ov.Core() 26 | 27 | @staticmethod 28 | def param_to_string(parameters) -> str: 29 | """Convert a list / tuple of parameters returned from OV to a string.""" 30 | if isinstance(parameters, (list, tuple)): 31 | return ', '.join([str(x) for x in parameters]) 32 | return str(parameters) 33 | 34 | def get_available_devices(self) -> list: 35 | """Return list of available devices.""" 36 | return self.core.available_devices 37 | 38 | def get_device_properties(self, device: str) -> dict: 39 | """Get all properties for a specific device.""" 40 | properties = {} 41 | supported_properties = self.core.get_property(device, 'SUPPORTED_PROPERTIES') 42 | 43 | for property_key in supported_properties: 44 | if property_key != 'SUPPORTED_PROPERTIES': 45 | try: 46 | property_val = self.core.get_property(device, property_key) 47 | properties[property_key] = self.param_to_string(property_val) 48 | except TypeError: 49 | properties[property_key] = 'UNSUPPORTED TYPE' 50 | return properties 51 | 52 | def print_device_info(self): 53 | """Print information about all available devices.""" 54 | log.info('Available devices:') 55 | for device in self.get_available_devices(): 56 | log.info(f'{device} :') 57 | log.info('\tSUPPORTED_PROPERTIES:') 58 | 59 | properties = self.get_device_properties(device) 60 | for key, value in properties.items(): 61 | log.info(f'\t\t{key}: {value}') 62 | log.info('') 63 | 64 | #def main(): 65 | # log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout) 66 | # query = DeviceDataQuery() 67 | # query.print_device_info() 68 | # return 0 69 | 70 | #if __name__ == '__main__': 71 | # sys.exit(main()) -------------------------------------------------------------------------------- /src/frontend/tools/payload_constructor.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from typing import Optional 3 | import requests 4 | import os 5 | 6 | 7 | # Default OpenARC URL 8 | DEFAULT_OPENARC_PORT = 8000 9 | OPENARC_URL = f"http://localhost:{DEFAULT_OPENARC_PORT}" 10 | 11 | # Update URL if custom port is provided 12 | def update_openarc_url(openarc_port=DEFAULT_OPENARC_PORT): 13 | global OPENARC_URL 14 | OPENARC_URL = f"http://localhost:{openarc_port}" 15 | 16 | def get_auth_headers(): 17 | """Get authorization headers with bearer token if available""" 18 | headers = {"Content-Type": "application/json"} 19 | api_key = os.environ.get("OPENARC_API_KEY") 20 | if api_key: 21 | headers["Authorization"] = f"Bearer {api_key}" 22 | return headers 23 | 24 | 25 | class LoadModelConfig(BaseModel): 26 | id_model: str 27 | use_cache: bool 28 | device: str 29 | export_model: bool 30 | model_type: str 31 | eos_token_id: Optional[int] = None 32 | pad_token_id: Optional[int] = None 33 | bos_token_id: Optional[int] = None 34 | dynamic_shapes: bool = True 35 | 36 | class OVConfig(BaseModel): 37 | NUM_STREAMS: Optional[str] = None 38 | PERFORMANCE_HINT: Optional[str] = None 39 | ENABLE_HYPERTHREADING: Optional[bool] = None 40 | INFERENCE_NUM_THREADS: Optional[str] = None 41 | INFERENCE_PRECISION_HINT: Optional[str] = None 42 | 43 | class Payload_Constructor: 44 | def __init__(self): 45 | self.generation_config = {} 46 | 47 | def load_model(self, id_model, device, use_cache, export_model, num_streams, performance_hint, inference_precision_hint, model_type, bos_token_id, eos_token_id, pad_token_id, enable_hyperthreading, inference_num_threads, dynamic_shapes): 48 | """ 49 | Constructs and sends the load model request based on UI inputs 50 | 51 | Args: 52 | id_model (str): Model identifier or path 53 | device (str): Device selection for inference 54 | use_cache (bool): Whether to use cache 55 | model_type (str): Defines the type of model to load 56 | export_model (bool): Whether to export the model 57 | num_streams (str): Number of inference streams 58 | performance_hint (str): Performance optimization strategy 59 | INFERENCE_PRECISION_HINT (str): Model precision for computation 60 | bos_token_id (str): BOS token ID 61 | eos_token_id (str): EOS token ID 62 | pad_token_id (str): PAD token ID 63 | enable_hyperthreading (bool): Whether to enable hyperthreading 64 | inference_num_threads (str): Number of inference threads 65 | dynamic_shapes (bool): Whether to use dynamic shapes 66 | """ 67 | 68 | # Create validated load_config 69 | load_config = LoadModelConfig( 70 | id_model=id_model, 71 | use_cache=use_cache, 72 | device=device, 73 | export_model=export_model, 74 | model_type=model_type, 75 | eos_token_id=int(eos_token_id) if eos_token_id else None, 76 | pad_token_id=int(pad_token_id) if pad_token_id else None, 77 | bos_token_id=int(bos_token_id) if bos_token_id else None, 78 | dynamic_shapes=dynamic_shapes 79 | ) 80 | 81 | # Create validated ov_config 82 | ov_config = OVConfig( 83 | NUM_STREAMS=num_streams if num_streams else None, 84 | PERFORMANCE_HINT=performance_hint if performance_hint else None, 85 | ENABLE_HYPERTHREADING=enable_hyperthreading, 86 | INFERENCE_NUM_THREADS=inference_num_threads if inference_num_threads else None, 87 | INFERENCE_PRECISION_HINT=inference_precision_hint if inference_precision_hint else None 88 | ) 89 | 90 | try: 91 | response = requests.post( 92 | f"{OPENARC_URL}/optimum/model/load", 93 | headers=get_auth_headers(), 94 | json={ 95 | "load_config": load_config.model_dump(exclude_none=True), 96 | "ov_config": ov_config.model_dump(exclude_none=True) 97 | } 98 | ) 99 | response.raise_for_status() 100 | return response.json(), f"Model loaded successfully: {response.json()}" 101 | except requests.exceptions.RequestException as e: 102 | return {"error": f"Request failed: {str(e)}"}, f"Error loading model: {str(e)}" 103 | 104 | def unload_model(self, model_id: str): 105 | """ 106 | Sends an unload model request to the API 107 | 108 | Args: 109 | model_id (str): The ID of the model to unload 110 | """ 111 | try: 112 | response = requests.delete( 113 | f"{OPENARC_URL}/optimum/model/unload", 114 | headers=get_auth_headers(), 115 | params={"model_id": model_id} 116 | ) 117 | response.raise_for_status() 118 | return response.json(), f"Model {model_id} unloaded successfully: {response.json()}" 119 | except requests.exceptions.RequestException as e: 120 | return {"error": f"Request failed: {str(e)}"}, f"Error unloading model {model_id}: {str(e)}" 121 | 122 | def status(self): 123 | """ 124 | Checks the server status 125 | """ 126 | try: 127 | response = requests.get( 128 | f"{OPENARC_URL}/optimum/status", 129 | headers=get_auth_headers() 130 | ) 131 | response.raise_for_status() 132 | return response.json(), f"Server status: {response.json()}" 133 | except requests.exceptions.RequestException as e: 134 | return {"error": f"Request failed: {str(e)}"}, f"Error checking server status: {str(e)}" 135 | -------------------------------------------------------------------------------- /start_dashboard.py: -------------------------------------------------------------------------------- 1 | # OpenArc/start_dashboard.py 2 | import argparse 3 | 4 | import gradio as gr 5 | 6 | from src.frontend.components.device_info import DeviceInfoTool 7 | from src.frontend.components.model_conversion import ConversionTool 8 | from src.frontend.components.documentation import OpenArc_Documentation 9 | from src.frontend.components.loader import Optimum_Loader 10 | from src.frontend.components.model_manager import ModelManager 11 | from src.frontend.tools.payload_constructor import ( 12 | Payload_Constructor, 13 | update_openarc_url, 14 | ) 15 | 16 | if __name__ == "__main__": 17 | parser = argparse.ArgumentParser(description="Start the OpenVINO Chat Dashboard") 18 | 19 | parser.add_argument("--openarc-port", type=int, default=8000, 20 | help="Port for the OpenARC server (default: 8000)") 21 | 22 | args = parser.parse_args() 23 | # Update OpenARC URL with the provided port 24 | update_openarc_url(args.openarc_port) 25 | 26 | # Create the dashboard components 27 | payload_constructor = Payload_Constructor() 28 | 29 | # Set up the Gradio interface 30 | with gr.Blocks(title="OpenARC Dashboard") as demo: 31 | with gr.Tabs(): 32 | # Main tabs 33 | optimum_loader = Optimum_Loader(payload_constructor) 34 | optimum_loader.create_interface() 35 | 36 | model_manager = ModelManager(payload_constructor) 37 | model_manager.create_interface() 38 | 39 | # Tools tab with sub-tabs 40 | with gr.Tab("Tools"): 41 | with gr.Tabs(): 42 | with gr.Tab("Model Conversion"): 43 | conversion_tool = ConversionTool() 44 | conversion_tool.gradio_app() 45 | 46 | # Device Information tab 47 | device_info_tool = DeviceInfoTool() 48 | device_info_tool.create_interface() 49 | 50 | # Documentation tab 51 | documentation = OpenArc_Documentation() 52 | documentation.create_interface() 53 | 54 | # Launch the dashboard 55 | demo.launch() 56 | -------------------------------------------------------------------------------- /start_server.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from src.api.launcher import start_server 3 | 4 | if __name__ == "__main__": 5 | parser = argparse.ArgumentParser(description="Start the OpenVINO Inference API server") 6 | parser.add_argument("--host", type=str, default="0.0.0.0", 7 | help="Host to bind the server to (default: 0.0.0.0)") 8 | parser.add_argument("--openarc-port", type=int, default=8000, 9 | help="Port to bind the server to (default: 8000)") 10 | args = parser.parse_args() 11 | start_server(host=args.host, openarc_port=args.openarc_port) --------------------------------------------------------------------------------