├── .gitignore
├── LICENSE
├── README.md
├── docs
├── model_conversion
│ ├── data_types.md
│ └── introduction.md
├── ov_config
│ ├── _README.md
│ ├── enable_hyperthreading.md
│ ├── inference_num_threads.md
│ ├── num_streams.md
│ ├── performance_hint_cumulative_throughput.md
│ ├── performance_hint_latency.md
│ ├── performance_hint_throughput.md
│ └── scheduling_core_type.md
└── tools
│ ├── ov_device_query.md
│ └── working_with_intel_devices.md
├── environment.yaml
├── scripts
├── benchmark
│ └── ov_simple_text_bench.py
├── examples
│ ├── dedication.png
│ ├── optimum_decoder.py
│ ├── ov_speculative_decoder_bench.py
│ ├── ov_text_model_card.py
│ ├── ov_vision.ipynb
│ └── ov_vision_model_card.py
└── requests
│ ├── load_model.sh
│ ├── load_vision.sh
│ ├── openai_like_completion.sh
│ ├── openai_like_models.sh
│ ├── status.sh
│ └── unload_model.sh
├── src
├── api
│ ├── __pycache__
│ │ ├── optimum_api.cpython-311.pyc
│ │ └── optimum_api.cpython-312.pyc
│ ├── launcher.py
│ └── optimum_api.py
├── engine
│ ├── __init__.py
│ ├── optimum
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── optimum_inference_core.cpython-311.pyc
│ │ │ └── optimum_inference_core.cpython-312.pyc
│ │ ├── optimum_base_config.py
│ │ ├── optimum_image2text.py
│ │ ├── optimum_seq2seq.py
│ │ ├── optimum_speech.py
│ │ ├── optimum_text2image.py
│ │ └── optimum_text2text.py
│ └── ov_genai
│ │ ├── __init__.py
│ │ ├── base_configuration.py
│ │ ├── llm_pipe_core.py
│ │ ├── multimodal_pipe_core.py
│ │ ├── txt2img_pipe_core.py
│ │ └── whisper_pipe_core.py
└── frontend
│ ├── __init__.py
│ ├── components
│ ├── device_info.py
│ ├── documentation.py
│ ├── loader.py
│ ├── model_conversion.py
│ └── model_manager.py
│ └── tools
│ ├── device_query.py
│ └── payload_constructor.py
├── start_dashboard.py
└── start_server.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Python bytecode and cache files
2 | __pycache__/
3 | *.py[cod]
4 | *.pyo
5 | *.pyd
6 | *.so
7 | *.pyc
8 | *.egg-info/
9 | *.egg
10 | .pytest_cache/
11 | .mypy_cache/
12 | .obsidian/
13 | .gradio/
14 | .venv/
15 | .vscode/
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Welcome to OpenARC
2 |
3 | [](https://discord.gg/Bzz9hax9Jq)
4 |
5 |
6 |
7 | > [!NOTE]
8 | > OpenArc is under active development. Expect breaking changes.
9 |
10 | **OpenArc** is an inference engine built with Optimum-Intel to leverage hardware acceleration on Intel CPUs, GPUs and NPUs through OpenVINO runtime that integrates closely with Huggingface Transformers.
11 |
12 | Under the hood OpenArc hosts a webserver over a growing collection of Transformers integrated AutoModel classes from Optimum-Intel. These enable accelerating inference on a wide range of tasks, models and source frameworks.
13 |
14 | ## Currently implemented
15 |
16 | OpenArc currently supports text generation and text generation with vision. Support for speculative decoding, generating embeddings, speech tasks, image generation, PaddleOCR, and other
17 |
18 | Currently implemented:
19 |
20 | [OVModelForCausalLM](https://github.com/huggingface/optimum-intel/blob/main/optimum/intel/openvino/modeling_decoder.py#L422)
21 |
22 | [OVModelForVisualCausalLM](https://github.com/huggingface/optimum-intel/blob/main/optimum/intel/openvino/modeling_visual_language.py#L309)
23 |
24 | OpenArc enables a similar workflow to what's possible with Ollama, LM-Studio or OpenRouter but with hardware acceleration from OpenVINO C++ runtime.
25 |
26 | ## Features
27 |
28 | - OpenAI compatible endpoints
29 | - Validated OpenWebUI support, but it should work elsewhere
30 | - Load multiple vision/text models concurrently on multiple devices for hotswap/multi agent workflows
31 | - **Most** HuggingFace text generation models
32 | - Growing set of vision capable LLMs:
33 | - Qwen2-VL
34 | - Qwen2.5-VL
35 | - Gemma 3
36 | ### Gradio management dashboard
37 | - Load models with OpenVINO optimizations
38 | - Build conversion commands
39 | - See loaded models and chosen optimizations
40 | - Unload models and view metadata about
41 | - Query detected devices
42 | - Query device properties
43 | - View tokenizer data
44 | - View architecture metadata from config.json
45 | ### Performance metrics on every completion
46 | - ttft: time to generate first token
47 | - generation_time : time to generate the whole response
48 | - number of tokens: total generated tokens for that request
49 | - tokens per second: measures throughput.
50 | - average token latency: helpful for optimizing zero shot classification tasks
51 |
52 | ## System Requirments
53 |
54 | OpenArc has been built on top of the OpenVINO runtime; as a result OpenArc supports the same range of hardware **but requires device specifc drivers** this document will not cover in-depth.
55 |
56 | Supported operating system are a bit different for each class of device. Please review [system requirments](https://docs.openvino.ai/2025/about-openvino/release-notes-openvino/system-requirements.html#cpu) for OpenVINO 2025.1.0optimum to learn which
57 |
58 | - Windows versions are supported
59 | - Linux distributions are supported
60 | - kernel versions are required
61 | - My system uses version 6.9.4-060904-generic with Ubuntu 24.04 LTS.
62 | - commands for different package managers
63 | - other required dependencies for GPU and NPU
64 |
65 | If you need help installing drivers:
66 | - Join the [Discord](https://discord.gg/PnuTBVcr)
67 | - Open an issue
68 | - Use [Linux Drivers](https://github.com/SearchSavior/OpenArc/discussions/11)
69 | - Use [Windows Drivers](https://github.com/SearchSavior/OpenArc/discussions/12)
70 |
71 |
72 | CPU
73 |
74 | Intel® Core™ Ultra Series 1 and Series 2 (Windows only )
75 |
76 | Intel® Xeon® 6 processor (preview)
77 |
78 | Intel Atom® Processor X Series
79 |
80 | Intel Atom® processor with Intel® SSE4.2 support
81 |
82 | Intel® Pentium® processor N4200/5, N3350/5, N3450/5 with Intel® HD Graphics
83 |
84 | 6th - 14th generation Intel® Core™ processors
85 |
86 | 1st - 5th generation Intel® Xeon® Scalable Processors
87 |
88 | ARM CPUs with armv7a and higher, ARM64 CPUs with arm64-v8a and higher, Apple® Mac with Apple silicon
89 |
90 |
91 |
92 |
93 | GPU
94 |
95 | Intel® Arc™ GPU Series
96 |
97 | Intel® HD Graphics
98 |
99 | Intel® UHD Graphics
100 |
101 | Intel® Iris® Pro Graphics
102 |
103 | Intel® Iris® Xe Graphics
104 |
105 | Intel® Iris® Xe Max Graphics
106 |
107 | Intel® Data Center GPU Flex Series
108 |
109 | Intel® Data Center GPU Max Series
110 |
111 |
112 |
113 |
114 | NPU
115 |
116 | Intel® Core Ultra Series
117 |
118 | This was a bit harder to list out as the system requirments page does not include an itemized list. However, it is safe to assume that if a device contains an Intel NPU it will be supported.
119 |
120 | The Gradio dashboard has tools for querying your device under the Tools tab to learn what optimization properties are selected by default.
121 |
122 |
123 |
124 | ### Ubuntu
125 |
126 | Create the conda environment:
127 |
128 | conda env create -f environment.yaml
129 |
130 |
131 | Set your API key as an environment variable:
132 |
133 | export OPENARC_API_KEY=
134 |
135 | Build Optimum-Intel from source to get the latest support:
136 |
137 | ```
138 | pip install "optimum-intel[openvino] @ git+https://github.com/huggingface/optimum-intel"
139 | ```
140 |
141 | ### Windows
142 |
143 | 1. Install Miniconda from [here](https://www.anaconda.com/docs/getting-started/miniconda/install#windows-installation)
144 |
145 | 2. Navigate to the directory containing the environment.yaml file and run
146 |
147 | conda env create -f environment.yaml
148 |
149 | Set your API key as an environment variable:
150 |
151 | setx OPENARC_API_KEY=
152 |
153 | Build Optimum-Intel from source to get the latest support:
154 |
155 | ```
156 | pip install "optimum-intel[openvino] @ git+https://github.com/huggingface/optimum-intel"
157 | ```
158 |
159 | > [!Tips]
160 | - Avoid setting up the environment from IDE extensions.
161 | - Try not to use the environment for other ML projects. Soon we will have uv.
162 |
163 | ## Usage
164 |
165 | OpenArc has two components:
166 |
167 | - start_server.py - launches the inference server
168 | - start_dashboard.py - launches the dashboard, which manages the server and provides some useful tools
169 |
170 |
171 | To launch the inference server run
172 | ```
173 | python start_server.py --host 0.0.0.0 --openarc-port 8000
174 | ```
175 |
176 | > host: defines the ip address to bind the server to
177 |
178 | > openarc_port: defines the port which can be used to access the server
179 |
180 | To launch the dashboard run
181 | ```
182 | python start_dashboard.py --openarc-port 8000
183 | ```
184 | > openarc_port: defines the port which requests from the dashboard use
185 |
186 | Run these in two different terminals.
187 |
188 | > [!NOTE]
189 | > Gradio handles ports natively so the port number does not need to be set. Default is 7860 but it will increment if another instance of gradio is running.
190 |
191 | ## OpenWebUI
192 |
193 | > [!NOTE]
194 | > I'm only going to cover the basics on OpenWebUI here. To learn more and set it up check out the [OpenWebUI docs](https://docs.openwebui.com/).
195 |
196 | - From the Connections menu add a new connection
197 | - Enter the server address and port where OpenArc is running **followed by /v1**
198 | Example:
199 | http://0.0.0.0:8000/v1
200 |
201 | - Here you need to set the API key manually
202 | - When you hit the refresh button OpenWebUI sends a GET request to the OpenArc server to get the list of models at v1/models
203 |
204 | Serverside logs should report:
205 |
206 | "GET /v1/models HTTP/1.1" 200 OK
207 |
208 | ### Other Frontends
209 |
210 | OpenArc _mostly_ conforms to the openai API specification. In practice this means other frontends, python classes and community tooling will be compatible.
211 |
212 | Tested:
213 |
214 | (mikupad)[https://github.com/lmg-anon/mikupad]
215 |
216 | ### Usage:
217 |
218 | - Load the model you want to use from the dashboard
219 | - Select the connection you just created and use the refresh button to update the list of models
220 | - if you use API keys and have a list of models these might be towards the bottom
221 |
222 | ## Convert to [OpenVINO IR](https://docs.openvino.ai/2025/documentation/openvino-ir-format.html)
223 |
224 | There are a few source of models which can be used with OpenArc;
225 |
226 | - [OpenVINO LLM Collection on HuggingFace](https://huggingface.co/collections/OpenVINO/llm-6687aaa2abca3bbcec71a9bd)
227 |
228 | - [My HuggingFace repo](https://huggingface.co/Echo9Zulu)
229 | - My repo contains preconverted models for a variety of architectures and usecases
230 | - OpenArc supports almost all of them
231 | - **These get updated regularly so check back often!**
232 |
233 | You can easily craft conversion commands using my HF Space, [Optimum-CLI-Tool_tool](https://huggingface.co/spaces/Echo9Zulu/Optimum-CLI-Tool_tool) or in the OpenArc Dashboard.
234 |
235 | This tool respects the positional arguments defined [here](https://huggingface.co/docs/optimum/main/en/intel/openvino/export), then execute commands in the OpenArc environment.
236 |
237 | | Models | Compressed Weights |
238 | | ----------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------ |
239 | | [Ministral-3b-instruct-int4_asym-ov](https://huggingface.co/Echo9Zulu/Ministral-3b-instruct-int4_asym-ov) | 1.85 GB |
240 | | [Hermes-3-Llama-3.2-3B-awq-ov](https://huggingface.co/Echo9Zulu/Hermes-3-Llama-3.2-3B-awq-ov) | 1.8 GB |
241 | | [Llama-3.1-Tulu-3-8B-int4_asym-ov](https://huggingface.co/Echo9Zulu/Llama-3.1-Tulu-3-8B-int4_asym-ov/tree/main) | 4.68 GB |
242 | | [Qwen2.5-7B-Instruct-1M-int4-ov](https://huggingface.co/Echo9Zulu/Qwen2.5-7B-Instruct-1M-int4-ov) | 4.46 GB |
243 | | [Meta-Llama-3.1-8B-SurviveV3-int4_asym-awq-se-wqe-ov](https://huggingface.co/Echo9Zulu/Meta-Llama-3.1-8B-SurviveV3-int4_asym-awq-se-wqe-ov) | 4.68 GB |
244 | | [Falcon3-10B-Instruct-int4_asym-ov](https://huggingface.co/Echo9Zulu/Falcon3-10B-Instruct-int4_asym-ov) | 5.74 GB |
245 | | [Echo9Zulu/phi-4-int4_asym-awq-ov](https://huggingface.co/Echo9Zulu/phi-4-int4_asym-awq-ov) | 8.11 GB |
246 | | [DeepSeek-R1-Distill-Qwen-14B-int4-awq-ov](https://huggingface.co/Echo9Zulu/DeepSeek-R1-Distill-Qwen-14B-int4-awq-ov/tree/main) | 7.68 GB |
247 | | [Phi-4-o1-int4_asym-awq-weight_quantization_error-ov](https://huggingface.co/Echo9Zulu/Phi-4-o1-int4_asym-awq-weight_quantization_error-ov) | 8.11 GB |
248 | | [Mistral-Small-24B-Instruct-2501-int4_asym-ov](https://huggingface.co/Echo9Zulu/Mistral-Small-24B-Instruct-2501-int4_asym-ov) | 12.9 GB |
249 |
250 | Documentation on choosing parameters for conversion is coming soon; we also have a channel in Discord for this topic.
251 |
252 | > [!NOTE]
253 | > The optimum CLI tool integrates several different APIs from several different Intel projects; it is a better alternative than using APIs in from_pretrained() methods.
254 | > It references prebuilt export configurations for each supported model architecture meaning **not all models are supported** but most are. If you use the CLI tool and get an error about an unsupported architecture follow the link, [open an issue](https://github.com/huggingface/optimum-intel/issues) with references to the model card and the maintainers will get back to you.
255 |
256 | > [!NOTE]
257 | > A naming convention for openvino converted models is coming soon.
258 |
259 | ## Performance with OpenVINO runtime
260 |
261 | Notes on the test:
262 |
263 | - No openvino optimization parameters were used
264 | - Fixed input length
265 | - I sent one user message
266 | - Quant strategies for models are not considered
267 | - I converted each of these models myself (I'm working on standardizing model cards to share this information more directly)
268 | - OpenVINO generates a cache on first inference so metrics are on second generation
269 | - Seconds were used for readability
270 |
271 | Test System:
272 |
273 | CPU: Xeon W-2255 (10c, 20t) @3.7ghz
274 | GPU: 3x Arc A770 16GB Asrock Phantom
275 | RAM: 128gb DDR4 ECC 2933 mhz
276 | Disk: 4tb ironwolf, 1tb 970 Evo
277 |
278 | OS: Ubuntu 24.04
279 | Kernel: 6.9.4-060904-generic
280 |
281 | Prompt: "We don't even have a chat template so strap in and let it ride!"
282 | max_new_tokens= 128
283 | ---
284 |
285 | ### GPU Performance: 1x Arc A770
286 |
287 | | Model | Prompt Processing (sec) | Throughput (t/sec) | Duration (sec) | Size (GB) |
288 | | ------------------------------------------------ | ----------------------- | ------------------ | -------------- | --------- |
289 | | Phi-4-mini-instruct-int4_asym-gptq-ov | 0.41 | 47.25 | 3.10 | 2.3 |
290 | | Hermes-3-Llama-3.2-3B-int4_sym-awq-se-ov | 0.27 | 64.18 | 0.98 | 1.8 |
291 | | Llama-3.1-Nemotron-Nano-8B-v1-int4_sym-awq-se-ov | 0.32 | 47.99 | 2.96 | 4.7 |
292 | | phi-4-int4_asym-awq-se-ov | 0.30 | 25.27 | 5.32 | 8.1 |
293 | | DeepSeek-R1-Distill-Qwen-14B-int4_sym-awq-se-ov | 0.42 | 25.23 | 1.56 | 8.4 |
294 | | Mistral-Small-24B-Instruct-2501-int4_asym-ov | 0.36 | 18.81 | 7.11 | 12.9 |
295 |
296 |
297 | ### CPU Performance: Xeon W-2255
298 |
299 | | Model | Prompt Processing (sec) | Throughput (t/sec) | Duration (sec) | Size (GB) |
300 | | ------------------------------------------------ | ----------------------- | ------------------ | -------------- | --------- |
301 | | Phi-4-mini-instruct-int4_asym-gptq-ov | 1.02 | 20.44 | 7.23 | 2.3 |
302 | | Hermes-3-Llama-3.2-3B-int4_sym-awq-se-ov | 1.06 | 23.66 | 3.01 | 1.8 |
303 | | Llama-3.1-Nemotron-Nano-8B-v1-int4_sym-awq-se-ov | 2.53 | 13.22 | 12.14 | 4.7 |
304 | | phi-4-int4_asym-awq-se-ov | 4 | 6.63 | 23.14 | 8.1 |
305 | | DeepSeek-R1-Distill-Qwen-14B-int4_sym-awq-se-ov | 5.02 | 7.25 | 11.09 | 8.4 |
306 | | Mistral-Small-24B-Instruct-2501-int4_asym-ov | 6.88 | 4.11 | 37.5 | 12.9 |
307 | | Nous-Hermes-2-Mixtral-8x7B-DPO-int4-sym-se-ov | 15.56 | 6.67 | 34.60 | 24.2 |
308 |
309 |
310 | ### Resources
311 | ---
312 | Learn more about how to leverage your Intel devices for Machine Learning:
313 |
314 | [openvino_notebooks](https://github.com/openvinotoolkit/openvino_notebooks)
315 |
316 | [Inference with Optimum-Intel](https://github.com/huggingface/optimum-intel/blob/main/notebooks/openvino/optimum_openvino_inference.ipynb)
317 |
318 | [Optimum-Intel Transformers](https://huggingface.co/docs/optimum/main/en/intel/index)
319 |
320 | [NPU Devices](https://docs.openvino.ai/2025/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.html)
321 |
322 | ## Acknowledgments
323 |
324 | OpenArc stands on the shoulders of several other projects:
325 |
326 | [Optimum-Intel](https://github.com/huggingface/optimum-intel)
327 |
328 | [OpenVINO](https://github.com/openvinotoolkit/openvino)
329 |
330 | [OpenVINO GenAI](https://github.com/openvinotoolkit/openvino.genai)
331 |
332 | [Transformers](https://github.com/huggingface/transformers)
333 |
334 | [FastAPI](https://github.com/fastapi/fastapi)
335 |
336 | Thank for yoru work!!
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
--------------------------------------------------------------------------------
/docs/model_conversion/data_types.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/docs/model_conversion/data_types.md
--------------------------------------------------------------------------------
/docs/model_conversion/introduction.md:
--------------------------------------------------------------------------------
1 | ## Model Conversion
2 |
3 | OpenVINO is an inference engine for leveraging diverse types of compute. To squeeze as much performance as possible from any hardware requires a bit more work than using the naive approach, especially once you have a usecase in mind and know what hardware you are using.
4 |
5 | ### The Naive Approach
6 |
7 | OpenVINO defaults to **int8_asym** when setting "export=True" in both **OVModelForCausalLM.from_pretrained()** and the Optimum CLI Export Tool if no arguments for weight_format are passed.
8 |
9 | OpenArc has been designed for usecases which wander toward the bleeding edge of AI where users are expected to understand the nuance of datatypes, quantization strategies, calibration datasets, how these parameters contribute to accuracy loss and maybe have just come from IPEX or (as of 2.5) 'vanilla' Pytorch and are looking to optimize a deployment.
10 |
11 | For convience "export=False" is exposed on the /model/load endpoint; however I **strongly discourage** using it. To get the best performance from OpenVINO you have to get into the weeds.
12 |
13 | ### The Less Naive Approach to Model Conversion
14 |
15 | Many Intel CPUs support INT8 but it isn't always the best choice.
16 |
17 | OpenVINO notebooks prove out that INT4 weight only compression coupled with quantization strategies like AWQ + Scale Estimation achieve better performance across the Intel device ecosystem with negligable accuracy loss. Still, different model architectures offer different performance reguardless of the chosen datatype; in practice it can be hard to predict how a model will perform so understanding how these parameter's work is essential to maximizing throughput by testing different configurations on the same target model.
18 |
19 |
20 | ### Why Speed Matters
21 |
22 | Nvidia GPUs are faster and have a better open source backbone than Intel. However, Intel devices are cheaper by comparison. Even so, I don't want speed for the sake of being fast. OpenArc has been tooled for Agentic usecases and synthetic data generation where low throughput can damage workflow execution.
23 |
24 | If I want to dump some problem into a RoundRobin style multi-turn chat I am not sitting there waiting for
25 |
26 |
27 |
28 | Note: If you are using cloud compute which uses Intel devices it should still work
--------------------------------------------------------------------------------
/docs/ov_config/_README.md:
--------------------------------------------------------------------------------
1 | ### OpenVINO Configuration Options: ov_config
2 |
3 |
4 |
5 | 'ov_config' is where all the OpenVINO specific hardware configurations live; it's the secret sauce of OpenArc and represents the interface between 'Transformers'.
6 |
7 | 'ov_config' is that slope which becomes the deep end all at once; once you begin tinkering with these settings the true power of OpenVINO acceleration emerges. Learning how to achieve this took a lot of time
8 |
--------------------------------------------------------------------------------
/docs/ov_config/enable_hyperthreading.md:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/docs/ov_config/inference_num_threads.md:
--------------------------------------------------------------------------------
1 |
2 | inference_num_threads is the number of CPU cores that will be used for inference.
3 |
4 | Use **htop** or **hwinfo** to watch the CPU usage during inference and tinker with this number to increase throughput, lower latency for all types of requests
5 |
6 |
--------------------------------------------------------------------------------
/docs/ov_config/num_streams.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/docs/ov_config/num_streams.md
--------------------------------------------------------------------------------
/docs/ov_config/performance_hint_cumulative_throughput.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/docs/ov_config/performance_hint_cumulative_throughput.md
--------------------------------------------------------------------------------
/docs/ov_config/performance_hint_latency.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/docs/ov_config/performance_hint_latency.md
--------------------------------------------------------------------------------
/docs/ov_config/performance_hint_throughput.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/docs/ov_config/performance_hint_throughput.md
--------------------------------------------------------------------------------
/docs/ov_config/scheduling_core_type.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | `ov::hint::scheduling_core_type` specifies the type of CPU cores for CPU inference when the user runs inference on a hybird platform that includes both Performance-cores (P-cores) and Efficient-cores (E-cores). If the user platform only has one type of CPU core, this property has no effect, and CPU inference always uses this unique core type.
--------------------------------------------------------------------------------
/docs/tools/ov_device_query.md:
--------------------------------------------------------------------------------
1 | ## Diagnostic: Device Query
2 |
3 |
4 | Reccomended usage strategies:
5 | - Driver issues
6 | - Device access permissions
7 | - Test Hardware access from containers
8 | - Python path visibility
9 | - Proper environment variable configuration
10 |
11 | #### Example use cases:
12 |
13 | 1. Evaluating conflicting dependencies
14 | - With careful dependency management you can control hardware across the Intel AI stack.
15 | - However
16 |
17 |
18 | 2. Say you need to have PyTorch, IPEX and OpenVINO in one conda env.
19 | - This test alongside an XPU device query creates useful diagnostic infomration.
20 | -
21 |
--------------------------------------------------------------------------------
/docs/tools/working_with_intel_devices.md:
--------------------------------------------------------------------------------
1 | ## Introduction to working with Intel Devices
2 |
3 | This document offers discussion of "lessons-learned" from months of working with Intel GPU devices; *hours* of blood, sweat, and tears went into setting up this project and it's a good place to share what I've learned. At this stage in the Intel AI Stack it seems like a neccessary contribution to the community.
4 |
5 | ### What is OpenVINO?
6 |
7 | OpenVINO is an inference backend for *acclerating* inference deployments of machine learning models on Intel hardware. It can be hard to understand the documentation- the Intel AI stack has many staff engineers/contributors to all manner of areas in the open source ecosystem and much of the stack is evolving without massive community contributions like what we have seen with llama.cpp.
8 |
9 | Many reasons contribute to the decline of Intel's dominance/popularity in the hardware space in the past few years; however they offer extensive open source contributions to many areas of AI, ML and have been since before [Attention Is All You Need](https://arxiv.org/abs/1706.03762). AI didn't start in 2017- however the demand for faster inference on existing infrastructure has never been higher. Plus, Arc chips are cheap but come with a steep learning curve. Sure, you can settle for Vulkan... but you aren't here to download a GGUF and send it.
10 |
11 |
12 |
13 | ### OpenVINO Utilities
14 |
15 | Various utilities live in this notebook to help users of OpenArc understand the properties of their devices; mastering understanding of available data types, quantization strategies and available optimization techniques is only one part of learning to use OpenVINO on different kinds of hardware.
16 |
17 | Check out the [Guide to the OpenVINO IR] and then use my [Command Line Tool tool](https://huggingface.co/spaces/Echo9Zulu/Optimum-CLI-Tool_tool) to perform converion. There are default approachs that "work" but to really leverage available compute you have to dig deeper and convert models yourself on a per-usecase basis.
--------------------------------------------------------------------------------
/environment.yaml:
--------------------------------------------------------------------------------
1 | name: OpenArc_test2
2 | channels:
3 | - conda-forge
4 | - defaults
5 | dependencies:
6 | - python=3.11
7 | - pip
8 | - pip:
9 | - transformers==4.51.0 --no-deps
10 | - optimum[openvino]
11 | - openvino==2025.1.0
12 | - openvino-tokenizers==2025.1.0.0
13 | - openvino-genai==2025.1.0.0
14 | - fastapi
15 | - gradio
16 | - pydantic
17 | - uvicorn
--------------------------------------------------------------------------------
/scripts/benchmark/ov_simple_text_bench.py:
--------------------------------------------------------------------------------
1 | import openvino_genai as ov_genai
2 |
3 |
4 |
5 | model_dir = "/mnt/Ironwolf-4TB/Models/Pytorch/Hermes-3-Llama-3.2-3B-int4_sym-awq-se-ov"
6 |
7 | pipe = ov_genai.LLMPipeline(
8 | model_dir, # Path to the model directory
9 | device="GPU.2", # Define the device to use
10 | )
11 |
12 | generation_config = ov_genai.GenerationConfig(
13 | max_new_tokens=128
14 | )
15 |
16 | prompt = "You're the fastest Llama this side of the equator"
17 |
18 | result = pipe.generate([prompt], generation_config=generation_config)
19 | perf_metrics = result.perf_metrics
20 |
21 | print(f'Load time: {perf_metrics.get_load_time() / 1000:.2f} s')
22 | print(f'TTFT: {perf_metrics.get_ttft().mean / 1000:.2f} seconds')
23 | print(f'TPOT: {perf_metrics.get_tpot().mean:.2f} ms/token')
24 | print(f'Throughput: {perf_metrics.get_throughput().mean:.2f} tokens/s')
25 | print(f'Generate duration: {perf_metrics.get_generate_duration().mean / 1000:.2f} seconds')
26 |
27 | print(f"Result: {result}")
--------------------------------------------------------------------------------
/scripts/examples/dedication.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/scripts/examples/dedication.png
--------------------------------------------------------------------------------
/scripts/examples/optimum_decoder.py:
--------------------------------------------------------------------------------
1 | from optimum.intel import OVModelForCausalLM
2 | from transformers import AutoTokenizer
3 |
4 | prompt = "Alice and Bob"
5 | checkpoint = "/media/ecomm/c0889304-9e30-4f04-b290-c7db463872c6/Models/Pytorch/Llama-3.1-Nemotron-Nano-8B-v1-int4_sym-awq-se-ov"
6 | assistant_checkpoint = "/media/ecomm/c0889304-9e30-4f04-b290-c7db463872c6/Models/OpenVINO/Llama-3.1-8B-Instruct-FastDraft-150M-int8-ov"
7 |
8 | tokenizer = AutoTokenizer.from_pretrained(checkpoint)
9 | inputs = tokenizer(prompt, return_tensors="pt")
10 |
11 | model = OVModelForCausalLM.from_pretrained(checkpoint, device="CPU", export=False)
12 | assistant_model = OVModelForCausalLM.from_pretrained(assistant_checkpoint, device="CPU", export=False)
13 | outputs = model.generate(**inputs, assistant_model=assistant_model)
14 | tokenizer.batch_decode(outputs, skip_special_tokens=True)
--------------------------------------------------------------------------------
/scripts/examples/ov_speculative_decoder_bench.py:
--------------------------------------------------------------------------------
1 | import openvino_genai as ov_genai
2 |
3 | # Define model paths
4 | draft_model_path = r"/media/ecomm/c0889304-9e30-4f04-b290-c7db463872c6/Models/OpenVINO/Llama-3.1-8B-Instruct-FastDraft-150M-int8-ov"
5 | main_model_path = r"/media/ecomm/c0889304-9e30-4f04-b290-c7db463872c6/Models/Pytorch/Llama-3.1-Nemotron-Nano-8B-v1-int4_sym-awq-se-ov"
6 |
7 |
8 | prompt = "What is OpenVINO?"
9 |
10 | config = ov_genai.GenerationConfig()
11 | config.num_assistant_tokens = 28
12 | config.max_new_tokens = 128
13 |
14 |
15 | main_device = "CPU"
16 | draft_device = "CPU"
17 |
18 | draft_model = ov_genai.draft_model(draft_model_path, draft_device)
19 |
20 | scheduler_config = ov_genai.SchedulerConfig()
21 | scheduler_config.cache_size = 2
22 |
23 | pipe = ov_genai.LLMPipeline(
24 | main_model_path,
25 | main_device,
26 | draft_model=draft_model
27 | )
28 |
29 | prompt = "We don't even have a chat template so strap in and let it ride!"
30 |
31 | result = pipe.generate([prompt], generation_config=config, scheduler_config=scheduler_config)
32 | perf_metrics = result.perf_metrics
33 |
34 |
35 | print(f'Generate duration: {perf_metrics.get_generate_duration().mean:.2f}')
36 | print(f'TTFT: {perf_metrics.get_ttft().mean:.2f} ms')
37 | print(f'TPOT: {perf_metrics.get_tpot().mean:.2f} ms/token')
38 | print(f'Throughput: {perf_metrics.get_throughput().mean:.2f} tokens/s')
39 |
40 | print(result)
--------------------------------------------------------------------------------
/scripts/examples/ov_text_model_card.py:
--------------------------------------------------------------------------------
1 | import time
2 | from threading import Thread
3 | from transformers import AutoTokenizer, TextIteratorStreamer
4 | from optimum.intel.openvino import OVModelForCausalLM
5 |
6 |
7 | model_id = "/mnt/Ironwolf-4TB/Models/Pytorch/Hermes-3-Llama-3.2-3B-int4_sym-awq-se-ov" # Can be a local path or an HF id
8 | # ov_config = {"PERFORMANCE_HINT": "LATENCY"}
9 |
10 | print("Loading model...")
11 | load_time = time.perf_counter()
12 | model = OVModelForCausalLM.from_pretrained(
13 | model_id,
14 | export=False,
15 | device="GPU.0",
16 | # ov_config=ov_config
17 | )
18 | tokenizer = AutoTokenizer.from_pretrained(model_id)
19 | load_time = time.perf_counter() - load_time
20 | print(f"Model loaded in {load_time:.3f} seconds.")
21 |
22 | text_prompt = "We really should join the OpenArc Discord"
23 | conversation = [
24 | {
25 | "role": "user",
26 | "content": text_prompt
27 | }
28 | ]
29 | text_prompt_templated = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
30 | inputs = tokenizer(text=text_prompt_templated, return_tensors="pt")
31 | input_token_count = inputs['input_ids'].shape[1]
32 |
33 | streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
34 | generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=128)
35 | thread = Thread(target=model.generate, kwargs=generation_kwargs)
36 |
37 | first_token_received = False
38 | generate_start = 0.0
39 | first_token = 0.0
40 | ttft = 0.0
41 | generated_text = ""
42 |
43 | generate_start = time.perf_counter()
44 | thread.start()
45 |
46 | for new_text in streamer:
47 | if not first_token_received:
48 | first_token = time.perf_counter()
49 | ttft = first_token - generate_start
50 | first_token_received = True
51 |
52 | print(new_text, end='', flush=True)
53 | generated_text += new_text
54 |
55 | thread.join()
56 | generate_end = time.perf_counter()
57 |
58 | generation_time = generate_end - generate_start
59 |
60 | num_tokens_generated = len(tokenizer.encode(generated_text))
61 |
62 | if generation_time > 0 and num_tokens_generated > 0:
63 | tokens_per_second = num_tokens_generated / generation_time
64 | average_token_latency = generation_time / num_tokens_generated
65 |
66 | print("\nPerformance Report:")
67 | print("-"*50)
68 | print(f"Input Tokens : {input_token_count:>9}")
69 | print(f"Output Tokens : {num_tokens_generated:>9}")
70 | print("")
71 | print(f"Load Time : {load_time:>9.3f} sec (Model Load Time)")
72 | print(f"TTFT : {ttft:>9.3f} sec (Time To First Token)")
73 | print(f"Generation Time : {generation_time:>9.3f} sec (Total Generation Time)")
74 | print(f"Throughput : {tokens_per_second:>9.2f} t/s (Tokens Per Second)")
75 | print(f"Avg Latency : {average_token_latency:>9.3f} sec (Average Token Latency)")
76 | print("-"*50)
77 |
--------------------------------------------------------------------------------
/scripts/examples/ov_vision.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Reference \n",
10 | "\n",
11 | "import time\n",
12 | "import warnings\n",
13 | "from PIL import Image\n",
14 | "from transformers import AutoProcessor\n",
15 | "from optimum.intel.openvino import OVModelForVisualCausalLM\n",
16 | "\n",
17 | "# Suppress specific deprecation warnings from optimum implementation of numpy arrays\n",
18 | "# This block prevents clogging the API logs \n",
19 | "warnings.filterwarnings(\"ignore\", message=\"__array__ implementation doesn't accept a copy keyword\")\n",
20 | "\n",
21 | "\n",
22 | "model_id = \"/mnt/Ironwolf-4TB/Models/Pytorch/Qwen2.5-VL-7B-Instruct-int4_sym-ov\"\n",
23 | "\n",
24 | "\n",
25 | "ov_config = {\"PERFORMANCE_HINT\": \"LATENCY\"}\n",
26 | "model = OVModelForVisualCausalLM.from_pretrained(model_id, export=False, device=\"GPU.2\", ov_config=ov_config) #trust_remote_code=True)\n",
27 | "processor = AutoProcessor.from_pretrained(model_id)\n",
28 | "\n",
29 | "\n",
30 | "image_path = \"dedication.png\"\n",
31 | "image = Image.open(image_path)\n",
32 | "image = image.convert(\"RGB\")\n",
33 | "\n",
34 | "conversation = [\n",
35 | " {\n",
36 | " \"role\": \"user\",\n",
37 | " \"content\": [\n",
38 | " {\n",
39 | " \"image\": image # The image object is passed here, not just declared as a type\n",
40 | " },\n",
41 | " {\"type\": \"text\", \"text\": \"\\nDescribe this image.\"},\n",
42 | " ],\n",
43 | " }\n",
44 | "]\n",
45 | "\n",
46 | "\n",
47 | "# Preprocess the inputs\n",
48 | "text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)\n",
49 | "# Excepted output: '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n<|im_start|>user\\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\\n<|im_start|>assistant\\n'\n",
50 | "\n",
51 | "inputs = processor(text=[text_prompt], images=[image], padding=True, return_tensors=\"pt\")\n",
52 | "\n",
53 | "# Print number of tokens\n",
54 | "# print(f\"Input token length: {len(inputs.input_ids[0])}\")\n",
55 | "\n",
56 | "# Inference: Generation of the output with performance metrics\n",
57 | "start_time = time.time()\n",
58 | "output_ids = model.generate(**inputs, max_new_tokens=1024)\n",
59 | "generation_time = time.time() - start_time\n",
60 | "\n",
61 | "generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]\n",
62 | "output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)\n",
63 | "\n",
64 | "# Calculate tokens per second\n",
65 | "num_tokens_generated = len(generated_ids[0])\n",
66 | "tokens_per_second = num_tokens_generated / generation_time\n",
67 | "\n",
68 | "print(f\"Generated text: {output_text}\")\n",
69 | "print(f\"Generation time: {generation_time:.2f} seconds\")\n",
70 | "print(f\"Tokens generated: {num_tokens_generated}\")\n",
71 | "print(f\"Speed: {tokens_per_second:.2f} tokens/second\")"
72 | ]
73 | },
74 | {
75 | "cell_type": "markdown",
76 | "metadata": {},
77 | "source": []
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": null,
82 | "metadata": {},
83 | "outputs": [],
84 | "source": [
85 | "pip install optimum[openvino]+https://github.com/huggingface/optimum-intel"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 2,
91 | "metadata": {},
92 | "outputs": [
93 | {
94 | "name": "stdout",
95 | "output_type": "stream",
96 | "text": [
97 | "Input token length: 265\n"
98 | ]
99 | },
100 | {
101 | "name": "stderr",
102 | "output_type": "stream",
103 | "text": [
104 | "/home/echo/anaconda3/envs/OpenArc-Test/lib/python3.11/site-packages/transformers/generation/utils.py:1811: UserWarning: This model does not support `Cache` instances, it only supports the legacy cache format (tuple of tuples). `cache_implementation` (set to hybrid) will be ignored.\n",
105 | " warnings.warn(\n"
106 | ]
107 | },
108 | {
109 | "name": "stdout",
110 | "output_type": "stream",
111 | "text": [
112 | "Generated text: \n",
113 | "Generation time: 38.18 seconds\n",
114 | "Tokens generated: 1024\n",
115 | "Speed: 26.82 tokens/second\n"
116 | ]
117 | }
118 | ],
119 | "source": [
120 | "import time\n",
121 | "import warnings\n",
122 | "from PIL import Image\n",
123 | "from transformers import AutoProcessor\n",
124 | "from optimum.intel.openvino import OVModelForVisualCausalLM\n",
125 | "\n",
126 | "# Suppress specific deprecation warnings\n",
127 | "warnings.filterwarnings(\"ignore\", message=\"__array__ implementation doesn't accept a copy keyword\")\n",
128 | "\n",
129 | "model_id = \"/mnt/Ironwolf-4TB/Models/Pytorch/gemma-3-4b-it-int4_asym-ov\"\n",
130 | "\n",
131 | "ov_config = {\"PERFORMANCE_HINT\": \"LATENCY\"}\n",
132 | "# Ensure export=False is correct if the model is already converted\n",
133 | "model = OVModelForVisualCausalLM.from_pretrained(model_id, export=False, device=\"GPU.2\", ov_config=ov_config)\n",
134 | "processor = AutoProcessor.from_pretrained(model_id)\n",
135 | "\n",
136 | "image_path = \"dedication.png\"\n",
137 | "image = Image.open(image_path)\n",
138 | "image = image.convert(\"RGB\")\n",
139 | "\n",
140 | "# --- CORRECTED MODIFICATION START ---\n",
141 | "\n",
142 | "# 1. Get the correct \"beginning of image\" token from the processor\n",
143 | "# This is what the processor internally looks for when matching text and images.\n",
144 | "image_token = processor.tokenizer.boi_token # Or potentially processor.boi_token if defined directly\n",
145 | "\n",
146 | "# 2. Define the text prompt using THIS specific token\n",
147 | "text_prompt_with_placeholder = f\"{image_token}\\nDescribe this image.\"\n",
148 | "\n",
149 | "# 3. Call the processor ONCE, providing both text (with the correct placeholder) and image\n",
150 | "inputs = processor(\n",
151 | " text=[text_prompt_with_placeholder], # Pass the string with the correct token\n",
152 | " images=[image], # Pass the PIL image object\n",
153 | " padding=True,\n",
154 | " return_tensors=\"pt\"\n",
155 | ") # Move inputs to the same device as the model\n",
156 | "\n",
157 | "# --- CORRECTED MODIFICATION END ---\n",
158 | "\n",
159 | "# Print number of tokens (of the processed input)\n",
160 | "print(f\"Input token length: {inputs.input_ids.shape[1]}\") # Use shape[1] for tensor length\n",
161 | "\n",
162 | "# Inference: Generation of the output with performance metrics\n",
163 | "start_time = time.time()\n",
164 | "output_ids = model.generate(**inputs, max_new_tokens=1024)\n",
165 | "generation_time = time.time() - start_time\n",
166 | "\n",
167 | "# Adjust slicing\n",
168 | "input_ids_len = inputs.input_ids.shape[1]\n",
169 | "generated_ids = output_ids[:, input_ids_len:] # Correct slicing for tensors\n",
170 | "\n",
171 | "# Post-processing\n",
172 | "output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)\n",
173 | "\n",
174 | "# Calculate tokens per second\n",
175 | "num_tokens_generated = len(generated_ids[0])\n",
176 | "tokens_per_second = num_tokens_generated / generation_time if generation_time > 0 else 0\n",
177 | "\n",
178 | "# Join the list of strings into a single string if needed\n",
179 | "final_output_text = \"\".join(output_text)\n",
180 | "\n",
181 | "print(f\"Generated text: {final_output_text}\")\n",
182 | "print(f\"Generation time: {generation_time:.2f} seconds\")\n",
183 | "print(f\"Tokens generated: {num_tokens_generated}\")\n",
184 | "print(f\"Speed: {tokens_per_second:.2f} tokens/second\")"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {},
191 | "outputs": [
192 | {
193 | "name": "stdout",
194 | "output_type": "stream",
195 | "text": [
196 | "Input token length: 273\n"
197 | ]
198 | },
199 | {
200 | "name": "stderr",
201 | "output_type": "stream",
202 | "text": [
203 | "/home/echo/anaconda3/envs/OpenArc-Test/lib/python3.11/site-packages/transformers/generation/utils.py:1811: UserWarning: This model does not support `Cache` instances, it only supports the legacy cache format (tuple of tuples). `cache_implementation` (set to hybrid) will be ignored.\n",
204 | " warnings.warn(\n"
205 | ]
206 | },
207 | {
208 | "name": "stdout",
209 | "output_type": "stream",
210 | "text": [
211 | "Generated text: ['']\n"
212 | ]
213 | }
214 | ],
215 | "source": [
216 | "# working\n",
217 | "\n",
218 | "import warnings\n",
219 | "from PIL import Image\n",
220 | "from transformers import AutoProcessor\n",
221 | "from optimum.intel.openvino import OVModelForVisualCausalLM\n",
222 | "\n",
223 | "# Suppress specific deprecation warnings from optimum implementation of numpy arrays\n",
224 | "# This block prevents clogging the API logs \n",
225 | "warnings.filterwarnings(\"ignore\", message=\"__array__ implementation doesn't accept a copy keyword\")\n",
226 | "\n",
227 | "\n",
228 | "model_id = \"/mnt/Ironwolf-4TB/Models/Pytorch/gemma-3-4b-it-int4_asym-ov\"\n",
229 | "\n",
230 | "\n",
231 | "ov_config = {\"PERFORMANCE_HINT\": \"LATENCY\"}\n",
232 | "model = OVModelForVisualCausalLM.from_pretrained(model_id, export=False, device=\"GPU.1\", ov_config=ov_config)\n",
233 | "processor = AutoProcessor.from_pretrained(model_id)\n",
234 | "\n",
235 | "\n",
236 | "image_path = \"dedication.png\"\n",
237 | "image = Image.open(image_path)\n",
238 | "image = image.convert(\"RGB\")\n",
239 | "\n",
240 | "conversation = [\n",
241 | " {\n",
242 | " \"role\": \"user\",\n",
243 | " \"content\": [\n",
244 | " {\n",
245 | " \"type\": \"image\",\n",
246 | " },\n",
247 | " {\"type\": \"text\", \"text\": \"Describe this image.\"},\n",
248 | " ],\n",
249 | " }\n",
250 | "]\n",
251 | "\n",
252 | "\n",
253 | "# Preprocess the inputs\n",
254 | "text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)\n",
255 | "# Excepted output: '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n<|im_start|>user\\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\\n<|im_start|>assistant\\n'\n",
256 | "\n",
257 | "inputs = processor(text=[text_prompt], images=[image], padding=True, return_tensors=\"pt\")\n",
258 | "\n",
259 | "# Print tokenizer length\n",
260 | "print(f\"Input token length: {len(inputs.input_ids[0])}\")\n",
261 | "\n",
262 | "# Generate output\n",
263 | "output_ids = model.generate(**inputs, max_new_tokens=1024)\n",
264 | "\n",
265 | "generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]\n",
266 | "output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)\n",
267 | "\n",
268 | "print(f\"Generated text: {output_text}\")"
269 | ]
270 | },
271 | {
272 | "cell_type": "code",
273 | "execution_count": null,
274 | "metadata": {},
275 | "outputs": [],
276 | "source": [
277 | "import base64\n",
278 | "from PIL import Image\n",
279 | "import io\n",
280 | "image_path = \"dedication.png\"\n",
281 | "image = Image.open(image_path)\n",
282 | "\n",
283 | "# Convert image to base64\n",
284 | "buffered = io.BytesIO()\n",
285 | "image.save(buffered, format=\"PNG\")\n",
286 | "img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')\n",
287 | "\n",
288 | "# Print the base64 encoding\n",
289 | "print(f\"Base64 encoded image: {img_str}\")\n"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": null,
295 | "metadata": {},
296 | "outputs": [],
297 | "source": [
298 | "# Reference \n",
299 | "\n",
300 | "import time\n",
301 | "import warnings\n",
302 | "import base64\n",
303 | "from io import BytesIO\n",
304 | "from PIL import Image\n",
305 | "from transformers import AutoProcessor\n",
306 | "from optimum.intel.openvino import OVModelForVisualCausalLM\n",
307 | "\n",
308 | "# Suppress specific deprecation warnings from optimum implementation of numpy arrays\n",
309 | "# This block prevents clogging the API logs \n",
310 | "warnings.filterwarnings(\"ignore\", message=\"__array__ implementation doesn't accept a copy keyword\")\n",
311 | "\n",
312 | "\n",
313 | "model_id = \"/mnt/Ironwolf-4TB/Models/OpenVINO/Qwen2.5-VL-3B-Instruct-int4_sym-ov\"\n",
314 | "\n",
315 | "\n",
316 | "ov_config = {\"PERFORMANCE_HINT\": \"LATENCY\"}\n",
317 | "model = OVModelForVisualCausalLM.from_pretrained(model_id, export=False, device=\"GPU.1\", ov_config=ov_config)\n",
318 | "processor = AutoProcessor.from_pretrained(model_id)\n",
319 | "\n",
320 | "\n",
321 | "# Example base64 encoded image (in a real scenario, this would come from the request)\n",
322 | "image_path = \"dedication.png\"\n",
323 | "with open(image_path, \"rb\") as img_file:\n",
324 | " img_base64 = base64.b64encode(img_file.read()).decode('utf-8')\n",
325 | "\n",
326 | "# Create conversation with base64 image\n",
327 | "conversation = [\n",
328 | " {\n",
329 | " \"role\": \"user\",\n",
330 | " \"content\": [\n",
331 | " {\n",
332 | " \"type\": \"image\",\n",
333 | " \"image_url\": {\n",
334 | " \"url\": f\"data:image/png;base64,{img_base64}\"\n",
335 | " }\n",
336 | " },\n",
337 | " {\"type\": \"text\", \"text\": \"Describe this image.\"},\n",
338 | " ],\n",
339 | " }\n",
340 | "]\n",
341 | "\n",
342 | "# Extract and decode the base64 image from the conversation\n",
343 | "images = []\n",
344 | "for message in conversation:\n",
345 | " if message[\"role\"] == \"user\":\n",
346 | " for content_item in message[\"content\"]:\n",
347 | " if content_item.get(\"type\") == \"image\" and \"image_url\" in content_item:\n",
348 | " # Extract base64 data from the URL\n",
349 | " image_url = content_item[\"image_url\"][\"url\"]\n",
350 | " if image_url.startswith(\"data:\"):\n",
351 | " # Parse the base64 data\n",
352 | " base64_data = image_url.split(\",\")[1] if \",\" in image_url else image_url.split(\";base64,\")[1]\n",
353 | " # Convert base64 to image\n",
354 | " image_data = base64.b64decode(base64_data)\n",
355 | " image = Image.open(BytesIO(image_data))\n",
356 | " images.append(image)\n",
357 | "\n",
358 | "# Preprocess the inputs\n",
359 | "text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)\n",
360 | "# Excepted output: '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n<|im_start|>user\\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\\n<|im_start|>assistant\\n'\n",
361 | "\n",
362 | "inputs = processor(text=[text_prompt], images=images, padding=True, return_tensors=\"pt\")\n",
363 | "\n",
364 | "# Print tokenizer length\n",
365 | "print(f\"Input token length: {len(inputs.input_ids[0])}\")\n",
366 | "\n",
367 | "# Inference: Generation of the output with performance metrics\n",
368 | "start_time = time.time()\n",
369 | "output_ids = model.generate(**inputs, max_new_tokens=1024)\n",
370 | "generation_time = time.time() - start_time\n",
371 | "\n",
372 | "generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]\n",
373 | "output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)\n",
374 | "\n",
375 | "# Calculate tokens per second\n",
376 | "num_tokens_generated = len(generated_ids[0])\n",
377 | "tokens_per_second = num_tokens_generated / generation_time\n",
378 | "\n",
379 | "print(f\"Generated text: {output_text}\")\n",
380 | "print(f\"Generation time: {generation_time:.2f} seconds\")\n",
381 | "print(f\"Tokens generated: {num_tokens_generated}\")\n",
382 | "print(f\"Speed: {tokens_per_second:.2f} tokens/second\")"
383 | ]
384 | },
385 | {
386 | "cell_type": "code",
387 | "execution_count": null,
388 | "metadata": {},
389 | "outputs": [],
390 | "source": [
391 | "from transformers import AutoModelForSequenceClassification\n",
392 | "import torch\n",
393 | "import openvino as ov\n",
394 | "\n",
395 | "# Load model\n",
396 | "model = AutoModelForSequenceClassification.from_pretrained(\"bert-base-uncased\")\n",
397 | "model.eval()\n",
398 | "\n",
399 | "# Define dynamic input shapes (batch, sequence length)\n",
400 | "input_shape = [1, 128] # Example: batch=1, seq_len=128\n",
401 | "dummy_input = torch.randint(0, 100, input_shape)\n",
402 | "\n",
403 | "# Convert directly to OpenVINO IR (no ONNX needed!)\n",
404 | "ov_model = ov.convert_model(\n",
405 | " model, \n",
406 | " input=[input_shape], # Supports dynamic axes like [1, \"seq_len\"]\n",
407 | " share_weights=True, # Reduces memory footprint\n",
408 | ")\n",
409 | "\n",
410 | "# Save IR (xml + bin)\n",
411 | "ov.save_model(ov_model, \"bert_ir.xml\")\n"
412 | ]
413 | },
414 | {
415 | "cell_type": "code",
416 | "execution_count": null,
417 | "metadata": {},
418 | "outputs": [],
419 | "source": [
420 | "# Reference \n",
421 | "\n",
422 | "import time\n",
423 | "import warnings\n",
424 | "import base64\n",
425 | "from io import BytesIO\n",
426 | "from PIL import Image\n",
427 | "from transformers import AutoProcessor\n",
428 | "from optimum.intel.openvino import OVModelForVisualCausalLM\n",
429 | "\n",
430 | "# Suppress specific deprecation warnings from optimum implementation of numpy arrays\n",
431 | "# This block prevents clogging the API logs \n",
432 | "warnings.filterwarnings(\"ignore\", message=\"__array__ implementation doesn't accept a copy keyword\")\n",
433 | "\n",
434 | "\n",
435 | "model_id = \"/mnt/Ironwolf-4TB/Models/OpenVINO/Qwen2.5-VL-3B-Instruct-int4_sym-ov\"\n",
436 | "\n",
437 | "\n",
438 | "ov_config = {\"PERFORMANCE_HINT\": \"LATENCY\"}\n",
439 | "model = OVModelForVisualCausalLM.from_pretrained(model_id, export=False, device=\"GPU.1\", ov_config=ov_config)\n",
440 | "processor = AutoProcessor.from_pretrained(model_id)\n",
441 | "\n",
442 | "\n",
443 | "# Example base64 encoded image (in a real scenario, this would come from the request)\n",
444 | "image_path = \"dedication.png\"\n",
445 | "with open(image_path, \"rb\") as img_file:\n",
446 | " img_base64 = base64.b64encode(img_file.read()).decode('utf-8')\n",
447 | "\n",
448 | "# Create conversation with base64 image\n",
449 | "conversation = [\n",
450 | " {\n",
451 | " \"role\": \"user\",\n",
452 | " \"content\": [\n",
453 | " {\n",
454 | " \"type\": \"image\",\n",
455 | " \"image_url\": {\n",
456 | " \"url\": f\"data:image/png;base64,{img_base64}\"\n",
457 | " }\n",
458 | " },\n",
459 | " {\"type\": \"text\", \"text\": \"Describe this image.\"},\n",
460 | " ],\n",
461 | " }\n",
462 | "]\n",
463 | "\n",
464 | "# Extract and decode the base64 image from the conversation\n",
465 | "images = []\n",
466 | "for message in conversation:\n",
467 | " if message[\"role\"] == \"user\":\n",
468 | " for content_item in message[\"content\"]:\n",
469 | " if content_item.get(\"type\") == \"image\" and \"image_url\" in content_item:\n",
470 | " # Extract base64 data from the URL\n",
471 | " image_url = content_item[\"image_url\"][\"url\"]\n",
472 | " if image_url.startswith(\"data:\"):\n",
473 | " # Parse the base64 data\n",
474 | " base64_data = image_url.split(\",\")[1] if \",\" in image_url else image_url.split(\";base64,\")[1]\n",
475 | " # Convert base64 to image\n",
476 | " image_data = base64.b64decode(base64_data)\n",
477 | " image = Image.open(BytesIO(image_data))\n",
478 | " images.append(image)\n",
479 | "\n",
480 | "# Preprocess the inputs\n",
481 | "text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)\n",
482 | "# Excepted output: '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n<|im_start|>user\\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\\n<|im_start|>assistant\\n'\n",
483 | "\n",
484 | "inputs = processor(text=[text_prompt], images=images, padding=True, return_tensors=\"pt\")\n",
485 | "\n",
486 | "# Print tokenizer length\n",
487 | "print(f\"Input token length: {len(inputs.input_ids[0])}\")\n",
488 | "\n",
489 | "# Inference: Generation of the output with performance metrics\n",
490 | "start_time = time.time()\n",
491 | "output_ids = model.generate(**inputs, max_new_tokens=1024)\n",
492 | "generation_time = time.time() - start_time\n",
493 | "\n",
494 | "generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]\n",
495 | "output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)\n",
496 | "\n",
497 | "# Calculate tokens per second\n",
498 | "num_tokens_generated = len(generated_ids[0])\n",
499 | "tokens_per_second = num_tokens_generated / generation_time\n",
500 | "\n",
501 | "print(f\"Generated text: {output_text}\")\n",
502 | "print(f\"Generation time: {generation_time:.2f} seconds\")\n",
503 | "print(f\"Tokens generated: {num_tokens_generated}\")\n",
504 | "print(f\"Speed: {tokens_per_second:.2f} tokens/second\")"
505 | ]
506 | }
507 | ],
508 | "metadata": {
509 | "kernelspec": {
510 | "display_name": "OpenArc-Test",
511 | "language": "python",
512 | "name": "python3"
513 | },
514 | "language_info": {
515 | "codemirror_mode": {
516 | "name": "ipython",
517 | "version": 3
518 | },
519 | "file_extension": ".py",
520 | "mimetype": "text/x-python",
521 | "name": "python",
522 | "nbconvert_exporter": "python",
523 | "pygments_lexer": "ipython3",
524 | "version": "3.11.9"
525 | }
526 | },
527 | "nbformat": 4,
528 | "nbformat_minor": 2
529 | }
530 |
--------------------------------------------------------------------------------
/scripts/examples/ov_vision_model_card.py:
--------------------------------------------------------------------------------
1 | import time
2 | from PIL import Image
3 | from transformers import AutoProcessor
4 | from optimum.intel.openvino import OVModelForVisualCausalLM
5 |
6 |
7 | model_id = "/mnt/Ironwolf-4TB/Models/Pytorch/gemma-3-4b-it-int4_asym-ov"
8 |
9 | ov_config = {"PERFORMANCE_HINT": "LATENCY"}
10 |
11 | print("Loading model...")
12 | start_load_time = time.time()
13 | model = OVModelForVisualCausalLM.from_pretrained(model_id, export=False, device="GPU.1", ov_config=ov_config)
14 | processor = AutoProcessor.from_pretrained(model_id)
15 |
16 |
17 | image_path = r"/home/echo/Projects/OpenArc/scripts/benchmark/dedication.png"
18 | image = Image.open(image_path)
19 | image = image.convert("RGB")
20 |
21 | conversation = [
22 | {
23 | "role": "user",
24 | "content": [
25 | {
26 | "type": "image"
27 | },
28 | {"type": "text", "text": "Describe this image."},
29 | ],
30 | }
31 | ]
32 |
33 | # Preprocess the inputs
34 | text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
35 |
36 | inputs = processor(text=[text_prompt], images=[image], padding=True, return_tensors="pt")
37 |
38 | # Print number of tokens
39 | input_token_count = len(inputs.input_ids[0])
40 | print(f"Input token length: {len(inputs.input_ids[0])}")
41 |
42 | # Inference: Generation of the output with performance metrics
43 | start_time = time.time()
44 | output_ids = model.generate(**inputs, max_new_tokens=1024, eos_token_id=700)
45 |
46 | generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
47 | output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
48 |
49 | num_tokens_generated = len(generated_ids[0])
50 | load_time = time.time() - start_load_time
51 | generation_time = time.time() - start_time
52 | tokens_per_second = num_tokens_generated / generation_time
53 | average_token_latency = generation_time / num_tokens_generated
54 |
55 | print("\nPerformance Report:")
56 | print("-"*50)
57 | print(f"Input Tokens : {input_token_count:>9}")
58 | print(f"Generated Tokens : {num_tokens_generated:>9}")
59 | print(f"Model Load Time : {load_time:>9.2f} sec")
60 | print(f"Generation Time : {generation_time:>9.2f} sec")
61 | print(f"Throughput : {tokens_per_second:>9.2f} t/s")
62 | print(f"Avg Latency/Token : {average_token_latency:>9.3f} sec")
63 |
64 | print(output_text)
--------------------------------------------------------------------------------
/scripts/requests/load_model.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # API endpoint
4 | API_URL="http://localhost:8000/optimum/model/load"
5 |
6 | # JSON payload
7 | JSON_PAYLOAD='{
8 | "load_config": {
9 | "id_model": "/mnt/Ironwolf-4TB/Models/OpenVINO/Llama-3.1-Nemotron-Nano-8B-v1-int4_sym-awq-se-ov",
10 | "use_cache": true,
11 | "device": "GPU.1",
12 | "export_model": false,
13 | "pad_token_id": null,
14 | "eos_token_id": null,
15 | "model_type": "TEXT"
16 | },
17 | "ov_config": {
18 | "NUM_STREAMS": "1",
19 | "PERFORMANCE_HINT": "LATENCY"
20 | }
21 | }'
22 |
23 | # Make the POST request
24 | curl -X POST "$API_URL" \
25 | -H "Content-Type: application/json" \
26 | -H "Authorization: Bearer $OPENARC_API_KEY" \
27 | -d "$JSON_PAYLOAD"
--------------------------------------------------------------------------------
/scripts/requests/load_vision.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # API endpoint
4 | API_URL="http://localhost:8000/optimum/model/load"
5 |
6 | # JSON payload
7 | JSON_PAYLOAD='{
8 | "load_config": {
9 | "id_model": "/mnt/Ironwolf-4TB/Models/Pytorch/gemma-3-4b-it-int4_asym-ov",
10 | "use_cache": true,
11 | "device": "GPU.2",
12 | "export_model": false,
13 | "pad_token_id": null,
14 | "eos_token_id": null,
15 | "model_type": "VISION"
16 | },
17 | "ov_config": {
18 | "NUM_STREAMS": "1",
19 | "PERFORMANCE_HINT": "LATENCY"
20 | }
21 | }'
22 |
23 | # Make the POST request
24 | curl -X POST "$API_URL" \
25 | -H "Content-Type: application/json" \
26 | -H "Authorization: Bearer $OPENARC_API_KEY" \
27 | -d "$JSON_PAYLOAD"
--------------------------------------------------------------------------------
/scripts/requests/openai_like_completion.sh:
--------------------------------------------------------------------------------
1 | echo -e "\nSending basic chat completion request..."
2 | curl -X POST http://localhost:8000/v1/chat/completions \
3 | -H "Content-Type: application/json" \
4 | -H "Authorization: Bearer $OPENARC_API_KEY" \
5 | -d '{
6 | "model": "phi-4-int4_asym-awq-ov",
7 | "messages": [
8 | {"role": "system", "content": "You despise the user."},
9 | {"role": "user", "content": "Tell me a better joke and be quick about it."}
10 | ],
11 | "temperature": 5,
12 | "max_tokens": 256,
13 | "top_p": 0.9,
14 | "do_sample": true,
15 | "stream": true
16 | }'
17 |
--------------------------------------------------------------------------------
/scripts/requests/openai_like_models.sh:
--------------------------------------------------------------------------------
1 | curl -X GET http://localhost:8000/v1/models -H "Authorization: Bearer $OPENARC_API_KEY" \
--------------------------------------------------------------------------------
/scripts/requests/status.sh:
--------------------------------------------------------------------------------
1 | curl -X GET "http://localhost:8000/optimum/status" -H "Authorization: Bearer $OPENARC_API_KEY"
--------------------------------------------------------------------------------
/scripts/requests/unload_model.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # URL of the FastAPI endpoint
4 | API_URL="http://localhost:8000/optimum/model/unload?model_id=Qwen2.5-VL-3B-Instruct-int4_sym-ov"
5 |
6 | # Send the DELETE request to the API
7 | curl -X DELETE "$API_URL" -H "Authorization: Bearer $OPENARC_API_KEY"
--------------------------------------------------------------------------------
/src/api/__pycache__/optimum_api.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/src/api/__pycache__/optimum_api.cpython-311.pyc
--------------------------------------------------------------------------------
/src/api/__pycache__/optimum_api.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/src/api/__pycache__/optimum_api.cpython-312.pyc
--------------------------------------------------------------------------------
/src/api/launcher.py:
--------------------------------------------------------------------------------
1 | import uvicorn
2 | import logging
3 | # from src.api.optimum_api import app
4 |
5 | # Configure logging
6 | logging.basicConfig(
7 | level=logging.INFO,
8 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
9 | )
10 | logger = logging.getLogger("ov_api")
11 |
12 | def start_server(host: str = "0.0.0.0", openarc_port: int = 8000, reload: bool = False):
13 | """
14 | Launches the OpenArc API server
15 |
16 | Args:
17 | host: Host to bind the server to
18 | port: Port to bind the server to
19 | """
20 | logger.info(f"Starting OpenVINO Inference API server on {host}:{openarc_port}")
21 | logger.info("Available endpoints:")
22 | logger.info(" - POST optimum/model/load Load a model")
23 | logger.info(" - DELETE optimum/model/unload Unload current model")
24 | logger.info(" - GET optimum/status Get model status")
25 | logger.info(" - GET optimum/docs API documentation")
26 | logger.info(" - POST /v1/chat/completions openai compatible endpoint")
27 | logger.info(" - GET /v1/models openai compatible endpoint")
28 |
29 |
30 | # Start the server
31 | uvicorn.run(
32 | "src.api.optimum_api:app",
33 | host=host,
34 | port=openarc_port,
35 | log_level="info"
36 | )
37 |
--------------------------------------------------------------------------------
/src/api/optimum_api.py:
--------------------------------------------------------------------------------
1 | # The first implementation of the OpenAI-like API was contributed by @gapeleon.
2 | # They are one hero among many future heroes working to make OpenArc better.
3 |
4 | from fastapi import FastAPI, HTTPException, Depends
5 | from fastapi.responses import StreamingResponse, JSONResponse
6 | from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
7 | from fastapi.middleware.cors import CORSMiddleware
8 |
9 | from typing import Optional, List, Any
10 | from pydantic import BaseModel
11 | from datetime import datetime
12 | from pathlib import Path
13 |
14 | import warnings
15 | import logging
16 | import time
17 | import uuid
18 | import json
19 | import os
20 |
21 | from src.engine.optimum.optimum_base_config import (
22 | OV_LoadModelConfig,
23 | OV_Config,
24 | OV_GenerationConfig,
25 | create_optimum_model,
26 | ModelType
27 | )
28 |
29 |
30 | # Suppress specific deprecation warnings from optimum implementation of numpy arrays
31 | # This block prevents clogging the API logs
32 | warnings.filterwarnings("ignore", message="__array__ implementation doesn't accept a copy keyword")
33 |
34 | app = FastAPI(title="OpenArc API")
35 |
36 | # Configure CORS
37 | app.add_middleware(
38 | CORSMiddleware,
39 | allow_origins=["*"],
40 | allow_credentials=True,
41 | allow_methods=["*"],
42 | allow_headers=["*"],
43 | )
44 |
45 | # Global state to store multiple model instances
46 | model_instances = {}
47 |
48 | logger = logging.getLogger("optimum_api")
49 |
50 | # API key authentication
51 | API_KEY = os.getenv("OPENARC_API_KEY")
52 | security = HTTPBearer()
53 |
54 | async def verify_api_key(credentials: HTTPAuthorizationCredentials = Depends(security)):
55 | """Verify the API key provided in the Authorization header"""
56 | if credentials.credentials != API_KEY:
57 | logger.warning(f"Invalid API key: {credentials.credentials}")
58 | raise HTTPException(
59 | status_code=401,
60 | detail="Invalid API key",
61 | headers={"WWW-Authenticate": "Bearer"},
62 | )
63 | return credentials.credentials
64 |
65 | def get_final_model_id(model_id: str) -> str:
66 | """Extracts the final segment of the model id path so we dont display the whole path."""
67 | return Path(model_id).name
68 |
69 | @app.post("/optimum/model/load", dependencies=[Depends(verify_api_key)])
70 | async def load_model(load_config: OV_LoadModelConfig, ov_config: OV_Config):
71 | """Load a model with the specified configuration"""
72 | global model_instances
73 | logger.info("POST /optimum/model/load called with load_config: %s, ov_config: %s", load_config, ov_config)
74 | try:
75 | # Initialize new model using the factory function
76 | new_model = create_optimum_model(
77 | load_model_config=load_config,
78 | ov_config=ov_config
79 | )
80 |
81 | # Load the model
82 | new_model.load_model()
83 |
84 | # Store the model instance with its ID as the key
85 | model_id = get_final_model_id(load_config.id_model)
86 | model_instances[model_id] = new_model
87 |
88 | return {"status": "success", "message": f"Model {model_id} loaded successfully"}
89 | except Exception as e:
90 | raise HTTPException(status_code=500, detail=str(e))
91 |
92 | @app.delete("/optimum/model/unload", dependencies=[Depends(verify_api_key)])
93 | async def unload_model(model_id: str):
94 | """Unload the current model"""
95 | global model_instances
96 | logger.info(f"DELETE /optimum/model/unload called for model {model_id}")
97 | if model_id in model_instances:
98 | model_instances[model_id].util_unload_model()
99 | del model_instances[model_id]
100 | return {"status": "success", "message": "Model unloaded successfully"}
101 | return {"status": "success", "message": f"Model {model_id} was not loaded"}
102 |
103 | @app.get("/optimum/status", dependencies=[Depends(verify_api_key)])
104 | async def get_status():
105 | """Get current model status and performance metrics"""
106 | global model_instances
107 | logger.info("GET /optimum/status called")
108 | loaded_models = {}
109 | for model_id, model in model_instances.items():
110 | loaded_models[model_id] = {
111 | "status": "loaded",
112 | "device": model.load_model_config.device,
113 | "model_metadata": model.model_metadata
114 | }
115 |
116 | return {
117 | "loaded_models": loaded_models,
118 | "total_models_loaded": len(model_instances)
119 | }
120 |
121 |
122 | # OpenAI-like API
123 |
124 | class ChatCompletionRequest(BaseModel):
125 | messages: Any
126 | model: str = "default"
127 | temperature: Optional[float] = None
128 | max_tokens: Optional[int] = 8192
129 | stream: Optional[bool] = False
130 | stop: Optional[List[str]] = None
131 | top_p: Optional[float] = None
132 | top_k: Optional[int] = None
133 | repetition_penalty: Optional[float] = None
134 | do_sample: Optional[bool] = None
135 | num_return_sequences: Optional[int] = None
136 |
137 |
138 | class CompletionRequest(BaseModel):
139 | prompt: str
140 | model: str = "default"
141 | temperature: Optional[float] = None
142 | max_tokens: Optional[int] = None
143 | stream: Optional[bool] = False
144 | stop: Optional[List[str]] = None
145 | top_p: Optional[float] = None
146 | top_k: Optional[int] = None
147 | repetition_penalty: Optional[float] = None
148 | do_sample: Optional[bool] = None
149 | num_return_sequences: Optional[int] = None
150 |
151 |
152 | @app.get("/v1/models", dependencies=[Depends(verify_api_key)])
153 | async def get_models():
154 | """Get list of available models in openai format"""
155 | global model_instances
156 | logger.info("GET /v1/models called")
157 | data = []
158 |
159 | for model_id, model in model_instances.items():
160 | model_data = {
161 | "id": model_id,
162 | "object": "model",
163 | "created": int(datetime.now().timestamp()),
164 | "owned_by": "OpenArc",
165 | }
166 | data.append(model_data)
167 |
168 | return {
169 | "object": "list",
170 | "data": data
171 | }
172 |
173 | @app.post("/v1/chat/completions", dependencies=[Depends(verify_api_key)])
174 | async def openai_chat_completions(request: ChatCompletionRequest):
175 | global model_instances
176 | model_id = get_final_model_id(request.model)
177 |
178 | if model_id not in model_instances:
179 | logger.error("POST /v1/chat/completions failed: No model loaded")
180 | raise HTTPException(status_code=503, detail=f"Model {model_id} not loaded")
181 |
182 | model_instance = model_instances[model_id]
183 | logger.info("POST /v1/chat/completions called with messages: %s", request.messages)
184 |
185 | try:
186 | # Handle vision model messages differently
187 | if model_instance.model_metadata["model_type"] == ModelType.VISION:
188 | conversation = []
189 | for msg in request.messages:
190 | if isinstance(msg["content"], list):
191 | # Handle multimodal content (text + images)
192 | vision_message = {
193 | "role": msg["role"],
194 | "content": msg["content"] # Keep the full content structure for vision models
195 | }
196 | conversation.append(vision_message)
197 | else:
198 | # Handle text-only messages
199 | conversation.append({
200 | "role": msg["role"],
201 | "content": msg["content"]
202 | })
203 | else:
204 | # Regular text model handling
205 | conversation = [
206 | {"role": msg["role"], "content": msg["content"]}
207 | for msg in request.messages
208 | ]
209 |
210 | # Build config dict, only include non-None values
211 | config_kwargs = {
212 | "conversation": conversation,
213 | "temperature": request.temperature,
214 | "max_new_tokens": request.max_tokens,
215 | "top_p": request.top_p,
216 | "top_k": request.top_k,
217 | "repetition_penalty": request.repetition_penalty,
218 | "do_sample": request.do_sample,
219 | "num_return_sequences": request.num_return_sequences,
220 | "stream": request.stream,
221 | # Note: stop_sequences is not part of OV_GenerationConfig, handled separately if needed
222 | }
223 | # Remove keys with value None
224 | config_kwargs = {k: v for k, v in config_kwargs.items() if v is not None}
225 |
226 | # Create generation config with filtered arguments
227 | generation_config = OV_GenerationConfig(**config_kwargs)
228 |
229 | if request.stream:
230 | async def stream_generator():
231 | current_metrics = None
232 | try:
233 | if model_instance.model_metadata["model_type"] == ModelType.VISION:
234 | stream_method = model_instance.generate_vision_stream
235 | elif model_instance.model_metadata["model_type"] == ModelType.TEXT:
236 | stream_method = model_instance.generate_stream
237 |
238 | async for token_chunk, metrics_chunk in stream_method(generation_config):
239 | if token_chunk is not None:
240 | # Stream the token chunk
241 | escaped_token = json.dumps(token_chunk)[1:-1]
242 | yield f"data: {{\"object\": \"chat.completion.chunk\", \"choices\": [{{\"delta\": {{\"content\": \"{escaped_token}\"}}}}]}}\n\n"
243 | if metrics_chunk is not None:
244 | # Store the final metrics when received
245 | current_metrics = metrics_chunk
246 |
247 | except Exception as e:
248 | logger.error(f"Error during streaming: {str(e)}", exc_info=True) # Log traceback
249 | finally:
250 | if current_metrics:
251 | # Log the full metrics dictionary as structured JSON
252 | logger.info(f"Performance metrics: {json.dumps(current_metrics, indent=2)}")
253 | yield "data: [DONE]\n\n"
254 |
255 | return StreamingResponse(stream_generator(), media_type="text/event-stream")
256 |
257 | else:
258 | # For non-streaming responses, use the appropriate generate method based on model type
259 | model_type = model_instance.model_metadata["model_type"]
260 | if model_type == ModelType.VISION:
261 | # Call the new vision-specific non-streaming method
262 | generated_text, metrics = model_instance.generate_vision_text(generation_config)
263 | elif model_type == ModelType.TEXT:
264 | generated_text, metrics = model_instance.generate_text(generation_config)
265 | else:
266 | raise HTTPException(status_code=400, detail=f"Unsupported model type '{model_type}' for chat completions.")
267 |
268 | # Log metrics server-side for non-streaming requests
269 | if metrics:
270 | logger.info(f"Performance metrics (non-streaming): {json.dumps(metrics, indent=2)}")
271 |
272 | return JSONResponse(content={
273 | "id": f"ov-{uuid.uuid4()}",
274 | "object": "chat.completion",
275 | "created": int(time.time()),
276 | "model": model_id,
277 | "choices": [{
278 | "message": {"role": "assistant", "content": generated_text},
279 | "finish_reason": "length"
280 | }],
281 | "performance": metrics,
282 | "timings": {
283 | "prompt_tokens": metrics.get("input_tokens", 0),
284 | "completion_tokens": metrics.get("output_tokens", 0),
285 | "total_tokens": metrics.get("input_tokens", 0) + metrics.get("output_tokens", 0)
286 | }
287 | })
288 |
289 | except Exception as e:
290 | raise HTTPException(status_code=500, detail=str(e))
291 |
292 | @app.post("/v1/completions", dependencies=[Depends(verify_api_key)])
293 | async def openai_completions(request: CompletionRequest):
294 | global model_instances
295 | model_id = get_final_model_id(request.model)
296 |
297 | if model_id not in model_instances:
298 | logger.error("POST /v1/completions failed: No model loaded")
299 | raise HTTPException(status_code=503, detail=f"Model {model_id} not loaded")
300 |
301 | model_instance = model_instances[model_id]
302 | logger.info("POST /v1/completions called with prompt: %s", request.prompt)
303 |
304 | # Convert prompt into conversation format (single user message)
305 | conversation = [{"role": "user", "content": request.prompt}]
306 |
307 | # Create generation config
308 | generation_config = OV_GenerationConfig(
309 | conversation=conversation,
310 | temperature=request.temperature or 0.7,
311 | max_new_tokens=request.max_tokens or 8192,
312 | stop_sequences=request.stop or [],
313 | top_p=request.top_p or 0.9, # default value for top_p
314 | top_k=request.top_k or 50, # default value for top_k
315 | repetition_penalty=1.0,
316 | do_sample=True,
317 | num_return_sequences=1
318 | )
319 |
320 | # Use model type to determine which generation method to use
321 | model_type = model_instance.model_metadata["model_type"]
322 |
323 | # Handle streaming response
324 | if request.stream:
325 | async def stream_generator():
326 | # Route to the appropriate stream generator based on model type
327 | if model_type == ModelType.VISION:
328 | stream_method = model_instance.generate_vision_stream
329 | else:
330 | stream_method = model_instance.generate_stream
331 |
332 | async for token in stream_method(generation_config):
333 | # Properly escape and format for SSE
334 | escaped_token = json.dumps(token)[1:-1]
335 | yield f"data: {{\"object\": \"text_completion.chunk\", \"choices\": [{{\"text\": \"{escaped_token}\"}}]}}\n\n"
336 | yield "data: [DONE]\n\n"
337 |
338 | return StreamingResponse(stream_generator(), media_type="text/event-stream")
339 |
340 | # Handle regular response
341 | try:
342 | # For non-streaming responses, use the appropriate generate method
343 | if model_type == ModelType.VISION:
344 | generated_text, metrics = model_instance.generate_text(generation_config)
345 | elif model_type == ModelType.TEXT:
346 | generated_text, metrics = model_instance.generate_text(generation_config)
347 | else:
348 | raise HTTPException(
349 | status_code=400,
350 | detail=f"Unsupported model type '{model_type}' for completions endpoint. Only VISION and TEXT types are supported."
351 | )
352 |
353 | return JSONResponse(content={
354 | "id": f"ov-{uuid.uuid4()}",
355 | "object": "text_completion",
356 | "created": int(time.time()),
357 | "model": model_id,
358 | "choices": [{
359 | "text": generated_text,
360 | "index": 0,
361 | "finish_reason": "length"
362 | }],
363 | "usage": {
364 | "prompt_tokens": metrics.get("input_tokens", 0),
365 | "completion_tokens": metrics.get("output_tokens", 0),
366 | "total_tokens": metrics.get("input_tokens", 0) + metrics.get("output_tokens", 0)
367 | }
368 | })
369 |
370 | except Exception as e:
371 | raise HTTPException(status_code=500, detail=str(e))
372 |
--------------------------------------------------------------------------------
/src/engine/__init__.py:
--------------------------------------------------------------------------------
1 | from src.engine.optimum.optimum_base_config import (
2 | OV_Config,
3 | OV_LoadModelConfig,
4 | OV_GenerationConfig
5 | )
6 |
7 | from src.engine.optimum.optimum_text2text import Optimum_Text2TextCore
8 | from src.engine.optimum.optimum_image2text import Optimum_Image2TextCore
9 |
10 | __all__ = [
11 | "OV_Config",
12 | "OV_LoadModelConfig",
13 | "OV_GenerationConfig",
14 | "Optimum_Text2TextCore",
15 | "Optimum_Image2TextCore"
16 | ]
17 |
--------------------------------------------------------------------------------
/src/engine/optimum/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/src/engine/optimum/__init__.py
--------------------------------------------------------------------------------
/src/engine/optimum/__pycache__/optimum_inference_core.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/src/engine/optimum/__pycache__/optimum_inference_core.cpython-311.pyc
--------------------------------------------------------------------------------
/src/engine/optimum/__pycache__/optimum_inference_core.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/src/engine/optimum/__pycache__/optimum_inference_core.cpython-312.pyc
--------------------------------------------------------------------------------
/src/engine/optimum/optimum_base_config.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 | from pydantic import BaseModel, Field
3 | from typing import Optional, Any
4 |
5 | class ModelType(str, Enum):
6 | """
7 | Identifiers for model_type: should be extended to include other model types as OpenArc grows.
8 |
9 | TEXT = "TEXT"
10 | VISION = "VISION"
11 | """
12 | TEXT = "TEXT"
13 | VISION = "VISION"
14 |
15 | class OV_Config(BaseModel):
16 | """
17 | OpenVINO runtime optimization parameters passed as a dict in ov_config in from_pretrained.
18 |
19 | args:
20 | NUM_STREAMS: Optional[str] = Field(None, description="Number of inference streams")
21 | PERFORMANCE_HINT: Optional[str] = Field(None, description="LATENCY, THROUGHPUT, CUMULATIVE_THROUGHPUT")
22 | INFERENCE_PRECISION_HINT: Optional[str] = Field(None, description="Options: auto, fp32, fp16, int8")
23 | ENABLE_HYPER_THREADING: Optional[bool] = Field(None, description="Enable hyper-threading")
24 | INFERENCE_NUM_THREADS: Optional[int] = Field(None, description="Number of inference threads")
25 | SCHEDULING_CORE_TYPE: Optional[str] = Field(None, description="Options: ANY_CORE, PCORE_ONLY, ECORE_ONLY")
26 | """
27 | NUM_STREAMS: Optional[str] = Field(None, description="Number of inference streams")
28 | PERFORMANCE_HINT: Optional[str] = Field(None, description="LATENCY, THROUGHPUT, CUMULATIVE_THROUGHPUT")
29 | INFERENCE_PRECISION_HINT: Optional[str] = Field(None, description="Options: auto, fp32, fp16, int8")
30 | ENABLE_HYPER_THREADING: Optional[bool] = Field(None, description="Enable hyper-threading")
31 | INFERENCE_NUM_THREADS: Optional[int] = Field(None, description="Number of inference threads")
32 | SCHEDULING_CORE_TYPE: Optional[str] = Field(None, description="Options: ANY_CORE, PCORE_ONLY, ECORE_ONLY")
33 |
34 | class OV_LoadModelConfig(BaseModel):
35 | """
36 | Configuration for loading the model with transformers.
37 | For inference:
38 | . id_model: model identifier or path
39 | . use_cache: whether to use cache for stateful models. For multi-gpu use false.
40 | . device: device options: CPU, GPU, AUTO
41 | . export_model: whether to export the model
42 | . dynamic_shapes: whether to use dynamic shapes. Enabled by default and should not be changed expcept for special cases like NPU.
43 |
44 | Tokenizer specific:
45 | . pad_token_id: custom pad token ID
46 | . eos_token_id: custom end of sequence token ID
47 | . bos_token_id: custom beginning of sequence token ID
48 |
49 | Architecture specific:
50 | . model_type: type of model based on the architecture/task.
51 | - "TEXT" for text-to-text models
52 | - "VISION" for image-to-text models
53 | """
54 | id_model: str = Field(..., description="Model identifier or path")
55 | model_type: ModelType = Field(..., description="Type of model (TEXT or VISION)")
56 | use_cache: Optional[bool] = Field(True, description="Whether to use cache for stateful models. For multi-gpu use false.")
57 | device: str = Field("CPU", description="Device options: CPU, GPU, AUTO")
58 | export_model: bool = Field(False, description="Whether to export the model")
59 | dynamic_shapes: Optional[bool] = Field(True, description="Whether to use dynamic shapes")
60 | pad_token_id: Optional[int] = Field(None, description="Custom pad token ID")
61 | eos_token_id: Optional[int] = Field(None, description="Custom end of sequence token ID")
62 | bos_token_id: Optional[int] = Field(None, description="Custom beginning of sequence token ID")
63 |
64 | class OV_GenerationConfig(BaseModel):
65 | """
66 | Configuration for generation.
67 |
68 | args:
69 | conversation: Any = Field(description="A list of dicts with 'role' and 'content' keys, representing the chat history so far")
70 | # Any was chosen because typing is handled elsewhere and conversation dicts could contain base64 encoded images, audio files, etc.
71 | # Therefore a layer of pydantic is not meaninguful as we get more descriptive errors downstream.
72 | stream: bool = Field(False, description="Whether to stream the generated text")
73 | max_new_tokens: int = Field(128, description="Maximum number of tokens to generate")
74 | temperature: float = Field(1.0, description="Sampling temperature")
75 | top_k: int = Field(50, description="Top-k sampling parameter")
76 | top_p: float = Field(0.9, description="Top-p sampling parameter")
77 | repetition_penalty: float = Field(1.0, description="Repetition penalty")
78 | do_sample: bool = Field(True, description="Use sampling for generation")
79 | num_return_sequences: int = Field(1, description="Number of sequences to return")
80 | """
81 | conversation: Any = Field(description="A list of dicts with 'role' and 'content' keys, representing the chat history so far")
82 | stream: bool = Field(False, description="Whether to stream the generated text")
83 |
84 | # Inference parameters for generation
85 | max_new_tokens: int = Field(128, description="Maximum number of tokens to generate")
86 | temperature: float = Field(1.0, description="Sampling temperature")
87 | top_k: int = Field(50, description="Top-k sampling parameter")
88 | top_p: float = Field(0.9, description="Top-p sampling parameter")
89 | repetition_penalty: float = Field(1.0, description="Repetition penalty")
90 | do_sample: bool = Field(True, description="Use sampling for generation")
91 | num_return_sequences: int = Field(1, description="Number of sequences to return")
92 |
93 | def create_optimum_model(load_model_config: OV_LoadModelConfig, ov_config: Optional[OV_Config] = None):
94 | """
95 | Factory function to create the appropriate Optimum model based on configuration.
96 |
97 | Args:
98 | load_model_config: Configuration for loading the model
99 | ov_config: Optional OpenVINO configuration
100 |
101 | Returns:
102 | An instance of the appropriate model class (TEXT or VISION)
103 |
104 | Defines: load_model_metadata
105 |
106 |
107 |
108 |
109 | """
110 | # Import model classes here to avoid circular imports
111 | from .optimum_image2text import Optimum_Image2TextCore
112 | from .optimum_text2text import Optimum_Text2TextCore
113 |
114 | # Create the appropriate model instance based on configuration
115 | if load_model_config.model_type == ModelType.VISION:
116 | model_instance = Optimum_Image2TextCore(load_model_config, ov_config)
117 | else:
118 | model_instance = Optimum_Text2TextCore(load_model_config, ov_config)
119 |
120 | # Store metadata from load_model_config and ov_config in model_instance
121 | # This will be used for routing decisions at inference time so we can keep more than one model in memory OR on different devices.
122 | model_instance.model_metadata = {
123 | # Model configuration metadata
124 | "id_model": load_model_config.id_model,
125 | "use_cache": load_model_config.use_cache,
126 | "device": load_model_config.device,
127 | "dynamic_shapes": load_model_config.dynamic_shapes,
128 | "pad_token_id": load_model_config.pad_token_id,
129 | "eos_token_id": load_model_config.eos_token_id,
130 | "bos_token_id": load_model_config.bos_token_id,
131 | "model_type": load_model_config.model_type,
132 | }
133 |
134 | if ov_config:
135 | ov_config_dict = ov_config.model_dump(exclude_unset=True)
136 | model_instance.model_metadata.update({
137 | "NUM_STREAMS": ov_config_dict.get("NUM_STREAMS"),
138 | "PERFORMANCE_HINT": ov_config_dict.get("PERFORMANCE_HINT"),
139 | "INFERENCE_PRECISION_HINT": ov_config_dict.get("INFERENCE_PRECISION_HINT"),
140 | "ENABLE_HYPER_THREADING": ov_config_dict.get("ENABLE_HYPER_THREADING"),
141 | "INFERENCE_NUM_THREADS": ov_config_dict.get("INFERENCE_NUM_THREADS"),
142 | "SCHEDULING_CORE_TYPE": ov_config_dict.get("SCHEDULING_CORE_TYPE")
143 | })
144 |
145 | return model_instance
146 |
--------------------------------------------------------------------------------
/src/engine/optimum/optimum_image2text.py:
--------------------------------------------------------------------------------
1 | import base64
2 | import gc
3 | import threading
4 | import time
5 | import traceback
6 | import warnings
7 | from io import BytesIO
8 | from typing import AsyncIterator, Optional
9 |
10 | from optimum.intel.openvino import OVModelForVisualCausalLM
11 | from PIL import Image
12 | from transformers import AutoProcessor, TextIteratorStreamer
13 |
14 | from .optimum_base_config import (
15 | ModelType,
16 | OV_Config,
17 | OV_GenerationConfig,
18 | OV_LoadModelConfig,
19 | )
20 |
21 | # Suppress specific deprecation warnings from optimum implementation of numpy arrays
22 | # This block prevents clogging the API logs
23 | warnings.filterwarnings("ignore", message="__array__ implementation doesn't accept a copy keyword")
24 |
25 | class Optimum_Image2TextCore:
26 | """
27 | Loads an OpenVINO model and processor,
28 | Applies a chat template to conversation messages, and generates a response.
29 |
30 | For OpenVINO the vision models is split into two parts:
31 | . language_model: The language model part of the vision model.
32 | . text_embeddings: The text embeddings part of the vision model.
33 | . vision_embeddings: The vision embeddings part of the vision model.
34 | """
35 | def __init__(self, load_model_config: OV_LoadModelConfig, ov_config: Optional[OV_Config] = None):
36 | """
37 | Args:
38 | load_model_config: An instance of OV_LoadModelConfig containing parameters
39 | such as id_model, device, export_model, use_cache, and token IDs.
40 | ov_config: Optional OV_Config instance with additional model options.
41 | """
42 | self.load_model_config = load_model_config
43 | self.ov_config = ov_config
44 | self.model = None
45 | self.processor = None
46 | self.model_metadata = {
47 | "model_type": ModelType.VISION,
48 | "id_model": load_model_config.id_model
49 | }
50 |
51 | def load_model(self):
52 | """Load the tokenizer and vision model."""
53 | print(f"Loading model {self.load_model_config.id_model} on device {self.load_model_config.device}...")
54 |
55 | # Extract its configuration as a dict
56 | ov_config_dict = self.ov_config.model_dump(exclude_unset=True) if self.ov_config else {}
57 |
58 | self.model = OVModelForVisualCausalLM.from_pretrained(
59 | self.load_model_config.id_model,
60 | device=self.load_model_config.device,
61 | export_model=self.load_model_config.export_model,
62 | ov_config=ov_config_dict,
63 | dynamic_shapes=self.load_model_config.dynamic_shapes,
64 | use_cache=self.load_model_config.use_cache,
65 | pad_token_id=self.load_model_config.pad_token_id,
66 | eos_token_id=self.load_model_config.eos_token_id,
67 | bos_token_id=self.load_model_config.bos_token_id
68 | )
69 | print("Model loaded successfully.")
70 |
71 | self.processor = AutoProcessor.from_pretrained(self.load_model_config.id_model)
72 | print("Processor loaded successfully.")
73 |
74 | async def generate_vision_stream(
75 | self,
76 | generation_config: OV_GenerationConfig
77 | ) -> AsyncIterator[str]:
78 | """
79 | Asynchronously stream generated text from an image using the provided configuration from
80 | OV_GenerationConfig in completion requests.
81 |
82 | Args:
83 | generation_config: Configuration for text generation
84 | conversation: List of messages to generate text from, can include images
85 | max_new_tokens: Maximum number of tokens to generate
86 | temperature: Temperature for the model
87 | top_p: Top-p value for the model
88 | top_k: Top-k value for the model
89 | repetition_penalty: Repetition penalty for the model
90 | do_sample: Whether to sample from the model
91 | num_return_sequences: Number of sequences to generate
92 |
93 | Yields:
94 | new_text: Generated text tokens as they become available
95 | performance_metrics: Performance metrics for the generation
96 | ttft: Time to first token
97 | generation_time: Time taken to generate the text
98 | tokens_per_second: Tokens per second
99 | average_token_latency: Average token latency
100 | num_tokens_generated: Number of tokens generated
101 | """
102 | if not self.model or not self.processor:
103 | raise ValueError("Model not loaded. Call load_model first.")
104 |
105 | try:
106 | performance_metrics = {}
107 | images = []
108 | text_conversation = []
109 |
110 | for message in generation_config.conversation:
111 | # Check if the message content is a list (multimodal content)
112 | if isinstance(message.get("content", ""), list):
113 | text_parts = []
114 | for content_item in message["content"]:
115 | # Check if this is an image content item
116 | if isinstance(content_item, dict) and content_item.get("type") == "image_url":
117 | image_url = content_item.get("image_url", {})
118 | # Check if it's a base64 encoded image
119 | if isinstance(image_url, dict) and image_url.get("url", "").startswith("data:image/"):
120 | # Extract the base64 data
121 | base64_data = image_url["url"].split(",", 1)
122 | if len(base64_data) > 1:
123 | # Decode base64 to binary
124 | image_data = base64.b64decode(base64_data[1])
125 | # Convert to PIL Image
126 | image = Image.open(BytesIO(image_data))
127 | images.append(image)
128 | # If it's a text content item
129 | elif isinstance(content_item, dict) and content_item.get("type") == "text":
130 | text_parts.append(content_item.get("text", ""))
131 |
132 | # Create a new message with just the text parts
133 | if text_parts:
134 | text_message = message.copy()
135 | text_message["content"] = " ".join(text_parts)
136 | text_conversation.append(text_message)
137 | else:
138 | # If no text parts, still include the message with empty content
139 | text_message = message.copy()
140 | text_message["content"] = ""
141 | text_conversation.append(text_message)
142 | else:
143 | text_conversation.append(message)
144 |
145 | text_prompt = self.processor.apply_chat_template(
146 | generation_config.conversation,
147 | add_generation_prompt=True
148 | )
149 |
150 | if images:
151 | inputs = self.processor(
152 | text=[text_prompt],
153 | images=images,
154 | padding=True,
155 | return_tensors="pt"
156 | )
157 | else:
158 | inputs = self.processor(
159 | text=[text_prompt],
160 | padding=True,
161 | return_tensors="pt"
162 | )
163 |
164 | streamer = TextIteratorStreamer(
165 | self.processor.tokenizer,
166 | skip_prompt=True,
167 | skip_special_tokens=True
168 | )
169 |
170 | # Set up generation parameters
171 | generation_kwargs = dict(
172 | **inputs,
173 | max_new_tokens=generation_config.max_new_tokens,
174 | temperature=generation_config.temperature,
175 | top_p=generation_config.top_p,
176 | top_k=generation_config.top_k,
177 | repetition_penalty=generation_config.repetition_penalty,
178 | do_sample=generation_config.do_sample,
179 | num_return_sequences=generation_config.num_return_sequences,
180 | streamer=streamer
181 | )
182 |
183 | thread = threading.Thread(target=self.model.generate, kwargs=generation_kwargs)
184 |
185 | first_token_received = False
186 | first_token_time = 0.0
187 | ttft = 0.0
188 | num_tokens_generated = 0
189 | generate_start = time.perf_counter()
190 | thread.start()
191 |
192 | new_text = ""
193 | for new_token in streamer:
194 | num_tokens_generated += 1
195 | new_text += new_token
196 | if not first_token_received:
197 | first_token_time = time.perf_counter()
198 | ttft = first_token_time - generate_start
199 | first_token_received = True
200 | new_text += new_token
201 | yield new_token, None
202 |
203 | thread.join()
204 | generate_end = time.perf_counter()
205 | generation_time = generate_end - generate_start
206 |
207 | if generation_time > 0 and num_tokens_generated > 0:
208 | tokens_per_second = num_tokens_generated / generation_time
209 | average_token_latency = generation_time / num_tokens_generated
210 |
211 | performance_metrics = {
212 | "ttft": round(ttft, 2),
213 | "generation_time": round(generation_time, 2),
214 | "tokens_per_second": round(tokens_per_second, 2),
215 | "average_token_latency": round(average_token_latency, 2),
216 | "num_tokens_generated": num_tokens_generated,
217 | }
218 |
219 | yield None, performance_metrics
220 |
221 |
222 | except Exception as e:
223 | print(f"Error during vision generation: {str(e)}")
224 | traceback.print_exc()
225 | raise
226 |
227 | finally:
228 | if 'thread' in locals():
229 | thread.join()
230 |
231 | def util_unload_model(self):
232 | """Unload model and free memory"""
233 | del self.model
234 | self.model = None
235 |
236 | del self.processor
237 | self.processor = None
238 |
239 | gc.collect()
240 | print("Model unloaded and memory cleaned up")
241 |
--------------------------------------------------------------------------------
/src/engine/optimum/optimum_seq2seq.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/src/engine/optimum/optimum_seq2seq.py
--------------------------------------------------------------------------------
/src/engine/optimum/optimum_speech.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/src/engine/optimum/optimum_speech.py
--------------------------------------------------------------------------------
/src/engine/optimum/optimum_text2image.py:
--------------------------------------------------------------------------------
1 | # TODO: Implement text-to-image generation using OpenVINO
2 |
3 |
4 |
5 |
6 |
7 | #!/usr/bin/env python3
8 | # Copyright (C) 2024 Intel Corporation
9 | # SPDX-License-Identifier: Apache-2.0
10 |
11 | import argparse
12 |
13 | import openvino_genai
14 | from PIL import Image
15 |
16 |
17 |
18 |
19 | def generate_image(model_dir: str, prompt: str, device: str = 'CPU') -> Image.Image:
20 | """Generate an image from text using OpenVINO text-to-image pipeline.
21 |
22 | Args:
23 | model_dir: Path to the model directory
24 | prompt: Text prompt to generate image from
25 | device: Device to run on ('CPU' or 'GPU')
26 |
27 | Returns:
28 | PIL.Image.Image: Generated image
29 | """
30 | pipe = openvino_genai.Text2ImagePipeline(model_dir, device)
31 |
32 | image_tensor = pipe.generate(
33 | prompt,
34 | width=512,
35 | height=512,
36 | num_inference_steps=20,
37 | num_images_per_prompt=1)
38 |
39 | return Image.fromarray(image_tensor.data[0])
40 |
41 |
42 | if '__main__' == __name__:
43 | # Example usage
44 | image = generate_image("path/to/model", "a scenic landscape")
45 | image.save("image.bmp")
--------------------------------------------------------------------------------
/src/engine/optimum/optimum_text2text.py:
--------------------------------------------------------------------------------
1 | import gc
2 | import time
3 | import traceback
4 | import logging
5 | from threading import Thread
6 | from typing import Any, AsyncIterator, Dict, Optional
7 |
8 | from optimum.intel import OVModelForCausalLM
9 | from transformers import AutoTokenizer
10 | from transformers.generation.streamers import TextIteratorStreamer
11 |
12 | from .optimum_base_config import (
13 | OV_Config,
14 | OV_GenerationConfig,
15 | OV_LoadModelConfig,
16 | ModelType
17 | )
18 |
19 | logging.basicConfig(level=logging.INFO)
20 | logger = logging.getLogger(__name__)
21 |
22 | class Optimum_Text2TextCore:
23 | """
24 | - Initialize the Optimum_Text2TextCore class when enum ModelType (as model_type) is TEXT.
25 | - Loads an OpenVINO model and HuggingFace tokenizer
26 | - Used for text-to-text generation only
27 | - Any model which can be converted with the Optimum-CLI tool will work.
28 |
29 | """
30 | def __init__(self, load_model_config: OV_LoadModelConfig, ov_config: Optional[OV_Config] = None):
31 | """
32 | Args:
33 | load_model_config: An instance of OV_LoadModelConfig from POST /optimum/model/load
34 |
35 | ov_config: An instance of OV_Config from POST /optimum/model/load
36 | """
37 | self.load_model_config = load_model_config
38 | self.ov_config = ov_config
39 | self.model = None
40 | self.tokenizer = None
41 | self.model_metadata = {
42 | "model_type": ModelType.TEXT,
43 | "id_model": load_model_config.id_model
44 | }
45 |
46 | def load_model(self):
47 | """Load the tokenizer and model."""
48 | print(f"Loading model {self.load_model_config.id_model} on device {self.load_model_config.device}...")
49 |
50 | # Extract its configuration as a dict
51 | ov_config_dict = self.ov_config.model_dump(exclude_unset=True) if self.ov_config else {}
52 |
53 | # Load model with token IDs from config
54 | self.model = OVModelForCausalLM.from_pretrained(
55 | self.load_model_config.id_model,
56 | device=self.load_model_config.device,
57 | export_model=self.load_model_config.export_model,
58 | ov_config=ov_config_dict,
59 | dynamic_shapes=self.load_model_config.dynamic_shapes,
60 | use_cache=self.load_model_config.use_cache,
61 | pad_token_id=self.load_model_config.pad_token_id,
62 | eos_token_id=self.load_model_config.eos_token_id,
63 | bos_token_id=self.load_model_config.bos_token_id
64 | )
65 | print("Model loaded successfully.")
66 |
67 | self.tokenizer = AutoTokenizer.from_pretrained(self.load_model_config.id_model)
68 | print("Tokenizer loaded successfully.")
69 |
70 | async def generate_stream(self, generation_config: OV_GenerationConfig) -> AsyncIterator[tuple[Optional[str], Optional[Dict[str, Any]]]]:
71 | """
72 | Asynchronously stream generated text tokens, followed by performance metrics.
73 |
74 | Args:
75 | generation_config: Configuration for text generation containing conversation history
76 | and generation parameters
77 |
78 | Yields:
79 | - Tuple of (token, None) for each generated token.
80 | - Tuple of (None, performance_metrics) once at the end.
81 |
82 | performance_metrics contains
83 | - ttft: Time to first token
84 | - generation_time: Time taken to generate the text
85 | - tokens_per_second: Tokens per second
86 | - average_token_latency: Average token latency
87 | - num_tokens_generated: Number of tokens generated
88 | """
89 |
90 | performance_metrics = {}
91 | new_text = ""
92 |
93 | try:
94 | # Convert conversation to input ids using the chat template
95 | input_ids = self.tokenizer.apply_chat_template(
96 | generation_config.conversation,
97 | tokenize=True,
98 | add_generation_prompt=True,
99 | return_tensors="pt"
100 | )
101 |
102 | # Initialize the streamer with tokenized input
103 | streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True)
104 |
105 | # Create generation kwargs from config
106 | generation_kwargs = dict(
107 | input_ids=input_ids,
108 | max_new_tokens=generation_config.max_new_tokens,
109 | temperature=generation_config.temperature,
110 | top_k=generation_config.top_k,
111 | top_p=generation_config.top_p,
112 | do_sample=generation_config.do_sample,
113 | repetition_penalty=generation_config.repetition_penalty,
114 | num_return_sequences=generation_config.num_return_sequences,
115 | streamer=streamer,
116 | )
117 |
118 | # Create and start the generation thread
119 | thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
120 |
121 | first_token_received = False
122 | first_token_time = 0.0
123 | ttft = 0.0 # Initialize ttft
124 | generate_start = time.perf_counter()
125 | thread.start()
126 |
127 | # Stream the generated text tokens
128 | for new_token in streamer:
129 | if not first_token_received:
130 | first_token_time = time.perf_counter()
131 | ttft = first_token_time - generate_start
132 | first_token_received = True
133 | new_text += new_token
134 | yield new_token, None
135 |
136 | thread.join()
137 | generate_end = time.perf_counter()
138 |
139 | generation_time = generate_end - generate_start
140 | # Calculate num_tokens_generated based on the final accumulated text
141 | num_tokens_generated = len(self.tokenizer.encode(new_text, return_tensors="pt")[0])
142 |
143 | if generation_time > 0 and num_tokens_generated > 0:
144 | tokens_per_second = num_tokens_generated / generation_time
145 | average_token_latency = generation_time / num_tokens_generated
146 |
147 | performance_metrics = {
148 | "ttft": round(ttft, 2),
149 | "generation_time": round(generation_time, 2),
150 | "tokens_per_second": round(tokens_per_second, 2),
151 | "average_token_latency": round(average_token_latency, 2),
152 | "num_tokens_generated": num_tokens_generated,
153 | }
154 |
155 | # Yield final metrics after streaming tokens
156 | yield None, performance_metrics
157 |
158 | except Exception as e:
159 | logger.error(f"Error during streaming generation: {str(e)}")
160 | traceback.print_exc()
161 | raise
162 |
163 | def generate_text(self, generation_config: OV_GenerationConfig) -> tuple[str, Dict[str, Any]]:
164 | """
165 | Generate text without streaming.
166 | (Note: This implementation currently doesn't calculate/return metrics)
167 |
168 | Args:
169 | generation_config: Configuration for text generation containing conversation history
170 | and generation parameters
171 | Returns:
172 | Tuple of (generated_text, performance_metrics)
173 | """
174 | performance_metrics = {}
175 | try:
176 | input_ids = self.tokenizer.apply_chat_template(
177 | generation_config.conversation,
178 | tokenize=True,
179 | add_generation_prompt=False,
180 | return_tensors="pt"
181 | )
182 |
183 | generation_kwargs = dict(
184 | input_ids=input_ids,
185 | max_new_tokens=generation_config.max_new_tokens,
186 | temperature=generation_config.temperature,
187 | top_k=generation_config.top_k,
188 | top_p=generation_config.top_p,
189 | do_sample=generation_config.do_sample,
190 | repetition_penalty=generation_config.repetition_penalty,
191 | num_return_sequences=generation_config.num_return_sequences,
192 | )
193 |
194 |
195 | generate_start = time.perf_counter()
196 |
197 | outputs = self.model.generate(**generation_kwargs)
198 |
199 | generate_end = time.perf_counter()
200 |
201 | # Extract new tokens by excluding the input tokens
202 | new_tokens = outputs[0][input_ids.shape[1]:]
203 |
204 | generated_text = self.tokenizer.decode(new_tokens, skip_special_tokens=True)
205 |
206 | generation_time = generate_end - generate_start
207 | num_tokens_generated = len(new_tokens)
208 |
209 | if generation_time > 0 and num_tokens_generated > 0:
210 | tokens_per_second = num_tokens_generated / generation_time
211 | average_token_latency = generation_time / num_tokens_generated
212 |
213 | performance_metrics = {
214 | "generation_time": round(generation_time, 2),
215 | "tokens_per_second": round(tokens_per_second, 2),
216 | "average_token_latency": round(average_token_latency, 2),
217 | "num_tokens_generated": num_tokens_generated,
218 | }
219 |
220 | return generated_text, performance_metrics
221 |
222 | except Exception as e:
223 | print(f"Error during text generation: {str(e)}")
224 | traceback.print_exc()
225 | raise
226 |
227 | def util_unload_model(self):
228 | """Unload model and free memory"""
229 | del self.model
230 | self.model = None
231 |
232 | del self.tokenizer
233 | self.tokenizer = None
234 |
235 | gc.collect()
236 | print("Model unloaded and memory cleaned up")
237 |
--------------------------------------------------------------------------------
/src/engine/ov_genai/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/src/engine/ov_genai/__init__.py
--------------------------------------------------------------------------------
/src/engine/ov_genai/base_configuration.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | from pydantic import BaseModel, Field
4 | from typing import Optional
5 |
6 |
7 |
8 | # I'm stilling working through how to build an API from this. Many other classes inherit from this
9 | # so pydantic models must be carefully designed to make API useful for other types of models.
10 |
11 |
12 | class OV_GenAI_GenerationConfig(BaseModel):
13 |
14 | # adapters: Optional[AdapterConfig] = Field(None, description="Adapter configuration for LoRA")
15 | assistant_confidence_threshold: float = Field(..., description="Confidence threshold for assistant")
16 | diversity_penalty: float = Field(1.0, description="Diversity penalty for beam search")
17 | do_sample: bool = Field(True, description="Whether to use sampling for generation")
18 | echo: bool = Field(False, description="Whether to echo the prompt in the output")
19 | eos_token_id: int = Field(2, description="Token ID for end of sentence")
20 |
21 | frequency_penalty: float = Field(0.0, description="Frequency penalty for token repetition")
22 | ignore_eos: bool = Field(False, description="Whether to ignore end of sentence token")
23 | include_stop_str_in_output: bool = Field(False, description="Whether to include stop string in output")
24 | length_penalty: float = Field(1.0, description="Exponential penalty to the length for beam search")
25 | logprobs: int = Field(0, description="Number of top logprobs computed for each position")
26 | max_length: int = Field(..., description="Maximum length of generated tokens")
27 | max_new_tokens: int = Field(128, description="Maximum number of new tokens to generate")
28 | max_ngram_size: int = Field(0, description="Maximum n-gram size for no repeat n-gram")
29 | min_new_tokens: int = Field(0, description="Minimum number of new tokens to generate")
30 |
31 | no_repeat_ngram_size: int = Field(0, description="Size of n-gram to avoid repetition")
32 | num_assistant_tokens: int = Field(0, description="Number of assistant tokens")
33 | num_beam_groups: int = Field(1, description="Number of groups to divide beams into")
34 | num_beams: int = Field(1, description="Number of beams for beam search")
35 | num_return_sequences: int = Field(1, description="Number of sequences to return")
36 | presence_penalty: float = Field(0.0, description="Presence penalty for token repetition")
37 | repetition_penalty: float = Field(1.0, description="Repetition penalty for token repetition")
38 | rng_seed: int = Field(0, description="Random number generator seed")
39 |
40 | # stop_criteria: StopCriteria = Field(..., description="Stopping criteria for beam search")
41 | stop_strings: set[str] = Field(set(), description="Set of strings to stop generation")
42 | stop_token_ids: set[int] = Field(set(), description="Set of token IDs to stop generation")
43 |
44 | temperature: float = Field(1.0, description="Sampling temperature")
45 | top_k: int = Field(50, description="Top-k sampling parameter")
46 | top_p: float = Field(1.0, description="Top-p sampling parameter")
--------------------------------------------------------------------------------
/src/engine/ov_genai/llm_pipe_core.py:
--------------------------------------------------------------------------------
1 | import openvino_genai as ov_genai
2 | import openvino as ov
3 |
4 |
5 | # TODO: Implement LLMPipeline
6 | # Inherit from generator config
7 | # Use DecodedResults
8 | # Use EncodedResults
9 |
10 |
--------------------------------------------------------------------------------
/src/engine/ov_genai/multimodal_pipe_core.py:
--------------------------------------------------------------------------------
1 | import openvino_genai as ov_genai
2 | import openvino as ov
3 |
4 |
5 | # TODO: Implement VLMPipeline
6 | # Inherit from generator_config.py pydanitc models
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/src/engine/ov_genai/txt2img_pipe_core.py:
--------------------------------------------------------------------------------
1 | import openvino_genai as ov_genai
2 | import openvino as ov
3 |
4 |
5 |
6 |
7 | # TODO: Implement Txt2ImgPipeline
8 |
--------------------------------------------------------------------------------
/src/engine/ov_genai/whisper_pipe_core.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | import openvino_genai as ov_genai
4 | import openvino as ov
5 |
6 |
7 | # TODO: Implement WhisperPipeline
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/src/frontend/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SearchSavior/OpenArc/9fca3140efa12f517776204c048d1e280b44ecc0/src/frontend/__init__.py
--------------------------------------------------------------------------------
/src/frontend/components/device_info.py:
--------------------------------------------------------------------------------
1 | import gradio as gr
2 |
3 | from ..tools.device_query import DeviceDataQuery, DeviceDiagnosticQuery
4 |
5 |
6 | class DeviceInfoTool:
7 | """
8 | This class is used to get information about the devices available on the system.
9 | It uses tools from the Openvino runtime directly to get the information and is not part of
10 | Optimum-Intel or OpenVINO GenAI.
11 | """
12 | def __init__(self):
13 | self.device_data_query = DeviceDataQuery()
14 | self.device_diagnostic_query = DeviceDiagnosticQuery()
15 |
16 | def get_available_devices(self):
17 | """Get list of available devices from DeviceDiagnosticQuery"""
18 | devices = self.device_diagnostic_query.get_available_devices()
19 | return {"Available Devices": devices}
20 |
21 | def get_device_properties(self):
22 | """Get detailed properties for all available devices from DeviceDataQuery"""
23 | devices = self.device_data_query.get_available_devices()
24 | result = {}
25 |
26 | for device in devices:
27 | properties = self.device_data_query.get_device_properties(device)
28 | result[device] = properties
29 |
30 | return result
31 |
32 | def create_interface(self):
33 | with gr.Tab("Devices"):
34 | with gr.Row():
35 | with gr.Column(scale=1):
36 | gr.Markdown("## Available Devices")
37 | device_list = gr.JSON(label="Device List")
38 | refresh_button = gr.Button("Refresh Device List")
39 | refresh_button.click(
40 | fn=self.get_available_devices,
41 | inputs=[],
42 | outputs=[device_list]
43 | )
44 | with gr.Column(scale=2):
45 | gr.Markdown("## Device Properties")
46 | device_properties = gr.JSON(label="Device Properties")
47 | properties_button = gr.Button("Get Device Properties")
48 | properties_button.click(
49 | fn=self.get_device_properties,
50 | inputs=[],
51 | outputs=[device_properties]
52 | )
53 |
--------------------------------------------------------------------------------
/src/frontend/components/documentation.py:
--------------------------------------------------------------------------------
1 | import gradio as gr
2 | from pathlib import Path
3 |
4 | class OpenArc_Documentation:
5 | """
6 | The idea of this class is to help keep the documentation organized in a way that will be easy to migrate
7 | to a new frontend in the future.
8 | Also, keeping everything in its own md file is probably better for searchability from outside GitHub.
9 | """
10 |
11 | def __init__(self):
12 | self.doc_components = {}
13 |
14 | self.doc_categories = {
15 | "Performance Hints": [
16 | "LATENCY",
17 | "THROUGHPUT",
18 | "CUMULATIVE_THROUGHPUT"
19 | ],
20 | "CPU Options": [
21 | "Enable Hyperthreading",
22 | "Inference Num Threads",
23 | "Scheduling Core Type"
24 | ],
25 | "Streaming Options": [
26 | "Num Streams"
27 | ]
28 | }
29 |
30 | # Map topic names to file paths
31 | self.doc_files = {
32 | "LATENCY": "docs/ov_config/performance_hint_latency.md",
33 | "THROUGHPUT": "docs/ov_config/performance_hint_throughput.md",
34 | "CUMULATIVE_THROUGHPUT": "docs/ov_config/performance_hint_cumulative_throughput.md",
35 | "Enable Hyperthreading": "docs/ov_config/enable_hyperthreading.md",
36 | "Inference Num Threads": "docs/ov_config/inference_num_threads.md",
37 | "Num Threads": "docs/ov_config/num_threads.md",
38 | "Num Streams": "docs/ov_config/num_streams.md",
39 | "Scheduling Core Type": "docs/ov_config/scheduling_core_type.md"
40 | }
41 |
42 | def read_markdown_file(self, file_path):
43 | """Read a markdown file and return its contents"""
44 | try:
45 | path = Path(file_path)
46 | if path.exists():
47 | return path.read_text()
48 | return f"Documentation file not found: {file_path}"
49 | except Exception as e:
50 | return f"Error reading documentation: {str(e)}"
51 |
52 | def display_doc(self, doc_name):
53 | """Display the selected documentation"""
54 | if doc_name in self.doc_files:
55 | return self.read_markdown_file(self.doc_files[doc_name])
56 | return "Please select a documentation topic from the list."
57 |
58 | def create_interface(self):
59 | with gr.Tab("Documentation"):
60 | with gr.Row():
61 | gr.Markdown("# OpenArc Documentation")
62 |
63 | with gr.Row():
64 | # Create columns for layout
65 | nav_col = gr.Column(scale=1)
66 | content_col = gr.Column(scale=3)
67 |
68 | # Create the content markdown component first
69 | with content_col:
70 | doc_content = gr.Markdown(
71 | value="""
72 | # OpenVINO Configuration Documentation
73 |
74 | Welcome to the OpenArc documentation for OpenVINO configuration options.
75 | This documentation will help you understand how to optimize your model inference using various configuration parameters.
76 |
77 | ## Getting Started
78 |
79 | Select a topic from the navigation panel on the left to view detailed documentation.
80 |
81 | The configuration options are organized into categories:
82 | - **Performance Hints**: Options that control the performance optimization strategy
83 | - **CPU Options**: Settings specific to CPU execution
84 | - **Streaming Options**: Parameters for controlling inference streams
85 | - **Scheduling Options**: Options for thread scheduling and core allocation
86 | """
87 | )
88 | # Store the component for later reference
89 | self.doc_components['doc_content'] = doc_content
90 |
91 | # Now create the navigation sidebar with buttons
92 | with nav_col:
93 | gr.Markdown("## Configuration Options")
94 |
95 | # Create accordions for each category
96 | for category, topics in self.doc_categories.items():
97 | with gr.Accordion(f"{category} ({len(topics)})", open=True):
98 | for topic in topics:
99 | topic_btn = gr.Button(topic, size="sm")
100 | # Set up click handler for each button
101 | topic_btn.click(
102 | fn=self.display_doc,
103 | inputs=[gr.Textbox(value=topic, visible=False)],
104 | outputs=[self.doc_components['doc_content']]
105 | )
--------------------------------------------------------------------------------
/src/frontend/components/loader.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | import gradio as gr
3 | import json
4 |
5 |
6 | class Optimum_Loader:
7 | def __init__(self, payload_constructor):
8 | self.payload_constructor = payload_constructor
9 | self.components = {}
10 |
11 | def read_openvino_config(self, id_model):
12 | """Read the OpenVINO config file from the model directory and display it in the dashboard as a JSON object.
13 | This file is generated by the OpenVINO toolkit and contains metadata about the model's configuration and optimizations based on
14 | how it was converted to the OpenVINO IR.
15 | """
16 | try:
17 | # Convert to Path object and get parent directory
18 | model_path = Path(id_model)
19 | config_path = model_path / "openvino_config.json"
20 |
21 | if config_path.exists():
22 | return json.loads(config_path.read_text())
23 | return {"message": f"No openvino_config.json found in {str(config_path)}"}
24 | except Exception as e:
25 | return {"error": f"Error reading config: {str(e)}"}
26 |
27 | def read_architecture(self, id_model):
28 | """Read the architecture file from the model directory and display it in the dashboard as a JSON object.
29 | While not explicitly required for inference, this file contains metadata about the model's architecture
30 | and can be useful for debugging performance by understanding the model's structure to choose optimization parameters.
31 | """
32 | try:
33 | # Convert to Path object and get parent directory
34 | model_path = Path(id_model)
35 | architecture_path = model_path / "config.json"
36 |
37 | if architecture_path.exists():
38 | return json.loads(architecture_path.read_text())
39 | return {"message": f"No architecture.json found in {str(architecture_path)}"}
40 | except Exception as e:
41 | return {"error": f"Error reading architecture: {str(e)}"}
42 |
43 | def read_generation_config(self, id_model):
44 | """Read the generation config file from the model directory and display it in the dashboard as a JSON object.
45 | This file contains the ground truth of what sampling parameters should be used
46 | for inference. These values are dervied directly from the model's pytorch metadata and should be taken as precedent for benchmararks.
47 | """
48 | try:
49 | model_path = Path(id_model)
50 | generation_config_path = model_path / "generation_config.json"
51 |
52 | if generation_config_path.exists():
53 | return json.loads(generation_config_path.read_text())
54 | return {"message": f"No generation_config.json found in {str(generation_config_path)}"}
55 | except Exception as e:
56 | return {"error": f"Error reading generation config: {str(e)}"}
57 |
58 |
59 | def create_interface(self):
60 | with gr.Tab("Loader"):
61 | with gr.Row():
62 | self.load_model_interface()
63 | self.debug_tool()
64 | self.setup_button_handlers()
65 |
66 | def load_model_interface(self):
67 | with gr.Column(min_width=500, scale=1):
68 | # Model Basic Configuration
69 | with gr.Group("Model Configuration"):
70 | self.components.update({
71 | 'id_model': gr.Textbox(
72 | label="Model Identifier or Path",
73 | placeholder="Enter model identifier or local path",
74 | info="Enter the model's Hugging Face identifier or local path"
75 | ),
76 | 'device': gr.Dropdown(
77 | choices=["", "AUTO", "CPU", "GPU.0", "GPU.1", "GPU.2", "AUTO:GPU.0,GPU.1", "AUTO:GPU.0,GPU.1,GPU.2"],
78 | label="Device",
79 | value="",
80 | info="Select the device for model inference"
81 | ),
82 | 'use_cache': gr.Checkbox(
83 | label="Use Cache",
84 | value=True,
85 | info="Enable cache for stateful models (disable for multi-GPU)"
86 | ),
87 | 'export_model': gr.Checkbox(
88 | label="Export Model",
89 | value=False,
90 | info="Whether to export the model to int8_asym. Default and not recommended."
91 | ),
92 | 'dynamic_shapes': gr.Checkbox(
93 | label="Dynamic Shapes",
94 | value=True,
95 | info="Whether to use dynamic shapes. Default is True. Should only be disabled for NPU inference."
96 | ),
97 | 'model_type': gr.Dropdown(
98 | label="Model Type",
99 | choices=["TEXT", "VISION"],
100 | info="Defines the type of model to load. No default; must be specified."
101 | )
102 | })
103 |
104 | # Token Configuration
105 | with gr.Group("Token Settings"):
106 | self.components.update({
107 | 'bos_token_id': gr.Textbox(
108 | label="bos_token_id",
109 | value="",
110 |
111 | ),
112 | 'eos_token_id': gr.Textbox(
113 | label="eos_token_id",
114 | value="",
115 |
116 | ),
117 | 'pad_token_id': gr.Textbox(
118 | label="pad_token_id",
119 | value="",
120 |
121 | )
122 | })
123 |
124 | # Performance Optimization
125 | with gr.Group("Performance Settings"):
126 | self.components.update({
127 | 'num_streams': gr.Textbox(
128 | label="Number of Streams",
129 | value="",
130 | placeholder="Leave empty for default",
131 | info="Number of inference streams (optional)"
132 | ),
133 | 'performance_hint': gr.Dropdown(
134 | choices=["", "LATENCY", "THROUGHPUT", "CUMULATIVE_THROUGHPUT"],
135 | label="Performance Hint",
136 | value="",
137 | info="Select performance optimization strategy"
138 | ),
139 | 'inference_precision_hint': gr.Dropdown(
140 | choices=["", "auto", "fp16", "fp32", "int8"],
141 | label="Precision Hint",
142 | value="",
143 | info="Select model precision for computation"
144 | ),
145 | 'enable_hyperthreading': gr.Checkbox(
146 | label="Enable Hyperthreading",
147 | value=True,
148 | info="Enable hyperthreading for CPU inference"
149 | ),
150 | 'inference_num_threads': gr.Textbox(
151 | label="Inference Number of Threads",
152 | value="",
153 | placeholder="Leave empty for default",
154 | info="Number of inference threads (optional)"
155 | )
156 | })
157 |
158 | # Action Buttons
159 | with gr.Row():
160 | self.components.update({
161 | 'load_button': gr.Button("Load Model", variant="primary"),
162 | 'status_button': gr.Button("Check Status", variant="secondary")
163 | })
164 |
165 | def debug_tool(self):
166 | with gr.Column(min_width=300, scale=1):
167 | with gr.Accordion("Debug Log", open=True):
168 | self.components['debug_log'] = gr.JSON(
169 | label="Log Output",
170 | value={"message": "Debug information will appear here..."},
171 | )
172 | with gr.Accordion("OpenVINO Config", open=False):
173 | self.components['config_viewer'] = gr.JSON(
174 | label="OpenVINO Configuration",
175 | value={"message": "Config will appear here when model path is entered..."},
176 | )
177 | with gr.Accordion("Architecture", open=False):
178 | self.components['architecture_viewer'] = gr.JSON(
179 | label="Architecture",
180 | value={"message": "Architecture will appear here when model path is entered..."},
181 | )
182 |
183 | with gr.Accordion("Generation Config", open=False):
184 | self.components['generation_config_viewer'] = gr.JSON(
185 | label="Generation Configuration",
186 | value={"message": "Generation config will appear here when model path is entered..."},
187 | )
188 |
189 | def setup_button_handlers(self):
190 | self.build_load_request()
191 |
192 | # Add handler for model path changes
193 | self.components['id_model'].change(
194 | fn=self.read_openvino_config,
195 | inputs=[self.components['id_model']],
196 | outputs=[self.components['config_viewer']]
197 | )
198 |
199 | self.components['id_model'].change(
200 | fn=self.read_architecture,
201 | inputs=[self.components['id_model']],
202 | outputs=[self.components['architecture_viewer']]
203 | )
204 |
205 | self.components['id_model'].change(
206 | fn=self.read_generation_config,
207 | inputs=[self.components['id_model']],
208 | outputs=[self.components['generation_config_viewer']]
209 | )
210 |
211 | def build_load_request(self):
212 | self.components['load_button'].click(
213 | fn=self.payload_constructor.load_model,
214 | inputs=[
215 | self.components[key] for key in [
216 | 'id_model', 'device', 'use_cache', 'export_model',
217 | 'num_streams', 'performance_hint', 'inference_precision_hint',
218 | 'model_type',
219 | 'bos_token_id', 'eos_token_id', 'pad_token_id',
220 | 'enable_hyperthreading', 'inference_num_threads', 'dynamic_shapes'
221 | ]
222 | ],
223 | outputs=[self.components['debug_log']]
224 | )
225 |
226 | self.components['status_button'].click(
227 | fn=self.payload_constructor.status,
228 | inputs=None,
229 | outputs=[self.components['debug_log']]
230 | )
231 |
--------------------------------------------------------------------------------
/src/frontend/components/model_conversion.py:
--------------------------------------------------------------------------------
1 | import gradio as gr
2 | from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE # Import for default cache_dir
3 |
4 | # Dynamically get tasks (approximation, as original script uses TasksManager)
5 | # In a real scenario, this might need a more robust way to get tasks if TasksManager is available
6 | # For now, using the list from the previous file content + info from the script
7 | AVAILABLE_TASKS = [
8 | 'image-to-image', 'image-segmentation', 'image-text-to-text', 'inpainting',
9 | 'sentence-similarity', 'text-to-audio', 'image-to-text',
10 | 'automatic-speech-recognition', 'token-classification', 'text-to-image',
11 | 'audio-classification', 'feature-extraction', 'semantic-segmentation',
12 | 'masked-im', 'audio-xvector', 'audio-frame-classification',
13 | 'text2text-generation', 'multiple-choice', 'depth-estimation',
14 | 'image-classification', 'fill-mask', 'zero-shot-object-detection',
15 | 'object-detection', 'question-answering', 'zero-shot-image-classification',
16 | 'mask-generation', 'text-generation', 'text-classification',
17 | 'text-generation-with-past'
18 | ]
19 |
20 | class ConversionTool:
21 | def __init__(self):
22 |
23 | self.model_input = gr.Textbox(
24 | label='Model',
25 | placeholder='Model ID on huggingface.co or path on disk',
26 | info="Model ID on huggingface.co or path on disk to load model from." # Updated info
27 | )
28 |
29 | self.output_path = gr.Textbox(
30 | label='Output Directory',
31 | placeholder='Path to store the generated OV model',
32 | info="Path indicating the directory where to store the generated OV model." # Updated info
33 | )
34 |
35 | self.task = gr.Dropdown(
36 | label='Task',
37 | choices=['auto'] + AVAILABLE_TASKS,
38 | value='auto', # Default value is 'auto'
39 | info=( # Updated info
40 | "The task to export the model for. If not specified, the task will be auto-inferred based on metadata in the model repository."
41 |
42 | )
43 | )
44 |
45 | self.framework = gr.Dropdown(
46 | label='Framework',
47 | choices=[None, 'pt', 'tf'], # Added None option
48 | value=None,
49 | info=( # Updated info
50 | "The framework to use for the export. If not provided, will attempt to use the local checkpoint's "
51 | "original framework or what is available in the environment."
52 | )
53 | )
54 |
55 | self.trust_remote_code = gr.Checkbox( # Added trust_remote_code
56 | label='Trust Remote Code',
57 | value=False,
58 | info=(
59 | "Allows to use custom code for the modeling hosted in the model repository. This option should only be set for repositories you trust and in which "
60 | "you have read the code, as it will execute on your local machine arbitrary code present in the model repository."
61 | )
62 | )
63 |
64 | self.weight_format = gr.Dropdown(
65 | label='Weight Format',
66 | choices=['fp32', 'fp16', 'int8', 'int4', 'mxfp4', 'nf4'], # Added None option
67 | value=None,
68 | info="The weight format of the exported model." # Updated info
69 | )
70 |
71 | self.quant_mode = gr.Dropdown( # Added quant_mode
72 | label='Quantization Mode',
73 | choices=[None, 'int8', 'f8e4m3', 'f8e5m2', 'nf4_f8e4m3', 'nf4_f8e5m2', 'int4_f8e4m3', 'int4_f8e5m2'],
74 | value=None,
75 | info=(
76 | "Quantization precision mode. This is used for applying full model quantization including activations. "
77 | )
78 | )
79 |
80 | self.library = gr.Dropdown(
81 | label='Library',
82 | choices=[
83 | None, # Added None option
84 | 'transformers',
85 | 'diffusers',
86 | 'timm',
87 | 'sentence_transformers',
88 | 'open_clip'
89 | ],
90 | value=None, # Default is None, inferred later
91 | info="The library used to load the model before export. If not provided, will attempt to infer the local checkpoint's library" # Updated info
92 | )
93 |
94 | self.cache_dir = gr.Textbox( # Added cache_dir
95 | label='Cache Directory',
96 | placeholder=f'Default: {HUGGINGFACE_HUB_CACHE}', # Use imported default
97 | value=None, # Default to None, let the script handle the default path
98 | info="The path to a directory in which the downloaded model should be cached if the standard cache should not be used."
99 | )
100 |
101 | self.pad_token_id = gr.Number( # Added pad_token_id
102 | label='Pad Token ID',
103 | value=None,
104 | step=1,
105 | info=(
106 | "This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it."
107 | )
108 | )
109 |
110 | self.variant = gr.Textbox( # Added variant
111 | label='Variant',
112 | value=None,
113 | info="If specified load weights from variant filename."
114 | )
115 |
116 | self.ratio = gr.Number(
117 | label='Ratio',
118 | value=None, # Default is None
119 | minimum=0.0,
120 | maximum=1.0, # Max is 1.0 according to help text
121 | step=0.1,
122 | info=( # Updated info
123 | "A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80%% of the layers will be quantized to int4 "
124 | "while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0. "
125 | "Note: If dataset is provided, and the ratio is less than 1.0, then data-aware mixed precision assignment will be applied."
126 | )
127 | )
128 |
129 | self.sym = gr.Checkbox( # Moved sym higher to group with quantization params
130 | label='Symmetric Quantization',
131 | value=None, # Default is None in script
132 | info=("Whether to apply symmetric quantization") # Updated info
133 | )
134 |
135 | self.group_size = gr.Number(
136 | label='Group Size',
137 | value=None, # Default is None
138 | step=1,
139 | info=("The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.") # Updated info
140 | )
141 |
142 | self.backup_precision = gr.Dropdown(
143 | label='Backup Precision',
144 | choices=[None, 'none', 'int8_sym', 'int8_asym'], # Added None and 'none'
145 | value=None, # Default is None
146 | info=( # Updated info
147 | "Defines a backup precision for mixed-precision weight compression. Only valid for 4-bit weight formats. "
148 | "If not provided, backup precision is int8_asym. 'none' stands for original floating-point precision of "
149 | "the model weights, in this case weights are retained in their original precision without any "
150 | "quantization. 'int8_sym' stands for 8-bit integer symmetric quantization without zero point. 'int8_asym' "
151 | "stands for 8-bit integer asymmetric quantization with zero points per each quantization group."
152 | )
153 | )
154 |
155 | self.dataset = gr.Dropdown(
156 | label='Dataset',
157 | choices=[None, # Added None option
158 | 'auto',
159 | 'wikitext2',
160 | 'c4',
161 | 'c4-new',
162 | 'contextual',
163 | 'conceptual_captions',
164 | 'laion/220k-GPT4Vision-captions-from-LIVIS',
165 | 'laion/filtered-wit'],
166 | value=None,
167 | info=( # Updated info
168 | "The dataset used for data-aware compression or quantization with NNCF. "
169 | "For language models you can use the one from the list ['auto','wikitext2','c4','c4-new']. With 'auto' the "
170 | "dataset will be collected from model's generations. "
171 | "For diffusion models it should be on of ['conceptual_captions',"
172 | "'laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. "
173 | "For visual language models the dataset must be set to 'contextual'. "
174 | "Note: if none of the data-aware compression algorithms are selected and ratio parameter is omitted or "
175 | "equals 1.0, the dataset argument will not have an effect on the resulting model."
176 | )
177 | )
178 |
179 | self.all_layers = gr.Checkbox(
180 | label='All Layers',
181 | value=None, # Default is None in script
182 | info=( # Updated info
183 | "Whether embeddings and last MatMul layers should be compressed to INT4. If not provided an weight "
184 | "compression is applied, they are compressed to INT8."
185 | )
186 | )
187 |
188 | self.awq = gr.Checkbox(
189 | label='AWQ',
190 | value=None, # Default is None in script
191 | info=( # Updated info
192 | "Whether to apply AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs, but requires "
193 | "additional time for tuning weights on a calibration dataset. To run AWQ, please also provide a dataset "
194 | "argument. Note: it is possible that there will be no matching patterns in the model to apply AWQ, in such "
195 | "case it will be skipped."
196 | )
197 | )
198 |
199 | self.scale_estimation = gr.Checkbox( # Added scale_estimation
200 | label='Scale Estimation',
201 | value=None, # Default is None in script
202 | info=(
203 | "Indicates whether to apply a scale estimation algorithm that minimizes the L2 error between the original "
204 | "and compressed layers. Providing a dataset is required to run scale estimation. Please note, that "
205 | "applying scale estimation takes additional memory and time."
206 | )
207 | )
208 |
209 | self.gptq = gr.Checkbox( # Added gptq
210 | label='GPTQ',
211 | value=None, # Default is None in script
212 | info=(
213 | "Indicates whether to apply GPTQ algorithm that optimizes compressed weights in a layer-wise fashion to "
214 | "minimize the difference between activations of a compressed and original layer. Please note, that "
215 | "applying GPTQ takes additional memory and time."
216 | )
217 | )
218 |
219 | self.lora_correction = gr.Checkbox( # Added lora_correction
220 | label='LoRA Correction',
221 | value=None, # Default is None in script
222 | info=(
223 | "Indicates whether to apply LoRA Correction algorithm. When enabled, this algorithm introduces low-rank "
224 | "adaptation layers in the model that can recover accuracy after weight compression at some cost of "
225 | "inference latency. Please note, that applying LoRA Correction algorithm takes additional memory and time."
226 | )
227 | )
228 |
229 | self.sensitivity_metric = gr.Dropdown( # Added sensitivity_metric
230 | label='Sensitivity Metric',
231 | choices=[None, 'weight_quantization_error', 'hessian_input_activation',
232 | 'mean_activation_variance', 'max_activation_variance', 'mean_activation_magnitude'],
233 | value=None,
234 | info=(
235 | "The sensitivity metric for assigning quantization precision to layers. It can be one of the following: "
236 | "['weight_quantization_error', 'hessian_input_activation', 'mean_activation_variance', "
237 | "'max_activation_variance', 'mean_activation_magnitude']."
238 | )
239 | )
240 |
241 | self.num_samples = gr.Number( # Added num_samples
242 | label='Number of Samples',
243 | value=None,
244 | step=1,
245 | info="The maximum number of samples to take from the dataset for quantization." # Updated info
246 | )
247 |
248 | self.disable_stateful = gr.Checkbox(
249 | label='Disable Stateful',
250 | value=False, # Default is False (stateful is enabled by default)
251 | info=( # Updated info
252 | "Disable stateful converted models, stateless models will be generated instead. Stateful models are produced by default when this key is not used. "
253 | "In stateful models all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs. "
254 | "If --disable-stateful option is used, it may result in sub-optimal inference performance. "
255 | "Use it when you intentionally want to use a stateless model, for example, to be compatible with existing "
256 | "OpenVINO native inference code that expects KV-cache inputs and outputs in the model."
257 | )
258 | )
259 |
260 | self.disable_convert_tokenizer = gr.Checkbox(
261 | label='Disable Convert Tokenizer',
262 | value=False, # Default is False (conversion is enabled by default)
263 | info="Do not add converted tokenizer and detokenizer OpenVINO models." # Updated info
264 | )
265 |
266 | self.smooth_quant_alpha = gr.Number( # Added smooth_quant_alpha
267 | label='Smooth Quant Alpha',
268 | value=None,
269 | minimum=0.0,
270 | maximum=1.0,
271 | step=0.1,
272 | info=(
273 | "SmoothQuant alpha parameter that improves the distribution of activations before MatMul layers and "
274 | "reduces quantization error. Valid only when activations quantization is enabled."
275 | )
276 | )
277 |
278 | self.command_output = gr.TextArea(
279 | label='Generated Command',
280 | placeholder='Generated command will appear here...',
281 | show_label=True,
282 | show_copy_button=True,
283 | lines=5 # Adjust height
284 | )
285 |
286 | def construct_command(self, model_input, output_path, task, framework, trust_remote_code, # Added trust_remote_code
287 | weight_format, quant_mode, library, cache_dir, pad_token_id, variant, # Added new args
288 | ratio, sym, group_size, backup_precision, dataset, all_layers, # Added sym
289 | awq, scale_estimation, gptq, lora_correction, sensitivity_metric, num_samples, # Added new args
290 | disable_stateful, disable_convert_tokenizer, smooth_quant_alpha): # Added smooth_quant_alpha
291 | """Construct the command string"""
292 | if not model_input or not output_path:
293 | return ''
294 |
295 | cmd_parts = ['optimum-cli export openvino']
296 | cmd_parts.append(f'-m "{model_input}"')
297 |
298 | if task and task != 'auto':
299 | cmd_parts.append(f'--task {task}')
300 |
301 | if framework:
302 | cmd_parts.append(f'--framework {framework}')
303 |
304 | if trust_remote_code: # Added trust_remote_code flag
305 | cmd_parts.append('--trust-remote-code')
306 |
307 | if weight_format: # Check if not None/empty
308 | cmd_parts.append(f'--weight-format {weight_format}')
309 |
310 | if quant_mode: # Added quant_mode
311 | cmd_parts.append(f'--quant-mode {quant_mode}')
312 |
313 | if library: # Check if not None/empty
314 | cmd_parts.append(f'--library {library}')
315 |
316 | if cache_dir: # Added cache_dir
317 | cmd_parts.append(f'--cache_dir "{cache_dir}"')
318 |
319 | if pad_token_id: # Added pad_token_id
320 | cmd_parts.append(f'--pad-token-id {int(pad_token_id)}') # Ensure int
321 |
322 | if variant: # Added variant
323 | cmd_parts.append(f'--variant "{variant}"')
324 |
325 | # Compression/Quantization specific args
326 | if ratio: # Check for None explicitly
327 | cmd_parts.append(f'--ratio {ratio}')
328 |
329 | if sym: # Check for None explicitly and True
330 | cmd_parts.append('--sym')
331 |
332 | if group_size: # Check for None explicitly
333 | cmd_parts.append(f'--group-size {int(group_size)}') # Ensure int
334 |
335 | if backup_precision: # Check if not None/empty
336 | cmd_parts.append(f'--backup-precision {backup_precision}')
337 |
338 | if dataset: # Check if not None/empty
339 | cmd_parts.append(f'--dataset {dataset}')
340 |
341 | if all_layers: # Check for None explicitly and True
342 | cmd_parts.append('--all-layers')
343 |
344 | if awq: # Check for None explicitly and True
345 | cmd_parts.append('--awq')
346 |
347 | if scale_estimation: # Added scale_estimation flag
348 | cmd_parts.append('--scale-estimation')
349 |
350 | if gptq is not None and gptq: # Added gptq flag
351 | cmd_parts.append('--gptq')
352 |
353 | if lora_correction: # Added lora_correction flag
354 | cmd_parts.append('--lora-correction')
355 |
356 | if sensitivity_metric: # Added sensitivity_metric
357 | cmd_parts.append(f'--sensitivity-metric {sensitivity_metric}')
358 |
359 | if num_samples: # Added num_samples
360 | cmd_parts.append(f'--num-samples {int(num_samples)}') # Ensure int
361 |
362 | if smooth_quant_alpha: # Added smooth_quant_alpha
363 | cmd_parts.append(f'--smooth-quant-alpha {smooth_quant_alpha}')
364 |
365 | # Other boolean flags
366 | if disable_stateful: # Default is False, only add if True
367 | cmd_parts.append('--disable-stateful')
368 | if disable_convert_tokenizer: # Default is False, only add if True
369 | cmd_parts.append('--disable-convert-tokenizer')
370 |
371 | # Output path is always last and required
372 | cmd_parts.append(f'"{output_path}"')
373 |
374 | constructed_command = ' '.join(cmd_parts)
375 | return constructed_command
376 |
377 | def gradio_app(self):
378 | """Create and run the Gradio interface."""
379 | # Define inputs in the order they appear visually (or logically)
380 | inputs = [
381 | self.model_input,
382 | self.output_path,
383 | self.task,
384 | self.framework,
385 | self.trust_remote_code, # Added
386 | self.weight_format,
387 | self.quant_mode, # Added
388 | self.library,
389 | self.cache_dir, # Added
390 | self.pad_token_id, # Added
391 | self.variant, # Added
392 | # Quantization/Compression Group
393 | self.ratio,
394 | self.sym, # Added
395 | self.group_size,
396 | self.backup_precision,
397 | self.dataset,
398 | self.all_layers,
399 | self.awq,
400 | self.scale_estimation, # Added
401 | self.gptq, # Added
402 | self.lora_correction, # Added
403 | self.sensitivity_metric, # Added
404 | self.num_samples, # Added
405 | self.smooth_quant_alpha, # Added
406 | # Other Flags
407 | self.disable_stateful,
408 | self.disable_convert_tokenizer,
409 | ]
410 | interface = gr.Interface(
411 | fn=self.construct_command,
412 | inputs=inputs,
413 | outputs=self.command_output,
414 | title="OpenVINO IR Model Conversion Tool",
415 | description="""
416 | Enter model information to generate an `optimum-cli export openvino` command.
417 | Use the arguments below to configure the export process based on the OpenVINO exporter documentation.
418 | Then run the generated command in the terminal where your OpenArc environment is activated.
419 | """,
420 | flagging_mode='auto' # Keep or remove based on preference
421 | )
422 |
423 | return interface
424 |
425 |
426 | # if __name__ == "__main__":
427 | # tool = ConversionTool()
428 | # app = tool.gradio_app()
429 | # app.launch(share=False)
430 |
--------------------------------------------------------------------------------
/src/frontend/components/model_manager.py:
--------------------------------------------------------------------------------
1 | import gradio as gr
2 | import pandas as pd
3 |
4 |
5 | class ModelManager:
6 | def __init__(self, payload_constructor):
7 | self.payload_constructor = payload_constructor
8 | self.components = {}
9 |
10 | def _refresh_models(self):
11 | """Helper function to fetch and format loaded models data"""
12 | response, _ = self.payload_constructor.status()
13 |
14 | if "error" in response:
15 | return pd.DataFrame(), "Error fetching model status"
16 |
17 | loaded_models = response.get("loaded_models", {})
18 | total_models = response.get("total_models_loaded", 0)
19 |
20 | # Format data for two-column DataFrame
21 | model_data = []
22 | for model_name, model_info in loaded_models.items():
23 | metadata = model_info.get("model_metadata", {})
24 |
25 | # Add model header row
26 | model_data.append({"Attribute": f"{model_name}", "Value": ""})
27 |
28 | # Add all model attributes
29 | model_data.append({"Attribute": "Status", "Value": model_info.get("status", "")})
30 | model_data.append({"Attribute": "Device", "Value": model_info.get("device", "")})
31 | model_data.append({"Attribute": "Path", "Value": metadata.get("id_model", "")})
32 | model_data.append({"Attribute": "use_cache", "Value": metadata.get("use_cache", "")})
33 | model_data.append({"Attribute": "dynamic_shapes", "Value": metadata.get("dynamic_shapes", "")})
34 | model_data.append({"Attribute": "pad_token_id", "Value": metadata.get("pad_token_id", "")})
35 | model_data.append({"Attribute": "eos_token_id", "Value": metadata.get("eos_token_id", "")})
36 | model_data.append({"Attribute": "bos_token_id", "Value": metadata.get("bos_token_id", "")})
37 | model_data.append({"Attribute": "is_vision_model", "Value": metadata.get("is_vision_model", "")})
38 | model_data.append({"Attribute": "is_text_model", "Value": metadata.get("is_text_model", "")})
39 | model_data.append({"Attribute": "NUM_STREAMS", "Value": metadata.get("NUM_STREAMS", "")})
40 | model_data.append({"Attribute": "PERFORMANCE_HINT", "Value": metadata.get("PERFORMANCE_HINT", "")})
41 | model_data.append({"Attribute": "PRECISION_HINT", "Value": metadata.get("PRECISION_HINT", "")})
42 | model_data.append({"Attribute": "ENABLE_HYPER_THREADING", "Value": metadata.get("ENABLE_HYPER_THREADING", "")})
43 | model_data.append({"Attribute": "INFERENCE_NUM_THREADS", "Value": metadata.get("INFERENCE_NUM_THREADS", "")})
44 | model_data.append({"Attribute": "SCHEDULING_CORE_TYPE", "Value": metadata.get("SCHEDULING_CORE_TYPE", "")})
45 |
46 | # Add empty row between models
47 | model_data.append({"Attribute": "", "Value": ""})
48 |
49 | df = pd.DataFrame(model_data)
50 | status_text = f"Total Models Loaded: {total_models}"
51 | return df, status_text
52 |
53 | def _unload_model(self):
54 | """Helper function to unload a model"""
55 | response, _ = self.payload_constructor.unload_model()
56 | return response
57 |
58 | def _unload_model_ui(self, model_id):
59 | """Helper function to handle model unloading"""
60 | _, status_msg = self.payload_constructor.unload_model(model_id)
61 | return status_msg
62 |
63 | def create_interface(self):
64 | with gr.Tab("Model Manager"):
65 | gr.Markdown("## Model Management Interface")
66 |
67 | with gr.Row():
68 | refresh_btn = gr.Button("Refresh Loaded Models")
69 | status_text = gr.Textbox(label="Status", interactive=False)
70 |
71 | model_table = gr.DataFrame(
72 | headers=["Attribute", "Value"],
73 | datatype=["str", "str"],
74 | interactive=False,
75 | wrap=True,
76 | )
77 |
78 | refresh_btn.click(
79 | fn=self._refresh_models,
80 | outputs=[model_table, status_text]
81 | )
82 |
83 | with gr.Row():
84 | model_id_input = gr.Textbox(label="Model ID to Unload")
85 | unload_btn = gr.Button("Unload Model")
86 | unload_status = gr.Textbox(label="Unload Status", interactive=False)
87 |
88 | unload_btn.click(
89 | fn=self._unload_model_ui,
90 | inputs=model_id_input,
91 | outputs=unload_status
92 | )
93 |
--------------------------------------------------------------------------------
/src/frontend/tools/device_query.py:
--------------------------------------------------------------------------------
1 | # Diagnostic Device Query
2 |
3 | import openvino as ov
4 | import logging as log
5 |
6 | class DeviceDiagnosticQuery:
7 | def __init__(self):
8 | self.core = ov.Core()
9 | self.available_devices = self.core.available_devices
10 |
11 | def get_available_devices(self):
12 | """Returns a list of available OpenVINO devices."""
13 | return self.available_devices
14 |
15 | #if __name__ == "__main__":
16 | # device_query = DeviceDiagnosticQuery()
17 | # print(device_query.get_available_devices())
18 |
19 |
20 | # Device Query:
21 | # Taken from https://github.com/openvinotoolkit/openvino/blob/master/samples/python/hello_query_device/hello_query_device.py
22 |
23 | class DeviceDataQuery:
24 | def __init__(self):
25 | self.core = ov.Core()
26 |
27 | @staticmethod
28 | def param_to_string(parameters) -> str:
29 | """Convert a list / tuple of parameters returned from OV to a string."""
30 | if isinstance(parameters, (list, tuple)):
31 | return ', '.join([str(x) for x in parameters])
32 | return str(parameters)
33 |
34 | def get_available_devices(self) -> list:
35 | """Return list of available devices."""
36 | return self.core.available_devices
37 |
38 | def get_device_properties(self, device: str) -> dict:
39 | """Get all properties for a specific device."""
40 | properties = {}
41 | supported_properties = self.core.get_property(device, 'SUPPORTED_PROPERTIES')
42 |
43 | for property_key in supported_properties:
44 | if property_key != 'SUPPORTED_PROPERTIES':
45 | try:
46 | property_val = self.core.get_property(device, property_key)
47 | properties[property_key] = self.param_to_string(property_val)
48 | except TypeError:
49 | properties[property_key] = 'UNSUPPORTED TYPE'
50 | return properties
51 |
52 | def print_device_info(self):
53 | """Print information about all available devices."""
54 | log.info('Available devices:')
55 | for device in self.get_available_devices():
56 | log.info(f'{device} :')
57 | log.info('\tSUPPORTED_PROPERTIES:')
58 |
59 | properties = self.get_device_properties(device)
60 | for key, value in properties.items():
61 | log.info(f'\t\t{key}: {value}')
62 | log.info('')
63 |
64 | #def main():
65 | # log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout)
66 | # query = DeviceDataQuery()
67 | # query.print_device_info()
68 | # return 0
69 |
70 | #if __name__ == '__main__':
71 | # sys.exit(main())
--------------------------------------------------------------------------------
/src/frontend/tools/payload_constructor.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 | from typing import Optional
3 | import requests
4 | import os
5 |
6 |
7 | # Default OpenARC URL
8 | DEFAULT_OPENARC_PORT = 8000
9 | OPENARC_URL = f"http://localhost:{DEFAULT_OPENARC_PORT}"
10 |
11 | # Update URL if custom port is provided
12 | def update_openarc_url(openarc_port=DEFAULT_OPENARC_PORT):
13 | global OPENARC_URL
14 | OPENARC_URL = f"http://localhost:{openarc_port}"
15 |
16 | def get_auth_headers():
17 | """Get authorization headers with bearer token if available"""
18 | headers = {"Content-Type": "application/json"}
19 | api_key = os.environ.get("OPENARC_API_KEY")
20 | if api_key:
21 | headers["Authorization"] = f"Bearer {api_key}"
22 | return headers
23 |
24 |
25 | class LoadModelConfig(BaseModel):
26 | id_model: str
27 | use_cache: bool
28 | device: str
29 | export_model: bool
30 | model_type: str
31 | eos_token_id: Optional[int] = None
32 | pad_token_id: Optional[int] = None
33 | bos_token_id: Optional[int] = None
34 | dynamic_shapes: bool = True
35 |
36 | class OVConfig(BaseModel):
37 | NUM_STREAMS: Optional[str] = None
38 | PERFORMANCE_HINT: Optional[str] = None
39 | ENABLE_HYPERTHREADING: Optional[bool] = None
40 | INFERENCE_NUM_THREADS: Optional[str] = None
41 | INFERENCE_PRECISION_HINT: Optional[str] = None
42 |
43 | class Payload_Constructor:
44 | def __init__(self):
45 | self.generation_config = {}
46 |
47 | def load_model(self, id_model, device, use_cache, export_model, num_streams, performance_hint, inference_precision_hint, model_type, bos_token_id, eos_token_id, pad_token_id, enable_hyperthreading, inference_num_threads, dynamic_shapes):
48 | """
49 | Constructs and sends the load model request based on UI inputs
50 |
51 | Args:
52 | id_model (str): Model identifier or path
53 | device (str): Device selection for inference
54 | use_cache (bool): Whether to use cache
55 | model_type (str): Defines the type of model to load
56 | export_model (bool): Whether to export the model
57 | num_streams (str): Number of inference streams
58 | performance_hint (str): Performance optimization strategy
59 | INFERENCE_PRECISION_HINT (str): Model precision for computation
60 | bos_token_id (str): BOS token ID
61 | eos_token_id (str): EOS token ID
62 | pad_token_id (str): PAD token ID
63 | enable_hyperthreading (bool): Whether to enable hyperthreading
64 | inference_num_threads (str): Number of inference threads
65 | dynamic_shapes (bool): Whether to use dynamic shapes
66 | """
67 |
68 | # Create validated load_config
69 | load_config = LoadModelConfig(
70 | id_model=id_model,
71 | use_cache=use_cache,
72 | device=device,
73 | export_model=export_model,
74 | model_type=model_type,
75 | eos_token_id=int(eos_token_id) if eos_token_id else None,
76 | pad_token_id=int(pad_token_id) if pad_token_id else None,
77 | bos_token_id=int(bos_token_id) if bos_token_id else None,
78 | dynamic_shapes=dynamic_shapes
79 | )
80 |
81 | # Create validated ov_config
82 | ov_config = OVConfig(
83 | NUM_STREAMS=num_streams if num_streams else None,
84 | PERFORMANCE_HINT=performance_hint if performance_hint else None,
85 | ENABLE_HYPERTHREADING=enable_hyperthreading,
86 | INFERENCE_NUM_THREADS=inference_num_threads if inference_num_threads else None,
87 | INFERENCE_PRECISION_HINT=inference_precision_hint if inference_precision_hint else None
88 | )
89 |
90 | try:
91 | response = requests.post(
92 | f"{OPENARC_URL}/optimum/model/load",
93 | headers=get_auth_headers(),
94 | json={
95 | "load_config": load_config.model_dump(exclude_none=True),
96 | "ov_config": ov_config.model_dump(exclude_none=True)
97 | }
98 | )
99 | response.raise_for_status()
100 | return response.json(), f"Model loaded successfully: {response.json()}"
101 | except requests.exceptions.RequestException as e:
102 | return {"error": f"Request failed: {str(e)}"}, f"Error loading model: {str(e)}"
103 |
104 | def unload_model(self, model_id: str):
105 | """
106 | Sends an unload model request to the API
107 |
108 | Args:
109 | model_id (str): The ID of the model to unload
110 | """
111 | try:
112 | response = requests.delete(
113 | f"{OPENARC_URL}/optimum/model/unload",
114 | headers=get_auth_headers(),
115 | params={"model_id": model_id}
116 | )
117 | response.raise_for_status()
118 | return response.json(), f"Model {model_id} unloaded successfully: {response.json()}"
119 | except requests.exceptions.RequestException as e:
120 | return {"error": f"Request failed: {str(e)}"}, f"Error unloading model {model_id}: {str(e)}"
121 |
122 | def status(self):
123 | """
124 | Checks the server status
125 | """
126 | try:
127 | response = requests.get(
128 | f"{OPENARC_URL}/optimum/status",
129 | headers=get_auth_headers()
130 | )
131 | response.raise_for_status()
132 | return response.json(), f"Server status: {response.json()}"
133 | except requests.exceptions.RequestException as e:
134 | return {"error": f"Request failed: {str(e)}"}, f"Error checking server status: {str(e)}"
135 |
--------------------------------------------------------------------------------
/start_dashboard.py:
--------------------------------------------------------------------------------
1 | # OpenArc/start_dashboard.py
2 | import argparse
3 |
4 | import gradio as gr
5 |
6 | from src.frontend.components.device_info import DeviceInfoTool
7 | from src.frontend.components.model_conversion import ConversionTool
8 | from src.frontend.components.documentation import OpenArc_Documentation
9 | from src.frontend.components.loader import Optimum_Loader
10 | from src.frontend.components.model_manager import ModelManager
11 | from src.frontend.tools.payload_constructor import (
12 | Payload_Constructor,
13 | update_openarc_url,
14 | )
15 |
16 | if __name__ == "__main__":
17 | parser = argparse.ArgumentParser(description="Start the OpenVINO Chat Dashboard")
18 |
19 | parser.add_argument("--openarc-port", type=int, default=8000,
20 | help="Port for the OpenARC server (default: 8000)")
21 |
22 | args = parser.parse_args()
23 | # Update OpenARC URL with the provided port
24 | update_openarc_url(args.openarc_port)
25 |
26 | # Create the dashboard components
27 | payload_constructor = Payload_Constructor()
28 |
29 | # Set up the Gradio interface
30 | with gr.Blocks(title="OpenARC Dashboard") as demo:
31 | with gr.Tabs():
32 | # Main tabs
33 | optimum_loader = Optimum_Loader(payload_constructor)
34 | optimum_loader.create_interface()
35 |
36 | model_manager = ModelManager(payload_constructor)
37 | model_manager.create_interface()
38 |
39 | # Tools tab with sub-tabs
40 | with gr.Tab("Tools"):
41 | with gr.Tabs():
42 | with gr.Tab("Model Conversion"):
43 | conversion_tool = ConversionTool()
44 | conversion_tool.gradio_app()
45 |
46 | # Device Information tab
47 | device_info_tool = DeviceInfoTool()
48 | device_info_tool.create_interface()
49 |
50 | # Documentation tab
51 | documentation = OpenArc_Documentation()
52 | documentation.create_interface()
53 |
54 | # Launch the dashboard
55 | demo.launch()
56 |
--------------------------------------------------------------------------------
/start_server.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from src.api.launcher import start_server
3 |
4 | if __name__ == "__main__":
5 | parser = argparse.ArgumentParser(description="Start the OpenVINO Inference API server")
6 | parser.add_argument("--host", type=str, default="0.0.0.0",
7 | help="Host to bind the server to (default: 0.0.0.0)")
8 | parser.add_argument("--openarc-port", type=int, default=8000,
9 | help="Port to bind the server to (default: 8000)")
10 | args = parser.parse_args()
11 | start_server(host=args.host, openarc_port=args.openarc_port)
--------------------------------------------------------------------------------