├── requirements.txt ├── LICENSE ├── README.md └── highCompute.py /requirements.txt: -------------------------------------------------------------------------------- 1 | gradio==4.44.1 2 | pydantic==2.10.6 3 | pydantic_core==2.27.2 4 | requests 5 | python-dotenv 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # highCompute.py 2 | 3 | [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) 4 | 5 | A single Python file that connects via the OpenAI Chat Completions API, giving you something akin to OpenAI High Compute at home. **Any** models are compatible. Using dynamic programming methods, computational capacity is increased by tens or even hundreds of times for both reasoning and non-reasoning models, significantly improving answer quality and the ability to solve extremely complex tasks for LLMs. 6 | 7 | This is a simple Gradio-based web application providing an interface for interacting with a locally hosted Large Language Model (LLM). The key feature is the ability to select a "Computation Level," which determines the strategy for processing user queries—ranging from direct responses to multi-level task decomposition for obtaining more structured and comprehensive answers to complex queries. 8 | 9 | 10 | 11 | https://github.com/user-attachments/assets/8cc0fa3d-69fa-4183-8d6b-004ea934fd78 12 | 13 | ![Снимок экрана_20250427_174119](https://github.com/user-attachments/assets/b70d8c5d-911a-4c49-ab37-b4ec9925086e) 14 | 15 | 16 | 17 | 18 | The application connects to your specified LLM API endpoint, compatible with the OpenAI Chat Completions API format. 19 | 20 | ## 🌟 Key Features 21 | 22 | * **Local LLM Integration:** Works with your own LLM server (e.g., llama.cpp, Ollama, LM Studio, vLLM with an OpenAI-compatible endpoint). 23 | * **Compute Levels:** 24 | * **Low:** Direct query to the LLM for a quick response. This is a standard chat mode. Generates N tokens — for example, solving a task may only consume 700 tokens. 25 | * **Medium:** Single-level task decomposition into subtasks, solving them, and synthesizing the final answer. Suitable for moderately complex queries. The number of generated tokens is approximately 10-15x higher compared to Low Compute (average value, depends on the task): if solving a task in Low Compute took 700 tokens, Medium level would require around 7,000 tokens. 26 | * **High:** Two-level task decomposition (stages → steps), solving individual steps, synthesizing stage results, and generating the final answer. Designed for highly complex and multi-component tasks. The number of generated tokens is approximately 100-150x higher compared to Low Compute: if solving a task in Low Compute took 700 tokens, High level would require around 70,000 tokens. 27 | * **Flexible Compute Adjustment:** You can freely adjust the Compute Level for each query individually. For example, initiate the first query in High Compute, then switch to Low mode, and later use Medium Compute to solve a specific problem mid-chat. 28 | 29 | ## ⚙️ How It Works: Computation Levels 30 | 31 | The core idea is that for complex tasks, a simple direct query to the LLM may not yield optimal results. Decomposition allows breaking down a complex problem into smaller, manageable parts, solving them individually, and then combining the results. 32 | 33 | 1. **Low:** 34 | * `User Query` → `LLM (single call)` → `Response` 35 | * The fastest mode, suitable for simple questions or when a quick response is needed. Essentially, this is the standard chat mode. 36 | 37 | 2. **Medium:** 38 | * `User Query` → `LLM (decomposition request)` → `List of subtasks` 39 | * *For each subtask:* `Subtask + Context` → `LLM (subtask solution)` → `Subtask result` 40 | * `All subtask results + Original query` → `LLM (final synthesis)` → `Final answer` 41 | * Uses multiple LLM calls. Decomposition and synthesis requests use a lower `temperature` for greater predictability. 42 | 43 | 3. **High:** 44 | * `User Query` → `LLM (Level 1 decomposition)` → `List of stages (L1)` 45 | * *For each L1 stage:* 46 | * `L1 Stage + Context` → `LLM (Level 2 decomposition)` → `List of steps (L2)` 47 | * *If L2 decomposition is not needed:* `L1 Stage + Context` → `LLM (direct L1 stage solution)` → `L1 Stage result` 48 | * *If L2 decomposition succeeds:* 49 | * *For each L2 step:* `L2 Step + L1 Context` → `LLM (L2 step solution)` → `L2 Step result` 50 | * `All L2 Step results + L1 Context` → `LLM (L1 stage synthesis)` → `L1 Stage result` 51 | * `All L1 Stage results + Original query` → `LLM (final synthesis)` → `Final answer` 52 | * The most resource-intensive mode, using multiple LLM calls. Designed for highly complex tasks requiring multi-stage planning and solving. Uses a lower `temperature` for all decomposition and synthesis steps. If L1 decomposition fails, it automatically switches to `Medium` mode. WARNING! This can increase the number of generated tokens by hundreds of times! If you're using a paid API, consider this carefully! 53 | 54 | ## 📋 Prerequisites 55 | 56 | * **Python 3.11** 57 | * **pip** (Python package manager) 58 | * **A working LLM server:** You need an accessible HTTP server with an LLM that provides an API compatible with [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create). 59 | * Examples of such servers: 60 | * [Ollama](https://ollama.ai/) (with the `--api` flag or via a separate proxy for OpenAI compatibility) 61 | * [LM Studio](https://lmstudio.ai/) (provides an OpenAI-compatible endpoint) 62 | * [vLLM](https://github.com/vllm-project/vllm) (with an OpenAI-compatible server) 63 | * [OpenRouter](https://openrouter.ai/). 64 | * **Important:** The server must accept POST requests at the path specified in `LLM_API_ENDPOINT` (default: `/v1/chat/completions`) and process JSON data in OpenAI format (fields: `model`, `messages`, `temperature`, `top_p`, `top_k`). The response must also follow the OpenAI format (expected field: `choices[0].message.content`). 65 | 66 | ## 🚀 Installation 67 | 68 | 1. **Clone the repository:** 69 | ```bash 70 | git clone https://github.com/AlexBefest/highCompute.py.git 71 | cd highCompute.py 72 | ``` 73 | 74 | 2. **Create and activate a virtual environment (recommended):** 75 | * On Linux/macOS: 76 | ```bash 77 | python3 -m venv venv 78 | source venv/bin/activate 79 | ``` 80 | * On Windows: 81 | ```bash 82 | python -m venv venv 83 | .\venv\Scripts\activate 84 | ``` 85 | 86 | 3. **Install dependencies:** 87 | Install Python dependencies: 88 | ```bash 89 | pip install -r requirements.txt 90 | ``` 91 | 92 | ## ⚙️ Configuration 93 | 94 | 1. **Create a `.env` file** in the project root folder. 95 | 2. **Add `LLM_API_ENDPOINT`, `LLM_MODEL`, and `LLM_API_KEY` to `.env`**, specifying the full URL of your local LLM API endpoint compatible with OpenAI Chat Completions API, your LLM model name, and API key. 96 | 97 | **Example `.env` file content:** 98 | ```dotenv 99 | LLM_API_ENDPOINT=http://192.168.2.33:8000/v1/chat/completions 100 | LLM_API_KEY="token-abc123" 101 | LLM_MODEL="AlexBefest/Gemma3-27B" 102 | ``` 103 | * Ensure your LLM server is actually listening at this address and path. 104 | 105 | ## ▶️ Running the Application 106 | 107 | 1. **Ensure your local LLM server is running** and accessible at the URL specified in `.env` (or the default address). 108 | 2. **Run the Python script:** 109 | ```bash 110 | python highCompute.py 111 | ``` 112 | 3. **Open the web interface:** The console will display a Gradio message with the local URL, typically `http://127.0.0.1:7860`. Open this URL in your web browser. 113 | 114 | ## 💬 Using the Interface 115 | 116 | 1. **Select Computation Level:** Low, Medium, or High, depending on query complexity. 117 | 2. **(Optional) Adjust parameters:** Modify the `Temperature`, `Top-P`, and `Top-K` sliders if you want to change the LLM's response style. 118 | * `Temperature`: Controls randomness. Lower values (closer to 0) make responses more deterministic and focused. Higher values (closer to 2.0) make responses more creative and diverse but may lead to "hallucinations." 119 | * `Top-P`: Nucleus sampling. The model only considers tokens whose cumulative probability is ≥ `top_p`. A value of `1.0` disables this parameter. 120 | * `Top-K`: Only the top `k` most probable tokens are considered. A value of `0` disables this parameter. 121 | 3. **Enter your query:** Type your message in the "Your message" text field at the bottom. 122 | 4. **Submit the query:** Press Enter or click the "Submit" button. 123 | 5. **View the response:** The LLM's answer will appear in the chat window. 124 | 6. **Continue the conversation:** Enter follow-up messages. Chat history is preserved and passed to the LLM for context. 125 | 7. **Clear chat:** Click the "Clear Chat" button to reset history and start a new conversation. 126 | 127 | ## ⚠️ Important Notes & Troubleshooting 128 | 129 | * **LLM API Compatibility:** Ensure your LLM endpoint *strictly* follows the OpenAI Chat Completions API format for requests and responses. Incompatibility will cause errors. 130 | * **Performance:** `Medium` and especially `High` modes perform multiple sequential LLM calls, significantly increasing response time compared to `Low` mode. 131 | * **Decomposition Quality:** The success of `Medium` and `High` modes heavily depends on the LLM's ability to understand and execute decomposition and synthesis instructions. Quality may vary based on the LLM model and task complexity. Sometimes, the LLM may fail to decompose the task or return a response not in a numbered list format. 132 | * **Method Efficiency:** Note that this method may be inefficient with smaller models. 133 | * **Network Errors:** If you see "Network error," check if your LLM server is running and accessible at the `.env`-specified address. Verify network and firewall settings. 134 | * **JSON Errors:** If you see "Error: Failed to decode JSON response" or "Invalid format," this means the LLM server returned a response that is not valid JSON or does not match the expected OpenAI structure. Check your LLM server logs. 135 | 136 | *** 137 | 138 | # highCompute.py 139 | 140 | [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) 141 | 142 | Всего один python-файл, подключение через OpenAI Chat Completions API, и вы получаете что-то вроде OpenAI High Compute at home. Совместимы **любые** модели. Используя метод динамического программирования, увеличение количества вычислений в десятки и сотни раз для reasoning и no-reasoning моделей, что значительно повышает качество ответов и способность решать чрезвычайно сложные для LLM задачи. 143 | 144 | Это простое веб-приложение на базе Gradio, предоставляющее интерфейс для взаимодействия с локально запущенной Большой Языковой Моделью (LLM). Ключевой особенностью является возможность выбора "Уровня Вычислений" (Computation Level), который определяет стратегию обработки запроса пользователя: от прямого ответа до многоуровневой декомпозиции задачи для получения более структурированных и полных ответов на сложные запросы. 145 | 146 | ![image](https://github.com/user-attachments/assets/8886405d-9a49-41ca-89d1-900fdc136d8d) 147 | 148 | 149 | Приложение подключается к указанному вами API-эндпоинту LLM, совместимому с форматом OpenAI Chat Completions API. 150 | 151 | ## 🌟 Ключевые возможности 152 | 153 | * **Подключение к локальной LLM:** Работает с вашим собственным LLM-сервером (например, llama.cpp, Ollama, LM Studio, vLLM с OpenAI-совместимым эндпоинтом). 154 | * **Уровни Вычислений:** 155 | * **Low (Низкий):** Прямой запрос к LLM для быстрого ответа. Это совершенно обычный режим чата. Генерируется N-токенов: допустим, на решение задачи ушло всего 7000 токенов. 156 | * **Medium (Средний):** Одноуровневая декомпозиция задачи на подзадачи, их решение и последующий синтез ответа. Подходит для умеренно сложных запросов. Количество генерируемых токенов примерно в 10-15 раз больше по отношению к Low Compute (среднее значение, всё зависит от задачи): если бы на low compute решение задачи заняло 700 токенов, то на Medium уровне примерно 7000 токенов. 157 | * **High (Высокий):** Двухуровневая декомпозиция задачи (этапы -> шаги), решение шагов, синтез результатов этапов и финальный синтез общего ответа. Предназначен для наиболее сложных и многокомпонентных задач. Количество генерируемых токенов примерно в 100-150 раз больше по отношению к уровню Low: если бы на low compute решение задачи заняло 700 токенов, то на High уровне это заняло бы 70000 токенов. 158 | * **Свободная регулировка Compute:** Вы можете свободно регулировать Compute Level для каждого вашего запроса отдельно. Например, первый запрос инициировать на High Compute, затем поработать в режиме Low, и в середине чата решить сделать Medium Compute для решения определённой проблемы. 159 | 160 | ## ⚙️ Как это работает: Уровни Вычислений 161 | 162 | Основная идея заключается в том, что для сложных задач простой прямой запрос к LLM может не дать оптимального результата. Декомпозиция позволяет разбить сложную проблему на более мелкие, управляемые части, решить их по отдельности, а затем объединить результаты. 163 | 164 | 1. **Low (Низкий):** 165 | * `Пользовательский запрос` -> `LLM (один вызов)` -> `Ответ` 166 | * Самый быстрый режим, подходит для простых вопросов или когда требуется максимально быстрая реакция. Фактически, это стандартный режим чата. 167 | 168 | 2. **Medium (Средний):** 169 | * `Пользовательский запрос` -> `LLM (запрос на декомпозицию)` -> `Список подзадач` 170 | * *Для каждой подзадачи:* `Подзадача + Контекст` -> `LLM (решение подзадачи)` -> `Результат подзадачи` 171 | * `Все результаты подзадач + Исходный запрос` -> `LLM (синтез финального ответа)` -> `Финальный ответ` 172 | * Использует несколько вызовов LLM. Запросы на декомпозицию и синтез используют пониженную `temperature` для большей предсказуемости. 173 | 174 | 3. **High (Высокий):** 175 | * `Пользовательский запрос` -> `LLM (декомпозиция Уровня 1)` -> `Список этапов (L1)` 176 | * *Для каждого этапа L1:* 177 | * `Этап L1 + Контекст` -> `LLM (декомпозиция Уровня 2)` -> `Список шагов (L2)` 178 | * *Если декомпозиция L2 не требуется:* `Этап L1 + Контекст` -> `LLM (прямое решение этапа L1)` -> `Результат этапа L1` 179 | * *Если декомпозиция L2 удалась:* 180 | * *Для каждого шага L2:* `Шаг L2 + Контекст L1` -> `LLM (решение шага L2)` -> `Результат шага L2` 181 | * `Все результаты шагов L2 + Контекст L1` -> `LLM (синтез результата этапа L1)` -> `Результат этапа L1` 182 | * `Все результаты этапов L1 + Исходный запрос` -> `LLM (финальный синтез)` -> `Финальный ответ` 183 | * Самый ресурсоемкий режим, использующий множество вызовов LLM. Предназначен для очень сложных задач, требующих многоэтапного планирования и решения. Использует пониженную `temperature` для всех шагов декомпозиции и синтеза. Если декомпозиция L1 не удается, автоматически переключается на режим `Medium`. ВНИМАНИЕ! Может увеличить количество генерируемых токенов в сотни раз! Если вы используете платный API, вам стоит это учитывать! 184 | 185 | ## 📋 Предварительные требования 186 | 187 | * **Python 3.11+** 188 | * **pip** (менеджер пакетов Python) 189 | * **Работающий LLM сервер:** Вам необходим доступный по HTTP сервер с LLM, который предоставляет API, совместимое с [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create). 190 | * Примеры таких серверов: 191 | * [Ollama](https://ollama.ai/) (с флагом `--api` или через отдельный прокси для OpenAI совместимости) 192 | * [LM Studio](https://lmstudio.ai/) (предоставляет OpenAI-совместимый эндпоинт) 193 | * [vLLM](https://github.com/vllm-project/vllm) (с OpenAI-совместимым сервером) 194 | * [OpenRouter](https://openrouter.ai/). 195 | * **Важно:** Сервер должен принимать POST-запросы на путь, указанный в `LLM_API_ENDPOINT` (по умолчанию `/v1/chat/completions`), и обрабатывать JSON-данные в формате OpenAI (поля `model`, `messages`, `temperature`, `top_p`, `top_k`). Ответ также должен соответствовать формату OpenAI (ожидается поле `choices[0].message.content`). 196 | 197 | ## 🚀 Установка 198 | 199 | 1. **Клонируйте репозиторий:** 200 | ```bash 201 | git clone https://github.com/AlexBefest/highCompute.py.git 202 | cd highCompute.py 203 | ``` 204 | 205 | 2. **Создайте и активируйте виртуальное окружение (рекомендуется):** 206 | * На Linux/macOS: 207 | ```bash 208 | python3 -m venv venv 209 | source venv/bin/activate 210 | ``` 211 | * На Windows: 212 | ```bash 213 | python -m venv venv 214 | .\venv\Scripts\activate 215 | ``` 216 | 217 | 3. **Установите зависимости:** 218 | Выполните установку python-зависимостей: 219 | ```bash 220 | pip install -r requirements.txt 221 | ``` 222 | 223 | ## ⚙️ Конфигурация 224 | 225 | 1. **Создайте файл `.env`** в корневой папке проекта. 226 | 2. **Добавьте в `.env` переменную `LLM_API_ENDPOINT`, `LLM_MODEL` и `LLM_API_KEY`**, указав полный URL вашего локального LLM API эндпоинта, который совместим с OpenAI Chat Completions API. А также имя вашей LLM-модели и API-ключ. 227 | 228 | **Пример содержимого файла `.env`:** 229 | ```dotenv 230 | LLM_API_ENDPOINT=http://192.168.2.33:8000/v1/chat/completions 231 | LLM_API_KEY="token-abc123" 232 | LLM_MODEL ="AlexBefest/Gemma3-27B" 233 | ``` 234 | * Убедитесь, что ваш LLM сервер действительно слушает этот адрес и путь. 235 | 236 | ## ▶️ Запуск приложения 237 | 238 | 1. **Убедитесь, что ваш локальный LLM сервер запущен** и доступен по URL, указанному в файле `.env` (или по адресу по умолчанию). 239 | 2. **Запустите Python скрипт:** 240 | ```bash 241 | python highCompute.py 242 | ``` 243 | 3. **Откройте веб-интерфейс:** В консоли вы увидите сообщение от Gradio с локальным URL, обычно `http://127.0.0.1:7860`. Откройте этот URL в вашем веб-браузере. 244 | 245 | ## 💬 Использование интерфейса 246 | 247 | 1. **Выберите Уровень Вычислений (Computation Level):** Low, Medium или High, в зависимости от сложности вашего запроса. 248 | 2. **(Опционально) Настройте параметры:** Отрегулируйте ползунки `Temperature`, `Top-P`, `Top-K`, если хотите изменить стиль генерации ответа LLM. 249 | * `Temperature`: Контролирует случайность. Низкие значения (ближе к 0) делают ответы более детерминированными и сфокусированными. Высокие значения (ближе к 2.0) делают ответы более креативными и разнообразными, но могут привести к "галлюцинациям". 250 | * `Top-P`: Нуклеусное сэмплирование. Модель рассматривает только токены, чья суммарная вероятность больше или равна `top_p`. Значение `1.0` отключает этот параметр. 251 | * `Top-K`: Рассматриваются только `k` наиболее вероятных токенов. Значение `0` отключает этот параметр. 252 | 3. **Введите ваш запрос:** Напишите сообщение в текстовое поле "Your message" внизу. 253 | 4. **Отправьте запрос:** Нажмите Enter или кнопку "Submit". 254 | 5. **Просмотрите ответ:** Ответ LLM появится в окне чата. 255 | 6. **Продолжайте диалог:** Вводите следующие сообщения. История чата будет сохраняться и передаваться LLM для контекста. 256 | 7. **Очистить чат:** Нажмите кнопку "Clear Chat", чтобы сбросить историю и начать новый диалог. 257 | 258 | ## ⚠️ Важные замечания и устранение неисправностей 259 | 260 | * **Совместимость LLM API:** Убедитесь, что ваш LLM эндпоинт *строго* следует формату OpenAI Chat Completions API для запросов и ответов. Несовместимость формата приведет к ошибкам. 261 | * **Производительность:** Режимы `Medium` и особенно `High` выполняют несколько последовательных вызовов LLM, что значительно увеличивает время ожидания ответа по сравнению с режимом `Low`. 262 | * **Качество декомпозиции:** Успех режимов `Medium` и `High` сильно зависит от способности LLM понимать и выполнять инструкции по декомпозиции и синтезу. Качество может варьироваться в зависимости от используемой модели LLM и сложности исходной задачи. Иногда LLM может не суметь разбить задачу или вернуть ответ не в виде нумерованного списка. 263 | * **Эффективность метода:** Нужно понимать, что данный метод может быть неэффективен с небольшими моделями 264 | * **Сетевые ошибки:** Если вы видите "Network error", проверьте, запущен ли ваш LLM сервер и доступен ли он по указанному в `.env` адресу. Проверьте настройки сети и файрвола. 265 | * **Ошибки JSON:** Если вы видите "Error: Failed to decode JSON response" или "Invalid format", это означает, что LLM сервер вернул ответ, который не является валидным JSON или не соответствует ожидаемой структуре OpenAI. Проверьте логи вашего LLM сервера. 266 | -------------------------------------------------------------------------------- /highCompute.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | import requests 3 | import json 4 | import os 5 | import re 6 | from dotenv import load_dotenv 7 | import time 8 | 9 | load_dotenv() 10 | 11 | DEFAULT_ENDPOINT = "http://127.0.0.1:8080/v1/chat/completions" 12 | DEFAULT_LLM_MODEL = "local-model" 13 | DEFAULT_API_KEY = None 14 | 15 | LOCAL_API_ENDPOINT = os.getenv("LLM_API_ENDPOINT", DEFAULT_ENDPOINT) 16 | LLM_MODEL = os.getenv("LLM_MODEL", DEFAULT_LLM_MODEL) 17 | LLM_API_KEY = os.getenv("LLM_API_KEY", DEFAULT_API_KEY) 18 | 19 | def call_llm(prompt, chat_history_gradio=None, temperature=0.7, top_p=None, top_k=None, stream=False): 20 | messages = [] 21 | if chat_history_gradio: 22 | for user_msg, assistant_msg in chat_history_gradio: 23 | if user_msg: 24 | messages.append({"role": "user", "content": user_msg}) 25 | if assistant_msg: 26 | messages.append({"role": "assistant", "content": assistant_msg}) 27 | messages.append({"role": "user", "content": prompt}) 28 | 29 | payload_dict = { 30 | "model": LLM_MODEL, 31 | "messages": messages, 32 | "temperature": temperature, 33 | "stream": stream 34 | } 35 | if top_p is not None and top_p < 1.0: 36 | payload_dict["top_p"] = top_p 37 | if top_k is not None and top_k > 0: 38 | payload_dict["top_k"] = top_k 39 | 40 | payload = json.dumps(payload_dict) 41 | headers = {'Content-Type': 'application/json; charset=utf-8', 'Accept': 'text/event-stream' if stream else 'application/json'} 42 | 43 | if LLM_API_KEY: 44 | headers['Authorization'] = f'Bearer {LLM_API_KEY}' 45 | print(f"Sending request to {LOCAL_API_ENDPOINT} using API Key.") 46 | else: 47 | print(f"Sending request to {LOCAL_API_ENDPOINT} without API Key.") 48 | 49 | print(f"Model: '{LLM_MODEL}', Stream: {stream}, Payload: {payload[:200]}...") 50 | 51 | try: 52 | response = requests.post(LOCAL_API_ENDPOINT, headers=headers, data=payload.encode('utf-8'), timeout=36000, stream=stream) 53 | response.raise_for_status() 54 | 55 | if stream: 56 | print("Processing stream...") 57 | for chunk_bytes in response.iter_content(chunk_size=None): 58 | if not chunk_bytes: 59 | continue 60 | try: 61 | lines = chunk_bytes.decode('utf-8').splitlines() 62 | for line in lines: 63 | if line.startswith("data:"): 64 | line_data = line[len("data:"):].strip() 65 | if line_data == "[DONE]": 66 | print("Stream finished.") 67 | break 68 | try: 69 | chunk = json.loads(line_data) 70 | if chunk.get("choices") and len(chunk["choices"]) > 0: 71 | delta = chunk["choices"][0].get("delta", {}) 72 | content_chunk = delta.get("content") 73 | if content_chunk: 74 | yield content_chunk 75 | except json.JSONDecodeError: 76 | print(f"Warning: Could not decode stream line JSON: {line_data}") 77 | continue 78 | except Exception as e: 79 | print(f"Error processing stream chunk: {e}, Line: {line_data}") 80 | yield f"\n[Error processing stream chunk: {e}]" 81 | break 82 | else: 83 | continue 84 | break 85 | except UnicodeDecodeError: 86 | print(f"Warning: Could not decode chunk as UTF-8: {chunk_bytes[:100]}...") 87 | continue 88 | print("Stream processing complete.") 89 | 90 | else: 91 | response.encoding = response.apparent_encoding if response.encoding is None else response.encoding 92 | data = response.json() 93 | print(f"Received non-stream response: {json.dumps(data, ensure_ascii=False, indent=2)}") 94 | 95 | if data.get("choices") and len(data["choices"]) > 0: 96 | message_content = data["choices"][0].get("message", {}).get("content") 97 | if message_content: 98 | yield message_content.strip() 99 | else: 100 | print("Error: 'content' key not found in LLM response choice.") 101 | yield "Error: 'content' not found in LLM response." 102 | else: 103 | print("Error: 'choices' array is missing, empty, or invalid in LLM response.") 104 | yield "Error: Invalid format in LLM response (missing 'choices')." 105 | 106 | except requests.exceptions.Timeout: 107 | print(f"Network error: Request timed out after 36000 seconds.") 108 | yield "Network error: Request timed out." 109 | except requests.exceptions.RequestException as e: 110 | print(f"Network error: {e}") 111 | yield f"Network error: {e}" 112 | except json.JSONDecodeError as e: 113 | print(f"Error: Failed to decode JSON response from server. Response text: {response.text}") 114 | yield f"Error: Failed to read server response (JSONDecodeError: {e}). Check server logs." 115 | except Exception as e: 116 | print(f"An unexpected error occurred: {e}") 117 | yield f"An unexpected error occurred: {e}" 118 | 119 | 120 | def low_compute(user_input, history, temperature, top_p, top_k): 121 | yield "[Status] Sending request directly to LLM..." 122 | print("[Low Mode] Sending LLM request (streaming)...") 123 | full_response = "" 124 | for chunk in call_llm(user_input, chat_history_gradio=history, temperature=temperature, top_p=top_p, top_k=top_k, stream=True): 125 | full_response += chunk 126 | yield full_response 127 | print("[Low Mode] Response stream finished.") 128 | 129 | 130 | def medium_compute(user_input, history, temperature, top_p, top_k): 131 | yield "[Status] Starting task decomposition (1 level)..." 132 | print("[Medium Mode] Starting task decomposition...") 133 | control_temp = max(0.1, temperature * 0.5) 134 | decompose_prompt = f'Original task: "{user_input}". Break it down into logical subtasks needed to solve it (numbered list). Be concise.' 135 | 136 | subtasks_text_gen = call_llm(decompose_prompt, temperature=control_temp, top_p=top_p, top_k=top_k, stream=False) 137 | subtasks_text = next(subtasks_text_gen, "Error: No response from decomposition.") 138 | if subtasks_text.startswith("Error:") or subtasks_text.startswith("Network error:"): 139 | yield "[Status] Decomposition failed. Answering directly..." 140 | print(f"[Medium Mode] Decomposition failed: {subtasks_text}. Responding directly (streaming)...") 141 | full_response = "" 142 | for chunk in call_llm(user_input, chat_history_gradio=history, temperature=temperature, top_p=top_p, top_k=top_k, stream=True): 143 | full_response += chunk 144 | yield full_response 145 | print("[Medium Mode] Direct response stream finished after decomposition failure.") 146 | return 147 | 148 | subtasks = re.findall(r"^\s*\d+\.\s*(.*)", subtasks_text, re.MULTILINE) 149 | 150 | if not subtasks: 151 | yield "[Status] Decomposition returned no subtasks. Answering directly..." 152 | print("[Medium Mode] Decomposition returned no numbered points. Responding directly (streaming)...") 153 | full_response = "" 154 | for chunk in call_llm(user_input, chat_history_gradio=history, temperature=temperature, top_p=top_p, top_k=top_k, stream=True): 155 | full_response += chunk 156 | yield full_response 157 | print("[Medium Mode] Direct response stream finished after no subtasks found.") 158 | return 159 | 160 | yield f"[Status] Task divided into {len(subtasks)} subtasks. Solving them one by one..." 161 | print(f"[Medium Mode] Task divided into {len(subtasks)} subtasks.") 162 | subtask_results = [] 163 | temp_history_medium = history.copy() if history else [] 164 | 165 | for i, subtask in enumerate(subtasks): 166 | subtask = subtask.strip() 167 | if not subtask: continue 168 | yield f"[Status] Solving subtask {i+1}/{len(subtasks)}: \"{subtask}...\"" 169 | print(f"[Medium Mode] Solving subtask {i+1}/{len(subtasks)}: \"{subtask}\"...") 170 | solve_prompt = f'Original overall task: "{user_input}". Current subtask: "{subtask}". Provide a detailed solution or answer for this specific subtask.' 171 | 172 | subtask_result_gen = call_llm(solve_prompt, chat_history_gradio=temp_history_medium, temperature=temperature, top_p=top_p, top_k=top_k, stream=False) 173 | subtask_result = next(subtask_result_gen, f"Error: No response for subtask {i+1}.") 174 | subtask_results.append({"subtask": subtask, "result": subtask_result}) 175 | print(f"[Medium Mode] Subtask {i+1} result: Received.") 176 | if subtask_result.startswith("Error:") or subtask_result.startswith("Network error:"): 177 | yield f"[Status] Error solving subtask {i+1}. Aborting and attempting direct answer..." 178 | print(f"[Medium Mode] Error solving subtask {i+1}: {subtask_result}. Responding directly (streaming)...") 179 | full_response = "" 180 | for chunk in call_llm(user_input, chat_history_gradio=history, temperature=temperature, top_p=top_p, top_k=top_k, stream=True): 181 | full_response += chunk 182 | yield full_response 183 | print("[Medium Mode] Direct response stream finished after subtask error.") 184 | return 185 | 186 | yield "[Status] All subtasks solved. Synthesizing final response..." 187 | print("[Medium Mode] Synthesizing final response (streaming)...") 188 | synthesis_prompt = f'Original task: "{user_input}". The task was broken down and the results for each subtask are:\n---\n' 189 | for i, res in enumerate(subtask_results): 190 | synthesis_prompt += f"{i+1}. Subtask: {res['subtask']}\n Result: {res['result']}\n---\n" 191 | synthesis_prompt += "Combine these results into a single, coherent, well-formatted final response that directly addresses the original task. Do not just list the subtasks and results; synthesize them." 192 | 193 | full_response = "" 194 | for chunk in call_llm(synthesis_prompt, temperature=control_temp, top_p=top_p, top_k=top_k, stream=True): 195 | full_response += chunk 196 | yield full_response 197 | print("[Medium Mode] Final response stream synthesized.") 198 | 199 | 200 | def high_compute(user_input, history, temperature, top_p, top_k): 201 | yield "[Status] Starting task decomposition (Level 1)..." 202 | print("[High Mode] Starting task decomposition (Level 1)...") 203 | control_temp = max(0.1, temperature * 0.5) 204 | decompose_prompt_l1 = f'Original complex task: "{user_input}". Break this down into major high-level stages or components (Level 1 - numbered list). Keep items distinct and logical.' 205 | 206 | subtasks_l1_text_gen = call_llm(decompose_prompt_l1, temperature=control_temp, top_p=top_p, top_k=top_k, stream=False) 207 | subtasks_l1_text = next(subtasks_l1_text_gen, "Error: No response from L1 decomposition.") 208 | 209 | if subtasks_l1_text.startswith("Error:") or subtasks_l1_text.startswith("Network error:"): 210 | yield "[Status] Level 1 decomposition failed. Falling back to Medium compute mode..." 211 | print(f"[High Mode] Decomposition failed (Level 1): {subtasks_l1_text}. Falling back to Medium Mode...") 212 | yield from medium_compute(user_input, history, temperature, top_p, top_k) 213 | return 214 | 215 | subtasks_l1 = re.findall(r"^\s*\d+\.\s*(.*)", subtasks_l1_text, re.MULTILINE) 216 | 217 | if not subtasks_l1: 218 | yield "[Status] Level 1 decomposition returned no subtasks. Falling back to Medium compute mode..." 219 | print("[High Mode] Decomposition returned no subtasks (Level 1). Falling back to Medium Mode...") 220 | yield from medium_compute(user_input, history, temperature, top_p, top_k) 221 | return 222 | 223 | yield f"[Status] Task divided into {len(subtasks_l1)} Level 1 stages. Processing stages..." 224 | print(f"[High Mode] Task divided into {len(subtasks_l1)} Level 1 subtasks.") 225 | subtasks_l1_results = [] 226 | temp_history_high = history.copy() if history else [] 227 | 228 | for i, subtask_l1 in enumerate(subtasks_l1): 229 | subtask_l1 = subtask_l1.strip() 230 | if not subtask_l1: continue 231 | yield f"[Status] Processing Level 1 stage {i+1}/{len(subtasks_l1)}: \"{subtask_l1}...\". Starting mandatory Level 2 decomposition..." 232 | print(f"[High Mode] Working on Level 1 subtask ({i+1}/{len(subtasks_l1)}): \"{subtask_l1}\"") 233 | print(f"[High Mode] Attempting MANDATORY Level 2 decomposition for: \"{subtask_l1}\"...") 234 | 235 | decompose_prompt_l2 = f'Current high-level stage (Level 1): "{subtask_l1}". Break THIS stage down into smaller, actionable steps (Level 2 - numbered list). You MUST provide the steps as a numbered list starting with "1.". Even if there is only one step, write "1. {subtask_l1}". Do not use phrases like "No further decomposition needed". Just provide the list.' 236 | 237 | subtasks_l2_text_gen = call_llm(decompose_prompt_l2, temperature=control_temp, top_p=top_p, top_k=top_k, stream=False) 238 | subtasks_l2_text = next(subtasks_l2_text_gen, f"Error: No response for L2 decomposition of stage {i+1}.") 239 | print(f"[DEBUG High Mode] Raw L2 decomposition text for '{subtask_l1}':\n>>>\n{subtasks_l2_text}\n<<<") 240 | 241 | if subtasks_l2_text.startswith("Error:") or subtasks_l2_text.startswith("Network error:"): 242 | yield f"[Status] Stage {i+1}: L2 decomposition failed ({subtasks_l2_text}). Forcing L1 task as single L2 step." 243 | print(f"[High Mode] L2 decomposition failed for \"{subtask_l1}\": {subtasks_l2_text}. Forcing it as a single L2 step.") 244 | subtasks_l2 = [subtask_l1.strip()] 245 | else: 246 | subtasks_l2 = re.findall(r"^\s*\d+\.\s*(.*)", subtasks_l2_text, re.MULTILINE) 247 | if not subtasks_l2: 248 | yield f"[Status] Stage {i+1}: L2 decomposition format issue or LLM refusal. Forcing L1 task as single L2 step." 249 | print(f"[High Mode] L2 decomposition failed/refused for \"{subtask_l1}\". Forcing it as a single L2 step.") 250 | subtasks_l2 = [subtask_l1.strip()] 251 | 252 | 253 | yield f"[Status] Stage {i+1} processing {len(subtasks_l2)} Level 2 step(s)..." 254 | print(f"[High Mode] Processing {len(subtasks_l2)} Level 2 step(s) for L1 subtask \"{subtask_l1}\".") 255 | subtasks_l2_results = [] 256 | abort_stage = False 257 | for j, subtask_l2 in enumerate(subtasks_l2): 258 | subtask_l2 = subtask_l2.strip() 259 | if not subtask_l2: continue 260 | yield f"[Status] Stage {i+1}/{len(subtasks_l1)}, Solving L2 step {j+1}/{len(subtasks_l2)}: \"{subtask_l2}...\"" 261 | print(f"[High Mode] Solving Level 2 step ({j+1}/{len(subtasks_l2)}): \"{subtask_l2}\"...") 262 | 263 | if len(subtasks_l2) == 1 and subtask_l2 == subtask_l1: 264 | solve_prompt_l2 = f'Original task: "{user_input}".\nCurrent Level 1 stage: "{subtask_l1}".\nThis stage could not be broken down further. Solve this specific stage in detail.' 265 | else: 266 | solve_prompt_l2 = f'Original task: "{user_input}".\nCurrent Level 1 stage: "{subtask_l1}".\nCurrent Level 2 step: "{subtask_l2}".\nSolve this specific Level 2 step in detail.' 267 | 268 | result_l2_gen = call_llm(solve_prompt_l2, chat_history_gradio=temp_history_high, temperature=temperature, top_p=top_p, top_k=top_k, stream=False) 269 | result_l2 = next(result_l2_gen, f"Error: No response for L2 step {j+1}.") 270 | subtasks_l2_results.append({"subtask": subtask_l2, "result": result_l2}) 271 | print(f"[High Mode] Level 2 step result ({j+1}): Received.") 272 | if result_l2.startswith("Error:") or result_l2.startswith("Network error:"): 273 | yield f"[Status] Error solving L2 step {j+1} in stage {i+1}. Aborting stage..." 274 | print(f"[High Mode] Error solving L2 step {j+1}: {result_l2}. Aborting stage {i+1}.") 275 | subtasks_l1_results.append({"subtask": subtask_l1, "result": f"[Error processing stage {i+1}: {result_l2}]"}) 276 | abort_stage = True 277 | break 278 | 279 | if abort_stage: 280 | continue 281 | 282 | 283 | yield f"[Status] Stage {i+1}: Synthesizing results from {len(subtasks_l2)} Level 2 step(s)..." 284 | print(f"[High Mode] Synthesizing Level 2 results for L1 subtask \"{subtask_l1}\"...") 285 | result_l1_final = "" 286 | if len(subtasks_l2_results) == 1: 287 | result_l1_final = subtasks_l2_results[0]['result'] 288 | print(f"[High Mode] Result for \"{subtask_l1}\" (from single L2 step): Received.") 289 | else: 290 | synthesis_prompt_l2 = f'The goal for this stage was: "{subtask_l1}". The results for the Level 2 steps taken are:\n---\n' 291 | for j, res_l2 in enumerate(subtasks_l2_results): 292 | synthesis_prompt_l2 += f"{j+1}. Step: {res_l2['subtask']}\n Result: {res_l2['result']}\n---\n" 293 | synthesis_prompt_l2 += f'Synthesize these results into a single, coherent answer for the Level 1 stage: "{subtask_l1}". Focus on fulfilling the goal of this stage.' 294 | 295 | result_l1_final_gen = call_llm(synthesis_prompt_l2, temperature=control_temp, top_p=top_p, top_k=top_k, stream=False) 296 | result_l1_final = next(result_l1_final_gen, f"Error: No response for L1 synthesis stage {i+1}.") 297 | print(f"[High Mode] Result for \"{subtask_l1}\" (synthesized from L2): Received.") 298 | if result_l1_final.startswith("Error:") or result_l1_final.startswith("Network error:"): 299 | yield f"[Status] Error synthesizing L2 results for stage {i+1}. Using raw results..." 300 | print(f"[High Mode] Error synthesizing L2 results for stage {i+1}: {result_l1_final}. Using raw results.") 301 | result_l1_final = "\n".join([f"Step {j+1}: {res['subtask']}\nResult: {res['result']}" for j, res in enumerate(subtasks_l2_results)]) 302 | 303 | 304 | subtasks_l1_results.append({"subtask": subtask_l1, "result": result_l1_final}) 305 | 306 | yield "[Status] All Level 1 stages processed. Synthesizing final response..." 307 | print("[High Mode] Synthesizing final response from Level 1 results (streaming)...") 308 | final_synthesis_prompt = f'Original complex task: "{user_input}". The task was addressed in the following major stages, with these results:\n---\n' 309 | for i, res_l1 in enumerate(subtasks_l1_results): 310 | final_synthesis_prompt += f"{i+1}. Stage: {res_l1['subtask']}\n Overall Result for Stage: {res_l1['result']}\n---\n" 311 | final_synthesis_prompt += "Synthesize all these stage results into a comprehensive, well-structured final answer that directly addresses the original complex task. Ensure coherence and clarity." 312 | 313 | full_response = "" 314 | for chunk in call_llm(final_synthesis_prompt, temperature=control_temp, top_p=top_p, top_k=top_k, stream=True): 315 | full_response += chunk 316 | yield full_response 317 | print("[High Mode] Final response stream synthesized.") 318 | 319 | 320 | def chat_interface_logic(message, history, compute_level, temperature, top_p, top_k): 321 | if history is None: 322 | history = [] 323 | 324 | history.append([message, ""]) 325 | yield history, "", "[Status] Processing request..." 326 | 327 | compute_function = None 328 | if compute_level == "Low": 329 | compute_function = low_compute 330 | elif compute_level == "Medium": 331 | compute_function = medium_compute 332 | elif compute_level == "High": 333 | compute_function = high_compute 334 | else: 335 | error_msg = "Error: Unknown computation level selected." 336 | history[-1][1] = error_msg 337 | yield history, "", "[Status] Error" 338 | return 339 | 340 | response_generator = compute_function(message, history[:-1], temperature, top_p, top_k) 341 | 342 | final_assistant_response = "" 343 | current_status = "[Status] Processing request..." 344 | 345 | try: 346 | for response_part in response_generator: 347 | if isinstance(response_part, str) and response_part.startswith("[Status]"): 348 | current_status = response_part 349 | yield history, "", current_status 350 | elif isinstance(response_part, str): 351 | final_assistant_response = response_part 352 | history[-1][1] = final_assistant_response 353 | yield history, "", current_status 354 | else: 355 | print(f"Warning: Unexpected type yielded from compute function: {type(response_part)}") 356 | error_fragment = f"\n[Warning: Unexpected data type in response stream: {type(response_part)}]" 357 | final_assistant_response += error_fragment 358 | history[-1][1] = final_assistant_response 359 | yield history, "", current_status 360 | 361 | except Exception as e: 362 | print(f"Error during response generation: {e}") 363 | error_msg = f"An error occurred during processing: {e}" 364 | history[-1][1] = error_msg 365 | yield history, "", "[Status] Error Encountered" 366 | return 367 | 368 | yield history, "", "" 369 | 370 | 371 | def regenerate_last(history, compute_level, temperature, top_p, top_k): 372 | if not history: 373 | yield history, "", "[Status] Cannot regenerate: Chat history is empty." 374 | return 375 | 376 | if history[-1][0] is None or history[-1][0] == "": 377 | yield history, "", "[Status] Cannot regenerate: Last entry is not a user message." 378 | return 379 | 380 | last_user_message = history[-1][0] 381 | history_context = history[:-1] 382 | 383 | history[-1][1] = "" 384 | yield history, "", f"[Status] Regenerating response for: \"{last_user_message[:50]}...\"" 385 | 386 | compute_function = None 387 | if compute_level == "Low": 388 | compute_function = low_compute 389 | elif compute_level == "Medium": 390 | compute_function = medium_compute 391 | elif compute_level == "High": 392 | compute_function = high_compute 393 | else: 394 | error_msg = "Error: Unknown computation level selected." 395 | history[-1][1] = error_msg 396 | yield history, "", "[Status] Error" 397 | return 398 | 399 | response_generator = compute_function(last_user_message, history_context, temperature, top_p, top_k) 400 | 401 | final_assistant_response = "" 402 | current_status = f"[Status] Regenerating response for: \"{last_user_message[:50]}...\"" 403 | 404 | try: 405 | for response_part in response_generator: 406 | if isinstance(response_part, str) and response_part.startswith("[Status]"): 407 | current_status = response_part 408 | yield history, "", current_status 409 | elif isinstance(response_part, str): 410 | final_assistant_response = response_part 411 | history[-1][1] = final_assistant_response 412 | yield history, "", current_status 413 | else: 414 | print(f"Warning: Unexpected type yielded during regeneration: {type(response_part)}") 415 | error_fragment = f"\n[Warning: Unexpected data type in response stream: {type(response_part)}]" 416 | final_assistant_response += error_fragment 417 | history[-1][1] = final_assistant_response 418 | yield history, "", current_status 419 | 420 | except Exception as e: 421 | print(f"Error during response regeneration: {e}") 422 | error_msg = f"An error occurred during regeneration: {e}" 423 | history[-1][1] = error_msg 424 | yield history, "", "[Status] Error Encountered during Regeneration" 425 | return 426 | 427 | yield history, "", "" 428 | 429 | 430 | with gr.Blocks(theme=gr.themes.Soft()) as demo: 431 | gr.Markdown("# Advanced Chat Agent with Computation Levels (Local LLM)") 432 | gr.Markdown(f"Using endpoint: `{LOCAL_API_ENDPOINT}` with model `{LLM_MODEL}`") 433 | if LLM_API_KEY: 434 | gr.Markdown("API Key: Loaded from environment variable.") 435 | else: 436 | gr.Markdown("API Key: Not configured (using endpoint without Authorization header).") 437 | 438 | with gr.Row(): 439 | with gr.Column(scale=1): 440 | compute_level_selector = gr.Radio( 441 | ["Low", "Medium", "High"], 442 | label="Computation Level", 443 | value="Low", 444 | info="Low: Direct response. Medium: 1-level decomposition. High: 2-level decomposition." 445 | ) 446 | temp_slider = gr.Slider( 447 | minimum=0.0, maximum=2.0, value=0.7, step=0.1, label="Temperature", 448 | info="Controls randomness. Lower values make the model more deterministic." 449 | ) 450 | top_p_slider = gr.Slider( 451 | minimum=0.0, maximum=1.0, value=1.0, step=0.05, label="Top-P (Nucleus Sampling)", 452 | info="Considers only tokens with cumulative probability >= top_p. 1.0 disables it." 453 | ) 454 | top_k_slider = gr.Slider( 455 | minimum=0, maximum=100, value=0, step=1, label="Top-K", 456 | info="Considers only the top k most likely tokens. 0 disables it." 457 | ) 458 | with gr.Row(): 459 | regenerate_btn = gr.Button("Regenerate") 460 | clear_btn = gr.ClearButton(value="Clear Chat") 461 | 462 | 463 | with gr.Column(scale=4): 464 | status_display = gr.Markdown("", label="Current Status") 465 | chatbot = gr.Chatbot(label="Chat", height=700, show_copy_button=True, likeable=True, show_share_button=True) 466 | with gr.Row(): 467 | chat_input = gr.Textbox( 468 | label="Your message", 469 | placeholder="Enter your query here...", 470 | scale=4, 471 | show_label=False, 472 | container=False 473 | ) 474 | submit_btn = gr.Button("Submit", variant="primary", scale=1, min_width=120) 475 | 476 | clear_btn.add(components=[chat_input, chatbot, status_display]) 477 | 478 | submit_inputs = [chat_input, chatbot, compute_level_selector, temp_slider, top_p_slider, top_k_slider] 479 | submit_outputs = [chatbot, chat_input, status_display] 480 | 481 | regenerate_inputs = [chatbot, compute_level_selector, temp_slider, top_p_slider, top_k_slider] 482 | regenerate_outputs = [chatbot, chat_input, status_display] 483 | 484 | submit_btn.click( 485 | fn=chat_interface_logic, 486 | inputs=submit_inputs, 487 | outputs=submit_outputs, 488 | queue=True 489 | ) 490 | chat_input.submit( 491 | fn=chat_interface_logic, 492 | inputs=submit_inputs, 493 | outputs=submit_outputs, 494 | queue=True 495 | ) 496 | regenerate_btn.click( 497 | fn=regenerate_last, 498 | inputs=regenerate_inputs, 499 | outputs=regenerate_outputs, 500 | queue=True 501 | ) 502 | 503 | 504 | if __name__ == "__main__": 505 | print(f"Launching Gradio interface for local LLM...") 506 | print(f"Connecting to: {LOCAL_API_ENDPOINT}") 507 | print(f"Model name used in requests: {LLM_MODEL}") 508 | if LLM_API_KEY: 509 | print("API Key detected in environment variables.") 510 | else: 511 | print("API Key not found in environment variables.") 512 | try: 513 | base_url = '/'.join(LOCAL_API_ENDPOINT.split('/')[:3]) 514 | response = requests.get(base_url, timeout=5) 515 | print(f"Base URL {base_url} is accessible (Status: {response.status_code}).") 516 | except Exception as e: 517 | print(f"Warning: Could not check endpoint base URL accessibility ({base_url}): {e}") 518 | 519 | demo.launch() 520 | --------------------------------------------------------------------------------