├── requirements.txt
├── LICENSE
├── README.md
└── highCompute.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | gradio==4.44.1
2 | pydantic==2.10.6
3 | pydantic_core==2.27.2
4 | requests
5 | python-dotenv
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # highCompute.py
  2 | 
  3 | [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)  
  4 | 
  5 | A single Python file that connects via the OpenAI Chat Completions API, giving you something akin to OpenAI High Compute at home. **Any** models are compatible. Using dynamic programming methods, computational capacity is increased by tens or even hundreds of times for both reasoning and non-reasoning models, significantly improving answer quality and the ability to solve extremely complex tasks for LLMs.  
  6 | 
  7 | This is a simple Gradio-based web application providing an interface for interacting with a locally hosted Large Language Model (LLM). The key feature is the ability to select a "Computation Level," which determines the strategy for processing user queries—ranging from direct responses to multi-level task decomposition for obtaining more structured and comprehensive answers to complex queries.  
  8 | 
  9 | 
 10 | 
 11 | https://github.com/user-attachments/assets/8cc0fa3d-69fa-4183-8d6b-004ea934fd78
 12 | 
 13 | ![Снимок экрана_20250427_174119](https://github.com/user-attachments/assets/b70d8c5d-911a-4c49-ab37-b4ec9925086e)
 14 | 
 15 | 
 16 | 
 17 | 
 18 | The application connects to your specified LLM API endpoint, compatible with the OpenAI Chat Completions API format.  
 19 | 
 20 | ## 🌟 Key Features  
 21 | 
 22 | *   **Local LLM Integration:** Works with your own LLM server (e.g., llama.cpp, Ollama, LM Studio, vLLM with an OpenAI-compatible endpoint).  
 23 | *   **Compute Levels:**  
 24 |     *   **Low:** Direct query to the LLM for a quick response. This is a standard chat mode. Generates N tokens — for example, solving a task may only consume 700 tokens.  
 25 |     *   **Medium:** Single-level task decomposition into subtasks, solving them, and synthesizing the final answer. Suitable for moderately complex queries. The number of generated tokens is approximately 10-15x higher compared to Low Compute (average value, depends on the task): if solving a task in Low Compute took 700 tokens, Medium level would require around 7,000 tokens.  
 26 |     *   **High:** Two-level task decomposition (stages → steps), solving individual steps, synthesizing stage results, and generating the final answer. Designed for highly complex and multi-component tasks. The number of generated tokens is approximately 100-150x higher compared to Low Compute: if solving a task in Low Compute took 700 tokens, High level would require around 70,000 tokens.  
 27 | *   **Flexible Compute Adjustment:** You can freely adjust the Compute Level for each query individually. For example, initiate the first query in High Compute, then switch to Low mode, and later use Medium Compute to solve a specific problem mid-chat.
 28 | 
 29 | ## ⚙️ How It Works: Computation Levels  
 30 | 
 31 | The core idea is that for complex tasks, a simple direct query to the LLM may not yield optimal results. Decomposition allows breaking down a complex problem into smaller, manageable parts, solving them individually, and then combining the results.  
 32 | 
 33 | 1.  **Low:**  
 34 |     *   `User Query` → `LLM (single call)` → `Response`  
 35 |     *   The fastest mode, suitable for simple questions or when a quick response is needed. Essentially, this is the standard chat mode.  
 36 | 
 37 | 2.  **Medium:**  
 38 |     *   `User Query` → `LLM (decomposition request)` → `List of subtasks`  
 39 |     *   *For each subtask:* `Subtask + Context` → `LLM (subtask solution)` → `Subtask result`  
 40 |     *   `All subtask results + Original query` → `LLM (final synthesis)` → `Final answer`  
 41 |     *   Uses multiple LLM calls. Decomposition and synthesis requests use a lower `temperature` for greater predictability.  
 42 | 
 43 | 3.  **High:**  
 44 |     *   `User Query` → `LLM (Level 1 decomposition)` → `List of stages (L1)`  
 45 |     *   *For each L1 stage:*  
 46 |         *   `L1 Stage + Context` → `LLM (Level 2 decomposition)` → `List of steps (L2)`  
 47 |         *   *If L2 decomposition is not needed:* `L1 Stage + Context` → `LLM (direct L1 stage solution)` → `L1 Stage result`  
 48 |         *   *If L2 decomposition succeeds:*  
 49 |             *   *For each L2 step:* `L2 Step + L1 Context` → `LLM (L2 step solution)` → `L2 Step result`  
 50 |             *   `All L2 Step results + L1 Context` → `LLM (L1 stage synthesis)` → `L1 Stage result`  
 51 |     *   `All L1 Stage results + Original query` → `LLM (final synthesis)` → `Final answer`  
 52 |     *   The most resource-intensive mode, using multiple LLM calls. Designed for highly complex tasks requiring multi-stage planning and solving. Uses a lower `temperature` for all decomposition and synthesis steps. If L1 decomposition fails, it automatically switches to `Medium` mode. WARNING! This can increase the number of generated tokens by hundreds of times! If you're using a paid API, consider this carefully!  
 53 | 
 54 | ## 📋 Prerequisites  
 55 | 
 56 | *   **Python 3.11**  
 57 | *   **pip** (Python package manager)  
 58 | *   **A working LLM server:** You need an accessible HTTP server with an LLM that provides an API compatible with [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create).  
 59 |     *   Examples of such servers:  
 60 |         *   [Ollama](https://ollama.ai/) (with the `--api` flag or via a separate proxy for OpenAI compatibility)  
 61 |         *   [LM Studio](https://lmstudio.ai/) (provides an OpenAI-compatible endpoint)  
 62 |         *   [vLLM](https://github.com/vllm-project/vllm) (with an OpenAI-compatible server)  
 63 |         *   [OpenRouter](https://openrouter.ai/).  
 64 |     *   **Important:** The server must accept POST requests at the path specified in `LLM_API_ENDPOINT` (default: `/v1/chat/completions`) and process JSON data in OpenAI format (fields: `model`, `messages`, `temperature`, `top_p`, `top_k`). The response must also follow the OpenAI format (expected field: `choices[0].message.content`).  
 65 | 
 66 | ## 🚀 Installation  
 67 | 
 68 | 1.  **Clone the repository:**  
 69 |     ```bash  
 70 |     git clone https://github.com/AlexBefest/highCompute.py.git
 71 |     cd highCompute.py  
 72 |     ```  
 73 | 
 74 | 2.  **Create and activate a virtual environment (recommended):**  
 75 |     *   On Linux/macOS:  
 76 |         ```bash  
 77 |         python3 -m venv venv  
 78 |         source venv/bin/activate  
 79 |         ```  
 80 |     *   On Windows:  
 81 |         ```bash  
 82 |         python -m venv venv  
 83 |         .\venv\Scripts\activate  
 84 |         ```  
 85 | 
 86 | 3.  **Install dependencies:**  
 87 |     Install Python dependencies:  
 88 |     ```bash  
 89 |     pip install -r requirements.txt  
 90 |     ```  
 91 | 
 92 | ## ⚙️ Configuration  
 93 | 
 94 | 1.  **Create a `.env` file** in the project root folder.  
 95 | 2.  **Add `LLM_API_ENDPOINT`, `LLM_MODEL`, and `LLM_API_KEY` to `.env`**, specifying the full URL of your local LLM API endpoint compatible with OpenAI Chat Completions API, your LLM model name, and API key.  
 96 | 
 97 |     **Example `.env` file content:**  
 98 |     ```dotenv  
 99 |     LLM_API_ENDPOINT=http://192.168.2.33:8000/v1/chat/completions  
100 |     LLM_API_KEY="token-abc123"  
101 |     LLM_MODEL="AlexBefest/Gemma3-27B"  
102 |     ```  
103 |     *   Ensure your LLM server is actually listening at this address and path.  
104 | 
105 | ## ▶️ Running the Application  
106 | 
107 | 1.  **Ensure your local LLM server is running** and accessible at the URL specified in `.env` (or the default address).  
108 | 2.  **Run the Python script:**  
109 |     ```bash  
110 |     python highCompute.py  
111 |     ```  
112 | 3.  **Open the web interface:** The console will display a Gradio message with the local URL, typically `http://127.0.0.1:7860`. Open this URL in your web browser.  
113 | 
114 | ## 💬 Using the Interface  
115 | 
116 | 1.  **Select Computation Level:** Low, Medium, or High, depending on query complexity.  
117 | 2.  **(Optional) Adjust parameters:** Modify the `Temperature`, `Top-P`, and `Top-K` sliders if you want to change the LLM's response style.  
118 |     *   `Temperature`: Controls randomness. Lower values (closer to 0) make responses more deterministic and focused. Higher values (closer to 2.0) make responses more creative and diverse but may lead to "hallucinations."  
119 |     *   `Top-P`: Nucleus sampling. The model only considers tokens whose cumulative probability is ≥ `top_p`. A value of `1.0` disables this parameter.  
120 |     *   `Top-K`: Only the top `k` most probable tokens are considered. A value of `0` disables this parameter.  
121 | 3.  **Enter your query:** Type your message in the "Your message" text field at the bottom.  
122 | 4.  **Submit the query:** Press Enter or click the "Submit" button.  
123 | 5.  **View the response:** The LLM's answer will appear in the chat window.  
124 | 6.  **Continue the conversation:** Enter follow-up messages. Chat history is preserved and passed to the LLM for context.  
125 | 7.  **Clear chat:** Click the "Clear Chat" button to reset history and start a new conversation.  
126 | 
127 | ## ⚠️ Important Notes & Troubleshooting  
128 | 
129 | *   **LLM API Compatibility:** Ensure your LLM endpoint *strictly* follows the OpenAI Chat Completions API format for requests and responses. Incompatibility will cause errors.  
130 | *   **Performance:** `Medium` and especially `High` modes perform multiple sequential LLM calls, significantly increasing response time compared to `Low` mode.  
131 | *   **Decomposition Quality:** The success of `Medium` and `High` modes heavily depends on the LLM's ability to understand and execute decomposition and synthesis instructions. Quality may vary based on the LLM model and task complexity. Sometimes, the LLM may fail to decompose the task or return a response not in a numbered list format.  
132 | *   **Method Efficiency:** Note that this method may be inefficient with smaller models.  
133 | *   **Network Errors:** If you see "Network error," check if your LLM server is running and accessible at the `.env`-specified address. Verify network and firewall settings.  
134 | *   **JSON Errors:** If you see "Error: Failed to decode JSON response" or "Invalid format," this means the LLM server returned a response that is not valid JSON or does not match the expected OpenAI structure. Check your LLM server logs.
135 | 
136 | ***
137 | 
138 | # highCompute.py
139 | 
140 | [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
141 | 
142 | Всего один python-файл, подключение через OpenAI Chat Completions API, и вы получаете что-то вроде OpenAI High Compute at home. Совместимы **любые** модели. Используя метод динамического программирования, увеличение количества вычислений в десятки и сотни раз для reasoning и no-reasoning моделей, что значительно повышает качество ответов и способность решать чрезвычайно сложные для LLM задачи. 
143 | 
144 | Это простое веб-приложение на базе Gradio, предоставляющее интерфейс для взаимодействия с локально запущенной Большой Языковой Моделью (LLM). Ключевой особенностью является возможность выбора "Уровня Вычислений" (Computation Level), который определяет стратегию обработки запроса пользователя: от прямого ответа до многоуровневой декомпозиции задачи для получения более структурированных и полных ответов на сложные запросы.
145 | 
146 | ![image](https://github.com/user-attachments/assets/8886405d-9a49-41ca-89d1-900fdc136d8d)
147 | 
148 | 
149 | Приложение подключается к указанному вами API-эндпоинту LLM, совместимому с форматом OpenAI Chat Completions API.
150 | 
151 | ## 🌟 Ключевые возможности
152 | 
153 | *   **Подключение к локальной LLM:** Работает с вашим собственным LLM-сервером (например, llama.cpp, Ollama, LM Studio, vLLM с OpenAI-совместимым эндпоинтом).
154 | *   **Уровни Вычислений:**
155 |     *   **Low (Низкий):** Прямой запрос к LLM для быстрого ответа. Это совершенно обычный режим чата. Генерируется N-токенов: допустим, на решение задачи ушло всего 7000 токенов.
156 |     *   **Medium (Средний):** Одноуровневая декомпозиция задачи на подзадачи, их решение и последующий синтез ответа. Подходит для умеренно сложных запросов. Количество генерируемых токенов примерно в 10-15 раз больше по отношению к Low Compute (среднее значение, всё зависит от задачи): если бы на low compute решение задачи заняло 700 токенов, то на Medium уровне примерно 7000 токенов.
157 |     *   **High (Высокий):** Двухуровневая декомпозиция задачи (этапы -> шаги), решение шагов, синтез результатов этапов и финальный синтез общего ответа. Предназначен для наиболее сложных и многокомпонентных задач. Количество генерируемых токенов примерно в 100-150 раз больше по отношению к уровню Low: если бы на low compute решение задачи заняло 700 токенов, то на High уровне это заняло бы 70000 токенов.
158 | *   **Свободная регулировка Compute:** Вы можете свободно регулировать Compute Level для каждого вашего запроса отдельно. Например, первый запрос инициировать на High Compute, затем поработать в режиме Low, и в середине чата решить сделать Medium Compute для решения определённой проблемы.
159 | 
160 | ## ⚙️ Как это работает: Уровни Вычислений
161 | 
162 | Основная идея заключается в том, что для сложных задач простой прямой запрос к LLM может не дать оптимального результата. Декомпозиция позволяет разбить сложную проблему на более мелкие, управляемые части, решить их по отдельности, а затем объединить результаты.
163 | 
164 | 1.  **Low (Низкий):**
165 |     *   `Пользовательский запрос` -> `LLM (один вызов)` -> `Ответ`
166 |     *   Самый быстрый режим, подходит для простых вопросов или когда требуется максимально быстрая реакция. Фактически, это стандартный режим чата.
167 | 
168 | 2.  **Medium (Средний):**
169 |     *   `Пользовательский запрос` -> `LLM (запрос на декомпозицию)` -> `Список подзадач`
170 |     *   *Для каждой подзадачи:* `Подзадача + Контекст` -> `LLM (решение подзадачи)` -> `Результат подзадачи`
171 |     *   `Все результаты подзадач + Исходный запрос` -> `LLM (синтез финального ответа)` -> `Финальный ответ`
172 |     *   Использует несколько вызовов LLM. Запросы на декомпозицию и синтез используют пониженную `temperature` для большей предсказуемости.
173 | 
174 | 3.  **High (Высокий):**
175 |     *   `Пользовательский запрос` -> `LLM (декомпозиция Уровня 1)` -> `Список этапов (L1)`
176 |     *   *Для каждого этапа L1:*
177 |         *   `Этап L1 + Контекст` -> `LLM (декомпозиция Уровня 2)` -> `Список шагов (L2)`
178 |         *   *Если декомпозиция L2 не требуется:* `Этап L1 + Контекст` -> `LLM (прямое решение этапа L1)` -> `Результат этапа L1`
179 |         *   *Если декомпозиция L2 удалась:*
180 |             *   *Для каждого шага L2:* `Шаг L2 + Контекст L1` -> `LLM (решение шага L2)` -> `Результат шага L2`
181 |             *   `Все результаты шагов L2 + Контекст L1` -> `LLM (синтез результата этапа L1)` -> `Результат этапа L1`
182 |     *   `Все результаты этапов L1 + Исходный запрос` -> `LLM (финальный синтез)` -> `Финальный ответ`
183 |     *   Самый ресурсоемкий режим, использующий множество вызовов LLM. Предназначен для очень сложных задач, требующих многоэтапного планирования и решения. Использует пониженную `temperature` для всех шагов декомпозиции и синтеза. Если декомпозиция L1 не удается, автоматически переключается на режим `Medium`. ВНИМАНИЕ! Может увеличить количество генерируемых токенов в сотни раз! Если вы используете платный API, вам стоит это учитывать!
184 | 
185 | ## 📋 Предварительные требования
186 | 
187 | *   **Python 3.11+**
188 | *   **pip** (менеджер пакетов Python)
189 | *   **Работающий LLM сервер:** Вам необходим доступный по HTTP сервер с LLM, который предоставляет API, совместимое с [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat/create).
190 |     *   Примеры таких серверов:
191 |         *   [Ollama](https://ollama.ai/) (с флагом `--api` или через отдельный прокси для OpenAI совместимости)
192 |         *   [LM Studio](https://lmstudio.ai/) (предоставляет OpenAI-совместимый эндпоинт)
193 |         *   [vLLM](https://github.com/vllm-project/vllm) (с OpenAI-совместимым сервером)
194 |         *   [OpenRouter](https://openrouter.ai/).
195 |     *   **Важно:** Сервер должен принимать POST-запросы на путь, указанный в `LLM_API_ENDPOINT` (по умолчанию `/v1/chat/completions`), и обрабатывать JSON-данные в формате OpenAI (поля `model`, `messages`, `temperature`, `top_p`, `top_k`). Ответ также должен соответствовать формату OpenAI (ожидается поле `choices[0].message.content`).
196 | 
197 | ## 🚀 Установка
198 | 
199 | 1.  **Клонируйте репозиторий:**
200 |     ```bash
201 |     git clone https://github.com/AlexBefest/highCompute.py.git
202 |     cd highCompute.py
203 |     ```
204 | 
205 | 2.  **Создайте и активируйте виртуальное окружение (рекомендуется):**
206 |     *   На Linux/macOS:
207 |         ```bash
208 |         python3 -m venv venv
209 |         source venv/bin/activate
210 |         ```
211 |     *   На Windows:
212 |         ```bash
213 |         python -m venv venv
214 |         .\venv\Scripts\activate
215 |         ```
216 | 
217 | 3.  **Установите зависимости:**
218 |     Выполните установку python-зависимостей:
219 |     ```bash
220 |     pip install -r requirements.txt
221 |     ```
222 | 
223 | ## ⚙️ Конфигурация
224 | 
225 | 1.  **Создайте файл `.env`** в корневой папке проекта.
226 | 2.  **Добавьте в `.env` переменную `LLM_API_ENDPOINT`, `LLM_MODEL` и `LLM_API_KEY`**, указав полный URL вашего локального LLM API эндпоинта, который совместим с OpenAI Chat Completions API. А также имя вашей LLM-модели и API-ключ.
227 | 
228 |     **Пример содержимого файла `.env`:**
229 |     ```dotenv
230 |     LLM_API_ENDPOINT=http://192.168.2.33:8000/v1/chat/completions
231 |     LLM_API_KEY="token-abc123"
232 |     LLM_MODEL ="AlexBefest/Gemma3-27B"
233 |     ```
234 |     *   Убедитесь, что ваш LLM сервер действительно слушает этот адрес и путь.
235 | 
236 | ## ▶️ Запуск приложения
237 | 
238 | 1.  **Убедитесь, что ваш локальный LLM сервер запущен** и доступен по URL, указанному в файле `.env` (или по адресу по умолчанию).
239 | 2.  **Запустите Python скрипт:**
240 |     ```bash
241 |     python highCompute.py
242 |     ```
243 | 3.  **Откройте веб-интерфейс:** В консоли вы увидите сообщение от Gradio с локальным URL, обычно `http://127.0.0.1:7860`. Откройте этот URL в вашем веб-браузере.
244 | 
245 | ## 💬 Использование интерфейса
246 | 
247 | 1.  **Выберите Уровень Вычислений (Computation Level):** Low, Medium или High, в зависимости от сложности вашего запроса.
248 | 2.  **(Опционально) Настройте параметры:** Отрегулируйте ползунки `Temperature`, `Top-P`, `Top-K`, если хотите изменить стиль генерации ответа LLM.
249 |     *   `Temperature`: Контролирует случайность. Низкие значения (ближе к 0) делают ответы более детерминированными и сфокусированными. Высокие значения (ближе к 2.0) делают ответы более креативными и разнообразными, но могут привести к "галлюцинациям".
250 |     *   `Top-P`: Нуклеусное сэмплирование. Модель рассматривает только токены, чья суммарная вероятность больше или равна `top_p`. Значение `1.0` отключает этот параметр.
251 |     *   `Top-K`: Рассматриваются только `k` наиболее вероятных токенов. Значение `0` отключает этот параметр.
252 | 3.  **Введите ваш запрос:** Напишите сообщение в текстовое поле "Your message" внизу.
253 | 4.  **Отправьте запрос:** Нажмите Enter или кнопку "Submit".
254 | 5.  **Просмотрите ответ:** Ответ LLM появится в окне чата.
255 | 6.  **Продолжайте диалог:** Вводите следующие сообщения. История чата будет сохраняться и передаваться LLM для контекста.
256 | 7.  **Очистить чат:** Нажмите кнопку "Clear Chat", чтобы сбросить историю и начать новый диалог.
257 | 
258 | ## ⚠️ Важные замечания и устранение неисправностей
259 | 
260 | *   **Совместимость LLM API:** Убедитесь, что ваш LLM эндпоинт *строго* следует формату OpenAI Chat Completions API для запросов и ответов. Несовместимость формата приведет к ошибкам.
261 | *   **Производительность:** Режимы `Medium` и особенно `High` выполняют несколько последовательных вызовов LLM, что значительно увеличивает время ожидания ответа по сравнению с режимом `Low`.
262 | *   **Качество декомпозиции:** Успех режимов `Medium` и `High` сильно зависит от способности LLM понимать и выполнять инструкции по декомпозиции и синтезу. Качество может варьироваться в зависимости от используемой модели LLM и сложности исходной задачи. Иногда LLM может не суметь разбить задачу или вернуть ответ не в виде нумерованного списка.
263 | *   **Эффективность метода:** Нужно понимать, что данный метод может быть неэффективен с небольшими моделями
264 | *   **Сетевые ошибки:** Если вы видите "Network error", проверьте, запущен ли ваш LLM сервер и доступен ли он по указанному в `.env` адресу. Проверьте настройки сети и файрвола.
265 | *   **Ошибки JSON:** Если вы видите "Error: Failed to decode JSON response" или "Invalid format", это означает, что LLM сервер вернул ответ, который не является валидным JSON или не соответствует ожидаемой структуре OpenAI. Проверьте логи вашего LLM сервера.
266 | 


--------------------------------------------------------------------------------
/highCompute.py:
--------------------------------------------------------------------------------
  1 | import gradio as gr
  2 | import requests
  3 | import json
  4 | import os
  5 | import re
  6 | from dotenv import load_dotenv
  7 | import time
  8 | 
  9 | load_dotenv()
 10 | 
 11 | DEFAULT_ENDPOINT = "http://127.0.0.1:8080/v1/chat/completions"
 12 | DEFAULT_LLM_MODEL = "local-model"
 13 | DEFAULT_API_KEY = None
 14 | 
 15 | LOCAL_API_ENDPOINT = os.getenv("LLM_API_ENDPOINT", DEFAULT_ENDPOINT)
 16 | LLM_MODEL = os.getenv("LLM_MODEL", DEFAULT_LLM_MODEL)
 17 | LLM_API_KEY = os.getenv("LLM_API_KEY", DEFAULT_API_KEY)
 18 | 
 19 | def call_llm(prompt, chat_history_gradio=None, temperature=0.7, top_p=None, top_k=None, stream=False):
 20 |     messages = []
 21 |     if chat_history_gradio:
 22 |         for user_msg, assistant_msg in chat_history_gradio:
 23 |             if user_msg:
 24 |                 messages.append({"role": "user", "content": user_msg})
 25 |             if assistant_msg:
 26 |                  messages.append({"role": "assistant", "content": assistant_msg})
 27 |     messages.append({"role": "user", "content": prompt})
 28 | 
 29 |     payload_dict = {
 30 |         "model": LLM_MODEL,
 31 |         "messages": messages,
 32 |         "temperature": temperature,
 33 |         "stream": stream
 34 |     }
 35 |     if top_p is not None and top_p < 1.0:
 36 |          payload_dict["top_p"] = top_p
 37 |     if top_k is not None and top_k > 0:
 38 |          payload_dict["top_k"] = top_k
 39 | 
 40 |     payload = json.dumps(payload_dict)
 41 |     headers = {'Content-Type': 'application/json; charset=utf-8', 'Accept': 'text/event-stream' if stream else 'application/json'}
 42 | 
 43 |     if LLM_API_KEY:
 44 |         headers['Authorization'] = f'Bearer {LLM_API_KEY}'
 45 |         print(f"Sending request to {LOCAL_API_ENDPOINT} using API Key.")
 46 |     else:
 47 |         print(f"Sending request to {LOCAL_API_ENDPOINT} without API Key.")
 48 | 
 49 |     print(f"Model: '{LLM_MODEL}', Stream: {stream}, Payload: {payload[:200]}...")
 50 | 
 51 |     try:
 52 |         response = requests.post(LOCAL_API_ENDPOINT, headers=headers, data=payload.encode('utf-8'), timeout=36000, stream=stream)
 53 |         response.raise_for_status()
 54 | 
 55 |         if stream:
 56 |             print("Processing stream...")
 57 |             for chunk_bytes in response.iter_content(chunk_size=None):
 58 |                  if not chunk_bytes:
 59 |                      continue
 60 |                  try:
 61 |                      lines = chunk_bytes.decode('utf-8').splitlines()
 62 |                      for line in lines:
 63 |                          if line.startswith("data:"):
 64 |                              line_data = line[len("data:"):].strip()
 65 |                              if line_data == "[DONE]":
 66 |                                  print("Stream finished.")
 67 |                                  break
 68 |                              try:
 69 |                                  chunk = json.loads(line_data)
 70 |                                  if chunk.get("choices") and len(chunk["choices"]) > 0:
 71 |                                      delta = chunk["choices"][0].get("delta", {})
 72 |                                      content_chunk = delta.get("content")
 73 |                                      if content_chunk:
 74 |                                          yield content_chunk
 75 |                              except json.JSONDecodeError:
 76 |                                  print(f"Warning: Could not decode stream line JSON: {line_data}")
 77 |                                  continue
 78 |                              except Exception as e:
 79 |                                  print(f"Error processing stream chunk: {e}, Line: {line_data}")
 80 |                                  yield f"\n[Error processing stream chunk: {e}]"
 81 |                                  break
 82 |                      else:
 83 |                          continue
 84 |                      break
 85 |                  except UnicodeDecodeError:
 86 |                      print(f"Warning: Could not decode chunk as UTF-8: {chunk_bytes[:100]}...")
 87 |                      continue
 88 |             print("Stream processing complete.")
 89 | 
 90 |         else:
 91 |             response.encoding = response.apparent_encoding if response.encoding is None else response.encoding
 92 |             data = response.json()
 93 |             print(f"Received non-stream response: {json.dumps(data, ensure_ascii=False, indent=2)}")
 94 | 
 95 |             if data.get("choices") and len(data["choices"]) > 0:
 96 |                 message_content = data["choices"][0].get("message", {}).get("content")
 97 |                 if message_content:
 98 |                     yield message_content.strip()
 99 |                 else:
100 |                     print("Error: 'content' key not found in LLM response choice.")
101 |                     yield "Error: 'content' not found in LLM response."
102 |             else:
103 |                 print("Error: 'choices' array is missing, empty, or invalid in LLM response.")
104 |                 yield "Error: Invalid format in LLM response (missing 'choices')."
105 | 
106 |     except requests.exceptions.Timeout:
107 |         print(f"Network error: Request timed out after 36000 seconds.")
108 |         yield "Network error: Request timed out."
109 |     except requests.exceptions.RequestException as e:
110 |         print(f"Network error: {e}")
111 |         yield f"Network error: {e}"
112 |     except json.JSONDecodeError as e:
113 |         print(f"Error: Failed to decode JSON response from server. Response text: {response.text}")
114 |         yield f"Error: Failed to read server response (JSONDecodeError: {e}). Check server logs."
115 |     except Exception as e:
116 |         print(f"An unexpected error occurred: {e}")
117 |         yield f"An unexpected error occurred: {e}"
118 | 
119 | 
120 | def low_compute(user_input, history, temperature, top_p, top_k):
121 |     yield "[Status] Sending request directly to LLM..."
122 |     print("[Low Mode] Sending LLM request (streaming)...")
123 |     full_response = ""
124 |     for chunk in call_llm(user_input, chat_history_gradio=history, temperature=temperature, top_p=top_p, top_k=top_k, stream=True):
125 |         full_response += chunk
126 |         yield full_response
127 |     print("[Low Mode] Response stream finished.")
128 | 
129 | 
130 | def medium_compute(user_input, history, temperature, top_p, top_k):
131 |     yield "[Status] Starting task decomposition (1 level)..."
132 |     print("[Medium Mode] Starting task decomposition...")
133 |     control_temp = max(0.1, temperature * 0.5)
134 |     decompose_prompt = f'Original task: "{user_input}". Break it down into logical subtasks needed to solve it (numbered list). Be concise.'
135 | 
136 |     subtasks_text_gen = call_llm(decompose_prompt, temperature=control_temp, top_p=top_p, top_k=top_k, stream=False)
137 |     subtasks_text = next(subtasks_text_gen, "Error: No response from decomposition.")
138 |     if subtasks_text.startswith("Error:") or subtasks_text.startswith("Network error:"):
139 |         yield "[Status] Decomposition failed. Answering directly..."
140 |         print(f"[Medium Mode] Decomposition failed: {subtasks_text}. Responding directly (streaming)...")
141 |         full_response = ""
142 |         for chunk in call_llm(user_input, chat_history_gradio=history, temperature=temperature, top_p=top_p, top_k=top_k, stream=True):
143 |             full_response += chunk
144 |             yield full_response
145 |         print("[Medium Mode] Direct response stream finished after decomposition failure.")
146 |         return
147 | 
148 |     subtasks = re.findall(r"^\s*\d+\.\s*(.*)", subtasks_text, re.MULTILINE)
149 | 
150 |     if not subtasks:
151 |         yield "[Status] Decomposition returned no subtasks. Answering directly..."
152 |         print("[Medium Mode] Decomposition returned no numbered points. Responding directly (streaming)...")
153 |         full_response = ""
154 |         for chunk in call_llm(user_input, chat_history_gradio=history, temperature=temperature, top_p=top_p, top_k=top_k, stream=True):
155 |             full_response += chunk
156 |             yield full_response
157 |         print("[Medium Mode] Direct response stream finished after no subtasks found.")
158 |         return
159 | 
160 |     yield f"[Status] Task divided into {len(subtasks)} subtasks. Solving them one by one..."
161 |     print(f"[Medium Mode] Task divided into {len(subtasks)} subtasks.")
162 |     subtask_results = []
163 |     temp_history_medium = history.copy() if history else []
164 | 
165 |     for i, subtask in enumerate(subtasks):
166 |         subtask = subtask.strip()
167 |         if not subtask: continue
168 |         yield f"[Status] Solving subtask {i+1}/{len(subtasks)}: \"{subtask}...\""
169 |         print(f"[Medium Mode] Solving subtask {i+1}/{len(subtasks)}: \"{subtask}\"...")
170 |         solve_prompt = f'Original overall task: "{user_input}". Current subtask: "{subtask}". Provide a detailed solution or answer for this specific subtask.'
171 | 
172 |         subtask_result_gen = call_llm(solve_prompt, chat_history_gradio=temp_history_medium, temperature=temperature, top_p=top_p, top_k=top_k, stream=False)
173 |         subtask_result = next(subtask_result_gen, f"Error: No response for subtask {i+1}.")
174 |         subtask_results.append({"subtask": subtask, "result": subtask_result})
175 |         print(f"[Medium Mode] Subtask {i+1} result: Received.")
176 |         if subtask_result.startswith("Error:") or subtask_result.startswith("Network error:"):
177 |              yield f"[Status] Error solving subtask {i+1}. Aborting and attempting direct answer..."
178 |              print(f"[Medium Mode] Error solving subtask {i+1}: {subtask_result}. Responding directly (streaming)...")
179 |              full_response = ""
180 |              for chunk in call_llm(user_input, chat_history_gradio=history, temperature=temperature, top_p=top_p, top_k=top_k, stream=True):
181 |                  full_response += chunk
182 |                  yield full_response
183 |              print("[Medium Mode] Direct response stream finished after subtask error.")
184 |              return
185 | 
186 |     yield "[Status] All subtasks solved. Synthesizing final response..."
187 |     print("[Medium Mode] Synthesizing final response (streaming)...")
188 |     synthesis_prompt = f'Original task: "{user_input}". The task was broken down and the results for each subtask are:\n---\n'
189 |     for i, res in enumerate(subtask_results):
190 |         synthesis_prompt += f"{i+1}. Subtask: {res['subtask']}\n   Result: {res['result']}\n---\n"
191 |     synthesis_prompt += "Combine these results into a single, coherent, well-formatted final response that directly addresses the original task. Do not just list the subtasks and results; synthesize them."
192 | 
193 |     full_response = ""
194 |     for chunk in call_llm(synthesis_prompt, temperature=control_temp, top_p=top_p, top_k=top_k, stream=True):
195 |         full_response += chunk
196 |         yield full_response
197 |     print("[Medium Mode] Final response stream synthesized.")
198 | 
199 | 
200 | def high_compute(user_input, history, temperature, top_p, top_k):
201 |     yield "[Status] Starting task decomposition (Level 1)..."
202 |     print("[High Mode] Starting task decomposition (Level 1)...")
203 |     control_temp = max(0.1, temperature * 0.5)
204 |     decompose_prompt_l1 = f'Original complex task: "{user_input}". Break this down into major high-level stages or components (Level 1 - numbered list). Keep items distinct and logical.'
205 | 
206 |     subtasks_l1_text_gen = call_llm(decompose_prompt_l1, temperature=control_temp, top_p=top_p, top_k=top_k, stream=False)
207 |     subtasks_l1_text = next(subtasks_l1_text_gen, "Error: No response from L1 decomposition.")
208 | 
209 |     if subtasks_l1_text.startswith("Error:") or subtasks_l1_text.startswith("Network error:"):
210 |         yield "[Status] Level 1 decomposition failed. Falling back to Medium compute mode..."
211 |         print(f"[High Mode] Decomposition failed (Level 1): {subtasks_l1_text}. Falling back to Medium Mode...")
212 |         yield from medium_compute(user_input, history, temperature, top_p, top_k)
213 |         return
214 | 
215 |     subtasks_l1 = re.findall(r"^\s*\d+\.\s*(.*)", subtasks_l1_text, re.MULTILINE)
216 | 
217 |     if not subtasks_l1:
218 |         yield "[Status] Level 1 decomposition returned no subtasks. Falling back to Medium compute mode..."
219 |         print("[High Mode] Decomposition returned no subtasks (Level 1). Falling back to Medium Mode...")
220 |         yield from medium_compute(user_input, history, temperature, top_p, top_k)
221 |         return
222 | 
223 |     yield f"[Status] Task divided into {len(subtasks_l1)} Level 1 stages. Processing stages..."
224 |     print(f"[High Mode] Task divided into {len(subtasks_l1)} Level 1 subtasks.")
225 |     subtasks_l1_results = []
226 |     temp_history_high = history.copy() if history else []
227 | 
228 |     for i, subtask_l1 in enumerate(subtasks_l1):
229 |         subtask_l1 = subtask_l1.strip()
230 |         if not subtask_l1: continue
231 |         yield f"[Status] Processing Level 1 stage {i+1}/{len(subtasks_l1)}: \"{subtask_l1}...\". Starting mandatory Level 2 decomposition..."
232 |         print(f"[High Mode] Working on Level 1 subtask ({i+1}/{len(subtasks_l1)}): \"{subtask_l1}\"")
233 |         print(f"[High Mode]   Attempting MANDATORY Level 2 decomposition for: \"{subtask_l1}\"...")
234 | 
235 |         decompose_prompt_l2 = f'Current high-level stage (Level 1): "{subtask_l1}". Break THIS stage down into smaller, actionable steps (Level 2 - numbered list). You MUST provide the steps as a numbered list starting with "1.". Even if there is only one step, write "1. {subtask_l1}". Do not use phrases like "No further decomposition needed". Just provide the list.'
236 | 
237 |         subtasks_l2_text_gen = call_llm(decompose_prompt_l2, temperature=control_temp, top_p=top_p, top_k=top_k, stream=False)
238 |         subtasks_l2_text = next(subtasks_l2_text_gen, f"Error: No response for L2 decomposition of stage {i+1}.")
239 |         print(f"[DEBUG High Mode] Raw L2 decomposition text for '{subtask_l1}':\n>>>\n{subtasks_l2_text}\n<<<")
240 | 
241 |         if subtasks_l2_text.startswith("Error:") or subtasks_l2_text.startswith("Network error:"):
242 |             yield f"[Status] Stage {i+1}: L2 decomposition failed ({subtasks_l2_text}). Forcing L1 task as single L2 step."
243 |             print(f"[High Mode]   L2 decomposition failed for \"{subtask_l1}\": {subtasks_l2_text}. Forcing it as a single L2 step.")
244 |             subtasks_l2 = [subtask_l1.strip()]
245 |         else:
246 |             subtasks_l2 = re.findall(r"^\s*\d+\.\s*(.*)", subtasks_l2_text, re.MULTILINE)
247 |             if not subtasks_l2:
248 |                 yield f"[Status] Stage {i+1}: L2 decomposition format issue or LLM refusal. Forcing L1 task as single L2 step."
249 |                 print(f"[High Mode]   L2 decomposition failed/refused for \"{subtask_l1}\". Forcing it as a single L2 step.")
250 |                 subtasks_l2 = [subtask_l1.strip()]
251 | 
252 | 
253 |         yield f"[Status] Stage {i+1} processing {len(subtasks_l2)} Level 2 step(s)..."
254 |         print(f"[High Mode]   Processing {len(subtasks_l2)} Level 2 step(s) for L1 subtask \"{subtask_l1}\".")
255 |         subtasks_l2_results = []
256 |         abort_stage = False
257 |         for j, subtask_l2 in enumerate(subtasks_l2):
258 |             subtask_l2 = subtask_l2.strip()
259 |             if not subtask_l2: continue
260 |             yield f"[Status] Stage {i+1}/{len(subtasks_l1)}, Solving L2 step {j+1}/{len(subtasks_l2)}: \"{subtask_l2}...\""
261 |             print(f"[High Mode]     Solving Level 2 step ({j+1}/{len(subtasks_l2)}): \"{subtask_l2}\"...")
262 | 
263 |             if len(subtasks_l2) == 1 and subtask_l2 == subtask_l1:
264 |                 solve_prompt_l2 = f'Original task: "{user_input}".\nCurrent Level 1 stage: "{subtask_l1}".\nThis stage could not be broken down further. Solve this specific stage in detail.'
265 |             else:
266 |                  solve_prompt_l2 = f'Original task: "{user_input}".\nCurrent Level 1 stage: "{subtask_l1}".\nCurrent Level 2 step: "{subtask_l2}".\nSolve this specific Level 2 step in detail.'
267 | 
268 |             result_l2_gen = call_llm(solve_prompt_l2, chat_history_gradio=temp_history_high, temperature=temperature, top_p=top_p, top_k=top_k, stream=False)
269 |             result_l2 = next(result_l2_gen, f"Error: No response for L2 step {j+1}.")
270 |             subtasks_l2_results.append({"subtask": subtask_l2, "result": result_l2})
271 |             print(f"[High Mode]     Level 2 step result ({j+1}): Received.")
272 |             if result_l2.startswith("Error:") or result_l2.startswith("Network error:"):
273 |                 yield f"[Status] Error solving L2 step {j+1} in stage {i+1}. Aborting stage..."
274 |                 print(f"[High Mode]   Error solving L2 step {j+1}: {result_l2}. Aborting stage {i+1}.")
275 |                 subtasks_l1_results.append({"subtask": subtask_l1, "result": f"[Error processing stage {i+1}: {result_l2}]"})
276 |                 abort_stage = True
277 |                 break
278 | 
279 |         if abort_stage:
280 |             continue
281 | 
282 | 
283 |         yield f"[Status] Stage {i+1}: Synthesizing results from {len(subtasks_l2)} Level 2 step(s)..."
284 |         print(f"[High Mode]   Synthesizing Level 2 results for L1 subtask \"{subtask_l1}\"...")
285 |         result_l1_final = ""
286 |         if len(subtasks_l2_results) == 1:
287 |              result_l1_final = subtasks_l2_results[0]['result']
288 |              print(f"[High Mode]   Result for \"{subtask_l1}\" (from single L2 step): Received.")
289 |         else:
290 |             synthesis_prompt_l2 = f'The goal for this stage was: "{subtask_l1}". The results for the Level 2 steps taken are:\n---\n'
291 |             for j, res_l2 in enumerate(subtasks_l2_results):
292 |                 synthesis_prompt_l2 += f"{j+1}. Step: {res_l2['subtask']}\n   Result: {res_l2['result']}\n---\n"
293 |             synthesis_prompt_l2 += f'Synthesize these results into a single, coherent answer for the Level 1 stage: "{subtask_l1}". Focus on fulfilling the goal of this stage.'
294 | 
295 |             result_l1_final_gen = call_llm(synthesis_prompt_l2, temperature=control_temp, top_p=top_p, top_k=top_k, stream=False)
296 |             result_l1_final = next(result_l1_final_gen, f"Error: No response for L1 synthesis stage {i+1}.")
297 |             print(f"[High Mode]   Result for \"{subtask_l1}\" (synthesized from L2): Received.")
298 |             if result_l1_final.startswith("Error:") or result_l1_final.startswith("Network error:"):
299 |                  yield f"[Status] Error synthesizing L2 results for stage {i+1}. Using raw results..."
300 |                  print(f"[High Mode]   Error synthesizing L2 results for stage {i+1}: {result_l1_final}. Using raw results.")
301 |                  result_l1_final = "\n".join([f"Step {j+1}: {res['subtask']}\nResult: {res['result']}" for j, res in enumerate(subtasks_l2_results)])
302 | 
303 | 
304 |         subtasks_l1_results.append({"subtask": subtask_l1, "result": result_l1_final})
305 | 
306 |     yield "[Status] All Level 1 stages processed. Synthesizing final response..."
307 |     print("[High Mode] Synthesizing final response from Level 1 results (streaming)...")
308 |     final_synthesis_prompt = f'Original complex task: "{user_input}". The task was addressed in the following major stages, with these results:\n---\n'
309 |     for i, res_l1 in enumerate(subtasks_l1_results):
310 |         final_synthesis_prompt += f"{i+1}. Stage: {res_l1['subtask']}\n   Overall Result for Stage: {res_l1['result']}\n---\n"
311 |     final_synthesis_prompt += "Synthesize all these stage results into a comprehensive, well-structured final answer that directly addresses the original complex task. Ensure coherence and clarity."
312 | 
313 |     full_response = ""
314 |     for chunk in call_llm(final_synthesis_prompt, temperature=control_temp, top_p=top_p, top_k=top_k, stream=True):
315 |         full_response += chunk
316 |         yield full_response
317 |     print("[High Mode] Final response stream synthesized.")
318 | 
319 | 
320 | def chat_interface_logic(message, history, compute_level, temperature, top_p, top_k):
321 |     if history is None:
322 |         history = []
323 | 
324 |     history.append([message, ""])
325 |     yield history, "", "[Status] Processing request..."
326 | 
327 |     compute_function = None
328 |     if compute_level == "Low":
329 |         compute_function = low_compute
330 |     elif compute_level == "Medium":
331 |         compute_function = medium_compute
332 |     elif compute_level == "High":
333 |         compute_function = high_compute
334 |     else:
335 |         error_msg = "Error: Unknown computation level selected."
336 |         history[-1][1] = error_msg
337 |         yield history, "", "[Status] Error"
338 |         return
339 | 
340 |     response_generator = compute_function(message, history[:-1], temperature, top_p, top_k)
341 | 
342 |     final_assistant_response = ""
343 |     current_status = "[Status] Processing request..."
344 | 
345 |     try:
346 |         for response_part in response_generator:
347 |             if isinstance(response_part, str) and response_part.startswith("[Status]"):
348 |                 current_status = response_part
349 |                 yield history, "", current_status
350 |             elif isinstance(response_part, str):
351 |                 final_assistant_response = response_part
352 |                 history[-1][1] = final_assistant_response
353 |                 yield history, "", current_status
354 |             else:
355 |                 print(f"Warning: Unexpected type yielded from compute function: {type(response_part)}")
356 |                 error_fragment = f"\n[Warning: Unexpected data type in response stream: {type(response_part)}]"
357 |                 final_assistant_response += error_fragment
358 |                 history[-1][1] = final_assistant_response
359 |                 yield history, "", current_status
360 | 
361 |     except Exception as e:
362 |         print(f"Error during response generation: {e}")
363 |         error_msg = f"An error occurred during processing: {e}"
364 |         history[-1][1] = error_msg
365 |         yield history, "", "[Status] Error Encountered"
366 |         return
367 | 
368 |     yield history, "", ""
369 | 
370 | 
371 | def regenerate_last(history, compute_level, temperature, top_p, top_k):
372 |     if not history:
373 |         yield history, "", "[Status] Cannot regenerate: Chat history is empty."
374 |         return
375 | 
376 |     if history[-1][0] is None or history[-1][0] == "":
377 |          yield history, "", "[Status] Cannot regenerate: Last entry is not a user message."
378 |          return
379 | 
380 |     last_user_message = history[-1][0]
381 |     history_context = history[:-1]
382 | 
383 |     history[-1][1] = ""
384 |     yield history, "", f"[Status] Regenerating response for: \"{last_user_message[:50]}...\""
385 | 
386 |     compute_function = None
387 |     if compute_level == "Low":
388 |         compute_function = low_compute
389 |     elif compute_level == "Medium":
390 |         compute_function = medium_compute
391 |     elif compute_level == "High":
392 |         compute_function = high_compute
393 |     else:
394 |         error_msg = "Error: Unknown computation level selected."
395 |         history[-1][1] = error_msg
396 |         yield history, "", "[Status] Error"
397 |         return
398 | 
399 |     response_generator = compute_function(last_user_message, history_context, temperature, top_p, top_k)
400 | 
401 |     final_assistant_response = ""
402 |     current_status = f"[Status] Regenerating response for: \"{last_user_message[:50]}...\""
403 | 
404 |     try:
405 |         for response_part in response_generator:
406 |             if isinstance(response_part, str) and response_part.startswith("[Status]"):
407 |                 current_status = response_part
408 |                 yield history, "", current_status
409 |             elif isinstance(response_part, str):
410 |                 final_assistant_response = response_part
411 |                 history[-1][1] = final_assistant_response
412 |                 yield history, "", current_status
413 |             else:
414 |                 print(f"Warning: Unexpected type yielded during regeneration: {type(response_part)}")
415 |                 error_fragment = f"\n[Warning: Unexpected data type in response stream: {type(response_part)}]"
416 |                 final_assistant_response += error_fragment
417 |                 history[-1][1] = final_assistant_response
418 |                 yield history, "", current_status
419 | 
420 |     except Exception as e:
421 |         print(f"Error during response regeneration: {e}")
422 |         error_msg = f"An error occurred during regeneration: {e}"
423 |         history[-1][1] = error_msg
424 |         yield history, "", "[Status] Error Encountered during Regeneration"
425 |         return
426 | 
427 |     yield history, "", ""
428 | 
429 | 
430 | with gr.Blocks(theme=gr.themes.Soft()) as demo:
431 |     gr.Markdown("# Advanced Chat Agent with Computation Levels (Local LLM)")
432 |     gr.Markdown(f"Using endpoint: `{LOCAL_API_ENDPOINT}` with model `{LLM_MODEL}`")
433 |     if LLM_API_KEY:
434 |         gr.Markdown("API Key: Loaded from environment variable.")
435 |     else:
436 |         gr.Markdown("API Key: Not configured (using endpoint without Authorization header).")
437 | 
438 |     with gr.Row():
439 |         with gr.Column(scale=1):
440 |             compute_level_selector = gr.Radio(
441 |                 ["Low", "Medium", "High"],
442 |                 label="Computation Level",
443 |                 value="Low",
444 |                 info="Low: Direct response. Medium: 1-level decomposition. High: 2-level decomposition."
445 |             )
446 |             temp_slider = gr.Slider(
447 |                 minimum=0.0, maximum=2.0, value=0.7, step=0.1, label="Temperature",
448 |                 info="Controls randomness. Lower values make the model more deterministic."
449 |             )
450 |             top_p_slider = gr.Slider(
451 |                 minimum=0.0, maximum=1.0, value=1.0, step=0.05, label="Top-P (Nucleus Sampling)",
452 |                 info="Considers only tokens with cumulative probability >= top_p. 1.0 disables it."
453 |             )
454 |             top_k_slider = gr.Slider(
455 |                 minimum=0, maximum=100, value=0, step=1, label="Top-K",
456 |                 info="Considers only the top k most likely tokens. 0 disables it."
457 |             )
458 |             with gr.Row():
459 |                  regenerate_btn = gr.Button("Regenerate")
460 |                  clear_btn = gr.ClearButton(value="Clear Chat")
461 | 
462 | 
463 |         with gr.Column(scale=4):
464 |             status_display = gr.Markdown("", label="Current Status")
465 |             chatbot = gr.Chatbot(label="Chat", height=700, show_copy_button=True, likeable=True, show_share_button=True)
466 |             with gr.Row():
467 |                 chat_input = gr.Textbox(
468 |                     label="Your message",
469 |                     placeholder="Enter your query here...",
470 |                     scale=4,
471 |                     show_label=False,
472 |                     container=False
473 |                 )
474 |                 submit_btn = gr.Button("Submit", variant="primary", scale=1, min_width=120)
475 | 
476 |     clear_btn.add(components=[chat_input, chatbot, status_display])
477 | 
478 |     submit_inputs = [chat_input, chatbot, compute_level_selector, temp_slider, top_p_slider, top_k_slider]
479 |     submit_outputs = [chatbot, chat_input, status_display]
480 | 
481 |     regenerate_inputs = [chatbot, compute_level_selector, temp_slider, top_p_slider, top_k_slider]
482 |     regenerate_outputs = [chatbot, chat_input, status_display]
483 | 
484 |     submit_btn.click(
485 |         fn=chat_interface_logic,
486 |         inputs=submit_inputs,
487 |         outputs=submit_outputs,
488 |         queue=True
489 |     )
490 |     chat_input.submit(
491 |          fn=chat_interface_logic,
492 |         inputs=submit_inputs,
493 |         outputs=submit_outputs,
494 |         queue=True
495 |     )
496 |     regenerate_btn.click(
497 |         fn=regenerate_last,
498 |         inputs=regenerate_inputs,
499 |         outputs=regenerate_outputs,
500 |         queue=True
501 |     )
502 | 
503 | 
504 | if __name__ == "__main__":
505 |     print(f"Launching Gradio interface for local LLM...")
506 |     print(f"Connecting to: {LOCAL_API_ENDPOINT}")
507 |     print(f"Model name used in requests: {LLM_MODEL}")
508 |     if LLM_API_KEY:
509 |         print("API Key detected in environment variables.")
510 |     else:
511 |         print("API Key not found in environment variables.")
512 |     try:
513 |         base_url = '/'.join(LOCAL_API_ENDPOINT.split('/')[:3])
514 |         response = requests.get(base_url, timeout=5)
515 |         print(f"Base URL {base_url} is accessible (Status: {response.status_code}).")
516 |     except Exception as e:
517 |         print(f"Warning: Could not check endpoint base URL accessibility ({base_url}): {e}")
518 | 
519 |     demo.launch()
520 | 


--------------------------------------------------------------------------------