├── Dockerfile
├── LICENSE
├── README.md
├── agents
├── base.py
├── code_exec_agent.py
├── default_agent.py
├── pdf_file_agent.py
├── pdf_link_agent.py
└── selector.py
├── config.py
├── llm
└── phi_wrapper.py
├── main.py
├── requirements.txt
├── sanitizer
└── prompt_sanitizer.py
├── society_mind
└── autogen_society.py
├── templates
├── code_instruction.txt
├── critic_instruction.txt
├── default_instruction.txt
├── finalizer_instruction.txt
├── generator_instruction.txt
└── pdf_instruction.txt
└── utils
├── cache.py
├── docker_sandbox.py
├── exceptions.py
├── io.py
├── logger.py
└── pdf_utils.py
/Dockerfile:
--------------------------------------------------------------------------------
1 | #
2 | # NOTE: THIS DOCKERFILE IS GENERATED VIA "apply-templates.sh"
3 | #
4 | # PLEASE DO NOT EDIT IT DIRECTLY.
5 | #
6 |
7 | FROM debian:bookworm-slim
8 |
9 | # ensure local python is preferred over distribution python
10 | ENV PATH /usr/local/bin:$PATH
11 |
12 | # runtime dependencies
13 | RUN set -eux; \
14 | apt-get update; \
15 | apt-get install -y --no-install-recommends \
16 | ca-certificates \
17 | netbase \
18 | tzdata \
19 | ; \
20 | rm -rf /var/lib/apt/lists/*
21 |
22 | ENV PYTHON_VERSION 3.14.0a7
23 | ENV PYTHON_SHA256 71adbcec3ac9edf93308e55cfb4184f2eb4b16fda2bb0a5a382929ed29c8386d
24 |
25 | RUN set -eux; \
26 | \
27 | savedAptMark="$(apt-mark showmanual)"; \
28 | apt-get update; \
29 | apt-get install -y --no-install-recommends \
30 | dpkg-dev \
31 | gcc \
32 | gnupg \
33 | libbluetooth-dev \
34 | libbz2-dev \
35 | libc6-dev \
36 | libdb-dev \
37 | libffi-dev \
38 | libgdbm-dev \
39 | liblzma-dev \
40 | libncursesw5-dev \
41 | libreadline-dev \
42 | libsqlite3-dev \
43 | libssl-dev \
44 | make \
45 | tk-dev \
46 | uuid-dev \
47 | wget \
48 | xz-utils \
49 | zlib1g-dev \
50 | ; \
51 | \
52 | wget -O python.tar.xz "https://www.python.org/ftp/python/${PYTHON_VERSION%%[a-z]*}/Python-$PYTHON_VERSION.tar.xz"; \
53 | echo "$PYTHON_SHA256 *python.tar.xz" | sha256sum -c -; \
54 | mkdir -p /usr/src/python; \
55 | tar --extract --directory /usr/src/python --strip-components=1 --file python.tar.xz; \
56 | rm python.tar.xz; \
57 | \
58 | cd /usr/src/python; \
59 | gnuArch="$(dpkg-architecture --query DEB_BUILD_GNU_TYPE)"; \
60 | ./configure \
61 | --build="$gnuArch" \
62 | --enable-loadable-sqlite-extensions \
63 | --enable-optimizations \
64 | --enable-option-checking=fatal \
65 | --enable-shared \
66 | --with-lto \
67 | --with-ensurepip \
68 | ; \
69 | nproc="$(nproc)"; \
70 | EXTRA_CFLAGS="$(dpkg-buildflags --get CFLAGS)"; \
71 | LDFLAGS="$(dpkg-buildflags --get LDFLAGS)"; \
72 | LDFLAGS="${LDFLAGS:--Wl},--strip-all"; \
73 | arch="$(dpkg --print-architecture)"; arch="${arch##*-}"; \
74 | # https://docs.python.org/3.12/howto/perf_profiling.html
75 | # https://github.com/docker-library/python/pull/1000#issuecomment-2597021615
76 | case "$arch" in \
77 | amd64|arm64) \
78 | # only add "-mno-omit-leaf" on arches that support it
79 | # https://gcc.gnu.org/onlinedocs/gcc-14.2.0/gcc/x86-Options.html#index-momit-leaf-frame-pointer-2
80 | # https://gcc.gnu.org/onlinedocs/gcc-14.2.0/gcc/AArch64-Options.html#index-momit-leaf-frame-pointer
81 | EXTRA_CFLAGS="${EXTRA_CFLAGS:-} -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer"; \
82 | ;; \
83 | i386) \
84 | # don't enable frame-pointers on 32bit x86 due to performance drop.
85 | ;; \
86 | *) \
87 | # other arches don't support "-mno-omit-leaf"
88 | EXTRA_CFLAGS="${EXTRA_CFLAGS:-} -fno-omit-frame-pointer"; \
89 | ;; \
90 | esac; \
91 | make -j "$nproc" \
92 | "EXTRA_CFLAGS=${EXTRA_CFLAGS:-}" \
93 | "LDFLAGS=${LDFLAGS:-}" \
94 | ; \
95 | # https://github.com/docker-library/python/issues/784
96 | # prevent accidental usage of a system installed libpython of the same version
97 | rm python; \
98 | make -j "$nproc" \
99 | "EXTRA_CFLAGS=${EXTRA_CFLAGS:-}" \
100 | "LDFLAGS=${LDFLAGS:--Wl},-rpath='\$\$ORIGIN/../lib'" \
101 | python \
102 | ; \
103 | make install; \
104 | \
105 | cd /; \
106 | rm -rf /usr/src/python; \
107 | \
108 | find /usr/local -depth \
109 | \( \
110 | \( -type d -a \( -name test -o -name tests -o -name idle_test \) \) \
111 | -o \( -type f -a \( -name '*.pyc' -o -name '*.pyo' -o -name 'libpython*.a' \) \) \
112 | \) -exec rm -rf '{}' + \
113 | ; \
114 | \
115 | ldconfig; \
116 | \
117 | apt-mark auto '.*' > /dev/null; \
118 | apt-mark manual $savedAptMark; \
119 | find /usr/local -type f -executable -not \( -name '*tkinter*' \) -exec ldd '{}' ';' \
120 | | awk '/=>/ { so = $(NF-1); if (index(so, "/usr/local/") == 1) { next }; gsub("^/(usr/)?", "", so); printf "*%s\n", so }' \
121 | | sort -u \
122 | | xargs -r dpkg-query --search \
123 | | cut -d: -f1 \
124 | | sort -u \
125 | | xargs -r apt-mark manual \
126 | ; \
127 | apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false; \
128 | rm -rf /var/lib/apt/lists/*; \
129 | \
130 | export PYTHONDONTWRITEBYTECODE=1; \
131 | python3 --version; \
132 | pip3 --version
133 |
134 | # make some useful symlinks that are expected to exist ("/usr/local/bin/python" and friends)
135 | RUN set -eux; \
136 | for src in idle3 pip3 pydoc3 python3 python3-config; do \
137 | dst="$(echo "$src" | tr -d 3)"; \
138 | [ -s "/usr/local/bin/$src" ]; \
139 | [ ! -e "/usr/local/bin/$dst" ]; \
140 | ln -svT "$src" "/usr/local/bin/$dst"; \
141 | done
142 |
143 | CMD ["python3"]
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2025 by Enigma
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 🧠 MindForсe: Intelligent Context-Aware Assistant [beta-version]
2 |
3 | **Умный ассистент с поддержкой контекста, анализом документов и безопасным выполнением кода**
4 |
5 | ---
6 |
7 | ## 🌟 Особенности
8 |
9 | - 📄 **Анализ PDF** (по ссылкам и локальным файлам)
10 | - ⌨️ **Безопасное выполнение кода** в Docker-песочнице
11 | - 🤖 **Итеративное улучшение ответов** (Society of Mind)
12 | - 🔒 **Защита от инъекций** с помощью BERT-модели
13 | - 🧩 **Модульная архитектура** с переиспользуемыми агентами
14 | - ⚡ **Умное кэширование** с учетом версий и данных
15 |
16 | ---
17 |
18 | ## 🏗 Архитектура системы
19 |
20 | ```mermaid
21 | graph TD
22 | A[Пользовательский запрос] --> B{Тип контента}
23 | B -->|PDF-ссылка| C[PDFLinkAgent]
24 | B -->|Код| D[CodeExecutionAgent]
25 | B -->|Локальный PDF| E[PDFFileAgent]
26 | B -->|Текст| F[DefaultAgent]
27 | C --> G[Извлечение текста]
28 | D --> H[Запуск в Docker]
29 | E --> G
30 | G --> I[Семантический поиск]
31 | H --> J[Сбор результатов]
32 | I --> K[Society of Mind]
33 | J --> K
34 | K --> L[Генерация ответа]
35 | L --> M[Кэширование]
36 | M --> N[Пользователь]
37 |
--------------------------------------------------------------------------------
/agents/base.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import Any, Dict
3 | from utils.exceptions import ProcessingError
4 |
5 | class Agent(ABC):
6 | def __init__(self, config: Dict[str, Any] = None):
7 | self.config = config or {}
8 | self._validate_config()
9 |
10 | def _validate_config(self):
11 | """Проверка конфигурации агента"""
12 | required_params = self.required_params()
13 | for param in required_params:
14 | if param not in self.config:
15 | raise ProcessingError(f"Missing required parameter: {param}")
16 |
17 | @staticmethod
18 | @abstractmethod
19 | def required_params() -> list:
20 | """Список обязательных параметров конфигурации"""
21 | return []
22 |
23 | @abstractmethod
24 | async def execute(self, input_data: str) -> str:
25 | """Основной метод выполнения задачи"""
26 | pass
27 |
28 | def __repr__(self):
29 | return f"<{self.__class__.__name__} config={self.config}>"
--------------------------------------------------------------------------------
/agents/code_exec_agent.py:
--------------------------------------------------------------------------------
1 | import re
2 | from typing import Dict, Any
3 | from .base import Agent
4 | from utils.docker_sandbox import DockerSandbox
5 | from utils.exceptions import (CodeExecutionError, ResourceLimitExceeded,
6 | DockerSecurityException)
7 |
8 | class CodeExecutionAgent(Agent):
9 | MAX_OUTPUT_LENGTH = 10000
10 | BLACKLIST_PATTERNS = [
11 | r"os\.system",
12 | r"subprocess\.",
13 | r"open\(",
14 | r"import\s+(os|sys|subprocess)",
15 | r"__import__",
16 | r"eval\(",
17 | r"exec\(",
18 | r"pickle\.",
19 | r"shutil\.",
20 | r"socket\."
21 | ]
22 |
23 | @staticmethod
24 | def required_params():
25 | return ["docker_config"]
26 |
27 | def __init__(self, config: Dict[str, Any]):
28 | super().__init__(config)
29 | self.sandbox = DockerSandbox(config["docker_config"])
30 |
31 | async def execute(self, input_data: str) -> str:
32 | """Безопасное выполнение кода"""
33 | try:
34 | self._validate_code(input_data)
35 | result = await self.sandbox.execute(input_data)
36 | return self._sanitize_output(result)
37 | except DockerSecurityException as e:
38 | raise CodeExecutionError(f"Security violation: {str(e)}") from e
39 | except Exception as e:
40 | raise CodeExecutionError(str(e)) from e
41 |
42 | def _validate_code(self, code: str):
43 | """Проверка кода на опасные паттерны"""
44 | for pattern in self.BLACKLIST_PATTERNS:
45 | if re.search(pattern, code):
46 | raise DockerSecurityException(f"Blocked pattern: {pattern}")
47 |
48 | def _sanitize_output(self, output: str) -> str:
49 | """Санобработка вывода"""
50 | if len(output) > self.MAX_OUTPUT_LENGTH:
51 | raise ResourceLimitExceeded("Output too large")
52 |
53 | # Удаление чувствительной информации
54 | cleaned = re.sub(r"(API_KEY|SECRET|PASSWORD)\s*=\s*'.*?'", "[REDACTED]", output)
55 | return cleaned[:self.MAX_OUTPUT_LENGTH]
--------------------------------------------------------------------------------
/agents/default_agent.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, Any
2 | from .base import Agent
3 | from utils.exceptions import ProcessingError
4 |
5 | class DefaultAgent(Agent):
6 | async def execute(self, input_data: str) -> str:
7 | """Дефолтная обработка запроса"""
8 | try:
9 | return input_data
10 | except Exception as e:
11 | raise ProcessingError(f"Default processing failed: {str(e)}") from e
--------------------------------------------------------------------------------
/agents/pdf_file_agent.py:
--------------------------------------------------------------------------------
1 | import fitz
2 | import os
3 | from pathlib import Path
4 | from typing import Dict, Any
5 | from .base import Agent
6 | from utils.exceptions import PDFProcessingError, ResourceLimitExceeded
7 |
8 | class PDFFileAgent(Agent):
9 | MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
10 | ALLOWED_MIME_TYPES = ["application/pdf"]
11 |
12 | @staticmethod
13 | def required_params():
14 | return ["upload_dir", "embedding_model"]
15 |
16 | def __init__(self, config: Dict[str, Any]):
17 | super().__init__(config)
18 | self.upload_dir = Path(config["upload_dir"])
19 | self.embedding_model = config["embedding_model"]
20 | self._validate_upload_dir()
21 |
22 | def _validate_upload_dir(self):
23 | """Проверка директории для загрузок"""
24 | if not self.upload_dir.exists():
25 | self.upload_dir.mkdir(parents=True)
26 | if not os.access(self.upload_dir, os.W_OK):
27 | raise PDFProcessingError("Upload directory not writable")
28 |
29 | async def execute(self, input_data: str) -> str:
30 | """Обработка загруженного PDF"""
31 | try:
32 | file_path = self._validate_file(input_data)
33 | text = self._parse_pdf(file_path)
34 | return self._find_relevant_sections(text, input_data)
35 | except Exception as e:
36 | raise PDFProcessingError(str(e)) from e
37 |
38 | def _validate_file(self, input_data: str) -> Path:
39 | """Валидация загруженного файла"""
40 | file_match = re.search(r"(.+?)", input_data)
41 | if not file_match:
42 | raise PDFProcessingError("Invalid file format")
43 |
44 | file_path = self.upload_dir / file_match.group(1)
45 | if not file_path.exists():
46 | raise PDFProcessingError("File not found")
47 |
48 | if file_path.stat().st_size > self.MAX_FILE_SIZE:
49 | raise ResourceLimitExceeded("File size exceeds limit")
50 |
51 | return file_path
52 |
53 | def _parse_pdf(self, file_path: Path) -> str:
54 | """Парсинг PDF файла"""
55 | try:
56 | doc = fitz.open(file_path)
57 | return "\n".join([page.get_text() for page in doc])
58 | except fitz.FileDataError:
59 | raise PDFProcessingError("Invalid PDF file structure")
60 | except Exception as e:
61 | raise PDFProcessingError(f"PDF parsing error: {str(e)}")
62 |
63 | def _find_relevant_sections(self, text: str, query: str) -> str:
64 | """Поиск релевантных разделов"""
65 | try:
66 | chunks = text.split("\n\n")
67 | query_embedding = self.embedding_model.encode(query)
68 | doc_embeddings = self.embedding_model.encode(chunks)
69 |
70 | scores = util.pytorch_cos_sim(query_embedding, doc_embeddings)[0]
71 | top_indices = scores.argsort(descending=True)[:5]
72 |
73 | return "\n".join([chunks[i] for i in top_indices])
74 | except Exception as e:
75 | raise PDFProcessingError(f"Relevance search failed: {str(e)}")
--------------------------------------------------------------------------------
/agents/pdf_link_agent.py:
--------------------------------------------------------------------------------
1 | import re
2 | import requests
3 | import fitz
4 | from io import BytesIO
5 | from typing import Optional
6 | from sentence_transformers import util
7 | from .base import Agent
8 | from utils.exceptions import (PDFProcessingError, NetworkError,
9 | ResourceLimitExceeded, SecurityException)
10 |
11 | class PDFLinkAgent(Agent):
12 | MAX_PDF_SIZE = 10 * 1024 * 1024 # 10MB
13 | TIMEOUT = 15
14 |
15 | @staticmethod
16 | def required_params():
17 | return ["embedding_model"]
18 |
19 | def __init__(self, config: Dict[str, Any]):
20 | super().__init__(config)
21 | self.embedding_model = config["embedding_model"]
22 |
23 | async def execute(self, input_data: str) -> str:
24 | """Основной метод обработки PDF по ссылке"""
25 | try:
26 | url = self._extract_url(input_data)
27 | content = await self._download_pdf(url)
28 | text = self._parse_pdf(content)
29 | return self._find_relevant_sections(text, input_data)
30 | except Exception as e:
31 | raise PDFProcessingError(str(e)) from e
32 |
33 | def _extract_url(self, text: str) -> str:
34 | """Извлечение PDF URL из текста"""
35 | match = re.search(r'(https?://\S+\.pdf)', text)
36 | if not match:
37 | raise PDFProcessingError("No valid PDF URL found")
38 | return match.group(1)
39 |
40 | async def _download_pdf(self, url: str) -> bytes:
41 | """Безопасная загрузка PDF"""
42 | try:
43 | async with requests.Session() as session:
44 | response = await session.get(
45 | url,
46 | stream=True,
47 | timeout=self.TIMEOUT,
48 | headers={"User-Agent": "Mozilla/5.0"}
49 | )
50 | response.raise_for_status()
51 |
52 | if int(response.headers.get('Content-Length', 0)) > self.MAX_PDF_SIZE:
53 | raise ResourceLimitExceeded("PDF file size exceeds limit")
54 |
55 | return await response.content.read()
56 |
57 | except requests.RequestException as e:
58 | raise NetworkError(f"Failed to download PDF: {str(e)}") from e
59 |
60 | def _parse_pdf(self, content: bytes) -> str:
61 | """Парсинг PDF контента"""
62 | try:
63 | with BytesIO(content) as buffer:
64 | doc = fitz.open("pdf", buffer)
65 | return "\n".join([page.get_text() for page in doc])
66 | except fitz.FileDataError:
67 | raise PDFProcessingError("Invalid PDF file structure")
68 | except Exception as e:
69 | raise PDFProcessingError(f"PDF parsing error: {str(e)}")
70 |
71 | def _find_relevant_sections(self, text: str, query: str) -> str:
72 | """Поиск релевантных разделов"""
73 | try:
74 | chunks = text.split("\n\n")
75 | query_embedding = self.embedding_model.encode(query)
76 | doc_embeddings = self.embedding_model.encode(chunks)
77 |
78 | scores = util.pytorch_cos_sim(query_embedding, doc_embeddings)[0]
79 | top_indices = scores.argsort(descending=True)[:5]
80 |
81 | return "\n".join([chunks[i] for i in top_indices])
82 | except Exception as e:
83 | raise PDFProcessingError(f"Relevance search failed: {str(e)}")
--------------------------------------------------------------------------------
/agents/selector.py:
--------------------------------------------------------------------------------
1 | import re
2 | import mimetypes
3 | from typing import Optional
4 | from .base import Agent
5 | from .pdf_link_agent import PDFLinkAgent
6 | from .code_exec_agent import CodeExecutionAgent
7 | from .pdf_file_agent import PDFFileAgent
8 | from .default_agent import DefaultAgent
9 | from utils.exceptions import AgentSelectionError, SecurityException
10 |
11 | class AgentSelector:
12 | def __init__(self):
13 | self.code_patterns = [
14 | r'(def\s+\w+\s*\(.*\):)',
15 | r'(class\s+\w+)',
16 | r'(import\s+\w+)',
17 | r'(print\(.*\))',
18 | r'(\#\!.*python)'
19 | ]
20 | self.url_pattern = r'(https?:\/\/(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+\.pdf)'
21 |
22 | def select_agent(self, prompt: str) -> Agent:
23 | try:
24 | # Проверка безопасности перед выбором агента
25 | self._check_prompt_safety(prompt)
26 |
27 | # Определение типа задачи
28 | if self._is_pdf_url(prompt):
29 | return PDFLinkAgent()
30 |
31 | if self._is_code(prompt):
32 | return CodeExecutionAgent()
33 |
34 | if self._has_uploaded_file(prompt):
35 | return self._handle_file_upload(prompt)
36 |
37 | return DefaultAgent()
38 |
39 | except Exception as e:
40 | raise AgentSelectionError(f"Agent selection failed: {str(e)}")
41 |
42 | def _check_prompt_safety(self, prompt: str):
43 | forbidden_patterns = [
44 | r'(\/etc\/passwd)',
45 | r'(file:\/\/)',
46 | r'(localhost:\d+)'
47 | ]
48 | for pattern in forbidden_patterns:
49 | if re.search(pattern, prompt):
50 | raise SecurityException(f"Dangerous pattern detected: {pattern}")
51 |
52 | def _is_pdf_url(self, text: str) -> bool:
53 | return bool(re.search(self.url_pattern, text))
54 |
55 | def _is_code(self, text: str) -> bool:
56 | return any(re.search(pattern, text) for pattern in self.code_patterns)
57 |
58 | def _has_uploaded_file(self, text: str) -> bool:
59 | return '' in text
60 |
61 | def _handle_file_upload(self, prompt: str) -> Agent:
62 | file_info = self._parse_upload(prompt)
63 | mime_type, _ = mimetypes.guess_type(file_info['name'])
64 |
65 | if mime_type == 'application/pdf':
66 | return PDFFileAgent()
67 | elif mime_type in ['text/plain', 'text/x-python']:
68 | return CodeExecutionAgent()
69 |
70 | raise AgentSelectionError(f"Unsupported file type: {mime_type}")
71 |
72 | def _parse_upload(self, prompt: str) -> dict:
73 | match = re.search(r'(?P.+?)', prompt)
74 | if not match:
75 | raise AgentSelectionError("Invalid file upload format")
76 | return {'name': match.group('name')}
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LastGuardian89/Mind-Force/6fa3ed29f17b451cdb2f76d925a9efc4ec2f72c9/config.py
--------------------------------------------------------------------------------
/llm/phi_wrapper.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoTokenizer, AutoModelForCausalLM
2 | import torch
3 | import os
4 |
5 | class PhiLLM:
6 | def __init__(self, model_id="microsoft/phi-2"):
7 | self.tokenizer = AutoTokenizer.from_pretrained(model_id)
8 | self.model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16).cuda()
9 |
10 | def _load_template(self, template_name):
11 | path = os.path.join("templates", template_name)
12 | with open(path) as f:
13 | return f.read()
14 |
15 | def generate(self, prompt, context="", mode="auto"):
16 | if mode == "pdf":
17 | template = self._load_template("pdf_instruction.txt")
18 | filled = template.format(context=context, question=prompt)
19 | elif mode == "code":
20 | template = self._load_template("code_instruction.txt")
21 | filled = template.format(code=prompt, question="What does this code do?")
22 | else:
23 | template = self._load_template("default_instruction.txt")
24 | filled = template.format(question=prompt)
25 |
26 | inputs = self.tokenizer(filled, return_tensors="pt").to(self.model.device)
27 | outputs = self.model.generate(**inputs, max_new_tokens=300)
28 | return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
29 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from typing import Optional
3 | from agents.selector import AgentSelector
4 | from .llm.phi_wrapper import PhiLLM
5 | from .society_mind.autogen_society import SocietyMind
6 | from .sanitizer.prompt_sanitizer import SanitizationPipeline
7 | from .utils.io import get_input_data, send_response, log_request
8 | from .utils.cache import check_cache, save_cache
9 | from .utils.logger import setup_logging, RequestLogger
10 | from utils.exceptions import (SecurityException, ProcessingError,
11 | NetworkError, ResourceLimitExceeded)
12 |
13 | class AIOrchestrator:
14 | def __init__(self):
15 | setup_logging()
16 | self.logger = RequestLogger()
17 | self.selector = AgentSelector()
18 | self.llm = PhiLLM()
19 | self.society = SocietyMind(self.llm)
20 | self.cache_enabled = True
21 |
22 | async def process_request(self, user_input: str) -> str:
23 | try:
24 | # Шаг 1: Санитайзинг ввода
25 | clean_input = await SanitizationPipeline.process(user_input)
26 |
27 | # Шаг 2: Проверка кэша
28 | if self.cache_enabled:
29 | cached = check_cache(clean_input)
30 | if cached:
31 | self.logger.log("CACHE_HIT", {"input": clean_input})
32 | return cached
33 |
34 | # Шаг 3: Выбор и выполнение агента
35 | agent = self.selector.select_agent(clean_input)
36 | context = await agent.execute(clean_input)
37 |
38 | # Шаг 4: Генерация ответа
39 | raw_response = await self.llm.generate_async(clean_input, context)
40 |
41 | # Шаг 5: Обсуждение в SocietyMind
42 | final_response = await self.society.refine(
43 | prompt=clean_input,
44 | context=context,
45 | initial_response=raw_response
46 | )
47 |
48 | # Шаг 6: Сохранение и возврат результата
49 | save_cache(clean_input, final_response)
50 | return final_response
51 |
52 | except SecurityException as e:
53 | self.logger.log("SECURITY_BLOCK", {
54 | "input": user_input,
55 | "reason": str(e)
56 | })
57 | return "Request blocked for security reasons"
58 |
59 | except ProcessingError as e:
60 | self.logger.log("PROCESSING_ERROR", {
61 | "input": user_input,
62 | "error": str(e)
63 | })
64 | return "Error processing your request"
65 |
66 | except Exception as e:
67 | self.logger.log("INTERNAL_ERROR", {
68 | "input": user_input,
69 | "error": str(e)
70 | })
71 | return "Internal server error"
72 |
73 | finally:
74 | log_request(user_input, final_response if 'final_response' in locals() else None)
75 |
76 | async def main_flow():
77 | orchestrator = AIOrchestrator()
78 | while True:
79 | try:
80 | user_input = get_input_data()
81 | response = await orchestrator.process_request(user_input)
82 | send_response(response)
83 | except KeyboardInterrupt:
84 | break
85 |
86 | if __name__ == "__main__":
87 | asyncio.run(main_flow())
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers>=4.30
2 | torch>=2.0
3 | python-dotenv>=0.19
4 | sentence-transformers>=2.2
5 | pymupdf>=1.22
6 | docker>=6.0
7 | aiohttp>=3.8
--------------------------------------------------------------------------------
/sanitizer/prompt_sanitizer.py:
--------------------------------------------------------------------------------
1 | import re
2 | import torch
3 | from transformers import BertTokenizer, BertForSequenceClassification
4 | from typing import Optional
5 | from utils.exceptions import InjectionAttemptError, SecurityException
6 |
7 | class PromptSanitizer:
8 | def __init__(self, model_path: str = "bert-prompt-sanitizer"):
9 | self.patterns = [
10 | (r'(?i)(delete|drop|truncate)', "SQL injection"),
11 | (r'', "HTML injection"),
12 | (r'\{%|%\}', "Template injection"),
13 | (r'__import__|eval\(|exec\(', "Code injection"),
14 | (r'(ftp|ssh|sftp)://', "Dangerous protocol"),
15 | (r'/etc/passwd', "Sensitive file access")
16 | ]
17 |
18 | try:
19 | self.tokenizer = BertTokenizer.from_pretrained(model_path)
20 | self.model = BertForSequenceClassification.from_pretrained(model_path).eval()
21 | except Exception as e:
22 | raise RuntimeError(f"Failed to load security model: {str(e)}")
23 |
24 | def sanitize(self, prompt: str) -> str:
25 | self._check_patterns(prompt)
26 | self._check_ml(prompt)
27 | return prompt
28 |
29 | def _check_patterns(self, text: str):
30 | for pattern, description in self.patterns:
31 | if re.search(pattern, text):
32 | raise InjectionAttemptError(f"Pattern detected: {description} - {pattern}")
33 |
34 | def _check_ml(self, text: str):
35 | try:
36 | inputs = self.tokenizer(
37 | text,
38 | return_tensors="pt",
39 | max_length=512,
40 | truncation=True
41 | )
42 |
43 | with torch.no_grad():
44 | outputs = self.model(**inputs)
45 |
46 | probs = torch.softmax(outputs.logits, dim=1)
47 | if probs[0][1].item() > 0.85:
48 | raise SecurityException("ML model detected malicious intent")
49 |
50 | except Exception as e:
51 | raise SecurityException(f"Security check failed: {str(e)}")
52 |
53 | class SanitizationPipeline:
54 | @staticmethod
55 | async def process(prompt: str) -> str:
56 | try:
57 | sanitizer = PromptSanitizer()
58 | return sanitizer.sanitize(prompt)
59 | except Exception as e:
60 | raise SecurityException(str(e))
--------------------------------------------------------------------------------
/society_mind/autogen_society.py:
--------------------------------------------------------------------------------
1 | import os
2 | import asyncio
3 | import torch
4 | import re
5 | from typing import Optional, Tuple
6 | from sentence_transformers import SentenceTransformer, util
7 | from llm.phi_wrapper import PhiLLM
8 | from utils.exceptions import QualityThresholdReached
9 |
10 | class SocietyMind:
11 | def __init__(
12 | self,
13 | model: PhiLLM,
14 | max_rounds: int = 3,
15 | similarity_threshold: float = 0.85,
16 | quality_threshold: float = 0.7
17 | ):
18 | self.model = model
19 | self.max_rounds = max_rounds
20 | self.similarity_threshold = similarity_threshold
21 | self.quality_threshold = quality_threshold
22 | self.similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
23 |
24 | self.templates = {
25 | 'generator': self._load_template("generator_instruction.txt"),
26 | 'critic': self._load_template("critic_instruction.txt"),
27 | 'finalizer': self._load_template("finalizer_instruction.txt")
28 | }
29 |
30 | async def refine_response(
31 | self,
32 | query: str,
33 | context: str,
34 | initial_response: str
35 | ) -> str:
36 | current_response = initial_response
37 | previous_response = ""
38 | iteration = 0
39 |
40 | while iteration < self.max_rounds:
41 | # 1. Generate critique with context
42 | critique = await self._generate_critique(query, current_response, context)
43 |
44 | # 2. Check stopping conditions
45 | stop_reason = self._check_stopping_conditions(
46 | current_response,
47 | previous_response,
48 | context
49 | )
50 | if stop_reason:
51 | print(f"Stopping iteration: {stop_reason}")
52 | break
53 |
54 | # 3. Generate improved response
55 | previous_response = current_response
56 | current_response = await self._generate_improved(
57 | query,
58 | context,
59 | critique
60 | )
61 |
62 | iteration += 1
63 |
64 | return await self._finalize_response(current_response, context)
65 |
66 | def _check_stopping_conditions(
67 | self,
68 | current: str,
69 | previous: str,
70 | context: str
71 | ) -> Optional[str]:
72 | # 1. Check similarity between iterations
73 | iteration_similarity = self._calculate_similarity(current, previous)
74 | if iteration_similarity > self.similarity_threshold:
75 | return f"Iteration similarity {iteration_similarity:.2f}"
76 |
77 | # 2. Check quality score
78 | quality_score = self._calculate_quality_score(current, context)
79 | if quality_score >= self.quality_threshold:
80 | return f"Quality threshold {quality_score:.2f}"
81 |
82 | return None
83 |
84 | def _calculate_quality_score(self, response: str, context: str) -> float:
85 |
86 | context_sim = self._calculate_similarity(response, context)
87 |
88 | key_terms = self._extract_key_terms(context)
89 | coverage = sum(1 for term in key_terms if term in response) / len(key_terms)
90 |
91 |
92 | length_factor = min(max(len(response)/500, 0.5), 1.0)
93 |
94 |
95 | return 0.6*context_sim + 0.3*coverage + 0.1*length_factor
96 |
97 | def _extract_key_terms(self, text: str, top_n: int = 10) -> list:
98 | words = re.findall(r'\w+', text.lower())
99 | freq = {}
100 | for word in words:
101 | if word in freq:
102 | freq[word] += 1
103 | else:
104 | freq[word] = 1
105 | return sorted(freq.items(), key=lambda x: x[1], reverse=True)[:top_n]
106 |
107 | async def _generate_critique(
108 | self,
109 | query: str,
110 | response: str,
111 | context: str
112 | ) -> str:
113 | prompt = self.templates['critic'].format(
114 | query=query,
115 | response=response,
116 | context=context
117 | )
118 | return await self._safe_generate(prompt)
119 |
120 | async def _generate_improved(
121 | self,
122 | query: str,
123 | context: str,
124 | critique: str
125 | ) -> str:
126 | prompt = self.templates['generator'].format(
127 | query=query,
128 | context=context,
129 | feedback=critique
130 | )
131 | return await self._safe_generate(prompt)
132 |
133 | def _calculate_similarity(self, text1: str, text2: str) -> float:
134 | if not text1 or not text2:
135 | return 0.0
136 | embeddings = self.similarity_model.encode([text1, text2])
137 | return util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()
138 |
139 | async def _safe_generate(self, prompt: str) -> str:
140 | try:
141 | inputs = self.model.tokenizer(
142 | prompt,
143 | return_tensors="pt",
144 | max_length=1024,
145 | truncation=True
146 | ).to(self.model.device)
147 |
148 | outputs = await asyncio.to_thread(
149 | self.model.model.generate,
150 | **inputs,
151 | max_new_tokens=500,
152 | temperature=0.7,
153 | top_p=0.9,
154 | repetition_penalty=1.1
155 | )
156 |
157 | return self.model.tokenizer.decode(
158 | outputs[0],
159 | skip_special_tokens=True
160 | ).strip()
161 | except Exception as e:
162 | raise RuntimeError(f"Generation failed: {str(e)}")
163 |
164 | def _load_template(self, filename: str) -> str:
165 | template_path = os.path.join("templates", filename)
166 | with open(template_path, "r") as f:
167 | return f.read()
168 |
169 | async def _finalize_response(self, response: str, context: str) -> str:
170 | prompt = self.templates['finalizer'].format(
171 | response=response,
172 | context=context
173 | )
174 | return await self._safe_generate(prompt)
--------------------------------------------------------------------------------
/templates/code_instruction.txt:
--------------------------------------------------------------------------------
1 | Instruction: You are an AI engineer. The user will send a code snippet and possibly a related question. You must simulate the code in a secure Python environment, return the result, and optionally explain it.
2 | Input (Code):
3 | {code}
4 |
5 | Question: {question}
6 | Output:
--------------------------------------------------------------------------------
/templates/critic_instruction.txt:
--------------------------------------------------------------------------------
1 | [ROLE]
2 | You are a quality assurance expert. Analyze this response considering the context from user documents/code.
3 |
4 | [USER QUESTION]
5 | {query}
6 |
7 | [RESPONSE TO CRITIQUE]
8 | {response}
9 |
10 | [CONTEXT FROM USER DATA]
11 | {context}
12 |
13 | [INSTRUCTIONS]
14 | 1. Identify factual inconsistencies with context
15 | 2. Check technical accuracy
16 | 3. Verify source citations
17 | 4. Assess clarity for non-experts
18 | 5. Rate 1-5 with justification
--------------------------------------------------------------------------------
/templates/default_instruction.txt:
--------------------------------------------------------------------------------
1 | Instruction: You are a general-purpose assistant. Provide clear, accurate, and helpful answers to the user's questions.
2 | Question: {question}
3 | Output:
--------------------------------------------------------------------------------
/templates/finalizer_instruction.txt:
--------------------------------------------------------------------------------
1 | [ROLE]
2 | You are an editor. Refine this response to meet quality standards.
3 |
4 | [RESPONSE DRAFT]
5 | {response}
6 |
7 | [CONTEXT]
8 | {context}
9 |
10 | [INSTRUCTIONS]
11 | - Fix grammar and style issues
12 | - Ensure proper formatting
13 | - Add disclaimer if needed
14 | - Keep under 500 words
--------------------------------------------------------------------------------
/templates/generator_instruction.txt:
--------------------------------------------------------------------------------
1 | [ROLE]
2 | You are an AI assistant. Improve this response using feedback and context.
3 |
4 | [ORIGINAL QUESTION]
5 | {query}
6 |
7 | [CRITIQUE FEEDBACK]
8 | {feedback}
9 |
10 | [USER PROVIDED CONTEXT]
11 | {context}
12 |
13 | [INSTRUCTIONS]
14 | 1. Address all feedback points
15 | 2. Cite relevant context sections
16 | 3. Maintain technical accuracy
17 | 4. Use markdown formatting
--------------------------------------------------------------------------------
/templates/pdf_instruction.txt:
--------------------------------------------------------------------------------
1 | Instruction: You are a helpful assistant. The user will provide a question and a PDF document (either via link or upload). Your task is to answer the question using only the information relevant from the PDF document.
2 | Input: {context}
3 | Question: {question}
4 | Output:
5 |
--------------------------------------------------------------------------------
/utils/cache.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | import json
3 | import os
4 | from datetime import datetime
5 | from pathlib import Path
6 | from typing import Optional, Dict, Any
7 |
8 | class SmartCache:
9 | def __init__(self, cache_dir: str = "cache", ttl: int = 86400):
10 | self.cache_dir = Path(cache_dir)
11 | self.ttl = ttl # Время жизни записи в секундах (по умолчанию 24 часа)
12 | self._init_cache_dir()
13 |
14 | def _init_cache_dir(self):
15 | self.cache_dir.mkdir(exist_ok=True, parents=True)
16 |
17 | def _get_key_path(self, key: str) -> Path:
18 | return self.cache_dir / f"{key}.json"
19 |
20 | def generate_key(
21 | self,
22 | prompt: str,
23 | context: str,
24 | model_version: str,
25 | data_hash: str
26 | ) -> str:
27 | """Генерация уникального ключа кэша"""
28 | key_data = f"{prompt}-{context}-{model_version}-{data_hash}"
29 | return hashlib.sha256(key_data.encode()).hexdigest()
30 |
31 | def check_cache(self, key: str) -> Optional[Dict[str, Any]]:
32 | """Проверка наличия записи в кэше"""
33 | key_path = self._get_key_path(key)
34 |
35 | if not key_path.exists():
36 | return None
37 |
38 | with open(key_path, 'r') as f:
39 | entry = json.load(f)
40 |
41 | if self._is_expired(entry['timestamp']):
42 | key_path.unlink()
43 | return None
44 |
45 | return entry['response']
46 |
47 | def save_cache(
48 | self,
49 | key: str,
50 | response: str,
51 | metadata: Optional[Dict] = None
52 | ):
53 | """Сохранение записи в кэш"""
54 | entry = {
55 | 'timestamp': datetime.now().isoformat(),
56 | 'response': response,
57 | 'metadata': metadata or {}
58 | }
59 |
60 | with open(self._get_key_path(key), 'w') as f:
61 | json.dump(entry, f)
62 |
63 | def _is_expired(self, timestamp: str) -> bool:
64 | """Проверка истечения срока жизни записи"""
65 | entry_time = datetime.fromisoformat(timestamp)
66 | return (datetime.now() - entry_time).total_seconds() > self.ttl
67 |
68 | class DataHasher:
69 | @staticmethod
70 | def hash_content(content: bytes) -> str:
71 | return hashlib.sha256(content).hexdigest()
72 |
73 | @classmethod
74 | def hash_file(cls, file_path: Path) -> str:
75 | with open(file_path, 'rb') as f:
76 | return cls.hash_content(f.read())
77 |
78 | @classmethod
79 | def hash_code(cls, code: str) -> str:
80 | return cls.hash_content(code.encode())
81 |
82 | class CacheManager:
83 | def __init__(self, model: PhiLLM): #? Решить вопрос с PhiLLM
84 | self.cache = SmartCache()
85 | self.model = model
86 | self.hasher = DataHasher()
87 |
88 | async def process_request(
89 | self,
90 | prompt: str,
91 | context: str,
92 | data_source: Optional[Path] = None,
93 | code: Optional[str] = None
94 | ) -> Optional[str]:
95 | data_hash = self._get_data_hash(data_source, code)
96 |
97 | cache_key = self.cache.generate_key(
98 | prompt=prompt,
99 | context=context,
100 | model_version=self.model.version,
101 | data_hash=data_hash
102 | )
103 |
104 | if cached := self.cache.check_cache(cache_key):
105 | return cached
106 |
107 | return None
108 |
109 | def _get_data_hash(
110 | self,
111 | data_source: Optional[Path],
112 | code: Optional[str]
113 | ) -> str:
114 | if data_source:
115 | return self.hasher.hash_file(data_source)
116 | if code:
117 | return self.hasher.hash_code(code)
118 | return "no_data"
119 |
120 | async def handle_user_request(prompt: str, context: str, file_path: Path):
121 | model = PhiLLM()
122 | cache_manager = CacheManager(model)
123 |
124 | cached_response = await cache_manager.process_request(
125 | prompt=prompt,
126 | context=context,
127 | data_source=file_path
128 | )
129 |
130 | if cached_response:
131 | return cached_response
132 |
133 | response = await process_request(prompt, context, file_path)
134 |
135 | cache_manager.cache.save_cache(
136 | key=cache_manager.cache.generate_key(
137 | prompt=prompt,
138 | context=context,
139 | model_version=model.version,
140 | data_hash=cache_manager.hasher.hash_file(file_path)
141 | ),
142 | response=response,
143 | metadata={
144 | 'source': str(file_path),
145 | 'model_version': model.version
146 | }
147 | )
148 |
149 | return response
--------------------------------------------------------------------------------
/utils/docker_sandbox.py:
--------------------------------------------------------------------------------
1 | import docker
2 | from docker.errors import DockerException
3 | from .exceptions import DockerSecurityException, ResourceLimitExceeded, CodeExecutionError
4 |
5 | class DockerSandbox:
6 | def __init__(self):
7 | self.client = docker.from_env()
8 | self._validate_docker()
9 |
10 | def _validate_docker(self):
11 | try:
12 | self.client.ping()
13 | except DockerException:
14 | raise RuntimeError("Docker daemon not available")
15 |
16 | async def execute(self, code: str, timeout=10, mem_limit='100m') -> str:
17 | self._check_code_safety(code)
18 |
19 | try:
20 | container = self.client.containers.run(
21 | image="python-sandbox:secure",
22 | command=f"timeout -s KILL {timeout} python -c '{code}'",
23 | mem_limit=mem_limit,
24 | network_mode="none",
25 | pids_limit=100,
26 | read_only=True,
27 | detach=True
28 | )
29 |
30 | try:
31 | result = container.wait(timeout=timeout + 2)
32 | if result['StatusCode'] != 0:
33 | raise CodeExecutionError(f"Exit code {result['StatusCode']}")
34 |
35 | logs = container.logs().decode()
36 | self._check_output_safety(logs)
37 | return logs
38 |
39 | except docker.errors.ContainerError as e:
40 | raise CodeExecutionError(str(e))
41 | finally:
42 | container.remove(force=True)
43 |
44 | except docker.errors.ImageNotFound:
45 | raise CodeExecutionError("Sandbox image not found")
46 | except Exception as e:
47 | raise CodeExecutionError(str(e))
48 |
49 | def _check_code_safety(self, code: str):
50 | dangerous_patterns = [
51 | 'os.system', 'subprocess', 'open(',
52 | 'import socket', 'import shutil',
53 | '__import__', 'eval(', 'exec('
54 | ]
55 |
56 | if any(pattern in code for pattern in dangerous_patterns):
57 | raise DockerSecurityException(code)
58 |
59 | def _check_output_safety(self, output: str):
60 | if len(output) > 10_000:
61 | raise ResourceLimitExceeded("Output size")
--------------------------------------------------------------------------------
/utils/exceptions.py:
--------------------------------------------------------------------------------
1 | class SecurityException(Exception):
2 | def __init__(self, message="Security violation detectted"):
3 | super().__init__(message)
4 |
5 | class InjectionAttemptError(SecurityException):
6 | def __init__(self, pattern):
7 | super().__init__(f"Potential injection attempt detect: {pattern}")
8 |
9 | class DockerSecurityException(SecurityException):
10 | def __init__(self, code_snippet):
11 | super().__init__(f"Dangerous code blocked: {code_snippet}")
12 |
13 | class ProcessingError(Exception):
14 | """Base class for processing errors"""
15 | def __init__(self, message="Processing failed"):
16 | super().__init__(message)
17 |
18 | class PDFProcessingError(ProcessingError):
19 | """PDF-related errors"""
20 | def __init__(self, reason):
21 | super().__init__(f"PDF processing failed: {reason}")
22 |
23 | class CodeExecutionError(ProcessingError):
24 | """Code execution errors"""
25 | def __init__(self, reason):
26 | super().__init__(f"Code execution failed: {reason}")
27 |
28 | class ResourceLimitExceeded(ProcessingError):
29 | """Resource limitation errors"""
30 | def __init__(self, resource_type):
31 | super().__init__(f"{resource_type} limit exceeded")
32 |
33 | class NetworkError(ProcessingError):
34 | """Network-related errors"""
35 | def __init__(self, url):
36 | super().__init__(f"Network operation failed for: {url}")
--------------------------------------------------------------------------------
/utils/io.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import os
3 |
4 | def get_input_data():
5 | return input("Enter your question/code/link or upload: ")
6 |
7 | def send_response_to_user(response):
8 | print("\n\n[Final Response]:\n", response)
9 |
10 | def log_request(prompt, response):
11 | log_line = f"{datetime.datetime.now().isoformat()} | PROMPT: {prompt}\nRESPONSE: {response}\n{'='*80}\n"
12 | with open("logs/request_log.txt", "a") as log_file:
13 | log_file.write(log_line)
--------------------------------------------------------------------------------
/utils/logger.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import json
3 | from datetime import datetime
4 |
5 | def setup_logging():
6 | logging.basicConfig(
7 | level=logging.INFO,
8 | format='%(asctime)s - %(levelname)s - %(message)s',
9 | handlers=[
10 | logging.FileHandler('app.log'),
11 | logging.StreamHandler()
12 | ]
13 | )
14 |
15 | class RequestLogger:
16 | def __init__(self):
17 | self.logger = logging.getLogger('security')
18 |
19 | def log(self, event_type: str, details: dict):
20 | log_entry = {
21 | 'timestamp': datetime.utcnow().isoformat(),
22 | 'type': event_type,
23 | 'details': details
24 | }
25 | self.logger.info(json.dumps(log_entry))
--------------------------------------------------------------------------------
/utils/pdf_utils.py:
--------------------------------------------------------------------------------
1 | import fitz # PyMuPDF
2 | import requests
3 | from sentence_transformers import SentenceTransformer, util
4 |
5 | model = SentenceTransformer('all-MiniLM-L6-v2')
6 |
7 | def extract_text_from_url_pdf(url):
8 | response = requests.get(url)
9 | with open("temp.pdf", 'wb') as f:
10 | f.write(response.content)
11 | return extract_text_from_uploaded_pdf("temp.pdf")
12 |
13 | def extract_text_from_uploaded_pdf(path='temp.pdf'):
14 | doc = fitz.open(path)
15 | return "\n".join([page.get_text() for page in doc])
16 |
17 | def find_relevant_passages(text, question, k=5):
18 | chunks = text.split("\n\n")
19 | embeddings = model.encode(chunks, convert_to_tensor=True)
20 | question_emb = model.encode(question, convert_to_tensor=True)
21 | top_k = util.semantic_search(question_emb, embeddings, top_k=k)[0]
22 | return "\n".join([chunks[idx['corpus_id']] for idx in top_k])
--------------------------------------------------------------------------------