├── hub └── .gitkeep ├── requirements.txt ├── utils ├── modules │ ├── exit.py │ ├── delete_storage_models.py │ ├── delete_models.py │ ├── download_models.py │ ├── delete_built_pipeline.py │ ├── test_model.py │ └── build_pipeline.py ├── utils.py └── const.py ├── data └── example.json ├── LICENSE ├── main.py ├── ai ├── pipeline.py ├── tools.py ├── download.py ├── pipeline_tester.py └── education.py ├── .gitignore └── README.md /hub/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | sentence-transformers==3.4.1 2 | numpy==2.2.4 3 | -------------------------------------------------------------------------------- /utils/modules/exit.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | def exit(): 4 | """exit from the program""" 5 | sys.exit(0) -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | 3 | def clear(): 4 | """terminal clear""" 5 | os.system('cls' if sys.platform == 'win32' else 'clear') -------------------------------------------------------------------------------- /utils/modules/delete_storage_models.py: -------------------------------------------------------------------------------- 1 | from ai.tools import delete_downloaded_sentence_transformers_models 2 | 3 | def DelStorage(): 4 | print("\nConfirm deletion\n") 5 | print("[1] Delete all cached models") 6 | print("[0] Cancel") 7 | print() 8 | choice = input("Select an action: ") 9 | 10 | if choice == "1": 11 | delete_downloaded_sentence_transformers_models() 12 | print("\nPress Enter to continue...") 13 | input() 14 | 15 | return 16 | -------------------------------------------------------------------------------- /data/example.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "questions": ["How to reset password?", "Forgot password"], 4 | "answers": [ 5 | "Go to Settings → 'Recover access'", 6 | "You can reset your password here: [link]" 7 | ] 8 | }, 9 | { 10 | "questions": ["How to change username?", "Update account name", "Modify my username"], 11 | "answers": [ 12 | "Username changes are available in Profile → 'Edit Account Details'", 13 | "To update your username: 1) Go to Profile 2) Click 'Edit' 3) Enter new username", 14 | "Note: You can only change your username once every 30 days" 15 | ] 16 | } 17 | ] -------------------------------------------------------------------------------- /utils/modules/delete_models.py: -------------------------------------------------------------------------------- 1 | from ai.download import delete_model 2 | 3 | def Delete(get_download_models): 4 | print() 5 | print("[0] - Exit\n") 6 | print() 7 | print("Downloaded models:") 8 | models: list = get_download_models() 9 | for i, model in enumerate(models): 10 | print(f"[{i+1}] {model['name']}") 11 | print() 12 | model_name = input("Select a model to delete (You can use a space): ") 13 | 14 | if model_name in ["0", ""]: 15 | return 16 | 17 | for model_name in model_name.split(): 18 | model_name = models[int(model_name)-1]["name"] 19 | delete_model(model_name) 20 | print(f"Model {model_name} deleted") 21 | print("\nPress Enter to continue...") 22 | input() 23 | -------------------------------------------------------------------------------- /utils/modules/download_models.py: -------------------------------------------------------------------------------- 1 | from ai.download import download_model 2 | from utils.const import models 3 | 4 | def Download(): 5 | print("\n[0] - Exit\n") 6 | for i, model in models.items(): 7 | name = model["name"].split("/")[-1] 8 | desc = model["desc"] 9 | print(f"[{i}] {name} ({desc})") 10 | 11 | choice = int(input("\nSelect a model: ")) 12 | 13 | local_name = input("Enter a name to save the model (press Enter to skip): ") 14 | 15 | if choice not in models: 16 | return 17 | 18 | download_model(models[choice]["name"], custom_save_name=local_name) 19 | 20 | print(f"\nModel {models[choice]} successfully downloaded") 21 | print("\nPress Enter to continue...") 22 | input() 23 | return 24 | -------------------------------------------------------------------------------- /utils/modules/delete_built_pipeline.py: -------------------------------------------------------------------------------- 1 | from ai.tools import get_built_pipelines, delete_built_pipeline 2 | 3 | 4 | def DeleteBuiltPipeline(): 5 | pipelines = get_built_pipelines() 6 | print("\n[0] - Exit\n") 7 | for i, pipeline in enumerate(pipelines): 8 | print(f"[{i + 1}] {pipeline['name']}") 9 | print() 10 | answer = input("Select a pipeline to delete (You can use a space): ") 11 | if answer in ["0", ""]: 12 | return 13 | 14 | for pipe in answer.split(): 15 | try: 16 | delete_built_pipeline(pipelines[int(pipe) - 1]["name"]) 17 | print(f"Pipeline {pipelines[int(pipe) - 1]['name']} deleted") 18 | except FileNotFoundError as e: 19 | print(e) 20 | print("\nPress Enter to continue...") 21 | input() 22 | return -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 rizza 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from utils.modules import build_pipeline, exit, test_model, download_models, delete_models, delete_storage_models, delete_built_pipeline 2 | from utils.utils import clear 3 | from ai.tools import get_download_models 4 | 5 | __version__ = "0.2.0" 6 | 7 | def main(): 8 | while True: 9 | clear() 10 | 11 | print() 12 | print("[1] - Build pipeline") 13 | print("[2] - Test pipeline") 14 | print("[3] - Download model") 15 | print("[4] - Remove downloaded model") 16 | print("[5] - Clear model cache") 17 | print("[6] - Delete the built pipeline") 18 | print("[0] - Exit") 19 | 20 | print() 21 | choice = input("Select action: ") 22 | 23 | if choice in ["0", ""]: 24 | exit.exit() 25 | if choice == "1": 26 | build_pipeline.Build(get_download_models) 27 | elif choice == "2": 28 | test_model.Test() 29 | elif choice == "3": 30 | download_models.Download() 31 | elif choice == "4": 32 | delete_models.Delete(get_download_models) 33 | elif choice == "5": 34 | delete_storage_models.DelStorage() 35 | elif choice == "6": 36 | delete_built_pipeline.DeleteBuiltPipeline() 37 | 38 | 39 | if __name__ == '__main__': 40 | main() -------------------------------------------------------------------------------- /utils/modules/test_model.py: -------------------------------------------------------------------------------- 1 | from ai.pipeline_tester import PipelineTester 2 | from ai.tools import get_built_pipelines 3 | import json 4 | 5 | def Test(): 6 | print("\n\n[0] - Exit\n") 7 | print("Available pipelines:\n") 8 | pipelines = get_built_pipelines() 9 | for i, pipeline in enumerate(pipelines): 10 | # print(pipeline) 11 | print(f"[{i + 1}] {pipeline['name']} ({pipeline['questions']} questions, created on {pipeline['created_at']})") 12 | if not pipelines: 13 | print("None") 14 | # for model in edu.get_trained_models(): 15 | # print(f" - {model['name']} ({model['questions']} вопросов, создана {model['created_at']})") 16 | model_name = input("\nEnter the pipeline name: ") 17 | 18 | if model_name in ["0", ""]: 19 | return 20 | 21 | try: 22 | model_name = pipelines[int(model_name) - 1]["name"] 23 | except: 24 | pass 25 | 26 | tester = PipelineTester(model_name) 27 | 28 | print("\n[stats] - Show statistics") 29 | print("[0] - Exit\n") 30 | while True: 31 | question = input("\nEnter your question: ") 32 | if question in ["0", ""]: 33 | break 34 | elif question == "stats": 35 | print(json.dumps(tester.get_stats(), indent=4, ensure_ascii=False)) 36 | continue 37 | 38 | result = tester.query(question) 39 | print(f"Answer: {result['answer']} (similarity: {result['score']:.2f})") 40 | 41 | 42 | return -------------------------------------------------------------------------------- /utils/modules/build_pipeline.py: -------------------------------------------------------------------------------- 1 | from ai.education import Education 2 | from utils.const import strategies, models 3 | 4 | def Build(get_downloaded_models): 5 | print("\n[0] - Exit\n") 6 | data_file = input("Enter the name of the data file: ") 7 | pipeline_name = input("Enter a name for the pipeline: ") 8 | print() 9 | print("[1] - Cyclic strategy") 10 | print("[2] - Random strategy") 11 | print("[3] - Last strategy") 12 | print("[4] - Most similar strategy") 13 | print() 14 | answer_strategy = input("Select a strategy: ") 15 | 16 | # Return if any values are empty 17 | if any(x in ["0", ""] for x in [data_file, pipeline_name, answer_strategy]): 18 | return 19 | 20 | print() 21 | i = 0 22 | print("Models from sentence-transformers:") 23 | for i, model in models.items(): 24 | name = model["name"].split("/")[-1] 25 | desc = model["desc"] 26 | print(f"[{i}] {name} ({desc})") 27 | i += 1 28 | print() 29 | print("Models from hub:") 30 | hub_models: list = get_downloaded_models() 31 | for model in hub_models: 32 | print(f'[{i}] {model["name"]}') 33 | i += 1 34 | if len(hub_models) == 0: 35 | print("None") 36 | print() 37 | model_name = int(input("Select a model: ")) 38 | 39 | # Training 40 | 41 | print("Training the pipeline...") 42 | model_name = models.get(model_name, model_name) 43 | if type(model_name) == int: 44 | # print(model_name, model_name-len(models) - 1) 45 | # print(hub_models) 46 | model_name = hub_models[model_name - len(models) - 1] 47 | edu = Education(model_name=model_name["name"]) 48 | result = edu.train_on_file(data_file, pipeline_name, answer_strategy=strategies.get(answer_strategy, "cycle")) 49 | print("Pipeline saved at", result['model_dir']) 50 | print("\nPress Enter to continue...") 51 | input() 52 | return 53 | -------------------------------------------------------------------------------- /ai/pipeline.py: -------------------------------------------------------------------------------- 1 | # pipeline.py 2 | import json 3 | import numpy as np 4 | from pathlib import Path 5 | from sentence_transformers import SentenceTransformer 6 | from sklearn.metrics.pairwise import cosine_similarity 7 | 8 | class Pipeline: 9 | def __init__(self): 10 | """ 11 | Standalone pipeline class that works with files in its directory. 12 | """ 13 | self.base_path = Path(__file__).parent 14 | self._load_components() 15 | 16 | def _load_components(self): 17 | """Loads all components from the current directory.""" 18 | # Checking for required files 19 | required_files = [ 20 | 'model_files', 21 | 'question_embeddings.npy', 22 | 'answers.json', 23 | 'meta.json' 24 | ] 25 | 26 | for file in required_files: 27 | if not (self.base_path / file).exists(): 28 | raise FileNotFoundError(f"Required file missing: {file}") 29 | 30 | # Load the model 31 | self.model = SentenceTransformer(str(self.base_path / 'model_files')) 32 | 33 | # Load embeddings 34 | self.embeddings = np.load(self.base_path / 'question_embeddings.npy') 35 | 36 | # Load answers 37 | with open(self.base_path / 'answers.json', 'r', encoding='utf-8') as f: 38 | self.answers = json.load(f) 39 | 40 | # Load metadata 41 | with open(self.base_path / 'meta.json', 'r', encoding='utf-8') as f: 42 | self.meta = json.load(f) 43 | 44 | def query(self, question: str, threshold: float = 0.7) -> dict: 45 | """ 46 | Main method to process a query. 47 | """ 48 | # Encode the question 49 | question_embedding = self.model.encode([question]) 50 | 51 | # Find the closest match 52 | sim_scores = cosine_similarity(question_embedding, self.embeddings)[0] 53 | best_idx = np.argmax(sim_scores) 54 | best_score = float(sim_scores[best_idx]) 55 | 56 | return { 57 | 'answer': self.answers[best_idx] if best_score > threshold else None, 58 | 'score': best_score, 59 | 'is_match': best_score > threshold, 60 | 'strategy': self.meta['training_params']['answer_strategy'] 61 | } 62 | -------------------------------------------------------------------------------- /ai/tools.py: -------------------------------------------------------------------------------- 1 | import os, json 2 | import shutil 3 | from pathlib import Path 4 | 5 | def get_built_pipelines(target_dir: str = "build"): 6 | """ 7 | Returns a list of built pipelines. 8 | 9 | :param target_dir: Directory with built pipelines. 10 | :return: List of dictionaries with pipeline information. 11 | """ 12 | pipelines = [] 13 | path = Path(target_dir) 14 | 15 | for item in path.iterdir(): 16 | if item.is_dir(): 17 | with open(item / "meta.json", "r", encoding="utf-8") as f: 18 | meta = json.load(f) 19 | data = { 20 | "name": item.name, 21 | "questions": meta["questions_count"], 22 | "created_at": meta["training_params"]["created_at"] 23 | } 24 | pipelines.append(data) 25 | 26 | return pipelines 27 | 28 | def delete_built_pipeline(pipeline_name: str, target_dir: str = "build"): 29 | """ 30 | Deletes a built pipeline. 31 | 32 | :param pipeline_name: Name of the pipeline. 33 | :param target_dir: Directory with built pipelines. 34 | """ 35 | pipeline_dir = Path(target_dir) / pipeline_name 36 | if pipeline_dir.exists(): 37 | shutil.rmtree(pipeline_dir) 38 | else: 39 | raise FileNotFoundError(f"Pipeline {pipeline_name} not found") 40 | 41 | def get_download_models(target_dir: str = "hub"): 42 | """ 43 | Returns a list of downloaded models. 44 | 45 | :param target_dir: Directory with downloaded models. 46 | :return: List of dictionaries with model information. 47 | """ 48 | models = [] 49 | path = Path(target_dir) 50 | 51 | for item in path.iterdir(): 52 | if item.is_dir(): 53 | data = { 54 | "name": item.name, 55 | "source": target_dir + "/" + item.name, 56 | } 57 | models.append(data) 58 | 59 | return models 60 | 61 | def delete_downloaded_sentence_transformers_models(): 62 | """ 63 | Deletes all downloaded models from known cache directories. 64 | """ 65 | deleted = False 66 | cache_paths = [ 67 | Path.home() / ".cache" / "huggingface" / "hub", 68 | Path.home() / ".cache" / "torch" / "sentence_transformers", 69 | Path.home() / ".cache" / "huggingface" / "transformers" 70 | ] 71 | 72 | for cache_path in cache_paths: 73 | if cache_path.exists(): 74 | print(f"Found model directory: {cache_path}") 75 | try: 76 | shutil.rmtree(cache_path) 77 | print(f"✅ Successfully deleted: {cache_path}") 78 | deleted = True 79 | except Exception as e: 80 | print(f"⚠️ Error deleting {cache_path}: {str(e)}") 81 | 82 | if not deleted: 83 | print("❌ No cache directories found") 84 | -------------------------------------------------------------------------------- /utils/const.py: -------------------------------------------------------------------------------- 1 | models = { 2 | 1: { 3 | "name": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2", 4 | "desc": "High quality multilingual embeddings", 5 | "details": { 6 | "languages": "50+", 7 | "embedding_size": 768, 8 | "speed": "medium", 9 | "best_for": "Semantic search, clustering", 10 | "pros": "Excellent quality for multilingual tasks", 11 | "cons": "Larger memory footprint", 12 | "release_year": 2020 13 | } 14 | }, 15 | 2: { 16 | "name": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", 17 | "desc": "Optimal balance of speed and quality", 18 | "details": { 19 | "languages": "50+", 20 | "embedding_size": 384, 21 | "speed": "fast", 22 | "best_for": "Real-time applications, production use", 23 | "pros": "4x faster than mpnet with good accuracy", 24 | "cons": "Lower dimensionality than mpnet", 25 | "release_year": 2021 26 | } 27 | }, 28 | 3: { 29 | "name": "sentence-transformers/distiluse-base-multilingual-cased-v2", 30 | "desc": "Lightweight multilingual model", 31 | "details": { 32 | "languages": "50+", 33 | "embedding_size": 512, 34 | "speed": "very fast", 35 | "best_for": "Mobile/edge devices, low-resource environments", 36 | "pros": "Small size, decent performance", 37 | "cons": "Lower accuracy than full-size models", 38 | "release_year": 2020 39 | } 40 | }, 41 | 4: { 42 | "name": "sentence-transformers/LaBSE", 43 | "desc": "Google's universal language encoder", 44 | "details": { 45 | "languages": 109, 46 | "embedding_size": 768, 47 | "speed": "medium", 48 | "best_for": "Cross-lingual tasks, language detection", 49 | "pros": "Widest language coverage", 50 | "cons": "Outdated architecture", 51 | "release_year": 2019 52 | } 53 | }, 54 | 6: { 55 | "name": "intfloat/multilingual-e5-large", 56 | "desc": "Microsoft's efficient multilingual encoder", 57 | "details": { 58 | "languages": "100+", 59 | "embedding_size": 1024, 60 | "speed": "medium-fast", 61 | "best_for": "Large-scale production systems", 62 | "pros": "Excellent speed/accuracy balance", 63 | "cons": "Slightly less precise than BGE-M3", 64 | "release_year": 2023, 65 | "benchmarks": { 66 | "MTEB": 72.1, 67 | "RAG": 89.7 68 | } 69 | } 70 | } 71 | } 72 | 73 | strategies = { 74 | "1": "cycle", 75 | "2": "random", 76 | "3": "last", 77 | "4": "most_similar" 78 | } -------------------------------------------------------------------------------- /ai/download.py: -------------------------------------------------------------------------------- 1 | from huggingface_hub import snapshot_download 2 | from pathlib import Path 3 | from datetime import datetime 4 | import json 5 | import shutil 6 | 7 | def delete_model(model_name: str, target_dir: str = "hub"): 8 | """ 9 | Deletes the model (folder with files). 10 | 11 | :param model_name: Name of the model 12 | :param target_dir: Directory with models 13 | """ 14 | 15 | model_dir = Path(target_dir) / model_name 16 | if model_dir.exists(): 17 | shutil.rmtree(model_dir) 18 | else: 19 | raise FileNotFoundError(f"Model {model_name} not found") 20 | 21 | def download_model(model_name: str, custom_save_name="", target_dir: str = "hub", 22 | ignore_patterns: list = None): 23 | """ 24 | Downloads a model from Hugging Face Hub with selective files 25 | 26 | :param model_name: Name of the model (with or without the prefix) 27 | :param custom_save_name: Custom name for the saved model 28 | :param target_dir: Directory to save the model 29 | :param ignore_patterns: List of file patterns to ignore 30 | :return: Path to the saved model 31 | """ 32 | # if not model_name.startswith('sentence-transformers/'): 33 | # model_name = f'sentence-transformers/{model_name}' 34 | 35 | if custom_save_name and ('/' in custom_save_name or '\\' in custom_save_name): 36 | raise ValueError("Custom save name cannot contain path separators (/, \\)") 37 | 38 | if custom_save_name: 39 | model_dir = Path(target_dir) / custom_save_name 40 | else: 41 | model_dir = Path(target_dir) / model_name.split('/')[-1] 42 | 43 | model_dir.mkdir(parents=True, exist_ok=True) 44 | 45 | # Default ignore patterns 46 | default_ignore = [ 47 | "*.h5", # TensorFlow 48 | "*.msgpack", # Flax/JAX 49 | "*.onnx", # ONNX 50 | "*.ot", # Other 51 | "*.tflite", # TensorFlow Lite 52 | "*.mlmodel", # Core ML 53 | "*.bin", # PyTorch 54 | ] 55 | 56 | final_ignore = ignore_patterns if ignore_patterns is not None else default_ignore 57 | 58 | try: 59 | snapshot_download( 60 | repo_id=model_name, 61 | local_dir=model_dir, 62 | local_dir_use_symlinks=False, 63 | ignore_patterns=final_ignore, 64 | allow_patterns=["*.json", "*.txt", "*.safetensors", "tokenizer.model"] # Only the ones we need 65 | ) 66 | 67 | # Deleting possible empty directories 68 | for subdir in ["tf_model.h5", "flax_model.msgpack", "onnx"]: 69 | dir_path = model_dir / subdir 70 | if dir_path.exists(): 71 | shutil.rmtree(dir_path) 72 | 73 | # Saving metadata 74 | with open(model_dir / "meta.json", "w") as f: 75 | json.dump({ 76 | "source": model_name, 77 | "downloaded_at": datetime.now().isoformat(), 78 | "downloaded_files": [f.name for f in model_dir.glob("*") if f.is_file()] 79 | }, f, indent=2) 80 | 81 | return str(model_dir) 82 | 83 | except Exception as e: 84 | raise RuntimeError(f"Error downloading model: {str(e)}") -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 116 | .pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .venv 133 | env/ 134 | venv/ 135 | ENV/ 136 | env.bak/ 137 | venv.bak/ 138 | 139 | # Spyder project settings 140 | .spyderproject 141 | .spyproject 142 | 143 | # Rope project settings 144 | .ropeproject 145 | 146 | # mkdocs documentation 147 | /site 148 | 149 | # mypy 150 | .mypy_cache/ 151 | .dmypy.json 152 | dmypy.json 153 | 154 | # Pyre type checker 155 | .pyre/ 156 | 157 | # pytype static type analyzer 158 | .pytype/ 159 | 160 | # Cython debug symbols 161 | cython_debug/ 162 | 163 | # PyCharm 164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 166 | # and can be added to the global gitignore or merged into this file. For a more nuclear 167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 168 | #.idea/ 169 | 170 | # Ruff stuff: 171 | .ruff_cache/ 172 | 173 | # PyPI configuration file 174 | .pypirc 175 | 176 | test.py 177 | 178 | # models 179 | hub/* 180 | !hub/.gitkeep 181 | 182 | # pipelines 183 | build/* 184 | !build/.gitkeep 185 | 186 | # data 187 | data/* 188 | !data/example.json -------------------------------------------------------------------------------- /ai/pipeline_tester.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import json 3 | from datetime import datetime 4 | from sentence_transformers import SentenceTransformer 5 | from sklearn.metrics.pairwise import cosine_similarity 6 | from pathlib import Path 7 | 8 | class PipelineTester: 9 | def __init__(self, model_name, models_path="build"): 10 | """ 11 | :param model_name: Name of the trained model (e.g. 'faq_model') 12 | :param models_path: Path to the folder with trained models (default is 'build') 13 | """ 14 | self.models_path = Path(models_path) 15 | self.model_path = self.models_path / model_name 16 | self.model = None 17 | self.embeddings = None 18 | self.answers = None 19 | self.meta = None 20 | self.stats = { 21 | 'total_queries': 0, 22 | 'matches': 0, 23 | 'threshold': 0.7, 24 | 'queries': [] 25 | } 26 | self._load_model() 27 | 28 | def _load_model(self): 29 | """Load the model and data from the folder of the trained model""" 30 | if not self.model_path.exists(): 31 | raise FileNotFoundError(f"Model directory not found: {self.model_path}") 32 | 33 | # Load the model from saved files 34 | model_files_path = self.model_path / 'model_files' 35 | if not model_files_path.exists(): 36 | raise FileNotFoundError(f"Model files not found in {model_files_path}") 37 | 38 | self.model = SentenceTransformer(str(model_files_path)) 39 | 40 | # Load the other components 41 | self.embeddings = np.load(self.model_path / 'question_embeddings.npy') 42 | 43 | with open(self.model_path / 'answers.json', 'r', encoding='utf-8') as f: 44 | self.answers = json.load(f) 45 | 46 | with open(self.model_path / 'meta.json', 'r', encoding='utf-8') as f: 47 | self.meta = json.load(f) 48 | 49 | def get_trained_models(self): 50 | """Returns a list of trained models (similar to Education.get_trained_models)""" 51 | models = [] 52 | for model_dir in self.models_path.iterdir(): 53 | if model_dir.is_dir(): 54 | meta_path = model_dir / 'meta.json' 55 | if meta_path.exists(): 56 | with open(meta_path, 'r', encoding='utf-8') as f: 57 | try: 58 | meta = json.load(f) 59 | models.append({ 60 | 'name': model_dir.name, 61 | 'source': meta['source_data'], 62 | 'questions': meta['questions_count'], 63 | 'created_at': meta['training_params']['created_at'], 64 | 'path': str(model_dir), 65 | 'strategy': meta['training_params']['answer_strategy'], 66 | 'model_info': meta.get('model_info', {}) 67 | }) 68 | except (json.JSONDecodeError, KeyError) as e: 69 | print(f"Error reading metadata from {meta_path}: {str(e)}") 70 | continue 71 | return models 72 | 73 | def query(self, question, threshold=None): 74 | """ 75 | Query the model 76 | :param question: The question text 77 | :param threshold: Similarity threshold (None for the default value) 78 | :return: { 79 | 'answer': str|None, 80 | 'score': float, 81 | 'is_match': bool 82 | } 83 | """ 84 | threshold = threshold or self.stats['threshold'] 85 | self.stats['total_queries'] += 1 86 | 87 | # Encode the question 88 | question_embedding = self.model.encode([question]) 89 | 90 | # Find the closest match 91 | sim_scores = cosine_similarity(question_embedding, self.embeddings)[0] 92 | best_idx = np.argmax(sim_scores) 93 | best_score = float(sim_scores[best_idx]) 94 | is_match = best_score > threshold 95 | 96 | # Record the statistics 97 | result = { 98 | 'question': question, 99 | 'answer': self.answers[best_idx] if is_match else None, 100 | 'score': best_score, 101 | 'is_match': is_match, 102 | 'timestamp': datetime.now().isoformat() 103 | } 104 | 105 | if is_match: 106 | self.stats['matches'] += 1 107 | 108 | self.stats['queries'].append(result) 109 | return result 110 | 111 | def get_stats(self, reset=False): 112 | """ 113 | Get statistics 114 | :param reset: Reset statistics after fetching 115 | :return: { 116 | 'total_queries': int, 117 | 'matches': int, 118 | 'match_rate': float, 119 | 'threshold': float, 120 | 'last_query': dict|None 121 | } 122 | """ 123 | stats = { 124 | 'total_queries': self.stats['total_queries'], 125 | 'matches': self.stats['matches'], 126 | 'match_rate': self.stats['matches'] / self.stats['total_queries'] if self.stats['total_queries'] > 0 else 0, 127 | 'threshold': self.stats['threshold'], 128 | 'last_query': self.stats['queries'][-1] if self.stats['queries'] else None 129 | } 130 | 131 | if reset: 132 | self.reset_stats() 133 | 134 | return stats 135 | 136 | def reset_stats(self): 137 | """Reset statistics""" 138 | self.stats = { 139 | 'total_queries': 0, 140 | 'matches': 0, 141 | 'threshold': self.stats['threshold'], 142 | 'queries': [] 143 | } 144 | 145 | def set_threshold(self, threshold): 146 | """Set the similarity threshold""" 147 | self.stats['threshold'] = float(threshold) 148 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ReplyCore 2 | 3 | ### _Fast QA pipeline creation using your data with [sentence-transformers](https://pypi.org/project/sentence-transformers/): model training and production-ready integration_ 4 | 5 | ## ❓Why is this needed? 6 | 7 | _I personally use it to automate responses to frequent repetitive questions in tech support, but there are many possible use cases._ 8 | 9 | ## ⚙️How does it work? 10 | 11 | Your questions and answers are converted into numerical vectors using a neural network model. 12 | `"How do I reset my password?"` → `[0.24, -0.12, 0.76, ...]` 13 | 14 | The model does not look for exact word matches but calculates **semantic similarity** based on the angle between vectors. 15 | 16 | The system understands **rephrased questions** thanks to: 17 | 18 | - Considering word order 19 | - Recognizing synonyms (`"reset password" ≈ "recover access"`) 20 | - Multi-task model training 21 | 22 | ## 🤖📊Available Models in the Interactive Program 23 | 24 | > You can select additional models for `utils/const.py` 25 | > from [this list](https://www.sbert.net/docs/sentence_transformer/pretrained_models.html) 26 | 27 | | Model ID | Name | Dimensions | Speed | Languages | Best For | Size | Benchmark (MTEB) | 28 | | -------- | --------------------------------------- | ---------- | ----- | --------- | ------------------------- | ----- | ---------------- | 29 | | 1 | `paraphrase-multilingual-mpnet-base-v2` | 768 | 🐢 | 50+ | Highest accuracy tasks | 1.2GB | 65.3 | 30 | | 2 | `paraphrase-multilingual-MiniLM-L12-v2` | 384 | 🚗 | 50+ | Balanced speed/quality | 470MB | 63.7 | 31 | | 3 | `distiluse-base-multilingual-cased-v2` | 512 | 🚄 | 50+ | Low-resource environments | 480MB | 61.2 | 32 | | 4 | `LaBSE` | 768 | 🐢 | 109 | Multilingual applications | 1.8GB | 58.2 | 33 | | 5 | `multilingual-e5-large` | 1024 | 🚗 | 100+ | Large-scale production | 2.1GB | 72.1 | 34 | 35 | ## 💡✨Why is the Interactive Program Beneficial? 36 | 37 | 1. _Easily train a pipeline without writing custom code_ 38 | 2. _Assemble a ready-to-use pipeline with your model and a built-in module for operation_ 39 | 3. _Download any models directly in the program for offline training_ 40 | 4. _Test your pipelines immediately after training—no need to constantly move folders into your project. Validate on the spot and check statistics_ 41 | 42 | ## 🧠🔄Training Strategies 43 | 44 | ### `last` (_Default_) 45 | 46 | **How it works:** 47 | 48 | - Takes the answer with the same index as the question (`answers[i]`). 49 | - If there are fewer answers than questions, it uses the last answer (`answers[-1]`). 50 | 51 | **Example:** 52 | 53 | questions = ["Q1", "Q2", "Q3"] 54 | answers = ["A1", "A2"] 55 | 56 | Result: 57 | Q1 → A1, Q2 → A2, Q3 → A2 (last answer) 58 | 59 | **When to use:** 60 | 61 | - For "one question → one answer" pairs. 62 | - When answers are ordered correctly for the questions. 63 | 64 | ## 65 | 66 | ### `cycle` (_Cyclic_) 67 | 68 | **How it works:** 69 | 70 | - Reuses answers cyclically: `answers[i % len(answers)]`. 71 | 72 | **Example:** 73 | 74 | questions = ["Q1", "Q2", "Q3", "Q4"] 75 | answers = ["A1", "A2"] 76 | 77 | Result: 78 | Q1 → A1, Q2 → A2, Q3 → A1, Q4 → A2 79 | 80 | **When to use:** 81 | 82 | - When there are more questions than answers. 83 | - When answers are general-purpose (e.g., common hints). 84 | 85 | ## 86 | 87 | ### `random` (_Random_) 88 | 89 | **How it works:** 90 | 91 | - Selects a random answer from the list using `random.choice(answers)`. 92 | 93 | **Example:** 94 | 95 | questions = ["Q1", "Q2", "Q3"] 96 | answers = ["A1", "A2", "A3"] 97 | 98 | Possible result: 99 | Q1 → A3, Q2 → A1, Q3 → A3 100 | 101 | **When to use:** 102 | 103 | - To add variety to responses. 104 | 105 | ## 106 | 107 | ### `most-similar` 108 | 109 | **How it works:** 110 | 111 | 1. For each question, its **embedding** (vector representation) is calculated. 112 | 2. The **embeddings** of all answers are **pre-cached** (for speed). 113 | 3. The answer **most semantically similar** to the question is selected (via cosine similarity). 114 | 115 | **Example** 116 | 117 | questions = ["How to reset password?", "Payment failed", "Contact support"] 118 | answers = ["Click 'Forgot password'", "Check balance", "Email us at help@site.com"] 119 | 120 | # Embeddings: 121 | q_embeddings = model.encode(questions) # Vector for each question 122 | a_embeddings = model.encode(answers) # Vector for each answer 123 | 124 | # For the question "Payment failed": 125 | question_idx = 1 126 | question_embedding = q_embeddings[1] 127 | 128 | # Compare with answer embeddings: 129 | similarities = cosine_similarity([question_embedding], a_embeddings)[0] 130 | best_answer_idx = similarities.argmax() # Index of the most similar answer 131 | 132 | Result: 133 | "Payment failed" → "Check balance" (as their embeddings are the closest) 134 | 135 | **When to use:** 136 | 137 | - When **answers are not tied** to specific questions (e.g., a general knowledge base). 138 | - For complex questions, where **direct matching** (`last`, `cycle`) produces poor results. 139 | - In **RAG systems**, where finding semantic matches is important. 140 | 141 | ## ⬇️🚀Installation and Launch 142 | 143 | **Requirements: Python 3.9+** 144 | 145 | **Install dependencies:** 146 | 147 | pip install -r requirements.txt 148 | 149 | **Add your training data to the `data/` directory** 150 | 151 | > An example is provided in the `data/example.json` file. 152 | 153 | **Launch the interactive program:** 154 | 155 | python main.py 156 | 157 | ## 🔗🧩Integration with the Project 158 | 159 | _The assembled pipelines with models are saved in the `build/your_pipeline` directory. This folder contains the `pipeline.py` module for working with the pipeline._ 160 | 161 | **Working with the assembled pipeline** 162 | 163 | from your_pipeline.pipeline import Pipeline 164 | 165 | pipe = Pipeline() 166 | result = pipe.query("Shall we have a cup of coffee?") 167 | 168 | print(result) 169 | 170 | **Result:** 171 | 172 | { 173 | "answer": "I suggest having a freshly squeezed juice", 174 | "score": 0.8474252223968506, 175 | "is_match": True, 176 | "strategy": "cycle" 177 | } 178 | 179 | **Where:** 180 | 181 | - `answer` - _The answer_ 182 | - `score` - _Confidence level of the answer_ 183 | - `is_match` - _Has the pre-defined similarity threshold been 184 | exceeded?_ 185 | - `strategy` - _Training strategy of the pipeline_ 186 | 187 | ## 🌟In conclusion 188 | 189 | _This program **will not create a real artificial intelligence**. It will only train a pipeline on existing data. It is not self-learning, it doesn't think, and it can't come up with answers. It simply helps to automate responses._ 190 | -------------------------------------------------------------------------------- /ai/education.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import shutil 4 | import os 5 | from datetime import datetime 6 | import time 7 | import numpy as np 8 | import torch 9 | from sentence_transformers import SentenceTransformer, util 10 | from typing import Literal, Optional, Dict, List 11 | from pathlib import Path 12 | from tqdm import tqdm 13 | 14 | class Education: 15 | def __init__(self, model_name='paraphrase-multilingual-MiniLM-L12-v2'): 16 | """ 17 | Initialization of model training 18 | :param model_name: model name (with or without the prefix) 19 | :param hub_dir: folder with saved models (optional) 20 | """ 21 | self.model_name = model_name 22 | self.hub_dir = 'hub' 23 | self.data_dir = 'data' 24 | self.pipeline_dir = 'build' 25 | self._answer_embeddings_cache = {} 26 | self._current_answers_hash = None 27 | self._ensure_dirs_exist() 28 | self.model = self._init_model() 29 | 30 | def _init_model(self): 31 | """Initializes the model, first trying the local hub, then creating directly""" 32 | # Trying to load from hub if the folder is specified 33 | if self.hub_dir: 34 | local_path = os.path.join(self.hub_dir, self.model_name) 35 | if os.path.exists(local_path): 36 | try: 37 | _a = SentenceTransformer(local_path) 38 | print("Model loaded from hub") 39 | return _a 40 | except Exception as e: 41 | print(f"Failed to load the model from hub: {str(e)}") 42 | 43 | # If not found in the hub, load directly 44 | full_model_name = self.model_name 45 | _a = SentenceTransformer(full_model_name) 46 | print("Model loaded directly") 47 | return _a 48 | 49 | def _load_model_from_hub(self, model_name): 50 | """Loads the model from the local hub folder""" 51 | model_path = os.path.join(self.hub_dir, model_name) 52 | 53 | if not os.path.exists(model_path): 54 | raise FileNotFoundError( 55 | f"Model {model_name} not found in folder {self.hub_dir}. " 56 | f"Available models: {os.listdir(self.hub_dir)}" 57 | ) 58 | 59 | return SentenceTransformer(model_path) 60 | 61 | def _ensure_dirs_exist(self): 62 | """Creates necessary directories if they do not exist""" 63 | Path(self.data_dir).mkdir(parents=True, exist_ok=True) 64 | Path(self.pipeline_dir).mkdir(parents=True, exist_ok=True) 65 | if self.hub_dir: 66 | Path(self.hub_dir).mkdir(parents=True, exist_ok=True) 67 | 68 | def _copy_pipeline_files(self, model_dir: str): 69 | """Copies the necessary files for the pipeline to work""" 70 | dest_path = Path(model_dir) 71 | 72 | # Copying the self-contained pipeline.py 73 | current_dir = Path(__file__).parent 74 | shutil.copy(current_dir / 'pipeline.py', dest_path) 75 | 76 | # Creating the __init__.py file 77 | with open(dest_path / '__init__.py', 'w') as f: 78 | f.write('# Auto-generated pipeline package\n') 79 | 80 | # Creating requirements.txt 81 | possible_req_paths = [ 82 | current_dir.parent / 'requirements.txt', # In the root of the project 83 | Path.cwd() / 'requirements.txt' # In the working directory 84 | ] 85 | 86 | for req_path in possible_req_paths: 87 | if req_path.exists(): 88 | shutil.copy(req_path, dest_path) 89 | break 90 | 91 | def _get_embeddings(self, answers: List[str], force_update: bool = False) -> torch.Tensor: 92 | """Smart caching of answer embeddings""" 93 | answers_tuple = tuple(answers) 94 | current_hash = hash(answers_tuple) 95 | 96 | if force_update or current_hash != self._current_answers_hash: 97 | self._clear_embeddings_cache() 98 | self._current_answers_hash = current_hash 99 | 100 | if current_hash not in self._answer_embeddings_cache: 101 | with torch.no_grad(): 102 | self._answer_embeddings_cache[current_hash] = { 103 | 'embeddings': self.model.encode(answers, convert_to_tensor=True), 104 | 'timestamp': time.time() 105 | } 106 | 107 | return self._answer_embeddings_cache[current_hash]['embeddings'] 108 | 109 | def _clear_embeddings_cache(self, max_items: int = 3, max_age_hours: int = 24): 110 | """Clearing old caches""" 111 | now = time.time() 112 | to_delete = [] 113 | 114 | if len(self._answer_embeddings_cache) > max_items: 115 | oldest = sorted(self._answer_embeddings_cache.items(), 116 | key=lambda x: x[1]['timestamp'])[0][0] 117 | to_delete.append(oldest) 118 | 119 | for h, data in self._answer_embeddings_cache.items(): 120 | if (now - data['timestamp']) > max_age_hours * 3600: 121 | to_delete.append(h) 122 | 123 | for h in set(to_delete): 124 | del self._answer_embeddings_cache[h] 125 | if h == self._current_answers_hash: 126 | self._current_answers_hash = None 127 | 128 | def train_on_file(self, data_file: str, model_name: str, 129 | answer_strategy: Literal['last', 'cycle', 'random', 'most_similar'] = 'last', 130 | show_progress: bool = True, chunk_size: int = 100): 131 | """ 132 | Train the model on the specified data file 133 | :param data_file: name of the data file (e.g. 'faq.json') 134 | :param model_name: name for saving the model 135 | :param answer_strategy: answer selection strategy 136 | ('last' - last, 'cycle' - cyclic, 'random' - random, 'most_similar' - most similar) 137 | :param show_progress: whether to show progress bars 138 | :param chunk_size: batch size for question encoding 139 | :return: dictionary with training results 140 | """ 141 | # Data validation 142 | if not data_file.endswith('.json'): 143 | data_file += '.json' 144 | 145 | data_path = os.path.join(self.data_dir, data_file) 146 | if not os.path.exists(data_path): 147 | raise FileNotFoundError(f"Data file {data_path} not found") 148 | 149 | # Load data 150 | with open(data_path, 'r', encoding='utf-8') as f: 151 | try: 152 | faq = json.load(f) 153 | except json.JSONDecodeError as e: 154 | raise ValueError(f"JSON format error: {str(e)}") 155 | 156 | # Validate data structure 157 | if not isinstance(faq, list): 158 | raise ValueError("Data should be an array of objects") 159 | 160 | # Prepare data with progress bars 161 | all_questions = [] 162 | all_answers = [] 163 | 164 | # Main progress bar for FAQ items 165 | faq_iter = tqdm(faq, desc="Processing FAQ items", disable=not show_progress) 166 | for item in faq_iter: 167 | if not all(k in item for k in ['questions', 'answers']): 168 | raise ValueError("Each item must contain 'questions' and 'answers'") 169 | 170 | answers = item['answers'] 171 | questions = item['questions'] 172 | 173 | if not answers: 174 | raise ValueError("Answer list cannot be empty") 175 | 176 | # Nested progress bar for questions 177 | questions_iter = tqdm(questions, desc=" Processing questions", 178 | leave=False, disable=not show_progress) 179 | for i, question in enumerate(questions_iter): 180 | all_questions.append(question) 181 | 182 | # Select answer by strategy 183 | if answer_strategy == 'last': 184 | answer = answers[i] if i < len(answers) else answers[-1] 185 | elif answer_strategy == 'cycle': 186 | answer = answers[i % len(answers)] 187 | elif answer_strategy == 'random': 188 | answer = random.choice(answers) 189 | elif answer_strategy == 'most_similar': 190 | answer_embeddings = self._get_embeddings(answers) 191 | question_embedding = self.model.encode(question, convert_to_tensor=True) 192 | similarities = util.cos_sim(question_embedding, answer_embeddings)[0] 193 | answer = answers[similarities.argmax().item()] 194 | else: 195 | raise ValueError(f"Invalid strategy: {answer_strategy}") 196 | 197 | all_answers.append(answer) 198 | 199 | if not all_questions: 200 | raise ValueError("No questions found for training") 201 | 202 | # Encode questions with chunked progress bar 203 | question_embeddings = [] 204 | chunks = [all_questions[i:i + chunk_size] for i in range(0, len(all_questions), chunk_size)] 205 | 206 | encoding_iter = tqdm(chunks, desc="Encoding questions", disable=not show_progress) 207 | for chunk in encoding_iter: 208 | question_embeddings.extend(self.model.encode(chunk)) 209 | 210 | question_embeddings = np.array(question_embeddings) 211 | 212 | # Create model folder 213 | model_dir = os.path.join(self.pipeline_dir, model_name) 214 | Path(model_dir).mkdir(parents=True, exist_ok=True) 215 | 216 | # Save results 217 | print(f"Saving pipeline...") 218 | np.save(os.path.join(model_dir, 'question_embeddings.npy'), question_embeddings) 219 | with open(os.path.join(model_dir, 'answers.json'), 'w', encoding='utf-8') as f: 220 | json.dump(all_answers, f, ensure_ascii=False, indent=2) 221 | 222 | # Save the model 223 | model_files_path = os.path.join(model_dir, 'model_files') 224 | self.model.save(model_files_path) 225 | 226 | # Get model name safely 227 | try: 228 | model_name_attr = getattr(self.model, 'model_name', None) 229 | model_name_str = model_name_attr if model_name_attr else str(self.model[0].auto_model.config._name_or_path) 230 | base_model_name = os.path.basename(model_name_str) 231 | except Exception: 232 | base_model_name = "unknown_model" 233 | 234 | # Model metadata 235 | meta = { 236 | 'source_data': data_file, 237 | 'questions_count': len(all_questions), 238 | 'answers_count': len(all_answers), 239 | 'model_info': { 240 | 'name': base_model_name, 241 | 'source': 'local_hub', 242 | 'embedding_dim': question_embeddings.shape[1], 243 | 'max_seq_length': self.model.max_seq_length, 244 | 'model_files_path': 'model_files' 245 | }, 246 | 'training_params': { 247 | 'answer_strategy': answer_strategy, 248 | 'created_at': datetime.now().isoformat(), 249 | 'chunk_size': chunk_size 250 | } 251 | } 252 | 253 | with open(os.path.join(model_dir, 'meta.json'), 'w', encoding='utf-8') as f: 254 | json.dump(meta, f, indent=2, ensure_ascii=False) 255 | 256 | self._copy_pipeline_files(model_dir) 257 | 258 | return { 259 | 'status': 'success', 260 | 'model_name': model_name, 261 | 'model_dir': model_dir, 262 | 'model_files_path': model_files_path, 263 | 'questions_processed': len(all_questions), 264 | 'answers_processed': len(all_answers), 265 | 'embedding_shape': question_embeddings.shape 266 | } 267 | 268 | def update_answers(self, new_answers: List[str]): 269 | """Принудительное обновление кэша эмбеддингов""" 270 | self._clear_embeddings_cache(max_items=0) 271 | self._get_embeddings(new_answers, force_update=True) 272 | 273 | def get_trained_models(self): 274 | """Returns a list of trained models""" 275 | models = [] 276 | for model_dir in Path(self.pipeline_dir).iterdir(): 277 | if model_dir.is_dir(): 278 | meta_path = model_dir / 'meta.json' 279 | if meta_path.exists(): 280 | with open(meta_path, 'r', encoding='utf-8') as f: 281 | try: 282 | meta = json.load(f) 283 | models.append({ 284 | 'name': model_dir.name, 285 | 'source': meta['source_data'], 286 | 'questions': meta['questions_count'], 287 | 'created_at': meta['training_params']['created_at'], 288 | 'path': str(model_dir), 289 | 'strategy': meta['training_params']['answer_strategy'], 290 | 'model_info': meta.get('model_info', {}) 291 | }) 292 | except (json.JSONDecodeError, KeyError) as e: 293 | print(f"Error reading metadata {meta_path}: {str(e)}") 294 | continue 295 | return models 296 | --------------------------------------------------------------------------------