├── hub
    └── .gitkeep
├── requirements.txt
├── utils
    ├── modules
    │   ├── exit.py
    │   ├── delete_storage_models.py
    │   ├── delete_models.py
    │   ├── download_models.py
    │   ├── delete_built_pipeline.py
    │   ├── test_model.py
    │   └── build_pipeline.py
    ├── utils.py
    └── const.py
├── data
    └── example.json
├── LICENSE
├── main.py
├── ai
    ├── pipeline.py
    ├── tools.py
    ├── download.py
    ├── pipeline_tester.py
    └── education.py
├── .gitignore
└── README.md


/hub/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | sentence-transformers==3.4.1
2 | numpy==2.2.4
3 | 


--------------------------------------------------------------------------------
/utils/modules/exit.py:
--------------------------------------------------------------------------------
1 | import sys
2 | 
3 | def exit():
4 |     """exit from the program"""
5 |     sys.exit(0)


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
1 | import os, sys
2 | 
3 | def clear():
4 |     """terminal clear"""
5 |     os.system('cls' if sys.platform == 'win32' else 'clear')


--------------------------------------------------------------------------------
/utils/modules/delete_storage_models.py:
--------------------------------------------------------------------------------
 1 | from ai.tools import delete_downloaded_sentence_transformers_models
 2 | 
 3 | def DelStorage():
 4 |     print("\nConfirm deletion\n")
 5 |     print("[1] Delete all cached models")
 6 |     print("[0] Cancel")
 7 |     print()
 8 |     choice = input("Select an action: ")
 9 | 
10 |     if choice == "1":
11 |         delete_downloaded_sentence_transformers_models()
12 |         print("\nPress Enter to continue...")
13 |         input()
14 |     
15 |     return
16 | 


--------------------------------------------------------------------------------
/data/example.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "questions": ["How to reset password?", "Forgot password"],
 4 |         "answers": [
 5 |             "Go to Settings → 'Recover access'", 
 6 |             "You can reset your password here: [link]"
 7 |         ]
 8 |     },
 9 |     {
10 |         "questions": ["How to change username?", "Update account name", "Modify my username"],
11 |         "answers": [
12 |             "Username changes are available in Profile → 'Edit Account Details'",
13 |             "To update your username: 1) Go to Profile 2) Click 'Edit' 3) Enter new username",
14 |             "Note: You can only change your username once every 30 days"
15 |         ]
16 |     }
17 | ]


--------------------------------------------------------------------------------
/utils/modules/delete_models.py:
--------------------------------------------------------------------------------
 1 | from ai.download import delete_model
 2 | 
 3 | def Delete(get_download_models):
 4 |     print()
 5 |     print("[0] - Exit\n")
 6 |     print()
 7 |     print("Downloaded models:")
 8 |     models: list = get_download_models()
 9 |     for i, model in enumerate(models):
10 |         print(f"[{i+1}] {model['name']}")
11 |     print()
12 |     model_name = input("Select a model to delete (You can use a space): ")
13 | 
14 |     if model_name in ["0", ""]:
15 |         return
16 |     
17 |     for model_name in model_name.split():
18 |         model_name = models[int(model_name)-1]["name"]
19 |         delete_model(model_name)
20 |         print(f"Model {model_name} deleted")
21 |     print("\nPress Enter to continue...")
22 |     input()
23 | 


--------------------------------------------------------------------------------
/utils/modules/download_models.py:
--------------------------------------------------------------------------------
 1 | from ai.download import download_model
 2 | from utils.const import models
 3 | 
 4 | def Download():
 5 |     print("\n[0] - Exit\n")
 6 |     for i, model in models.items():
 7 |         name = model["name"].split("/")[-1]
 8 |         desc = model["desc"]
 9 |         print(f"[{i}] {name} ({desc})")
10 | 
11 |     choice = int(input("\nSelect a model: "))
12 | 
13 |     local_name = input("Enter a name to save the model (press Enter to skip): ")
14 | 
15 |     if choice not in models:
16 |         return
17 | 
18 |     download_model(models[choice]["name"], custom_save_name=local_name)
19 | 
20 |     print(f"\nModel {models[choice]} successfully downloaded")
21 |     print("\nPress Enter to continue...")
22 |     input()
23 |     return
24 | 


--------------------------------------------------------------------------------
/utils/modules/delete_built_pipeline.py:
--------------------------------------------------------------------------------
 1 | from ai.tools import get_built_pipelines, delete_built_pipeline
 2 | 
 3 | 
 4 | def DeleteBuiltPipeline():
 5 |     pipelines = get_built_pipelines()
 6 |     print("\n[0] - Exit\n")
 7 |     for i, pipeline in enumerate(pipelines):
 8 |         print(f"[{i + 1}] {pipeline['name']}")
 9 |     print()
10 |     answer = input("Select a pipeline to delete (You can use a space): ")
11 |     if answer in ["0", ""]:
12 |         return
13 |     
14 |     for pipe in answer.split():
15 |         try:
16 |             delete_built_pipeline(pipelines[int(pipe) - 1]["name"])
17 |             print(f"Pipeline {pipelines[int(pipe) - 1]['name']} deleted")
18 |         except FileNotFoundError as e:
19 |             print(e)
20 |     print("\nPress Enter to continue...")
21 |     input()
22 |     return


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 rizza
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | from utils.modules import build_pipeline, exit, test_model, download_models, delete_models, delete_storage_models, delete_built_pipeline
 2 | from utils.utils import clear
 3 | from ai.tools import get_download_models
 4 | 
 5 | __version__ = "0.2.0"
 6 | 
 7 | def main():
 8 |     while True:
 9 |         clear()
10 | 
11 |         print()
12 |         print("[1] - Build pipeline")
13 |         print("[2] - Test pipeline") 
14 |         print("[3] - Download model")
15 |         print("[4] - Remove downloaded model")
16 |         print("[5] - Clear model cache")
17 |         print("[6] - Delete the built pipeline")
18 |         print("[0] - Exit")
19 | 
20 |         print()
21 |         choice = input("Select action: ")
22 | 
23 |         if choice in ["0", ""]:
24 |             exit.exit()
25 |         if choice == "1":
26 |             build_pipeline.Build(get_download_models)
27 |         elif choice == "2":
28 |             test_model.Test()
29 |         elif choice == "3":
30 |             download_models.Download()
31 |         elif choice == "4":
32 |             delete_models.Delete(get_download_models)
33 |         elif choice == "5":
34 |             delete_storage_models.DelStorage()
35 |         elif choice == "6":
36 |             delete_built_pipeline.DeleteBuiltPipeline()
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     main()


--------------------------------------------------------------------------------
/utils/modules/test_model.py:
--------------------------------------------------------------------------------
 1 | from ai.pipeline_tester import PipelineTester
 2 | from ai.tools import get_built_pipelines
 3 | import json
 4 | 
 5 | def Test():
 6 |     print("\n\n[0] - Exit\n")
 7 |     print("Available pipelines:\n")
 8 |     pipelines = get_built_pipelines()
 9 |     for i, pipeline in enumerate(pipelines):
10 |         # print(pipeline)
11 |         print(f"[{i + 1}] {pipeline['name']} ({pipeline['questions']} questions, created on {pipeline['created_at']})")
12 |     if not pipelines:
13 |         print("None")
14 |     # for model in edu.get_trained_models():
15 |     #     print(f" - {model['name']} ({model['questions']} вопросов, создана {model['created_at']})")
16 |     model_name = input("\nEnter the pipeline name: ")
17 | 
18 |     if model_name in ["0", ""]:
19 |         return
20 | 
21 |     try:
22 |         model_name = pipelines[int(model_name) - 1]["name"]
23 |     except:
24 |         pass
25 | 
26 |     tester = PipelineTester(model_name)
27 | 
28 |     print("\n[stats] - Show statistics")
29 |     print("[0] - Exit\n")
30 |     while True:
31 |         question = input("\nEnter your question: ")
32 |         if question in ["0", ""]:
33 |             break
34 |         elif question == "stats":
35 |             print(json.dumps(tester.get_stats(), indent=4, ensure_ascii=False))
36 |             continue
37 | 
38 |         result = tester.query(question)
39 |         print(f"Answer: {result['answer']} (similarity: {result['score']:.2f})")
40 | 
41 | 
42 |     return


--------------------------------------------------------------------------------
/utils/modules/build_pipeline.py:
--------------------------------------------------------------------------------
 1 | from ai.education import Education
 2 | from utils.const import strategies, models
 3 | 
 4 | def Build(get_downloaded_models):
 5 |     print("\n[0] - Exit\n")
 6 |     data_file = input("Enter the name of the data file: ")
 7 |     pipeline_name = input("Enter a name for the pipeline: ")
 8 |     print()
 9 |     print("[1] - Cyclic strategy")
10 |     print("[2] - Random strategy")
11 |     print("[3] - Last strategy")
12 |     print("[4] - Most similar strategy")
13 |     print()
14 |     answer_strategy = input("Select a strategy: ")
15 | 
16 |     # Return if any values are empty
17 |     if any(x in ["0", ""] for x in [data_file, pipeline_name, answer_strategy]):
18 |         return
19 | 
20 |     print()
21 |     i = 0
22 |     print("Models from sentence-transformers:")
23 |     for i, model in models.items():
24 |         name = model["name"].split("/")[-1]
25 |         desc = model["desc"]
26 |         print(f"[{i}] {name} ({desc})")
27 |         i += 1
28 |     print()
29 |     print("Models from hub:")
30 |     hub_models: list = get_downloaded_models()
31 |     for model in hub_models:
32 |         print(f'[{i}] {model["name"]}')
33 |         i += 1
34 |     if len(hub_models) == 0:
35 |         print("None")
36 |     print()
37 |     model_name = int(input("Select a model: "))
38 |     
39 |     # Training
40 | 
41 |     print("Training the pipeline...")
42 |     model_name = models.get(model_name, model_name)
43 |     if type(model_name) == int:
44 |         # print(model_name, model_name-len(models) - 1)
45 |         # print(hub_models)
46 |         model_name = hub_models[model_name - len(models) - 1]
47 |     edu = Education(model_name=model_name["name"])
48 |     result = edu.train_on_file(data_file, pipeline_name, answer_strategy=strategies.get(answer_strategy, "cycle"))
49 |     print("Pipeline saved at", result['model_dir'])
50 |     print("\nPress Enter to continue...")
51 |     input()
52 |     return
53 | 


--------------------------------------------------------------------------------
/ai/pipeline.py:
--------------------------------------------------------------------------------
 1 | # pipeline.py
 2 | import json
 3 | import numpy as np
 4 | from pathlib import Path
 5 | from sentence_transformers import SentenceTransformer
 6 | from sklearn.metrics.pairwise import cosine_similarity
 7 | 
 8 | class Pipeline:
 9 |     def __init__(self):
10 |         """
11 |         Standalone pipeline class that works with files in its directory.
12 |         """
13 |         self.base_path = Path(__file__).parent
14 |         self._load_components()
15 | 
16 |     def _load_components(self):
17 |         """Loads all components from the current directory."""
18 |         # Checking for required files
19 |         required_files = [
20 |             'model_files',
21 |             'question_embeddings.npy',
22 |             'answers.json',
23 |             'meta.json'
24 |         ]
25 |         
26 |         for file in required_files:
27 |             if not (self.base_path / file).exists():
28 |                 raise FileNotFoundError(f"Required file missing: {file}")
29 | 
30 |         # Load the model
31 |         self.model = SentenceTransformer(str(self.base_path / 'model_files'))
32 |         
33 |         # Load embeddings
34 |         self.embeddings = np.load(self.base_path / 'question_embeddings.npy')
35 |         
36 |         # Load answers
37 |         with open(self.base_path / 'answers.json', 'r', encoding='utf-8') as f:
38 |             self.answers = json.load(f)
39 |         
40 |         # Load metadata
41 |         with open(self.base_path / 'meta.json', 'r', encoding='utf-8') as f:
42 |             self.meta = json.load(f)
43 | 
44 |     def query(self, question: str, threshold: float = 0.7) -> dict:
45 |         """
46 |         Main method to process a query.
47 |         """
48 |         # Encode the question
49 |         question_embedding = self.model.encode([question])
50 |         
51 |         # Find the closest match
52 |         sim_scores = cosine_similarity(question_embedding, self.embeddings)[0]
53 |         best_idx = np.argmax(sim_scores)
54 |         best_score = float(sim_scores[best_idx])
55 |         
56 |         return {
57 |             'answer': self.answers[best_idx] if best_score > threshold else None,
58 |             'score': best_score,
59 |             'is_match': best_score > threshold,
60 |             'strategy': self.meta['training_params']['answer_strategy']
61 |         }
62 | 


--------------------------------------------------------------------------------
/ai/tools.py:
--------------------------------------------------------------------------------
 1 | import os, json
 2 | import shutil
 3 | from pathlib import Path
 4 | 
 5 | def get_built_pipelines(target_dir: str = "build"):
 6 |     """
 7 |     Returns a list of built pipelines.
 8 | 
 9 |     :param target_dir: Directory with built pipelines.
10 |     :return: List of dictionaries with pipeline information.
11 |     """
12 |     pipelines = []
13 |     path = Path(target_dir)
14 | 
15 |     for item in path.iterdir():
16 |         if item.is_dir():
17 |             with open(item / "meta.json", "r", encoding="utf-8") as f:
18 |                 meta = json.load(f)
19 |                 data = {
20 |                     "name": item.name,
21 |                     "questions": meta["questions_count"],
22 |                     "created_at": meta["training_params"]["created_at"]
23 |                 }
24 |                 pipelines.append(data)
25 |     
26 |     return pipelines
27 | 
28 | def delete_built_pipeline(pipeline_name: str, target_dir: str = "build"):
29 |     """
30 |     Deletes a built pipeline.
31 | 
32 |     :param pipeline_name: Name of the pipeline.
33 |     :param target_dir: Directory with built pipelines.
34 |     """
35 |     pipeline_dir = Path(target_dir) / pipeline_name
36 |     if pipeline_dir.exists():
37 |         shutil.rmtree(pipeline_dir)
38 |     else:
39 |         raise FileNotFoundError(f"Pipeline {pipeline_name} not found")
40 | 
41 | def get_download_models(target_dir: str = "hub"):
42 |     """
43 |     Returns a list of downloaded models.
44 |     
45 |     :param target_dir: Directory with downloaded models.
46 |     :return: List of dictionaries with model information.
47 |     """
48 |     models = []
49 |     path = Path(target_dir)
50 | 
51 |     for item in path.iterdir():
52 |         if item.is_dir():
53 |             data = {
54 |                 "name": item.name,
55 |                 "source": target_dir + "/" + item.name,
56 |             }
57 |             models.append(data)
58 |     
59 |     return models
60 | 
61 | def delete_downloaded_sentence_transformers_models():
62 |     """
63 |     Deletes all downloaded models from known cache directories.
64 |     """
65 |     deleted = False
66 |     cache_paths = [
67 |         Path.home() / ".cache" / "huggingface" / "hub",
68 |         Path.home() / ".cache" / "torch" / "sentence_transformers",
69 |         Path.home() / ".cache" / "huggingface" / "transformers"
70 |     ]
71 | 
72 |     for cache_path in cache_paths:
73 |         if cache_path.exists():
74 |             print(f"Found model directory: {cache_path}")
75 |             try:
76 |                 shutil.rmtree(cache_path)
77 |                 print(f"✅ Successfully deleted: {cache_path}")
78 |                 deleted = True
79 |             except Exception as e:
80 |                 print(f"⚠️ Error deleting {cache_path}: {str(e)}")
81 | 
82 |     if not deleted:
83 |         print("❌ No cache directories found")
84 | 


--------------------------------------------------------------------------------
/utils/const.py:
--------------------------------------------------------------------------------
 1 | models = {
 2 |     1: {
 3 |         "name": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
 4 |         "desc": "High quality multilingual embeddings",
 5 |         "details": {
 6 |             "languages": "50+",
 7 |             "embedding_size": 768,
 8 |             "speed": "medium",
 9 |             "best_for": "Semantic search, clustering",
10 |             "pros": "Excellent quality for multilingual tasks",
11 |             "cons": "Larger memory footprint",
12 |             "release_year": 2020
13 |         }
14 |     },
15 |     2: {
16 |         "name": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
17 |         "desc": "Optimal balance of speed and quality",
18 |         "details": {
19 |             "languages": "50+",
20 |             "embedding_size": 384,
21 |             "speed": "fast",
22 |             "best_for": "Real-time applications, production use",
23 |             "pros": "4x faster than mpnet with good accuracy",
24 |             "cons": "Lower dimensionality than mpnet",
25 |             "release_year": 2021
26 |         }
27 |     },
28 |     3: {
29 |         "name": "sentence-transformers/distiluse-base-multilingual-cased-v2",
30 |         "desc": "Lightweight multilingual model",
31 |         "details": {
32 |             "languages": "50+",
33 |             "embedding_size": 512,
34 |             "speed": "very fast",
35 |             "best_for": "Mobile/edge devices, low-resource environments",
36 |             "pros": "Small size, decent performance",
37 |             "cons": "Lower accuracy than full-size models",
38 |             "release_year": 2020
39 |         }
40 |     },
41 |     4: {
42 |         "name": "sentence-transformers/LaBSE",
43 |         "desc": "Google's universal language encoder",
44 |         "details": {
45 |             "languages": 109,
46 |             "embedding_size": 768,
47 |             "speed": "medium",
48 |             "best_for": "Cross-lingual tasks, language detection",
49 |             "pros": "Widest language coverage",
50 |             "cons": "Outdated architecture",
51 |             "release_year": 2019
52 |         }
53 |     },
54 |     6: {
55 |         "name": "intfloat/multilingual-e5-large",
56 |         "desc": "Microsoft's efficient multilingual encoder",
57 |         "details": {
58 |             "languages": "100+",
59 |             "embedding_size": 1024,
60 |             "speed": "medium-fast",
61 |             "best_for": "Large-scale production systems",
62 |             "pros": "Excellent speed/accuracy balance",
63 |             "cons": "Slightly less precise than BGE-M3",
64 |             "release_year": 2023,
65 |             "benchmarks": {
66 |                 "MTEB": 72.1,
67 |                 "RAG": 89.7
68 |             }
69 |         }
70 |     }
71 | }
72 | 
73 | strategies = {
74 |     "1": "cycle",
75 |     "2": "random",
76 |     "3": "last",
77 |     "4": "most_similar"
78 | }


--------------------------------------------------------------------------------
/ai/download.py:
--------------------------------------------------------------------------------
 1 | from huggingface_hub import snapshot_download
 2 | from pathlib import Path
 3 | from datetime import datetime
 4 | import json
 5 | import shutil
 6 | 
 7 | def delete_model(model_name: str, target_dir: str = "hub"):
 8 |     """
 9 |     Deletes the model (folder with files).
10 |     
11 |     :param model_name: Name of the model
12 |     :param target_dir: Directory with models
13 |     """
14 |     
15 |     model_dir = Path(target_dir) / model_name
16 |     if model_dir.exists():
17 |         shutil.rmtree(model_dir)
18 |     else:
19 |         raise FileNotFoundError(f"Model {model_name} not found")
20 | 
21 | def download_model(model_name: str, custom_save_name="", target_dir: str = "hub", 
22 |                   ignore_patterns: list = None):
23 |     """
24 |     Downloads a model from Hugging Face Hub with selective files
25 |     
26 |     :param model_name: Name of the model (with or without the prefix)
27 |     :param custom_save_name: Custom name for the saved model
28 |     :param target_dir: Directory to save the model
29 |     :param ignore_patterns: List of file patterns to ignore
30 |     :return: Path to the saved model
31 |     """
32 |     # if not model_name.startswith('sentence-transformers/'):
33 |     #     model_name = f'sentence-transformers/{model_name}'
34 | 
35 |     if custom_save_name and ('/' in custom_save_name or '\\' in custom_save_name):
36 |         raise ValueError("Custom save name cannot contain path separators (/, \\)")
37 |     
38 |     if custom_save_name:
39 |         model_dir = Path(target_dir) / custom_save_name
40 |     else:
41 |         model_dir = Path(target_dir) / model_name.split('/')[-1]
42 | 
43 |     model_dir.mkdir(parents=True, exist_ok=True)
44 | 
45 |     # Default ignore patterns
46 |     default_ignore = [
47 |         "*.h5",         # TensorFlow
48 |         "*.msgpack",    # Flax/JAX
49 |         "*.onnx",       # ONNX
50 |         "*.ot",         # Other
51 |         "*.tflite",     # TensorFlow Lite
52 |         "*.mlmodel",    # Core ML
53 |         "*.bin",        # PyTorch
54 |     ]
55 |     
56 |     final_ignore = ignore_patterns if ignore_patterns is not None else default_ignore
57 | 
58 |     try:
59 |         snapshot_download(
60 |             repo_id=model_name,
61 |             local_dir=model_dir,
62 |             local_dir_use_symlinks=False,
63 |             ignore_patterns=final_ignore,
64 |             allow_patterns=["*.json", "*.txt", "*.safetensors", "tokenizer.model"]  # Only the ones we need
65 |         )
66 |         
67 |         # Deleting possible empty directories
68 |         for subdir in ["tf_model.h5", "flax_model.msgpack", "onnx"]:
69 |             dir_path = model_dir / subdir
70 |             if dir_path.exists():
71 |                 shutil.rmtree(dir_path)
72 |         
73 |         # Saving metadata
74 |         with open(model_dir / "meta.json", "w") as f:
75 |             json.dump({
76 |                 "source": model_name,
77 |                 "downloaded_at": datetime.now().isoformat(),
78 |                 "downloaded_files": [f.name for f in model_dir.glob("*") if f.is_file()]
79 |             }, f, indent=2)
80 |             
81 |         return str(model_dir)
82 |         
83 |     except Exception as e:
84 |         raise RuntimeError(f"Error downloading model: {str(e)}")


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | 
110 | # pdm
111 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | #   in version control.
115 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116 | .pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 | 
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 | 
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 | 
127 | # SageMath parsed files
128 | *.sage.py
129 | 
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 | 
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 | 
143 | # Rope project settings
144 | .ropeproject
145 | 
146 | # mkdocs documentation
147 | /site
148 | 
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 | 
154 | # Pyre type checker
155 | .pyre/
156 | 
157 | # pytype static type analyzer
158 | .pytype/
159 | 
160 | # Cython debug symbols
161 | cython_debug/
162 | 
163 | # PyCharm
164 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
167 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | #.idea/
169 | 
170 | # Ruff stuff:
171 | .ruff_cache/
172 | 
173 | # PyPI configuration file
174 | .pypirc
175 | 
176 | test.py
177 | 
178 | # models
179 | hub/*
180 | !hub/.gitkeep
181 | 
182 | # pipelines
183 | build/*
184 | !build/.gitkeep
185 | 
186 | # data
187 | data/*
188 | !data/example.json


--------------------------------------------------------------------------------
/ai/pipeline_tester.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import json
  3 | from datetime import datetime
  4 | from sentence_transformers import SentenceTransformer
  5 | from sklearn.metrics.pairwise import cosine_similarity
  6 | from pathlib import Path
  7 | 
  8 | class PipelineTester:
  9 |     def __init__(self, model_name, models_path="build"):
 10 |         """
 11 |         :param model_name: Name of the trained model (e.g. 'faq_model')
 12 |         :param models_path: Path to the folder with trained models (default is 'build')
 13 |         """
 14 |         self.models_path = Path(models_path)
 15 |         self.model_path = self.models_path / model_name
 16 |         self.model = None
 17 |         self.embeddings = None
 18 |         self.answers = None
 19 |         self.meta = None
 20 |         self.stats = {
 21 |             'total_queries': 0,
 22 |             'matches': 0,
 23 |             'threshold': 0.7,
 24 |             'queries': []
 25 |         }
 26 |         self._load_model()
 27 | 
 28 |     def _load_model(self):
 29 |         """Load the model and data from the folder of the trained model"""
 30 |         if not self.model_path.exists():
 31 |             raise FileNotFoundError(f"Model directory not found: {self.model_path}")
 32 | 
 33 |         # Load the model from saved files
 34 |         model_files_path = self.model_path / 'model_files'
 35 |         if not model_files_path.exists():
 36 |             raise FileNotFoundError(f"Model files not found in {model_files_path}")
 37 |         
 38 |         self.model = SentenceTransformer(str(model_files_path))
 39 |         
 40 |         # Load the other components
 41 |         self.embeddings = np.load(self.model_path / 'question_embeddings.npy')
 42 |         
 43 |         with open(self.model_path / 'answers.json', 'r', encoding='utf-8') as f:
 44 |             self.answers = json.load(f)
 45 |             
 46 |         with open(self.model_path / 'meta.json', 'r', encoding='utf-8') as f:
 47 |             self.meta = json.load(f)
 48 | 
 49 |     def get_trained_models(self):
 50 |         """Returns a list of trained models (similar to Education.get_trained_models)"""
 51 |         models = []
 52 |         for model_dir in self.models_path.iterdir():
 53 |             if model_dir.is_dir():
 54 |                 meta_path = model_dir / 'meta.json'
 55 |                 if meta_path.exists():
 56 |                     with open(meta_path, 'r', encoding='utf-8') as f:
 57 |                         try:
 58 |                             meta = json.load(f)
 59 |                             models.append({
 60 |                                 'name': model_dir.name,
 61 |                                 'source': meta['source_data'],
 62 |                                 'questions': meta['questions_count'],
 63 |                                 'created_at': meta['training_params']['created_at'],
 64 |                                 'path': str(model_dir),
 65 |                                 'strategy': meta['training_params']['answer_strategy'],
 66 |                                 'model_info': meta.get('model_info', {})
 67 |                             })
 68 |                         except (json.JSONDecodeError, KeyError) as e:
 69 |                             print(f"Error reading metadata from {meta_path}: {str(e)}")
 70 |                             continue
 71 |         return models
 72 | 
 73 |     def query(self, question, threshold=None):
 74 |         """
 75 |         Query the model
 76 |         :param question: The question text
 77 |         :param threshold: Similarity threshold (None for the default value)
 78 |         :return: {
 79 |             'answer': str|None, 
 80 |             'score': float,
 81 |             'is_match': bool
 82 |         }
 83 |         """
 84 |         threshold = threshold or self.stats['threshold']
 85 |         self.stats['total_queries'] += 1
 86 | 
 87 |         # Encode the question
 88 |         question_embedding = self.model.encode([question])
 89 |         
 90 |         # Find the closest match
 91 |         sim_scores = cosine_similarity(question_embedding, self.embeddings)[0]
 92 |         best_idx = np.argmax(sim_scores)
 93 |         best_score = float(sim_scores[best_idx])
 94 |         is_match = best_score > threshold
 95 |         
 96 |         # Record the statistics
 97 |         result = {
 98 |             'question': question,
 99 |             'answer': self.answers[best_idx] if is_match else None,
100 |             'score': best_score,
101 |             'is_match': is_match,
102 |             'timestamp': datetime.now().isoformat()
103 |         }
104 |         
105 |         if is_match:
106 |             self.stats['matches'] += 1
107 |             
108 |         self.stats['queries'].append(result)
109 |         return result
110 | 
111 |     def get_stats(self, reset=False):
112 |         """
113 |         Get statistics
114 |         :param reset: Reset statistics after fetching
115 |         :return: {
116 |             'total_queries': int,
117 |             'matches': int,
118 |             'match_rate': float,
119 |             'threshold': float,
120 |             'last_query': dict|None
121 |         }
122 |         """
123 |         stats = {
124 |             'total_queries': self.stats['total_queries'],
125 |             'matches': self.stats['matches'],
126 |             'match_rate': self.stats['matches'] / self.stats['total_queries'] if self.stats['total_queries'] > 0 else 0,
127 |             'threshold': self.stats['threshold'],
128 |             'last_query': self.stats['queries'][-1] if self.stats['queries'] else None
129 |         }
130 |         
131 |         if reset:
132 |             self.reset_stats()
133 |             
134 |         return stats
135 | 
136 |     def reset_stats(self):
137 |         """Reset statistics"""
138 |         self.stats = {
139 |             'total_queries': 0,
140 |             'matches': 0,
141 |             'threshold': self.stats['threshold'],
142 |             'queries': []
143 |         }
144 | 
145 |     def set_threshold(self, threshold):
146 |         """Set the similarity threshold"""
147 |         self.stats['threshold'] = float(threshold)
148 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ReplyCore
  2 | 
  3 | ### _Fast QA pipeline creation using your data with [sentence-transformers](https://pypi.org/project/sentence-transformers/): model training and production-ready integration_
  4 | 
  5 | ## ❓Why is this needed?
  6 | 
  7 | _I personally use it to automate responses to frequent repetitive questions in tech support, but there are many possible use cases._
  8 | 
  9 | ## ⚙️How does it work?
 10 | 
 11 | Your questions and answers are converted into numerical vectors using a neural network model.  
 12 | `"How do I reset my password?"` → `[0.24, -0.12, 0.76, ...]`
 13 | 
 14 | The model does not look for exact word matches but calculates **semantic similarity** based on the angle between vectors.
 15 | 
 16 | The system understands **rephrased questions** thanks to:
 17 | 
 18 | - Considering word order
 19 | - Recognizing synonyms (`"reset password" ≈ "recover access"`)
 20 | - Multi-task model training
 21 | 
 22 | ## 🤖📊Available Models in the Interactive Program
 23 | 
 24 | > You can select additional models for `utils/const.py`  
 25 | > from [this list](https://www.sbert.net/docs/sentence_transformer/pretrained_models.html)
 26 | 
 27 | | Model ID | Name                                    | Dimensions | Speed | Languages | Best For                  | Size  | Benchmark (MTEB) |
 28 | | -------- | --------------------------------------- | ---------- | ----- | --------- | ------------------------- | ----- | ---------------- |
 29 | | 1        | `paraphrase-multilingual-mpnet-base-v2` | 768        | 🐢    | 50+       | Highest accuracy tasks    | 1.2GB | 65.3             |
 30 | | 2        | `paraphrase-multilingual-MiniLM-L12-v2` | 384        | 🚗    | 50+       | Balanced speed/quality    | 470MB | 63.7             |
 31 | | 3        | `distiluse-base-multilingual-cased-v2`  | 512        | 🚄    | 50+       | Low-resource environments | 480MB | 61.2             |
 32 | | 4        | `LaBSE`                                 | 768        | 🐢    | 109       | Multilingual applications | 1.8GB | 58.2             |
 33 | | 5        | `multilingual-e5-large`                 | 1024       | 🚗    | 100+      | Large-scale production    | 2.1GB | 72.1             |
 34 | 
 35 | ## 💡✨Why is the Interactive Program Beneficial?
 36 | 
 37 | 1. _Easily train a pipeline without writing custom code_
 38 | 2. _Assemble a ready-to-use pipeline with your model and a built-in module for operation_
 39 | 3. _Download any models directly in the program for offline training_
 40 | 4. _Test your pipelines immediately after training—no need to constantly move folders into your project. Validate on the spot and check statistics_
 41 | 
 42 | ## 🧠🔄Training Strategies
 43 | 
 44 | ### `last` (_Default_)
 45 | 
 46 | **How it works:**
 47 | 
 48 | - Takes the answer with the same index as the question (`answers[i]`).
 49 | - If there are fewer answers than questions, it uses the last answer (`answers[-1]`).
 50 | 
 51 | **Example:**
 52 | 
 53 |     questions = ["Q1", "Q2", "Q3"]
 54 |     answers = ["A1", "A2"]
 55 | 
 56 |     Result:
 57 |     Q1 → A1, Q2 → A2, Q3 → A2 (last answer)
 58 | 
 59 | **When to use:**
 60 | 
 61 | - For "one question → one answer" pairs.
 62 | - When answers are ordered correctly for the questions.
 63 | 
 64 | ##
 65 | 
 66 | ### `cycle` (_Cyclic_)
 67 | 
 68 | **How it works:**
 69 | 
 70 | - Reuses answers cyclically: `answers[i % len(answers)]`.
 71 | 
 72 | **Example:**
 73 | 
 74 |     questions = ["Q1", "Q2", "Q3", "Q4"]
 75 |     answers = ["A1", "A2"]
 76 | 
 77 |     Result:
 78 |     Q1 → A1, Q2 → A2, Q3 → A1, Q4 → A2
 79 | 
 80 | **When to use:**
 81 | 
 82 | - When there are more questions than answers.
 83 | - When answers are general-purpose (e.g., common hints).
 84 | 
 85 |   ##
 86 | 
 87 | ### `random` (_Random_)
 88 | 
 89 | **How it works:**
 90 | 
 91 | - Selects a random answer from the list using `random.choice(answers)`.
 92 | 
 93 | **Example:**
 94 | 
 95 |     questions = ["Q1", "Q2", "Q3"]
 96 |     answers = ["A1", "A2", "A3"]
 97 | 
 98 |     Possible result:
 99 |     Q1 → A3, Q2 → A1, Q3 → A3
100 | 
101 | **When to use:**
102 | 
103 | - To add variety to responses.
104 | 
105 | ##
106 | 
107 | ### `most-similar`
108 | 
109 | **How it works:**
110 | 
111 | 1. For each question, its **embedding** (vector representation) is calculated.
112 | 2. The **embeddings** of all answers are **pre-cached** (for speed).
113 | 3. The answer **most semantically similar** to the question is selected (via cosine similarity).
114 | 
115 | **Example**
116 | 
117 |     questions = ["How to reset password?", "Payment failed", "Contact support"]
118 |     answers = ["Click 'Forgot password'", "Check balance", "Email us at help@site.com"]
119 | 
120 |     # Embeddings:
121 |     q_embeddings = model.encode(questions)  # Vector for each question
122 |     a_embeddings = model.encode(answers)   # Vector for each answer
123 | 
124 |     # For the question "Payment failed":
125 |     question_idx = 1
126 |     question_embedding = q_embeddings[1]
127 | 
128 |     # Compare with answer embeddings:
129 |     similarities = cosine_similarity([question_embedding], a_embeddings)[0]
130 |     best_answer_idx = similarities.argmax()  # Index of the most similar answer
131 | 
132 |     Result:
133 |     "Payment failed" → "Check balance" (as their embeddings are the closest)
134 | 
135 | **When to use:**
136 | 
137 | - When **answers are not tied** to specific questions (e.g., a general knowledge base).
138 | - For complex questions, where **direct matching** (`last`, `cycle`) produces poor results.
139 | - In **RAG systems**, where finding semantic matches is important.
140 | 
141 | ## ⬇️🚀Installation and Launch
142 | 
143 | **Requirements: Python 3.9+**
144 | 
145 | **Install dependencies:**
146 | 
147 |     pip install -r requirements.txt
148 | 
149 | **Add your training data to the `data/` directory**
150 | 
151 | > An example is provided in the `data/example.json` file.
152 | 
153 | **Launch the interactive program:**
154 | 
155 |     python main.py
156 | 
157 | ## 🔗🧩Integration with the Project
158 | 
159 | _The assembled pipelines with models are saved in the `build/your_pipeline` directory. This folder contains the `pipeline.py` module for working with the pipeline._
160 | 
161 | **Working with the assembled pipeline**
162 | 
163 |     from your_pipeline.pipeline import Pipeline
164 | 
165 |     pipe  = Pipeline()
166 |     result  =  pipe.query("Shall we have a cup of coffee?")
167 | 
168 |     print(result)
169 | 
170 | **Result:**
171 | 
172 |     {
173 |         "answer": "I suggest having a freshly squeezed juice",
174 |         "score": 0.8474252223968506,
175 |         "is_match": True,
176 |         "strategy": "cycle"
177 |     }
178 | 
179 | **Where:**
180 | 
181 | - `answer` - _The answer_
182 | - `score` - _Confidence level of the answer_
183 | - `is_match` - _Has the pre-defined similarity threshold been
184 |   exceeded?_
185 | - `strategy` - _Training strategy of the pipeline_
186 | 
187 | ## 🌟In conclusion
188 | 
189 | _This program **will not create a real artificial intelligence**. It will only train a pipeline on existing data. It is not self-learning, it doesn't think, and it can't come up with answers. It simply helps to automate responses._
190 | 


--------------------------------------------------------------------------------
/ai/education.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import random
  3 | import shutil
  4 | import os
  5 | from datetime import datetime
  6 | import time
  7 | import numpy as np
  8 | import torch
  9 | from sentence_transformers import SentenceTransformer, util
 10 | from typing import Literal, Optional, Dict, List
 11 | from pathlib import Path
 12 | from tqdm import tqdm
 13 | 
 14 | class Education:
 15 |     def __init__(self, model_name='paraphrase-multilingual-MiniLM-L12-v2'):
 16 |         """
 17 |         Initialization of model training
 18 |         :param model_name: model name (with or without the prefix)
 19 |         :param hub_dir: folder with saved models (optional)
 20 |         """
 21 |         self.model_name = model_name
 22 |         self.hub_dir = 'hub'
 23 |         self.data_dir = 'data'
 24 |         self.pipeline_dir = 'build'
 25 |         self._answer_embeddings_cache = {}
 26 |         self._current_answers_hash = None
 27 |         self._ensure_dirs_exist()
 28 |         self.model = self._init_model()
 29 |     
 30 |     def _init_model(self):
 31 |         """Initializes the model, first trying the local hub, then creating directly"""
 32 |         # Trying to load from hub if the folder is specified
 33 |         if self.hub_dir:
 34 |             local_path = os.path.join(self.hub_dir, self.model_name)
 35 |             if os.path.exists(local_path):
 36 |                 try:
 37 |                     _a = SentenceTransformer(local_path)
 38 |                     print("Model loaded from hub")
 39 |                     return _a
 40 |                 except Exception as e:
 41 |                     print(f"Failed to load the model from hub: {str(e)}")
 42 |         
 43 |         # If not found in the hub, load directly
 44 |         full_model_name = self.model_name
 45 |         _a = SentenceTransformer(full_model_name)
 46 |         print("Model loaded directly")
 47 |         return _a
 48 | 
 49 |     def _load_model_from_hub(self, model_name):
 50 |         """Loads the model from the local hub folder"""
 51 |         model_path = os.path.join(self.hub_dir, model_name)
 52 |         
 53 |         if not os.path.exists(model_path):
 54 |             raise FileNotFoundError(
 55 |                 f"Model {model_name} not found in folder {self.hub_dir}. "
 56 |                 f"Available models: {os.listdir(self.hub_dir)}"
 57 |             )
 58 |             
 59 |         return SentenceTransformer(model_path)
 60 | 
 61 |     def _ensure_dirs_exist(self):
 62 |         """Creates necessary directories if they do not exist"""
 63 |         Path(self.data_dir).mkdir(parents=True, exist_ok=True)
 64 |         Path(self.pipeline_dir).mkdir(parents=True, exist_ok=True)
 65 |         if self.hub_dir:
 66 |             Path(self.hub_dir).mkdir(parents=True, exist_ok=True)
 67 | 
 68 |     def _copy_pipeline_files(self, model_dir: str):
 69 |         """Copies the necessary files for the pipeline to work"""
 70 |         dest_path = Path(model_dir)
 71 |         
 72 |         # Copying the self-contained pipeline.py
 73 |         current_dir = Path(__file__).parent
 74 |         shutil.copy(current_dir / 'pipeline.py', dest_path)
 75 |         
 76 |         # Creating the __init__.py file
 77 |         with open(dest_path / '__init__.py', 'w') as f:
 78 |             f.write('# Auto-generated pipeline package\n')
 79 |         
 80 |         # Creating requirements.txt
 81 |         possible_req_paths = [
 82 |             current_dir.parent / 'requirements.txt',  # In the root of the project
 83 |             Path.cwd() / 'requirements.txt'           # In the working directory
 84 |         ]
 85 |         
 86 |         for req_path in possible_req_paths:
 87 |             if req_path.exists():
 88 |                 shutil.copy(req_path, dest_path)
 89 |                 break
 90 |     
 91 |     def _get_embeddings(self, answers: List[str], force_update: bool = False) -> torch.Tensor:
 92 |         """Smart caching of answer embeddings"""
 93 |         answers_tuple = tuple(answers)
 94 |         current_hash = hash(answers_tuple)
 95 |         
 96 |         if force_update or current_hash != self._current_answers_hash:
 97 |             self._clear_embeddings_cache()
 98 |             self._current_answers_hash = current_hash
 99 |             
100 |         if current_hash not in self._answer_embeddings_cache:
101 |             with torch.no_grad():
102 |                 self._answer_embeddings_cache[current_hash] = {
103 |                     'embeddings': self.model.encode(answers, convert_to_tensor=True),
104 |                     'timestamp': time.time()
105 |                 }
106 |         
107 |         return self._answer_embeddings_cache[current_hash]['embeddings']
108 | 
109 |     def _clear_embeddings_cache(self, max_items: int = 3, max_age_hours: int = 24):
110 |         """Clearing old caches"""
111 |         now = time.time()
112 |         to_delete = []
113 |         
114 |         if len(self._answer_embeddings_cache) > max_items:
115 |             oldest = sorted(self._answer_embeddings_cache.items(), 
116 |                           key=lambda x: x[1]['timestamp'])[0][0]
117 |             to_delete.append(oldest)
118 |         
119 |         for h, data in self._answer_embeddings_cache.items():
120 |             if (now - data['timestamp']) > max_age_hours * 3600:
121 |                 to_delete.append(h)
122 |         
123 |         for h in set(to_delete):
124 |             del self._answer_embeddings_cache[h]
125 |             if h == self._current_answers_hash:
126 |                 self._current_answers_hash = None
127 | 
128 |     def train_on_file(self, data_file: str, model_name: str, 
129 |                  answer_strategy: Literal['last', 'cycle', 'random', 'most_similar'] = 'last',
130 |                  show_progress: bool = True, chunk_size: int = 100):
131 |         """
132 |         Train the model on the specified data file
133 |         :param data_file: name of the data file (e.g. 'faq.json')
134 |         :param model_name: name for saving the model
135 |         :param answer_strategy: answer selection strategy 
136 |             ('last' - last, 'cycle' - cyclic, 'random' - random, 'most_similar' - most similar)
137 |         :param show_progress: whether to show progress bars
138 |         :param chunk_size: batch size for question encoding
139 |         :return: dictionary with training results
140 |         """
141 |         # Data validation
142 |         if not data_file.endswith('.json'):
143 |             data_file += '.json'
144 |         
145 |         data_path = os.path.join(self.data_dir, data_file)
146 |         if not os.path.exists(data_path):
147 |             raise FileNotFoundError(f"Data file {data_path} not found")
148 | 
149 |         # Load data
150 |         with open(data_path, 'r', encoding='utf-8') as f:
151 |             try:
152 |                 faq = json.load(f)
153 |             except json.JSONDecodeError as e:
154 |                 raise ValueError(f"JSON format error: {str(e)}")
155 | 
156 |         # Validate data structure
157 |         if not isinstance(faq, list):
158 |             raise ValueError("Data should be an array of objects")
159 | 
160 |         # Prepare data with progress bars
161 |         all_questions = []
162 |         all_answers = []
163 |         
164 |         # Main progress bar for FAQ items
165 |         faq_iter = tqdm(faq, desc="Processing FAQ items", disable=not show_progress)
166 |         for item in faq_iter:
167 |             if not all(k in item for k in ['questions', 'answers']):
168 |                 raise ValueError("Each item must contain 'questions' and 'answers'")
169 |             
170 |             answers = item['answers']
171 |             questions = item['questions']
172 |             
173 |             if not answers:
174 |                 raise ValueError("Answer list cannot be empty")
175 |             
176 |             # Nested progress bar for questions
177 |             questions_iter = tqdm(questions, desc="   Processing questions", 
178 |                                 leave=False, disable=not show_progress)
179 |             for i, question in enumerate(questions_iter):
180 |                 all_questions.append(question)
181 |                 
182 |                 # Select answer by strategy
183 |                 if answer_strategy == 'last':
184 |                     answer = answers[i] if i < len(answers) else answers[-1]
185 |                 elif answer_strategy == 'cycle':
186 |                     answer = answers[i % len(answers)]
187 |                 elif answer_strategy == 'random':
188 |                     answer = random.choice(answers)
189 |                 elif answer_strategy == 'most_similar':
190 |                     answer_embeddings = self._get_embeddings(answers)
191 |                     question_embedding = self.model.encode(question, convert_to_tensor=True)
192 |                     similarities = util.cos_sim(question_embedding, answer_embeddings)[0]
193 |                     answer = answers[similarities.argmax().item()]
194 |                 else:
195 |                     raise ValueError(f"Invalid strategy: {answer_strategy}")
196 |                 
197 |                 all_answers.append(answer)
198 | 
199 |         if not all_questions:
200 |             raise ValueError("No questions found for training")
201 | 
202 |         # Encode questions with chunked progress bar
203 |         question_embeddings = []
204 |         chunks = [all_questions[i:i + chunk_size] for i in range(0, len(all_questions), chunk_size)]
205 |         
206 |         encoding_iter = tqdm(chunks, desc="Encoding questions", disable=not show_progress)
207 |         for chunk in encoding_iter:
208 |             question_embeddings.extend(self.model.encode(chunk))
209 |         
210 |         question_embeddings = np.array(question_embeddings)
211 | 
212 |         # Create model folder
213 |         model_dir = os.path.join(self.pipeline_dir, model_name)
214 |         Path(model_dir).mkdir(parents=True, exist_ok=True)
215 |         
216 |         # Save results
217 |         print(f"Saving pipeline...")
218 |         np.save(os.path.join(model_dir, 'question_embeddings.npy'), question_embeddings)
219 |         with open(os.path.join(model_dir, 'answers.json'), 'w', encoding='utf-8') as f:
220 |             json.dump(all_answers, f, ensure_ascii=False, indent=2)
221 |         
222 |         # Save the model
223 |         model_files_path = os.path.join(model_dir, 'model_files')
224 |         self.model.save(model_files_path)
225 |         
226 |         # Get model name safely
227 |         try:
228 |             model_name_attr = getattr(self.model, 'model_name', None)
229 |             model_name_str = model_name_attr if model_name_attr else str(self.model[0].auto_model.config._name_or_path)
230 |             base_model_name = os.path.basename(model_name_str)
231 |         except Exception:
232 |             base_model_name = "unknown_model"
233 | 
234 |         # Model metadata
235 |         meta = {
236 |             'source_data': data_file,
237 |             'questions_count': len(all_questions),
238 |             'answers_count': len(all_answers),
239 |             'model_info': {
240 |                 'name': base_model_name,
241 |                 'source': 'local_hub',
242 |                 'embedding_dim': question_embeddings.shape[1],
243 |                 'max_seq_length': self.model.max_seq_length,
244 |                 'model_files_path': 'model_files'
245 |             },
246 |             'training_params': {
247 |                 'answer_strategy': answer_strategy,
248 |                 'created_at': datetime.now().isoformat(),
249 |                 'chunk_size': chunk_size
250 |             }
251 |         }
252 |         
253 |         with open(os.path.join(model_dir, 'meta.json'), 'w', encoding='utf-8') as f:
254 |             json.dump(meta, f, indent=2, ensure_ascii=False)
255 |         
256 |         self._copy_pipeline_files(model_dir)
257 | 
258 |         return {
259 |             'status': 'success',
260 |             'model_name': model_name,
261 |             'model_dir': model_dir,
262 |             'model_files_path': model_files_path,
263 |             'questions_processed': len(all_questions),
264 |             'answers_processed': len(all_answers),
265 |             'embedding_shape': question_embeddings.shape
266 |         }
267 | 
268 |     def update_answers(self, new_answers: List[str]):
269 |         """Принудительное обновление кэша эмбеддингов"""
270 |         self._clear_embeddings_cache(max_items=0)
271 |         self._get_embeddings(new_answers, force_update=True)
272 |     
273 |     def get_trained_models(self):
274 |         """Returns a list of trained models"""
275 |         models = []
276 |         for model_dir in Path(self.pipeline_dir).iterdir():
277 |             if model_dir.is_dir():
278 |                 meta_path = model_dir / 'meta.json'
279 |                 if meta_path.exists():
280 |                     with open(meta_path, 'r', encoding='utf-8') as f:
281 |                         try:
282 |                             meta = json.load(f)
283 |                             models.append({
284 |                                 'name': model_dir.name,
285 |                                 'source': meta['source_data'],
286 |                                 'questions': meta['questions_count'],
287 |                                 'created_at': meta['training_params']['created_at'],
288 |                                 'path': str(model_dir),
289 |                                 'strategy': meta['training_params']['answer_strategy'],
290 |                                 'model_info': meta.get('model_info', {})
291 |                             })
292 |                         except (json.JSONDecodeError, KeyError) as e:
293 |                             print(f"Error reading metadata {meta_path}: {str(e)}")
294 |                             continue
295 |         return models
296 | 


--------------------------------------------------------------------------------