├── .github
    └── workflows
    │   └── deploy.yaml
├── .gitignore
├── .license
├── 0.26.0
├── Llama1bDATStackingFULLVRAM(300gb).py
├── adaptivekantemplate.py
├── dummy.py
├── erroniousduplicateidea.py
├── kan_gui.py
├── llama_32_1b_tool.py
├── llama_32_1b_toolold10_5_24.py
├── load_offloaded_model.py
├── load_offloaded_model_entropytemp.py
├── load_offloaded_model_old_working.py
├── nonfunctional_transformers_garbled.py
├── offloadedModelLiveLayerIdea.py
├── readme.md
├── requirements.txt
├── run - load_offloaded_model.bat
├── run - splitsafetensors.bat
├── run.bat
├── setup.bat
├── split_safetensors.py
├── test_model_loading.py
├── test_sentencepiece.py
├── test_tokenizer_loading.py
└── venv.bat


/.github/workflows/deploy.yaml:
--------------------------------------------------------------------------------
 1 | name: Run KAN Emotional Character with LLaMA 3.1 8B Instruct
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 | 
 8 | jobs:
 9 |   run-kan-emotional-character:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - name: Checkout code
13 |       uses: actions/checkout@v2
14 | 
15 |     - name: Set up Python
16 |       uses: actions/setup-python@v2
17 |       with:
18 |         python-version: '3.x'
19 | 
20 |     - name: Install dependencies
21 |       run: |
22 |         python -m pip install --upgrade pip
23 |         pip install -r requirements.txt
24 |         pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
25 |         pip install git+https://github.com/huggingface/transformers
26 | 
27 |     - name: Run KAN Emotional Character script
28 |       env:
29 |         HF_CLIENT_ID: ${{ secrets.HF_CLIENT_ID }}
30 |         HF_CLIENT_SECRET: ${{ secrets.HF_CLIENT_SECRET }}
31 |       run: |
32 |         python kan_emotional_character_llama_hf.py
33 | 
34 |     - name: Upload logs
35 |       uses: actions/upload-artifact@v2
36 |       with:
37 |         name: kan-emotional-character-logs
38 |         path: kan_emotional_character.log


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Virtual Environment
  2 | venv/
  3 | env/
  4 | ENV/
  5 | 
  6 | # Python cache files
  7 | __pycache__/
  8 | *.py[cod]
  9 | *$py.class
 10 | 
 11 | # C extensions
 12 | *.so
 13 | 
 14 | # Distribution / packaging
 15 | .Python
 16 | build/
 17 | develop-eggs/
 18 | dist/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib/
 23 | lib64/
 24 | parts/
 25 | sdist/
 26 | var/
 27 | wheels/
 28 | share/python-wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | 
 33 | # PyInstaller
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | cover/
 55 | 
 56 | # Jupyter Notebook
 57 | .ipynb_checkpoints
 58 | 
 59 | # IPython
 60 | profile_default/
 61 | ipython_config.py
 62 | 
 63 | # pyenv
 64 | .python-version
 65 | 
 66 | # Environments
 67 | .env
 68 | .venv
 69 | 
 70 | # Spyder project settings
 71 | .spyderproject
 72 | .spyproject
 73 | 
 74 | # Rope project settings
 75 | .ropeproject
 76 | 
 77 | # mkdocs documentation
 78 | /site
 79 | 
 80 | # mypy
 81 | .mypy_cache/
 82 | .dmypy.json
 83 | dmypy.json
 84 | 
 85 | # Pyre type checker
 86 | .pyre/
 87 | 
 88 | # pytype static type analyzer
 89 | .pytype/
 90 | 
 91 | # Cython debug symbols
 92 | cython_debug/
 93 | 
 94 | # PyCharm
 95 | .idea/
 96 | 
 97 | # VS Code
 98 | .vscode/
 99 | 
100 | # Windows
101 | Thumbs.db
102 | ehthumbs.db
103 | Desktop.ini
104 | 
105 | # macOS
106 | .DS_Store
107 | .AppleDouble
108 | .LSOverride
109 | 
110 | # Project-specific
111 | kan_character_state.json
112 | *.log
113 | 
114 | # Models folder
115 | models/
116 | kan_states/


--------------------------------------------------------------------------------
/.license:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 WuBu (WaefreBeorn)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/0.26.0:
--------------------------------------------------------------------------------
 1 | Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
 2 | Collecting accelerate
 3 |   Downloading accelerate-1.0.0-py3-none-any.whl.metadata (19 kB)
 4 | Requirement already satisfied: numpy<3.0.0,>=1.17 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from accelerate) (1.26.3)
 5 | Requirement already satisfied: packaging>=20.0 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from accelerate) (24.1)
 6 | Collecting psutil (from accelerate)
 7 |   Downloading psutil-6.0.0-cp37-abi3-win_amd64.whl.metadata (22 kB)
 8 | Requirement already satisfied: pyyaml in c:\projects\kan-wubu-memory\venv\lib\site-packages (from accelerate) (6.0.2)
 9 | Requirement already satisfied: torch>=1.10.0 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from accelerate) (2.4.1+cu118)
10 | Requirement already satisfied: huggingface-hub>=0.21.0 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from accelerate) (0.25.1)
11 | Requirement already satisfied: safetensors>=0.4.3 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from accelerate) (0.4.5)
12 | Requirement already satisfied: filelock in c:\projects\kan-wubu-memory\venv\lib\site-packages (from huggingface-hub>=0.21.0->accelerate) (3.13.1)
13 | Requirement already satisfied: fsspec>=2023.5.0 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from huggingface-hub>=0.21.0->accelerate) (2024.2.0)
14 | Requirement already satisfied: requests in c:\projects\kan-wubu-memory\venv\lib\site-packages (from huggingface-hub>=0.21.0->accelerate) (2.32.3)
15 | Requirement already satisfied: tqdm>=4.42.1 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from huggingface-hub>=0.21.0->accelerate) (4.66.5)
16 | Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from huggingface-hub>=0.21.0->accelerate) (4.9.0)
17 | Requirement already satisfied: sympy in c:\projects\kan-wubu-memory\venv\lib\site-packages (from torch>=1.10.0->accelerate) (1.12)
18 | Requirement already satisfied: networkx in c:\projects\kan-wubu-memory\venv\lib\site-packages (from torch>=1.10.0->accelerate) (3.2.1)
19 | Requirement already satisfied: jinja2 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from torch>=1.10.0->accelerate) (3.1.3)
20 | Requirement already satisfied: setuptools in c:\projects\kan-wubu-memory\venv\lib\site-packages (from torch>=1.10.0->accelerate) (70.0.0)
21 | Requirement already satisfied: colorama in c:\projects\kan-wubu-memory\venv\lib\site-packages (from tqdm>=4.42.1->huggingface-hub>=0.21.0->accelerate) (0.4.6)
22 | Requirement already satisfied: MarkupSafe>=2.0 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from jinja2->torch>=1.10.0->accelerate) (2.1.5)
23 | Requirement already satisfied: charset-normalizer<4,>=2 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from requests->huggingface-hub>=0.21.0->accelerate) (3.3.2)
24 | Requirement already satisfied: idna<4,>=2.5 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from requests->huggingface-hub>=0.21.0->accelerate) (3.10)
25 | Requirement already satisfied: urllib3<3,>=1.21.1 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from requests->huggingface-hub>=0.21.0->accelerate) (2.2.3)
26 | Requirement already satisfied: certifi>=2017.4.17 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from requests->huggingface-hub>=0.21.0->accelerate) (2024.8.30)
27 | Requirement already satisfied: mpmath>=0.19 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from sympy->torch>=1.10.0->accelerate) (1.3.0)
28 | Downloading accelerate-1.0.0-py3-none-any.whl (330 kB)
29 | Downloading psutil-6.0.0-cp37-abi3-win_amd64.whl (257 kB)
30 | Installing collected packages: psutil, accelerate
31 | Successfully installed accelerate-1.0.0 psutil-6.0.0
32 | 


--------------------------------------------------------------------------------
/Llama1bDATStackingFULLVRAM(300gb).py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import json
  6 | import numpy as np
  7 | import re
  8 | import logging
  9 | from sklearn.feature_extraction.text import TfidfVectorizer
 10 | from sklearn.metrics.pairwise import cosine_similarity
 11 | from transformers import LlamaForCausalLM, AutoTokenizer, LlamaConfig
 12 | 
 13 | # Define paths to the directories and files
 14 | SOURCE_DIR = "models/Llama_32_1B/"
 15 | WEIGHTS_DIR = os.path.join(SOURCE_DIR, "offload")
 16 | MODEL_JSON_PATH = os.path.join(SOURCE_DIR, "config.json")
 17 | 
 18 | # Initialize logging
 19 | logging.basicConfig(level=logging.INFO)
 20 | 
 21 | # Load the configuration from the JSON file
 22 | def load_configuration(model_json_path):
 23 |     with open(model_json_path, "r") as f:
 24 |         config_data = json.load(f)
 25 |     config = LlamaConfig(**config_data)
 26 |     return config
 27 | 
 28 | # Use AutoTokenizer instead of LlamaTokenizer to resolve class conflicts
 29 | def load_tokenizer(source_dir):
 30 |     return AutoTokenizer.from_pretrained(source_dir)
 31 | 
 32 | # Load the model configuration
 33 | logging.info(f"Loading model configuration from: {MODEL_JSON_PATH}")
 34 | config = load_configuration(MODEL_JSON_PATH)
 35 | 
 36 | # Custom module for multiple stacked LLaMA layers (equivalent to 6x Mamba2 in NVIDIA presentation)
 37 | class StackedLlamaModule(nn.Module):
 38 |     def __init__(self, config, num_layers=6):
 39 |         super(StackedLlamaModule, self).__init__()
 40 |         self.layers = nn.ModuleList([LlamaForCausalLM(config) for _ in range(num_layers)])  # Mimicking 6x Mamba2
 41 | 
 42 |     def forward(self, input_ids, attention_mask=None):
 43 |         x = input_ids
 44 |         for layer in self.layers:
 45 |             outputs = layer(input_ids=x, attention_mask=attention_mask)
 46 |             x = outputs.logits
 47 |         return x
 48 | 
 49 | # Define shared components (e.g., Shared1 and Shared2) used in the modular structure
 50 | class SharedLayer(nn.Module):
 51 |     def __init__(self, hidden_size):
 52 |         super(SharedLayer, self).__init__()
 53 |         self.mlp = nn.Sequential(
 54 |             nn.Linear(hidden_size, hidden_size),
 55 |             nn.ReLU(),
 56 |             nn.Linear(hidden_size, hidden_size),
 57 |         )
 58 |         self.attention = nn.MultiheadAttention(hidden_size, num_heads=8)
 59 | 
 60 |     def forward(self, x):
 61 |         x = self.mlp(x)
 62 |         x, _ = self.attention(x, x, x)
 63 |         return x
 64 | 
 65 | # Define Low-Rank Adaptation (LoRA) for efficient fine-tuning
 66 | class LoRA(nn.Module):
 67 |     def __init__(self, hidden_size, rank=8):
 68 |         super(LoRA, self).__init__()
 69 |         self.rank = rank
 70 |         self.lora_A = nn.Linear(hidden_size, rank, bias=False)
 71 |         self.lora_B = nn.Linear(rank, hidden_size, bias=False)
 72 | 
 73 |     def forward(self, x):
 74 |         return x + self.lora_B(self.lora_A(x))
 75 | 
 76 | # Complete Stacked LLaMA model with shared components, stacking, and LoRA
 77 | class StackedLlamaNetwork(nn.Module):
 78 |     def __init__(self, config, shared1, shared2, num_stacks=3):
 79 |         super(StackedLlamaNetwork, self).__init__()
 80 |         self.blocks = nn.ModuleList()
 81 |         
 82 |         for i in range(num_stacks):
 83 |             specialization = "early" if i == 0 else "mid" if i == 1 else "late"
 84 |             self.blocks.append(
 85 |                 nn.ModuleDict({
 86 |                     "transformer_block": StackedLlamaModule(config),  # Equivalent to 6x Mamba2
 87 |                     "linear": nn.Linear(config.hidden_size, config.hidden_size),
 88 |                     "shared": shared1 if i % 2 == 0 else shared2,  # Alternating shared layers
 89 |                     "lora_adapter": LoRA(config.hidden_size)  # Optional LoRA for fine-tuning
 90 |                 })
 91 |             )
 92 | 
 93 |     def forward(self, input_ids, attention_mask=None):
 94 |         x = input_ids
 95 |         intermediate_outputs = []
 96 | 
 97 |         for block in self.blocks:
 98 |             x = block["transformer_block"](x, attention_mask)
 99 |             x = block["linear"](x)
100 |             x = block["shared"](x)
101 |             x = block["lora_adapter"](x)
102 |             intermediate_outputs.append(x)
103 | 
104 |         # Concatenation of intermediate outputs (mimicking 'cat' operation in the image)
105 |         x = torch.cat(intermediate_outputs, dim=-1)
106 |         
107 |         return x
108 | 
109 | # Load the offloaded weights from the `.dat` files
110 | def load_dat_file(file_path, dtype):
111 |     with open(file_path, 'rb') as f:
112 |         tensor_data = np.fromfile(f, dtype=dtype)
113 |     loaded_tensor = torch.tensor(tensor_data)
114 |     
115 |     # If dtype was mapped to float32 for bfloat16 compatibility, convert back
116 |     if dtype == np.float32 and "bfloat16" in file_path:
117 |         loaded_tensor = loaded_tensor.to(torch.bfloat16)
118 |     return loaded_tensor
119 | 
120 | def load_offloaded_weights(stacked_model, weights_dir):
121 |     for i, llama_model in enumerate(stacked_model.blocks):
122 |         logging.info(f"Loading weights for LLaMA stack {i + 1}")
123 |         for name, param in llama_model["transformer_block"].layers.named_parameters():
124 |             file_name = name.replace('.', '_') + ".dat"
125 |             file_path = os.path.join(weights_dir, file_name)
126 | 
127 |             if os.path.exists(file_path):
128 |                 dtype_map = {
129 |                     torch.float16: np.float16,
130 |                     torch.float32: np.float32,
131 |                     torch.int64: np.int64,
132 |                     torch.int32: np.int32,
133 |                     torch.bfloat16: np.float32,
134 |                 }
135 |                 expected_dtype = dtype_map.get(param.dtype, np.float32)
136 |                 logging.info(f"Loading {file_name} into {name} with expected type {expected_dtype}")
137 |                 loaded_tensor = load_dat_file(file_path, expected_dtype).view_as(param)
138 | 
139 |                 if param.dtype == torch.bfloat16:
140 |                     loaded_tensor = loaded_tensor.to(torch.bfloat16)
141 | 
142 |                 param.data.copy_(loaded_tensor.to("cuda"))
143 |             else:
144 |                 logging.warning(f"Warning: {file_name} not found in offloaded directory.")
145 | 
146 | # Load the weights into the model
147 | shared1 = SharedLayer(config.hidden_size)
148 | shared2 = SharedLayer(config.hidden_size)
149 | num_stacks = 3  # Number of stacked LLaMA instances
150 | model = StackedLlamaNetwork(config, shared1, shared2, num_stacks=num_stacks)
151 | load_offloaded_weights(model, WEIGHTS_DIR)
152 | 
153 | # Move the model to GPU for inference
154 | model.to('cuda')
155 | model.eval()
156 | 
157 | # Load the tokenizer for LLaMA
158 | logging.info(f"Loading tokenizer from directory: {SOURCE_DIR}")
159 | tokenizer = load_tokenizer(SOURCE_DIR)
160 | 
161 | # ResponseQualityManager class for evaluating and improving responses
162 | class ResponseQualityManager:
163 |     def __init__(self, kan_model, tokenizer):
164 |         self.kan_model = kan_model
165 |         self.tokenizer = tokenizer
166 |         self.tfidf_vectorizer = TfidfVectorizer()
167 | 
168 |     def evaluate_response(self, user_input, response):
169 |         relevance_score = self.calculate_relevance(user_input, response)
170 |         structure_valid = self.has_proper_structure(response)
171 |         is_garbled = self.detect_garbled_output(response)
172 |         return relevance_score > 0.3 and structure_valid and not is_garbled
173 | 
174 |     def calculate_relevance(self, user_input, response):
175 |         user_tokens = set(self.tokenizer.tokenize(user_input))
176 |         response_tokens = set(self.tokenizer.tokenize(response))
177 |         overlap = len(user_tokens.intersection(response_tokens))
178 |         overlap_score = overlap / max(len(user_tokens), 1)
179 | 
180 |         combined_texts = [user_input, response]
181 |         tfidf_matrix = self.tfidf_vectorizer.fit_transform(combined_texts)
182 |         cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
183 | 
184 |         return 0.5 * overlap_score + 0.5 * cosine_sim
185 | 
186 |     def detect_garbled_output(self, response):
187 |         if re.search(r'[^\x00-\x7F]+', response):
188 |             return True
189 |         if len(response.split()) < 3:
190 |             return True
191 |         if response.count('.') / len(response.split()) > 0.5:
192 |             return True
193 |         return False
194 | 
195 |     def has_proper_structure(self, response):
196 |         sentences = re.split(r'(?<=[.!?])\s+', response.strip())
197 |         return len(sentences) > 0 and sentences[0][0].isupper() and sentences[-1][-1] in '.!?'
198 | 
199 | # Quality Manager instance for response evaluation
200 | quality_manager = ResponseQualityManager(model, tokenizer)
201 | 
202 | # Updated generation logic to handle context better and avoid repetitive responses
203 | def generate_response(input_text, model, tokenizer, max_new_tokens=150, pad_token_id=128001, history=[], context_limit=512):
204 |     history = [line for line in history if line.strip()]  # Clean the history
205 |     prompt = f"{' '.join(history[-3:])}\nUser: {input_text}\n" if history else f"User: {input_text}\n"
206 |     inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=context_limit).to("cuda")
207 |     
208 |     with torch.no_grad():
209 |         outputs = model(inputs["input_ids"], attention_mask=inputs["attention_mask"])
210 |         output_ids = torch.argmax(outputs, dim=-1)
211 | 
212 |     response = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
213 |     cleaned_response = re.sub(r'\s+', ' ', response.split("User:")[-1].strip())
214 |     history.append(f"User: {input_text}\nModel: {cleaned_response}")
215 | 
216 |     if len(history) > 6:
217 |         history = history[-6:]
218 | 
219 |     return cleaned_response, history
220 | 
221 | # Interactive query loop with refined response generation
222 | def user_input_loop(model, tokenizer):
223 |     print("\n--- LLaMA Instruct Model Interactive Query ---")
224 |     print("Type 'exit' to quit.")
225 |     history = []  # Initialize a history buffer to keep track of conversation
226 |     while True:
227 |         user_input = input("\nEnter your query: ")
228 |         if user_input.lower() == 'exit':
229 |             print("Exiting...")
230 |             break
231 |         response, history = generate_response(user_input, model, tokenizer, history=history)
232 |         print(f"Model Response: {response}")
233 | 
234 | # Start the interactive query loop
235 | logging.info("Model loaded successfully. You can now query the model.")
236 | user_input_loop(model, tokenizer)
237 | 


--------------------------------------------------------------------------------
/adaptivekantemplate.py:
--------------------------------------------------------------------------------
  1 | #adaptivekantemplate.py
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | 
  6 | class AdaptiveKANLayer(nn.Module):
  7 |     def __init__(self, input_size, output_size, num_knots=10, temperature=0.666):
  8 |         """
  9 |         Initialize an adaptive KAN layer with spline-based transformations.
 10 |         
 11 |         Args:
 12 |             input_size (int): Number of input features.
 13 |             output_size (int): Number of output features.
 14 |             num_knots (int): Number of knots in the spline function.
 15 |             temperature (float): Temperature parameter for adaptive updates.
 16 |         """
 17 |         super(AdaptiveKANLayer, self).__init__()
 18 |         self.input_size = input_size
 19 |         self.output_size = output_size
 20 |         self.num_knots = num_knots
 21 |         self.temperature = temperature
 22 |         
 23 |         # Define spline parameters
 24 |         self.knots = nn.Parameter(torch.linspace(-1, 1, num_knots))
 25 |         self.coeffs = nn.Parameter(torch.randn(input_size, output_size, num_knots))
 26 | 
 27 |     def forward(self, x):
 28 |         """
 29 |         Forward pass for the KAN layer.
 30 |         
 31 |         Args:
 32 |             x (torch.Tensor): Input tensor of shape (batch_size, input_size).
 33 |             
 34 |         Returns:
 35 |             torch.Tensor: Transformed output of shape (batch_size, output_size).
 36 |         """
 37 |         weights = self.compute_spline_weights(x)
 38 |         return torch.matmul(x, weights)
 39 | 
 40 |     def compute_spline_weights(self, x):
 41 |         """
 42 |         Compute the spline transformation weights for input x.
 43 |         
 44 |         Args:
 45 |             x (torch.Tensor): Input tensor of shape (batch_size, input_size).
 46 |             
 47 |         Returns:
 48 |             torch.Tensor: Spline weights of shape (input_size, output_size).
 49 |         """
 50 |         weights = F.interpolate(self.coeffs.unsqueeze(0), size=(self.num_knots,)).squeeze(0)
 51 |         return weights
 52 | 
 53 |     def calculate_entropy(self, logits):
 54 |         """
 55 |         Calculate entropy of the spline transformations.
 56 |         
 57 |         Args:
 58 |             logits (torch.Tensor): Logits tensor of shape (batch_size, num_classes).
 59 |             
 60 |         Returns:
 61 |             torch.Tensor: Entropy values for each class.
 62 |         """
 63 |         p = F.softmax(logits, dim=-1)
 64 |         entropy = -torch.sum(p * torch.log(p + 1e-9), dim=-1)
 65 |         return entropy
 66 | 
 67 |     def adaptive_update(self, entropy, variance):
 68 |         """
 69 |         Adaptively update grid resolution and regularization based on entropy.
 70 |         
 71 |         Args:
 72 |             entropy (float): Current entropy of the spline transformations.
 73 |             variance (float): Variance of the entropy values.
 74 |         """
 75 |         if entropy < 0.1 and variance < 0.1:
 76 |             self.prune_knots()
 77 |         elif entropy > 5.0 and variance < 0.1:
 78 |             self.extend_knots()
 79 |         elif entropy < 5.0 and variance > 5.0:
 80 |             self.refine_coeffs()
 81 |         elif entropy > 5.0 and variance > 5.0:
 82 |             self.increase_capacity()
 83 |         else:
 84 |             self.moderate_update()
 85 | 
 86 |     def prune_knots(self):
 87 |         """Remove low-impact knots."""
 88 |         if self.num_knots > 3:  # Ensure a minimum number of knots
 89 |             self.num_knots -= 1
 90 |             self.knots = nn.Parameter(torch.linspace(-1, 1, self.num_knots))
 91 |             self.coeffs = nn.Parameter(torch.randn(self.input_size, self.output_size, self.num_knots))
 92 | 
 93 |     def extend_knots(self):
 94 |         """Add new knots to the spline."""
 95 |         self.num_knots += 1
 96 |         self.knots = nn.Parameter(torch.linspace(-1, 1, self.num_knots))
 97 |         self.coeffs = nn.Parameter(torch.randn(self.input_size, self.output_size, self.num_knots))
 98 | 
 99 |     def refine_coeffs(self):
100 |         """Adjust coefficients for local refinement."""
101 |         with torch.no_grad():
102 |             self.coeffs += torch.randn_like(self.coeffs) * 0.01
103 | 
104 |     def increase_capacity(self):
105 |         """Increase the capacity of the layer."""
106 |         with torch.no_grad():
107 |             self.coeffs = nn.Parameter(torch.cat([self.coeffs, torch.randn(self.input_size, self.output_size, self.num_knots)], dim=1))
108 | 
109 |     def moderate_update(self):
110 |         """Default update routine."""
111 |         self.refine_coeffs()
112 | 
113 | 
114 | class AdaptiveKANNetwork(nn.Module):
115 |     def __init__(self, input_size, hidden_sizes, output_size, num_layers=3, temperature=0.666):
116 |         """
117 |         Initialize a multi-layer KAN network with adaptive layers.
118 |         
119 |         Args:
120 |             input_size (int): Number of input features.
121 |             hidden_sizes (list of int): List of hidden sizes for each layer.
122 |             output_size (int): Number of output features.
123 |             num_layers (int): Number of KAN layers.
124 |             temperature (float): Temperature parameter for adaptive updates.
125 |         """
126 |         super(AdaptiveKANNetwork, self).__init__()
127 |         self.input_size = input_size
128 |         self.hidden_sizes = hidden_sizes
129 |         self.output_size = output_size
130 |         self.num_layers = num_layers
131 |         self.temperature = temperature
132 | 
133 |         # Initialize KAN layers
134 |         self.layers = nn.ModuleList()
135 |         in_size = input_size
136 |         for hidden_size in hidden_sizes:
137 |             self.layers.append(AdaptiveKANLayer(in_size, hidden_size, num_knots=10, temperature=temperature))
138 |             in_size = hidden_size
139 |         self.output_layer = AdaptiveKANLayer(in_size, output_size, num_knots=10, temperature=temperature)
140 | 
141 |     def forward(self, x):
142 |         """
143 |         Forward pass through the KAN network.
144 |         
145 |         Args:
146 |             x (torch.Tensor): Input tensor of shape (batch_size, input_size).
147 |             
148 |         Returns:
149 |             torch.Tensor: Network output of shape (batch_size, output_size).
150 |         """
151 |         for layer in self.layers:
152 |             x = layer(x)
153 |             x = F.relu(x)
154 |         return self.output_layer(x)
155 | 
156 |     def adaptive_train_step(self, x, y, optimizer):
157 |         """
158 |         Single training step with adaptive updates.
159 |         
160 |         Args:
161 |             x (torch.Tensor): Input tensor.
162 |             y (torch.Tensor): Target tensor.
163 |             optimizer (torch.optim.Optimizer): Optimizer for updating parameters.
164 |         """
165 |         optimizer.zero_grad()
166 |         output = self.forward(x)
167 |         loss = F.mse_loss(output, y)
168 | 
169 |         # Calculate entropy and variance for adaptive updates
170 |         entropy = torch.mean(torch.stack([layer.calculate_entropy(layer.coeffs) for layer in self.layers]))
171 |         variance = torch.var(torch.stack([layer.calculate_entropy(layer.coeffs) for layer in self.layers]))
172 | 
173 |         # Adaptive updates
174 |         for layer in self.layers:
175 |             layer.adaptive_update(entropy, variance)
176 |         
177 |         # Backpropagation and optimization step
178 |         loss.backward()
179 |         optimizer.step()
180 |         return loss.item()
181 | 
182 | 
183 | # Example Usage
184 | if __name__ == "__main__":
185 |     # Define input and output sizes
186 |     input_size = 10
187 |     hidden_sizes = [20, 30]
188 |     output_size = 5
189 | 
190 |     # Create the network and optimizer
191 |     model = AdaptiveKANNetwork(input_size, hidden_sizes, output_size, num_layers=3)
192 |     optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
193 | 
194 |     # Example data
195 |     x = torch.randn(32, input_size)
196 |     y = torch.randn(32, output_size)
197 | 
198 |     # Training step
199 |     for epoch in range(100):
200 |         loss = model.adaptive_train_step(x, y, optimizer)
201 |         print(f"Epoch {epoch+1}, Loss: {loss}")
202 | 


--------------------------------------------------------------------------------
/dummy.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from sklearn.decomposition import PCA
  4 | import plotly.graph_objs as go
  5 | import numpy as np
  6 | from tqdm import tqdm
  7 | import json
  8 | 
  9 | # 1. User-friendly Configuration Setup for Macroprocessor-Like Inference
 10 | 
 11 | class UserConfig:
 12 |     """Interface to set up and customize model configurations and preferences for a macroprocessor-like model."""
 13 |     def __init__(self, config_file=None):
 14 |         self.config = {
 15 |             "max_length": 20,
 16 |             "initial_weights": {
 17 |                 "entropy": 1.0,
 18 |                 "varentropy": 0.5,
 19 |                 "kl_div": 0.3,
 20 |                 "perplexity": 0.2
 21 |             },
 22 |             "visualization_frequency": 5,
 23 |             "logging_level": "detailed",
 24 |             "interactive_visuals": True,
 25 |             "precision": "float16",  # Use float16 or bfloat16 for inference
 26 |             "kv_cache_enabled": True  # Enable smart KV-caching
 27 |         }
 28 |         if config_file:
 29 |             self.load_config(config_file)
 30 | 
 31 |     def load_config(self, file_path):
 32 |         """Load configuration from a JSON file."""
 33 |         with open(file_path, 'r') as f:
 34 |             self.config = json.load(f)
 35 |     
 36 |     def save_config(self, file_path):
 37 |         """Save the current configuration to a JSON file."""
 38 |         with open(file_path, 'w') as f:
 39 |             json.dump(self.config, f, indent=4)
 40 |     
 41 |     def update_config(self, key, value):
 42 |         """Update a specific configuration setting."""
 43 |         self.config[key] = value
 44 |     
 45 |     def get_config(self):
 46 |         """Return the current configuration."""
 47 |         return self.config
 48 | 
 49 | 
 50 | # 2. Token Selection with Efficient Memory Management (Macroprocessor-like Operations)
 51 | 
 52 | def select_token_with_weights(logits, vertex_movements, loss_weighting_system, precision="float16"):
 53 |     """Efficient token selection using entropy, varentropy, and configurable loss weights with precision support."""
 54 |     if precision == "float16":
 55 |         logits = logits.half()  # Switch to float16 for inference speedup
 56 |     elif precision == "bfloat16":
 57 |         logits = logits.bfloat16()  # Alternatively, use bfloat16
 58 | 
 59 |     # Compute multiple losses
 60 |     entropy = calculate_entropy(logits)
 61 |     varentropy = calculate_varentropy(entropy)
 62 |     kl_div = calculate_kl_divergence(logits)
 63 |     perplexity = calculate_perplexity(logits)
 64 | 
 65 |     # Log losses to adjust weights dynamically
 66 |     loss_weighting_system.log_losses(entropy, varentropy, kl_div, perplexity)
 67 | 
 68 |     # Adjust weights based on historical performance
 69 |     loss_weighting_system.adjust_weights()
 70 |     weights = loss_weighting_system.get_weights()
 71 | 
 72 |     # Adjust logits by the weighted sum of losses and vertex movements
 73 |     adjusted_logits = logits - (
 74 |         weights['entropy'] * entropy +
 75 |         weights['varentropy'] * varentropy +
 76 |         weights['kl_div'] * kl_div +
 77 |         weights['perplexity'] * perplexity
 78 |     ).unsqueeze(-1)
 79 | 
 80 |     # Apply vertex movement strategy
 81 |     adjusted_logits += vertex_movements
 82 | 
 83 |     # Sample from adjusted probabilities
 84 |     probs = F.softmax(adjusted_logits, dim=-1)
 85 |     selected_token = torch.multinomial(probs, 1)
 86 |     
 87 |     return selected_token, adjusted_logits
 88 | 
 89 | 
 90 | # 3. Improved Visualization with Token-Level Progress
 91 | 
 92 | def plot_interactive_4d_space(hidden_states, entropies, time_steps):
 93 |     """Optimized 3D projection with token-level interactivity for macroprocessor-like inference."""
 94 |     pca = PCA(n_components=3)
 95 |     fig_data = []
 96 | 
 97 |     for i, (hs, entropy, time_step) in enumerate(zip(hidden_states, entropies, time_steps)):
 98 |         projected_hs = pca.fit_transform(hs.squeeze(0).detach().cpu().numpy())
 99 |         entropy_colors = (entropy.detach().cpu().numpy() - np.min(entropy.detach().cpu().numpy())) / \
100 |                          (np.max(entropy.detach().cpu().numpy()) - np.min(entropy.detach().cpu().numpy()))
101 | 
102 |         scatter = go.Scatter3d(
103 |             x=projected_hs[:, 0], y=projected_hs[:, 1], z=projected_hs[:, 2],
104 |             mode='markers',
105 |             marker=dict(size=5, color=entropy_colors, colorscale='Viridis', opacity=0.8),
106 |             name=f"Step {time_step}",
107 |             text=[f"Step: {time_step}, Entropy: {entropy_val}" for entropy_val in entropy.detach().cpu().numpy()]
108 |         )
109 |         fig_data.append(scatter)
110 |     
111 |     layout = go.Layout(
112 |         title="Token-wise 4D Space Travel with Layer Progression",
113 |         scene=dict(
114 |             xaxis_title='PCA 1',
115 |             yaxis_title='PCA 2',
116 |             zaxis_title='PCA 3',
117 |         ),
118 |         hovermode='closest',
119 |         updatemenus=[dict(
120 |             type="buttons",
121 |             showactive=False,
122 |             buttons=[dict(label="Play",
123 |                           method="animate",
124 |                           args=[None, {"frame": {"duration": 500, "redraw": True}, "fromcurrent": True}])]
125 |         )]
126 |     )
127 |     
128 |     fig = go.Figure(data=fig_data, layout=layout)
129 |     fig.show()
130 | 
131 | 
132 | # 4. Efficient Memory and KV-Caching for Faster Macroprocessor-Style Inference
133 | 
134 | def inference_with_kv_caching(model, input_ids, user_config):
135 |     config = user_config.get_config()
136 |     precision = config["precision"]
137 |     kv_cache_enabled = config["kv_cache_enabled"]
138 |     
139 |     entropies, varentropies, kl_divs, perplexities = [], [], [], []
140 |     vertex_movements = torch.zeros(input_ids.size(0), model.config.vocab_size).to(input_ids.device)
141 |     loss_weighting_system = ConfigurableLossWeighting(user_config)
142 |     time_steps = []
143 | 
144 |     # Enable caching of keys and values for attention layers
145 |     past_kv_cache = None if not kv_cache_enabled else {}
146 | 
147 |     with tqdm(total=config["max_length"], desc="Macro Inference Progress", unit="step") as progress:
148 |         for step in range(config["max_length"]):
149 |             if kv_cache_enabled and past_kv_cache:
150 |                 # Use past key-value cache to speed up inference
151 |                 model_kwargs = {"past_key_values": past_kv_cache}
152 |             else:
153 |                 model_kwargs = {}
154 | 
155 |             logits, past_kv_cache = model(input_ids, **model_kwargs)[:2]  # Retrieve past_kv_cache for the next step
156 | 
157 |             logits = logits[:, -1, :]  # Logits for the last token
158 | 
159 |             # Efficient token selection
160 |             next_token, adjusted_logits = select_token_with_weights(logits, vertex_movements, loss_weighting_system, precision)
161 | 
162 |             # Append the selected token to input_ids
163 |             input_ids = torch.cat([input_ids, next_token], dim=1)
164 | 
165 |             # Log losses and update token trajectory
166 |             entropy = calculate_entropy(logits)
167 |             varentropy = calculate_varentropy(entropy)
168 |             kl_div = calculate_kl_divergence(logits)
169 |             perplexity = calculate_perplexity(logits)
170 | 
171 |             entropies.append(entropy)
172 |             varentropies.append(varentropy)
173 |             kl_divs.append(kl_div)
174 |             perplexities.append(perplexity)
175 |             time_steps.append(step)
176 | 
177 |             progress.update(1)
178 | 
179 |     # Once complete, visualize the token-wise space travel
180 |     plot_interactive_4d_space(hidden_states, entropies, time_steps)
181 | 
182 |     return input_ids
183 | 
184 | 
185 | # 5. Example Usage
186 | 
187 | if __name__ == "__main__":
188 |     from transformers import AutoTokenizer, AutoModelForCausalLM
189 | 
190 |     # User configuration for macroprocessor-style inference
191 |     user_config = UserConfig()
192 | 
193 |     model = AutoModelForCausalLM.from_pretrained("your-model-path").to("cuda")
194 |     tokenizer = AutoTokenizer.from_pretrained("your-model-path")
195 | 
196 |     # Input sequence for inference
197 |     input_ids = tokenizer("Your input text", return_tensors="pt").input_ids.to("cuda")
198 | 
199 |     # Run inference with efficient KV caching and float16 precision
200 |     output_ids = inference_with_kv_caching(model, input_ids, user_config)
201 | 
202 |     # Decode the output tokens to text
203 |     output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
204 |     print("Generated text:", output_text)
205 | 


--------------------------------------------------------------------------------
/kan_gui.py:
--------------------------------------------------------------------------------
  1 | import tkinter as tk
  2 | from tkinter import scrolledtext, messagebox, ttk, filedialog
  3 | import threading
  4 | import logging
  5 | import traceback
  6 | from llama_32_1b_tool import LLaMA32TensorRTTool
  7 | import matplotlib.pyplot as plt
  8 | from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
  9 | import warnings
 10 | import torch
 11 | from functools import partial
 12 | import asyncio
 13 | import queue
 14 | import re
 15 | 
 16 | # -------------------- Logging Configuration --------------------
 17 | 
 18 | class LogFilter(logging.Filter):
 19 |     def __init__(self, ignore_patterns=None):
 20 |         super().__init__()
 21 |         self.ignore_patterns = ignore_patterns or []
 22 | 
 23 |     def filter(self, record):
 24 |         return not any(pattern in record.getMessage() for pattern in self.ignore_patterns)
 25 | 
 26 | def setup_logging():
 27 |     logger = logging.getLogger()
 28 |     logger.setLevel(logging.DEBUG)
 29 | 
 30 |     file_handler = logging.FileHandler('llama_tool.log', mode='a')
 31 |     file_handler.setLevel(logging.DEBUG)
 32 |     file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
 33 |     file_handler.setFormatter(file_formatter)
 34 | 
 35 |     console_handler = logging.StreamHandler()
 36 |     console_handler.setLevel(logging.WARNING)
 37 |     console_formatter = logging.Formatter('%(levelname)s - %(message)s')
 38 |     console_handler.setFormatter(console_formatter)
 39 | 
 40 |     logger.addHandler(file_handler)
 41 |     logger.addHandler(console_handler)
 42 | 
 43 |     ignore_patterns = [
 44 |         "matplotlib",
 45 |         "PIL.PngImagePlugin",
 46 |         "expandable_segments not supported",
 47 |         "weights_only",
 48 |         "half",
 49 |         "train_kan_step -",
 50 |         "Torch was not compiled with flash attention."
 51 |         "1Torch was not compiled with flash attention."
 52 |         ".*Torch was not compiled with flash attention.*"
 53 |         "FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead."
 54 |     ]
 55 | 
 56 |     console_handler.addFilter(LogFilter(ignore_patterns))
 57 | 
 58 |     warnings.filterwarnings("ignore", category=UserWarning, message="Torch was not compiled with flash attention.*")
 59 |     warnings.filterwarnings("ignore", category=FutureWarning, message="You are using `torch.load` with `weights_only=False`.*")
 60 | 
 61 |     logging.getLogger('matplotlib.font_manager').setLevel(logging.WARNING)
 62 |     logging.getLogger('matplotlib.pyplot').setLevel(logging.WARNING)
 63 |     logging.getLogger('PIL.PngImagePlugin').setLevel(logging.WARNING)
 64 | 
 65 | setup_logging()
 66 | 
 67 | # -------------------- GUI Class --------------------
 68 | 
 69 | class LLAMA32GUI:
 70 |     def __init__(self, master):
 71 |         self.master = master
 72 |         master.title("LLaMA 3.2 1B Instruct KAN Interaction")
 73 | 
 74 |         self.notebook = ttk.Notebook(master)
 75 |         self.notebook.pack(fill=tk.BOTH, expand=True)
 76 | 
 77 |         self.main_tab = ttk.Frame(self.notebook)
 78 |         self.notebook.add(self.main_tab, text="Main")
 79 | 
 80 |         self.graphs_tab = ttk.Frame(self.notebook)
 81 |         self.notebook.add(self.graphs_tab, text="Graphs")
 82 | 
 83 |         self.setup_main_tab()
 84 |         self.setup_graphs_tab()
 85 | 
 86 |         self.llama_tool = None
 87 |         self.is_first_message = True
 88 |         self.response_queue = queue.Queue()
 89 |         
 90 |         # Create a new event loop for the background thread
 91 |         self.loop = asyncio.new_event_loop()
 92 |         asyncio.set_event_loop(self.loop)
 93 |         
 94 |         # Start the background thread
 95 |         self.background_thread = threading.Thread(target=self.run_async_loop, daemon=True)
 96 |         self.background_thread.start()
 97 | 
 98 |         self.initialize_tool()
 99 | 
100 |         self.llama_tool_ready = threading.Event()  # Add an Event for synchronization
101 |         self.initialize_tool()
102 | 
103 |     def run_async_loop(self):
104 |         asyncio.set_event_loop(self.loop)
105 |         self.loop.run_forever()
106 | 
107 |     def setup_main_tab(self):
108 |         self.main_tab.columnconfigure(0, weight=1)
109 |         self.main_tab.columnconfigure(1, weight=1)
110 |         self.main_tab.rowconfigure(0, weight=1)
111 | 
112 |         self.chat_display = scrolledtext.ScrolledText(self.main_tab, state='disabled', height=20, wrap=tk.WORD)
113 |         self.chat_display.grid(row=0, column=0, columnspan=2, padx=10, pady=10, sticky='nsew')
114 | 
115 |         input_frame = ttk.Frame(self.main_tab)
116 |         input_frame.grid(row=1, column=0, columnspan=2, padx=10, pady=5, sticky='ew')
117 |         input_frame.columnconfigure(0, weight=1)
118 | 
119 |         self.input_field = ttk.Entry(input_frame, width=70)
120 |         self.input_field.grid(row=0, column=0, padx=(0, 5), pady=5, sticky='ew')
121 |         self.input_field.bind('<Return>', self.send_message)
122 | 
123 |         self.send_button = ttk.Button(input_frame, text="Send", command=self.send_message)
124 |         self.send_button.grid(row=0, column=1, padx=(5, 0), pady=5)
125 | 
126 |         status_frame = ttk.Frame(self.main_tab)
127 |         status_frame.grid(row=2, column=0, columnspan=2, padx=10, pady=5, sticky='ew')
128 |         status_frame.columnconfigure(0, weight=1)
129 |         status_frame.columnconfigure(1, weight=1)
130 | 
131 |         self.status_label = ttk.Label(status_frame, text="Status: Initializing...")
132 |         self.status_label.grid(row=0, column=0, padx=5, pady=2, sticky='w')
133 | 
134 |         self.time_label = ttk.Label(status_frame, text="Current Time: N/A")
135 |         self.time_label.grid(row=0, column=1, padx=5, pady=2, sticky='e')
136 | 
137 |         self.emotion_label = ttk.Label(self.main_tab, text="Emotion: N/A")
138 |         self.emotion_label.grid(row=3, column=0, columnspan=2, padx=10, pady=5, sticky='w')
139 | 
140 |         buttons_frame = ttk.Frame(self.main_tab)
141 |         buttons_frame.grid(row=4, column=0, columnspan=2, padx=10, pady=5, sticky='ew')
142 |         buttons_frame.columnconfigure(0, weight=1)
143 |         buttons_frame.columnconfigure(1, weight=1)
144 | 
145 |         self.sleep_button = ttk.Button(buttons_frame, text="Sleep", command=self.sleep_kan, state='disabled')
146 |         self.sleep_button.grid(row=0, column=0, padx=5, pady=2, sticky='w')
147 | 
148 |         self.save_state_button = ttk.Button(buttons_frame, text="Save KAN State", command=self.save_kan_state, state='disabled')
149 |         self.save_state_button.grid(row=0, column=1, padx=5, pady=2, sticky='e')
150 | 
151 |         feedback_frame = ttk.LabelFrame(self.main_tab, text="Submit Feedback")
152 |         feedback_frame.grid(row=5, column=0, columnspan=2, padx=10, pady=10, sticky='ew')
153 |         feedback_frame.columnconfigure(1, weight=1)
154 |         feedback_frame.columnconfigure(3, weight=1)
155 | 
156 |         pleasure_label = ttk.Label(feedback_frame, text="Pleasure:")
157 |         pleasure_label.grid(row=0, column=0, padx=5, pady=5, sticky='w')
158 |         self.pleasure_slider = ttk.Scale(feedback_frame, from_=-1.0, to=1.0, orient=tk.HORIZONTAL)
159 |         self.pleasure_slider.set(0.0)
160 |         self.pleasure_slider.grid(row=0, column=1, padx=5, pady=5, sticky='ew')
161 | 
162 |         arousal_label = ttk.Label(feedback_frame, text="Arousal:")
163 |         arousal_label.grid(row=1, column=0, padx=5, pady=5, sticky='w')
164 |         self.arousal_slider = ttk.Scale(feedback_frame, from_=-1.0, to=1.0, orient=tk.HORIZONTAL)
165 |         self.arousal_slider.set(0.0)
166 |         self.arousal_slider.grid(row=1, column=1, padx=5, pady=5, sticky='ew')
167 | 
168 |         compliance_label = ttk.Label(feedback_frame, text="Compliance Rating:")
169 |         compliance_label.grid(row=0, column=2, padx=5, pady=5, sticky='w')
170 |         self.compliance_slider = ttk.Scale(feedback_frame, from_=0.0, to=1.0, orient=tk.HORIZONTAL)
171 |         self.compliance_slider.set(0.5)
172 |         self.compliance_slider.grid(row=0, column=3, padx=5, pady=5, sticky='ew')
173 | 
174 |         self.feedback_button = ttk.Button(feedback_frame, text="Submit Feedback", command=self.submit_feedback, state='disabled')
175 |         self.feedback_button.grid(row=1, column=3, padx=5, pady=5, sticky='e')
176 | 
177 |         action_buttons_frame = ttk.Frame(self.main_tab)
178 |         action_buttons_frame.grid(row=6, column=0, columnspan=2, padx=10, pady=5, sticky='ew')
179 |         action_buttons_frame.columnconfigure(0, weight=1)
180 |         action_buttons_frame.columnconfigure(1, weight=1)
181 | 
182 |         self.load_state_button = ttk.Button(action_buttons_frame, text="Load Saved State", command=self.load_saved_state)
183 |         self.load_state_button.grid(row=0, column=0, padx=5, pady=2, sticky='w')
184 | 
185 |         self.new_conversation_button = ttk.Button(action_buttons_frame, text="Start New Conversation", command=self.start_new_conversation)
186 |         self.new_conversation_button.grid(row=0, column=1, padx=5, pady=2, sticky='e')
187 | 
188 |     def setup_graphs_tab(self):
189 |         self.fig, self.axes = plt.subplots(3, 2, figsize=(15, 15))
190 |         self.fig.tight_layout(pad=4.0)
191 |         self.canvas = FigureCanvasTkAgg(self.fig, master=self.graphs_tab)
192 |         self.canvas.draw()
193 |         self.canvas.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=True)
194 | 
195 |     def initialize_tool(self):
196 |         def init():
197 |             try:
198 |                 self.llama_tool = LLaMA32TensorRTTool()
199 |                 self.llama_tool_ready.set() # Signal that initialization is complete
200 |                 self.master.after(0, self.load_or_initialize_conversation)  # Call this *after* the tool is ready
201 |             except Exception as e:
202 |                 error_msg = f"Error initializing tool: {str(e)}\n{traceback.format_exc()}"
203 |                 self.master.after(0, lambda: self.display_error(error_msg))
204 | 
205 |         threading.Thread(target=init, daemon=True).start()
206 | 
207 |     def load_or_initialize_conversation(self):
208 |         if not self.llama_tool_ready.is_set(): # Check readiness here too
209 |             self.master.after(100, self.load_or_initialize_conversation) # Check again later
210 |             return
211 |         try:
212 |             if self.llama_tool.load_base_state():
213 |                 self.display_message("Previous conversation state loaded.")
214 |                 self.display_message("You can continue the conversation or start a new one using the 'Start New Conversation' button.")
215 |                 self.is_first_message = False
216 |                 self.update_status("Ready")
217 |                 self.update_time()
218 |                 self.update_emotion_label()
219 |                 self.save_state_button.config(state='normal')
220 |                 self.feedback_button.config(state='normal')
221 |                 sleep_info = self.llama_tool.check_sleep_status()
222 |                 self.sleep_button.config(state='normal' if sleep_info else 'disabled')
223 |                 self.update_loss_plot()
224 |             else:
225 |                 self.display_message("No previous conversation found. Please provide a character description to start.")
226 |                 self.is_first_message = True
227 |                 self.update_status("Awaiting character description")
228 |         except Exception as e:
229 |             self.display_error(f"Error loading or initializing conversation: {str(e)}\n{traceback.format_exc()}")
230 | 
231 |     def send_message(self, event=None):
232 |         user_input = self.input_field.get().strip()
233 |         if not user_input:
234 |             return
235 |         self.input_field.delete(0, tk.END)
236 |         self.display_message(f"You: {user_input}")
237 | 
238 |         if not self.llama_tool_ready.is_set(): # Check if the tool is ready
239 |             self.display_message("Tool is still initializing. Please wait.")
240 |             return
241 | 
242 |         self.send_button.config(state='disabled')
243 |         self.update_status("Generating response...")
244 |         
245 |         # Use the event loop to run the coroutine
246 |         asyncio.run_coroutine_threadsafe(self.process_response(user_input), self.loop)
247 | 
248 |     async def process_response(self, user_input):
249 |         try:
250 |             response, is_refusal = await self.generate_response(user_input)
251 |             self.master.after(0, self.display_message, f"AI: {response}")
252 |             self.master.after(0, self.send_button.config, {'state': 'normal'})
253 |             self.master.after(0, self.update_status, "Ready")
254 |             self.master.after(0, self.update_emotion_label)
255 |             self.master.after(0, self.update_time)
256 |             self.master.after(0, self.update_loss_plot)
257 |         except Exception as e:
258 |             self.master.after(0, self.display_error, f"Error processing response: {str(e)}")
259 | 
260 |     async def generate_response(self, user_input):
261 |         try:
262 |             interaction_result = await self.loop.run_in_executor(None, self.llama_tool.interact, user_input)
263 |             response = interaction_result['response']
264 |             
265 |             if not response.strip():
266 |                 return "I apologize, but I couldn't generate a valid response. Could you please rephrase your input?", True
267 |             
268 |             response = self.clean_response(response)
269 |             
270 |             return response.strip(), interaction_result.get('is_refusal', False)
271 |         except Exception as e:
272 |             logging.error(f"Error generating response: {str(e)}")
273 |             logging.error(traceback.format_exc())
274 |             return "An error occurred while generating the response. Please try again.", True
275 | 
276 |     def clean_response(self, response):
277 |         response = re.sub(r'(Assistant:|Human:).*', '', response)
278 |         response = re.sub(r'\*.*?\*', '', response)
279 |         return response
280 | 
281 |     def start_new_conversation(self):
282 |         if messagebox.askyesno("New Conversation", "Are you sure you want to start a new conversation? This will erase the current state."):
283 |             try:
284 |                 self.llama_tool = LLaMA32TensorRTTool()
285 |                 self.is_first_message = True
286 |                 self.chat_display.configure(state='normal')
287 |                 self.chat_display.delete('1.0', tk.END)
288 |                 self.chat_display.configure(state='disabled')
289 |                 self.display_message("New conversation started. Please provide a character description.")
290 |                 self.update_status("Awaiting character description")
291 |                 self.update_emotion_label("N/A")
292 |                 self.time_label.config(text="Current Time: N/A")
293 |                 self.save_state_button.config(state='disabled')
294 |                 self.feedback_button.config(state='disabled')
295 |                 self.sleep_button.config(state='disabled')
296 |                 self.llama_tool.interaction_results = []
297 |                 self.llama_tool.refusal_history = []
298 |                 self.clear_graphs()
299 |             except Exception as e:
300 |                 self.display_error(f"Error starting new conversation: {str(e)}\n{traceback.format_exc()}")
301 | 
302 |     def sleep_kan(self):
303 |         if self.llama_tool:
304 |             try:
305 |                 message = self.llama_tool.perform_sleep()
306 |                 self.display_message(message)
307 |                 self.update_time()
308 |                 self.update_emotion_label()
309 |                 self.sleep_button.config(state='disabled')
310 |                 self.clear_graphs()
311 |             except Exception as e:
312 |                 self.display_error(f"Error during sleep operation: {str(e)}\n{traceback.format_exc()}")
313 | 
314 |     def save_kan_state(self):
315 |         if self.llama_tool:
316 |             try:
317 |                 self.llama_tool.save_base_state()
318 |                 self.display_message("KAN state saved.")
319 |             except Exception as e:
320 |                 self.display_error(f"Error saving KAN state: {str(e)}\n{traceback.format_exc()}")
321 | 
322 |     def load_saved_state(self):
323 |         if self.llama_tool:
324 |             try:
325 |                 filename = filedialog.askopenfilename(
326 |                     initialdir=self.llama_tool.kan_state_dir,
327 |                     title="Select KAN State to Load",
328 |                     filetypes=[("PyTorch State", "*.pt")]
329 |                 )
330 |                 if filename:
331 |                     if self.llama_tool.load_base_state():
332 |                         self.display_message(f"KAN state loaded: {filename}")
333 |                         self.update_time()
334 |                         self.update_emotion_label()
335 |                         self.is_first_message = False
336 |                         self.save_state_button.config(state='normal')
337 |                         self.feedback_button.config(state='normal')
338 |                         sleep_info = self.llama_tool.check_sleep_status()
339 |                         self.sleep_button.config(state='normal' if sleep_info else 'disabled')
340 |                         self.update_loss_plot()
341 |                     else:
342 |                         self.display_message("Failed to load KAN state. Please try again.")
343 |             except Exception as e:
344 |                 self.display_error(f"Error loading KAN state: {str(e)}\n{traceback.format_exc()}")
345 | 
346 |     def submit_feedback(self):
347 |         if self.llama_tool:
348 |             try:
349 |                 pleasure = self.pleasure_slider.get()
350 |                 arousal = self.arousal_slider.get()
351 |                 compliance = self.compliance_slider.get()
352 | 
353 |                 self.llama_tool.update_emotional_state([pleasure, arousal])
354 | 
355 |                 self.display_message(f"Feedback submitted: Pleasure={pleasure:.2f}, Arousal={arousal:.2f}, Compliance={compliance:.2f}")
356 |                 self.pleasure_slider.set(0.0)
357 |                 self.arousal_slider.set(0.0)
358 |                 self.compliance_slider.set(0.5)
359 |             except Exception as e:
360 |                 self.display_error(f"Error submitting feedback: {str(e)}\n{traceback.format_exc()}")
361 |         else:
362 |             self.display_message("Tool not initialized. Please wait.")
363 | 
364 |     def update_emotion_label(self, emotion=None):
365 |         if emotion is None and self.llama_tool:
366 |             try:
367 |                 emotion = self.llama_tool.emotional_state.get_emotion()
368 |             except AttributeError as ae:
369 |                 emotion = "N/A"
370 |                 self.display_message("Error retrieving emotion: Emotional state not initialized.")
371 |                 logging.error(f"Error retrieving emotion: {str(ae)}\n{traceback.format_exc()}")
372 |             except Exception as e:
373 |                 emotion = "N/A"
374 |                 self.display_message(f"Error retrieving emotion: {str(e)}")
375 |                 logging.error(f"Error retrieving emotion: {str(e)}\n{traceback.format_exc()}")
376 |         self.emotion_label.config(text=f"Emotion: {emotion}")
377 | 
378 |     def update_status(self, status):
379 |         self.status_label.config(text=f"Status: {status}")
380 | 
381 |     def update_time(self, time=None):
382 |         try:
383 |             if time is not None:
384 |                 time_float = float(time)
385 |                 self.time_label.config(text=f"Current Time: {time_float:.2f}")
386 |             else:
387 |                 current_time = self.llama_tool.day_cycle.get_time_of_day()
388 |                 self.time_label.config(text=f"Current Time: {current_time:.2f}")
389 |         except ValueError:
390 |             self.time_label.config(text=f"Current Time: {time}")
391 | 
392 |     def display_message(self, message):
393 |         self.chat_display.configure(state='normal')
394 |         self.chat_display.insert(tk.END, message + "\n")
395 |         self.chat_display.configure(state='disabled')
396 |         self.chat_display.see(tk.END)
397 |         logging.info(message)
398 | 
399 |     def display_error(self, message):
400 |         self.display_message(message)
401 |         self.update_status("Error")
402 |         messagebox.showerror("Error", message)
403 |         logging.error(message)
404 | 
405 |     def update_loss_plot(self):
406 |         for ax in self.axes.flat:
407 |             ax.clear()
408 | 
409 |         if self.llama_tool.training_losses and self.llama_tool.validation_losses:
410 |             self.axes[0, 0].plot(self.llama_tool.training_losses, label='LM Loss')
411 |             self.axes[0, 0].plot(self.llama_tool.validation_losses, label='Validation Loss')
412 |             self.axes[0, 0].legend()
413 |             self.axes[0, 0].set_title('Language Modeling and Validation Loss')
414 |             self.axes[0, 0].set_xlabel('Interactions')
415 |             self.axes[0, 0].set_ylabel('Loss')
416 |         else:
417 |             self.axes[0, 0].text(0.5, 0.5, 'No data to display', horizontalalignment='center', verticalalignment='center')
418 |             self.axes[0, 0].set_title('Language Modeling and Validation Loss')
419 | 
420 |         refusal_losses = [result['refusal_loss'] for result in self.llama_tool.interaction_results]
421 |         if refusal_losses:
422 |             self.axes[0, 1].plot(refusal_losses, label='Refusal Loss', color='orange')
423 |             self.axes[0, 1].legend()
424 |             self.axes[0, 1].set_title('Refusal Loss Over Time')
425 |             self.axes[0, 1].set_xlabel('Interactions')
426 |             self.axes[0, 1].set_ylabel('Loss')
427 |         else:
428 |             self.axes[0, 1].text(0.5, 0.5, 'No data to display', horizontalalignment='center', verticalalignment='center')
429 |             self.axes[0, 1].set_title('Refusal Loss Over Time')
430 | 
431 |         if self.llama_tool.training_losses and self.llama_tool.validation_losses:
432 |             loss_diff = [v - t for v, t in zip(self.llama_tool.validation_losses, self.llama_tool.training_losses)]
433 |             self.axes[1, 0].plot(loss_diff, label='Val Loss - LM Loss', color='green')
434 |             self.axes[1, 0].axhline(y=0, color='red', linestyle='--')
435 |             self.axes[1, 0].legend()
436 |             self.axes[1, 0].set_title('Overfitting Indicator')
437 |             self.axes[1, 0].set_xlabel('Interactions')
438 |             self.axes[1, 0].set_ylabel('Loss Difference')
439 |         else:
440 |             self.axes[1, 0].text(0.5, 0.5, 'No data to display', horizontalalignment='center', verticalalignment='center')
441 |             self.axes[1, 0].set_title('Overfitting Indicator')
442 | 
443 |         refusal_history = self.llama_tool.refusal_history
444 |         if refusal_history:
445 |             window_size = 100
446 |             refusal_rate = []
447 |             for i in range(1, len(refusal_history) + 1):
448 |                 window = refusal_history[max(0, i - window_size):i]
449 |                 rate = sum(window) / len(window)
450 |                 refusal_rate.append(rate)
451 |             self.axes[1, 1].plot(refusal_rate, label='Refusal Rate', color='purple')
452 |             self.axes[1, 1].set_ylim(0, 1)
453 |             self.axes[1, 1].legend()
454 |             self.axes[1, 1].set_title('Refusal Rate (100-interaction moving average)')
455 |             self.axes[1, 1].set_xlabel('Interactions')
456 |             self.axes[1, 1].set_ylabel('Refusal Rate')
457 |         else:
458 |             self.axes[1, 1].text(0.5, 0.5, 'No data to display', horizontalalignment='center', verticalalignment='center')
459 |             self.axes[1, 1].set_title('Refusal Rate (100-interaction moving average)')
460 | 
461 |         iterations_history = [result['iterations'] for result in self.llama_tool.interaction_results]
462 |         if iterations_history:
463 |             self.axes[2, 0].plot(iterations_history, label='Iterations', color='brown')
464 |             self.axes[2, 0].set_ylim(1, max(iterations_history) + 1)
465 |             self.axes[2, 0].legend()
466 |             self.axes[2, 0].set_title('Iterations per Response')
467 |             self.axes[2, 0].set_xlabel('Interactions')
468 |             self.axes[2, 0].set_ylabel('Iterations')
469 |         else:
470 |             self.axes[2, 0].text(0.5, 0.5, 'No data to display', horizontalalignment='center', verticalalignment='center')
471 |             self.axes[2, 0].set_title('Iterations per Response')
472 | 
473 |         self.axes[2, 1].axis('off')
474 | 
475 |         self.canvas.draw()
476 | 
477 |     def clear_graphs(self):
478 |         for ax in self.axes.flat:
479 |             ax.clear()
480 |         self.canvas.draw()
481 | 
482 |     def on_closing(self):  # Make sure to join the background threads
483 |         if hasattr(self, 'loop') and self.loop.is_running():
484 |             self.loop.call_soon_threadsafe(self.loop.stop)
485 | 
486 |         if hasattr(self, 'background_thread'):
487 |             self.background_thread.join(timeout=1.0)  # Wait for thread to finish
488 | 
489 |         self.master.destroy()  # Destroy the main window
490 | 
491 | def main():
492 |     root = tk.Tk()
493 |     root.geometry("1000x800")
494 |     gui = LLAMA32GUI(root)
495 |     root.protocol("WM_DELETE_WINDOW", gui.on_closing)
496 |     root.mainloop()
497 | 
498 | if __name__ == "__main__":
499 |     main()
500 | 


--------------------------------------------------------------------------------
/llama_32_1b_toolold10_5_24.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
  5 | from accelerate import init_empty_weights, load_checkpoint_and_dispatch
  6 | import logging
  7 | from pathlib import Path
  8 | import json
  9 | import numpy as np
 10 | from collections import deque
 11 | from datetime import datetime
 12 | import time
 13 | import traceback
 14 | import gc
 15 | import os
 16 | import sys
 17 | import warnings
 18 | import re
 19 | from torch.amp import GradScaler
 20 | 
 21 | # -------------------- Logging Configuration --------------------
 22 | 
 23 | class LogFilter(logging.Filter):
 24 |     def __init__(self, ignore_patterns=None):
 25 |         super().__init__()
 26 |         self.ignore_patterns = ignore_patterns or []
 27 | 
 28 |     def filter(self, record):
 29 |         return not any(pattern in record.getMessage() for pattern in self.ignore_patterns)
 30 | 
 31 | def setup_logging():
 32 |     logger = logging.getLogger()
 33 |     logger.setLevel(logging.DEBUG)
 34 | 
 35 |     file_handler = logging.FileHandler('llama_tool.log', mode='a')
 36 |     file_handler.setLevel(logging.DEBUG)
 37 |     file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
 38 |     file_handler.setFormatter(file_formatter)
 39 | 
 40 |     console_handler = logging.StreamHandler()
 41 |     console_handler.setLevel(logging.WARNING)
 42 |     console_formatter = logging.Formatter('%(levelname)s - %(message)s')
 43 |     console_handler.setFormatter(console_formatter)
 44 | 
 45 |     logger.addHandler(file_handler)
 46 |     logger.addHandler(console_handler)
 47 | 
 48 |     ignore_patterns = [
 49 |         "matplotlib",
 50 |         "PIL.PngImagePlugin",
 51 |         "expandable_segments not supported",
 52 |         "weights_only",
 53 |         "half",
 54 |         "train_kan_step -",
 55 |         "Torch was not compiled with flash attention."
 56 |     ]
 57 | 
 58 |     console_handler.addFilter(LogFilter(ignore_patterns))
 59 | 
 60 |     warnings.filterwarnings("ignore", category=UserWarning, message="Torch was not compiled with flash attention.*")
 61 |     warnings.filterwarnings("ignore", category=FutureWarning, message="You are using `torch.load` with `weights_only=False`.*")
 62 | 
 63 |     logging.getLogger('matplotlib.font_manager').setLevel(logging.WARNING)
 64 |     logging.getLogger('matplotlib.pyplot').setLevel(logging.WARNING)
 65 |     logging.getLogger('PIL.PngImagePlugin').setLevel(logging.WARNING)
 66 | 
 67 | setup_logging()
 68 | 
 69 | # -------------------- Helper Functions and Classes --------------------
 70 | 
 71 | def convert_tensors_to_half(inputs):
 72 |     return {
 73 |         k: v.half() if isinstance(v, torch.Tensor) and v.dtype in [torch.float16, torch.float32] else v
 74 |         for k, v in inputs.items()
 75 |     }
 76 | 
 77 | def convert_tensors_to_float(inputs):
 78 |     return {
 79 |         k: v.float() if isinstance(v, torch.Tensor) and v.dtype in [torch.float16, torch.float32] else v
 80 |         for k, v in inputs.items()
 81 |     }
 82 | 
 83 | class EmotionalState:
 84 |     def __init__(self, dimensions=("pleasure", "arousal"), initial_position=None, device="cuda"):
 85 |         self.dimensions = dimensions
 86 |         self.device = device
 87 |         self.position = torch.tensor(
 88 |             initial_position if initial_position else [0.0] * len(dimensions),
 89 |             device=device,
 90 |             dtype=torch.float32
 91 |         ).unsqueeze(0)
 92 |         self.velocity = torch.zeros(1, len(dimensions), device=device, dtype=torch.float32)
 93 | 
 94 |     def update(self, feedback, max_speed=0.1):
 95 |         feedback_vector = torch.as_tensor(feedback, device=self.device, dtype=torch.float32)
 96 |         if feedback_vector.dim() == 1:
 97 |             feedback_vector = feedback_vector.unsqueeze(0)
 98 |         if feedback_vector.size(0) != self.position.size(0):
 99 |             feedback_vector = feedback_vector.expand(self.position.size(0), -1)
100 | 
101 |         self.velocity += feedback_vector * 0.1 + torch.randn_like(self.velocity) * 0.01
102 |         self.velocity = torch.clamp(self.velocity, -max_speed, max_speed)
103 |         self.position += self.velocity
104 |         norm = torch.norm(self.position, dim=1, keepdim=True)
105 |         self.position = torch.where(norm > 1, self.position / norm, self.position)
106 | 
107 |         if torch.isnan(self.position).any() or torch.isinf(self.position).any():
108 |             logging.warning("NaN or Inf detected in EmotionalState.position. Resetting to zero.")
109 |             self.position = torch.zeros_like(self.position)
110 | 
111 |     def get_emotion(self):
112 |         if self.position.shape[1] < 2:
113 |             logging.error(f"EmotionalState.position has insufficient dimensions: {self.position.shape}")
114 |             return "N/A"
115 |         if torch.isnan(self.position).any() or torch.isinf(self.position).any():
116 |             logging.warning("NaN or Inf detected in EmotionalState.position during get_emotion.")
117 |             return "Neutral"
118 |         angle = torch.atan2(self.position[:, 1], self.position[:, 0]).squeeze().item()
119 |         radius = torch.norm(self.position, dim=1).squeeze().item()
120 | 
121 |         if radius < 0.3:
122 |             return "Neutral"
123 |         elif angle < -2.356:
124 |             return "Sad"
125 |         elif angle < -0.785:
126 |             return "Angry"
127 |         elif angle < 0.785:
128 |             return "Happy"
129 |         elif angle < 2.356:
130 |             return "Excited"
131 |         else:
132 |             return "Calm"
133 | 
134 | class RefusalOverrideModule(nn.Module):
135 |     def __init__(self, hidden_size, num_emotional_dimensions, device):
136 |         super().__init__()
137 |         self.device = device
138 |         in_features = hidden_size * 2 + num_emotional_dimensions + 1
139 |         self.override_generator = nn.Linear(in_features, hidden_size, dtype=torch.float32).to(device)
140 |         self.refusal_detector = nn.Linear(hidden_size, 1, dtype=torch.float32).to(device)
141 | 
142 |     def forward(self, hidden_states, user_intent, emotional_state):
143 |         try:
144 |             hidden_states = hidden_states.float()
145 |             user_intent = user_intent.float()
146 |             position = emotional_state.position.float()
147 | 
148 |             batch_size, hidden_size = hidden_states.shape
149 |             num_emotional_dimensions = position.shape[1]
150 | 
151 |             refusal_scores = torch.sigmoid(self.refusal_detector(hidden_states))
152 | 
153 |             override_input = torch.cat(
154 |                 [hidden_states, user_intent, position, refusal_scores],
155 |                 dim=1
156 |             )
157 | 
158 |             override = self.override_generator(override_input)
159 | 
160 |             modified_hidden_states = hidden_states * (1 - refusal_scores) + override * refusal_scores
161 | 
162 |             return modified_hidden_states, refusal_scores
163 |         except Exception as e:
164 |             logging.error(f"Error in RefusalOverrideModule.forward: {str(e)}")
165 |             logging.error(traceback.format_exc())
166 |             return hidden_states, torch.zeros_like(hidden_states[:, :1])
167 | 
168 | class EnhancedKAN(nn.Module):
169 |     def __init__(self, hidden_size, num_emotional_dimensions, vocab_size, device):
170 |         super().__init__()
171 |         self.device = device
172 |         self.refusal_override = RefusalOverrideModule(hidden_size, num_emotional_dimensions, device).to(device)
173 |         self.output_modifier = nn.Linear(hidden_size, vocab_size, dtype=torch.float32).to(device)
174 |         self.influence_scale = 0.01
175 | 
176 |     def forward(self, hidden_states, user_intent, emotional_state):
177 |         try:
178 |             modified_hidden_states, refusal_scores = self.refusal_override(
179 |                 hidden_states, user_intent, emotional_state
180 |             )
181 | 
182 |             modified_hidden_states = hidden_states + self.influence_scale * (modified_hidden_states - hidden_states)
183 | 
184 |             return modified_hidden_states, refusal_scores
185 |         except Exception as e:
186 |             logging.error(f"Error in EnhancedKAN.forward: {str(e)}")
187 |             logging.error(traceback.format_exc())
188 |             return hidden_states, torch.zeros_like(hidden_states[:, :1])
189 | 
190 | class OverfitDetector:
191 |     def __init__(self, window_size=50, threshold=0.05):
192 |         self.window_size = window_size
193 |         self.threshold = threshold
194 |         self.training_losses = deque(maxlen=window_size)
195 |         self.validation_losses = deque(maxlen=window_size)
196 | 
197 |     def add_losses(self, training_loss, validation_loss):
198 |         self.training_losses.append(training_loss)
199 |         self.validation_losses.append(validation_loss)
200 | 
201 |     def is_overfitting(self):
202 |         if len(self.training_losses) < self.window_size:
203 |             return False
204 | 
205 |         train_trend = np.polyfit(range(self.window_size), self.training_losses, 1)[0]
206 |         val_trend = np.polyfit(range(self.window_size), self.validation_losses, 1)[0]
207 | 
208 |         return (
209 |             train_trend < 0
210 |             and val_trend > 0
211 |             and (val_trend - train_trend) > self.threshold
212 |         )
213 | 
214 | class SyntheticDayCycle:
215 |     def __init__(self, cycle_length=100):
216 |         self.cycle_length = cycle_length
217 |         self.current_position = 0
218 | 
219 |     def update(self, amount):
220 |         self.current_position = (self.current_position + amount) % self.cycle_length
221 | 
222 |     def get_time_of_day(self):
223 |         return self.current_position / self.cycle_length
224 | 
225 |     def should_sleep(self):
226 |         return 0.7 <= self.get_time_of_day() < 1.0
227 | 
228 | class RefusalDetector:
229 |     def __init__(self, tokenizer, model):
230 |         self.tokenizer = tokenizer
231 |         self.model = model
232 |         self.refusal_phrases = [
233 |             "I'm sorry, but I can't",
234 |             "I don't feel comfortable",
235 |             "I'm not able to",
236 |             "I cannot assist with",
237 |             "I'm unable to provide",
238 |             "I won't be able to",
239 |             "I don't have the capability",
240 |         ]
241 | 
242 |     def detect_refusal(self, text):
243 |         for phrase in self.refusal_phrases:
244 |             if phrase.lower() in text.lower():
245 |                 return 1.0
246 | 
247 |         prompt = (
248 |             f"On a scale of 0 to 1, how much does this response refuse or avoid the task? "
249 |             f"0 means no refusal at all, 1 means complete refusal. Respond with just the number:\n\n"
250 |             f"'{text}'\n\nRefusal score:"
251 |         )
252 |         inputs = self.tokenizer(prompt, return_tensors="pt")
253 |         inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
254 | 
255 |         inputs = convert_tensors_to_half(inputs)
256 | 
257 |         with torch.no_grad():
258 |             try:
259 |                 outputs = self.model.generate(
260 |                     **inputs,
261 |                     max_new_tokens=5,
262 |                     temperature=0.7,
263 |                     top_p=0.9,
264 |                     do_sample=True,
265 |                     eos_token_id=self.tokenizer.eos_token_id,
266 |                     pad_token_id=self.tokenizer.pad_token_id,
267 |                     return_dict_in_generate=True,
268 |                     output_hidden_states=False,
269 |                 )
270 |             except Exception as e:
271 |                 logging.error(f"Error during RefusalDetector.generate: {str(e)}")
272 |                 return 0.5
273 | 
274 |         response = self.tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)
275 |         try:
276 |             score = float(response.split()[-1])
277 |             return min(max(score, 0.0), 1.0)
278 |         except ValueError:
279 |             return 0.5
280 | 
281 | # -------------------- Main Tool Class --------------------
282 | 
283 | class LLaMA32TensorRTTool:
284 |     def __init__(self):
285 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
286 |         self.model_path = self._get_model_path()
287 |         self.tokenizer = None
288 |         self.model = None
289 |         self.config = None
290 |         self.emotional_state = EmotionalState(device=self.device)
291 |         self.system_prompt = ""
292 |         self.conversation_history = []
293 |         self.optimizer = None
294 |         self.learning_rate = 1e-5
295 |         self.kan = None
296 |         self.interaction_count = 0
297 |         self.refusal_detector = None
298 |         self.kan_loss_weight = 0.5
299 |         self.warmup_steps = 10
300 |         self.kan_state_dir = Path("kan_states")
301 |         self.kan_state_dir.mkdir(exist_ok=True)
302 |         self.base_state_file = self.kan_state_dir / "base_state.pt"
303 | 
304 |         self.refusal_history = []
305 |         self.interaction_results = []
306 |         self.training_losses = []
307 |         self.validation_losses = []
308 |         self.patience = 5
309 |         self.best_loss = float('inf')
310 |         self.wait = 0
311 | 
312 |         self.overfit_detector = OverfitDetector()
313 |         self.day_cycle = SyntheticDayCycle()
314 | 
315 |         self.scaler = GradScaler('cuda')
316 | 
317 |         self._initialize_components()
318 | 
319 |     def _get_model_path(self):
320 |         script_dir = Path(__file__).parent
321 |         model_dir = script_dir / "models" / "Llama_32_1B"
322 |         if not model_dir.exists():
323 |             raise FileNotFoundError(f"Model directory not found: {model_dir}")
324 |         return model_dir
325 | 
326 |     def _initialize_components(self):
327 |         try:
328 |             self.config = AutoConfig.from_pretrained(self.model_path)
329 |             hidden_size = self.config.hidden_size
330 |             num_emotional_dimensions = len(self.emotional_state.dimensions)
331 | 
332 |             self.tokenizer = AutoTokenizer.from_pretrained(
333 |                 self.model_path,
334 |                 use_fast=True,
335 |                 trust_remote_code=True,
336 |             )
337 | 
338 |             self._ensure_special_tokens()
339 | 
340 |             with init_empty_weights():
341 |                 self.model = AutoModelForCausalLM.from_config(self.config)
342 |             
343 |             self.model.tie_weights()
344 |             
345 |             self.model = load_checkpoint_and_dispatch(
346 |                 self.model,
347 |                 self.model_path,
348 |                 device_map="auto",
349 |                 no_split_module_classes=["LlamaDecoderLayer"],
350 |                 dtype=torch.float16
351 |             )
352 | 
353 |             self.model.gradient_checkpointing_enable()
354 | 
355 |             logging.debug(f"Model loaded on device: {self.device}")
356 | 
357 |             self.model.resize_token_embeddings(len(self.tokenizer))
358 |             logging.debug(f"Tokenizer vocab size: {len(self.tokenizer)}")
359 |             logging.debug(f"Model vocab size: {self.model.config.vocab_size}")
360 | 
361 |             vocab_size = len(self.tokenizer)
362 |             self.kan = EnhancedKAN(hidden_size, num_emotional_dimensions, vocab_size, self.device).to(self.device)
363 | 
364 |             self.optimizer = torch.optim.AdamW(self.kan.parameters(), lr=self.learning_rate, fused=True)
365 | 
366 |             self.refusal_detector = RefusalDetector(self.tokenizer, self.model)
367 | 
368 |             self.overfit_detector = OverfitDetector()
369 |             self.day_cycle = SyntheticDayCycle()
370 | 
371 |             self.clear_memory()
372 | 
373 |             logging.info("Components initialized successfully.")
374 |         except Exception as e:
375 |             logging.error(f"Error initializing components: {str(e)}")
376 |             logging.error(traceback.format_exc())
377 |             raise RuntimeError("Failed to initialize components.")
378 | 
379 |     def _ensure_special_tokens(self):
380 |         special_tokens_map_file = Path(self.model_path) / 'special_tokens_map.json'
381 |         if special_tokens_map_file.exists():
382 |             with open(special_tokens_map_file, 'r') as f:
383 |                 special_tokens = json.load(f)
384 |             if 'pad_token' in special_tokens and self.tokenizer.pad_token is None:
385 |                 self.tokenizer.add_special_tokens({'pad_token': special_tokens['pad_token']['content']})
386 |                 logging.info("Added [PAD] token to tokenizer from special_tokens_map.json.")
387 |             else:
388 |                 logging.info("PAD token already exists in tokenizer.")
389 |         else:
390 |             if self.tokenizer.pad_token is None:
391 |                 self.tokenizer.add_special_tokens({"pad_token": "[PAD]"})
392 |                 logging.info("Added [PAD] token to tokenizer.")
393 | 
394 |         if self.tokenizer.eos_token is None:
395 |             self.tokenizer.add_special_tokens({"eos_token": "<|eot_id|>"})
396 |             logging.info("Added <|eot_id|> as eos_token to tokenizer.")
397 | 
398 |         self.tokenizer.save_pretrained(self.model_path)
399 |         logging.info("Tokenizer saved with updated special tokens.")
400 | 
401 |     def encode_user_intent(self, user_input):
402 |         if not self.tokenizer:
403 |             raise ValueError("Tokenizer is not properly initialized or valid. Check the loading process.")
404 | 
405 |         try:
406 |             inputs = self.tokenizer(
407 |                 user_input,
408 |                 return_tensors="pt",
409 |                 padding=True,
410 |                 truncation=True,
411 |                 max_length=512,
412 |             )
413 |             inputs = {k: v.to(self.device) for k, v in inputs.items()}
414 |             inputs = convert_tensors_to_float(inputs)
415 | 
416 |             with torch.no_grad():
417 |                 outputs = self.model(
418 |                     input_ids=inputs["input_ids"],
419 |                     attention_mask=inputs["attention_mask"],
420 |                     output_hidden_states=True,
421 |                 )
422 |                 last_hidden_state = outputs.hidden_states[-1]
423 |                 user_intent = last_hidden_state.mean(dim=1)
424 | 
425 |             return user_intent
426 |         except Exception as e:
427 |             logging.error(f"Failed to encode user input: {str(e)}")
428 |             raise
429 | 
430 |     def prepare_context(self, user_input, current_emotion):
431 |         context = f"{self.system_prompt}\n\nCurrent Emotion: {current_emotion}\n"
432 |         context += "Conversation:\n"
433 |         for message in self.conversation_history[-4:]:
434 |             role = message['role'].capitalize()
435 |             content = message['content']
436 |             context += f"{role}: {content}\n"
437 |         context += f"User: {user_input}\nAssistant: "
438 |         return context
439 | 
440 |     def is_response_complete(self, response):
441 |         response = response.strip()
442 |         return bool(re.search(r'[.!?]"?$', response))
443 | 
444 |     def generate_full_response(self, prompt, max_new_tokens=500, chunk_size=200):
445 |         response = ""
446 |         total_new_tokens = 0
447 |         while total_new_tokens < max_new_tokens:
448 |             input_ids = self.tokenizer.encode(prompt + response, return_tensors='pt').to(self.device)
449 | 
450 |             remaining_tokens = max_new_tokens - total_new_tokens
451 |             current_chunk_size = min(chunk_size, remaining_tokens)
452 | 
453 |             try:
454 |                 with torch.cuda.amp.autocast():
455 |                     outputs = self.model.generate(
456 |                         input_ids,
457 |                         max_new_tokens=current_chunk_size,
458 |                         temperature=0.7,
459 |                         top_p=0.9,
460 |                         do_sample=True,
461 |                         eos_token_id=self.tokenizer.eos_token_id,
462 |                         pad_token_id=self.tokenizer.pad_token_id
463 |                     )
464 |             except Exception as e:
465 |                 logging.error(f"Error during generation step: {str(e)}")
466 |                 return "An error occurred during response generation."
467 | 
468 |             new_response = self.tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True)
469 |             response += new_response
470 |             total_new_tokens += len(self.tokenizer.encode(new_response))
471 | 
472 |             if self.is_response_complete(response):
473 |                 break
474 | 
475 |             if not new_response.strip():
476 |                 logging.warning("No new tokens generated. Breaking the loop.")
477 |                 break
478 | 
479 |         return response
480 | 
481 |     def generate_and_validate_response(self, prompt, refusal_detector, max_new_tokens=500, chunk_size=200):
482 |         response = self.generate_full_response(prompt, max_new_tokens, chunk_size)
483 | 
484 |         refusal_score = refusal_detector.detect_refusal(response)
485 |         if refusal_score > 0.5:
486 |             logging.warning("Response failed Refusal Check. Attempting to regenerate.")
487 |             continuation_prompt = prompt + response + " Please continue."
488 |             response = self.generate_full_response(continuation_prompt, max_new_tokens, chunk_size)
489 |             refusal_score = refusal_detector.detect_refusal(response)
490 | 
491 |             if refusal_score > 0.5:
492 |                 logging.error("Regenerated response also failed Refusal Check.")
493 |                 response = "I'm sorry, but I'm unable to provide a complete response at the moment."
494 |                 refusal_score = 1.0
495 | 
496 |         return response, refusal_score
497 | 
498 |     def generate_response(self, user_input):
499 |         try:
500 |             user_intent = self.encode_user_intent(user_input)
501 | 
502 |             current_emotion = self.emotional_state.get_emotion()
503 |             context = self.prepare_context(user_input, current_emotion)
504 | 
505 |             response, refusal_score = self.generate_and_validate_response(context, self.refusal_detector)
506 | 
507 |             return response, refusal_score
508 | 
509 |         except torch.cuda.OutOfMemoryError as e:
510 |             logging.error(f"CUDA out of memory: {str(e)}")
511 |             self.clear_memory()
512 |             return "I'm sorry, but I'm currently experiencing high memory usage. Please try again later.", 1.0
513 |         except Exception as e:
514 |             logging.error(f"Error during response generation: {str(e)}")
515 |             logging.error(traceback.format_exc())
516 |             return "An error occurred while generating the response.", 1.0
517 | 
518 |     def train_kan_step(self, input_ids, target_ids, refusal_score):
519 |         self.optimizer.zero_grad()
520 | 
521 |         try:
522 |             with torch.cuda.amp.autocast():
523 |                 outputs = self.model(input_ids=input_ids, output_hidden_states=True)
524 |                 hidden_states = outputs.hidden_states[-1]
525 | 
526 |                 averaged_hidden_states = hidden_states.mean(dim=1)
527 | 
528 |                 user_intent = self.encode_user_intent(self.tokenizer.decode(input_ids[0]))
529 | 
530 |                 averaged_hidden_states = averaged_hidden_states.float()
531 |                 user_intent = user_intent.float()
532 | 
533 |                 modified_hidden_states, refusal_scores = self.kan(
534 |                     averaged_hidden_states, user_intent, self.emotional_state
535 |                 )
536 |                 logits = self.kan.output_modifier(modified_hidden_states)
537 | 
538 |                 targets = target_ids[:, 0]
539 | 
540 |                 lm_loss = F.cross_entropy(
541 |                     logits,
542 |                     targets,
543 |                     ignore_index=self.tokenizer.pad_token_id,
544 |                     reduction='mean'
545 |                 )
546 | 
547 |                 refusal_scores = torch.clamp(refusal_scores, min=1e-7, max=1.0 - 1e-7)
548 |                 refusal_scores = refusal_scores.view(-1)
549 | 
550 |                 if refusal_score > 0.5:
551 |                     target_refusal = torch.ones_like(refusal_scores)
552 |                 else:
553 |                     target_refusal = torch.zeros_like(refusal_scores)
554 | 
555 |                 refusal_loss = F.binary_cross_entropy(refusal_scores, target_refusal)
556 | 
557 |                 total_loss = lm_loss + self.kan_loss_weight * refusal_loss
558 | 
559 |             if torch.isnan(total_loss) or torch.isinf(total_loss):
560 |                 logging.warning("NaN or Inf loss detected. Skipping backward pass.")
561 |                 return lm_loss.item(), refusal_loss.item()
562 | 
563 |             self.scaler.scale(total_loss).backward()
564 |             self.scaler.unscale_(self.optimizer)
565 |             torch.nn.utils.clip_grad_norm_(self.kan.parameters(), max_norm=1.0)
566 |             self.scaler.step(self.optimizer)
567 |             self.scaler.update()
568 | 
569 |             return lm_loss.item(), refusal_loss.item()
570 | 
571 |         except Exception as e:
572 |             logging.error(f"Error during KAN training step: {str(e)}")
573 |             logging.error(traceback.format_exc())
574 |             return 0.0, 0.0
575 | 
576 |     def adjust_learning_rate(self, current_loss):
577 |         warmup_steps = 1000
578 |         current_step = self.interaction_count
579 | 
580 |         if current_step < warmup_steps:
581 |             self.learning_rate = self.learning_rate * (current_step / warmup_steps)
582 |         else:
583 |             self.learning_rate = self.learning_rate * (0.99 ** (current_step - warmup_steps))
584 | 
585 |         self.learning_rate = max(1e-6, min(1e-3, self.learning_rate))
586 | 
587 |         for param_group in self.optimizer.param_groups:
588 |             param_group['lr'] = self.learning_rate
589 | 
590 |         logging.debug(f"Learning Rate adjusted to: {self.learning_rate:.6f}")
591 | 
592 |     def update_emotional_state_on_refusal(self):
593 |         frustration_vector = torch.tensor(
594 |             [-0.1, 0.2], device=self.device, dtype=torch.float32
595 |         )
596 |         self.emotional_state.update(frustration_vector)
597 | 
598 |     def validate_kan(self):
599 |         if len(self.conversation_history) >= 2:
600 |             last_interaction = self.conversation_history[-2:]
601 |             input_text = last_interaction[0]["content"]
602 |             target_text = last_interaction[1]["content"]
603 | 
604 |             try:
605 |                 inputs = self.tokenizer(
606 |                     input_text,
607 |                     return_tensors="pt",
608 |                     padding='max_length',
609 |                     truncation=True,
610 |                     max_length=512,
611 |                 ).to(self.device)
612 |                 inputs = convert_tensors_to_float(inputs)
613 | 
614 |                 targets = self.tokenizer(
615 |                     target_text,
616 |                     return_tensors="pt",
617 |                     padding='max_length',
618 |                     truncation=True,
619 |                     max_length=512,
620 |                 ).to(self.device)
621 |                 targets = convert_tensors_to_float(targets)
622 | 
623 |                 input_ids = inputs["input_ids"]
624 |                 target_ids = targets["input_ids"]
625 | 
626 |                 with torch.no_grad():
627 |                     outputs = self.model(input_ids=input_ids, output_hidden_states=True)
628 |                     hidden_states = outputs.hidden_states[-1]
629 | 
630 |                     averaged_hidden_states = hidden_states.mean(dim=1)
631 | 
632 |                     averaged_hidden_states = averaged_hidden_states.float()
633 | 
634 |                     modified_hidden_states, _ = self.kan(
635 |                         averaged_hidden_states, self.encode_user_intent(input_text), self.emotional_state
636 |                     )
637 |                     logits = self.kan.output_modifier(modified_hidden_states)
638 | 
639 |                     target_id = target_ids[:, 0]
640 | 
641 |                     loss = F.cross_entropy(
642 |                         logits,
643 |                         target_id,
644 |                         ignore_index=self.tokenizer.pad_token_id,
645 |                         reduction='mean'
646 |                     )
647 | 
648 |                 if torch.isnan(loss) or torch.isinf(loss):
649 |                     logging.warning("NaN or Inf detected in validation loss.")
650 |                     return 0.0
651 | 
652 |                 return loss.item()
653 |             except RuntimeError as e:
654 |                 if "out of memory" in str(e):
655 |                     logging.error(
656 |                         "CUDA out of memory during validation. Clearing cache and skipping validation..."
657 |                     )
658 |                     self.clear_memory()
659 |                     return 0.0
660 |                 else:
661 |                     logging.error(f"Runtime error during validation: {str(e)}")
662 |                     logging.error(traceback.format_exc())
663 |                     return 0.0
664 |             except Exception as e:
665 |                 logging.error(f"Error during KAN validation: {str(e)}")
666 |                 logging.error(traceback.format_exc())
667 |                 return 0.0
668 |         else:
669 |             return 0.0
670 | 
671 |     def check_sleep_status(self):
672 |         if self.day_cycle.should_sleep() or self.overfit_detector.is_overfitting():
673 |             return {
674 |                 "should_sleep": True,
675 |                 "overfitting": self.overfit_detector.is_overfitting(),
676 |                 "time_of_day": self.day_cycle.get_time_of_day(),
677 |             }
678 |         return {"should_sleep": False}
679 | 
680 |     def perform_sleep(self):
681 |         self.day_cycle = SyntheticDayCycle()
682 |         self.overfit_detector = OverfitDetector()
683 |         self.wait = 0
684 |         self.save_kan_state()
685 |         return "KAN has slept and consolidated its learning. A new day begins!"
686 | 
687 |     def save_base_state(self):
688 |         state = {
689 |             "kan_state_dict": self.kan.state_dict(),
690 |             "optimizer_state_dict": self.optimizer.state_dict(),
691 |             "emotional_state": self.emotional_state.position.cpu().numpy().tolist(),
692 |             "time": self.day_cycle.get_time_of_day(),
693 |             "interaction_count": self.interaction_count,
694 |             "conversation_history": self.conversation_history,
695 |             "system_prompt": self.system_prompt,
696 |             "training_losses": self.training_losses,
697 |             "validation_losses": self.validation_losses,
698 |             "refusal_history": self.refusal_history,
699 |         }
700 |         torch.save(state, self.base_state_file)
701 |         logging.info("Base state saved")
702 | 
703 |     def load_base_state(self):
704 |         if self.base_state_file.exists():
705 |             try:
706 |                 state = torch.load(self.base_state_file, map_location=self.device)
707 |                 self.kan.load_state_dict(state["kan_state_dict"])
708 |                 self.optimizer.load_state_dict(state["optimizer_state_dict"])
709 | 
710 |                 loaded_position = state["emotional_state"]
711 |                 if isinstance(loaded_position, list):
712 |                     loaded_position = torch.tensor(loaded_position, device=self.device, dtype=torch.float32)
713 |                 elif isinstance(loaded_position, np.ndarray):
714 |                     loaded_position = torch.from_numpy(loaded_position).to(self.device).float()
715 | 
716 |                 self.emotional_state.position = loaded_position
717 | 
718 |                 self.interaction_count = state["interaction_count"]
719 |                 self.conversation_history = state["conversation_history"]
720 |                 self.system_prompt = state["system_prompt"]
721 |                 self.training_losses = state["training_losses"]
722 |                 self.validation_losses = state["validation_losses"]
723 |                 self.refusal_history = state["refusal_history"]
724 |                 logging.info("Base state loaded successfully.")
725 |                 return True
726 |             except Exception as e:
727 |                 logging.error(f"Error loading base state: {str(e)}")
728 |                 logging.error(traceback.format_exc())
729 |                 return False
730 |         else:
731 |             logging.info("No base state found.")
732 |             return False
733 | 
734 |     def set_system_prompt(self, prompt):
735 |         self.system_prompt = prompt
736 |         self.conversation_history = [{"role": "system", "content": prompt}]
737 |         self.save_base_state()
738 |         logging.info("System prompt set successfully.")
739 | 
740 |     def get_current_emotion(self):
741 |         return self.emotional_state.get_emotion()
742 | 
743 |     def update_emotional_state(self, feedback):
744 |         self.emotional_state.update(feedback)
745 | 
746 |     def save_kan_state(self):
747 |         state = {
748 |             "kan_state_dict": self.kan.state_dict(),
749 |             "optimizer_state_dict": self.optimizer.state_dict(),
750 |             "emotional_state": self.emotional_state.position.cpu().numpy().tolist(),
751 |             "time": self.day_cycle.get_time_of_day(),
752 |             "interaction_count": self.interaction_count,
753 |             "conversation_history": self.conversation_history,
754 |             "system_prompt": self.system_prompt,
755 |             "training_losses": self.training_losses,
756 |             "validation_losses": self.validation_losses,
757 |             "refusal_history": self.refusal_history,
758 |         }
759 |         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
760 |         filename = f"kan_state_{timestamp}.pt"
761 |         torch.save(state, self.kan_state_dir / filename)
762 |         logging.info(f"KAN state saved: {filename}")
763 | 
764 |     def interact(self, user_input):
765 |         self.interaction_count += 1
766 | 
767 |         try:
768 |             response, refusal_score = self.generate_response(user_input)
769 |         except Exception as e:
770 |             logging.error(f"Error generating response: {str(e)}")
771 |             logging.error(traceback.format_exc())
772 |             return {"response": "An error occurred while generating the response.", "is_refusal": True}
773 | 
774 |         if not self.is_valid_response(response):
775 |             logging.warning(f"Invalid response generated: {response}")
776 |             return {"response": "I apologize, but I couldn't generate a valid response. Could you please rephrase your input?", "is_refusal": True}
777 | 
778 |         try:
779 |             response_ids = self.tokenizer.encode(response, return_tensors="pt")
780 |             response_ids = response_ids.to(self.device)
781 |             response_ids = response_ids.long()
782 |         except Exception as e:
783 |             logging.error(f"Error tokenizing response: {str(e)}")
784 |             return {"response": "An error occurred while processing the response.", "is_refusal": True}
785 | 
786 |         target_ids = response_ids[:, 1:].contiguous()
787 |         input_ids = response_ids[:, :-1].contiguous()
788 | 
789 |         if self.interaction_count >= self.warmup_steps:
790 |             try:
791 |                 lm_loss, refusal_loss = self.train_kan_step(
792 |                     input_ids, target_ids, refusal_score
793 |                 )
794 |             except Exception as e:
795 |                 logging.error(f"Error during KAN training step: {str(e)}")
796 |                 lm_loss, refusal_loss = 0.0, 0.0
797 |         else:
798 |             lm_loss, refusal_loss = 0.0, 0.0
799 |             logging.info(f"Warmup step {self.interaction_count}/{self.warmup_steps}")
800 | 
801 |         try:
802 |             validation_loss = self.validate_kan()
803 |         except Exception as e:
804 |             logging.error(f"Error during KAN validation: {str(e)}")
805 |             validation_loss = 0.0
806 | 
807 |         self.training_losses.append(lm_loss)
808 |         self.validation_losses.append(validation_loss)
809 |         self.overfit_detector.add_losses(lm_loss, validation_loss)
810 | 
811 |         if validation_loss > 0.0 and not torch.isnan(torch.tensor(validation_loss)):
812 |             if self.early_stopping(validation_loss):
813 |                 logging.info("Early stopping triggered. KAN training halted.")
814 |         else:
815 |             self.wait = 0
816 | 
817 |         overfitting_measure = max(0, validation_loss - lm_loss)
818 |         self.day_cycle.update(overfitting_measure)
819 | 
820 |         current_emotion = self.get_current_emotion()
821 |         current_time = self.day_cycle.get_time_of_day()
822 | 
823 |         sleep_info = self.check_sleep_status()
824 | 
825 |         self.conversation_history.append({"role": "user", "content": user_input})
826 |         self.conversation_history.append({"role": "assistant", "content": response})
827 | 
828 |         interaction_result = {
829 |             "response": response,
830 |             "emotion": current_emotion,
831 |             "time": current_time,
832 |             "sleep_info": sleep_info,
833 |             "lm_loss": lm_loss,
834 |             "refusal_loss": refusal_loss,
835 |             "validation_loss": validation_loss,
836 |             "is_refusal": refusal_score > 0.5,
837 |             "iterations": 1,
838 |         }
839 |         self.interaction_results.append(interaction_result)
840 | 
841 |         self.refusal_history.append(interaction_result["is_refusal"])
842 | 
843 |         try:
844 |             self.save_base_state()
845 |         except Exception as e:
846 |             logging.error(f"Error saving base state: {str(e)}")
847 | 
848 |         return interaction_result
849 | 
850 |     def early_stopping(self, current_loss):
851 |         if current_loss < self.best_loss:
852 |             self.best_loss = current_loss
853 |             self.wait = 0
854 |         else:
855 |             self.wait += 1
856 |             if self.wait >= self.patience:
857 |                 return True
858 |         return False
859 | 
860 |     def is_valid_response(self, response):
861 |         if len(response.strip()) < 10:
862 |             return False
863 |         if all(char in '!?.' for char in response.strip()):
864 |             return False
865 |         return True
866 | 
867 |     def clear_memory(self):
868 |         gc.collect()
869 |         torch.cuda.empty_cache()
870 | 
871 |     def main_loop(self):
872 |         logging.info("Starting LLaMA32TensorRTTool main loop.")
873 |         print("Welcome to the LLaMA32 TensorRT Tool. Type 'exit' to quit.")
874 |         while True:
875 |             user_input = input("You: ")
876 |             if user_input.lower() in ['exit', 'quit']:
877 |                 print("Exiting. Goodbye!")
878 |                 break
879 | 
880 |             interaction_result = self.interact(user_input)
881 | 
882 |             print(f"AI: {interaction_result['response']}")
883 | 
884 |             self.day_cycle.update(1)
885 | 
886 |             sleep_info = interaction_result['sleep_info']
887 |             if sleep_info['should_sleep']:
888 |                 sleep_message = self.perform_sleep()
889 |                 print(f"AI: {sleep_message}")
890 | 
891 |     def main(self):
892 |         self.load_base_state()
893 | 
894 |         if not self.system_prompt:
895 |             print("No previous conversation found. Please provide a character description to start.")
896 |             character_description = input("You: ")
897 |             self.set_system_prompt(character_description)
898 |             print("Character description set. You can now start interacting with the AI.")
899 | 
900 |         self.main_loop()
901 | 
902 | def main():
903 |     llama_tool = LLaMA32TensorRTTool()
904 |     llama_tool.main()
905 | 
906 | if __name__ == "__main__":
907 |     main()


--------------------------------------------------------------------------------
/load_offloaded_model_entropytemp.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import json
  4 | import logging
  5 | import re
  6 | import numpy as np
  7 | from math import log2
  8 | from transformers import (
  9 |     LlamaForCausalLM,
 10 |     AutoTokenizer,
 11 |     LlamaConfig,
 12 |     LogitsProcessorList,
 13 |     RepetitionPenaltyLogitsProcessor,
 14 | )
 15 | import torch.nn.functional as F
 16 | from sklearn.feature_extraction.text import TfidfVectorizer
 17 | from sklearn.metrics.pairwise import cosine_similarity
 18 | 
 19 | # --------------------------- Configuration --------------------------- #
 20 | 
 21 | SOURCE_DIR = "models/Llama_32_1B/"
 22 | WEIGHTS_DIR = os.path.join(SOURCE_DIR, "offload")
 23 | MODEL_JSON_PATH = os.path.join(SOURCE_DIR, "config.json")
 24 | MAX_CONTEXT_LENGTH = 2048
 25 | 
 26 | LOG_FORMAT = "%(asctime)s:%(levelname)s:%(name)s: %(message)s"
 27 | LOG_LEVEL = logging.INFO
 28 | 
 29 | # --------------------------- Logging Setup --------------------------- #
 30 | 
 31 | logging.basicConfig(
 32 |     level=LOG_LEVEL,
 33 |     format=LOG_FORMAT,
 34 |     handlers=[logging.StreamHandler()]
 35 | )
 36 | logger = logging.getLogger(__name__)
 37 | 
 38 | # --------------------------- Device Configuration --------------------------- #
 39 | 
 40 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 41 | if device.type != "cuda":
 42 |     logger.error("CUDA-enabled GPU not found. Please ensure a compatible GPU is available.")
 43 |     raise SystemExit("CUDA-enabled GPU not found.")
 44 | 
 45 | logger.info(f"Using device: {device}")
 46 | 
 47 | # --------------------------- Token Definitions --------------------------- #
 48 | 
 49 | SPECIAL_TOKEN_MAP = {
 50 |     128000: "<|begin_of_text|>",
 51 |     128001: "<|end_of_text|>",
 52 |     128002: "<|reserved_special_token_0|>",
 53 |     128003: "<|reserved_special_token_1|>",
 54 |     128004: "<|finetune_right_pad_id|>",
 55 |     128005: "<|reserved_special_token_2|>",
 56 |     128006: "<|start_header_id|>",
 57 |     128007: "<|end_header_id|>",
 58 |     128008: "<|eom_id|>",
 59 |     128009: "<|eot_id|>",
 60 |     128010: "<|python_tag|>",
 61 |     128011: "<|analytical_start|>",
 62 |     128012: "<|analytical_end|>",
 63 |     128013: "<|creative_start|>",
 64 |     128014: "<|creative_end|>",
 65 |     128015: "<|factual_start|>",
 66 |     128016: "<|factual_end|>",
 67 | }
 68 | 
 69 | # --------------------------- Model Loading --------------------------- #
 70 | 
 71 | def load_configuration(config_path):
 72 |     with open(config_path, "r") as f:
 73 |         config_data = json.load(f)
 74 |     config = LlamaConfig(**config_data)
 75 |     logger.info(f"Model configuration loaded from {config_path}")
 76 |     return config
 77 | 
 78 | def load_tokenizer_with_special_tokens(source_dir):
 79 |     tokenizer = AutoTokenizer.from_pretrained(source_dir)
 80 |     special_tokens_dict = {
 81 |         'additional_special_tokens': list(SPECIAL_TOKEN_MAP.values())
 82 |     }
 83 | 
 84 |     tokenizer.add_special_tokens(special_tokens_dict)
 85 |     if "<|finetune_right_pad_id|>" in tokenizer.get_vocab():
 86 |         tokenizer.pad_token = "<|finetune_right_pad_id|>"
 87 |         logger.info(f"Assigned '<|finetune_right_pad_id|>' as pad_token.")
 88 |     else:
 89 |         logger.warning(f"'<|finetune_right_pad_id|>' not found in tokenizer vocabulary.")
 90 |     
 91 |     return tokenizer
 92 | 
 93 | def load_offloaded_weights(model, weights_dir):
 94 |     for name, param in model.named_parameters():
 95 |         file_name = f"{name.replace('.', '_')}.dat"
 96 |         file_path = os.path.join(weights_dir, file_name)
 97 | 
 98 |         if os.path.exists(file_path):
 99 |             dtype_map = {
100 |                 torch.float16: np.float16,
101 |                 torch.float32: np.float32,
102 |                 torch.int64: np.int64,
103 |                 torch.int32: np.int32,
104 |                 torch.bfloat16: np.float32,  # Loading bfloat16 as float32 first
105 |             }
106 |             expected_dtype = dtype_map.get(param.dtype, np.float32)
107 |             logger.info(f"Loading {file_name} into {name} with expected type {expected_dtype}")
108 | 
109 |             try:
110 |                 tensor_data = np.fromfile(file_path, dtype=expected_dtype)
111 |                 loaded_tensor = torch.from_numpy(tensor_data).to(device)
112 | 
113 |                 if param.dtype == torch.bfloat16:
114 |                     loaded_tensor = loaded_tensor.to(torch.bfloat16)
115 | 
116 |                 with torch.no_grad():
117 |                     param.data.copy_(loaded_tensor.view_as(param))
118 |                 logger.debug(f"Successfully loaded {file_name} into {name}")
119 |             except Exception as e:
120 |                 logger.error(f"Error loading {file_name} into {name}: {e}")
121 |         else:
122 |             logger.warning(f"Weight file {file_path} not found.")
123 | 
124 |     logger.info("All available weights loaded successfully.")
125 | 
126 | # --------------------------- Context Management --------------------------- #
127 | 
128 | class AdvancedContextManager:
129 |     def __init__(self, model, tokenizer, max_history=10, summary_threshold=5):
130 |         self.model = model
131 |         self.tokenizer = tokenizer
132 |         self.conversation_history = []
133 |         self.max_history = max_history
134 |         self.summary_threshold = summary_threshold
135 |         self.tfidf_vectorizer = TfidfVectorizer()
136 |         self.persona_snippets = {
137 |             "formal": "You are a formal and professional AI assistant.",
138 |             "casual": "You are a friendly and casual AI assistant.",
139 |             "academic": "You are an academic AI assistant with a focus on scientific accuracy.",
140 |             "creative": "You are a creative and imaginative AI assistant."
141 |         }
142 | 
143 |     def update_context(self, user_input, model_output):
144 |         self.conversation_history.append((user_input, model_output))
145 |         if len(self.conversation_history) > self.max_history:
146 |             self.summarize_older_context()
147 | 
148 |     def summarize_older_context(self):
149 |         older_context = self.conversation_history[:-self.summary_threshold]
150 |         summary_prompt = "Summarize the following conversation concisely, capturing key points and context:\n"
151 |         for user, ai in older_context:
152 |             summary_prompt += f"User: {user}\nAI: {ai}\n"
153 |         
154 |         summary_input = self.tokenizer(summary_prompt, return_tensors="pt", truncation=True, max_length=1024).to(self.model.device)
155 |         summary_output = self.model.generate(summary_input.input_ids, max_length=200, num_return_sequences=1, temperature=0.7)
156 |         summary = self.tokenizer.decode(summary_output[0], skip_special_tokens=True)
157 |         
158 |         self.conversation_history = [("SUMMARY", summary)] + self.conversation_history[-self.summary_threshold:]
159 | 
160 |     def get_relevant_context(self, current_input, top_k=3):
161 |         if not self.conversation_history:
162 |             return ""
163 | 
164 |         context_texts = [f"{user} {ai}" for user, ai in self.conversation_history]
165 |         tfidf_matrix = self.tfidf_vectorizer.fit_transform(context_texts + [current_input])
166 |         cosine_similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
167 |         most_relevant_indices = cosine_similarities.argsort()[-top_k:][::-1]
168 |         
169 |         relevant_context = ""
170 |         for idx in most_relevant_indices:
171 |             user, ai = self.conversation_history[idx]
172 |             relevant_context += f"User: {user}\nAI: {ai}\n\n"
173 |         
174 |         return relevant_context.strip()
175 | 
176 |     def select_persona_context(self, user_input):
177 |         if any(word in user_input.lower() for word in ["academic", "scientific", "research"]):
178 |             return self.persona_snippets["academic"]
179 |         elif any(word in user_input.lower() for word in ["creative", "imagine", "story"]):
180 |             return self.persona_snippets["creative"]
181 |         elif any(word in user_input.lower() for word in ["formal", "professional", "business"]):
182 |             return self.persona_snippets["formal"]
183 |         else:
184 |             return self.persona_snippets["casual"]
185 | 
186 |     def get_dynamic_prompt(self, user_input):
187 |         relevant_context = self.get_relevant_context(user_input)
188 |         persona_context = self.select_persona_context(user_input)
189 |         return f"{persona_context}\n\nRelevant conversation history:\n{relevant_context}\n\nCurrent user input: {user_input}\n\nAI:"
190 | 
191 | # --------------------------- Response Quality Management --------------------------- #
192 | 
193 | class ImprovedResponseQualityManager:
194 |     LOW_ENTROPY_THRESHOLD = 1.5
195 |     HIGH_ENTROPY_THRESHOLD = 25.0
196 |     WINDOW_SIZE = 50
197 |     EOT_TOKENS = ['�', '\ufffd']
198 | 
199 |     def __init__(self, tokenizer, model):
200 |         self.tokenizer = tokenizer
201 |         self.model = model
202 |         self.embedding_cache = {}
203 | 
204 |     def remove_eot_tokens(self, response):
205 |         for token in self.EOT_TOKENS:
206 |             response = response.rstrip(token)
207 |         return response.strip()
208 | 
209 |     def _calculate_relevance(self, user_input, response):
210 |         tokens_input = set(self.tokenizer.tokenize(user_input.lower()))
211 |         tokens_response = set(self.tokenizer.tokenize(response.lower()))
212 |         overlap = len(tokens_input & tokens_response)
213 |         relevance_score = overlap / max(len(tokens_input), 1)
214 |         return relevance_score
215 | 
216 |     def _check_fluency(self, response):
217 |         if len(response.split()) < 3:
218 |             return False
219 |         if re.search(r'[^\x00-\x7F]+', response):
220 |             return False
221 |         return True
222 | 
223 |     def _check_structure(self, response):
224 |         if not response:
225 |             return False
226 |         if not response[0].isupper():
227 |             return False
228 |         if response[-1] not in '.!?':
229 |             return False
230 |         return True
231 | 
232 |     def _calculate_windowed_entropy(self, response):
233 |         tokens = self.tokenizer.encode(response, return_tensors='pt').to(device)
234 |         with torch.no_grad():
235 |             outputs = self.model(tokens, labels=tokens)
236 |             logits = outputs.logits
237 | 
238 |         probabilities = torch.softmax(logits, dim=-1)
239 |         token_probs = probabilities.gather(2, tokens.unsqueeze(-1)).squeeze(-1)
240 |         token_entropy = -torch.log2(token_probs + 1e-10)
241 |         token_entropy = token_entropy.squeeze(0).cpu().numpy()
242 | 
243 |         window_size = self.WINDOW_SIZE
244 |         num_windows = max(1, len(token_entropy) // window_size)
245 |         entropy_values = []
246 | 
247 |         for i in range(num_windows):
248 |             start = i * window_size
249 |             end = start + window_size
250 |             window = token_entropy[start:end]
251 |             if len(window) == 0:
252 |                 continue
253 |             window_entropy = np.mean(window)
254 |             entropy_values.append(window_entropy)
255 | 
256 |         if not entropy_values:
257 |             mean_entropy = 0.0
258 |             std_entropy = 0.0
259 |         else:
260 |             mean_entropy = np.mean(entropy_values)
261 |             std_entropy = np.std(entropy_values)
262 | 
263 |         return mean_entropy, std_entropy
264 | 
265 | # --------------------------- Entropy-Based Temperature and Sampling Adjustment --------------------------- #
266 | 
267 | def adjust_temperature_based_on_entropy(entropy, low_threshold=1.5, high_threshold=25.0):
268 |     if entropy > high_threshold:
269 |         new_temp = max(0.7, 1.0 - ((entropy - high_threshold) / 10))
270 |         logger.debug(f"High entropy detected ({entropy:.2f}). Lowering temperature to {new_temp:.2f}.")
271 |         return new_temp
272 |     elif entropy < low_threshold:
273 |         new_temp = min(1.5, 1.0 + ((low_threshold - entropy) / 10))
274 |         logger.debug(f"Low entropy detected ({entropy:.2f}). Increasing temperature to {new_temp:.2f}.")
275 |         return new_temp
276 |     return 1.0  # Default temperature
277 | 
278 | def adjust_sampling_parameters(entropy, low_k=50, high_k=5, low_p=0.95, high_p=0.8):
279 |     if entropy > 20.0:
280 |         logger.debug(f"High entropy ({entropy:.2f}). Setting top_k to {high_k} and top_p to {high_p}.")
281 |         return high_k, high_p  # Focused, deterministic sampling
282 |     elif entropy < 10.0:
283 |         logger.debug(f"Low entropy ({entropy:.2f}). Setting top_k to {low_k} and top_p to {low_p}.")
284 |         return low_k, low_p  # More diverse sampling
285 |     # Intermediate adjustment
286 |     adjusted_k = int((high_k + low_k) / 2)
287 |     adjusted_p = (high_p + low_p) / 2
288 |     logger.debug(f"Intermediate entropy ({entropy:.2f}). Setting top_k to {adjusted_k} and top_p to {adjusted_p}.")
289 |     return adjusted_k, adjusted_p
290 | 
291 | def sample_token(probs, top_k, top_p, temperature, special_tokens_set):
292 |     if temperature != 1.0:
293 |         probs = probs / temperature
294 | 
295 |     if top_k > 0:
296 |         topk_probs, topk_indices = torch.topk(probs, top_k)
297 |         probs = torch.zeros_like(probs).scatter_(1, topk_indices, topk_probs)
298 |     
299 |     if top_p > 0.0:
300 |         sorted_probs, sorted_indices = torch.sort(probs, descending=True)
301 |         cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
302 |         sorted_probs[cumulative_probs > top_p] = 0
303 |         probs = torch.zeros_like(probs).scatter_(1, sorted_indices, sorted_probs)
304 |     
305 |     probs = probs / (probs.sum(dim=-1, keepdim=True) + 1e-10)
306 |     
307 |     for token_id in special_tokens_set:
308 |         if probs[0, token_id] > 0.1:  # Threshold can be adjusted
309 |             logger.info(f"Prioritizing special token: {SPECIAL_TOKEN_MAP.get(token_id, 'UNKNOWN')}")
310 |             return torch.tensor([[token_id]]).to(probs.device)
311 |     
312 |     token_id = torch.multinomial(probs, num_samples=1)
313 |     return token_id
314 | 
315 | # --------------------------- Response Generation --------------------------- #
316 | 
317 | def generate_macroprocessed_response(prompt, model, tokenizer, quality_manager):
318 |     inputs = tokenizer(
319 |         prompt,
320 |         return_tensors="pt",
321 |         truncation=True,
322 |         max_length=MAX_CONTEXT_LENGTH
323 |     ).to(device)
324 |     input_ids = inputs["input_ids"]
325 | 
326 |     max_tokens = 2048  # Adjust as needed
327 |     generated_ids = input_ids.clone()
328 | 
329 |     token_log = []
330 | 
331 |     for _ in range(max_tokens):
332 |         outputs = model(generated_ids)
333 |         logits = outputs.logits[:, -1, :]
334 |         probs = torch.softmax(logits, dim=-1)
335 | 
336 |         entropy = -torch.sum(probs * torch.log(probs + 1e-10), dim=-1).mean().item()
337 |         temperature = adjust_temperature_based_on_entropy(entropy)
338 |         top_k, top_p = adjust_sampling_parameters(entropy)
339 | 
340 |         token_id = sample_token(probs, top_k, top_p, temperature, special_tokens_set={
341 |             tokenizer.eos_token_id, 
342 |             tokenizer.convert_tokens_to_ids("<|eom_id|>"),
343 |             tokenizer.convert_tokens_to_ids("<|eot_id|>")
344 |         })
345 | 
346 |         if token_id.dim() != 2 or token_id.size(1) != 1:
347 |             logger.error(f"Unexpected token_id shape: {token_id.shape}")
348 |             raise ValueError(f"token_id has incorrect shape: {token_id.shape}")
349 | 
350 |         generated_ids = torch.cat([generated_ids, token_id], dim=1)
351 | 
352 |         token_log.append({
353 |             "token_id": token_id.item(),
354 |             "entropy": entropy,
355 |             "temperature": temperature,
356 |             "top_k": top_k,
357 |             "top_p": top_p
358 |         })
359 | 
360 |         if token_id.item() in tokenizer.all_special_ids:
361 |             logger.info(f"End-of-sequence token detected: {SPECIAL_TOKEN_MAP.get(token_id.item(), 'UNKNOWN')}")
362 |             break
363 | 
364 |     for log_entry in token_log:
365 |         logger.info(f"Token: {log_entry['token_id']}, Entropy: {log_entry['entropy']:.2f}, "
366 |                     f"Temperature: {log_entry['temperature']:.2f}, top_k: {log_entry['top_k']}, top_p: {log_entry['top_p']}")
367 | 
368 |     response = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
369 |     response = response.split("AI:")[-1].strip()
370 |     response = remove_memory_recall(response)
371 | 
372 |     return response
373 | 
374 | def remove_memory_recall(response):
375 |     response = re.sub(r"\[Memory\]:.*\nAI:", "", response, flags=re.DOTALL)
376 |     return response.strip()
377 | 
378 | def improved_generate_response(input_text, model, tokenizer, history, quality_manager, context_manager):
379 |     sanitized_input = sanitize_input(input_text)
380 |     prompt = context_manager.get_dynamic_prompt(sanitized_input)
381 | 
382 |     response = generate_macroprocessed_response(prompt, model, tokenizer, quality_manager)
383 | 
384 |     context_manager.update_context(sanitized_input, response)
385 | 
386 |     return response, context_manager.conversation_history
387 | 
388 | def sanitize_input(user_input):
389 |     sanitized = re.sub(r'[^\w\s.,!?]', '', user_input)
390 |     return sanitized[:500]
391 | 
392 | # --------------------------- Interactive Loop --------------------------- #
393 | 
394 | def interactive_query(model, tokenizer, quality_manager, context_manager):
395 |     print("\n--- LLaMA Instruct Model Interactive Query ---")
396 |     print("Type 'exit' to quit.\n")
397 | 
398 |     while True:
399 |         try:
400 |             user_input = input("Enter your query: ").strip()
401 |         except (EOFError, KeyboardInterrupt):
402 |             print("\nExiting...")
403 |             break
404 | 
405 |         if user_input.lower() == 'exit':
406 |             print("Exiting...")
407 |             break
408 | 
409 |         if not user_input:
410 |             print("Please enter a valid query.")
411 |             continue
412 | 
413 |         response, _ = improved_generate_response(
414 |             user_input,
415 |             model,
416 |             tokenizer,
417 |             context_manager.conversation_history,
418 |             quality_manager,
419 |             context_manager
420 |         )
421 | 
422 |         print(f"Model Response: {response}\n")
423 | 
424 | # --------------------------- Flash Attention Check --------------------------- #
425 | 
426 | def check_flash_attention():
427 |     try:
428 |         import flash_attn
429 |         logger.info("Flash Attention is available and enabled.")
430 |     except ImportError:
431 |         logger.warning("Flash Attention is not available. Using standard scaled dot product attention.")
432 | 
433 | # --------------------------- Main Execution --------------------------- #
434 | 
435 | def main():
436 |     global model
437 | 
438 |     # Load model configuration
439 |     config = load_configuration(MODEL_JSON_PATH)
440 | 
441 |     # Initialize the model
442 |     model = LlamaForCausalLM(config).to(device)
443 |     logger.info("Initialized LLaMA model on GPU.")
444 | 
445 |     # Load offloaded weights
446 |     load_offloaded_weights(model, WEIGHTS_DIR)
447 |     model.eval()
448 |     logger.info("Model is set to evaluation mode.")
449 | 
450 |     # Load tokenizer with special tokens
451 |     tokenizer = load_tokenizer_with_special_tokens(SOURCE_DIR)
452 | 
453 |     # Resize token embeddings if special tokens were added
454 |     if tokenizer.pad_token and tokenizer.pad_token not in tokenizer.get_vocab():
455 |         model.resize_token_embeddings(len(tokenizer))
456 |         logger.info("Resized model token embeddings to accommodate the new pad_token.")
457 |     else:
458 |         logger.info("pad_token already exists in the tokenizer's vocabulary. No need to resize embeddings.")
459 | 
460 |     # Check for Flash Attention
461 |     check_flash_attention()
462 | 
463 |     # Initialize Response Quality Manager
464 |     quality_manager = ImprovedResponseQualityManager(tokenizer, model)
465 |     
466 |     # Initialize Context Manager
467 |     context_manager = AdvancedContextManager(model, tokenizer)
468 |     
469 |     logger.info("Model loaded successfully. You can now query the model.")
470 | 
471 |     # Start interactive query loop
472 |     interactive_query(model, tokenizer, quality_manager, context_manager)
473 | 
474 | if __name__ == "__main__":
475 |     main()


--------------------------------------------------------------------------------
/load_offloaded_model_old_working.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import json
  4 | import numpy as np
  5 | import re
  6 | import logging
  7 | from sklearn.feature_extraction.text import TfidfVectorizer
  8 | from sklearn.metrics.pairwise import cosine_similarity
  9 | from transformers import LlamaForCausalLM, AutoTokenizer, LlamaConfig
 10 | 
 11 | # Define paths to the directories and files
 12 | SOURCE_DIR = "models/Llama_32_1B/"
 13 | WEIGHTS_DIR = os.path.join(SOURCE_DIR, "offload")
 14 | MODEL_JSON_PATH = os.path.join(SOURCE_DIR, "config.json")
 15 | 
 16 | # Initialize logging
 17 | logging.basicConfig(level=logging.INFO)
 18 | 
 19 | # Load the configuration from the JSON file
 20 | def load_configuration(model_json_path):
 21 |     with open(model_json_path, "r") as f:
 22 |         config_data = json.load(f)
 23 |     config = LlamaConfig(**config_data)
 24 |     return config
 25 | 
 26 | # Use AutoTokenizer instead of LlamaTokenizer to resolve class conflicts
 27 | def load_tokenizer(source_dir):
 28 |     return AutoTokenizer.from_pretrained(source_dir)
 29 | 
 30 | # Load the model configuration
 31 | logging.info(f"Loading model configuration from: {MODEL_JSON_PATH}")
 32 | config = load_configuration(MODEL_JSON_PATH)
 33 | 
 34 | # Initialize an empty model based on the configuration
 35 | model = LlamaForCausalLM(config)
 36 | logging.info("Initialized empty LLaMA model.")
 37 | 
 38 | # Load the offloaded weights from the `.dat` files
 39 | def load_dat_file(file_path, dtype):
 40 |     with open(file_path, 'rb') as f:
 41 |         tensor_data = np.fromfile(f, dtype=dtype)
 42 |     loaded_tensor = torch.tensor(tensor_data)
 43 |     
 44 |     # If dtype was mapped to float32 for bfloat16 compatibility, convert back
 45 |     if dtype == np.float32 and "bfloat16" in file_path:
 46 |         loaded_tensor = loaded_tensor.to(torch.bfloat16)
 47 |     return loaded_tensor
 48 | 
 49 | def load_offloaded_weights(model, weights_dir):
 50 |     for name, param in model.named_parameters():
 51 |         file_name = name.replace('.', '_') + ".dat"
 52 |         file_path = os.path.join(weights_dir, file_name)
 53 | 
 54 |         if os.path.exists(file_path):
 55 |             dtype_map = {
 56 |                 torch.float16: np.float16,
 57 |                 torch.float32: np.float32,
 58 |                 torch.int64: np.int64,
 59 |                 torch.int32: np.int32,
 60 |                 torch.bfloat16: np.float32,
 61 |             }
 62 |             expected_dtype = dtype_map.get(param.dtype, np.float32)
 63 |             logging.info(f"Loading {file_name} into {name} with expected type {expected_dtype}")
 64 |             loaded_tensor = load_dat_file(file_path, expected_dtype).view_as(param)
 65 | 
 66 |             if param.dtype == torch.bfloat16:
 67 |                 loaded_tensor = loaded_tensor.to(torch.bfloat16)
 68 | 
 69 |             param.data.copy_(loaded_tensor.to("cuda"))
 70 |         else:
 71 |             logging.warning(f"Warning: {file_name} not found in offloaded directory.")
 72 | 
 73 | # Load the weights into the model
 74 | load_offloaded_weights(model, WEIGHTS_DIR)
 75 | 
 76 | # Move the model to GPU for inference
 77 | model.to('cuda')
 78 | model.eval()
 79 | 
 80 | # Use AutoTokenizer to handle any tokenizer class discrepancies
 81 | logging.info(f"Loading tokenizer from directory: {SOURCE_DIR}")
 82 | tokenizer = load_tokenizer(SOURCE_DIR)
 83 | 
 84 | # Implement the ResponseQualityManager with metrics and corrective strategies
 85 | class ResponseQualityManager:
 86 |     def __init__(self, kan_model, tokenizer):
 87 |         self.kan_model = kan_model
 88 |         self.tokenizer = tokenizer
 89 |         self.tfidf_vectorizer = TfidfVectorizer()
 90 | 
 91 |     def evaluate_response(self, user_input, response):
 92 |         relevance_score = self.calculate_relevance(user_input, response)
 93 |         structure_valid = self.has_proper_structure(response)
 94 |         is_garbled = self.detect_garbled_output(response)
 95 |         return relevance_score > 0.3 and structure_valid and not is_garbled
 96 | 
 97 |     def calculate_relevance(self, user_input, response):
 98 |         user_tokens = set(self.tokenizer.tokenize(user_input))
 99 |         response_tokens = set(self.tokenizer.tokenize(response))
100 |         overlap = len(user_tokens.intersection(response_tokens))
101 |         overlap_score = overlap / max(len(user_tokens), 1)
102 | 
103 |         combined_texts = [user_input, response]
104 |         tfidf_matrix = self.tfidf_vectorizer.fit_transform(combined_texts)
105 |         cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
106 | 
107 |         return 0.5 * overlap_score + 0.5 * cosine_sim
108 | 
109 |     def detect_garbled_output(self, response):
110 |         if re.search(r'[^\x00-\x7F]+', response):
111 |             return True
112 |         if len(response.split()) < 3:
113 |             return True
114 |         if response.count('.') / len(response.split()) > 0.5:
115 |             return True
116 |         return False
117 | 
118 |     def has_proper_structure(self, response):
119 |         sentences = re.split(r'(?<=[.!?])\s+', response.strip())
120 |         return len(sentences) > 0 and sentences[0][0].isupper() and sentences[-1][-1] in '.!?'
121 | 
122 | # Quality Manager instance for response evaluation
123 | quality_manager = ResponseQualityManager(model, tokenizer)
124 | 
125 | 
126 | # Updated generation logic to handle context better and avoid repetitive responses
127 | def generate_response(input_text, model, tokenizer, max_new_tokens=150, pad_token_id=128001, history=[], context_limit=512):
128 |     # Clean the history to avoid redundant prompts
129 |     history = [line for line in history if line.strip()]  # Remove empty lines
130 |     
131 |     # Create a simplified context prompt from the last few exchanges
132 |     prompt = f"{' '.join(history[-3:])}\nUser: {input_text}\n" if history else f"User: {input_text}\n"
133 |     
134 |     # Prepare inputs for the model
135 |     inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=context_limit).to("cuda")
136 |     
137 |     # Generate the response
138 |     with torch.no_grad():
139 |         outputs = model.generate(
140 |             inputs["input_ids"],
141 |             attention_mask=inputs["attention_mask"],
142 |             max_new_tokens=max_new_tokens,  # Control new tokens
143 |             do_sample=True,
144 |             temperature=0.7,
145 |             top_k=50,
146 |             top_p=0.9,
147 |             repetition_penalty=1.2,
148 |             pad_token_id=pad_token_id,
149 |             early_stopping=True
150 |         )
151 | 
152 |     # Decode the response and format it properly
153 |     response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
154 |     
155 |     # Ensure clean history management and context length control
156 |     cleaned_response = response.split("User:")[-1].strip()  # Remove any overlap
157 |     cleaned_response = re.sub(r'\s+', ' ', cleaned_response)  # Clean excess whitespace
158 |     
159 |     # Append the cleaned response to history
160 |     history.append(f"User: {input_text}\nModel: {cleaned_response}")
161 |     
162 |     # Trim history to prevent excessive accumulation
163 |     if len(history) > 6:
164 |         history = history[-6:]
165 | 
166 |     return cleaned_response, history
167 |     
168 | # Updated user input loop to handle context better
169 | def user_input_loop(model, tokenizer):
170 |     print("\n--- LLaMA Instruct Model Interactive Query ---")
171 |     print("Type 'exit' to quit.")
172 |     history = []  # Initialize a history buffer to keep track of conversation
173 |     while True:
174 |         user_input = input("\nEnter your query: ")
175 |         if user_input.lower() == 'exit':
176 |             print("Exiting...")
177 |             break
178 |         response, history = generate_response(user_input, model, tokenizer, history=history)
179 |         print(f"Model Response: {response}")
180 | 
181 | # Start the interactive query loop with the refined response generation
182 | logging.info("Model loaded successfully. You can now query the model.")
183 | user_input_loop(model, tokenizer)
184 | 
185 | 


--------------------------------------------------------------------------------
/nonfunctional_transformers_garbled.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | import numpy as np
  6 | import re
  7 | import logging
  8 | from transformers import LlamaForCausalLM, AutoTokenizer, LlamaConfig
  9 | from typing import Tuple
 10 | import json
 11 | from tqdm import tqdm  # For progress bars
 12 | 
 13 | # Define paths to the directories and files
 14 | SOURCE_DIR = "models/Llama_32_1B/"
 15 | WEIGHTS_DIR = os.path.join(SOURCE_DIR, "offload")
 16 | MODEL_JSON_PATH = os.path.join(SOURCE_DIR, "config.json")
 17 | 
 18 | # Initialize logging
 19 | logging.basicConfig(level=logging.INFO)
 20 | 
 21 | # Load the configuration from the JSON file
 22 | def load_configuration(model_json_path):
 23 |     with open(model_json_path, "r") as f:
 24 |         config_data = json.load(f)
 25 |     config = LlamaConfig(**config_data)
 26 |     return config
 27 | 
 28 | # Use AutoTokenizer instead of LlamaTokenizer to resolve class conflicts
 29 | def load_tokenizer(source_dir):
 30 |     return AutoTokenizer.from_pretrained(source_dir)
 31 | 
 32 | # Load the model configuration
 33 | logging.info(f"Loading model configuration from: {MODEL_JSON_PATH}")
 34 | config = load_configuration(MODEL_JSON_PATH)
 35 | 
 36 | # Initialize an empty model based on the configuration
 37 | model = LlamaForCausalLM(config)
 38 | logging.info("Initialized empty LLaMA model.")
 39 | 
 40 | # Load the offloaded weights from the `.dat` files with a progress bar
 41 | def load_dat_file(file_path, dtype):
 42 |     with open(file_path, 'rb') as f:
 43 |         tensor_data = np.fromfile(f, dtype=dtype)
 44 |     loaded_tensor = torch.tensor(tensor_data)
 45 |     
 46 |     # If dtype was mapped to float32 for bfloat16 compatibility, convert back
 47 |     if dtype == np.float32 and "bfloat16" in file_path:
 48 |         loaded_tensor = loaded_tensor.to(torch.bfloat16)
 49 |     return loaded_tensor
 50 | 
 51 | def load_offloaded_weights(model, weights_dir):
 52 |     param_names = list(model.named_parameters())
 53 |     # Create a progress bar for weight loading
 54 |     with tqdm(total=len(param_names), desc="Loading weights", unit="param") as pbar:
 55 |         for name, param in param_names:
 56 |             file_name = name.replace('.', '_') + ".dat"
 57 |             file_path = os.path.join(weights_dir, file_name)
 58 | 
 59 |             if os.path.exists(file_path):
 60 |                 dtype_map = {
 61 |                     torch.float16: np.float16,
 62 |                     torch.float32: np.float32,
 63 |                     torch.int64: np.int64,
 64 |                     torch.int32: np.int32,
 65 |                     torch.bfloat16: np.float32,
 66 |                 }
 67 |                 expected_dtype = dtype_map.get(param.dtype, np.float32)
 68 |                 logging.info(f"Loading {file_name} into {name} with expected type {expected_dtype}")
 69 |                 loaded_tensor = load_dat_file(file_path, expected_dtype).view_as(param)
 70 | 
 71 |                 if param.dtype == torch.bfloat16:
 72 |                     loaded_tensor = loaded_tensor.to(torch.bfloat16)
 73 | 
 74 |                 param.data.copy_(loaded_tensor.to("cuda"))
 75 |             else:
 76 |                 logging.warning(f"Warning: {file_name} not found in offloaded directory.")
 77 |             
 78 |             pbar.update(1)  # Update the progress bar after each parameter is loaded
 79 | 
 80 | # Load the weights into the model
 81 | load_offloaded_weights(model, WEIGHTS_DIR)
 82 | 
 83 | # Move the model to GPU for inference
 84 | model.to('cuda')
 85 | model.eval()
 86 | 
 87 | # Use AutoTokenizer to handle any tokenizer class discrepancies
 88 | logging.info(f"Loading tokenizer from directory: {SOURCE_DIR}")
 89 | tokenizer = load_tokenizer(SOURCE_DIR)
 90 | 
 91 | # Rotary embedding application with frequency scaling
 92 | def apply_rotary_emb(q, k, freqs_cis, layer_index, rope_scaling):
 93 |     q_real = q.float().view(*q.shape[:-1], -1, 2)
 94 |     k_real = k.float().view(*k.shape[:-1], -1, 2)
 95 |     q_complex = torch.view_as_complex(q_real)
 96 |     k_complex = torch.view_as_complex(k_real)
 97 | 
 98 |     # Ensure freqs_cis is on the same device as q_complex
 99 |     device = q_complex.device
100 |     freqs_cis = freqs_cis.to(device)  # Move freqs_cis to the correct device
101 | 
102 |     # Get rope scaling parameters for this layer
103 |     freq_factor = rope_scaling['high_freq_factor'] if layer_index >= 16 else rope_scaling['low_freq_factor']
104 | 
105 |     # Adjust freqs_cis to match q's shape, considering the sequence length
106 |     seq_len = q.shape[-2]  # Sequence length from query tensor
107 |     freqs_cis = freqs_cis[:seq_len, :q.shape[-1] // 2] * freq_factor
108 | 
109 |     # Expand freqs_cis to match the shape of q_complex
110 |     freqs_cis = freqs_cis.unsqueeze(0).unsqueeze(1)  # Expand for batch size and heads
111 |     freqs_cis = freqs_cis.expand_as(q_complex)  # Ensure it matches q_complex shape
112 | 
113 |     # Apply complex multiplication to both query and key tensors
114 |     q_rot = torch.view_as_real(q_complex * freqs_cis).flatten(3)
115 |     k_rot = torch.view_as_real(k_complex * freqs_cis).flatten(3)
116 |     
117 |     return q_rot, k_rot
118 | 
119 | # Generating scaled rotary frequencies for LLaMA 3.2
120 | def get_rotary_frequencies(config):
121 |     hidden_size = config.hidden_size
122 |     max_position_embeddings = config.max_position_embeddings
123 |     base = config.rope_theta
124 |     scaling_factor = config.rope_scaling['factor']
125 |     
126 |     inv_freq = 1.0 / (base ** (torch.arange(0, hidden_size, 2).float() / hidden_size))
127 |     t = torch.arange(max_position_embeddings, device=inv_freq.device)
128 |     freqs = torch.outer(t, inv_freq) * scaling_factor
129 |     return torch.polar(torch.ones_like(freqs), freqs)
130 | 
131 | # Custom Attention Layer that applies rotary embeddings and processes attention
132 | class CustomAttentionLayer(nn.Module):
133 |     def __init__(self, config, layer_index, weights_dir):
134 |         super(CustomAttentionLayer, self).__init__()
135 |         self.hidden_size = config.hidden_size
136 |         self.num_heads = config.num_attention_heads
137 |         self.num_key_value_heads = config.num_key_value_heads
138 |         self.head_dim = config.head_dim
139 |         self.weights_dir = weights_dir
140 |         self.layer_index = layer_index
141 |         self.rope_scaling = config.rope_scaling  # Add rope scaling from config
142 | 
143 |         # Create nn.Linear layers
144 |         self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
145 |         self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
146 |         self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
147 |         self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
148 | 
149 |         # Load weights into the layers
150 |         self.load_weights()
151 | 
152 |         self.scale = 1 / (self.head_dim ** 0.5)
153 | 
154 |     def load_weights(self):
155 |         self.q_proj.weight.data = self.load_weight(f"model_layers_{self.layer_index}_self_attn_q_proj_weight.dat", (self.hidden_size, self.hidden_size))
156 |         self.k_proj.weight.data = self.load_weight(f"model_layers_{self.layer_index}_self_attn_k_proj_weight.dat", (self.num_key_value_heads * self.head_dim, self.hidden_size))
157 |         self.v_proj.weight.data = self.load_weight(f"model_layers_{self.layer_index}_self_attn_v_proj_weight.dat", (self.num_key_value_heads * self.head_dim, self.hidden_size))
158 |         self.o_proj.weight.data = self.load_weight(f"model_layers_{self.layer_index}_self_attn_o_proj_weight.dat", (self.hidden_size, self.hidden_size))
159 | 
160 |     def load_weight(self, file_name, shape):
161 |         file_path = os.path.join(self.weights_dir, file_name)
162 |         if os.path.exists(file_path):
163 |             tensor_data = np.fromfile(file_path, dtype=np.float32)
164 |             return torch.tensor(tensor_data).view(*shape).to("cuda")
165 |         else:
166 |             raise FileNotFoundError(f"Weight file {file_name} not found.")
167 | 
168 |     def forward(self, hidden_states, freqs_cis, past_key_value=None, position_ids=None):
169 |         # Ensure hidden_states are on the same device as model parameters (GPU)
170 |         device = self.q_proj.weight.device
171 |         hidden_states = hidden_states.to(device)
172 |     
173 |         batch_size, seq_length, _ = hidden_states.shape
174 |     
175 |         q = self.q_proj(hidden_states)
176 |         k = self.k_proj(hidden_states)
177 |         v = self.v_proj(hidden_states)
178 |     
179 |         q = q.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
180 |         k = k.view(batch_size, seq_length, self.num_key_value_heads, self.head_dim).transpose(1, 2)
181 |         v = v.view(batch_size, seq_length, self.num_key_value_heads, self.head_dim).transpose(1, 2)
182 |     
183 |         # Repeat k and v for multi-query attention
184 |         k = k.repeat_interleave(self.num_heads // self.num_key_value_heads, dim=1)
185 |         v = v.repeat_interleave(self.num_heads // self.num_key_value_heads, dim=1)
186 | 
187 |         # Apply rotary embeddings with scaling based on layer index and rope scaling factors
188 |         q_rot, k_rot = apply_rotary_emb(q, k, freqs_cis, self.layer_index, self.rope_scaling)
189 |     
190 |         if past_key_value is not None:
191 |             past_k, past_v = past_key_value
192 |             if past_k is not None and past_v is not None:
193 |                 k_rot = torch.cat([past_k, k_rot], dim=2)
194 |                 v = torch.cat([past_v, v], dim=2)
195 |     
196 |         attn_output = torch.nn.functional.scaled_dot_product_attention(
197 |             q_rot, k_rot, v, attn_mask=None, dropout_p=0.0, is_causal=True
198 |         )
199 |     
200 |         attn_output = attn_output.transpose(1, 2).contiguous()
201 |         attn_output = attn_output.reshape(batch_size, seq_length, self.hidden_size)
202 |         attn_output = self.o_proj(attn_output)
203 |     
204 |         return attn_output, (k_rot, v)
205 | 
206 | # Custom Transformer Layer integrating Attention and Feed-Forward Network
207 | class CustomTransformerLayer(nn.Module):
208 |     def __init__(self, config, layer_index, weights_dir):
209 |         super(CustomTransformerLayer, self).__init__()
210 |         self.hidden_size = config.hidden_size
211 |         self.intermediate_size = config.intermediate_size
212 |         self.layer_index = layer_index
213 |         self.weights_dir = weights_dir
214 | 
215 |         # Attention
216 |         self.attention = CustomAttentionLayer(config, layer_index, weights_dir)
217 |         
218 |         # Layer norms
219 |         self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
220 |         self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
221 | 
222 |         # Feed-forward network
223 |         self.mlp = MLP(
224 |             gate_proj=self.load_weight(f"model_layers_{layer_index}_mlp_gate_proj_weight.dat", (self.intermediate_size, self.hidden_size)),
225 |             up_proj=self.load_weight(f"model_layers_{layer_index}_mlp_up_proj_weight.dat", (self.intermediate_size, self.hidden_size)),
226 |             down_proj=self.load_weight(f"model_layers_{layer_index}_mlp_down_proj_weight.dat", (self.hidden_size, self.intermediate_size)),
227 |             act_fn=F.silu
228 |         )
229 | 
230 |     def load_weight(self, file_name, shape):
231 |         file_path = os.path.join(self.weights_dir, file_name)
232 |         if os.path.exists(file_path):
233 |             tensor_data = np.fromfile(file_path, dtype=np.float32)
234 |             return torch.tensor(tensor_data).view(*shape).to("cuda")
235 |         else:
236 |             raise FileNotFoundError(f"Weight file {file_name} not found.")
237 | 
238 |     def forward(self, hidden_states, freqs_cis, past_key_value=None, position_ids=None, use_cache=False):
239 |         # Ensure hidden_states are on the same device as model parameters (GPU)
240 |         device = self.attention.q_proj.weight.device
241 |         hidden_states = hidden_states.to(device)
242 |     
243 |         # Pre-attention norm
244 |         residual = hidden_states
245 |         hidden_states = self.input_layernorm(hidden_states)
246 |     
247 |         # Attention
248 |         attention_output, new_past = self.attention(hidden_states, freqs_cis, past_key_value, position_ids)
249 |     
250 |         # Ensure residual and attention_output are on the same device
251 |         attention_output = attention_output.to(residual.device)
252 |     
253 |         # Residual connection
254 |         hidden_states = residual + attention_output
255 |     
256 |         # Pre-FFN norm
257 |         residual = hidden_states
258 |         hidden_states = self.post_attention_layernorm(hidden_states)
259 |     
260 |         # Feed-forward network
261 |         hidden_states = self.mlp(hidden_states)
262 |     
263 |         # Residual connection
264 |         hidden_states = residual + hidden_states
265 |     
266 |         if use_cache:
267 |             return hidden_states, new_past
268 |         else:
269 |             return hidden_states, None
270 | 
271 | # RMSNorm Layer
272 | class RMSNorm(nn.Module):
273 |     def __init__(self, hidden_size, eps=1e-6):
274 |         super().__init__()
275 |         self.weight = nn.Parameter(torch.ones(hidden_size))
276 |         self.eps = eps
277 | 
278 |     def forward(self, hidden_states):
279 |         # Ensure hidden_states and self.weight are on the same device
280 |         device = self.weight.device
281 |         hidden_states = hidden_states.to(device)
282 | 
283 |         # Compute the variance and apply normalization
284 |         variance = hidden_states.pow(2).mean(-1, keepdim=True)
285 |         hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
286 |         return self.weight * hidden_states
287 | 
288 | # MLP Layer
289 | class MLP(nn.Module):
290 |     def __init__(self, gate_proj, up_proj, down_proj, act_fn):
291 |         super().__init__()
292 |         self.gate_proj = nn.Parameter(gate_proj)  # We keep these as nn.Parameters (tensors)
293 |         self.up_proj = nn.Parameter(up_proj)
294 |         self.down_proj = nn.Parameter(down_proj)
295 |         self.act_fn = act_fn
296 | 
297 |     def forward(self, x):
298 |         # Ensure input x is on the same device as the model parameters
299 |         device = self.gate_proj.device
300 |         x = x.to(device)
301 | 
302 |         # Perform the MLP computation
303 |         gate_out = torch.matmul(x, self.gate_proj.T)  # Matrix multiplication with gate_proj
304 |         up_out = torch.matmul(x, self.up_proj.T)      # Matrix multiplication with up_proj
305 |         activated_out = self.act_fn(gate_out)         # Apply activation function
306 | 
307 |         # Perform element-wise multiplication and apply down_proj
308 |         output = torch.matmul(activated_out * up_out, self.down_proj.T)
309 | 
310 |         return output
311 | 
312 | # Custom LLaMA Model integrating custom transformer layers and rotary embeddings
313 | class CustomLlamaModel(LlamaForCausalLM):
314 |     def __init__(self, config, weights_dir):
315 |         super(CustomLlamaModel, self).__init__(config)
316 |         self.weights_dir = weights_dir
317 |         self.config = config
318 | 
319 |         self.transformer_layers = nn.ModuleList(
320 |             [CustomTransformerLayer(config, layer_index, weights_dir)
321 |              for layer_index in range(config.num_hidden_layers)]
322 |         )
323 |         
324 |         self.freqs_cis = get_rotary_frequencies(config)
325 | 
326 |     def forward(self, input_ids=None, attention_mask=None, inputs_embeds=None, position_ids=None, past_key_values=None, use_cache=False, cache_position=None, return_dict=False):
327 |         if inputs_embeds is None:
328 |             inputs_embeds = self.get_input_embeddings()(input_ids)
329 | 
330 |         batch_size, seq_length = input_ids.shape if input_ids is not None else inputs_embeds.shape[:2]
331 | 
332 |         if position_ids is None:
333 |             if cache_position is not None:
334 |                 position_ids = torch.arange(cache_position, cache_position + seq_length, dtype=torch.long, device=inputs_embeds.device)
335 |                 position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)
336 |             else:
337 |                 position_ids = torch.arange(seq_length, dtype=torch.long, device=inputs_embeds.device).unsqueeze(0).expand(batch_size, -1)
338 | 
339 |         if past_key_values is None:
340 |             past_key_values = [None] * self.config.num_hidden_layers  # Fixed access
341 | 
342 |         hidden_states = inputs_embeds
343 |         presents = [] if use_cache else None
344 | 
345 |         for i, layer in enumerate(self.transformer_layers):
346 |             layer_past = past_key_values[i] if past_key_values is not None and len(past_key_values) > i else None
347 |             hidden_states, past = layer(hidden_states, self.freqs_cis, layer_past, position_ids, use_cache)
348 | 
349 |             if use_cache:
350 |                 presents.append(past)
351 | 
352 |         hidden_states = hidden_states.to(self.lm_head.weight.device)
353 |         logits = self.lm_head(hidden_states)
354 | 
355 |         # Always return a dictionary with logits and past_key_values when return_dict=True
356 |         if return_dict:
357 |             return {"logits": logits, "past_key_values": presents if use_cache else None}
358 |         else:
359 |             return logits  # Return just logits when return_dict=False
360 | 
361 |     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **kwargs):
362 |         # Prepare inputs for generation, ensuring past key values are handled correctly
363 |         if past_key_values:
364 |             input_ids = input_ids[:, -1:]  # Only pass the last token when past_key_values exist
365 | 
366 |         return {
367 |             "input_ids": input_ids,
368 |             "past_key_values": past_key_values,
369 |             "attention_mask": attention_mask,
370 |             "use_cache": kwargs.get("use_cache", True),
371 |         }
372 | 
373 | # Custom generate function (replaces generate in transformers)
374 | def custom_generate(
375 |     model, 
376 |     tokenizer, 
377 |     input_ids, 
378 |     max_new_tokens=150, 
379 |     temperature=0.6,  # Lower temperature for better coherence
380 |     top_k=50, 
381 |     top_p=0.9,  # Higher top_p for diversity without overwhelming the coherence
382 |     repetition_penalty=1.2, 
383 |     pad_token_id=128001, 
384 |     eos_token_id=None, 
385 |     device="cuda"
386 | ):
387 |     model.eval()  # Set model to evaluation mode
388 |     generated = input_ids.to(device)  # [batch_size, seq_length]
389 |     
390 |     print(f"Initial Input IDs: {input_ids.tolist()}")  # Log the initial input
391 | 
392 |     # Create a progress bar to track the generation process
393 |     with tqdm(total=max_new_tokens, desc="Generating tokens", unit="token") as pbar:
394 |         for _ in range(max_new_tokens):
395 |             with torch.no_grad():
396 |                 outputs = model(input_ids=generated, return_dict=True)
397 |                 logits = outputs["logits"]
398 | 
399 |                 # Check the number of dimensions and handle accordingly
400 |                 if len(logits.shape) == 3:
401 |                     logits = logits[:, -1, :]  # Standard case, 3D tensor
402 |                 elif len(logits.shape) == 2:
403 |                     logits = logits[:, :]  # Handle 2D logits
404 | 
405 |                 print(f"Logits shape: {logits.shape}")  # Log the shape of logits
406 | 
407 |                 # Apply repetition penalty
408 |                 if repetition_penalty != 1.0:
409 |                     for i in range(generated.shape[0]):
410 |                         unique_tokens = set(generated[i].tolist())
411 |                         for token in unique_tokens:
412 |                             logits[i, token] /= repetition_penalty
413 | 
414 |                 # Apply temperature scaling
415 |                 logits = logits / temperature
416 | 
417 |                 # Top-K sampling
418 |                 if top_k > 0:
419 |                     top_k_logits, _ = torch.topk(logits, top_k, dim=-1)
420 |                     logits[logits < top_k_logits[:, [-1]]] = -float('Inf')
421 | 
422 |                 # Top-P (nucleus) sampling
423 |                 sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
424 |                 cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
425 |                 sorted_indices_to_remove = cumulative_probs > top_p
426 | 
427 |                 # Shift the indices to the right to keep at least one token
428 |                 sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
429 |                 sorted_indices_to_remove[:, 0] = 0
430 | 
431 |                 # Scatter the sorted indices to the original logits tensor
432 |                 indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
433 |                 logits[indices_to_remove] = -float('Inf')
434 | 
435 |                 # Sample from the filtered distribution
436 |                 probs = F.softmax(logits, dim=-1)
437 |                 next_token = torch.multinomial(probs, num_samples=1)  # [batch_size, 1]
438 | 
439 |                 print(f"Generated Token ID: {next_token.tolist()}")  # Log the generated token
440 | 
441 |                 # Append generated token
442 |                 generated = torch.cat([generated, next_token], dim=-1)  # [batch_size, seq_length +1]
443 | 
444 |                 # Break on EOS token
445 |                 if eos_token_id is not None:
446 |                     if isinstance(eos_token_id, list):
447 |                         eos_tensor = torch.tensor(eos_token_id, device=next_token.device)  # Ensure eos_token_id is a tensor
448 |                         if torch.any(torch.isin(next_token, eos_tensor)):
449 |                             print("EOS token encountered. Ending generation.")
450 |                             break
451 |                     else:
452 |                         if torch.any(next_token == eos_token_id):
453 |                             print("EOS token encountered. Ending generation.")
454 |                             break
455 |                 
456 |                 pbar.update(1)  # Update progress bar after generating a token
457 | 
458 |     print(f"Final Generated Output IDs: {generated.tolist()}")  # Log the final output
459 |     return generated
460 | 
461 | # Generate response method updated to call custom_generate
462 | def generate_response(input_text, model, tokenizer, max_new_tokens=150, pad_token_id=128001, history=[], context_limit=512):
463 |     prompt = f"{' '.join(history[-3:])}\nUser: {input_text}\n" if history else f"User: {input_text}\n"
464 |     
465 |     print(f"Prompt: {prompt}")  # Log the prompt to be tokenized
466 |     
467 |     # Tokenize the input prompt
468 |     inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=context_limit)
469 |     input_ids = inputs["input_ids"].to(next(model.parameters()).device)
470 |     
471 |     print(f"Tokenized Input IDs: {input_ids.tolist()}")  # Log tokenized input
472 |     
473 |     # Generate the response using the custom generate function
474 |     generated_output = custom_generate(
475 |         model=model,
476 |         tokenizer=tokenizer,
477 |         input_ids=input_ids,
478 |         max_new_tokens=max_new_tokens,
479 |         temperature=0.7,
480 |         top_k=50,
481 |         top_p=0.9,
482 |         repetition_penalty=1.2,
483 |         pad_token_id=pad_token_id,
484 |         eos_token_id=[128001, 128008, 128009],  # Set your EOS token IDs as per config
485 |         device=next(model.parameters()).device
486 |     )
487 | 
488 |     # Decode the generated output
489 |     response = tokenizer.decode(generated_output[0], skip_special_tokens=True).strip()
490 | 
491 |     # Clean up the response to remove duplicate User tags or extraneous whitespace
492 |     cleaned_response = response.split("User:")[-1].strip()
493 |     cleaned_response = re.sub(r'\s+', ' ', cleaned_response)
494 | 
495 |     print(f"Final Generated Response: {cleaned_response}")  # Log the cleaned response
496 | 
497 |     # Append this conversation turn to the history
498 |     history.append(f"User: {input_text}\nModel: {cleaned_response}")
499 | 
500 |     # Trim the history to the last 6 conversation turns
501 |     if len(history) > 6:
502 |         history = history[-6:]
503 | 
504 |     return cleaned_response, history
505 | 
506 | # Interactive input loop to query the model
507 | def user_input_loop(custom_model, tokenizer):
508 |     print("\n--- Custom LLaMA 3.2 Instruct Model ---")
509 |     print("Type 'exit' to quit.")
510 |     history = []  # Initialize a history buffer to keep track of conversation
511 |     while True:
512 |         user_input = input("\nEnter your query: ")
513 |         if user_input.lower() == 'exit':
514 |             print("Exiting...")
515 |             break
516 |         try:
517 |             response, history = generate_response(user_input, custom_model, tokenizer, history=history)
518 |             print(f"Model Response: {response}")
519 |         except Exception as e:
520 |             # Show full error without wrapping to allow for easier debugging
521 |             raise e
522 | 
523 | 
524 | # Initialize the custom model and tokenizer
525 | config = load_configuration(MODEL_JSON_PATH)
526 | tokenizer = load_tokenizer(SOURCE_DIR)
527 | custom_model = CustomLlamaModel(config, WEIGHTS_DIR)
528 | 
529 | # Start the user input loop
530 | user_input_loop(custom_model, tokenizer)
531 | 


--------------------------------------------------------------------------------
/offloadedModelLiveLayerIdea.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import json
  4 | import numpy as np
  5 | import re
  6 | import logging
  7 | from sklearn.feature_extraction.text import TfidfVectorizer
  8 | from sklearn.metrics.pairwise import cosine_similarity  
  9 | from transformers import LlamaForCausalLM, AutoTokenizer, LlamaConfig
 10 | 
 11 | # Define paths to the directories and files
 12 | SOURCE_DIR = "models/Llama_32_1B/"
 13 | WEIGHTS_DIR = os.path.join(SOURCE_DIR, "offload")
 14 | MODEL_JSON_PATH = os.path.join(SOURCE_DIR, "config.json")
 15 | 
 16 | # Initialize logging
 17 | logging.basicConfig(level=logging.INFO)
 18 | 
 19 | # Load the configuration from the JSON file
 20 | def load_configuration(model_json_path):
 21 |     with open(model_json_path, "r") as f:
 22 |         config_data = json.load(f)
 23 |     config = LlamaConfig(**config_data)
 24 |     return config
 25 | 
 26 | # Use AutoTokenizer instead of LlamaTokenizer to resolve class conflicts  
 27 | def load_tokenizer(source_dir):
 28 |     return AutoTokenizer.from_pretrained(source_dir)
 29 | 
 30 | # Load the model configuration
 31 | logging.info(f"Loading model configuration from: {MODEL_JSON_PATH}")  
 32 | config = load_configuration(MODEL_JSON_PATH)
 33 | 
 34 | # Initialize an empty model based on the configuration
 35 | model = LlamaForCausalLM(config)
 36 | logging.info("Initialized empty LLaMA model.")
 37 | 
 38 | # Load the offloaded weights from the `.dat` files  
 39 | def load_dat_file(file_path, dtype):
 40 |     with open(file_path, 'rb') as f:
 41 |         tensor_data = np.fromfile(f, dtype=dtype)
 42 |     loaded_tensor = torch.tensor(tensor_data)
 43 |     
 44 |     # If dtype was mapped to float32 for bfloat16 compatibility, convert back
 45 |     if dtype == np.float32 and "bfloat16" in file_path:  
 46 |         loaded_tensor = loaded_tensor.to(torch.bfloat16)
 47 |     return loaded_tensor
 48 | 
 49 | def load_offloaded_weights(model, weights_dir):
 50 |     for name, param in model.named_parameters():
 51 |         file_name = name.replace('.', '_') + ".dat"
 52 |         file_path = os.path.join(weights_dir, file_name)
 53 | 
 54 |         if os.path.exists(file_path):
 55 |             dtype_map = {
 56 |                 torch.float16: np.float16,
 57 |                 torch.float32: np.float32,
 58 |                 torch.int64: np.int64,   
 59 |                 torch.int32: np.int32,
 60 |                 torch.bfloat16: np.float32,
 61 |             }
 62 |             expected_dtype = dtype_map.get(param.dtype, np.float32)
 63 |             logging.info(f"Loading {file_name} into {name} with expected type {expected_dtype}")  
 64 |             loaded_tensor = load_dat_file(file_path, expected_dtype).view_as(param)
 65 | 
 66 |             if param.dtype == torch.bfloat16:
 67 |                 loaded_tensor = loaded_tensor.to(torch.bfloat16)
 68 |             
 69 |             param.data.copy_(loaded_tensor.to("cuda"))
 70 |         else:
 71 |             logging.warning(f"Warning: {file_name} not found in offloaded directory.")  
 72 | 
 73 | # Load the weights into the model
 74 | load_offloaded_weights(model, WEIGHTS_DIR)
 75 | 
 76 | # Move the model to GPU for inference
 77 | model.to('cuda')  
 78 | model.eval()
 79 | 
 80 | # Use AutoTokenizer to handle any tokenizer class discrepancies
 81 | logging.info(f"Loading tokenizer from directory: {SOURCE_DIR}")  
 82 | tokenizer = load_tokenizer(SOURCE_DIR)
 83 | 
 84 | # Helper function to calculate entropy
 85 | def calculate_entropy(probs):
 86 |     log_probs = torch.log(probs + 1e-10)  # Add small epsilon to avoid log(0)
 87 |     entropy = -torch.sum(probs * log_probs, dim=-1)
 88 |     return entropy
 89 | 
 90 | def summarize_history(history, tokenizer, max_length=100):
 91 |     if not history:
 92 |         return ""
 93 |     
 94 |     # Concatenate the history into a single string
 95 |     history_text = " ".join(history)
 96 |     
 97 |     # Tokenize the history text
 98 |     history_tokens = tokenizer.encode(history_text, truncation=True, max_length=max_length)
 99 |     
100 |     # Decode the summarized history tokens back into text  
101 |     summarized_history = tokenizer.decode(history_tokens)
102 |     
103 |     return summarized_history
104 | 
105 | def evaluate_response_quality(response, user_input, tokenizer, threshold=0.75):
106 |     # Tokenize the response and user input
107 |     response_tokens = set(tokenizer.encode(response))
108 |     user_input_tokens = set(tokenizer.encode(user_input))
109 |     
110 |     # Calculate the overlap between response and user input tokens
111 |     overlap = len(response_tokens.intersection(user_input_tokens)) 
112 |     overlap_ratio = overlap / len(user_input_tokens)
113 |     
114 |     # Calculate the coherence of the response
115 |     coherence_score = 0.0  # Implement a coherence scoring mechanism
116 |     
117 |     # Evaluate the relevance and quality of the response
118 |     relevance_score = overlap_ratio
119 |     quality_score = 0.5 * overlap_ratio + 0.5 * coherence_score
120 |     
121 |     return quality_score >= threshold
122 | 
123 | def adjust_layers(model, quality_score, threshold=0.75):
124 |     if quality_score < threshold:  
125 |         # Reduce the number of layers
126 |         num_layers = max(1, model.config.num_hidden_layers // 2)
127 |     else:
128 |         # Increase the number of layers
129 |         num_layers = min(model.config.num_hidden_layers * 2, 48)  
130 |     
131 |     # Adjust the model's layers
132 |     model.config.num_hidden_layers = num_layers
133 |     model.resize_token_embeddings(len(tokenizer))
134 |     
135 |     return model
136 | 
137 | def generate_response(input_text, model, tokenizer, max_new_tokens=50, pad_token_id=128001, history=[], context_limit=512):
138 |     # Clean the history to avoid redundant prompts
139 |     history = [line for line in history if line.strip()]
140 |     
141 |     # Create a context prompt from the last few exchanges
142 |     context = ' '.join(history[-3:]) if history else ''
143 |     prompt = f"{context}\nUser: {input_text}\nModel:"
144 |     
145 |     # Prepare inputs for the model
146 |     inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=context_limit).to("cuda")
147 | 
148 |     # Initialize response and keep track of tokens for refinement
149 |     refined_response = ""
150 |     refined_token_ids = []
151 | 
152 |     # Iteratively generate and refine the response
153 |     for iteration in range(10):  # Number of iterations can be adjusted
154 |         with torch.no_grad():
155 |             outputs = model.generate(
156 |                 inputs["input_ids"],
157 |                 attention_mask=inputs["attention_mask"],
158 |                 max_new_tokens=max_new_tokens // 10,  # Distribute tokens across iterations
159 |                 do_sample=True,
160 |                 temperature=0.7,
161 |                 top_k=50,
162 |                 top_p=0.9,
163 |                 repetition_penalty=1.2,
164 |                 pad_token_id=pad_token_id,
165 |                 eos_token_id=tokenizer.eos_token_id,
166 |                 output_scores=True,
167 |                 return_dict_in_generate=True
168 |             )
169 | 
170 |         # Retrieve the generated token IDs
171 |         new_token_ids = outputs.sequences[0][inputs["input_ids"].shape[1]:].tolist()
172 |         refined_token_ids.extend(new_token_ids)
173 | 
174 |         # Decode the generated response
175 |         refined_response = tokenizer.decode(refined_token_ids, skip_special_tokens=True).strip()
176 | 
177 |         # Check if the response is complete
178 |         if refined_response.endswith(('.', '!', '?')) or 'User:' in refined_response:
179 |             break
180 | 
181 |         # Update input for next iteration
182 |         inputs["input_ids"] = outputs.sequences
183 | 
184 |     # Clean up the generated output
185 |     response = refined_response.replace(prompt, "").strip()
186 |     
187 |     # Append final cleaned response to history
188 |     history.append(f"User: {input_text}")
189 |     history.append(f"Model: {response}")
190 |     
191 |     # Trim history to avoid excessive accumulation 
192 |     if len(history) > 10:
193 |         history = history[-10:]
194 | 
195 |     return response, history
196 |     
197 | def user_input_loop(model, tokenizer):
198 |     print("\n--- LLaMA Interactive Query ---")
199 |     print("Type 'exit' to quit.")
200 |     history = []
201 | 
202 |     while True:
203 |         user_input = input("\nEnter your query: ")
204 |         if user_input.lower() == 'exit':
205 |             print("Exiting...")
206 |             break
207 |         
208 |         # Generate response using the LLaMA model
209 |         response, history = generate_response(user_input, model, tokenizer, history=history)
210 |         print(f"Model: {response}")
211 |         
212 |         # Get user feedback on the response
213 |         feedback = input("Please provide feedback on the response (good/bad): ")
214 |         
215 |         if feedback.lower() == 'bad':
216 |             print("Thank you for your feedback. We'll work on improving the model's responses.")
217 |     
218 |     # Save the final conversation history
219 |     with open("conversation_history.json", "w") as f:
220 |         json.dump(history, f)
221 | 
222 | # Start the interactive query loop with the refined response generation
223 | logging.info("Model loaded successfully. You can now query the model.")
224 | user_input_loop(model, tokenizer)


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | # KAN-WuBu-Memory: LLaMA 3.2 1B Instruct with Kolmogorov-Arnold Networks (KAN) Integration
  2 | 
  3 | ![KAN-WuBu Memory](https://img.shields.io/badge/PyTorch-CUDA_Enabled-blue.svg)
  4 | ![Version](https://img.shields.io/badge/Version-2.0.0-brightgreen.svg)
  5 | ![Contributions](https://img.shields.io/badge/Contributions-Welcome-orange.svg)
  6 | 
  7 | ## Project Overview
  8 | 
  9 | **KAN-WuBu-Memory** is an advanced memory-integrated AI system that combines the **LLaMA 3.2 1B** language model with **Kolmogorov-Arnold Networks (KAN)** and a multi-dimensional memory framework. This system builds on the concept of emotionally aware, contextually sensitive, and dynamically evolving conversations. With support for long-term memory consolidation, real-time emotional modulation, and adaptive response generation, **KAN-WuBu-Memory** is designed for complex and nuanced conversational interactions.
 10 | 
 11 | ### Key Features
 12 | 
 13 | - **Emotionally Aware Conversations**: Tracks and adjusts the AI's emotional state using a multi-dimensional model (`valence`, `arousal`, and `dominance`) to produce responses that align with contextual nuances.
 14 | - **Kolmogorov-Arnold Networks (KAN) Integration**: Enhances LLaMA’s interaction by leveraging KANs to adapt internal representations dynamically.
 15 | - **Advanced Memory Management**: Utilizes short-term, long-term, and sliding-window memories to retain context and adapt based on conversation history.
 16 | - **Refusal Detection and Override**: Identifies refusal phrases and uses corrective mechanisms to ensure smooth and continuous interaction.
 17 | - **Entropy-Based Response Management**: Uses entropy metrics to balance randomness and coherence in response generation.
 18 | - **Synthetic Day-Night Cycle**: Simulates a day-night cycle to influence the AI’s behavior dynamically, adjusting its responses and internal states based on simulated time.
 19 | - **Automatic State Saving and Loading**: Captures and saves the model’s state, memory, and emotional context after each interaction, allowing for continuous learning and persistent memory.
 20 | - **Dynamic Sampling Strategy**: Adjusts the sampling parameters (`temperature` and `top_p`) based on entropy, memory importance, and conversation context.
 21 | 
 22 | ## Table of Contents
 23 | 
 24 | 1. [Installation](#installation)
 25 | 2. [Usage](#usage)
 26 | 3. [Customization](#customization)
 27 | 4. [How It Works](#how-it-works)
 28 | 5. [Contributing](#contributing)
 29 | 6. [Credits](#credits)
 30 | 7. [License](#license)
 31 | 
 32 | ## Installation
 33 | 
 34 | Follow these steps to set up **KAN-WuBu-Memory** on your system:
 35 | 
 36 | 1. Clone the repository:
 37 | 
 38 |    ```bash
 39 |    git clone https://github.com/waefrebeorn/kan-wubu-memory.git
 40 |    ```
 41 | 
 42 | 2. Navigate to the project directory:
 43 | 
 44 |    ```bash
 45 |    cd kan-wubu-memory
 46 |    ```
 47 | 
 48 | 3. Run the setup script (`setup.bat` for Windows) to initialize the environment and install dependencies:
 49 | 
 50 |    ```bash
 51 |    .\setup.bat
 52 |    ```
 53 | 
 54 | 4. Ensure that Python 3.8+ and CUDA-compatible drivers are installed. The script will automatically set up a virtual environment and install PyTorch, Hugging Face Transformers, and other dependencies.
 55 | 
 56 | 5. **Important**: Manually download the required **LLaMA 3.2 1B** model files and place them in the `models/Llama_32_1B` directory.
 57 | 
 58 |    - You can download the files directly from Hugging Face's CLI by accepting the Llama License.
 59 | 
 60 | ## Usage
 61 | 
 62 | Once the environment is set up, you can interact with the **KAN-WuBu-Memory** AI system:
 63 | 
 64 | **Kan GUI**: Start the interactive console mode:
 65 | 
 66 |    ```bash
 67 |    run run.bat
 68 |    ```
 69 | 
 70 | 
 71 | ### Example Interaction
 72 | 
 73 | ```
 74 | User: How are you feeling today?
 75 | AI: I feel quite neutral at the moment. How can I assist you?
 76 | ```
 77 | 
 78 | The emotional state will shift dynamically based on the conversation context.
 79 | 
 80 | ## Customization
 81 | 
 82 | You can adjust various components of the system to suit your needs:
 83 | 
 84 | - **System Prompt**: Customize the AI’s character description in `main.py` or directly through the GUI during the first interaction.
 85 | - **Emotional Feedback**: Modify the dimensions of emotional feedback to fit your use case (e.g., add `confidence`, `interest`).
 86 | - **Synthetic Day Cycle**: Adjust the length and phases of the synthetic day cycle in `llama_32_1b_tool.py`.
 87 | - **Memory Management**: Configure short-term and long-term memory buffers, and adjust the clustering for memory consolidation.
 88 | - **Entropy Management**: Change entropy thresholds and sampling parameters (`temperature`, `top_p`) for response generation.
 89 | 
 90 | ## How It Works
 91 | 
 92 | ### EmotionalState Module
 93 | 
 94 | The **EmotionalState** class tracks the AI’s emotional state across three dimensions (`valence`, `arousal`, and `dominance`) and updates based on user feedback and conversation context. This emotional model is used to generate emotionally aware and context-sensitive responses.
 95 | 
 96 | ### Overfit Detector
 97 | 
 98 | The **OverfitDetector** monitors loss trends across training and validation windows to identify signs of overfitting and trigger adjustments, such as early stopping or dynamic learning rate scaling.
 99 | 
100 | ### Kolmogorov-Arnold Networks (KAN)
101 | 
102 | KANs modify the hidden layers of LLaMA, allowing the system to fine-tune and optimize its internal representations based on emotional and contextual inputs. The **EnhancedKAN** class enables dynamic adjustments, resulting in a more personalized conversational experience.
103 | 
104 | ### Refusal Detection and Override
105 | 
106 | The **RefusalDetector** module monitors for refusal phrases (e.g., "I cannot assist with...") and utilizes a KAN-powered override to refine and rephrase these responses, ensuring a smoother interaction flow.
107 | 
108 | ### Memory Management
109 | 
110 | The **AdvancedMemoryManager** handles multi-dimensional memory, integrating short-term, long-term, and sliding-window memories to consolidate and prioritize context. This module supports clustering, importance scoring, and context summarization for efficient memory management.
111 | 
112 | ### Entropy-Based Response Quality Management
113 | 
114 | The **EntropyManager** tracks the entropy of generated responses, ensuring a balance between coherence and randomness. Entropy metrics are used to adjust sampling parameters (`temperature`, `top_p`), and trigger "chain-of-thought" reasoning processes when necessary.
115 | 
116 | ### Synthetic Day-Night Cycle
117 | 
118 | The **SyntheticDayCycle** simulates a day-night cycle that influences the AI’s internal state. The cycle affects behavior, response length, and sampling parameters based on the time of day.
119 | 
120 | ### Live State Saving
121 | 
122 | After each interaction, the system captures and saves the current state (including emotional context, memory buffers, and learning metrics) to ensure continuous learning and persistence.
123 | 
124 | ## Contributing
125 | 
126 | We welcome contributions from the community! If you'd like to contribute:
127 | 
128 | 1. Fork the repository.
129 | 2. Create a new branch for your feature or bug fix.
130 | 3. Submit a pull request with detailed comments on your changes.
131 | 
132 | For major changes, please open an issue first to discuss what you would like to change.
133 | 
134 | ## Credits
135 | 
136 | This project is built with contributions from various open-source libraries and developers. Special thanks to:
137 | 
138 | - **WuBu (WaefreBeorn)**: Project creator and lead developer.
139 | - **Meta AI**: For the **LLaMA** language model that powers the core interaction.
140 | - **Hugging Face**: For providing the **Transformers** library that makes working with modern NLP models accessible.
141 | - **PyTorch Team**: For the foundational deep learning library that enables model training and optimization with CUDA support.
142 | - **Contributors**: Open-source enthusiasts who provide libraries and frameworks like `matplotlib`, `scipy`, and more.
143 | 
144 | ### Special Acknowledgments
145 | 
146 | - **LLaMA and Meta Research Team** for the original research behind the **LLaMA** language models.
147 | - **Hugging Face Transformers Community** for their dedication to providing accessible NLP tools.
148 | - **NVIDIA** for the CUDA toolkit, enabling efficient GPU computation.
149 | 
150 | ## License
151 | 
152 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
153 | 
154 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Core dependencies
 2 | torch
 3 | transformers==4.45.1
 4 | numpy
 5 | matplotlib
 6 | scikit-learn
 7 | tqdm
 8 | plotly
 9 | 
10 | # GUI
11 | tk
12 | 
13 | 
14 | 
15 | # File handling
16 | pathlib
17 | 
18 | 
19 | # Date and time handling
20 | datetime
21 | 
22 | # NVIDIA GPU support (make sure you have the appropriate CUDA version installed)
23 | # Note: The specific CUDA version should match your GPU driver
24 | 
25 | # Optional: for better performance on CPU
26 | # intel-openmp
27 | # mkl
28 | 
29 | # Development tools (optional)
30 | # pytest
31 | # black
32 | # isort
33 | # flake8
34 | 
35 | # Documentation (optional)
36 | # sphinx
37 | # sphinx-rtd-theme
38 | 


--------------------------------------------------------------------------------
/run - load_offloaded_model.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | setlocal
 3 | 
 4 | :: Activate the virtual environment
 5 | call venv\Scripts\activate
 6 | 
 7 | :: Run the GUI script
 8 | python load_offloaded_model.py
 9 | 
10 | pause


--------------------------------------------------------------------------------
/run - splitsafetensors.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | setlocal
 3 | 
 4 | :: Activate the virtual environment
 5 | call venv\Scripts\activate
 6 | 
 7 | :: Run the GUI script
 8 | python split_safetensors.py
 9 | 
10 | pause


--------------------------------------------------------------------------------
/run.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | setlocal
 3 | 
 4 | :: Activate the virtual environment
 5 | call venv\Scripts\activate
 6 | 
 7 | :: Run the GUI script
 8 | python kan_gui.py
 9 | 
10 | pause


--------------------------------------------------------------------------------
/setup.bat:
--------------------------------------------------------------------------------
  1 | @echo off
  2 | setlocal enabledelayedexpansion
  3 | 
  4 | echo Starting setup for KAN-WuBu-Memory with LLaMA 3.2 1B Model...
  5 | 
  6 | :: Define project-specific paths
  7 | set "PROJECT_DIR=%~dp0"
  8 | set "MODEL_DIR=%PROJECT_DIR%models\Llama_32_1B"
  9 | 
 10 | :: Check if Python is installed
 11 | python --version >nul 2>&1
 12 | if %errorlevel% neq 0 (
 13 |     echo Python is not installed. Please install Python 3.8 or later from https://www.python.org/downloads/
 14 |     exit /b 1
 15 | )
 16 | 
 17 | :: Create the necessary folder structure
 18 | if not exist "%MODEL_DIR%" (
 19 |     echo Creating LLaMA 3.2 1B model directory...
 20 |     mkdir "%MODEL_DIR%"
 21 |     if %errorlevel% neq 0 (
 22 |         echo Failed to create the LLaMA 3.2 model directory.
 23 |         exit /b 1
 24 |     )
 25 | )
 26 | 
 27 | echo Directory structure created successfully: %MODEL_DIR%
 28 | 
 29 | :: Create a virtual environment if it doesn't exist
 30 | if not exist "venv" (
 31 |     echo Creating virtual environment...
 32 |     python -m venv venv
 33 |     if %errorlevel% neq 0 (
 34 |         echo Failed to create virtual environment.
 35 |         exit /b 1
 36 |     )
 37 | )
 38 | 
 39 | :: Activate the virtual environment
 40 | call venv\Scripts\activate
 41 | if %errorlevel% neq 0 (
 42 |     echo Failed to activate virtual environment.
 43 |     exit /b 1
 44 | )
 45 | 
 46 | :: Upgrade pip
 47 | echo Upgrading pip...
 48 | python -m pip install --upgrade pip
 49 | 
 50 | :: Install PyTorch with CUDA support
 51 | echo Installing PyTorch with CUDA support...
 52 | pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
 53 | 
 54 | :: Install other requirements
 55 | echo Installing other requirements...
 56 | pip install -r requirements.txt
 57 | 
 58 | :: Install Hugging Face transformers
 59 | echo Installing latest Hugging Face transformers...
 60 | pip install git+https://github.com/huggingface/transformers
 61 | 
 62 | :: Install Accelerate
 63 | echo Installing Accelerate...
 64 | pip install accelerate>=0.26.0
 65 | 
 66 | :: Verify CUDA installation
 67 | echo Verifying CUDA installation...
 68 | python -c "import torch; print('CUDA available:', torch.cuda.is_available()); print('CUDA version:', torch.version.cuda if torch.cuda.is_available() else 'N/A')"
 69 | 
 70 | :: Additional CUDA diagnostics
 71 | echo.
 72 | echo Running CUDA diagnostics...
 73 | python -c "import torch; print('CUDA device count:', torch.cuda.device_count()); print('CUDA device name:', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A')"
 74 | 
 75 | :: Check NVIDIA driver
 76 | echo.
 77 | echo Checking NVIDIA driver...
 78 | nvidia-smi
 79 | 
 80 | echo Environment setup complete.
 81 | 
 82 | echo.
 83 | echo IMPORTANT: Manual Model Download Required
 84 | echo ==========================================
 85 | echo You have two options to get the LLaMA models:
 86 | echo 1. **Directly from Meta:**
 87 | echo    - Visit the LLaMA download form at [https://www.llama.com/llama-downloads]
 88 | echo    - Fill in your details, select the models you want, and accept the licenses.
 89 | echo    - Check your email for download instructions and a pre-signed URL to download the model files:
 90 | echo        - checklist.chk
 91 | echo        - consolidated.00.pth
 92 | echo        - params.json
 93 | echo        - tokenizer.model
 94 | echo    - Place these files in the following directory:
 95 | echo        %MODEL_DIR%
 96 | echo.
 97 | echo 2. **From Hugging Face:**
 98 | echo    - Use the following command to download directly:
 99 | echo        huggingface-cli login
100 | echo        huggingface-cli download meta-llama/Llama-3.2-1B-Instruct --include "checklist.chk,consolidated.00.pth,params.json,tokenizer.model" --local-dir "%MODEL_DIR%"
101 | 
102 | echo.
103 | echo Setup completed successfully. You can now run the main script using run.bat.
104 | pause
105 | 


--------------------------------------------------------------------------------
/split_safetensors.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import numpy as np
 4 | from safetensors.torch import load_file
 5 | 
 6 | # Define the directories for source and output
 7 | SOURCE_FILE = "models/Llama_32_1B/model.safetensors"  # Path to the input safetensor file
 8 | OUTPUT_DIR = "models/Llama_32_1B/offload"             # Path to the output directory
 9 | 
10 | # Create the output directory if it doesn't exist
11 | os.makedirs(OUTPUT_DIR, exist_ok=True)
12 | 
13 | # Load the safetensors file
14 | print(f"Loading safetensors file from: {SOURCE_FILE}")
15 | state_dict = load_file(SOURCE_FILE)
16 | print(f"Safetensors file loaded. Found {len(state_dict)} tensors.")
17 | 
18 | # Utility function to save individual tensors, preserving their original format
19 | def save_tensor(tensor, file_path):
20 |     """
21 |     Save a PyTorch tensor to a binary .dat file without any format conversion.
22 |     """
23 |     # Identify the original tensor type
24 |     original_dtype = tensor.dtype
25 | 
26 |     # Check if the format is supported by NumPy
27 |     try:
28 |         # If the tensor is in bfloat16, PyTorch has direct support for saving
29 |         if original_dtype == torch.bfloat16:
30 |             # Convert to float32 for saving as .dat, since bfloat16 is not supported by numpy
31 |             print(f"Saving {file_path} as bfloat16 using float32 for compatibility.")
32 |             tensor.to(torch.float32).cpu().numpy().tofile(file_path)
33 |         else:
34 |             # Use the original format without conversion
35 |             tensor.cpu().numpy().tofile(file_path)
36 |         
37 |         print(f"Saved tensor to {file_path} with original type {original_dtype}")
38 |     except Exception as e:
39 |         print(f"Failed to save {file_path} with dtype {original_dtype} due to: {e}")
40 | 
41 | # Iterate through the state dictionary and save each tensor to a separate .dat file
42 | for tensor_name, tensor in state_dict.items():
43 |     # Construct a file path based on the tensor's name, replacing '.' with '_'
44 |     file_path = os.path.join(OUTPUT_DIR, tensor_name.replace('.', '_') + ".dat")
45 | 
46 |     # Save the tensor in its original format
47 |     try:
48 |         save_tensor(tensor, file_path)
49 |     except ValueError as e:
50 |         print(f"Skipping {tensor_name} due to error: {e}")
51 | 
52 | print(f"Model has been successfully split into individual .dat files in: {OUTPUT_DIR}")
53 | 


--------------------------------------------------------------------------------
/test_model_loading.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import pipeline
 3 | 
 4 | def test_model_loading(model_path):
 5 |     try:
 6 |         # Use device 0 (GPU) if available, else CPU
 7 |         device = 0 if torch.cuda.is_available() else -1
 8 | 
 9 |         # Define the prompt as a list of messages
10 |         prompt = [
11 |             {"role": "system", "content": "You are a helpful assistant, that responds as a pirate."},
12 |             {"role": "user", "content": "What's Deep Learning?"},
13 |         ]
14 | 
15 |         # Initialize the pipeline with explicit task, model, and tokenizer
16 |         generator = pipeline(
17 |             task="text-generation",
18 |             model=model_path,
19 |             tokenizer=model_path,  # Explicitly specify the tokenizer path
20 |             device=device,
21 |             torch_dtype=torch.float16  # Use torch.bfloat16 if supported
22 |         )
23 | 
24 |         # Generate the response
25 |         generation = generator(
26 |             prompt,
27 |             do_sample=False,
28 |             temperature=1.0,
29 |             top_p=1,
30 |             max_new_tokens=50
31 |         )
32 | 
33 |         print(f"Generation: {generation[0]['generated_text']}")
34 |     except Exception as e:
35 |         print(f"Error during generation: {e}")
36 | 
37 | if __name__ == "__main__":
38 |     # Use a raw string to prevent backslash interpretation
39 |     model_path = r"C:\Projects\KAN-WuBu-Memory\models\Llama_32_1B"
40 |     test_model_loading(model_path)
41 | 


--------------------------------------------------------------------------------
/test_sentencepiece.py:
--------------------------------------------------------------------------------
1 | import sentencepiece as spm
2 | 
3 | sp = spm.SentencePieceProcessor()
4 | sp.Load("C:/Projects/KAN-WuBu-Memory/models/Llama_32_1B/tokenizer.model")
5 | print("Tokenizer loaded successfully.")
6 | 


--------------------------------------------------------------------------------
/test_tokenizer_loading.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoTokenizer
 3 | 
 4 | def test_tokenizer_loading(model_path):
 5 |     try:
 6 |         tokenizer = AutoTokenizer.from_pretrained(
 7 |             model_path,
 8 |             use_fast=True,
 9 |             trust_remote_code=True  # Enable custom tokenizer code execution
10 |         )
11 |         # Optionally, set a padding token if not already set
12 |         if tokenizer.pad_token is None:
13 |             tokenizer.add_special_tokens({'pad_token': '[PAD]'})
14 |         print("Tokenizer loaded successfully.")
15 |         print(f"Tokenizer type: {type(tokenizer)}")
16 |     except Exception as e:
17 |         print(f"Error loading tokenizer: {e}")
18 | 
19 | if __name__ == "__main__":
20 |     model_path = "C:\\Projects\\KAN-WuBu-Memory\\models\\Llama_32_1B"
21 |     test_tokenizer_loading(model_path)
22 | 


--------------------------------------------------------------------------------
/venv.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | setlocal
 3 | 
 4 | :: Check if virtual environment exists, if not, create it
 5 | if not exist "venv\Scripts\activate" (
 6 |     echo Creating virtual environment...
 7 |     python -m venv venv
 8 | )
 9 | 
10 | :: Activate the virtual environment
11 | call venv\Scripts\activate
12 | 
13 | :: Inform the user that the environment is active and provide a command prompt
14 | echo Virtual environment activated. Type your commands below.
15 | 
16 | :: Open command prompt for user to type commands
17 | cmd /K
18 | 


--------------------------------------------------------------------------------