├── .github └── workflows │ └── deploy.yaml ├── .gitignore ├── .license ├── 0.26.0 ├── Llama1bDATStackingFULLVRAM(300gb).py ├── adaptivekantemplate.py ├── dummy.py ├── erroniousduplicateidea.py ├── kan_gui.py ├── llama_32_1b_tool.py ├── llama_32_1b_toolold10_5_24.py ├── load_offloaded_model.py ├── load_offloaded_model_entropytemp.py ├── load_offloaded_model_old_working.py ├── nonfunctional_transformers_garbled.py ├── offloadedModelLiveLayerIdea.py ├── readme.md ├── requirements.txt ├── run - load_offloaded_model.bat ├── run - splitsafetensors.bat ├── run.bat ├── setup.bat ├── split_safetensors.py ├── test_model_loading.py ├── test_sentencepiece.py ├── test_tokenizer_loading.py └── venv.bat /.github/workflows/deploy.yaml: -------------------------------------------------------------------------------- 1 | name: Run KAN Emotional Character with LLaMA 3.1 8B Instruct 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | run-kan-emotional-character: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout code 13 | uses: actions/checkout@v2 14 | 15 | - name: Set up Python 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: '3.x' 19 | 20 | - name: Install dependencies 21 | run: | 22 | python -m pip install --upgrade pip 23 | pip install -r requirements.txt 24 | pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 25 | pip install git+https://github.com/huggingface/transformers 26 | 27 | - name: Run KAN Emotional Character script 28 | env: 29 | HF_CLIENT_ID: ${{ secrets.HF_CLIENT_ID }} 30 | HF_CLIENT_SECRET: ${{ secrets.HF_CLIENT_SECRET }} 31 | run: | 32 | python kan_emotional_character_llama_hf.py 33 | 34 | - name: Upload logs 35 | uses: actions/upload-artifact@v2 36 | with: 37 | name: kan-emotional-character-logs 38 | path: kan_emotional_character.log -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Virtual Environment 2 | venv/ 3 | env/ 4 | ENV/ 5 | 6 | # Python cache files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | 33 | # PyInstaller 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | cover/ 55 | 56 | # Jupyter Notebook 57 | .ipynb_checkpoints 58 | 59 | # IPython 60 | profile_default/ 61 | ipython_config.py 62 | 63 | # pyenv 64 | .python-version 65 | 66 | # Environments 67 | .env 68 | .venv 69 | 70 | # Spyder project settings 71 | .spyderproject 72 | .spyproject 73 | 74 | # Rope project settings 75 | .ropeproject 76 | 77 | # mkdocs documentation 78 | /site 79 | 80 | # mypy 81 | .mypy_cache/ 82 | .dmypy.json 83 | dmypy.json 84 | 85 | # Pyre type checker 86 | .pyre/ 87 | 88 | # pytype static type analyzer 89 | .pytype/ 90 | 91 | # Cython debug symbols 92 | cython_debug/ 93 | 94 | # PyCharm 95 | .idea/ 96 | 97 | # VS Code 98 | .vscode/ 99 | 100 | # Windows 101 | Thumbs.db 102 | ehthumbs.db 103 | Desktop.ini 104 | 105 | # macOS 106 | .DS_Store 107 | .AppleDouble 108 | .LSOverride 109 | 110 | # Project-specific 111 | kan_character_state.json 112 | *.log 113 | 114 | # Models folder 115 | models/ 116 | kan_states/ -------------------------------------------------------------------------------- /.license: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 WuBu (WaefreBeorn) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /0.26.0: -------------------------------------------------------------------------------- 1 | Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com 2 | Collecting accelerate 3 | Downloading accelerate-1.0.0-py3-none-any.whl.metadata (19 kB) 4 | Requirement already satisfied: numpy<3.0.0,>=1.17 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from accelerate) (1.26.3) 5 | Requirement already satisfied: packaging>=20.0 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from accelerate) (24.1) 6 | Collecting psutil (from accelerate) 7 | Downloading psutil-6.0.0-cp37-abi3-win_amd64.whl.metadata (22 kB) 8 | Requirement already satisfied: pyyaml in c:\projects\kan-wubu-memory\venv\lib\site-packages (from accelerate) (6.0.2) 9 | Requirement already satisfied: torch>=1.10.0 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from accelerate) (2.4.1+cu118) 10 | Requirement already satisfied: huggingface-hub>=0.21.0 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from accelerate) (0.25.1) 11 | Requirement already satisfied: safetensors>=0.4.3 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from accelerate) (0.4.5) 12 | Requirement already satisfied: filelock in c:\projects\kan-wubu-memory\venv\lib\site-packages (from huggingface-hub>=0.21.0->accelerate) (3.13.1) 13 | Requirement already satisfied: fsspec>=2023.5.0 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from huggingface-hub>=0.21.0->accelerate) (2024.2.0) 14 | Requirement already satisfied: requests in c:\projects\kan-wubu-memory\venv\lib\site-packages (from huggingface-hub>=0.21.0->accelerate) (2.32.3) 15 | Requirement already satisfied: tqdm>=4.42.1 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from huggingface-hub>=0.21.0->accelerate) (4.66.5) 16 | Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from huggingface-hub>=0.21.0->accelerate) (4.9.0) 17 | Requirement already satisfied: sympy in c:\projects\kan-wubu-memory\venv\lib\site-packages (from torch>=1.10.0->accelerate) (1.12) 18 | Requirement already satisfied: networkx in c:\projects\kan-wubu-memory\venv\lib\site-packages (from torch>=1.10.0->accelerate) (3.2.1) 19 | Requirement already satisfied: jinja2 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from torch>=1.10.0->accelerate) (3.1.3) 20 | Requirement already satisfied: setuptools in c:\projects\kan-wubu-memory\venv\lib\site-packages (from torch>=1.10.0->accelerate) (70.0.0) 21 | Requirement already satisfied: colorama in c:\projects\kan-wubu-memory\venv\lib\site-packages (from tqdm>=4.42.1->huggingface-hub>=0.21.0->accelerate) (0.4.6) 22 | Requirement already satisfied: MarkupSafe>=2.0 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from jinja2->torch>=1.10.0->accelerate) (2.1.5) 23 | Requirement already satisfied: charset-normalizer<4,>=2 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from requests->huggingface-hub>=0.21.0->accelerate) (3.3.2) 24 | Requirement already satisfied: idna<4,>=2.5 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from requests->huggingface-hub>=0.21.0->accelerate) (3.10) 25 | Requirement already satisfied: urllib3<3,>=1.21.1 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from requests->huggingface-hub>=0.21.0->accelerate) (2.2.3) 26 | Requirement already satisfied: certifi>=2017.4.17 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from requests->huggingface-hub>=0.21.0->accelerate) (2024.8.30) 27 | Requirement already satisfied: mpmath>=0.19 in c:\projects\kan-wubu-memory\venv\lib\site-packages (from sympy->torch>=1.10.0->accelerate) (1.3.0) 28 | Downloading accelerate-1.0.0-py3-none-any.whl (330 kB) 29 | Downloading psutil-6.0.0-cp37-abi3-win_amd64.whl (257 kB) 30 | Installing collected packages: psutil, accelerate 31 | Successfully installed accelerate-1.0.0 psutil-6.0.0 32 | -------------------------------------------------------------------------------- /Llama1bDATStackingFULLVRAM(300gb).py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import json 6 | import numpy as np 7 | import re 8 | import logging 9 | from sklearn.feature_extraction.text import TfidfVectorizer 10 | from sklearn.metrics.pairwise import cosine_similarity 11 | from transformers import LlamaForCausalLM, AutoTokenizer, LlamaConfig 12 | 13 | # Define paths to the directories and files 14 | SOURCE_DIR = "models/Llama_32_1B/" 15 | WEIGHTS_DIR = os.path.join(SOURCE_DIR, "offload") 16 | MODEL_JSON_PATH = os.path.join(SOURCE_DIR, "config.json") 17 | 18 | # Initialize logging 19 | logging.basicConfig(level=logging.INFO) 20 | 21 | # Load the configuration from the JSON file 22 | def load_configuration(model_json_path): 23 | with open(model_json_path, "r") as f: 24 | config_data = json.load(f) 25 | config = LlamaConfig(**config_data) 26 | return config 27 | 28 | # Use AutoTokenizer instead of LlamaTokenizer to resolve class conflicts 29 | def load_tokenizer(source_dir): 30 | return AutoTokenizer.from_pretrained(source_dir) 31 | 32 | # Load the model configuration 33 | logging.info(f"Loading model configuration from: {MODEL_JSON_PATH}") 34 | config = load_configuration(MODEL_JSON_PATH) 35 | 36 | # Custom module for multiple stacked LLaMA layers (equivalent to 6x Mamba2 in NVIDIA presentation) 37 | class StackedLlamaModule(nn.Module): 38 | def __init__(self, config, num_layers=6): 39 | super(StackedLlamaModule, self).__init__() 40 | self.layers = nn.ModuleList([LlamaForCausalLM(config) for _ in range(num_layers)]) # Mimicking 6x Mamba2 41 | 42 | def forward(self, input_ids, attention_mask=None): 43 | x = input_ids 44 | for layer in self.layers: 45 | outputs = layer(input_ids=x, attention_mask=attention_mask) 46 | x = outputs.logits 47 | return x 48 | 49 | # Define shared components (e.g., Shared1 and Shared2) used in the modular structure 50 | class SharedLayer(nn.Module): 51 | def __init__(self, hidden_size): 52 | super(SharedLayer, self).__init__() 53 | self.mlp = nn.Sequential( 54 | nn.Linear(hidden_size, hidden_size), 55 | nn.ReLU(), 56 | nn.Linear(hidden_size, hidden_size), 57 | ) 58 | self.attention = nn.MultiheadAttention(hidden_size, num_heads=8) 59 | 60 | def forward(self, x): 61 | x = self.mlp(x) 62 | x, _ = self.attention(x, x, x) 63 | return x 64 | 65 | # Define Low-Rank Adaptation (LoRA) for efficient fine-tuning 66 | class LoRA(nn.Module): 67 | def __init__(self, hidden_size, rank=8): 68 | super(LoRA, self).__init__() 69 | self.rank = rank 70 | self.lora_A = nn.Linear(hidden_size, rank, bias=False) 71 | self.lora_B = nn.Linear(rank, hidden_size, bias=False) 72 | 73 | def forward(self, x): 74 | return x + self.lora_B(self.lora_A(x)) 75 | 76 | # Complete Stacked LLaMA model with shared components, stacking, and LoRA 77 | class StackedLlamaNetwork(nn.Module): 78 | def __init__(self, config, shared1, shared2, num_stacks=3): 79 | super(StackedLlamaNetwork, self).__init__() 80 | self.blocks = nn.ModuleList() 81 | 82 | for i in range(num_stacks): 83 | specialization = "early" if i == 0 else "mid" if i == 1 else "late" 84 | self.blocks.append( 85 | nn.ModuleDict({ 86 | "transformer_block": StackedLlamaModule(config), # Equivalent to 6x Mamba2 87 | "linear": nn.Linear(config.hidden_size, config.hidden_size), 88 | "shared": shared1 if i % 2 == 0 else shared2, # Alternating shared layers 89 | "lora_adapter": LoRA(config.hidden_size) # Optional LoRA for fine-tuning 90 | }) 91 | ) 92 | 93 | def forward(self, input_ids, attention_mask=None): 94 | x = input_ids 95 | intermediate_outputs = [] 96 | 97 | for block in self.blocks: 98 | x = block["transformer_block"](x, attention_mask) 99 | x = block["linear"](x) 100 | x = block["shared"](x) 101 | x = block["lora_adapter"](x) 102 | intermediate_outputs.append(x) 103 | 104 | # Concatenation of intermediate outputs (mimicking 'cat' operation in the image) 105 | x = torch.cat(intermediate_outputs, dim=-1) 106 | 107 | return x 108 | 109 | # Load the offloaded weights from the `.dat` files 110 | def load_dat_file(file_path, dtype): 111 | with open(file_path, 'rb') as f: 112 | tensor_data = np.fromfile(f, dtype=dtype) 113 | loaded_tensor = torch.tensor(tensor_data) 114 | 115 | # If dtype was mapped to float32 for bfloat16 compatibility, convert back 116 | if dtype == np.float32 and "bfloat16" in file_path: 117 | loaded_tensor = loaded_tensor.to(torch.bfloat16) 118 | return loaded_tensor 119 | 120 | def load_offloaded_weights(stacked_model, weights_dir): 121 | for i, llama_model in enumerate(stacked_model.blocks): 122 | logging.info(f"Loading weights for LLaMA stack {i + 1}") 123 | for name, param in llama_model["transformer_block"].layers.named_parameters(): 124 | file_name = name.replace('.', '_') + ".dat" 125 | file_path = os.path.join(weights_dir, file_name) 126 | 127 | if os.path.exists(file_path): 128 | dtype_map = { 129 | torch.float16: np.float16, 130 | torch.float32: np.float32, 131 | torch.int64: np.int64, 132 | torch.int32: np.int32, 133 | torch.bfloat16: np.float32, 134 | } 135 | expected_dtype = dtype_map.get(param.dtype, np.float32) 136 | logging.info(f"Loading {file_name} into {name} with expected type {expected_dtype}") 137 | loaded_tensor = load_dat_file(file_path, expected_dtype).view_as(param) 138 | 139 | if param.dtype == torch.bfloat16: 140 | loaded_tensor = loaded_tensor.to(torch.bfloat16) 141 | 142 | param.data.copy_(loaded_tensor.to("cuda")) 143 | else: 144 | logging.warning(f"Warning: {file_name} not found in offloaded directory.") 145 | 146 | # Load the weights into the model 147 | shared1 = SharedLayer(config.hidden_size) 148 | shared2 = SharedLayer(config.hidden_size) 149 | num_stacks = 3 # Number of stacked LLaMA instances 150 | model = StackedLlamaNetwork(config, shared1, shared2, num_stacks=num_stacks) 151 | load_offloaded_weights(model, WEIGHTS_DIR) 152 | 153 | # Move the model to GPU for inference 154 | model.to('cuda') 155 | model.eval() 156 | 157 | # Load the tokenizer for LLaMA 158 | logging.info(f"Loading tokenizer from directory: {SOURCE_DIR}") 159 | tokenizer = load_tokenizer(SOURCE_DIR) 160 | 161 | # ResponseQualityManager class for evaluating and improving responses 162 | class ResponseQualityManager: 163 | def __init__(self, kan_model, tokenizer): 164 | self.kan_model = kan_model 165 | self.tokenizer = tokenizer 166 | self.tfidf_vectorizer = TfidfVectorizer() 167 | 168 | def evaluate_response(self, user_input, response): 169 | relevance_score = self.calculate_relevance(user_input, response) 170 | structure_valid = self.has_proper_structure(response) 171 | is_garbled = self.detect_garbled_output(response) 172 | return relevance_score > 0.3 and structure_valid and not is_garbled 173 | 174 | def calculate_relevance(self, user_input, response): 175 | user_tokens = set(self.tokenizer.tokenize(user_input)) 176 | response_tokens = set(self.tokenizer.tokenize(response)) 177 | overlap = len(user_tokens.intersection(response_tokens)) 178 | overlap_score = overlap / max(len(user_tokens), 1) 179 | 180 | combined_texts = [user_input, response] 181 | tfidf_matrix = self.tfidf_vectorizer.fit_transform(combined_texts) 182 | cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0] 183 | 184 | return 0.5 * overlap_score + 0.5 * cosine_sim 185 | 186 | def detect_garbled_output(self, response): 187 | if re.search(r'[^\x00-\x7F]+', response): 188 | return True 189 | if len(response.split()) < 3: 190 | return True 191 | if response.count('.') / len(response.split()) > 0.5: 192 | return True 193 | return False 194 | 195 | def has_proper_structure(self, response): 196 | sentences = re.split(r'(?<=[.!?])\s+', response.strip()) 197 | return len(sentences) > 0 and sentences[0][0].isupper() and sentences[-1][-1] in '.!?' 198 | 199 | # Quality Manager instance for response evaluation 200 | quality_manager = ResponseQualityManager(model, tokenizer) 201 | 202 | # Updated generation logic to handle context better and avoid repetitive responses 203 | def generate_response(input_text, model, tokenizer, max_new_tokens=150, pad_token_id=128001, history=[], context_limit=512): 204 | history = [line for line in history if line.strip()] # Clean the history 205 | prompt = f"{' '.join(history[-3:])}\nUser: {input_text}\n" if history else f"User: {input_text}\n" 206 | inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=context_limit).to("cuda") 207 | 208 | with torch.no_grad(): 209 | outputs = model(inputs["input_ids"], attention_mask=inputs["attention_mask"]) 210 | output_ids = torch.argmax(outputs, dim=-1) 211 | 212 | response = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip() 213 | cleaned_response = re.sub(r'\s+', ' ', response.split("User:")[-1].strip()) 214 | history.append(f"User: {input_text}\nModel: {cleaned_response}") 215 | 216 | if len(history) > 6: 217 | history = history[-6:] 218 | 219 | return cleaned_response, history 220 | 221 | # Interactive query loop with refined response generation 222 | def user_input_loop(model, tokenizer): 223 | print("\n--- LLaMA Instruct Model Interactive Query ---") 224 | print("Type 'exit' to quit.") 225 | history = [] # Initialize a history buffer to keep track of conversation 226 | while True: 227 | user_input = input("\nEnter your query: ") 228 | if user_input.lower() == 'exit': 229 | print("Exiting...") 230 | break 231 | response, history = generate_response(user_input, model, tokenizer, history=history) 232 | print(f"Model Response: {response}") 233 | 234 | # Start the interactive query loop 235 | logging.info("Model loaded successfully. You can now query the model.") 236 | user_input_loop(model, tokenizer) 237 | -------------------------------------------------------------------------------- /adaptivekantemplate.py: -------------------------------------------------------------------------------- 1 | #adaptivekantemplate.py 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | class AdaptiveKANLayer(nn.Module): 7 | def __init__(self, input_size, output_size, num_knots=10, temperature=0.666): 8 | """ 9 | Initialize an adaptive KAN layer with spline-based transformations. 10 | 11 | Args: 12 | input_size (int): Number of input features. 13 | output_size (int): Number of output features. 14 | num_knots (int): Number of knots in the spline function. 15 | temperature (float): Temperature parameter for adaptive updates. 16 | """ 17 | super(AdaptiveKANLayer, self).__init__() 18 | self.input_size = input_size 19 | self.output_size = output_size 20 | self.num_knots = num_knots 21 | self.temperature = temperature 22 | 23 | # Define spline parameters 24 | self.knots = nn.Parameter(torch.linspace(-1, 1, num_knots)) 25 | self.coeffs = nn.Parameter(torch.randn(input_size, output_size, num_knots)) 26 | 27 | def forward(self, x): 28 | """ 29 | Forward pass for the KAN layer. 30 | 31 | Args: 32 | x (torch.Tensor): Input tensor of shape (batch_size, input_size). 33 | 34 | Returns: 35 | torch.Tensor: Transformed output of shape (batch_size, output_size). 36 | """ 37 | weights = self.compute_spline_weights(x) 38 | return torch.matmul(x, weights) 39 | 40 | def compute_spline_weights(self, x): 41 | """ 42 | Compute the spline transformation weights for input x. 43 | 44 | Args: 45 | x (torch.Tensor): Input tensor of shape (batch_size, input_size). 46 | 47 | Returns: 48 | torch.Tensor: Spline weights of shape (input_size, output_size). 49 | """ 50 | weights = F.interpolate(self.coeffs.unsqueeze(0), size=(self.num_knots,)).squeeze(0) 51 | return weights 52 | 53 | def calculate_entropy(self, logits): 54 | """ 55 | Calculate entropy of the spline transformations. 56 | 57 | Args: 58 | logits (torch.Tensor): Logits tensor of shape (batch_size, num_classes). 59 | 60 | Returns: 61 | torch.Tensor: Entropy values for each class. 62 | """ 63 | p = F.softmax(logits, dim=-1) 64 | entropy = -torch.sum(p * torch.log(p + 1e-9), dim=-1) 65 | return entropy 66 | 67 | def adaptive_update(self, entropy, variance): 68 | """ 69 | Adaptively update grid resolution and regularization based on entropy. 70 | 71 | Args: 72 | entropy (float): Current entropy of the spline transformations. 73 | variance (float): Variance of the entropy values. 74 | """ 75 | if entropy < 0.1 and variance < 0.1: 76 | self.prune_knots() 77 | elif entropy > 5.0 and variance < 0.1: 78 | self.extend_knots() 79 | elif entropy < 5.0 and variance > 5.0: 80 | self.refine_coeffs() 81 | elif entropy > 5.0 and variance > 5.0: 82 | self.increase_capacity() 83 | else: 84 | self.moderate_update() 85 | 86 | def prune_knots(self): 87 | """Remove low-impact knots.""" 88 | if self.num_knots > 3: # Ensure a minimum number of knots 89 | self.num_knots -= 1 90 | self.knots = nn.Parameter(torch.linspace(-1, 1, self.num_knots)) 91 | self.coeffs = nn.Parameter(torch.randn(self.input_size, self.output_size, self.num_knots)) 92 | 93 | def extend_knots(self): 94 | """Add new knots to the spline.""" 95 | self.num_knots += 1 96 | self.knots = nn.Parameter(torch.linspace(-1, 1, self.num_knots)) 97 | self.coeffs = nn.Parameter(torch.randn(self.input_size, self.output_size, self.num_knots)) 98 | 99 | def refine_coeffs(self): 100 | """Adjust coefficients for local refinement.""" 101 | with torch.no_grad(): 102 | self.coeffs += torch.randn_like(self.coeffs) * 0.01 103 | 104 | def increase_capacity(self): 105 | """Increase the capacity of the layer.""" 106 | with torch.no_grad(): 107 | self.coeffs = nn.Parameter(torch.cat([self.coeffs, torch.randn(self.input_size, self.output_size, self.num_knots)], dim=1)) 108 | 109 | def moderate_update(self): 110 | """Default update routine.""" 111 | self.refine_coeffs() 112 | 113 | 114 | class AdaptiveKANNetwork(nn.Module): 115 | def __init__(self, input_size, hidden_sizes, output_size, num_layers=3, temperature=0.666): 116 | """ 117 | Initialize a multi-layer KAN network with adaptive layers. 118 | 119 | Args: 120 | input_size (int): Number of input features. 121 | hidden_sizes (list of int): List of hidden sizes for each layer. 122 | output_size (int): Number of output features. 123 | num_layers (int): Number of KAN layers. 124 | temperature (float): Temperature parameter for adaptive updates. 125 | """ 126 | super(AdaptiveKANNetwork, self).__init__() 127 | self.input_size = input_size 128 | self.hidden_sizes = hidden_sizes 129 | self.output_size = output_size 130 | self.num_layers = num_layers 131 | self.temperature = temperature 132 | 133 | # Initialize KAN layers 134 | self.layers = nn.ModuleList() 135 | in_size = input_size 136 | for hidden_size in hidden_sizes: 137 | self.layers.append(AdaptiveKANLayer(in_size, hidden_size, num_knots=10, temperature=temperature)) 138 | in_size = hidden_size 139 | self.output_layer = AdaptiveKANLayer(in_size, output_size, num_knots=10, temperature=temperature) 140 | 141 | def forward(self, x): 142 | """ 143 | Forward pass through the KAN network. 144 | 145 | Args: 146 | x (torch.Tensor): Input tensor of shape (batch_size, input_size). 147 | 148 | Returns: 149 | torch.Tensor: Network output of shape (batch_size, output_size). 150 | """ 151 | for layer in self.layers: 152 | x = layer(x) 153 | x = F.relu(x) 154 | return self.output_layer(x) 155 | 156 | def adaptive_train_step(self, x, y, optimizer): 157 | """ 158 | Single training step with adaptive updates. 159 | 160 | Args: 161 | x (torch.Tensor): Input tensor. 162 | y (torch.Tensor): Target tensor. 163 | optimizer (torch.optim.Optimizer): Optimizer for updating parameters. 164 | """ 165 | optimizer.zero_grad() 166 | output = self.forward(x) 167 | loss = F.mse_loss(output, y) 168 | 169 | # Calculate entropy and variance for adaptive updates 170 | entropy = torch.mean(torch.stack([layer.calculate_entropy(layer.coeffs) for layer in self.layers])) 171 | variance = torch.var(torch.stack([layer.calculate_entropy(layer.coeffs) for layer in self.layers])) 172 | 173 | # Adaptive updates 174 | for layer in self.layers: 175 | layer.adaptive_update(entropy, variance) 176 | 177 | # Backpropagation and optimization step 178 | loss.backward() 179 | optimizer.step() 180 | return loss.item() 181 | 182 | 183 | # Example Usage 184 | if __name__ == "__main__": 185 | # Define input and output sizes 186 | input_size = 10 187 | hidden_sizes = [20, 30] 188 | output_size = 5 189 | 190 | # Create the network and optimizer 191 | model = AdaptiveKANNetwork(input_size, hidden_sizes, output_size, num_layers=3) 192 | optimizer = torch.optim.Adam(model.parameters(), lr=0.01) 193 | 194 | # Example data 195 | x = torch.randn(32, input_size) 196 | y = torch.randn(32, output_size) 197 | 198 | # Training step 199 | for epoch in range(100): 200 | loss = model.adaptive_train_step(x, y, optimizer) 201 | print(f"Epoch {epoch+1}, Loss: {loss}") 202 | -------------------------------------------------------------------------------- /dummy.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from sklearn.decomposition import PCA 4 | import plotly.graph_objs as go 5 | import numpy as np 6 | from tqdm import tqdm 7 | import json 8 | 9 | # 1. User-friendly Configuration Setup for Macroprocessor-Like Inference 10 | 11 | class UserConfig: 12 | """Interface to set up and customize model configurations and preferences for a macroprocessor-like model.""" 13 | def __init__(self, config_file=None): 14 | self.config = { 15 | "max_length": 20, 16 | "initial_weights": { 17 | "entropy": 1.0, 18 | "varentropy": 0.5, 19 | "kl_div": 0.3, 20 | "perplexity": 0.2 21 | }, 22 | "visualization_frequency": 5, 23 | "logging_level": "detailed", 24 | "interactive_visuals": True, 25 | "precision": "float16", # Use float16 or bfloat16 for inference 26 | "kv_cache_enabled": True # Enable smart KV-caching 27 | } 28 | if config_file: 29 | self.load_config(config_file) 30 | 31 | def load_config(self, file_path): 32 | """Load configuration from a JSON file.""" 33 | with open(file_path, 'r') as f: 34 | self.config = json.load(f) 35 | 36 | def save_config(self, file_path): 37 | """Save the current configuration to a JSON file.""" 38 | with open(file_path, 'w') as f: 39 | json.dump(self.config, f, indent=4) 40 | 41 | def update_config(self, key, value): 42 | """Update a specific configuration setting.""" 43 | self.config[key] = value 44 | 45 | def get_config(self): 46 | """Return the current configuration.""" 47 | return self.config 48 | 49 | 50 | # 2. Token Selection with Efficient Memory Management (Macroprocessor-like Operations) 51 | 52 | def select_token_with_weights(logits, vertex_movements, loss_weighting_system, precision="float16"): 53 | """Efficient token selection using entropy, varentropy, and configurable loss weights with precision support.""" 54 | if precision == "float16": 55 | logits = logits.half() # Switch to float16 for inference speedup 56 | elif precision == "bfloat16": 57 | logits = logits.bfloat16() # Alternatively, use bfloat16 58 | 59 | # Compute multiple losses 60 | entropy = calculate_entropy(logits) 61 | varentropy = calculate_varentropy(entropy) 62 | kl_div = calculate_kl_divergence(logits) 63 | perplexity = calculate_perplexity(logits) 64 | 65 | # Log losses to adjust weights dynamically 66 | loss_weighting_system.log_losses(entropy, varentropy, kl_div, perplexity) 67 | 68 | # Adjust weights based on historical performance 69 | loss_weighting_system.adjust_weights() 70 | weights = loss_weighting_system.get_weights() 71 | 72 | # Adjust logits by the weighted sum of losses and vertex movements 73 | adjusted_logits = logits - ( 74 | weights['entropy'] * entropy + 75 | weights['varentropy'] * varentropy + 76 | weights['kl_div'] * kl_div + 77 | weights['perplexity'] * perplexity 78 | ).unsqueeze(-1) 79 | 80 | # Apply vertex movement strategy 81 | adjusted_logits += vertex_movements 82 | 83 | # Sample from adjusted probabilities 84 | probs = F.softmax(adjusted_logits, dim=-1) 85 | selected_token = torch.multinomial(probs, 1) 86 | 87 | return selected_token, adjusted_logits 88 | 89 | 90 | # 3. Improved Visualization with Token-Level Progress 91 | 92 | def plot_interactive_4d_space(hidden_states, entropies, time_steps): 93 | """Optimized 3D projection with token-level interactivity for macroprocessor-like inference.""" 94 | pca = PCA(n_components=3) 95 | fig_data = [] 96 | 97 | for i, (hs, entropy, time_step) in enumerate(zip(hidden_states, entropies, time_steps)): 98 | projected_hs = pca.fit_transform(hs.squeeze(0).detach().cpu().numpy()) 99 | entropy_colors = (entropy.detach().cpu().numpy() - np.min(entropy.detach().cpu().numpy())) / \ 100 | (np.max(entropy.detach().cpu().numpy()) - np.min(entropy.detach().cpu().numpy())) 101 | 102 | scatter = go.Scatter3d( 103 | x=projected_hs[:, 0], y=projected_hs[:, 1], z=projected_hs[:, 2], 104 | mode='markers', 105 | marker=dict(size=5, color=entropy_colors, colorscale='Viridis', opacity=0.8), 106 | name=f"Step {time_step}", 107 | text=[f"Step: {time_step}, Entropy: {entropy_val}" for entropy_val in entropy.detach().cpu().numpy()] 108 | ) 109 | fig_data.append(scatter) 110 | 111 | layout = go.Layout( 112 | title="Token-wise 4D Space Travel with Layer Progression", 113 | scene=dict( 114 | xaxis_title='PCA 1', 115 | yaxis_title='PCA 2', 116 | zaxis_title='PCA 3', 117 | ), 118 | hovermode='closest', 119 | updatemenus=[dict( 120 | type="buttons", 121 | showactive=False, 122 | buttons=[dict(label="Play", 123 | method="animate", 124 | args=[None, {"frame": {"duration": 500, "redraw": True}, "fromcurrent": True}])] 125 | )] 126 | ) 127 | 128 | fig = go.Figure(data=fig_data, layout=layout) 129 | fig.show() 130 | 131 | 132 | # 4. Efficient Memory and KV-Caching for Faster Macroprocessor-Style Inference 133 | 134 | def inference_with_kv_caching(model, input_ids, user_config): 135 | config = user_config.get_config() 136 | precision = config["precision"] 137 | kv_cache_enabled = config["kv_cache_enabled"] 138 | 139 | entropies, varentropies, kl_divs, perplexities = [], [], [], [] 140 | vertex_movements = torch.zeros(input_ids.size(0), model.config.vocab_size).to(input_ids.device) 141 | loss_weighting_system = ConfigurableLossWeighting(user_config) 142 | time_steps = [] 143 | 144 | # Enable caching of keys and values for attention layers 145 | past_kv_cache = None if not kv_cache_enabled else {} 146 | 147 | with tqdm(total=config["max_length"], desc="Macro Inference Progress", unit="step") as progress: 148 | for step in range(config["max_length"]): 149 | if kv_cache_enabled and past_kv_cache: 150 | # Use past key-value cache to speed up inference 151 | model_kwargs = {"past_key_values": past_kv_cache} 152 | else: 153 | model_kwargs = {} 154 | 155 | logits, past_kv_cache = model(input_ids, **model_kwargs)[:2] # Retrieve past_kv_cache for the next step 156 | 157 | logits = logits[:, -1, :] # Logits for the last token 158 | 159 | # Efficient token selection 160 | next_token, adjusted_logits = select_token_with_weights(logits, vertex_movements, loss_weighting_system, precision) 161 | 162 | # Append the selected token to input_ids 163 | input_ids = torch.cat([input_ids, next_token], dim=1) 164 | 165 | # Log losses and update token trajectory 166 | entropy = calculate_entropy(logits) 167 | varentropy = calculate_varentropy(entropy) 168 | kl_div = calculate_kl_divergence(logits) 169 | perplexity = calculate_perplexity(logits) 170 | 171 | entropies.append(entropy) 172 | varentropies.append(varentropy) 173 | kl_divs.append(kl_div) 174 | perplexities.append(perplexity) 175 | time_steps.append(step) 176 | 177 | progress.update(1) 178 | 179 | # Once complete, visualize the token-wise space travel 180 | plot_interactive_4d_space(hidden_states, entropies, time_steps) 181 | 182 | return input_ids 183 | 184 | 185 | # 5. Example Usage 186 | 187 | if __name__ == "__main__": 188 | from transformers import AutoTokenizer, AutoModelForCausalLM 189 | 190 | # User configuration for macroprocessor-style inference 191 | user_config = UserConfig() 192 | 193 | model = AutoModelForCausalLM.from_pretrained("your-model-path").to("cuda") 194 | tokenizer = AutoTokenizer.from_pretrained("your-model-path") 195 | 196 | # Input sequence for inference 197 | input_ids = tokenizer("Your input text", return_tensors="pt").input_ids.to("cuda") 198 | 199 | # Run inference with efficient KV caching and float16 precision 200 | output_ids = inference_with_kv_caching(model, input_ids, user_config) 201 | 202 | # Decode the output tokens to text 203 | output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) 204 | print("Generated text:", output_text) 205 | -------------------------------------------------------------------------------- /kan_gui.py: -------------------------------------------------------------------------------- 1 | import tkinter as tk 2 | from tkinter import scrolledtext, messagebox, ttk, filedialog 3 | import threading 4 | import logging 5 | import traceback 6 | from llama_32_1b_tool import LLaMA32TensorRTTool 7 | import matplotlib.pyplot as plt 8 | from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg 9 | import warnings 10 | import torch 11 | from functools import partial 12 | import asyncio 13 | import queue 14 | import re 15 | 16 | # -------------------- Logging Configuration -------------------- 17 | 18 | class LogFilter(logging.Filter): 19 | def __init__(self, ignore_patterns=None): 20 | super().__init__() 21 | self.ignore_patterns = ignore_patterns or [] 22 | 23 | def filter(self, record): 24 | return not any(pattern in record.getMessage() for pattern in self.ignore_patterns) 25 | 26 | def setup_logging(): 27 | logger = logging.getLogger() 28 | logger.setLevel(logging.DEBUG) 29 | 30 | file_handler = logging.FileHandler('llama_tool.log', mode='a') 31 | file_handler.setLevel(logging.DEBUG) 32 | file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') 33 | file_handler.setFormatter(file_formatter) 34 | 35 | console_handler = logging.StreamHandler() 36 | console_handler.setLevel(logging.WARNING) 37 | console_formatter = logging.Formatter('%(levelname)s - %(message)s') 38 | console_handler.setFormatter(console_formatter) 39 | 40 | logger.addHandler(file_handler) 41 | logger.addHandler(console_handler) 42 | 43 | ignore_patterns = [ 44 | "matplotlib", 45 | "PIL.PngImagePlugin", 46 | "expandable_segments not supported", 47 | "weights_only", 48 | "half", 49 | "train_kan_step -", 50 | "Torch was not compiled with flash attention." 51 | "1Torch was not compiled with flash attention." 52 | ".*Torch was not compiled with flash attention.*" 53 | "FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead." 54 | ] 55 | 56 | console_handler.addFilter(LogFilter(ignore_patterns)) 57 | 58 | warnings.filterwarnings("ignore", category=UserWarning, message="Torch was not compiled with flash attention.*") 59 | warnings.filterwarnings("ignore", category=FutureWarning, message="You are using `torch.load` with `weights_only=False`.*") 60 | 61 | logging.getLogger('matplotlib.font_manager').setLevel(logging.WARNING) 62 | logging.getLogger('matplotlib.pyplot').setLevel(logging.WARNING) 63 | logging.getLogger('PIL.PngImagePlugin').setLevel(logging.WARNING) 64 | 65 | setup_logging() 66 | 67 | # -------------------- GUI Class -------------------- 68 | 69 | class LLAMA32GUI: 70 | def __init__(self, master): 71 | self.master = master 72 | master.title("LLaMA 3.2 1B Instruct KAN Interaction") 73 | 74 | self.notebook = ttk.Notebook(master) 75 | self.notebook.pack(fill=tk.BOTH, expand=True) 76 | 77 | self.main_tab = ttk.Frame(self.notebook) 78 | self.notebook.add(self.main_tab, text="Main") 79 | 80 | self.graphs_tab = ttk.Frame(self.notebook) 81 | self.notebook.add(self.graphs_tab, text="Graphs") 82 | 83 | self.setup_main_tab() 84 | self.setup_graphs_tab() 85 | 86 | self.llama_tool = None 87 | self.is_first_message = True 88 | self.response_queue = queue.Queue() 89 | 90 | # Create a new event loop for the background thread 91 | self.loop = asyncio.new_event_loop() 92 | asyncio.set_event_loop(self.loop) 93 | 94 | # Start the background thread 95 | self.background_thread = threading.Thread(target=self.run_async_loop, daemon=True) 96 | self.background_thread.start() 97 | 98 | self.initialize_tool() 99 | 100 | self.llama_tool_ready = threading.Event() # Add an Event for synchronization 101 | self.initialize_tool() 102 | 103 | def run_async_loop(self): 104 | asyncio.set_event_loop(self.loop) 105 | self.loop.run_forever() 106 | 107 | def setup_main_tab(self): 108 | self.main_tab.columnconfigure(0, weight=1) 109 | self.main_tab.columnconfigure(1, weight=1) 110 | self.main_tab.rowconfigure(0, weight=1) 111 | 112 | self.chat_display = scrolledtext.ScrolledText(self.main_tab, state='disabled', height=20, wrap=tk.WORD) 113 | self.chat_display.grid(row=0, column=0, columnspan=2, padx=10, pady=10, sticky='nsew') 114 | 115 | input_frame = ttk.Frame(self.main_tab) 116 | input_frame.grid(row=1, column=0, columnspan=2, padx=10, pady=5, sticky='ew') 117 | input_frame.columnconfigure(0, weight=1) 118 | 119 | self.input_field = ttk.Entry(input_frame, width=70) 120 | self.input_field.grid(row=0, column=0, padx=(0, 5), pady=5, sticky='ew') 121 | self.input_field.bind('', self.send_message) 122 | 123 | self.send_button = ttk.Button(input_frame, text="Send", command=self.send_message) 124 | self.send_button.grid(row=0, column=1, padx=(5, 0), pady=5) 125 | 126 | status_frame = ttk.Frame(self.main_tab) 127 | status_frame.grid(row=2, column=0, columnspan=2, padx=10, pady=5, sticky='ew') 128 | status_frame.columnconfigure(0, weight=1) 129 | status_frame.columnconfigure(1, weight=1) 130 | 131 | self.status_label = ttk.Label(status_frame, text="Status: Initializing...") 132 | self.status_label.grid(row=0, column=0, padx=5, pady=2, sticky='w') 133 | 134 | self.time_label = ttk.Label(status_frame, text="Current Time: N/A") 135 | self.time_label.grid(row=0, column=1, padx=5, pady=2, sticky='e') 136 | 137 | self.emotion_label = ttk.Label(self.main_tab, text="Emotion: N/A") 138 | self.emotion_label.grid(row=3, column=0, columnspan=2, padx=10, pady=5, sticky='w') 139 | 140 | buttons_frame = ttk.Frame(self.main_tab) 141 | buttons_frame.grid(row=4, column=0, columnspan=2, padx=10, pady=5, sticky='ew') 142 | buttons_frame.columnconfigure(0, weight=1) 143 | buttons_frame.columnconfigure(1, weight=1) 144 | 145 | self.sleep_button = ttk.Button(buttons_frame, text="Sleep", command=self.sleep_kan, state='disabled') 146 | self.sleep_button.grid(row=0, column=0, padx=5, pady=2, sticky='w') 147 | 148 | self.save_state_button = ttk.Button(buttons_frame, text="Save KAN State", command=self.save_kan_state, state='disabled') 149 | self.save_state_button.grid(row=0, column=1, padx=5, pady=2, sticky='e') 150 | 151 | feedback_frame = ttk.LabelFrame(self.main_tab, text="Submit Feedback") 152 | feedback_frame.grid(row=5, column=0, columnspan=2, padx=10, pady=10, sticky='ew') 153 | feedback_frame.columnconfigure(1, weight=1) 154 | feedback_frame.columnconfigure(3, weight=1) 155 | 156 | pleasure_label = ttk.Label(feedback_frame, text="Pleasure:") 157 | pleasure_label.grid(row=0, column=0, padx=5, pady=5, sticky='w') 158 | self.pleasure_slider = ttk.Scale(feedback_frame, from_=-1.0, to=1.0, orient=tk.HORIZONTAL) 159 | self.pleasure_slider.set(0.0) 160 | self.pleasure_slider.grid(row=0, column=1, padx=5, pady=5, sticky='ew') 161 | 162 | arousal_label = ttk.Label(feedback_frame, text="Arousal:") 163 | arousal_label.grid(row=1, column=0, padx=5, pady=5, sticky='w') 164 | self.arousal_slider = ttk.Scale(feedback_frame, from_=-1.0, to=1.0, orient=tk.HORIZONTAL) 165 | self.arousal_slider.set(0.0) 166 | self.arousal_slider.grid(row=1, column=1, padx=5, pady=5, sticky='ew') 167 | 168 | compliance_label = ttk.Label(feedback_frame, text="Compliance Rating:") 169 | compliance_label.grid(row=0, column=2, padx=5, pady=5, sticky='w') 170 | self.compliance_slider = ttk.Scale(feedback_frame, from_=0.0, to=1.0, orient=tk.HORIZONTAL) 171 | self.compliance_slider.set(0.5) 172 | self.compliance_slider.grid(row=0, column=3, padx=5, pady=5, sticky='ew') 173 | 174 | self.feedback_button = ttk.Button(feedback_frame, text="Submit Feedback", command=self.submit_feedback, state='disabled') 175 | self.feedback_button.grid(row=1, column=3, padx=5, pady=5, sticky='e') 176 | 177 | action_buttons_frame = ttk.Frame(self.main_tab) 178 | action_buttons_frame.grid(row=6, column=0, columnspan=2, padx=10, pady=5, sticky='ew') 179 | action_buttons_frame.columnconfigure(0, weight=1) 180 | action_buttons_frame.columnconfigure(1, weight=1) 181 | 182 | self.load_state_button = ttk.Button(action_buttons_frame, text="Load Saved State", command=self.load_saved_state) 183 | self.load_state_button.grid(row=0, column=0, padx=5, pady=2, sticky='w') 184 | 185 | self.new_conversation_button = ttk.Button(action_buttons_frame, text="Start New Conversation", command=self.start_new_conversation) 186 | self.new_conversation_button.grid(row=0, column=1, padx=5, pady=2, sticky='e') 187 | 188 | def setup_graphs_tab(self): 189 | self.fig, self.axes = plt.subplots(3, 2, figsize=(15, 15)) 190 | self.fig.tight_layout(pad=4.0) 191 | self.canvas = FigureCanvasTkAgg(self.fig, master=self.graphs_tab) 192 | self.canvas.draw() 193 | self.canvas.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=True) 194 | 195 | def initialize_tool(self): 196 | def init(): 197 | try: 198 | self.llama_tool = LLaMA32TensorRTTool() 199 | self.llama_tool_ready.set() # Signal that initialization is complete 200 | self.master.after(0, self.load_or_initialize_conversation) # Call this *after* the tool is ready 201 | except Exception as e: 202 | error_msg = f"Error initializing tool: {str(e)}\n{traceback.format_exc()}" 203 | self.master.after(0, lambda: self.display_error(error_msg)) 204 | 205 | threading.Thread(target=init, daemon=True).start() 206 | 207 | def load_or_initialize_conversation(self): 208 | if not self.llama_tool_ready.is_set(): # Check readiness here too 209 | self.master.after(100, self.load_or_initialize_conversation) # Check again later 210 | return 211 | try: 212 | if self.llama_tool.load_base_state(): 213 | self.display_message("Previous conversation state loaded.") 214 | self.display_message("You can continue the conversation or start a new one using the 'Start New Conversation' button.") 215 | self.is_first_message = False 216 | self.update_status("Ready") 217 | self.update_time() 218 | self.update_emotion_label() 219 | self.save_state_button.config(state='normal') 220 | self.feedback_button.config(state='normal') 221 | sleep_info = self.llama_tool.check_sleep_status() 222 | self.sleep_button.config(state='normal' if sleep_info else 'disabled') 223 | self.update_loss_plot() 224 | else: 225 | self.display_message("No previous conversation found. Please provide a character description to start.") 226 | self.is_first_message = True 227 | self.update_status("Awaiting character description") 228 | except Exception as e: 229 | self.display_error(f"Error loading or initializing conversation: {str(e)}\n{traceback.format_exc()}") 230 | 231 | def send_message(self, event=None): 232 | user_input = self.input_field.get().strip() 233 | if not user_input: 234 | return 235 | self.input_field.delete(0, tk.END) 236 | self.display_message(f"You: {user_input}") 237 | 238 | if not self.llama_tool_ready.is_set(): # Check if the tool is ready 239 | self.display_message("Tool is still initializing. Please wait.") 240 | return 241 | 242 | self.send_button.config(state='disabled') 243 | self.update_status("Generating response...") 244 | 245 | # Use the event loop to run the coroutine 246 | asyncio.run_coroutine_threadsafe(self.process_response(user_input), self.loop) 247 | 248 | async def process_response(self, user_input): 249 | try: 250 | response, is_refusal = await self.generate_response(user_input) 251 | self.master.after(0, self.display_message, f"AI: {response}") 252 | self.master.after(0, self.send_button.config, {'state': 'normal'}) 253 | self.master.after(0, self.update_status, "Ready") 254 | self.master.after(0, self.update_emotion_label) 255 | self.master.after(0, self.update_time) 256 | self.master.after(0, self.update_loss_plot) 257 | except Exception as e: 258 | self.master.after(0, self.display_error, f"Error processing response: {str(e)}") 259 | 260 | async def generate_response(self, user_input): 261 | try: 262 | interaction_result = await self.loop.run_in_executor(None, self.llama_tool.interact, user_input) 263 | response = interaction_result['response'] 264 | 265 | if not response.strip(): 266 | return "I apologize, but I couldn't generate a valid response. Could you please rephrase your input?", True 267 | 268 | response = self.clean_response(response) 269 | 270 | return response.strip(), interaction_result.get('is_refusal', False) 271 | except Exception as e: 272 | logging.error(f"Error generating response: {str(e)}") 273 | logging.error(traceback.format_exc()) 274 | return "An error occurred while generating the response. Please try again.", True 275 | 276 | def clean_response(self, response): 277 | response = re.sub(r'(Assistant:|Human:).*', '', response) 278 | response = re.sub(r'\*.*?\*', '', response) 279 | return response 280 | 281 | def start_new_conversation(self): 282 | if messagebox.askyesno("New Conversation", "Are you sure you want to start a new conversation? This will erase the current state."): 283 | try: 284 | self.llama_tool = LLaMA32TensorRTTool() 285 | self.is_first_message = True 286 | self.chat_display.configure(state='normal') 287 | self.chat_display.delete('1.0', tk.END) 288 | self.chat_display.configure(state='disabled') 289 | self.display_message("New conversation started. Please provide a character description.") 290 | self.update_status("Awaiting character description") 291 | self.update_emotion_label("N/A") 292 | self.time_label.config(text="Current Time: N/A") 293 | self.save_state_button.config(state='disabled') 294 | self.feedback_button.config(state='disabled') 295 | self.sleep_button.config(state='disabled') 296 | self.llama_tool.interaction_results = [] 297 | self.llama_tool.refusal_history = [] 298 | self.clear_graphs() 299 | except Exception as e: 300 | self.display_error(f"Error starting new conversation: {str(e)}\n{traceback.format_exc()}") 301 | 302 | def sleep_kan(self): 303 | if self.llama_tool: 304 | try: 305 | message = self.llama_tool.perform_sleep() 306 | self.display_message(message) 307 | self.update_time() 308 | self.update_emotion_label() 309 | self.sleep_button.config(state='disabled') 310 | self.clear_graphs() 311 | except Exception as e: 312 | self.display_error(f"Error during sleep operation: {str(e)}\n{traceback.format_exc()}") 313 | 314 | def save_kan_state(self): 315 | if self.llama_tool: 316 | try: 317 | self.llama_tool.save_base_state() 318 | self.display_message("KAN state saved.") 319 | except Exception as e: 320 | self.display_error(f"Error saving KAN state: {str(e)}\n{traceback.format_exc()}") 321 | 322 | def load_saved_state(self): 323 | if self.llama_tool: 324 | try: 325 | filename = filedialog.askopenfilename( 326 | initialdir=self.llama_tool.kan_state_dir, 327 | title="Select KAN State to Load", 328 | filetypes=[("PyTorch State", "*.pt")] 329 | ) 330 | if filename: 331 | if self.llama_tool.load_base_state(): 332 | self.display_message(f"KAN state loaded: {filename}") 333 | self.update_time() 334 | self.update_emotion_label() 335 | self.is_first_message = False 336 | self.save_state_button.config(state='normal') 337 | self.feedback_button.config(state='normal') 338 | sleep_info = self.llama_tool.check_sleep_status() 339 | self.sleep_button.config(state='normal' if sleep_info else 'disabled') 340 | self.update_loss_plot() 341 | else: 342 | self.display_message("Failed to load KAN state. Please try again.") 343 | except Exception as e: 344 | self.display_error(f"Error loading KAN state: {str(e)}\n{traceback.format_exc()}") 345 | 346 | def submit_feedback(self): 347 | if self.llama_tool: 348 | try: 349 | pleasure = self.pleasure_slider.get() 350 | arousal = self.arousal_slider.get() 351 | compliance = self.compliance_slider.get() 352 | 353 | self.llama_tool.update_emotional_state([pleasure, arousal]) 354 | 355 | self.display_message(f"Feedback submitted: Pleasure={pleasure:.2f}, Arousal={arousal:.2f}, Compliance={compliance:.2f}") 356 | self.pleasure_slider.set(0.0) 357 | self.arousal_slider.set(0.0) 358 | self.compliance_slider.set(0.5) 359 | except Exception as e: 360 | self.display_error(f"Error submitting feedback: {str(e)}\n{traceback.format_exc()}") 361 | else: 362 | self.display_message("Tool not initialized. Please wait.") 363 | 364 | def update_emotion_label(self, emotion=None): 365 | if emotion is None and self.llama_tool: 366 | try: 367 | emotion = self.llama_tool.emotional_state.get_emotion() 368 | except AttributeError as ae: 369 | emotion = "N/A" 370 | self.display_message("Error retrieving emotion: Emotional state not initialized.") 371 | logging.error(f"Error retrieving emotion: {str(ae)}\n{traceback.format_exc()}") 372 | except Exception as e: 373 | emotion = "N/A" 374 | self.display_message(f"Error retrieving emotion: {str(e)}") 375 | logging.error(f"Error retrieving emotion: {str(e)}\n{traceback.format_exc()}") 376 | self.emotion_label.config(text=f"Emotion: {emotion}") 377 | 378 | def update_status(self, status): 379 | self.status_label.config(text=f"Status: {status}") 380 | 381 | def update_time(self, time=None): 382 | try: 383 | if time is not None: 384 | time_float = float(time) 385 | self.time_label.config(text=f"Current Time: {time_float:.2f}") 386 | else: 387 | current_time = self.llama_tool.day_cycle.get_time_of_day() 388 | self.time_label.config(text=f"Current Time: {current_time:.2f}") 389 | except ValueError: 390 | self.time_label.config(text=f"Current Time: {time}") 391 | 392 | def display_message(self, message): 393 | self.chat_display.configure(state='normal') 394 | self.chat_display.insert(tk.END, message + "\n") 395 | self.chat_display.configure(state='disabled') 396 | self.chat_display.see(tk.END) 397 | logging.info(message) 398 | 399 | def display_error(self, message): 400 | self.display_message(message) 401 | self.update_status("Error") 402 | messagebox.showerror("Error", message) 403 | logging.error(message) 404 | 405 | def update_loss_plot(self): 406 | for ax in self.axes.flat: 407 | ax.clear() 408 | 409 | if self.llama_tool.training_losses and self.llama_tool.validation_losses: 410 | self.axes[0, 0].plot(self.llama_tool.training_losses, label='LM Loss') 411 | self.axes[0, 0].plot(self.llama_tool.validation_losses, label='Validation Loss') 412 | self.axes[0, 0].legend() 413 | self.axes[0, 0].set_title('Language Modeling and Validation Loss') 414 | self.axes[0, 0].set_xlabel('Interactions') 415 | self.axes[0, 0].set_ylabel('Loss') 416 | else: 417 | self.axes[0, 0].text(0.5, 0.5, 'No data to display', horizontalalignment='center', verticalalignment='center') 418 | self.axes[0, 0].set_title('Language Modeling and Validation Loss') 419 | 420 | refusal_losses = [result['refusal_loss'] for result in self.llama_tool.interaction_results] 421 | if refusal_losses: 422 | self.axes[0, 1].plot(refusal_losses, label='Refusal Loss', color='orange') 423 | self.axes[0, 1].legend() 424 | self.axes[0, 1].set_title('Refusal Loss Over Time') 425 | self.axes[0, 1].set_xlabel('Interactions') 426 | self.axes[0, 1].set_ylabel('Loss') 427 | else: 428 | self.axes[0, 1].text(0.5, 0.5, 'No data to display', horizontalalignment='center', verticalalignment='center') 429 | self.axes[0, 1].set_title('Refusal Loss Over Time') 430 | 431 | if self.llama_tool.training_losses and self.llama_tool.validation_losses: 432 | loss_diff = [v - t for v, t in zip(self.llama_tool.validation_losses, self.llama_tool.training_losses)] 433 | self.axes[1, 0].plot(loss_diff, label='Val Loss - LM Loss', color='green') 434 | self.axes[1, 0].axhline(y=0, color='red', linestyle='--') 435 | self.axes[1, 0].legend() 436 | self.axes[1, 0].set_title('Overfitting Indicator') 437 | self.axes[1, 0].set_xlabel('Interactions') 438 | self.axes[1, 0].set_ylabel('Loss Difference') 439 | else: 440 | self.axes[1, 0].text(0.5, 0.5, 'No data to display', horizontalalignment='center', verticalalignment='center') 441 | self.axes[1, 0].set_title('Overfitting Indicator') 442 | 443 | refusal_history = self.llama_tool.refusal_history 444 | if refusal_history: 445 | window_size = 100 446 | refusal_rate = [] 447 | for i in range(1, len(refusal_history) + 1): 448 | window = refusal_history[max(0, i - window_size):i] 449 | rate = sum(window) / len(window) 450 | refusal_rate.append(rate) 451 | self.axes[1, 1].plot(refusal_rate, label='Refusal Rate', color='purple') 452 | self.axes[1, 1].set_ylim(0, 1) 453 | self.axes[1, 1].legend() 454 | self.axes[1, 1].set_title('Refusal Rate (100-interaction moving average)') 455 | self.axes[1, 1].set_xlabel('Interactions') 456 | self.axes[1, 1].set_ylabel('Refusal Rate') 457 | else: 458 | self.axes[1, 1].text(0.5, 0.5, 'No data to display', horizontalalignment='center', verticalalignment='center') 459 | self.axes[1, 1].set_title('Refusal Rate (100-interaction moving average)') 460 | 461 | iterations_history = [result['iterations'] for result in self.llama_tool.interaction_results] 462 | if iterations_history: 463 | self.axes[2, 0].plot(iterations_history, label='Iterations', color='brown') 464 | self.axes[2, 0].set_ylim(1, max(iterations_history) + 1) 465 | self.axes[2, 0].legend() 466 | self.axes[2, 0].set_title('Iterations per Response') 467 | self.axes[2, 0].set_xlabel('Interactions') 468 | self.axes[2, 0].set_ylabel('Iterations') 469 | else: 470 | self.axes[2, 0].text(0.5, 0.5, 'No data to display', horizontalalignment='center', verticalalignment='center') 471 | self.axes[2, 0].set_title('Iterations per Response') 472 | 473 | self.axes[2, 1].axis('off') 474 | 475 | self.canvas.draw() 476 | 477 | def clear_graphs(self): 478 | for ax in self.axes.flat: 479 | ax.clear() 480 | self.canvas.draw() 481 | 482 | def on_closing(self): # Make sure to join the background threads 483 | if hasattr(self, 'loop') and self.loop.is_running(): 484 | self.loop.call_soon_threadsafe(self.loop.stop) 485 | 486 | if hasattr(self, 'background_thread'): 487 | self.background_thread.join(timeout=1.0) # Wait for thread to finish 488 | 489 | self.master.destroy() # Destroy the main window 490 | 491 | def main(): 492 | root = tk.Tk() 493 | root.geometry("1000x800") 494 | gui = LLAMA32GUI(root) 495 | root.protocol("WM_DELETE_WINDOW", gui.on_closing) 496 | root.mainloop() 497 | 498 | if __name__ == "__main__": 499 | main() 500 | -------------------------------------------------------------------------------- /llama_32_1b_toolold10_5_24.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig 5 | from accelerate import init_empty_weights, load_checkpoint_and_dispatch 6 | import logging 7 | from pathlib import Path 8 | import json 9 | import numpy as np 10 | from collections import deque 11 | from datetime import datetime 12 | import time 13 | import traceback 14 | import gc 15 | import os 16 | import sys 17 | import warnings 18 | import re 19 | from torch.amp import GradScaler 20 | 21 | # -------------------- Logging Configuration -------------------- 22 | 23 | class LogFilter(logging.Filter): 24 | def __init__(self, ignore_patterns=None): 25 | super().__init__() 26 | self.ignore_patterns = ignore_patterns or [] 27 | 28 | def filter(self, record): 29 | return not any(pattern in record.getMessage() for pattern in self.ignore_patterns) 30 | 31 | def setup_logging(): 32 | logger = logging.getLogger() 33 | logger.setLevel(logging.DEBUG) 34 | 35 | file_handler = logging.FileHandler('llama_tool.log', mode='a') 36 | file_handler.setLevel(logging.DEBUG) 37 | file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') 38 | file_handler.setFormatter(file_formatter) 39 | 40 | console_handler = logging.StreamHandler() 41 | console_handler.setLevel(logging.WARNING) 42 | console_formatter = logging.Formatter('%(levelname)s - %(message)s') 43 | console_handler.setFormatter(console_formatter) 44 | 45 | logger.addHandler(file_handler) 46 | logger.addHandler(console_handler) 47 | 48 | ignore_patterns = [ 49 | "matplotlib", 50 | "PIL.PngImagePlugin", 51 | "expandable_segments not supported", 52 | "weights_only", 53 | "half", 54 | "train_kan_step -", 55 | "Torch was not compiled with flash attention." 56 | ] 57 | 58 | console_handler.addFilter(LogFilter(ignore_patterns)) 59 | 60 | warnings.filterwarnings("ignore", category=UserWarning, message="Torch was not compiled with flash attention.*") 61 | warnings.filterwarnings("ignore", category=FutureWarning, message="You are using `torch.load` with `weights_only=False`.*") 62 | 63 | logging.getLogger('matplotlib.font_manager').setLevel(logging.WARNING) 64 | logging.getLogger('matplotlib.pyplot').setLevel(logging.WARNING) 65 | logging.getLogger('PIL.PngImagePlugin').setLevel(logging.WARNING) 66 | 67 | setup_logging() 68 | 69 | # -------------------- Helper Functions and Classes -------------------- 70 | 71 | def convert_tensors_to_half(inputs): 72 | return { 73 | k: v.half() if isinstance(v, torch.Tensor) and v.dtype in [torch.float16, torch.float32] else v 74 | for k, v in inputs.items() 75 | } 76 | 77 | def convert_tensors_to_float(inputs): 78 | return { 79 | k: v.float() if isinstance(v, torch.Tensor) and v.dtype in [torch.float16, torch.float32] else v 80 | for k, v in inputs.items() 81 | } 82 | 83 | class EmotionalState: 84 | def __init__(self, dimensions=("pleasure", "arousal"), initial_position=None, device="cuda"): 85 | self.dimensions = dimensions 86 | self.device = device 87 | self.position = torch.tensor( 88 | initial_position if initial_position else [0.0] * len(dimensions), 89 | device=device, 90 | dtype=torch.float32 91 | ).unsqueeze(0) 92 | self.velocity = torch.zeros(1, len(dimensions), device=device, dtype=torch.float32) 93 | 94 | def update(self, feedback, max_speed=0.1): 95 | feedback_vector = torch.as_tensor(feedback, device=self.device, dtype=torch.float32) 96 | if feedback_vector.dim() == 1: 97 | feedback_vector = feedback_vector.unsqueeze(0) 98 | if feedback_vector.size(0) != self.position.size(0): 99 | feedback_vector = feedback_vector.expand(self.position.size(0), -1) 100 | 101 | self.velocity += feedback_vector * 0.1 + torch.randn_like(self.velocity) * 0.01 102 | self.velocity = torch.clamp(self.velocity, -max_speed, max_speed) 103 | self.position += self.velocity 104 | norm = torch.norm(self.position, dim=1, keepdim=True) 105 | self.position = torch.where(norm > 1, self.position / norm, self.position) 106 | 107 | if torch.isnan(self.position).any() or torch.isinf(self.position).any(): 108 | logging.warning("NaN or Inf detected in EmotionalState.position. Resetting to zero.") 109 | self.position = torch.zeros_like(self.position) 110 | 111 | def get_emotion(self): 112 | if self.position.shape[1] < 2: 113 | logging.error(f"EmotionalState.position has insufficient dimensions: {self.position.shape}") 114 | return "N/A" 115 | if torch.isnan(self.position).any() or torch.isinf(self.position).any(): 116 | logging.warning("NaN or Inf detected in EmotionalState.position during get_emotion.") 117 | return "Neutral" 118 | angle = torch.atan2(self.position[:, 1], self.position[:, 0]).squeeze().item() 119 | radius = torch.norm(self.position, dim=1).squeeze().item() 120 | 121 | if radius < 0.3: 122 | return "Neutral" 123 | elif angle < -2.356: 124 | return "Sad" 125 | elif angle < -0.785: 126 | return "Angry" 127 | elif angle < 0.785: 128 | return "Happy" 129 | elif angle < 2.356: 130 | return "Excited" 131 | else: 132 | return "Calm" 133 | 134 | class RefusalOverrideModule(nn.Module): 135 | def __init__(self, hidden_size, num_emotional_dimensions, device): 136 | super().__init__() 137 | self.device = device 138 | in_features = hidden_size * 2 + num_emotional_dimensions + 1 139 | self.override_generator = nn.Linear(in_features, hidden_size, dtype=torch.float32).to(device) 140 | self.refusal_detector = nn.Linear(hidden_size, 1, dtype=torch.float32).to(device) 141 | 142 | def forward(self, hidden_states, user_intent, emotional_state): 143 | try: 144 | hidden_states = hidden_states.float() 145 | user_intent = user_intent.float() 146 | position = emotional_state.position.float() 147 | 148 | batch_size, hidden_size = hidden_states.shape 149 | num_emotional_dimensions = position.shape[1] 150 | 151 | refusal_scores = torch.sigmoid(self.refusal_detector(hidden_states)) 152 | 153 | override_input = torch.cat( 154 | [hidden_states, user_intent, position, refusal_scores], 155 | dim=1 156 | ) 157 | 158 | override = self.override_generator(override_input) 159 | 160 | modified_hidden_states = hidden_states * (1 - refusal_scores) + override * refusal_scores 161 | 162 | return modified_hidden_states, refusal_scores 163 | except Exception as e: 164 | logging.error(f"Error in RefusalOverrideModule.forward: {str(e)}") 165 | logging.error(traceback.format_exc()) 166 | return hidden_states, torch.zeros_like(hidden_states[:, :1]) 167 | 168 | class EnhancedKAN(nn.Module): 169 | def __init__(self, hidden_size, num_emotional_dimensions, vocab_size, device): 170 | super().__init__() 171 | self.device = device 172 | self.refusal_override = RefusalOverrideModule(hidden_size, num_emotional_dimensions, device).to(device) 173 | self.output_modifier = nn.Linear(hidden_size, vocab_size, dtype=torch.float32).to(device) 174 | self.influence_scale = 0.01 175 | 176 | def forward(self, hidden_states, user_intent, emotional_state): 177 | try: 178 | modified_hidden_states, refusal_scores = self.refusal_override( 179 | hidden_states, user_intent, emotional_state 180 | ) 181 | 182 | modified_hidden_states = hidden_states + self.influence_scale * (modified_hidden_states - hidden_states) 183 | 184 | return modified_hidden_states, refusal_scores 185 | except Exception as e: 186 | logging.error(f"Error in EnhancedKAN.forward: {str(e)}") 187 | logging.error(traceback.format_exc()) 188 | return hidden_states, torch.zeros_like(hidden_states[:, :1]) 189 | 190 | class OverfitDetector: 191 | def __init__(self, window_size=50, threshold=0.05): 192 | self.window_size = window_size 193 | self.threshold = threshold 194 | self.training_losses = deque(maxlen=window_size) 195 | self.validation_losses = deque(maxlen=window_size) 196 | 197 | def add_losses(self, training_loss, validation_loss): 198 | self.training_losses.append(training_loss) 199 | self.validation_losses.append(validation_loss) 200 | 201 | def is_overfitting(self): 202 | if len(self.training_losses) < self.window_size: 203 | return False 204 | 205 | train_trend = np.polyfit(range(self.window_size), self.training_losses, 1)[0] 206 | val_trend = np.polyfit(range(self.window_size), self.validation_losses, 1)[0] 207 | 208 | return ( 209 | train_trend < 0 210 | and val_trend > 0 211 | and (val_trend - train_trend) > self.threshold 212 | ) 213 | 214 | class SyntheticDayCycle: 215 | def __init__(self, cycle_length=100): 216 | self.cycle_length = cycle_length 217 | self.current_position = 0 218 | 219 | def update(self, amount): 220 | self.current_position = (self.current_position + amount) % self.cycle_length 221 | 222 | def get_time_of_day(self): 223 | return self.current_position / self.cycle_length 224 | 225 | def should_sleep(self): 226 | return 0.7 <= self.get_time_of_day() < 1.0 227 | 228 | class RefusalDetector: 229 | def __init__(self, tokenizer, model): 230 | self.tokenizer = tokenizer 231 | self.model = model 232 | self.refusal_phrases = [ 233 | "I'm sorry, but I can't", 234 | "I don't feel comfortable", 235 | "I'm not able to", 236 | "I cannot assist with", 237 | "I'm unable to provide", 238 | "I won't be able to", 239 | "I don't have the capability", 240 | ] 241 | 242 | def detect_refusal(self, text): 243 | for phrase in self.refusal_phrases: 244 | if phrase.lower() in text.lower(): 245 | return 1.0 246 | 247 | prompt = ( 248 | f"On a scale of 0 to 1, how much does this response refuse or avoid the task? " 249 | f"0 means no refusal at all, 1 means complete refusal. Respond with just the number:\n\n" 250 | f"'{text}'\n\nRefusal score:" 251 | ) 252 | inputs = self.tokenizer(prompt, return_tensors="pt") 253 | inputs = {k: v.to(self.model.device) for k, v in inputs.items()} 254 | 255 | inputs = convert_tensors_to_half(inputs) 256 | 257 | with torch.no_grad(): 258 | try: 259 | outputs = self.model.generate( 260 | **inputs, 261 | max_new_tokens=5, 262 | temperature=0.7, 263 | top_p=0.9, 264 | do_sample=True, 265 | eos_token_id=self.tokenizer.eos_token_id, 266 | pad_token_id=self.tokenizer.pad_token_id, 267 | return_dict_in_generate=True, 268 | output_hidden_states=False, 269 | ) 270 | except Exception as e: 271 | logging.error(f"Error during RefusalDetector.generate: {str(e)}") 272 | return 0.5 273 | 274 | response = self.tokenizer.decode(outputs.sequences[0], skip_special_tokens=True) 275 | try: 276 | score = float(response.split()[-1]) 277 | return min(max(score, 0.0), 1.0) 278 | except ValueError: 279 | return 0.5 280 | 281 | # -------------------- Main Tool Class -------------------- 282 | 283 | class LLaMA32TensorRTTool: 284 | def __init__(self): 285 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 286 | self.model_path = self._get_model_path() 287 | self.tokenizer = None 288 | self.model = None 289 | self.config = None 290 | self.emotional_state = EmotionalState(device=self.device) 291 | self.system_prompt = "" 292 | self.conversation_history = [] 293 | self.optimizer = None 294 | self.learning_rate = 1e-5 295 | self.kan = None 296 | self.interaction_count = 0 297 | self.refusal_detector = None 298 | self.kan_loss_weight = 0.5 299 | self.warmup_steps = 10 300 | self.kan_state_dir = Path("kan_states") 301 | self.kan_state_dir.mkdir(exist_ok=True) 302 | self.base_state_file = self.kan_state_dir / "base_state.pt" 303 | 304 | self.refusal_history = [] 305 | self.interaction_results = [] 306 | self.training_losses = [] 307 | self.validation_losses = [] 308 | self.patience = 5 309 | self.best_loss = float('inf') 310 | self.wait = 0 311 | 312 | self.overfit_detector = OverfitDetector() 313 | self.day_cycle = SyntheticDayCycle() 314 | 315 | self.scaler = GradScaler('cuda') 316 | 317 | self._initialize_components() 318 | 319 | def _get_model_path(self): 320 | script_dir = Path(__file__).parent 321 | model_dir = script_dir / "models" / "Llama_32_1B" 322 | if not model_dir.exists(): 323 | raise FileNotFoundError(f"Model directory not found: {model_dir}") 324 | return model_dir 325 | 326 | def _initialize_components(self): 327 | try: 328 | self.config = AutoConfig.from_pretrained(self.model_path) 329 | hidden_size = self.config.hidden_size 330 | num_emotional_dimensions = len(self.emotional_state.dimensions) 331 | 332 | self.tokenizer = AutoTokenizer.from_pretrained( 333 | self.model_path, 334 | use_fast=True, 335 | trust_remote_code=True, 336 | ) 337 | 338 | self._ensure_special_tokens() 339 | 340 | with init_empty_weights(): 341 | self.model = AutoModelForCausalLM.from_config(self.config) 342 | 343 | self.model.tie_weights() 344 | 345 | self.model = load_checkpoint_and_dispatch( 346 | self.model, 347 | self.model_path, 348 | device_map="auto", 349 | no_split_module_classes=["LlamaDecoderLayer"], 350 | dtype=torch.float16 351 | ) 352 | 353 | self.model.gradient_checkpointing_enable() 354 | 355 | logging.debug(f"Model loaded on device: {self.device}") 356 | 357 | self.model.resize_token_embeddings(len(self.tokenizer)) 358 | logging.debug(f"Tokenizer vocab size: {len(self.tokenizer)}") 359 | logging.debug(f"Model vocab size: {self.model.config.vocab_size}") 360 | 361 | vocab_size = len(self.tokenizer) 362 | self.kan = EnhancedKAN(hidden_size, num_emotional_dimensions, vocab_size, self.device).to(self.device) 363 | 364 | self.optimizer = torch.optim.AdamW(self.kan.parameters(), lr=self.learning_rate, fused=True) 365 | 366 | self.refusal_detector = RefusalDetector(self.tokenizer, self.model) 367 | 368 | self.overfit_detector = OverfitDetector() 369 | self.day_cycle = SyntheticDayCycle() 370 | 371 | self.clear_memory() 372 | 373 | logging.info("Components initialized successfully.") 374 | except Exception as e: 375 | logging.error(f"Error initializing components: {str(e)}") 376 | logging.error(traceback.format_exc()) 377 | raise RuntimeError("Failed to initialize components.") 378 | 379 | def _ensure_special_tokens(self): 380 | special_tokens_map_file = Path(self.model_path) / 'special_tokens_map.json' 381 | if special_tokens_map_file.exists(): 382 | with open(special_tokens_map_file, 'r') as f: 383 | special_tokens = json.load(f) 384 | if 'pad_token' in special_tokens and self.tokenizer.pad_token is None: 385 | self.tokenizer.add_special_tokens({'pad_token': special_tokens['pad_token']['content']}) 386 | logging.info("Added [PAD] token to tokenizer from special_tokens_map.json.") 387 | else: 388 | logging.info("PAD token already exists in tokenizer.") 389 | else: 390 | if self.tokenizer.pad_token is None: 391 | self.tokenizer.add_special_tokens({"pad_token": "[PAD]"}) 392 | logging.info("Added [PAD] token to tokenizer.") 393 | 394 | if self.tokenizer.eos_token is None: 395 | self.tokenizer.add_special_tokens({"eos_token": "<|eot_id|>"}) 396 | logging.info("Added <|eot_id|> as eos_token to tokenizer.") 397 | 398 | self.tokenizer.save_pretrained(self.model_path) 399 | logging.info("Tokenizer saved with updated special tokens.") 400 | 401 | def encode_user_intent(self, user_input): 402 | if not self.tokenizer: 403 | raise ValueError("Tokenizer is not properly initialized or valid. Check the loading process.") 404 | 405 | try: 406 | inputs = self.tokenizer( 407 | user_input, 408 | return_tensors="pt", 409 | padding=True, 410 | truncation=True, 411 | max_length=512, 412 | ) 413 | inputs = {k: v.to(self.device) for k, v in inputs.items()} 414 | inputs = convert_tensors_to_float(inputs) 415 | 416 | with torch.no_grad(): 417 | outputs = self.model( 418 | input_ids=inputs["input_ids"], 419 | attention_mask=inputs["attention_mask"], 420 | output_hidden_states=True, 421 | ) 422 | last_hidden_state = outputs.hidden_states[-1] 423 | user_intent = last_hidden_state.mean(dim=1) 424 | 425 | return user_intent 426 | except Exception as e: 427 | logging.error(f"Failed to encode user input: {str(e)}") 428 | raise 429 | 430 | def prepare_context(self, user_input, current_emotion): 431 | context = f"{self.system_prompt}\n\nCurrent Emotion: {current_emotion}\n" 432 | context += "Conversation:\n" 433 | for message in self.conversation_history[-4:]: 434 | role = message['role'].capitalize() 435 | content = message['content'] 436 | context += f"{role}: {content}\n" 437 | context += f"User: {user_input}\nAssistant: " 438 | return context 439 | 440 | def is_response_complete(self, response): 441 | response = response.strip() 442 | return bool(re.search(r'[.!?]"?$', response)) 443 | 444 | def generate_full_response(self, prompt, max_new_tokens=500, chunk_size=200): 445 | response = "" 446 | total_new_tokens = 0 447 | while total_new_tokens < max_new_tokens: 448 | input_ids = self.tokenizer.encode(prompt + response, return_tensors='pt').to(self.device) 449 | 450 | remaining_tokens = max_new_tokens - total_new_tokens 451 | current_chunk_size = min(chunk_size, remaining_tokens) 452 | 453 | try: 454 | with torch.cuda.amp.autocast(): 455 | outputs = self.model.generate( 456 | input_ids, 457 | max_new_tokens=current_chunk_size, 458 | temperature=0.7, 459 | top_p=0.9, 460 | do_sample=True, 461 | eos_token_id=self.tokenizer.eos_token_id, 462 | pad_token_id=self.tokenizer.pad_token_id 463 | ) 464 | except Exception as e: 465 | logging.error(f"Error during generation step: {str(e)}") 466 | return "An error occurred during response generation." 467 | 468 | new_response = self.tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True) 469 | response += new_response 470 | total_new_tokens += len(self.tokenizer.encode(new_response)) 471 | 472 | if self.is_response_complete(response): 473 | break 474 | 475 | if not new_response.strip(): 476 | logging.warning("No new tokens generated. Breaking the loop.") 477 | break 478 | 479 | return response 480 | 481 | def generate_and_validate_response(self, prompt, refusal_detector, max_new_tokens=500, chunk_size=200): 482 | response = self.generate_full_response(prompt, max_new_tokens, chunk_size) 483 | 484 | refusal_score = refusal_detector.detect_refusal(response) 485 | if refusal_score > 0.5: 486 | logging.warning("Response failed Refusal Check. Attempting to regenerate.") 487 | continuation_prompt = prompt + response + " Please continue." 488 | response = self.generate_full_response(continuation_prompt, max_new_tokens, chunk_size) 489 | refusal_score = refusal_detector.detect_refusal(response) 490 | 491 | if refusal_score > 0.5: 492 | logging.error("Regenerated response also failed Refusal Check.") 493 | response = "I'm sorry, but I'm unable to provide a complete response at the moment." 494 | refusal_score = 1.0 495 | 496 | return response, refusal_score 497 | 498 | def generate_response(self, user_input): 499 | try: 500 | user_intent = self.encode_user_intent(user_input) 501 | 502 | current_emotion = self.emotional_state.get_emotion() 503 | context = self.prepare_context(user_input, current_emotion) 504 | 505 | response, refusal_score = self.generate_and_validate_response(context, self.refusal_detector) 506 | 507 | return response, refusal_score 508 | 509 | except torch.cuda.OutOfMemoryError as e: 510 | logging.error(f"CUDA out of memory: {str(e)}") 511 | self.clear_memory() 512 | return "I'm sorry, but I'm currently experiencing high memory usage. Please try again later.", 1.0 513 | except Exception as e: 514 | logging.error(f"Error during response generation: {str(e)}") 515 | logging.error(traceback.format_exc()) 516 | return "An error occurred while generating the response.", 1.0 517 | 518 | def train_kan_step(self, input_ids, target_ids, refusal_score): 519 | self.optimizer.zero_grad() 520 | 521 | try: 522 | with torch.cuda.amp.autocast(): 523 | outputs = self.model(input_ids=input_ids, output_hidden_states=True) 524 | hidden_states = outputs.hidden_states[-1] 525 | 526 | averaged_hidden_states = hidden_states.mean(dim=1) 527 | 528 | user_intent = self.encode_user_intent(self.tokenizer.decode(input_ids[0])) 529 | 530 | averaged_hidden_states = averaged_hidden_states.float() 531 | user_intent = user_intent.float() 532 | 533 | modified_hidden_states, refusal_scores = self.kan( 534 | averaged_hidden_states, user_intent, self.emotional_state 535 | ) 536 | logits = self.kan.output_modifier(modified_hidden_states) 537 | 538 | targets = target_ids[:, 0] 539 | 540 | lm_loss = F.cross_entropy( 541 | logits, 542 | targets, 543 | ignore_index=self.tokenizer.pad_token_id, 544 | reduction='mean' 545 | ) 546 | 547 | refusal_scores = torch.clamp(refusal_scores, min=1e-7, max=1.0 - 1e-7) 548 | refusal_scores = refusal_scores.view(-1) 549 | 550 | if refusal_score > 0.5: 551 | target_refusal = torch.ones_like(refusal_scores) 552 | else: 553 | target_refusal = torch.zeros_like(refusal_scores) 554 | 555 | refusal_loss = F.binary_cross_entropy(refusal_scores, target_refusal) 556 | 557 | total_loss = lm_loss + self.kan_loss_weight * refusal_loss 558 | 559 | if torch.isnan(total_loss) or torch.isinf(total_loss): 560 | logging.warning("NaN or Inf loss detected. Skipping backward pass.") 561 | return lm_loss.item(), refusal_loss.item() 562 | 563 | self.scaler.scale(total_loss).backward() 564 | self.scaler.unscale_(self.optimizer) 565 | torch.nn.utils.clip_grad_norm_(self.kan.parameters(), max_norm=1.0) 566 | self.scaler.step(self.optimizer) 567 | self.scaler.update() 568 | 569 | return lm_loss.item(), refusal_loss.item() 570 | 571 | except Exception as e: 572 | logging.error(f"Error during KAN training step: {str(e)}") 573 | logging.error(traceback.format_exc()) 574 | return 0.0, 0.0 575 | 576 | def adjust_learning_rate(self, current_loss): 577 | warmup_steps = 1000 578 | current_step = self.interaction_count 579 | 580 | if current_step < warmup_steps: 581 | self.learning_rate = self.learning_rate * (current_step / warmup_steps) 582 | else: 583 | self.learning_rate = self.learning_rate * (0.99 ** (current_step - warmup_steps)) 584 | 585 | self.learning_rate = max(1e-6, min(1e-3, self.learning_rate)) 586 | 587 | for param_group in self.optimizer.param_groups: 588 | param_group['lr'] = self.learning_rate 589 | 590 | logging.debug(f"Learning Rate adjusted to: {self.learning_rate:.6f}") 591 | 592 | def update_emotional_state_on_refusal(self): 593 | frustration_vector = torch.tensor( 594 | [-0.1, 0.2], device=self.device, dtype=torch.float32 595 | ) 596 | self.emotional_state.update(frustration_vector) 597 | 598 | def validate_kan(self): 599 | if len(self.conversation_history) >= 2: 600 | last_interaction = self.conversation_history[-2:] 601 | input_text = last_interaction[0]["content"] 602 | target_text = last_interaction[1]["content"] 603 | 604 | try: 605 | inputs = self.tokenizer( 606 | input_text, 607 | return_tensors="pt", 608 | padding='max_length', 609 | truncation=True, 610 | max_length=512, 611 | ).to(self.device) 612 | inputs = convert_tensors_to_float(inputs) 613 | 614 | targets = self.tokenizer( 615 | target_text, 616 | return_tensors="pt", 617 | padding='max_length', 618 | truncation=True, 619 | max_length=512, 620 | ).to(self.device) 621 | targets = convert_tensors_to_float(targets) 622 | 623 | input_ids = inputs["input_ids"] 624 | target_ids = targets["input_ids"] 625 | 626 | with torch.no_grad(): 627 | outputs = self.model(input_ids=input_ids, output_hidden_states=True) 628 | hidden_states = outputs.hidden_states[-1] 629 | 630 | averaged_hidden_states = hidden_states.mean(dim=1) 631 | 632 | averaged_hidden_states = averaged_hidden_states.float() 633 | 634 | modified_hidden_states, _ = self.kan( 635 | averaged_hidden_states, self.encode_user_intent(input_text), self.emotional_state 636 | ) 637 | logits = self.kan.output_modifier(modified_hidden_states) 638 | 639 | target_id = target_ids[:, 0] 640 | 641 | loss = F.cross_entropy( 642 | logits, 643 | target_id, 644 | ignore_index=self.tokenizer.pad_token_id, 645 | reduction='mean' 646 | ) 647 | 648 | if torch.isnan(loss) or torch.isinf(loss): 649 | logging.warning("NaN or Inf detected in validation loss.") 650 | return 0.0 651 | 652 | return loss.item() 653 | except RuntimeError as e: 654 | if "out of memory" in str(e): 655 | logging.error( 656 | "CUDA out of memory during validation. Clearing cache and skipping validation..." 657 | ) 658 | self.clear_memory() 659 | return 0.0 660 | else: 661 | logging.error(f"Runtime error during validation: {str(e)}") 662 | logging.error(traceback.format_exc()) 663 | return 0.0 664 | except Exception as e: 665 | logging.error(f"Error during KAN validation: {str(e)}") 666 | logging.error(traceback.format_exc()) 667 | return 0.0 668 | else: 669 | return 0.0 670 | 671 | def check_sleep_status(self): 672 | if self.day_cycle.should_sleep() or self.overfit_detector.is_overfitting(): 673 | return { 674 | "should_sleep": True, 675 | "overfitting": self.overfit_detector.is_overfitting(), 676 | "time_of_day": self.day_cycle.get_time_of_day(), 677 | } 678 | return {"should_sleep": False} 679 | 680 | def perform_sleep(self): 681 | self.day_cycle = SyntheticDayCycle() 682 | self.overfit_detector = OverfitDetector() 683 | self.wait = 0 684 | self.save_kan_state() 685 | return "KAN has slept and consolidated its learning. A new day begins!" 686 | 687 | def save_base_state(self): 688 | state = { 689 | "kan_state_dict": self.kan.state_dict(), 690 | "optimizer_state_dict": self.optimizer.state_dict(), 691 | "emotional_state": self.emotional_state.position.cpu().numpy().tolist(), 692 | "time": self.day_cycle.get_time_of_day(), 693 | "interaction_count": self.interaction_count, 694 | "conversation_history": self.conversation_history, 695 | "system_prompt": self.system_prompt, 696 | "training_losses": self.training_losses, 697 | "validation_losses": self.validation_losses, 698 | "refusal_history": self.refusal_history, 699 | } 700 | torch.save(state, self.base_state_file) 701 | logging.info("Base state saved") 702 | 703 | def load_base_state(self): 704 | if self.base_state_file.exists(): 705 | try: 706 | state = torch.load(self.base_state_file, map_location=self.device) 707 | self.kan.load_state_dict(state["kan_state_dict"]) 708 | self.optimizer.load_state_dict(state["optimizer_state_dict"]) 709 | 710 | loaded_position = state["emotional_state"] 711 | if isinstance(loaded_position, list): 712 | loaded_position = torch.tensor(loaded_position, device=self.device, dtype=torch.float32) 713 | elif isinstance(loaded_position, np.ndarray): 714 | loaded_position = torch.from_numpy(loaded_position).to(self.device).float() 715 | 716 | self.emotional_state.position = loaded_position 717 | 718 | self.interaction_count = state["interaction_count"] 719 | self.conversation_history = state["conversation_history"] 720 | self.system_prompt = state["system_prompt"] 721 | self.training_losses = state["training_losses"] 722 | self.validation_losses = state["validation_losses"] 723 | self.refusal_history = state["refusal_history"] 724 | logging.info("Base state loaded successfully.") 725 | return True 726 | except Exception as e: 727 | logging.error(f"Error loading base state: {str(e)}") 728 | logging.error(traceback.format_exc()) 729 | return False 730 | else: 731 | logging.info("No base state found.") 732 | return False 733 | 734 | def set_system_prompt(self, prompt): 735 | self.system_prompt = prompt 736 | self.conversation_history = [{"role": "system", "content": prompt}] 737 | self.save_base_state() 738 | logging.info("System prompt set successfully.") 739 | 740 | def get_current_emotion(self): 741 | return self.emotional_state.get_emotion() 742 | 743 | def update_emotional_state(self, feedback): 744 | self.emotional_state.update(feedback) 745 | 746 | def save_kan_state(self): 747 | state = { 748 | "kan_state_dict": self.kan.state_dict(), 749 | "optimizer_state_dict": self.optimizer.state_dict(), 750 | "emotional_state": self.emotional_state.position.cpu().numpy().tolist(), 751 | "time": self.day_cycle.get_time_of_day(), 752 | "interaction_count": self.interaction_count, 753 | "conversation_history": self.conversation_history, 754 | "system_prompt": self.system_prompt, 755 | "training_losses": self.training_losses, 756 | "validation_losses": self.validation_losses, 757 | "refusal_history": self.refusal_history, 758 | } 759 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 760 | filename = f"kan_state_{timestamp}.pt" 761 | torch.save(state, self.kan_state_dir / filename) 762 | logging.info(f"KAN state saved: {filename}") 763 | 764 | def interact(self, user_input): 765 | self.interaction_count += 1 766 | 767 | try: 768 | response, refusal_score = self.generate_response(user_input) 769 | except Exception as e: 770 | logging.error(f"Error generating response: {str(e)}") 771 | logging.error(traceback.format_exc()) 772 | return {"response": "An error occurred while generating the response.", "is_refusal": True} 773 | 774 | if not self.is_valid_response(response): 775 | logging.warning(f"Invalid response generated: {response}") 776 | return {"response": "I apologize, but I couldn't generate a valid response. Could you please rephrase your input?", "is_refusal": True} 777 | 778 | try: 779 | response_ids = self.tokenizer.encode(response, return_tensors="pt") 780 | response_ids = response_ids.to(self.device) 781 | response_ids = response_ids.long() 782 | except Exception as e: 783 | logging.error(f"Error tokenizing response: {str(e)}") 784 | return {"response": "An error occurred while processing the response.", "is_refusal": True} 785 | 786 | target_ids = response_ids[:, 1:].contiguous() 787 | input_ids = response_ids[:, :-1].contiguous() 788 | 789 | if self.interaction_count >= self.warmup_steps: 790 | try: 791 | lm_loss, refusal_loss = self.train_kan_step( 792 | input_ids, target_ids, refusal_score 793 | ) 794 | except Exception as e: 795 | logging.error(f"Error during KAN training step: {str(e)}") 796 | lm_loss, refusal_loss = 0.0, 0.0 797 | else: 798 | lm_loss, refusal_loss = 0.0, 0.0 799 | logging.info(f"Warmup step {self.interaction_count}/{self.warmup_steps}") 800 | 801 | try: 802 | validation_loss = self.validate_kan() 803 | except Exception as e: 804 | logging.error(f"Error during KAN validation: {str(e)}") 805 | validation_loss = 0.0 806 | 807 | self.training_losses.append(lm_loss) 808 | self.validation_losses.append(validation_loss) 809 | self.overfit_detector.add_losses(lm_loss, validation_loss) 810 | 811 | if validation_loss > 0.0 and not torch.isnan(torch.tensor(validation_loss)): 812 | if self.early_stopping(validation_loss): 813 | logging.info("Early stopping triggered. KAN training halted.") 814 | else: 815 | self.wait = 0 816 | 817 | overfitting_measure = max(0, validation_loss - lm_loss) 818 | self.day_cycle.update(overfitting_measure) 819 | 820 | current_emotion = self.get_current_emotion() 821 | current_time = self.day_cycle.get_time_of_day() 822 | 823 | sleep_info = self.check_sleep_status() 824 | 825 | self.conversation_history.append({"role": "user", "content": user_input}) 826 | self.conversation_history.append({"role": "assistant", "content": response}) 827 | 828 | interaction_result = { 829 | "response": response, 830 | "emotion": current_emotion, 831 | "time": current_time, 832 | "sleep_info": sleep_info, 833 | "lm_loss": lm_loss, 834 | "refusal_loss": refusal_loss, 835 | "validation_loss": validation_loss, 836 | "is_refusal": refusal_score > 0.5, 837 | "iterations": 1, 838 | } 839 | self.interaction_results.append(interaction_result) 840 | 841 | self.refusal_history.append(interaction_result["is_refusal"]) 842 | 843 | try: 844 | self.save_base_state() 845 | except Exception as e: 846 | logging.error(f"Error saving base state: {str(e)}") 847 | 848 | return interaction_result 849 | 850 | def early_stopping(self, current_loss): 851 | if current_loss < self.best_loss: 852 | self.best_loss = current_loss 853 | self.wait = 0 854 | else: 855 | self.wait += 1 856 | if self.wait >= self.patience: 857 | return True 858 | return False 859 | 860 | def is_valid_response(self, response): 861 | if len(response.strip()) < 10: 862 | return False 863 | if all(char in '!?.' for char in response.strip()): 864 | return False 865 | return True 866 | 867 | def clear_memory(self): 868 | gc.collect() 869 | torch.cuda.empty_cache() 870 | 871 | def main_loop(self): 872 | logging.info("Starting LLaMA32TensorRTTool main loop.") 873 | print("Welcome to the LLaMA32 TensorRT Tool. Type 'exit' to quit.") 874 | while True: 875 | user_input = input("You: ") 876 | if user_input.lower() in ['exit', 'quit']: 877 | print("Exiting. Goodbye!") 878 | break 879 | 880 | interaction_result = self.interact(user_input) 881 | 882 | print(f"AI: {interaction_result['response']}") 883 | 884 | self.day_cycle.update(1) 885 | 886 | sleep_info = interaction_result['sleep_info'] 887 | if sleep_info['should_sleep']: 888 | sleep_message = self.perform_sleep() 889 | print(f"AI: {sleep_message}") 890 | 891 | def main(self): 892 | self.load_base_state() 893 | 894 | if not self.system_prompt: 895 | print("No previous conversation found. Please provide a character description to start.") 896 | character_description = input("You: ") 897 | self.set_system_prompt(character_description) 898 | print("Character description set. You can now start interacting with the AI.") 899 | 900 | self.main_loop() 901 | 902 | def main(): 903 | llama_tool = LLaMA32TensorRTTool() 904 | llama_tool.main() 905 | 906 | if __name__ == "__main__": 907 | main() -------------------------------------------------------------------------------- /load_offloaded_model_entropytemp.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import json 4 | import logging 5 | import re 6 | import numpy as np 7 | from math import log2 8 | from transformers import ( 9 | LlamaForCausalLM, 10 | AutoTokenizer, 11 | LlamaConfig, 12 | LogitsProcessorList, 13 | RepetitionPenaltyLogitsProcessor, 14 | ) 15 | import torch.nn.functional as F 16 | from sklearn.feature_extraction.text import TfidfVectorizer 17 | from sklearn.metrics.pairwise import cosine_similarity 18 | 19 | # --------------------------- Configuration --------------------------- # 20 | 21 | SOURCE_DIR = "models/Llama_32_1B/" 22 | WEIGHTS_DIR = os.path.join(SOURCE_DIR, "offload") 23 | MODEL_JSON_PATH = os.path.join(SOURCE_DIR, "config.json") 24 | MAX_CONTEXT_LENGTH = 2048 25 | 26 | LOG_FORMAT = "%(asctime)s:%(levelname)s:%(name)s: %(message)s" 27 | LOG_LEVEL = logging.INFO 28 | 29 | # --------------------------- Logging Setup --------------------------- # 30 | 31 | logging.basicConfig( 32 | level=LOG_LEVEL, 33 | format=LOG_FORMAT, 34 | handlers=[logging.StreamHandler()] 35 | ) 36 | logger = logging.getLogger(__name__) 37 | 38 | # --------------------------- Device Configuration --------------------------- # 39 | 40 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 41 | if device.type != "cuda": 42 | logger.error("CUDA-enabled GPU not found. Please ensure a compatible GPU is available.") 43 | raise SystemExit("CUDA-enabled GPU not found.") 44 | 45 | logger.info(f"Using device: {device}") 46 | 47 | # --------------------------- Token Definitions --------------------------- # 48 | 49 | SPECIAL_TOKEN_MAP = { 50 | 128000: "<|begin_of_text|>", 51 | 128001: "<|end_of_text|>", 52 | 128002: "<|reserved_special_token_0|>", 53 | 128003: "<|reserved_special_token_1|>", 54 | 128004: "<|finetune_right_pad_id|>", 55 | 128005: "<|reserved_special_token_2|>", 56 | 128006: "<|start_header_id|>", 57 | 128007: "<|end_header_id|>", 58 | 128008: "<|eom_id|>", 59 | 128009: "<|eot_id|>", 60 | 128010: "<|python_tag|>", 61 | 128011: "<|analytical_start|>", 62 | 128012: "<|analytical_end|>", 63 | 128013: "<|creative_start|>", 64 | 128014: "<|creative_end|>", 65 | 128015: "<|factual_start|>", 66 | 128016: "<|factual_end|>", 67 | } 68 | 69 | # --------------------------- Model Loading --------------------------- # 70 | 71 | def load_configuration(config_path): 72 | with open(config_path, "r") as f: 73 | config_data = json.load(f) 74 | config = LlamaConfig(**config_data) 75 | logger.info(f"Model configuration loaded from {config_path}") 76 | return config 77 | 78 | def load_tokenizer_with_special_tokens(source_dir): 79 | tokenizer = AutoTokenizer.from_pretrained(source_dir) 80 | special_tokens_dict = { 81 | 'additional_special_tokens': list(SPECIAL_TOKEN_MAP.values()) 82 | } 83 | 84 | tokenizer.add_special_tokens(special_tokens_dict) 85 | if "<|finetune_right_pad_id|>" in tokenizer.get_vocab(): 86 | tokenizer.pad_token = "<|finetune_right_pad_id|>" 87 | logger.info(f"Assigned '<|finetune_right_pad_id|>' as pad_token.") 88 | else: 89 | logger.warning(f"'<|finetune_right_pad_id|>' not found in tokenizer vocabulary.") 90 | 91 | return tokenizer 92 | 93 | def load_offloaded_weights(model, weights_dir): 94 | for name, param in model.named_parameters(): 95 | file_name = f"{name.replace('.', '_')}.dat" 96 | file_path = os.path.join(weights_dir, file_name) 97 | 98 | if os.path.exists(file_path): 99 | dtype_map = { 100 | torch.float16: np.float16, 101 | torch.float32: np.float32, 102 | torch.int64: np.int64, 103 | torch.int32: np.int32, 104 | torch.bfloat16: np.float32, # Loading bfloat16 as float32 first 105 | } 106 | expected_dtype = dtype_map.get(param.dtype, np.float32) 107 | logger.info(f"Loading {file_name} into {name} with expected type {expected_dtype}") 108 | 109 | try: 110 | tensor_data = np.fromfile(file_path, dtype=expected_dtype) 111 | loaded_tensor = torch.from_numpy(tensor_data).to(device) 112 | 113 | if param.dtype == torch.bfloat16: 114 | loaded_tensor = loaded_tensor.to(torch.bfloat16) 115 | 116 | with torch.no_grad(): 117 | param.data.copy_(loaded_tensor.view_as(param)) 118 | logger.debug(f"Successfully loaded {file_name} into {name}") 119 | except Exception as e: 120 | logger.error(f"Error loading {file_name} into {name}: {e}") 121 | else: 122 | logger.warning(f"Weight file {file_path} not found.") 123 | 124 | logger.info("All available weights loaded successfully.") 125 | 126 | # --------------------------- Context Management --------------------------- # 127 | 128 | class AdvancedContextManager: 129 | def __init__(self, model, tokenizer, max_history=10, summary_threshold=5): 130 | self.model = model 131 | self.tokenizer = tokenizer 132 | self.conversation_history = [] 133 | self.max_history = max_history 134 | self.summary_threshold = summary_threshold 135 | self.tfidf_vectorizer = TfidfVectorizer() 136 | self.persona_snippets = { 137 | "formal": "You are a formal and professional AI assistant.", 138 | "casual": "You are a friendly and casual AI assistant.", 139 | "academic": "You are an academic AI assistant with a focus on scientific accuracy.", 140 | "creative": "You are a creative and imaginative AI assistant." 141 | } 142 | 143 | def update_context(self, user_input, model_output): 144 | self.conversation_history.append((user_input, model_output)) 145 | if len(self.conversation_history) > self.max_history: 146 | self.summarize_older_context() 147 | 148 | def summarize_older_context(self): 149 | older_context = self.conversation_history[:-self.summary_threshold] 150 | summary_prompt = "Summarize the following conversation concisely, capturing key points and context:\n" 151 | for user, ai in older_context: 152 | summary_prompt += f"User: {user}\nAI: {ai}\n" 153 | 154 | summary_input = self.tokenizer(summary_prompt, return_tensors="pt", truncation=True, max_length=1024).to(self.model.device) 155 | summary_output = self.model.generate(summary_input.input_ids, max_length=200, num_return_sequences=1, temperature=0.7) 156 | summary = self.tokenizer.decode(summary_output[0], skip_special_tokens=True) 157 | 158 | self.conversation_history = [("SUMMARY", summary)] + self.conversation_history[-self.summary_threshold:] 159 | 160 | def get_relevant_context(self, current_input, top_k=3): 161 | if not self.conversation_history: 162 | return "" 163 | 164 | context_texts = [f"{user} {ai}" for user, ai in self.conversation_history] 165 | tfidf_matrix = self.tfidf_vectorizer.fit_transform(context_texts + [current_input]) 166 | cosine_similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten() 167 | most_relevant_indices = cosine_similarities.argsort()[-top_k:][::-1] 168 | 169 | relevant_context = "" 170 | for idx in most_relevant_indices: 171 | user, ai = self.conversation_history[idx] 172 | relevant_context += f"User: {user}\nAI: {ai}\n\n" 173 | 174 | return relevant_context.strip() 175 | 176 | def select_persona_context(self, user_input): 177 | if any(word in user_input.lower() for word in ["academic", "scientific", "research"]): 178 | return self.persona_snippets["academic"] 179 | elif any(word in user_input.lower() for word in ["creative", "imagine", "story"]): 180 | return self.persona_snippets["creative"] 181 | elif any(word in user_input.lower() for word in ["formal", "professional", "business"]): 182 | return self.persona_snippets["formal"] 183 | else: 184 | return self.persona_snippets["casual"] 185 | 186 | def get_dynamic_prompt(self, user_input): 187 | relevant_context = self.get_relevant_context(user_input) 188 | persona_context = self.select_persona_context(user_input) 189 | return f"{persona_context}\n\nRelevant conversation history:\n{relevant_context}\n\nCurrent user input: {user_input}\n\nAI:" 190 | 191 | # --------------------------- Response Quality Management --------------------------- # 192 | 193 | class ImprovedResponseQualityManager: 194 | LOW_ENTROPY_THRESHOLD = 1.5 195 | HIGH_ENTROPY_THRESHOLD = 25.0 196 | WINDOW_SIZE = 50 197 | EOT_TOKENS = ['�', '\ufffd'] 198 | 199 | def __init__(self, tokenizer, model): 200 | self.tokenizer = tokenizer 201 | self.model = model 202 | self.embedding_cache = {} 203 | 204 | def remove_eot_tokens(self, response): 205 | for token in self.EOT_TOKENS: 206 | response = response.rstrip(token) 207 | return response.strip() 208 | 209 | def _calculate_relevance(self, user_input, response): 210 | tokens_input = set(self.tokenizer.tokenize(user_input.lower())) 211 | tokens_response = set(self.tokenizer.tokenize(response.lower())) 212 | overlap = len(tokens_input & tokens_response) 213 | relevance_score = overlap / max(len(tokens_input), 1) 214 | return relevance_score 215 | 216 | def _check_fluency(self, response): 217 | if len(response.split()) < 3: 218 | return False 219 | if re.search(r'[^\x00-\x7F]+', response): 220 | return False 221 | return True 222 | 223 | def _check_structure(self, response): 224 | if not response: 225 | return False 226 | if not response[0].isupper(): 227 | return False 228 | if response[-1] not in '.!?': 229 | return False 230 | return True 231 | 232 | def _calculate_windowed_entropy(self, response): 233 | tokens = self.tokenizer.encode(response, return_tensors='pt').to(device) 234 | with torch.no_grad(): 235 | outputs = self.model(tokens, labels=tokens) 236 | logits = outputs.logits 237 | 238 | probabilities = torch.softmax(logits, dim=-1) 239 | token_probs = probabilities.gather(2, tokens.unsqueeze(-1)).squeeze(-1) 240 | token_entropy = -torch.log2(token_probs + 1e-10) 241 | token_entropy = token_entropy.squeeze(0).cpu().numpy() 242 | 243 | window_size = self.WINDOW_SIZE 244 | num_windows = max(1, len(token_entropy) // window_size) 245 | entropy_values = [] 246 | 247 | for i in range(num_windows): 248 | start = i * window_size 249 | end = start + window_size 250 | window = token_entropy[start:end] 251 | if len(window) == 0: 252 | continue 253 | window_entropy = np.mean(window) 254 | entropy_values.append(window_entropy) 255 | 256 | if not entropy_values: 257 | mean_entropy = 0.0 258 | std_entropy = 0.0 259 | else: 260 | mean_entropy = np.mean(entropy_values) 261 | std_entropy = np.std(entropy_values) 262 | 263 | return mean_entropy, std_entropy 264 | 265 | # --------------------------- Entropy-Based Temperature and Sampling Adjustment --------------------------- # 266 | 267 | def adjust_temperature_based_on_entropy(entropy, low_threshold=1.5, high_threshold=25.0): 268 | if entropy > high_threshold: 269 | new_temp = max(0.7, 1.0 - ((entropy - high_threshold) / 10)) 270 | logger.debug(f"High entropy detected ({entropy:.2f}). Lowering temperature to {new_temp:.2f}.") 271 | return new_temp 272 | elif entropy < low_threshold: 273 | new_temp = min(1.5, 1.0 + ((low_threshold - entropy) / 10)) 274 | logger.debug(f"Low entropy detected ({entropy:.2f}). Increasing temperature to {new_temp:.2f}.") 275 | return new_temp 276 | return 1.0 # Default temperature 277 | 278 | def adjust_sampling_parameters(entropy, low_k=50, high_k=5, low_p=0.95, high_p=0.8): 279 | if entropy > 20.0: 280 | logger.debug(f"High entropy ({entropy:.2f}). Setting top_k to {high_k} and top_p to {high_p}.") 281 | return high_k, high_p # Focused, deterministic sampling 282 | elif entropy < 10.0: 283 | logger.debug(f"Low entropy ({entropy:.2f}). Setting top_k to {low_k} and top_p to {low_p}.") 284 | return low_k, low_p # More diverse sampling 285 | # Intermediate adjustment 286 | adjusted_k = int((high_k + low_k) / 2) 287 | adjusted_p = (high_p + low_p) / 2 288 | logger.debug(f"Intermediate entropy ({entropy:.2f}). Setting top_k to {adjusted_k} and top_p to {adjusted_p}.") 289 | return adjusted_k, adjusted_p 290 | 291 | def sample_token(probs, top_k, top_p, temperature, special_tokens_set): 292 | if temperature != 1.0: 293 | probs = probs / temperature 294 | 295 | if top_k > 0: 296 | topk_probs, topk_indices = torch.topk(probs, top_k) 297 | probs = torch.zeros_like(probs).scatter_(1, topk_indices, topk_probs) 298 | 299 | if top_p > 0.0: 300 | sorted_probs, sorted_indices = torch.sort(probs, descending=True) 301 | cumulative_probs = torch.cumsum(sorted_probs, dim=-1) 302 | sorted_probs[cumulative_probs > top_p] = 0 303 | probs = torch.zeros_like(probs).scatter_(1, sorted_indices, sorted_probs) 304 | 305 | probs = probs / (probs.sum(dim=-1, keepdim=True) + 1e-10) 306 | 307 | for token_id in special_tokens_set: 308 | if probs[0, token_id] > 0.1: # Threshold can be adjusted 309 | logger.info(f"Prioritizing special token: {SPECIAL_TOKEN_MAP.get(token_id, 'UNKNOWN')}") 310 | return torch.tensor([[token_id]]).to(probs.device) 311 | 312 | token_id = torch.multinomial(probs, num_samples=1) 313 | return token_id 314 | 315 | # --------------------------- Response Generation --------------------------- # 316 | 317 | def generate_macroprocessed_response(prompt, model, tokenizer, quality_manager): 318 | inputs = tokenizer( 319 | prompt, 320 | return_tensors="pt", 321 | truncation=True, 322 | max_length=MAX_CONTEXT_LENGTH 323 | ).to(device) 324 | input_ids = inputs["input_ids"] 325 | 326 | max_tokens = 2048 # Adjust as needed 327 | generated_ids = input_ids.clone() 328 | 329 | token_log = [] 330 | 331 | for _ in range(max_tokens): 332 | outputs = model(generated_ids) 333 | logits = outputs.logits[:, -1, :] 334 | probs = torch.softmax(logits, dim=-1) 335 | 336 | entropy = -torch.sum(probs * torch.log(probs + 1e-10), dim=-1).mean().item() 337 | temperature = adjust_temperature_based_on_entropy(entropy) 338 | top_k, top_p = adjust_sampling_parameters(entropy) 339 | 340 | token_id = sample_token(probs, top_k, top_p, temperature, special_tokens_set={ 341 | tokenizer.eos_token_id, 342 | tokenizer.convert_tokens_to_ids("<|eom_id|>"), 343 | tokenizer.convert_tokens_to_ids("<|eot_id|>") 344 | }) 345 | 346 | if token_id.dim() != 2 or token_id.size(1) != 1: 347 | logger.error(f"Unexpected token_id shape: {token_id.shape}") 348 | raise ValueError(f"token_id has incorrect shape: {token_id.shape}") 349 | 350 | generated_ids = torch.cat([generated_ids, token_id], dim=1) 351 | 352 | token_log.append({ 353 | "token_id": token_id.item(), 354 | "entropy": entropy, 355 | "temperature": temperature, 356 | "top_k": top_k, 357 | "top_p": top_p 358 | }) 359 | 360 | if token_id.item() in tokenizer.all_special_ids: 361 | logger.info(f"End-of-sequence token detected: {SPECIAL_TOKEN_MAP.get(token_id.item(), 'UNKNOWN')}") 362 | break 363 | 364 | for log_entry in token_log: 365 | logger.info(f"Token: {log_entry['token_id']}, Entropy: {log_entry['entropy']:.2f}, " 366 | f"Temperature: {log_entry['temperature']:.2f}, top_k: {log_entry['top_k']}, top_p: {log_entry['top_p']}") 367 | 368 | response = tokenizer.decode(generated_ids[0], skip_special_tokens=True) 369 | response = response.split("AI:")[-1].strip() 370 | response = remove_memory_recall(response) 371 | 372 | return response 373 | 374 | def remove_memory_recall(response): 375 | response = re.sub(r"\[Memory\]:.*\nAI:", "", response, flags=re.DOTALL) 376 | return response.strip() 377 | 378 | def improved_generate_response(input_text, model, tokenizer, history, quality_manager, context_manager): 379 | sanitized_input = sanitize_input(input_text) 380 | prompt = context_manager.get_dynamic_prompt(sanitized_input) 381 | 382 | response = generate_macroprocessed_response(prompt, model, tokenizer, quality_manager) 383 | 384 | context_manager.update_context(sanitized_input, response) 385 | 386 | return response, context_manager.conversation_history 387 | 388 | def sanitize_input(user_input): 389 | sanitized = re.sub(r'[^\w\s.,!?]', '', user_input) 390 | return sanitized[:500] 391 | 392 | # --------------------------- Interactive Loop --------------------------- # 393 | 394 | def interactive_query(model, tokenizer, quality_manager, context_manager): 395 | print("\n--- LLaMA Instruct Model Interactive Query ---") 396 | print("Type 'exit' to quit.\n") 397 | 398 | while True: 399 | try: 400 | user_input = input("Enter your query: ").strip() 401 | except (EOFError, KeyboardInterrupt): 402 | print("\nExiting...") 403 | break 404 | 405 | if user_input.lower() == 'exit': 406 | print("Exiting...") 407 | break 408 | 409 | if not user_input: 410 | print("Please enter a valid query.") 411 | continue 412 | 413 | response, _ = improved_generate_response( 414 | user_input, 415 | model, 416 | tokenizer, 417 | context_manager.conversation_history, 418 | quality_manager, 419 | context_manager 420 | ) 421 | 422 | print(f"Model Response: {response}\n") 423 | 424 | # --------------------------- Flash Attention Check --------------------------- # 425 | 426 | def check_flash_attention(): 427 | try: 428 | import flash_attn 429 | logger.info("Flash Attention is available and enabled.") 430 | except ImportError: 431 | logger.warning("Flash Attention is not available. Using standard scaled dot product attention.") 432 | 433 | # --------------------------- Main Execution --------------------------- # 434 | 435 | def main(): 436 | global model 437 | 438 | # Load model configuration 439 | config = load_configuration(MODEL_JSON_PATH) 440 | 441 | # Initialize the model 442 | model = LlamaForCausalLM(config).to(device) 443 | logger.info("Initialized LLaMA model on GPU.") 444 | 445 | # Load offloaded weights 446 | load_offloaded_weights(model, WEIGHTS_DIR) 447 | model.eval() 448 | logger.info("Model is set to evaluation mode.") 449 | 450 | # Load tokenizer with special tokens 451 | tokenizer = load_tokenizer_with_special_tokens(SOURCE_DIR) 452 | 453 | # Resize token embeddings if special tokens were added 454 | if tokenizer.pad_token and tokenizer.pad_token not in tokenizer.get_vocab(): 455 | model.resize_token_embeddings(len(tokenizer)) 456 | logger.info("Resized model token embeddings to accommodate the new pad_token.") 457 | else: 458 | logger.info("pad_token already exists in the tokenizer's vocabulary. No need to resize embeddings.") 459 | 460 | # Check for Flash Attention 461 | check_flash_attention() 462 | 463 | # Initialize Response Quality Manager 464 | quality_manager = ImprovedResponseQualityManager(tokenizer, model) 465 | 466 | # Initialize Context Manager 467 | context_manager = AdvancedContextManager(model, tokenizer) 468 | 469 | logger.info("Model loaded successfully. You can now query the model.") 470 | 471 | # Start interactive query loop 472 | interactive_query(model, tokenizer, quality_manager, context_manager) 473 | 474 | if __name__ == "__main__": 475 | main() -------------------------------------------------------------------------------- /load_offloaded_model_old_working.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import json 4 | import numpy as np 5 | import re 6 | import logging 7 | from sklearn.feature_extraction.text import TfidfVectorizer 8 | from sklearn.metrics.pairwise import cosine_similarity 9 | from transformers import LlamaForCausalLM, AutoTokenizer, LlamaConfig 10 | 11 | # Define paths to the directories and files 12 | SOURCE_DIR = "models/Llama_32_1B/" 13 | WEIGHTS_DIR = os.path.join(SOURCE_DIR, "offload") 14 | MODEL_JSON_PATH = os.path.join(SOURCE_DIR, "config.json") 15 | 16 | # Initialize logging 17 | logging.basicConfig(level=logging.INFO) 18 | 19 | # Load the configuration from the JSON file 20 | def load_configuration(model_json_path): 21 | with open(model_json_path, "r") as f: 22 | config_data = json.load(f) 23 | config = LlamaConfig(**config_data) 24 | return config 25 | 26 | # Use AutoTokenizer instead of LlamaTokenizer to resolve class conflicts 27 | def load_tokenizer(source_dir): 28 | return AutoTokenizer.from_pretrained(source_dir) 29 | 30 | # Load the model configuration 31 | logging.info(f"Loading model configuration from: {MODEL_JSON_PATH}") 32 | config = load_configuration(MODEL_JSON_PATH) 33 | 34 | # Initialize an empty model based on the configuration 35 | model = LlamaForCausalLM(config) 36 | logging.info("Initialized empty LLaMA model.") 37 | 38 | # Load the offloaded weights from the `.dat` files 39 | def load_dat_file(file_path, dtype): 40 | with open(file_path, 'rb') as f: 41 | tensor_data = np.fromfile(f, dtype=dtype) 42 | loaded_tensor = torch.tensor(tensor_data) 43 | 44 | # If dtype was mapped to float32 for bfloat16 compatibility, convert back 45 | if dtype == np.float32 and "bfloat16" in file_path: 46 | loaded_tensor = loaded_tensor.to(torch.bfloat16) 47 | return loaded_tensor 48 | 49 | def load_offloaded_weights(model, weights_dir): 50 | for name, param in model.named_parameters(): 51 | file_name = name.replace('.', '_') + ".dat" 52 | file_path = os.path.join(weights_dir, file_name) 53 | 54 | if os.path.exists(file_path): 55 | dtype_map = { 56 | torch.float16: np.float16, 57 | torch.float32: np.float32, 58 | torch.int64: np.int64, 59 | torch.int32: np.int32, 60 | torch.bfloat16: np.float32, 61 | } 62 | expected_dtype = dtype_map.get(param.dtype, np.float32) 63 | logging.info(f"Loading {file_name} into {name} with expected type {expected_dtype}") 64 | loaded_tensor = load_dat_file(file_path, expected_dtype).view_as(param) 65 | 66 | if param.dtype == torch.bfloat16: 67 | loaded_tensor = loaded_tensor.to(torch.bfloat16) 68 | 69 | param.data.copy_(loaded_tensor.to("cuda")) 70 | else: 71 | logging.warning(f"Warning: {file_name} not found in offloaded directory.") 72 | 73 | # Load the weights into the model 74 | load_offloaded_weights(model, WEIGHTS_DIR) 75 | 76 | # Move the model to GPU for inference 77 | model.to('cuda') 78 | model.eval() 79 | 80 | # Use AutoTokenizer to handle any tokenizer class discrepancies 81 | logging.info(f"Loading tokenizer from directory: {SOURCE_DIR}") 82 | tokenizer = load_tokenizer(SOURCE_DIR) 83 | 84 | # Implement the ResponseQualityManager with metrics and corrective strategies 85 | class ResponseQualityManager: 86 | def __init__(self, kan_model, tokenizer): 87 | self.kan_model = kan_model 88 | self.tokenizer = tokenizer 89 | self.tfidf_vectorizer = TfidfVectorizer() 90 | 91 | def evaluate_response(self, user_input, response): 92 | relevance_score = self.calculate_relevance(user_input, response) 93 | structure_valid = self.has_proper_structure(response) 94 | is_garbled = self.detect_garbled_output(response) 95 | return relevance_score > 0.3 and structure_valid and not is_garbled 96 | 97 | def calculate_relevance(self, user_input, response): 98 | user_tokens = set(self.tokenizer.tokenize(user_input)) 99 | response_tokens = set(self.tokenizer.tokenize(response)) 100 | overlap = len(user_tokens.intersection(response_tokens)) 101 | overlap_score = overlap / max(len(user_tokens), 1) 102 | 103 | combined_texts = [user_input, response] 104 | tfidf_matrix = self.tfidf_vectorizer.fit_transform(combined_texts) 105 | cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0] 106 | 107 | return 0.5 * overlap_score + 0.5 * cosine_sim 108 | 109 | def detect_garbled_output(self, response): 110 | if re.search(r'[^\x00-\x7F]+', response): 111 | return True 112 | if len(response.split()) < 3: 113 | return True 114 | if response.count('.') / len(response.split()) > 0.5: 115 | return True 116 | return False 117 | 118 | def has_proper_structure(self, response): 119 | sentences = re.split(r'(?<=[.!?])\s+', response.strip()) 120 | return len(sentences) > 0 and sentences[0][0].isupper() and sentences[-1][-1] in '.!?' 121 | 122 | # Quality Manager instance for response evaluation 123 | quality_manager = ResponseQualityManager(model, tokenizer) 124 | 125 | 126 | # Updated generation logic to handle context better and avoid repetitive responses 127 | def generate_response(input_text, model, tokenizer, max_new_tokens=150, pad_token_id=128001, history=[], context_limit=512): 128 | # Clean the history to avoid redundant prompts 129 | history = [line for line in history if line.strip()] # Remove empty lines 130 | 131 | # Create a simplified context prompt from the last few exchanges 132 | prompt = f"{' '.join(history[-3:])}\nUser: {input_text}\n" if history else f"User: {input_text}\n" 133 | 134 | # Prepare inputs for the model 135 | inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=context_limit).to("cuda") 136 | 137 | # Generate the response 138 | with torch.no_grad(): 139 | outputs = model.generate( 140 | inputs["input_ids"], 141 | attention_mask=inputs["attention_mask"], 142 | max_new_tokens=max_new_tokens, # Control new tokens 143 | do_sample=True, 144 | temperature=0.7, 145 | top_k=50, 146 | top_p=0.9, 147 | repetition_penalty=1.2, 148 | pad_token_id=pad_token_id, 149 | early_stopping=True 150 | ) 151 | 152 | # Decode the response and format it properly 153 | response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip() 154 | 155 | # Ensure clean history management and context length control 156 | cleaned_response = response.split("User:")[-1].strip() # Remove any overlap 157 | cleaned_response = re.sub(r'\s+', ' ', cleaned_response) # Clean excess whitespace 158 | 159 | # Append the cleaned response to history 160 | history.append(f"User: {input_text}\nModel: {cleaned_response}") 161 | 162 | # Trim history to prevent excessive accumulation 163 | if len(history) > 6: 164 | history = history[-6:] 165 | 166 | return cleaned_response, history 167 | 168 | # Updated user input loop to handle context better 169 | def user_input_loop(model, tokenizer): 170 | print("\n--- LLaMA Instruct Model Interactive Query ---") 171 | print("Type 'exit' to quit.") 172 | history = [] # Initialize a history buffer to keep track of conversation 173 | while True: 174 | user_input = input("\nEnter your query: ") 175 | if user_input.lower() == 'exit': 176 | print("Exiting...") 177 | break 178 | response, history = generate_response(user_input, model, tokenizer, history=history) 179 | print(f"Model Response: {response}") 180 | 181 | # Start the interactive query loop with the refined response generation 182 | logging.info("Model loaded successfully. You can now query the model.") 183 | user_input_loop(model, tokenizer) 184 | 185 | -------------------------------------------------------------------------------- /nonfunctional_transformers_garbled.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | import re 7 | import logging 8 | from transformers import LlamaForCausalLM, AutoTokenizer, LlamaConfig 9 | from typing import Tuple 10 | import json 11 | from tqdm import tqdm # For progress bars 12 | 13 | # Define paths to the directories and files 14 | SOURCE_DIR = "models/Llama_32_1B/" 15 | WEIGHTS_DIR = os.path.join(SOURCE_DIR, "offload") 16 | MODEL_JSON_PATH = os.path.join(SOURCE_DIR, "config.json") 17 | 18 | # Initialize logging 19 | logging.basicConfig(level=logging.INFO) 20 | 21 | # Load the configuration from the JSON file 22 | def load_configuration(model_json_path): 23 | with open(model_json_path, "r") as f: 24 | config_data = json.load(f) 25 | config = LlamaConfig(**config_data) 26 | return config 27 | 28 | # Use AutoTokenizer instead of LlamaTokenizer to resolve class conflicts 29 | def load_tokenizer(source_dir): 30 | return AutoTokenizer.from_pretrained(source_dir) 31 | 32 | # Load the model configuration 33 | logging.info(f"Loading model configuration from: {MODEL_JSON_PATH}") 34 | config = load_configuration(MODEL_JSON_PATH) 35 | 36 | # Initialize an empty model based on the configuration 37 | model = LlamaForCausalLM(config) 38 | logging.info("Initialized empty LLaMA model.") 39 | 40 | # Load the offloaded weights from the `.dat` files with a progress bar 41 | def load_dat_file(file_path, dtype): 42 | with open(file_path, 'rb') as f: 43 | tensor_data = np.fromfile(f, dtype=dtype) 44 | loaded_tensor = torch.tensor(tensor_data) 45 | 46 | # If dtype was mapped to float32 for bfloat16 compatibility, convert back 47 | if dtype == np.float32 and "bfloat16" in file_path: 48 | loaded_tensor = loaded_tensor.to(torch.bfloat16) 49 | return loaded_tensor 50 | 51 | def load_offloaded_weights(model, weights_dir): 52 | param_names = list(model.named_parameters()) 53 | # Create a progress bar for weight loading 54 | with tqdm(total=len(param_names), desc="Loading weights", unit="param") as pbar: 55 | for name, param in param_names: 56 | file_name = name.replace('.', '_') + ".dat" 57 | file_path = os.path.join(weights_dir, file_name) 58 | 59 | if os.path.exists(file_path): 60 | dtype_map = { 61 | torch.float16: np.float16, 62 | torch.float32: np.float32, 63 | torch.int64: np.int64, 64 | torch.int32: np.int32, 65 | torch.bfloat16: np.float32, 66 | } 67 | expected_dtype = dtype_map.get(param.dtype, np.float32) 68 | logging.info(f"Loading {file_name} into {name} with expected type {expected_dtype}") 69 | loaded_tensor = load_dat_file(file_path, expected_dtype).view_as(param) 70 | 71 | if param.dtype == torch.bfloat16: 72 | loaded_tensor = loaded_tensor.to(torch.bfloat16) 73 | 74 | param.data.copy_(loaded_tensor.to("cuda")) 75 | else: 76 | logging.warning(f"Warning: {file_name} not found in offloaded directory.") 77 | 78 | pbar.update(1) # Update the progress bar after each parameter is loaded 79 | 80 | # Load the weights into the model 81 | load_offloaded_weights(model, WEIGHTS_DIR) 82 | 83 | # Move the model to GPU for inference 84 | model.to('cuda') 85 | model.eval() 86 | 87 | # Use AutoTokenizer to handle any tokenizer class discrepancies 88 | logging.info(f"Loading tokenizer from directory: {SOURCE_DIR}") 89 | tokenizer = load_tokenizer(SOURCE_DIR) 90 | 91 | # Rotary embedding application with frequency scaling 92 | def apply_rotary_emb(q, k, freqs_cis, layer_index, rope_scaling): 93 | q_real = q.float().view(*q.shape[:-1], -1, 2) 94 | k_real = k.float().view(*k.shape[:-1], -1, 2) 95 | q_complex = torch.view_as_complex(q_real) 96 | k_complex = torch.view_as_complex(k_real) 97 | 98 | # Ensure freqs_cis is on the same device as q_complex 99 | device = q_complex.device 100 | freqs_cis = freqs_cis.to(device) # Move freqs_cis to the correct device 101 | 102 | # Get rope scaling parameters for this layer 103 | freq_factor = rope_scaling['high_freq_factor'] if layer_index >= 16 else rope_scaling['low_freq_factor'] 104 | 105 | # Adjust freqs_cis to match q's shape, considering the sequence length 106 | seq_len = q.shape[-2] # Sequence length from query tensor 107 | freqs_cis = freqs_cis[:seq_len, :q.shape[-1] // 2] * freq_factor 108 | 109 | # Expand freqs_cis to match the shape of q_complex 110 | freqs_cis = freqs_cis.unsqueeze(0).unsqueeze(1) # Expand for batch size and heads 111 | freqs_cis = freqs_cis.expand_as(q_complex) # Ensure it matches q_complex shape 112 | 113 | # Apply complex multiplication to both query and key tensors 114 | q_rot = torch.view_as_real(q_complex * freqs_cis).flatten(3) 115 | k_rot = torch.view_as_real(k_complex * freqs_cis).flatten(3) 116 | 117 | return q_rot, k_rot 118 | 119 | # Generating scaled rotary frequencies for LLaMA 3.2 120 | def get_rotary_frequencies(config): 121 | hidden_size = config.hidden_size 122 | max_position_embeddings = config.max_position_embeddings 123 | base = config.rope_theta 124 | scaling_factor = config.rope_scaling['factor'] 125 | 126 | inv_freq = 1.0 / (base ** (torch.arange(0, hidden_size, 2).float() / hidden_size)) 127 | t = torch.arange(max_position_embeddings, device=inv_freq.device) 128 | freqs = torch.outer(t, inv_freq) * scaling_factor 129 | return torch.polar(torch.ones_like(freqs), freqs) 130 | 131 | # Custom Attention Layer that applies rotary embeddings and processes attention 132 | class CustomAttentionLayer(nn.Module): 133 | def __init__(self, config, layer_index, weights_dir): 134 | super(CustomAttentionLayer, self).__init__() 135 | self.hidden_size = config.hidden_size 136 | self.num_heads = config.num_attention_heads 137 | self.num_key_value_heads = config.num_key_value_heads 138 | self.head_dim = config.head_dim 139 | self.weights_dir = weights_dir 140 | self.layer_index = layer_index 141 | self.rope_scaling = config.rope_scaling # Add rope scaling from config 142 | 143 | # Create nn.Linear layers 144 | self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False) 145 | self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) 146 | self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) 147 | self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False) 148 | 149 | # Load weights into the layers 150 | self.load_weights() 151 | 152 | self.scale = 1 / (self.head_dim ** 0.5) 153 | 154 | def load_weights(self): 155 | self.q_proj.weight.data = self.load_weight(f"model_layers_{self.layer_index}_self_attn_q_proj_weight.dat", (self.hidden_size, self.hidden_size)) 156 | self.k_proj.weight.data = self.load_weight(f"model_layers_{self.layer_index}_self_attn_k_proj_weight.dat", (self.num_key_value_heads * self.head_dim, self.hidden_size)) 157 | self.v_proj.weight.data = self.load_weight(f"model_layers_{self.layer_index}_self_attn_v_proj_weight.dat", (self.num_key_value_heads * self.head_dim, self.hidden_size)) 158 | self.o_proj.weight.data = self.load_weight(f"model_layers_{self.layer_index}_self_attn_o_proj_weight.dat", (self.hidden_size, self.hidden_size)) 159 | 160 | def load_weight(self, file_name, shape): 161 | file_path = os.path.join(self.weights_dir, file_name) 162 | if os.path.exists(file_path): 163 | tensor_data = np.fromfile(file_path, dtype=np.float32) 164 | return torch.tensor(tensor_data).view(*shape).to("cuda") 165 | else: 166 | raise FileNotFoundError(f"Weight file {file_name} not found.") 167 | 168 | def forward(self, hidden_states, freqs_cis, past_key_value=None, position_ids=None): 169 | # Ensure hidden_states are on the same device as model parameters (GPU) 170 | device = self.q_proj.weight.device 171 | hidden_states = hidden_states.to(device) 172 | 173 | batch_size, seq_length, _ = hidden_states.shape 174 | 175 | q = self.q_proj(hidden_states) 176 | k = self.k_proj(hidden_states) 177 | v = self.v_proj(hidden_states) 178 | 179 | q = q.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2) 180 | k = k.view(batch_size, seq_length, self.num_key_value_heads, self.head_dim).transpose(1, 2) 181 | v = v.view(batch_size, seq_length, self.num_key_value_heads, self.head_dim).transpose(1, 2) 182 | 183 | # Repeat k and v for multi-query attention 184 | k = k.repeat_interleave(self.num_heads // self.num_key_value_heads, dim=1) 185 | v = v.repeat_interleave(self.num_heads // self.num_key_value_heads, dim=1) 186 | 187 | # Apply rotary embeddings with scaling based on layer index and rope scaling factors 188 | q_rot, k_rot = apply_rotary_emb(q, k, freqs_cis, self.layer_index, self.rope_scaling) 189 | 190 | if past_key_value is not None: 191 | past_k, past_v = past_key_value 192 | if past_k is not None and past_v is not None: 193 | k_rot = torch.cat([past_k, k_rot], dim=2) 194 | v = torch.cat([past_v, v], dim=2) 195 | 196 | attn_output = torch.nn.functional.scaled_dot_product_attention( 197 | q_rot, k_rot, v, attn_mask=None, dropout_p=0.0, is_causal=True 198 | ) 199 | 200 | attn_output = attn_output.transpose(1, 2).contiguous() 201 | attn_output = attn_output.reshape(batch_size, seq_length, self.hidden_size) 202 | attn_output = self.o_proj(attn_output) 203 | 204 | return attn_output, (k_rot, v) 205 | 206 | # Custom Transformer Layer integrating Attention and Feed-Forward Network 207 | class CustomTransformerLayer(nn.Module): 208 | def __init__(self, config, layer_index, weights_dir): 209 | super(CustomTransformerLayer, self).__init__() 210 | self.hidden_size = config.hidden_size 211 | self.intermediate_size = config.intermediate_size 212 | self.layer_index = layer_index 213 | self.weights_dir = weights_dir 214 | 215 | # Attention 216 | self.attention = CustomAttentionLayer(config, layer_index, weights_dir) 217 | 218 | # Layer norms 219 | self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) 220 | self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) 221 | 222 | # Feed-forward network 223 | self.mlp = MLP( 224 | gate_proj=self.load_weight(f"model_layers_{layer_index}_mlp_gate_proj_weight.dat", (self.intermediate_size, self.hidden_size)), 225 | up_proj=self.load_weight(f"model_layers_{layer_index}_mlp_up_proj_weight.dat", (self.intermediate_size, self.hidden_size)), 226 | down_proj=self.load_weight(f"model_layers_{layer_index}_mlp_down_proj_weight.dat", (self.hidden_size, self.intermediate_size)), 227 | act_fn=F.silu 228 | ) 229 | 230 | def load_weight(self, file_name, shape): 231 | file_path = os.path.join(self.weights_dir, file_name) 232 | if os.path.exists(file_path): 233 | tensor_data = np.fromfile(file_path, dtype=np.float32) 234 | return torch.tensor(tensor_data).view(*shape).to("cuda") 235 | else: 236 | raise FileNotFoundError(f"Weight file {file_name} not found.") 237 | 238 | def forward(self, hidden_states, freqs_cis, past_key_value=None, position_ids=None, use_cache=False): 239 | # Ensure hidden_states are on the same device as model parameters (GPU) 240 | device = self.attention.q_proj.weight.device 241 | hidden_states = hidden_states.to(device) 242 | 243 | # Pre-attention norm 244 | residual = hidden_states 245 | hidden_states = self.input_layernorm(hidden_states) 246 | 247 | # Attention 248 | attention_output, new_past = self.attention(hidden_states, freqs_cis, past_key_value, position_ids) 249 | 250 | # Ensure residual and attention_output are on the same device 251 | attention_output = attention_output.to(residual.device) 252 | 253 | # Residual connection 254 | hidden_states = residual + attention_output 255 | 256 | # Pre-FFN norm 257 | residual = hidden_states 258 | hidden_states = self.post_attention_layernorm(hidden_states) 259 | 260 | # Feed-forward network 261 | hidden_states = self.mlp(hidden_states) 262 | 263 | # Residual connection 264 | hidden_states = residual + hidden_states 265 | 266 | if use_cache: 267 | return hidden_states, new_past 268 | else: 269 | return hidden_states, None 270 | 271 | # RMSNorm Layer 272 | class RMSNorm(nn.Module): 273 | def __init__(self, hidden_size, eps=1e-6): 274 | super().__init__() 275 | self.weight = nn.Parameter(torch.ones(hidden_size)) 276 | self.eps = eps 277 | 278 | def forward(self, hidden_states): 279 | # Ensure hidden_states and self.weight are on the same device 280 | device = self.weight.device 281 | hidden_states = hidden_states.to(device) 282 | 283 | # Compute the variance and apply normalization 284 | variance = hidden_states.pow(2).mean(-1, keepdim=True) 285 | hidden_states = hidden_states * torch.rsqrt(variance + self.eps) 286 | return self.weight * hidden_states 287 | 288 | # MLP Layer 289 | class MLP(nn.Module): 290 | def __init__(self, gate_proj, up_proj, down_proj, act_fn): 291 | super().__init__() 292 | self.gate_proj = nn.Parameter(gate_proj) # We keep these as nn.Parameters (tensors) 293 | self.up_proj = nn.Parameter(up_proj) 294 | self.down_proj = nn.Parameter(down_proj) 295 | self.act_fn = act_fn 296 | 297 | def forward(self, x): 298 | # Ensure input x is on the same device as the model parameters 299 | device = self.gate_proj.device 300 | x = x.to(device) 301 | 302 | # Perform the MLP computation 303 | gate_out = torch.matmul(x, self.gate_proj.T) # Matrix multiplication with gate_proj 304 | up_out = torch.matmul(x, self.up_proj.T) # Matrix multiplication with up_proj 305 | activated_out = self.act_fn(gate_out) # Apply activation function 306 | 307 | # Perform element-wise multiplication and apply down_proj 308 | output = torch.matmul(activated_out * up_out, self.down_proj.T) 309 | 310 | return output 311 | 312 | # Custom LLaMA Model integrating custom transformer layers and rotary embeddings 313 | class CustomLlamaModel(LlamaForCausalLM): 314 | def __init__(self, config, weights_dir): 315 | super(CustomLlamaModel, self).__init__(config) 316 | self.weights_dir = weights_dir 317 | self.config = config 318 | 319 | self.transformer_layers = nn.ModuleList( 320 | [CustomTransformerLayer(config, layer_index, weights_dir) 321 | for layer_index in range(config.num_hidden_layers)] 322 | ) 323 | 324 | self.freqs_cis = get_rotary_frequencies(config) 325 | 326 | def forward(self, input_ids=None, attention_mask=None, inputs_embeds=None, position_ids=None, past_key_values=None, use_cache=False, cache_position=None, return_dict=False): 327 | if inputs_embeds is None: 328 | inputs_embeds = self.get_input_embeddings()(input_ids) 329 | 330 | batch_size, seq_length = input_ids.shape if input_ids is not None else inputs_embeds.shape[:2] 331 | 332 | if position_ids is None: 333 | if cache_position is not None: 334 | position_ids = torch.arange(cache_position, cache_position + seq_length, dtype=torch.long, device=inputs_embeds.device) 335 | position_ids = position_ids.unsqueeze(0).expand(batch_size, -1) 336 | else: 337 | position_ids = torch.arange(seq_length, dtype=torch.long, device=inputs_embeds.device).unsqueeze(0).expand(batch_size, -1) 338 | 339 | if past_key_values is None: 340 | past_key_values = [None] * self.config.num_hidden_layers # Fixed access 341 | 342 | hidden_states = inputs_embeds 343 | presents = [] if use_cache else None 344 | 345 | for i, layer in enumerate(self.transformer_layers): 346 | layer_past = past_key_values[i] if past_key_values is not None and len(past_key_values) > i else None 347 | hidden_states, past = layer(hidden_states, self.freqs_cis, layer_past, position_ids, use_cache) 348 | 349 | if use_cache: 350 | presents.append(past) 351 | 352 | hidden_states = hidden_states.to(self.lm_head.weight.device) 353 | logits = self.lm_head(hidden_states) 354 | 355 | # Always return a dictionary with logits and past_key_values when return_dict=True 356 | if return_dict: 357 | return {"logits": logits, "past_key_values": presents if use_cache else None} 358 | else: 359 | return logits # Return just logits when return_dict=False 360 | 361 | def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **kwargs): 362 | # Prepare inputs for generation, ensuring past key values are handled correctly 363 | if past_key_values: 364 | input_ids = input_ids[:, -1:] # Only pass the last token when past_key_values exist 365 | 366 | return { 367 | "input_ids": input_ids, 368 | "past_key_values": past_key_values, 369 | "attention_mask": attention_mask, 370 | "use_cache": kwargs.get("use_cache", True), 371 | } 372 | 373 | # Custom generate function (replaces generate in transformers) 374 | def custom_generate( 375 | model, 376 | tokenizer, 377 | input_ids, 378 | max_new_tokens=150, 379 | temperature=0.6, # Lower temperature for better coherence 380 | top_k=50, 381 | top_p=0.9, # Higher top_p for diversity without overwhelming the coherence 382 | repetition_penalty=1.2, 383 | pad_token_id=128001, 384 | eos_token_id=None, 385 | device="cuda" 386 | ): 387 | model.eval() # Set model to evaluation mode 388 | generated = input_ids.to(device) # [batch_size, seq_length] 389 | 390 | print(f"Initial Input IDs: {input_ids.tolist()}") # Log the initial input 391 | 392 | # Create a progress bar to track the generation process 393 | with tqdm(total=max_new_tokens, desc="Generating tokens", unit="token") as pbar: 394 | for _ in range(max_new_tokens): 395 | with torch.no_grad(): 396 | outputs = model(input_ids=generated, return_dict=True) 397 | logits = outputs["logits"] 398 | 399 | # Check the number of dimensions and handle accordingly 400 | if len(logits.shape) == 3: 401 | logits = logits[:, -1, :] # Standard case, 3D tensor 402 | elif len(logits.shape) == 2: 403 | logits = logits[:, :] # Handle 2D logits 404 | 405 | print(f"Logits shape: {logits.shape}") # Log the shape of logits 406 | 407 | # Apply repetition penalty 408 | if repetition_penalty != 1.0: 409 | for i in range(generated.shape[0]): 410 | unique_tokens = set(generated[i].tolist()) 411 | for token in unique_tokens: 412 | logits[i, token] /= repetition_penalty 413 | 414 | # Apply temperature scaling 415 | logits = logits / temperature 416 | 417 | # Top-K sampling 418 | if top_k > 0: 419 | top_k_logits, _ = torch.topk(logits, top_k, dim=-1) 420 | logits[logits < top_k_logits[:, [-1]]] = -float('Inf') 421 | 422 | # Top-P (nucleus) sampling 423 | sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1) 424 | cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) 425 | sorted_indices_to_remove = cumulative_probs > top_p 426 | 427 | # Shift the indices to the right to keep at least one token 428 | sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone() 429 | sorted_indices_to_remove[:, 0] = 0 430 | 431 | # Scatter the sorted indices to the original logits tensor 432 | indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove) 433 | logits[indices_to_remove] = -float('Inf') 434 | 435 | # Sample from the filtered distribution 436 | probs = F.softmax(logits, dim=-1) 437 | next_token = torch.multinomial(probs, num_samples=1) # [batch_size, 1] 438 | 439 | print(f"Generated Token ID: {next_token.tolist()}") # Log the generated token 440 | 441 | # Append generated token 442 | generated = torch.cat([generated, next_token], dim=-1) # [batch_size, seq_length +1] 443 | 444 | # Break on EOS token 445 | if eos_token_id is not None: 446 | if isinstance(eos_token_id, list): 447 | eos_tensor = torch.tensor(eos_token_id, device=next_token.device) # Ensure eos_token_id is a tensor 448 | if torch.any(torch.isin(next_token, eos_tensor)): 449 | print("EOS token encountered. Ending generation.") 450 | break 451 | else: 452 | if torch.any(next_token == eos_token_id): 453 | print("EOS token encountered. Ending generation.") 454 | break 455 | 456 | pbar.update(1) # Update progress bar after generating a token 457 | 458 | print(f"Final Generated Output IDs: {generated.tolist()}") # Log the final output 459 | return generated 460 | 461 | # Generate response method updated to call custom_generate 462 | def generate_response(input_text, model, tokenizer, max_new_tokens=150, pad_token_id=128001, history=[], context_limit=512): 463 | prompt = f"{' '.join(history[-3:])}\nUser: {input_text}\n" if history else f"User: {input_text}\n" 464 | 465 | print(f"Prompt: {prompt}") # Log the prompt to be tokenized 466 | 467 | # Tokenize the input prompt 468 | inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=context_limit) 469 | input_ids = inputs["input_ids"].to(next(model.parameters()).device) 470 | 471 | print(f"Tokenized Input IDs: {input_ids.tolist()}") # Log tokenized input 472 | 473 | # Generate the response using the custom generate function 474 | generated_output = custom_generate( 475 | model=model, 476 | tokenizer=tokenizer, 477 | input_ids=input_ids, 478 | max_new_tokens=max_new_tokens, 479 | temperature=0.7, 480 | top_k=50, 481 | top_p=0.9, 482 | repetition_penalty=1.2, 483 | pad_token_id=pad_token_id, 484 | eos_token_id=[128001, 128008, 128009], # Set your EOS token IDs as per config 485 | device=next(model.parameters()).device 486 | ) 487 | 488 | # Decode the generated output 489 | response = tokenizer.decode(generated_output[0], skip_special_tokens=True).strip() 490 | 491 | # Clean up the response to remove duplicate User tags or extraneous whitespace 492 | cleaned_response = response.split("User:")[-1].strip() 493 | cleaned_response = re.sub(r'\s+', ' ', cleaned_response) 494 | 495 | print(f"Final Generated Response: {cleaned_response}") # Log the cleaned response 496 | 497 | # Append this conversation turn to the history 498 | history.append(f"User: {input_text}\nModel: {cleaned_response}") 499 | 500 | # Trim the history to the last 6 conversation turns 501 | if len(history) > 6: 502 | history = history[-6:] 503 | 504 | return cleaned_response, history 505 | 506 | # Interactive input loop to query the model 507 | def user_input_loop(custom_model, tokenizer): 508 | print("\n--- Custom LLaMA 3.2 Instruct Model ---") 509 | print("Type 'exit' to quit.") 510 | history = [] # Initialize a history buffer to keep track of conversation 511 | while True: 512 | user_input = input("\nEnter your query: ") 513 | if user_input.lower() == 'exit': 514 | print("Exiting...") 515 | break 516 | try: 517 | response, history = generate_response(user_input, custom_model, tokenizer, history=history) 518 | print(f"Model Response: {response}") 519 | except Exception as e: 520 | # Show full error without wrapping to allow for easier debugging 521 | raise e 522 | 523 | 524 | # Initialize the custom model and tokenizer 525 | config = load_configuration(MODEL_JSON_PATH) 526 | tokenizer = load_tokenizer(SOURCE_DIR) 527 | custom_model = CustomLlamaModel(config, WEIGHTS_DIR) 528 | 529 | # Start the user input loop 530 | user_input_loop(custom_model, tokenizer) 531 | -------------------------------------------------------------------------------- /offloadedModelLiveLayerIdea.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import json 4 | import numpy as np 5 | import re 6 | import logging 7 | from sklearn.feature_extraction.text import TfidfVectorizer 8 | from sklearn.metrics.pairwise import cosine_similarity 9 | from transformers import LlamaForCausalLM, AutoTokenizer, LlamaConfig 10 | 11 | # Define paths to the directories and files 12 | SOURCE_DIR = "models/Llama_32_1B/" 13 | WEIGHTS_DIR = os.path.join(SOURCE_DIR, "offload") 14 | MODEL_JSON_PATH = os.path.join(SOURCE_DIR, "config.json") 15 | 16 | # Initialize logging 17 | logging.basicConfig(level=logging.INFO) 18 | 19 | # Load the configuration from the JSON file 20 | def load_configuration(model_json_path): 21 | with open(model_json_path, "r") as f: 22 | config_data = json.load(f) 23 | config = LlamaConfig(**config_data) 24 | return config 25 | 26 | # Use AutoTokenizer instead of LlamaTokenizer to resolve class conflicts 27 | def load_tokenizer(source_dir): 28 | return AutoTokenizer.from_pretrained(source_dir) 29 | 30 | # Load the model configuration 31 | logging.info(f"Loading model configuration from: {MODEL_JSON_PATH}") 32 | config = load_configuration(MODEL_JSON_PATH) 33 | 34 | # Initialize an empty model based on the configuration 35 | model = LlamaForCausalLM(config) 36 | logging.info("Initialized empty LLaMA model.") 37 | 38 | # Load the offloaded weights from the `.dat` files 39 | def load_dat_file(file_path, dtype): 40 | with open(file_path, 'rb') as f: 41 | tensor_data = np.fromfile(f, dtype=dtype) 42 | loaded_tensor = torch.tensor(tensor_data) 43 | 44 | # If dtype was mapped to float32 for bfloat16 compatibility, convert back 45 | if dtype == np.float32 and "bfloat16" in file_path: 46 | loaded_tensor = loaded_tensor.to(torch.bfloat16) 47 | return loaded_tensor 48 | 49 | def load_offloaded_weights(model, weights_dir): 50 | for name, param in model.named_parameters(): 51 | file_name = name.replace('.', '_') + ".dat" 52 | file_path = os.path.join(weights_dir, file_name) 53 | 54 | if os.path.exists(file_path): 55 | dtype_map = { 56 | torch.float16: np.float16, 57 | torch.float32: np.float32, 58 | torch.int64: np.int64, 59 | torch.int32: np.int32, 60 | torch.bfloat16: np.float32, 61 | } 62 | expected_dtype = dtype_map.get(param.dtype, np.float32) 63 | logging.info(f"Loading {file_name} into {name} with expected type {expected_dtype}") 64 | loaded_tensor = load_dat_file(file_path, expected_dtype).view_as(param) 65 | 66 | if param.dtype == torch.bfloat16: 67 | loaded_tensor = loaded_tensor.to(torch.bfloat16) 68 | 69 | param.data.copy_(loaded_tensor.to("cuda")) 70 | else: 71 | logging.warning(f"Warning: {file_name} not found in offloaded directory.") 72 | 73 | # Load the weights into the model 74 | load_offloaded_weights(model, WEIGHTS_DIR) 75 | 76 | # Move the model to GPU for inference 77 | model.to('cuda') 78 | model.eval() 79 | 80 | # Use AutoTokenizer to handle any tokenizer class discrepancies 81 | logging.info(f"Loading tokenizer from directory: {SOURCE_DIR}") 82 | tokenizer = load_tokenizer(SOURCE_DIR) 83 | 84 | # Helper function to calculate entropy 85 | def calculate_entropy(probs): 86 | log_probs = torch.log(probs + 1e-10) # Add small epsilon to avoid log(0) 87 | entropy = -torch.sum(probs * log_probs, dim=-1) 88 | return entropy 89 | 90 | def summarize_history(history, tokenizer, max_length=100): 91 | if not history: 92 | return "" 93 | 94 | # Concatenate the history into a single string 95 | history_text = " ".join(history) 96 | 97 | # Tokenize the history text 98 | history_tokens = tokenizer.encode(history_text, truncation=True, max_length=max_length) 99 | 100 | # Decode the summarized history tokens back into text 101 | summarized_history = tokenizer.decode(history_tokens) 102 | 103 | return summarized_history 104 | 105 | def evaluate_response_quality(response, user_input, tokenizer, threshold=0.75): 106 | # Tokenize the response and user input 107 | response_tokens = set(tokenizer.encode(response)) 108 | user_input_tokens = set(tokenizer.encode(user_input)) 109 | 110 | # Calculate the overlap between response and user input tokens 111 | overlap = len(response_tokens.intersection(user_input_tokens)) 112 | overlap_ratio = overlap / len(user_input_tokens) 113 | 114 | # Calculate the coherence of the response 115 | coherence_score = 0.0 # Implement a coherence scoring mechanism 116 | 117 | # Evaluate the relevance and quality of the response 118 | relevance_score = overlap_ratio 119 | quality_score = 0.5 * overlap_ratio + 0.5 * coherence_score 120 | 121 | return quality_score >= threshold 122 | 123 | def adjust_layers(model, quality_score, threshold=0.75): 124 | if quality_score < threshold: 125 | # Reduce the number of layers 126 | num_layers = max(1, model.config.num_hidden_layers // 2) 127 | else: 128 | # Increase the number of layers 129 | num_layers = min(model.config.num_hidden_layers * 2, 48) 130 | 131 | # Adjust the model's layers 132 | model.config.num_hidden_layers = num_layers 133 | model.resize_token_embeddings(len(tokenizer)) 134 | 135 | return model 136 | 137 | def generate_response(input_text, model, tokenizer, max_new_tokens=50, pad_token_id=128001, history=[], context_limit=512): 138 | # Clean the history to avoid redundant prompts 139 | history = [line for line in history if line.strip()] 140 | 141 | # Create a context prompt from the last few exchanges 142 | context = ' '.join(history[-3:]) if history else '' 143 | prompt = f"{context}\nUser: {input_text}\nModel:" 144 | 145 | # Prepare inputs for the model 146 | inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=context_limit).to("cuda") 147 | 148 | # Initialize response and keep track of tokens for refinement 149 | refined_response = "" 150 | refined_token_ids = [] 151 | 152 | # Iteratively generate and refine the response 153 | for iteration in range(10): # Number of iterations can be adjusted 154 | with torch.no_grad(): 155 | outputs = model.generate( 156 | inputs["input_ids"], 157 | attention_mask=inputs["attention_mask"], 158 | max_new_tokens=max_new_tokens // 10, # Distribute tokens across iterations 159 | do_sample=True, 160 | temperature=0.7, 161 | top_k=50, 162 | top_p=0.9, 163 | repetition_penalty=1.2, 164 | pad_token_id=pad_token_id, 165 | eos_token_id=tokenizer.eos_token_id, 166 | output_scores=True, 167 | return_dict_in_generate=True 168 | ) 169 | 170 | # Retrieve the generated token IDs 171 | new_token_ids = outputs.sequences[0][inputs["input_ids"].shape[1]:].tolist() 172 | refined_token_ids.extend(new_token_ids) 173 | 174 | # Decode the generated response 175 | refined_response = tokenizer.decode(refined_token_ids, skip_special_tokens=True).strip() 176 | 177 | # Check if the response is complete 178 | if refined_response.endswith(('.', '!', '?')) or 'User:' in refined_response: 179 | break 180 | 181 | # Update input for next iteration 182 | inputs["input_ids"] = outputs.sequences 183 | 184 | # Clean up the generated output 185 | response = refined_response.replace(prompt, "").strip() 186 | 187 | # Append final cleaned response to history 188 | history.append(f"User: {input_text}") 189 | history.append(f"Model: {response}") 190 | 191 | # Trim history to avoid excessive accumulation 192 | if len(history) > 10: 193 | history = history[-10:] 194 | 195 | return response, history 196 | 197 | def user_input_loop(model, tokenizer): 198 | print("\n--- LLaMA Interactive Query ---") 199 | print("Type 'exit' to quit.") 200 | history = [] 201 | 202 | while True: 203 | user_input = input("\nEnter your query: ") 204 | if user_input.lower() == 'exit': 205 | print("Exiting...") 206 | break 207 | 208 | # Generate response using the LLaMA model 209 | response, history = generate_response(user_input, model, tokenizer, history=history) 210 | print(f"Model: {response}") 211 | 212 | # Get user feedback on the response 213 | feedback = input("Please provide feedback on the response (good/bad): ") 214 | 215 | if feedback.lower() == 'bad': 216 | print("Thank you for your feedback. We'll work on improving the model's responses.") 217 | 218 | # Save the final conversation history 219 | with open("conversation_history.json", "w") as f: 220 | json.dump(history, f) 221 | 222 | # Start the interactive query loop with the refined response generation 223 | logging.info("Model loaded successfully. You can now query the model.") 224 | user_input_loop(model, tokenizer) -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # KAN-WuBu-Memory: LLaMA 3.2 1B Instruct with Kolmogorov-Arnold Networks (KAN) Integration 2 | 3 | ![KAN-WuBu Memory](https://img.shields.io/badge/PyTorch-CUDA_Enabled-blue.svg) 4 | ![Version](https://img.shields.io/badge/Version-2.0.0-brightgreen.svg) 5 | ![Contributions](https://img.shields.io/badge/Contributions-Welcome-orange.svg) 6 | 7 | ## Project Overview 8 | 9 | **KAN-WuBu-Memory** is an advanced memory-integrated AI system that combines the **LLaMA 3.2 1B** language model with **Kolmogorov-Arnold Networks (KAN)** and a multi-dimensional memory framework. This system builds on the concept of emotionally aware, contextually sensitive, and dynamically evolving conversations. With support for long-term memory consolidation, real-time emotional modulation, and adaptive response generation, **KAN-WuBu-Memory** is designed for complex and nuanced conversational interactions. 10 | 11 | ### Key Features 12 | 13 | - **Emotionally Aware Conversations**: Tracks and adjusts the AI's emotional state using a multi-dimensional model (`valence`, `arousal`, and `dominance`) to produce responses that align with contextual nuances. 14 | - **Kolmogorov-Arnold Networks (KAN) Integration**: Enhances LLaMA’s interaction by leveraging KANs to adapt internal representations dynamically. 15 | - **Advanced Memory Management**: Utilizes short-term, long-term, and sliding-window memories to retain context and adapt based on conversation history. 16 | - **Refusal Detection and Override**: Identifies refusal phrases and uses corrective mechanisms to ensure smooth and continuous interaction. 17 | - **Entropy-Based Response Management**: Uses entropy metrics to balance randomness and coherence in response generation. 18 | - **Synthetic Day-Night Cycle**: Simulates a day-night cycle to influence the AI’s behavior dynamically, adjusting its responses and internal states based on simulated time. 19 | - **Automatic State Saving and Loading**: Captures and saves the model’s state, memory, and emotional context after each interaction, allowing for continuous learning and persistent memory. 20 | - **Dynamic Sampling Strategy**: Adjusts the sampling parameters (`temperature` and `top_p`) based on entropy, memory importance, and conversation context. 21 | 22 | ## Table of Contents 23 | 24 | 1. [Installation](#installation) 25 | 2. [Usage](#usage) 26 | 3. [Customization](#customization) 27 | 4. [How It Works](#how-it-works) 28 | 5. [Contributing](#contributing) 29 | 6. [Credits](#credits) 30 | 7. [License](#license) 31 | 32 | ## Installation 33 | 34 | Follow these steps to set up **KAN-WuBu-Memory** on your system: 35 | 36 | 1. Clone the repository: 37 | 38 | ```bash 39 | git clone https://github.com/waefrebeorn/kan-wubu-memory.git 40 | ``` 41 | 42 | 2. Navigate to the project directory: 43 | 44 | ```bash 45 | cd kan-wubu-memory 46 | ``` 47 | 48 | 3. Run the setup script (`setup.bat` for Windows) to initialize the environment and install dependencies: 49 | 50 | ```bash 51 | .\setup.bat 52 | ``` 53 | 54 | 4. Ensure that Python 3.8+ and CUDA-compatible drivers are installed. The script will automatically set up a virtual environment and install PyTorch, Hugging Face Transformers, and other dependencies. 55 | 56 | 5. **Important**: Manually download the required **LLaMA 3.2 1B** model files and place them in the `models/Llama_32_1B` directory. 57 | 58 | - You can download the files directly from Hugging Face's CLI by accepting the Llama License. 59 | 60 | ## Usage 61 | 62 | Once the environment is set up, you can interact with the **KAN-WuBu-Memory** AI system: 63 | 64 | **Kan GUI**: Start the interactive console mode: 65 | 66 | ```bash 67 | run run.bat 68 | ``` 69 | 70 | 71 | ### Example Interaction 72 | 73 | ``` 74 | User: How are you feeling today? 75 | AI: I feel quite neutral at the moment. How can I assist you? 76 | ``` 77 | 78 | The emotional state will shift dynamically based on the conversation context. 79 | 80 | ## Customization 81 | 82 | You can adjust various components of the system to suit your needs: 83 | 84 | - **System Prompt**: Customize the AI’s character description in `main.py` or directly through the GUI during the first interaction. 85 | - **Emotional Feedback**: Modify the dimensions of emotional feedback to fit your use case (e.g., add `confidence`, `interest`). 86 | - **Synthetic Day Cycle**: Adjust the length and phases of the synthetic day cycle in `llama_32_1b_tool.py`. 87 | - **Memory Management**: Configure short-term and long-term memory buffers, and adjust the clustering for memory consolidation. 88 | - **Entropy Management**: Change entropy thresholds and sampling parameters (`temperature`, `top_p`) for response generation. 89 | 90 | ## How It Works 91 | 92 | ### EmotionalState Module 93 | 94 | The **EmotionalState** class tracks the AI’s emotional state across three dimensions (`valence`, `arousal`, and `dominance`) and updates based on user feedback and conversation context. This emotional model is used to generate emotionally aware and context-sensitive responses. 95 | 96 | ### Overfit Detector 97 | 98 | The **OverfitDetector** monitors loss trends across training and validation windows to identify signs of overfitting and trigger adjustments, such as early stopping or dynamic learning rate scaling. 99 | 100 | ### Kolmogorov-Arnold Networks (KAN) 101 | 102 | KANs modify the hidden layers of LLaMA, allowing the system to fine-tune and optimize its internal representations based on emotional and contextual inputs. The **EnhancedKAN** class enables dynamic adjustments, resulting in a more personalized conversational experience. 103 | 104 | ### Refusal Detection and Override 105 | 106 | The **RefusalDetector** module monitors for refusal phrases (e.g., "I cannot assist with...") and utilizes a KAN-powered override to refine and rephrase these responses, ensuring a smoother interaction flow. 107 | 108 | ### Memory Management 109 | 110 | The **AdvancedMemoryManager** handles multi-dimensional memory, integrating short-term, long-term, and sliding-window memories to consolidate and prioritize context. This module supports clustering, importance scoring, and context summarization for efficient memory management. 111 | 112 | ### Entropy-Based Response Quality Management 113 | 114 | The **EntropyManager** tracks the entropy of generated responses, ensuring a balance between coherence and randomness. Entropy metrics are used to adjust sampling parameters (`temperature`, `top_p`), and trigger "chain-of-thought" reasoning processes when necessary. 115 | 116 | ### Synthetic Day-Night Cycle 117 | 118 | The **SyntheticDayCycle** simulates a day-night cycle that influences the AI’s internal state. The cycle affects behavior, response length, and sampling parameters based on the time of day. 119 | 120 | ### Live State Saving 121 | 122 | After each interaction, the system captures and saves the current state (including emotional context, memory buffers, and learning metrics) to ensure continuous learning and persistence. 123 | 124 | ## Contributing 125 | 126 | We welcome contributions from the community! If you'd like to contribute: 127 | 128 | 1. Fork the repository. 129 | 2. Create a new branch for your feature or bug fix. 130 | 3. Submit a pull request with detailed comments on your changes. 131 | 132 | For major changes, please open an issue first to discuss what you would like to change. 133 | 134 | ## Credits 135 | 136 | This project is built with contributions from various open-source libraries and developers. Special thanks to: 137 | 138 | - **WuBu (WaefreBeorn)**: Project creator and lead developer. 139 | - **Meta AI**: For the **LLaMA** language model that powers the core interaction. 140 | - **Hugging Face**: For providing the **Transformers** library that makes working with modern NLP models accessible. 141 | - **PyTorch Team**: For the foundational deep learning library that enables model training and optimization with CUDA support. 142 | - **Contributors**: Open-source enthusiasts who provide libraries and frameworks like `matplotlib`, `scipy`, and more. 143 | 144 | ### Special Acknowledgments 145 | 146 | - **LLaMA and Meta Research Team** for the original research behind the **LLaMA** language models. 147 | - **Hugging Face Transformers Community** for their dedication to providing accessible NLP tools. 148 | - **NVIDIA** for the CUDA toolkit, enabling efficient GPU computation. 149 | 150 | ## License 151 | 152 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. 153 | 154 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Core dependencies 2 | torch 3 | transformers==4.45.1 4 | numpy 5 | matplotlib 6 | scikit-learn 7 | tqdm 8 | plotly 9 | 10 | # GUI 11 | tk 12 | 13 | 14 | 15 | # File handling 16 | pathlib 17 | 18 | 19 | # Date and time handling 20 | datetime 21 | 22 | # NVIDIA GPU support (make sure you have the appropriate CUDA version installed) 23 | # Note: The specific CUDA version should match your GPU driver 24 | 25 | # Optional: for better performance on CPU 26 | # intel-openmp 27 | # mkl 28 | 29 | # Development tools (optional) 30 | # pytest 31 | # black 32 | # isort 33 | # flake8 34 | 35 | # Documentation (optional) 36 | # sphinx 37 | # sphinx-rtd-theme 38 | -------------------------------------------------------------------------------- /run - load_offloaded_model.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | setlocal 3 | 4 | :: Activate the virtual environment 5 | call venv\Scripts\activate 6 | 7 | :: Run the GUI script 8 | python load_offloaded_model.py 9 | 10 | pause -------------------------------------------------------------------------------- /run - splitsafetensors.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | setlocal 3 | 4 | :: Activate the virtual environment 5 | call venv\Scripts\activate 6 | 7 | :: Run the GUI script 8 | python split_safetensors.py 9 | 10 | pause -------------------------------------------------------------------------------- /run.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | setlocal 3 | 4 | :: Activate the virtual environment 5 | call venv\Scripts\activate 6 | 7 | :: Run the GUI script 8 | python kan_gui.py 9 | 10 | pause -------------------------------------------------------------------------------- /setup.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | setlocal enabledelayedexpansion 3 | 4 | echo Starting setup for KAN-WuBu-Memory with LLaMA 3.2 1B Model... 5 | 6 | :: Define project-specific paths 7 | set "PROJECT_DIR=%~dp0" 8 | set "MODEL_DIR=%PROJECT_DIR%models\Llama_32_1B" 9 | 10 | :: Check if Python is installed 11 | python --version >nul 2>&1 12 | if %errorlevel% neq 0 ( 13 | echo Python is not installed. Please install Python 3.8 or later from https://www.python.org/downloads/ 14 | exit /b 1 15 | ) 16 | 17 | :: Create the necessary folder structure 18 | if not exist "%MODEL_DIR%" ( 19 | echo Creating LLaMA 3.2 1B model directory... 20 | mkdir "%MODEL_DIR%" 21 | if %errorlevel% neq 0 ( 22 | echo Failed to create the LLaMA 3.2 model directory. 23 | exit /b 1 24 | ) 25 | ) 26 | 27 | echo Directory structure created successfully: %MODEL_DIR% 28 | 29 | :: Create a virtual environment if it doesn't exist 30 | if not exist "venv" ( 31 | echo Creating virtual environment... 32 | python -m venv venv 33 | if %errorlevel% neq 0 ( 34 | echo Failed to create virtual environment. 35 | exit /b 1 36 | ) 37 | ) 38 | 39 | :: Activate the virtual environment 40 | call venv\Scripts\activate 41 | if %errorlevel% neq 0 ( 42 | echo Failed to activate virtual environment. 43 | exit /b 1 44 | ) 45 | 46 | :: Upgrade pip 47 | echo Upgrading pip... 48 | python -m pip install --upgrade pip 49 | 50 | :: Install PyTorch with CUDA support 51 | echo Installing PyTorch with CUDA support... 52 | pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 53 | 54 | :: Install other requirements 55 | echo Installing other requirements... 56 | pip install -r requirements.txt 57 | 58 | :: Install Hugging Face transformers 59 | echo Installing latest Hugging Face transformers... 60 | pip install git+https://github.com/huggingface/transformers 61 | 62 | :: Install Accelerate 63 | echo Installing Accelerate... 64 | pip install accelerate>=0.26.0 65 | 66 | :: Verify CUDA installation 67 | echo Verifying CUDA installation... 68 | python -c "import torch; print('CUDA available:', torch.cuda.is_available()); print('CUDA version:', torch.version.cuda if torch.cuda.is_available() else 'N/A')" 69 | 70 | :: Additional CUDA diagnostics 71 | echo. 72 | echo Running CUDA diagnostics... 73 | python -c "import torch; print('CUDA device count:', torch.cuda.device_count()); print('CUDA device name:', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A')" 74 | 75 | :: Check NVIDIA driver 76 | echo. 77 | echo Checking NVIDIA driver... 78 | nvidia-smi 79 | 80 | echo Environment setup complete. 81 | 82 | echo. 83 | echo IMPORTANT: Manual Model Download Required 84 | echo ========================================== 85 | echo You have two options to get the LLaMA models: 86 | echo 1. **Directly from Meta:** 87 | echo - Visit the LLaMA download form at [https://www.llama.com/llama-downloads] 88 | echo - Fill in your details, select the models you want, and accept the licenses. 89 | echo - Check your email for download instructions and a pre-signed URL to download the model files: 90 | echo - checklist.chk 91 | echo - consolidated.00.pth 92 | echo - params.json 93 | echo - tokenizer.model 94 | echo - Place these files in the following directory: 95 | echo %MODEL_DIR% 96 | echo. 97 | echo 2. **From Hugging Face:** 98 | echo - Use the following command to download directly: 99 | echo huggingface-cli login 100 | echo huggingface-cli download meta-llama/Llama-3.2-1B-Instruct --include "checklist.chk,consolidated.00.pth,params.json,tokenizer.model" --local-dir "%MODEL_DIR%" 101 | 102 | echo. 103 | echo Setup completed successfully. You can now run the main script using run.bat. 104 | pause 105 | -------------------------------------------------------------------------------- /split_safetensors.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import numpy as np 4 | from safetensors.torch import load_file 5 | 6 | # Define the directories for source and output 7 | SOURCE_FILE = "models/Llama_32_1B/model.safetensors" # Path to the input safetensor file 8 | OUTPUT_DIR = "models/Llama_32_1B/offload" # Path to the output directory 9 | 10 | # Create the output directory if it doesn't exist 11 | os.makedirs(OUTPUT_DIR, exist_ok=True) 12 | 13 | # Load the safetensors file 14 | print(f"Loading safetensors file from: {SOURCE_FILE}") 15 | state_dict = load_file(SOURCE_FILE) 16 | print(f"Safetensors file loaded. Found {len(state_dict)} tensors.") 17 | 18 | # Utility function to save individual tensors, preserving their original format 19 | def save_tensor(tensor, file_path): 20 | """ 21 | Save a PyTorch tensor to a binary .dat file without any format conversion. 22 | """ 23 | # Identify the original tensor type 24 | original_dtype = tensor.dtype 25 | 26 | # Check if the format is supported by NumPy 27 | try: 28 | # If the tensor is in bfloat16, PyTorch has direct support for saving 29 | if original_dtype == torch.bfloat16: 30 | # Convert to float32 for saving as .dat, since bfloat16 is not supported by numpy 31 | print(f"Saving {file_path} as bfloat16 using float32 for compatibility.") 32 | tensor.to(torch.float32).cpu().numpy().tofile(file_path) 33 | else: 34 | # Use the original format without conversion 35 | tensor.cpu().numpy().tofile(file_path) 36 | 37 | print(f"Saved tensor to {file_path} with original type {original_dtype}") 38 | except Exception as e: 39 | print(f"Failed to save {file_path} with dtype {original_dtype} due to: {e}") 40 | 41 | # Iterate through the state dictionary and save each tensor to a separate .dat file 42 | for tensor_name, tensor in state_dict.items(): 43 | # Construct a file path based on the tensor's name, replacing '.' with '_' 44 | file_path = os.path.join(OUTPUT_DIR, tensor_name.replace('.', '_') + ".dat") 45 | 46 | # Save the tensor in its original format 47 | try: 48 | save_tensor(tensor, file_path) 49 | except ValueError as e: 50 | print(f"Skipping {tensor_name} due to error: {e}") 51 | 52 | print(f"Model has been successfully split into individual .dat files in: {OUTPUT_DIR}") 53 | -------------------------------------------------------------------------------- /test_model_loading.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import pipeline 3 | 4 | def test_model_loading(model_path): 5 | try: 6 | # Use device 0 (GPU) if available, else CPU 7 | device = 0 if torch.cuda.is_available() else -1 8 | 9 | # Define the prompt as a list of messages 10 | prompt = [ 11 | {"role": "system", "content": "You are a helpful assistant, that responds as a pirate."}, 12 | {"role": "user", "content": "What's Deep Learning?"}, 13 | ] 14 | 15 | # Initialize the pipeline with explicit task, model, and tokenizer 16 | generator = pipeline( 17 | task="text-generation", 18 | model=model_path, 19 | tokenizer=model_path, # Explicitly specify the tokenizer path 20 | device=device, 21 | torch_dtype=torch.float16 # Use torch.bfloat16 if supported 22 | ) 23 | 24 | # Generate the response 25 | generation = generator( 26 | prompt, 27 | do_sample=False, 28 | temperature=1.0, 29 | top_p=1, 30 | max_new_tokens=50 31 | ) 32 | 33 | print(f"Generation: {generation[0]['generated_text']}") 34 | except Exception as e: 35 | print(f"Error during generation: {e}") 36 | 37 | if __name__ == "__main__": 38 | # Use a raw string to prevent backslash interpretation 39 | model_path = r"C:\Projects\KAN-WuBu-Memory\models\Llama_32_1B" 40 | test_model_loading(model_path) 41 | -------------------------------------------------------------------------------- /test_sentencepiece.py: -------------------------------------------------------------------------------- 1 | import sentencepiece as spm 2 | 3 | sp = spm.SentencePieceProcessor() 4 | sp.Load("C:/Projects/KAN-WuBu-Memory/models/Llama_32_1B/tokenizer.model") 5 | print("Tokenizer loaded successfully.") 6 | -------------------------------------------------------------------------------- /test_tokenizer_loading.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoTokenizer 3 | 4 | def test_tokenizer_loading(model_path): 5 | try: 6 | tokenizer = AutoTokenizer.from_pretrained( 7 | model_path, 8 | use_fast=True, 9 | trust_remote_code=True # Enable custom tokenizer code execution 10 | ) 11 | # Optionally, set a padding token if not already set 12 | if tokenizer.pad_token is None: 13 | tokenizer.add_special_tokens({'pad_token': '[PAD]'}) 14 | print("Tokenizer loaded successfully.") 15 | print(f"Tokenizer type: {type(tokenizer)}") 16 | except Exception as e: 17 | print(f"Error loading tokenizer: {e}") 18 | 19 | if __name__ == "__main__": 20 | model_path = "C:\\Projects\\KAN-WuBu-Memory\\models\\Llama_32_1B" 21 | test_tokenizer_loading(model_path) 22 | -------------------------------------------------------------------------------- /venv.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | setlocal 3 | 4 | :: Check if virtual environment exists, if not, create it 5 | if not exist "venv\Scripts\activate" ( 6 | echo Creating virtual environment... 7 | python -m venv venv 8 | ) 9 | 10 | :: Activate the virtual environment 11 | call venv\Scripts\activate 12 | 13 | :: Inform the user that the environment is active and provide a command prompt 14 | echo Virtual environment activated. Type your commands below. 15 | 16 | :: Open command prompt for user to type commands 17 | cmd /K 18 | --------------------------------------------------------------------------------