├── logo.png
├── easykv
├── __init__.py
├── __pycache__
│ ├── utils.cpython-310.pyc
│ ├── __init__.cpython-310.pyc
│ ├── easykv.cpython-310.pyc
│ ├── llama_patch.cpython-310.pyc
│ └── mistral_patch.cpython-310.pyc
├── utils.py
├── mistral_patch.py
├── llama_patch.py
└── easykv.py
├── test_decoding.py
├── test_ppl.py
├── test_passkey.py
├── test_passkey_NTK.py
├── test_summarization.py
├── README.md
└── doc.txt
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DRSY/EasyKV/HEAD/logo.png
--------------------------------------------------------------------------------
/easykv/__init__.py:
--------------------------------------------------------------------------------
1 | from .easykv import enable_fixed_kv
2 | from .utils import set_dynamicntk_rope_length
--------------------------------------------------------------------------------
/easykv/__pycache__/utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DRSY/EasyKV/HEAD/easykv/__pycache__/utils.cpython-310.pyc
--------------------------------------------------------------------------------
/easykv/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DRSY/EasyKV/HEAD/easykv/__pycache__/__init__.cpython-310.pyc
--------------------------------------------------------------------------------
/easykv/__pycache__/easykv.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DRSY/EasyKV/HEAD/easykv/__pycache__/easykv.cpython-310.pyc
--------------------------------------------------------------------------------
/easykv/__pycache__/llama_patch.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DRSY/EasyKV/HEAD/easykv/__pycache__/llama_patch.cpython-310.pyc
--------------------------------------------------------------------------------
/easykv/__pycache__/mistral_patch.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DRSY/EasyKV/HEAD/easykv/__pycache__/mistral_patch.cpython-310.pyc
--------------------------------------------------------------------------------
/easykv/utils.py:
--------------------------------------------------------------------------------
1 | from types import MethodType
2 | from transformers.models.llama.modeling_llama import LlamaAttention
3 |
4 |
5 | def modify_method_of_instance(instance, target_class_name, target_method_name, new_method, visited_instances=None):
6 | """
7 | This function modifies the method of an instance of a model class.
8 | It's part from chat-GPT.
9 | It will replace the method with the new method.
10 | Currently, we only use this function to modify the attention method of a model. Do not test it further.
11 |
12 | instance:
13 | instance of a model to modify.
14 | target_class_name:
15 | name of the attention class to modify. E.g. 'LlamaAttention', 'GPTNeoXAttention', etc.
16 | new_method: new method to replace the original method. E.g. 'self_extend_forward'.
17 | It should include a parameter 'self' to be binded to the instance.
18 | """
19 | if visited_instances is None:
20 | visited_instances = set()
21 | # Unique identifier for the instance (using id() since object's id is unique)
22 | instance_id = id(instance)
23 | if instance_id in visited_instances:
24 | return
25 | # Add the instance to the already_visited set
26 | visited_instances.add(instance_id)
27 |
28 | # Check if this instance is of the target class
29 | if instance.__class__.__name__ == target_class_name:
30 | bond_method = MethodType(new_method, instance)
31 | setattr(instance, target_method_name, bond_method)
32 | elif hasattr(instance, '__dict__'):
33 | for attr_name, attr_value in instance.__dict__.items():
34 | if isinstance(attr_value, object) and not isinstance(attr_value, (list, tuple, dict, set)):
35 | modify_method_of_instance(attr_value, target_class_name, target_method_name, new_method, visited_instances)
36 | elif isinstance(attr_value, (list, tuple)):
37 | for item in attr_value:
38 | if isinstance(item, object):
39 | modify_method_of_instance(item, target_class_name, target_method_name, new_method, visited_instances)
40 | # If attribute value is a dictionary, iterate over its values and recurse
41 | # E.g, for a ModuleList, its moudels are stored in a dictionary: ._modules
42 | elif isinstance(attr_value, dict):
43 | for key, value in attr_value.items():
44 | if isinstance(value, object):
45 | modify_method_of_instance(value, target_class_name, target_method_name, new_method, visited_instances)
46 |
47 | # If attribute value is a set, iterate and recurse
48 | elif isinstance(attr_value, set):
49 | for item in attr_value:
50 | if isinstance(item, object):
51 | modify_method_of_instance(item, target_class_name, target_method_name, new_method, visited_instances)
52 |
53 | def set_dynamicntk_rope_length(model, max_length):
54 | for name, module in model.named_modules():
55 | if isinstance(module, LlamaAttention):
56 | module.rotary_emb._set_cos_sin_cache(max_length, device=model.device, dtype=module.rotary_emb.inv_freq.dtype)
57 | print(f"DynamicNTKRoPE max length reset to {max_length}")
--------------------------------------------------------------------------------
/test_decoding.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | warnings.filterwarnings("ignore")
3 | import torch
4 | from transformers import (AutoModelForCausalLM, AutoTokenizer)
5 | from easykv import enable_fixed_kv
6 |
7 | # define the model path and the corresponding prompt template
8 | MODEL_CONFIGS = {
9 | 'wizardlm_13b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--WizardLM--WizardLM-13B-V1.2/snapshots/cf5f40382559f19e13874e45b39575171ca46ef8', template="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\nUSER: Hello!\nASSISTANT: Hello!\nUSER: {inst}\nASSISTANT:"),
10 | 'llama2_13b_chat': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--meta-llama--Llama-2-13b-chat-hf/snapshots/c2f3ec81aac798ae26dcc57799a994dfbf521496/', template="[INST] <
3 | 9 | Update • 10 | Features • 11 | Installation • 12 | Example • 13 | Passkey Retrieval • 14 | Summarization • 15 | Instruction Following • 16 | Perplexity • 17 | Todos • 18 | Acknowledgement 19 |
20 | 21 | ## Update 22 | + [2024.2.12] The paper of EasyKV is now available on arxiv(https://arxiv.org/abs/2402.06262). 23 | + [2024.1.28] Add support for streaming mode by setting ```streaming``` in generation config to ```True```. 24 | + [2024.1.19] Add support for stopping condition during generation. 25 | + [2024.1.17] Add ```auto``` mode. See [example](#auto-mode). 26 | + [2024.1.17] Add examples for [perplexity computation](#perplexity) using LLaMa2-13B and DynamicNTK. 27 | + [2024.1.16] Add examples for [Instruction Following](#instruction-following) using LLaMa2-7B-Chat. 28 | + [2024.1.15] Add examples for [Passkey Retrieval](#passkey-retrieval-example) using long-context LLM(Vicuna-7B-16K) and DynamicNTK-scaled LLaMa2-7B-Chat. 29 | + [2024.1.15] Add examples for [Summarization](#summarization-example) using LLaMa2-7B-Chat. 30 | + [2024.1.14] Uploaded the standalone Pytorch implementation. Pypi package and paper describing the details of our integrated eviction policy design are coming soon. 31 | 32 | ## Features 33 | + Offer control over the memory budget allocated for the KV cache during LLM inference, with easy-to-use interface. 34 | + Support both prompt encoding and auto-regressive decoding. 35 | + Support Multi-Head Attention(MHA), Multi-Query Attention(MQA), and Grouped-Query Attention(GQA). 36 | + Support LLaMa, LLaMa2, and Mistral. 37 | + Support various stride for prompt encoding(larger stride leads to faster encoding). 38 | 39 | ## Installation 40 | First of all, clone this repo into your working directory. 41 | ```bash 42 | git clone https://github.com/DRSY/EasyKV.git 43 | cd EasyKV 44 | ``` 45 | Then import ```enable_fixed_kv``` in your Python script: 46 | ```python 47 | from easykv import enable_fixed_kv 48 | ``` 49 | 50 | ## Example Usage 51 | There are two different phases in LLM generative inference, i.e., prompt encoding and auto-regressive decoding. 52 | ### Prompt Encoding/Prefilling 53 | For prefilling stage, please specify ```budget``` in the range of (0,1), e.g., 0.5, which leads to 50% savings in KV cache memory footprint. 54 | ```python 55 | import torch 56 | from transformers import AutoModelForCausalLM, AutoTokenizer 57 | 58 | # Define your model path and template in a dict MODEL_CONFIGS 59 | model_name = 'zephyr_7b' 60 | path = MODEL_CONFIGS[model_name]['path'] 61 | template = MODEL_CONFIGS[model_name]['template'] 62 | model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.float16, device_map='auto').eval() 63 | tokenizer = AutoTokenizer.from_pretrained(path) 64 | 65 | # Turn on fixed KV cache mode for prefilling phase 66 | stride=8 67 | enable_fixed_kv(model, tokenizer, mode='encoding', stride=stride) 68 | 69 | # Test input 70 | article = "###\nArticle: It was the first time the Single Transferable Vote (STV) system had been used to select two members in the same ward in a by-election. The SNP topped the vote in the Leith Walk by-election, while Scottish Labour won the second seat from the Greens. The by-election was called after Deidre Brock of the SNP and Maggie Chapman of the Scottish Greens stood down. The SNP's John Lewis Ritchie topped the Leith Walk poll with 2,290 votes. He was elected at stage one in the STV process with a swing in first-preference votes of 7.6% from Labour. Labour's Marion Donaldson received 1,623 votes, ahead of Susan Jane Rae of the Scottish Greens on 1,381. Ms Donaldson was elected at stage 10 of the voting process after other preferences had been considered. The by-election was called after Ms Brock stood down when she was elected as the SNP MP for Edinburgh North and Leith in May. Ms Chapman, of the Scottish Greens, resigned from her post to concentrate on standing for the Scottish Parliament in next May's election. The turnout for the by-election was 25.1%. The SNP also held the Midlothian West seat on Midlothian Council with a swing of 6.3% from Labour. The party's Kelly Parry secured 1,540 votes, ahead of Labour's Ian Miller on 945 votes. The by-election was called after Owen Thompson was elected as SNP MP for the Midlothian constituency.\n\nSummarize the above article in 1 sentence.\n" 71 | prompt = f"Write a SHORT summary of the following text delimited by triple backticks. Return your response which covers the key points of the text.\n```{article}```" 72 | input_prompt = template.format(inst=prompt) 73 | 74 | # Define eviction policy 75 | kv_policy = 'roco' 76 | # Define sampling parameters 77 | gen_kwargs = dict( 78 | temperature=1e-9, 79 | top_p=1.0, 80 | max_new_tokens=256, 81 | budget=0.5, 82 | kv_policy=kv_policy, 83 | keep_attention=False, # set to True if your DRAM is not tight and you can get better performance 84 | eos_token_ids=[tokenizer.eos_token_id] 85 | ) 86 | input_ids = tokenizer([input_prompt], return_tensors='pt').input_ids.to(model.device) 87 | output = model.easykv_generate(input_ids=input_ids, generation_config=gen_kwargs) 88 | print(f"{'='*20} {kv_policy} {'='*20}\n{output}") 89 | ``` 90 | ### Auto-regressive Decoding 91 | For auto-regressive decoding phase, please specify ```budget``` as an integer, which represents the maximum length of KV cache, e.g, 200. 92 | ```python 93 | import torch 94 | from transformers import AutoModelForCausalLM, AutoTokenizer 95 | 96 | # Define your model path and template in a dict MODEL_CONFIGS 97 | model_name = 'llama2_7b_chat' 98 | path = MODEL_CONFIGS[model_name]['path'] 99 | template = MODEL_CONFIGS[model_name]['template'] 100 | model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.float16, device_map='auto').eval() 101 | tokenizer = AutoTokenizer.from_pretrained(path) 102 | 103 | # Turn on fixed KV cache mode for decoding phase 104 | enable_fixed_kv(model, tokenizer, mode='decoding', stride=1) 105 | 106 | # Test input 107 | prompt = f"What are the names of some famous actors that started their careers on Broadway?" 108 | input_prompt = template.format(inst=prompt) 109 | kv_policy = 'roco' 110 | # Define sampling parameters 111 | gen_kwargs = dict( 112 | temperature=1e-9, 113 | top_p=1.0, 114 | max_new_tokens=2048, 115 | budget=200, 116 | kv_policy=kv_policy, 117 | eos_token_ids=[tokenizer.eos_token_id] 118 | ) 119 | input_ids = tokenizer([input_prompt], return_tensors='pt').input_ids.to(model.device) 120 | output = model.easykv_generate(input_ids=input_ids, generation_config=gen_kwargs) 121 | print(f"{'='*20} {kv_policy} {'='*20}\n{output}") 122 | ``` 123 | ### Auto Mode 124 | In case both the prompt and generation are long, ```auto``` mode can help automatically handle KV cache throught the prefilling and decoding stages. 125 | ```python 126 | stride = 64 # stride for sliding window 127 | kv_policy = "roco" # cache eviction policy 128 | budget = 1024 # an integer specifying the maximum KV cache 129 | enable_fixed_kv(model, tokenizer, mode='auto', stride=stride) 130 | gen_kwargs = dict( 131 | temperature=1e-9, 132 | top_p=1.0, 133 | max_new_tokens=64, 134 | budget=budget, 135 | kv_policy=kv_policy, 136 | keep_attention=False, # set to True if your DRAM is not tight and you can get better performance 137 | eos_token_ids=[tokenizer.eos_token_id] 138 | ) 139 | output = model.easykv_generate(input_ids=input_ids, generation_config=gen_kwargs) 140 | ``` 141 | 142 | ### Passkey Retrieval Example 143 | We provide examplar code for passkey retrieval in [test_passkey.py](./test_passkey.py) and [test_passkey_NTK.py](./test_passkey_NTK.py) using Vicuna-7B-16K and DynamicNTK-scaled LLaMa2-7B-Chat, respectively. 144 | 145 | The results of DynamicNTK-scaled LLaMa2-7B-Chat on ```5K``` passkey retrieval task is shown below: 146 | ```bash 147 | #Tokens of Prompt: 5144 Passkey target: 89427 148 | KV cache budget ratio: 100.00%(5144/5144) 149 | Current GPU memory usage: 18.359 GB 150 | Peak GPU memory usage: 21.751 GB 151 | Llama2-EasyKV-h2o_head_std_avg(100.00%): [What is the pass key? The pass key is 89427.] 152 | 153 | KV cache budget ratio: 50.08%(2576/5144) 154 | Current GPU memory usage: 15.625 GB 155 | Peak GPU memory usage: 18.423 GB 156 | Llama2-EasyKV-h2o_head_std_avg(50.00%): [What is the pass key? The pass key is 89427.] 157 | ------------------------------------------------------------------------------------ 158 | #Tokens of Prompt: 5144 Passkey target: 51906 159 | KV cache budget ratio: 100.00%(5144/5144) 160 | Current GPU memory usage: 18.359 GB 161 | Peak GPU memory usage: 21.751 GB 162 | Llama2-EasyKV-h2o_head_std_avg(100.00%): [What is the pass key? The pass key is 51906.] 163 | 164 | KV cache budget ratio: 50.08%(2576/5144) 165 | Current GPU memory usage: 15.625 GB 166 | Peak GPU memory usage: 18.427 GB 167 | Llama2-EasyKV-h2o_head_std_avg(50.00%): [What is the pass key? The pass key is 51906.] 168 | ------------------------------------------------------------------------------------ 169 | #Tokens of Prompt: 5144 Passkey target: 38117 170 | KV cache budget ratio: 100.00%(5144/5144) 171 | Current GPU memory usage: 18.359 GB 172 | Peak GPU memory usage: 21.751 GB 173 | Llama2-EasyKV-h2o_head_std_avg(100.00%): [What is the pass key? The pass key is 38117.] 174 | 175 | KV cache budget ratio: 50.08%(2576/5144) 176 | Current GPU memory usage: 15.625 GB 177 | Peak GPU memory usage: 18.427 GB 178 | Llama2-EasyKV-h2o_head_std_avg(50.00%): [What is the pass key? The pass key is 38117.] 179 | ------------------------------------------------------------------------------------ 180 | #Tokens of Prompt: 5144 Passkey target: 60151 181 | KV cache budget ratio: 100.00%(5144/5144) 182 | Current GPU memory usage: 18.359 GB 183 | Peak GPU memory usage: 21.751 GB 184 | Llama2-EasyKV-h2o_head_std_avg(100.00%): [What is the pass key? The pass key is 60151.] 185 | 186 | KV cache budget ratio: 50.08%(2576/5144) 187 | Current GPU memory usage: 15.625 GB 188 | Peak GPU memory usage: 18.427 GB 189 | Llama2-EasyKV-h2o_head_std_avg(50.00%): [What is the pass key? The pass key is 60151.] 190 | ------------------------------------------------------------------------------------ 191 | #Tokens of Prompt: 5144 Passkey target: 23789 192 | KV cache budget ratio: 100.00%(5144/5144) 193 | Current GPU memory usage: 18.359 GB 194 | Peak GPU memory usage: 21.752 GB 195 | Llama2-EasyKV-h2o_head_std_avg(100.00%): [What is the pass key? The pass key is 23789.] 196 | 197 | KV cache budget ratio: 50.08%(2576/5144) 198 | Current GPU memory usage: 15.626 GB 199 | Peak GPU memory usage: 18.427 GB 200 | Llama2-EasyKV-h2o_head_std_avg(50.00%): [What is the pass key? The pass key is 23789.] 201 | ``` 202 | 203 | The results of Vicuna-7B-16K on ```10K``` passkey retrieval task is shown below: 204 | ```bash 205 | #Tokens of Prompt: 9994 Passkey target: 51013 206 | KV cache budget ratio: 100.00%(9994/9994) 207 | Current GPU memory usage: 23.666 GB 208 | Peak GPU memory usage: 41.896 GB 209 | EasyKV-h2o_head_std_avg(100.00%): [What is the pass key? The pass key is 51013.] 210 | 211 | KV cache budget ratio: 50.05%(5002/9994) 212 | Current GPU memory usage: 18.4 GB 213 | Peak GPU memory usage: 25.36 GB 214 | EasyKV-h2o_head_std_avg(50.00%): [What is the pass key? The pass key is 51013.] 215 | ------------------------------------------------------------------------------------ 216 | #Tokens of Prompt: 9994 Passkey target: 36920 217 | KV cache budget ratio: 100.00%(9994/9994) 218 | Current GPU memory usage: 23.666 GB 219 | Peak GPU memory usage: 41.896 GB 220 | EasyKV-h2o_head_std_avg(100.00%): [What is the pass key? The pass key is 36920.] 221 | 222 | KV cache budget ratio: 50.05%(5002/9994) 223 | Current GPU memory usage: 18.378 GB 224 | Peak GPU memory usage: 25.36 GB 225 | EasyKV-h2o_head_std_avg(50.00%): [What is the pass key? The pass key is 36920.] 226 | ------------------------------------------------------------------------------------ 227 | #Tokens of Prompt: 9994 Passkey target: 83493 228 | KV cache budget ratio: 100.00%(9994/9994) 229 | Current GPU memory usage: 23.666 GB 230 | Peak GPU memory usage: 41.896 GB 231 | EasyKV-h2o_head_std_avg(100.00%): [What is the pass key? The pass key is 83493.] 232 | 233 | KV cache budget ratio: 50.05%(5002/9994) 234 | Current GPU memory usage: 18.378 GB 235 | Peak GPU memory usage: 25.36 GB 236 | EasyKV-h2o_head_std_avg(50.00%): [What is the pass key? The pass key is 83493.] 237 | ------------------------------------------------------------------------------------ 238 | #Tokens of Prompt: 9994 Passkey target: 78585 239 | KV cache budget ratio: 100.00%(9994/9994) 240 | Current GPU memory usage: 23.666 GB 241 | Peak GPU memory usage: 41.896 GB 242 | EasyKV-h2o_head_std_avg(100.00%): [What is the pass key? The pass key is 78585.] 243 | 244 | KV cache budget ratio: 50.05%(5002/9994) 245 | Current GPU memory usage: 18.378 GB 246 | Peak GPU memory usage: 25.36 GB 247 | EasyKV-h2o_head_std_avg(50.00%): [What is the pass key? The pass key is 78585.] 248 | ------------------------------------------------------------------------------------ 249 | #Tokens of Prompt: 9994 Passkey target: 58328 250 | KV cache budget ratio: 100.00%(9994/9994) 251 | Current GPU memory usage: 23.666 GB 252 | Peak GPU memory usage: 41.896 GB 253 | EasyKV-h2o_head_std_avg(100.00%): [What is the pass key? The pass key is 58328.] 254 | 255 | KV cache budget ratio: 50.05%(5002/9994) 256 | Current GPU memory usage: 18.378 GB 257 | Peak GPU memory usage: 25.36 GB 258 | EasyKV-h2o_head_std_avg(50.00%): [What is the pass key? The pass key is 58328.] 259 | ``` 260 | 261 | ### Summarization Example 262 | We provide examplar code for summarization in [test_summarization.py](./test_summarization.py). 263 | The results of full KV cache and 50%-constrained KV cache using EasyKV is shown below: 264 | ```bash 265 | EasyKV(100.00%): The 2016 European Championship, also known as Euro 2016, will take place in France from June 10 to July 10, featuring 24 teams, including France, Spain, Germany, England, Wales, and Northern Ireland, with the tournament kicking off with France playing Romania on Friday, June 10, and the final taking place at the Stade de France in Paris on July 10. 266 | EasyKV(50.00%): The 2016 European Championship, also known as Euro 2016, will be held in France from June 10th to July 10th, featuring 24 teams, including defending champions Spain, and will be marked by a number of changes to the rules of the game, as well as increased security measures due to the ongoing terror threat. 267 | ``` 268 | 269 | ### Instruction Following 270 | We provide examplar code for instruction-following in [test_decoding.py](./test_decoding.py). 271 | The results of EasyKV using different KV budget(300/150) with LLaMa2-7B-Chat are shown below: 272 | ```bash 273 | Instruction: What are the names of some famous actors that started their careers on Broadway? 274 | KV cache budget ratio: 58.14%(300/516) 275 | ==================== EasyKV-h2o_head_decay_avg_std-300 ==================== 276 | Thank you for your question! There are many talented actors who have started their careers on Broadway and gone on to achieve great success in the entertainment industry. Here are some famous actors who got their start on Broadway: 277 | 1. Hugh Jackman: Known for his roles in "The Boy from Oz" and "The Greatest Showman," Jackman got his start on Broadway in the musical "Oklahoma!" 278 | 2. Audra McDonald: A six-time Tony Award winner, McDonald got her start on Broadway in the musical "Ragtime." 279 | 3. Idina Menzel: Menzel got her start on Broadway in the musical "Rent" and went on to star in "Wicked" and "Frozen." 280 | 4. Lin-Manuel Miranda: Miranda got his start on Broadway in the musical "In the Heights" and went on to create the hit musical "Hamilton." 281 | 5. Bernadette Peters: A legendary actress and singer, Peters got her start on Broadway in the musical "The Beautyful Nose" and has since starred in numerous productions, including "Gypsy" and "Sweeney Todd." 282 | 6. James Corden: Corden got his start on Broadway in the musical "Les Miserables" before becoming a late-night talk show host on "The Late Late Show with James Corden." 283 | 7. Christine Baranski: Baranski got her start on Broadway in the musical "The Producers" and has since appeared in numerous productions, including "The Good Wife" and "The Good Fight." 284 | 8. Nathan Lane: Lane got his start on Broadway in the musical "A Funny Thing Happened on the Way to the Forum" and has since starred in numerous productions, including "The Producers" and "It's Only a Play." 285 | 9. Bette Midler: Midler got her start on Broadway in the musical "Fiddler on the Roof" and went on to star in "Wicked" and "Hello, Dolly!" 286 | 10. John Leguizamo: Leguizamo got his start on Broadway in the play "A Day in the Death of Jose Marti" and has since appeared in numerous productions, including "Spanglish" and "The Lion King." 287 | I hope this list helps! Let me know if you have any other questions. 288 | KV cache budget ratio: 28.30%(150/530) 289 | ==================== EasyKV-h2o_head_decay_avg_std-150 ==================== 290 | Thank you for your question! There are many talented actors who have started their careers on Broadway and gone on to achieve great success in the entertainment industry. Here are some famous actors who got their start on Broadway: 291 | 1. Hugh Jackman: Known for his roles in "The Boy from Oz" and "The Greatest Showman," Jackman got his start on Broadway in the musical "Oklahoma!" 292 | 2. Audra McDonald: A six-time Tony Award winner, McDonald got her start on Broadway in the musical "Ragtime." 293 | 3. Idina Menzel: Menzel got her start on Broadway in the musical "Rent" and went on to star in "Wicked" and "Frozen." 294 | 4. Lin-Manuel Miranda: Miranda got his start on Broadway in the musical "In the Heights" and went on to create the hit musical "Hamilton." 295 | 5. Bernadette Peters: A legendary actress and singer, Peters got her start on Broadway in the musical "The Beautyful Nose" and has since starred in numerous Broadway productions. 296 | 6. James Corden: Corden got his start on Broadway in the musical "Les Miserables" before becoming a late-night talk show host on "The Late Late Show with James Corden." 297 | 7. Christine Baranski: Baranski got her start on Broadway in the musical "The Producers" before going on to star in the TV show "The Good Wife" and the movie "The Big Sick." 298 | 8. Nathan Lane: Lane got his start on Broadway in the musical "A Funny Thing Happened on the Way to the Forum" and has since starred in numerous Broadway productions, including "The Producers" and "The Birdcage." 299 | 9. Bette Midler: Midler got her start on Broadway in the musical "Fiddler on the Roof" before going on to star in the TV show "The Rose" and the movie "Hocus Pocus." 300 | 10. John Leguizamo: Leguizamo got his start on Broadway in the play "A Day in the Death of Jose Marti" before going on to star in numerous TV shows and movies, including "ER" and "Ice Age." 301 | These are just a few examples of actors who got their start on Broadway. There are many other talented actors who have also gotten their start on the Great White Way. 302 | ``` 303 | 304 | ### Perplexity 305 | We provide examplar code for perplexity computation in [test_ppl.py](./test_ppl.py). 306 | The result with LLaMa2-13B with DynamicNTK on 10000-token document([doc.txt](./doc.txt)) is shown below: 307 | ```bash 308 | DynamicNTKRoPE max length reset to 11000 309 | Fixed KV Cache for ppl enabled 310 | 311 | Input token length: 10253 312 | EasyKV-100.00% PPL: 7.44 313 | ------------------------------------------ 314 | KV cache budget ratio: 50.38%(5165/10253) 315 | EasyKV-recency-50.00% PPL: 7.68 316 | ------------------------------------------ 317 | KV cache budget ratio: 50.38%(5165/10253) 318 | EasyKV-h2o_head_std_avg-50.00% PPL: 7.47 319 | ``` 320 | 321 | ## List of Supported KV Eviction Policies: 322 | + random: drop kv cache of a randomly chosen position. 323 | + recency: similar to StreamingLLM, dropping the least recent token's kv cache. 324 | + h2o_head: Heavy-hitter oracle, which drops kv cache whose accumulated attention score is smallest. 325 | + tova: Token Omission Via Attention, which uses attention weights of the last token only. 326 | + roco: newly proposed eviction policy with better evivtion candidate selection and importance estimation. 327 | 328 | 329 | ## Todos 330 | + [x] Add ```auto``` mode so that users don't have to manually specify ```encoding``` or ```decoding```. 331 | + [ ] Add [LongBench](https://github.com/THUDM/LongBench/tree/main?tab=readme-ov-file#how-to-evaluate-on-LongBench) evaluation. 332 | + [ ] Add filtering mechanism to prevent dropping KV cache of specified tokens. 333 | 334 | 335 | ## Acknowledgement 336 | ```latex 337 | @article{xiao2023efficient, 338 | title={Efficient streaming language models with attention sinks}, 339 | author={Xiao, Guangxuan and Tian, Yuandong and Chen, Beidi and Han, Song and Lewis, Mike}, 340 | journal={arXiv preprint arXiv:2309.17453}, 341 | year={2023} 342 | } 343 | 344 | @article{liu2023scissorhands, 345 | title={Scissorhands: Exploiting the Persistence of Importance Hypothesis for LLM KV Cache Compression at Test Time}, 346 | author={Liu, Zichang and Desai, Aditya and Liao, Fangshuo and Wang, Weitao and Xie, Victor and Xu, Zhaozhuo and Kyrillidis, Anastasios and Shrivastava, Anshumali}, 347 | journal={arXiv preprint arXiv:2305.17118}, 348 | year={2023} 349 | } 350 | 351 | @article{zhang2023h, 352 | title={H $ \_2 $ O: Heavy-Hitter Oracle for Efficient Generative Inference of Large Language Models}, 353 | author={Zhang, Zhenyu and Sheng, Ying and Zhou, Tianyi and Chen, Tianlong and Zheng, Lianmin and Cai, Ruisi and Song, Zhao and Tian, Yuandong and R{\'e}, Christopher and Barrett, Clark and others}, 354 | journal={arXiv preprint arXiv:2306.14048}, 355 | year={2023} 356 | } 357 | 358 | @article{oren2024transformers, 359 | title={Transformers are Multi-State RNNs}, 360 | author={Oren, Matanel and Hassid, Michael and Adi, Yossi and Schwartz, Roy}, 361 | journal={arXiv preprint arXiv:2401.06104}, 362 | year={2024} 363 | } 364 | ``` -------------------------------------------------------------------------------- /doc.txt: -------------------------------------------------------------------------------- 1 | Chapter: "Mine ear is open, and my heart prepared: The worst is worldly loss thou canst unfold: Say, is my kingdom lost?" SHAKESPEARE. It was a feature peculiar to the colonial wars of North America, that the toils and dangers of the wilderness were to be encountered before the adverse hosts could meet. A wide and apparently an impervious boundary of forests severed the possessions of the hostile provinces of France and England. The hardy colonist, and the trained European who fought at his side, frequently expended months in struggling against the rapids of the streams, or in effecting the rugged passes of the mountains, in quest of an opportunity to exhibit their courage in a more martial conflict. But, emulating the patience and self-denial of the practised native warriors, they learned to overcome every difficulty; and it would seem that, in time, there was no recess of the woods so dark, nor any secret place so lovely, that it might claim exemption from the inroads of those who had pledged their blood to satiate their vengeance, or to uphold the cold and selfish policy of the distant monarchs of Europe. Perhaps no district throughout the wide extent of the intermediate frontiers can furnish a livelier picture of the cruelty and fierceness of the savage warfare of those periods than the country which lies between the head waters of the Hudson and the adjacent lakes. The facilities which nature had there offered to the march of the combatants were too obvious to be neglected. The lengthened sheet of the Champlain stretched from the frontiers of Canada, deep within the borders of the neighboring province of New York, forming a natural passage across half the distance that the French were compelled to master in order to strike their enemies. Near its southern termination, it received the contributions of another lake, whose waters were so limpid as to have been exclusively selected by the Jesuit missionaries to perform the typical purification of baptism, and to obtain for it the title of lake "du Saint Sacrement." The less zealous English thought they conferred a sufficient honor on its unsullied fountains, when they bestowed the name of their reigning prince, the second of the house of Hanover. The two united to rob the untutored possessors of its wooded scenery of their native right to perpetuate its original appellation of "Horican."[1] Winding its way among countless islands, and imbedded in mountains, the "holy lake" extended a dozen leagues still farther to the south. With the high plain that there interposed itself to the further passage of the water, commenced a portage of as many miles, which conducted the adventurer to the banks of the Hudson, at a point where, with the usual obstructions of the rapids, or rifts, as they were then termed in the language of the country, the river became navigable to the tide. While, in the pursuit of their daring plans of annoyance, the restless enterprise of the French even attempted the distant and difficult gorges of the Alleghany, it may easily be imagined that their proverbial acuteness would not overlook the natural advantages of the district we have just described. It became, emphatically, the bloody arena, in which most of the battles for the mastery of the colonies were contested. Forts were erected at the different points that commanded the facilities of the route, and were taken and retaken, razed and rebuilt, as victory alighted on the hostile banners. While the husbandman shrank back from the dangerous passes, within the safer boundaries of the more ancient settlements, armies larger than those that had often disposed of the sceptres of the mother countries, were seen to bury themselves in these forests, whence they rarely returned but in skeleton bands, that were haggard with care, or dejected by defeat. Though the arts of peace were unknown to this fatal region, its forests were alive with men; its shades and glens rang with the sounds of martial music, and the echoes of its mountains threw back the laugh, or repeated the wanton cry, of many a gallant and reckless youth, as he hurried by them, in the noontide of his spirits, to slumber in a long night of forgetfulness. It was in this scene of strife and bloodshed that the incidents we shall attempt to relate occurred, during the third year of the war which England and France last waged for the possession of a country that neither was destined to retain. The imbecility of her military leaders abroad, and the fatal want of energy in her councils at home, had lowered the character of Great Britain from the proud elevation on which it had been placed, by the talents and enterprise of her former warriors and statesmen. No longer dreaded by her enemies, her servants were fast losing the confidence of self-respect. In this mortifying abasement, the colonists, though innocent of her imbecility, and too humble to be the agents of her blunders, were but the natural participators. They had recently seen a chosen army from that country, which, reverencing as a mother, they had blindly believed invincible--an army led by a chief who had been selected from a crowd of trained warriors, for his rare military endowments, disgracefully routed by a handful of French and Indians, and only saved from annihilation by the coolness and spirit of a Virginian boy, whose riper fame has since diffused itself, with the steady influence of moral truth, to the uttermost confines of Christendom.[2] A wide frontier had been laid naked by this unexpected disaster, and more substantial evils were preceded by a thousand fanciful and imaginary dangers. The alarmed colonists believed that the yells of the savages mingled with every fitful gust of wind that issued from the interminable forests of the west. The terrific character of their merciless enemies increased immeasurably the natural horrors of warfare. Numberless recent massacres were still vivid in their recollections; nor was there any ear in the provinces so deaf as not to have drunk in with avidity the narrative of some fearful tale of midnight murder, in which the natives of the forests were the principal and barbarous actors. As the credulous and excited traveller related the hazardous chances of the wilderness, the blood of the timid curdled with terror, and mothers cast anxious glances even at those children which slumbered within the security of the largest towns. In short, the magnifying influence of fear began to set at naught the calculations of reason, and to render those who should have remembered their manhood, the slaves of the basest of passions. Even the most confident and the stoutest hearts began to think the issue of the contest was becoming doubtful; and that abject class was hourly increasing in numbers, who thought they foresaw all the possessions of the English crown in America subdued by their Christian foes, or laid waste by the inroads of their relentless allies. When, therefore, intelligence was received at the fort, which covered the southern termination of the portage between the Hudson and the lakes, that Montcalm had been seen moving up the Champlain, with an army "numerous as the leaves on the trees," its truth was admitted with more of the craven reluctance of fear than with the stern joy that a warrior should feel, in finding an enemy within reach of his blow. The news had been brought, towards the decline of a day in midsummer, by an Indian runner, who also bore an urgent request from Munro, the commander of a work on the shore of the "holy lake," for a speedy and powerful reinforcement. It has already been mentioned that the distance between these two posts was less than five leagues. The rude path, which originally formed their line of communication, had been widened for the passage of wagons; so that the distance which had been travelled by the son of the forest in two hours, might easily be effected by a detachment of troops, with their necessary baggage, between the rising and setting of a summer sun. The loyal servants of the British crown had given to one of these forest fastnesses the name of William Henry, and to the other that of Fort Edward; calling each after a favorite prince of the reigning family. The veteran Scotchman just named held the first, with a regiment of regulars and a few provincials; a force really by far too small to make head against the formidable power that Montcalm was leading to the foot of his earthen mounds. At the latter, however, lay General Webb, who commanded the armies of the king in the northern provinces, with a body of more than five thousand men. By uniting the several detachments of his command, this officer might have arrayed nearly double that number of combatants against the enterprising Frenchman, who had ventured so far from his reinforcements, with an army but little superior in numbers. But under the influence of their degraded fortunes, both officers and men appeared better disposed to await the approach of their formidable antagonists, within their works, than to resist the progress of their march, by emulating the successful example of the French at Fort du Quesne, and striking a blow on their advance. After the first surprise of the intelligence had a little abated, a rumor was spread through the entrenched camp, which stretched along the margin of the Hudson, forming a chain of outworks to the body of the fort itself, that a chosen detachment of fifteen hundred men was to depart, with the dawn, for William Henry, the post at the northern extremity of the portage. That which at first was only rumor, soon became certainty, as orders passed from the quarters of the commander-in-chief to the several corps he had selected for this service, to prepare for their speedy departure. All doubt as to the intention of Webb now vanished, and an hour or two of hurried footsteps and anxious faces succeeded. The novice in the military art flew from point to point, retarding his own preparations by the excess of his violent and somewhat distempered zeal; while the more practised veteran made his arrangements with a deliberation that scorned every appearance of haste; though his sober lineaments and anxious eye sufficiently betrayed that he had no very strong professional relish for the as yet untried and dreaded warfare of the wilderness. At length the sun set in a flood of glory, behind the distant western hills, and as darkness drew its veil around the secluded spot the sounds of preparation diminished; the last light finally disappeared from the log cabin of some officer; the trees cast their deeper shadows over the mounds and the rippling stream, and a silence soon pervaded the camp, as deep as that which reigned in the vast forest by which it was environed. According to the orders of the preceding night, the heavy sleep of the army was broken by the rolling of the warning drums, whose rattling echoes were heard issuing, on the damp morning air, out of every vista of the woods, just as day began to draw the shaggy outlines of some tall pines of the vicinity, on the opening brightness of a soft and cloudless eastern sky. In an instant the whole camp was in motion; the meanest soldier arousing from his lair to witness the departure of his comrades, and to share in the excitement and incidents of the hour. The simple array of the chosen band was soon completed. While the regular and trained hirelings of the king marched with haughtiness to the right of the line, the less pretending colonists took their humbler position on its left, with a docility that long practice had rendered easy. The scouts departed; strong guards preceded and followed the lumbering vehicles that bore the baggage; and before the gray light of the morning was mellowed by the rays of the sun, the main body of the combatants wheeled into column, and left the encampment with a show of high military bearing, that served to drown the slumbering apprehensions of many a novice, who was now about to make his first essay in arms. While in view of their admiring comrades, the same proud front and ordered array was observed, until the notes of their fifes growing fainter in distance, the forest at length appeared to swallow up the living mass which had slowly entered its bosom. The deepest sounds of the retiring and invisible column had ceased to be borne on the breeze to the listeners, and the latest straggler had already disappeared in pursuit; but there still remained the signs of another departure, before a log cabin of unusual size and accommodations, in front of which those sentinels paced their rounds, who were known to guard the person of the English general. At this spot were gathered some half dozen horses, caparisoned in a manner which showed that two, at least, were destined to bear the persons of females, of a rank that it was not usual to meet so far in the wilds of the country. A third wore the trappings and arms of an officer of the staff; while the rest, from the plainness of the housings, and the travelling mails with which they were encumbered, were evidently fitted for the reception of as many menials, who were, seemingly, already awaiting the pleasure of those they served. At a respectful distance from this unusual show were gathered divers groups of curious idlers; some admiring the blood and bone of the high-mettled military charger, and others gazing at the preparations, with dull wonder of vulgar curiosity. There was one man, however, who, by his countenance and actions, formed a marked exception to those who composed the latter class of spectators, being neither idle, nor seemingly very ignorant. The person of this individual was to the last degree ungainly, without being in any particular manner deformed. He had all the bones and joints of other men, without any of their proportions. Erect, his stature surpassed that of his fellows; seated, he appeared reduced within the ordinary limits of the race. The same contrariety in his members seemed to exist throughout the whole man. His head was large; his shoulders narrow; his arms long and dangling; while his hands were small, if not delicate. His legs and thighs were thin, nearly to emaciation, but of extraordinary length; and his knees would have been considered tremendous, had they not been outdone by the broader foundations on which this false superstructure of the blended human orders was so profanely reared. The ill-assorted and injudicious attire of the individual only served to render his awkwardness more conspicuous. A sky-blue coat, with short and broad skirts and low cape, exposed a long thin neck, and longer and thinner legs, to the worst animadversions of the evil disposed. His nether garment was of yellow nankeen, closely fitted to the shape, and tied at his bunches of knees by large knots of white ribbon, a good deal sullied by use. Clouded cotton stockings, and shoes, on one of the latter of which was a plated spur, completed the costume of the lower extremity of this figure, no curve or angle of which was concealed, but, on the other hand, studiously exhibited, through the vanity or simplicity of its owner. From beneath the flap of an enormous pocket of a soiled vest of embossed silk, heavily ornamented with tarnished silver lace, projected an instrument, which, from being seen in such martial company, might have been easily mistaken for some mischievous and unknown implement of war. Small as it was, this uncommon engine had excited the curiosity of most of the Europeans in the camp, though several of the provincials were seen to handle it, not only without fear, but with the utmost familiarity. A large, civil cocked hat, like those worn by clergymen within the last thirty years, surmounted the whole, furnishing dignity to a good-natured and somewhat vacant countenance, that apparently needed such artificial aid, to support the gravity of some high and extraordinary trust. While the common herd stood aloof, in deference to the quarters of Webb, the figure we have described stalked in the centre of the domestics, freely expressing his censures or commendations on the merits of the horses, as by chance they displeased or satisfied his judgment. "This beast, I rather conclude, friend, is not of home raising, but is from foreign lands, or perhaps from the little island itself over the blue water?" he said, in a voice as remarkable for the softness and sweetness of its tones, as was his person for its rare proportions: "I may speak of these things, and be no braggart; for I have been down at both havens; that which is situate at the mouth of Thames, and is named after the capital of Old England, and that which is called 'Haven,' with the addition of the word 'New'; and have seen the snows and brigantines collecting their droves, like the gathering to the ark, being outward bound to the Island of Jamaica, for the purpose of barter and traffic in four-footed animals; but never before have I beheld a beast which verified the true Scripture war-horse like this: 'He paweth in the valley, and rejoiceth in his strength: he goeth on to meet the armed men. He saith among the trumpets, Ha, ha; and he smelleth the battle afar off, the thunder of the captains, and the shouting.' It would seem that the stock of the horse of Israel has descended to our own time; would it not, friend?" Receiving no reply to this extraordinary appeal, which in truth, as it was delivered with the vigor of full and sonorous tones, merited some sort of notice, he who had thus sung forth the language of the Holy Book turned to the silent figure to whom he had unwittingly addressed himself, and found a new and more powerful subject of admiration in the object that encountered his gaze. His eyes fell on the still, upright, and rigid form of the "Indian runner," who had borne to the camp the unwelcome tidings of the preceding evening. Although in a state of perfect repose, and apparently disregarding, with characteristic stoicism, the excitement and bustle around him, there was a sullen fierceness mingled with the quiet of the savage, that was likely to arrest the attention of much more experienced eyes than those which now scanned him, in unconcealed amazement. The native bore both the tomahawk and knife of his tribe; and yet his appearance was not altogether that of a warrior. On the contrary, there was an air of neglect about his person, like that which might have proceeded from great and recent exertion, which he had not yet found leisure to repair. The colors of the war-paint had blended in dark confusion about his fierce countenance, and rendered his swarthy lineaments still more savage and repulsive than if art had attempted an effect which had been thus produced by chance. His eye, alone, which glistened like a fiery star amid lowering clouds, was to be seen in its state of native wildness. For a single instant, his searching and yet wary glance met the wondering look of the other, and then changing its direction, partly in cunning, and partly in disdain, it remained fixed, as if penetrating the distant air. It is impossible to say what unlooked-for remark this short and silent communication, between two such singular men, might have elicited from the white man, had not his active curiosity been again drawn to other objects. A general movement among the domestics, and a low sound of gentle voices, announced the approach of those whose presence alone was wanted to enable the cavalcade to move. The simple admirer of the war-horse instantly fell back to a low, gaunt, switch-tailed mare, that was unconsciously gleaning the faded herbage of the camp nigh by; where, leaning with one elbow on the blanket that concealed an apology for a saddle, he became a spectator of the departure, while a foal was quietly making its morning repast, on the opposite side of the same animal. A young man, in the dress of an officer, conducted to their steeds two females, who, as it was apparent by their dresses, were prepared to encounter the fatigues of a journey in the woods. One, and she was the most juvenile in her appearance, though both were young, permitted glimpses of her dazzling complexion, fair golden hair, and bright blue eyes, to be caught, as she artlessly suffered the morning air to blow aside the green veil which descended low from her beaver. The flush which still lingered above the pines in the western sky was not more bright nor delicate than the bloom on her cheek; nor was the opening day more cheering than the animated smile which she bestowed on the youth, as he assisted her into the saddle. The other, who appeared to share equally in the attentions of the young officer, concealed her charms from the gaze of the soldiery, with a care that seemed better fitted to the experience of four or five additional years. It could be seen, however, that her person, though moulded with the same exquisite proportions, of which none of the graces were lost by the travelling dress she wore, was rather fuller and more mature than that of her companion. No sooner were these females seated, than their attendant sprang lightly into the saddle of the war-horse, when the whole three bowed to Webb, who, in courtesy, awaited their parting on the threshold of his cabin, and turning their horses' heads, they proceeded at a slow amble, followed by their train, towards the northern entrance of the encampment. As they traversed that short distance, not a voice was heard amongst them; but a slight exclamation proceeded from the younger of the females, as the Indian runner glided by her, unexpectedly, and led the way along the military road in her front. Though this sudden and startling movement of the Indian produced no sound from the other, in the surprise her veil also was allowed to open its folds, and betrayed an indescribable look of pity, admiration, and horror, as her dark eye followed the easy motions of the savage. The tresses of this lady were shining and black, like the plumage of the raven. Her complexion was not brown, but it rather appeared charged with the color of the rich blood, that seemed ready to burst its bounds. And yet there was neither coarseness nor want of shadowing in a countenance that was exquisitely regular and dignified, and surpassingly beautiful. She smiled, as if in pity at her own momentary forgetfulness, discovering by the act a row of teeth that would have shamed the purest ivory; when, replacing the veil, she bowed her face, and rode in silence, like one whose thoughts were abstracted from the scene around her. "Sola, sola, wo, ha, ho, sola!" SHAKESPEARE. While one of the lovely beings we have so cursorily presented to the reader was thus lost in thought, the other quickly recovered from the alarm which induced the exclamation, and, laughing at her own weakness, she inquired of the youth who rode by her side,-- "Are such spectres frequent in the woods, Heyward; or is this sight an especial entertainment on our behalf? If the latter, gratitude must close our mouths; but if the former, both Cora and I shall have need to draw largely on that stock of hereditary courage which we boast, even before we are made to encounter the redoubtable Montcalm." "Yon Indian is a 'runner' of the army; and, after the fashion of his people, he may be accounted a hero," returned the officer. "He has volunteered to guide us to the lake, by a path but little known, sooner than if we followed the tardy movements of the column: and, by consequence, more agreeably." "I like him not," said the lady, shuddering, partly in assumed, yet more in real terror. "You know him, Duncan, or you would not trust yourself so freely to his keeping?" "Say, rather, Alice, that I would not trust you. I do know him, or he would not have my confidence, and least of all at this moment. He is said to be a Canadian, too; and yet he served with our friends the Mohawks, who, as you know, are one of the six allied nations.[3] He was brought among us, as I have heard, by some strange accident in which your father was interested, and in which the savage was rigidly dealt by--but I forget the idle tale; it is enough, that he is now our friend." "If he has been my father's enemy, I like him still less!" exclaimed the now really anxious girl. "Will you not speak to him, Major Heyward, that I may hear his tones? Foolish though it may be, you have often heard me avow my faith in the tones of the human voice!" "It would be in vain; and answered, most probably, by an ejaculation. Though he may understand it, he affects, like most of his people, to be ignorant of the English; and least of all will he condescend to speak it, now that war demands the utmost exercise of his dignity. But he stops; the private path by which we are to journey is, doubtless, at hand." The conjecture of Major Heyward was true. When they reached the spot where the Indian stood, pointing into the thicket that fringed the military road, a narrow and blind path, which might, with some little inconvenience, receive one person at a time, became visible. "Here, then, lies our way," said the young man, in a low voice. "Manifest no distrust, or you may invite the danger you appear to apprehend." "Cora, what think you?" asked the reluctant fair one. "If we journey with the troops, though we may find their presence irksome, shall we not feel better assurance of our safety?" "Being little accustomed to the practices of the savages, Alice, you mistake the place of real danger," said Heyward. "If enemies have reached the portage at all, a thing by no means probable, as our scouts are abroad, they will surely be found skirting the column where scalps abound the most. The route of the detachment is known, while ours, having been determined within the hour, must still be secret." "Should we distrust the man because his manners are not our manners, and that his skin is dark?" coldly asked Cora. Alice hesitated no longer; but giving her Narragansett[4] a smart cut of the whip, she was the first to dash aside the slight branches of the bushes, and to follow the runner along the dark and tangled pathway. The young man regarded the last speaker in open admiration, and even permitted her fairer though certainly not more beautiful companion to proceed unattended, while he sedulously opened the way himself for the passage of her who has been called Cora. It would seem that the domestics had been previously instructed; for, instead of penetrating the thicket, they followed the route of the column; a measure which Heyward stated had been dictated by the sagacity of their guide, in order to diminish the marks of their trail, if, haply, the Canadian savages should be lurking so far in advance of their army. For many minutes the intricacy of the route admitted of no further dialogue; after which they emerged from the broad border of underbrush which grew along the line of the highway, and entered under the high but dark arches of the forest. Here their progress was less interrupted, and the instant the guide perceived that the females could command their steeds, he moved on, at a pace between a trot and a walk, and at a rate which kept the sure-footed and peculiar animals they rode, at a fast yet easy amble. The youth had turned to speak to the dark-eyed Cora, when the distant sound of horses' hoofs, clattering over the roots of the broken way in his rear, caused him to check his charger; and, as his companions drew their reins at the same instant, the whole party came to a halt, in order to obtain an explanation of the unlooked-for interruption. In a few moments a colt was seen gliding, like a fallow-deer, among the straight trunks of the pines; and, in another instant, the person of the ungainly man described in the preceding chapter, came into view, with as much rapidity as he could excite his meagre beast to endure without coming to an open rupture. Until now this personage had escaped the observation of the travellers. If he possessed the power to arrest any wandering eye when exhibiting the glories of his altitude on foot, his equestrian graces were still more likely to attract attention. Notwithstanding a constant application of his one armed heel to the flanks of the mare, the most confirmed gait that he could establish was a Canterbury gallop with the hind legs, in which those more forward assisted for doubtful moments, though generally content to maintain a loping trot. Perhaps the rapidity of the changes from one of these paces to the other created an optical illusion, which might thus magnify the powers of the beast; for it is certain that Heyward, who possessed a true eye for the merits of a horse, was unable, with his utmost ingenuity, to decide by what sort of movement his pursuer worked his sinuous way on his footsteps with such persevering hardihood. The industry and movements of the rider were not less remarkable than those of the ridden. At each change in the evolutions of the latter, the former raised his tall person in the stirrups; producing, in this manner, by the undue elongation of his legs, such sudden growths and diminishings of the stature, as baffled every conjecture that might be made as to his dimensions. If to this be added the fact that, in consequence of the ex parte application of the spur, one side of the mare appeared to journey faster than the other; and that the aggrieved flank was resolutely indicated by unremitted flourishes of a bushy tail, we finish the picture of both horse and man. The frown which had gathered around the handsome, open, and manly brow of Heyward, gradually relaxed, and his lips curled into a slight smile, as he regarded the stranger. Alice made no very powerful effort to control her merriment; and even the dark, thoughtful eye of Cora lighted with a humor that, it would seem, the habit, rather than the nature of its mistress repressed. "Seek you any here?" demanded Heyward, when the other had arrived sufficiently nigh to abate his speed; "I trust you are no messenger of evil tidings?" "Even so," replied the stranger, making diligent use of his triangular castor, to produce a circulation in the close air of the woods, and leaving his hearers in doubt to which of the young man's questions he responded; when, however, he had cooled his face, and recovered his breath, he continued, "I hear you are riding to William Henry; as I am journeying thitherward myself, I concluded good company would seem consistent to the wishes of both parties." "You appear to possess the privilege of a casting vote," returned Heyward; "we are three, whilst you have consulted no one but yourself." "Even so. The first point to be obtained is to know one's own mind. Once sure of that, and where women are concerned, it is not easy, the next is, to act up to the decision. I have endeavored to do both, and here I am." "If you journey to the lake, you have mistaken your route," said Heyward, haughtily; "the highway thither is at least half a mile behind you." "Even so," returned the stranger, nothing daunted by this cold reception; "I have tarried at 'Edward' a week, and I should be dumb not to have inquired the road I was to journey; and if dumb there would be an end to my calling." After simpering in a small way, like one whose modesty prohibited a more open expression of his admiration of a witticism that was perfectly unintelligible to his hearers, he continued: "It is not prudent for any one of my profession to be too familiar with those he is to instruct; for which reason I follow not the line of the army; besides which, I conclude that a gentleman of your character has the best judgment in matters of wayfaring; I have therefore decided to join company, in order that the ride may be made agreeable, and partake of social communion." "A most arbitrary, if not a hasty decision!" exclaimed Heyward, undecided whether to give vent to his growing anger, or to laugh in the other's face. "But you speak of instruction, and of a profession; are you an adjunct to the provincial corps, as a master of the noble science of defence and offence; or, perhaps, you are one who draws lines and angles, under the pretence of expounding the mathematics?" The stranger regarded his interrogator a moment, in wonder; and then, losing every mark of self-satisfaction in an expression of solemn humility, he answered:-- "Of offence, I hope there is none, to either party: of defence, I make none--by God's good mercy, having committed no palpable sin since last entreating his pardoning grace. I understand not your allusions about lines and angles; and I leave expounding to those who have been called and set apart for that holy office. I lay claim to no higher gift than a small insight into the glorious art of petitioning and thanksgiving, as practised in psalmody." "The man is, most manifestly, a disciple of Apollo," cried the amused Alice, "and I take him under my own especial protection. Nay, throw aside that frown, Heyward, and in pity to my longing ears, suffer him to journey in our train. Besides," she added, in a low and hurried voice, casting a glance at the distant Cora, who slowly followed the footsteps of their silent but sullen guide, "it may be a friend added to our strength, in time of need." "Think you, Alice, that I would trust those I love by this secret path, did I imagine such need could happen?" "Nay, nay, I think not of it now; but this strange man amuses me; and if he 'hath music in his soul,' let us not churlishly reject his company." She pointed persuasively along the path with her riding-whip, while their eyes met in a look which the young man lingered a moment to prolong; then yielding to her gentle influence, he clapped his spurs into his charger, and in a few bounds was again at the side of Cora. "I am glad to encounter thee, friend," continued the maiden, waving her hand to the stranger to proceed, as she urged her Narragansett to renew its amble. "Partial relatives have almost persuaded me that I am not entirely worthless in a duet myself; and we may enliven our wayfaring by indulging in our favorite pursuit. It might be of signal advantage to one, ignorant as I, to hear the opinions and experience of a master in the art." "It is refreshing both to the spirits and to the body to indulge in psalmody, in befitting seasons," returned the master of song, unhesitatingly complying with her intimation to follow; "and nothing would relieve the mind more than such a consoling communion. But four parts are altogether necessary to the perfection of melody. You have all the manifestations of a soft and rich treble; I can, by especial aid, carry a full tenor to the highest letter; but we lack counter and bass! Yon officer of the king, who hesitated to admit me to his company, might fill the latter, if one may judge from the intonations of his voice in common dialogue." "Judge not too rashly from hasty and deceptive appearances," said the lady, smiling; "though Major Heyward can assume such deep notes on occasion, believe me, his natural tones are better fitted for a mellow tenor than the bass you heard." "Is he, then, much practised in the art of psalmody?" demanded her simple companion. Alice felt disposed to laugh, though she succeeded in suppressing her merriment, ere she answered,-- "I apprehend that he is rather addicted to profane song. The chances of a soldier's life are but little fitted for the encouragement of more sober inclinations." "Man's voice is given to him, like his other talents, to be used, and not to be abused. None can say they have ever known me neglect my gifts! I am thankful that, though my boyhood may be said to have been set apart, like the youth of the royal David, for the purposes of music, no syllable of rude verse has ever profaned my lips." "You have, then, limited your efforts to sacred song?" "Even so. As the psalms of David exceed all other language, so does the psalmody that has been fitted to them by the divines and sages of the land, surpass all vain poetry. Happily, I may say that I utter nothing but the thoughts and the wishes of the King of Israel himself; for though the times may call for some slight changes, yet does this version which we use in the colonies of New England, so much exceed all other versions, that, by its richness, its exactness, and its spiritual simplicity, it approacheth, as near as may be, to the great work of the inspired writer. I never abide in any place, sleeping or waking, without an example of this gifted work. 'Tis the six-and-twentieth edition, promulgated at Boston, Anno Domini 1744; and is entitled, _The Psalms, Hymns, and Spiritual Songs of the Old and New Testaments; faithfully translated into English Metre, for the Use, Edification, and Comfort of the Saints, in Public and Private, especially in New England_." During this eulogium on the rare production of his native poets, the stranger had drawn the book from his pocket, and, fitting a pair of iron-rimmed spectacles to his nose, opened the volume with a care and veneration suited to its sacred purposes. Then, without circumlocution or apology, first pronouncing the word "Standish," and placing the unknown engine, already described, to his mouth, from which he drew a high, shrill sound, that was followed by an octave below, from his own voice, he commenced singing the following words, in full, sweet, and melodious tones, that set the music, the poetry, and even the uneasy motion of his ill-trained beast at defiance:-- "How good it is, O see, And how it pleaseth well, Together, e'en in unity, For brethren so to dwell. It's like the choice ointment, From the head to the beard did go: Down Aaron's beard, that downward went, His garment's skirts unto." The delivery of these skilful rhymes was accompanied, on the part of the stranger, by a regular rise and fall of his right hand, which terminated at the descent, by suffering the fingers to dwell a moment on the leaves of the little volume; and on the ascent, by such a flourish of the member as none but the initiated may ever hope to imitate. It would seem that long practice had rendered this manual accompaniment necessary; for it did not cease until the preposition which the poet had selected for the close of his verse, had been duly delivered like a word of two syllables. Such an innovation on the silence and retirement of the forest could not fail to enlist the ears of those who journeyed at so short a distance in advance. The Indian muttered a few words in broken English to Heyward, who, in his turn, spoke to the stranger; at once interrupting, and, for the time, closing his musical efforts. "Though we are not in danger, common prudence would teach us to journey through this wilderness in as quiet a manner as possible. You will, then, pardon me, Alice, should I diminish your enjoyments, by requesting this gentleman to postpone his chant until a safer opportunity." "You will diminish them, indeed," returned the arch girl, "for never did I hear a more unworthy conjunction of execution and language, than that to which I have been listening; and I was far gone in a learned inquiry into the causes of such an unfitness between sound and sense, when you broke the charm of my musings by that bass of yours, Duncan!" "I know not what you call my bass," said Heyward, piqued at her remark, "but I know that your safety, and that of Cora, is far dearer to me than could be any orchestra of Handel's music." He paused and turned his head quickly towards a thicket, and then bent his eyes suspiciously on their guide, who continued his steady pace, in undisturbed gravity. The young man smiled to himself, for he believed he had mistaken some shining berry of the woods for the glistening eyeballs of a prowling savage, and he rode forward, continuing the conversation which had been interrupted by the passing thought. Major Heyward was mistaken only in suffering his youthful and generous pride to suppress his active watchfulness. The cavalcade had not long passed, before the branches of the bushes that formed the thicket were cautiously moved asunder, and a human visage, as fiercely wild as savage art and unbridled passions could make it, peered out on the retiring footsteps of the travellers. A gleam of exultation shot across the darkly painted lineaments of the inhabitant of the forest, as he traced the route of his intended victims, who rode unconsciously onward; the light and graceful forms of the females waving among the trees, in the curvatures of their path, followed at each bend by the manly figure of Heyward, until, finally, the shapeless person of the singing-master was concealed behind the numberless trunks of trees, that rose, in dark lines, in the intermediate space. Q: Can you write an appropriate summary of the above paragraphs? A: Before any characters appear, the time and geography are made clear. Though it is the last war that England and France waged for a country that neither would retain, the wilderness between the forces still has to be overcome first. Thus it is in 1757, in the New York area between the head waters of the Hudson River and Lake George to the north. Because only two years earlier General Braddock was disgracefully routed by a handful of French and Indians, the frontier is now exposed to real and imaginary savage disasters as well as to the horrors of warfare. Fear has replaced reason. Near dusk of a day in July, an Indian runner named Magua arrives at Fort Edward on the upper Hudson. He has come from Fort William Henry at the southern tip of Lake George with the news that the French General Montcalm is moving south with a very large army and that Munro, commander of Fort William Henry, is in urgent need of plentiful reinforcements from General Webb. Early the next morning, a limited detachment of fifteen hundred regulars and colonists departs as if swallowed by the forest. Shortly afterwards, Major Duncan Heyward and Alice and Cora Munro, guided by Magua on foot, take by horseback a secret route toward William Henry for the girls to join their father. Blonde Alice is doubtful about Magua, covered with war paint and showing a sullen fierceness; but dark-haired Cora is stoically common sense about him, even though Heyward mentions that their father had once had to deal rigidly with the Indian. As the small party pushes on, they are overtaken by David Gamut, a tall, ungainly psalmodist ridiculously dressed and carrying a pitch pipe while riding a mare followed by its young colt. He desires to join them, and after some banter between him and Alice, he pulls out the twenty-sixth edition of The Bay Psalm Book, sounds his pipe, and renders a song "in full, sweet, and melodious tones." At a muttered comment from Magua, Heyward insists upon silence for safety. Then he glances about them and, satisfied that he has seen only shining berries, smiles to himself as they move on. But he is wrong. The branches move and a man peers exultingly after them as they disappear among the dark lines of trees. -------------------------------------------------------------------------------- /easykv/easykv.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from typing import Tuple 3 | import math 4 | import statistics 5 | from functools import partial 6 | from .utils import modify_method_of_instance 7 | from .llama_patch import llama_forward, llama_forward_stream 8 | from .mistral_patch import mistral_forward, mistral_forward_stream 9 | 10 | def cache_size(kv_cache): 11 | """ 12 | the amount of memory(MB) 13 | """ 14 | cnt = 0 15 | for i in range(len(kv_cache)): 16 | for tensor in kv_cache[i]: 17 | cnt += tensor.numel() 18 | return cnt*2/(1024**2) 19 | 20 | def gpu_stats(): 21 | torch.cuda.empty_cache() 22 | memory_stats = torch.cuda.memory_stats() 23 | print("Current GPU memory usage:", round(memory_stats["allocated_bytes.all.current"]/(1024**3), 3), "GB") 24 | print("Peak GPU memory usage:", round(memory_stats["allocated_bytes.all.peak"]/(1024**3), 3), "GB") 25 | # print("Reserved GPU memory:", round(memory_stats["reserved_bytes.all.allocated"]/(1024**3), 3), "GB") 26 | 27 | 28 | # ANSI code for different colors 29 | class Color: 30 | RESET = '\033[0m' 31 | RED = '\033[91m' 32 | GREEN = '\033[92m' 33 | YELLOW = '\033[93m' 34 | BLUE = '\033[94m' 35 | PURPLE = '\033[95m' 36 | CYAN = '\033[96m' 37 | 38 | @staticmethod 39 | def print(content, color: str): 40 | print(f"{getattr(Color, color.upper())}{content}{Color.RESET}") 41 | 42 | 43 | def relu_normalize(p, q): 44 | """ 45 | Construct the modified sampling distribution 46 | """ 47 | tmp_dist = torch.relu(p-q) 48 | return tmp_dist / tmp_dist.sum(dim=-1, keepdim=True) 49 | 50 | def entropy(p): 51 | """ 52 | Shano entropy of a distribution 53 | """ 54 | return -torch.sum(p*p.log(), dim=-1) 55 | 56 | def truncate_kv_cache_silo(kv_cache, eviction_ids): 57 | kv_cache = list(kv_cache) 58 | for i in range(len(kv_cache)): 59 | kv_cache[i] = list(kv_cache[i]) 60 | l = kv_cache[0][0].shape[2] 61 | head_dim = kv_cache[0][0].shape[-1] 62 | num_heads = kv_cache[0][0].shape[1] 63 | for i in range(len(eviction_ids)): 64 | _index = torch.arange(l, device=kv_cache[0][0].device).unsqueeze(0).repeat(len(eviction_ids[i]), 1) # (num_heads, l) 65 | mask = (_index != torch.tensor(eviction_ids[i], device=kv_cache[0][0].device).unsqueeze(-1)) # (num_heads, l) 66 | kv_cache[i][0] = kv_cache[i][0][0][mask, ...].view(1, num_heads, -1, head_dim) 67 | kv_cache[i][1] = kv_cache[i][1][0][mask, ...].view(1, num_heads, -1, head_dim) 68 | return kv_cache 69 | 70 | def truncate_kv_cache_liso(kv_cache, eviction_ids): 71 | kv_cache = list(kv_cache) 72 | for i in range(len(kv_cache)): 73 | kv_cache[i] = list(kv_cache[i]) 74 | l = kv_cache[0][0].shape[2] 75 | head_dim = kv_cache[0][0].shape[-1] 76 | num_heads = kv_cache[0][0].shape[1] 77 | for i in range(eviction_ids.shape[0]): 78 | src_ = torch.zeros(num_heads, eviction_ids.shape[-1]).to(kv_cache[0][0].device) 79 | mask = torch.ones(num_heads, l, device=kv_cache[0][0].device).scatter(dim=-1, index=eviction_ids[i], src=src_).bool() 80 | kv_cache[i][0] = kv_cache[i][0][0][mask, ...].view(1, num_heads, -1, head_dim) 81 | kv_cache[i][1] = kv_cache[i][1][0][mask, ...].view(1, num_heads, -1, head_dim) 82 | return kv_cache 83 | 84 | def truncate_kv_cache_liso_mean(kv_cache, eviction_ids): 85 | """ 86 | eviction_ids: (num_layers, num_heads, k+1) 87 | """ 88 | kv_cache = list(kv_cache) 89 | for i in range(len(kv_cache)): 90 | kv_cache[i] = list(kv_cache[i]) 91 | l = kv_cache[0][0].shape[2] 92 | head_dim = kv_cache[0][0].shape[-1] 93 | num_heads = kv_cache[0][0].shape[1] 94 | for i in range(eviction_ids.shape[0]): 95 | src_ = torch.zeros(num_heads, eviction_ids.shape[-1]).to(kv_cache[0][0].device) 96 | mask = torch.ones(num_heads, l, device=kv_cache[0][0].device).scatter(dim=-1, index=eviction_ids[i], src=src_).bool() 97 | evicted_mask = ~mask 98 | key_evicted_mean = torch.mean(kv_cache[i][0][0][evicted_mask, ...].view(1, num_heads, -1, head_dim), dim=2, keepdim=True) 99 | value_evicted_mean = torch.mean(kv_cache[i][1][0][evicted_mask, ...].view(1, num_heads, -1, head_dim), dim=2, keepdim=True) 100 | kv_cache[i][0] = torch.cat((kv_cache[i][0][0][mask, ...].view(1, num_heads, -1, head_dim), key_evicted_mean), dim=2) 101 | kv_cache[i][1] = torch.cat((kv_cache[i][1][0][mask, ...].view(1, num_heads, -1, head_dim), value_evicted_mean), dim=2) 102 | return kv_cache 103 | 104 | 105 | def truncate_kv_cache(kv_cache: Tuple, start, end): 106 | remain_id = torch.tensor(list(sorted(list(set(list(range(kv_cache[0][0].shape[2]))).difference(set(list(range(start, end))))))), device=kv_cache[0][0].device) 107 | kv_cache = list(kv_cache) 108 | for i in range(len(kv_cache)): 109 | kv_cache[i] = list(kv_cache[i]) 110 | kv_cache[i][0] = kv_cache[i][0][:, :, remain_id, :] 111 | kv_cache[i][1] = kv_cache[i][1][:, :, remain_id, :] 112 | return kv_cache 113 | 114 | 115 | def logits_adapter(logits: torch.Tensor, temperature: float, top_p: float): 116 | """ 117 | Apply given transformation to the input logits, including temperature scaling and top_p renormalization 118 | """ 119 | flag = False 120 | if logits.ndim==3: 121 | bsz = logits.shape[0] 122 | l = logits.shape[1] 123 | logits = logits.view(-1, logits.shape[-1]) 124 | flag = True 125 | prob = torch.softmax(logits / temperature, dim=-1) 126 | sorted_prob, sorted_prob_idx = torch.sort(prob, descending=True, dim=-1) 127 | cumsum = torch.cumsum(sorted_prob, dim=-1) 128 | mask = (cumsum - sorted_prob) > top_p 129 | sorted_prob[mask] = 0.0 130 | sorted_prob.div_(sorted_prob.sum(dim=-1, keepdim=True)) 131 | _, gather_pos = torch.sort(sorted_prob_idx, descending=False, dim=-1) 132 | final_prob = torch.gather(sorted_prob, -1, gather_pos) 133 | if flag: final_prob = final_prob.view(bsz, l, -1) 134 | return final_prob, torch.softmax(logits, dim=-1) 135 | 136 | 137 | def h2o_head_decay_score(attention_map, decay_factor, device, stride): 138 | num_heads = attention_map[0].shape[1] 139 | num_layers = len(attention_map) 140 | budget = attention_map[0].shape[-1] 141 | cache_attn_scores = torch.tensor([[[0.0]*(budget+stride) for _ in range(num_heads)] for _ in range(num_layers)], device=device) 142 | decay_tensor = torch.tensor([decay_factor**power for power in range(budget)], device=device).flip(dims=(0,)).unsqueeze(-1).unsqueeze(0).repeat(num_heads, 1, budget) # (1, budget, budget) 143 | for l in range(num_layers): 144 | cache_attn_scores[l, :, :-stride] = torch.sum(attention_map[l][0] * decay_tensor, dim=1) * (1.0 - decay_factor) 145 | return cache_attn_scores 146 | 147 | def h2o_head_decay_prob_score(attention_map, decay_factor, device, probs): 148 | num_heads = attention_map[0].shape[1] 149 | num_layers = len(attention_map) 150 | budget = attention_map[0].shape[-1] 151 | cache_attn_scores = torch.tensor([[[0.0]*(budget+1) for _ in range(num_heads)] for _ in range(num_layers)], device=device) 152 | decay_tensor = torch.tensor([decay_factor**power for power in range(budget)], device=device).flip(dims=(0,)).unsqueeze(-1).unsqueeze(0).repeat(num_heads, 1, budget) # (1, budget, budget) 153 | probs = torch.tensor(probs, device=device).unsqueeze(-1).unsqueeze(0).repeat(num_heads, 1, budget) # (num_heads, budgett, budget) 154 | for l in range(num_layers): 155 | cache_attn_scores[l, :, :-1] = torch.sum(attention_map[l][0] * decay_tensor * probs, dim=1) * (1.0 - decay_factor) 156 | return cache_attn_scores 157 | 158 | def h2o_head_prob_score(attention_map, device, probs, mode:str='v1'): 159 | num_heads = attention_map[0].shape[1] 160 | num_layers = len(attention_map) 161 | budget = attention_map[0].shape[-1] 162 | cache_attn_scores = torch.tensor([[[0.0]*(budget+1) for _ in range(num_heads)] for _ in range(num_layers)], device=device) 163 | cache_attn_scores_square = torch.tensor([[[0.0]*(budget+1) for _ in range(num_heads)] for _ in range(num_layers)], device=device) 164 | if mode == 'v1': 165 | probs = 1.0-torch.tensor(probs, device=device).unsqueeze(-1).unsqueeze(0).repeat(num_heads, 1, budget) # (num_heads, budgett, budget) 166 | elif mode == 'v2': 167 | probs = torch.tensor(probs, device=device).unsqueeze(-1).unsqueeze(0).repeat(num_heads, 1, budget) # (num_heads, budgett, budget) 168 | for l in range(num_layers): 169 | cache_attn_scores[l, :, :-1] = torch.sum(attention_map[l][0] * probs, dim=1) 170 | cache_attn_scores_square[l, :, :-1] = torch.sum((attention_map[l][0] * probs)**2, dim=1) 171 | return cache_attn_scores, cache_attn_scores_square 172 | 173 | def h2o_head_score(attention_map, device, stride, budget, num_layers, num_heads, empty=False): 174 | # if attention_map is not None: 175 | # attention_map = list(attention_map) 176 | # num_layers = len(attention_map) 177 | # budget = attention_map[0].shape[-1] 178 | cache_attn_scores = torch.tensor([[[0.0]*(budget+stride) for _ in range(num_heads)] for _ in range(num_layers)], device=device) 179 | cache_attn_scores_square = torch.tensor([[[0.0]*(budget+stride) for _ in range(num_heads)] for _ in range(num_layers)], device=device) 180 | if not empty: 181 | for l in range(num_layers): 182 | attention_map[l] = attention_map[l].to('cuda') 183 | cache_attn_scores[l, :, :attention_map[l].shape[-1]] = torch.sum(attention_map[l][0], dim=1) 184 | cache_attn_scores_square[l, :, :attention_map[l].shape[-1]] = torch.sum(attention_map[l][0]**2, dim=1) 185 | attention_map[l] = None 186 | return cache_attn_scores, cache_attn_scores_square 187 | 188 | def process_for_mqa_gqa(attentions, num_layers, num_heads, rep_n): 189 | # Unified processing for MQA, GQA and MHA 190 | attentions = list(attentions) 191 | for l in range(num_layers): 192 | bs = attentions[l].shape[0] 193 | sl = attentions[l].shape[2] 194 | tl = attentions[l].shape[3] 195 | attentions[l] = attentions[l].reshape(bs, num_heads, rep_n, sl, tl).mean(dim=2) # (bs, num_kv_heads, sl, tl) 196 | return attentions 197 | 198 | 199 | @torch.inference_mode() 200 | def generate(self, input_ids, generation_config, kv_mode='encoding', stride=1, report_decoding_latency: bool=False): 201 | temperature = generation_config.get('temperature', 1.0) 202 | top_p = generation_config.get('top_p', 1.0) 203 | max_new_tokens = generation_config.get('max_new_tokens', 1024) 204 | budget = generation_config.get('budget', 0.5) 205 | mode = generation_config.get('kv_policy', 'recency') 206 | temp_length = generation_config.get('temp_length', 4) 207 | recent_ratio = generation_config.get('recent_ratio', 0.1) 208 | keep_attention = generation_config.get('keep_attention', False) 209 | eos_token_ids = generation_config.get('eos_token_ids', [self.tokenizer.eos_token_id]) 210 | streaming = generation_config.get('streaming', False) 211 | num_layers = self.config.num_hidden_layers 212 | if not hasattr(self.config, "num_key_value_heads"): num_heads = self.config.num_attention_heads 213 | else: num_heads = self.config.num_key_value_heads 214 | tokenizer = self.tokenizer 215 | # Handle MQA and GQA 216 | is_gqa = hasattr(self.config, "num_key_value_heads") and getattr(self.config, "num_key_value_heads") != getattr(self.config, "num_attention_heads") 217 | if is_gqa: rep_n = self.config.num_attention_heads // self.config.num_key_value_heads 218 | else: rep_n = 1 219 | length = input_ids.shape[-1] 220 | if kv_mode == 'auto': 221 | length = input_ids.shape[-1] 222 | assert type(budget) == int 223 | if budget > length: 224 | kv_mode = 'decoding' 225 | budget -= length 226 | else: 227 | kv_mode = 'encoding_decoding' 228 | if kv_mode == 'decoding': 229 | """ 230 | auto-regressive decoding 231 | """ 232 | outputs_prefilling = self(input_ids=input_ids, use_cache=True) 233 | prefix_token_lst = input_ids[0].cpu().numpy().tolist() 234 | past_key_values, logits = outputs_prefilling.past_key_values, outputs_prefilling.logits 235 | logits_prev_step = logits[:, -1, :] 236 | prob_prev_step, raw_prob_prev_step = logits_adapter(logits_prev_step, temperature, top_p) 237 | 238 | cache_tokens = [] 239 | cache_probs = [] 240 | cache_cur_probs = [] 241 | cache_positions = [] 242 | cache_attn_scores = torch.tensor([[[0.0]*(budget+1) for _ in range(num_heads)] for _ in range(num_layers)], device=self.device) 243 | cache_attn_scores_square = torch.tensor([[[0.0]*(budget+1) for _ in range(num_heads)] for _ in range(num_layers)], device=self.device) 244 | cache_counter = torch.tensor([[[1.0]*(budget+1) for _ in range(num_heads)] for _ in range(num_layers)], device=self.device) 245 | cache_counter = torch.cumsum(cache_counter, dim=-1).flip(dims=(2,)) - 1.0 246 | cache_counter_token = torch.tensor([1.0]*(budget+1), device=self.device) 247 | cache_counter_token = torch.cumsum(cache_counter_token, dim=-1).flip(dims=(0, )) - 1.0 248 | n = 0 249 | output_ids = [] 250 | token_probs = [] 251 | cur_pos_id = past_key_values[0][0].shape[2] 252 | evicted_positions = [] 253 | if 'llama' in self.config.architectures[0].lower(): 254 | modify_method_of_instance(self, "LlamaAttention", "forward", partial(llama_forward if not streaming else llama_forward_stream, attn_device='cuda')) 255 | else: 256 | modify_method_of_instance(self, "MistralAttention", "forward", partial(mistral_forward if not streaming else mistral_forward_stream, attn_device='cuda')) 257 | while n < max_new_tokens: 258 | next_token = torch.multinomial(prob_prev_step, num_samples=1) 259 | output_ids.append(next_token[0, 0].cpu().item()) 260 | next_token_prob = torch.gather(raw_prob_prev_step, -1, next_token) # (bsz, 1) 261 | token_probs.append((tokenizer.convert_ids_to_tokens([output_ids[-1]])[0], next_token_prob[0, 0].cpu().item())) 262 | n += 1 263 | if output_ids[-1] in eos_token_ids: break 264 | outputs = self(input_ids=next_token, 265 | past_key_values=past_key_values, 266 | attention_mask=torch.ones(next_token.shape[0], 1+past_key_values[0][0].shape[2], dtype=torch.long, device=next_token.device), 267 | position_ids=torch.LongTensor([cur_pos_id]).to(self.device).view(-1, 1), 268 | use_cache=True, 269 | output_attentions=True) 270 | # unified processing for GQA and MHA 271 | outputs.attentions = list(outputs.attentions) 272 | for l in range(num_layers): 273 | bs = outputs.attentions[l].shape[0] 274 | sl = outputs.attentions[l].shape[2] 275 | tl = outputs.attentions[l].shape[3] 276 | outputs.attentions[l] = outputs.attentions[l].reshape(bs, num_heads, rep_n, sl, tl).mean(dim=2) # (bs, num_kv_heads, sl, tl) 277 | past_key_values = outputs.past_key_values 278 | logits_prev_step = outputs.logits[:, -1, :] 279 | cache_cur_probs.append(torch.exp(-entropy(raw_prob_prev_step))[0].cpu().item()) 280 | prob_prev_step, raw_prob_prev_step = logits_adapter(logits_prev_step, temperature, top_p) 281 | 282 | # update 283 | cache_probs.append(next_token_prob[0,0].cpu().item()) 284 | cache_tokens.append(output_ids[-1]) 285 | cache_positions.append(cur_pos_id) 286 | 287 | # update accumulated attention scores 288 | if 'h2o_head' == mode: 289 | for l in range(num_layers): 290 | attention_map = outputs.attentions[l][0, :, 0, len(prefix_token_lst):] # (num_heads, l) 291 | cache_attn_scores[l, :, :attention_map.shape[-1]] += attention_map 292 | elif 'roco' == mode: 293 | for l in range(num_layers): 294 | attention_map = outputs.attentions[l][0, :, 0, len(prefix_token_lst):] # (num_heads, l) 295 | cache_attn_scores[l, :, :attention_map.shape[-1]] += attention_map 296 | cache_attn_scores_square[l, :, :attention_map.shape[-1]] += attention_map ** 2 297 | elif 'tova' == mode: 298 | for l in range(num_layers): 299 | attention_map = outputs.attentions[l][0, :, 0, len(prefix_token_lst):] # (num_heads, l) 300 | cache_attn_scores[l, :, :attention_map.shape[-1]] = attention_map 301 | # evict if current kv cache size exceeds the budget 302 | cur_kv_size = past_key_values[0][0].shape[2] 303 | if (cur_kv_size-len(prefix_token_lst)) > budget and mode != 'full': 304 | cache_counter += 1.0 305 | cache_counter_token += 1.0 306 | positions_tensor = torch.tensor(cache_positions, device=self.device).float() 307 | positions_tensor = positions_tensor / float(cur_pos_id) 308 | recent_ratio = 0.3 309 | recent_window = int(budget*recent_ratio) 310 | if mode in ['h2o_head']: 311 | eviction_ids = torch.argmin(cache_attn_scores[:, :, :-recent_window], dim=-1) + len(prefix_token_lst) 312 | _eviction_ids = eviction_ids 313 | eviction_ids = eviction_ids.cpu().numpy().tolist() 314 | past_key_values = truncate_kv_cache_silo(past_key_values, eviction_ids) 315 | _index = torch.arange(cache_attn_scores.shape[-1], device=self.device).unsqueeze(0).unsqueeze(0).repeat(num_layers, num_heads, 1) 316 | _eviction_ids -= len(prefix_token_lst) 317 | mask = (_eviction_ids.unsqueeze(-1)!=_index).view(-1, _index.shape[-1]) 318 | cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, _index.shape[-1]-1), torch.zeros(num_layers, num_heads, 1, device=self.device)), dim=-1) 319 | elif mode in ['roco']: 320 | cur_std = torch.sqrt(cache_attn_scores_square / cache_counter - (cache_attn_scores / cache_counter)**2) 321 | cur_std[:, :, -10:] = 1e9 322 | _, feasible_ids = torch.topk(cur_std, largest=False, k=budget-recent_window, dim=-1) # (layers, heads, k) 323 | argmin_id = torch.argmin(cache_attn_scores.gather(dim=-1, index=feasible_ids) / cache_counter.gather(dim=-1, index=feasible_ids), dim=-1).unsqueeze(-1) # (layers, heads) 324 | eviction_ids = feasible_ids.gather(dim=-1, index=argmin_id).squeeze(-1) + len(prefix_token_lst) 325 | _eviction_ids = eviction_ids 326 | eviction_ids = eviction_ids.cpu().numpy().tolist() 327 | past_key_values = truncate_kv_cache_silo(past_key_values, eviction_ids) 328 | _index = torch.arange(cache_attn_scores.shape[-1], device=self.device).unsqueeze(0).unsqueeze(0).repeat(num_layers, num_heads, 1) 329 | _eviction_ids -= len(prefix_token_lst) 330 | mask = (_eviction_ids.unsqueeze(-1)!=_index).view(-1, _index.shape[-1]) 331 | cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, _index.shape[-1]-1), torch.zeros(num_layers, num_heads, 1, device=self.device)), dim=-1) 332 | cache_attn_scores_square = torch.cat((cache_attn_scores_square.view(-1, cache_attn_scores_square.shape[-1])[mask].view(num_layers, num_heads, _index.shape[-1]-1), torch.zeros(num_layers, num_heads, 1, device=self.device)), dim=-1) 333 | cache_counter = torch.cat((cache_counter.view(-1, cache_counter.shape[-1])[mask].view(num_layers, num_heads, _index.shape[-1]-1), torch.zeros(num_layers, num_heads, 1, device=self.device)), dim=-1) 334 | elif mode == 'tova': 335 | eviction_ids = torch.argmin(cache_attn_scores, dim=-1) + len(prefix_token_lst) 336 | _eviction_ids = eviction_ids 337 | eviction_ids = eviction_ids.cpu().numpy().tolist() 338 | past_key_values = truncate_kv_cache_silo(past_key_values, eviction_ids) 339 | _index = torch.arange(cache_attn_scores.shape[-1], device=self.device).unsqueeze(0).unsqueeze(0).repeat(num_layers, num_heads, 1) 340 | _eviction_ids -= len(prefix_token_lst) 341 | mask = (_eviction_ids.unsqueeze(-1)!=_index).view(-1, _index.shape[-1]) 342 | cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, _index.shape[-1]-1), torch.zeros(num_layers, num_heads, 1, device=self.device)), dim=-1) 343 | elif mode == 'recency': 344 | scores = 1.0 - positions_tensor 345 | _, evict_id = torch.topk(scores, k=1, dim=-1) 346 | evict_id = evict_id[0].cpu().item() 347 | past_key_values = truncate_kv_cache(past_key_values, start=len(prefix_token_lst)+evict_id, end=len(prefix_token_lst)+evict_id+1) 348 | evicted_positions.append(cache_positions[evict_id]-len(prefix_token_lst)) 349 | cache_probs.pop(evict_id) 350 | cache_tokens.pop(evict_id) 351 | cache_cur_probs.pop(evict_id) 352 | cache_positions.pop(evict_id) 353 | elif mode == 'random': 354 | scores = torch.rand(*positions_tensor.shape).to(self.device) 355 | _, evict_id = torch.topk(scores, k=1, dim=-1) 356 | evict_id = evict_id[0].cpu().item() 357 | past_key_values = truncate_kv_cache(past_key_values, start=len(prefix_token_lst)+evict_id, end=len(prefix_token_lst)+evict_id+1) 358 | evicted_positions.append(cache_positions[evict_id]-len(prefix_token_lst)) 359 | cache_probs.pop(evict_id) 360 | cache_tokens.pop(evict_id) 361 | cache_cur_probs.pop(evict_id) 362 | cache_positions.pop(evict_id) 363 | cur_pos_id += 1 364 | _tmp = past_key_values[0][0].shape[2]-len(prefix_token_lst) 365 | print(f"KV cache budget ratio: {_tmp / len(output_ids) *100:.2f}%({_tmp}/{len(output_ids)})") 366 | return tokenizer.decode(output_ids, skip_special_tokens=True).strip() 367 | elif kv_mode == 'encoding': 368 | """ 369 | prompt encoding/prefilling 370 | """ 371 | length = input_ids.shape[-1] 372 | if type(budget) == float and budget >= 1.0 or type(budget) == int and budget >= length: 373 | outputs_prefilling = self(input_ids=input_ids, use_cache=True) 374 | past_key_values, logits = outputs_prefilling.past_key_values, outputs_prefilling.logits 375 | logits_prev_step = logits[:, -1, :] 376 | prob_prev_step, raw_prob_prev_step = logits_adapter(logits_prev_step, temperature, top_p) 377 | cur_pos_id = past_key_values[0][0].shape[2] 378 | else: 379 | # In case budget is also large, the attention_map will occupy a lot of memory 380 | # We offload attention_map to CPU first and move it layer by laer to GPU to compute eviction score 381 | if 'llama' in self.config.architectures[0].lower(): 382 | modify_method_of_instance(self, "LlamaAttention", "forward", partial(llama_forward if not streaming else llama_forward_stream, attn_device='cuda')) 383 | else: 384 | modify_method_of_instance(self, "MistralAttention", "forward", partial(mistral_forward if not streaming else mistral_forward_stream, attn_device='cuda')) 385 | if type(budget) == float: 386 | budget = int(length * budget) + stride 387 | elif type(budget) == int: 388 | budget += stride 389 | for idx in range(budget, -1, -1): 390 | if (length-idx)%stride==0: break 391 | for r_idx in range(idx-1, -1, -1): 392 | if (idx-r_idx)%stride==0: break 393 | prefix = input_ids[:, :r_idx] 394 | recent_window = int(budget*recent_ratio) 395 | sink_length = temp_length 396 | outputs_prefilling = self(input_ids=prefix, use_cache=True, output_attentions=keep_attention) 397 | past_key_values, logits = outputs_prefilling.past_key_values, outputs_prefilling.logits 398 | loss_fct = torch.nn.CrossEntropyLoss(reduction='none') 399 | logits_prev_step = logits[:, -1, :] 400 | _, raw_prob_prev_step = logits_adapter(logits_prev_step, temperature, top_p) 401 | prefix_token_lst = input_ids[0].cpu().numpy().tolist() 402 | cache_tokens = prefix[0].cpu().numpy().tolist() 403 | if keep_attention: 404 | outputs_prefilling.attentions = process_for_mqa_gqa(outputs_prefilling.attentions, num_layers, num_heads, rep_n) 405 | cache_attn_scores, cache_attn_scores_square = h2o_head_score(outputs_prefilling.attentions, self.device, stride, idx, num_layers, num_heads, empty=not keep_attention) 406 | # Back to GPU 407 | if 'llama' in self.config.architectures[0].lower(): 408 | modify_method_of_instance(self, "LlamaAttention", "forward", partial(llama_forward if not streaming else llama_forward_stream, attn_device='cuda')) 409 | else: 410 | modify_method_of_instance(self, "MistralAttention", "forward", partial(mistral_forward if not streaming else mistral_forward_stream, attn_device='cuda')) 411 | 412 | if keep_attention: 413 | cache_counter = torch.tensor([[[1.0]*(idx+stride) for _ in range(num_heads)] for _ in range(num_layers)], device=self.device) 414 | cache_counter = torch.cumsum(cache_counter, dim=-1).flip(dims=(2,)) - float(stride) 415 | else: 416 | cache_counter = torch.tensor([[[float(stride)]*idx+torch.arange(stride, 0, -1).numpy().tolist() for _ in range(num_heads)] for _ in range(num_layers)], device=self.device) - float(stride) 417 | cache_counter_token = torch.tensor([1.0]*(idx+stride), device=self.device) 418 | cache_counter_token = torch.cumsum(cache_counter_token, dim=-1).flip(dims=(0, )) - float(stride) 419 | n = 0 420 | output_ids = [] 421 | token_probs = [] 422 | cur_pos_id = past_key_values[0][0].shape[2] 423 | evicted_positions = [] 424 | log_probs = [] 425 | # for token_i in range(idx, length, stride): 426 | for token_i in range(r_idx, length, stride): 427 | n += stride 428 | outputs = self(input_ids=input_ids[:, token_i:token_i+stride], 429 | past_key_values=past_key_values, 430 | attention_mask=torch.ones(1, stride+past_key_values[0][0].shape[2], dtype=torch.long, device=self.device), 431 | position_ids=torch.LongTensor(list(range(cur_pos_id, cur_pos_id+stride))).to(self.device).view(1, -1), 432 | use_cache=True, 433 | output_attentions=True) 434 | past_key_values = outputs.past_key_values 435 | logits_prev_step = outputs.logits[:, -1, :] 436 | prob_prev_step, raw_prob_prev_step = logits_adapter(logits_prev_step, temperature, top_p) 437 | 438 | # Unified processing for MQA, GQA and MHA 439 | outputs.attentions = process_for_mqa_gqa(outputs.attentions, num_layers, num_heads, rep_n) 440 | 441 | cur_kv_size = past_key_values[0][0].shape[2] 442 | # update accumulated attention scores 443 | if cur_kv_size>idx or keep_attention: 444 | if 'h2o_head' == mode: 445 | for l in range(num_layers): 446 | attention_map = outputs.attentions[l][0, :, :, :].sum(dim=1) # (num_heads, stride, stride+l) 447 | cache_attn_scores[l, :, :attention_map.shape[-1]] += attention_map 448 | elif 'roco' == mode: 449 | for l in range(num_layers): 450 | attention_map = outputs.attentions[l][0, :, :, :].sum(dim=1) # (num_heads, l) 451 | attention_map_sq = ((outputs.attentions[l][0, :, :, :])**2).sum(dim=1) 452 | cache_attn_scores[l, :, :attention_map.shape[-1]] += attention_map 453 | cache_attn_scores_square[l, :, :attention_map.shape[-1]] += attention_map_sq 454 | elif 'tova' == mode: 455 | for l in range(num_layers): 456 | attention_map = outputs.attentions[l][0, :, -1, :].mean(dim=0).unsqueeze(0).repeat(num_heads, 1) # (num_heads, l) 457 | cache_attn_scores[l, :, :attention_map.shape[-1]] = attention_map 458 | # evict if current kv cache size exceeds the budget 459 | if mode != 'full' and cur_kv_size>idx: 460 | cache_counter += float(stride) 461 | cache_counter_token += float(stride) 462 | if mode in ['h2o_head']: 463 | eviction_ids = torch.topk(cache_attn_scores[:, :, sink_length:-recent_window], dim=-1, k=stride, largest=False)[1] + sink_length 464 | past_key_values = truncate_kv_cache_liso(past_key_values, eviction_ids) 465 | _index = torch.ones(num_layers, num_heads, cache_attn_scores.shape[-1], device=self.device).view(num_layers*num_heads, -1) 466 | _src = torch.zeros(num_layers, num_heads, stride, device=self.device).view(num_layers*num_heads, -1) 467 | mask = _index.scatter(dim=-1, index=eviction_ids.view(num_layers*num_heads, -1), src=_src).bool() 468 | cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, stride, device=self.device)), dim=-1) 469 | cache_counter = torch.cat((cache_counter.view(-1, cache_counter.shape[-1])[mask].view(num_layers, num_heads, -1), (torch.arange(stride)-stride+1).view(1, 1, -1).repeat(num_layers, num_heads, 1).flip(dims=(2,)).to(self.device)), dim=-1) 470 | elif mode in ['roco']: 471 | cur_std = torch.sqrt(cache_attn_scores_square / cache_counter - (cache_attn_scores / cache_counter)**2) 472 | cur_std[:, :, -10:] = 1e9 473 | cur_std[:, :, :sink_length] = 1e9 474 | _, feasible_ids = torch.topk(cur_std, largest=False, k=max(budget-recent_window-sink_length, stride), dim=-1) # (layers, heads, k) 475 | argmin_id = torch.topk(cache_attn_scores.gather(dim=-1, index=feasible_ids) / cache_counter.gather(dim=-1, index=feasible_ids), dim=-1, largest=False, k=stride)[1] # (layers, heads) 476 | eviction_ids = feasible_ids.gather(dim=-1, index=argmin_id) 477 | past_key_values = truncate_kv_cache_liso(past_key_values, eviction_ids) 478 | _index = torch.ones(num_layers, num_heads, cache_attn_scores.shape[-1], device=self.device).view(num_layers*num_heads, -1) 479 | _src = torch.zeros(num_layers, num_heads, stride, device=self.device).view(num_layers*num_heads, -1) 480 | mask = _index.scatter(dim=-1, index=eviction_ids.view(num_layers*num_heads, -1), src=_src).bool() 481 | cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, stride, device=self.device)), dim=-1) 482 | cache_attn_scores_square = torch.cat((cache_attn_scores_square.view(-1, cache_attn_scores_square.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, stride, device=self.device)), dim=-1) 483 | cache_counter = torch.cat((cache_counter.view(-1, cache_counter.shape[-1])[mask].view(num_layers, num_heads, -1), (torch.arange(stride)-stride+1).view(1, 1, -1).repeat(num_layers, num_heads, 1).flip(dims=(2,)).to(self.device)), dim=-1) 484 | elif mode == 'tova': 485 | eviction_ids = torch.topk(cache_attn_scores[:, :, sink_length:-recent_window], dim=-1, k=stride, largest=False)[1] + sink_length 486 | past_key_values = truncate_kv_cache_liso(past_key_values, eviction_ids) 487 | _index = torch.ones(num_layers, num_heads, cache_attn_scores.shape[-1], device=self.device).view(num_layers*num_heads, -1) 488 | _src = torch.zeros(num_layers, num_heads, stride, device=self.device).view(num_layers*num_heads, -1) 489 | mask = _index.scatter(dim=-1, index=eviction_ids.view(num_layers*num_heads, -1), src=_src).bool() 490 | cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, stride, device=self.device)), dim=-1) 491 | elif mode == 'recency': 492 | evict_id = sink_length 493 | past_key_values = truncate_kv_cache(past_key_values, start=evict_id, end=evict_id+stride) 494 | elif mode == 'random': 495 | scores = torch.rand(cache_attn_scores.shape[-1]).to(self.device) 496 | scores[-stride:] = -1e9 497 | _, evict_id = torch.topk(scores, k=1, dim=-1) 498 | evict_id = evict_id[0].cpu().item() 499 | past_key_values = truncate_kv_cache(past_key_values, start=evict_id, end=evict_id+stride) 500 | cur_pos_id += stride 501 | cur_pos_id = input_ids.shape[-1] 502 | _tmp = past_key_values[0][0].shape[2] 503 | print(f"KV cache budget ratio: {_tmp / input_ids.shape[-1]*100:.2f}%({_tmp}/{input_ids.shape[-1]})") 504 | n = 0 505 | output_ids = [] 506 | decoding_times = [] 507 | import time 508 | while n < max_new_tokens: 509 | next_token = torch.multinomial(prob_prev_step, num_samples=1) 510 | output_ids.append(next_token[0, 0].cpu().item()) 511 | next_token_prob = torch.gather(raw_prob_prev_step, -1, next_token) # (bsz, 1) 512 | n += 1 513 | if output_ids[-1] in eos_token_ids: break 514 | s = time.time() 515 | outputs = self(input_ids=next_token, 516 | past_key_values=past_key_values, 517 | attention_mask=torch.ones(next_token.shape[0], 1+past_key_values[0][0].shape[2], dtype=torch.long, device=self.device), 518 | position_ids=torch.LongTensor([cur_pos_id]).to(self.device).view(-1, 1), 519 | use_cache=True) 520 | past_key_values = outputs.past_key_values 521 | logits_prev_step = outputs.logits[:, -1, :] 522 | prob_prev_step, raw_prob_prev_step = logits_adapter(logits_prev_step, temperature, top_p) 523 | e = time.time() 524 | cur_step_time = e-s 525 | decoding_times.append(cur_step_time) 526 | cur_pos_id += 1 527 | decoded_output = tokenizer.decode(output_ids, skip_special_tokens=True).strip() 528 | if report_decoding_latency: print(f"Per-step decoding latency: {statistics.mean(decoding_times[1:]):.3f}") 529 | return decoded_output 530 | elif kv_mode == 'encoding_decoding': 531 | """ 532 | after encoding, budget-1 decoding 533 | """ 534 | length = input_ids.shape[-1] 535 | assert type(budget) == int and budget <= length 536 | white_lst = ['random', 'recency', 'tova', 'roco'] 537 | assert mode in white_lst, f"mode must be within {white_lst}, get {mode} instead" 538 | # In case budget is also large, the attention_map will occupy a lot of memory 539 | # We offload attention_map to CPU first and move it layer by layer to GPU to compute eviction score 540 | if 'llama' in self.config.architectures[0].lower(): 541 | modify_method_of_instance(self, "LlamaAttention", "forward", partial(llama_forward if not streaming else llama_forward_stream, attn_device='cuda')) 542 | else: 543 | modify_method_of_instance(self, "MistralAttention", "forward", partial(mistral_forward if not streaming else mistral_forward_stream, attn_device='cuda')) 544 | if type(budget) == float: 545 | budget = int(length * budget) + stride 546 | elif type(budget) == int: 547 | budget += stride 548 | if budget >= length: budget -= stride 549 | for idx in range(budget, -1, -1): 550 | if (length-idx)%stride==0: break 551 | for r_idx in range(1, idx): 552 | if (idx-r_idx)%stride==0: break 553 | # prefix = input_ids[:, :idx] 554 | prefix = input_ids[:, :r_idx] 555 | recent_window = int(budget*recent_ratio) 556 | sink_length = temp_length 557 | outputs_prefilling = self(input_ids=prefix, use_cache=True, output_attentions=keep_attention) 558 | past_key_values, logits = outputs_prefilling.past_key_values, outputs_prefilling.logits 559 | loss_fct = torch.nn.CrossEntropyLoss(reduction='none') 560 | logits_prev_step = logits[:, -1, :] 561 | _, raw_prob_prev_step = logits_adapter(logits_prev_step, temperature, top_p) 562 | prefix_token_lst = input_ids[0].cpu().numpy().tolist() 563 | cache_tokens = prefix[0].cpu().numpy().tolist() 564 | if keep_attention: 565 | outputs_prefilling.attentions = process_for_mqa_gqa(outputs_prefilling.attentions, num_layers, num_heads, rep_n) 566 | cache_attn_scores, cache_attn_scores_square = h2o_head_score(outputs_prefilling.attentions, self.device, stride, idx, num_layers, num_heads, empty=not keep_attention) 567 | # Back to GPU 568 | if 'llama' in self.config.architectures[0].lower(): 569 | modify_method_of_instance(self, "LlamaAttention", "forward", partial(llama_forward if not streaming else llama_forward_stream, attn_device='cuda')) 570 | else: 571 | modify_method_of_instance(self, "MistralAttention", "forward", partial(mistral_forward if not streaming else mistral_forward_stream, attn_device='cuda')) 572 | 573 | if keep_attention: 574 | cache_counter = torch.tensor([[[1.0]*(idx+stride) for _ in range(num_heads)] for _ in range(num_layers)], device=self.device) 575 | cache_counter = torch.cumsum(cache_counter, dim=-1).flip(dims=(2,)) - float(stride) 576 | else: 577 | cache_counter = torch.tensor([[[float(stride)]*idx+torch.arange(stride, 0, -1).numpy().tolist() for _ in range(num_heads)] for _ in range(num_layers)], device=self.device) - float(stride) 578 | cache_counter_token = torch.tensor([1.0]*(idx+stride), device=self.device) 579 | cache_counter_token = torch.cumsum(cache_counter_token, dim=-1).flip(dims=(0, )) - float(stride) 580 | n = 0 581 | output_ids = [] 582 | token_probs = [] 583 | cur_pos_id = past_key_values[0][0].shape[2] 584 | evicted_positions = [] 585 | log_probs = [] 586 | # for token_i in range(idx, length, stride): 587 | for token_i in range(r_idx, length, stride): 588 | n += stride 589 | outputs = self(input_ids=input_ids[:, token_i:token_i+stride], 590 | past_key_values=past_key_values, 591 | attention_mask=torch.ones(1, stride+past_key_values[0][0].shape[2], dtype=torch.long, device=self.device), 592 | position_ids=torch.LongTensor(list(range(cur_pos_id, cur_pos_id+stride))).to(self.device).view(1, -1), 593 | use_cache=True, 594 | output_attentions=True) 595 | past_key_values = outputs.past_key_values 596 | logits_prev_step = outputs.logits[:, -1, :] 597 | prob_prev_step, raw_prob_prev_step = logits_adapter(logits_prev_step, temperature, top_p) 598 | 599 | # Unified processing for MQA, GQA and MHA 600 | outputs.attentions = process_for_mqa_gqa(outputs.attentions, num_layers, num_heads, rep_n) 601 | 602 | cur_kv_size = past_key_values[0][0].shape[2] 603 | if cur_kv_size>idx or keep_attention: 604 | # update accumulated attention scores 605 | if 'h2o_head' == mode: 606 | for l in range(num_layers): 607 | attention_map = outputs.attentions[l][0, :, :, :].sum(dim=1) # (num_heads, stride, stride+l) 608 | cache_attn_scores[l, :, :attention_map.shape[-1]] += attention_map 609 | elif 'roco' == mode: 610 | for l in range(num_layers): 611 | attention_map = outputs.attentions[l][0, :, :, :].sum(dim=1) # (num_heads, l) 612 | attention_map_sq = ((outputs.attentions[l][0, :, :, :])**2).sum(dim=1) 613 | cache_attn_scores[l, :, :attention_map.shape[-1]] += attention_map 614 | cache_attn_scores_square[l, :, :attention_map.shape[-1]] += attention_map_sq 615 | elif 'tova' == mode: 616 | for l in range(num_layers): 617 | attention_map = outputs.attentions[l][0, :, -1, :] # (num_heads, l) 618 | cache_attn_scores[l, :, :attention_map.shape[-1]] = attention_map 619 | # evict if current kv cache size exceeds the budget 620 | if mode != 'full' and cur_kv_size>idx: 621 | cache_counter += float(stride) 622 | cache_counter_token += float(stride) 623 | if mode in ['h2o_head']: 624 | eviction_ids = torch.topk(cache_attn_scores[:, :, sink_length:-recent_window], dim=-1, k=stride, largest=False)[1] + sink_length 625 | past_key_values = truncate_kv_cache_liso(past_key_values, eviction_ids) 626 | _index = torch.ones(num_layers, num_heads, cache_attn_scores.shape[-1], device=self.device).view(num_layers*num_heads, -1) 627 | _src = torch.zeros(num_layers, num_heads, stride, device=self.device).view(num_layers*num_heads, -1) 628 | mask = _index.scatter(dim=-1, index=eviction_ids.view(num_layers*num_heads, -1), src=_src).bool() 629 | cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, stride, device=self.device)), dim=-1) 630 | cache_counter = torch.cat((cache_counter.view(-1, cache_counter.shape[-1])[mask].view(num_layers, num_heads, -1), (torch.arange(stride)-stride+1).view(1, 1, -1).repeat(num_layers, num_heads, 1).flip(dims=(2,)).to(self.device)), dim=-1) 631 | elif mode in ['roco']: 632 | cur_std = torch.sqrt(cache_attn_scores_square / cache_counter - (cache_attn_scores / cache_counter)**2) 633 | cur_std[:, :, -10:] = 1e9 634 | cur_std[:, :, :sink_length] = 1e9 635 | _, feasible_ids = torch.topk(cur_std, largest=False, k=max(budget-recent_window-sink_length, stride), dim=-1) # (layers, heads, k) 636 | argmin_id = torch.topk(cache_attn_scores.gather(dim=-1, index=feasible_ids) / cache_counter.gather(dim=-1, index=feasible_ids), dim=-1, largest=False, k=stride)[1] # (layers, heads) 637 | eviction_ids = feasible_ids.gather(dim=-1, index=argmin_id) 638 | past_key_values = truncate_kv_cache_liso(past_key_values, eviction_ids) 639 | _index = torch.ones(num_layers, num_heads, cache_attn_scores.shape[-1], device=self.device).view(num_layers*num_heads, -1) 640 | _src = torch.zeros(num_layers, num_heads, stride, device=self.device).view(num_layers*num_heads, -1) 641 | mask = _index.scatter(dim=-1, index=eviction_ids.view(num_layers*num_heads, -1), src=_src).bool() 642 | cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, stride, device=self.device)), dim=-1) 643 | cache_attn_scores_square = torch.cat((cache_attn_scores_square.view(-1, cache_attn_scores_square.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, stride, device=self.device)), dim=-1) 644 | cache_counter = torch.cat((cache_counter.view(-1, cache_counter.shape[-1])[mask].view(num_layers, num_heads, -1), (torch.arange(stride)-stride+1).view(1, 1, -1).repeat(num_layers, num_heads, 1).flip(dims=(2,)).to(self.device)), dim=-1) 645 | elif mode == 'tova': 646 | eviction_ids = torch.topk(cache_attn_scores[:, :, sink_length:-recent_window], dim=-1, k=stride, largest=False)[1] + sink_length 647 | past_key_values = truncate_kv_cache_liso(past_key_values, eviction_ids) 648 | _index = torch.ones(num_layers, num_heads, cache_attn_scores.shape[-1], device=self.device).view(num_layers*num_heads, -1) 649 | _src = torch.zeros(num_layers, num_heads, stride, device=self.device).view(num_layers*num_heads, -1) 650 | mask = _index.scatter(dim=-1, index=eviction_ids.view(num_layers*num_heads, -1), src=_src).bool() 651 | cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, stride, device=self.device)), dim=-1) 652 | elif mode == 'recency': 653 | evict_id = sink_length 654 | past_key_values = truncate_kv_cache(past_key_values, start=evict_id, end=evict_id+stride) 655 | elif mode == 'random': 656 | scores = torch.rand(cache_attn_scores.shape[-1]).to(self.device) 657 | scores[-stride:] = -1e9 658 | _, evict_id = torch.topk(scores, k=1, dim=-1) 659 | evict_id = evict_id[0].cpu().item() 660 | past_key_values = truncate_kv_cache(past_key_values, start=evict_id, end=evict_id+stride) 661 | cur_pos_id += stride 662 | cur_pos_id = input_ids.shape[-1] 663 | _tmp = past_key_values[0][0].shape[2] 664 | n = 0 665 | output_ids = [] 666 | cache_attn_scores = cache_attn_scores[:, :, :-(stride-1)] 667 | cache_attn_scores_square = cache_attn_scores_square[:, :, :-(stride-1)] 668 | cache_counter = cache_counter[:, :, :-(stride-1)] 669 | assert cache_attn_scores.shape[-1] == _tmp+1 670 | while n < max_new_tokens: 671 | next_token = torch.multinomial(prob_prev_step, num_samples=1) 672 | output_ids.append(next_token[0, 0].cpu().item()) 673 | next_token_prob = torch.gather(raw_prob_prev_step, -1, next_token) # (bsz, 1) 674 | n += 1 675 | if output_ids[-1] in eos_token_ids: break 676 | outputs = self(input_ids=next_token, 677 | past_key_values=past_key_values, 678 | attention_mask=torch.ones(next_token.shape[0], 1+past_key_values[0][0].shape[2], dtype=torch.long, device=self.device), 679 | position_ids=torch.LongTensor([cur_pos_id]).to(self.device).view(-1, 1), 680 | use_cache=True, 681 | output_attentions=True) 682 | # unified processing for GQA and MHA 683 | outputs.attentions = list(outputs.attentions) 684 | for l in range(num_layers): 685 | bs = outputs.attentions[l].shape[0] 686 | sl = outputs.attentions[l].shape[2] 687 | tl = outputs.attentions[l].shape[3] 688 | outputs.attentions[l] = outputs.attentions[l].reshape(bs, num_heads, rep_n, sl, tl).mean(dim=2) # (bs, num_kv_heads, sl, tl) 689 | past_key_values = outputs.past_key_values 690 | logits_prev_step = outputs.logits[:, -1, :] 691 | prob_prev_step, raw_prob_prev_step = logits_adapter(logits_prev_step, temperature, top_p) 692 | 693 | # update accumulated attention scores 694 | if 'h2o_head' == mode: 695 | for l in range(num_layers): 696 | attention_map = outputs.attentions[l][0, :, 0, :] 697 | cache_attn_scores[l, :, :attention_map.shape[-1]] += attention_map 698 | elif 'roco' == mode: 699 | for l in range(num_layers): 700 | attention_map = outputs.attentions[l][0, :, 0, :] 701 | attention_map_sq = ((outputs.attentions[l][0, :, 0, :])**2) 702 | cache_attn_scores[l, :, :attention_map.shape[-1]] += attention_map 703 | cache_attn_scores_square[l, :, :attention_map_sq.shape[-1]] += attention_map_sq 704 | elif 'tova' == mode: 705 | for l in range(num_layers): 706 | attention_map = outputs.attentions[l][0, :, 0, :] 707 | cache_attn_scores[l, :, :attention_map.shape[-1]] = attention_map 708 | cache_counter += 1.0 709 | recent_ratio = 0.3 710 | recent_window = int(budget*recent_ratio) 711 | if mode in ['h2o_head']: 712 | eviction_ids = torch.argmin(cache_attn_scores[:, :, :-recent_window], dim=-1) 713 | _eviction_ids = eviction_ids 714 | eviction_ids = eviction_ids.cpu().numpy().tolist() 715 | past_key_values = truncate_kv_cache_silo(past_key_values, eviction_ids) 716 | _index = torch.arange(cache_attn_scores.shape[-1], device=self.device).unsqueeze(0).unsqueeze(0).repeat(num_layers, num_heads, 1) 717 | mask = (_eviction_ids.unsqueeze(-1)!=_index).view(-1, _index.shape[-1]) 718 | cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, 1, device=self.device)), dim=-1) 719 | elif mode in ['roco']: 720 | cur_std = torch.sqrt(cache_attn_scores_square / cache_counter - (cache_attn_scores / cache_counter)**2) 721 | cur_std[:, :, -10:] = 1e9 722 | _, feasible_ids = torch.topk(cur_std, largest=False, k=budget-recent_window, dim=-1) # (layers, heads, k) 723 | argmin_id = torch.argmin(cache_attn_scores.gather(dim=-1, index=feasible_ids) / cache_counter.gather(dim=-1, index=feasible_ids), dim=-1).unsqueeze(-1) # (layers, heads) 724 | eviction_ids = feasible_ids.gather(dim=-1, index=argmin_id).squeeze(-1) 725 | _eviction_ids = eviction_ids 726 | eviction_ids = eviction_ids.cpu().numpy().tolist() 727 | past_key_values = truncate_kv_cache_silo(past_key_values, eviction_ids) 728 | _index = torch.arange(cache_attn_scores.shape[-1], device=self.device).unsqueeze(0).unsqueeze(0).repeat(num_layers, num_heads, 1) 729 | mask = (_eviction_ids.unsqueeze(-1)!=_index).view(-1, _index.shape[-1]) 730 | cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, 1, device=self.device)), dim=-1) 731 | cache_attn_scores_square = torch.cat((cache_attn_scores_square.view(-1, cache_attn_scores_square.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, 1, device=self.device)), dim=-1) 732 | cache_counter = torch.cat((cache_counter.view(-1, cache_counter.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, 1, device=self.device)), dim=-1) 733 | elif mode == 'tova': 734 | eviction_ids = torch.argmin(cache_attn_scores, dim=-1) 735 | _eviction_ids = eviction_ids 736 | eviction_ids = eviction_ids.cpu().numpy().tolist() 737 | past_key_values = truncate_kv_cache_silo(past_key_values, eviction_ids) 738 | _index = torch.arange(cache_attn_scores.shape[-1], device=self.device).unsqueeze(0).unsqueeze(0).repeat(num_layers, num_heads, 1) 739 | mask = (_eviction_ids.unsqueeze(-1)!=_index).view(-1, _index.shape[-1]) 740 | cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, 1, device=self.device)), dim=-1) 741 | elif mode == 'recency': 742 | past_key_values = truncate_kv_cache(past_key_values, start=sink_length, end=sink_length+1) 743 | elif mode == 'random': 744 | scores = torch.rand(*positions_tensor.shape).to(self.device) 745 | _, evict_id = torch.topk(scores, k=1, dim=-1) 746 | evict_id = evict_id[0].cpu().item() 747 | past_key_values = truncate_kv_cache(past_key_values, start=sink_length+evict_id, end=sink_length+evict_id+1) 748 | cur_pos_id += 1 749 | cache_size = past_key_values[0][0].shape[2] 750 | total_length = length + len(output_ids) 751 | print(f"KV Cache Budget ratio {cache_size / total_length*100:.2f}%[{cache_size}/({length}+{len(output_ids)})]") 752 | decoded_output = tokenizer.decode(output_ids, skip_special_tokens=True).strip() 753 | return decoded_output 754 | elif kv_mode == 'ppl': 755 | """ 756 | perplexity computation with fixed kv cache 757 | """ 758 | length = input_ids.shape[-1] 759 | if budget >= 1.0: 760 | outputs_prefilling = self(input_ids=input_ids, use_cache=False) 761 | logits = outputs_prefilling.logits 762 | loss_fct = torch.nn.CrossEntropyLoss(reduction='none') 763 | log_probs = loss_fct(logits[0, :-1], input_ids.clone()[0, 1:]).cpu().numpy().tolist() 764 | ppl = math.exp(statistics.mean(log_probs)) 765 | return ppl 766 | else: 767 | # In case budget is also large, the attention_map will occupy a lot of memory 768 | # We offload attention_map to CPU first and move it layer by laer to GPU to compute eviction score 769 | if 'llama' in self.config.architectures[0].lower(): 770 | modify_method_of_instance(self, "LlamaAttention", "forward", partial(llama_forward if not streaming else llama_forward_stream, attn_device='cuda')) 771 | else: 772 | modify_method_of_instance(self, "MistralAttention", "forward", partial(mistral_forward if not streaming else mistral_forward_stream, attn_device='cuda')) 773 | if type(budget) == float: 774 | budget = int(length * budget) + stride 775 | elif type(budget) == int: 776 | budget += stride 777 | for idx in range(budget, -1, -1): 778 | if (length-idx)%stride==0: break 779 | for r_idx in range(1, idx): 780 | if (idx-r_idx)%stride==0: break 781 | prefix = input_ids[:, :r_idx] 782 | recent_window = int(budget*recent_ratio) 783 | sink_length = temp_length 784 | outputs_prefilling = self(input_ids=prefix, use_cache=True, output_attentions=keep_attention) 785 | past_key_values, logits = outputs_prefilling.past_key_values, outputs_prefilling.logits 786 | loss_fct = torch.nn.CrossEntropyLoss(reduction='none') 787 | logits_prev_step = logits[:, -1, :] 788 | _, raw_prob_prev_step = logits_adapter(logits_prev_step, temperature, top_p) 789 | prefix_token_lst = input_ids[0].cpu().numpy().tolist() 790 | cache_tokens = prefix[0].cpu().numpy().tolist() 791 | if keep_attention: 792 | outputs_prefilling.attentions = process_for_mqa_gqa(outputs_prefilling.attentions, num_layers, num_heads, rep_n) 793 | cache_attn_scores, cache_attn_scores_square = h2o_head_score(outputs_prefilling.attentions, self.device, stride, idx, num_layers, num_heads, empty=not keep_attention) 794 | # Back to GPU 795 | if 'llama' in self.config.architectures[0].lower(): 796 | modify_method_of_instance(self, "LlamaAttention", "forward", partial(llama_forward if not streaming else llama_forward_stream, attn_device='cuda')) 797 | else: 798 | modify_method_of_instance(self, "MistralAttention", "forward", partial(mistral_forward if not streaming else mistral_forward_stream, attn_device='cuda')) 799 | 800 | if keep_attention: 801 | cache_counter = torch.tensor([[[1.0]*(idx+stride) for _ in range(num_heads)] for _ in range(num_layers)], device=self.device) 802 | cache_counter = torch.cumsum(cache_counter, dim=-1).flip(dims=(2,)) - float(stride) 803 | else: 804 | cache_counter = torch.tensor([[[float(stride)]*idx+torch.arange(stride, 0, -1).numpy().tolist() for _ in range(num_heads)] for _ in range(num_layers)], device=self.device) - float(stride) 805 | cache_counter_token = torch.tensor([1.0]*(idx+stride), device=self.device) 806 | cache_counter_token = torch.cumsum(cache_counter_token, dim=-1).flip(dims=(0, )) - float(stride) 807 | n = 0 808 | output_ids = [] 809 | token_probs = [] 810 | cur_pos_id = past_key_values[0][0].shape[2] 811 | evicted_positions = [] 812 | log_probs = [] 813 | all_logits = [] 814 | all_ids = [] 815 | # for token_i in range(idx, length, stride): 816 | for token_i in range(r_idx, length, stride): 817 | n += stride 818 | outputs = self(input_ids=input_ids[:, token_i:token_i+stride], 819 | past_key_values=past_key_values, 820 | attention_mask=torch.ones(1, stride+past_key_values[0][0].shape[2], dtype=torch.long, device=self.device), 821 | position_ids=torch.LongTensor(list(range(cur_pos_id, cur_pos_id+stride))).to(self.device).view(1, -1), 822 | use_cache=True, 823 | output_attentions=True) 824 | past_key_values = outputs.past_key_values 825 | logits_prev_step = outputs.logits[:, -1, :] 826 | all_logits.append(outputs.logits[0]) 827 | all_ids.append(input_ids[0, token_i:token_i+stride]) 828 | prob_prev_step, raw_prob_prev_step = logits_adapter(logits_prev_step, temperature, top_p) 829 | 830 | # Unified processing for MQA, GQA and MHA 831 | outputs.attentions = process_for_mqa_gqa(outputs.attentions, num_layers, num_heads, rep_n) 832 | cur_kv_size = past_key_values[0][0].shape[2] 833 | # update accumulated attention scores 834 | if cur_kv_size>idx or keep_attention: 835 | if 'h2o_head' == mode: 836 | for l in range(num_layers): 837 | attention_map = outputs.attentions[l][0, :, :, :].sum(dim=1) # (num_heads, stride, stride+l) 838 | cache_attn_scores[l, :, :attention_map.shape[-1]] += attention_map 839 | elif 'roco' == mode: 840 | for l in range(num_layers): 841 | attention_map = outputs.attentions[l][0, :, :, :].sum(dim=1) # (num_heads, l) 842 | attention_map_sq = ((outputs.attentions[l][0, :, :, :])**2).sum(dim=1) 843 | cache_attn_scores[l, :, :attention_map.shape[-1]] += attention_map 844 | cache_attn_scores_square[l, :, :attention_map.shape[-1]] += attention_map_sq 845 | elif 'tova' == mode: 846 | for l in range(num_layers): 847 | attention_map = outputs.attentions[l][0, :, -1, :].mean(dim=0).unsqueeze(0).repeat(num_heads, 1) # (num_heads, l) 848 | cache_attn_scores[l, :, :attention_map.shape[-1]] = attention_map 849 | # evict if current kv cache size exceeds the budget 850 | if mode != 'full' and cur_kv_size>idx: 851 | cache_counter += float(stride) 852 | cache_counter_token += float(stride) 853 | if mode in ['h2o_head']: 854 | eviction_ids = torch.topk(cache_attn_scores[:, :, sink_length:-recent_window], dim=-1, k=stride, largest=False)[1] + sink_length 855 | past_key_values = truncate_kv_cache_liso(past_key_values, eviction_ids) 856 | _index = torch.ones(num_layers, num_heads, cache_attn_scores.shape[-1], device=self.device).view(num_layers*num_heads, -1) 857 | _src = torch.zeros(num_layers, num_heads, stride, device=self.device).view(num_layers*num_heads, -1) 858 | mask = _index.scatter(dim=-1, index=eviction_ids.view(num_layers*num_heads, -1), src=_src).bool() 859 | cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, stride, device=self.device)), dim=-1) 860 | cache_counter = torch.cat((cache_counter.view(-1, cache_counter.shape[-1])[mask].view(num_layers, num_heads, -1), (torch.arange(stride)-stride+1).view(1, 1, -1).repeat(num_layers, num_heads, 1).flip(dims=(2,)).to(self.device)), dim=-1) 861 | elif mode in ['roco']: 862 | cur_std = torch.sqrt(cache_attn_scores_square / cache_counter - (cache_attn_scores / cache_counter)**2) 863 | cur_std[:, :, -10:] = 1e9 864 | cur_std[:, :, :sink_length] = 1e9 865 | _, feasible_ids = torch.topk(cur_std, largest=False, k=max(budget-recent_window-sink_length, stride), dim=-1) # (layers, heads, k) 866 | # _, feasible_ids = torch.topk(cur_std, largest=False, k=max(budget-int(budget*0.1)-sink_length, stride), dim=-1) # (layers, heads, k) 867 | argmin_id = torch.topk(cache_attn_scores.gather(dim=-1, index=feasible_ids) / cache_counter.gather(dim=-1, index=feasible_ids), dim=-1, largest=False, k=stride)[1] # (layers, heads) 868 | eviction_ids = feasible_ids.gather(dim=-1, index=argmin_id) 869 | past_key_values = truncate_kv_cache_liso(past_key_values, eviction_ids) 870 | _index = torch.ones(num_layers, num_heads, cache_attn_scores.shape[-1], device=self.device).view(num_layers*num_heads, -1) 871 | _src = torch.zeros(num_layers, num_heads, stride, device=self.device).view(num_layers*num_heads, -1) 872 | mask = _index.scatter(dim=-1, index=eviction_ids.view(num_layers*num_heads, -1), src=_src).bool() 873 | cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, stride, device=self.device)), dim=-1) 874 | cache_attn_scores_square = torch.cat((cache_attn_scores_square.view(-1, cache_attn_scores_square.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, stride, device=self.device)), dim=-1) 875 | cache_counter = torch.cat((cache_counter.view(-1, cache_counter.shape[-1])[mask].view(num_layers, num_heads, -1), (torch.arange(stride)-stride+1).view(1, 1, -1).repeat(num_layers, num_heads, 1).flip(dims=(2,)).to(self.device)), dim=-1) 876 | elif mode == 'tova': 877 | eviction_ids = torch.topk(cache_attn_scores[:, :, sink_length:-recent_window], dim=-1, k=stride, largest=False)[1] + sink_length 878 | past_key_values = truncate_kv_cache_liso(past_key_values, eviction_ids) 879 | _index = torch.ones(num_layers, num_heads, cache_attn_scores.shape[-1], device=self.device).view(num_layers*num_heads, -1) 880 | _src = torch.zeros(num_layers, num_heads, stride, device=self.device).view(num_layers*num_heads, -1) 881 | mask = _index.scatter(dim=-1, index=eviction_ids.view(num_layers*num_heads, -1), src=_src).bool() 882 | cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, stride, device=self.device)), dim=-1) 883 | elif mode == 'recency': 884 | evict_id = sink_length 885 | past_key_values = truncate_kv_cache(past_key_values, start=evict_id, end=evict_id+stride) 886 | elif mode == 'random': 887 | scores = torch.rand(cache_attn_scores.shape[-1]).to(self.device) 888 | scores[-stride:] = -1e9 889 | _, evict_id = torch.topk(scores, k=1, dim=-1) 890 | evict_id = evict_id[0].cpu().item() 891 | past_key_values = truncate_kv_cache(past_key_values, start=evict_id, end=evict_id+stride) 892 | cur_pos_id += stride 893 | cur_pos_id = input_ids.shape[-1] 894 | _tmp = past_key_values[0][0].shape[2] 895 | print(f"KV cache budget ratio: {_tmp / input_ids.shape[-1]*100:.2f}%({_tmp}/{input_ids.shape[-1]})") 896 | all_ids = torch.cat(all_ids) 897 | all_logits = torch.cat(all_logits, dim=0) 898 | assert all_ids.shape[0] == all_logits.shape[0] 899 | log_probs = loss_fct(all_logits[:-1], all_ids[1:]).cpu().numpy().tolist() 900 | ppl = math.exp(statistics.mean(log_probs)) 901 | return ppl 902 | 903 | def enable_fixed_kv(model, tokenizer, mode, stride=1, verbose=False): 904 | model.tokenizer = tokenizer 905 | import functools 906 | model.easykv_generate = functools.partial(generate, self=model, kv_mode=mode, stride=stride, report_decoding_latency=verbose) 907 | model.easykv_ppl = functools.partial(generate, self=model, kv_mode='ppl', stride=stride) 908 | print(f"Fixed KV Cache for {mode} enabled") --------------------------------------------------------------------------------