├── logo.png
├── easykv
    ├── __init__.py
    ├── __pycache__
    │   ├── utils.cpython-310.pyc
    │   ├── __init__.cpython-310.pyc
    │   ├── easykv.cpython-310.pyc
    │   ├── llama_patch.cpython-310.pyc
    │   └── mistral_patch.cpython-310.pyc
    ├── utils.py
    ├── mistral_patch.py
    ├── llama_patch.py
    └── easykv.py
├── test_decoding.py
├── test_ppl.py
├── test_passkey.py
├── test_passkey_NTK.py
├── test_summarization.py
├── README.md
└── doc.txt


/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DRSY/EasyKV/HEAD/logo.png


--------------------------------------------------------------------------------
/easykv/__init__.py:
--------------------------------------------------------------------------------
1 | from .easykv import enable_fixed_kv
2 | from .utils import set_dynamicntk_rope_length


--------------------------------------------------------------------------------
/easykv/__pycache__/utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DRSY/EasyKV/HEAD/easykv/__pycache__/utils.cpython-310.pyc


--------------------------------------------------------------------------------
/easykv/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DRSY/EasyKV/HEAD/easykv/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/easykv/__pycache__/easykv.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DRSY/EasyKV/HEAD/easykv/__pycache__/easykv.cpython-310.pyc


--------------------------------------------------------------------------------
/easykv/__pycache__/llama_patch.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DRSY/EasyKV/HEAD/easykv/__pycache__/llama_patch.cpython-310.pyc


--------------------------------------------------------------------------------
/easykv/__pycache__/mistral_patch.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DRSY/EasyKV/HEAD/easykv/__pycache__/mistral_patch.cpython-310.pyc


--------------------------------------------------------------------------------
/easykv/utils.py:
--------------------------------------------------------------------------------
 1 | from types import MethodType
 2 | from transformers.models.llama.modeling_llama import LlamaAttention
 3 | 
 4 | 
 5 | def modify_method_of_instance(instance, target_class_name, target_method_name, new_method, visited_instances=None):
 6 |     """
 7 |         This function modifies the method of an instance of a model class. 
 8 |         It's part from chat-GPT.
 9 |         It will replace the method  with the new method.
10 |         Currently, we only use this function to modify the attention method of a model. Do not test it further. 
11 | 
12 |         instance: 
13 |             instance of a model to modify.
14 |         target_class_name: 
15 |             name of the attention class to modify. E.g. 'LlamaAttention', 'GPTNeoXAttention', etc.
16 |         new_method: new method to replace the original method. E.g. 'self_extend_forward'. 
17 |             It should include a parameter 'self' to be binded to the instance.
18 |     """
19 |     if visited_instances is None:
20 |         visited_instances = set()
21 |     # Unique identifier for the instance (using id() since object's id is unique)
22 |     instance_id = id(instance)
23 |     if instance_id in visited_instances:
24 |         return 
25 |     # Add the instance to the already_visited set
26 |     visited_instances.add(instance_id)
27 | 
28 |     # Check if this instance is of the target class
29 |     if instance.__class__.__name__ == target_class_name:
30 |         bond_method = MethodType(new_method, instance) 
31 |         setattr(instance, target_method_name, bond_method)
32 |     elif hasattr(instance, '__dict__'):
33 |         for attr_name, attr_value in instance.__dict__.items():
34 |             if isinstance(attr_value, object) and not isinstance(attr_value, (list, tuple, dict, set)):
35 |                 modify_method_of_instance(attr_value, target_class_name, target_method_name, new_method, visited_instances)
36 |             elif isinstance(attr_value, (list, tuple)):
37 |                 for item in attr_value:
38 |                     if isinstance(item, object):
39 |                         modify_method_of_instance(item, target_class_name, target_method_name, new_method, visited_instances)
40 |             # If attribute value is a dictionary, iterate over its values and recurse
41 |             # E.g, for a ModuleList, its moudels are stored in a dictionary: ._modules
42 |             elif isinstance(attr_value, dict):
43 |                 for key, value in attr_value.items():
44 |                     if isinstance(value, object):
45 |                         modify_method_of_instance(value, target_class_name, target_method_name, new_method, visited_instances)
46 |             
47 |             # If attribute value is a set, iterate and recurse
48 |             elif isinstance(attr_value, set):
49 |                 for item in attr_value:
50 |                     if isinstance(item, object):
51 |                         modify_method_of_instance(item, target_class_name, target_method_name, new_method, visited_instances)
52 |     
53 | def set_dynamicntk_rope_length(model, max_length):
54 |     for name, module in model.named_modules():
55 |         if isinstance(module, LlamaAttention):
56 |             module.rotary_emb._set_cos_sin_cache(max_length, device=model.device, dtype=module.rotary_emb.inv_freq.dtype)
57 |     print(f"DynamicNTKRoPE max length reset to {max_length}")


--------------------------------------------------------------------------------
/test_decoding.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | warnings.filterwarnings("ignore")
 3 | import torch
 4 | from transformers import (AutoModelForCausalLM, AutoTokenizer)
 5 | from easykv import enable_fixed_kv
 6 | 
 7 | # define the model path and the corresponding prompt template
 8 | MODEL_CONFIGS = {
 9 |     'wizardlm_13b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--WizardLM--WizardLM-13B-V1.2/snapshots/cf5f40382559f19e13874e45b39575171ca46ef8', template="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\nUSER: Hello!\nASSISTANT: Hello!</s>\nUSER: {inst}\nASSISTANT:"),
10 |     'llama2_13b_chat': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--meta-llama--Llama-2-13b-chat-hf/snapshots/c2f3ec81aac798ae26dcc57799a994dfbf521496/', template="[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>\n{inst}[/INST]"),
11 |     'vicuna_13b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--lmsys--vicuna-13b-v1.5/snapshots/3deb0106f72a3a433f0c6ea0cb978bdf14bcd3a6/', template="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\nUSER: Hello!\nASSISTANT: Hello!</s>\nUSER: {inst}\nASSISTANT:"),
12 |     'openchat': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--openchat--openchat_v3.2_super/snapshots/aab7ce4d48b31a295a0116b61569d8e87a09bb7a/', template="GPT4 User: {inst}<|end_of_turn|>GPT4 Assistant:"),
13 |     'vicuna_7b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--lmsys--vicuna-7b-v1.5/snapshots/de56c35b1763eaae20f4d60efd64af0a9091ebe5/', template="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\nUSER: Hello!\nASSISTANT: Hello!</s>\nUSER: {inst}\nASSISTANT:"),
14 |     'wizardlm_7b': dict(path='/cpfs01/user/rensiyu/language_modeling/stanford_alpaca/output_mle_fp16_recycledWiz70k_llama2_7b_512', template="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{inst}\n\n### Response:"),
15 |     'alpaca_7b': dict(path='/cpfs01/user/rensiyu/language_modeling/stanford_alpaca/output_mle_recycledAlpaca52k_llama2_7b_512_ds', template="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{inst}\n\n### Response:"),
16 |     'zephyr_7b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--HuggingFaceH4--zephyr-7b-beta/snapshots/dc24cabd13eacd3ae3a5fe574bd645483a335a4a/', template="<|system|>\nYou are a friendly chatbot who always responds in a helpful and detailed manner to the user's questions.</s>\n<|user|>\n{inst}</s>\n<|assistant|>\n"),
17 |     'llama2_7b_chat': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/llama2/model_weights_hf/llama-2-7b-chat-hf/', template="[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>\n{inst}[/INST]"),
18 |     'llama2_7b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/llama2/model_weights_hf/llama-2-7b-hf'),
19 |     'llama2_13b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--meta-llama--Llama-2-13b-hf/snapshots/dc1d3b3bfdb69df26f8fc966c16353274b138c55/'),
20 | }
21 | 
22 | model_name = 'llama2_7b_chat'
23 | path = MODEL_CONFIGS[model_name]['path']
24 | template = MODEL_CONFIGS[model_name]['template']
25 | model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.float16, device_map='auto').eval()
26 | tokenizer = AutoTokenizer.from_pretrained(path)
27 | 
28 | # Setup KV caching mode
29 | enable_fixed_kv(model, tokenizer, mode='decoding', stride=1)
30 | 
31 | # Test input
32 | inst = "What are the names of some famous actors that started their careers on Broadway?"
33 | input_prompt = template.format(inst=inst)
34 | 
35 | # Define eviction policy
36 | kv_policy = 'roco'
37 | 
38 | for budget in [300, 150]:
39 |     # Define sampling parameters
40 |     gen_kwargs = dict(
41 |         temperature=1e-9,
42 |         top_p=1.0,
43 |         max_new_tokens=2048,
44 |         budget=budget,
45 |         kv_policy=kv_policy,
46 |     )
47 |     input_ids = tokenizer([input_prompt], return_tensors='pt').input_ids.to(model.device)
48 |     output = model.easykv_generate(input_ids=input_ids, generation_config=gen_kwargs)
49 |     print(f"{'='*20} EasyKV-{kv_policy}-{budget} {'='*20}\n{output}")


--------------------------------------------------------------------------------
/test_ppl.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | warnings.filterwarnings("ignore")
 3 | import torch
 4 | from transformers import (AutoModelForCausalLM, AutoTokenizer, AutoConfig)
 5 | from easykv import enable_fixed_kv, set_dynamicntk_rope_length
 6 | import json
 7 | 
 8 | # Define the model path and the corresponding prompt template
 9 | MODEL_CONFIGS = {
10 |     'wizardlm_13b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--WizardLM--WizardLM-13B-V1.2/snapshots/cf5f40382559f19e13874e45b39575171ca46ef8', template="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\nUSER: Hello!\nASSISTANT: Hello!</s>\nUSER: {inst}\nASSISTANT:"),
11 |     'llama2_13b_chat': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--meta-llama--Llama-2-13b-chat-hf/snapshots/c2f3ec81aac798ae26dcc57799a994dfbf521496/', template="[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>\n{inst}[/INST]"),
12 |     'vicuna_13b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--lmsys--vicuna-13b-v1.5/snapshots/3deb0106f72a3a433f0c6ea0cb978bdf14bcd3a6/', template="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\nUSER: Hello!\nASSISTANT: Hello!</s>\nUSER: {inst}\nASSISTANT:"),
13 |     'openchat': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--openchat--openchat_v3.2_super/snapshots/aab7ce4d48b31a295a0116b61569d8e87a09bb7a/', template="GPT4 User: {inst}<|end_of_turn|>GPT4 Assistant:"),
14 |     'vicuna_7b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--lmsys--vicuna-7b-v1.5/snapshots/de56c35b1763eaae20f4d60efd64af0a9091ebe5/', template="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\nUSER: Hello!\nASSISTANT: Hello!</s>\nUSER: {inst}\nASSISTANT:"),
15 |     'wizardlm_7b': dict(path='/cpfs01/user/rensiyu/language_modeling/stanford_alpaca/output_mle_fp16_recycledWiz70k_llama2_7b_512', template="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{inst}\n\n### Response:"),
16 |     'vicuna_7b_16k': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--lmsys--vicuna-7b-v1.5-16k/snapshots/c8df3ca4436a3bce5c4b5877e0117032081852b4/', template="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\nUSER: Hello!\nASSISTANT: Hello!</s>\nUSER: {inst}\nASSISTANT:"),
17 |     'alpaca_7b': dict(path='/cpfs01/user/rensiyu/language_modeling/stanford_alpaca/output_mle_recycledAlpaca52k_llama2_7b_512_ds', template="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{inst}\n\n### Response:"),
18 |     'zephyr_7b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--HuggingFaceH4--zephyr-7b-beta/snapshots/dc24cabd13eacd3ae3a5fe574bd645483a335a4a/', template="<|system|>\nYou are a friendly chatbot who always responds in a helpful and detailed manner to the user's questions.</s>\n<|user|>\n{inst}</s>\n<|assistant|>\n"),
19 |     'llama2_7b_chat': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/llama2/model_weights_hf/llama-2-7b-chat-hf/', template="[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>\n{inst}[/INST]"),
20 |     'llama2_7b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/llama2/model_weights_hf/llama-2-7b-hf'),
21 |     'llama2_13b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--meta-llama--Llama-2-13b-hf/snapshots/dc1d3b3bfdb69df26f8fc966c16353274b138c55/'),
22 | }
23 | 
24 | # Define model config
25 | model_name = 'llama2_13b'
26 | path = MODEL_CONFIGS[model_name]['path']
27 | 
28 | # Enable DynamicNTK for extending LLaMa2 to longer sequences
29 | config = AutoConfig.from_pretrained(path)
30 | config.rope_scaling = dict(type="dynamic", factor=2)
31 | config.max_position_embeddings = 4096
32 | 
33 | # Load model
34 | model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.float16, device_map='auto', config=config).eval()
35 | tokenizer = AutoTokenizer.from_pretrained(path)
36 | 
37 | # Set the max sequence length before inference to avoid inconsistency of RoPE's base parameter
38 | set_dynamicntk_rope_length(model, 11000)
39 | 
40 | enable_fixed_kv(model, tokenizer, mode='ppl', stride=96)
41 | 
42 | doc = open("./doc.txt").read().strip()
43 | input_ids = tokenizer(doc, return_tensors="pt").input_ids.cuda()
44 | print("Input token length:", input_ids.shape[-1])
45 | 
46 | 
47 | budgets = [1.0, 0.5]
48 | for budget in budgets:
49 |     # Define sampling parameters
50 |     for kv_policy in ['recency', 'roco']:
51 |         ppl_kwargs = dict(
52 |             budget=budget,
53 |             kv_policy=kv_policy,
54 |             keep_attention=False
55 |         )
56 |         ppl = model.easykv_ppl(input_ids=input_ids, generation_config=ppl_kwargs)
57 |         print(f"EasyKV-{kv_policy}-{budget*100:.2f}% PPL: {ppl:.2f}")
58 |         if budget == 1.0: break


--------------------------------------------------------------------------------
/test_passkey.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | warnings.filterwarnings("ignore")
 3 | import torch
 4 | from transformers import (AutoModelForCausalLM, AutoTokenizer, AutoConfig)
 5 | from easykv import enable_fixed_kv, set_dynamicntk_rope_length
 6 | import json
 7 | 
 8 | # Define the model path and the corresponding prompt template
 9 | MODEL_CONFIGS = {
10 |     'wizardlm_13b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--WizardLM--WizardLM-13B-V1.2/snapshots/cf5f40382559f19e13874e45b39575171ca46ef8', template="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\nUSER: Hello!\nASSISTANT: Hello!</s>\nUSER: {inst}\nASSISTANT:"),
11 |     'llama2_13b_chat': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--meta-llama--Llama-2-13b-chat-hf/snapshots/c2f3ec81aac798ae26dcc57799a994dfbf521496/', template="[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>\n{inst}[/INST]"),
12 |     'vicuna_13b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--lmsys--vicuna-13b-v1.5/snapshots/3deb0106f72a3a433f0c6ea0cb978bdf14bcd3a6/', template="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\nUSER: Hello!\nASSISTANT: Hello!</s>\nUSER: {inst}\nASSISTANT:"),
13 |     'openchat': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--openchat--openchat_v3.2_super/snapshots/aab7ce4d48b31a295a0116b61569d8e87a09bb7a/', template="GPT4 User: {inst}<|end_of_turn|>GPT4 Assistant:"),
14 |     'vicuna_7b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--lmsys--vicuna-7b-v1.5/snapshots/de56c35b1763eaae20f4d60efd64af0a9091ebe5/', template="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\nUSER: Hello!\nASSISTANT: Hello!</s>\nUSER: {inst}\nASSISTANT:"),
15 |     'wizardlm_7b': dict(path='/cpfs01/user/rensiyu/language_modeling/stanford_alpaca/output_mle_fp16_recycledWiz70k_llama2_7b_512', template="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{inst}\n\n### Response:"),
16 |     'vicuna_7b_16k': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--lmsys--vicuna-7b-v1.5-16k/snapshots/c8df3ca4436a3bce5c4b5877e0117032081852b4/', template="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\nUSER: Hello!\nASSISTANT: Hello!</s>\nUSER: {inst}\nASSISTANT:"),
17 |     'alpaca_7b': dict(path='/cpfs01/user/rensiyu/language_modeling/stanford_alpaca/output_mle_recycledAlpaca52k_llama2_7b_512_ds', template="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{inst}\n\n### Response:"),
18 |     'zephyr_7b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--HuggingFaceH4--zephyr-7b-beta/snapshots/dc24cabd13eacd3ae3a5fe574bd645483a335a4a/', template="<|system|>\nYou are a friendly chatbot who always responds in a helpful and detailed manner to the user's questions.</s>\n<|user|>\n{inst}</s>\n<|assistant|>\n"),
19 |     'llama2_7b_chat': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/llama2/model_weights_hf/llama-2-7b-chat-hf/', template="[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>\n{inst}[/INST]"),
20 |     'llama2_7b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/llama2/model_weights_hf/llama-2-7b-hf'),
21 |     'llama2_13b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--meta-llama--Llama-2-13b-hf/snapshots/dc1d3b3bfdb69df26f8fc966c16353274b138c55/'),
22 | }
23 | 
24 | # Define model config
25 | model_name = 'vicuna_7b_16k'
26 | path = MODEL_CONFIGS[model_name]['path']
27 | template = MODEL_CONFIGS[model_name]['template']
28 | 
29 | 
30 | # Load model
31 | config = AutoConfig.from_pretrained(path)
32 | model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.float16, device_map='auto', config=config).eval()
33 | tokenizer = AutoTokenizer.from_pretrained(path)
34 | 
35 | # Define KV cache eviction policy
36 | kv_policy = "roco"
37 | 
38 | enable_fixed_kv(model, tokenizer, mode='encoding', stride=96)
39 | 
40 | 
41 | with torch.no_grad():
42 |     # Test the passkey retrieval task
43 |     for line in open("./passkey_examples_10k.jsonl", "r"):
44 |         example = json.loads(line)
45 |         prompt_postfix = "What is the pass key? The pass key is "
46 |         prompt = example["input"] + prompt_postfix
47 |         input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()
48 |         print("-----------------------------------")
49 |         print(f"#Tokens of Prompt:", input_ids.shape[1], end=" ")
50 |         print("Passkey target:", example["target"])
51 | 
52 |         # EasyKV generate
53 |         budgets = [0.5]
54 |         for budget in budgets:
55 |             # Define sampling parameters
56 |             for kv_policy in ['h2o_head_std_avg']:
57 |                 gen_kwargs = dict(
58 |                     temperature=1e-9,
59 |                     top_p=1.0,
60 |                     max_new_tokens=6,
61 |                     budget=budget,
62 |                     kv_policy=kv_policy,
63 |                     keep_attention=False
64 |                 )
65 |                 output = model.easykv_generate(input_ids=input_ids, generation_config=gen_kwargs)
66 |                 answer= f"EasyKV-{kv_policy}({gen_kwargs['budget']*100:.2f}%):     [" + prompt_postfix + output  + "]"
67 |                 answer = answer.replace("\n", "\\n")
68 |                 print(answer)
69 |                 torch.cuda.empty_cache()


--------------------------------------------------------------------------------
/test_passkey_NTK.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | warnings.filterwarnings("ignore")
 3 | import torch
 4 | from transformers import (AutoModelForCausalLM, AutoTokenizer, AutoConfig)
 5 | from easykv import enable_fixed_kv, set_dynamicntk_rope_length
 6 | import json
 7 | 
 8 | # Define the model path and the corresponding prompt template
 9 | MODEL_CONFIGS = {
10 |     'wizardlm_13b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--WizardLM--WizardLM-13B-V1.2/snapshots/cf5f40382559f19e13874e45b39575171ca46ef8', template="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\nUSER: Hello!\nASSISTANT: Hello!</s>\nUSER: {inst}\nASSISTANT:"),
11 |     'llama2_13b_chat': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--meta-llama--Llama-2-13b-chat-hf/snapshots/c2f3ec81aac798ae26dcc57799a994dfbf521496/', template="[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>\n{inst}[/INST]"),
12 |     'vicuna_13b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--lmsys--vicuna-13b-v1.5/snapshots/3deb0106f72a3a433f0c6ea0cb978bdf14bcd3a6/', template="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\nUSER: Hello!\nASSISTANT: Hello!</s>\nUSER: {inst}\nASSISTANT:"),
13 |     'openchat': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--openchat--openchat_v3.2_super/snapshots/aab7ce4d48b31a295a0116b61569d8e87a09bb7a/', template="GPT4 User: {inst}<|end_of_turn|>GPT4 Assistant:"),
14 |     'vicuna_7b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--lmsys--vicuna-7b-v1.5/snapshots/de56c35b1763eaae20f4d60efd64af0a9091ebe5/', template="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\nUSER: Hello!\nASSISTANT: Hello!</s>\nUSER: {inst}\nASSISTANT:"),
15 |     'wizardlm_7b': dict(path='/cpfs01/user/rensiyu/language_modeling/stanford_alpaca/output_mle_fp16_recycledWiz70k_llama2_7b_512', template="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{inst}\n\n### Response:"),
16 |     'vicuna_7b_16k': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--lmsys--vicuna-7b-v1.5-16k/snapshots/c8df3ca4436a3bce5c4b5877e0117032081852b4/', template="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\nUSER: Hello!\nASSISTANT: Hello!</s>\nUSER: {inst}\nASSISTANT:"),
17 |     'alpaca_7b': dict(path='/cpfs01/user/rensiyu/language_modeling/stanford_alpaca/output_mle_recycledAlpaca52k_llama2_7b_512_ds', template="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{inst}\n\n### Response:"),
18 |     'zephyr_7b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--HuggingFaceH4--zephyr-7b-beta/snapshots/dc24cabd13eacd3ae3a5fe574bd645483a335a4a/', template="<|system|>\nYou are a friendly chatbot who always responds in a helpful and detailed manner to the user's questions.</s>\n<|user|>\n{inst}</s>\n<|assistant|>\n"),
19 |     'llama2_7b_chat': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/llama2/model_weights_hf/llama-2-7b-chat-hf/', template="[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>\n{inst}[/INST]"),
20 |     'llama2_7b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/llama2/model_weights_hf/llama-2-7b-hf'),
21 |     'llama2_13b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--meta-llama--Llama-2-13b-hf/snapshots/dc1d3b3bfdb69df26f8fc966c16353274b138c55/'),
22 | }
23 | 
24 | # Define model config
25 | model_name = 'llama2_7b_chat'
26 | path = MODEL_CONFIGS[model_name]['path']
27 | template = MODEL_CONFIGS[model_name]['template']
28 | 
29 | # Enable DynamicNTK for extending LLaMa2 to longer sequences
30 | config = AutoConfig.from_pretrained(path)
31 | config.rope_scaling = dict(type="dynamic", factor=2)
32 | config.max_position_embeddings = 4096
33 | 
34 | # Load model
35 | model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.float16, device_map='auto', config=config).eval()
36 | tokenizer = AutoTokenizer.from_pretrained(path)
37 | 
38 | # Set the max sequence length before inference to avoid inconsistency of RoPE's base parameter
39 | set_dynamicntk_rope_length(model, 5200)
40 | 
41 | # Define KV cache eviction policy
42 | kv_policy = "roco"
43 | 
44 | enable_fixed_kv(model, tokenizer, mode='encoding', stride=24)
45 | 
46 | 
47 | # Test the passkey retrieval task
48 | for line in open("./passkey_examples_5k.jsonl", "r"):
49 |     example = json.loads(line)
50 |     prompt_postfix = "What is the pass key? The pass key is "
51 |     prompt = example["input"] + prompt_postfix
52 |     input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()
53 |     print("-----------------------------------")
54 |     print(f"#Tokens of Prompt:", input_ids.shape[1], end=" ")
55 |     print("Passkey target:", example["target"])
56 | 
57 |     # EasyKV generate
58 |     budgets = [0.5]
59 |     for budget in budgets:
60 |         # Define sampling parameters
61 |         for kv_policy in ['h2o_head_std_avg']:
62 |             gen_kwargs = dict(
63 |                 temperature=1e-9,
64 |                 top_p=1.0,
65 |                 max_new_tokens=6,
66 |                 budget=budget,
67 |                 kv_policy=kv_policy,
68 |                 keep_attention=True
69 |             )
70 |             output = model.easykv_generate(input_ids=input_ids, generation_config=gen_kwargs)
71 |             answer= f"Llama2-EasyKV-{kv_policy}({gen_kwargs['budget']*100:.2f}%):     [" + prompt_postfix + output  + "]"
72 |             answer = answer.replace("\n", "\\n")
73 |             print(answer)


--------------------------------------------------------------------------------
/easykv/mistral_patch.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import warnings
  3 | from typing import List, Optional, Tuple, Union
  4 | 
  5 | import torch
  6 | import torch.nn.functional as F
  7 | import torch.utils.checkpoint
  8 | from torch import nn
  9 | from transformers.cache_utils import Cache
 10 | 
 11 | 
 12 | def rotate_half(x):
 13 |     """Rotates half the hidden dims of the input."""
 14 |     x1 = x[..., : x.shape[-1] // 2]
 15 |     x2 = x[..., x.shape[-1] // 2:]
 16 |     return torch.cat((-x2, x1), dim=-1)
 17 | 
 18 | # Copied from transformers.models.llama.modeling_llama.repeat_kv
 19 | 
 20 | 
 21 | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
 22 |     """
 23 |     This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
 24 |     num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
 25 |     """
 26 |     batch, num_key_value_heads, slen, head_dim = hidden_states.shape
 27 |     if n_rep == 1:
 28 |         return hidden_states
 29 |     hidden_states = hidden_states[:, :, None, :, :].expand(
 30 |         batch, num_key_value_heads, n_rep, slen, head_dim)
 31 |     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 32 | 
 33 | # Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
 34 | 
 35 | def apply_rotary_pos_emb_sep(x, cos, sin, position_ids, unsqueeze_dim=1):
 36 |     """Applies Rotary Position Embedding to the query and key tensors.
 37 | 
 38 |     Args:
 39 |         q (`torch.Tensor`): The query tensor.
 40 |         k (`torch.Tensor`): The key tensor.
 41 |         cos (`torch.Tensor`): The cosine part of the rotary embedding.
 42 |         sin (`torch.Tensor`): The sine part of the rotary embedding.
 43 |         position_ids (`torch.Tensor`):
 44 |             The position indices of the tokens corresponding to the query and key tensors. For example, this can be
 45 |             used to pass offsetted position ids when working with a KV-cache.
 46 |         unsqueeze_dim (`int`, *optional*, defaults to 1):
 47 |             The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
 48 |             sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
 49 |             that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
 50 |             k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
 51 |             cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
 52 |             the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
 53 |     Returns:
 54 |         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
 55 |     """
 56 |     cos = cos[position_ids].unsqueeze(unsqueeze_dim)
 57 |     sin = sin[position_ids].unsqueeze(unsqueeze_dim)
 58 |     x_embed = (x * cos) + (rotate_half(x) * sin)
 59 |     return x_embed
 60 | 
 61 | 
 62 | def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
 63 |     """Applies Rotary Position Embedding to the query and key tensors.
 64 | 
 65 |     Args:
 66 |         q (`torch.Tensor`): The query tensor.
 67 |         k (`torch.Tensor`): The key tensor.
 68 |         cos (`torch.Tensor`): The cosine part of the rotary embedding.
 69 |         sin (`torch.Tensor`): The sine part of the rotary embedding.
 70 |         position_ids (`torch.Tensor`):
 71 |             The position indices of the tokens corresponding to the query and key tensors. For example, this can be
 72 |             used to pass offsetted position ids when working with a KV-cache.
 73 |         unsqueeze_dim (`int`, *optional*, defaults to 1):
 74 |             The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
 75 |             sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
 76 |             that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
 77 |             k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
 78 |             cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
 79 |             the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
 80 |     Returns:
 81 |         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
 82 |     """
 83 |     cos = cos[position_ids].unsqueeze(unsqueeze_dim)
 84 |     sin = sin[position_ids].unsqueeze(unsqueeze_dim)
 85 |     q_embed = (q * cos) + (rotate_half(q) * sin)
 86 |     k_embed = (k * cos) + (rotate_half(k) * sin)
 87 |     return q_embed, k_embed
 88 | 
 89 | 
 90 | def mistral_forward(
 91 |     self,
 92 |     hidden_states: torch.Tensor,
 93 |     attention_mask: Optional[torch.Tensor] = None,
 94 |     position_ids: Optional[torch.LongTensor] = None,
 95 |     past_key_value: Optional[Cache] = None,
 96 |     output_attentions: bool = False,
 97 |     use_cache: bool = False,
 98 |     attn_device = 'cuda',
 99 |     **kwargs,
100 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
101 |     if "padding_mask" in kwargs:
102 |         warnings.warn(
103 |             "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
104 |         )
105 |     bsz, q_len, _ = hidden_states.size()
106 | 
107 |     query_states = self.q_proj(hidden_states)
108 |     key_states = self.k_proj(hidden_states)
109 |     value_states = self.v_proj(hidden_states)
110 | 
111 |     query_states = query_states.view(
112 |         bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
113 |     key_states = key_states.view(
114 |         bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
115 |     value_states = value_states.view(
116 |         bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
117 | 
118 |     kv_seq_len = key_states.shape[-2]
119 |     if past_key_value is not None:
120 |         if self.layer_idx is None:
121 |             raise ValueError(
122 |                 f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
123 |                 "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
124 |                 "with a layer index."
125 |             )
126 |         kv_seq_len += past_key_value.get_usable_length(
127 |             kv_seq_len, self.layer_idx)
128 |     # for our purpose
129 |     max_Pos_id = position_ids[0, -1].cpu().item()
130 |     cos, sin = self.rotary_emb(
131 |         value_states, seq_len=max(kv_seq_len, max_Pos_id+1))
132 | 
133 |     # orig impl
134 |     # cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
135 |     query_states, key_states = apply_rotary_pos_emb(
136 |         query_states, key_states, cos, sin, position_ids)
137 | 
138 |     if past_key_value is not None:
139 |         cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
140 |         key_states, value_states = past_key_value.update(
141 |             key_states, value_states, self.layer_idx, cache_kwargs)
142 | 
143 |     # repeat k/v heads if n_kv_heads < n_heads
144 |     key_states = repeat_kv(key_states, self.num_key_value_groups)
145 |     value_states = repeat_kv(value_states, self.num_key_value_groups)
146 | 
147 |     attn_weights = torch.matmul(
148 |         query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
149 | 
150 |     if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
151 |         raise ValueError(
152 |             f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
153 |             f" {attn_weights.size()}"
154 |         )
155 | 
156 |     if attention_mask is not None:
157 |         if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
158 |             raise ValueError(
159 |                 f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
160 |             )
161 | 
162 |         attn_weights = attn_weights + attention_mask
163 | 
164 |     # upcast attention to fp32
165 |     attn_weights = nn.functional.softmax(
166 |         attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
167 |     attn_weights = nn.functional.dropout(
168 |         attn_weights, p=self.attention_dropout, training=self.training)
169 |     attn_output = torch.matmul(attn_weights, value_states)
170 | 
171 |     if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
172 |         raise ValueError(
173 |             f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
174 |             f" {attn_output.size()}"
175 |         )
176 | 
177 |     attn_output = attn_output.transpose(1, 2).contiguous()
178 |     attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
179 | 
180 |     attn_output = self.o_proj(attn_output)
181 | 
182 |     if not output_attentions:
183 |         attn_weights = None
184 |     else: attn_weights = attn_weights.to(attn_device)
185 | 
186 |     return attn_output, attn_weights, past_key_value
187 | 
188 | 
189 | def mistral_forward_stream(
190 |     self,
191 |     hidden_states: torch.Tensor,
192 |     attention_mask: Optional[torch.Tensor] = None,
193 |     position_ids: Optional[torch.LongTensor] = None,
194 |     past_key_value: Optional[Cache] = None,
195 |     output_attentions: bool = False,
196 |     use_cache: bool = False,
197 |     attn_device = 'cuda',
198 |     **kwargs,
199 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
200 |     if "padding_mask" in kwargs:
201 |         warnings.warn(
202 |             "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
203 |         )
204 |     bsz, q_len, _ = hidden_states.size()
205 | 
206 |     query_states = self.q_proj(hidden_states)
207 |     key_states = self.k_proj(hidden_states)
208 |     value_states = self.v_proj(hidden_states)
209 | 
210 |     query_states = query_states.view(
211 |         bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
212 |     key_states = key_states.view(
213 |         bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
214 |     value_states = value_states.view(
215 |         bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
216 | 
217 |     kv_seq_len = key_states.shape[-2]
218 |     if past_key_value is not None:
219 |         if self.layer_idx is None:
220 |             raise ValueError(
221 |                 f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
222 |                 "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
223 |                 "with a layer index."
224 |             )
225 |         prev_kv_length = past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
226 |         query_position_ids = torch.arange(prev_kv_length, prev_kv_length+query_states.shape[-2], device=attn_device).unsqueeze(0).repeat(bsz, 1)
227 |         key_position_ids = torch.arange(0, prev_kv_length+query_states.shape[-2], device=attn_device).unsqueeze(0).repeat(bsz, 1)
228 |         kv_seq_len += past_key_value.get_usable_length(
229 |             kv_seq_len, self.layer_idx)
230 |     else:
231 |         query_position_ids = torch.arange(0, query_states.shape[-2], device=attn_device).unsqueeze(0).repeat(bsz, 1)
232 |         key_position_ids = torch.arange(0, query_states.shape[-2], device=attn_device).unsqueeze(0).repeat(bsz, 1)
233 |     cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
234 | 
235 |     if past_key_value is not None:
236 |         cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
237 |         key_states, value_states = past_key_value.update(
238 |             key_states, value_states, self.layer_idx, cache_kwargs)
239 | 
240 |     query_states = apply_rotary_pos_emb_sep(query_states, cos, sin, query_position_ids)
241 |     key_states = apply_rotary_pos_emb_sep(key_states, cos, sin, key_position_ids)
242 | 
243 |     # repeat k/v heads if n_kv_heads < n_heads
244 |     key_states = repeat_kv(key_states, self.num_key_value_groups)
245 |     value_states = repeat_kv(value_states, self.num_key_value_groups)
246 | 
247 |     attn_weights = torch.matmul(
248 |         query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
249 | 
250 |     if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
251 |         raise ValueError(
252 |             f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
253 |             f" {attn_weights.size()}"
254 |         )
255 | 
256 |     if attention_mask is not None:
257 |         if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
258 |             raise ValueError(
259 |                 f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
260 |             )
261 | 
262 |         attn_weights = attn_weights + attention_mask
263 | 
264 |     # upcast attention to fp32
265 |     attn_weights = nn.functional.softmax(
266 |         attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
267 |     attn_weights = nn.functional.dropout(
268 |         attn_weights, p=self.attention_dropout, training=self.training)
269 |     attn_output = torch.matmul(attn_weights, value_states)
270 | 
271 |     if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
272 |         raise ValueError(
273 |             f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
274 |             f" {attn_output.size()}"
275 |         )
276 | 
277 |     attn_output = attn_output.transpose(1, 2).contiguous()
278 |     attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
279 | 
280 |     attn_output = self.o_proj(attn_output)
281 | 
282 |     if not output_attentions:
283 |         attn_weights = None
284 |     else: attn_weights = attn_weights.to(attn_device)
285 | 
286 |     return attn_output, attn_weights, past_key_value
287 | 


--------------------------------------------------------------------------------
/test_summarization.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | warnings.filterwarnings("ignore")
 3 | import torch
 4 | from transformers import (AutoModelForCausalLM, AutoTokenizer)
 5 | from easykv import enable_fixed_kv
 6 | 
 7 | # define the model path and the corresponding prompt template
 8 | MODEL_CONFIGS = {
 9 |     'wizardlm_13b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--WizardLM--WizardLM-13B-V1.2/snapshots/cf5f40382559f19e13874e45b39575171ca46ef8', template="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\nUSER: Hello!\nASSISTANT: Hello!</s>\nUSER: {inst}\nASSISTANT:"),
10 |     'llama2_13b_chat': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--meta-llama--Llama-2-13b-chat-hf/snapshots/c2f3ec81aac798ae26dcc57799a994dfbf521496/', template="[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>\n{inst}[/INST]"),
11 |     'vicuna_13b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--lmsys--vicuna-13b-v1.5/snapshots/3deb0106f72a3a433f0c6ea0cb978bdf14bcd3a6/', template="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\nUSER: Hello!\nASSISTANT: Hello!</s>\nUSER: {inst}\nASSISTANT:"),
12 |     'openchat': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--openchat--openchat_v3.2_super/snapshots/aab7ce4d48b31a295a0116b61569d8e87a09bb7a/', template="GPT4 User: {inst}<|end_of_turn|>GPT4 Assistant:"),
13 |     'vicuna_7b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--lmsys--vicuna-7b-v1.5/snapshots/de56c35b1763eaae20f4d60efd64af0a9091ebe5/', template="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\nUSER: Hello!\nASSISTANT: Hello!</s>\nUSER: {inst}\nASSISTANT:"),
14 |     'wizardlm_7b': dict(path='/cpfs01/user/rensiyu/language_modeling/stanford_alpaca/output_mle_fp16_recycledWiz70k_llama2_7b_512', template="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{inst}\n\n### Response:"),
15 |     'alpaca_7b': dict(path='/cpfs01/user/rensiyu/language_modeling/stanford_alpaca/output_mle_recycledAlpaca52k_llama2_7b_512_ds', template="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{inst}\n\n### Response:"),
16 |     'zephyr_7b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--HuggingFaceH4--zephyr-7b-beta/snapshots/dc24cabd13eacd3ae3a5fe574bd645483a335a4a/', template="<|system|>\nYou are a friendly chatbot who always responds in a helpful and detailed manner to the user's questions.</s>\n<|user|>\n{inst}</s>\n<|assistant|>\n"),
17 |     'llama2_7b_chat': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/llama2/model_weights_hf/llama-2-7b-chat-hf/', template="[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>\n{inst}[/INST]"),
18 |     'llama2_7b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/llama2/model_weights_hf/llama-2-7b-hf'),
19 |     'llama2_13b': dict(path='/cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub/models--meta-llama--Llama-2-13b-hf/snapshots/dc1d3b3bfdb69df26f8fc966c16353274b138c55/'),
20 | }
21 | 
22 | model_name = 'llama2_7b_chat'
23 | path = MODEL_CONFIGS[model_name]['path']
24 | template = MODEL_CONFIGS[model_name]['template']
25 | model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.float16, device_map='auto').eval()
26 | tokenizer = AutoTokenizer.from_pretrained(path)
27 | 
28 | # Setup KV caching mode
29 | stride = 24
30 | enable_fixed_kv(model, tokenizer, mode='encoding', stride=stride)
31 | 
32 | # Test input
33 | article = "###\nArticle: Media playback is not supported on this device The tournament kicks off with France playing Romania on Friday, 10 June and the final will take place at the Stade de France in Paris on 10 July. Spain are hoping to retain the title they won in 2008 and 2012. England will be appearing at their ninth finals, while Wales and Northern Ireland have qualified for the first time. Good question - because, for the first time, there are 24 teams competing in the finals. That is an increase from the 16 that had taken part in every edition since it was hosted in England in 1996. With six groups of four teams, it means the top two will qualify for the last 16, plus the four best third-placed finishers. In other words, only eight teams will fail to qualify from the group stage. One point could be enough to put your team into the last 16 - and from then the tournament goes to a knockout format. As Greece proved by pulling off a sensational triumph at Euro 2004, trying to predict a winner is a difficult game. World champions Germany are understandably one of the favourites, while holders Spain are also short odds with the bookmakers. Germany finished top of their qualification group but booking their place in France was not without its hiccups - with defeats by Poland and the Republic of Ireland. They have been beaten by France and England since qualifying but remain one to watch. Spain dominated international football between 2008 and 2012 but failed to qualify from their group at the 2014 World Cup. Could this be a last hurrah for Vicente del Bosque's ageing but brilliant side? France won the World Cup as hosts in 1998 - can they repeat the famous success of the team led by Didier Deschamps, Zinedine Zidane and Laurent Blanc? A run into the latter stages by Les Bleus is likely to help restore national morale in a country still recovering from last year's deadly Paris attacks. France are the bookmakers' favourites to win the whole thing, but their form is tricky to gauge because they did not have to qualify for this tournament. Fast on the counter and unified after years of internal division, watch out for talented midfielders Paul Pogba and Blaise Matuidi. But they cannot call upon prolific Real Madrid striker Karim Benzema. The French Football Federation (FFF) said he would not be picked after being investigated for his part in an alleged plot to blackmail Les Bleus team-mate Mathieu Valbuena, who did not make the squad. Well, there are three of them in France - only Scotland missed out - plus the Republic of Ireland. It is going to get crowded too, because England and Wales are both in Group B, along with Russia and debutants Slovakia. Don't expect much work to get done when England and Wales play each other - that match in Lens will be shown live on BBC One, kick-off 14:00 BST, on Thursday, 16 June. Roy Hodgson's England have a good young team but doubts persist about their defence. They breezed through their qualifying group with a perfect record of 10 wins, while confidence was further boosted with March's impressive win away at world champions Germany. Wales have got Real Madrid star Gareth Bale and plenty of heart. Chris Coleman's side reached their first major tournament in 57 years after losing just once in qualification. Northern Ireland might have numerous players from England's lower leagues but they finished top of their qualifying group, losing just one of their 10 matches. Striker Kyle Lafferty could not get a game for club side Norwich during qualifying (he has since been loaned to Birmingham) but he scored seven crucial goals in nine games for his country. With the 24-team format offering hope to traditionally less-successful nations, five countries have taken advantage to secure their European Championship finals debut. Northern Ireland and Wales, of course, are two of them. Slovakia, who have been drawn alongside England and Wales in Group B, have qualified for the first time as an independent state. A 1-0 win against Spain showed the 2010 World Cup qualifiers can mix it with Europe's elite. Beware England and Wales. Albania have never been near a major finals, but edged out Denmark - the 1992 champions - in their qualifying group thanks to a shock win in Portugal. Oh, and because they were awarded a 3-0 win in Serbia by the Court of Arbitration for Sport after a riot. With a population of roughly 330,000 (comparable to Coventry) and only 21,508 registered players, Iceland are unsurprisingly the smallest nation to ever qualify for the finals. How did they do that? By beating the Netherlands - 1988 winners and three-time semi-finalists - home and away during qualifying. The Dutch, for the first time since 1984, will not be there. Real Madrid superstar Cristiano Ronaldo, who was voted the world's second best player behind Argentina's Lionel Messi in the 2015 Ballon d'Or vote, heads a star-studded cast. Ronaldo will be leading Portugal's challenge, while Wales hope his club-mate Gareth Bale - the world's most expensive player - can transform his impressive La Liga form onto the international stage. World Cup winners Germany boast a host of stellar names who have impressed in previous tournaments, most notably Bayern Munich goalkeeper Manuel Neuer and forward Thomas Muller plus Real Madrid midfielder Toni Kroos. France's challenge is set to be driven by energetic Juventus midfielder Paul Pogba, who is still reportedly courting the attention of several English Premier League sides. Belgium are not short of star quality either. Premier League pair Eden Hazard and Kevin de Bruyne will spearhead their quest for a first major tournament win, but the Red Devils are without injured captain Vincent Kompany. And Sweden superstar Zlatan Ibrahimovic, 34, will be hoping to illuminate an international tournament for what could be the final time. The Netherlands' failure to qualify leaves Bayern Munich winger Arjen Robben watching from home, while his club-mate Franck Ribery was not named in the France squad, despite suggestions he was considering ending his international retirement. The tournament's 51 games will be staged at 10 locations across France, including new stadiums in Bordeaux, Lille, Lyon and Nice. The opening match - between France and Romania on 10 June - and the 10 July final will be played at the Stade de France in Paris. Building the new venues and renovating historic grounds such as Marseille's Stade Velodrome has cost 1.6bn euros (\u00a31.2bn) - modernisation which was necessary, organisers say, because France did not fully capitalise on hosting the 1998 World Cup. Vibrating stands, floating roofs and adjacent slag heaps - read BBC Sport's venue-by-venue guide In March this year, the International Football Association Board (IFAB) ratified a host of revisions to the laws of the game in an attempt to remove inconsistencies and meet the needs of the modern game. The changes came into effect on 1 June, so will apply for Euro 2016. More than 90 revisions were made, but these are some of the key changes: Kick-off: Previously, the ball had to go forward from kick-off but the rule has been changed to allow it to go in any direction. Pre-match red cards: Players can now be sent off before a match gets under way, although they can be replaced by another player in the match-day squad. Leaving the pitch after treatment: If a player is injured in a challenge resulting in a yellow or red card, they no longer have to leave the field and can have a quick assessment or medical treatment. This change is designed to prevent situations where a team would be temporarily down to 10 players. The end of 'triple punishment': A professional foul inside the area will now normally result in a yellow card for the offender, and not a red. This is to end the so-called triple punishment of penalty, dismissal and suspension, which was seen by some as excessive. There are exceptions for when the offender will receive a red which include holding, pushing or pulling and violent conduct. Goalline technology, already established in the Premier League and elsewhere, will be used at Euro 2016. Yes. But you had better be quick. Most of the group games - including all of England's matches - are sold out. However, as of 9 June there are tickets available to watch Wales and Northern Ireland, plus other games involving some of the smaller nations. There are also tickets available for the opening game between France and Romania - at 395 euros each. Find the latest ticket details on Uefa's official Euro 2016 website Each of the 10 host cities will have official Uefa fan zones. The fan zones will have a giant screen for showing all of the matches throughout the tournament. The biggest will be in Paris, where up to 90,000 supporters can gather on the Champ de Mars - in the shadow of the Eiffel Tower. You won't miss a kick. The BBC and ITV will bring audiences closer to the heart of the action than ever before with extensive coverage of Euro 2016. The BBC will showcase 26 live matches on TV and streamed online within its 140 hours of programming, including highlights on BBC One, BBC Two and the Red Button plus 24 days of coverage on Radio 5 live. The BBC's Euro 2016 digital service will feature live broadcasts of all BBC games, commentary and live text coverage of every game featuring in-game highlight clips, on-demand highlights of every goal, breaking news, exclusive features and expert analysis. ITV will be home to 26 live games across ITV1 and ITV4, which will be simulcast live on the ITV Hub. READ MORE: Ferdinand and Henry join BBC coverage for the Euros The French government has extended a state of emergency imposed after the Paris attacks in November to cover Euro 2016. The coordinated gun and bomb attacks in Paris on 13 November killed 130 people and were claimed by so-called Islamic State. The Stade de France, which was targeted by suicide bombers, will host the opening match of Euro 2016 and the final. The current state of emergency gives police extra powers to conduct searches and put people under house arrest. More than 90,000 police, soldiers and private security agents are being deployed as well. US warns of Euro 2016 terror threat\n\nSummarize the above article in 1 sentence.\n"
34 | prompt = f"Write a SHORT summary of the following text delimited by triple backticks. Return your response which covers the key points of the text.\n```{article}```"
35 | input_prompt = template.format(inst=prompt)
36 | 
37 | # Define eviction policy
38 | kv_policy = 'roco'
39 | 
40 | for budget in [1.0, 0.5]:
41 |     # Define sampling parameters
42 |     gen_kwargs = dict(
43 |         temperature=0.3,
44 |         top_p=1.0,
45 |         max_new_tokens=256,
46 |         budget=budget,
47 |         kv_policy=kv_policy,
48 |         keep_attention=True
49 |     )
50 |     input_ids = tokenizer([input_prompt], return_tensors='pt').input_ids.to(model.device)
51 |     output = model.easykv_generate(input_ids=input_ids, generation_config=gen_kwargs)
52 |     print(f"{'='*20} EasyKV-{kv_policy} {'='*20}\n{output}")


--------------------------------------------------------------------------------
/easykv/llama_patch.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import warnings
  3 | from typing import List, Optional, Tuple, Union
  4 | 
  5 | import torch
  6 | import torch.nn.functional as F
  7 | import torch.utils.checkpoint
  8 | from torch import nn
  9 | from transformers.cache_utils import Cache
 10 | from transformers.models.llama.modeling_llama import *
 11 | 
 12 | 
 13 | def rotate_half(x):
 14 |     """Rotates half the hidden dims of the input."""
 15 |     x1 = x[..., : x.shape[-1] // 2]
 16 |     x2 = x[..., x.shape[-1] // 2:]
 17 |     return torch.cat((-x2, x1), dim=-1)
 18 | 
 19 | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
 20 |     """
 21 |     This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
 22 |     num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
 23 |     """
 24 |     batch, num_key_value_heads, slen, head_dim = hidden_states.shape
 25 |     if n_rep == 1:
 26 |         return hidden_states
 27 |     hidden_states = hidden_states[:, :, None, :, :].expand(
 28 |         batch, num_key_value_heads, n_rep, slen, head_dim)
 29 |     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 30 | 
 31 | def apply_grouped_rotary_pos_emb(q, k, cos, sin, position_ids, g_size_1=8, g_size_2=1024):
 32 |     # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
 33 |     position_ids_q = position_ids//g_size_1 + g_size_2 - g_size_2//g_size_1
 34 |     position_ids_k = position_ids//g_size_1
 35 | 
 36 |     cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
 37 |     sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
 38 |     cos_q = cos[position_ids_q].unsqueeze(1)  # [bs, 1, seq_len, dim]
 39 |     sin_q = sin[position_ids_q].unsqueeze(1)  # [bs, 1, seq_len, dim]
 40 |     cos_k = cos[position_ids_k].unsqueeze(1)  # [bs, 1, seq_len, dim]
 41 |     sin_k = sin[position_ids_k].unsqueeze(1)  # [bs, 1, seq_len, dim]
 42 |     q_embed = (q * cos_q) + (rotate_half(q) * sin_q) 
 43 |     k_embed = (k * cos_k) + (rotate_half(k) * sin_k) 
 44 |     return q_embed, k_embed
 45 | 
 46 | 
 47 | def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
 48 |     """Applies Rotary Position Embedding to the query and key tensors.
 49 | 
 50 |     Args:
 51 |         q (`torch.Tensor`): The query tensor.
 52 |         k (`torch.Tensor`): The key tensor.
 53 |         cos (`torch.Tensor`): The cosine part of the rotary embedding.
 54 |         sin (`torch.Tensor`): The sine part of the rotary embedding.
 55 |         position_ids (`torch.Tensor`):
 56 |             The position indices of the tokens corresponding to the query and key tensors. For example, this can be
 57 |             used to pass offsetted position ids when working with a KV-cache.
 58 |         unsqueeze_dim (`int`, *optional*, defaults to 1):
 59 |             The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
 60 |             sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
 61 |             that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
 62 |             k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
 63 |             cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
 64 |             the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
 65 |     Returns:
 66 |         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
 67 |     """
 68 |     cos = cos[position_ids].unsqueeze(unsqueeze_dim)
 69 |     sin = sin[position_ids].unsqueeze(unsqueeze_dim)
 70 |     q_embed = (q * cos) + (rotate_half(q) * sin)
 71 |     k_embed = (k * cos) + (rotate_half(k) * sin)
 72 |     return q_embed, k_embed
 73 | 
 74 | def apply_rotary_pos_emb_sep(x, cos, sin, position_ids, unsqueeze_dim=1):
 75 |     """Applies Rotary Position Embedding to the query and key tensors.
 76 | 
 77 |     Args:
 78 |         q (`torch.Tensor`): The query tensor.
 79 |         k (`torch.Tensor`): The key tensor.
 80 |         cos (`torch.Tensor`): The cosine part of the rotary embedding.
 81 |         sin (`torch.Tensor`): The sine part of the rotary embedding.
 82 |         position_ids (`torch.Tensor`):
 83 |             The position indices of the tokens corresponding to the query and key tensors. For example, this can be
 84 |             used to pass offsetted position ids when working with a KV-cache.
 85 |         unsqueeze_dim (`int`, *optional*, defaults to 1):
 86 |             The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
 87 |             sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
 88 |             that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
 89 |             k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
 90 |             cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
 91 |             the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
 92 |     Returns:
 93 |         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
 94 |     """
 95 |     cos = cos[position_ids].unsqueeze(unsqueeze_dim)
 96 |     sin = sin[position_ids].unsqueeze(unsqueeze_dim)
 97 |     x_embed = (x * cos) + (rotate_half(x) * sin)
 98 |     return x_embed
 99 | 
100 | 
101 | def apply_neighbor_rotary_pos_emb(q, k, cos, sin, position_ids, g_size=1):
102 |     # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
103 |     position_ids = position_ids % g_size
104 | 
105 |     cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
106 |     sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
107 |     cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
108 |     sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
109 |     q_embed = (q * cos) + (rotate_half(q) * sin)
110 |     k_embed = (k * cos) + (rotate_half(k) * sin)
111 |     return q_embed, k_embed
112 | 
113 | def apply_identical_rotary_pos_emb(q, k, cos, sin, position_ids, idd_position=1024):
114 |     position_ids = torch.ones_like(position_ids) * idd_position
115 | 
116 |     cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
117 |     sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
118 |     cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
119 |     sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
120 |     q_embed = (q * cos) + (rotate_half(q) * sin)
121 |     k_embed = (k * cos) + (rotate_half(k) * sin)
122 |     return q_embed, k_embed
123 | 
124 | 
125 | def llama_forward(
126 |     self,
127 |     hidden_states: torch.Tensor,
128 |     attention_mask: Optional[torch.Tensor] = None,
129 |     position_ids: Optional[torch.LongTensor] = None,
130 |     past_key_value: Optional[Cache] = None,
131 |     output_attentions: bool = False,
132 |     use_cache: bool = False,
133 |     attn_device = 'cuda',
134 |     **kwargs,
135 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
136 |     if "padding_mask" in kwargs:
137 |         warnings.warn(
138 |             "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
139 |         )
140 | 
141 |     bsz, q_len, _ = hidden_states.size()
142 | 
143 |     if self.config.pretraining_tp > 1:
144 |         key_value_slicing = (self.num_key_value_heads *
145 |                              self.head_dim) // self.config.pretraining_tp
146 |         query_slices = self.q_proj.weight.split(
147 |             (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
148 |         )
149 |         key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
150 |         value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
151 | 
152 |         query_states = [F.linear(hidden_states, query_slices[i])
153 |                         for i in range(self.config.pretraining_tp)]
154 |         query_states = torch.cat(query_states, dim=-1)
155 | 
156 |         key_states = [F.linear(hidden_states, key_slices[i])
157 |                       for i in range(self.config.pretraining_tp)]
158 |         key_states = torch.cat(key_states, dim=-1)
159 | 
160 |         value_states = [F.linear(hidden_states, value_slices[i])
161 |                         for i in range(self.config.pretraining_tp)]
162 |         value_states = torch.cat(value_states, dim=-1)
163 | 
164 |     else:
165 |         query_states = self.q_proj(hidden_states)
166 |         key_states = self.k_proj(hidden_states)
167 |         value_states = self.v_proj(hidden_states)
168 | 
169 |     query_states = query_states.view(
170 |         bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
171 |     key_states = key_states.view(
172 |         bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
173 |     value_states = value_states.view(
174 |         bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
175 | 
176 |     kv_seq_len = key_states.shape[-2]
177 |     if past_key_value is not None:
178 |         if self.layer_idx is None:
179 |             raise ValueError(
180 |                 f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
181 |                 "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
182 |                 "with a layer index."
183 |             )
184 |         kv_seq_len += past_key_value.get_usable_length(
185 |             kv_seq_len, self.layer_idx)
186 |     # for KV eviction purpose
187 |     max_Pos = torch.topk(position_ids[0], k=1, dim=-1)[0].cpu().item()+1
188 |     cos, sin = self.rotary_emb(
189 |         value_states, seq_len=max(kv_seq_len, max_Pos))
190 |     query_states, key_states = apply_rotary_pos_emb(
191 |         query_states, key_states, cos, sin, position_ids)
192 | 
193 |     if past_key_value is not None:
194 |         cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
195 |         key_states, value_states = past_key_value.update(
196 |             key_states, value_states, self.layer_idx, cache_kwargs)
197 | 
198 |     key_states = repeat_kv(key_states, self.num_key_value_groups)
199 |     value_states = repeat_kv(value_states, self.num_key_value_groups)
200 | 
201 |     attn_weights = torch.matmul(
202 |         query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
203 | 
204 |     if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
205 |         raise ValueError(
206 |             f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
207 |             f" {attn_weights.size()}"
208 |         )
209 | 
210 |     if attention_mask is not None:
211 |         if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
212 |             raise ValueError(
213 |                 f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
214 |             )
215 |         attn_weights = attn_weights + attention_mask
216 | 
217 |     # upcast attention to fp32
218 |     attn_weights = nn.functional.softmax(
219 |         attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
220 |     attn_weights = nn.functional.dropout(
221 |         attn_weights, p=self.attention_dropout, training=self.training)
222 |     attn_output = torch.matmul(attn_weights, value_states)
223 | 
224 |     if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
225 |         raise ValueError(
226 |             f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
227 |             f" {attn_output.size()}"
228 |         )
229 | 
230 |     attn_output = attn_output.transpose(1, 2).contiguous()
231 | 
232 |     attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
233 | 
234 |     if self.config.pretraining_tp > 1:
235 |         attn_output = attn_output.split(
236 |             self.hidden_size // self.config.pretraining_tp, dim=2)
237 |         o_proj_slices = self.o_proj.weight.split(
238 |             self.hidden_size // self.config.pretraining_tp, dim=1)
239 |         attn_output = sum([F.linear(attn_output[i], o_proj_slices[i])
240 |                           for i in range(self.config.pretraining_tp)])
241 |     else:
242 |         attn_output = self.o_proj(attn_output)
243 | 
244 |     if not output_attentions:
245 |         attn_weights = None
246 |     else: attn_weights = attn_weights.to(attn_device)
247 | 
248 |     return attn_output, attn_weights, past_key_value
249 | 
250 | 
251 | def llama_forward_stream(
252 |     self,
253 |     hidden_states: torch.Tensor,
254 |     attention_mask: Optional[torch.Tensor] = None,
255 |     position_ids: Optional[torch.LongTensor] = None,
256 |     past_key_value: Optional[Cache] = None,
257 |     output_attentions: bool = False,
258 |     use_cache: bool = False,
259 |     attn_device = 'cuda',
260 |     **kwargs,
261 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
262 |     if "padding_mask" in kwargs:
263 |         warnings.warn(
264 |             "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
265 |         )
266 | 
267 |     bsz, q_len, _ = hidden_states.size()
268 | 
269 |     if self.config.pretraining_tp > 1:
270 |         key_value_slicing = (self.num_key_value_heads *
271 |                              self.head_dim) // self.config.pretraining_tp
272 |         query_slices = self.q_proj.weight.split(
273 |             (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
274 |         )
275 |         key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
276 |         value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
277 | 
278 |         query_states = [F.linear(hidden_states, query_slices[i])
279 |                         for i in range(self.config.pretraining_tp)]
280 |         query_states = torch.cat(query_states, dim=-1)
281 | 
282 |         key_states = [F.linear(hidden_states, key_slices[i])
283 |                       for i in range(self.config.pretraining_tp)]
284 |         key_states = torch.cat(key_states, dim=-1)
285 | 
286 |         value_states = [F.linear(hidden_states, value_slices[i])
287 |                         for i in range(self.config.pretraining_tp)]
288 |         value_states = torch.cat(value_states, dim=-1)
289 | 
290 |     else:
291 |         query_states = self.q_proj(hidden_states)
292 |         key_states = self.k_proj(hidden_states)
293 |         value_states = self.v_proj(hidden_states)
294 | 
295 |     query_states = query_states.view(
296 |         bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
297 |     key_states = key_states.view(
298 |         bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
299 |     value_states = value_states.view(
300 |         bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
301 | 
302 |     kv_seq_len = key_states.shape[-2]
303 |     if past_key_value is not None:
304 |         if self.layer_idx is None:
305 |             raise ValueError(
306 |                 f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
307 |                 "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
308 |                 "with a layer index."
309 |             )
310 |         prev_kv_length = past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
311 |         query_position_ids = torch.arange(prev_kv_length, prev_kv_length+query_states.shape[-2], device=attn_device).unsqueeze(0).repeat(bsz, 1)
312 |         key_position_ids = torch.arange(0, prev_kv_length+query_states.shape[-2], device=attn_device).unsqueeze(0).repeat(bsz, 1)
313 |         kv_seq_len += past_key_value.get_usable_length(
314 |             kv_seq_len, self.layer_idx)
315 |     else:
316 |         query_position_ids = torch.arange(0, query_states.shape[-2], device=attn_device).unsqueeze(0).repeat(bsz, 1)
317 |         key_position_ids = torch.arange(0, query_states.shape[-2], device=attn_device).unsqueeze(0).repeat(bsz, 1)
318 |     cos, sin = self.rotary_emb(
319 |         value_states, seq_len=kv_seq_len)
320 | 
321 |     if past_key_value is not None:
322 |         cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
323 |         key_states, value_states = past_key_value.update(
324 |             key_states, value_states, self.layer_idx, cache_kwargs)
325 | 
326 |     query_states = apply_rotary_pos_emb_sep(query_states, cos, sin, query_position_ids)
327 |     key_states = apply_rotary_pos_emb_sep(key_states, cos, sin, key_position_ids)
328 | 
329 |     key_states = repeat_kv(key_states, self.num_key_value_groups)
330 |     value_states = repeat_kv(value_states, self.num_key_value_groups)
331 | 
332 |     attn_weights = torch.matmul(
333 |         query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
334 | 
335 |     if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
336 |         raise ValueError(
337 |             f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
338 |             f" {attn_weights.size()}"
339 |         )
340 | 
341 |     if attention_mask is not None:
342 |         if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
343 |             raise ValueError(
344 |                 f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
345 |             )
346 |         attn_weights = attn_weights + attention_mask
347 | 
348 |     # upcast attention to fp32
349 |     attn_weights = nn.functional.softmax(
350 |         attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
351 |     attn_weights = nn.functional.dropout(
352 |         attn_weights, p=self.attention_dropout, training=self.training)
353 |     attn_output = torch.matmul(attn_weights, value_states)
354 | 
355 |     if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
356 |         raise ValueError(
357 |             f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
358 |             f" {attn_output.size()}"
359 |         )
360 | 
361 |     attn_output = attn_output.transpose(1, 2).contiguous()
362 | 
363 |     attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
364 | 
365 |     if self.config.pretraining_tp > 1:
366 |         attn_output = attn_output.split(
367 |             self.hidden_size // self.config.pretraining_tp, dim=2)
368 |         o_proj_slices = self.o_proj.weight.split(
369 |             self.hidden_size // self.config.pretraining_tp, dim=1)
370 |         attn_output = sum([F.linear(attn_output[i], o_proj_slices[i])
371 |                           for i in range(self.config.pretraining_tp)])
372 |     else:
373 |         attn_output = self.o_proj(attn_output)
374 | 
375 |     if not output_attentions:
376 |         attn_weights = None
377 |     else: attn_weights = attn_weights.to(attn_device)
378 | 
379 |     return attn_output, attn_weights, past_key_value
380 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <h1 align="center">
  2 | <img src="./logo.png" alt="EasyKV" width="250" height="200"/>
  3 | <br>
  4 | EasyKV
  5 | </h1>
  6 | 
  7 | EasyKV is a Pytorch package dedicated for ***key-value cache constrained*** generative language model inference.
  8 | <p align="center">
  9 |   <a href="#update">Update</a> •
 10 |   <a href="#features">Features</a> •
 11 |   <a href="#installation">Installation</a> •
 12 |   <a href="#example-usage">Example</a> •
 13 |   <a href="#passkey-retrieval-example">Passkey Retrieval</a> •
 14 |   <a href="#summarization-example">Summarization</a> •
 15 |   <a href="#instruction-following">Instruction Following</a> •
 16 |   <a href="#perplexity">Perplexity</a> •
 17 |   <a href="#todos">Todos</a> •
 18 |   <a href="#acknowledgement">Acknowledgement</a>
 19 | </p>
 20 | 
 21 | ## Update
 22 | + [2024.2.12] The paper of EasyKV is now available on arxiv(https://arxiv.org/abs/2402.06262).
 23 | + [2024.1.28] Add support for streaming mode by setting ```streaming``` in generation config to ```True```.
 24 | + [2024.1.19] Add support for stopping condition during generation.
 25 | + [2024.1.17] Add ```auto``` mode. See [example](#auto-mode).
 26 | + [2024.1.17] Add examples for [perplexity computation](#perplexity) using LLaMa2-13B and DynamicNTK.
 27 | + [2024.1.16] Add examples for [Instruction Following](#instruction-following) using LLaMa2-7B-Chat.
 28 | + [2024.1.15] Add examples for [Passkey Retrieval](#passkey-retrieval-example) using long-context LLM(Vicuna-7B-16K) and DynamicNTK-scaled LLaMa2-7B-Chat.
 29 | + [2024.1.15] Add examples for [Summarization](#summarization-example) using LLaMa2-7B-Chat.
 30 | + [2024.1.14] Uploaded the standalone Pytorch implementation. Pypi package and paper describing the details of our integrated eviction policy design are coming soon.
 31 | 
 32 | ## Features
 33 | + Offer control over the memory budget allocated for the KV cache during LLM inference, with easy-to-use interface.
 34 | + Support both prompt encoding and auto-regressive decoding.
 35 | + Support Multi-Head Attention(MHA), Multi-Query Attention(MQA), and Grouped-Query Attention(GQA).
 36 | + Support LLaMa, LLaMa2, and Mistral.
 37 | + Support various stride for prompt encoding(larger stride leads to faster encoding).
 38 | 
 39 | ## Installation
 40 | First of all, clone this repo into your working directory.
 41 | ```bash
 42 | git clone https://github.com/DRSY/EasyKV.git
 43 | cd EasyKV
 44 | ```
 45 | Then import ```enable_fixed_kv``` in your Python script:
 46 | ```python
 47 | from easykv import enable_fixed_kv
 48 | ```
 49 | 
 50 | ## Example Usage
 51 | There are two different phases in LLM generative inference, i.e., prompt encoding and auto-regressive decoding. 
 52 | ### Prompt Encoding/Prefilling
 53 | For prefilling stage, please specify ```budget``` in the range of (0,1), e.g., 0.5, which leads to 50% savings in KV cache memory footprint.
 54 | ```python
 55 | import torch
 56 | from transformers import AutoModelForCausalLM, AutoTokenizer
 57 | 
 58 | # Define your model path and template in a dict MODEL_CONFIGS
 59 | model_name = 'zephyr_7b'
 60 | path = MODEL_CONFIGS[model_name]['path']
 61 | template = MODEL_CONFIGS[model_name]['template']
 62 | model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.float16, device_map='auto').eval()
 63 | tokenizer = AutoTokenizer.from_pretrained(path)
 64 | 
 65 | # Turn on fixed KV cache mode for prefilling phase
 66 | stride=8
 67 | enable_fixed_kv(model, tokenizer, mode='encoding', stride=stride)
 68 | 
 69 | # Test input
 70 | article = "###\nArticle: It was the first time the Single Transferable Vote (STV) system had been used to select two members in the same ward in a by-election. The SNP topped the vote in the Leith Walk by-election, while Scottish Labour won the second seat from the Greens. The by-election was called after Deidre Brock of the SNP and Maggie Chapman of the Scottish Greens stood down. The SNP's John Lewis Ritchie topped the Leith Walk poll with 2,290 votes. He was elected at stage one in the STV process with a swing in first-preference votes of 7.6% from Labour. Labour's Marion Donaldson received 1,623 votes, ahead of Susan Jane Rae of the Scottish Greens on 1,381. Ms Donaldson was elected at stage 10 of the voting process after other preferences had been considered. The by-election was called after Ms Brock stood down when she was elected as the SNP MP for Edinburgh North and Leith in May. Ms Chapman, of the Scottish Greens, resigned from her post to concentrate on standing for the Scottish Parliament in next May's election. The turnout for the by-election was 25.1%. The SNP also held the Midlothian West seat on Midlothian Council with a swing of 6.3% from Labour. The party's Kelly Parry secured 1,540 votes, ahead of Labour's Ian Miller on 945 votes. The by-election was called after Owen Thompson was elected as SNP MP for the Midlothian constituency.\n\nSummarize the above article in 1 sentence.\n"
 71 | prompt = f"Write a SHORT summary of the following text delimited by triple backticks. Return your response which covers the key points of the text.\n```{article}```"
 72 | input_prompt = template.format(inst=prompt)
 73 | 
 74 | # Define eviction policy
 75 | kv_policy = 'roco'
 76 | # Define sampling parameters
 77 | gen_kwargs = dict(
 78 |     temperature=1e-9,
 79 |     top_p=1.0,
 80 |     max_new_tokens=256,
 81 |     budget=0.5,
 82 |     kv_policy=kv_policy,
 83 |     keep_attention=False, # set to True if your DRAM is not tight and you can get better performance
 84 |     eos_token_ids=[tokenizer.eos_token_id]
 85 | )
 86 | input_ids = tokenizer([input_prompt], return_tensors='pt').input_ids.to(model.device)
 87 | output = model.easykv_generate(input_ids=input_ids, generation_config=gen_kwargs)
 88 | print(f"{'='*20} {kv_policy} {'='*20}\n{output}")
 89 | ```
 90 | ### Auto-regressive Decoding
 91 | For auto-regressive decoding phase, please specify ```budget``` as an integer, which represents the maximum length of KV cache, e.g, 200.
 92 | ```python
 93 | import torch
 94 | from transformers import AutoModelForCausalLM, AutoTokenizer
 95 | 
 96 | # Define your model path and template in a dict MODEL_CONFIGS
 97 | model_name = 'llama2_7b_chat'
 98 | path = MODEL_CONFIGS[model_name]['path']
 99 | template = MODEL_CONFIGS[model_name]['template']
100 | model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.float16, device_map='auto').eval()
101 | tokenizer = AutoTokenizer.from_pretrained(path)
102 | 
103 | # Turn on fixed KV cache mode for decoding phase
104 | enable_fixed_kv(model, tokenizer, mode='decoding', stride=1)
105 | 
106 | # Test input
107 | prompt = f"What are the names of some famous actors that started their careers on Broadway?"
108 | input_prompt = template.format(inst=prompt)
109 | kv_policy = 'roco'
110 | # Define sampling parameters
111 | gen_kwargs = dict(
112 |     temperature=1e-9,
113 |     top_p=1.0,
114 |     max_new_tokens=2048,
115 |     budget=200,
116 |     kv_policy=kv_policy,
117 |     eos_token_ids=[tokenizer.eos_token_id]
118 | )
119 | input_ids = tokenizer([input_prompt], return_tensors='pt').input_ids.to(model.device)
120 | output = model.easykv_generate(input_ids=input_ids, generation_config=gen_kwargs)
121 | print(f"{'='*20} {kv_policy} {'='*20}\n{output}")
122 | ```
123 | ### Auto Mode
124 | In case both the prompt and generation are long, ```auto``` mode can help automatically handle KV cache throught the prefilling and decoding stages.
125 | ```python
126 | stride = 64 # stride for sliding window
127 | kv_policy = "roco" # cache eviction policy
128 | budget = 1024 # an integer specifying the maximum KV cache
129 | enable_fixed_kv(model, tokenizer, mode='auto', stride=stride)
130 | gen_kwargs = dict(
131 |     temperature=1e-9,
132 |     top_p=1.0,
133 |     max_new_tokens=64,
134 |     budget=budget, 
135 |     kv_policy=kv_policy,
136 |     keep_attention=False, # set to True if your DRAM is not tight and you can get better performance
137 |     eos_token_ids=[tokenizer.eos_token_id]
138 | )
139 | output = model.easykv_generate(input_ids=input_ids, generation_config=gen_kwargs)
140 | ```
141 | 
142 | ### Passkey Retrieval Example
143 | We provide examplar code for passkey retrieval in [test_passkey.py](./test_passkey.py) and [test_passkey_NTK.py](./test_passkey_NTK.py) using Vicuna-7B-16K and DynamicNTK-scaled LLaMa2-7B-Chat, respectively.
144 | 
145 | The results of DynamicNTK-scaled LLaMa2-7B-Chat on ```5K``` passkey retrieval task is shown below:
146 | ```bash
147 | #Tokens of Prompt: 5144 Passkey target: 89427
148 | KV cache budget ratio: 100.00%(5144/5144)
149 | Current GPU memory usage: 18.359 GB
150 | Peak GPU memory usage: 21.751 GB
151 | Llama2-EasyKV-h2o_head_std_avg(100.00%):     [What is the pass key? The pass key is 89427.]
152 | 
153 | KV cache budget ratio: 50.08%(2576/5144)
154 | Current GPU memory usage: 15.625 GB
155 | Peak GPU memory usage: 18.423 GB
156 | Llama2-EasyKV-h2o_head_std_avg(50.00%):     [What is the pass key? The pass key is 89427.]
157 | ------------------------------------------------------------------------------------
158 | #Tokens of Prompt: 5144 Passkey target: 51906
159 | KV cache budget ratio: 100.00%(5144/5144)
160 | Current GPU memory usage: 18.359 GB
161 | Peak GPU memory usage: 21.751 GB
162 | Llama2-EasyKV-h2o_head_std_avg(100.00%):     [What is the pass key? The pass key is 51906.]
163 | 
164 | KV cache budget ratio: 50.08%(2576/5144)
165 | Current GPU memory usage: 15.625 GB
166 | Peak GPU memory usage: 18.427 GB
167 | Llama2-EasyKV-h2o_head_std_avg(50.00%):     [What is the pass key? The pass key is 51906.]
168 | ------------------------------------------------------------------------------------
169 | #Tokens of Prompt: 5144 Passkey target: 38117
170 | KV cache budget ratio: 100.00%(5144/5144)
171 | Current GPU memory usage: 18.359 GB
172 | Peak GPU memory usage: 21.751 GB
173 | Llama2-EasyKV-h2o_head_std_avg(100.00%):     [What is the pass key? The pass key is 38117.]
174 | 
175 | KV cache budget ratio: 50.08%(2576/5144)
176 | Current GPU memory usage: 15.625 GB
177 | Peak GPU memory usage: 18.427 GB
178 | Llama2-EasyKV-h2o_head_std_avg(50.00%):     [What is the pass key? The pass key is 38117.]
179 | ------------------------------------------------------------------------------------
180 | #Tokens of Prompt: 5144 Passkey target: 60151
181 | KV cache budget ratio: 100.00%(5144/5144)
182 | Current GPU memory usage: 18.359 GB
183 | Peak GPU memory usage: 21.751 GB
184 | Llama2-EasyKV-h2o_head_std_avg(100.00%):     [What is the pass key? The pass key is 60151.]
185 | 
186 | KV cache budget ratio: 50.08%(2576/5144)
187 | Current GPU memory usage: 15.625 GB
188 | Peak GPU memory usage: 18.427 GB
189 | Llama2-EasyKV-h2o_head_std_avg(50.00%):     [What is the pass key? The pass key is 60151.]
190 | ------------------------------------------------------------------------------------
191 | #Tokens of Prompt: 5144 Passkey target: 23789
192 | KV cache budget ratio: 100.00%(5144/5144)
193 | Current GPU memory usage: 18.359 GB
194 | Peak GPU memory usage: 21.752 GB
195 | Llama2-EasyKV-h2o_head_std_avg(100.00%):     [What is the pass key? The pass key is 23789.]
196 | 
197 | KV cache budget ratio: 50.08%(2576/5144)
198 | Current GPU memory usage: 15.626 GB
199 | Peak GPU memory usage: 18.427 GB
200 | Llama2-EasyKV-h2o_head_std_avg(50.00%):     [What is the pass key? The pass key is 23789.]
201 | ```
202 | 
203 | The results of Vicuna-7B-16K on ```10K``` passkey retrieval task is shown below:
204 | ```bash
205 | #Tokens of Prompt: 9994 Passkey target: 51013
206 | KV cache budget ratio: 100.00%(9994/9994)
207 | Current GPU memory usage: 23.666 GB
208 | Peak GPU memory usage: 41.896 GB
209 | EasyKV-h2o_head_std_avg(100.00%):     [What is the pass key? The pass key is 51013.]
210 | 
211 | KV cache budget ratio: 50.05%(5002/9994)
212 | Current GPU memory usage: 18.4 GB
213 | Peak GPU memory usage: 25.36 GB
214 | EasyKV-h2o_head_std_avg(50.00%):     [What is the pass key? The pass key is 51013.]
215 | ------------------------------------------------------------------------------------
216 | #Tokens of Prompt: 9994 Passkey target: 36920
217 | KV cache budget ratio: 100.00%(9994/9994)
218 | Current GPU memory usage: 23.666 GB
219 | Peak GPU memory usage: 41.896 GB
220 | EasyKV-h2o_head_std_avg(100.00%):     [What is the pass key? The pass key is 36920.]
221 | 
222 | KV cache budget ratio: 50.05%(5002/9994)
223 | Current GPU memory usage: 18.378 GB
224 | Peak GPU memory usage: 25.36 GB
225 | EasyKV-h2o_head_std_avg(50.00%):     [What is the pass key? The pass key is 36920.]
226 | ------------------------------------------------------------------------------------
227 | #Tokens of Prompt: 9994 Passkey target: 83493
228 | KV cache budget ratio: 100.00%(9994/9994)
229 | Current GPU memory usage: 23.666 GB
230 | Peak GPU memory usage: 41.896 GB
231 | EasyKV-h2o_head_std_avg(100.00%):     [What is the pass key? The pass key is 83493.]
232 | 
233 | KV cache budget ratio: 50.05%(5002/9994)
234 | Current GPU memory usage: 18.378 GB
235 | Peak GPU memory usage: 25.36 GB
236 | EasyKV-h2o_head_std_avg(50.00%):     [What is the pass key? The pass key is 83493.]
237 | ------------------------------------------------------------------------------------
238 | #Tokens of Prompt: 9994 Passkey target: 78585
239 | KV cache budget ratio: 100.00%(9994/9994)
240 | Current GPU memory usage: 23.666 GB
241 | Peak GPU memory usage: 41.896 GB
242 | EasyKV-h2o_head_std_avg(100.00%):     [What is the pass key? The pass key is 78585.]
243 | 
244 | KV cache budget ratio: 50.05%(5002/9994)
245 | Current GPU memory usage: 18.378 GB
246 | Peak GPU memory usage: 25.36 GB
247 | EasyKV-h2o_head_std_avg(50.00%):     [What is the pass key? The pass key is 78585.]
248 | ------------------------------------------------------------------------------------
249 | #Tokens of Prompt: 9994 Passkey target: 58328
250 | KV cache budget ratio: 100.00%(9994/9994)
251 | Current GPU memory usage: 23.666 GB
252 | Peak GPU memory usage: 41.896 GB
253 | EasyKV-h2o_head_std_avg(100.00%):     [What is the pass key? The pass key is 58328.]
254 | 
255 | KV cache budget ratio: 50.05%(5002/9994)
256 | Current GPU memory usage: 18.378 GB
257 | Peak GPU memory usage: 25.36 GB
258 | EasyKV-h2o_head_std_avg(50.00%):     [What is the pass key? The pass key is 58328.]
259 | ```
260 | 
261 | ### Summarization Example
262 | We provide examplar code for summarization in [test_summarization.py](./test_summarization.py).
263 | The results of full KV cache and 50%-constrained KV cache using EasyKV is shown below:
264 | ```bash
265 | EasyKV(100.00%): The 2016 European Championship, also known as Euro 2016, will take place in France from June 10 to July 10, featuring 24 teams, including France, Spain, Germany, England, Wales, and Northern Ireland, with the tournament kicking off with France playing Romania on Friday, June 10, and the final taking place at the Stade de France in Paris on July 10.
266 | EasyKV(50.00%): The 2016 European Championship, also known as Euro 2016, will be held in France from June 10th to July 10th, featuring 24 teams, including defending champions Spain, and will be marked by a number of changes to the rules of the game, as well as increased security measures due to the ongoing terror threat.
267 | ```
268 | 
269 | ### Instruction Following
270 | We provide examplar code for instruction-following in [test_decoding.py](./test_decoding.py).
271 | The results of EasyKV using different KV budget(300/150) with LLaMa2-7B-Chat are shown below:
272 | ```bash
273 | Instruction: What are the names of some famous actors that started their careers on Broadway?
274 | KV cache budget ratio: 58.14%(300/516)
275 | ==================== EasyKV-h2o_head_decay_avg_std-300 ====================
276 | Thank you for your question! There are many talented actors who have started their careers on Broadway and gone on to achieve great success in the entertainment industry. Here are some famous actors who got their start on Broadway:
277 | 1. Hugh Jackman: Known for his roles in "The Boy from Oz" and "The Greatest Showman," Jackman got his start on Broadway in the musical "Oklahoma!"
278 | 2. Audra McDonald: A six-time Tony Award winner, McDonald got her start on Broadway in the musical "Ragtime."
279 | 3. Idina Menzel: Menzel got her start on Broadway in the musical "Rent" and went on to star in "Wicked" and "Frozen."
280 | 4. Lin-Manuel Miranda: Miranda got his start on Broadway in the musical "In the Heights" and went on to create the hit musical "Hamilton."
281 | 5. Bernadette Peters: A legendary actress and singer, Peters got her start on Broadway in the musical "The Beautyful Nose" and has since starred in numerous productions, including "Gypsy" and "Sweeney Todd."
282 | 6. James Corden: Corden got his start on Broadway in the musical "Les Miserables" before becoming a late-night talk show host on "The Late Late Show with James Corden."
283 | 7. Christine Baranski: Baranski got her start on Broadway in the musical "The Producers" and has since appeared in numerous productions, including "The Good Wife" and "The Good Fight."
284 | 8. Nathan Lane: Lane got his start on Broadway in the musical "A Funny Thing Happened on the Way to the Forum" and has since starred in numerous productions, including "The Producers" and "It's Only a Play."
285 | 9. Bette Midler: Midler got her start on Broadway in the musical "Fiddler on the Roof" and went on to star in "Wicked" and "Hello, Dolly!"
286 | 10. John Leguizamo: Leguizamo got his start on Broadway in the play "A Day in the Death of Jose Marti" and has since appeared in numerous productions, including "Spanglish" and "The Lion King."
287 | I hope this list helps! Let me know if you have any other questions.
288 | KV cache budget ratio: 28.30%(150/530)
289 | ==================== EasyKV-h2o_head_decay_avg_std-150 ====================
290 | Thank you for your question! There are many talented actors who have started their careers on Broadway and gone on to achieve great success in the entertainment industry. Here are some famous actors who got their start on Broadway:
291 | 1. Hugh Jackman: Known for his roles in "The Boy from Oz" and "The Greatest Showman," Jackman got his start on Broadway in the musical "Oklahoma!"
292 | 2. Audra McDonald: A six-time Tony Award winner, McDonald got her start on Broadway in the musical "Ragtime."
293 | 3. Idina Menzel: Menzel got her start on Broadway in the musical "Rent" and went on to star in "Wicked" and "Frozen."
294 | 4. Lin-Manuel Miranda: Miranda got his start on Broadway in the musical "In the Heights" and went on to create the hit musical "Hamilton."
295 | 5. Bernadette Peters: A legendary actress and singer, Peters got her start on Broadway in the musical "The Beautyful Nose" and has since starred in numerous Broadway productions.
296 | 6. James Corden: Corden got his start on Broadway in the musical "Les Miserables" before becoming a late-night talk show host on "The Late Late Show with James Corden."
297 | 7. Christine Baranski: Baranski got her start on Broadway in the musical "The Producers" before going on to star in the TV show "The Good Wife" and the movie "The Big Sick."
298 | 8. Nathan Lane: Lane got his start on Broadway in the musical "A Funny Thing Happened on the Way to the Forum" and has since starred in numerous Broadway productions, including "The Producers" and "The Birdcage."
299 | 9. Bette Midler: Midler got her start on Broadway in the musical "Fiddler on the Roof" before going on to star in the TV show "The Rose" and the movie "Hocus Pocus."
300 | 10. John Leguizamo: Leguizamo got his start on Broadway in the play "A Day in the Death of Jose Marti" before going on to star in numerous TV shows and movies, including "ER" and "Ice Age."
301 | These are just a few examples of actors who got their start on Broadway. There are many other talented actors who have also gotten their start on the Great White Way.
302 | ```
303 | 
304 | ### Perplexity
305 | We provide examplar code for perplexity computation in [test_ppl.py](./test_ppl.py).
306 | The result with LLaMa2-13B with DynamicNTK on 10000-token document([doc.txt](./doc.txt)) is shown below:
307 | ```bash
308 | DynamicNTKRoPE max length reset to 11000
309 | Fixed KV Cache for ppl enabled
310 | 
311 | Input token length: 10253
312 | EasyKV-100.00% PPL: 7.44
313 | ------------------------------------------
314 | KV cache budget ratio: 50.38%(5165/10253)
315 | EasyKV-recency-50.00% PPL: 7.68
316 | ------------------------------------------
317 | KV cache budget ratio: 50.38%(5165/10253)
318 | EasyKV-h2o_head_std_avg-50.00% PPL: 7.47
319 | ```
320 | 
321 | ## List of Supported KV Eviction Policies:
322 | + random: drop kv cache of a randomly chosen position.
323 | + recency: similar to StreamingLLM, dropping the least recent token's kv cache.
324 | + h2o_head: Heavy-hitter oracle, which drops kv cache whose accumulated attention score is smallest.
325 | + tova: Token Omission Via Attention, which uses attention weights of the last token only.
326 | + roco: newly proposed eviction policy with better evivtion candidate selection and importance estimation.
327 | 
328 | 
329 | ## Todos
330 | + [x] Add ```auto``` mode so that users don't have to manually specify ```encoding``` or ```decoding```.
331 | + [ ] Add [LongBench](https://github.com/THUDM/LongBench/tree/main?tab=readme-ov-file#how-to-evaluate-on-LongBench) evaluation.
332 | + [ ] Add filtering mechanism to prevent dropping KV cache of specified tokens.
333 | 
334 | 
335 | ## Acknowledgement
336 | ```latex
337 | @article{xiao2023efficient,
338 |   title={Efficient streaming language models with attention sinks},
339 |   author={Xiao, Guangxuan and Tian, Yuandong and Chen, Beidi and Han, Song and Lewis, Mike},
340 |   journal={arXiv preprint arXiv:2309.17453},
341 |   year={2023}
342 | }
343 | 
344 | @article{liu2023scissorhands,
345 |   title={Scissorhands: Exploiting the Persistence of Importance Hypothesis for LLM KV Cache Compression at Test Time},
346 |   author={Liu, Zichang and Desai, Aditya and Liao, Fangshuo and Wang, Weitao and Xie, Victor and Xu, Zhaozhuo and Kyrillidis, Anastasios and Shrivastava, Anshumali},
347 |   journal={arXiv preprint arXiv:2305.17118},
348 |   year={2023}
349 | }
350 | 
351 | @article{zhang2023h,
352 |   title={H $ \_2 $ O: Heavy-Hitter Oracle for Efficient Generative Inference of Large Language Models},
353 |   author={Zhang, Zhenyu and Sheng, Ying and Zhou, Tianyi and Chen, Tianlong and Zheng, Lianmin and Cai, Ruisi and Song, Zhao and Tian, Yuandong and R{\'e}, Christopher and Barrett, Clark and others},
354 |   journal={arXiv preprint arXiv:2306.14048},
355 |   year={2023}
356 | }
357 | 
358 | @article{oren2024transformers,
359 |   title={Transformers are Multi-State RNNs},
360 |   author={Oren, Matanel and Hassid, Michael and Adi, Yossi and Schwartz, Roy},
361 |   journal={arXiv preprint arXiv:2401.06104},
362 |   year={2024}
363 | }
364 | ```


--------------------------------------------------------------------------------
/doc.txt:
--------------------------------------------------------------------------------
1 | Chapter: "Mine ear is open, and my heart prepared: The worst is worldly loss thou canst unfold: Say, is my kingdom lost?" SHAKESPEARE. It was a feature peculiar to the colonial wars of North America, that the toils and dangers of the wilderness were to be encountered before the adverse hosts could meet. A wide and apparently an impervious boundary of forests severed the possessions of the hostile provinces of France and England. The hardy colonist, and the trained European who fought at his side, frequently expended months in struggling against the rapids of the streams, or in effecting the rugged passes of the mountains, in quest of an opportunity to exhibit their courage in a more martial conflict. But, emulating the patience and self-denial of the practised native warriors, they learned to overcome every difficulty; and it would seem that, in time, there was no recess of the woods so dark, nor any secret place so lovely, that it might claim exemption from the inroads of those who had pledged their blood to satiate their vengeance, or to uphold the cold and selfish policy of the distant monarchs of Europe. Perhaps no district throughout the wide extent of the intermediate frontiers can furnish a livelier picture of the cruelty and fierceness of the savage warfare of those periods than the country which lies between the head waters of the Hudson and the adjacent lakes. The facilities which nature had there offered to the march of the combatants were too obvious to be neglected. The lengthened sheet of the Champlain stretched from the frontiers of Canada, deep within the borders of the neighboring province of New York, forming a natural passage across half the distance that the French were compelled to master in order to strike their enemies. Near its southern termination, it received the contributions of another lake, whose waters were so limpid as to have been exclusively selected by the Jesuit missionaries to perform the typical purification of baptism, and to obtain for it the title of lake "du Saint Sacrement." The less zealous English thought they conferred a sufficient honor on its unsullied fountains, when they bestowed the name of their reigning prince, the second of the house of Hanover. The two united to rob the untutored possessors of its wooded scenery of their native right to perpetuate its original appellation of "Horican."[1] Winding its way among countless islands, and imbedded in mountains, the "holy lake" extended a dozen leagues still farther to the south. With the high plain that there interposed itself to the further passage of the water, commenced a portage of as many miles, which conducted the adventurer to the banks of the Hudson, at a point where, with the usual obstructions of the rapids, or rifts, as they were then termed in the language of the country, the river became navigable to the tide. While, in the pursuit of their daring plans of annoyance, the restless enterprise of the French even attempted the distant and difficult gorges of the Alleghany, it may easily be imagined that their proverbial acuteness would not overlook the natural advantages of the district we have just described. It became, emphatically, the bloody arena, in which most of the battles for the mastery of the colonies were contested. Forts were erected at the different points that commanded the facilities of the route, and were taken and retaken, razed and rebuilt, as victory alighted on the hostile banners. While the husbandman shrank back from the dangerous passes, within the safer boundaries of the more ancient settlements, armies larger than those that had often disposed of the sceptres of the mother countries, were seen to bury themselves in these forests, whence they rarely returned but in skeleton bands, that were haggard with care, or dejected by defeat. Though the arts of peace were unknown to this fatal region, its forests were alive with men; its shades and glens rang with the sounds of martial music, and the echoes of its mountains threw back the laugh, or repeated the wanton cry, of many a gallant and reckless youth, as he hurried by them, in the noontide of his spirits, to slumber in a long night of forgetfulness. It was in this scene of strife and bloodshed that the incidents we shall attempt to relate occurred, during the third year of the war which England and France last waged for the possession of a country that neither was destined to retain. The imbecility of her military leaders abroad, and the fatal want of energy in her councils at home, had lowered the character of Great Britain from the proud elevation on which it had been placed, by the talents and enterprise of her former warriors and statesmen. No longer dreaded by her enemies, her servants were fast losing the confidence of self-respect. In this mortifying abasement, the colonists, though innocent of her imbecility, and too humble to be the agents of her blunders, were but the natural participators. They had recently seen a chosen army from that country, which, reverencing as a mother, they had blindly believed invincible--an army led by a chief who had been selected from a crowd of trained warriors, for his rare military endowments, disgracefully routed by a handful of French and Indians, and only saved from annihilation by the coolness and spirit of a Virginian boy, whose riper fame has since diffused itself, with the steady influence of moral truth, to the uttermost confines of Christendom.[2] A wide frontier had been laid naked by this unexpected disaster, and more substantial evils were preceded by a thousand fanciful and imaginary dangers. The alarmed colonists believed that the yells of the savages mingled with every fitful gust of wind that issued from the interminable forests of the west. The terrific character of their merciless enemies increased immeasurably the natural horrors of warfare. Numberless recent massacres were still vivid in their recollections; nor was there any ear in the provinces so deaf as not to have drunk in with avidity the narrative of some fearful tale of midnight murder, in which the natives of the forests were the principal and barbarous actors. As the credulous and excited traveller related the hazardous chances of the wilderness, the blood of the timid curdled with terror, and mothers cast anxious glances even at those children which slumbered within the security of the largest towns. In short, the magnifying influence of fear began to set at naught the calculations of reason, and to render those who should have remembered their manhood, the slaves of the basest of passions. Even the most confident and the stoutest hearts began to think the issue of the contest was becoming doubtful; and that abject class was hourly increasing in numbers, who thought they foresaw all the possessions of the English crown in America subdued by their Christian foes, or laid waste by the inroads of their relentless allies. When, therefore, intelligence was received at the fort, which covered the southern termination of the portage between the Hudson and the lakes, that Montcalm had been seen moving up the Champlain, with an army "numerous as the leaves on the trees," its truth was admitted with more of the craven reluctance of fear than with the stern joy that a warrior should feel, in finding an enemy within reach of his blow. The news had been brought, towards the decline of a day in midsummer, by an Indian runner, who also bore an urgent request from Munro, the commander of a work on the shore of the "holy lake," for a speedy and powerful reinforcement. It has already been mentioned that the distance between these two posts was less than five leagues. The rude path, which originally formed their line of communication, had been widened for the passage of wagons; so that the distance which had been travelled by the son of the forest in two hours, might easily be effected by a detachment of troops, with their necessary baggage, between the rising and setting of a summer sun. The loyal servants of the British crown had given to one of these forest fastnesses the name of William Henry, and to the other that of Fort Edward; calling each after a favorite prince of the reigning family. The veteran Scotchman just named held the first, with a regiment of regulars and a few provincials; a force really by far too small to make head against the formidable power that Montcalm was leading to the foot of his earthen mounds. At the latter, however, lay General Webb, who commanded the armies of the king in the northern provinces, with a body of more than five thousand men. By uniting the several detachments of his command, this officer might have arrayed nearly double that number of combatants against the enterprising Frenchman, who had ventured so far from his reinforcements, with an army but little superior in numbers. But under the influence of their degraded fortunes, both officers and men appeared better disposed to await the approach of their formidable antagonists, within their works, than to resist the progress of their march, by emulating the successful example of the French at Fort du Quesne, and striking a blow on their advance. After the first surprise of the intelligence had a little abated, a rumor was spread through the entrenched camp, which stretched along the margin of the Hudson, forming a chain of outworks to the body of the fort itself, that a chosen detachment of fifteen hundred men was to depart, with the dawn, for William Henry, the post at the northern extremity of the portage. That which at first was only rumor, soon became certainty, as orders passed from the quarters of the commander-in-chief to the several corps he had selected for this service, to prepare for their speedy departure. All doubt as to the intention of Webb now vanished, and an hour or two of hurried footsteps and anxious faces succeeded. The novice in the military art flew from point to point, retarding his own preparations by the excess of his violent and somewhat distempered zeal; while the more practised veteran made his arrangements with a deliberation that scorned every appearance of haste; though his sober lineaments and anxious eye sufficiently betrayed that he had no very strong professional relish for the as yet untried and dreaded warfare of the wilderness. At length the sun set in a flood of glory, behind the distant western hills, and as darkness drew its veil around the secluded spot the sounds of preparation diminished; the last light finally disappeared from the log cabin of some officer; the trees cast their deeper shadows over the mounds and the rippling stream, and a silence soon pervaded the camp, as deep as that which reigned in the vast forest by which it was environed. According to the orders of the preceding night, the heavy sleep of the army was broken by the rolling of the warning drums, whose rattling echoes were heard issuing, on the damp morning air, out of every vista of the woods, just as day began to draw the shaggy outlines of some tall pines of the vicinity, on the opening brightness of a soft and cloudless eastern sky. In an instant the whole camp was in motion; the meanest soldier arousing from his lair to witness the departure of his comrades, and to share in the excitement and incidents of the hour. The simple array of the chosen band was soon completed. While the regular and trained hirelings of the king marched with haughtiness to the right of the line, the less pretending colonists took their humbler position on its left, with a docility that long practice had rendered easy. The scouts departed; strong guards preceded and followed the lumbering vehicles that bore the baggage; and before the gray light of the morning was mellowed by the rays of the sun, the main body of the combatants wheeled into column, and left the encampment with a show of high military bearing, that served to drown the slumbering apprehensions of many a novice, who was now about to make his first essay in arms. While in view of their admiring comrades, the same proud front and ordered array was observed, until the notes of their fifes growing fainter in distance, the forest at length appeared to swallow up the living mass which had slowly entered its bosom. The deepest sounds of the retiring and invisible column had ceased to be borne on the breeze to the listeners, and the latest straggler had already disappeared in pursuit; but there still remained the signs of another departure, before a log cabin of unusual size and accommodations, in front of which those sentinels paced their rounds, who were known to guard the person of the English general. At this spot were gathered some half dozen horses, caparisoned in a manner which showed that two, at least, were destined to bear the persons of females, of a rank that it was not usual to meet so far in the wilds of the country. A third wore the trappings and arms of an officer of the staff; while the rest, from the plainness of the housings, and the travelling mails with which they were encumbered, were evidently fitted for the reception of as many menials, who were, seemingly, already awaiting the pleasure of those they served. At a respectful distance from this unusual show were gathered divers groups of curious idlers; some admiring the blood and bone of the high-mettled military charger, and others gazing at the preparations, with dull wonder of vulgar curiosity. There was one man, however, who, by his countenance and actions, formed a marked exception to those who composed the latter class of spectators, being neither idle, nor seemingly very ignorant. The person of this individual was to the last degree ungainly, without being in any particular manner deformed. He had all the bones and joints of other men, without any of their proportions. Erect, his stature surpassed that of his fellows; seated, he appeared reduced within the ordinary limits of the race. The same contrariety in his members seemed to exist throughout the whole man. His head was large; his shoulders narrow; his arms long and dangling; while his hands were small, if not delicate. His legs and thighs were thin, nearly to emaciation, but of extraordinary length; and his knees would have been considered tremendous, had they not been outdone by the broader foundations on which this false superstructure of the blended human orders was so profanely reared. The ill-assorted and injudicious attire of the individual only served to render his awkwardness more conspicuous. A sky-blue coat, with short and broad skirts and low cape, exposed a long thin neck, and longer and thinner legs, to the worst animadversions of the evil disposed. His nether garment was of yellow nankeen, closely fitted to the shape, and tied at his bunches of knees by large knots of white ribbon, a good deal sullied by use. Clouded cotton stockings, and shoes, on one of the latter of which was a plated spur, completed the costume of the lower extremity of this figure, no curve or angle of which was concealed, but, on the other hand, studiously exhibited, through the vanity or simplicity of its owner. From beneath the flap of an enormous pocket of a soiled vest of embossed silk, heavily ornamented with tarnished silver lace, projected an instrument, which, from being seen in such martial company, might have been easily mistaken for some mischievous and unknown implement of war. Small as it was, this uncommon engine had excited the curiosity of most of the Europeans in the camp, though several of the provincials were seen to handle it, not only without fear, but with the utmost familiarity. A large, civil cocked hat, like those worn by clergymen within the last thirty years, surmounted the whole, furnishing dignity to a good-natured and somewhat vacant countenance, that apparently needed such artificial aid, to support the gravity of some high and extraordinary trust. While the common herd stood aloof, in deference to the quarters of Webb, the figure we have described stalked in the centre of the domestics, freely expressing his censures or commendations on the merits of the horses, as by chance they displeased or satisfied his judgment. "This beast, I rather conclude, friend, is not of home raising, but is from foreign lands, or perhaps from the little island itself over the blue water?" he said, in a voice as remarkable for the softness and sweetness of its tones, as was his person for its rare proportions: "I may speak of these things, and be no braggart; for I have been down at both havens; that which is situate at the mouth of Thames, and is named after the capital of Old England, and that which is called 'Haven,' with the addition of the word 'New'; and have seen the snows and brigantines collecting their droves, like the gathering to the ark, being outward bound to the Island of Jamaica, for the purpose of barter and traffic in four-footed animals; but never before have I beheld a beast which verified the true Scripture war-horse like this: 'He paweth in the valley, and rejoiceth in his strength: he goeth on to meet the armed men. He saith among the trumpets, Ha, ha; and he smelleth the battle afar off, the thunder of the captains, and the shouting.' It would seem that the stock of the horse of Israel has descended to our own time; would it not, friend?" Receiving no reply to this extraordinary appeal, which in truth, as it was delivered with the vigor of full and sonorous tones, merited some sort of notice, he who had thus sung forth the language of the Holy Book turned to the silent figure to whom he had unwittingly addressed himself, and found a new and more powerful subject of admiration in the object that encountered his gaze. His eyes fell on the still, upright, and rigid form of the "Indian runner," who had borne to the camp the unwelcome tidings of the preceding evening. Although in a state of perfect repose, and apparently disregarding, with characteristic stoicism, the excitement and bustle around him, there was a sullen fierceness mingled with the quiet of the savage, that was likely to arrest the attention of much more experienced eyes than those which now scanned him, in unconcealed amazement. The native bore both the tomahawk and knife of his tribe; and yet his appearance was not altogether that of a warrior. On the contrary, there was an air of neglect about his person, like that which might have proceeded from great and recent exertion, which he had not yet found leisure to repair. The colors of the war-paint had blended in dark confusion about his fierce countenance, and rendered his swarthy lineaments still more savage and repulsive than if art had attempted an effect which had been thus produced by chance. His eye, alone, which glistened like a fiery star amid lowering clouds, was to be seen in its state of native wildness. For a single instant, his searching and yet wary glance met the wondering look of the other, and then changing its direction, partly in cunning, and partly in disdain, it remained fixed, as if penetrating the distant air. It is impossible to say what unlooked-for remark this short and silent communication, between two such singular men, might have elicited from the white man, had not his active curiosity been again drawn to other objects. A general movement among the domestics, and a low sound of gentle voices, announced the approach of those whose presence alone was wanted to enable the cavalcade to move. The simple admirer of the war-horse instantly fell back to a low, gaunt, switch-tailed mare, that was unconsciously gleaning the faded herbage of the camp nigh by; where, leaning with one elbow on the blanket that concealed an apology for a saddle, he became a spectator of the departure, while a foal was quietly making its morning repast, on the opposite side of the same animal. A young man, in the dress of an officer, conducted to their steeds two females, who, as it was apparent by their dresses, were prepared to encounter the fatigues of a journey in the woods. One, and she was the most juvenile in her appearance, though both were young, permitted glimpses of her dazzling complexion, fair golden hair, and bright blue eyes, to be caught, as she artlessly suffered the morning air to blow aside the green veil which descended low from her beaver. The flush which still lingered above the pines in the western sky was not more bright nor delicate than the bloom on her cheek; nor was the opening day more cheering than the animated smile which she bestowed on the youth, as he assisted her into the saddle. The other, who appeared to share equally in the attentions of the young officer, concealed her charms from the gaze of the soldiery, with a care that seemed better fitted to the experience of four or five additional years. It could be seen, however, that her person, though moulded with the same exquisite proportions, of which none of the graces were lost by the travelling dress she wore, was rather fuller and more mature than that of her companion. No sooner were these females seated, than their attendant sprang lightly into the saddle of the war-horse, when the whole three bowed to Webb, who, in courtesy, awaited their parting on the threshold of his cabin, and turning their horses' heads, they proceeded at a slow amble, followed by their train, towards the northern entrance of the encampment. As they traversed that short distance, not a voice was heard amongst them; but a slight exclamation proceeded from the younger of the females, as the Indian runner glided by her, unexpectedly, and led the way along the military road in her front. Though this sudden and startling movement of the Indian produced no sound from the other, in the surprise her veil also was allowed to open its folds, and betrayed an indescribable look of pity, admiration, and horror, as her dark eye followed the easy motions of the savage. The tresses of this lady were shining and black, like the plumage of the raven. Her complexion was not brown, but it rather appeared charged with the color of the rich blood, that seemed ready to burst its bounds. And yet there was neither coarseness nor want of shadowing in a countenance that was exquisitely regular and dignified, and surpassingly beautiful. She smiled, as if in pity at her own momentary forgetfulness, discovering by the act a row of teeth that would have shamed the purest ivory; when, replacing the veil, she bowed her face, and rode in silence, like one whose thoughts were abstracted from the scene around her. "Sola, sola, wo, ha, ho, sola!" SHAKESPEARE. While one of the lovely beings we have so cursorily presented to the reader was thus lost in thought, the other quickly recovered from the alarm which induced the exclamation, and, laughing at her own weakness, she inquired of the youth who rode by her side,-- "Are such spectres frequent in the woods, Heyward; or is this sight an especial entertainment on our behalf? If the latter, gratitude must close our mouths; but if the former, both Cora and I shall have need to draw largely on that stock of hereditary courage which we boast, even before we are made to encounter the redoubtable Montcalm." "Yon Indian is a 'runner' of the army; and, after the fashion of his people, he may be accounted a hero," returned the officer. "He has volunteered to guide us to the lake, by a path but little known, sooner than if we followed the tardy movements of the column: and, by consequence, more agreeably." "I like him not," said the lady, shuddering, partly in assumed, yet more in real terror. "You know him, Duncan, or you would not trust yourself so freely to his keeping?" "Say, rather, Alice, that I would not trust you. I do know him, or he would not have my confidence, and least of all at this moment. He is said to be a Canadian, too; and yet he served with our friends the Mohawks, who, as you know, are one of the six allied nations.[3] He was brought among us, as I have heard, by some strange accident in which your father was interested, and in which the savage was rigidly dealt by--but I forget the idle tale; it is enough, that he is now our friend." "If he has been my father's enemy, I like him still less!" exclaimed the now really anxious girl. "Will you not speak to him, Major Heyward, that I may hear his tones? Foolish though it may be, you have often heard me avow my faith in the tones of the human voice!" "It would be in vain; and answered, most probably, by an ejaculation. Though he may understand it, he affects, like most of his people, to be ignorant of the English; and least of all will he condescend to speak it, now that war demands the utmost exercise of his dignity. But he stops; the private path by which we are to journey is, doubtless, at hand." The conjecture of Major Heyward was true. When they reached the spot where the Indian stood, pointing into the thicket that fringed the military road, a narrow and blind path, which might, with some little inconvenience, receive one person at a time, became visible. "Here, then, lies our way," said the young man, in a low voice. "Manifest no distrust, or you may invite the danger you appear to apprehend." "Cora, what think you?" asked the reluctant fair one. "If we journey with the troops, though we may find their presence irksome, shall we not feel better assurance of our safety?" "Being little accustomed to the practices of the savages, Alice, you mistake the place of real danger," said Heyward. "If enemies have reached the portage at all, a thing by no means probable, as our scouts are abroad, they will surely be found skirting the column where scalps abound the most. The route of the detachment is known, while ours, having been determined within the hour, must still be secret." "Should we distrust the man because his manners are not our manners, and that his skin is dark?" coldly asked Cora. Alice hesitated no longer; but giving her Narragansett[4] a smart cut of the whip, she was the first to dash aside the slight branches of the bushes, and to follow the runner along the dark and tangled pathway. The young man regarded the last speaker in open admiration, and even permitted her fairer though certainly not more beautiful companion to proceed unattended, while he sedulously opened the way himself for the passage of her who has been called Cora. It would seem that the domestics had been previously instructed; for, instead of penetrating the thicket, they followed the route of the column; a measure which Heyward stated had been dictated by the sagacity of their guide, in order to diminish the marks of their trail, if, haply, the Canadian savages should be lurking so far in advance of their army. For many minutes the intricacy of the route admitted of no further dialogue; after which they emerged from the broad border of underbrush which grew along the line of the highway, and entered under the high but dark arches of the forest. Here their progress was less interrupted, and the instant the guide perceived that the females could command their steeds, he moved on, at a pace between a trot and a walk, and at a rate which kept the sure-footed and peculiar animals they rode, at a fast yet easy amble. The youth had turned to speak to the dark-eyed Cora, when the distant sound of horses' hoofs, clattering over the roots of the broken way in his rear, caused him to check his charger; and, as his companions drew their reins at the same instant, the whole party came to a halt, in order to obtain an explanation of the unlooked-for interruption. In a few moments a colt was seen gliding, like a fallow-deer, among the straight trunks of the pines; and, in another instant, the person of the ungainly man described in the preceding chapter, came into view, with as much rapidity as he could excite his meagre beast to endure without coming to an open rupture. Until now this personage had escaped the observation of the travellers. If he possessed the power to arrest any wandering eye when exhibiting the glories of his altitude on foot, his equestrian graces were still more likely to attract attention. Notwithstanding a constant application of his one armed heel to the flanks of the mare, the most confirmed gait that he could establish was a Canterbury gallop with the hind legs, in which those more forward assisted for doubtful moments, though generally content to maintain a loping trot. Perhaps the rapidity of the changes from one of these paces to the other created an optical illusion, which might thus magnify the powers of the beast; for it is certain that Heyward, who possessed a true eye for the merits of a horse, was unable, with his utmost ingenuity, to decide by what sort of movement his pursuer worked his sinuous way on his footsteps with such persevering hardihood. The industry and movements of the rider were not less remarkable than those of the ridden. At each change in the evolutions of the latter, the former raised his tall person in the stirrups; producing, in this manner, by the undue elongation of his legs, such sudden growths and diminishings of the stature, as baffled every conjecture that might be made as to his dimensions. If to this be added the fact that, in consequence of the ex parte application of the spur, one side of the mare appeared to journey faster than the other; and that the aggrieved flank was resolutely indicated by unremitted flourishes of a bushy tail, we finish the picture of both horse and man. The frown which had gathered around the handsome, open, and manly brow of Heyward, gradually relaxed, and his lips curled into a slight smile, as he regarded the stranger. Alice made no very powerful effort to control her merriment; and even the dark, thoughtful eye of Cora lighted with a humor that, it would seem, the habit, rather than the nature of its mistress repressed. "Seek you any here?" demanded Heyward, when the other had arrived sufficiently nigh to abate his speed; "I trust you are no messenger of evil tidings?" "Even so," replied the stranger, making diligent use of his triangular castor, to produce a circulation in the close air of the woods, and leaving his hearers in doubt to which of the young man's questions he responded; when, however, he had cooled his face, and recovered his breath, he continued, "I hear you are riding to William Henry; as I am journeying thitherward myself, I concluded good company would seem consistent to the wishes of both parties." "You appear to possess the privilege of a casting vote," returned Heyward; "we are three, whilst you have consulted no one but yourself." "Even so. The first point to be obtained is to know one's own mind. Once sure of that, and where women are concerned, it is not easy, the next is, to act up to the decision. I have endeavored to do both, and here I am." "If you journey to the lake, you have mistaken your route," said Heyward, haughtily; "the highway thither is at least half a mile behind you." "Even so," returned the stranger, nothing daunted by this cold reception; "I have tarried at 'Edward' a week, and I should be dumb not to have inquired the road I was to journey; and if dumb there would be an end to my calling." After simpering in a small way, like one whose modesty prohibited a more open expression of his admiration of a witticism that was perfectly unintelligible to his hearers, he continued: "It is not prudent for any one of my profession to be too familiar with those he is to instruct; for which reason I follow not the line of the army; besides which, I conclude that a gentleman of your character has the best judgment in matters of wayfaring; I have therefore decided to join company, in order that the ride may be made agreeable, and partake of social communion." "A most arbitrary, if not a hasty decision!" exclaimed Heyward, undecided whether to give vent to his growing anger, or to laugh in the other's face. "But you speak of instruction, and of a profession; are you an adjunct to the provincial corps, as a master of the noble science of defence and offence; or, perhaps, you are one who draws lines and angles, under the pretence of expounding the mathematics?" The stranger regarded his interrogator a moment, in wonder; and then, losing every mark of self-satisfaction in an expression of solemn humility, he answered:-- "Of offence, I hope there is none, to either party: of defence, I make none--by God's good mercy, having committed no palpable sin since last entreating his pardoning grace. I understand not your allusions about lines and angles; and I leave expounding to those who have been called and set apart for that holy office. I lay claim to no higher gift than a small insight into the glorious art of petitioning and thanksgiving, as practised in psalmody." "The man is, most manifestly, a disciple of Apollo," cried the amused Alice, "and I take him under my own especial protection. Nay, throw aside that frown, Heyward, and in pity to my longing ears, suffer him to journey in our train. Besides," she added, in a low and hurried voice, casting a glance at the distant Cora, who slowly followed the footsteps of their silent but sullen guide, "it may be a friend added to our strength, in time of need." "Think you, Alice, that I would trust those I love by this secret path, did I imagine such need could happen?" "Nay, nay, I think not of it now; but this strange man amuses me; and if he 'hath music in his soul,' let us not churlishly reject his company." She pointed persuasively along the path with her riding-whip, while their eyes met in a look which the young man lingered a moment to prolong; then yielding to her gentle influence, he clapped his spurs into his charger, and in a few bounds was again at the side of Cora. "I am glad to encounter thee, friend," continued the maiden, waving her hand to the stranger to proceed, as she urged her Narragansett to renew its amble. "Partial relatives have almost persuaded me that I am not entirely worthless in a duet myself; and we may enliven our wayfaring by indulging in our favorite pursuit. It might be of signal advantage to one, ignorant as I, to hear the opinions and experience of a master in the art." "It is refreshing both to the spirits and to the body to indulge in psalmody, in befitting seasons," returned the master of song, unhesitatingly complying with her intimation to follow; "and nothing would relieve the mind more than such a consoling communion. But four parts are altogether necessary to the perfection of melody. You have all the manifestations of a soft and rich treble; I can, by especial aid, carry a full tenor to the highest letter; but we lack counter and bass! Yon officer of the king, who hesitated to admit me to his company, might fill the latter, if one may judge from the intonations of his voice in common dialogue." "Judge not too rashly from hasty and deceptive appearances," said the lady, smiling; "though Major Heyward can assume such deep notes on occasion, believe me, his natural tones are better fitted for a mellow tenor than the bass you heard." "Is he, then, much practised in the art of psalmody?" demanded her simple companion. Alice felt disposed to laugh, though she succeeded in suppressing her merriment, ere she answered,-- "I apprehend that he is rather addicted to profane song. The chances of a soldier's life are but little fitted for the encouragement of more sober inclinations." "Man's voice is given to him, like his other talents, to be used, and not to be abused. None can say they have ever known me neglect my gifts! I am thankful that, though my boyhood may be said to have been set apart, like the youth of the royal David, for the purposes of music, no syllable of rude verse has ever profaned my lips." "You have, then, limited your efforts to sacred song?" "Even so. As the psalms of David exceed all other language, so does the psalmody that has been fitted to them by the divines and sages of the land, surpass all vain poetry. Happily, I may say that I utter nothing but the thoughts and the wishes of the King of Israel himself; for though the times may call for some slight changes, yet does this version which we use in the colonies of New England, so much exceed all other versions, that, by its richness, its exactness, and its spiritual simplicity, it approacheth, as near as may be, to the great work of the inspired writer. I never abide in any place, sleeping or waking, without an example of this gifted work. 'Tis the six-and-twentieth edition, promulgated at Boston, Anno Domini 1744; and is entitled, _The Psalms, Hymns, and Spiritual Songs of the Old and New Testaments; faithfully translated into English Metre, for the Use, Edification, and Comfort of the Saints, in Public and Private, especially in New England_." During this eulogium on the rare production of his native poets, the stranger had drawn the book from his pocket, and, fitting a pair of iron-rimmed spectacles to his nose, opened the volume with a care and veneration suited to its sacred purposes. Then, without circumlocution or apology, first pronouncing the word "Standish," and placing the unknown engine, already described, to his mouth, from which he drew a high, shrill sound, that was followed by an octave below, from his own voice, he commenced singing the following words, in full, sweet, and melodious tones, that set the music, the poetry, and even the uneasy motion of his ill-trained beast at defiance:-- "How good it is, O see, And how it pleaseth well, Together, e'en in unity, For brethren so to dwell. It's like the choice ointment, From the head to the beard did go: Down Aaron's beard, that downward went, His garment's skirts unto." The delivery of these skilful rhymes was accompanied, on the part of the stranger, by a regular rise and fall of his right hand, which terminated at the descent, by suffering the fingers to dwell a moment on the leaves of the little volume; and on the ascent, by such a flourish of the member as none but the initiated may ever hope to imitate. It would seem that long practice had rendered this manual accompaniment necessary; for it did not cease until the preposition which the poet had selected for the close of his verse, had been duly delivered like a word of two syllables. Such an innovation on the silence and retirement of the forest could not fail to enlist the ears of those who journeyed at so short a distance in advance. The Indian muttered a few words in broken English to Heyward, who, in his turn, spoke to the stranger; at once interrupting, and, for the time, closing his musical efforts. "Though we are not in danger, common prudence would teach us to journey through this wilderness in as quiet a manner as possible. You will, then, pardon me, Alice, should I diminish your enjoyments, by requesting this gentleman to postpone his chant until a safer opportunity." "You will diminish them, indeed," returned the arch girl, "for never did I hear a more unworthy conjunction of execution and language, than that to which I have been listening; and I was far gone in a learned inquiry into the causes of such an unfitness between sound and sense, when you broke the charm of my musings by that bass of yours, Duncan!" "I know not what you call my bass," said Heyward, piqued at her remark, "but I know that your safety, and that of Cora, is far dearer to me than could be any orchestra of Handel's music." He paused and turned his head quickly towards a thicket, and then bent his eyes suspiciously on their guide, who continued his steady pace, in undisturbed gravity. The young man smiled to himself, for he believed he had mistaken some shining berry of the woods for the glistening eyeballs of a prowling savage, and he rode forward, continuing the conversation which had been interrupted by the passing thought. Major Heyward was mistaken only in suffering his youthful and generous pride to suppress his active watchfulness. The cavalcade had not long passed, before the branches of the bushes that formed the thicket were cautiously moved asunder, and a human visage, as fiercely wild as savage art and unbridled passions could make it, peered out on the retiring footsteps of the travellers. A gleam of exultation shot across the darkly painted lineaments of the inhabitant of the forest, as he traced the route of his intended victims, who rode unconsciously onward; the light and graceful forms of the females waving among the trees, in the curvatures of their path, followed at each bend by the manly figure of Heyward, until, finally, the shapeless person of the singing-master was concealed behind the numberless trunks of trees, that rose, in dark lines, in the intermediate space. Q: Can you write an appropriate summary of the above paragraphs? A: Before any characters appear, the time and geography are made clear. Though it is the last war that England and France waged for a country that neither would retain, the wilderness between the forces still has to be overcome first. Thus it is in 1757, in the New York area between the head waters of the Hudson River and Lake George to the north. Because only two years earlier General Braddock was disgracefully routed by a handful of French and Indians, the frontier is now exposed to real and imaginary savage disasters as well as to the horrors of warfare. Fear has replaced reason. Near dusk of a day in July, an Indian runner named Magua arrives at Fort Edward on the upper Hudson. He has come from Fort William Henry at the southern tip of Lake George with the news that the French General Montcalm is moving south with a very large army and that Munro, commander of Fort William Henry, is in urgent need of plentiful reinforcements from General Webb. Early the next morning, a limited detachment of fifteen hundred regulars and colonists departs as if swallowed by the forest. Shortly afterwards, Major Duncan Heyward and Alice and Cora Munro, guided by Magua on foot, take by horseback a secret route toward William Henry for the girls to join their father. Blonde Alice is doubtful about Magua, covered with war paint and showing a sullen fierceness; but dark-haired Cora is stoically common sense about him, even though Heyward mentions that their father had once had to deal rigidly with the Indian. As the small party pushes on, they are overtaken by David Gamut, a tall, ungainly psalmodist ridiculously dressed and carrying a pitch pipe while riding a mare followed by its young colt. He desires to join them, and after some banter between him and Alice, he pulls out the twenty-sixth edition of The Bay Psalm Book, sounds his pipe, and renders a song "in full, sweet, and melodious tones." At a muttered comment from Magua, Heyward insists upon silence for safety. Then he glances about them and, satisfied that he has seen only shining berries, smiles to himself as they move on. But he is wrong. The branches move and a man peers exultingly after them as they disappear among the dark lines of trees.


--------------------------------------------------------------------------------
/easykv/easykv.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from typing import Tuple
  3 | import math
  4 | import statistics
  5 | from functools import partial
  6 | from .utils import modify_method_of_instance
  7 | from .llama_patch import llama_forward, llama_forward_stream
  8 | from .mistral_patch import mistral_forward, mistral_forward_stream
  9 | 
 10 | def cache_size(kv_cache):
 11 |     """
 12 |     the amount of memory(MB)
 13 |     """
 14 |     cnt = 0
 15 |     for i in range(len(kv_cache)):
 16 |         for tensor in kv_cache[i]:
 17 |             cnt += tensor.numel()
 18 |     return cnt*2/(1024**2)
 19 | 
 20 | def gpu_stats():
 21 |     torch.cuda.empty_cache()
 22 |     memory_stats = torch.cuda.memory_stats()
 23 |     print("Current GPU memory usage:", round(memory_stats["allocated_bytes.all.current"]/(1024**3), 3), "GB")
 24 |     print("Peak GPU memory usage:", round(memory_stats["allocated_bytes.all.peak"]/(1024**3), 3), "GB")
 25 |     # print("Reserved GPU memory:", round(memory_stats["reserved_bytes.all.allocated"]/(1024**3), 3), "GB")
 26 | 
 27 | 
 28 | # ANSI code for different colors
 29 | class Color:
 30 |     RESET = '\033[0m'
 31 |     RED = '\033[91m'
 32 |     GREEN = '\033[92m'
 33 |     YELLOW = '\033[93m'
 34 |     BLUE = '\033[94m'
 35 |     PURPLE = '\033[95m'
 36 |     CYAN = '\033[96m'
 37 | 
 38 |     @staticmethod
 39 |     def print(content, color: str):
 40 |         print(f"{getattr(Color, color.upper())}{content}{Color.RESET}")
 41 | 
 42 | 
 43 | def relu_normalize(p, q):
 44 |     """
 45 |     Construct the modified sampling distribution
 46 |     """
 47 |     tmp_dist = torch.relu(p-q)
 48 |     return tmp_dist / tmp_dist.sum(dim=-1, keepdim=True)
 49 | 
 50 | def entropy(p):
 51 |     """
 52 |     Shano entropy of a distribution
 53 |     """
 54 |     return -torch.sum(p*p.log(), dim=-1)
 55 | 
 56 | def truncate_kv_cache_silo(kv_cache, eviction_ids):
 57 |     kv_cache = list(kv_cache)
 58 |     for i in range(len(kv_cache)):
 59 |         kv_cache[i] = list(kv_cache[i])
 60 |     l = kv_cache[0][0].shape[2]
 61 |     head_dim = kv_cache[0][0].shape[-1]
 62 |     num_heads = kv_cache[0][0].shape[1]
 63 |     for i in range(len(eviction_ids)):
 64 |         _index = torch.arange(l, device=kv_cache[0][0].device).unsqueeze(0).repeat(len(eviction_ids[i]), 1) # (num_heads, l)
 65 |         mask = (_index != torch.tensor(eviction_ids[i], device=kv_cache[0][0].device).unsqueeze(-1)) # (num_heads, l)
 66 |         kv_cache[i][0] = kv_cache[i][0][0][mask, ...].view(1, num_heads, -1, head_dim)
 67 |         kv_cache[i][1] = kv_cache[i][1][0][mask, ...].view(1, num_heads, -1, head_dim)
 68 |     return kv_cache
 69 | 
 70 | def truncate_kv_cache_liso(kv_cache, eviction_ids):
 71 |     kv_cache = list(kv_cache)
 72 |     for i in range(len(kv_cache)):
 73 |         kv_cache[i] = list(kv_cache[i])
 74 |     l = kv_cache[0][0].shape[2]
 75 |     head_dim = kv_cache[0][0].shape[-1]
 76 |     num_heads = kv_cache[0][0].shape[1]
 77 |     for i in range(eviction_ids.shape[0]):
 78 |         src_ = torch.zeros(num_heads, eviction_ids.shape[-1]).to(kv_cache[0][0].device)
 79 |         mask = torch.ones(num_heads, l, device=kv_cache[0][0].device).scatter(dim=-1, index=eviction_ids[i], src=src_).bool()
 80 |         kv_cache[i][0] = kv_cache[i][0][0][mask, ...].view(1, num_heads, -1, head_dim)
 81 |         kv_cache[i][1] = kv_cache[i][1][0][mask, ...].view(1, num_heads, -1, head_dim)
 82 |     return kv_cache
 83 | 
 84 | def truncate_kv_cache_liso_mean(kv_cache, eviction_ids):
 85 |     """
 86 |     eviction_ids: (num_layers, num_heads, k+1)
 87 |     """
 88 |     kv_cache = list(kv_cache)
 89 |     for i in range(len(kv_cache)):
 90 |         kv_cache[i] = list(kv_cache[i])
 91 |     l = kv_cache[0][0].shape[2]
 92 |     head_dim = kv_cache[0][0].shape[-1]
 93 |     num_heads = kv_cache[0][0].shape[1]
 94 |     for i in range(eviction_ids.shape[0]):
 95 |         src_ = torch.zeros(num_heads, eviction_ids.shape[-1]).to(kv_cache[0][0].device)
 96 |         mask = torch.ones(num_heads, l, device=kv_cache[0][0].device).scatter(dim=-1, index=eviction_ids[i], src=src_).bool()
 97 |         evicted_mask = ~mask
 98 |         key_evicted_mean = torch.mean(kv_cache[i][0][0][evicted_mask, ...].view(1, num_heads, -1, head_dim), dim=2, keepdim=True)
 99 |         value_evicted_mean = torch.mean(kv_cache[i][1][0][evicted_mask, ...].view(1, num_heads, -1, head_dim), dim=2, keepdim=True)
100 |         kv_cache[i][0] = torch.cat((kv_cache[i][0][0][mask, ...].view(1, num_heads, -1, head_dim), key_evicted_mean), dim=2)
101 |         kv_cache[i][1] = torch.cat((kv_cache[i][1][0][mask, ...].view(1, num_heads, -1, head_dim), value_evicted_mean), dim=2)
102 |     return kv_cache
103 | 
104 | 
105 | def truncate_kv_cache(kv_cache: Tuple, start, end):
106 |     remain_id = torch.tensor(list(sorted(list(set(list(range(kv_cache[0][0].shape[2]))).difference(set(list(range(start, end))))))), device=kv_cache[0][0].device)
107 |     kv_cache = list(kv_cache)
108 |     for i in range(len(kv_cache)):
109 |         kv_cache[i] = list(kv_cache[i])
110 |         kv_cache[i][0] = kv_cache[i][0][:, :, remain_id, :]
111 |         kv_cache[i][1] = kv_cache[i][1][:, :, remain_id, :]
112 |     return kv_cache
113 | 
114 | 
115 | def logits_adapter(logits: torch.Tensor, temperature: float, top_p: float):
116 |     """
117 |     Apply given transformation to the input logits, including temperature scaling and top_p renormalization
118 |     """
119 |     flag = False
120 |     if logits.ndim==3:
121 |         bsz = logits.shape[0]
122 |         l = logits.shape[1]
123 |         logits = logits.view(-1, logits.shape[-1])
124 |         flag = True
125 |     prob = torch.softmax(logits / temperature, dim=-1)
126 |     sorted_prob, sorted_prob_idx = torch.sort(prob, descending=True, dim=-1)
127 |     cumsum = torch.cumsum(sorted_prob, dim=-1)
128 |     mask = (cumsum - sorted_prob) > top_p
129 |     sorted_prob[mask] = 0.0
130 |     sorted_prob.div_(sorted_prob.sum(dim=-1, keepdim=True))
131 |     _, gather_pos = torch.sort(sorted_prob_idx, descending=False, dim=-1)
132 |     final_prob = torch.gather(sorted_prob, -1, gather_pos)
133 |     if flag: final_prob = final_prob.view(bsz, l, -1)
134 |     return final_prob, torch.softmax(logits, dim=-1)
135 | 
136 | 
137 | def h2o_head_decay_score(attention_map, decay_factor, device, stride):
138 |     num_heads = attention_map[0].shape[1]
139 |     num_layers = len(attention_map)
140 |     budget = attention_map[0].shape[-1]
141 |     cache_attn_scores = torch.tensor([[[0.0]*(budget+stride) for _ in range(num_heads)] for _ in range(num_layers)], device=device)
142 |     decay_tensor = torch.tensor([decay_factor**power for power in range(budget)], device=device).flip(dims=(0,)).unsqueeze(-1).unsqueeze(0).repeat(num_heads, 1, budget) # (1, budget, budget)
143 |     for l in range(num_layers):
144 |         cache_attn_scores[l, :, :-stride] = torch.sum(attention_map[l][0] * decay_tensor, dim=1) * (1.0 - decay_factor)
145 |     return cache_attn_scores
146 | 
147 | def h2o_head_decay_prob_score(attention_map, decay_factor, device, probs):
148 |     num_heads = attention_map[0].shape[1]
149 |     num_layers = len(attention_map)
150 |     budget = attention_map[0].shape[-1]
151 |     cache_attn_scores = torch.tensor([[[0.0]*(budget+1) for _ in range(num_heads)] for _ in range(num_layers)], device=device)
152 |     decay_tensor = torch.tensor([decay_factor**power for power in range(budget)], device=device).flip(dims=(0,)).unsqueeze(-1).unsqueeze(0).repeat(num_heads, 1, budget) # (1, budget, budget)
153 |     probs = torch.tensor(probs, device=device).unsqueeze(-1).unsqueeze(0).repeat(num_heads, 1, budget) # (num_heads, budgett, budget)
154 |     for l in range(num_layers):
155 |         cache_attn_scores[l, :, :-1] = torch.sum(attention_map[l][0] * decay_tensor * probs, dim=1) * (1.0 - decay_factor)
156 |     return cache_attn_scores
157 | 
158 | def h2o_head_prob_score(attention_map, device, probs, mode:str='v1'):
159 |     num_heads = attention_map[0].shape[1]
160 |     num_layers = len(attention_map)
161 |     budget = attention_map[0].shape[-1]
162 |     cache_attn_scores = torch.tensor([[[0.0]*(budget+1) for _ in range(num_heads)] for _ in range(num_layers)], device=device)
163 |     cache_attn_scores_square = torch.tensor([[[0.0]*(budget+1) for _ in range(num_heads)] for _ in range(num_layers)], device=device)
164 |     if mode == 'v1':
165 |         probs = 1.0-torch.tensor(probs, device=device).unsqueeze(-1).unsqueeze(0).repeat(num_heads, 1, budget) # (num_heads, budgett, budget)
166 |     elif mode == 'v2':
167 |         probs = torch.tensor(probs, device=device).unsqueeze(-1).unsqueeze(0).repeat(num_heads, 1, budget) # (num_heads, budgett, budget)
168 |     for l in range(num_layers):
169 |         cache_attn_scores[l, :, :-1] = torch.sum(attention_map[l][0] * probs, dim=1)
170 |         cache_attn_scores_square[l, :, :-1] = torch.sum((attention_map[l][0] * probs)**2, dim=1)
171 |     return cache_attn_scores, cache_attn_scores_square
172 | 
173 | def h2o_head_score(attention_map, device, stride, budget, num_layers, num_heads, empty=False):
174 |     # if attention_map is not None:
175 |     #     attention_map = list(attention_map)
176 |     #     num_layers = len(attention_map)
177 |     #     budget = attention_map[0].shape[-1]
178 |     cache_attn_scores = torch.tensor([[[0.0]*(budget+stride) for _ in range(num_heads)] for _ in range(num_layers)], device=device)
179 |     cache_attn_scores_square = torch.tensor([[[0.0]*(budget+stride) for _ in range(num_heads)] for _ in range(num_layers)], device=device)
180 |     if not empty:
181 |         for l in range(num_layers):
182 |             attention_map[l] = attention_map[l].to('cuda')
183 |             cache_attn_scores[l, :, :attention_map[l].shape[-1]] = torch.sum(attention_map[l][0], dim=1)
184 |             cache_attn_scores_square[l, :, :attention_map[l].shape[-1]] = torch.sum(attention_map[l][0]**2, dim=1)
185 |             attention_map[l] = None
186 |     return cache_attn_scores, cache_attn_scores_square
187 | 
188 | def process_for_mqa_gqa(attentions, num_layers, num_heads, rep_n):
189 |     # Unified processing for MQA, GQA and MHA
190 |     attentions = list(attentions)
191 |     for l in range(num_layers):
192 |         bs = attentions[l].shape[0]
193 |         sl = attentions[l].shape[2]
194 |         tl = attentions[l].shape[3]
195 |         attentions[l] = attentions[l].reshape(bs, num_heads, rep_n, sl, tl).mean(dim=2) # (bs, num_kv_heads, sl, tl)
196 |     return attentions
197 | 
198 | 
199 | @torch.inference_mode()
200 | def generate(self, input_ids, generation_config, kv_mode='encoding', stride=1, report_decoding_latency: bool=False):
201 |     temperature = generation_config.get('temperature', 1.0)
202 |     top_p = generation_config.get('top_p', 1.0)
203 |     max_new_tokens = generation_config.get('max_new_tokens', 1024)
204 |     budget = generation_config.get('budget', 0.5)
205 |     mode = generation_config.get('kv_policy', 'recency')
206 |     temp_length = generation_config.get('temp_length', 4)
207 |     recent_ratio = generation_config.get('recent_ratio', 0.1)
208 |     keep_attention = generation_config.get('keep_attention', False)
209 |     eos_token_ids = generation_config.get('eos_token_ids', [self.tokenizer.eos_token_id])
210 |     streaming = generation_config.get('streaming', False)
211 |     num_layers = self.config.num_hidden_layers
212 |     if not hasattr(self.config, "num_key_value_heads"): num_heads = self.config.num_attention_heads
213 |     else: num_heads = self.config.num_key_value_heads
214 |     tokenizer = self.tokenizer
215 |     # Handle MQA and GQA
216 |     is_gqa = hasattr(self.config, "num_key_value_heads") and getattr(self.config, "num_key_value_heads") != getattr(self.config, "num_attention_heads")
217 |     if is_gqa: rep_n = self.config.num_attention_heads // self.config.num_key_value_heads
218 |     else: rep_n = 1
219 |     length = input_ids.shape[-1]
220 |     if kv_mode == 'auto':
221 |         length = input_ids.shape[-1]
222 |         assert type(budget) == int
223 |         if budget > length:
224 |             kv_mode = 'decoding'
225 |             budget -= length
226 |         else:
227 |             kv_mode = 'encoding_decoding'
228 |     if kv_mode == 'decoding':
229 |         """
230 |         auto-regressive decoding
231 |         """
232 |         outputs_prefilling = self(input_ids=input_ids, use_cache=True)
233 |         prefix_token_lst = input_ids[0].cpu().numpy().tolist()
234 |         past_key_values, logits = outputs_prefilling.past_key_values, outputs_prefilling.logits
235 |         logits_prev_step = logits[:, -1, :]
236 |         prob_prev_step, raw_prob_prev_step = logits_adapter(logits_prev_step, temperature, top_p)
237 | 
238 |         cache_tokens = []
239 |         cache_probs = []
240 |         cache_cur_probs = []
241 |         cache_positions = []
242 |         cache_attn_scores = torch.tensor([[[0.0]*(budget+1) for _ in range(num_heads)] for _ in range(num_layers)], device=self.device)
243 |         cache_attn_scores_square = torch.tensor([[[0.0]*(budget+1) for _ in range(num_heads)] for _ in range(num_layers)], device=self.device)
244 |         cache_counter = torch.tensor([[[1.0]*(budget+1) for _ in range(num_heads)] for _ in range(num_layers)], device=self.device)
245 |         cache_counter = torch.cumsum(cache_counter, dim=-1).flip(dims=(2,)) - 1.0
246 |         cache_counter_token = torch.tensor([1.0]*(budget+1), device=self.device)
247 |         cache_counter_token = torch.cumsum(cache_counter_token, dim=-1).flip(dims=(0, )) - 1.0
248 |         n = 0
249 |         output_ids = []
250 |         token_probs = []
251 |         cur_pos_id = past_key_values[0][0].shape[2]
252 |         evicted_positions = []
253 |         if 'llama' in self.config.architectures[0].lower():
254 |             modify_method_of_instance(self, "LlamaAttention", "forward", partial(llama_forward if not streaming else llama_forward_stream, attn_device='cuda'))
255 |         else:
256 |             modify_method_of_instance(self, "MistralAttention", "forward", partial(mistral_forward if not streaming else mistral_forward_stream, attn_device='cuda'))
257 |         while n < max_new_tokens:
258 |             next_token = torch.multinomial(prob_prev_step, num_samples=1)
259 |             output_ids.append(next_token[0, 0].cpu().item())
260 |             next_token_prob = torch.gather(raw_prob_prev_step, -1, next_token) # (bsz, 1)
261 |             token_probs.append((tokenizer.convert_ids_to_tokens([output_ids[-1]])[0], next_token_prob[0, 0].cpu().item()))
262 |             n += 1
263 |             if output_ids[-1] in eos_token_ids: break
264 |             outputs = self(input_ids=next_token, 
265 |                             past_key_values=past_key_values,
266 |                             attention_mask=torch.ones(next_token.shape[0], 1+past_key_values[0][0].shape[2], dtype=torch.long, device=next_token.device),
267 |                             position_ids=torch.LongTensor([cur_pos_id]).to(self.device).view(-1, 1),
268 |                             use_cache=True,
269 |                             output_attentions=True)
270 |             # unified processing for GQA and MHA
271 |             outputs.attentions = list(outputs.attentions)
272 |             for l in range(num_layers):
273 |                 bs = outputs.attentions[l].shape[0]
274 |                 sl = outputs.attentions[l].shape[2]
275 |                 tl = outputs.attentions[l].shape[3]
276 |                 outputs.attentions[l] = outputs.attentions[l].reshape(bs, num_heads, rep_n, sl, tl).mean(dim=2) # (bs, num_kv_heads, sl, tl)
277 |             past_key_values = outputs.past_key_values
278 |             logits_prev_step = outputs.logits[:, -1, :]
279 |             cache_cur_probs.append(torch.exp(-entropy(raw_prob_prev_step))[0].cpu().item())
280 |             prob_prev_step, raw_prob_prev_step = logits_adapter(logits_prev_step, temperature, top_p)
281 | 
282 |             # update 
283 |             cache_probs.append(next_token_prob[0,0].cpu().item())
284 |             cache_tokens.append(output_ids[-1])
285 |             cache_positions.append(cur_pos_id)
286 | 
287 |             # update accumulated attention scores
288 |             if 'h2o_head' == mode:
289 |                 for l in range(num_layers):
290 |                     attention_map = outputs.attentions[l][0, :, 0, len(prefix_token_lst):] # (num_heads, l)
291 |                     cache_attn_scores[l, :, :attention_map.shape[-1]] += attention_map
292 |             elif 'roco' == mode:
293 |                 for l in range(num_layers):
294 |                     attention_map = outputs.attentions[l][0, :, 0, len(prefix_token_lst):] # (num_heads, l)
295 |                     cache_attn_scores[l, :, :attention_map.shape[-1]] += attention_map
296 |                     cache_attn_scores_square[l, :, :attention_map.shape[-1]] += attention_map ** 2
297 |             elif 'tova' == mode:
298 |                 for l in range(num_layers):
299 |                     attention_map = outputs.attentions[l][0, :, 0, len(prefix_token_lst):] # (num_heads, l)
300 |                     cache_attn_scores[l, :, :attention_map.shape[-1]] = attention_map
301 |             # evict if current kv cache size exceeds the budget
302 |             cur_kv_size = past_key_values[0][0].shape[2]
303 |             if (cur_kv_size-len(prefix_token_lst)) > budget and mode != 'full':
304 |                 cache_counter += 1.0
305 |                 cache_counter_token += 1.0
306 |                 positions_tensor = torch.tensor(cache_positions, device=self.device).float()
307 |                 positions_tensor = positions_tensor / float(cur_pos_id)
308 |                 recent_ratio = 0.3
309 |                 recent_window = int(budget*recent_ratio)
310 |                 if mode in ['h2o_head']:
311 |                     eviction_ids = torch.argmin(cache_attn_scores[:, :, :-recent_window], dim=-1) + len(prefix_token_lst)
312 |                     _eviction_ids = eviction_ids
313 |                     eviction_ids = eviction_ids.cpu().numpy().tolist()
314 |                     past_key_values = truncate_kv_cache_silo(past_key_values, eviction_ids)
315 |                     _index = torch.arange(cache_attn_scores.shape[-1], device=self.device).unsqueeze(0).unsqueeze(0).repeat(num_layers, num_heads, 1)
316 |                     _eviction_ids -= len(prefix_token_lst)
317 |                     mask = (_eviction_ids.unsqueeze(-1)!=_index).view(-1, _index.shape[-1])
318 |                     cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, _index.shape[-1]-1), torch.zeros(num_layers, num_heads, 1, device=self.device)), dim=-1)
319 |                 elif mode in ['roco']:
320 |                     cur_std = torch.sqrt(cache_attn_scores_square / cache_counter - (cache_attn_scores / cache_counter)**2)
321 |                     cur_std[:, :, -10:] = 1e9
322 |                     _, feasible_ids = torch.topk(cur_std, largest=False, k=budget-recent_window, dim=-1) # (layers, heads, k)
323 |                     argmin_id = torch.argmin(cache_attn_scores.gather(dim=-1, index=feasible_ids) / cache_counter.gather(dim=-1, index=feasible_ids), dim=-1).unsqueeze(-1) # (layers, heads)
324 |                     eviction_ids = feasible_ids.gather(dim=-1, index=argmin_id).squeeze(-1) + len(prefix_token_lst)
325 |                     _eviction_ids = eviction_ids
326 |                     eviction_ids = eviction_ids.cpu().numpy().tolist()
327 |                     past_key_values = truncate_kv_cache_silo(past_key_values, eviction_ids)
328 |                     _index = torch.arange(cache_attn_scores.shape[-1], device=self.device).unsqueeze(0).unsqueeze(0).repeat(num_layers, num_heads, 1)
329 |                     _eviction_ids -= len(prefix_token_lst)
330 |                     mask = (_eviction_ids.unsqueeze(-1)!=_index).view(-1, _index.shape[-1])
331 |                     cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, _index.shape[-1]-1), torch.zeros(num_layers, num_heads, 1, device=self.device)), dim=-1)
332 |                     cache_attn_scores_square = torch.cat((cache_attn_scores_square.view(-1, cache_attn_scores_square.shape[-1])[mask].view(num_layers, num_heads, _index.shape[-1]-1), torch.zeros(num_layers, num_heads, 1, device=self.device)), dim=-1)
333 |                     cache_counter = torch.cat((cache_counter.view(-1, cache_counter.shape[-1])[mask].view(num_layers, num_heads, _index.shape[-1]-1), torch.zeros(num_layers, num_heads, 1, device=self.device)), dim=-1)
334 |                 elif mode == 'tova':
335 |                     eviction_ids = torch.argmin(cache_attn_scores, dim=-1) + len(prefix_token_lst)
336 |                     _eviction_ids = eviction_ids
337 |                     eviction_ids = eviction_ids.cpu().numpy().tolist()
338 |                     past_key_values = truncate_kv_cache_silo(past_key_values, eviction_ids)
339 |                     _index = torch.arange(cache_attn_scores.shape[-1], device=self.device).unsqueeze(0).unsqueeze(0).repeat(num_layers, num_heads, 1)
340 |                     _eviction_ids -= len(prefix_token_lst)
341 |                     mask = (_eviction_ids.unsqueeze(-1)!=_index).view(-1, _index.shape[-1])
342 |                     cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, _index.shape[-1]-1), torch.zeros(num_layers, num_heads, 1, device=self.device)), dim=-1)
343 |                 elif mode == 'recency':
344 |                     scores = 1.0 - positions_tensor
345 |                     _, evict_id = torch.topk(scores, k=1, dim=-1)
346 |                     evict_id = evict_id[0].cpu().item()
347 |                     past_key_values = truncate_kv_cache(past_key_values, start=len(prefix_token_lst)+evict_id, end=len(prefix_token_lst)+evict_id+1)
348 |                     evicted_positions.append(cache_positions[evict_id]-len(prefix_token_lst))
349 |                     cache_probs.pop(evict_id)
350 |                     cache_tokens.pop(evict_id)
351 |                     cache_cur_probs.pop(evict_id)
352 |                     cache_positions.pop(evict_id)
353 |                 elif mode == 'random':
354 |                     scores = torch.rand(*positions_tensor.shape).to(self.device)
355 |                     _, evict_id = torch.topk(scores, k=1, dim=-1)
356 |                     evict_id = evict_id[0].cpu().item()
357 |                     past_key_values = truncate_kv_cache(past_key_values, start=len(prefix_token_lst)+evict_id, end=len(prefix_token_lst)+evict_id+1)
358 |                     evicted_positions.append(cache_positions[evict_id]-len(prefix_token_lst))
359 |                     cache_probs.pop(evict_id)
360 |                     cache_tokens.pop(evict_id)
361 |                     cache_cur_probs.pop(evict_id)
362 |                     cache_positions.pop(evict_id)
363 |             cur_pos_id += 1
364 |         _tmp = past_key_values[0][0].shape[2]-len(prefix_token_lst)
365 |         print(f"KV cache budget ratio: {_tmp / len(output_ids) *100:.2f}%({_tmp}/{len(output_ids)})")
366 |         return tokenizer.decode(output_ids, skip_special_tokens=True).strip()
367 |     elif kv_mode == 'encoding':
368 |         """
369 |         prompt encoding/prefilling
370 |         """
371 |         length = input_ids.shape[-1]
372 |         if type(budget) == float and budget >= 1.0 or type(budget) == int and budget >= length:
373 |             outputs_prefilling = self(input_ids=input_ids, use_cache=True)
374 |             past_key_values, logits = outputs_prefilling.past_key_values, outputs_prefilling.logits
375 |             logits_prev_step = logits[:, -1, :]
376 |             prob_prev_step, raw_prob_prev_step = logits_adapter(logits_prev_step, temperature, top_p)
377 |             cur_pos_id = past_key_values[0][0].shape[2]
378 |         else:
379 |             # In case budget is also large, the attention_map will occupy a lot of memory
380 |             # We offload attention_map to CPU first and move it layer by laer to GPU to compute eviction score
381 |             if 'llama' in self.config.architectures[0].lower():
382 |                 modify_method_of_instance(self, "LlamaAttention", "forward", partial(llama_forward if not streaming else llama_forward_stream, attn_device='cuda'))
383 |             else:
384 |                 modify_method_of_instance(self, "MistralAttention", "forward", partial(mistral_forward if not streaming else mistral_forward_stream, attn_device='cuda'))
385 |             if type(budget) == float:
386 |                 budget = int(length * budget) + stride
387 |             elif type(budget) == int: 
388 |                 budget += stride
389 |             for idx in range(budget, -1, -1):
390 |                 if (length-idx)%stride==0: break
391 |             for r_idx in range(idx-1, -1, -1):
392 |                 if (idx-r_idx)%stride==0: break
393 |             prefix = input_ids[:, :r_idx]
394 |             recent_window = int(budget*recent_ratio)
395 |             sink_length = temp_length
396 |             outputs_prefilling = self(input_ids=prefix, use_cache=True, output_attentions=keep_attention)
397 |             past_key_values, logits = outputs_prefilling.past_key_values, outputs_prefilling.logits
398 |             loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
399 |             logits_prev_step = logits[:, -1, :]
400 |             _, raw_prob_prev_step = logits_adapter(logits_prev_step, temperature, top_p)
401 |             prefix_token_lst = input_ids[0].cpu().numpy().tolist()
402 |             cache_tokens = prefix[0].cpu().numpy().tolist()
403 |             if keep_attention:
404 |                 outputs_prefilling.attentions = process_for_mqa_gqa(outputs_prefilling.attentions, num_layers, num_heads, rep_n)
405 |             cache_attn_scores, cache_attn_scores_square = h2o_head_score(outputs_prefilling.attentions, self.device, stride, idx, num_layers, num_heads, empty=not keep_attention)
406 |             # Back to GPU
407 |             if 'llama' in self.config.architectures[0].lower():
408 |                 modify_method_of_instance(self, "LlamaAttention", "forward", partial(llama_forward if not streaming else llama_forward_stream, attn_device='cuda'))
409 |             else:
410 |                 modify_method_of_instance(self, "MistralAttention", "forward", partial(mistral_forward if not streaming else mistral_forward_stream, attn_device='cuda'))
411 | 
412 |             if keep_attention:
413 |                 cache_counter = torch.tensor([[[1.0]*(idx+stride) for _ in range(num_heads)] for _ in range(num_layers)], device=self.device)
414 |                 cache_counter = torch.cumsum(cache_counter, dim=-1).flip(dims=(2,)) - float(stride)
415 |             else:
416 |                 cache_counter = torch.tensor([[[float(stride)]*idx+torch.arange(stride, 0, -1).numpy().tolist() for _ in range(num_heads)] for _ in range(num_layers)], device=self.device) - float(stride)
417 |             cache_counter_token = torch.tensor([1.0]*(idx+stride), device=self.device)
418 |             cache_counter_token = torch.cumsum(cache_counter_token, dim=-1).flip(dims=(0, )) - float(stride)
419 |             n = 0
420 |             output_ids = []
421 |             token_probs = []
422 |             cur_pos_id = past_key_values[0][0].shape[2]
423 |             evicted_positions = []
424 |             log_probs = []
425 |             # for token_i in range(idx, length, stride):
426 |             for token_i in range(r_idx, length, stride):
427 |                 n += stride
428 |                 outputs = self(input_ids=input_ids[:, token_i:token_i+stride],
429 |                                 past_key_values=past_key_values,
430 |                                 attention_mask=torch.ones(1, stride+past_key_values[0][0].shape[2], dtype=torch.long, device=self.device),
431 |                                 position_ids=torch.LongTensor(list(range(cur_pos_id, cur_pos_id+stride))).to(self.device).view(1, -1),
432 |                                 use_cache=True,
433 |                                 output_attentions=True)
434 |                 past_key_values = outputs.past_key_values
435 |                 logits_prev_step = outputs.logits[:, -1, :]
436 |                 prob_prev_step, raw_prob_prev_step = logits_adapter(logits_prev_step, temperature, top_p)
437 | 
438 |                 # Unified processing for MQA, GQA and MHA
439 |                 outputs.attentions = process_for_mqa_gqa(outputs.attentions, num_layers, num_heads, rep_n)
440 | 
441 |                 cur_kv_size = past_key_values[0][0].shape[2]
442 |                 # update accumulated attention scores
443 |                 if cur_kv_size>idx or keep_attention:
444 |                     if 'h2o_head' == mode:
445 |                         for l in range(num_layers):
446 |                             attention_map = outputs.attentions[l][0, :, :, :].sum(dim=1) # (num_heads, stride, stride+l)
447 |                             cache_attn_scores[l, :, :attention_map.shape[-1]] += attention_map
448 |                     elif 'roco' == mode:
449 |                         for l in range(num_layers):
450 |                             attention_map = outputs.attentions[l][0, :, :, :].sum(dim=1) # (num_heads, l)
451 |                             attention_map_sq = ((outputs.attentions[l][0, :, :, :])**2).sum(dim=1)
452 |                             cache_attn_scores[l, :, :attention_map.shape[-1]] += attention_map
453 |                             cache_attn_scores_square[l, :, :attention_map.shape[-1]] += attention_map_sq
454 |                     elif 'tova' == mode:
455 |                         for l in range(num_layers):
456 |                             attention_map = outputs.attentions[l][0, :, -1, :].mean(dim=0).unsqueeze(0).repeat(num_heads, 1) # (num_heads, l)
457 |                             cache_attn_scores[l, :, :attention_map.shape[-1]] = attention_map
458 |                 # evict if current kv cache size exceeds the budget
459 |                 if mode != 'full' and cur_kv_size>idx:
460 |                     cache_counter += float(stride)
461 |                     cache_counter_token += float(stride)
462 |                     if mode in ['h2o_head']:
463 |                         eviction_ids = torch.topk(cache_attn_scores[:, :, sink_length:-recent_window], dim=-1, k=stride, largest=False)[1] + sink_length
464 |                         past_key_values = truncate_kv_cache_liso(past_key_values, eviction_ids)
465 |                         _index = torch.ones(num_layers, num_heads, cache_attn_scores.shape[-1], device=self.device).view(num_layers*num_heads, -1)
466 |                         _src = torch.zeros(num_layers, num_heads, stride, device=self.device).view(num_layers*num_heads, -1)
467 |                         mask = _index.scatter(dim=-1, index=eviction_ids.view(num_layers*num_heads, -1), src=_src).bool()
468 |                         cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, stride, device=self.device)), dim=-1)
469 |                         cache_counter = torch.cat((cache_counter.view(-1, cache_counter.shape[-1])[mask].view(num_layers, num_heads, -1), (torch.arange(stride)-stride+1).view(1, 1, -1).repeat(num_layers, num_heads, 1).flip(dims=(2,)).to(self.device)), dim=-1)
470 |                     elif mode in ['roco']:
471 |                         cur_std = torch.sqrt(cache_attn_scores_square / cache_counter - (cache_attn_scores / cache_counter)**2)
472 |                         cur_std[:, :, -10:] = 1e9
473 |                         cur_std[:, :, :sink_length] = 1e9
474 |                         _, feasible_ids = torch.topk(cur_std, largest=False, k=max(budget-recent_window-sink_length, stride), dim=-1) # (layers, heads, k)
475 |                         argmin_id = torch.topk(cache_attn_scores.gather(dim=-1, index=feasible_ids) / cache_counter.gather(dim=-1, index=feasible_ids), dim=-1, largest=False, k=stride)[1] # (layers, heads)
476 |                         eviction_ids = feasible_ids.gather(dim=-1, index=argmin_id)
477 |                         past_key_values = truncate_kv_cache_liso(past_key_values, eviction_ids)
478 |                         _index = torch.ones(num_layers, num_heads, cache_attn_scores.shape[-1], device=self.device).view(num_layers*num_heads, -1)
479 |                         _src = torch.zeros(num_layers, num_heads, stride, device=self.device).view(num_layers*num_heads, -1)
480 |                         mask = _index.scatter(dim=-1, index=eviction_ids.view(num_layers*num_heads, -1), src=_src).bool()
481 |                         cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, stride, device=self.device)), dim=-1)
482 |                         cache_attn_scores_square = torch.cat((cache_attn_scores_square.view(-1, cache_attn_scores_square.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, stride, device=self.device)), dim=-1)
483 |                         cache_counter = torch.cat((cache_counter.view(-1, cache_counter.shape[-1])[mask].view(num_layers, num_heads, -1), (torch.arange(stride)-stride+1).view(1, 1, -1).repeat(num_layers, num_heads, 1).flip(dims=(2,)).to(self.device)), dim=-1)
484 |                     elif mode == 'tova':
485 |                         eviction_ids = torch.topk(cache_attn_scores[:, :, sink_length:-recent_window], dim=-1, k=stride, largest=False)[1] + sink_length
486 |                         past_key_values = truncate_kv_cache_liso(past_key_values, eviction_ids)
487 |                         _index = torch.ones(num_layers, num_heads, cache_attn_scores.shape[-1], device=self.device).view(num_layers*num_heads, -1)
488 |                         _src = torch.zeros(num_layers, num_heads, stride, device=self.device).view(num_layers*num_heads, -1)
489 |                         mask = _index.scatter(dim=-1, index=eviction_ids.view(num_layers*num_heads, -1), src=_src).bool()
490 |                         cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, stride, device=self.device)), dim=-1)
491 |                     elif mode == 'recency':
492 |                         evict_id = sink_length
493 |                         past_key_values = truncate_kv_cache(past_key_values, start=evict_id, end=evict_id+stride)
494 |                     elif mode == 'random':
495 |                         scores = torch.rand(cache_attn_scores.shape[-1]).to(self.device)
496 |                         scores[-stride:] = -1e9
497 |                         _, evict_id = torch.topk(scores, k=1, dim=-1)
498 |                         evict_id = evict_id[0].cpu().item()
499 |                         past_key_values = truncate_kv_cache(past_key_values, start=evict_id, end=evict_id+stride)
500 |                 cur_pos_id += stride
501 |         cur_pos_id = input_ids.shape[-1]
502 |         _tmp = past_key_values[0][0].shape[2]
503 |         print(f"KV cache budget ratio: {_tmp / input_ids.shape[-1]*100:.2f}%({_tmp}/{input_ids.shape[-1]})")
504 |         n = 0
505 |         output_ids = []
506 |         decoding_times = []
507 |         import time
508 |         while n < max_new_tokens:
509 |             next_token = torch.multinomial(prob_prev_step, num_samples=1)
510 |             output_ids.append(next_token[0, 0].cpu().item())
511 |             next_token_prob = torch.gather(raw_prob_prev_step, -1, next_token) # (bsz, 1)
512 |             n += 1
513 |             if output_ids[-1] in eos_token_ids: break
514 |             s = time.time()
515 |             outputs = self(input_ids=next_token, 
516 |                             past_key_values=past_key_values,
517 |                             attention_mask=torch.ones(next_token.shape[0], 1+past_key_values[0][0].shape[2], dtype=torch.long, device=self.device),
518 |                             position_ids=torch.LongTensor([cur_pos_id]).to(self.device).view(-1, 1),
519 |                             use_cache=True)
520 |             past_key_values = outputs.past_key_values
521 |             logits_prev_step = outputs.logits[:, -1, :]
522 |             prob_prev_step, raw_prob_prev_step = logits_adapter(logits_prev_step, temperature, top_p)
523 |             e = time.time()
524 |             cur_step_time = e-s
525 |             decoding_times.append(cur_step_time)
526 |             cur_pos_id += 1
527 |         decoded_output = tokenizer.decode(output_ids, skip_special_tokens=True).strip()
528 |         if report_decoding_latency: print(f"Per-step decoding latency: {statistics.mean(decoding_times[1:]):.3f}")
529 |         return decoded_output
530 |     elif kv_mode == 'encoding_decoding':
531 |         """
532 |         after encoding, budget-1 decoding
533 |         """
534 |         length = input_ids.shape[-1]
535 |         assert type(budget) == int and budget <= length
536 |         white_lst = ['random', 'recency', 'tova', 'roco']
537 |         assert mode in white_lst, f"mode must be within {white_lst}, get {mode} instead"
538 |         # In case budget is also large, the attention_map will occupy a lot of memory
539 |         # We offload attention_map to CPU first and move it layer by layer to GPU to compute eviction score
540 |         if 'llama' in self.config.architectures[0].lower():
541 |             modify_method_of_instance(self, "LlamaAttention", "forward", partial(llama_forward if not streaming else llama_forward_stream, attn_device='cuda'))
542 |         else:
543 |             modify_method_of_instance(self, "MistralAttention", "forward", partial(mistral_forward if not streaming else mistral_forward_stream, attn_device='cuda'))
544 |         if type(budget) == float:
545 |             budget = int(length * budget) + stride
546 |         elif type(budget) == int: 
547 |             budget += stride
548 |             if budget >= length: budget -= stride
549 |         for idx in range(budget, -1, -1):
550 |             if (length-idx)%stride==0: break
551 |         for r_idx in range(1, idx):
552 |             if (idx-r_idx)%stride==0: break
553 |         # prefix = input_ids[:, :idx]
554 |         prefix = input_ids[:, :r_idx]
555 |         recent_window = int(budget*recent_ratio)
556 |         sink_length = temp_length
557 |         outputs_prefilling = self(input_ids=prefix, use_cache=True, output_attentions=keep_attention)
558 |         past_key_values, logits = outputs_prefilling.past_key_values, outputs_prefilling.logits
559 |         loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
560 |         logits_prev_step = logits[:, -1, :]
561 |         _, raw_prob_prev_step = logits_adapter(logits_prev_step, temperature, top_p)
562 |         prefix_token_lst = input_ids[0].cpu().numpy().tolist()
563 |         cache_tokens = prefix[0].cpu().numpy().tolist()
564 |         if keep_attention:
565 |             outputs_prefilling.attentions = process_for_mqa_gqa(outputs_prefilling.attentions, num_layers, num_heads, rep_n)
566 |         cache_attn_scores, cache_attn_scores_square = h2o_head_score(outputs_prefilling.attentions, self.device, stride, idx, num_layers, num_heads, empty=not keep_attention)
567 |         # Back to GPU
568 |         if 'llama' in self.config.architectures[0].lower():
569 |             modify_method_of_instance(self, "LlamaAttention", "forward", partial(llama_forward if not streaming else llama_forward_stream, attn_device='cuda'))
570 |         else:
571 |             modify_method_of_instance(self, "MistralAttention", "forward", partial(mistral_forward if not streaming else mistral_forward_stream, attn_device='cuda'))
572 | 
573 |         if keep_attention:
574 |             cache_counter = torch.tensor([[[1.0]*(idx+stride) for _ in range(num_heads)] for _ in range(num_layers)], device=self.device)
575 |             cache_counter = torch.cumsum(cache_counter, dim=-1).flip(dims=(2,)) - float(stride)
576 |         else:
577 |             cache_counter = torch.tensor([[[float(stride)]*idx+torch.arange(stride, 0, -1).numpy().tolist() for _ in range(num_heads)] for _ in range(num_layers)], device=self.device) - float(stride)
578 |         cache_counter_token = torch.tensor([1.0]*(idx+stride), device=self.device)
579 |         cache_counter_token = torch.cumsum(cache_counter_token, dim=-1).flip(dims=(0, )) - float(stride)
580 |         n = 0
581 |         output_ids = []
582 |         token_probs = []
583 |         cur_pos_id = past_key_values[0][0].shape[2]
584 |         evicted_positions = []
585 |         log_probs = []
586 |         # for token_i in range(idx, length, stride):
587 |         for token_i in range(r_idx, length, stride):
588 |             n += stride
589 |             outputs = self(input_ids=input_ids[:, token_i:token_i+stride],
590 |                             past_key_values=past_key_values,
591 |                             attention_mask=torch.ones(1, stride+past_key_values[0][0].shape[2], dtype=torch.long, device=self.device),
592 |                             position_ids=torch.LongTensor(list(range(cur_pos_id, cur_pos_id+stride))).to(self.device).view(1, -1),
593 |                             use_cache=True,
594 |                             output_attentions=True)
595 |             past_key_values = outputs.past_key_values
596 |             logits_prev_step = outputs.logits[:, -1, :]
597 |             prob_prev_step, raw_prob_prev_step = logits_adapter(logits_prev_step, temperature, top_p)
598 | 
599 |             # Unified processing for MQA, GQA and MHA
600 |             outputs.attentions = process_for_mqa_gqa(outputs.attentions, num_layers, num_heads, rep_n)
601 | 
602 |             cur_kv_size = past_key_values[0][0].shape[2]
603 |             if cur_kv_size>idx or keep_attention:
604 |                 # update accumulated attention scores
605 |                 if 'h2o_head' == mode:
606 |                     for l in range(num_layers):
607 |                         attention_map = outputs.attentions[l][0, :, :, :].sum(dim=1) # (num_heads, stride, stride+l)
608 |                         cache_attn_scores[l, :, :attention_map.shape[-1]] += attention_map
609 |                 elif 'roco' == mode:
610 |                     for l in range(num_layers):
611 |                         attention_map = outputs.attentions[l][0, :, :, :].sum(dim=1) # (num_heads, l)
612 |                         attention_map_sq = ((outputs.attentions[l][0, :, :, :])**2).sum(dim=1)
613 |                         cache_attn_scores[l, :, :attention_map.shape[-1]] += attention_map
614 |                         cache_attn_scores_square[l, :, :attention_map.shape[-1]] += attention_map_sq
615 |                 elif 'tova' == mode:
616 |                     for l in range(num_layers):
617 |                         attention_map = outputs.attentions[l][0, :, -1, :] # (num_heads, l)
618 |                         cache_attn_scores[l, :, :attention_map.shape[-1]] = attention_map
619 |             # evict if current kv cache size exceeds the budget
620 |             if mode != 'full' and cur_kv_size>idx:
621 |                 cache_counter += float(stride)
622 |                 cache_counter_token += float(stride)
623 |                 if mode in ['h2o_head']:
624 |                     eviction_ids = torch.topk(cache_attn_scores[:, :, sink_length:-recent_window], dim=-1, k=stride, largest=False)[1] + sink_length
625 |                     past_key_values = truncate_kv_cache_liso(past_key_values, eviction_ids)
626 |                     _index = torch.ones(num_layers, num_heads, cache_attn_scores.shape[-1], device=self.device).view(num_layers*num_heads, -1)
627 |                     _src = torch.zeros(num_layers, num_heads, stride, device=self.device).view(num_layers*num_heads, -1)
628 |                     mask = _index.scatter(dim=-1, index=eviction_ids.view(num_layers*num_heads, -1), src=_src).bool()
629 |                     cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, stride, device=self.device)), dim=-1)
630 |                     cache_counter = torch.cat((cache_counter.view(-1, cache_counter.shape[-1])[mask].view(num_layers, num_heads, -1), (torch.arange(stride)-stride+1).view(1, 1, -1).repeat(num_layers, num_heads, 1).flip(dims=(2,)).to(self.device)), dim=-1)
631 |                 elif mode in ['roco']:
632 |                     cur_std = torch.sqrt(cache_attn_scores_square / cache_counter - (cache_attn_scores / cache_counter)**2)
633 |                     cur_std[:, :, -10:] = 1e9
634 |                     cur_std[:, :, :sink_length] = 1e9
635 |                     _, feasible_ids = torch.topk(cur_std, largest=False, k=max(budget-recent_window-sink_length, stride), dim=-1) # (layers, heads, k)
636 |                     argmin_id = torch.topk(cache_attn_scores.gather(dim=-1, index=feasible_ids) / cache_counter.gather(dim=-1, index=feasible_ids), dim=-1, largest=False, k=stride)[1] # (layers, heads)
637 |                     eviction_ids = feasible_ids.gather(dim=-1, index=argmin_id)
638 |                     past_key_values = truncate_kv_cache_liso(past_key_values, eviction_ids)
639 |                     _index = torch.ones(num_layers, num_heads, cache_attn_scores.shape[-1], device=self.device).view(num_layers*num_heads, -1)
640 |                     _src = torch.zeros(num_layers, num_heads, stride, device=self.device).view(num_layers*num_heads, -1)
641 |                     mask = _index.scatter(dim=-1, index=eviction_ids.view(num_layers*num_heads, -1), src=_src).bool()
642 |                     cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, stride, device=self.device)), dim=-1)
643 |                     cache_attn_scores_square = torch.cat((cache_attn_scores_square.view(-1, cache_attn_scores_square.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, stride, device=self.device)), dim=-1)
644 |                     cache_counter = torch.cat((cache_counter.view(-1, cache_counter.shape[-1])[mask].view(num_layers, num_heads, -1), (torch.arange(stride)-stride+1).view(1, 1, -1).repeat(num_layers, num_heads, 1).flip(dims=(2,)).to(self.device)), dim=-1)
645 |                 elif mode == 'tova':
646 |                     eviction_ids = torch.topk(cache_attn_scores[:, :, sink_length:-recent_window], dim=-1, k=stride, largest=False)[1] + sink_length
647 |                     past_key_values = truncate_kv_cache_liso(past_key_values, eviction_ids)
648 |                     _index = torch.ones(num_layers, num_heads, cache_attn_scores.shape[-1], device=self.device).view(num_layers*num_heads, -1)
649 |                     _src = torch.zeros(num_layers, num_heads, stride, device=self.device).view(num_layers*num_heads, -1)
650 |                     mask = _index.scatter(dim=-1, index=eviction_ids.view(num_layers*num_heads, -1), src=_src).bool()
651 |                     cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, stride, device=self.device)), dim=-1)
652 |                 elif mode == 'recency':
653 |                     evict_id = sink_length
654 |                     past_key_values = truncate_kv_cache(past_key_values, start=evict_id, end=evict_id+stride)
655 |                 elif mode == 'random':
656 |                     scores = torch.rand(cache_attn_scores.shape[-1]).to(self.device)
657 |                     scores[-stride:] = -1e9
658 |                     _, evict_id = torch.topk(scores, k=1, dim=-1)
659 |                     evict_id = evict_id[0].cpu().item()
660 |                     past_key_values = truncate_kv_cache(past_key_values, start=evict_id, end=evict_id+stride)
661 |             cur_pos_id += stride
662 |         cur_pos_id = input_ids.shape[-1]
663 |         _tmp = past_key_values[0][0].shape[2]
664 |         n = 0
665 |         output_ids = []
666 |         cache_attn_scores = cache_attn_scores[:, :, :-(stride-1)]
667 |         cache_attn_scores_square = cache_attn_scores_square[:, :, :-(stride-1)]
668 |         cache_counter = cache_counter[:, :, :-(stride-1)]
669 |         assert cache_attn_scores.shape[-1] == _tmp+1
670 |         while n < max_new_tokens:
671 |             next_token = torch.multinomial(prob_prev_step, num_samples=1)
672 |             output_ids.append(next_token[0, 0].cpu().item())
673 |             next_token_prob = torch.gather(raw_prob_prev_step, -1, next_token) # (bsz, 1)
674 |             n += 1
675 |             if output_ids[-1] in eos_token_ids: break
676 |             outputs = self(input_ids=next_token, 
677 |                             past_key_values=past_key_values,
678 |                             attention_mask=torch.ones(next_token.shape[0], 1+past_key_values[0][0].shape[2], dtype=torch.long, device=self.device),
679 |                             position_ids=torch.LongTensor([cur_pos_id]).to(self.device).view(-1, 1),
680 |                             use_cache=True,
681 |                             output_attentions=True)
682 |             # unified processing for GQA and MHA
683 |             outputs.attentions = list(outputs.attentions)
684 |             for l in range(num_layers):
685 |                 bs = outputs.attentions[l].shape[0]
686 |                 sl = outputs.attentions[l].shape[2]
687 |                 tl = outputs.attentions[l].shape[3]
688 |                 outputs.attentions[l] = outputs.attentions[l].reshape(bs, num_heads, rep_n, sl, tl).mean(dim=2) # (bs, num_kv_heads, sl, tl)
689 |             past_key_values = outputs.past_key_values
690 |             logits_prev_step = outputs.logits[:, -1, :]
691 |             prob_prev_step, raw_prob_prev_step = logits_adapter(logits_prev_step, temperature, top_p)
692 | 
693 |             # update accumulated attention scores
694 |             if 'h2o_head' == mode:
695 |                 for l in range(num_layers):
696 |                     attention_map = outputs.attentions[l][0, :, 0, :]
697 |                     cache_attn_scores[l, :, :attention_map.shape[-1]] += attention_map
698 |             elif 'roco' == mode:
699 |                 for l in range(num_layers):
700 |                     attention_map = outputs.attentions[l][0, :, 0, :]
701 |                     attention_map_sq = ((outputs.attentions[l][0, :, 0, :])**2)
702 |                     cache_attn_scores[l, :, :attention_map.shape[-1]] += attention_map
703 |                     cache_attn_scores_square[l, :, :attention_map_sq.shape[-1]] += attention_map_sq
704 |             elif 'tova' == mode:
705 |                 for l in range(num_layers):
706 |                     attention_map = outputs.attentions[l][0, :, 0, :]
707 |                     cache_attn_scores[l, :, :attention_map.shape[-1]] = attention_map
708 |             cache_counter += 1.0
709 |             recent_ratio = 0.3
710 |             recent_window = int(budget*recent_ratio)
711 |             if mode in ['h2o_head']:
712 |                 eviction_ids = torch.argmin(cache_attn_scores[:, :, :-recent_window], dim=-1)
713 |                 _eviction_ids = eviction_ids
714 |                 eviction_ids = eviction_ids.cpu().numpy().tolist()
715 |                 past_key_values = truncate_kv_cache_silo(past_key_values, eviction_ids)
716 |                 _index = torch.arange(cache_attn_scores.shape[-1], device=self.device).unsqueeze(0).unsqueeze(0).repeat(num_layers, num_heads, 1)
717 |                 mask = (_eviction_ids.unsqueeze(-1)!=_index).view(-1, _index.shape[-1])
718 |                 cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, 1, device=self.device)), dim=-1)
719 |             elif mode in ['roco']:
720 |                 cur_std = torch.sqrt(cache_attn_scores_square / cache_counter - (cache_attn_scores / cache_counter)**2)
721 |                 cur_std[:, :, -10:] = 1e9
722 |                 _, feasible_ids = torch.topk(cur_std, largest=False, k=budget-recent_window, dim=-1) # (layers, heads, k)
723 |                 argmin_id = torch.argmin(cache_attn_scores.gather(dim=-1, index=feasible_ids) / cache_counter.gather(dim=-1, index=feasible_ids), dim=-1).unsqueeze(-1) # (layers, heads)
724 |                 eviction_ids = feasible_ids.gather(dim=-1, index=argmin_id).squeeze(-1)
725 |                 _eviction_ids = eviction_ids
726 |                 eviction_ids = eviction_ids.cpu().numpy().tolist()
727 |                 past_key_values = truncate_kv_cache_silo(past_key_values, eviction_ids)
728 |                 _index = torch.arange(cache_attn_scores.shape[-1], device=self.device).unsqueeze(0).unsqueeze(0).repeat(num_layers, num_heads, 1)
729 |                 mask = (_eviction_ids.unsqueeze(-1)!=_index).view(-1, _index.shape[-1])
730 |                 cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, 1, device=self.device)), dim=-1)
731 |                 cache_attn_scores_square = torch.cat((cache_attn_scores_square.view(-1, cache_attn_scores_square.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, 1, device=self.device)), dim=-1)
732 |                 cache_counter = torch.cat((cache_counter.view(-1, cache_counter.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, 1, device=self.device)), dim=-1)
733 |             elif mode == 'tova':
734 |                 eviction_ids = torch.argmin(cache_attn_scores, dim=-1)
735 |                 _eviction_ids = eviction_ids
736 |                 eviction_ids = eviction_ids.cpu().numpy().tolist()
737 |                 past_key_values = truncate_kv_cache_silo(past_key_values, eviction_ids)
738 |                 _index = torch.arange(cache_attn_scores.shape[-1], device=self.device).unsqueeze(0).unsqueeze(0).repeat(num_layers, num_heads, 1)
739 |                 mask = (_eviction_ids.unsqueeze(-1)!=_index).view(-1, _index.shape[-1])
740 |                 cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, 1, device=self.device)), dim=-1)
741 |             elif mode == 'recency':
742 |                 past_key_values = truncate_kv_cache(past_key_values, start=sink_length, end=sink_length+1)
743 |             elif mode == 'random':
744 |                 scores = torch.rand(*positions_tensor.shape).to(self.device)
745 |                 _, evict_id = torch.topk(scores, k=1, dim=-1)
746 |                 evict_id = evict_id[0].cpu().item()
747 |                 past_key_values = truncate_kv_cache(past_key_values, start=sink_length+evict_id, end=sink_length+evict_id+1)
748 |             cur_pos_id += 1
749 |         cache_size = past_key_values[0][0].shape[2]
750 |         total_length = length + len(output_ids)
751 |         print(f"KV Cache Budget ratio {cache_size / total_length*100:.2f}%[{cache_size}/({length}+{len(output_ids)})]")
752 |         decoded_output = tokenizer.decode(output_ids, skip_special_tokens=True).strip()
753 |         return decoded_output
754 |     elif kv_mode == 'ppl':
755 |         """
756 |         perplexity computation with fixed kv cache
757 |         """
758 |         length = input_ids.shape[-1]
759 |         if budget >= 1.0:
760 |             outputs_prefilling = self(input_ids=input_ids, use_cache=False)
761 |             logits = outputs_prefilling.logits
762 |             loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
763 |             log_probs = loss_fct(logits[0, :-1], input_ids.clone()[0, 1:]).cpu().numpy().tolist()
764 |             ppl = math.exp(statistics.mean(log_probs))
765 |             return ppl
766 |         else:
767 |             # In case budget is also large, the attention_map will occupy a lot of memory
768 |             # We offload attention_map to CPU first and move it layer by laer to GPU to compute eviction score
769 |             if 'llama' in self.config.architectures[0].lower():
770 |                 modify_method_of_instance(self, "LlamaAttention", "forward", partial(llama_forward if not streaming else llama_forward_stream, attn_device='cuda'))
771 |             else:
772 |                 modify_method_of_instance(self, "MistralAttention", "forward", partial(mistral_forward if not streaming else mistral_forward_stream, attn_device='cuda'))
773 |             if type(budget) == float:
774 |                 budget = int(length * budget) + stride
775 |             elif type(budget) == int: 
776 |                 budget += stride
777 |             for idx in range(budget, -1, -1):
778 |                 if (length-idx)%stride==0: break
779 |             for r_idx in range(1, idx):
780 |                 if (idx-r_idx)%stride==0: break
781 |             prefix = input_ids[:, :r_idx]
782 |             recent_window = int(budget*recent_ratio)
783 |             sink_length = temp_length
784 |             outputs_prefilling = self(input_ids=prefix, use_cache=True, output_attentions=keep_attention)
785 |             past_key_values, logits = outputs_prefilling.past_key_values, outputs_prefilling.logits
786 |             loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
787 |             logits_prev_step = logits[:, -1, :]
788 |             _, raw_prob_prev_step = logits_adapter(logits_prev_step, temperature, top_p)
789 |             prefix_token_lst = input_ids[0].cpu().numpy().tolist()
790 |             cache_tokens = prefix[0].cpu().numpy().tolist()
791 |             if keep_attention:
792 |                 outputs_prefilling.attentions = process_for_mqa_gqa(outputs_prefilling.attentions, num_layers, num_heads, rep_n)
793 |             cache_attn_scores, cache_attn_scores_square = h2o_head_score(outputs_prefilling.attentions, self.device, stride, idx, num_layers, num_heads, empty=not keep_attention)
794 |             # Back to GPU
795 |             if 'llama' in self.config.architectures[0].lower():
796 |                 modify_method_of_instance(self, "LlamaAttention", "forward", partial(llama_forward if not streaming else llama_forward_stream, attn_device='cuda'))
797 |             else:
798 |                 modify_method_of_instance(self, "MistralAttention", "forward", partial(mistral_forward if not streaming else mistral_forward_stream, attn_device='cuda'))
799 | 
800 |             if keep_attention:
801 |                 cache_counter = torch.tensor([[[1.0]*(idx+stride) for _ in range(num_heads)] for _ in range(num_layers)], device=self.device)
802 |                 cache_counter = torch.cumsum(cache_counter, dim=-1).flip(dims=(2,)) - float(stride)
803 |             else:
804 |                 cache_counter = torch.tensor([[[float(stride)]*idx+torch.arange(stride, 0, -1).numpy().tolist() for _ in range(num_heads)] for _ in range(num_layers)], device=self.device) - float(stride)
805 |             cache_counter_token = torch.tensor([1.0]*(idx+stride), device=self.device)
806 |             cache_counter_token = torch.cumsum(cache_counter_token, dim=-1).flip(dims=(0, )) - float(stride)
807 |             n = 0
808 |             output_ids = []
809 |             token_probs = []
810 |             cur_pos_id = past_key_values[0][0].shape[2]
811 |             evicted_positions = []
812 |             log_probs = []
813 |             all_logits = []
814 |             all_ids = []
815 |             # for token_i in range(idx, length, stride):
816 |             for token_i in range(r_idx, length, stride):
817 |                 n += stride
818 |                 outputs = self(input_ids=input_ids[:, token_i:token_i+stride],
819 |                                 past_key_values=past_key_values,
820 |                                 attention_mask=torch.ones(1, stride+past_key_values[0][0].shape[2], dtype=torch.long, device=self.device),
821 |                                 position_ids=torch.LongTensor(list(range(cur_pos_id, cur_pos_id+stride))).to(self.device).view(1, -1),
822 |                                 use_cache=True,
823 |                                 output_attentions=True)
824 |                 past_key_values = outputs.past_key_values
825 |                 logits_prev_step = outputs.logits[:, -1, :]
826 |                 all_logits.append(outputs.logits[0])
827 |                 all_ids.append(input_ids[0, token_i:token_i+stride])
828 |                 prob_prev_step, raw_prob_prev_step = logits_adapter(logits_prev_step, temperature, top_p)
829 | 
830 |                 # Unified processing for MQA, GQA and MHA
831 |                 outputs.attentions = process_for_mqa_gqa(outputs.attentions, num_layers, num_heads, rep_n)
832 |                 cur_kv_size = past_key_values[0][0].shape[2]
833 |                 # update accumulated attention scores
834 |                 if cur_kv_size>idx or keep_attention:
835 |                     if 'h2o_head' == mode:
836 |                         for l in range(num_layers):
837 |                             attention_map = outputs.attentions[l][0, :, :, :].sum(dim=1) # (num_heads, stride, stride+l)
838 |                             cache_attn_scores[l, :, :attention_map.shape[-1]] += attention_map
839 |                     elif 'roco' == mode:
840 |                         for l in range(num_layers):
841 |                             attention_map = outputs.attentions[l][0, :, :, :].sum(dim=1) # (num_heads, l)
842 |                             attention_map_sq = ((outputs.attentions[l][0, :, :, :])**2).sum(dim=1)
843 |                             cache_attn_scores[l, :, :attention_map.shape[-1]] += attention_map
844 |                             cache_attn_scores_square[l, :, :attention_map.shape[-1]] += attention_map_sq
845 |                     elif 'tova' == mode:
846 |                         for l in range(num_layers):
847 |                             attention_map = outputs.attentions[l][0, :, -1, :].mean(dim=0).unsqueeze(0).repeat(num_heads, 1) # (num_heads, l)
848 |                             cache_attn_scores[l, :, :attention_map.shape[-1]] = attention_map
849 |                 # evict if current kv cache size exceeds the budget
850 |                 if mode != 'full' and cur_kv_size>idx:
851 |                     cache_counter += float(stride)
852 |                     cache_counter_token += float(stride)
853 |                     if mode in ['h2o_head']:
854 |                         eviction_ids = torch.topk(cache_attn_scores[:, :, sink_length:-recent_window], dim=-1, k=stride, largest=False)[1] + sink_length
855 |                         past_key_values = truncate_kv_cache_liso(past_key_values, eviction_ids)
856 |                         _index = torch.ones(num_layers, num_heads, cache_attn_scores.shape[-1], device=self.device).view(num_layers*num_heads, -1)
857 |                         _src = torch.zeros(num_layers, num_heads, stride, device=self.device).view(num_layers*num_heads, -1)
858 |                         mask = _index.scatter(dim=-1, index=eviction_ids.view(num_layers*num_heads, -1), src=_src).bool()
859 |                         cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, stride, device=self.device)), dim=-1)
860 |                         cache_counter = torch.cat((cache_counter.view(-1, cache_counter.shape[-1])[mask].view(num_layers, num_heads, -1), (torch.arange(stride)-stride+1).view(1, 1, -1).repeat(num_layers, num_heads, 1).flip(dims=(2,)).to(self.device)), dim=-1)
861 |                     elif mode in ['roco']:
862 |                         cur_std = torch.sqrt(cache_attn_scores_square / cache_counter - (cache_attn_scores / cache_counter)**2)
863 |                         cur_std[:, :, -10:] = 1e9
864 |                         cur_std[:, :, :sink_length] = 1e9
865 |                         _, feasible_ids = torch.topk(cur_std, largest=False, k=max(budget-recent_window-sink_length, stride), dim=-1) # (layers, heads, k)
866 |                         # _, feasible_ids = torch.topk(cur_std, largest=False, k=max(budget-int(budget*0.1)-sink_length, stride), dim=-1) # (layers, heads, k)
867 |                         argmin_id = torch.topk(cache_attn_scores.gather(dim=-1, index=feasible_ids) / cache_counter.gather(dim=-1, index=feasible_ids), dim=-1, largest=False, k=stride)[1] # (layers, heads)
868 |                         eviction_ids = feasible_ids.gather(dim=-1, index=argmin_id)
869 |                         past_key_values = truncate_kv_cache_liso(past_key_values, eviction_ids)
870 |                         _index = torch.ones(num_layers, num_heads, cache_attn_scores.shape[-1], device=self.device).view(num_layers*num_heads, -1)
871 |                         _src = torch.zeros(num_layers, num_heads, stride, device=self.device).view(num_layers*num_heads, -1)
872 |                         mask = _index.scatter(dim=-1, index=eviction_ids.view(num_layers*num_heads, -1), src=_src).bool()
873 |                         cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, stride, device=self.device)), dim=-1)
874 |                         cache_attn_scores_square = torch.cat((cache_attn_scores_square.view(-1, cache_attn_scores_square.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, stride, device=self.device)), dim=-1)
875 |                         cache_counter = torch.cat((cache_counter.view(-1, cache_counter.shape[-1])[mask].view(num_layers, num_heads, -1), (torch.arange(stride)-stride+1).view(1, 1, -1).repeat(num_layers, num_heads, 1).flip(dims=(2,)).to(self.device)), dim=-1)
876 |                     elif mode == 'tova':
877 |                         eviction_ids = torch.topk(cache_attn_scores[:, :, sink_length:-recent_window], dim=-1, k=stride, largest=False)[1] + sink_length
878 |                         past_key_values = truncate_kv_cache_liso(past_key_values, eviction_ids)
879 |                         _index = torch.ones(num_layers, num_heads, cache_attn_scores.shape[-1], device=self.device).view(num_layers*num_heads, -1)
880 |                         _src = torch.zeros(num_layers, num_heads, stride, device=self.device).view(num_layers*num_heads, -1)
881 |                         mask = _index.scatter(dim=-1, index=eviction_ids.view(num_layers*num_heads, -1), src=_src).bool()
882 |                         cache_attn_scores = torch.cat((cache_attn_scores.view(-1, cache_attn_scores.shape[-1])[mask].view(num_layers, num_heads, -1), torch.zeros(num_layers, num_heads, stride, device=self.device)), dim=-1)
883 |                     elif mode == 'recency':
884 |                         evict_id = sink_length
885 |                         past_key_values = truncate_kv_cache(past_key_values, start=evict_id, end=evict_id+stride)
886 |                     elif mode == 'random':
887 |                         scores = torch.rand(cache_attn_scores.shape[-1]).to(self.device)
888 |                         scores[-stride:] = -1e9
889 |                         _, evict_id = torch.topk(scores, k=1, dim=-1)
890 |                         evict_id = evict_id[0].cpu().item()
891 |                         past_key_values = truncate_kv_cache(past_key_values, start=evict_id, end=evict_id+stride)
892 |                 cur_pos_id += stride
893 |             cur_pos_id = input_ids.shape[-1]
894 |             _tmp = past_key_values[0][0].shape[2]
895 |             print(f"KV cache budget ratio: {_tmp / input_ids.shape[-1]*100:.2f}%({_tmp}/{input_ids.shape[-1]})")
896 |             all_ids = torch.cat(all_ids)
897 |             all_logits = torch.cat(all_logits, dim=0)
898 |             assert all_ids.shape[0] == all_logits.shape[0]
899 |             log_probs = loss_fct(all_logits[:-1], all_ids[1:]).cpu().numpy().tolist()
900 |             ppl = math.exp(statistics.mean(log_probs))
901 |             return ppl
902 | 
903 | def enable_fixed_kv(model, tokenizer, mode, stride=1, verbose=False):
904 |     model.tokenizer = tokenizer
905 |     import functools
906 |     model.easykv_generate = functools.partial(generate, self=model, kv_mode=mode, stride=stride, report_decoding_latency=verbose)
907 |     model.easykv_ppl = functools.partial(generate, self=model, kv_mode='ppl', stride=stride)
908 |     print(f"Fixed KV Cache for {mode} enabled")


--------------------------------------------------------------------------------