├── 2safetensors.py ├── eval ├── exl2.human-eval.py └── tf.human-eval.py ├── merge-lora.py └── serve ├── exl2.server.py └── tf.server.py /2safetensors.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | from collections import defaultdict 5 | from tqdm import tqdm 6 | import argparse 7 | import torch 8 | 9 | from safetensors.torch import load_file, save_file 10 | 11 | def shared_pointers(tensors): 12 | ptrs = defaultdict(list) 13 | for k, v in tensors.items(): 14 | ptrs[v.data_ptr()].append(k) 15 | failing = [] 16 | for ptr, names in ptrs.items(): 17 | if len(names) > 1: 18 | failing.append(names) 19 | return failing 20 | 21 | def check_file_size(sf_filename: str, pt_filename: str): 22 | sf_size = os.stat(sf_filename).st_size 23 | pt_size = os.stat(pt_filename).st_size 24 | 25 | if (sf_size - pt_size) / pt_size > 0.01: 26 | raise RuntimeError( 27 | f"""The file size different is more than 1%: 28 | - {sf_filename}: {sf_size} 29 | - {pt_filename}: {pt_size} 30 | """ 31 | ) 32 | 33 | def convert_file( 34 | pt_filename: str, 35 | sf_filename: str, 36 | ): 37 | loaded = torch.load(pt_filename, map_location="cpu") 38 | if "state_dict" in loaded: 39 | loaded = loaded["state_dict"] 40 | shared = shared_pointers(loaded) 41 | for shared_weights in shared: 42 | for name in shared_weights[1:]: 43 | loaded.pop(name) 44 | 45 | # For tensors to be contiguous 46 | loaded = {k: v.contiguous().half() for k, v in loaded.items()} 47 | 48 | dirname = os.path.dirname(sf_filename) 49 | os.makedirs(dirname, exist_ok=True) 50 | save_file(loaded, sf_filename, metadata={"format": "pt"}) 51 | check_file_size(sf_filename, pt_filename) 52 | reloaded = load_file(sf_filename) 53 | for k in loaded: 54 | pt_tensor = loaded[k] 55 | sf_tensor = reloaded[k] 56 | if not torch.equal(pt_tensor, sf_tensor): 57 | raise RuntimeError(f"The output tensors do not match for key {k}") 58 | 59 | def rename(pt_filename: str) -> str: 60 | filename, ext = os.path.splitext(pt_filename) 61 | local = f"{filename}.safetensors" 62 | local = local.replace("pytorch_model", "model") 63 | return local 64 | 65 | def convert_multi(folder: str, delprv: bool): 66 | filename = "pytorch_model.bin.index.json" 67 | with open(os.path.join(folder, filename), "r") as f: 68 | data = json.load(f) 69 | 70 | filenames = set(data["weight_map"].values()) 71 | local_filenames = [] 72 | for filename in tqdm(filenames): 73 | pt_filename = os.path.join(folder, filename) 74 | sf_filename = rename(pt_filename) 75 | sf_filename = os.path.join(folder, sf_filename) 76 | convert_file(pt_filename, sf_filename) 77 | local_filenames.append(sf_filename) 78 | if(delprv): 79 | os.remove(pt_filename) 80 | 81 | index = os.path.join(folder, "model.safetensors.index.json") 82 | with open(index, "w") as f: 83 | newdata = {k: v for k, v in data.items()} 84 | newmap = {k: rename(v) for k, v in data["weight_map"].items()} 85 | newdata["weight_map"] = newmap 86 | json.dump(newdata, f, indent=4) 87 | local_filenames.append(index) 88 | if(delprv): 89 | os.remove(os.path.join(folder,"pytorch_model.bin.index.json")) 90 | return 91 | 92 | 93 | def convert_single(folder: str, delprv: bool): 94 | pt_name = "pytorch_model.bin" 95 | pt_filename = os.path.join(folder, pt_name) 96 | sf_name = "model.safetensors" 97 | sf_filename = os.path.join(folder, sf_name) 98 | convert_file(pt_filename, sf_filename) 99 | if(delprv): 100 | os.remove(pt_filename) 101 | return 102 | 103 | parser = argparse.ArgumentParser() 104 | parser.add_argument('-m', '--model', required=True, type=str, help="Path to the model dir") 105 | parser.add_argument('-d', '--delete', action='store_true', help="Delete pytorch files after conversion") 106 | args = parser.parse_args() 107 | 108 | for filename in os.listdir(args.model): 109 | if filename == "pytorch_model.bin": 110 | convert_single(args.model, args.delete) 111 | sys.exit(0) 112 | convert_multi(args.model, args.delete) 113 | -------------------------------------------------------------------------------- /eval/exl2.human-eval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys, os 3 | import uuid 4 | import time 5 | from tqdm import tqdm 6 | 7 | from exllamav2 import ( 8 | ExLlamaV2, 9 | ExLlamaV2Config, 10 | ExLlamaV2Cache, 11 | ExLlamaV2Tokenizer, 12 | ) 13 | from exllamav2.generator import ( 14 | ExLlamaV2StreamingGenerator, 15 | ExLlamaV2Sampler 16 | ) 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('-m', '--model', required=True, type=str, help="Path to the model dir") 20 | parser.add_argument('-c', '--length', default=0, required=False, type=int, help="Context length") 21 | parser.add_argument('-n', '--samples', default=1, required=False, type=int, help="Number of samples per task") 22 | parser.add_argument('-s', '--scale', default=1.0, required=False, type=float, help="Linear scale") 23 | parser.add_argument('-A', '--ntk_scale', default=1.0, required=False, type=float, help="NTK scale") 24 | parser.add_argument('-p', '--plus', action='store_true', help="Use HumanEvalPlus instead of original HumanEval") 25 | parser.add_argument('-o', '--output', default='samples.jsonl', required=False, type=str, help="Output file name") 26 | parser.add_argument('-t', '--temperature', default=0.1, required=False, type=float, help="Temperature") 27 | parser.add_argument('--max_new_tokens', default = 384, required=False, type=int, help="Max tokens to generate") 28 | parser.add_argument('--top_k', default=40, required=False, type=int, help="Top K") 29 | parser.add_argument('--top_p', default=0.75, required=False, type=float, help="Top P") 30 | parser.add_argument('--repetition_penalty', default=1, required=False, type=float, help="Repetition penalty") 31 | args = parser.parse_args() 32 | 33 | config = ExLlamaV2Config() 34 | config.model_dir = args.model 35 | config.prepare() 36 | 37 | if args.length != 0: 38 | config.max_seq_len = args.length 39 | 40 | if args.scale != 1.0: 41 | config.scale_pos_emb = args.scale 42 | 43 | if args.ntk_scale != 1.0: 44 | config.scale_alpha_value = args.ntk_scale 45 | 46 | model = ExLlamaV2(config) 47 | print("Loading model: " + args.model) 48 | model.load() 49 | tokenizer = ExLlamaV2Tokenizer(config) 50 | cache = ExLlamaV2Cache(model) 51 | 52 | generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer) 53 | generator.warmup() 54 | 55 | def generate_one_completion(prompt: str): 56 | 57 | settings = ExLlamaV2Sampler.Settings() 58 | settings.top_k = args.top_k 59 | settings.top_p = args.top_p 60 | settings.temperature = args.temperature 61 | settings.token_repetition_penalty = args.repetition_penalty 62 | 63 | input_ids = tokenizer.encode(prompt, add_bos = True) 64 | 65 | generator.set_stop_conditions([tokenizer.eos_token_id]) 66 | generator.begin_stream(input_ids, settings) 67 | generated_tokens = 0 68 | new_text = "" 69 | while True: 70 | chunk, eos, _ = generator.stream() 71 | generated_tokens += 1 72 | new_text += chunk 73 | if eos or generated_tokens == args.max_new_tokens: 74 | break 75 | return new_text 76 | 77 | 78 | if not args.plus: 79 | from human_eval.data import write_jsonl, read_problems 80 | problems = read_problems() 81 | else: 82 | from evalplus.data import get_human_eval_plus, write_jsonl 83 | problems = get_human_eval_plus() 84 | 85 | num_samples_per_task = args.samples 86 | samples = [ 87 | dict(task_id=task_id, completion=generate_one_completion(problems[task_id]["prompt"])) 88 | for task_id in tqdm(problems) 89 | for _ in range(num_samples_per_task) 90 | ] 91 | 92 | write_jsonl(args.output, samples) 93 | -------------------------------------------------------------------------------- /eval/tf.human-eval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | from tqdm import tqdm 4 | from human_eval.data import write_jsonl, read_problems 5 | from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig 6 | 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('-m', '--model', required=True, type=str, help="Grasping Model") 9 | parser.add_argument('-f', '--eight', action='store_true', help="Load in INT8 instead of NF4") 10 | parser.add_argument('-o', '--output', default='samples.jsonl', required=False, type=str, help="Output file name") 11 | parser.add_argument('-r', '--remote', required=False, type=bool, default=True, help="Trust remote code (default is True)") 12 | args = parser.parse_args() 13 | 14 | model_id = args.model 15 | tokenizer = AutoTokenizer.from_pretrained(model_id) 16 | 17 | if not args.eight: 18 | nf4_config = BitsAndBytesConfig( 19 | load_in_4bit=True, 20 | bnb_4bit_quant_type="nf4", 21 | bnb_4bit_use_double_quant=True, 22 | bnb_4bit_compute_dtype=torch.bfloat16 23 | ) 24 | model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto', quantization_config=nf4_config, trust_remote_code=args.remote) 25 | else: 26 | model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto', load_in_8bit=True, trust_remote_code=args.remote, use_flash_attention_2=True) 27 | 28 | print(model.generation_config) 29 | 30 | def generate_one_completion(prompt: str): 31 | tokenizer.pad_token = tokenizer.eos_token 32 | inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096) 33 | 34 | # Generate 35 | generate_ids = model.generate(inputs.input_ids.to("cuda"), max_new_tokens=384, do_sample=True, top_p=0.75, top_k=40, temperature=0.1) 36 | completion = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] 37 | completion = completion.replace(prompt, "").split("\n\n\n")[0] 38 | 39 | return completion 40 | 41 | problems = read_problems() 42 | 43 | num_samples_per_task = 1 44 | samples = [ 45 | dict(task_id=task_id, completion=generate_one_completion(problems[task_id]["prompt"])) 46 | for task_id in tqdm(problems) 47 | for _ in range(num_samples_per_task) 48 | ] 49 | write_jsonl(args.output, samples) 50 | -------------------------------------------------------------------------------- /merge-lora.py: -------------------------------------------------------------------------------- 1 | from transformers import LlamaForCausalLM, AutoTokenizer 2 | from peft import PeftModel 3 | import os 4 | import torch 5 | import argparse 6 | 7 | def main(): 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument("-m", "--model", type=str) 11 | parser.add_argument("-l", "--lora", type=str) 12 | parser.add_argument("-o", "--out_dir", type=str, default="./out") 13 | args = parser.parse_args() 14 | 15 | print(f"Loading base model: {args.model}") 16 | base_model = LlamaForCausalLM.from_pretrained(args.model, torch_dtype=torch.float16, device_map="cpu") 17 | 18 | print(f"Loading PEFT: {args.lora}") 19 | model = PeftModel.from_pretrained(base_model, args.lora) 20 | print(f"Running merge_and_unload") 21 | model = model.merge_and_unload() 22 | tokenizer = AutoTokenizer.from_pretrained(args.model) 23 | 24 | model.save_pretrained(f"{args.out_dir}") 25 | tokenizer.save_pretrained(f"{args.out_dir}") 26 | print(f"Model saved to {args.out_dir}") 27 | 28 | if __name__ == "__main__" : 29 | main() 30 | -------------------------------------------------------------------------------- /serve/exl2.server.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys, os 3 | import uuid 4 | import time 5 | import bottle 6 | from bottle import Bottle, run, route, request 7 | bottle.BaseRequest.MEMFILE_MAX = 1024 * 1024 * 10 8 | 9 | from exllamav2 import ( 10 | ExLlamaV2, 11 | ExLlamaV2Config, 12 | ExLlamaV2Cache, 13 | ExLlamaV2Tokenizer, 14 | ExLlamaV2Lora, 15 | ) 16 | from exllamav2.generator import ( 17 | ExLlamaV2StreamingGenerator, 18 | ExLlamaV2Sampler 19 | ) 20 | 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('-m', '--model', required=True, type=str, help="Path to the model dir") 23 | parser.add_argument('-l', '--lora', required=False, type=str, default=None, help="Path to the lora dir") 24 | parser.add_argument('-a', '--model_name', required=True, type=str, help="Model alias or ID") 25 | parser.add_argument('-c', '--length', default=0, required=False, type=int, help="Context length") 26 | parser.add_argument('-s', '--scale', default=1.0, required=False, type=float, help="Linear scale") 27 | parser.add_argument('-A', '--ntk_scale', default=1.0, required=False, type=float, help="NTK scale") 28 | parser.add_argument('--port', default=8013, required=False, type=int, help="Port to listen on") 29 | parser.add_argument('--ip', default='127.0.0.1', required=False, type=str, help="IP to listen on") 30 | args = parser.parse_args() 31 | 32 | # Initialize model and cache 33 | config = ExLlamaV2Config() 34 | config.model_dir = args.model 35 | config.prepare() 36 | 37 | if args.length != 0: 38 | config.max_seq_len = args.length 39 | 40 | if args.scale != 1.0: 41 | config.scale_pos_emb = args.scale 42 | 43 | if args.ntk_scale != 1.0: 44 | config.scale_alpha_value = args.ntk_scale 45 | 46 | model = ExLlamaV2(config) 47 | print("Loading model: " + args.model) 48 | model.load() 49 | tokenizer = ExLlamaV2Tokenizer(config) 50 | cache = ExLlamaV2Cache(model) 51 | 52 | lora = None 53 | if args.lora is not None: 54 | lora = ExLlamaV2Lora.from_directory(model, args.lora) 55 | # Initialize generator 56 | generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer) 57 | # Make sure CUDA is initialized so we can measure performance 58 | generator.warmup() 59 | 60 | conversations = {} 61 | app = Bottle() 62 | 63 | def full_conversation(idx): 64 | chat = '' 65 | for message in conversations[idx]['messages']: 66 | if message['role'] == 'system': 67 | chat += message['content'] 68 | if message['role'] == 'user': 69 | chat += conversations[idx]['prefix'] + message['content'] + conversations[idx]['infix'] 70 | if message['role'] == 'assistant': 71 | chat += conversations[idx]['suffix'] + message['content'] + '\n' 72 | 73 | if conversations[idx]['messages'][-1]['role'] == 'user': 74 | chat += conversations[idx]['suffix'] 75 | 76 | return chat 77 | 78 | @app.route('/prompt', method='PUT') 79 | def set_prompt(): 80 | data = request.json 81 | conversation_uuid = data.get('uuid', str(uuid.uuid4())) 82 | messages = data.get('messages', [{'role':'system', 'content':''}]) 83 | prefix = data.get('prefix', 'USER: ') 84 | suffix = data.get('suffix', 'ASSISTANT:') 85 | infix = data.get('infix', '\n') 86 | conversations[conversation_uuid] = { 87 | "messages": messages, 88 | "prefix": prefix, 89 | "suffix": suffix, 90 | "infix": infix 91 | } 92 | return {"message": "Prompt set", "uuid": conversation_uuid} 93 | 94 | @app.route('/chat', method='POST') 95 | def chat(): 96 | data = request.json 97 | conversation_uuid = data['uuid'] 98 | if conversation_uuid not in conversations: 99 | return {"uuid":conversation_uuid, "message": "not found"} 100 | 101 | temperature = data.get('temperature', 0.5) 102 | top_k = data.get('top_k', 40) 103 | top_p = data.get('top_p', 0.9) 104 | typical = data.get('typical', 0) 105 | repetition_penalty = data.get('repetition_penalty', 1.15) 106 | max_new_tokens = data.get('max_length', 256) 107 | query = data.get('query') 108 | 109 | settings = ExLlamaV2Sampler.Settings() 110 | settings.temperature = temperature 111 | settings.top_k = top_k 112 | settings.top_p = top_p 113 | settings.typical = typical 114 | settings.token_repetition_penalty = repetition_penalty 115 | conversations[conversation_uuid]['messages'].append({'role':'user','content':query}) 116 | 117 | full_ctx = full_conversation(conversation_uuid) 118 | input_ids = tokenizer.encode(full_ctx, add_bos = True) 119 | prompt_tokens = input_ids.shape[-1] 120 | 121 | start_time = time.time_ns() 122 | 123 | generator.set_stop_conditions([tokenizer.eos_token_id, conversations[conversation_uuid]['prefix'].rstrip(), "<|im_end|>"]) 124 | generator.begin_stream(input_ids, settings, loras = lora) 125 | generated_tokens = 0 126 | new_text = "" 127 | while True: 128 | chunk, eos, _ = generator.stream() 129 | generated_tokens += 1 130 | new_text += chunk 131 | if eos or generated_tokens == max_new_tokens: 132 | break 133 | 134 | end_time = time.time_ns() 135 | secs = (end_time - start_time) / 1e9 136 | 137 | conversations[conversation_uuid]['messages'].append({'role':'assistant','content':new_text}) 138 | return { 139 | "uuid": conversation_uuid, 140 | "text": new_text, 141 | "tokens": generated_tokens, 142 | "rate": generated_tokens / secs, 143 | "model": args.model_name, 144 | "type" : 'exllama', 145 | "ctx" : prompt_tokens + generated_tokens 146 | } 147 | 148 | @app.route('/complete', method='POST') 149 | def complete(): 150 | data = request.json 151 | temperature = data.get('temperature', 0.5) 152 | max_new_tokens = data.get('max_length', 256) 153 | add_bos = data.get('add_bos', False) 154 | top_k = data.get('top_k', 40) 155 | top_p = data.get('top_p', 0.9) 156 | typical = data.get('typical', 0) 157 | repetition_penalty = data.get('repetition_penalty', 1.15) 158 | encode_special = data.get('encode_special', True) 159 | query = data.get('query') 160 | 161 | settings = ExLlamaV2Sampler.Settings() 162 | settings.temperature = temperature 163 | settings.top_k = top_k 164 | settings.top_p = top_p 165 | settings.typical = typical 166 | settings.token_repetition_penalty = repetition_penalty 167 | 168 | input_ids = tokenizer.encode(query, add_bos = add_bos, encode_special_tokens = encode_special) 169 | prompt_tokens = input_ids.shape[-1] 170 | 171 | start_time = time.time_ns() 172 | generator.set_stop_conditions([tokenizer.eos_token_id]) 173 | generator.begin_stream(input_ids, settings, loras=lora) 174 | generated_tokens = 0 175 | new_text = "" 176 | while True: 177 | chunk, eos, _ = generator.stream() 178 | generated_tokens += 1 179 | new_text += chunk 180 | if eos or generated_tokens == max_new_tokens: 181 | break 182 | 183 | end_time = time.time_ns() 184 | secs = (end_time - start_time) / 1e9 185 | return { 186 | "text": new_text, 187 | "ctx": generated_tokens + prompt_tokens, 188 | "tokens": generated_tokens, 189 | "rate": generated_tokens / secs, 190 | "type": 'exllama', 191 | "model": args.model_name 192 | } 193 | 194 | run(app, host=args.ip, port=args.port) 195 | -------------------------------------------------------------------------------- /serve/tf.server.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import time 3 | import torch 4 | import uuid 5 | import os 6 | from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM, BitsAndBytesConfig, GPTQConfig 7 | from transformers import StoppingCriteria, StoppingCriteriaList 8 | import transformers 9 | from peft import PeftModel 10 | from bottle import Bottle, run, route, request 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('-m', '--model', required=True, type=str, help="Grasping Model") 14 | parser.add_argument('-a', '--model_name', required=False, type=str, default="uknown", help="Grasping Model's Alias") 15 | parser.add_argument('-l', '--lora_dir', required=False, type=str, default='', help="Path to lora directory") 16 | parser.add_argument('-f', '--eight', action='store_true', help="Load in INT8 instead of NF4") 17 | parser.add_argument('-r', '--remote', required=False, type=bool, default=False, help="Trust remote code (default is False)") 18 | parser.add_argument('--port', default=8013, required=False, type=int, help="Port to listen on") 19 | parser.add_argument('--ip', default='127.0.0.1', required=False, type=str, help="IP to listen on") 20 | args = parser.parse_args() 21 | 22 | app = Bottle() 23 | 24 | class StoppingCriteriaSub(StoppingCriteria): 25 | 26 | def __init__(self, stops = [], encounters=1): 27 | super().__init__() 28 | self.stops = [stop.to("cuda") for stop in stops] 29 | 30 | def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor): 31 | for stop in self.stops: 32 | if torch.all((stop == input_ids[0][-len(stop):])).item(): 33 | return True 34 | 35 | return False 36 | 37 | def load_model(): 38 | model_id = args.model 39 | tokenizer = AutoTokenizer.from_pretrained(model_id) 40 | 41 | free_in_GB = int(torch.cuda.mem_get_info()[0]/1024**3) 42 | max_memory = f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB' 43 | n_gpus = torch.cuda.device_count() 44 | max_memory = {i: max_memory for i in range(n_gpus)} 45 | 46 | if not args.eight: 47 | nf4_config = BitsAndBytesConfig( 48 | load_in_4bit=True, 49 | bnb_4bit_quant_type="nf4", 50 | bnb_4bit_use_double_quant=True, 51 | bnb_4bit_compute_dtype=torch.bfloat16 52 | ) 53 | model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto', quantization_config=nf4_config, max_memory=max_memory, trust_remote_code=args.remote, use_flash_attention_2=True) 54 | else: 55 | model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto', load_in_8bit=True, max_memory=max_memory, trust_remote_code=args.remote, use_flash_attention_2=True) 56 | print(model.generation_config) 57 | 58 | if args.lora_dir != '': 59 | model = PeftModel.from_pretrained(model, args.lora_dir) 60 | 61 | return model, tokenizer 62 | 63 | llm, tokenizer = load_model() 64 | 65 | conversations = {} 66 | 67 | def full_conversation(idx): 68 | chat = '' 69 | for message in conversations[idx]['messages']: 70 | if message['role'] == 'system': 71 | chat += message['content'] 72 | if message['role'] == 'user': 73 | chat += conversations[idx]['prefix'] + message['content'] + conversations[idx]['postfix'] 74 | if message['role'] == 'assistant': 75 | chat += conversations[idx]['suffix'] + message['content'] + '\n' 76 | 77 | if conversations[idx]['messages'][-1]['role'] == 'user': 78 | chat += conversations[idx]['suffix'] 79 | return chat 80 | 81 | @app.route('/prompt', method='PUT') 82 | def set_prompt(): 83 | data = request.json 84 | conversation_uuid = data.get('uuid', str(uuid.uuid4())) 85 | messages = data.get('messages', [{'role':'system', 'content':''}]) 86 | prefix = data.get('prefix', 'USER: ') 87 | postfix = data.get('postfix', '\n') 88 | suffix = data.get('suffix', 'ASSISTANT:') 89 | conversations[conversation_uuid] = { 90 | "messages": messages, 91 | "prefix": prefix, 92 | "suffix": suffix, 93 | "postfix": postfix 94 | } 95 | return {"message": "Prompt set", "uuid": conversation_uuid} 96 | 97 | @app.route('/chat', method='POST') 98 | def chat(): 99 | data = request.json 100 | conversation_uuid = data['uuid'] 101 | if conversation_uuid not in conversations: 102 | return {"uuid":conversation_uuid, "message": "not found"} 103 | 104 | temperature = data.get('temperature', 0.5) 105 | max_new_tokens = data.get('max_length', 256) 106 | query = data.get('query') 107 | 108 | conversations[conversation_uuid]['messages'].append({'role':'user','content':query}) 109 | full_ctx = full_conversation(conversation_uuid) 110 | 111 | stop_words = [conversations[conversation_uuid]['prefix'].rstrip(), ''] 112 | stop_words_ids = [tokenizer(stop_word, return_tensors='pt')['input_ids'].squeeze() for stop_word in stop_words] 113 | 114 | start_time = time.time_ns() 115 | stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)]) 116 | input_ids = tokenizer(full_ctx, return_tensors="pt").input_ids.to('cuda') 117 | outputs = llm.generate( 118 | inputs=input_ids, 119 | do_sample=True, 120 | num_beams=1, 121 | stopping_criteria=stopping_criteria, 122 | max_new_tokens = max_new_tokens, 123 | temperature = temperature, 124 | num_return_sequences=1, 125 | remove_invalid_values=True, 126 | ) 127 | answer = tokenizer.decode(outputs[0], skip_special_tokens=True) 128 | answer = answer.replace(full_ctx,"") 129 | conversations[conversation_uuid]['messages'].append({'role':'assistant','content':answer}) 130 | new_tokens = len(outputs[0]) - len(input_ids[0]) 131 | end_time = time.time_ns() 132 | secs = (end_time - start_time) / 1e9 133 | return { 134 | "text": answer, 135 | "ctx": len(outputs[0]), 136 | "tokens": new_tokens, 137 | "rate": new_tokens / secs, 138 | "model": args.model_name, 139 | } 140 | 141 | @app.route('/complete', method='POST') 142 | def complete(): 143 | data = request.json 144 | temperature = data.get('temperature', 0.5) 145 | max_new_tokens = data.get('max_length', 256) 146 | query = data.get('query') 147 | 148 | tok = AutoTokenizer.from_pretrained(args.model, add_bos_token=False) 149 | start_time = time.time_ns() 150 | input_ids = tok(query, return_tensors="pt").input_ids.to('cuda') 151 | outputs = llm.generate( 152 | inputs=input_ids, 153 | do_sample=True, 154 | num_beams=1, 155 | max_new_tokens = max_new_tokens, 156 | temperature = temperature, 157 | num_return_sequences=1, 158 | remove_invalid_values=True, 159 | ) 160 | answer = tokenizer.decode(outputs[0]) 161 | new_tokens = len(outputs[0]) - len(input_ids[0]) 162 | end_time = time.time_ns() 163 | secs = (end_time - start_time) / 1e9 164 | return { 165 | "text": answer, 166 | "ctx": len(outputs[0]), 167 | "tokens": new_tokens, 168 | "rate": new_tokens / secs, 169 | "model": args.model_name, 170 | } 171 | 172 | run(app, host=args.ip, port=args.port) 173 | --------------------------------------------------------------------------------