├── README.md ├── LLM └── LLaVA │ ├── Dockerfile │ ├── test.py │ ├── README.md │ └── run_inference.py └── LICENSE /README.md: -------------------------------------------------------------------------------- 1 | # Jetson-Inference-Server -------------------------------------------------------------------------------- /LLM/LLaVA/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM dustynv/local_llm:r35.3.1 2 | 3 | WORKDIR /opt/llava_inference/ 4 | COPY * /opt/llava_inference/ 5 | 6 | 7 | -------------------------------------------------------------------------------- /LLM/LLaVA/test.py: -------------------------------------------------------------------------------- 1 | from requests import post 2 | 3 | url = "http://192.168.49.241:5001/v2/generate" 4 | data = { 5 | 'user_prompt': 'what is image', 6 | 'sys_prompt': 'think step by step', 7 | 'max_new_tokens': 32, 8 | } 9 | files = {'image': open('/home/youjiang/Downloads/hoover.jpg', 'rb')} 10 | 11 | response = post(url, headers={}, json=data, files=files) 12 | print(response.text) 13 | -------------------------------------------------------------------------------- /LLM/LLaVA/README.md: -------------------------------------------------------------------------------- 1 | docker build -f Dockerfile . -t jis_llava:r35.3.1 2 | 3 | docker run -it --rm --runtime nvidia --network host -v /home/seeed/JIS/jetson-containers/data:/data jis_llava:r35.3.1 python3 -m run_inference 4 | docker run -it --rm --runtime nvidia --network host -v /home/seeed/JIS/jetson-containers/data:/data jis_llava:r35.3.1 python3 -m run_inference 5 | 6 | docker run -it --rm --runtime nvidia --network host 7 | -v /etc/enctune.conf:/etc/enctune.conf 8 | -v /etc/nv_tegra_release:/etc/nv_tegra_release 9 | -v /home/seeed/jetson-containers/data:/data 10 | -v /home/seeed/llava_inference/main.py:/opt/llava_inference/main.py 11 | --device /dev/snd --device /dev/bus/usb 12 | llava_jic:v0 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Seeed-Projects 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /LLM/LLaVA/run_inference.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import argparse 3 | from PIL import Image 4 | from io import BytesIO 5 | import re 6 | from flask import Flask, request 7 | from local_llm import LocalLM, ChatHistory 8 | 9 | 10 | def parse_args(): 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--model', default='liuhaotian/llava-v1.5-7b', type=str, help="either the path to the model, or HuggingFace model repo/name") 13 | parser.add_argument('--quant', default=None, type=str, help="for AWQ or MLC, either specify the quantization method, or the path to the quantized model AWQ and MLC API only") 14 | parser.add_argument('--api', default='mlc', type=str, help="the model backend API to use: 'auto_gptq', 'awq', 'mlc', or 'hf'if left as None, it will attempt to be automatically determined.") 15 | parser.add_argument('--vision_model', default=None, type=str, help="for VLMs, override the vision embedding model (CLIP) otherwise, it will use the CLIP variant from the config.") 16 | parser.add_argument('--port', default=5001, type=int, help="The port number of the service program") 17 | _args = parser.parse_args() 18 | return _args 19 | 20 | 21 | class Inference(): 22 | def __init__(self, args) -> None: 23 | self.model = LocalLM.from_pretrained( 24 | args.model, 25 | quant=args.quant, 26 | api=args.api, 27 | vision_model=args.vision_model, 28 | ) 29 | 30 | self.app = Flask(__name__) 31 | self.setup_routes() 32 | 33 | self.lock = threading.Lock() 34 | 35 | def predict(self, 36 | user_text, 37 | image_path, 38 | system_prompt, 39 | max_new_tokens=1024, 40 | min_new_tokens=-1, 41 | do_sample=False, 42 | repetition_penalty=1.0, 43 | temperature=0.7, 44 | top_p=0.95, 45 | ): 46 | print(image_path, flush=True) 47 | chat_history = ChatHistory(self.model, chat_template=None, system_prompt=system_prompt) 48 | entry = chat_history.append(role="user", msg=image_path) 49 | entry = chat_history.append(role="user", msg=user_text) 50 | if "image" in entry and "text" not in entry: 51 | return "only image message, waiting for user prompt" 52 | embedding, position = chat_history.embed_chat() 53 | reply = self.model.generate( 54 | embedding, 55 | streaming=False, 56 | max_new_tokens=int(max_new_tokens), 57 | min_new_tokens=int(min_new_tokens), 58 | do_sample=bool(do_sample), 59 | repetition_penalty=float(repetition_penalty), 60 | temperature=float(temperature), 61 | top_p=float(top_p), 62 | ) 63 | return reply 64 | 65 | def is_chinese(self, text): 66 | pattern = re.compile(r"[\u4e00-\u9fff]") 67 | return bool(pattern.search(text)) 68 | 69 | def setup_routes(self): 70 | 71 | @self.app.route("/v2/generate", methods=["POST"]) 72 | def generate2(): 73 | user_prompt = request.form.get("user_prompt") or "describe the image." 74 | if self.is_chinese(user_prompt): 75 | return "only english please." 76 | sys_prompt = ( 77 | request.form.get("sys_prompt") or "you are combo AI robot, will think how to finish human question" 78 | ) 79 | max_new_tokens = request.form.get("max_new_tokens") or 1024 80 | min_new_tokens = request.form.get("min_new_tokens") or -1 81 | do_sample = request.form.get("do_sample") or False 82 | repetition_penalty = request.form.get("repetition_penalty") or 1.0 83 | temperature = request.form.get("temperature") or 0.7 84 | top_p = request.form.get("top_p") or 0.95 85 | with self.lock: 86 | file = request.files["image"] 87 | file_data = file.read() 88 | image = Image.open(BytesIO(file_data)) 89 | output = self.predict( 90 | user_prompt, 91 | image, 92 | sys_prompt, 93 | max_new_tokens, 94 | min_new_tokens, 95 | do_sample, 96 | repetition_penalty, 97 | temperature, 98 | top_p, 99 | ) 100 | print(output) 101 | return output 102 | 103 | if __name__ == "__main__": 104 | args = parse_args() 105 | runner = Inference(args) 106 | runner.app.run(host="0.0.0.0", port=args.port) 107 | 108 | 109 | --------------------------------------------------------------------------------