├── README.md
├── LLM
    └── LLaVA
    │   ├── Dockerfile
    │   ├── test.py
    │   ├── README.md
    │   └── run_inference.py
└── LICENSE


/README.md:
--------------------------------------------------------------------------------
1 | # Jetson-Inference-Server


--------------------------------------------------------------------------------
/LLM/LLaVA/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM dustynv/local_llm:r35.3.1
2 | 
3 | WORKDIR /opt/llava_inference/
4 | COPY * /opt/llava_inference/
5 | 
6 | 
7 | 


--------------------------------------------------------------------------------
/LLM/LLaVA/test.py:
--------------------------------------------------------------------------------
 1 | from requests import post
 2 | 
 3 | url = "http://192.168.49.241:5001/v2/generate"
 4 | data = {
 5 |     'user_prompt': 'what is image',
 6 |     'sys_prompt': 'think step by step',
 7 |     'max_new_tokens': 32,
 8 | }
 9 | files = {'image': open('/home/youjiang/Downloads/hoover.jpg', 'rb')}
10 | 
11 | response = post(url, headers={}, json=data, files=files)
12 | print(response.text)
13 | 


--------------------------------------------------------------------------------
/LLM/LLaVA/README.md:
--------------------------------------------------------------------------------
 1 | docker build -f Dockerfile . -t jis_llava:r35.3.1
 2 | 
 3 | docker run -it --rm --runtime nvidia --network host -v /home/seeed/JIS/jetson-containers/data:/data  jis_llava:r35.3.1 python3 -m run_inference
 4 | docker run -it --rm --runtime nvidia --network host -v /home/seeed/JIS/jetson-containers/data:/data  jis_llava:r35.3.1 python3 -m run_inference
 5 | 
 6 | docker run -it --rm --runtime nvidia --network host 
 7 |     -v /etc/enctune.conf:/etc/enctune.conf
 8 |     -v /etc/nv_tegra_release:/etc/nv_tegra_release
 9 |     -v /home/seeed/jetson-containers/data:/data
10 |     -v /home/seeed/llava_inference/main.py:/opt/llava_inference/main.py
11 |     --device /dev/snd --device /dev/bus/usb
12 |     llava_jic:v0


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Seeed-Projects
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/LLM/LLaVA/run_inference.py:
--------------------------------------------------------------------------------
  1 | import threading
  2 | import argparse
  3 | from PIL import Image
  4 | from io import BytesIO
  5 | import re
  6 | from flask import Flask, request
  7 | from local_llm import LocalLM, ChatHistory
  8 | 
  9 | 
 10 | def parse_args():
 11 |     parser = argparse.ArgumentParser()
 12 |     parser.add_argument('--model', default='liuhaotian/llava-v1.5-7b', type=str, help="either the path to the model, or HuggingFace model repo/name")
 13 |     parser.add_argument('--quant', default=None, type=str, help="for AWQ or MLC, either specify the quantization method, or the path to the quantized model AWQ and MLC API only")
 14 |     parser.add_argument('--api', default='mlc', type=str, help="the model backend API to use:  'auto_gptq', 'awq', 'mlc', or 'hf'if left as None, it will attempt to be automatically determined.")
 15 |     parser.add_argument('--vision_model', default=None, type=str, help="for VLMs, override the vision embedding model (CLIP) otherwise, it will use the CLIP variant from the config.")
 16 |     parser.add_argument('--port', default=5001, type=int, help="The port number of the service program")
 17 |     _args = parser.parse_args()
 18 |     return _args
 19 | 
 20 | 
 21 | class Inference():
 22 |     def __init__(self, args) -> None:
 23 |         self.model = LocalLM.from_pretrained(
 24 |             args.model,
 25 |             quant=args.quant,
 26 |             api=args.api,
 27 |             vision_model=args.vision_model,
 28 |         )
 29 | 
 30 |         self.app = Flask(__name__)
 31 |         self.setup_routes()
 32 |     
 33 |         self.lock = threading.Lock()
 34 | 
 35 |     def predict(self,
 36 |         user_text,
 37 |         image_path,
 38 |         system_prompt,
 39 |         max_new_tokens=1024,
 40 |         min_new_tokens=-1,
 41 |         do_sample=False,
 42 |         repetition_penalty=1.0,
 43 |         temperature=0.7,
 44 |         top_p=0.95,
 45 |     ):
 46 |         print(image_path, flush=True)
 47 |         chat_history = ChatHistory(self.model, chat_template=None, system_prompt=system_prompt)
 48 |         entry = chat_history.append(role="user", msg=image_path)
 49 |         entry = chat_history.append(role="user", msg=user_text)
 50 |         if "image" in entry and "text" not in entry:
 51 |             return "only image message, waiting for user prompt"
 52 |         embedding, position = chat_history.embed_chat()
 53 |         reply = self.model.generate(
 54 |             embedding,
 55 |             streaming=False,
 56 |             max_new_tokens=int(max_new_tokens),
 57 |             min_new_tokens=int(min_new_tokens),
 58 |             do_sample=bool(do_sample),
 59 |             repetition_penalty=float(repetition_penalty),
 60 |             temperature=float(temperature),
 61 |             top_p=float(top_p),
 62 |         )
 63 |         return reply
 64 |     
 65 |     def is_chinese(self, text):
 66 |         pattern = re.compile(r"[\u4e00-\u9fff]")
 67 |         return bool(pattern.search(text))
 68 | 
 69 |     def setup_routes(self):
 70 | 
 71 |         @self.app.route("/v2/generate", methods=["POST"])
 72 |         def generate2():
 73 |             user_prompt = request.form.get("user_prompt") or "describe the image."
 74 |             if self.is_chinese(user_prompt):
 75 |                 return "only english please."
 76 |             sys_prompt = (
 77 |                 request.form.get("sys_prompt") or "you are combo AI robot, will think how to finish human question"
 78 |             )
 79 |             max_new_tokens = request.form.get("max_new_tokens") or 1024
 80 |             min_new_tokens = request.form.get("min_new_tokens") or -1
 81 |             do_sample = request.form.get("do_sample") or False
 82 |             repetition_penalty = request.form.get("repetition_penalty") or 1.0
 83 |             temperature = request.form.get("temperature") or 0.7
 84 |             top_p = request.form.get("top_p") or 0.95
 85 |             with self.lock:
 86 |                 file = request.files["image"]
 87 |                 file_data = file.read()
 88 |                 image = Image.open(BytesIO(file_data))
 89 |                 output = self.predict(
 90 |                     user_prompt,
 91 |                     image,
 92 |                     sys_prompt,
 93 |                     max_new_tokens,
 94 |                     min_new_tokens,
 95 |                     do_sample,
 96 |                     repetition_penalty,
 97 |                     temperature,
 98 |                     top_p,
 99 |                 )
100 |                 print(output)
101 |                 return output
102 | 
103 | if __name__ == "__main__":
104 |     args = parse_args()
105 |     runner = Inference(args)
106 |     runner.app.run(host="0.0.0.0", port=args.port)
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------