├── assets
└── image.png
├── requirements.txt
├── demo
└── app_shizhengpt.py
└── README.md
/assets/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FreedomIntelligence/ShizhenGPT/HEAD/assets/image.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.51.0
2 | vllm==0.6.4
3 | qwen_vl_utils
4 | gradio
5 | librosa
6 | soundfile
--------------------------------------------------------------------------------
/demo/app_shizhengpt.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import torch
4 | import gradio as gr
5 | import librosa
6 | import numpy as np
7 | import soundfile as sf
8 | from threading import Thread
9 | from transformers import (
10 | AutoModelForCausalLM,
11 | AutoProcessor,
12 | TextIteratorStreamer
13 | )
14 | from qwen_vl_utils import fetch_image
15 | from copy import deepcopy
16 |
17 | # Argument parsing for model path
18 | def parse_args():
19 | parser = argparse.ArgumentParser(description="Run multi-modal chatbot")
20 | parser.add_argument('--model_path', type=str, required=True, help="Path to the pre-trained model")
21 | return parser.parse_args()
22 |
23 | # Load the model
24 | def load_model(model_path):
25 | model = AutoModelForCausalLM.from_pretrained(
26 | model_path, torch_dtype=torch.bfloat16, device_map="cuda:1", trust_remote_code=True
27 | )
28 | processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
29 | model.eval()
30 | processor.chat_template = processor.tokenizer.chat_template
31 | return model, processor
32 |
33 | # Audio processing
34 | def process_audio(audio):
35 | if audio is None:
36 | return None
37 | try:
38 | sr, y = audio
39 | y = y[:, 0] if y.ndim > 1 else y
40 | save_path = "./temp.wav"
41 | sf.write(save_path, y, sr)
42 | y_resampled = librosa.load(save_path, sr=processor.feature_extractor.sampling_rate)[0]
43 | return y_resampled
44 | except Exception as e:
45 | print(f"Error processing audio: {e}")
46 | return None
47 |
48 | # Streaming response generation
49 | def generate_with_streaming(model, processor, text, images=None, audios=None, history=None):
50 |
51 | processed_images = []
52 | if images is not None and images:
53 | text = ''.join(['<|vision_start|><|image_pad|><|vision_end|>']*len(images)) + text
54 | processed_images = [fetch_image({"type": "image", "image": img, "max_pixels": 360*420})
55 | for img in images if img is not None]
56 | else:
57 | processed_images = None
58 |
59 | processed_audios = []
60 | if audios is not None and audios:
61 | text = ''.join(['<|audio_bos|><|AUDIO|><|audio_eos|>']*len(audios)) + text
62 | processed_audios = [audio for audio in audios if audio is not None]
63 | else:
64 | processed_audios = None
65 |
66 | messages = []
67 | if history:
68 | for user_msg, assistant_msg in history:
69 | messages.append({'role': 'user', 'content': user_msg})
70 | if len(assistant_msg) > 0: # 确保助手消息不为空
71 | messages.append({'role': 'assistant', 'content': assistant_msg})
72 |
73 | for xx in messages:
74 | xx['content'] = xx['content'].replace('<|audio_bos|><|AUDIO|><|audio_eos|>', '').replace('<|vision_start|><|image_pad|><|vision_end|>', '')
75 | messages.append({'role': 'user', 'content': text})
76 |
77 | print('messages',messages,flush=True)
78 | print('processed_images',processed_images,flush=True)
79 | print('processed_audios',processed_audios,flush=True)
80 |
81 | text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
82 | if not text:
83 | text = [""]
84 |
85 | input_data = processor(
86 | text=[text],
87 | audios=processed_audios,
88 | images=processed_images,
89 | return_tensors="pt",
90 | padding=True
91 | )
92 | print('input_ids',processor.tokenizer.decode(input_data['input_ids'][0]),flush=True)
93 |
94 | for k, v in input_data.items():
95 | if hasattr(v, "to"):
96 | input_data[k] = v.to(model.device)
97 |
98 | streamer = TextIteratorStreamer(processor.tokenizer, skip_special_tokens=True,skip_prompt=True)
99 | generation_kwargs = dict(
100 | **input_data,
101 | streamer=streamer,
102 | max_new_tokens=1500,
103 | do_sample=True,
104 | temperature=0.2,
105 | top_p=0.8,
106 | )
107 |
108 | thread = Thread(target=model.generate, kwargs=generation_kwargs)
109 | thread.start()
110 |
111 | for new_text in streamer:
112 | yield new_text
113 |
114 | def predict(message, image, audio, chatbox):
115 |
116 | chat_history = deepcopy(chatbox)
117 | print('[chat_history]',chat_history,flush=True)
118 | print('[message]',message,flush=True)
119 |
120 | processed_audio = None
121 | if audio is not None:
122 | processed_audio = [process_audio(audio)]
123 |
124 | processed_image = None
125 | if image is not None:
126 | processed_image = [image]
127 |
128 | chatbox.append([message, ""])
129 | response = ""
130 |
131 | for chunk in generate_with_streaming(model, processor, message, processed_image, processed_audio, chat_history):
132 | response += chunk
133 | chatbox[-1][1] = response
134 | yield chatbox
135 |
136 | print("\n=== Complete Model Response ===")
137 | print(response)
138 | print("============================\n", flush=True)
139 |
140 | return chatbox
141 |
142 | # CSS for Gradio interface
143 | css = """
144 | .gradio-container {
145 | background-color: #f7f7f7;
146 | font-family: 'Arial', sans-serif;
147 | }
148 | .chat-message {
149 | padding: 15px;
150 | border-radius: 10px;
151 | margin-bottom: 10px;
152 | }
153 | .user-message {
154 | background-color: #e6f7ff;
155 | border-left: 5px solid #1890ff;
156 | }
157 | .bot-message {
158 | background-color: #f2f2f2;
159 | border-left: 5px solid #52c41a;
160 | }
161 | .title {
162 | text-align: center;
163 | color: #1890ff;
164 | font-size: 24px;
165 | margin-bottom: 20px;
166 | }
167 | """
168 |
169 | # Gradio UI setup
170 | def setup_gradio_interface():
171 | with gr.Blocks(css=css) as demo:
172 | gr.HTML("
TCM-Omni 中医多模态大模型(融合望闻问切)
")
173 |
174 | with gr.Row():
175 | with gr.Column(scale=2):
176 | chatbot = gr.Chatbot(height=500)
177 | message = gr.Textbox(label="Input your question", placeholder="Please type your question here...")
178 |
179 | with gr.Row():
180 | submit_btn = gr.Button("Submit", variant="primary")
181 | clear_btn = gr.Button("Clear")
182 |
183 | with gr.Column(scale=1):
184 | image_input = gr.Image(type="filepath", label="Upload Image")
185 | audio_input = gr.Audio(type="numpy", label="Record or Upload Audio")
186 |
187 | submit_btn.click(predict, inputs=[message, image_input, audio_input, chatbot], outputs=[chatbot], show_progress=True).then(
188 | lambda: "", outputs=[message]
189 | )
190 |
191 | clear_btn.click(lambda: (None, None, None, []), outputs=[message, image_input, audio_input, chatbot])
192 |
193 | demo.queue().launch(server_name="0.0.0.0", server_port=7862, share=True)
194 |
195 | # Main entry point
196 | if __name__ == "__main__":
197 | args = parse_args()
198 | model, processor = load_model(args.model_path)
199 | setup_gradio_interface()
200 |
201 |
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Towards Multimodal LLMs for Traditional Chinese Medicine
2 |
3 |
4 |
5 | ShizhenGPT
6 |
7 |
8 |
9 |
20 |
21 | ## ⚡ Introduction
22 | Hello! Welcome to the repository for [ShizhenGPT](https://arxiv.org/abs/2508.14706)!
23 |
24 |
25 |

26 |
27 |
28 | **ShizhenGPT** is the first multimodal LLM designed for Traditional Chinese Medicine (TCM). Trained extensively, it excels in TCM knowledge and can understand images, sounds, smells, and pulses (支持望闻问切).
29 |
30 | ## 📚 The Largest Open-Source TCM Dataset
31 |
32 | We open-source the largest available TCM dataset, consisting of a pretraining dataset and an instruction fine-tuning dataset.
33 |
34 | | | Quantity | Description | Download Link |
35 | | --------------------------- | ----------- | ------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
36 | | **TCM Pretraining Dataset** | \~6B tokens | Injects TCM knowledge and aligns visual and auditory understanding. | [FreedomIntelligence/TCM-Pretrain-Data-ShizhenGPT](https://huggingface.co/datasets/FreedomIntelligence/TCM-Pretrain-Data-ShizhenGPT) |
37 | | **TCM Instruction Dataset** | 27K items | Fine-tunes TCM LLMs to improve instruction-following and response quality. | [FreedomIntelligence/TCM-Instruction-Tuning-ShizhenGPT](https://huggingface.co/datasets/FreedomIntelligence/TCM-Instruction-Tuning-ShizhenGPT) |
38 |
39 |
40 | ## 👨⚕️ Model
41 |
42 | #### Model Access
43 |
44 | > **ShizhenGPT-7B** is available on Huggingface:
45 |
46 | | | Parameters | Supported Modalities | Link |
47 | | ---------------------- | ---------- | ----------------------------- | --------------------------------------------------------------------- |
48 | | **ShizhenGPT-7B-LLM** | 7B | Text | [HF Link](https://huggingface.co/FreedomIntelligence/ShizhenGPT-7B-LLM) |
49 | | **ShizhenGPT-7B-VL** | 7B | Text, Image Understanding | [HF Link](https://huggingface.co/FreedomIntelligence/ShizhenGPT-7B-VL) |
50 | | **ShizhenGPT-7B-Omni** | 7B | Text, Four Diagnostics (望闻问切) | [HF Link](https://huggingface.co/FreedomIntelligence/ShizhenGPT-7B-Omni) |
51 | | **ShizhenGPT-32B-LLM** | 32B | Text | [HF Link](https://huggingface.co/FreedomIntelligence/ShizhenGPT-32B-LLM) |
52 | | **ShizhenGPT-32B-VL** | 32B | Text, Image Understanding | [HF Link](https://huggingface.co/FreedomIntelligence/ShizhenGPT-32B-VL) |
53 | | **ShizhenGPT-32B-Omni** | 32B | Text, Four Diagnostics (望闻问切) | Available soon |
54 |
55 | *Note: The LLM and VL models are parameter-split variants of ShizhenGPT-7B-Omni. Since their architectures align with Qwen2.5 and Qwen2.5-VL, they are easier to adapt to different environments. In contrast, ShizhenGPT-7B-Omni requires `transformers==0.51.0`.*
56 |
57 |
58 | #### Model Inference
59 |
60 |
61 | A. Launch with Gradio Demo
62 |
63 | ```shell
64 | pip install gradio
65 | python demo/app_shizhengpt.py --model_path FreedomIntelligence/ShizhenGPT-7B-Omni
66 | ```
67 |
68 |
69 |
70 |
71 | B. Text-based Inference
72 |
73 | ```python
74 | from transformers import AutoModelForCausalLM, AutoTokenizer
75 |
76 | model = AutoModelForCausalLM.from_pretrained("FreedomIntelligence/ShizhenGPT-7B-LLM",torch_dtype="auto",device_map="auto")
77 | tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/ShizhenGPT-7B-LLM")
78 |
79 | input_text = "为什么我总是手脚冰凉,是阳虚吗?"
80 | messages = [{"role": "user", "content": input_text}]
81 |
82 | inputs = tokenizer(tokenizer.apply_chat_template(messages, tokenize=False,add_generation_prompt=True
83 | ), return_tensors="pt").to(model.device)
84 | outputs = model.generate(**inputs, max_new_tokens=2048)
85 | print(tokenizer.decode(outputs[0], skip_special_tokens=True))
86 | ```
87 |
88 |
89 |
90 |
91 |
92 |
93 | C. Image-Text-to-Text
94 |
95 | ```python
96 | from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
97 | from qwen_vl_utils import process_vision_info
98 |
99 |
100 | processor = AutoProcessor.from_pretrained("FreedomIntelligence/ShizhenGPT-7B-VL")
101 | model = Qwen2_5_VLForConditionalGeneration.from_pretrained("FreedomIntelligence/ShizhenGPT-7B-VL", torch_dtype="auto", device_map="auto")
102 |
103 | messages = [
104 | {
105 | "role": "user",
106 | "content": [
107 | {
108 | "type": "image",
109 | "image": "/path/to/your/image.png",
110 | },
111 | {"type": "text", "text": "请从中医角度解读这张舌苔。"},
112 | ],
113 | }
114 | ]
115 |
116 | text = processor.apply_chat_template(
117 | messages, tokenize=False, add_generation_prompt=True
118 | )
119 | image_inputs, video_inputs = process_vision_info(messages)
120 | inputs = processor(
121 | text=[text],
122 | images=image_inputs,
123 | videos=video_inputs,
124 | padding=True,
125 | return_tensors="pt",
126 | )
127 | inputs = inputs.to("cuda")
128 |
129 | # Inference: Generation of the output
130 | generated_ids = model.generate(**inputs, max_new_tokens=128)
131 | generated_ids_trimmed = [
132 | out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
133 | ]
134 | output_text = processor.batch_decode(
135 | generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
136 | )
137 | print(output_text)
138 | ```
139 |
140 |
141 |
142 |
143 |
144 | D. Signal-Image-Text-to-Text
145 |
146 | ```python
147 | from transformers import AutoModelForCausalLM, AutoProcessor
148 | from qwen_vl_utils import fetch_image
149 | import librosa
150 |
151 | # Load model and processor
152 | model_path = 'FreedomIntelligence/ShizhenGPT-7B-Omni'
153 | model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype="auto").cuda()
154 | processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
155 |
156 | def generate(text, images=None, signals=None):
157 | # Process images if provided
158 | processed_images = []
159 | if images is not None and images:
160 | text = ''.join(['<|vision_start|><|image_pad|><|vision_end|>']*len(images)) + text
161 | processed_images = [fetch_image({"type": "image", "image": img, "max_pixels": 360*420})
162 | for img in images if img is not None]
163 | else:
164 | processed_images = None
165 |
166 | # Process audio signals if provided
167 | processed_signals = []
168 | if signals is not None and signals:
169 | text = ''.join(['<|audio_bos|><|AUDIO|><|audio_eos|>']*len(signals)) + text
170 | processed_signals = [librosa.load(signal, sr=processor.feature_extractor.sampling_rate)[0]
171 | for signal in signals if signal is not None]
172 | else:
173 | processed_signals = None
174 |
175 | # Prepare messages
176 | messages = [{'role': 'user', 'content': text}]
177 | text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
178 |
179 | # Ensure text is non-empty
180 | if not text:
181 | text = [""]
182 |
183 | # Process the input data
184 | input_data = processor(
185 | text=[text],
186 | audios=processed_signals,
187 | images=processed_images,
188 | return_tensors="pt",
189 | padding=True
190 | )
191 | input_data = input_data.to(model.device)
192 |
193 | # Generate the output
194 | generated_ids = model.generate(**input_data, max_new_tokens=1024)
195 | generated_ids_trimmed = [
196 | out_ids[len(in_ids):] for in_ids, out_ids in zip(input_data.input_ids, generated_ids)
197 | ]
198 | output_text = processor.batch_decode(
199 | generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
200 | )
201 |
202 | return output_text[0]
203 |
204 | # Example usage
205 | # Text input
206 | print(generate('为什么我总是手脚冰凉,是阳虚吗?'))
207 | # Image input
208 | print(generate('请从中医角度解读这张舌苔。', images=['path_to_image']))
209 | # Audio input
210 | print(generate('请回答这个语音问题', signals=['path_to_audio']))
211 | ```
212 |
213 |
214 |
215 | ## 🧐 Evaluation
216 |
217 |
218 | Text Benchmark
219 | The text benchmark is composed of five sections, each compiled from different national-level TCM examinations.
220 |
221 | | | Samples |
222 | | ------------------------------------ | ------------------------------ |
223 | | 2024 TCM Pharmacist (2024年中医药剂师考试) | 480 |
224 | | 2024 TCM Physician (2024年中医职业医师资格考试) | 184 |
225 | | 2024 TCM Assistant Physician (2024年中医助理职业医师资格考试) | 138|
226 | | 2024 TCM Graduate Entrance Examination (2024年中医综合考研真题) | 147 |
227 | | 2025 TCM Graduate Entrance Examination (2025年中医综合考研真题) | 139 |
228 |
229 |
230 |
231 | Vision Benchmark
232 | The benchmark is composed of 7 sections, each compiled from different authoritative TCM illustrated books.
233 |
234 | | | Samples |
235 | | ------------------------------------ | ------------------------------ |
236 | | TCM Patent | 1119 |
237 | | TCM Material | 1020 |
238 | | TCM Herb | 1100 |
239 | | Tongue | 768 |
240 | | Palm | 640 |
241 | | Holism | 1011 |
242 | | Tuina | 831 |
243 | | Eye | 715 |
244 |
245 |
246 |
247 | ## 📖 Citation
248 | ```
249 | @misc{chen2025shizhengptmultimodalllmstraditional,
250 | title={ShizhenGPT: Towards Multimodal LLMs for Traditional Chinese Medicine},
251 | author={Junying Chen and Zhenyang Cai and Zhiheng Liu and Yunjin Yang and Rongsheng Wang and Qingying Xiao and Xiangyi Feng and Zhan Su and Jing Guo and Xiang Wan and Guangjun Yu and Haizhou Li and Benyou Wang},
252 | year={2025},
253 | eprint={2508.14706},
254 | archivePrefix={arXiv},
255 | primaryClass={cs.CL},
256 | url={https://arxiv.org/abs/2508.14706},
257 | }
258 | ```
259 |
--------------------------------------------------------------------------------