├── assets
    └── image.png
├── requirements.txt
├── demo
    └── app_shizhengpt.py
└── README.md


/assets/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FreedomIntelligence/ShizhenGPT/HEAD/assets/image.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.51.0
2 | vllm==0.6.4
3 | qwen_vl_utils
4 | gradio
5 | librosa
6 | soundfile


--------------------------------------------------------------------------------
/demo/app_shizhengpt.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import torch
  4 | import gradio as gr
  5 | import librosa
  6 | import numpy as np
  7 | import soundfile as sf
  8 | from threading import Thread
  9 | from transformers import (
 10 |     AutoModelForCausalLM, 
 11 |     AutoProcessor, 
 12 |     TextIteratorStreamer
 13 | )
 14 | from qwen_vl_utils import fetch_image
 15 | from copy import deepcopy
 16 | 
 17 | # Argument parsing for model path
 18 | def parse_args():
 19 |     parser = argparse.ArgumentParser(description="Run multi-modal chatbot")
 20 |     parser.add_argument('--model_path', type=str, required=True, help="Path to the pre-trained model")
 21 |     return parser.parse_args()
 22 | 
 23 | # Load the model
 24 | def load_model(model_path):
 25 |     model = AutoModelForCausalLM.from_pretrained(
 26 |         model_path, torch_dtype=torch.bfloat16, device_map="cuda:1", trust_remote_code=True
 27 |     )
 28 |     processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
 29 |     model.eval()
 30 |     processor.chat_template = processor.tokenizer.chat_template
 31 |     return model, processor
 32 | 
 33 | # Audio processing
 34 | def process_audio(audio):
 35 |     if audio is None:
 36 |         return None
 37 |     try:
 38 |         sr, y = audio
 39 |         y = y[:, 0] if y.ndim > 1 else y
 40 |         save_path = "./temp.wav"
 41 |         sf.write(save_path, y, sr)
 42 |         y_resampled = librosa.load(save_path, sr=processor.feature_extractor.sampling_rate)[0]
 43 |         return y_resampled
 44 |     except Exception as e:
 45 |         print(f"Error processing audio: {e}")
 46 |         return None
 47 | 
 48 | # Streaming response generation
 49 | def generate_with_streaming(model, processor, text, images=None, audios=None, history=None):
 50 | 
 51 |     processed_images = []
 52 |     if images is not None and images:
 53 |         text = ''.join(['<|vision_start|><|image_pad|><|vision_end|>']*len(images)) +  text
 54 |         processed_images = [fetch_image({"type": "image", "image": img, "max_pixels": 360*420}) 
 55 |                             for img in images if img is not None]
 56 |     else:
 57 |         processed_images = None
 58 |     
 59 |     processed_audios = []
 60 |     if audios is not None and audios:
 61 |         text = ''.join(['<|audio_bos|><|AUDIO|><|audio_eos|>']*len(audios)) +  text
 62 |         processed_audios = [audio for audio in audios if audio is not None]
 63 |     else:
 64 |         processed_audios = None
 65 |     
 66 |     messages = []
 67 |     if history:
 68 |         for user_msg, assistant_msg in history:
 69 |             messages.append({'role': 'user', 'content': user_msg})
 70 |             if len(assistant_msg) > 0:  # 确保助手消息不为空
 71 |                 messages.append({'role': 'assistant', 'content': assistant_msg})
 72 |     
 73 |     for xx in messages:
 74 |         xx['content'] = xx['content'].replace('<|audio_bos|><|AUDIO|><|audio_eos|>', '').replace('<|vision_start|><|image_pad|><|vision_end|>', '')
 75 |     messages.append({'role': 'user', 'content': text})
 76 |     
 77 |     print('messages',messages,flush=True)
 78 |     print('processed_images',processed_images,flush=True)
 79 |     print('processed_audios',processed_audios,flush=True)
 80 |     
 81 |     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 82 |     if not text:
 83 |         text = [""]
 84 | 
 85 |     input_data = processor(
 86 |         text=[text],
 87 |         audios=processed_audios,
 88 |         images=processed_images, 
 89 |         return_tensors="pt", 
 90 |         padding=True
 91 |     )
 92 |     print('input_ids',processor.tokenizer.decode(input_data['input_ids'][0]),flush=True)
 93 |     
 94 |     for k, v in input_data.items():
 95 |         if hasattr(v, "to"):
 96 |             input_data[k] = v.to(model.device)
 97 | 
 98 |     streamer = TextIteratorStreamer(processor.tokenizer, skip_special_tokens=True,skip_prompt=True)
 99 |     generation_kwargs = dict(
100 |         **input_data,
101 |         streamer=streamer,
102 |         max_new_tokens=1500,
103 |         do_sample=True,
104 |         temperature=0.2,
105 |         top_p=0.8,
106 |     )
107 |     
108 |     thread = Thread(target=model.generate, kwargs=generation_kwargs)
109 |     thread.start()
110 |     
111 |     for new_text in streamer:
112 |         yield new_text
113 | 
114 | def predict(message, image, audio, chatbox):
115 | 
116 |     chat_history = deepcopy(chatbox)
117 |     print('[chat_history]',chat_history,flush=True)
118 |     print('[message]',message,flush=True)
119 |     
120 |     processed_audio = None
121 |     if audio is not None:
122 |         processed_audio = [process_audio(audio)]
123 |     
124 |     processed_image = None
125 |     if image is not None:
126 |         processed_image = [image]
127 | 
128 |     chatbox.append([message, ""])
129 |     response = ""
130 |     
131 |     for chunk in generate_with_streaming(model, processor, message, processed_image, processed_audio, chat_history):
132 |         response += chunk
133 |         chatbox[-1][1] = response
134 |         yield chatbox
135 |     
136 |     print("\n=== Complete Model Response ===")
137 |     print(response)
138 |     print("============================\n", flush=True)
139 |     
140 |     return chatbox
141 | 
142 | # CSS for Gradio interface
143 | css = """
144 | .gradio-container {
145 |     background-color: #f7f7f7;
146 |     font-family: 'Arial', sans-serif;
147 | }
148 | .chat-message {
149 |     padding: 15px;
150 |     border-radius: 10px;
151 |     margin-bottom: 10px;
152 | }
153 | .user-message {
154 |     background-color: #e6f7ff;
155 |     border-left: 5px solid #1890ff;
156 | }
157 | .bot-message {
158 |     background-color: #f2f2f2;
159 |     border-left: 5px solid #52c41a;
160 | }
161 | .title {
162 |     text-align: center;
163 |     color: #1890ff;
164 |     font-size: 24px;
165 |     margin-bottom: 20px;
166 | }
167 | """
168 | 
169 | # Gradio UI setup
170 | def setup_gradio_interface():
171 |     with gr.Blocks(css=css) as demo:
172 |         gr.HTML("<h1 class='title'>TCM-Omni 中医多模态大模型（融合望闻问切）</h1>")
173 | 
174 |         with gr.Row():
175 |             with gr.Column(scale=2):
176 |                 chatbot = gr.Chatbot(height=500)
177 |                 message = gr.Textbox(label="Input your question", placeholder="Please type your question here...")
178 | 
179 |                 with gr.Row():
180 |                     submit_btn = gr.Button("Submit", variant="primary")
181 |                     clear_btn = gr.Button("Clear")
182 | 
183 |             with gr.Column(scale=1):
184 |                 image_input = gr.Image(type="filepath", label="Upload Image")
185 |                 audio_input = gr.Audio(type="numpy", label="Record or Upload Audio")
186 | 
187 |         submit_btn.click(predict, inputs=[message, image_input, audio_input, chatbot], outputs=[chatbot], show_progress=True).then(
188 |             lambda: "", outputs=[message]
189 |         )
190 | 
191 |         clear_btn.click(lambda: (None, None, None, []), outputs=[message, image_input, audio_input, chatbot])
192 | 
193 |     demo.queue().launch(server_name="0.0.0.0", server_port=7862, share=True)
194 | 
195 | # Main entry point
196 | if __name__ == "__main__":
197 |     args = parse_args()
198 |     model, processor = load_model(args.model_path)
199 |     setup_gradio_interface()
200 | 
201 | 
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Towards Multimodal LLMs for Traditional Chinese Medicine
  2 | 
  3 | <div align="center">
  4 | <h3>
  5 |   ShizhenGPT
  6 | </h3>
  7 | </div>
  8 | 
  9 | <div align="center">
 10 | <h4>
 11 |   📃 <a href="https://arxiv.org/abs/2508.14706" target="_blank">Paper</a> ｜ 📚 <a href="https://huggingface.co/datasets/FreedomIntelligence/TCM-Pretrain-Data-ShizhenGPT" target="_blank">TCM Pre-training Dataset</a> | 📚 <a href="https://huggingface.co/datasets/FreedomIntelligence/TCM-Instruction-Tuning-ShizhenGPT" target="_blank">TCM Instruction Data</a>
 12 | </h4>
 13 | <h4>
 14 |   📚 <a href="https://huggingface.co/datasets/FreedomIntelligence/TCM-Text-Exams" target="_blank">TCM Text Benchmark</a> | 📚 <a href="https://huggingface.co/datasets/FreedomIntelligence/TCM-Vision-Benchmark" target="_blank">TCM Vision Benchmark</a>
 15 | </h4>
 16 | <h4>
 17 |   🤗 <a href="https://huggingface.co/FreedomIntelligence/ShizhenGPT-7B-Omni" target="_blank">ShizhenGPT-7B</a> | 🤗 <a href="https://huggingface.co/FreedomIntelligence/ShizhenGPT-32B-VL" target="_blank">ShizhenGPT-32B</a>
 18 | </h4>
 19 | </div>
 20 | 
 21 | ## ⚡ Introduction
 22 | Hello! Welcome to the repository for [ShizhenGPT](https://arxiv.org/abs/2508.14706)! 
 23 | 
 24 | <div align=center>
 25 | <img src="assets/image.png"  width = "50%" alt="ShizhenGPT" align=center/>
 26 | </div>
 27 | 
 28 | **ShizhenGPT** is the first multimodal LLM designed for Traditional Chinese Medicine (TCM). Trained extensively, it excels in TCM knowledge and can understand images, sounds, smells, and pulses (支持望闻问切).
 29 | 
 30 | ## 📚 The Largest Open-Source TCM Dataset
 31 | 
 32 | We open-source the largest available TCM dataset, consisting of a pretraining dataset and an instruction fine-tuning dataset.
 33 | 
 34 | |                             | Quantity    | Description                                                                                       | Download Link                                                                                                                                  |
 35 | | --------------------------- | ----------- | ------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
 36 | | **TCM Pretraining Dataset** | \~6B tokens | Injects TCM knowledge and aligns visual and auditory understanding. | [FreedomIntelligence/TCM-Pretrain-Data-ShizhenGPT](https://huggingface.co/datasets/FreedomIntelligence/TCM-Pretrain-Data-ShizhenGPT)           |
 37 | | **TCM Instruction Dataset** | 27K items   | Fine-tunes TCM LLMs to improve instruction-following and response quality.  | [FreedomIntelligence/TCM-Instruction-Tuning-ShizhenGPT](https://huggingface.co/datasets/FreedomIntelligence/TCM-Instruction-Tuning-ShizhenGPT) |
 38 | 
 39 | 
 40 | ## 👨‍⚕️ Model
 41 | 
 42 | #### Model Access
 43 | 
 44 | > **ShizhenGPT-7B** is available on Huggingface:
 45 | 
 46 | |                        | Parameters | Supported Modalities          | Link                                                                  |
 47 | | ---------------------- | ---------- | ----------------------------- | --------------------------------------------------------------------- |
 48 | | **ShizhenGPT-7B-LLM**  | 7B         | Text                          | [HF Link](https://huggingface.co/FreedomIntelligence/ShizhenGPT-7B-LLM) |
 49 | | **ShizhenGPT-7B-VL**   | 7B         | Text, Image Understanding     | [HF Link](https://huggingface.co/FreedomIntelligence/ShizhenGPT-7B-VL) |
 50 | | **ShizhenGPT-7B-Omni** | 7B         | Text, Four Diagnostics (望闻问切) | [HF Link](https://huggingface.co/FreedomIntelligence/ShizhenGPT-7B-Omni) |
 51 | | **ShizhenGPT-32B-LLM**  | 32B        | Text                          | [HF Link](https://huggingface.co/FreedomIntelligence/ShizhenGPT-32B-LLM) |
 52 | | **ShizhenGPT-32B-VL**   | 32B        | Text, Image Understanding     | [HF Link](https://huggingface.co/FreedomIntelligence/ShizhenGPT-32B-VL) |
 53 | | **ShizhenGPT-32B-Omni** | 32B        | Text, Four Diagnostics (望闻问切) | Available soon                                                          |
 54 | 
 55 | *Note: The LLM and VL models are parameter-split variants of ShizhenGPT-7B-Omni. Since their architectures align with Qwen2.5 and Qwen2.5-VL, they are easier to adapt to different environments. In contrast, ShizhenGPT-7B-Omni requires `transformers==0.51.0`.*
 56 | 
 57 | 
 58 | #### Model Inference
 59 | 
 60 | <details open>
 61 | <summary><h4>A. Launch with Gradio Demo</h4></summary>
 62 | 
 63 | ```shell
 64 | pip install gradio
 65 | python demo/app_shizhengpt.py --model_path FreedomIntelligence/ShizhenGPT-7B-Omni
 66 | ```
 67 | 
 68 | </details>
 69 | 
 70 | <details open>
 71 | <summary><h4>B. Text-based Inference</h4></summary>
 72 | 
 73 | ```python
 74 | from transformers import AutoModelForCausalLM, AutoTokenizer
 75 | 
 76 | model = AutoModelForCausalLM.from_pretrained("FreedomIntelligence/ShizhenGPT-7B-LLM",torch_dtype="auto",device_map="auto")
 77 | tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/ShizhenGPT-7B-LLM")
 78 | 
 79 | input_text = "为什么我总是手脚冰凉，是阳虚吗？"
 80 | messages = [{"role": "user", "content": input_text}]
 81 | 
 82 | inputs = tokenizer(tokenizer.apply_chat_template(messages, tokenize=False,add_generation_prompt=True
 83 | ), return_tensors="pt").to(model.device)
 84 | outputs = model.generate(**inputs, max_new_tokens=2048)
 85 | print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 86 | ```
 87 | 
 88 | </details>
 89 | 
 90 | 
 91 | <details open>
 92 | 
 93 | <summary><h4>C. Image-Text-to-Text</h4></summary>
 94 | 
 95 | ```python
 96 | from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 97 | from qwen_vl_utils import process_vision_info
 98 | 
 99 | 
100 | processor = AutoProcessor.from_pretrained("FreedomIntelligence/ShizhenGPT-7B-VL")
101 | model = Qwen2_5_VLForConditionalGeneration.from_pretrained("FreedomIntelligence/ShizhenGPT-7B-VL", torch_dtype="auto", device_map="auto")
102 | 
103 | messages = [
104 |     {
105 |         "role": "user",
106 |         "content": [
107 |             {
108 |                 "type": "image",
109 |                 "image": "/path/to/your/image.png",
110 |             },
111 |             {"type": "text", "text": "请从中医角度解读这张舌苔。"},
112 |         ],
113 |     }
114 | ]
115 | 
116 | text = processor.apply_chat_template(
117 |     messages, tokenize=False, add_generation_prompt=True
118 | )
119 | image_inputs, video_inputs = process_vision_info(messages)
120 | inputs = processor(
121 |     text=[text],
122 |     images=image_inputs,
123 |     videos=video_inputs,
124 |     padding=True,
125 |     return_tensors="pt",
126 | )
127 | inputs = inputs.to("cuda")
128 | 
129 | # Inference: Generation of the output
130 | generated_ids = model.generate(**inputs, max_new_tokens=128)
131 | generated_ids_trimmed = [
132 |     out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
133 | ]
134 | output_text = processor.batch_decode(
135 |     generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
136 | )
137 | print(output_text)
138 | ```
139 | 
140 | </details>
141 | 
142 | <details open>
143 | 
144 | <summary><h4>D. Signal-Image-Text-to-Text</h4></summary>
145 | 
146 | ```python
147 | from transformers import AutoModelForCausalLM, AutoProcessor
148 | from qwen_vl_utils import fetch_image
149 | import librosa
150 | 
151 | # Load model and processor
152 | model_path = 'FreedomIntelligence/ShizhenGPT-7B-Omni'
153 | model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype="auto").cuda()
154 | processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
155 | 
156 | def generate(text, images=None, signals=None):
157 |     # Process images if provided
158 |     processed_images = []
159 |     if images is not None and images:
160 |         text = ''.join(['<|vision_start|><|image_pad|><|vision_end|>']*len(images)) + text
161 |         processed_images = [fetch_image({"type": "image", "image": img, "max_pixels": 360*420}) 
162 |                             for img in images if img is not None]
163 |     else:
164 |         processed_images = None
165 |     
166 |     # Process audio signals if provided
167 |     processed_signals = []
168 |     if signals is not None and signals:
169 |         text = ''.join(['<|audio_bos|><|AUDIO|><|audio_eos|>']*len(signals)) + text
170 |         processed_signals = [librosa.load(signal, sr=processor.feature_extractor.sampling_rate)[0] 
171 |                              for signal in signals if signal is not None]
172 |     else:
173 |         processed_signals = None
174 |     
175 |     # Prepare messages
176 |     messages = [{'role': 'user', 'content': text}]
177 |     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
178 | 
179 |     # Ensure text is non-empty
180 |     if not text:
181 |         text = [""]
182 | 
183 |     # Process the input data
184 |     input_data = processor(
185 |         text=[text],
186 |         audios=processed_signals,
187 |         images=processed_images, 
188 |         return_tensors="pt", 
189 |         padding=True
190 |     )
191 |     input_data = input_data.to(model.device)
192 |     
193 |     # Generate the output
194 |     generated_ids = model.generate(**input_data, max_new_tokens=1024)
195 |     generated_ids_trimmed = [
196 |         out_ids[len(in_ids):] for in_ids, out_ids in zip(input_data.input_ids, generated_ids)
197 |     ]
198 |     output_text = processor.batch_decode(
199 |         generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
200 |     )
201 | 
202 |     return output_text[0]
203 | 
204 | # Example usage
205 | # Text input
206 | print(generate('为什么我总是手脚冰凉，是阳虚吗？'))
207 | # Image input
208 | print(generate('请从中医角度解读这张舌苔。', images=['path_to_image']))
209 | # Audio input
210 | print(generate('请回答这个语音问题', signals=['path_to_audio']))
211 | ```
212 | 
213 | </details>
214 | 
215 | ## 🧐 Evaluation
216 | 
217 | <details>
218 | <summary><h4>Text Benchmark</h4></summary>
219 | The text benchmark is composed of five sections, each compiled from different national-level TCM examinations.
220 | 
221 | |                                      | Samples                       |
222 | | ------------------------------------ | ------------------------------ |
223 | | 2024 TCM Pharmacist (2024年中医药剂师考试)   | 480 |
224 | | 2024 TCM Physician (2024年中医职业医师资格考试) | 184 |
225 | | 2024 TCM Assistant Physician (2024年中医助理职业医师资格考试) | 138|
226 | | 2024 TCM Graduate Entrance Examination (2024年中医综合考研真题) | 147 |
227 | | 2025 TCM Graduate Entrance Examination (2025年中医综合考研真题) | 139 |
228 | </details>
229 | 
230 | <details>
231 | <summary><h4>Vision Benchmark</h4></summary>
232 | The benchmark is composed of 7 sections, each compiled from different authoritative TCM illustrated books.
233 | 
234 | |                                      | Samples                       |
235 | | ------------------------------------ | ------------------------------ |
236 | | TCM Patent | 1119 |
237 | | TCM Material | 1020 |
238 | | TCM Herb | 1100 |
239 | | Tongue | 768 |
240 | | Palm | 640 |
241 | | Holism | 1011 |
242 | | Tuina | 831 |
243 | | Eye | 715 |
244 | </details>
245 | 
246 | 
247 | ## 📖 Citation
248 | ```
249 | @misc{chen2025shizhengptmultimodalllmstraditional,
250 |       title={ShizhenGPT: Towards Multimodal LLMs for Traditional Chinese Medicine}, 
251 |       author={Junying Chen and Zhenyang Cai and Zhiheng Liu and Yunjin Yang and Rongsheng Wang and Qingying Xiao and Xiangyi Feng and Zhan Su and Jing Guo and Xiang Wan and Guangjun Yu and Haizhou Li and Benyou Wang},
252 |       year={2025},
253 |       eprint={2508.14706},
254 |       archivePrefix={arXiv},
255 |       primaryClass={cs.CL},
256 |       url={https://arxiv.org/abs/2508.14706},
257 | }
258 | ```
259 | 


--------------------------------------------------------------------------------