├── assets └── image.png ├── requirements.txt ├── demo └── app_shizhengpt.py └── README.md /assets/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FreedomIntelligence/ShizhenGPT/HEAD/assets/image.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.51.0 2 | vllm==0.6.4 3 | qwen_vl_utils 4 | gradio 5 | librosa 6 | soundfile -------------------------------------------------------------------------------- /demo/app_shizhengpt.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import torch 4 | import gradio as gr 5 | import librosa 6 | import numpy as np 7 | import soundfile as sf 8 | from threading import Thread 9 | from transformers import ( 10 | AutoModelForCausalLM, 11 | AutoProcessor, 12 | TextIteratorStreamer 13 | ) 14 | from qwen_vl_utils import fetch_image 15 | from copy import deepcopy 16 | 17 | # Argument parsing for model path 18 | def parse_args(): 19 | parser = argparse.ArgumentParser(description="Run multi-modal chatbot") 20 | parser.add_argument('--model_path', type=str, required=True, help="Path to the pre-trained model") 21 | return parser.parse_args() 22 | 23 | # Load the model 24 | def load_model(model_path): 25 | model = AutoModelForCausalLM.from_pretrained( 26 | model_path, torch_dtype=torch.bfloat16, device_map="cuda:1", trust_remote_code=True 27 | ) 28 | processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) 29 | model.eval() 30 | processor.chat_template = processor.tokenizer.chat_template 31 | return model, processor 32 | 33 | # Audio processing 34 | def process_audio(audio): 35 | if audio is None: 36 | return None 37 | try: 38 | sr, y = audio 39 | y = y[:, 0] if y.ndim > 1 else y 40 | save_path = "./temp.wav" 41 | sf.write(save_path, y, sr) 42 | y_resampled = librosa.load(save_path, sr=processor.feature_extractor.sampling_rate)[0] 43 | return y_resampled 44 | except Exception as e: 45 | print(f"Error processing audio: {e}") 46 | return None 47 | 48 | # Streaming response generation 49 | def generate_with_streaming(model, processor, text, images=None, audios=None, history=None): 50 | 51 | processed_images = [] 52 | if images is not None and images: 53 | text = ''.join(['<|vision_start|><|image_pad|><|vision_end|>']*len(images)) + text 54 | processed_images = [fetch_image({"type": "image", "image": img, "max_pixels": 360*420}) 55 | for img in images if img is not None] 56 | else: 57 | processed_images = None 58 | 59 | processed_audios = [] 60 | if audios is not None and audios: 61 | text = ''.join(['<|audio_bos|><|AUDIO|><|audio_eos|>']*len(audios)) + text 62 | processed_audios = [audio for audio in audios if audio is not None] 63 | else: 64 | processed_audios = None 65 | 66 | messages = [] 67 | if history: 68 | for user_msg, assistant_msg in history: 69 | messages.append({'role': 'user', 'content': user_msg}) 70 | if len(assistant_msg) > 0: # 确保助手消息不为空 71 | messages.append({'role': 'assistant', 'content': assistant_msg}) 72 | 73 | for xx in messages: 74 | xx['content'] = xx['content'].replace('<|audio_bos|><|AUDIO|><|audio_eos|>', '').replace('<|vision_start|><|image_pad|><|vision_end|>', '') 75 | messages.append({'role': 'user', 'content': text}) 76 | 77 | print('messages',messages,flush=True) 78 | print('processed_images',processed_images,flush=True) 79 | print('processed_audios',processed_audios,flush=True) 80 | 81 | text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) 82 | if not text: 83 | text = [""] 84 | 85 | input_data = processor( 86 | text=[text], 87 | audios=processed_audios, 88 | images=processed_images, 89 | return_tensors="pt", 90 | padding=True 91 | ) 92 | print('input_ids',processor.tokenizer.decode(input_data['input_ids'][0]),flush=True) 93 | 94 | for k, v in input_data.items(): 95 | if hasattr(v, "to"): 96 | input_data[k] = v.to(model.device) 97 | 98 | streamer = TextIteratorStreamer(processor.tokenizer, skip_special_tokens=True,skip_prompt=True) 99 | generation_kwargs = dict( 100 | **input_data, 101 | streamer=streamer, 102 | max_new_tokens=1500, 103 | do_sample=True, 104 | temperature=0.2, 105 | top_p=0.8, 106 | ) 107 | 108 | thread = Thread(target=model.generate, kwargs=generation_kwargs) 109 | thread.start() 110 | 111 | for new_text in streamer: 112 | yield new_text 113 | 114 | def predict(message, image, audio, chatbox): 115 | 116 | chat_history = deepcopy(chatbox) 117 | print('[chat_history]',chat_history,flush=True) 118 | print('[message]',message,flush=True) 119 | 120 | processed_audio = None 121 | if audio is not None: 122 | processed_audio = [process_audio(audio)] 123 | 124 | processed_image = None 125 | if image is not None: 126 | processed_image = [image] 127 | 128 | chatbox.append([message, ""]) 129 | response = "" 130 | 131 | for chunk in generate_with_streaming(model, processor, message, processed_image, processed_audio, chat_history): 132 | response += chunk 133 | chatbox[-1][1] = response 134 | yield chatbox 135 | 136 | print("\n=== Complete Model Response ===") 137 | print(response) 138 | print("============================\n", flush=True) 139 | 140 | return chatbox 141 | 142 | # CSS for Gradio interface 143 | css = """ 144 | .gradio-container { 145 | background-color: #f7f7f7; 146 | font-family: 'Arial', sans-serif; 147 | } 148 | .chat-message { 149 | padding: 15px; 150 | border-radius: 10px; 151 | margin-bottom: 10px; 152 | } 153 | .user-message { 154 | background-color: #e6f7ff; 155 | border-left: 5px solid #1890ff; 156 | } 157 | .bot-message { 158 | background-color: #f2f2f2; 159 | border-left: 5px solid #52c41a; 160 | } 161 | .title { 162 | text-align: center; 163 | color: #1890ff; 164 | font-size: 24px; 165 | margin-bottom: 20px; 166 | } 167 | """ 168 | 169 | # Gradio UI setup 170 | def setup_gradio_interface(): 171 | with gr.Blocks(css=css) as demo: 172 | gr.HTML("

TCM-Omni 中医多模态大模型(融合望闻问切)

") 173 | 174 | with gr.Row(): 175 | with gr.Column(scale=2): 176 | chatbot = gr.Chatbot(height=500) 177 | message = gr.Textbox(label="Input your question", placeholder="Please type your question here...") 178 | 179 | with gr.Row(): 180 | submit_btn = gr.Button("Submit", variant="primary") 181 | clear_btn = gr.Button("Clear") 182 | 183 | with gr.Column(scale=1): 184 | image_input = gr.Image(type="filepath", label="Upload Image") 185 | audio_input = gr.Audio(type="numpy", label="Record or Upload Audio") 186 | 187 | submit_btn.click(predict, inputs=[message, image_input, audio_input, chatbot], outputs=[chatbot], show_progress=True).then( 188 | lambda: "", outputs=[message] 189 | ) 190 | 191 | clear_btn.click(lambda: (None, None, None, []), outputs=[message, image_input, audio_input, chatbot]) 192 | 193 | demo.queue().launch(server_name="0.0.0.0", server_port=7862, share=True) 194 | 195 | # Main entry point 196 | if __name__ == "__main__": 197 | args = parse_args() 198 | model, processor = load_model(args.model_path) 199 | setup_gradio_interface() 200 | 201 | 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Towards Multimodal LLMs for Traditional Chinese Medicine 2 | 3 |
4 |

5 | ShizhenGPT 6 |

7 |
8 | 9 |
10 |

11 | 📃 Paper | 📚 TCM Pre-training Dataset | 📚 TCM Instruction Data 12 |

13 |

14 | 📚 TCM Text Benchmark | 📚 TCM Vision Benchmark 15 |

16 |

17 | 🤗 ShizhenGPT-7B | 🤗 ShizhenGPT-32B 18 |

19 |
20 | 21 | ## ⚡ Introduction 22 | Hello! Welcome to the repository for [ShizhenGPT](https://arxiv.org/abs/2508.14706)! 23 | 24 |
25 | ShizhenGPT 26 |
27 | 28 | **ShizhenGPT** is the first multimodal LLM designed for Traditional Chinese Medicine (TCM). Trained extensively, it excels in TCM knowledge and can understand images, sounds, smells, and pulses (支持望闻问切). 29 | 30 | ## 📚 The Largest Open-Source TCM Dataset 31 | 32 | We open-source the largest available TCM dataset, consisting of a pretraining dataset and an instruction fine-tuning dataset. 33 | 34 | | | Quantity | Description | Download Link | 35 | | --------------------------- | ----------- | ------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | 36 | | **TCM Pretraining Dataset** | \~6B tokens | Injects TCM knowledge and aligns visual and auditory understanding. | [FreedomIntelligence/TCM-Pretrain-Data-ShizhenGPT](https://huggingface.co/datasets/FreedomIntelligence/TCM-Pretrain-Data-ShizhenGPT) | 37 | | **TCM Instruction Dataset** | 27K items | Fine-tunes TCM LLMs to improve instruction-following and response quality. | [FreedomIntelligence/TCM-Instruction-Tuning-ShizhenGPT](https://huggingface.co/datasets/FreedomIntelligence/TCM-Instruction-Tuning-ShizhenGPT) | 38 | 39 | 40 | ## 👨‍⚕️ Model 41 | 42 | #### Model Access 43 | 44 | > **ShizhenGPT-7B** is available on Huggingface: 45 | 46 | | | Parameters | Supported Modalities | Link | 47 | | ---------------------- | ---------- | ----------------------------- | --------------------------------------------------------------------- | 48 | | **ShizhenGPT-7B-LLM** | 7B | Text | [HF Link](https://huggingface.co/FreedomIntelligence/ShizhenGPT-7B-LLM) | 49 | | **ShizhenGPT-7B-VL** | 7B | Text, Image Understanding | [HF Link](https://huggingface.co/FreedomIntelligence/ShizhenGPT-7B-VL) | 50 | | **ShizhenGPT-7B-Omni** | 7B | Text, Four Diagnostics (望闻问切) | [HF Link](https://huggingface.co/FreedomIntelligence/ShizhenGPT-7B-Omni) | 51 | | **ShizhenGPT-32B-LLM** | 32B | Text | [HF Link](https://huggingface.co/FreedomIntelligence/ShizhenGPT-32B-LLM) | 52 | | **ShizhenGPT-32B-VL** | 32B | Text, Image Understanding | [HF Link](https://huggingface.co/FreedomIntelligence/ShizhenGPT-32B-VL) | 53 | | **ShizhenGPT-32B-Omni** | 32B | Text, Four Diagnostics (望闻问切) | Available soon | 54 | 55 | *Note: The LLM and VL models are parameter-split variants of ShizhenGPT-7B-Omni. Since their architectures align with Qwen2.5 and Qwen2.5-VL, they are easier to adapt to different environments. In contrast, ShizhenGPT-7B-Omni requires `transformers==0.51.0`.* 56 | 57 | 58 | #### Model Inference 59 | 60 |
61 |

A. Launch with Gradio Demo

62 | 63 | ```shell 64 | pip install gradio 65 | python demo/app_shizhengpt.py --model_path FreedomIntelligence/ShizhenGPT-7B-Omni 66 | ``` 67 | 68 |
69 | 70 |
71 |

B. Text-based Inference

72 | 73 | ```python 74 | from transformers import AutoModelForCausalLM, AutoTokenizer 75 | 76 | model = AutoModelForCausalLM.from_pretrained("FreedomIntelligence/ShizhenGPT-7B-LLM",torch_dtype="auto",device_map="auto") 77 | tokenizer = AutoTokenizer.from_pretrained("FreedomIntelligence/ShizhenGPT-7B-LLM") 78 | 79 | input_text = "为什么我总是手脚冰凉,是阳虚吗?" 80 | messages = [{"role": "user", "content": input_text}] 81 | 82 | inputs = tokenizer(tokenizer.apply_chat_template(messages, tokenize=False,add_generation_prompt=True 83 | ), return_tensors="pt").to(model.device) 84 | outputs = model.generate(**inputs, max_new_tokens=2048) 85 | print(tokenizer.decode(outputs[0], skip_special_tokens=True)) 86 | ``` 87 | 88 |
89 | 90 | 91 |
92 | 93 |

C. Image-Text-to-Text

94 | 95 | ```python 96 | from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor 97 | from qwen_vl_utils import process_vision_info 98 | 99 | 100 | processor = AutoProcessor.from_pretrained("FreedomIntelligence/ShizhenGPT-7B-VL") 101 | model = Qwen2_5_VLForConditionalGeneration.from_pretrained("FreedomIntelligence/ShizhenGPT-7B-VL", torch_dtype="auto", device_map="auto") 102 | 103 | messages = [ 104 | { 105 | "role": "user", 106 | "content": [ 107 | { 108 | "type": "image", 109 | "image": "/path/to/your/image.png", 110 | }, 111 | {"type": "text", "text": "请从中医角度解读这张舌苔。"}, 112 | ], 113 | } 114 | ] 115 | 116 | text = processor.apply_chat_template( 117 | messages, tokenize=False, add_generation_prompt=True 118 | ) 119 | image_inputs, video_inputs = process_vision_info(messages) 120 | inputs = processor( 121 | text=[text], 122 | images=image_inputs, 123 | videos=video_inputs, 124 | padding=True, 125 | return_tensors="pt", 126 | ) 127 | inputs = inputs.to("cuda") 128 | 129 | # Inference: Generation of the output 130 | generated_ids = model.generate(**inputs, max_new_tokens=128) 131 | generated_ids_trimmed = [ 132 | out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) 133 | ] 134 | output_text = processor.batch_decode( 135 | generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False 136 | ) 137 | print(output_text) 138 | ``` 139 | 140 |
141 | 142 |
143 | 144 |

D. Signal-Image-Text-to-Text

145 | 146 | ```python 147 | from transformers import AutoModelForCausalLM, AutoProcessor 148 | from qwen_vl_utils import fetch_image 149 | import librosa 150 | 151 | # Load model and processor 152 | model_path = 'FreedomIntelligence/ShizhenGPT-7B-Omni' 153 | model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype="auto").cuda() 154 | processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) 155 | 156 | def generate(text, images=None, signals=None): 157 | # Process images if provided 158 | processed_images = [] 159 | if images is not None and images: 160 | text = ''.join(['<|vision_start|><|image_pad|><|vision_end|>']*len(images)) + text 161 | processed_images = [fetch_image({"type": "image", "image": img, "max_pixels": 360*420}) 162 | for img in images if img is not None] 163 | else: 164 | processed_images = None 165 | 166 | # Process audio signals if provided 167 | processed_signals = [] 168 | if signals is not None and signals: 169 | text = ''.join(['<|audio_bos|><|AUDIO|><|audio_eos|>']*len(signals)) + text 170 | processed_signals = [librosa.load(signal, sr=processor.feature_extractor.sampling_rate)[0] 171 | for signal in signals if signal is not None] 172 | else: 173 | processed_signals = None 174 | 175 | # Prepare messages 176 | messages = [{'role': 'user', 'content': text}] 177 | text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) 178 | 179 | # Ensure text is non-empty 180 | if not text: 181 | text = [""] 182 | 183 | # Process the input data 184 | input_data = processor( 185 | text=[text], 186 | audios=processed_signals, 187 | images=processed_images, 188 | return_tensors="pt", 189 | padding=True 190 | ) 191 | input_data = input_data.to(model.device) 192 | 193 | # Generate the output 194 | generated_ids = model.generate(**input_data, max_new_tokens=1024) 195 | generated_ids_trimmed = [ 196 | out_ids[len(in_ids):] for in_ids, out_ids in zip(input_data.input_ids, generated_ids) 197 | ] 198 | output_text = processor.batch_decode( 199 | generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False 200 | ) 201 | 202 | return output_text[0] 203 | 204 | # Example usage 205 | # Text input 206 | print(generate('为什么我总是手脚冰凉,是阳虚吗?')) 207 | # Image input 208 | print(generate('请从中医角度解读这张舌苔。', images=['path_to_image'])) 209 | # Audio input 210 | print(generate('请回答这个语音问题', signals=['path_to_audio'])) 211 | ``` 212 | 213 |
214 | 215 | ## 🧐 Evaluation 216 | 217 |
218 |

Text Benchmark

219 | The text benchmark is composed of five sections, each compiled from different national-level TCM examinations. 220 | 221 | | | Samples | 222 | | ------------------------------------ | ------------------------------ | 223 | | 2024 TCM Pharmacist (2024年中医药剂师考试) | 480 | 224 | | 2024 TCM Physician (2024年中医职业医师资格考试) | 184 | 225 | | 2024 TCM Assistant Physician (2024年中医助理职业医师资格考试) | 138| 226 | | 2024 TCM Graduate Entrance Examination (2024年中医综合考研真题) | 147 | 227 | | 2025 TCM Graduate Entrance Examination (2025年中医综合考研真题) | 139 | 228 |
229 | 230 |
231 |

Vision Benchmark

232 | The benchmark is composed of 7 sections, each compiled from different authoritative TCM illustrated books. 233 | 234 | | | Samples | 235 | | ------------------------------------ | ------------------------------ | 236 | | TCM Patent | 1119 | 237 | | TCM Material | 1020 | 238 | | TCM Herb | 1100 | 239 | | Tongue | 768 | 240 | | Palm | 640 | 241 | | Holism | 1011 | 242 | | Tuina | 831 | 243 | | Eye | 715 | 244 |
245 | 246 | 247 | ## 📖 Citation 248 | ``` 249 | @misc{chen2025shizhengptmultimodalllmstraditional, 250 | title={ShizhenGPT: Towards Multimodal LLMs for Traditional Chinese Medicine}, 251 | author={Junying Chen and Zhenyang Cai and Zhiheng Liu and Yunjin Yang and Rongsheng Wang and Qingying Xiao and Xiangyi Feng and Zhan Su and Jing Guo and Xiang Wan and Guangjun Yu and Haizhou Li and Benyou Wang}, 252 | year={2025}, 253 | eprint={2508.14706}, 254 | archivePrefix={arXiv}, 255 | primaryClass={cs.CL}, 256 | url={https://arxiv.org/abs/2508.14706}, 257 | } 258 | ``` 259 | --------------------------------------------------------------------------------