├── .gitignore ├── BraTS2021_Training_Data ├── BraTS2021_00000 │ ├── BraTS2021_00000_flair │ │ └── 00000057_brain_flair.nii │ ├── BraTS2021_00000_seg │ │ └── 00000057_final_seg.nii │ ├── BraTS2021_00000_t1 │ │ └── 00000057_brain_t1.nii │ ├── BraTS2021_00000_t1ce │ │ └── 00000057_brain_t1ce.nii │ └── BraTS2021_00000_t2 │ │ └── 00000057_brain_t2.nii ├── BraTS2021_00002 │ ├── BraTS2021_00002_flair │ │ └── 00000014_brain_flair.nii │ ├── BraTS2021_00002_seg │ │ └── BraTS2021_00002_seg_new.nii │ ├── BraTS2021_00002_t1 │ │ └── 00000014_brain_t1.nii │ ├── BraTS2021_00002_t1ce │ │ └── 00000014_brain_t1ce.nii │ └── BraTS2021_00002_t2 │ │ └── 00000014_brain_t2.nii └── BraTS2021_00003 │ ├── BraTS2021_00003_flair │ └── 00000017_brain_flair.nii │ ├── BraTS2021_00003_seg │ └── BraTS2021_00003_seg_new.nii │ ├── BraTS2021_00003_t1 │ └── 00000017_brain_t1.nii │ ├── BraTS2021_00003_t1ce │ └── 00000017_brain_t1ce.nii │ └── BraTS2021_00003_t2 │ └── 00000017_brain_t2.nii ├── How_I_Use_LLM_to_DIY_metadata.ipynb ├── LLM_metadata.py ├── __pycache__ ├── config.cpython-310.pyc └── pipeline.cpython-310.pyc ├── after.png ├── before.png ├── pipeline.py ├── pipeline_example.py ├── readme.md ├── readme_en.md └── tutorial.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | config.py 2 | BraTS2021_Training_Data 3 | result.json 4 | preprocessed_data 5 | __pycache__ 6 | .vscode -------------------------------------------------------------------------------- /BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_flair/00000057_brain_flair.nii: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_flair/00000057_brain_flair.nii -------------------------------------------------------------------------------- /BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_seg/00000057_final_seg.nii: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_seg/00000057_final_seg.nii -------------------------------------------------------------------------------- /BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t1/00000057_brain_t1.nii: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t1/00000057_brain_t1.nii -------------------------------------------------------------------------------- /BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t1ce/00000057_brain_t1ce.nii: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t1ce/00000057_brain_t1ce.nii -------------------------------------------------------------------------------- /BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t2/00000057_brain_t2.nii: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t2/00000057_brain_t2.nii -------------------------------------------------------------------------------- /BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_flair/00000014_brain_flair.nii: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_flair/00000014_brain_flair.nii -------------------------------------------------------------------------------- /BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_seg/BraTS2021_00002_seg_new.nii: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_seg/BraTS2021_00002_seg_new.nii -------------------------------------------------------------------------------- /BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_t1/00000014_brain_t1.nii: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_t1/00000014_brain_t1.nii -------------------------------------------------------------------------------- /BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_t1ce/00000014_brain_t1ce.nii: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_t1ce/00000014_brain_t1ce.nii -------------------------------------------------------------------------------- /BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_t2/00000014_brain_t2.nii: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_t2/00000014_brain_t2.nii -------------------------------------------------------------------------------- /BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_flair/00000017_brain_flair.nii: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_flair/00000017_brain_flair.nii -------------------------------------------------------------------------------- /BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_seg/BraTS2021_00003_seg_new.nii: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_seg/BraTS2021_00003_seg_new.nii -------------------------------------------------------------------------------- /BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_t1/00000017_brain_t1.nii: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_t1/00000017_brain_t1.nii -------------------------------------------------------------------------------- /BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_t1ce/00000017_brain_t1ce.nii: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_t1ce/00000017_brain_t1ce.nii -------------------------------------------------------------------------------- /BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_t2/00000017_brain_t2.nii: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_t2/00000017_brain_t2.nii -------------------------------------------------------------------------------- /How_I_Use_LLM_to_DIY_metadata.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 17, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "{\n", 13 | " \"root_directory\": \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data\",\n", 14 | " \"a_level_summary\": {\n", 15 | " \"total_a_folders\": 3,\n", 16 | " \"example_a_folders\": [\n", 17 | " \"BraTS2021_00000\",\n", 18 | " \"BraTS2021_00002\",\n", 19 | " \"BraTS2021_00003\"\n", 20 | " ]\n", 21 | " },\n", 22 | " \"sampled_a_folders\": [\n", 23 | " {\n", 24 | " \"a_folder_name\": \"BraTS2021_00002\",\n", 25 | " \"directory_tree\": [\n", 26 | " {\n", 27 | " \"level\": 0,\n", 28 | " \"folder_name\": \"BraTS2021_00002\",\n", 29 | " \"sub_folders\": [\n", 30 | " \"BraTS2021_00002_flair\",\n", 31 | " \"BraTS2021_00002_seg\",\n", 32 | " \"BraTS2021_00002_t1\",\n", 33 | " \"BraTS2021_00002_t1ce\",\n", 34 | " \"BraTS2021_00002_t2\"\n", 35 | " ],\n", 36 | " \"file_count\": 0\n", 37 | " },\n", 38 | " {\n", 39 | " \"level\": 1,\n", 40 | " \"folder_name\": \"BraTS2021_00002_flair\",\n", 41 | " \"sub_folders\": [],\n", 42 | " \"file_count\": 1\n", 43 | " },\n", 44 | " {\n", 45 | " \"level\": 1,\n", 46 | " \"folder_name\": \"BraTS2021_00002_seg\",\n", 47 | " \"sub_folders\": [],\n", 48 | " \"file_count\": 1\n", 49 | " },\n", 50 | " {\n", 51 | " \"level\": 1,\n", 52 | " \"folder_name\": \"BraTS2021_00002_t1\",\n", 53 | " \"sub_folders\": [],\n", 54 | " \"file_count\": 1\n", 55 | " },\n", 56 | " {\n", 57 | " \"level\": 1,\n", 58 | " \"folder_name\": \"BraTS2021_00002_t1ce\",\n", 59 | " \"sub_folders\": [],\n", 60 | " \"file_count\": 1\n", 61 | " },\n", 62 | " {\n", 63 | " \"level\": 1,\n", 64 | " \"folder_name\": \"BraTS2021_00002_t2\",\n", 65 | " \"sub_folders\": [],\n", 66 | " \"file_count\": 1\n", 67 | " }\n", 68 | " ],\n", 69 | " \"sampled_files\": [\n", 70 | " \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_flair/00000014_brain_flair.nii\",\n", 71 | " \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_t2/00000014_brain_t2.nii\",\n", 72 | " \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_t1ce/00000014_brain_t1ce.nii\",\n", 73 | " \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_seg/BraTS2021_00002_seg_new.nii\",\n", 74 | " \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_t1/00000014_brain_t1.nii\"\n", 75 | " ]\n", 76 | " },\n", 77 | " {\n", 78 | " \"a_folder_name\": \"BraTS2021_00000\",\n", 79 | " \"directory_tree\": [\n", 80 | " {\n", 81 | " \"level\": 0,\n", 82 | " \"folder_name\": \"BraTS2021_00000\",\n", 83 | " \"sub_folders\": [\n", 84 | " \"BraTS2021_00000_flair\",\n", 85 | " \"BraTS2021_00000_seg\",\n", 86 | " \"BraTS2021_00000_t1\",\n", 87 | " \"BraTS2021_00000_t1ce\",\n", 88 | " \"BraTS2021_00000_t2\"\n", 89 | " ],\n", 90 | " \"file_count\": 0\n", 91 | " },\n", 92 | " {\n", 93 | " \"level\": 1,\n", 94 | " \"folder_name\": \"BraTS2021_00000_flair\",\n", 95 | " \"sub_folders\": [],\n", 96 | " \"file_count\": 1\n", 97 | " },\n", 98 | " {\n", 99 | " \"level\": 1,\n", 100 | " \"folder_name\": \"BraTS2021_00000_seg\",\n", 101 | " \"sub_folders\": [],\n", 102 | " \"file_count\": 1\n", 103 | " },\n", 104 | " {\n", 105 | " \"level\": 1,\n", 106 | " \"folder_name\": \"BraTS2021_00000_t1\",\n", 107 | " \"sub_folders\": [],\n", 108 | " \"file_count\": 1\n", 109 | " },\n", 110 | " {\n", 111 | " \"level\": 1,\n", 112 | " \"folder_name\": \"BraTS2021_00000_t1ce\",\n", 113 | " \"sub_folders\": [],\n", 114 | " \"file_count\": 1\n", 115 | " },\n", 116 | " {\n", 117 | " \"level\": 1,\n", 118 | " \"folder_name\": \"BraTS2021_00000_t2\",\n", 119 | " \"sub_folders\": [],\n", 120 | " \"file_count\": 1\n", 121 | " }\n", 122 | " ],\n", 123 | " \"sampled_files\": [\n", 124 | " \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t1ce/00000057_brain_t1ce.nii\",\n", 125 | " \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t1/00000057_brain_t1.nii\",\n", 126 | " \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t2/00000057_brain_t2.nii\",\n", 127 | " \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_flair/00000057_brain_flair.nii\",\n", 128 | " \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_seg/00000057_final_seg.nii\"\n", 129 | " ]\n", 130 | " },\n", 131 | " {\n", 132 | " \"a_folder_name\": \"BraTS2021_00003\",\n", 133 | " \"directory_tree\": [\n", 134 | " {\n", 135 | " \"level\": 0,\n", 136 | " \"folder_name\": \"BraTS2021_00003\",\n", 137 | " \"sub_folders\": [\n", 138 | " \"BraTS2021_00003_flair\",\n", 139 | " \"BraTS2021_00003_seg\",\n", 140 | " \"BraTS2021_00003_t1\",\n", 141 | " \"BraTS2021_00003_t1ce\",\n", 142 | " \"BraTS2021_00003_t2\"\n", 143 | " ],\n", 144 | " \"file_count\": 0\n", 145 | " },\n", 146 | " {\n", 147 | " \"level\": 1,\n", 148 | " \"folder_name\": \"BraTS2021_00003_flair\",\n", 149 | " \"sub_folders\": [],\n", 150 | " \"file_count\": 1\n", 151 | " },\n", 152 | " {\n", 153 | " \"level\": 1,\n", 154 | " \"folder_name\": \"BraTS2021_00003_seg\",\n", 155 | " \"sub_folders\": [],\n", 156 | " \"file_count\": 1\n", 157 | " },\n", 158 | " {\n", 159 | " \"level\": 1,\n", 160 | " \"folder_name\": \"BraTS2021_00003_t1\",\n", 161 | " \"sub_folders\": [],\n", 162 | " \"file_count\": 1\n", 163 | " },\n", 164 | " {\n", 165 | " \"level\": 1,\n", 166 | " \"folder_name\": \"BraTS2021_00003_t1ce\",\n", 167 | " \"sub_folders\": [],\n", 168 | " \"file_count\": 1\n", 169 | " },\n", 170 | " {\n", 171 | " \"level\": 1,\n", 172 | " \"folder_name\": \"BraTS2021_00003_t2\",\n", 173 | " \"sub_folders\": [],\n", 174 | " \"file_count\": 1\n", 175 | " }\n", 176 | " ],\n", 177 | " \"sampled_files\": [\n", 178 | " \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_t1/00000017_brain_t1.nii\",\n", 179 | " \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_seg/BraTS2021_00003_seg_new.nii\",\n", 180 | " \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_t2/00000017_brain_t2.nii\",\n", 181 | " \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_flair/00000017_brain_flair.nii\",\n", 182 | " \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_t1ce/00000017_brain_t1ce.nii\"\n", 183 | " ]\n", 184 | " }\n", 185 | " ]\n", 186 | "}\n", 187 | "\n", 188 | "结果已保存到: directory_analysis.json\n" 189 | ] 190 | } 191 | ], 192 | "source": [ 193 | "import os\n", 194 | "import random\n", 195 | "import json\n", 196 | "\n", 197 | "def analyze_directory(root_dir, sample_folder_count=1, sample_file_count=5):\n", 198 | " # 用于存储最终的结果\n", 199 | " result = {\n", 200 | " \"root_directory\": root_dir,\n", 201 | " \"a_level_summary\": {\n", 202 | " \"total_a_folders\": 0,\n", 203 | " \"example_a_folders\": [],\n", 204 | " },\n", 205 | " \"sampled_a_folders\": []\n", 206 | " }\n", 207 | "\n", 208 | " # 1. 计算根目录下的 A 级文件夹数量并打印\n", 209 | " a_level_folders = [f for f in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, f))]\n", 210 | " result[\"a_level_summary\"][\"total_a_folders\"] = len(a_level_folders)\n", 211 | " result[\"a_level_summary\"][\"example_a_folders\"] = a_level_folders[:5]\n", 212 | "\n", 213 | " # 2. 随机采样 A 级文件夹\n", 214 | " sampled_a_folders = random.sample(a_level_folders, min(sample_folder_count, len(a_level_folders)))\n", 215 | "\n", 216 | " for sampled_a_folder in sampled_a_folders:\n", 217 | " sampled_a_path = os.path.join(root_dir, sampled_a_folder)\n", 218 | " sampled_a_info = {\n", 219 | " \"a_folder_name\": sampled_a_folder,\n", 220 | " \"directory_tree\": [],\n", 221 | " \"sampled_files\": []\n", 222 | " }\n", 223 | "\n", 224 | " # 3. 穷尽 A 级文件夹下的目录树\n", 225 | " file_list = []\n", 226 | " for root, dirs, files in os.walk(sampled_a_path):\n", 227 | " # 获取当前路径的相对路径和层级\n", 228 | " relative_root = os.path.relpath(root, sampled_a_path)\n", 229 | " folder_level = len(relative_root.split(os.sep)) if relative_root != \".\" else 0\n", 230 | "\n", 231 | " # 保存目录树信息\n", 232 | " sampled_a_info[\"directory_tree\"].append({\n", 233 | " \"level\": folder_level,\n", 234 | " \"folder_name\": os.path.basename(root),\n", 235 | " \"sub_folders\": dirs,\n", 236 | " \"file_count\": len(files)\n", 237 | " })\n", 238 | "\n", 239 | " # 收集文件地址\n", 240 | " file_list.extend([os.path.join(root, f) for f in files])\n", 241 | "\n", 242 | " # 4. 随机采样末端文件\n", 243 | " sampled_files = random.sample(file_list, min(sample_file_count, len(file_list)))\n", 244 | " sampled_a_info[\"sampled_files\"] = sampled_files\n", 245 | "\n", 246 | " # 添加到结果中\n", 247 | " result[\"sampled_a_folders\"].append(sampled_a_info)\n", 248 | "\n", 249 | " # 将结果格式化为 JSON 并打印\n", 250 | " formatted_result = json.dumps(result, indent=4, ensure_ascii=False)\n", 251 | " print(formatted_result)\n", 252 | "\n", 253 | " # 可选择将结果保存到文件\n", 254 | " output_file = \"directory_analysis.json\"\n", 255 | " with open(output_file, \"w\", encoding=\"utf-8\") as f:\n", 256 | " f.write(formatted_result)\n", 257 | " print(f\"\\n结果已保存到: {output_file}\")\n", 258 | "\n", 259 | "# 设置根目录\n", 260 | "root_directory = \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data\"\n", 261 | "analyze_directory(root_directory, sample_file_count=10, sample_folder_count=4)\n" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 24, 267 | "metadata": {}, 268 | "outputs": [ 269 | { 270 | "name": "stdout", 271 | "output_type": "stream", 272 | "text": [ 273 | "模型输出的原始内容:\n", 274 | "```python\n", 275 | "import os\n", 276 | "import csv\n", 277 | "\n", 278 | "def generate_metadata(root_directory):\n", 279 | " metadata = []\n", 280 | " sample_id = 0\n", 281 | "\n", 282 | " for a_folder in os.listdir(root_directory):\n", 283 | " a_folder_path = os.path.join(root_directory, a_folder)\n", 284 | " if os.path.isdir(a_folder_path):\n", 285 | " flair_path = t1_path = t1ce_path = t2_path = seg_path = \"\"\n", 286 | "\n", 287 | " for sub_folder in os.listdir(a_folder_path):\n", 288 | " sub_folder_path = os.path.join(a_folder_path, sub_folder)\n", 289 | " if os.path.isdir(sub_folder_path):\n", 290 | " if sub_folder.endswith(\"_flair\"):\n", 291 | " flair_path = os.path.join(sub_folder_path, os.listdir(sub_folder_path)[0])\n", 292 | " elif sub_folder.endswith(\"_t1\"):\n", 293 | " t1_path = os.path.join(sub_folder_path, os.listdir(sub_folder_path)[0])\n", 294 | " elif sub_folder.endswith(\"_t1ce\"):\n", 295 | " t1ce_path = os.path.join(sub_folder_path, os.listdir(sub_folder_path)[0])\n", 296 | " elif sub_folder.endswith(\"_t2\"):\n", 297 | " t2_path = os.path.join(sub_folder_path, os.listdir(sub_folder_path)[0])\n", 298 | " elif sub_folder.endswith(\"_seg\"):\n", 299 | " seg_path = os.path.join(sub_folder_path, os.listdir(sub_folder_path)[0])\n", 300 | "\n", 301 | " metadata.append({\n", 302 | " \"sample_id\": sample_id,\n", 303 | " \"flair_path\": flair_path,\n", 304 | " \"t1_path\": t1_path,\n", 305 | " \"t1ce_path\": t1ce_path,\n", 306 | " \"t2_path\": t2_path,\n", 307 | " \"seg_path\": seg_path\n", 308 | " })\n", 309 | " sample_id += 1\n", 310 | "\n", 311 | " # Save metadata to csv\n", 312 | " csv_path = os.path.join(root_directory, \"metadata.csv\")\n", 313 | " with open(csv_path, mode='w', newline='') as file:\n", 314 | " writer = csv.DictWriter(file, fieldnames=[\"sample_id\", \"flair_path\", \"t1_path\", \"t1ce_path\", \"t2_path\", \"seg_path\"])\n", 315 | " writer.writeheader()\n", 316 | " for row in metadata:\n", 317 | " writer.writerow(row)\n", 318 | "\n", 319 | "# Example usage\n", 320 | "root_directory = \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data\"\n", 321 | "generate_metadata(root_directory)\n", 322 | "```\n", 323 | "\n", 324 | "### 代码说明:\n", 325 | "1. **文件命名规律分析**:\n", 326 | " - 文件夹命名中包含 `_flair`, `_t1`, `_t1ce`, `_t2` 的为多模态文件。\n", 327 | " - 文件夹命名中包含 `_seg` 的为掩码文件。\n", 328 | "\n", 329 | "2. **生成 metadata.csv**:\n", 330 | " - `sample_id`:使用数字序号。\n", 331 | " - `flair_path`, `t1_path`, `t1ce_path`, `t2_path`:分别对应不同模态的文件路径。\n", 332 | " - `seg_path`:掩码文件路径。\n", 333 | " - 若某模态或掩码文件不存在,则对应路径为空。\n", 334 | "\n", 335 | "3. **代码执行**:\n", 336 | " - 代码会遍历根目录下的所有 A 级文件夹,并提取各模态和掩码文件的路径,最终生成 `metadata.csv` 文件并保存在根目录下。\n", 337 | "LLM 的响应内容已保存到 result.json 文件中。\n", 338 | "生成的 Python 代码已保存到 generate_metadata.py 文件中。\n" 339 | ] 340 | } 341 | ], 342 | "source": [ 343 | "import requests\n", 344 | "import csv\n", 345 | "import re\n", 346 | "import json\n", 347 | "import os\n", 348 | "\n", 349 | "\n", 350 | "\n", 351 | "def generate_metadata(root_directory, your_api_key=None):\n", 352 | " # DeepSeek API 的 URL 和 API 密钥\n", 353 | " DEEPSEEK_API_URL = \"https://api.deepseek.com/v1/chat/completions\"\n", 354 | "\n", 355 | " # 从环境变量中读取 API 密钥\n", 356 | " # 我推荐你到DeepSeek官网注册一个账号,然后在个人中心获取API_KEY,他们会给你你一辈子都用不完的额度\n", 357 | " # 获取之后填写到API = \"你的API\"中\n", 358 | " \n", 359 | " if your_api_key is None:\n", 360 | " if os.path.exists(\"config.py\"):\n", 361 | " from config import API_KEY\n", 362 | " API_KEY = API_KEY\n", 363 | "\n", 364 | " # 读取 JSON 文件\n", 365 | " with open(\"directory_analysis.json\", \"r\") as f:\n", 366 | " json_input = f.read()\n", 367 | "\n", 368 | " # 构建请求数据\n", 369 | " data = {\n", 370 | " \"model\": \"deepseek-chat\",\n", 371 | " \"messages\": [\n", 372 | " {\n", 373 | " \"role\": \"system\",\n", 374 | " \"content\": (\n", 375 | " \"你是一名熟练的数据科学家,善于解析复杂的文件目录并生成元数据表格。\"\n", 376 | " \"你的任务是帮助用户分析医学影像数据集,并根据采样的文件结构生成metadata.csv。\"\n", 377 | " \"你只需要输出带有恰当注释的python代码即可,多余的信息不输出。\"\n", 378 | " )\n", 379 | " },\n", 380 | " {\n", 381 | " \"role\": \"user\",\n", 382 | " \"content\": (\n", 383 | " f\"我正在浏览一个医学影像数据集,它的根目录为:{json.loads(json_input)['root_directory']}。\\n\"\n", 384 | " \"这个数据集包含若干影像文件(可能包括多模态文件、单模态文件和掩码文件)。\\n\"\n", 385 | " \"我采样了一些子文件夹(记为 A 级文件夹)以及其中的 B/C 级文件夹,目录树和采样文件的信息如下:\\n\"\n", 386 | " f\"{json_input}\\n\"\n", 387 | " \"我需要你:\\n\"\n", 388 | " \"1. 分析文件命名的规律,判断是否存在多模态文件或掩码文件。\\n\"\n", 389 | " \"2. 根据这些规律生成构建 metadata.csv 的 Python 代码。\\n\"\n", 390 | " \"3. 输出的代码应该以根目录为输入,生成的 csv 应保存在根目录下,csv 的列包括 sample_id(若没有明显 id,则直接用数字序号)、各模态的文件地址(如 flair_path, t1_path 等,若不存在则为空,若没有明显的多模态特征那么记为image_path)、以及掩码地址(若不存在则为空)。\"\n", 391 | " )\n", 392 | " }\n", 393 | " ],\n", 394 | " \"stream\": False\n", 395 | " }\n", 396 | "\n", 397 | " # 发送请求\n", 398 | " headers = {\n", 399 | " \"Authorization\": f\"Bearer {API_KEY}\",\n", 400 | " \"Content-Type\": \"application/json\"\n", 401 | " }\n", 402 | " response = requests.post(DEEPSEEK_API_URL, headers=headers, json=data)\n", 403 | "\n", 404 | " # 检查响应状态码\n", 405 | " if response.status_code == 200:\n", 406 | " result = response.json()\n", 407 | " try:\n", 408 | " # 打印模型输出的原始内容\n", 409 | " print(\"模型输出的原始内容:\")\n", 410 | " model_output = result[\"choices\"][0][\"message\"][\"content\"]\n", 411 | " print(model_output)\n", 412 | "\n", 413 | " # 保存 LLM 输出到 result.json\n", 414 | " with open(\"result.json\", \"w\") as f:\n", 415 | " f.write(model_output)\n", 416 | " print(\"LLM 的响应内容已保存到 result.json 文件中。\")\n", 417 | "\n", 418 | " # 尝试从 LLM 的输出中提取生成的代码\n", 419 | " code_match = re.search(r\"```python(.*?)```\", model_output, re.DOTALL)\n", 420 | " if code_match:\n", 421 | " extracted_code = code_match.group(1).strip()\n", 422 | " with open(\"generate_metadata.py\", \"w\") as f:\n", 423 | " f.write(extracted_code)\n", 424 | " print(\"生成的 Python 代码已保存到 generate_metadata.py 文件中。\")\n", 425 | " else:\n", 426 | " print(\"未检测到有效的 Python 代码块,请手动检查 LLM 输出。\")\n", 427 | " except Exception as e:\n", 428 | " print(f\"解析响应内容时发生错误:{e}\")\n", 429 | " else:\n", 430 | " print(f\"请求失败,状态码:{response.status_code}\")\n", 431 | " print(response.text)\n", 432 | "\n", 433 | "\n", 434 | "generate_metadata(root_directory, your_api_key=None)" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": 25, 440 | "metadata": {}, 441 | "outputs": [ 442 | { 443 | "name": "stdout", 444 | "output_type": "stream", 445 | "text": [ 446 | "正在执行 generate_metadata.py...\n", 447 | "generate_metadata.py 执行成功!\n", 448 | "metadata.csv 文件已生成,路径为:/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/metadata.csv\n", 449 | "metadata.csv 的前 5 行内容:\n", 450 | "['sample_id', 'flair_path', 't1_path', 't1ce_path', 't2_path', 'seg_path']\n", 451 | "['0', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_flair/00000057_brain_flair.nii', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t1/00000057_brain_t1.nii', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t1ce/00000057_brain_t1ce.nii', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t2/00000057_brain_t2.nii', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_seg/00000057_final_seg.nii']\n", 452 | "['1', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_flair/00000014_brain_flair.nii', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_t1/00000014_brain_t1.nii', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_t1ce/00000014_brain_t1ce.nii', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_t2/00000014_brain_t2.nii', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_seg/BraTS2021_00002_seg_new.nii']\n", 453 | "['2', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_flair/00000017_brain_flair.nii', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_t1/00000017_brain_t1.nii', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_t1ce/00000017_brain_t1ce.nii', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_t2/00000017_brain_t2.nii', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_seg/BraTS2021_00003_seg_new.nii']\n" 454 | ] 455 | } 456 | ], 457 | "source": [ 458 | "import os\n", 459 | "import csv\n", 460 | "import subprocess\n", 461 | "\n", 462 | "def execute_metadata_script(root_directory):\n", 463 | " metadata_file = os.path.join(root_directory, \"metadata.csv\")\n", 464 | " script_file = \"generate_metadata.py\"\n", 465 | "\n", 466 | " # 检查 generate_metadata.py 是否存在\n", 467 | " if not os.path.exists(script_file):\n", 468 | " print(f\"脚本 {script_file} 不存在,请确保文件已正确生成。\")\n", 469 | " else:\n", 470 | " # 执行 generate_metadata.py 脚本\n", 471 | " print(f\"正在执行 {script_file}...\")\n", 472 | " result = subprocess.run([\"python\", script_file], capture_output=True, text=True)\n", 473 | "\n", 474 | " # 检查执行结果\n", 475 | " if result.returncode == 0:\n", 476 | " print(f\"{script_file} 执行成功!\")\n", 477 | " else:\n", 478 | " print(f\"{script_file} 执行失败!\")\n", 479 | " print(f\"错误输出:\\n{result.stderr}\")\n", 480 | "\n", 481 | " # 检查 metadata.csv 是否存在\n", 482 | " if os.path.exists(metadata_file):\n", 483 | " print(f\"metadata.csv 文件已生成,路径为:{metadata_file}\")\n", 484 | "\n", 485 | " # 打印 metadata.csv 的前 5 行\n", 486 | " try:\n", 487 | " with open(metadata_file, \"r\") as f:\n", 488 | " reader = csv.reader(f)\n", 489 | " print(\"metadata.csv 的前 5 行内容:\")\n", 490 | " for i, row in enumerate(reader):\n", 491 | " print(row)\n", 492 | " if i == 4: # 打印前 5 行\n", 493 | " break\n", 494 | " except Exception as e:\n", 495 | " print(f\"读取 metadata.csv 时发生错误:{e}\")\n", 496 | " else:\n", 497 | " print(\"metadata.csv 文件未生成,请检查脚本逻辑和根目录路径。\")\n", 498 | "\n", 499 | "execute_metadata_script(root_directory)" 500 | ] 501 | } 502 | ], 503 | "metadata": { 504 | "kernelspec": { 505 | "display_name": "Python 3", 506 | "language": "python", 507 | "name": "python3" 508 | }, 509 | "language_info": { 510 | "codemirror_mode": { 511 | "name": "ipython", 512 | "version": 3 513 | }, 514 | "file_extension": ".py", 515 | "mimetype": "text/x-python", 516 | "name": "python", 517 | "nbconvert_exporter": "python", 518 | "pygments_lexer": "ipython3", 519 | "version": "3.10.11" 520 | } 521 | }, 522 | "nbformat": 4, 523 | "nbformat_minor": 2 524 | } 525 | -------------------------------------------------------------------------------- /LLM_metadata.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import json 4 | import requests 5 | import csv 6 | import re 7 | import json 8 | import os 9 | import os 10 | import csv 11 | import subprocess 12 | 13 | def analyze_directory(root_directory, sample_folder_count=1, sample_file_count=5): 14 | # 用于存储最终的结果 15 | result = { 16 | "root_directory": root_directory, 17 | "a_level_summary": { 18 | "total_a_folders": 0, 19 | "example_a_folders": [], 20 | }, 21 | "sampled_a_folders": [] 22 | } 23 | 24 | # 1. 计算根目录下的 A 级文件夹数量并打印 25 | a_level_folders = [f for f in os.listdir(root_directory) if os.path.isdir(os.path.join(root_directory, f))] 26 | result["a_level_summary"]["total_a_folders"] = len(a_level_folders) 27 | result["a_level_summary"]["example_a_folders"] = a_level_folders[:5] 28 | 29 | # 2. 随机采样 A 级文件夹 30 | sampled_a_folders = random.sample(a_level_folders, min(sample_folder_count, len(a_level_folders))) 31 | 32 | for sampled_a_folder in sampled_a_folders: 33 | sampled_a_path = os.path.join(root_directory, sampled_a_folder) 34 | sampled_a_info = { 35 | "a_folder_name": sampled_a_folder, 36 | "directory_tree": [], 37 | "sampled_files": [] 38 | } 39 | 40 | # 3. 穷尽 A 级文件夹下的目录树 41 | file_list = [] 42 | for root, dirs, files in os.walk(sampled_a_path): 43 | # 获取当前路径的相对路径和层级 44 | relative_root = os.path.relpath(root, sampled_a_path) 45 | folder_level = len(relative_root.split(os.sep)) if relative_root != "." else 0 46 | 47 | # 保存目录树信息 48 | sampled_a_info["directory_tree"].append({ 49 | "level": folder_level, 50 | "folder_name": os.path.basename(root), 51 | "sub_folders": dirs, 52 | "file_count": len(files) 53 | }) 54 | 55 | # 收集文件地址 56 | file_list.extend([os.path.join(root, f) for f in files]) 57 | 58 | # 4. 随机采样末端文件 59 | sampled_files = random.sample(file_list, min(sample_file_count, len(file_list))) 60 | sampled_a_info["sampled_files"] = sampled_files 61 | 62 | # 添加到结果中 63 | result["sampled_a_folders"].append(sampled_a_info) 64 | 65 | # 将结果格式化为 JSON 并打印 66 | formatted_result = json.dumps(result, indent=4, ensure_ascii=False) 67 | print(formatted_result) 68 | 69 | # 可选择将结果保存到文件 70 | output_file = os.path.join(root_directory, "directory_analysis.json") 71 | with open(output_file, "w", encoding="utf-8") as f: 72 | f.write(formatted_result) 73 | print(f"\n结果已保存到: {output_file}") 74 | 75 | 76 | 77 | def generate_metadata(root_directory, your_api_key=None): 78 | # DeepSeek API 的 URL 和 API 密钥 79 | DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions" 80 | llm_api = None 81 | if your_api_key is None: 82 | if os.path.exists(r"D:\REPO\PreProcPipe\config.py"): 83 | from config import API_KEY 84 | llm_api = API_KEY 85 | print("API 密钥已从 config.py 中读取。") 86 | else: 87 | llm_api = your_api_key 88 | 89 | # 读取 JSON 文件 90 | with open(os.path.join(root_directory, "directory_analysis.json"), "r") as f: 91 | json_input = f.read() 92 | print("正在指使LLM生成代码...") 93 | # 构建请求数据 94 | data = { 95 | "model": "deepseek-chat", 96 | "messages": [ 97 | { 98 | "role": "system", 99 | "content": ( 100 | "你是一名熟练的数据科学家,善于解析复杂的文件目录并生成元数据表格。" 101 | "你的任务是帮助用户分析医学影像数据集,并根据采样的文件结构生成metadata.csv。" 102 | "你只需要输出带有恰当注释的python代码即可,多余的信息不输出。" 103 | ) 104 | }, 105 | { 106 | "role": "user", 107 | "content": ( 108 | f"我正在浏览一个医学影像数据集,它的根目录为:{json.loads(json_input)['root_directory']}。\n" 109 | "这个数据集包含若干影像文件(可能包括多模态文件、单模态文件和掩码文件)。\n" 110 | "我采样了一些子文件夹(记为 A 级文件夹)以及其中的 B/C 级文件夹,目录树和采样文件的信息如下:\n" 111 | f"{json_input}\n" 112 | "我需要你:\n" 113 | "1. 分析文件命名的规律,判断是否存在多模态文件或掩码文件,分析出他们之间配对的关系,比如命名可能有相同的地方,或者用后缀区分了图像与掩码。\n" 114 | "2. 根据这些规律生成构建 metadata.csv 的 Python 代码。\n" 115 | "3. 输出的代码应该以根目录为输入,生成的 csv 应保存在根目录下,csv 的列包括 sample_id(若没有明显 id,则直接用数字序号)、各模态的文件地址(如 flair_path, t1_path 等,若没有明显的多模态特征那么记为image_path)、以及掩码地址(若不存在则为空)。\n" 116 | ) 117 | } 118 | ], 119 | "stream": False 120 | } 121 | 122 | # 发送请求 123 | headers = { 124 | "Authorization": f"Bearer {llm_api}", 125 | "Content-Type": "application/json" 126 | } 127 | response = requests.post(DEEPSEEK_API_URL, headers=headers, json=data) 128 | 129 | # 检查响应状态码 130 | if response.status_code == 200: 131 | result = response.json() 132 | try: 133 | # 打印模型输出的原始内容 134 | print("模型输出的原始内容:") 135 | model_output = result["choices"][0]["message"]["content"] 136 | print(model_output) 137 | 138 | # 你可以保存 LLM 输出到 result.json 139 | # with open("result.json", "w") as f: 140 | # f.write(model_output) 141 | # print("LLM 的响应内容已保存到 result.json 文件中。") 142 | 143 | # 尝试从 LLM 的输出中提取生成的代码 144 | code_match = re.search(r"```python(.*?)```", model_output, re.DOTALL) 145 | if code_match: 146 | extracted_code = code_match.group(1).strip() 147 | with open(os.path.join(root_directory, "generate_metadata.py"), "w") as f: 148 | f.write(extracted_code) 149 | print("生成的 Python 代码已保存到 generate_metadata.py 文件中。") 150 | else: 151 | print("未检测到有效的 Python 代码块,请手动检查 LLM 输出。") 152 | except Exception as e: 153 | print(f"解析响应内容时发生错误:{e}") 154 | else: 155 | print(f"请求失败,状态码:{response.status_code}") 156 | print(response.text) 157 | 158 | 159 | 160 | def execute_metadata_script(root_directory): 161 | metadata_file = os.path.join(root_directory, "metadata.csv") 162 | script_file = os.path.join(root_directory, "generate_metadata.py") 163 | 164 | # 检查 generate_metadata.py 是否存在 165 | if not os.path.exists(script_file): 166 | print(f"脚本 {script_file} 不存在,请确保文件已正确生成。") 167 | else: 168 | # 执行 generate_metadata.py 脚本 169 | print(f"正在执行 {script_file}...") 170 | result = subprocess.run(["python", script_file], capture_output=True, text=True) 171 | 172 | # 检查执行结果 173 | if result.returncode == 0: 174 | print(f"{script_file} 执行成功!") 175 | else: 176 | print(f"{script_file} 执行失败!") 177 | print(f"错误输出:\n{result.stderr}") 178 | 179 | # 检查 metadata.csv 是否存在 180 | if os.path.exists(metadata_file): 181 | print(f"metadata.csv 文件已生成,路径为:{metadata_file}") 182 | 183 | # 打印 metadata.csv 的前 5 行 184 | try: 185 | with open(metadata_file, "r") as f: 186 | reader = csv.reader(f) 187 | print("metadata.csv 的前 5 行内容:") 188 | for i, row in enumerate(reader): 189 | print(row) 190 | if i == 4: # 打印前 5 行 191 | break 192 | except Exception as e: 193 | print(f"读取 metadata.csv 时发生错误:{e}") 194 | else: 195 | print("metadata.csv 文件未生成,请检查脚本逻辑和根目录路径。") 196 | 197 | def metadata_sanity_check(root_directory): 198 | metadata_file = os.path.join(root_directory, "metadata.csv") 199 | 200 | try: 201 | with open(metadata_file, mode='r') as file: 202 | reader = csv.DictReader(file) 203 | for row in reader: 204 | for key, value in row.items(): 205 | if '_path' in key: 206 | if not value: 207 | print(f"空值: {key} 在 sample_id {row['sample_id']} 中为空") 208 | else: 209 | full_path = os.path.join(root_directory, value) 210 | if not os.path.exists(full_path): 211 | print(f"路径无效: {key} 在 sample_id {row['sample_id']} 中指向 {full_path}") 212 | else: 213 | # print(f"路径有效: {key} 在 sample_id {row['sample_id']} 中指向 {full_path}") 214 | pass 215 | except Exception as e: 216 | print(f"读取 metadata.csv 时发生错误: {e}") 217 | print("看起来有错误,你可以手动查看 metadata.csv 是否正确。") 218 | 219 | 220 | 221 | 222 | if __name__ == "__main__": 223 | # 1. 分析文件目录结构 224 | root_directory = r"D:\REPO\PreProcPipe\BraTS2021_Training_Data" # 填写数据集根目录,一定要是绝对路径 225 | 226 | analyze_directory(root_directory=root_directory, sample_folder_count=5, sample_file_count=10) # 可以通过增加 sample_folder_count 和 sample_file_count 来提高成功率 227 | 228 | # 2. 生成 metadata.csv 的 Python 代码 229 | generate_metadata(root_directory=root_directory, your_api_key=None) # 填写你的API_KEY 230 | # 推荐你去DeepSeek官网注册一个账号,然后在个人中心获取API_KEY,他们会给你一辈子用不完的额度,输入格式为API = "sadasdasdwqeqwe2" 231 | 232 | # 3. 执行生成的代码并检查 metadata.csv 233 | execute_metadata_script(root_directory=root_directory) 234 | 235 | # 4. 检查 metadata.csv 的正确性 236 | metadata_sanity_check(root_directory=root_directory) 237 | 238 | 239 | -------------------------------------------------------------------------------- /__pycache__/config.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/__pycache__/config.cpython-310.pyc -------------------------------------------------------------------------------- /__pycache__/pipeline.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/__pycache__/pipeline.cpython-310.pyc -------------------------------------------------------------------------------- /after.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/after.png -------------------------------------------------------------------------------- /before.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/before.png -------------------------------------------------------------------------------- /pipeline.py: -------------------------------------------------------------------------------- 1 | import nibabel as nib 2 | import numpy as np 3 | from scipy.ndimage import zoom 4 | from multiprocessing import Pool 5 | import os 6 | import csv 7 | 8 | class SimplePreprocessor: 9 | def __init__(self, target_spacing=[1.0, 1.0, 1.0], normalization_scheme="z-score", target_size=None): 10 | """ 11 | 初始化预处理器。 12 | 13 | 参数: 14 | - target_spacing: 目标体素大小(spacing),默认为 [1.0, 1.0, 1.0]。 15 | - normalization_scheme: 归一化方案,支持 "z-score" 或 "min-max"。 16 | - target_size: 目标尺寸,例如 [256, 256],默认为 None(不调整尺寸)。 17 | """ 18 | self.target_spacing = target_spacing 19 | self.normalization_scheme = normalization_scheme 20 | self.target_size = target_size # 目标大小,例如 [256, 256] 21 | 22 | def read_images(self, image_paths): 23 | """ 24 | 读取多个模态的图像数据 (.nii) 文件,并返回一个列表,每个元素为单独的 NumPy 数组。 25 | """ 26 | print("Step 1: Loading multi-modal image data...") 27 | img_list = [] 28 | for path in image_paths: 29 | img = nib.load(path) 30 | img_data = img.get_fdata() 31 | img_list.append(img_data) 32 | # 假设所有模态具有相同的 spacing 33 | img_spacing = nib.load(image_paths[0]).header.get_zooms() 34 | print() 35 | return img_list, img_spacing 36 | 37 | def read_seg(self, seg_path): 38 | """ 39 | 读取分割数据 (.nii) 文件并转换为 NumPy 数组。 40 | """ 41 | print("Step 1: Loading segmentation data...") 42 | seg = nib.load(seg_path) 43 | seg_data = seg.get_fdata() 44 | print() 45 | return seg_data 46 | 47 | def run_case(self, image_paths, seg_path=None): 48 | """ 49 | 能够处理多模态图像的预处理流程,但不将它们合并到同一个数组中。 50 | """ 51 | # Step 1: 加载多模态图像数据 52 | data_list, spacing = self.read_images(image_paths) 53 | 54 | if seg_path: 55 | seg = self.read_seg(seg_path) 56 | else: 57 | seg = None 58 | 59 | # 打印原始数据形状 60 | for i, data in enumerate(data_list): 61 | print(f"Original image shape (modality {i}): {data.shape}") 62 | if seg is not None: 63 | print(f"Original segmentation shape: {seg.shape}") 64 | print() 65 | 66 | # Step 2: 根据所有模态数据的非零区域计算裁剪范围 67 | print("Step 2: Cropping to non-zero regions...") 68 | # 将所有模态的非零坐标合并计算公共裁剪区域 69 | data_list, seg, properties = self.crop(data_list, seg) 70 | properties['original_spacing'] = spacing 71 | 72 | # Step 3: 对每个模态独立归一化 73 | print("Step 3: Normalizing image data...") 74 | for i in range(len(data_list)): 75 | data_list[i] = self._normalize_single_modality(data_list[i]) 76 | print() 77 | 78 | # Step 4: 重采样到目标分辨率 79 | print("Step 4: Resampling data to target spacing...") 80 | # 使用第一模态计算 new_shape(假设各模态 spacing 一致) 81 | new_shape = self.compute_new_shape(data_list[0].shape, spacing, self.target_spacing) 82 | data_list = [self.resample_data(d, new_shape, order=3) for d in data_list] 83 | if seg is not None: 84 | seg = self.resample_data(seg, new_shape, order=0) 85 | print() 86 | 87 | # Step 5: 调整到目标尺寸(如果指定) 88 | if self.target_size is not None: 89 | print("Step 5: Resizing data to target size...") 90 | data_list = [self.resize_to_target_size(d, self.target_size, order=3) for d in data_list] 91 | if seg is not None: 92 | seg = self.resize_to_target_size(seg, self.target_size, order=0) 93 | print() 94 | 95 | print("Preprocessing completed.\n") 96 | return data_list, seg, spacing, properties 97 | 98 | 99 | def crop(self, data_list, seg): 100 | """ 101 | 裁剪图像和分割数据在 Z 轴方向的全零区域,返回裁剪后的数据列表和分割数据,以及裁剪属性。 102 | 103 | 参数: 104 | - data_list: 多模态图像数据列表,每个元素为 NumPy 数组。 105 | - seg: 分割数据(NumPy 数组),可以为 None。 106 | 107 | 返回: 108 | - cropped_data_list: 裁剪后的多模态图像数据列表。 109 | - cropped_seg: 裁剪后的分割数据(如果 seg 为 None,则返回 None)。 110 | - properties: 裁剪过程的属性信息,包括裁剪前后的形状和裁剪边界。 111 | """ 112 | print("Step 2: Cropping to non-zero regions along Z-axis...") 113 | 114 | # 获取所有模态在 Z 轴方向的非零范围 115 | nonzero_slices = [] 116 | for data in data_list: 117 | # 沿 Z 轴求和,如果某切片全为零,则和为零 118 | z_nonzero = np.any(data != 0, axis=(0, 1)) 119 | nonzero_slices.append(np.argwhere(z_nonzero).flatten()) 120 | 121 | if len(nonzero_slices) == 0: 122 | # 全部为零,不裁剪 123 | properties = { 124 | 'shape_before_cropping': [d.shape for d in data_list], 125 | 'shape_after_cropping': [d.shape for d in data_list], 126 | 'z_bbox': None 127 | } 128 | return data_list, seg, properties 129 | 130 | # 计算公共 Z 轴范围 131 | z_min = min(s.min() for s in nonzero_slices) 132 | z_max = max(s.max() for s in nonzero_slices) + 1 # 加1表示包含该索引 133 | 134 | print(f"Z-axis cropping range: {z_min} to {z_max}") 135 | 136 | # 裁剪所有模态的 Z 轴范围 137 | cropped_data_list = [d[:, :, z_min:z_max] for d in data_list] 138 | 139 | # 裁剪分割数据的 Z 轴范围 140 | cropped_seg = None 141 | if seg is not None: 142 | cropped_seg = seg[:, :, z_min:z_max] 143 | 144 | # 记录裁剪属性 145 | properties = { 146 | 'shape_before_cropping': [d.shape for d in data_list], 147 | 'shape_after_cropping': [d.shape for d in cropped_data_list], 148 | 'z_bbox': (z_min, z_max) 149 | } 150 | 151 | print(f"Shapes before cropping: {[d.shape for d in data_list]}") 152 | print(f"Shapes after cropping: {[d.shape for d in cropped_data_list]}") 153 | if seg is not None: 154 | print(f"Segmentation shape after cropping: {cropped_seg.shape}") 155 | 156 | return cropped_data_list, cropped_seg, properties 157 | 158 | 159 | # def _normalize(self, data, seg=None): 160 | # """ 161 | # 归一化图像数据。 162 | # """ 163 | # if self.normalization_scheme == "z-score": 164 | # mean_val = np.mean(data[data > 0]) 165 | # std_val = np.std(data[data > 0]) 166 | # data = (data - mean_val) / (std_val + 1e-8) 167 | # elif self.normalization_scheme == "min-max": 168 | # min_val = np.min(data[data > 0]) 169 | # max_val = np.max(data[data > 0]) 170 | # data = (data - min_val) / (max_val - min_val + 1e-8) 171 | # else: 172 | # raise ValueError(f"Unknown normalization scheme: {self.normalization_scheme}") 173 | # return data 174 | 175 | # 新增一个专门处理单个模态归一化的方法 176 | def _normalize_single_modality(self, data): 177 | """ 178 | 对单个模态数据进行归一化。 179 | """ 180 | mask = data > 0 181 | if self.normalization_scheme == "z-score": 182 | mean_val = np.mean(data[mask]) if np.any(mask) else 0.0 183 | std_val = np.std(data[mask]) if np.any(mask) else 1.0 184 | data = (data - mean_val) / (std_val + 1e-8) 185 | elif self.normalization_scheme == "min-max": 186 | min_val = np.min(data[mask]) if np.any(mask) else 0.0 187 | max_val = np.max(data[mask]) if np.any(mask) else 1.0 188 | data = (data - min_val) / (max_val - min_val + 1e-8) 189 | else: 190 | raise ValueError(f"Unknown normalization scheme: {self.normalization_scheme}") 191 | return data 192 | 193 | def compute_new_shape(self, old_shape, old_spacing, new_spacing): 194 | """ 195 | 根据原始分辨率和目标分辨率计算新的形状。 196 | """ 197 | resize_factor = [old_spacing[i] / new_spacing[i] for i in range(len(old_spacing))] 198 | print(f"Computed resize factors: {resize_factor}") 199 | new_shape = [int(np.round(old_shape[i] * resize_factor[i])) for i in range(len(old_shape))] 200 | print(f"Computed new shape: {new_shape}") 201 | return new_shape 202 | 203 | def resample_data(self, data, new_shape, order=3): 204 | """ 205 | 根据新的形状进行重采样。 206 | """ 207 | print("Resampling data...") 208 | zoom_factors = [new_shape[i] / data.shape[i] for i in range(len(data.shape))] 209 | resampled_data = zoom(data, zoom_factors, order=order) 210 | print(f"Data resampled to shape: {resampled_data.shape}") 211 | return resampled_data 212 | 213 | def resize_to_target_size(self, data, target_size, order=3): 214 | """ 215 | 将图像或分割数据调整到目标尺寸。 216 | """ 217 | print("Resizing data to target size...") 218 | current_shape = data.shape 219 | zoom_factors = [target_size[0] / current_shape[0], # 调整第一个维度(Y 轴,高度) 220 | target_size[1] / current_shape[1], # 调整第二个维度(X 轴,宽度) 221 | 1.0] # Z 轴(深度)保持不变 222 | resized_data = zoom(data, zoom_factors, order=order) 223 | print(f"Data resized to shape: {resized_data.shape}") 224 | return resized_data 225 | 226 | 227 | def process_case(args): 228 | """ 229 | 多进程调用的函数,用于处理单个病例。 230 | 231 | 参数: 232 | - args: (sample_id, image_paths, seg_path, preprocessor, output_root) 233 | """ 234 | sample_id, image_paths, seg_path, preprocessor, output_root = args 235 | # 调用预处理器的 run_case 方法处理多模态图像 236 | data_list, seg, spacing, properties = preprocessor.run_case(image_paths, seg_path) 237 | 238 | # 创建样本目录(在output_root下) 239 | # 检查sample_id的类型 240 | if isinstance(sample_id, int): 241 | sample_id = str(sample_id) 242 | sample_dir = os.path.join(output_root, sample_id) 243 | os.makedirs(sample_dir, exist_ok=True) 244 | 245 | # 推断各模态名称(使用文件名去除扩展名作为模态名称) 246 | modality_names = [os.path.splitext(os.path.basename(p))[0] for p in image_paths] 247 | 248 | # 保存各模态数据 249 | modality_paths = [] 250 | for modality_name, modality_data in zip(modality_names, data_list): 251 | save_path = os.path.join(sample_dir, f"{modality_name}.npz") 252 | np.savez_compressed(save_path, data=modality_data) 253 | modality_paths.append(save_path) 254 | 255 | seg_path_out = None 256 | # 保存分割数据(如果有分割) 257 | if seg is not None: 258 | seg_save_path = os.path.join(sample_dir, "seg.npz") 259 | np.savez_compressed(seg_save_path, data=seg) 260 | seg_path_out = seg_save_path 261 | 262 | # 保存 spacing 和 properties 为 meta.npz 263 | meta_save_path = os.path.join(sample_dir, "meta.npz") 264 | np.savez_compressed(meta_save_path, spacing=spacing, properties=properties) 265 | 266 | # 返回处理结果及保存的文件路径信息,用于后续生成metadata.csv 267 | return { 268 | "sample_id": sample_id, 269 | "modality_paths": modality_paths, 270 | "seg_path": seg_path_out, 271 | "meta_path": meta_save_path 272 | } 273 | 274 | 275 | def run_in_parallel(preprocessor, cases, output_root, num_workers=4): 276 | """ 277 | 使用多进程并行处理多个病例,并在output_root下存放处理结果为npz文件, 278 | 同时在output_root下生成metadata.csv记录每个sample的npz地址。 279 | 280 | 参数: 281 | - preprocessor: SimplePreprocessor 实例。 282 | - cases: 包含多个病例信息的列表,每个病例是一个字典,格式: 283 | { 284 | "sample_id": "某病例ID字符串", 285 | "image_paths": [模态1路径, 模态2路径, ...], 286 | "seg_path": 分割路径或 None 287 | } 288 | - output_root: 输出结果保存的根目录 289 | - num_workers: 并行进程数,默认为 4。 290 | 291 | 返回: 292 | - results: 包含每个病例保存文件路径信息的列表 293 | """ 294 | os.makedirs(output_root, exist_ok=True) 295 | 296 | args_list = [ 297 | (case["sample_id"], case["image_paths"], case["seg_path"], preprocessor, output_root) for case in cases 298 | ] 299 | 300 | # 使用多进程池并行处理 301 | with Pool(processes=num_workers) as pool: 302 | results = pool.map(process_case, args_list) 303 | 304 | # 生成 metadata.csv 305 | # 文件内容格式示例: 306 | # sample_id,modality_paths,seg_path,meta_path 307 | # BraTS2021_00000,"['output_root/BraTS2021_00000/t1.npz','output_root/BraTS2021_00000/t2.npz']","output_root/BraTS2021_00000/seg.npz","output_root/BraTS2021_00000/meta.npz" 308 | 309 | csv_path = os.path.join(output_root, "metadata.csv") 310 | with open(csv_path, mode='w', newline='', encoding='utf-8') as csvfile: 311 | writer = csv.writer(csvfile) 312 | writer.writerow(["sample_id", "modality_paths", "seg_path", "meta_path"]) 313 | for res in results: 314 | # 将绝对路径转换为相对于output_root的相对路径,便于移植 315 | # 如果需要保留绝对路径,可注释掉此步骤 316 | rel_modality_paths = [os.path.relpath(p, output_root) for p in res["modality_paths"]] 317 | rel_seg_path = os.path.relpath(res["seg_path"], output_root) if res["seg_path"] is not None else None 318 | rel_meta_path = os.path.relpath(res["meta_path"], output_root) 319 | writer.writerow([ 320 | res["sample_id"], 321 | str(rel_modality_paths), 322 | rel_seg_path, 323 | rel_meta_path 324 | ]) 325 | 326 | return results 327 | 328 | 329 | -------------------------------------------------------------------------------- /pipeline_example.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pipeline import SimplePreprocessor as ppp 3 | from pipeline import run_in_parallel 4 | 5 | 6 | # 定义读取 metadata.csv 并生成 cases 列表的函数 7 | def load_cases_from_metadata(csv_path): 8 | """ 9 | 从 metadata.csv 加载病例信息,并生成 (image_paths, seg_path) 的列表。 10 | 11 | 参数: 12 | - csv_path: metadata.csv 文件路径。 13 | 14 | 返回: 15 | - cases: 包含病例信息的列表,每个元素是一个字典,格式为: 16 | { 17 | "sample_id": 样本ID, 18 | "image_paths": [模态1路径, 模态2路径, ...], 19 | "seg_path": 分割路径或 None 20 | } 21 | """ 22 | df = pd.read_csv(csv_path) 23 | cases = [] 24 | for _, row in df.iterrows(): 25 | # 提取模态路径 26 | image_paths = [row['t1_path'], row['t1ce_path'], row['t2_path'], row['flair_path']] 27 | # 过滤掉空值 28 | image_paths = [path for path in image_paths if pd.notnull(path)] 29 | # 提取分割路径 30 | seg_path = row['seg_path'] if pd.notnull(row['seg_path']) else None 31 | # 添加到 cases 32 | cases.append({ 33 | "sample_id": row['sample_id'], 34 | "image_paths": image_paths, 35 | "seg_path": seg_path 36 | }) 37 | return cases 38 | 39 | 40 | if __name__ == "__main__": 41 | example_preprocessor = ppp( 42 | target_spacing = [0.5, 0.5, 0.5], 43 | target_size = [256, 256], 44 | normalization_scheme = "min-max", 45 | ) 46 | 47 | cases = load_cases_from_metadata(r"D:\REPO\PreProcPipe\BraTS2021_Training_Data\metadata.csv") 48 | results = run_in_parallel(example_preprocessor, cases, num_workers=8, output_root="preprocessed_data") 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # PreProcPipe: 多模态医学影像预处理框架 2 | 3 | 一个用于CT/MRI等多模态医学影像预处理的高效框架,支持自动化的数据预处理流程和元数据生成。 4 | 5 | ## Choose Language / 选择语言 6 | 7 | - [English](readme_en.md) 8 | - [简体中文](readme.md) 9 | 10 | ![预处理前](before.png) 11 | ![预处理后](after.png) 12 | 13 | ## 目录 14 | 15 | - [主要功能](#主要功能) 16 | - [技术架构](#技术架构) 17 | - [代码实现](#代码实现) 18 | - [使用指南](#使用指南) 19 | - [配置与扩展](#配置与扩展) 20 | - [示例](#示例) 21 | 22 | ## 主要功能 23 | 24 | ### 1. 预处理管道 (pipeline.py) 25 | - 多模态医学影像数据处理 26 | - Z轴方向智能裁剪 27 | - 可配置的数据归一化 28 | - 灵活的图像重采样 29 | - 并行处理支持 30 | - 自动保存处理结果 31 | 32 | ### 2. 元数据自动生成 (LLM_metadata.py) 33 | - 目录结构智能分析 34 | - LLM驱动的元数据规则生成 35 | - 自动验证和错误检测 36 | - DeepSeek API集成 37 | 38 | ## 技术架构 39 | 40 | ### 预处理管道架构 41 | 42 | 预处理管道采用模块化设计,按照以下步骤顺序处理数据: 43 | 44 | 1. **数据输入** → **SimplePreprocessor** 45 | - 接收多模态医学影像数据 46 | - 支持.nii格式文件 47 | 48 | 2. **数据加载** 49 | - 读取多模态图像数据 50 | - 读取分割数据(如果有) 51 | 52 | 3. **Z轴裁剪** 53 | - 智能识别有效区域 54 | - 去除冗余空白区域 55 | 56 | 4. **归一化** 57 | - 支持z-score标准化 58 | - 支持min-max归一化 59 | 60 | 5. **重采样** 61 | - 调整体素间距 62 | - 保持图像质量 63 | 64 | 6. **尺寸调整** 65 | - 统一输出尺寸 66 | - 可选的尺寸配置 67 | 68 | 7. **输出处理后的数据** 69 | - 保存为标准格式 70 | - 生成处理元数据 71 | 72 | ### 元数据生成系统架构 73 | 74 | 元数据生成系统采用LLM驱动的智能分析流程: 75 | 76 | 1. **数据集根目录** → **目录结构分析** 77 | - 扫描文件系统 78 | - 识别文件组织模式 79 | 80 | 2. **随机采样** 81 | - 选取代表性样本 82 | - 分析文件命名规律 83 | 84 | 3. **DeepSeek API调用** 85 | - 发送结构化请求 86 | - 接收AI分析结果 87 | 88 | 4. **生成处理代码** 89 | - 自动生成Python脚本 90 | - 包含数据处理逻辑 91 | 92 | 5. **执行与验证** 93 | - 运行生成的代码 94 | - 检查处理结果 95 | 96 | 6. **生成metadata.csv** 97 | - 记录数据集信息 98 | - 建立索引关系 99 | 100 | ## 代码实现 101 | 102 | ### 核心类:SimplePreprocessor 103 | 104 | ```python 105 | class SimplePreprocessor: 106 | def __init__(self, target_spacing=[1.0, 1.0, 1.0], 107 | normalization_scheme="z-score", 108 | target_size=None): 109 | """ 110 | 初始化预处理器 111 | 112 | 参数: 113 | - target_spacing: 目标体素大小,默认[1.0, 1.0, 1.0] 114 | - normalization_scheme: 归一化方案("z-score"/"min-max") 115 | - target_size: 目标尺寸,如[256, 256] 116 | """ 117 | ``` 118 | 119 | #### 主要方法: 120 | 121 | 1. **数据加载** 122 | ```python 123 | def read_images(self, image_paths): 124 | """加载多模态图像数据""" 125 | 126 | def read_seg(self, seg_path): 127 | """加载分割数据""" 128 | ``` 129 | 130 | 2. **数据预处理** 131 | ```python 132 | def crop(self, data_list, seg): 133 | """Z轴智能裁剪""" 134 | 135 | def _normalize_single_modality(self, data): 136 | """单模态数据归一化""" 137 | 138 | def compute_new_shape(self, old_shape, old_spacing, new_spacing): 139 | """计算重采样目标形状""" 140 | 141 | def resample_data(self, data, new_shape, order=3): 142 | """数据重采样""" 143 | 144 | def resize_to_target_size(self, data, target_size, order=3): 145 | """调整数据尺寸""" 146 | ``` 147 | 148 | 3. **处理流程** 149 | ```python 150 | def run_case(self, image_paths, seg_path=None): 151 | """执行完整的预处理流程""" 152 | ``` 153 | 154 | ### 元数据生成系统 155 | 156 | ```python 157 | def analyze_directory(root_directory, sample_folder_count=5, sample_file_count=10): 158 | """分析目录结构并采样""" 159 | 160 | def generate_metadata(root_directory, your_api_key=None): 161 | """使用LLM生成元数据处理代码""" 162 | 163 | def execute_metadata_script(root_directory): 164 | """执行并验证元数据生成""" 165 | ``` 166 | 167 | ## 使用指南 168 | 169 | ### 1. 基础预处理流程 170 | 171 | ```python 172 | from pipeline import SimplePreprocessor 173 | 174 | # 初始化预处理器 175 | preprocessor = SimplePreprocessor( 176 | target_spacing=[1.0, 1.0, 1.0], 177 | normalization_scheme="z-score", 178 | target_size=[256, 256] 179 | ) 180 | 181 | # 准备数据路径 182 | image_paths = [ 183 | "path/to/flair.nii", 184 | "path/to/t1.nii", 185 | "path/to/t1ce.nii", 186 | "path/to/t2.nii" 187 | ] 188 | seg_path = "path/to/seg.nii" 189 | 190 | # 执行预处理 191 | data_list, seg, spacing, properties = preprocessor.run_case(image_paths, seg_path) 192 | ``` 193 | 194 | ### 2. 批量处理 195 | 196 | ```python 197 | from pipeline import run_in_parallel 198 | 199 | # 准备多个样本 200 | cases = [ 201 | { 202 | "sample_id": "case_001", 203 | "image_paths": ["path/to/case1/flair.nii", ...], 204 | "seg_path": "path/to/case1/seg.nii" 205 | }, 206 | # 更多样本... 207 | ] 208 | 209 | # 并行处理 210 | results = run_in_parallel(preprocessor, cases, "output_dir", num_workers=4) 211 | ``` 212 | 213 | ### 3. 元数据生成 214 | 215 | ```python 216 | from LLM_metadata import analyze_directory, generate_metadata, execute_metadata_script 217 | 218 | # 分析目录结构 219 | analyze_directory(root_directory="dataset_path", 220 | sample_folder_count=5, 221 | sample_file_count=10) 222 | 223 | # 生成元数据代码 224 | generate_metadata(root_directory="dataset_path", 225 | your_api_key="your-deepseek-api-key") 226 | 227 | # 执行生成的代码 228 | execute_metadata_script(root_directory="dataset_path") 229 | ``` 230 | 231 | ## 配置与扩展 232 | 233 | ### 1. 预处理配置 234 | 235 | 可以通过修改SimplePreprocessor的初始化参数来自定义预处理行为: 236 | 237 | - `target_spacing`: 调整目标体素大小 238 | - `normalization_scheme`: 选择归一化方案 239 | - `target_size`: 设置输出尺寸 240 | 241 | ### 2. 扩展功能 242 | 243 | #### 添加新的归一化方法: 244 | 245 | ```python 246 | def _normalize_custom(self, data): 247 | """ 248 | 自定义归一化方法 249 | """ 250 | # 实现你的归一化逻辑 251 | return normalized_data 252 | 253 | # 在SimplePreprocessor中添加 254 | if self.normalization_scheme == "custom": 255 | data = self._normalize_custom(data) 256 | ``` 257 | 258 | #### 添加新的预处理步骤: 259 | 260 | ```python 261 | def new_preprocessing_step(self, data): 262 | """ 263 | 新的预处理步骤 264 | """ 265 | # 实现新的预处理逻辑 266 | return processed_data 267 | 268 | # 在run_case方法中添加 269 | data_list = [self.new_preprocessing_step(d) for d in data_list] 270 | ``` 271 | 272 | ## 示例 273 | 274 | ### 1. 单模态CT图像预处理 275 | 276 | ```python 277 | # 初始化预处理器 278 | preprocessor = SimplePreprocessor( 279 | target_spacing=[1.0, 1.0, 1.0], 280 | normalization_scheme="min-max" 281 | ) 282 | 283 | # 处理单个CT图像 284 | image_paths = ["path/to/ct.nii"] 285 | data_list, _, spacing, properties = preprocessor.run_case(image_paths) 286 | ``` 287 | 288 | ### 2. 多模态MRI数据处理 289 | 290 | ```python 291 | # 初始化预处理器 292 | preprocessor = SimplePreprocessor( 293 | target_spacing=[1.0, 1.0, 1.0], 294 | normalization_scheme="z-score", 295 | target_size=[256, 256] 296 | ) 297 | 298 | # 处理多模态MRI数据 299 | image_paths = [ 300 | "path/to/flair.nii", 301 | "path/to/t1.nii", 302 | "path/to/t1ce.nii", 303 | "path/to/t2.nii" 304 | ] 305 | seg_path = "path/to/seg.nii" 306 | 307 | # 执行预处理 308 | data_list, seg, spacing, properties = preprocessor.run_case(image_paths, seg_path) 309 | ``` 310 | 311 | ### 3. 自动生成数据集元数据 312 | 313 | ```python 314 | # 配置参数 315 | root_dir = "path/to/dataset" 316 | api_key = "your-deepseek-api-key" 317 | 318 | # 执行完整的元数据生成流程 319 | analyze_directory(root_dir, sample_folder_count=5) 320 | generate_metadata(root_dir, api_key) 321 | execute_metadata_script(root_dir) 322 | ``` 323 | 324 | ## Jupyter Notebook 教程 325 | 326 | 项目提供两个详细的Jupyter Notebook教程: 327 | 328 | ### 1. tutorial.ipynb 329 | 330 | 这是项目的主要教程,包含: 331 | - 完整的预处理管道使用示例 332 | - 各个参数的详细说明 333 | - 常见使用场景的演示 334 | - 处理结果的可视化 335 | 336 | 推荐新用户首先阅读此教程,它将帮助你快速上手预处理管道的使用。 337 | 338 | ### 2. How_I_Use_LLM_to_DIY_metadata.ipynb 339 | 340 | 这是关于如何使用LLM生成元数据的详细教程,包含: 341 | - LLM元数据生成的完整工作流程 342 | - DeepSeek API的配置和使用 343 | - 目录结构分析的实际案例 344 | - 常见问题的解决方案 345 | 346 | 如果你需要使用元数据自动生成功能,建议详细阅读此教程。 347 | 348 | ## 注意事项 349 | 350 | 1. 确保输入数据格式正确(支持.nii格式) 351 | 2. 检查磁盘空间是否充足(预处理后的数据可能较大) 352 | 3. 监控内存使用(处理大型数据集时) 353 | 4. 合理设置并行处理的进程数 354 | 5. 备份原始数据 355 | -------------------------------------------------------------------------------- /readme_en.md: -------------------------------------------------------------------------------- 1 | # PreProcPipe: Multimodal Medical Image Preprocessing Framework 2 | 3 | An efficient framework for preprocessing CT/MRI and other multimodal medical images, supporting automated data preprocessing and metadata generation. 4 | 5 | ![Before Processing](before.png) 6 | ![After Processing](after.png) 7 | 8 | ## Choose Language / 选择语言 9 | 10 | - [English](readme_en.md) 11 | - [简体中文](readme.md) 12 | 13 | ## Table of Contents 14 | 15 | - [Main Features](#main-features) 16 | - [Technical Architecture](#technical-architecture) 17 | - [Code Implementation](#code-implementation) 18 | - [User Guide](#user-guide) 19 | - [Configuration & Extension](#configuration--extension) 20 | - [Examples](#examples) 21 | 22 | ## Main Features 23 | 24 | ### 1. Preprocessing Pipeline (pipeline.py) 25 | - Multimodal medical image data processing 26 | - Intelligent Z-axis cropping 27 | - Configurable data normalization 28 | - Flexible image resampling 29 | - Parallel processing support 30 | - Automatic result saving 31 | 32 | ### 2. Metadata Auto-generation (LLM_metadata.py) 33 | - Intelligent directory structure analysis 34 | - LLM-driven metadata rule generation 35 | - Automatic validation and error detection 36 | - DeepSeek API integration 37 | 38 | ## Technical Architecture 39 | 40 | ### Preprocessing Pipeline Architecture 41 | 42 | The preprocessing pipeline adopts a modular design, processing data in the following sequence: 43 | 44 | 1. **Data Input** → **SimplePreprocessor** 45 | - Receives multimodal medical image data 46 | - Supports .nii format files 47 | 48 | 2. **Data Loading** 49 | - Reads multimodal image data 50 | - Reads segmentation data (if available) 51 | 52 | 3. **Z-axis Cropping** 53 | - Intelligently identifies effective regions 54 | - Removes redundant blank areas 55 | 56 | 4. **Normalization** 57 | - Supports z-score standardization 58 | - Supports min-max normalization 59 | 60 | 5. **Resampling** 61 | - Adjusts voxel spacing 62 | - Maintains image quality 63 | 64 | 6. **Size Adjustment** 65 | - Unifies output dimensions 66 | - Optional size configuration 67 | 68 | 7. **Output Processed Data** 69 | - Saves in standard format 70 | - Generates processing metadata 71 | 72 | ### Metadata Generation System Architecture 73 | 74 | The metadata generation system uses an LLM-driven intelligent analysis process: 75 | 76 | 1. **Dataset Root Directory** → **Directory Structure Analysis** 77 | - Scans file system 78 | - Identifies file organization patterns 79 | 80 | 2. **Random Sampling** 81 | - Selects representative samples 82 | - Analyzes file naming patterns 83 | 84 | 3. **DeepSeek API Call** 85 | - Sends structured requests 86 | - Receives AI analysis results 87 | 88 | 4. **Generate Processing Code** 89 | - Automatically generates Python scripts 90 | - Contains data processing logic 91 | 92 | 5. **Execution and Validation** 93 | - Runs generated code 94 | - Checks processing results 95 | 96 | 6. **Generate metadata.csv** 97 | - Records dataset information 98 | - Establishes index relationships 99 | 100 | ## Code Implementation 101 | 102 | ### Core Class: SimplePreprocessor 103 | 104 | ```python 105 | class SimplePreprocessor: 106 | def __init__(self, target_spacing=[1.0, 1.0, 1.0], 107 | normalization_scheme="z-score", 108 | target_size=None): 109 | """ 110 | Initialize preprocessor 111 | 112 | Parameters: 113 | - target_spacing: Target voxel size, default [1.0, 1.0, 1.0] 114 | - normalization_scheme: Normalization scheme ("z-score"/"min-max") 115 | - target_size: Target size, e.g., [256, 256] 116 | """ 117 | ``` 118 | 119 | #### Main Methods: 120 | 121 | 1. **Data Loading** 122 | ```python 123 | def read_images(self, image_paths): 124 | """Load multimodal image data""" 125 | 126 | def read_seg(self, seg_path): 127 | """Load segmentation data""" 128 | ``` 129 | 130 | 2. **Data Preprocessing** 131 | ```python 132 | def crop(self, data_list, seg): 133 | """Z-axis intelligent cropping""" 134 | 135 | def _normalize_single_modality(self, data): 136 | """Single modality data normalization""" 137 | 138 | def compute_new_shape(self, old_shape, old_spacing, new_spacing): 139 | """Calculate resampling target shape""" 140 | 141 | def resample_data(self, data, new_shape, order=3): 142 | """Data resampling""" 143 | 144 | def resize_to_target_size(self, data, target_size, order=3): 145 | """Adjust data size""" 146 | ``` 147 | 148 | 3. **Processing Flow** 149 | ```python 150 | def run_case(self, image_paths, seg_path=None): 151 | """Execute complete preprocessing workflow""" 152 | ``` 153 | 154 | ### Metadata Generation System 155 | 156 | ```python 157 | def analyze_directory(root_directory, sample_folder_count=5, sample_file_count=10): 158 | """Analyze directory structure and sample""" 159 | 160 | def generate_metadata(root_directory, your_api_key=None): 161 | """Use LLM to generate metadata processing code""" 162 | 163 | def execute_metadata_script(root_directory): 164 | """Execute and validate metadata generation""" 165 | ``` 166 | 167 | ## User Guide 168 | 169 | ### 1. Basic Preprocessing Flow 170 | 171 | ```python 172 | from pipeline import SimplePreprocessor 173 | 174 | # Initialize preprocessor 175 | preprocessor = SimplePreprocessor( 176 | target_spacing=[1.0, 1.0, 1.0], 177 | normalization_scheme="z-score", 178 | target_size=[256, 256] 179 | ) 180 | 181 | # Prepare data paths 182 | image_paths = [ 183 | "path/to/flair.nii", 184 | "path/to/t1.nii", 185 | "path/to/t1ce.nii", 186 | "path/to/t2.nii" 187 | ] 188 | seg_path = "path/to/seg.nii" 189 | 190 | # Execute preprocessing 191 | data_list, seg, spacing, properties = preprocessor.run_case(image_paths, seg_path) 192 | ``` 193 | 194 | ### 2. Batch Processing 195 | 196 | ```python 197 | from pipeline import run_in_parallel 198 | 199 | # Prepare multiple samples 200 | cases = [ 201 | { 202 | "sample_id": "case_001", 203 | "image_paths": ["path/to/case1/flair.nii", ...], 204 | "seg_path": "path/to/case1/seg.nii" 205 | }, 206 | # More samples... 207 | ] 208 | 209 | # Parallel processing 210 | results = run_in_parallel(preprocessor, cases, "output_dir", num_workers=4) 211 | ``` 212 | 213 | ### 3. Metadata Generation 214 | 215 | ```python 216 | from LLM_metadata import analyze_directory, generate_metadata, execute_metadata_script 217 | 218 | # Analyze directory structure 219 | analyze_directory(root_directory="dataset_path", 220 | sample_folder_count=5, 221 | sample_file_count=10) 222 | 223 | # Generate metadata code 224 | generate_metadata(root_directory="dataset_path", 225 | your_api_key="your-deepseek-api-key") 226 | 227 | # Execute generated code 228 | execute_metadata_script(root_directory="dataset_path") 229 | ``` 230 | 231 | ## Configuration & Extension 232 | 233 | ### 1. Preprocessing Configuration 234 | 235 | Customize preprocessing behavior by modifying SimplePreprocessor initialization parameters: 236 | 237 | - `target_spacing`: Adjust target voxel size 238 | - `normalization_scheme`: Choose normalization scheme 239 | - `target_size`: Set output dimensions 240 | 241 | ### 2. Extension Features 242 | 243 | #### Add New Normalization Method: 244 | 245 | ```python 246 | def _normalize_custom(self, data): 247 | """ 248 | Custom normalization method 249 | """ 250 | # Implement your normalization logic 251 | return normalized_data 252 | 253 | # Add in SimplePreprocessor 254 | if self.normalization_scheme == "custom": 255 | data = self._normalize_custom(data) 256 | ``` 257 | 258 | #### Add New Preprocessing Step: 259 | 260 | ```python 261 | def new_preprocessing_step(self, data): 262 | """ 263 | New preprocessing step 264 | """ 265 | # Implement new preprocessing logic 266 | return processed_data 267 | 268 | # Add in run_case method 269 | data_list = [self.new_preprocessing_step(d) for d in data_list] 270 | ``` 271 | 272 | ## Examples 273 | 274 | ### 1. Single Modality CT Image Preprocessing 275 | 276 | ```python 277 | # Initialize preprocessor 278 | preprocessor = SimplePreprocessor( 279 | target_spacing=[1.0, 1.0, 1.0], 280 | normalization_scheme="min-max" 281 | ) 282 | 283 | # Process single CT image 284 | image_paths = ["path/to/ct.nii"] 285 | data_list, _, spacing, properties = preprocessor.run_case(image_paths) 286 | ``` 287 | 288 | ### 2. Multimodal MRI Data Processing 289 | 290 | ```python 291 | # Initialize preprocessor 292 | preprocessor = SimplePreprocessor( 293 | target_spacing=[1.0, 1.0, 1.0], 294 | normalization_scheme="z-score", 295 | target_size=[256, 256] 296 | ) 297 | 298 | # Process multimodal MRI data 299 | image_paths = [ 300 | "path/to/flair.nii", 301 | "path/to/t1.nii", 302 | "path/to/t1ce.nii", 303 | "path/to/t2.nii" 304 | ] 305 | seg_path = "path/to/seg.nii" 306 | 307 | # Execute preprocessing 308 | data_list, seg, spacing, properties = preprocessor.run_case(image_paths, seg_path) 309 | ``` 310 | 311 | ### 3. Auto-generate Dataset Metadata 312 | 313 | ```python 314 | # Configure parameters 315 | root_dir = "path/to/dataset" 316 | api_key = "your-deepseek-api-key" 317 | 318 | # Execute complete metadata generation workflow 319 | analyze_directory(root_dir, sample_folder_count=5) 320 | generate_metadata(root_dir, api_key) 321 | execute_metadata_script(root_dir) 322 | ``` 323 | 324 | ## Jupyter Notebook Tutorials 325 | 326 | The project provides two detailed Jupyter Notebook tutorials: 327 | 328 | ### 1. tutorial.ipynb 329 | 330 | This is the main tutorial of the project, including: 331 | - Complete preprocessing pipeline usage examples 332 | - Detailed parameter explanations 333 | - Common use case demonstrations 334 | - Visualization of processing results 335 | 336 | New users are recommended to read this tutorial first, as it will help you quickly get started with the preprocessing pipeline. 337 | 338 | ### 2. How_I_Use_LLM_to_DIY_metadata.ipynb 339 | 340 | This is a detailed tutorial on how to use LLM for metadata generation, including: 341 | - Complete workflow of LLM metadata generation 342 | - DeepSeek API configuration and usage 343 | - Real cases of directory structure analysis 344 | - Solutions to common issues 345 | 346 | If you need to use the automatic metadata generation feature, it is recommended to read this tutorial thoroughly. 347 | 348 | ## Important Notes 349 | 350 | 1. Ensure correct input data format (supports .nii format) 351 | 2. Check sufficient disk space (processed data may be large) 352 | 3. Monitor memory usage (when processing large datasets) 353 | 4. Set appropriate parallel processing worker count 354 | 5. Backup original data 355 | -------------------------------------------------------------------------------- /tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | "'/teamspace/studios/this_studio/PreProcPipe'" 12 | ] 13 | }, 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "output_type": "execute_result" 17 | } 18 | ], 19 | "source": [ 20 | "%load_ext autoreload\n", 21 | "%autoreload 2\n", 22 | "\n", 23 | "import os\n", 24 | "if os.getcwd().split('/')[-1] != 'PreProcPipe':\n", 25 | " os.chdir('/teamspace/studios/this_studio/PreProcPipe')\n", 26 | "os.getcwd()\n", 27 | "\n" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "在这个教程中我将以一个小数据集作为例子。" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "你可以看到我的repo中有[PreProcPipe/BraTS2021_Training_Data]()文件夹,里面有一些nii文件。" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "data": { 51 | "text/plain": [ 52 | "['BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_flair/00000057_brain_flair.nii',\n", 53 | " 'BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_seg/00000057_final_seg.nii',\n", 54 | " 'BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t1/00000057_brain_t1.nii',\n", 55 | " 'BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t1ce/00000057_brain_t1ce.nii',\n", 56 | " 'BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t2/00000057_brain_t2.nii']" 57 | ] 58 | }, 59 | "execution_count": 2, 60 | "metadata": {}, 61 | "output_type": "execute_result" 62 | } 63 | ], 64 | "source": [ 65 | "\n", 66 | "# load nii files\n", 67 | "img_paths = []\n", 68 | "root_dir = 'BraTS2021_Training_Data/BraTS2021_00000'\n", 69 | "for folder in os.listdir(root_dir):\n", 70 | " folder_path = os.path.join(root_dir, folder)\n", 71 | " if os.path.isdir(folder_path): # 检查是否为目录\n", 72 | " # 遍历子目录中的文件\n", 73 | " for file in os.listdir(folder_path):\n", 74 | " if file.endswith('.nii'): # 检查是否为 .nii 文件\n", 75 | " img_paths.append(os.path.join(folder_path, file)) # 追加完整路径\n", 76 | "\n", 77 | "img_paths\n" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 3, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "name": "stdout", 87 | "output_type": "stream", 88 | "text": [ 89 | "(240, 240, 155)\n", 90 | "0.0 2934.0\n", 91 | "\n", 92 | "(240, 240, 155)\n", 93 | "0.0 4.0\n", 94 | "\n", 95 | "(240, 240, 155)\n", 96 | "0.0 2023.0\n", 97 | "\n", 98 | "(240, 240, 155)\n", 99 | "0.0 12343.0\n", 100 | "\n", 101 | "(240, 240, 155)\n", 102 | "0.0 2421.0\n", 103 | "\n" 104 | ] 105 | } 106 | ], 107 | "source": [ 108 | "import nibabel as nib\n", 109 | "import numpy as np\n", 110 | "\n", 111 | "for img_path in img_paths:\n", 112 | " img = nib.load(img_path)\n", 113 | " img_data = img.get_fdata()\n", 114 | " print(img_data.shape)\n", 115 | " print(np.min(img_data), np.max(img_data))\n", 116 | " print()" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "可视化检查一下" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 4, 129 | "metadata": {}, 130 | "outputs": [ 131 | { 132 | "data": { 133 | "application/vnd.jupyter.widget-view+json": { 134 | "model_id": "df2a2b204e684225915836ef52905a0f", 135 | "version_major": 2, 136 | "version_minor": 0 137 | }, 138 | "text/plain": [ 139 | "interactive(children=(IntSlider(value=77, description='z', max=154), Output()), _dom_classes=('widget-interact…" 140 | ] 141 | }, 142 | "metadata": {}, 143 | "output_type": "display_data" 144 | }, 145 | { 146 | "data": { 147 | "text/plain": [ 148 | "(z)>" 149 | ] 150 | }, 151 | "execution_count": 4, 152 | "metadata": {}, 153 | "output_type": "execute_result" 154 | } 155 | ], 156 | "source": [ 157 | "import nibabel as nib\n", 158 | "import numpy as np\n", 159 | "import matplotlib.pyplot as plt\n", 160 | "from ipywidgets import interact, IntSlider\n", 161 | "\n", 162 | "# 从 img_paths 加载所有图像数据\n", 163 | "def load_images(img_paths):\n", 164 | " images = []\n", 165 | " for path in img_paths:\n", 166 | " img = nib.load(path).get_fdata() # 加载数据\n", 167 | " images.append(img)\n", 168 | " return images\n", 169 | "\n", 170 | "# 显示某一 z 切片的函数\n", 171 | "def show_slices(images, z):\n", 172 | " num_images = len(images)\n", 173 | " fig, axes = plt.subplots(1, num_images, figsize=(5 * num_images, 5))\n", 174 | " if num_images == 1:\n", 175 | " axes = [axes]\n", 176 | "\n", 177 | " for i, img in enumerate(images):\n", 178 | " axes[i].imshow(img[:, :, z], cmap=\"gray\")\n", 179 | " axes[i].set_title(f\"Image {i+1} - Z: {z}\")\n", 180 | " plt.show()\n", 181 | "\n", 182 | "images = load_images(img_paths)\n", 183 | "\n", 184 | "# 使用 ipywidgets 创建滑块交互\n", 185 | "z_max = images[0].shape[2] - 1\n", 186 | "interact(lambda z: show_slices(images, z), z=IntSlider(min=0, max=z_max, step=1, value=z_max // 2))\n" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "万事俱备,我们开始使用`PreProcPipe`吧!" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "# PreProcPipe使用范例" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "为了将文件输入给PPP,我建议用一个像csv这样的元数据文件来记录文件路径,然后规范地输入。" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "很遗憾,这个制作csv的方法是因人而异的,因为每个数据集长得都不一样,不是吗😀" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 5, 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "name": "stdout", 224 | "output_type": "stream", 225 | "text": [ 226 | "Metadata saved to BraTS2021_Training_Data/metadata.csv\n" 227 | ] 228 | } 229 | ], 230 | "source": [ 231 | "import os\n", 232 | "import pandas as pd\n", 233 | "\n", 234 | "def collect_metadata(root_dir):\n", 235 | " \"\"\"\n", 236 | " Traverse the root_dir and collect the paths of multimodal images and segmentation for each sample.\n", 237 | " Return a list containing sample data, with each row corresponding to a sample and its file paths.\n", 238 | " \"\"\"\n", 239 | " metadata = []\n", 240 | " for sample_folder in os.listdir(root_dir):\n", 241 | " sample_path = os.path.join(root_dir, sample_folder)\n", 242 | " if os.path.isdir(sample_path): # Ensure it is a directory\n", 243 | " # Initialize a dictionary to store paths\n", 244 | " sample_data = {\n", 245 | " \"sample_id\": sample_folder,\n", 246 | " \"t1\": None,\n", 247 | " \"t1ce\": None,\n", 248 | " \"t2\": None,\n", 249 | " \"flair\": None,\n", 250 | " \"seg\": None\n", 251 | " }\n", 252 | " for modality_folder in os.listdir(sample_path):\n", 253 | " modality_path = os.path.join(sample_path, modality_folder)\n", 254 | " if os.path.isdir(modality_path): # Ensure it is a modality subdirectory\n", 255 | " for file in os.listdir(modality_path):\n", 256 | " if file.endswith('.nii'): # Ensure it is a .nii file\n", 257 | " # Classify based on modality\n", 258 | " if \"t1.nii\" in file and \"ce\" not in file:\n", 259 | " sample_data[\"t1\"] = os.path.join(modality_path, file)\n", 260 | " elif \"t1ce\" in file:\n", 261 | " sample_data[\"t1ce\"] = os.path.join(modality_path, file)\n", 262 | " elif \"t2\" in file:\n", 263 | " sample_data[\"t2\"] = os.path.join(modality_path, file)\n", 264 | " elif \"flair\" in file:\n", 265 | " sample_data[\"flair\"] = os.path.join(modality_path, file)\n", 266 | " elif \"seg\" in file:\n", 267 | " sample_data[\"seg\"] = os.path.join(modality_path, file)\n", 268 | " metadata.append(sample_data)\n", 269 | " return metadata\n", 270 | "\n", 271 | "# Root directory path\n", 272 | "root_dir = \"BraTS2021_Training_Data\"\n", 273 | "\n", 274 | "# Collect metadata\n", 275 | "metadata = collect_metadata(root_dir)\n", 276 | "\n", 277 | "# Convert to DataFrame\n", 278 | "df = pd.DataFrame(metadata)\n", 279 | "\n", 280 | "# Save as CSV file\n", 281 | "output_csv = f\"{root_dir}/metadata.csv\"\n", 282 | "df.to_csv(output_csv, index=False)\n", 283 | "\n", 284 | "print(f\"Metadata saved to {output_csv}\")" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 7, 290 | "metadata": {}, 291 | "outputs": [ 292 | { 293 | "name": "stdout", 294 | "output_type": "stream", 295 | "text": [ 296 | "Step 1: Loading multi-modal image data...Step 1: Loading multi-modal image data...\n", 297 | "Step 1: Loading multi-modal image data...\n", 298 | "\n", 299 | "\n", 300 | "Step 1: Loading segmentation data...\n", 301 | "\n", 302 | "Original image shape (modality 0): (240, 240, 155)\n", 303 | "Original image shape (modality 1): (240, 240, 155)\n", 304 | "Original image shape (modality 2): (240, 240, 155)\n", 305 | "Original image shape (modality 3): (240, 240, 155)\n", 306 | "Original segmentation shape: (240, 240, 155)\n", 307 | "\n", 308 | "Step 2: Cropping to non-zero regions...\n", 309 | "\n", 310 | "Step 1: Loading segmentation data...\n", 311 | "Step 2: Cropping to non-zero regions along Z-axis...\n", 312 | "\n", 313 | "Step 1: Loading segmentation data...\n", 314 | "Original image shape (modality 0): (240, 240, 155)\n", 315 | "\n", 316 | "Original image shape (modality 1): (240, 240, 155)Z-axis cropping range: 0 to 140\n", 317 | "\n", 318 | "Original image shape (modality 2): (240, 240, 155)\n", 319 | "\n", 320 | "Original image shape (modality 3): (240, 240, 155)\n", 321 | "Shapes before cropping: [(240, 240, 155), (240, 240, 155), (240, 240, 155), (240, 240, 155)]Original image shape (modality 0): (240, 240, 155)\n", 322 | "Original segmentation shape: (240, 240, 155)\n", 323 | "Shapes after cropping: [(240, 240, 140), (240, 240, 140), (240, 240, 140), (240, 240, 140)]Original image shape (modality 1): (240, 240, 155)\n", 324 | "\n", 325 | "\n", 326 | "\n", 327 | "Original image shape (modality 2): (240, 240, 155)Step 2: Cropping to non-zero regions...\n", 328 | "Segmentation shape after cropping: (240, 240, 140)\n", 329 | "Original image shape (modality 3): (240, 240, 155)\n", 330 | "Step 2: Cropping to non-zero regions along Z-axis...\n", 331 | "\n", 332 | "Step 3: Normalizing image data...Original segmentation shape: (240, 240, 155)\n", 333 | "\n", 334 | "Step 2: Cropping to non-zero regions...\n", 335 | "Step 2: Cropping to non-zero regions along Z-axis...\n", 336 | "\n", 337 | "Z-axis cropping range: 4 to 150\n", 338 | "Shapes before cropping: [(240, 240, 155), (240, 240, 155), (240, 240, 155), (240, 240, 155)]\n", 339 | "Shapes after cropping: [(240, 240, 146), (240, 240, 146), (240, 240, 146), (240, 240, 146)]Z-axis cropping range: 10 to 145\n", 340 | "Segmentation shape after cropping: (240, 240, 146)\n", 341 | "Shapes before cropping: [(240, 240, 155), (240, 240, 155), (240, 240, 155), (240, 240, 155)]\n", 342 | "Shapes after cropping: [(240, 240, 135), (240, 240, 135), (240, 240, 135), (240, 240, 135)]\n", 343 | "Segmentation shape after cropping: (240, 240, 135)\n", 344 | "\n", 345 | "Step 3: Normalizing image data...\n", 346 | "Step 3: Normalizing image data..." 347 | ] 348 | }, 349 | { 350 | "name": "stdout", 351 | "output_type": "stream", 352 | "text": [ 353 | "\n", 354 | "\n", 355 | "Step 4: Resampling data to target spacing...\n", 356 | "Computed resize factors: [1.0, 1.0, 1.0]\n", 357 | "Computed new shape: [240, 240, 140]\n", 358 | "Resampling data...\n", 359 | "\n", 360 | "Step 4: Resampling data to target spacing...\n", 361 | "Computed resize factors: [1.0, 1.0, 1.0]\n", 362 | "Computed new shape: [240, 240, 135]\n", 363 | "Resampling data...\n", 364 | "\n", 365 | "Step 4: Resampling data to target spacing...\n", 366 | "Computed resize factors: [1.0, 1.0, 1.0]\n", 367 | "Computed new shape: [240, 240, 146]\n", 368 | "Resampling data...\n", 369 | "Data resampled to shape: (240, 240, 140)\n", 370 | "Resampling data...\n", 371 | "Data resampled to shape: (240, 240, 135)\n", 372 | "Resampling data...\n", 373 | "Data resampled to shape: (240, 240, 146)\n", 374 | "Resampling data...\n", 375 | "Data resampled to shape: (240, 240, 140)\n", 376 | "Resampling data...\n", 377 | "Data resampled to shape: (240, 240, 135)\n", 378 | "\n", 379 | "Resampling data...Data resampled to shape: (240, 240, 146)\n", 380 | "Resampling data...\n", 381 | "Data resampled to shape: (240, 240, 140)\n", 382 | "Resampling data...\n", 383 | "Data resampled to shape: (240, 240, 135)\n", 384 | "Resampling data...\n", 385 | "Data resampled to shape: (240, 240, 146)\n", 386 | "Resampling data...\n", 387 | "Data resampled to shape: (240, 240, 140)\n", 388 | "Resampling data...\n", 389 | "Data resampled to shape: (240, 240, 140)\n", 390 | "\n", 391 | "Step 5: Resizing data to target size...\n", 392 | "Resizing data to target size...\n", 393 | "Data resampled to shape: (240, 240, 135)\n", 394 | "Resampling data...\n", 395 | "Data resampled to shape: (240, 240, 135)\n", 396 | "\n", 397 | "Step 5: Resizing data to target size...\n", 398 | "Resizing data to target size...\n", 399 | "Data resampled to shape: (240, 240, 146)\n", 400 | "Resampling data...\n", 401 | "Data resampled to shape: (240, 240, 146)\n", 402 | "\n", 403 | "Step 5: Resizing data to target size...\n", 404 | "Resizing data to target size...\n", 405 | "Data resized to shape: (256, 256, 140)\n", 406 | "Resizing data to target size...\n", 407 | "Data resized to shape: (256, 256, 135)\n", 408 | "Resizing data to target size...\n", 409 | "Data resized to shape: (256, 256, 146)\n", 410 | "Resizing data to target size...\n", 411 | "Data resized to shape: (256, 256, 140)\n", 412 | "Resizing data to target size...\n", 413 | "Data resized to shape: (256, 256, 135)\n", 414 | "Resizing data to target size...\n", 415 | "Data resized to shape: (256, 256, 146)\n", 416 | "Resizing data to target size...\n", 417 | "Data resized to shape: (256, 256, 140)\n", 418 | "Resizing data to target size...\n", 419 | "Data resized to shape: (256, 256, 135)\n", 420 | "Resizing data to target size...\n", 421 | "Data resized to shape: (256, 256, 146)\n", 422 | "Resizing data to target size...\n", 423 | "Data resized to shape: (256, 256, 140)\n", 424 | "Resizing data to target size...\n", 425 | "Data resized to shape: (256, 256, 135)\n", 426 | "Data resized to shape: (256, 256, 140)\n", 427 | "\n", 428 | "Preprocessing completed.\n", 429 | "\n", 430 | "Resizing data to target size...\n", 431 | "Data resized to shape: (256, 256, 135)\n", 432 | "\n", 433 | "Preprocessing completed.\n", 434 | "\n", 435 | "Data resized to shape: (256, 256, 146)\n", 436 | "Resizing data to target size...\n", 437 | "Data resized to shape: (256, 256, 146)\n", 438 | "\n", 439 | "Preprocessing completed.\n", 440 | "\n" 441 | ] 442 | } 443 | ], 444 | "source": [ 445 | "import pandas as pd\n", 446 | "from pipeline import SimplePreprocessor as ppp\n", 447 | "from pipeline import run_in_parallel\n", 448 | "\n", 449 | "# 定义读取 metadata.csv 并生成 cases 列表的函数\n", 450 | "def load_cases_from_metadata(csv_path):\n", 451 | " \"\"\"\n", 452 | " 从 metadata.csv 加载病例信息,并生成 (image_paths, seg_path) 的列表。\n", 453 | " \n", 454 | " 参数:\n", 455 | " - csv_path: metadata.csv 文件路径。\n", 456 | " \n", 457 | " 返回:\n", 458 | " - cases: 包含病例信息的列表,每个元素是一个字典,格式为:\n", 459 | " {\n", 460 | " \"sample_id\": 样本ID,\n", 461 | " \"image_paths\": [模态1路径, 模态2路径, ...],\n", 462 | " \"seg_path\": 分割路径或 None\n", 463 | " }\n", 464 | " \"\"\"\n", 465 | " df = pd.read_csv(csv_path)\n", 466 | " cases = []\n", 467 | " for _, row in df.iterrows():\n", 468 | " # 提取模态路径\n", 469 | " image_paths = [row['t1'], row['t1ce'], row['t2'], row['flair']]\n", 470 | " # 过滤掉空值\n", 471 | " image_paths = [path for path in image_paths if pd.notnull(path)]\n", 472 | " # 提取分割路径\n", 473 | " seg_path = row['seg'] if pd.notnull(row['seg']) else None\n", 474 | " # 添加到 cases\n", 475 | " cases.append({\n", 476 | " \"sample_id\": row['sample_id'],\n", 477 | " \"image_paths\": image_paths,\n", 478 | " \"seg_path\": seg_path\n", 479 | " })\n", 480 | " return cases\n", 481 | "\n", 482 | "# 加载 metadata.csv\n", 483 | "metadata_csv_path = \"BraTS2021_Training_Data/metadata.csv\"\n", 484 | "cases = load_cases_from_metadata(metadata_csv_path)\n", 485 | "\n", 486 | "# 初始化预处理器\n", 487 | "preprocessor = ppp(\n", 488 | " target_spacing=[1.0, 1.0, 1.0], \n", 489 | " normalization_scheme=\"z-score\", \n", 490 | " target_size=[256, 256]\n", 491 | ")\n", 492 | "\n", 493 | "# 使用多进程运行预处理\n", 494 | "num_workers = 4 # 设置进程数\n", 495 | "results = run_in_parallel(preprocessor, cases, num_workers=num_workers, output_root=\"preprocessed_data\")\n", 496 | "\n" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": 8, 502 | "metadata": {}, 503 | "outputs": [ 504 | { 505 | "data": { 506 | "text/plain": [ 507 | "[{'sample_id': 'BraTS2021_00000',\n", 508 | " 'modality_paths': ['preprocessed_data/BraTS2021_00000/00000057_brain_t1.npz',\n", 509 | " 'preprocessed_data/BraTS2021_00000/00000057_brain_t1ce.npz',\n", 510 | " 'preprocessed_data/BraTS2021_00000/00000057_brain_t2.npz',\n", 511 | " 'preprocessed_data/BraTS2021_00000/00000057_brain_flair.npz'],\n", 512 | " 'seg_path': 'preprocessed_data/BraTS2021_00000/seg.npz',\n", 513 | " 'meta_path': 'preprocessed_data/BraTS2021_00000/meta.npz'},\n", 514 | " {'sample_id': 'BraTS2021_00002',\n", 515 | " 'modality_paths': ['preprocessed_data/BraTS2021_00002/00000014_brain_t1.npz',\n", 516 | " 'preprocessed_data/BraTS2021_00002/00000014_brain_t1ce.npz',\n", 517 | " 'preprocessed_data/BraTS2021_00002/00000014_brain_t2.npz',\n", 518 | " 'preprocessed_data/BraTS2021_00002/00000014_brain_flair.npz'],\n", 519 | " 'seg_path': 'preprocessed_data/BraTS2021_00002/seg.npz',\n", 520 | " 'meta_path': 'preprocessed_data/BraTS2021_00002/meta.npz'},\n", 521 | " {'sample_id': 'BraTS2021_00003',\n", 522 | " 'modality_paths': ['preprocessed_data/BraTS2021_00003/00000017_brain_t1.npz',\n", 523 | " 'preprocessed_data/BraTS2021_00003/00000017_brain_t1ce.npz',\n", 524 | " 'preprocessed_data/BraTS2021_00003/00000017_brain_t2.npz',\n", 525 | " 'preprocessed_data/BraTS2021_00003/00000017_brain_flair.npz'],\n", 526 | " 'seg_path': 'preprocessed_data/BraTS2021_00003/seg.npz',\n", 527 | " 'meta_path': 'preprocessed_data/BraTS2021_00003/meta.npz'}]" 528 | ] 529 | }, 530 | "execution_count": 8, 531 | "metadata": {}, 532 | "output_type": "execute_result" 533 | } 534 | ], 535 | "source": [ 536 | "results" 537 | ] 538 | }, 539 | { 540 | "cell_type": "code", 541 | "execution_count": 10, 542 | "metadata": {}, 543 | "outputs": [ 544 | { 545 | "name": "stdout", 546 | "output_type": "stream", 547 | "text": [ 548 | "Modality paths: ['preprocessed_data/BraTS2021_00002/00000014_brain_t1.npz', 'preprocessed_data/BraTS2021_00002/00000014_brain_t1ce.npz', 'preprocessed_data/BraTS2021_00002/00000014_brain_t2.npz', 'preprocessed_data/BraTS2021_00002/00000014_brain_flair.npz']\n", 549 | "Seg path: preprocessed_data/BraTS2021_00002/seg.npz\n" 550 | ] 551 | }, 552 | { 553 | "data": { 554 | "application/vnd.jupyter.widget-view+json": { 555 | "model_id": "fc3eeee8f53342ee802db7860ca1364a", 556 | "version_major": 2, 557 | "version_minor": 0 558 | }, 559 | "text/plain": [ 560 | "interactive(children=(IntSlider(value=67, description='z_idx', max=134), Output()), _dom_classes=('widget-inte…" 561 | ] 562 | }, 563 | "metadata": {}, 564 | "output_type": "display_data" 565 | } 566 | ], 567 | "source": [ 568 | "import numpy as np\n", 569 | "import matplotlib.pyplot as plt\n", 570 | "from ipywidgets import interact, IntSlider\n", 571 | "import os\n", 572 | "\n", 573 | "import pandas as pd\n", 574 | "import ast\n", 575 | "import os\n", 576 | "\n", 577 | "metadata_csv = \"preprocessed_data/metadata.csv\"\n", 578 | "\n", 579 | "# 目标 sample_id\n", 580 | "target_sample_id = \"BraTS2021_00002\"\n", 581 | "\n", 582 | "# 读取 CSV\n", 583 | "df = pd.read_csv(metadata_csv)\n", 584 | "\n", 585 | "# 查找目标样本行\n", 586 | "row = df.loc[df['sample_id'] == target_sample_id].iloc[0]\n", 587 | "\n", 588 | "# 解析 image_paths 列(它是一个字符串表示的列表)\n", 589 | "image_paths = ast.literal_eval(row['modality_paths'])\n", 590 | "\n", 591 | "# 分割路径\n", 592 | "seg_path = row['seg_path'] if pd.notnull(row['seg_path']) else None\n", 593 | "\n", 594 | "# 根据需要,也可以将这些路径与根目录拼接\n", 595 | "# 如果 metadata.csv 中的路径已经是相对于 output_root 的相对路径\n", 596 | "# 且 output_root 为 \"preprocessed_data\"\n", 597 | "output_root = \"preprocessed_data\"\n", 598 | "image_paths = [os.path.join(output_root, p) for p in image_paths]\n", 599 | "if seg_path is not None:\n", 600 | " seg_path = os.path.join(output_root, seg_path)\n", 601 | "\n", 602 | "# 此时,image_paths 和 seg_path 就是从 metadata 中获得的对应文件路径列表和分割路径\n", 603 | "print(\"Modality paths:\", image_paths)\n", 604 | "print(\"Seg path:\", seg_path)\n", 605 | "\n", 606 | "\n", 607 | "# 假设所有文件都在 \"preprocessed_data\" 目录下\n", 608 | "# image_paths = [os.path.join(\"preprocessed_data\", p) for p in image_paths]\n", 609 | "# seg_path = os.path.join(\"preprocessed_data\", seg_path)\n", 610 | "\n", 611 | "# 加载图像数据\n", 612 | "modality_data = []\n", 613 | "for path in image_paths:\n", 614 | " data = np.load(path)[\"data\"] # npz 的 key 是data\n", 615 | " modality_data.append(data)\n", 616 | "\n", 617 | "# 将模态合并为多通道数据 (H, W, D, C)\n", 618 | "multi_modal_data = np.stack(modality_data, axis=-1) # (H, W, D, C)\n", 619 | "\n", 620 | "# 加载分割数据\n", 621 | "seg_data = np.load(seg_path)[\"data\"] # (H, W, D)\n", 622 | "\n", 623 | "# 获取数据形状和 D 轴大小\n", 624 | "H, W, D, C = multi_modal_data.shape\n", 625 | "\n", 626 | "def display_all_modalities(z_idx):\n", 627 | " \"\"\"\n", 628 | " 显示给定z轴索引下的所有模态图像及对应的分割mask叠加结果。\n", 629 | " \"\"\"\n", 630 | " fig, axes = plt.subplots(1, C, figsize=(4*C, 4))\n", 631 | " \n", 632 | " # 遍历每个模态\n", 633 | " for i in range(C):\n", 634 | " img_slice = multi_modal_data[..., i][..., z_idx]\n", 635 | " print(f\"min&max: {np.min(multi_modal_data[..., i]), np.max(multi_modal_data[..., i])}\") \n", 636 | "\n", 637 | " seg_slice = seg_data[..., z_idx]\n", 638 | " \n", 639 | " axes[i].imshow(img_slice, cmap='gray')\n", 640 | " \n", 641 | " # 使用 alpha 叠加 seg\n", 642 | " seg_mask = np.ma.masked_where(seg_slice == 0, seg_slice)\n", 643 | " axes[i].imshow(seg_mask, cmap='jet', alpha=0.5)\n", 644 | " \n", 645 | " axes[i].set_title(f\"Modality {i}, Z={z_idx}\")\n", 646 | " axes[i].axis('off')\n", 647 | " \n", 648 | " plt.tight_layout()\n", 649 | " plt.show()\n", 650 | "\n", 651 | "# 使用交互式滑块:只需要控制 z_idx 即可\n", 652 | "interact(\n", 653 | " display_all_modalities, \n", 654 | " z_idx=IntSlider(min=0, max=D-1, step=1, value=D//2)\n", 655 | ");\n" 656 | ] 657 | }, 658 | { 659 | "cell_type": "markdown", 660 | "metadata": {}, 661 | "source": [ 662 | "大功告成噜💅" 663 | ] 664 | } 665 | ], 666 | "metadata": { 667 | "kernelspec": { 668 | "display_name": "cloudspace", 669 | "language": "python", 670 | "name": "python3" 671 | }, 672 | "language_info": { 673 | "codemirror_mode": { 674 | "name": "ipython", 675 | "version": 3 676 | }, 677 | "file_extension": ".py", 678 | "mimetype": "text/x-python", 679 | "name": "python", 680 | "nbconvert_exporter": "python", 681 | "pygments_lexer": "ipython3", 682 | "version": "3.10.10" 683 | } 684 | }, 685 | "nbformat": 4, 686 | "nbformat_minor": 2 687 | } 688 | --------------------------------------------------------------------------------