├── .gitignore
├── BraTS2021_Training_Data
    ├── BraTS2021_00000
    │   ├── BraTS2021_00000_flair
    │   │   └── 00000057_brain_flair.nii
    │   ├── BraTS2021_00000_seg
    │   │   └── 00000057_final_seg.nii
    │   ├── BraTS2021_00000_t1
    │   │   └── 00000057_brain_t1.nii
    │   ├── BraTS2021_00000_t1ce
    │   │   └── 00000057_brain_t1ce.nii
    │   └── BraTS2021_00000_t2
    │   │   └── 00000057_brain_t2.nii
    ├── BraTS2021_00002
    │   ├── BraTS2021_00002_flair
    │   │   └── 00000014_brain_flair.nii
    │   ├── BraTS2021_00002_seg
    │   │   └── BraTS2021_00002_seg_new.nii
    │   ├── BraTS2021_00002_t1
    │   │   └── 00000014_brain_t1.nii
    │   ├── BraTS2021_00002_t1ce
    │   │   └── 00000014_brain_t1ce.nii
    │   └── BraTS2021_00002_t2
    │   │   └── 00000014_brain_t2.nii
    └── BraTS2021_00003
    │   ├── BraTS2021_00003_flair
    │       └── 00000017_brain_flair.nii
    │   ├── BraTS2021_00003_seg
    │       └── BraTS2021_00003_seg_new.nii
    │   ├── BraTS2021_00003_t1
    │       └── 00000017_brain_t1.nii
    │   ├── BraTS2021_00003_t1ce
    │       └── 00000017_brain_t1ce.nii
    │   └── BraTS2021_00003_t2
    │       └── 00000017_brain_t2.nii
├── How_I_Use_LLM_to_DIY_metadata.ipynb
├── LLM_metadata.py
├── __pycache__
    ├── config.cpython-310.pyc
    └── pipeline.cpython-310.pyc
├── after.png
├── before.png
├── pipeline.py
├── pipeline_example.py
├── readme.md
├── readme_en.md
└── tutorial.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | config.py
2 | BraTS2021_Training_Data
3 | result.json
4 | preprocessed_data
5 | __pycache__
6 | .vscode


--------------------------------------------------------------------------------
/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_flair/00000057_brain_flair.nii:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_flair/00000057_brain_flair.nii


--------------------------------------------------------------------------------
/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_seg/00000057_final_seg.nii:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_seg/00000057_final_seg.nii


--------------------------------------------------------------------------------
/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t1/00000057_brain_t1.nii:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t1/00000057_brain_t1.nii


--------------------------------------------------------------------------------
/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t1ce/00000057_brain_t1ce.nii:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t1ce/00000057_brain_t1ce.nii


--------------------------------------------------------------------------------
/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t2/00000057_brain_t2.nii:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t2/00000057_brain_t2.nii


--------------------------------------------------------------------------------
/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_flair/00000014_brain_flair.nii:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_flair/00000014_brain_flair.nii


--------------------------------------------------------------------------------
/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_seg/BraTS2021_00002_seg_new.nii:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_seg/BraTS2021_00002_seg_new.nii


--------------------------------------------------------------------------------
/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_t1/00000014_brain_t1.nii:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_t1/00000014_brain_t1.nii


--------------------------------------------------------------------------------
/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_t1ce/00000014_brain_t1ce.nii:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_t1ce/00000014_brain_t1ce.nii


--------------------------------------------------------------------------------
/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_t2/00000014_brain_t2.nii:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_t2/00000014_brain_t2.nii


--------------------------------------------------------------------------------
/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_flair/00000017_brain_flair.nii:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_flair/00000017_brain_flair.nii


--------------------------------------------------------------------------------
/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_seg/BraTS2021_00003_seg_new.nii:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_seg/BraTS2021_00003_seg_new.nii


--------------------------------------------------------------------------------
/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_t1/00000017_brain_t1.nii:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_t1/00000017_brain_t1.nii


--------------------------------------------------------------------------------
/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_t1ce/00000017_brain_t1ce.nii:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_t1ce/00000017_brain_t1ce.nii


--------------------------------------------------------------------------------
/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_t2/00000017_brain_t2.nii:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_t2/00000017_brain_t2.nii


--------------------------------------------------------------------------------
/How_I_Use_LLM_to_DIY_metadata.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 17,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "{\n",
 13 |       "    \"root_directory\": \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data\",\n",
 14 |       "    \"a_level_summary\": {\n",
 15 |       "        \"total_a_folders\": 3,\n",
 16 |       "        \"example_a_folders\": [\n",
 17 |       "            \"BraTS2021_00000\",\n",
 18 |       "            \"BraTS2021_00002\",\n",
 19 |       "            \"BraTS2021_00003\"\n",
 20 |       "        ]\n",
 21 |       "    },\n",
 22 |       "    \"sampled_a_folders\": [\n",
 23 |       "        {\n",
 24 |       "            \"a_folder_name\": \"BraTS2021_00002\",\n",
 25 |       "            \"directory_tree\": [\n",
 26 |       "                {\n",
 27 |       "                    \"level\": 0,\n",
 28 |       "                    \"folder_name\": \"BraTS2021_00002\",\n",
 29 |       "                    \"sub_folders\": [\n",
 30 |       "                        \"BraTS2021_00002_flair\",\n",
 31 |       "                        \"BraTS2021_00002_seg\",\n",
 32 |       "                        \"BraTS2021_00002_t1\",\n",
 33 |       "                        \"BraTS2021_00002_t1ce\",\n",
 34 |       "                        \"BraTS2021_00002_t2\"\n",
 35 |       "                    ],\n",
 36 |       "                    \"file_count\": 0\n",
 37 |       "                },\n",
 38 |       "                {\n",
 39 |       "                    \"level\": 1,\n",
 40 |       "                    \"folder_name\": \"BraTS2021_00002_flair\",\n",
 41 |       "                    \"sub_folders\": [],\n",
 42 |       "                    \"file_count\": 1\n",
 43 |       "                },\n",
 44 |       "                {\n",
 45 |       "                    \"level\": 1,\n",
 46 |       "                    \"folder_name\": \"BraTS2021_00002_seg\",\n",
 47 |       "                    \"sub_folders\": [],\n",
 48 |       "                    \"file_count\": 1\n",
 49 |       "                },\n",
 50 |       "                {\n",
 51 |       "                    \"level\": 1,\n",
 52 |       "                    \"folder_name\": \"BraTS2021_00002_t1\",\n",
 53 |       "                    \"sub_folders\": [],\n",
 54 |       "                    \"file_count\": 1\n",
 55 |       "                },\n",
 56 |       "                {\n",
 57 |       "                    \"level\": 1,\n",
 58 |       "                    \"folder_name\": \"BraTS2021_00002_t1ce\",\n",
 59 |       "                    \"sub_folders\": [],\n",
 60 |       "                    \"file_count\": 1\n",
 61 |       "                },\n",
 62 |       "                {\n",
 63 |       "                    \"level\": 1,\n",
 64 |       "                    \"folder_name\": \"BraTS2021_00002_t2\",\n",
 65 |       "                    \"sub_folders\": [],\n",
 66 |       "                    \"file_count\": 1\n",
 67 |       "                }\n",
 68 |       "            ],\n",
 69 |       "            \"sampled_files\": [\n",
 70 |       "                \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_flair/00000014_brain_flair.nii\",\n",
 71 |       "                \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_t2/00000014_brain_t2.nii\",\n",
 72 |       "                \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_t1ce/00000014_brain_t1ce.nii\",\n",
 73 |       "                \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_seg/BraTS2021_00002_seg_new.nii\",\n",
 74 |       "                \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_t1/00000014_brain_t1.nii\"\n",
 75 |       "            ]\n",
 76 |       "        },\n",
 77 |       "        {\n",
 78 |       "            \"a_folder_name\": \"BraTS2021_00000\",\n",
 79 |       "            \"directory_tree\": [\n",
 80 |       "                {\n",
 81 |       "                    \"level\": 0,\n",
 82 |       "                    \"folder_name\": \"BraTS2021_00000\",\n",
 83 |       "                    \"sub_folders\": [\n",
 84 |       "                        \"BraTS2021_00000_flair\",\n",
 85 |       "                        \"BraTS2021_00000_seg\",\n",
 86 |       "                        \"BraTS2021_00000_t1\",\n",
 87 |       "                        \"BraTS2021_00000_t1ce\",\n",
 88 |       "                        \"BraTS2021_00000_t2\"\n",
 89 |       "                    ],\n",
 90 |       "                    \"file_count\": 0\n",
 91 |       "                },\n",
 92 |       "                {\n",
 93 |       "                    \"level\": 1,\n",
 94 |       "                    \"folder_name\": \"BraTS2021_00000_flair\",\n",
 95 |       "                    \"sub_folders\": [],\n",
 96 |       "                    \"file_count\": 1\n",
 97 |       "                },\n",
 98 |       "                {\n",
 99 |       "                    \"level\": 1,\n",
100 |       "                    \"folder_name\": \"BraTS2021_00000_seg\",\n",
101 |       "                    \"sub_folders\": [],\n",
102 |       "                    \"file_count\": 1\n",
103 |       "                },\n",
104 |       "                {\n",
105 |       "                    \"level\": 1,\n",
106 |       "                    \"folder_name\": \"BraTS2021_00000_t1\",\n",
107 |       "                    \"sub_folders\": [],\n",
108 |       "                    \"file_count\": 1\n",
109 |       "                },\n",
110 |       "                {\n",
111 |       "                    \"level\": 1,\n",
112 |       "                    \"folder_name\": \"BraTS2021_00000_t1ce\",\n",
113 |       "                    \"sub_folders\": [],\n",
114 |       "                    \"file_count\": 1\n",
115 |       "                },\n",
116 |       "                {\n",
117 |       "                    \"level\": 1,\n",
118 |       "                    \"folder_name\": \"BraTS2021_00000_t2\",\n",
119 |       "                    \"sub_folders\": [],\n",
120 |       "                    \"file_count\": 1\n",
121 |       "                }\n",
122 |       "            ],\n",
123 |       "            \"sampled_files\": [\n",
124 |       "                \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t1ce/00000057_brain_t1ce.nii\",\n",
125 |       "                \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t1/00000057_brain_t1.nii\",\n",
126 |       "                \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t2/00000057_brain_t2.nii\",\n",
127 |       "                \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_flair/00000057_brain_flair.nii\",\n",
128 |       "                \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_seg/00000057_final_seg.nii\"\n",
129 |       "            ]\n",
130 |       "        },\n",
131 |       "        {\n",
132 |       "            \"a_folder_name\": \"BraTS2021_00003\",\n",
133 |       "            \"directory_tree\": [\n",
134 |       "                {\n",
135 |       "                    \"level\": 0,\n",
136 |       "                    \"folder_name\": \"BraTS2021_00003\",\n",
137 |       "                    \"sub_folders\": [\n",
138 |       "                        \"BraTS2021_00003_flair\",\n",
139 |       "                        \"BraTS2021_00003_seg\",\n",
140 |       "                        \"BraTS2021_00003_t1\",\n",
141 |       "                        \"BraTS2021_00003_t1ce\",\n",
142 |       "                        \"BraTS2021_00003_t2\"\n",
143 |       "                    ],\n",
144 |       "                    \"file_count\": 0\n",
145 |       "                },\n",
146 |       "                {\n",
147 |       "                    \"level\": 1,\n",
148 |       "                    \"folder_name\": \"BraTS2021_00003_flair\",\n",
149 |       "                    \"sub_folders\": [],\n",
150 |       "                    \"file_count\": 1\n",
151 |       "                },\n",
152 |       "                {\n",
153 |       "                    \"level\": 1,\n",
154 |       "                    \"folder_name\": \"BraTS2021_00003_seg\",\n",
155 |       "                    \"sub_folders\": [],\n",
156 |       "                    \"file_count\": 1\n",
157 |       "                },\n",
158 |       "                {\n",
159 |       "                    \"level\": 1,\n",
160 |       "                    \"folder_name\": \"BraTS2021_00003_t1\",\n",
161 |       "                    \"sub_folders\": [],\n",
162 |       "                    \"file_count\": 1\n",
163 |       "                },\n",
164 |       "                {\n",
165 |       "                    \"level\": 1,\n",
166 |       "                    \"folder_name\": \"BraTS2021_00003_t1ce\",\n",
167 |       "                    \"sub_folders\": [],\n",
168 |       "                    \"file_count\": 1\n",
169 |       "                },\n",
170 |       "                {\n",
171 |       "                    \"level\": 1,\n",
172 |       "                    \"folder_name\": \"BraTS2021_00003_t2\",\n",
173 |       "                    \"sub_folders\": [],\n",
174 |       "                    \"file_count\": 1\n",
175 |       "                }\n",
176 |       "            ],\n",
177 |       "            \"sampled_files\": [\n",
178 |       "                \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_t1/00000017_brain_t1.nii\",\n",
179 |       "                \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_seg/BraTS2021_00003_seg_new.nii\",\n",
180 |       "                \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_t2/00000017_brain_t2.nii\",\n",
181 |       "                \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_flair/00000017_brain_flair.nii\",\n",
182 |       "                \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_t1ce/00000017_brain_t1ce.nii\"\n",
183 |       "            ]\n",
184 |       "        }\n",
185 |       "    ]\n",
186 |       "}\n",
187 |       "\n",
188 |       "结果已保存到: directory_analysis.json\n"
189 |      ]
190 |     }
191 |    ],
192 |    "source": [
193 |     "import os\n",
194 |     "import random\n",
195 |     "import json\n",
196 |     "\n",
197 |     "def analyze_directory(root_dir, sample_folder_count=1, sample_file_count=5):\n",
198 |     "    # 用于存储最终的结果\n",
199 |     "    result = {\n",
200 |     "        \"root_directory\": root_dir,\n",
201 |     "        \"a_level_summary\": {\n",
202 |     "            \"total_a_folders\": 0,\n",
203 |     "            \"example_a_folders\": [],\n",
204 |     "        },\n",
205 |     "        \"sampled_a_folders\": []\n",
206 |     "    }\n",
207 |     "\n",
208 |     "    # 1. 计算根目录下的 A 级文件夹数量并打印\n",
209 |     "    a_level_folders = [f for f in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, f))]\n",
210 |     "    result[\"a_level_summary\"][\"total_a_folders\"] = len(a_level_folders)\n",
211 |     "    result[\"a_level_summary\"][\"example_a_folders\"] = a_level_folders[:5]\n",
212 |     "\n",
213 |     "    # 2. 随机采样 A 级文件夹\n",
214 |     "    sampled_a_folders = random.sample(a_level_folders, min(sample_folder_count, len(a_level_folders)))\n",
215 |     "\n",
216 |     "    for sampled_a_folder in sampled_a_folders:\n",
217 |     "        sampled_a_path = os.path.join(root_dir, sampled_a_folder)\n",
218 |     "        sampled_a_info = {\n",
219 |     "            \"a_folder_name\": sampled_a_folder,\n",
220 |     "            \"directory_tree\": [],\n",
221 |     "            \"sampled_files\": []\n",
222 |     "        }\n",
223 |     "\n",
224 |     "        # 3. 穷尽 A 级文件夹下的目录树\n",
225 |     "        file_list = []\n",
226 |     "        for root, dirs, files in os.walk(sampled_a_path):\n",
227 |     "            # 获取当前路径的相对路径和层级\n",
228 |     "            relative_root = os.path.relpath(root, sampled_a_path)\n",
229 |     "            folder_level = len(relative_root.split(os.sep)) if relative_root != \".\" else 0\n",
230 |     "\n",
231 |     "            # 保存目录树信息\n",
232 |     "            sampled_a_info[\"directory_tree\"].append({\n",
233 |     "                \"level\": folder_level,\n",
234 |     "                \"folder_name\": os.path.basename(root),\n",
235 |     "                \"sub_folders\": dirs,\n",
236 |     "                \"file_count\": len(files)\n",
237 |     "            })\n",
238 |     "\n",
239 |     "            # 收集文件地址\n",
240 |     "            file_list.extend([os.path.join(root, f) for f in files])\n",
241 |     "\n",
242 |     "        # 4. 随机采样末端文件\n",
243 |     "        sampled_files = random.sample(file_list, min(sample_file_count, len(file_list)))\n",
244 |     "        sampled_a_info[\"sampled_files\"] = sampled_files\n",
245 |     "\n",
246 |     "        # 添加到结果中\n",
247 |     "        result[\"sampled_a_folders\"].append(sampled_a_info)\n",
248 |     "\n",
249 |     "    # 将结果格式化为 JSON 并打印\n",
250 |     "    formatted_result = json.dumps(result, indent=4, ensure_ascii=False)\n",
251 |     "    print(formatted_result)\n",
252 |     "\n",
253 |     "    # 可选择将结果保存到文件\n",
254 |     "    output_file = \"directory_analysis.json\"\n",
255 |     "    with open(output_file, \"w\", encoding=\"utf-8\") as f:\n",
256 |     "        f.write(formatted_result)\n",
257 |     "    print(f\"\\n结果已保存到: {output_file}\")\n",
258 |     "\n",
259 |     "# 设置根目录\n",
260 |     "root_directory = \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data\"\n",
261 |     "analyze_directory(root_directory, sample_file_count=10, sample_folder_count=4)\n"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 24,
267 |    "metadata": {},
268 |    "outputs": [
269 |     {
270 |      "name": "stdout",
271 |      "output_type": "stream",
272 |      "text": [
273 |       "模型输出的原始内容：\n",
274 |       "```python\n",
275 |       "import os\n",
276 |       "import csv\n",
277 |       "\n",
278 |       "def generate_metadata(root_directory):\n",
279 |       "    metadata = []\n",
280 |       "    sample_id = 0\n",
281 |       "\n",
282 |       "    for a_folder in os.listdir(root_directory):\n",
283 |       "        a_folder_path = os.path.join(root_directory, a_folder)\n",
284 |       "        if os.path.isdir(a_folder_path):\n",
285 |       "            flair_path = t1_path = t1ce_path = t2_path = seg_path = \"\"\n",
286 |       "\n",
287 |       "            for sub_folder in os.listdir(a_folder_path):\n",
288 |       "                sub_folder_path = os.path.join(a_folder_path, sub_folder)\n",
289 |       "                if os.path.isdir(sub_folder_path):\n",
290 |       "                    if sub_folder.endswith(\"_flair\"):\n",
291 |       "                        flair_path = os.path.join(sub_folder_path, os.listdir(sub_folder_path)[0])\n",
292 |       "                    elif sub_folder.endswith(\"_t1\"):\n",
293 |       "                        t1_path = os.path.join(sub_folder_path, os.listdir(sub_folder_path)[0])\n",
294 |       "                    elif sub_folder.endswith(\"_t1ce\"):\n",
295 |       "                        t1ce_path = os.path.join(sub_folder_path, os.listdir(sub_folder_path)[0])\n",
296 |       "                    elif sub_folder.endswith(\"_t2\"):\n",
297 |       "                        t2_path = os.path.join(sub_folder_path, os.listdir(sub_folder_path)[0])\n",
298 |       "                    elif sub_folder.endswith(\"_seg\"):\n",
299 |       "                        seg_path = os.path.join(sub_folder_path, os.listdir(sub_folder_path)[0])\n",
300 |       "\n",
301 |       "            metadata.append({\n",
302 |       "                \"sample_id\": sample_id,\n",
303 |       "                \"flair_path\": flair_path,\n",
304 |       "                \"t1_path\": t1_path,\n",
305 |       "                \"t1ce_path\": t1ce_path,\n",
306 |       "                \"t2_path\": t2_path,\n",
307 |       "                \"seg_path\": seg_path\n",
308 |       "            })\n",
309 |       "            sample_id += 1\n",
310 |       "\n",
311 |       "    # Save metadata to csv\n",
312 |       "    csv_path = os.path.join(root_directory, \"metadata.csv\")\n",
313 |       "    with open(csv_path, mode='w', newline='') as file:\n",
314 |       "        writer = csv.DictWriter(file, fieldnames=[\"sample_id\", \"flair_path\", \"t1_path\", \"t1ce_path\", \"t2_path\", \"seg_path\"])\n",
315 |       "        writer.writeheader()\n",
316 |       "        for row in metadata:\n",
317 |       "            writer.writerow(row)\n",
318 |       "\n",
319 |       "# Example usage\n",
320 |       "root_directory = \"/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data\"\n",
321 |       "generate_metadata(root_directory)\n",
322 |       "```\n",
323 |       "\n",
324 |       "### 代码说明：\n",
325 |       "1. **文件命名规律分析**：\n",
326 |       "   - 文件夹命名中包含 `_flair`, `_t1`, `_t1ce`, `_t2` 的为多模态文件。\n",
327 |       "   - 文件夹命名中包含 `_seg` 的为掩码文件。\n",
328 |       "\n",
329 |       "2. **生成 metadata.csv**：\n",
330 |       "   - `sample_id`：使用数字序号。\n",
331 |       "   - `flair_path`, `t1_path`, `t1ce_path`, `t2_path`：分别对应不同模态的文件路径。\n",
332 |       "   - `seg_path`：掩码文件路径。\n",
333 |       "   - 若某模态或掩码文件不存在，则对应路径为空。\n",
334 |       "\n",
335 |       "3. **代码执行**：\n",
336 |       "   - 代码会遍历根目录下的所有 A 级文件夹，并提取各模态和掩码文件的路径，最终生成 `metadata.csv` 文件并保存在根目录下。\n",
337 |       "LLM 的响应内容已保存到 result.json 文件中。\n",
338 |       "生成的 Python 代码已保存到 generate_metadata.py 文件中。\n"
339 |      ]
340 |     }
341 |    ],
342 |    "source": [
343 |     "import requests\n",
344 |     "import csv\n",
345 |     "import re\n",
346 |     "import json\n",
347 |     "import os\n",
348 |     "\n",
349 |     "\n",
350 |     "\n",
351 |     "def generate_metadata(root_directory, your_api_key=None):\n",
352 |     "    # DeepSeek API 的 URL 和 API 密钥\n",
353 |     "    DEEPSEEK_API_URL = \"https://api.deepseek.com/v1/chat/completions\"\n",
354 |     "\n",
355 |     "    # 从环境变量中读取 API 密钥\n",
356 |     "    # 我推荐你到DeepSeek官网注册一个账号，然后在个人中心获取API_KEY，他们会给你你一辈子都用不完的额度\n",
357 |     "    # 获取之后填写到API = \"你的API\"中\n",
358 |     "    \n",
359 |     "    if your_api_key is None:\n",
360 |     "        if os.path.exists(\"config.py\"):\n",
361 |     "            from config import API_KEY\n",
362 |     "            API_KEY = API_KEY\n",
363 |     "\n",
364 |     "    # 读取 JSON 文件\n",
365 |     "    with open(\"directory_analysis.json\", \"r\") as f:\n",
366 |     "        json_input = f.read()\n",
367 |     "\n",
368 |     "    # 构建请求数据\n",
369 |     "    data = {\n",
370 |     "        \"model\": \"deepseek-chat\",\n",
371 |     "        \"messages\": [\n",
372 |     "            {\n",
373 |     "                \"role\": \"system\",\n",
374 |     "                \"content\": (\n",
375 |     "                    \"你是一名熟练的数据科学家，善于解析复杂的文件目录并生成元数据表格。\"\n",
376 |     "                    \"你的任务是帮助用户分析医学影像数据集，并根据采样的文件结构生成metadata.csv。\"\n",
377 |     "                    \"你只需要输出带有恰当注释的python代码即可，多余的信息不输出。\"\n",
378 |     "                )\n",
379 |     "            },\n",
380 |     "            {\n",
381 |     "                \"role\": \"user\",\n",
382 |     "                \"content\": (\n",
383 |     "                    f\"我正在浏览一个医学影像数据集，它的根目录为：{json.loads(json_input)['root_directory']}。\\n\"\n",
384 |     "                    \"这个数据集包含若干影像文件（可能包括多模态文件、单模态文件和掩码文件）。\\n\"\n",
385 |     "                    \"我采样了一些子文件夹（记为 A 级文件夹）以及其中的 B/C 级文件夹，目录树和采样文件的信息如下：\\n\"\n",
386 |     "                    f\"{json_input}\\n\"\n",
387 |     "                    \"我需要你：\\n\"\n",
388 |     "                    \"1. 分析文件命名的规律，判断是否存在多模态文件或掩码文件。\\n\"\n",
389 |     "                    \"2. 根据这些规律生成构建 metadata.csv 的 Python 代码。\\n\"\n",
390 |     "                    \"3. 输出的代码应该以根目录为输入，生成的 csv 应保存在根目录下，csv 的列包括 sample_id（若没有明显 id，则直接用数字序号）、各模态的文件地址（如 flair_path, t1_path 等，若不存在则为空，若没有明显的多模态特征那么记为image_path）、以及掩码地址（若不存在则为空）。\"\n",
391 |     "                )\n",
392 |     "            }\n",
393 |     "        ],\n",
394 |     "        \"stream\": False\n",
395 |     "    }\n",
396 |     "\n",
397 |     "    # 发送请求\n",
398 |     "    headers = {\n",
399 |     "        \"Authorization\": f\"Bearer {API_KEY}\",\n",
400 |     "        \"Content-Type\": \"application/json\"\n",
401 |     "    }\n",
402 |     "    response = requests.post(DEEPSEEK_API_URL, headers=headers, json=data)\n",
403 |     "\n",
404 |     "    # 检查响应状态码\n",
405 |     "    if response.status_code == 200:\n",
406 |     "        result = response.json()\n",
407 |     "        try:\n",
408 |     "            # 打印模型输出的原始内容\n",
409 |     "            print(\"模型输出的原始内容：\")\n",
410 |     "            model_output = result[\"choices\"][0][\"message\"][\"content\"]\n",
411 |     "            print(model_output)\n",
412 |     "\n",
413 |     "            # 保存 LLM 输出到 result.json\n",
414 |     "            with open(\"result.json\", \"w\") as f:\n",
415 |     "                f.write(model_output)\n",
416 |     "            print(\"LLM 的响应内容已保存到 result.json 文件中。\")\n",
417 |     "\n",
418 |     "            # 尝试从 LLM 的输出中提取生成的代码\n",
419 |     "            code_match = re.search(r\"```python(.*?)```\", model_output, re.DOTALL)\n",
420 |     "            if code_match:\n",
421 |     "                extracted_code = code_match.group(1).strip()\n",
422 |     "                with open(\"generate_metadata.py\", \"w\") as f:\n",
423 |     "                    f.write(extracted_code)\n",
424 |     "                print(\"生成的 Python 代码已保存到 generate_metadata.py 文件中。\")\n",
425 |     "            else:\n",
426 |     "                print(\"未检测到有效的 Python 代码块，请手动检查 LLM 输出。\")\n",
427 |     "        except Exception as e:\n",
428 |     "            print(f\"解析响应内容时发生错误：{e}\")\n",
429 |     "    else:\n",
430 |     "        print(f\"请求失败，状态码：{response.status_code}\")\n",
431 |     "        print(response.text)\n",
432 |     "\n",
433 |     "\n",
434 |     "generate_metadata(root_directory, your_api_key=None)"
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "code",
439 |    "execution_count": 25,
440 |    "metadata": {},
441 |    "outputs": [
442 |     {
443 |      "name": "stdout",
444 |      "output_type": "stream",
445 |      "text": [
446 |       "正在执行 generate_metadata.py...\n",
447 |       "generate_metadata.py 执行成功！\n",
448 |       "metadata.csv 文件已生成，路径为：/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/metadata.csv\n",
449 |       "metadata.csv 的前 5 行内容：\n",
450 |       "['sample_id', 'flair_path', 't1_path', 't1ce_path', 't2_path', 'seg_path']\n",
451 |       "['0', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_flair/00000057_brain_flair.nii', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t1/00000057_brain_t1.nii', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t1ce/00000057_brain_t1ce.nii', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t2/00000057_brain_t2.nii', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_seg/00000057_final_seg.nii']\n",
452 |       "['1', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_flair/00000014_brain_flair.nii', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_t1/00000014_brain_t1.nii', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_t1ce/00000014_brain_t1ce.nii', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_t2/00000014_brain_t2.nii', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00002/BraTS2021_00002_seg/BraTS2021_00002_seg_new.nii']\n",
453 |       "['2', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_flair/00000017_brain_flair.nii', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_t1/00000017_brain_t1.nii', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_t1ce/00000017_brain_t1ce.nii', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_t2/00000017_brain_t2.nii', '/teamspace/studios/this_studio/PreProcPipe/BraTS2021_Training_Data/BraTS2021_00003/BraTS2021_00003_seg/BraTS2021_00003_seg_new.nii']\n"
454 |      ]
455 |     }
456 |    ],
457 |    "source": [
458 |     "import os\n",
459 |     "import csv\n",
460 |     "import subprocess\n",
461 |     "\n",
462 |     "def execute_metadata_script(root_directory):\n",
463 |     "    metadata_file = os.path.join(root_directory, \"metadata.csv\")\n",
464 |     "    script_file = \"generate_metadata.py\"\n",
465 |     "\n",
466 |     "    # 检查 generate_metadata.py 是否存在\n",
467 |     "    if not os.path.exists(script_file):\n",
468 |     "        print(f\"脚本 {script_file} 不存在，请确保文件已正确生成。\")\n",
469 |     "    else:\n",
470 |     "        # 执行 generate_metadata.py 脚本\n",
471 |     "        print(f\"正在执行 {script_file}...\")\n",
472 |     "        result = subprocess.run([\"python\", script_file], capture_output=True, text=True)\n",
473 |     "\n",
474 |     "        # 检查执行结果\n",
475 |     "        if result.returncode == 0:\n",
476 |     "            print(f\"{script_file} 执行成功！\")\n",
477 |     "        else:\n",
478 |     "            print(f\"{script_file} 执行失败！\")\n",
479 |     "            print(f\"错误输出：\\n{result.stderr}\")\n",
480 |     "\n",
481 |     "        # 检查 metadata.csv 是否存在\n",
482 |     "        if os.path.exists(metadata_file):\n",
483 |     "            print(f\"metadata.csv 文件已生成，路径为：{metadata_file}\")\n",
484 |     "\n",
485 |     "            # 打印 metadata.csv 的前 5 行\n",
486 |     "            try:\n",
487 |     "                with open(metadata_file, \"r\") as f:\n",
488 |     "                    reader = csv.reader(f)\n",
489 |     "                    print(\"metadata.csv 的前 5 行内容：\")\n",
490 |     "                    for i, row in enumerate(reader):\n",
491 |     "                        print(row)\n",
492 |     "                        if i == 4:  # 打印前 5 行\n",
493 |     "                            break\n",
494 |     "            except Exception as e:\n",
495 |     "                print(f\"读取 metadata.csv 时发生错误：{e}\")\n",
496 |     "        else:\n",
497 |     "            print(\"metadata.csv 文件未生成，请检查脚本逻辑和根目录路径。\")\n",
498 |     "\n",
499 |     "execute_metadata_script(root_directory)"
500 |    ]
501 |   }
502 |  ],
503 |  "metadata": {
504 |   "kernelspec": {
505 |    "display_name": "Python 3",
506 |    "language": "python",
507 |    "name": "python3"
508 |   },
509 |   "language_info": {
510 |    "codemirror_mode": {
511 |     "name": "ipython",
512 |     "version": 3
513 |    },
514 |    "file_extension": ".py",
515 |    "mimetype": "text/x-python",
516 |    "name": "python",
517 |    "nbconvert_exporter": "python",
518 |    "pygments_lexer": "ipython3",
519 |    "version": "3.10.11"
520 |   }
521 |  },
522 |  "nbformat": 4,
523 |  "nbformat_minor": 2
524 | }
525 | 


--------------------------------------------------------------------------------
/LLM_metadata.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import json
  4 | import requests
  5 | import csv
  6 | import re
  7 | import json
  8 | import os
  9 | import os
 10 | import csv
 11 | import subprocess
 12 | 
 13 | def analyze_directory(root_directory, sample_folder_count=1, sample_file_count=5):
 14 |     # 用于存储最终的结果
 15 |     result = {
 16 |         "root_directory": root_directory,
 17 |         "a_level_summary": {
 18 |             "total_a_folders": 0,
 19 |             "example_a_folders": [],
 20 |         },
 21 |         "sampled_a_folders": []
 22 |     }
 23 | 
 24 |     # 1. 计算根目录下的 A 级文件夹数量并打印
 25 |     a_level_folders = [f for f in os.listdir(root_directory) if os.path.isdir(os.path.join(root_directory, f))]
 26 |     result["a_level_summary"]["total_a_folders"] = len(a_level_folders)
 27 |     result["a_level_summary"]["example_a_folders"] = a_level_folders[:5]
 28 | 
 29 |     # 2. 随机采样 A 级文件夹
 30 |     sampled_a_folders = random.sample(a_level_folders, min(sample_folder_count, len(a_level_folders)))
 31 | 
 32 |     for sampled_a_folder in sampled_a_folders:
 33 |         sampled_a_path = os.path.join(root_directory, sampled_a_folder)
 34 |         sampled_a_info = {
 35 |             "a_folder_name": sampled_a_folder,
 36 |             "directory_tree": [],
 37 |             "sampled_files": []
 38 |         }
 39 | 
 40 |         # 3. 穷尽 A 级文件夹下的目录树
 41 |         file_list = []
 42 |         for root, dirs, files in os.walk(sampled_a_path):
 43 |             # 获取当前路径的相对路径和层级
 44 |             relative_root = os.path.relpath(root, sampled_a_path)
 45 |             folder_level = len(relative_root.split(os.sep)) if relative_root != "." else 0
 46 | 
 47 |             # 保存目录树信息
 48 |             sampled_a_info["directory_tree"].append({
 49 |                 "level": folder_level,
 50 |                 "folder_name": os.path.basename(root),
 51 |                 "sub_folders": dirs,
 52 |                 "file_count": len(files)
 53 |             })
 54 | 
 55 |             # 收集文件地址
 56 |             file_list.extend([os.path.join(root, f) for f in files])
 57 | 
 58 |         # 4. 随机采样末端文件
 59 |         sampled_files = random.sample(file_list, min(sample_file_count, len(file_list)))
 60 |         sampled_a_info["sampled_files"] = sampled_files
 61 | 
 62 |         # 添加到结果中
 63 |         result["sampled_a_folders"].append(sampled_a_info)
 64 | 
 65 |     # 将结果格式化为 JSON 并打印
 66 |     formatted_result = json.dumps(result, indent=4, ensure_ascii=False)
 67 |     print(formatted_result)
 68 | 
 69 |     # 可选择将结果保存到文件
 70 |     output_file = os.path.join(root_directory, "directory_analysis.json")
 71 |     with open(output_file, "w", encoding="utf-8") as f:
 72 |         f.write(formatted_result)
 73 |     print(f"\n结果已保存到: {output_file}")
 74 | 
 75 | 
 76 | 
 77 | def generate_metadata(root_directory, your_api_key=None):
 78 |     # DeepSeek API 的 URL 和 API 密钥
 79 |     DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"
 80 |     llm_api = None
 81 |     if your_api_key is None:
 82 |         if os.path.exists(r"D:\REPO\PreProcPipe\config.py"):
 83 |             from config import API_KEY
 84 |             llm_api = API_KEY
 85 |             print("API 密钥已从 config.py 中读取。")
 86 |     else:
 87 |         llm_api = your_api_key
 88 | 
 89 |     # 读取 JSON 文件
 90 |     with open(os.path.join(root_directory, "directory_analysis.json"), "r") as f:
 91 |         json_input = f.read()
 92 |     print("正在指使LLM生成代码...")
 93 |     # 构建请求数据
 94 |     data = {
 95 |         "model": "deepseek-chat",
 96 |         "messages": [
 97 |             {
 98 |                 "role": "system",
 99 |                 "content": (
100 |                     "你是一名熟练的数据科学家，善于解析复杂的文件目录并生成元数据表格。"
101 |                     "你的任务是帮助用户分析医学影像数据集，并根据采样的文件结构生成metadata.csv。"
102 |                     "你只需要输出带有恰当注释的python代码即可，多余的信息不输出。"
103 |                 )
104 |             },
105 |             {
106 |                 "role": "user",
107 |                 "content": (
108 |                     f"我正在浏览一个医学影像数据集，它的根目录为：{json.loads(json_input)['root_directory']}。\n"
109 |                     "这个数据集包含若干影像文件（可能包括多模态文件、单模态文件和掩码文件）。\n"
110 |                     "我采样了一些子文件夹（记为 A 级文件夹）以及其中的 B/C 级文件夹，目录树和采样文件的信息如下：\n"
111 |                     f"{json_input}\n"
112 |                     "我需要你：\n"
113 |                     "1. 分析文件命名的规律，判断是否存在多模态文件或掩码文件，分析出他们之间配对的关系，比如命名可能有相同的地方，或者用后缀区分了图像与掩码。\n"
114 |                     "2. 根据这些规律生成构建 metadata.csv 的 Python 代码。\n"
115 |                     "3. 输出的代码应该以根目录为输入，生成的 csv 应保存在根目录下，csv 的列包括 sample_id（若没有明显 id，则直接用数字序号）、各模态的文件地址（如 flair_path, t1_path 等，若没有明显的多模态特征那么记为image_path）、以及掩码地址（若不存在则为空）。\n"
116 |                 )
117 |             }
118 |         ],
119 |         "stream": False
120 |     }
121 | 
122 |     # 发送请求
123 |     headers = {
124 |         "Authorization": f"Bearer {llm_api}",
125 |         "Content-Type": "application/json"
126 |     }
127 |     response = requests.post(DEEPSEEK_API_URL, headers=headers, json=data)
128 | 
129 |     # 检查响应状态码
130 |     if response.status_code == 200:
131 |         result = response.json()
132 |         try:
133 |             # 打印模型输出的原始内容
134 |             print("模型输出的原始内容：")
135 |             model_output = result["choices"][0]["message"]["content"]
136 |             print(model_output)
137 | 
138 |             # 你可以保存 LLM 输出到 result.json
139 |             # with open("result.json", "w") as f:
140 |             #     f.write(model_output)
141 |             # print("LLM 的响应内容已保存到 result.json 文件中。")
142 | 
143 |             # 尝试从 LLM 的输出中提取生成的代码
144 |             code_match = re.search(r"```python(.*?)```", model_output, re.DOTALL)
145 |             if code_match:
146 |                 extracted_code = code_match.group(1).strip()
147 |                 with open(os.path.join(root_directory, "generate_metadata.py"), "w") as f:
148 |                     f.write(extracted_code)
149 |                 print("生成的 Python 代码已保存到 generate_metadata.py 文件中。")
150 |             else:
151 |                 print("未检测到有效的 Python 代码块，请手动检查 LLM 输出。")
152 |         except Exception as e:
153 |             print(f"解析响应内容时发生错误：{e}")
154 |     else:
155 |         print(f"请求失败，状态码：{response.status_code}")
156 |         print(response.text)
157 | 
158 | 
159 | 
160 | def execute_metadata_script(root_directory):
161 |     metadata_file = os.path.join(root_directory, "metadata.csv")
162 |     script_file = os.path.join(root_directory, "generate_metadata.py")
163 | 
164 |     # 检查 generate_metadata.py 是否存在
165 |     if not os.path.exists(script_file):
166 |         print(f"脚本 {script_file} 不存在，请确保文件已正确生成。")
167 |     else:
168 |         # 执行 generate_metadata.py 脚本
169 |         print(f"正在执行 {script_file}...")
170 |         result = subprocess.run(["python", script_file], capture_output=True, text=True)
171 | 
172 |         # 检查执行结果
173 |         if result.returncode == 0:
174 |             print(f"{script_file} 执行成功！")
175 |         else:
176 |             print(f"{script_file} 执行失败！")
177 |             print(f"错误输出：\n{result.stderr}")
178 | 
179 |         # 检查 metadata.csv 是否存在
180 |         if os.path.exists(metadata_file):
181 |             print(f"metadata.csv 文件已生成，路径为：{metadata_file}")
182 | 
183 |             # 打印 metadata.csv 的前 5 行
184 |             try:
185 |                 with open(metadata_file, "r") as f:
186 |                     reader = csv.reader(f)
187 |                     print("metadata.csv 的前 5 行内容：")
188 |                     for i, row in enumerate(reader):
189 |                         print(row)
190 |                         if i == 4:  # 打印前 5 行
191 |                             break
192 |             except Exception as e:
193 |                 print(f"读取 metadata.csv 时发生错误：{e}")
194 |         else:
195 |             print("metadata.csv 文件未生成，请检查脚本逻辑和根目录路径。")
196 | 
197 | def metadata_sanity_check(root_directory):
198 |     metadata_file = os.path.join(root_directory, "metadata.csv")
199 |     
200 |     try:
201 |         with open(metadata_file, mode='r') as file:
202 |             reader = csv.DictReader(file)
203 |             for row in reader:
204 |                 for key, value in row.items():
205 |                     if '_path' in key:
206 |                         if not value:
207 |                             print(f"空值: {key} 在 sample_id {row['sample_id']} 中为空")
208 |                         else:
209 |                             full_path = os.path.join(root_directory, value)
210 |                             if not os.path.exists(full_path):
211 |                                 print(f"路径无效: {key} 在 sample_id {row['sample_id']} 中指向 {full_path}")
212 |                             else:
213 |                                 # print(f"路径有效: {key} 在 sample_id {row['sample_id']} 中指向 {full_path}")
214 |                                 pass
215 |     except Exception as e:
216 |         print(f"读取 metadata.csv 时发生错误: {e}")
217 |         print("看起来有错误，你可以手动查看 metadata.csv 是否正确。")
218 | 
219 |     
220 | 
221 | 
222 | if __name__ == "__main__":
223 |     # 1. 分析文件目录结构
224 |     root_directory = r"D:\REPO\PreProcPipe\BraTS2021_Training_Data" # 填写数据集根目录，一定要是绝对路径
225 | 
226 |     analyze_directory(root_directory=root_directory, sample_folder_count=5, sample_file_count=10) # 可以通过增加 sample_folder_count 和 sample_file_count 来提高成功率
227 | 
228 |     # 2. 生成 metadata.csv 的 Python 代码
229 |     generate_metadata(root_directory=root_directory, your_api_key=None) # 填写你的API_KEY
230 |     # 推荐你去DeepSeek官网注册一个账号，然后在个人中心获取API_KEY，他们会给你一辈子用不完的额度，输入格式为API = "sadasdasdwqeqwe2"
231 | 
232 |     # 3. 执行生成的代码并检查 metadata.csv
233 |     execute_metadata_script(root_directory=root_directory)
234 | 
235 |     # 4. 检查 metadata.csv 的正确性
236 |     metadata_sanity_check(root_directory=root_directory)
237 | 
238 | 
239 | 


--------------------------------------------------------------------------------
/__pycache__/config.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/__pycache__/config.cpython-310.pyc


--------------------------------------------------------------------------------
/__pycache__/pipeline.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/__pycache__/pipeline.cpython-310.pyc


--------------------------------------------------------------------------------
/after.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/after.png


--------------------------------------------------------------------------------
/before.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgy112112/PreProcPipe/57e6024ddcf1eb96a8e62ee75dc9e6a9f906d9cf/before.png


--------------------------------------------------------------------------------
/pipeline.py:
--------------------------------------------------------------------------------
  1 | import nibabel as nib
  2 | import numpy as np
  3 | from scipy.ndimage import zoom
  4 | from multiprocessing import Pool
  5 | import os
  6 | import csv
  7 | 
  8 | class SimplePreprocessor:
  9 |     def __init__(self, target_spacing=[1.0, 1.0, 1.0], normalization_scheme="z-score", target_size=None):
 10 |         """
 11 |         初始化预处理器。
 12 | 
 13 |         参数：
 14 |         - target_spacing: 目标体素大小（spacing），默认为 [1.0, 1.0, 1.0]。
 15 |         - normalization_scheme: 归一化方案，支持 "z-score" 或 "min-max"。
 16 |         - target_size: 目标尺寸，例如 [256, 256]，默认为 None（不调整尺寸）。
 17 |         """
 18 |         self.target_spacing = target_spacing
 19 |         self.normalization_scheme = normalization_scheme
 20 |         self.target_size = target_size  # 目标大小，例如 [256, 256]
 21 | 
 22 |     def read_images(self, image_paths):
 23 |         """
 24 |         读取多个模态的图像数据 (.nii) 文件，并返回一个列表，每个元素为单独的 NumPy 数组。
 25 |         """
 26 |         print("Step 1: Loading multi-modal image data...")
 27 |         img_list = []
 28 |         for path in image_paths:
 29 |             img = nib.load(path)
 30 |             img_data = img.get_fdata()
 31 |             img_list.append(img_data)
 32 |         # 假设所有模态具有相同的 spacing
 33 |         img_spacing = nib.load(image_paths[0]).header.get_zooms()
 34 |         print()
 35 |         return img_list, img_spacing
 36 | 
 37 |     def read_seg(self, seg_path):
 38 |         """
 39 |         读取分割数据 (.nii) 文件并转换为 NumPy 数组。
 40 |         """
 41 |         print("Step 1: Loading segmentation data...")
 42 |         seg = nib.load(seg_path)
 43 |         seg_data = seg.get_fdata()
 44 |         print()
 45 |         return seg_data
 46 | 
 47 |     def run_case(self, image_paths, seg_path=None):
 48 |         """
 49 |         能够处理多模态图像的预处理流程，但不将它们合并到同一个数组中。
 50 |         """
 51 |         # Step 1: 加载多模态图像数据
 52 |         data_list, spacing = self.read_images(image_paths)
 53 | 
 54 |         if seg_path:
 55 |             seg = self.read_seg(seg_path)
 56 |         else:
 57 |             seg = None
 58 | 
 59 |         # 打印原始数据形状
 60 |         for i, data in enumerate(data_list):
 61 |             print(f"Original image shape (modality {i}): {data.shape}")
 62 |         if seg is not None:
 63 |             print(f"Original segmentation shape: {seg.shape}")
 64 |         print()
 65 | 
 66 |         # Step 2: 根据所有模态数据的非零区域计算裁剪范围
 67 |         print("Step 2: Cropping to non-zero regions...")
 68 |         # 将所有模态的非零坐标合并计算公共裁剪区域
 69 |         data_list, seg, properties = self.crop(data_list, seg)
 70 |         properties['original_spacing'] = spacing
 71 | 
 72 |         # Step 3: 对每个模态独立归一化
 73 |         print("Step 3: Normalizing image data...")
 74 |         for i in range(len(data_list)):
 75 |             data_list[i] = self._normalize_single_modality(data_list[i])
 76 |         print()
 77 | 
 78 |         # Step 4: 重采样到目标分辨率
 79 |         print("Step 4: Resampling data to target spacing...")
 80 |         # 使用第一模态计算 new_shape（假设各模态 spacing 一致）
 81 |         new_shape = self.compute_new_shape(data_list[0].shape, spacing, self.target_spacing)
 82 |         data_list = [self.resample_data(d, new_shape, order=3) for d in data_list]
 83 |         if seg is not None:
 84 |             seg = self.resample_data(seg, new_shape, order=0)
 85 |         print()
 86 | 
 87 |         # Step 5: 调整到目标尺寸（如果指定）
 88 |         if self.target_size is not None:
 89 |             print("Step 5: Resizing data to target size...")
 90 |             data_list = [self.resize_to_target_size(d, self.target_size, order=3) for d in data_list]
 91 |             if seg is not None:
 92 |                 seg = self.resize_to_target_size(seg, self.target_size, order=0)
 93 |             print()
 94 | 
 95 |         print("Preprocessing completed.\n")
 96 |         return data_list, seg, spacing, properties
 97 |         
 98 | 
 99 |     def crop(self, data_list, seg):
100 |         """
101 |         裁剪图像和分割数据在 Z 轴方向的全零区域，返回裁剪后的数据列表和分割数据，以及裁剪属性。
102 |         
103 |         参数：
104 |         - data_list: 多模态图像数据列表，每个元素为 NumPy 数组。
105 |         - seg: 分割数据（NumPy 数组），可以为 None。
106 |         
107 |         返回：
108 |         - cropped_data_list: 裁剪后的多模态图像数据列表。
109 |         - cropped_seg: 裁剪后的分割数据（如果 seg 为 None，则返回 None）。
110 |         - properties: 裁剪过程的属性信息，包括裁剪前后的形状和裁剪边界。
111 |         """
112 |         print("Step 2: Cropping to non-zero regions along Z-axis...")
113 | 
114 |         # 获取所有模态在 Z 轴方向的非零范围
115 |         nonzero_slices = []
116 |         for data in data_list:
117 |             # 沿 Z 轴求和，如果某切片全为零，则和为零
118 |             z_nonzero = np.any(data != 0, axis=(0, 1))
119 |             nonzero_slices.append(np.argwhere(z_nonzero).flatten())
120 | 
121 |         if len(nonzero_slices) == 0:
122 |             # 全部为零，不裁剪
123 |             properties = {
124 |                 'shape_before_cropping': [d.shape for d in data_list],
125 |                 'shape_after_cropping': [d.shape for d in data_list],
126 |                 'z_bbox': None
127 |             }
128 |             return data_list, seg, properties
129 | 
130 |         # 计算公共 Z 轴范围
131 |         z_min = min(s.min() for s in nonzero_slices)
132 |         z_max = max(s.max() for s in nonzero_slices) + 1  # 加1表示包含该索引
133 | 
134 |         print(f"Z-axis cropping range: {z_min} to {z_max}")
135 | 
136 |         # 裁剪所有模态的 Z 轴范围
137 |         cropped_data_list = [d[:, :, z_min:z_max] for d in data_list]
138 | 
139 |         # 裁剪分割数据的 Z 轴范围
140 |         cropped_seg = None
141 |         if seg is not None:
142 |             cropped_seg = seg[:, :, z_min:z_max]
143 | 
144 |         # 记录裁剪属性
145 |         properties = {
146 |             'shape_before_cropping': [d.shape for d in data_list],
147 |             'shape_after_cropping': [d.shape for d in cropped_data_list],
148 |             'z_bbox': (z_min, z_max)
149 |         }
150 | 
151 |         print(f"Shapes before cropping: {[d.shape for d in data_list]}")
152 |         print(f"Shapes after cropping: {[d.shape for d in cropped_data_list]}")
153 |         if seg is not None:
154 |             print(f"Segmentation shape after cropping: {cropped_seg.shape}")
155 | 
156 |         return cropped_data_list, cropped_seg, properties
157 | 
158 | 
159 |     # def _normalize(self, data, seg=None):
160 |     #     """
161 |     #     归一化图像数据。
162 |     #     """
163 |     #     if self.normalization_scheme == "z-score":
164 |     #         mean_val = np.mean(data[data > 0])
165 |     #         std_val = np.std(data[data > 0])
166 |     #         data = (data - mean_val) / (std_val + 1e-8)
167 |     #     elif self.normalization_scheme == "min-max":
168 |     #         min_val = np.min(data[data > 0])
169 |     #         max_val = np.max(data[data > 0])
170 |     #         data = (data - min_val) / (max_val - min_val + 1e-8)
171 |     #     else:
172 |     #         raise ValueError(f"Unknown normalization scheme: {self.normalization_scheme}")
173 |     #     return data
174 | 
175 |     # 新增一个专门处理单个模态归一化的方法
176 |     def _normalize_single_modality(self, data):
177 |         """
178 |         对单个模态数据进行归一化。
179 |         """
180 |         mask = data > 0
181 |         if self.normalization_scheme == "z-score":
182 |             mean_val = np.mean(data[mask]) if np.any(mask) else 0.0
183 |             std_val = np.std(data[mask]) if np.any(mask) else 1.0
184 |             data = (data - mean_val) / (std_val + 1e-8)
185 |         elif self.normalization_scheme == "min-max":
186 |             min_val = np.min(data[mask]) if np.any(mask) else 0.0
187 |             max_val = np.max(data[mask]) if np.any(mask) else 1.0
188 |             data = (data - min_val) / (max_val - min_val + 1e-8)
189 |         else:
190 |             raise ValueError(f"Unknown normalization scheme: {self.normalization_scheme}")
191 |         return data
192 | 
193 |     def compute_new_shape(self, old_shape, old_spacing, new_spacing):
194 |         """
195 |         根据原始分辨率和目标分辨率计算新的形状。
196 |         """
197 |         resize_factor = [old_spacing[i] / new_spacing[i] for i in range(len(old_spacing))]
198 |         print(f"Computed resize factors: {resize_factor}")
199 |         new_shape = [int(np.round(old_shape[i] * resize_factor[i])) for i in range(len(old_shape))]
200 |         print(f"Computed new shape: {new_shape}")
201 |         return new_shape
202 | 
203 |     def resample_data(self, data, new_shape, order=3):
204 |         """
205 |         根据新的形状进行重采样。
206 |         """
207 |         print("Resampling data...")
208 |         zoom_factors = [new_shape[i] / data.shape[i] for i in range(len(data.shape))]
209 |         resampled_data = zoom(data, zoom_factors, order=order)
210 |         print(f"Data resampled to shape: {resampled_data.shape}")
211 |         return resampled_data
212 | 
213 |     def resize_to_target_size(self, data, target_size, order=3):
214 |         """
215 |         将图像或分割数据调整到目标尺寸。
216 |         """
217 |         print("Resizing data to target size...")
218 |         current_shape = data.shape
219 |         zoom_factors = [target_size[0] / current_shape[0],  # 调整第一个维度（Y 轴，高度）
220 |                         target_size[1] / current_shape[1],  # 调整第二个维度（X 轴，宽度）
221 |                         1.0]  # Z 轴（深度）保持不变
222 |         resized_data = zoom(data, zoom_factors, order=order)
223 |         print(f"Data resized to shape: {resized_data.shape}")
224 |         return resized_data
225 | 
226 | 
227 | def process_case(args):
228 |     """
229 |     多进程调用的函数，用于处理单个病例。
230 | 
231 |     参数：
232 |     - args: (sample_id, image_paths, seg_path, preprocessor, output_root)
233 |     """
234 |     sample_id, image_paths, seg_path, preprocessor, output_root = args
235 |     # 调用预处理器的 run_case 方法处理多模态图像
236 |     data_list, seg, spacing, properties = preprocessor.run_case(image_paths, seg_path)
237 | 
238 |     # 创建样本目录（在output_root下）
239 |     # 检查sample_id的类型
240 |     if isinstance(sample_id, int):
241 |         sample_id = str(sample_id)
242 |     sample_dir = os.path.join(output_root, sample_id)
243 |     os.makedirs(sample_dir, exist_ok=True)
244 | 
245 |     # 推断各模态名称（使用文件名去除扩展名作为模态名称）
246 |     modality_names = [os.path.splitext(os.path.basename(p))[0] for p in image_paths]
247 | 
248 |     # 保存各模态数据
249 |     modality_paths = []
250 |     for modality_name, modality_data in zip(modality_names, data_list):
251 |         save_path = os.path.join(sample_dir, f"{modality_name}.npz")
252 |         np.savez_compressed(save_path, data=modality_data)
253 |         modality_paths.append(save_path)
254 | 
255 |     seg_path_out = None
256 |     # 保存分割数据（如果有分割）
257 |     if seg is not None:
258 |         seg_save_path = os.path.join(sample_dir, "seg.npz")
259 |         np.savez_compressed(seg_save_path, data=seg)
260 |         seg_path_out = seg_save_path
261 | 
262 |     # 保存 spacing 和 properties 为 meta.npz
263 |     meta_save_path = os.path.join(sample_dir, "meta.npz")
264 |     np.savez_compressed(meta_save_path, spacing=spacing, properties=properties)
265 | 
266 |     # 返回处理结果及保存的文件路径信息，用于后续生成metadata.csv
267 |     return {
268 |         "sample_id": sample_id,
269 |         "modality_paths": modality_paths,
270 |         "seg_path": seg_path_out,
271 |         "meta_path": meta_save_path
272 |     }
273 | 
274 | 
275 | def run_in_parallel(preprocessor, cases, output_root, num_workers=4):
276 |     """
277 |     使用多进程并行处理多个病例，并在output_root下存放处理结果为npz文件，
278 |     同时在output_root下生成metadata.csv记录每个sample的npz地址。
279 | 
280 |     参数：
281 |     - preprocessor: SimplePreprocessor 实例。
282 |     - cases: 包含多个病例信息的列表，每个病例是一个字典，格式：
283 |         {
284 |             "sample_id": "某病例ID字符串",
285 |             "image_paths": [模态1路径, 模态2路径, ...],
286 |             "seg_path": 分割路径或 None
287 |         }
288 |     - output_root: 输出结果保存的根目录
289 |     - num_workers: 并行进程数，默认为 4。
290 | 
291 |     返回：
292 |     - results: 包含每个病例保存文件路径信息的列表
293 |     """
294 |     os.makedirs(output_root, exist_ok=True)
295 | 
296 |     args_list = [
297 |         (case["sample_id"], case["image_paths"], case["seg_path"], preprocessor, output_root) for case in cases
298 |     ]
299 | 
300 |     # 使用多进程池并行处理
301 |     with Pool(processes=num_workers) as pool:
302 |         results = pool.map(process_case, args_list)
303 | 
304 |     # 生成 metadata.csv
305 |     # 文件内容格式示例：
306 |     # sample_id,modality_paths,seg_path,meta_path
307 |     # BraTS2021_00000,"['output_root/BraTS2021_00000/t1.npz','output_root/BraTS2021_00000/t2.npz']","output_root/BraTS2021_00000/seg.npz","output_root/BraTS2021_00000/meta.npz"
308 | 
309 |     csv_path = os.path.join(output_root, "metadata.csv")
310 |     with open(csv_path, mode='w', newline='', encoding='utf-8') as csvfile:
311 |         writer = csv.writer(csvfile)
312 |         writer.writerow(["sample_id", "modality_paths", "seg_path", "meta_path"])
313 |         for res in results:
314 |             # 将绝对路径转换为相对于output_root的相对路径，便于移植
315 |             # 如果需要保留绝对路径，可注释掉此步骤
316 |             rel_modality_paths = [os.path.relpath(p, output_root) for p in res["modality_paths"]]
317 |             rel_seg_path = os.path.relpath(res["seg_path"], output_root) if res["seg_path"] is not None else None
318 |             rel_meta_path = os.path.relpath(res["meta_path"], output_root)
319 |             writer.writerow([
320 |                 res["sample_id"],
321 |                 str(rel_modality_paths),
322 |                 rel_seg_path,
323 |                 rel_meta_path
324 |             ])
325 | 
326 |     return results
327 | 
328 | 
329 | 


--------------------------------------------------------------------------------
/pipeline_example.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from pipeline import SimplePreprocessor as ppp
 3 | from pipeline import run_in_parallel
 4 | 
 5 | 
 6 | # 定义读取 metadata.csv 并生成 cases 列表的函数
 7 | def load_cases_from_metadata(csv_path):
 8 |     """
 9 |     从 metadata.csv 加载病例信息，并生成 (image_paths, seg_path) 的列表。
10 |     
11 |     参数：
12 |     - csv_path: metadata.csv 文件路径。
13 |     
14 |     返回：
15 |     - cases: 包含病例信息的列表，每个元素是一个字典，格式为：
16 |       {
17 |           "sample_id": 样本ID,
18 |           "image_paths": [模态1路径, 模态2路径, ...],
19 |           "seg_path": 分割路径或 None
20 |       }
21 |     """
22 |     df = pd.read_csv(csv_path)
23 |     cases = []
24 |     for _, row in df.iterrows():
25 |         # 提取模态路径
26 |         image_paths = [row['t1_path'], row['t1ce_path'], row['t2_path'], row['flair_path']]
27 |         # 过滤掉空值
28 |         image_paths = [path for path in image_paths if pd.notnull(path)]
29 |         # 提取分割路径
30 |         seg_path = row['seg_path'] if pd.notnull(row['seg_path']) else None
31 |         # 添加到 cases
32 |         cases.append({
33 |             "sample_id": row['sample_id'],
34 |             "image_paths": image_paths,
35 |             "seg_path": seg_path
36 |         })
37 |     return cases
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     example_preprocessor = ppp(
42 |         target_spacing = [0.5, 0.5, 0.5],
43 |         target_size = [256, 256],
44 |         normalization_scheme = "min-max",
45 |     )
46 | 
47 |     cases = load_cases_from_metadata(r"D:\REPO\PreProcPipe\BraTS2021_Training_Data\metadata.csv")
48 |     results = run_in_parallel(example_preprocessor, cases, num_workers=8, output_root="preprocessed_data")
49 | 
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | # PreProcPipe: 多模态医学影像预处理框架
  2 | 
  3 | 一个用于CT/MRI等多模态医学影像预处理的高效框架，支持自动化的数据预处理流程和元数据生成。
  4 | 
  5 | ## Choose Language / 选择语言
  6 | 
  7 | - [English](readme_en.md)
  8 | - [简体中文](readme.md)
  9 | 
 10 | ![预处理前](before.png)
 11 | ![预处理后](after.png)
 12 | 
 13 | ## 目录
 14 | 
 15 | - [主要功能](#主要功能)
 16 | - [技术架构](#技术架构)
 17 | - [代码实现](#代码实现)
 18 | - [使用指南](#使用指南)
 19 | - [配置与扩展](#配置与扩展)
 20 | - [示例](#示例)
 21 | 
 22 | ## 主要功能
 23 | 
 24 | ### 1. 预处理管道 (pipeline.py)
 25 | - 多模态医学影像数据处理
 26 | - Z轴方向智能裁剪
 27 | - 可配置的数据归一化
 28 | - 灵活的图像重采样
 29 | - 并行处理支持
 30 | - 自动保存处理结果
 31 | 
 32 | ### 2. 元数据自动生成 (LLM_metadata.py)
 33 | - 目录结构智能分析
 34 | - LLM驱动的元数据规则生成
 35 | - 自动验证和错误检测
 36 | - DeepSeek API集成
 37 | 
 38 | ## 技术架构
 39 | 
 40 | ### 预处理管道架构
 41 | 
 42 | 预处理管道采用模块化设计，按照以下步骤顺序处理数据：
 43 | 
 44 | 1. **数据输入** → **SimplePreprocessor**
 45 |    - 接收多模态医学影像数据
 46 |    - 支持.nii格式文件
 47 | 
 48 | 2. **数据加载**
 49 |    - 读取多模态图像数据
 50 |    - 读取分割数据（如果有）
 51 | 
 52 | 3. **Z轴裁剪**
 53 |    - 智能识别有效区域
 54 |    - 去除冗余空白区域
 55 | 
 56 | 4. **归一化**
 57 |    - 支持z-score标准化
 58 |    - 支持min-max归一化
 59 | 
 60 | 5. **重采样**
 61 |    - 调整体素间距
 62 |    - 保持图像质量
 63 | 
 64 | 6. **尺寸调整**
 65 |    - 统一输出尺寸
 66 |    - 可选的尺寸配置
 67 | 
 68 | 7. **输出处理后的数据**
 69 |    - 保存为标准格式
 70 |    - 生成处理元数据
 71 | 
 72 | ### 元数据生成系统架构
 73 | 
 74 | 元数据生成系统采用LLM驱动的智能分析流程：
 75 | 
 76 | 1. **数据集根目录** → **目录结构分析**
 77 |    - 扫描文件系统
 78 |    - 识别文件组织模式
 79 | 
 80 | 2. **随机采样**
 81 |    - 选取代表性样本
 82 |    - 分析文件命名规律
 83 | 
 84 | 3. **DeepSeek API调用**
 85 |    - 发送结构化请求
 86 |    - 接收AI分析结果
 87 | 
 88 | 4. **生成处理代码**
 89 |    - 自动生成Python脚本
 90 |    - 包含数据处理逻辑
 91 | 
 92 | 5. **执行与验证**
 93 |    - 运行生成的代码
 94 |    - 检查处理结果
 95 | 
 96 | 6. **生成metadata.csv**
 97 |    - 记录数据集信息
 98 |    - 建立索引关系
 99 | 
100 | ## 代码实现
101 | 
102 | ### 核心类：SimplePreprocessor
103 | 
104 | ```python
105 | class SimplePreprocessor:
106 |     def __init__(self, target_spacing=[1.0, 1.0, 1.0], 
107 |                  normalization_scheme="z-score", 
108 |                  target_size=None):
109 |         """
110 |         初始化预处理器
111 |         
112 |         参数：
113 |         - target_spacing: 目标体素大小，默认[1.0, 1.0, 1.0]
114 |         - normalization_scheme: 归一化方案("z-score"/"min-max")
115 |         - target_size: 目标尺寸，如[256, 256]
116 |         """
117 | ```
118 | 
119 | #### 主要方法：
120 | 
121 | 1. **数据加载**
122 | ```python
123 | def read_images(self, image_paths):
124 |     """加载多模态图像数据"""
125 | 
126 | def read_seg(self, seg_path):
127 |     """加载分割数据"""
128 | ```
129 | 
130 | 2. **数据预处理**
131 | ```python
132 | def crop(self, data_list, seg):
133 |     """Z轴智能裁剪"""
134 | 
135 | def _normalize_single_modality(self, data):
136 |     """单模态数据归一化"""
137 | 
138 | def compute_new_shape(self, old_shape, old_spacing, new_spacing):
139 |     """计算重采样目标形状"""
140 | 
141 | def resample_data(self, data, new_shape, order=3):
142 |     """数据重采样"""
143 | 
144 | def resize_to_target_size(self, data, target_size, order=3):
145 |     """调整数据尺寸"""
146 | ```
147 | 
148 | 3. **处理流程**
149 | ```python
150 | def run_case(self, image_paths, seg_path=None):
151 |     """执行完整的预处理流程"""
152 | ```
153 | 
154 | ### 元数据生成系统
155 | 
156 | ```python
157 | def analyze_directory(root_directory, sample_folder_count=5, sample_file_count=10):
158 |     """分析目录结构并采样"""
159 | 
160 | def generate_metadata(root_directory, your_api_key=None):
161 |     """使用LLM生成元数据处理代码"""
162 | 
163 | def execute_metadata_script(root_directory):
164 |     """执行并验证元数据生成"""
165 | ```
166 | 
167 | ## 使用指南
168 | 
169 | ### 1. 基础预处理流程
170 | 
171 | ```python
172 | from pipeline import SimplePreprocessor
173 | 
174 | # 初始化预处理器
175 | preprocessor = SimplePreprocessor(
176 |     target_spacing=[1.0, 1.0, 1.0],
177 |     normalization_scheme="z-score",
178 |     target_size=[256, 256]
179 | )
180 | 
181 | # 准备数据路径
182 | image_paths = [
183 |     "path/to/flair.nii",
184 |     "path/to/t1.nii",
185 |     "path/to/t1ce.nii",
186 |     "path/to/t2.nii"
187 | ]
188 | seg_path = "path/to/seg.nii"
189 | 
190 | # 执行预处理
191 | data_list, seg, spacing, properties = preprocessor.run_case(image_paths, seg_path)
192 | ```
193 | 
194 | ### 2. 批量处理
195 | 
196 | ```python
197 | from pipeline import run_in_parallel
198 | 
199 | # 准备多个样本
200 | cases = [
201 |     {
202 |         "sample_id": "case_001",
203 |         "image_paths": ["path/to/case1/flair.nii", ...],
204 |         "seg_path": "path/to/case1/seg.nii"
205 |     },
206 |     # 更多样本...
207 | ]
208 | 
209 | # 并行处理
210 | results = run_in_parallel(preprocessor, cases, "output_dir", num_workers=4)
211 | ```
212 | 
213 | ### 3. 元数据生成
214 | 
215 | ```python
216 | from LLM_metadata import analyze_directory, generate_metadata, execute_metadata_script
217 | 
218 | # 分析目录结构
219 | analyze_directory(root_directory="dataset_path", 
220 |                  sample_folder_count=5, 
221 |                  sample_file_count=10)
222 | 
223 | # 生成元数据代码
224 | generate_metadata(root_directory="dataset_path", 
225 |                  your_api_key="your-deepseek-api-key")
226 | 
227 | # 执行生成的代码
228 | execute_metadata_script(root_directory="dataset_path")
229 | ```
230 | 
231 | ## 配置与扩展
232 | 
233 | ### 1. 预处理配置
234 | 
235 | 可以通过修改SimplePreprocessor的初始化参数来自定义预处理行为：
236 | 
237 | - `target_spacing`: 调整目标体素大小
238 | - `normalization_scheme`: 选择归一化方案
239 | - `target_size`: 设置输出尺寸
240 | 
241 | ### 2. 扩展功能
242 | 
243 | #### 添加新的归一化方法：
244 | 
245 | ```python
246 | def _normalize_custom(self, data):
247 |     """
248 |     自定义归一化方法
249 |     """
250 |     # 实现你的归一化逻辑
251 |     return normalized_data
252 | 
253 | # 在SimplePreprocessor中添加
254 | if self.normalization_scheme == "custom":
255 |     data = self._normalize_custom(data)
256 | ```
257 | 
258 | #### 添加新的预处理步骤：
259 | 
260 | ```python
261 | def new_preprocessing_step(self, data):
262 |     """
263 |     新的预处理步骤
264 |     """
265 |     # 实现新的预处理逻辑
266 |     return processed_data
267 | 
268 | # 在run_case方法中添加
269 | data_list = [self.new_preprocessing_step(d) for d in data_list]
270 | ```
271 | 
272 | ## 示例
273 | 
274 | ### 1. 单模态CT图像预处理
275 | 
276 | ```python
277 | # 初始化预处理器
278 | preprocessor = SimplePreprocessor(
279 |     target_spacing=[1.0, 1.0, 1.0],
280 |     normalization_scheme="min-max"
281 | )
282 | 
283 | # 处理单个CT图像
284 | image_paths = ["path/to/ct.nii"]
285 | data_list, _, spacing, properties = preprocessor.run_case(image_paths)
286 | ```
287 | 
288 | ### 2. 多模态MRI数据处理
289 | 
290 | ```python
291 | # 初始化预处理器
292 | preprocessor = SimplePreprocessor(
293 |     target_spacing=[1.0, 1.0, 1.0],
294 |     normalization_scheme="z-score",
295 |     target_size=[256, 256]
296 | )
297 | 
298 | # 处理多模态MRI数据
299 | image_paths = [
300 |     "path/to/flair.nii",
301 |     "path/to/t1.nii",
302 |     "path/to/t1ce.nii",
303 |     "path/to/t2.nii"
304 | ]
305 | seg_path = "path/to/seg.nii"
306 | 
307 | # 执行预处理
308 | data_list, seg, spacing, properties = preprocessor.run_case(image_paths, seg_path)
309 | ```
310 | 
311 | ### 3. 自动生成数据集元数据
312 | 
313 | ```python
314 | # 配置参数
315 | root_dir = "path/to/dataset"
316 | api_key = "your-deepseek-api-key"
317 | 
318 | # 执行完整的元数据生成流程
319 | analyze_directory(root_dir, sample_folder_count=5)
320 | generate_metadata(root_dir, api_key)
321 | execute_metadata_script(root_dir)
322 | ```
323 | 
324 | ## Jupyter Notebook 教程
325 | 
326 | 项目提供两个详细的Jupyter Notebook教程：
327 | 
328 | ### 1. tutorial.ipynb
329 | 
330 | 这是项目的主要教程，包含：
331 | - 完整的预处理管道使用示例
332 | - 各个参数的详细说明
333 | - 常见使用场景的演示
334 | - 处理结果的可视化
335 | 
336 | 推荐新用户首先阅读此教程，它将帮助你快速上手预处理管道的使用。
337 | 
338 | ### 2. How_I_Use_LLM_to_DIY_metadata.ipynb
339 | 
340 | 这是关于如何使用LLM生成元数据的详细教程，包含：
341 | - LLM元数据生成的完整工作流程
342 | - DeepSeek API的配置和使用
343 | - 目录结构分析的实际案例
344 | - 常见问题的解决方案
345 | 
346 | 如果你需要使用元数据自动生成功能，建议详细阅读此教程。
347 | 
348 | ## 注意事项
349 | 
350 | 1. 确保输入数据格式正确（支持.nii格式）
351 | 2. 检查磁盘空间是否充足（预处理后的数据可能较大）
352 | 3. 监控内存使用（处理大型数据集时）
353 | 4. 合理设置并行处理的进程数
354 | 5. 备份原始数据
355 | 


--------------------------------------------------------------------------------
/readme_en.md:
--------------------------------------------------------------------------------
  1 | # PreProcPipe: Multimodal Medical Image Preprocessing Framework
  2 | 
  3 | An efficient framework for preprocessing CT/MRI and other multimodal medical images, supporting automated data preprocessing and metadata generation.
  4 | 
  5 | ![Before Processing](before.png)
  6 | ![After Processing](after.png)
  7 | 
  8 | ## Choose Language / 选择语言
  9 | 
 10 | - [English](readme_en.md)
 11 | - [简体中文](readme.md)
 12 | 
 13 | ## Table of Contents
 14 | 
 15 | - [Main Features](#main-features)
 16 | - [Technical Architecture](#technical-architecture)
 17 | - [Code Implementation](#code-implementation)
 18 | - [User Guide](#user-guide)
 19 | - [Configuration & Extension](#configuration--extension)
 20 | - [Examples](#examples)
 21 | 
 22 | ## Main Features
 23 | 
 24 | ### 1. Preprocessing Pipeline (pipeline.py)
 25 | - Multimodal medical image data processing
 26 | - Intelligent Z-axis cropping
 27 | - Configurable data normalization
 28 | - Flexible image resampling
 29 | - Parallel processing support
 30 | - Automatic result saving
 31 | 
 32 | ### 2. Metadata Auto-generation (LLM_metadata.py)
 33 | - Intelligent directory structure analysis
 34 | - LLM-driven metadata rule generation
 35 | - Automatic validation and error detection
 36 | - DeepSeek API integration
 37 | 
 38 | ## Technical Architecture
 39 | 
 40 | ### Preprocessing Pipeline Architecture
 41 | 
 42 | The preprocessing pipeline adopts a modular design, processing data in the following sequence:
 43 | 
 44 | 1. **Data Input** → **SimplePreprocessor**
 45 |    - Receives multimodal medical image data
 46 |    - Supports .nii format files
 47 | 
 48 | 2. **Data Loading**
 49 |    - Reads multimodal image data
 50 |    - Reads segmentation data (if available)
 51 | 
 52 | 3. **Z-axis Cropping**
 53 |    - Intelligently identifies effective regions
 54 |    - Removes redundant blank areas
 55 | 
 56 | 4. **Normalization**
 57 |    - Supports z-score standardization
 58 |    - Supports min-max normalization
 59 | 
 60 | 5. **Resampling**
 61 |    - Adjusts voxel spacing
 62 |    - Maintains image quality
 63 | 
 64 | 6. **Size Adjustment**
 65 |    - Unifies output dimensions
 66 |    - Optional size configuration
 67 | 
 68 | 7. **Output Processed Data**
 69 |    - Saves in standard format
 70 |    - Generates processing metadata
 71 | 
 72 | ### Metadata Generation System Architecture
 73 | 
 74 | The metadata generation system uses an LLM-driven intelligent analysis process:
 75 | 
 76 | 1. **Dataset Root Directory** → **Directory Structure Analysis**
 77 |    - Scans file system
 78 |    - Identifies file organization patterns
 79 | 
 80 | 2. **Random Sampling**
 81 |    - Selects representative samples
 82 |    - Analyzes file naming patterns
 83 | 
 84 | 3. **DeepSeek API Call**
 85 |    - Sends structured requests
 86 |    - Receives AI analysis results
 87 | 
 88 | 4. **Generate Processing Code**
 89 |    - Automatically generates Python scripts
 90 |    - Contains data processing logic
 91 | 
 92 | 5. **Execution and Validation**
 93 |    - Runs generated code
 94 |    - Checks processing results
 95 | 
 96 | 6. **Generate metadata.csv**
 97 |    - Records dataset information
 98 |    - Establishes index relationships
 99 | 
100 | ## Code Implementation
101 | 
102 | ### Core Class: SimplePreprocessor
103 | 
104 | ```python
105 | class SimplePreprocessor:
106 |     def __init__(self, target_spacing=[1.0, 1.0, 1.0], 
107 |                  normalization_scheme="z-score", 
108 |                  target_size=None):
109 |         """
110 |         Initialize preprocessor
111 |         
112 |         Parameters:
113 |         - target_spacing: Target voxel size, default [1.0, 1.0, 1.0]
114 |         - normalization_scheme: Normalization scheme ("z-score"/"min-max")
115 |         - target_size: Target size, e.g., [256, 256]
116 |         """
117 | ```
118 | 
119 | #### Main Methods:
120 | 
121 | 1. **Data Loading**
122 | ```python
123 | def read_images(self, image_paths):
124 |     """Load multimodal image data"""
125 | 
126 | def read_seg(self, seg_path):
127 |     """Load segmentation data"""
128 | ```
129 | 
130 | 2. **Data Preprocessing**
131 | ```python
132 | def crop(self, data_list, seg):
133 |     """Z-axis intelligent cropping"""
134 | 
135 | def _normalize_single_modality(self, data):
136 |     """Single modality data normalization"""
137 | 
138 | def compute_new_shape(self, old_shape, old_spacing, new_spacing):
139 |     """Calculate resampling target shape"""
140 | 
141 | def resample_data(self, data, new_shape, order=3):
142 |     """Data resampling"""
143 | 
144 | def resize_to_target_size(self, data, target_size, order=3):
145 |     """Adjust data size"""
146 | ```
147 | 
148 | 3. **Processing Flow**
149 | ```python
150 | def run_case(self, image_paths, seg_path=None):
151 |     """Execute complete preprocessing workflow"""
152 | ```
153 | 
154 | ### Metadata Generation System
155 | 
156 | ```python
157 | def analyze_directory(root_directory, sample_folder_count=5, sample_file_count=10):
158 |     """Analyze directory structure and sample"""
159 | 
160 | def generate_metadata(root_directory, your_api_key=None):
161 |     """Use LLM to generate metadata processing code"""
162 | 
163 | def execute_metadata_script(root_directory):
164 |     """Execute and validate metadata generation"""
165 | ```
166 | 
167 | ## User Guide
168 | 
169 | ### 1. Basic Preprocessing Flow
170 | 
171 | ```python
172 | from pipeline import SimplePreprocessor
173 | 
174 | # Initialize preprocessor
175 | preprocessor = SimplePreprocessor(
176 |     target_spacing=[1.0, 1.0, 1.0],
177 |     normalization_scheme="z-score",
178 |     target_size=[256, 256]
179 | )
180 | 
181 | # Prepare data paths
182 | image_paths = [
183 |     "path/to/flair.nii",
184 |     "path/to/t1.nii",
185 |     "path/to/t1ce.nii",
186 |     "path/to/t2.nii"
187 | ]
188 | seg_path = "path/to/seg.nii"
189 | 
190 | # Execute preprocessing
191 | data_list, seg, spacing, properties = preprocessor.run_case(image_paths, seg_path)
192 | ```
193 | 
194 | ### 2. Batch Processing
195 | 
196 | ```python
197 | from pipeline import run_in_parallel
198 | 
199 | # Prepare multiple samples
200 | cases = [
201 |     {
202 |         "sample_id": "case_001",
203 |         "image_paths": ["path/to/case1/flair.nii", ...],
204 |         "seg_path": "path/to/case1/seg.nii"
205 |     },
206 |     # More samples...
207 | ]
208 | 
209 | # Parallel processing
210 | results = run_in_parallel(preprocessor, cases, "output_dir", num_workers=4)
211 | ```
212 | 
213 | ### 3. Metadata Generation
214 | 
215 | ```python
216 | from LLM_metadata import analyze_directory, generate_metadata, execute_metadata_script
217 | 
218 | # Analyze directory structure
219 | analyze_directory(root_directory="dataset_path", 
220 |                  sample_folder_count=5, 
221 |                  sample_file_count=10)
222 | 
223 | # Generate metadata code
224 | generate_metadata(root_directory="dataset_path", 
225 |                  your_api_key="your-deepseek-api-key")
226 | 
227 | # Execute generated code
228 | execute_metadata_script(root_directory="dataset_path")
229 | ```
230 | 
231 | ## Configuration & Extension
232 | 
233 | ### 1. Preprocessing Configuration
234 | 
235 | Customize preprocessing behavior by modifying SimplePreprocessor initialization parameters:
236 | 
237 | - `target_spacing`: Adjust target voxel size
238 | - `normalization_scheme`: Choose normalization scheme
239 | - `target_size`: Set output dimensions
240 | 
241 | ### 2. Extension Features
242 | 
243 | #### Add New Normalization Method:
244 | 
245 | ```python
246 | def _normalize_custom(self, data):
247 |     """
248 |     Custom normalization method
249 |     """
250 |     # Implement your normalization logic
251 |     return normalized_data
252 | 
253 | # Add in SimplePreprocessor
254 | if self.normalization_scheme == "custom":
255 |     data = self._normalize_custom(data)
256 | ```
257 | 
258 | #### Add New Preprocessing Step:
259 | 
260 | ```python
261 | def new_preprocessing_step(self, data):
262 |     """
263 |     New preprocessing step
264 |     """
265 |     # Implement new preprocessing logic
266 |     return processed_data
267 | 
268 | # Add in run_case method
269 | data_list = [self.new_preprocessing_step(d) for d in data_list]
270 | ```
271 | 
272 | ## Examples
273 | 
274 | ### 1. Single Modality CT Image Preprocessing
275 | 
276 | ```python
277 | # Initialize preprocessor
278 | preprocessor = SimplePreprocessor(
279 |     target_spacing=[1.0, 1.0, 1.0],
280 |     normalization_scheme="min-max"
281 | )
282 | 
283 | # Process single CT image
284 | image_paths = ["path/to/ct.nii"]
285 | data_list, _, spacing, properties = preprocessor.run_case(image_paths)
286 | ```
287 | 
288 | ### 2. Multimodal MRI Data Processing
289 | 
290 | ```python
291 | # Initialize preprocessor
292 | preprocessor = SimplePreprocessor(
293 |     target_spacing=[1.0, 1.0, 1.0],
294 |     normalization_scheme="z-score",
295 |     target_size=[256, 256]
296 | )
297 | 
298 | # Process multimodal MRI data
299 | image_paths = [
300 |     "path/to/flair.nii",
301 |     "path/to/t1.nii",
302 |     "path/to/t1ce.nii",
303 |     "path/to/t2.nii"
304 | ]
305 | seg_path = "path/to/seg.nii"
306 | 
307 | # Execute preprocessing
308 | data_list, seg, spacing, properties = preprocessor.run_case(image_paths, seg_path)
309 | ```
310 | 
311 | ### 3. Auto-generate Dataset Metadata
312 | 
313 | ```python
314 | # Configure parameters
315 | root_dir = "path/to/dataset"
316 | api_key = "your-deepseek-api-key"
317 | 
318 | # Execute complete metadata generation workflow
319 | analyze_directory(root_dir, sample_folder_count=5)
320 | generate_metadata(root_dir, api_key)
321 | execute_metadata_script(root_dir)
322 | ```
323 | 
324 | ## Jupyter Notebook Tutorials
325 | 
326 | The project provides two detailed Jupyter Notebook tutorials:
327 | 
328 | ### 1. tutorial.ipynb
329 | 
330 | This is the main tutorial of the project, including:
331 | - Complete preprocessing pipeline usage examples
332 | - Detailed parameter explanations
333 | - Common use case demonstrations
334 | - Visualization of processing results
335 | 
336 | New users are recommended to read this tutorial first, as it will help you quickly get started with the preprocessing pipeline.
337 | 
338 | ### 2. How_I_Use_LLM_to_DIY_metadata.ipynb
339 | 
340 | This is a detailed tutorial on how to use LLM for metadata generation, including:
341 | - Complete workflow of LLM metadata generation
342 | - DeepSeek API configuration and usage
343 | - Real cases of directory structure analysis
344 | - Solutions to common issues
345 | 
346 | If you need to use the automatic metadata generation feature, it is recommended to read this tutorial thoroughly.
347 | 
348 | ## Important Notes
349 | 
350 | 1. Ensure correct input data format (supports .nii format)
351 | 2. Check sufficient disk space (processed data may be large)
352 | 3. Monitor memory usage (when processing large datasets)
353 | 4. Set appropriate parallel processing worker count
354 | 5. Backup original data
355 | 


--------------------------------------------------------------------------------
/tutorial.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "data": {
 10 |       "text/plain": [
 11 |        "'/teamspace/studios/this_studio/PreProcPipe'"
 12 |       ]
 13 |      },
 14 |      "execution_count": 1,
 15 |      "metadata": {},
 16 |      "output_type": "execute_result"
 17 |     }
 18 |    ],
 19 |    "source": [
 20 |     "%load_ext autoreload\n",
 21 |     "%autoreload 2\n",
 22 |     "\n",
 23 |     "import os\n",
 24 |     "if os.getcwd().split('/')[-1] != 'PreProcPipe':\n",
 25 |     "    os.chdir('/teamspace/studios/this_studio/PreProcPipe')\n",
 26 |     "os.getcwd()\n",
 27 |     "\n"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "在这个教程中我将以一个小数据集作为例子。"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "你可以看到我的repo中有[PreProcPipe/BraTS2021_Training_Data]()文件夹，里面有一些nii文件。"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 2,
 47 |    "metadata": {},
 48 |    "outputs": [
 49 |     {
 50 |      "data": {
 51 |       "text/plain": [
 52 |        "['BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_flair/00000057_brain_flair.nii',\n",
 53 |        " 'BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_seg/00000057_final_seg.nii',\n",
 54 |        " 'BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t1/00000057_brain_t1.nii',\n",
 55 |        " 'BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t1ce/00000057_brain_t1ce.nii',\n",
 56 |        " 'BraTS2021_Training_Data/BraTS2021_00000/BraTS2021_00000_t2/00000057_brain_t2.nii']"
 57 |       ]
 58 |      },
 59 |      "execution_count": 2,
 60 |      "metadata": {},
 61 |      "output_type": "execute_result"
 62 |     }
 63 |    ],
 64 |    "source": [
 65 |     "\n",
 66 |     "# load nii files\n",
 67 |     "img_paths = []\n",
 68 |     "root_dir = 'BraTS2021_Training_Data/BraTS2021_00000'\n",
 69 |     "for folder in os.listdir(root_dir):\n",
 70 |     "    folder_path = os.path.join(root_dir, folder)\n",
 71 |     "    if os.path.isdir(folder_path):  # 检查是否为目录\n",
 72 |     "        # 遍历子目录中的文件\n",
 73 |     "        for file in os.listdir(folder_path):\n",
 74 |     "            if file.endswith('.nii'):  # 检查是否为 .nii 文件\n",
 75 |     "                img_paths.append(os.path.join(folder_path, file))  # 追加完整路径\n",
 76 |     "\n",
 77 |     "img_paths\n"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 3,
 83 |    "metadata": {},
 84 |    "outputs": [
 85 |     {
 86 |      "name": "stdout",
 87 |      "output_type": "stream",
 88 |      "text": [
 89 |       "(240, 240, 155)\n",
 90 |       "0.0 2934.0\n",
 91 |       "\n",
 92 |       "(240, 240, 155)\n",
 93 |       "0.0 4.0\n",
 94 |       "\n",
 95 |       "(240, 240, 155)\n",
 96 |       "0.0 2023.0\n",
 97 |       "\n",
 98 |       "(240, 240, 155)\n",
 99 |       "0.0 12343.0\n",
100 |       "\n",
101 |       "(240, 240, 155)\n",
102 |       "0.0 2421.0\n",
103 |       "\n"
104 |      ]
105 |     }
106 |    ],
107 |    "source": [
108 |     "import nibabel as nib\n",
109 |     "import numpy as np\n",
110 |     "\n",
111 |     "for img_path in img_paths:\n",
112 |     "    img = nib.load(img_path)\n",
113 |     "    img_data = img.get_fdata()\n",
114 |     "    print(img_data.shape)\n",
115 |     "    print(np.min(img_data), np.max(img_data))\n",
116 |     "    print()"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "可视化检查一下"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 4,
129 |    "metadata": {},
130 |    "outputs": [
131 |     {
132 |      "data": {
133 |       "application/vnd.jupyter.widget-view+json": {
134 |        "model_id": "df2a2b204e684225915836ef52905a0f",
135 |        "version_major": 2,
136 |        "version_minor": 0
137 |       },
138 |       "text/plain": [
139 |        "interactive(children=(IntSlider(value=77, description='z', max=154), Output()), _dom_classes=('widget-interact…"
140 |       ]
141 |      },
142 |      "metadata": {},
143 |      "output_type": "display_data"
144 |     },
145 |     {
146 |      "data": {
147 |       "text/plain": [
148 |        "<function __main__.<lambda>(z)>"
149 |       ]
150 |      },
151 |      "execution_count": 4,
152 |      "metadata": {},
153 |      "output_type": "execute_result"
154 |     }
155 |    ],
156 |    "source": [
157 |     "import nibabel as nib\n",
158 |     "import numpy as np\n",
159 |     "import matplotlib.pyplot as plt\n",
160 |     "from ipywidgets import interact, IntSlider\n",
161 |     "\n",
162 |     "# 从 img_paths 加载所有图像数据\n",
163 |     "def load_images(img_paths):\n",
164 |     "    images = []\n",
165 |     "    for path in img_paths:\n",
166 |     "        img = nib.load(path).get_fdata()  # 加载数据\n",
167 |     "        images.append(img)\n",
168 |     "    return images\n",
169 |     "\n",
170 |     "# 显示某一 z 切片的函数\n",
171 |     "def show_slices(images, z):\n",
172 |     "    num_images = len(images)\n",
173 |     "    fig, axes = plt.subplots(1, num_images, figsize=(5 * num_images, 5))\n",
174 |     "    if num_images == 1:\n",
175 |     "        axes = [axes]\n",
176 |     "\n",
177 |     "    for i, img in enumerate(images):\n",
178 |     "        axes[i].imshow(img[:, :, z], cmap=\"gray\")\n",
179 |     "        axes[i].set_title(f\"Image {i+1} - Z: {z}\")\n",
180 |     "    plt.show()\n",
181 |     "\n",
182 |     "images = load_images(img_paths)\n",
183 |     "\n",
184 |     "# 使用 ipywidgets 创建滑块交互\n",
185 |     "z_max = images[0].shape[2] - 1\n",
186 |     "interact(lambda z: show_slices(images, z), z=IntSlider(min=0, max=z_max, step=1, value=z_max // 2))\n"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "markdown",
191 |    "metadata": {},
192 |    "source": [
193 |     "万事俱备，我们开始使用`PreProcPipe`吧！"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "markdown",
198 |    "metadata": {},
199 |    "source": [
200 |     "# PreProcPipe使用范例"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "为了将文件输入给PPP，我建议用一个像csv这样的元数据文件来记录文件路径，然后规范地输入。"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "metadata": {},
213 |    "source": [
214 |     "很遗憾，这个制作csv的方法是因人而异的，因为每个数据集长得都不一样，不是吗😀"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 5,
220 |    "metadata": {},
221 |    "outputs": [
222 |     {
223 |      "name": "stdout",
224 |      "output_type": "stream",
225 |      "text": [
226 |       "Metadata saved to BraTS2021_Training_Data/metadata.csv\n"
227 |      ]
228 |     }
229 |    ],
230 |    "source": [
231 |     "import os\n",
232 |     "import pandas as pd\n",
233 |     "\n",
234 |     "def collect_metadata(root_dir):\n",
235 |     "    \"\"\"\n",
236 |     "    Traverse the root_dir and collect the paths of multimodal images and segmentation for each sample.\n",
237 |     "    Return a list containing sample data, with each row corresponding to a sample and its file paths.\n",
238 |     "    \"\"\"\n",
239 |     "    metadata = []\n",
240 |     "    for sample_folder in os.listdir(root_dir):\n",
241 |     "        sample_path = os.path.join(root_dir, sample_folder)\n",
242 |     "        if os.path.isdir(sample_path):  # Ensure it is a directory\n",
243 |     "            # Initialize a dictionary to store paths\n",
244 |     "            sample_data = {\n",
245 |     "                \"sample_id\": sample_folder,\n",
246 |     "                \"t1\": None,\n",
247 |     "                \"t1ce\": None,\n",
248 |     "                \"t2\": None,\n",
249 |     "                \"flair\": None,\n",
250 |     "                \"seg\": None\n",
251 |     "            }\n",
252 |     "            for modality_folder in os.listdir(sample_path):\n",
253 |     "                modality_path = os.path.join(sample_path, modality_folder)\n",
254 |     "                if os.path.isdir(modality_path):  # Ensure it is a modality subdirectory\n",
255 |     "                    for file in os.listdir(modality_path):\n",
256 |     "                        if file.endswith('.nii'):  # Ensure it is a .nii file\n",
257 |     "                            # Classify based on modality\n",
258 |     "                            if \"t1.nii\" in file and \"ce\" not in file:\n",
259 |     "                                sample_data[\"t1\"] = os.path.join(modality_path, file)\n",
260 |     "                            elif \"t1ce\" in file:\n",
261 |     "                                sample_data[\"t1ce\"] = os.path.join(modality_path, file)\n",
262 |     "                            elif \"t2\" in file:\n",
263 |     "                                sample_data[\"t2\"] = os.path.join(modality_path, file)\n",
264 |     "                            elif \"flair\" in file:\n",
265 |     "                                sample_data[\"flair\"] = os.path.join(modality_path, file)\n",
266 |     "                            elif \"seg\" in file:\n",
267 |     "                                sample_data[\"seg\"] = os.path.join(modality_path, file)\n",
268 |     "            metadata.append(sample_data)\n",
269 |     "    return metadata\n",
270 |     "\n",
271 |     "# Root directory path\n",
272 |     "root_dir = \"BraTS2021_Training_Data\"\n",
273 |     "\n",
274 |     "# Collect metadata\n",
275 |     "metadata = collect_metadata(root_dir)\n",
276 |     "\n",
277 |     "# Convert to DataFrame\n",
278 |     "df = pd.DataFrame(metadata)\n",
279 |     "\n",
280 |     "# Save as CSV file\n",
281 |     "output_csv = f\"{root_dir}/metadata.csv\"\n",
282 |     "df.to_csv(output_csv, index=False)\n",
283 |     "\n",
284 |     "print(f\"Metadata saved to {output_csv}\")"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": 7,
290 |    "metadata": {},
291 |    "outputs": [
292 |     {
293 |      "name": "stdout",
294 |      "output_type": "stream",
295 |      "text": [
296 |       "Step 1: Loading multi-modal image data...Step 1: Loading multi-modal image data...\n",
297 |       "Step 1: Loading multi-modal image data...\n",
298 |       "\n",
299 |       "\n",
300 |       "Step 1: Loading segmentation data...\n",
301 |       "\n",
302 |       "Original image shape (modality 0): (240, 240, 155)\n",
303 |       "Original image shape (modality 1): (240, 240, 155)\n",
304 |       "Original image shape (modality 2): (240, 240, 155)\n",
305 |       "Original image shape (modality 3): (240, 240, 155)\n",
306 |       "Original segmentation shape: (240, 240, 155)\n",
307 |       "\n",
308 |       "Step 2: Cropping to non-zero regions...\n",
309 |       "\n",
310 |       "Step 1: Loading segmentation data...\n",
311 |       "Step 2: Cropping to non-zero regions along Z-axis...\n",
312 |       "\n",
313 |       "Step 1: Loading segmentation data...\n",
314 |       "Original image shape (modality 0): (240, 240, 155)\n",
315 |       "\n",
316 |       "Original image shape (modality 1): (240, 240, 155)Z-axis cropping range: 0 to 140\n",
317 |       "\n",
318 |       "Original image shape (modality 2): (240, 240, 155)\n",
319 |       "\n",
320 |       "Original image shape (modality 3): (240, 240, 155)\n",
321 |       "Shapes before cropping: [(240, 240, 155), (240, 240, 155), (240, 240, 155), (240, 240, 155)]Original image shape (modality 0): (240, 240, 155)\n",
322 |       "Original segmentation shape: (240, 240, 155)\n",
323 |       "Shapes after cropping: [(240, 240, 140), (240, 240, 140), (240, 240, 140), (240, 240, 140)]Original image shape (modality 1): (240, 240, 155)\n",
324 |       "\n",
325 |       "\n",
326 |       "\n",
327 |       "Original image shape (modality 2): (240, 240, 155)Step 2: Cropping to non-zero regions...\n",
328 |       "Segmentation shape after cropping: (240, 240, 140)\n",
329 |       "Original image shape (modality 3): (240, 240, 155)\n",
330 |       "Step 2: Cropping to non-zero regions along Z-axis...\n",
331 |       "\n",
332 |       "Step 3: Normalizing image data...Original segmentation shape: (240, 240, 155)\n",
333 |       "\n",
334 |       "Step 2: Cropping to non-zero regions...\n",
335 |       "Step 2: Cropping to non-zero regions along Z-axis...\n",
336 |       "\n",
337 |       "Z-axis cropping range: 4 to 150\n",
338 |       "Shapes before cropping: [(240, 240, 155), (240, 240, 155), (240, 240, 155), (240, 240, 155)]\n",
339 |       "Shapes after cropping: [(240, 240, 146), (240, 240, 146), (240, 240, 146), (240, 240, 146)]Z-axis cropping range: 10 to 145\n",
340 |       "Segmentation shape after cropping: (240, 240, 146)\n",
341 |       "Shapes before cropping: [(240, 240, 155), (240, 240, 155), (240, 240, 155), (240, 240, 155)]\n",
342 |       "Shapes after cropping: [(240, 240, 135), (240, 240, 135), (240, 240, 135), (240, 240, 135)]\n",
343 |       "Segmentation shape after cropping: (240, 240, 135)\n",
344 |       "\n",
345 |       "Step 3: Normalizing image data...\n",
346 |       "Step 3: Normalizing image data..."
347 |      ]
348 |     },
349 |     {
350 |      "name": "stdout",
351 |      "output_type": "stream",
352 |      "text": [
353 |       "\n",
354 |       "\n",
355 |       "Step 4: Resampling data to target spacing...\n",
356 |       "Computed resize factors: [1.0, 1.0, 1.0]\n",
357 |       "Computed new shape: [240, 240, 140]\n",
358 |       "Resampling data...\n",
359 |       "\n",
360 |       "Step 4: Resampling data to target spacing...\n",
361 |       "Computed resize factors: [1.0, 1.0, 1.0]\n",
362 |       "Computed new shape: [240, 240, 135]\n",
363 |       "Resampling data...\n",
364 |       "\n",
365 |       "Step 4: Resampling data to target spacing...\n",
366 |       "Computed resize factors: [1.0, 1.0, 1.0]\n",
367 |       "Computed new shape: [240, 240, 146]\n",
368 |       "Resampling data...\n",
369 |       "Data resampled to shape: (240, 240, 140)\n",
370 |       "Resampling data...\n",
371 |       "Data resampled to shape: (240, 240, 135)\n",
372 |       "Resampling data...\n",
373 |       "Data resampled to shape: (240, 240, 146)\n",
374 |       "Resampling data...\n",
375 |       "Data resampled to shape: (240, 240, 140)\n",
376 |       "Resampling data...\n",
377 |       "Data resampled to shape: (240, 240, 135)\n",
378 |       "\n",
379 |       "Resampling data...Data resampled to shape: (240, 240, 146)\n",
380 |       "Resampling data...\n",
381 |       "Data resampled to shape: (240, 240, 140)\n",
382 |       "Resampling data...\n",
383 |       "Data resampled to shape: (240, 240, 135)\n",
384 |       "Resampling data...\n",
385 |       "Data resampled to shape: (240, 240, 146)\n",
386 |       "Resampling data...\n",
387 |       "Data resampled to shape: (240, 240, 140)\n",
388 |       "Resampling data...\n",
389 |       "Data resampled to shape: (240, 240, 140)\n",
390 |       "\n",
391 |       "Step 5: Resizing data to target size...\n",
392 |       "Resizing data to target size...\n",
393 |       "Data resampled to shape: (240, 240, 135)\n",
394 |       "Resampling data...\n",
395 |       "Data resampled to shape: (240, 240, 135)\n",
396 |       "\n",
397 |       "Step 5: Resizing data to target size...\n",
398 |       "Resizing data to target size...\n",
399 |       "Data resampled to shape: (240, 240, 146)\n",
400 |       "Resampling data...\n",
401 |       "Data resampled to shape: (240, 240, 146)\n",
402 |       "\n",
403 |       "Step 5: Resizing data to target size...\n",
404 |       "Resizing data to target size...\n",
405 |       "Data resized to shape: (256, 256, 140)\n",
406 |       "Resizing data to target size...\n",
407 |       "Data resized to shape: (256, 256, 135)\n",
408 |       "Resizing data to target size...\n",
409 |       "Data resized to shape: (256, 256, 146)\n",
410 |       "Resizing data to target size...\n",
411 |       "Data resized to shape: (256, 256, 140)\n",
412 |       "Resizing data to target size...\n",
413 |       "Data resized to shape: (256, 256, 135)\n",
414 |       "Resizing data to target size...\n",
415 |       "Data resized to shape: (256, 256, 146)\n",
416 |       "Resizing data to target size...\n",
417 |       "Data resized to shape: (256, 256, 140)\n",
418 |       "Resizing data to target size...\n",
419 |       "Data resized to shape: (256, 256, 135)\n",
420 |       "Resizing data to target size...\n",
421 |       "Data resized to shape: (256, 256, 146)\n",
422 |       "Resizing data to target size...\n",
423 |       "Data resized to shape: (256, 256, 140)\n",
424 |       "Resizing data to target size...\n",
425 |       "Data resized to shape: (256, 256, 135)\n",
426 |       "Data resized to shape: (256, 256, 140)\n",
427 |       "\n",
428 |       "Preprocessing completed.\n",
429 |       "\n",
430 |       "Resizing data to target size...\n",
431 |       "Data resized to shape: (256, 256, 135)\n",
432 |       "\n",
433 |       "Preprocessing completed.\n",
434 |       "\n",
435 |       "Data resized to shape: (256, 256, 146)\n",
436 |       "Resizing data to target size...\n",
437 |       "Data resized to shape: (256, 256, 146)\n",
438 |       "\n",
439 |       "Preprocessing completed.\n",
440 |       "\n"
441 |      ]
442 |     }
443 |    ],
444 |    "source": [
445 |     "import pandas as pd\n",
446 |     "from pipeline import SimplePreprocessor as ppp\n",
447 |     "from pipeline import run_in_parallel\n",
448 |     "\n",
449 |     "# 定义读取 metadata.csv 并生成 cases 列表的函数\n",
450 |     "def load_cases_from_metadata(csv_path):\n",
451 |     "    \"\"\"\n",
452 |     "    从 metadata.csv 加载病例信息，并生成 (image_paths, seg_path) 的列表。\n",
453 |     "    \n",
454 |     "    参数：\n",
455 |     "    - csv_path: metadata.csv 文件路径。\n",
456 |     "    \n",
457 |     "    返回：\n",
458 |     "    - cases: 包含病例信息的列表，每个元素是一个字典，格式为：\n",
459 |     "      {\n",
460 |     "          \"sample_id\": 样本ID,\n",
461 |     "          \"image_paths\": [模态1路径, 模态2路径, ...],\n",
462 |     "          \"seg_path\": 分割路径或 None\n",
463 |     "      }\n",
464 |     "    \"\"\"\n",
465 |     "    df = pd.read_csv(csv_path)\n",
466 |     "    cases = []\n",
467 |     "    for _, row in df.iterrows():\n",
468 |     "        # 提取模态路径\n",
469 |     "        image_paths = [row['t1'], row['t1ce'], row['t2'], row['flair']]\n",
470 |     "        # 过滤掉空值\n",
471 |     "        image_paths = [path for path in image_paths if pd.notnull(path)]\n",
472 |     "        # 提取分割路径\n",
473 |     "        seg_path = row['seg'] if pd.notnull(row['seg']) else None\n",
474 |     "        # 添加到 cases\n",
475 |     "        cases.append({\n",
476 |     "            \"sample_id\": row['sample_id'],\n",
477 |     "            \"image_paths\": image_paths,\n",
478 |     "            \"seg_path\": seg_path\n",
479 |     "        })\n",
480 |     "    return cases\n",
481 |     "\n",
482 |     "# 加载 metadata.csv\n",
483 |     "metadata_csv_path = \"BraTS2021_Training_Data/metadata.csv\"\n",
484 |     "cases = load_cases_from_metadata(metadata_csv_path)\n",
485 |     "\n",
486 |     "# 初始化预处理器\n",
487 |     "preprocessor = ppp(\n",
488 |     "    target_spacing=[1.0, 1.0, 1.0], \n",
489 |     "    normalization_scheme=\"z-score\", \n",
490 |     "    target_size=[256, 256]\n",
491 |     ")\n",
492 |     "\n",
493 |     "# 使用多进程运行预处理\n",
494 |     "num_workers = 4  # 设置进程数\n",
495 |     "results = run_in_parallel(preprocessor, cases, num_workers=num_workers, output_root=\"preprocessed_data\")\n",
496 |     "\n"
497 |    ]
498 |   },
499 |   {
500 |    "cell_type": "code",
501 |    "execution_count": 8,
502 |    "metadata": {},
503 |    "outputs": [
504 |     {
505 |      "data": {
506 |       "text/plain": [
507 |        "[{'sample_id': 'BraTS2021_00000',\n",
508 |        "  'modality_paths': ['preprocessed_data/BraTS2021_00000/00000057_brain_t1.npz',\n",
509 |        "   'preprocessed_data/BraTS2021_00000/00000057_brain_t1ce.npz',\n",
510 |        "   'preprocessed_data/BraTS2021_00000/00000057_brain_t2.npz',\n",
511 |        "   'preprocessed_data/BraTS2021_00000/00000057_brain_flair.npz'],\n",
512 |        "  'seg_path': 'preprocessed_data/BraTS2021_00000/seg.npz',\n",
513 |        "  'meta_path': 'preprocessed_data/BraTS2021_00000/meta.npz'},\n",
514 |        " {'sample_id': 'BraTS2021_00002',\n",
515 |        "  'modality_paths': ['preprocessed_data/BraTS2021_00002/00000014_brain_t1.npz',\n",
516 |        "   'preprocessed_data/BraTS2021_00002/00000014_brain_t1ce.npz',\n",
517 |        "   'preprocessed_data/BraTS2021_00002/00000014_brain_t2.npz',\n",
518 |        "   'preprocessed_data/BraTS2021_00002/00000014_brain_flair.npz'],\n",
519 |        "  'seg_path': 'preprocessed_data/BraTS2021_00002/seg.npz',\n",
520 |        "  'meta_path': 'preprocessed_data/BraTS2021_00002/meta.npz'},\n",
521 |        " {'sample_id': 'BraTS2021_00003',\n",
522 |        "  'modality_paths': ['preprocessed_data/BraTS2021_00003/00000017_brain_t1.npz',\n",
523 |        "   'preprocessed_data/BraTS2021_00003/00000017_brain_t1ce.npz',\n",
524 |        "   'preprocessed_data/BraTS2021_00003/00000017_brain_t2.npz',\n",
525 |        "   'preprocessed_data/BraTS2021_00003/00000017_brain_flair.npz'],\n",
526 |        "  'seg_path': 'preprocessed_data/BraTS2021_00003/seg.npz',\n",
527 |        "  'meta_path': 'preprocessed_data/BraTS2021_00003/meta.npz'}]"
528 |       ]
529 |      },
530 |      "execution_count": 8,
531 |      "metadata": {},
532 |      "output_type": "execute_result"
533 |     }
534 |    ],
535 |    "source": [
536 |     "results"
537 |    ]
538 |   },
539 |   {
540 |    "cell_type": "code",
541 |    "execution_count": 10,
542 |    "metadata": {},
543 |    "outputs": [
544 |     {
545 |      "name": "stdout",
546 |      "output_type": "stream",
547 |      "text": [
548 |       "Modality paths: ['preprocessed_data/BraTS2021_00002/00000014_brain_t1.npz', 'preprocessed_data/BraTS2021_00002/00000014_brain_t1ce.npz', 'preprocessed_data/BraTS2021_00002/00000014_brain_t2.npz', 'preprocessed_data/BraTS2021_00002/00000014_brain_flair.npz']\n",
549 |       "Seg path: preprocessed_data/BraTS2021_00002/seg.npz\n"
550 |      ]
551 |     },
552 |     {
553 |      "data": {
554 |       "application/vnd.jupyter.widget-view+json": {
555 |        "model_id": "fc3eeee8f53342ee802db7860ca1364a",
556 |        "version_major": 2,
557 |        "version_minor": 0
558 |       },
559 |       "text/plain": [
560 |        "interactive(children=(IntSlider(value=67, description='z_idx', max=134), Output()), _dom_classes=('widget-inte…"
561 |       ]
562 |      },
563 |      "metadata": {},
564 |      "output_type": "display_data"
565 |     }
566 |    ],
567 |    "source": [
568 |     "import numpy as np\n",
569 |     "import matplotlib.pyplot as plt\n",
570 |     "from ipywidgets import interact, IntSlider\n",
571 |     "import os\n",
572 |     "\n",
573 |     "import pandas as pd\n",
574 |     "import ast\n",
575 |     "import os\n",
576 |     "\n",
577 |     "metadata_csv = \"preprocessed_data/metadata.csv\"\n",
578 |     "\n",
579 |     "# 目标 sample_id\n",
580 |     "target_sample_id = \"BraTS2021_00002\"\n",
581 |     "\n",
582 |     "# 读取 CSV\n",
583 |     "df = pd.read_csv(metadata_csv)\n",
584 |     "\n",
585 |     "# 查找目标样本行\n",
586 |     "row = df.loc[df['sample_id'] == target_sample_id].iloc[0]\n",
587 |     "\n",
588 |     "# 解析 image_paths 列（它是一个字符串表示的列表）\n",
589 |     "image_paths = ast.literal_eval(row['modality_paths'])\n",
590 |     "\n",
591 |     "# 分割路径\n",
592 |     "seg_path = row['seg_path'] if pd.notnull(row['seg_path']) else None\n",
593 |     "\n",
594 |     "# 根据需要，也可以将这些路径与根目录拼接\n",
595 |     "# 如果 metadata.csv 中的路径已经是相对于 output_root 的相对路径\n",
596 |     "# 且 output_root 为 \"preprocessed_data\"\n",
597 |     "output_root = \"preprocessed_data\"\n",
598 |     "image_paths = [os.path.join(output_root, p) for p in image_paths]\n",
599 |     "if seg_path is not None:\n",
600 |     "    seg_path = os.path.join(output_root, seg_path)\n",
601 |     "\n",
602 |     "# 此时，image_paths 和 seg_path 就是从 metadata 中获得的对应文件路径列表和分割路径\n",
603 |     "print(\"Modality paths:\", image_paths)\n",
604 |     "print(\"Seg path:\", seg_path)\n",
605 |     "\n",
606 |     "\n",
607 |     "# 假设所有文件都在 \"preprocessed_data\" 目录下\n",
608 |     "# image_paths = [os.path.join(\"preprocessed_data\", p) for p in image_paths]\n",
609 |     "# seg_path = os.path.join(\"preprocessed_data\", seg_path)\n",
610 |     "\n",
611 |     "# 加载图像数据\n",
612 |     "modality_data = []\n",
613 |     "for path in image_paths:\n",
614 |     "    data = np.load(path)[\"data\"] # npz 的 key 是data\n",
615 |     "    modality_data.append(data)\n",
616 |     "\n",
617 |     "# 将模态合并为多通道数据 (H, W, D, C)\n",
618 |     "multi_modal_data = np.stack(modality_data, axis=-1)  # (H, W, D, C)\n",
619 |     "\n",
620 |     "# 加载分割数据\n",
621 |     "seg_data = np.load(seg_path)[\"data\"]  # (H, W, D)\n",
622 |     "\n",
623 |     "# 获取数据形状和 D 轴大小\n",
624 |     "H, W, D, C = multi_modal_data.shape\n",
625 |     "\n",
626 |     "def display_all_modalities(z_idx):\n",
627 |     "    \"\"\"\n",
628 |     "    显示给定z轴索引下的所有模态图像及对应的分割mask叠加结果。\n",
629 |     "    \"\"\"\n",
630 |     "    fig, axes = plt.subplots(1, C, figsize=(4*C, 4))\n",
631 |     "    \n",
632 |     "    # 遍历每个模态\n",
633 |     "    for i in range(C):\n",
634 |     "        img_slice = multi_modal_data[..., i][..., z_idx]\n",
635 |     "        print(f\"min&max: {np.min(multi_modal_data[..., i]), np.max(multi_modal_data[..., i])}\")  \n",
636 |     "\n",
637 |     "        seg_slice = seg_data[..., z_idx]\n",
638 |     "        \n",
639 |     "        axes[i].imshow(img_slice, cmap='gray')\n",
640 |     "        \n",
641 |     "        # 使用 alpha 叠加 seg\n",
642 |     "        seg_mask = np.ma.masked_where(seg_slice == 0, seg_slice)\n",
643 |     "        axes[i].imshow(seg_mask, cmap='jet', alpha=0.5)\n",
644 |     "        \n",
645 |     "        axes[i].set_title(f\"Modality {i}, Z={z_idx}\")\n",
646 |     "        axes[i].axis('off')\n",
647 |     "    \n",
648 |     "    plt.tight_layout()\n",
649 |     "    plt.show()\n",
650 |     "\n",
651 |     "# 使用交互式滑块：只需要控制 z_idx 即可\n",
652 |     "interact(\n",
653 |     "    display_all_modalities, \n",
654 |     "    z_idx=IntSlider(min=0, max=D-1, step=1, value=D//2)\n",
655 |     ");\n"
656 |    ]
657 |   },
658 |   {
659 |    "cell_type": "markdown",
660 |    "metadata": {},
661 |    "source": [
662 |     "大功告成噜💅"
663 |    ]
664 |   }
665 |  ],
666 |  "metadata": {
667 |   "kernelspec": {
668 |    "display_name": "cloudspace",
669 |    "language": "python",
670 |    "name": "python3"
671 |   },
672 |   "language_info": {
673 |    "codemirror_mode": {
674 |     "name": "ipython",
675 |     "version": 3
676 |    },
677 |    "file_extension": ".py",
678 |    "mimetype": "text/x-python",
679 |    "name": "python",
680 |    "nbconvert_exporter": "python",
681 |    "pygments_lexer": "ipython3",
682 |    "version": "3.10.10"
683 |   }
684 |  },
685 |  "nbformat": 4,
686 |  "nbformat_minor": 2
687 | }
688 | 


--------------------------------------------------------------------------------