├── README.md
└── dvh_analysis.py


/README.md:
--------------------------------------------------------------------------------
 1 | # DICOM dose parameter extraction
 2 | Script for automated extraction of dosimetric parameters from multiple RT DICOM files.
 3 | Note: The code is not optimised for speed and the directory structure must be as follows:
 4 | 
 5 | - Parent directory
 6 |      - Patient Folder 1
 7 |          - RS file
 8 |          - RD file
 9 |          - RP file
10 |      - Patient Folder 2
11 |          - RS file
12 |          - RD file
13 |          - RP file....
14 | 
15 | ## Instructions for use
16 | Contained within this repository is the Python module "dvh_analysis.py" which can be run on a directory to extract dosimetric parameters from DICOM RT files.
17 | Dependencies are:
18 | - pandas
19 | - numpy
20 | - tkinter
21 | - dicompyler-core
22 | 
23 | 
24 | 
25 | ## Example usage
26 | ```python
27 | # import the module
28 | import dvh_analysis
29 | 
30 | # generate and store results
31 | my_res = dvh_analysis.dicom_dvh_stats_multi(stats='import', # allow import of file with list of dosimetric statistics to extract
32 |                                             include_body=False, # excludes the body contour from DVH statistic generation (for speed)
33 |                                             struct_labels=True,# allow import of file containing user-defined stucture labels to add to output
34 |                                             save_df=True, # automatically save the generated pandas dataframe
35 |                                             user_structures=None) # list of structures to analyse. None ==> analyses all structures.
36 | ```
37 | 


--------------------------------------------------------------------------------
/dvh_analysis.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # ## Process developed to analyse multiple DICOM files and extract DVH information
  5 | # All files must be stored as follows:
  6 | # - Parent directory
  7 | #     - Patient Folder 1
  8 | #         - RS file
  9 | #         - RD file
 10 | #         - RP file
 11 | #     - Patient Folder 2
 12 | #         - RS file
 13 | #         - RD file
 14 | #         - RP file....
 15 | # 
 16 | # The DVH analysis is basaed on the dicompyler-core module (which is based on pydicom).
 17 | # Notebook with examples: https://github.com/bastula/dicom-notebooks/blob/master/dicompyler-core_usage.ipynb
 18 | # 
 19 | # - The user can specify the parent directory and then all sub directories are analysed.
 20 | # - The user must specify which DVH statistics are required (Can be from an excel/text/csv file).
 21 | # - Additional structure labels can be added to allow simpler comparison between differnet plans which may have different original structure names. *Note that all structure names are converted to lowercase*.
 22 | # - Results are output in CSV format and contain patient id, plan name, structrure name and the dvh statistics requested.
 23 | # 
 24 | 
 25 | # In[1]:
 26 | 
 27 | import pandas as pd
 28 | import numpy as np
 29 | import matplotlib.pyplot as plt
 30 | #get_ipython().magic('matplotlib inline')
 31 | 
 32 | import itertools as it
 33 | from dicompylercore import dicomparser, dvhcalc#, dvh
 34 | import os
 35 | import pydicom
 36 | 
 37 | ## for choosing a directory
 38 | import tkinter as tk
 39 | from tkinter import filedialog
 40 | 
 41 | ## for exiting on error (no oncentra dose grid)
 42 | import sys
 43 | 
 44 | 
 45 | # In[2]:
 46 | 
 47 | def dir_open(title = 'Select Directory...'):
 48 |     """Get a directory string using the OS file chooser dialog."""
 49 |     root = tk.Tk()
 50 |     my_dir =  filedialog.askdirectory(title = title)
 51 |     #print (root.directory)
 52 |     root.withdraw() ## for some reason it freezes if the empty window is not open
 53 |     return my_dir
 54 | 
 55 | def file_open(title = 'Select File...'):
 56 |     """Get a file string using the OS file chooser dialog."""
 57 |     root = tk.Tk()
 58 |     my_file =  filedialog.askopenfilename(title = title)
 59 |     #print (root.filename)
 60 |     root.withdraw()
 61 |     return my_file
 62 | 
 63 | def file_save(title = 'Save File As...',filetypes = (("csv","*.csv"),("all files","*.*")), initialfile=''):
 64 |     root = tk.Tk()
 65 |     ftypes = [('CSV', '.csv'), ('All files', '*')]
 66 |     my_file =  filedialog.asksaveasfilename(title = title, filetypes = ftypes, defaultextension='.csv',initialfile=initialfile)
 67 |     root.withdraw()
 68 |     return my_file
 69 | 
 70 | 
 71 | # In[3]:
 72 | 
 73 | def save_df_to_csv(df,prompt=False,save_dir='',save_name = None):
 74 |     """Function to save the dataframe as csv.
 75 |     Option to prompt the user where to save.
 76 |     Defualt save location is within the parent directory of analysis"""
 77 |     
 78 |     if save_name == None:
 79 |         the_date = pd.Timestamp("now").strftime("%Y%m%d-%H%M") ## no need to inport date module as can use pandas
 80 |         default_save_name = 'results-' + str(the_date) + '.csv'
 81 |     
 82 |     if prompt==True:
 83 |         save_name = file_save(title = 'Save Results As...', initialfile = default_save_name)
 84 |         #print(save_name)
 85 |         if save_name == '': ## user closed before saving
 86 |             return
 87 |     else:
 88 |         save_name = os.path.join(save_dir,default_save_name)
 89 |         
 90 |     df.to_csv(save_name)
 91 |     print('Results saved:', save_name)
 92 | 
 93 | 
 94 | # In[4]:
 95 | 
 96 | def swap_dict(the_dict):
 97 |     """Swaps key,value pairs in dict. Must be unique values to work"""
 98 |     return {y:x for x,y in the_dict.items()}
 99 | 
100 | 
101 | # In[5]:
102 | 
103 | def dicom_type(file):
104 |     """ will return 'rtss', 'rtdose', 'rtplan','ct' dependant on type of dicom file"""
105 |     the_type = dicomparser.DicomParser(file).GetSOPClassUID()
106 |     return the_type
107 | 
108 | 
109 | # In[6]:
110 | 
111 | def get_rt_files(directory):
112 |     """provide a directory and the rtdose, rtss, rtplan files will be returned as a dict"""
113 |     ## get list of all files in directory
114 |     all_files = os.listdir(directory)
115 |     
116 |     ## get the rt files and put tehm in a dict
117 |     ## note: key will not exist if the correct file does not exist
118 |     pt_rt_file_dict = {}
119 |     for file in all_files:
120 |         full_file_dir = os.path.join(directory,file)
121 |         #print(full_file_dir)
122 |         d_type = dicom_type(full_file_dir)
123 |         if d_type == 'rtss':
124 |             pt_rt_file_dict['rtss']=full_file_dir
125 |         if d_type == 'rtdose':
126 |             pt_rt_file_dict['rtdose']=full_file_dir
127 |         if d_type == 'rtplan':
128 |             pt_rt_file_dict['rtplan']=full_file_dir
129 |     
130 |     return pt_rt_file_dict
131 | 
132 | 
133 | # In[7]:
134 | 
135 | def structure_dict(file):
136 |     """Loop through (rtss) file to get {struct_name:struct_id}.
137 |     The loop will go through the IDs to get the structure names.
138 |     file is the complete file directory"""
139 |     ## check to ensure rtss filetype before trying to get structure names
140 |     file_type = dicom_type(file)
141 |     if file_type != 'rtss':## alternative-check length of the returned dict. if 0 then not structs - best to check before
142 |         print('File is not an RT Structure set:',file)
143 |         return
144 |     else:
145 |         ## get the raw structure info from the dicom file - this returns a dict {number:{'color'...id....name etc}}
146 |         struct_info = dicomparser.DicomParser(file).GetStructures()
147 |         
148 |         ## create dict to store the {strucure_name,:structure_id}
149 |         struct_name_dict = {}
150 |         
151 |         ## go through structures and store name and id in dict
152 |         for keys,values in struct_info.items():
153 |             struct_name = struct_info[keys]['name'].lower() ## make all lowercase?
154 |             struct_id = struct_info[keys]['id']
155 |             #print(keys,struct_id,struct_name)
156 |             #print(abc[keys]['id'])
157 |             struct_name_dict[struct_name] = struct_id
158 |         
159 |     return struct_name_dict
160 | 
161 | 
162 | # In[8]:
163 | 
164 | def get_dicom_plan_info(file):
165 |     """Get info from the dicom plan file.
166 |     Return results as a dictionary so can be easily expanded as required"""
167 |     
168 |     if dicom_type(file)!= 'rtplan':
169 |         print('Should provide an RTPLAN filetype')
170 |         dicom_info = None
171 |     else:
172 |         ds = pydicom.read_file(file)
173 |     
174 |         ## Try and keep this fuinction the same for all manufacturers. USe seperate functinos to get specific info.
175 |         ## this uses the functionality of pydicom, rather than specifying the dicom tags.
176 |         ## pydicom essentially looks up the names from a dictionary and then looks up the dicom tag
177 |         ## https://github.com/pydicom/pydicom/blob/master/pydicom/_dicom_dict.py
178 |         ## using names is more understanable to most people, but can also use dicom tags
179 |         
180 |         dicom_info = {} ## store results in a dictionary
181 |         
182 |         ## values specified by name
183 |         dicom_info['study_date'] = ds.StudyDate ## study is the CT info
184 |         dicom_info['study_time'] = ds.StudyTime
185 |         
186 |         if 'study_description' in ds: ## this doesnt seem to appear in Variseed, so have to check it exists before setting it
187 |             dicom_info['study_description'] = ds.StudyDescription
188 |         else:
189 |             dicom_info['study_description'] = None
190 | 
191 |         dicom_info['patient_id'] = ds.PatientID
192 |         dicom_info['patient_name'] = ds.PatientName
193 |         dicom_info['plan_label'] = ds.RTPlanLabel ## plan name
194 |         dicom_info['plan_date'] = ds.RTPlanDate
195 |         dicom_info['plan_time'] = ds.RTPlanTime
196 |         
197 |         ## values can also be specified by dicom tag - need to use the .value to retrieve the actual value
198 |         dicom_info['manufacturer'] = ds[0x0008, 0x0070].value
199 |     
200 |     return dicom_info
201 | 
202 | 
203 | # In[9]:
204 | 
205 | def get_pt_id(file):
206 |     """Get patient ID from dicom file"""
207 |     pt_info = dicomparser.DicomParser(file).GetDemographics()
208 |     return pt_info['id']
209 | 
210 | 
211 | # In[10]:
212 | 
213 | def all_equal(iterable):
214 |     """Returns True if all the elements are equal to each other.
215 |     Based on itertools recipe: https://docs.python.org/3/library/itertools.html#recipes
216 |     Used to ensure same patient IDs for generating DVH from files"""
217 |     g = it.groupby(iterable)
218 |     return next(g, True) and not next(g, False)
219 | 
220 | 
221 | # In[11]:
222 | 
223 | def all_rt_files_exist(directory):
224 |     """Checks if all required DICOM file types exist within suipplied directory.
225 |     Should have rtss, rtplan, rtdose.
226 |     Returns (True/False, List of files)"""
227 |     rt_files = get_rt_files(directory)
228 |     rt_files_keys = rt_files.keys()
229 |    
230 |     ## check if each filetype exists
231 |     rt_non_exist = [] ## store ones which do not exist
232 |     rt_exist = [] ## store ones which do exist
233 |     if 'rtss' not in rt_files_keys:
234 |         rt_non_exist.append('rtss')
235 |     else:
236 |         rt_exist.append('rtss')
237 |     if 'rtdose' not in rt_files_keys:
238 |         rt_non_exist.append('rtdose')
239 |     else:
240 |         rt_exist.append('rtdose')
241 |     if 'rtplan' not in rt_files_keys:
242 |         rt_non_exist.append('rtplan')
243 |     else:
244 |         rt_exist.append('rtplan')
245 |     
246 |     if len(rt_non_exist) != 0:
247 |         print('The following filetypes do not exist:',rt_non_exist)
248 |         all_exist = False
249 |     else:
250 |         all_exist = True
251 |         
252 |     ## extract the ids for the files which exist
253 |     rt_files_exist_pt_ids = []
254 |     for key in rt_exist:
255 |         rt_pt_id = get_pt_id(rt_files[key])
256 |         rt_files_exist_pt_ids.append(rt_pt_id)
257 |     
258 |     ## put in value to force failure for testing
259 |     #rt_files_exist_pt_ids[2] = '333'
260 |         
261 |     ## check all the ids match
262 |     if all_equal(rt_files_exist_pt_ids):
263 |         all_ids_match = True
264 |     else:
265 |         all_ids_match = False
266 |         print('Mis-matched patient IDs identified:',rt_files_exist_pt_ids)
267 |         
268 |     ## if all exist and match then return True
269 |     if all_exist == True and all_ids_match == True:
270 |         all_exist_match = True
271 |     else:
272 |         all_exist_match = False
273 |     
274 |     return all_exist_match,rt_files
275 | 
276 | 
277 | # In[12]:
278 | 
279 | def get_dicom_dose(file):
280 |     """Will return the prescribed dose (in Gy) from an rtplan file. This is needed for DVH analysis.
281 |     An error will be shown to the user if a non RTPlan file is supplied"""
282 |     if dicom_type(file) != 'rtplan':
283 |         print('Prescription cannot be obtained from non RTPlan DICOM file. Returned None.')
284 |         dicom_dose = None
285 |     else:
286 |         if is_oncentra(file) == True: ## oncentra seems to not store the prescribed dose in the same way, so deal with seperately
287 |             print('Oncentra File - Attempting to determine prescription...')
288 |             dicom_dose = get_oncentra_dose(file)
289 |             ## get oncentra dose...
290 |         else:
291 |             dicom_dose = dicomparser.DicomParser(file).GetPlan()['rxdose']/100 ## convert to Gy for use
292 |     return dicom_dose
293 | 
294 | 
295 | # In[13]:
296 | 
297 | # In[14]:
298 | 
299 | def check_oncentra_dose_exists(file):
300 |     dose_scaling_factor_tag = [0x3004,0x000e] ## use this to see if there is dose within an Oncentra file.
301 |     try:
302 |         dose_scale_exists = get_dicom_tag_info(file,dose_scaling_factor_tag)
303 |         #print('Dose Grid Exists in Oncentra File')
304 |         dose_exists = True
305 |     except KeyError:
306 |         #print('No Dose Grid within Oncentra File - do 3D Dose Grid Calculation')
307 |         dose_exists = False
308 |     return dose_exists
309 | 
310 | 
311 | # In[15]:
312 | 
313 | def get_dicom_tag_info(file,tag):
314 |     """Get info from dicom file using a dicom tag specified as a tuple (0x0000,0x0000)
315 |     Reading the file each time a piece of info is needed is not particularly efficient, so try not to use this to much"""
316 |     tag_value = pydicom.read_file(file)[tag].value
317 |     return tag_value
318 | 
319 | 
320 | # In[16]:
321 | 
322 | def is_oncentra(rt_plan):
323 |     """Determine if the RTPLAN file indicates that it is from Oncentra.
324 |     Used to allow prescribed dose to be determined from private tag"""
325 |     manufacturer = pydicom.read_file(rt_plan)[0x00081090].value
326 |     if manufacturer == 'Oncentra':
327 |         oncentra = True ## probably want to look up this tag: (0008, 1090) Manufacturer's Model Name           LO: 'Oncentra'
328 |     else:
329 |         oncentra = False
330 |     return oncentra
331 | 
332 | def get_oncentra_dose(my_file):
333 |     """Get dose from private Oncentra Tag (3007, 1000)"""
334 |     omp_dose_tag = (0x3007,0x1000)
335 |     oncentra_dose = get_dicom_tag_info(my_file,omp_dose_tag)
336 |     dose_exists = check_oncentra_dose_exists(my_file)
337 |     if dose_exists == False:
338 |         #print('3D dose grid was not calculated in Oncentra')
339 |         sys.exit('No Dose Grid within Oncentra File - do 3D Dose Grid Calculation. Aborting Calculation')
340 |     return oncentra_dose
341 | 
342 | 
343 | 
344 | # In[18]:
345 | 
346 | def calc_dvh(rtss,rtdose,structure_id,px):
347 |     """Calculate a dvh object from input files and structure id, and assign the prescription to the dvh object.
348 |     All 3 files must be provided."""
349 |     dvh = dvhcalc.get_dvh(rtss,rtdose,structure_id)
350 |     dvh.rx_dose = px
351 |     return dvh
352 | 
353 | 
354 | # In[19]:
355 | 
356 | def non_int_stat(dvh,stat_string):
357 |     """Function to return the dvh statistic for integer and non-integer values"""
358 |         
359 |     ## get the stat type based on first letter (V/D)
360 |     stat_string = stat_string.lower() ## make all lowercase for simplicity
361 |     stat_type = stat_string[0]
362 |     if stat_type not in ['v','d']:
363 |         sys.exit('Unknown dose statistic type: ' + stat_type)
364 |     
365 |     ## get the units based on last letters (if they exist) (cc/Gy)
366 |     ## get the end string (must reverse through the lsit to do this, then reverse it back to correct way)
367 |     end_string = "".join(it.takewhile(str.isalpha, reversed(stat_string)))[::-1]
368 |     if len(end_string) == 0:
369 |         units = None
370 |     else:
371 |         units = end_string
372 |         if units not in ['cc','gy']:
373 |             sys.exit('Unknown dose statistic units: ' + units)
374 |             
375 |     ## get the value required - remove the stat type and the units to give the numerical value
376 |     stat_numeric = float(stat_string[len(stat_type):len(stat_string)-len(end_string)])
377 |     
378 |     ## now if D.. then use dose_constraint, if V... use volume_constraint with the obtained units
379 |     if stat_type =='v':
380 |         stat = dvh.volume_constraint(stat_numeric, units)
381 |         
382 |     if stat_type =='d':
383 |         stat = dvh.dose_constraint(stat_numeric, units)
384 | 
385 |     return stat
386 | 
387 | 
388 | # In[20]:
389 | 
390 | def get_statistics(dvh,stat_list):
391 |     """get multiple statistics from a dvh and store in a dict for use"""
392 |     dvh_stats = {}
393 |     
394 |     ## record the volume
395 |     dvh_vol = (dvh.volume, dvh.volume_units)
396 |     dvh_stats['volume'] = (dvh_vol[0], dvh_vol[1])
397 |     
398 |     ## check if valid volume from the calcs
399 |     if dvh_vol[0] == 0 or np.isnan(dvh_vol[0]):
400 |         valid_vol = False
401 |     else:
402 |         valid_vol = True
403 |     #print(dvh_vol,valid_vol)
404 | 
405 |     ## record the dosimetric statistics
406 |     for stat in stat_list:
407 |         if valid_vol == True:
408 |             
409 |             special_stats = {'mean':(dvh.mean, dvh.dose_units),
410 |                 'max':(dvh.max, dvh.dose_units),
411 |                 'min':(dvh.min, dvh.dose_units),
412 |                 'median':(dvh.statistic('D50').value,dvh.statistic('D50').units)}
413 |             
414 |             if stat.lower() in special_stats:
415 |                 dvh_stats[stat] = special_stats[stat.lower()]
416 |                 
417 |             else:
418 |                 #dvh_stat = dvh.statistic(stat)
419 |                 dvh_stat = non_int_stat(dvh,stat)
420 |                 dvh_stats[stat] = (dvh_stat.value,dvh_stat.units)
421 |         else:
422 |             dvh_stats[stat] = None
423 |     return dvh_stats
424 | 
425 | 
426 | # In[65]:
427 | 
428 | def dicom_dvh_stats_single(directory=None, stats=None, structures=None, output='df', include_body=True,verbose=True,save_df=False,user_structures=None):
429 |     """Get dvh statistics for the specified structures from the DICOM files in the given directory.
430 |     directory = string :'C:\\.....'
431 |     structures = list of structure IDs: [1,2,5....]. Default is None which will get stats from all structures.
432 |     stats = list of stats: ['D90', 'V100'....]
433 |     This will probably be wrapped by another funciton which will allow selection of the directory and the structure mapping.
434 |     Raw results are as a dict, but default is a dataframe. Specifying anything other than 'df' will output a dict.
435 |     output='df' must be used for processing multiple dicom sets"""
436 |     
437 |     if directory==None:
438 |         directory = dir_open()
439 |         
440 |     ## list of default stats. HAs to be set as None in the function as called from multi patient version too
441 |     default_stats = ['D98','D90','V100','V10', 'V20', 'D2cc', 'max', 'min', 'mean', 'median']
442 |     if stats==None:
443 |         stats = default_stats
444 |     
445 |     ## 1, 2, 3
446 |     ## get the DICOM file paths into dictionary for use
447 |     ## function also checks if the IDs match        
448 |     all_exist_and_match, dir_files = all_rt_files_exist(directory)
449 |     
450 |     ## 4
451 |     ## get structures which exist - dont need this if specifying structure ID
452 |     ## get the structure names {name:id}
453 |     structure_names = structure_dict(dir_files['rtss'])
454 |     
455 |     ## remove body structure if required (default)
456 |     if include_body != True:
457 |         if 'body' in structure_names:
458 |             del structure_names['body']
459 |     
460 |     ## remove couch structures from analysis
461 |     for the_struct in ['couchsurface','couchinterior']:
462 |         if the_struct in structure_names:
463 |             del structure_names[the_struct]
464 |     
465 |     ## remove any obvious pseudos:
466 |     structure_names = {k:v for k,v in structure_names.items() if 'pseudo' not in k}
467 |                       
468 |     ## limit to only specified structure names by removing non-matching structures
469 |     ## **********need to reconstruct dict by checking names are in list supplied by user... This below line is untested!******
470 |     print('***')
471 |     print(structure_names)
472 |     if user_structures == 'mapping':
473 |         the_structs = import_structure_list(file_open(title='Select File Containing Structure List...'))
474 |         the_structs = [i.lower() for i in the_structs]
475 |         structure_names =  {k:v for k,v in structure_names.items() if k.lower() in the_structs}
476 |     elif user_structures is not None:
477 |         ## if passed a list
478 |         the_structs = [i.lower() for i in user_structures]
479 |         structure_names =  {k:v for k,v in structure_names.items() if k.lower() in the_structs}
480 |     print(structure_names)
481 |     print('*********')
482 |     
483 |     ## swap the structure dict to give {id:name}
484 |     all_structures = swap_dict(structure_names)
485 |     #print (structure_names)
486 |     
487 |     ## if structure IDs not specified, then get data for all structures
488 |     ## ideally user should be able specify the structure names rather than IDs.
489 |     ## would need a funciton to get teh required IDs from the names - see above proposal
490 |     if structures==None:
491 |         structures = list(all_structures.keys())
492 | 
493 |     ## ******* I think this below chunk is now obsolete as list of structures is constructed differently.
494 | #    if not all(structure in all_structures for structure in structures):
495 | #        ## tell user which structures IDs do not match if there are any which do not match
496 | #        ## then remove this from the list of structures to analyse
497 | #        struct_id_mismatch = list_exist_in_dict(structures,all_structures)
498 | #        print('The structure IDs', struct_id_mismatch, 'do not exist within the structure set.')
499 | #        print(structures)
500 | #        for struct_id in struct_id_mismatch:
501 | #            structures.remove(struct_id)
502 |     ## *********
503 |     
504 |     ## get the dvh for each structure and extract the required statistics
505 |     pt_id = get_pt_id(dir_files['rtplan'])
506 |     plan_name = get_dicom_plan_info(dir_files['rtplan'])['plan_label']
507 |     #print(plan_name)
508 |     prescription = get_dicom_dose(dir_files['rtplan'])
509 |     
510 |     print('Patient id:',pt_id, '\nPlan name:',plan_name, '\nGetting DVH statistics', stats,)
511 |     
512 |     all_dvh_stats = {}
513 |     
514 |     struct_num = 1
515 |     tot_structs = len(structures)
516 |     for structure_id in structures:
517 |         structure_name = all_structures[structure_id]
518 |         ## display ful progress if required
519 |         if verbose == True:
520 |             print('Patient id: ' + str(pt_id) + ', Plan name: ' + str(plan_name) +
521 |                   ', Processing structure:' + str(struct_num) + ' of ' + str(tot_structs) + ' (' + structure_name + ')')
522 |         struct_dvh = calc_dvh(dir_files['rtss'],dir_files['rtdose'],structure_id,prescription)
523 |         ## get required stats from DVH
524 |         structure_dvh_stats = get_statistics(struct_dvh,stats)
525 |         all_dvh_stats[structure_name]=structure_dvh_stats
526 |         #print(structure_id,structure_name,structure_dvh_stats)
527 |         struct_num += 1
528 |     
529 |     print('Patient Completed')
530 |     
531 |     dir_string_output = os.path.split(directory)[1]
532 |     
533 |     results_dict = {(dir_string_output,pt_id,plan_name): all_dvh_stats}
534 |     
535 |     if output == 'df':
536 |         results_output = pt_stats_df(results_dict)
537 |         if save_df == True:
538 |             pickle_save_dir = os.path.split(directory)[0]+ '\\' +os.path.split(directory)[1] + '.pkl'
539 |             print(pickle_save_dir)
540 |             results_output.to_pickle(pickle_save_dir)
541 |     else:
542 |         results_output = results_dict
543 |     
544 |     return results_output
545 | 
546 | 
547 | # In[66]:
548 | 
549 | def pt_stats_df(results_dict):
550 |     """Formats the results (which are in a dict) into a more user friendly dataframe"""
551 |     
552 |     ## get the patient results into a df
553 |     results_df = pd.DataFrame.from_dict(results_dict[list(results_dict.keys())[0]],orient='index')
554 |     #print(results_df)
555 | 
556 |     ## rename the index as 'structure' and then re-index for consistency of data access (i.e. index is seperate)
557 |     results_df.index.rename('structure',inplace=True)
558 |     results_df.reset_index(inplace=True,drop=False)
559 |     
560 |     ## insert the patient id as a seperate column at the start of the df (i.d. = first (only) key in results dict)
561 |     results_df.insert(0,'plan_name',list(results_dict.keys())[0][2])
562 |     results_df.insert(0,'patient_id',list(results_dict.keys())[0][1])
563 |     results_df.insert(0,'sub_dir',list(results_dict.keys())[0][0])
564 |     
565 |     ## split the statistic results into values and units for simpler analysis/formatting
566 |     for heading in results_df.columns.values:
567 |         if heading not in ['sub_dir','structure', 'patient_id', 'plan_name']:
568 |             head_val = heading + '_val'
569 |             head_unit = heading + '_unit'
570 | 
571 |             results_df[[head_val,head_unit]] = results_df[heading].apply(pd.Series)
572 |             
573 |     ## convert pt_id, plan_name, structure columns to lowercase and remove trailing whitespace.
574 |     results_df['patient_id'] = results_df['patient_id'].str.strip().str.lower()
575 |     results_df['plan_name'] = results_df['plan_name'].str.strip().str.lower()
576 |     results_df['structure'] = results_df['structure'].str.strip().str.lower()
577 |     
578 |     ## set all headers to lowercase
579 |     results_df.columns = results_df.columns.str.lower()
580 |     
581 |     return results_df
582 | 
583 | 
584 | # In[67]:
585 | 
586 | def get_sub_dirs(parent_dir=None):
587 |     """Get a list of sub-directories (only first level) from the provided (or usually chosen form prompt) directory"""
588 |     ## can allow user to specify path in code if tehy want to, otherwise it will prompt
589 |     if parent_dir == None:
590 |         parent_dir = dir_open("Select Directory Containing Subdirectories to Analyse...")
591 | 
592 |     ## extract the info from the directory
593 |     path,dirs,files = next(os.walk(parent_dir))
594 |     
595 |     ## combine the path and the dirs into the required strings
596 |     sub_dirs = [path + '/' + dirs[i] for i in range(len(dirs))]
597 |     
598 |     return(path,sub_dirs)
599 | 
600 | 
601 | # In[68]:
602 | 
603 | def dicom_dvh_stats_multi(parent_directory=None, stats='prompt', structures=None,
604 |                            save_results=True, save_prompt=False, include_body=True, limit=None,
605 |                            struct_labels=False, verbose=True,save_df=False,user_structures=None):
606 |     """Function to allow entire sub directory of folders to be analysed and the results combined.
607 |     The number of directories can be limited using e.g. limit=2.
608 |     The results can be saved as a csv file using save_results=True
609 |     Save_prompt will aim to allow user specified save location and file name in the future.
610 |     If verbose=True, then will show structures being processed."""
611 |     if parent_directory == None:
612 |         path, sub_dirs = get_sub_dirs()
613 |     else:
614 |         path, sub_dirs = get_sub_dirs(parent_dir = parent_directory)
615 |     
616 |     ## can get list of pseudo structure labels from user to add to results.
617 |     if struct_labels == True:
618 |         struct_label_file = file_open(title='Select File Containing Structure Labels...')
619 |     
620 |     ## can limit number of directories checked. Default is no limit.
621 |     if limit is not None:
622 |         sub_dirs = sub_dirs[:limit]
623 |     
624 |     ## get the dch stats from the user if required. (Only not required if they set stats=None)
625 |     if stats == 'import':
626 |         stats = import_dvh_stats(file_open(title='Select File Containing DVH Stats to Return...'))
627 |     if stats == 'prompt':
628 |         stats = prompt_dvh_stat_list()
629 |         
630 |     if user_structures == 'mapping':
631 |         user_struct_list = import_structure_list(file_open(title='Select File Containing Structure List...'))
632 |     else:
633 |         user_struct_list = None
634 |     
635 |     ## get the results from each directory and append the results into a list which then becomes a df
636 |     all_data_list = []
637 |     num_sub_dirs = len(sub_dirs)
638 |     sub_dir_count = 1
639 |     for sub_dir in sub_dirs:
640 |         print('Processing Directory ' + str(sub_dir_count) + ' of ' + str(num_sub_dirs) + ', ' + sub_dir)
641 |         sub_dir_results = dicom_dvh_stats_single(directory=sub_dir, stats=stats, structures=structures, output='df',
642 |                                            include_body=include_body, verbose=verbose,save_df=save_df,user_structures=user_struct_list)
643 | 
644 |         all_data_list.append(sub_dir_results)
645 |         sub_dir_count += 1
646 |     all_results = pd.concat(all_data_list,axis=0)
647 |     print('Dosimetric data extracted from all directories')
648 |     if struct_labels == True:
649 |         all_results = add_struct_label_plan_id(all_results, struct_label_file)
650 |         print('Adding struct_label structure names')
651 |     else:
652 |         all_results['struct_label'] = np.nan
653 |         
654 |     ## move the struct_label col to a better location in the df
655 |     cols = all_results.columns.tolist() ## get a list of the columns
656 |     cols.insert(3, cols.pop(cols.index('struct_label'))) ## get the required column (pop) and move it to new location
657 |     all_results = all_results.reindex(columns= cols) ## reindex the df with new column order
658 |     
659 |     ## save the results
660 |     if save_results == True:
661 |         save_df_to_csv(all_results,prompt=save_prompt,save_dir=path)
662 |     
663 |     ## df of results is returned
664 |     return all_results
665 | 
666 |     
667 | #%%
668 | 
669 | def prompt_dvh_stat_list(prompt="Enter DVH Stats Seperate by a comma. e.g. D90,V100,D2cc: "):
670 |     """ Get the stats input by the user and output as a list for use """
671 |     print('\a')
672 |     return input(prompt).replace(' ', '').split(',')
673 | 
674 | # In[69]:
675 | 
676 | def import_dvh_stats(file):
677 |     """Import the DVH stats to calculate and return a list of these.
678 |     The stats should be on seperate rows in the file, with no header."""
679 |     
680 |     ## get file extension to determine import method
681 |     filetype = os.path.splitext(file)[1]
682 |     #print(filetype)
683 |     ## read in the struct_label structure file and ensure patient IDs are strings to match the DICOM data
684 |     if filetype in ['.xlsx','.xls','.xlm','.xlsm']:
685 |         df = pd.read_excel(file, index_col=None, header=None)
686 |     elif filetype in ['.csv']:
687 |         df = pd.read_csv(file, index_col=False, header=None)
688 |     elif filetype in ['.txt']:
689 |         df = pd.read_table(file, index_col=False, header=None)
690 |     else:
691 |         print('Can currently only import .csv, .txt, or excel filetypes for DVH stats')
692 |     
693 |     ## convert df to list
694 |     dvh_stat_list = list(df[0].values)
695 |     
696 |     return dvh_stat_list
697 | # %%
698 | def import_structure_list(file):
699 |     """Import the structures as a list.
700 |     The stats should be on seperate rows in the file, with no header."""
701 |     
702 |     ## get file extension to determine import method
703 |     filetype = os.path.splitext(file)[1]
704 |     #print(filetype)
705 |     ## read in the struct_label structure file and ensure patient IDs are strings to match the DICOM data
706 |     if filetype in ['.xlsx','.xls','.xlm','.xlsm']:
707 |         df = pd.read_excel(file, index_col=None, header=None)
708 |     elif filetype in ['.csv']:
709 |         df = pd.read_csv(file, index_col=False, header=None)
710 |     elif filetype in ['.txt']:
711 |         df = pd.read_table(file, index_col=False, header=None)
712 |     else:
713 |         print('Can currently only import .csv, .txt, or excel filetypes for structure names')
714 |     
715 |     ## convert df to list
716 |     structure_list = list(df[0].values)
717 |     
718 |     return structure_list
719 | 
720 | 
721 | # In[70]:
722 | 
723 | def import_struct_label_file(file):
724 |     """Import file and get into correctly formatted dataframe for use.
725 |     Supported filetypes are: excel, csv, txt"""
726 |     
727 |     ## get file extension to determine import method
728 |     filetype = os.path.splitext(file)[1]
729 |     #print(filetype)
730 |     ## read in the struct_label structure file and ensure patient IDs are strings to match the DICOM data
731 |     if filetype in ['.xlsx','.xls','.xlm','.xlsm']:
732 |         df = pd.read_excel(file, index_col=None)
733 |     elif filetype in ['.csv']:
734 |         df = pd.read_csv(file, index_col=False)
735 |     elif filetype in ['.txt']:
736 |         df = pd.read_table(file, index_col=False)
737 |     else:
738 |         print('Can currently only import .csv, .txt, or excel filetypes for structure labels')
739 |     
740 |     ## do some tidying - lower case and remove whitespace
741 |     df['patient_id'] = df['patient_id'].astype(str).str.strip().str.lower()
742 |     df['structure'] = df['structure'].str.strip().str.lower()
743 |     
744 |     ## plan_name might not exist if all unique patients
745 |     if 'plan_name' in df.columns:
746 |         df['plan_name'] = df['plan_name'].str.strip().str.lower()
747 |     
748 |     return df
749 | 
750 | 
751 | # In[71]:
752 | 
753 | def add_struct_label_plan_id(results_df,struct_label_file):
754 |     """Function to check the struct_label structure file has the plan ID included.
755 |     If it does not, then this is inferred from the patient ID from the results.
756 |     Only if unique plan ids for each pateint can the struct_label strucutre names be reliably used.
757 |     The struct_label_df is returned including the plan names"""
758 |     
759 |     ## read in the struct_label structure file and ensure patient IDs are strings to match the DICOM data
760 |     df_structs = import_struct_label_file(struct_label_file)
761 |     unique_pts = results_df['patient_id'].unique() ## get pateint ids from results
762 |     df_structs = df_structs[df_structs['patient_id'].isin(unique_pts)] ## remove structure mapping info from uneeded pts
763 |     
764 | 
765 |     ## if plan names do not exist, then add them from the results if they are unique for each patient
766 |     if 'plan_name' not in df_structs.columns:
767 |         print('Determing plan names from results df')
768 |         #unique_pts = results_df['patient_id'].unique()
769 |         #df_structs = df_structs[df_structs['patient_id'] in unique_pts] ## filter out structure mapping from pts not in results
770 |         ## need to actually remove the rows completely so the lengths match properly...??
771 | 
772 |         ## store the plan names in a dict {pt_id, plan_name} for each patient.
773 |         plan_names_dict = {}
774 | 
775 |         for patient in unique_pts:
776 |             pt_plan_names = results_df[results_df['patient_id']==patient]['plan_name'].unique()
777 |             plan_names_dict[patient] = pt_plan_names
778 | 
779 |         ## check length of all items in the dict. Should be 1 if unique plan names. Warn user if not.
780 |         unique_plan_names = True
781 |         for item in plan_names_dict:
782 |             if len(plan_names_dict[item]) != 1 :
783 |                 print('Patient', item,
784 |                       'does not have unique plan names. These must be specified within the struct_label structures file.')
785 |                 unique_plan_names = False
786 | 
787 |         ## if all unique then add to the dataframe
788 |         if unique_plan_names == True:
789 |             all_struct_pts = df_structs['patient_id'].values
790 |             ## create list of pateint plan names and then add to df
791 |             plan_names_to_add = []
792 |             for patient in all_struct_pts:
793 |                 plan_names_to_add.append(plan_names_dict[patient][0])
794 |             df_structs['plan_name'] = plan_names_to_add
795 |             if len(df_structs) != 0: ## if nothing matches then cant set values whcih dont exist
796 |                 df_structs['plan_name'] = df_structs['plan_name'].str.strip().str.lower() ## lower case and strip whitespace
797 |                 print('Unique plan names added to struct_label structure data for use')
798 |             else:
799 |                 df_structs['plan_name'] = 'No matching plan names'
800 |                 print('No matching plan names to add from structure mapping file')
801 |             
802 |     ## check psudo_struct df only contains unique rows to ensure no abiguity in results
803 |     if df_structs.equals(df_structs.drop_duplicates()) == False:
804 |         print('struct_label Structure Dataframe contains duplicates. Results may be ambiguous.')
805 |     
806 |     ## add the struct_label names to the results using the merge functionality.
807 |     merged_df = results_df.merge(df_structs, how='outer', on=['patient_id', 'plan_name', 'structure'])
808 |     merged_df = merged_df[~merged_df['sub_dir'].isnull()] ## remove any blank values which have appeared due to merge?
809 |     
810 |     return merged_df
811 | 
812 | 
813 | # In[72]:
814 | 
815 | #my_results = dicom_dvh_stats_multi(save_results=True, save_prompt=True, stats='import',
816 | #                                    limit=2 ,struct_labels=True, verbose=True, include_body=False)
817 | 
818 | 
819 | # In[74]:
820 | 
821 | #my_results.head()
822 | 
823 | 
824 | # In[59]:
825 | 
826 | #simple_results = dicom_dvh_stats_multi()
827 | 
828 | 
829 | # ## Analysis of Results.
830 | # - Make use of pandas to demonstrate simplicity of extracting data.
831 | 
832 | # In[75]:
833 | 
834 | ## produce a boxplot of each measured dvh statistic
835 | 
836 | ## produce a list of stats to plot (only want the values, and not volume)
837 | #cols = list(my_results.columns)
838 | ## keep only items with '_val' at the end
839 | #cols = [x for x in cols if '_val' in x]
840 | ## remove any other unnesessary thigns from the list
841 | #to_remove = ['volume_val']
842 | #cols = [x for x in cols if x not in to_remove]
843 | #print(cols)
844 | 
845 | #my_results[cols].plot.box(figsize=(10,4))
846 | #plt.show()
847 | 
848 | 
849 | # ## Get non-integer stats
850 | # e.g D0.1cc
851 | # - probably best to check the string passed and see if there is a decimal.
852 | #     - Then shoudl eb abel to determine method to use: https://groups.google.com/d/msg/dicompyler/EMnyhcEg4_Y/4P1wIcJ3AQAJ
853 | 
854 | # In[265]:
855 | 
856 | 
857 | # In[268]:
858 | 
859 | #abc = calc_dvh(ss,dose,3,get_dicom_dose(plan))
860 | 
861 | 
862 | # In[351]:
863 | 
864 | ## D90 (D90Gy is odd...? Does it even mean anything? Have I ever used it...?)
865 | #print(abc.D9)
866 | #print(abc.statistic('D9'))
867 | #print(abc.dose_constraint(9.6))
868 | #print('-')
869 | 
870 | #print(abc.D2cc)
871 | #print(abc.statistic('D2cc'))
872 | #print(abc.dose_constraint(2.2, volume_units='cc'))
873 | #print('-')
874 | 
875 | #print(abc.V50)
876 | #print(abc.statistic('V50'))
877 | #print(abc.volume_constraint(50.3))
878 | #print('-')
879 | 
880 | #print(abc.V50Gy)
881 | #print(abc.statistic('V50Gy'))
882 | #print(abc.volume_constraint(50, dose_units='Gy'))
883 | #print('-')
884 | 
885 | 
886 | # In[ ]:
887 | 
888 | #Dx = dose_constraint(x)
889 | #Dxcc = dose_constraint(x, volume_units='cc')
890 | #Vx = volume_constraint(x)
891 | #VxGy = volume_constraint(x, dose_units='Gy')
892 | 
893 | 
894 | # In[447]:
895 | 
896 | #abc.statistic('V50Gy')
897 | 
898 | 
899 | # In[ ]:
900 | 
901 | 
902 | 
903 | 


--------------------------------------------------------------------------------