├── requirements.txt ├── datasets └── toy_data.zip ├── source ├── begin_DTMIL.py ├── batch_visualization.py ├── dataset_formatter_cl.py ├── feature_ranking_analysis.py ├── requirements.txt ├── dtmil │ ├── configuration │ │ ├── DTMIL_config_dir.json │ │ ├── DTMIL_config_default.json │ │ └── config_dtmil.py │ ├── prediction_data.py │ ├── feature_ranking │ │ ├── ranking_window.py │ │ └── feature_ranking.py │ ├── utilities.py │ ├── data_container.py │ ├── model_container.py │ └── visualizations.py ├── simple_visualization.py └── guis │ ├── parameter_selector.py │ └── dataset_formatter.py ├── ADOPT Corporate CLA.pdf ├── ADOPT Individual CLA.pdf ├── documentation ├── readme.txt ├── Usage Guide.pdf ├── ADOPT Corporate CLA.pdf ├── ADOPT Individual CLA.pdf ├── feature_ranking_example.pdf ├── parameter_graph_example.pdf ├── ADOPT NASA Open Source Agreement.pdf ├── Anomalous_Test_ranking_data_0193_precursor_event_1.pdf └── config_readme.rtf ├── ADOPT NASA Open Source Agreement.pdf └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | scipy 2 | matplotlib 3 | h5py 4 | numpy 5 | Keras 6 | tensorflow -------------------------------------------------------------------------------- /datasets/toy_data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/ADOPT/HEAD/datasets/toy_data.zip -------------------------------------------------------------------------------- /source/begin_DTMIL.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/ADOPT/HEAD/source/begin_DTMIL.py -------------------------------------------------------------------------------- /ADOPT Corporate CLA.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/ADOPT/HEAD/ADOPT Corporate CLA.pdf -------------------------------------------------------------------------------- /ADOPT Individual CLA.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/ADOPT/HEAD/ADOPT Individual CLA.pdf -------------------------------------------------------------------------------- /documentation/readme.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/ADOPT/HEAD/documentation/readme.txt -------------------------------------------------------------------------------- /documentation/Usage Guide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/ADOPT/HEAD/documentation/Usage Guide.pdf -------------------------------------------------------------------------------- /source/batch_visualization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/ADOPT/HEAD/source/batch_visualization.py -------------------------------------------------------------------------------- /source/dataset_formatter_cl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/ADOPT/HEAD/source/dataset_formatter_cl.py -------------------------------------------------------------------------------- /ADOPT NASA Open Source Agreement.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/ADOPT/HEAD/ADOPT NASA Open Source Agreement.pdf -------------------------------------------------------------------------------- /source/feature_ranking_analysis.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/ADOPT/HEAD/source/feature_ranking_analysis.py -------------------------------------------------------------------------------- /documentation/ADOPT Corporate CLA.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/ADOPT/HEAD/documentation/ADOPT Corporate CLA.pdf -------------------------------------------------------------------------------- /documentation/ADOPT Individual CLA.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/ADOPT/HEAD/documentation/ADOPT Individual CLA.pdf -------------------------------------------------------------------------------- /documentation/feature_ranking_example.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/ADOPT/HEAD/documentation/feature_ranking_example.pdf -------------------------------------------------------------------------------- /documentation/parameter_graph_example.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/ADOPT/HEAD/documentation/parameter_graph_example.pdf -------------------------------------------------------------------------------- /documentation/ADOPT NASA Open Source Agreement.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/ADOPT/HEAD/documentation/ADOPT NASA Open Source Agreement.pdf -------------------------------------------------------------------------------- /source/requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib==3.1.1 2 | pandas==0.25.1 3 | numpy==1.16.4 4 | Keras==2.3.0 5 | scipy==1.3.1 6 | h5py==2.9.0 7 | scikit_learn==0.21.3 8 | typing==3.7.4.1 9 | -------------------------------------------------------------------------------- /documentation/Anomalous_Test_ranking_data_0193_precursor_event_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/ADOPT/HEAD/documentation/Anomalous_Test_ranking_data_0193_precursor_event_1.pdf -------------------------------------------------------------------------------- /source/dtmil/configuration/DTMIL_config_dir.json: -------------------------------------------------------------------------------- 1 | { 2 | "cache_file": "data/cached_data.h5", 3 | "datasets_directory": "../../../datasets/", 4 | "parameters_directory": "data/parameters/", 5 | "model_archive_directory": "misc/model_archive/", 6 | "model_storage_directory": "model_saves/", 7 | "raw_data_directory": "data/raw_data/", 8 | "selected_dataset":"selected_dataset.txt", 9 | "model_output_directory":"output/" 10 | } -------------------------------------------------------------------------------- /source/dtmil/configuration/DTMIL_config_default.json: -------------------------------------------------------------------------------- 1 | { 2 | "config_name":"run", 3 | "config_id":0, 4 | "id_hold":false, 5 | 6 | "training": 7 | { 8 | "batch_size": 32, 9 | "dr": 0, 10 | "epochs": 200, 11 | "lam": 0.01, 12 | "lr": 0.001, 13 | "nhd": 500, 14 | "nhr": 5, 15 | "seed": 0, 16 | "pre_trained_model":"", 17 | "train_flag": 1, 18 | "pre_trained_json":"" 19 | }, 20 | 21 | "importing": 22 | { 23 | "nominal_filename":"filelist_nominal.txt", 24 | "adverse_filename": "filelist_adverse.txt", 25 | "holdout_percent": 0.1, 26 | "validation_percent":0.4, 27 | "state_cache": 0, 28 | "time_splice": 1 29 | }, 30 | 31 | "preprocessing": 32 | { 33 | "set_sample_length":null, 34 | "redundant_parameters": [], 35 | "drop_parameters":[], 36 | "all_parameter_names":[] 37 | }, 38 | 39 | "model_io": 40 | { 41 | "model_filename":"keras_model.h5", 42 | "model_container_filename":"model_container.pkl", 43 | "data_container_filename":"data.pkl" 44 | }, 45 | 46 | "visualization": 47 | { 48 | "binary_parameters":[], 49 | "guideline_type": 1, 50 | "precursor_threshold":0.5 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /source/simple_visualization.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | from enum import Enum 3 | from batch_visualization import Batch_Visualizer 4 | 5 | num_columns = 4 6 | 7 | def visualize_event(event_types,i_bad,i_good): 8 | if (Event_Type.Nominal in event_types): 9 | print("visualizing nominal") 10 | viz.save_sample_parameters(i_good,num_columns=num_columns) 11 | 12 | if (Event_Type.Anomalous in event_types): 13 | print("visualizing adverse") 14 | 15 | viz.save_sample_parameters(i_bad,num_columns=num_columns) 16 | 17 | 18 | class Dataset_Type(Enum): 19 | Train = 1 20 | Validation = 2 21 | Test = 3 22 | 23 | class Event_Type(Enum): 24 | Nominal = 1 25 | Anomalous = 2 26 | 27 | 28 | print(argv) 29 | #%% user defined variables 30 | 31 | if (len(argv) > 1): 32 | dataset_input = argv[1] 33 | 34 | else: 35 | dataset_input = input("Input the path of the dataset:\n") 36 | 37 | 38 | viz = Batch_Visualizer(dataset_input) 39 | input_val = "" 40 | 41 | while(input_val == ""): 42 | set_types = {1:"Train",2:"Validation",3:"Test"} 43 | 44 | 45 | prompt = ("Which part of the dataset would you like to visualize? If multiple, input the numbers separated by commas.\n\n" 46 | "1. Training Set\n" 47 | "2. Validation Set\n" 48 | "3. Test Set\n" 49 | "\n") 50 | 51 | set_nums = input(prompt) 52 | input_val = set_nums 53 | 54 | if (input_val == ""): 55 | print("no input selected, try again, or press control-c to exit\n") 56 | 57 | else: 58 | sets_list = [int(num) for num in set_nums.split(',')] 59 | 60 | dataset_types_list = [Dataset_Type(num) for num in sets_list] 61 | 62 | event_prompt = ("Which event would you like to visualize? If multiple, input the numbers separated by commas.\n\n" 63 | "1. Nominal\n" 64 | "2. Adverse\n" 65 | "\n") 66 | event_nums = input(event_prompt) 67 | event_list = [int(num) for num in event_nums.split(',')] 68 | event_types = [Event_Type(num) for num in event_list] 69 | 70 | 71 | 72 | print(dataset_types_list) 73 | if(Dataset_Type.Train in dataset_types_list): 74 | 75 | visualize_event(event_types,viz.myData.I_bad,viz.myData.I_opt) 76 | 77 | if(Dataset_Type.Validation in dataset_types_list): 78 | visualize_event(event_types,viz.myData.I_bad_valid,viz.myData.I_opt_valid) 79 | 80 | if(Dataset_Type.Test in dataset_types_list): 81 | visualize_event(event_types,viz.myData.I_bad_ho,viz.myData.I_opt_ho) 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /source/dtmil/prediction_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Apr 2 13:55:10 2019 5 | 6 | @author: dweckler 7 | """ 8 | 9 | 10 | 11 | import numpy as np, matplotlib.pyplot as plt 12 | from keras import backend as T 13 | import time 14 | import os 15 | from dtmil.utilities import flat_avg 16 | from dtmil.model_container import ModelContainer 17 | from dtmil.data_container import DataContainer 18 | 19 | #%%class def 20 | 21 | 22 | class Prediction_Data: 23 | 24 | def __init__(self,myData:DataContainer,myModel:ModelContainer,sample_id:int = None, data_padding:bool = False, input_window = None): 25 | 26 | self.myData = myData 27 | self.myModel = myModel 28 | self.current_sample = sample_id 29 | 30 | #FIXME: Figure out what shape the input window will be. For now, it just assumes the same shape as the data sample (two indeces: [time,feature]) 31 | if input_window is not None: 32 | self.data_sample = input_window 33 | else: 34 | #TODO: make states and states_orig have the same "shape order" 35 | #both the arrays below are the same shape 36 | if sample_id is None: 37 | sample_id = 0 38 | print(f"no value provided for sample_id, setting to default value of {sample_id}") 39 | self.data_sample = myData.states[sample_id,:,:] 40 | 41 | self.data_length = len(self.data_sample) 42 | self.visualization_sample = myData.states_orig[:,sample_id,:] 43 | 44 | inst_layer_output_fn = T.function([myModel.model.layers[0].input],[myModel.model.layers[-2].output]) 45 | self.instance_layer_output_function = inst_layer_output_fn 46 | 47 | if(data_padding): 48 | self.pad_data() 49 | 50 | #self.pad_original_precursor_score() 51 | 52 | else: 53 | self.data_window = self.data_sample 54 | self.visualization_window = self.visualization_sample 55 | 56 | self.padded_sample = None 57 | self.padded_vis_sample = None 58 | 59 | self.start_index = 0 60 | self.end_index = self.data_length - 1 61 | 62 | self.update_predictions() 63 | 64 | def update_predictions(self): 65 | 66 | data_window = self.data_window 67 | data_length = len(data_window) 68 | num_features = len(data_window[0]) 69 | 70 | #TODO: get the states from myData if there isn't another type of input 71 | input_values=np.reshape(data_window,(1,data_length,num_features)) 72 | self.input_values = input_values 73 | 74 | # get instance probabilities (precursor score) 75 | L=self.instance_layer_output_function([input_values])[0] 76 | self.L = L 77 | 78 | self.precursor_score = L[0,:,0] 79 | 80 | # get precursor indeces 81 | #FIXME: Make this work with updating visualization params, or let the visualization module take it 82 | self.precursor_threshold = self.myData.json_data['visualization']["precursor_threshold"] 83 | self.precursor_indeces=np.where(self.precursor_score>self.precursor_threshold)[0] 84 | 85 | 86 | #This is only until we get actual streaming working 87 | def update_data_window(self,step_size = 1): 88 | 89 | new_start_index = self.start_index + step_size 90 | end_index = new_start_index + self.data_length 91 | 92 | if end_index >= len(self.padded_sample): 93 | #array would be out of bounds so we set it to the last value 94 | end_index = len(self.padded_sample) 95 | #new_start_index = end_index - self.data_length +1 96 | new_start_index = end_index - self.data_length 97 | 98 | 99 | self.start_index = new_start_index 100 | self.data_window = self.padded_sample[new_start_index:end_index] 101 | self.visualization_window = self.padded_vis_sample[new_start_index:end_index] 102 | 103 | #self.orig_prec_score_window = self.padded_orig_prec_score[new_start_index:end_index] 104 | 105 | self.update_predictions() 106 | 107 | #####TODO: Remove once demos are done 108 | 109 | def pad_data(self): 110 | 111 | data_sample = self.data_sample 112 | vis_sample = self.visualization_sample 113 | self.padded_sample, self.data_window = self.pad_sample(data_sample) 114 | self.padded_vis_sample, self.visualization_window = self.pad_sample(vis_sample) 115 | 116 | self.start_index = 0 117 | 118 | def pad_sample(self, sample): 119 | data_length = self.data_length 120 | pad_left = np.stack([sample[0]]*data_length) 121 | pad_right = np.stack([sample[-1]]*data_length) 122 | 123 | padded_sample = np.concatenate((pad_left,sample,pad_right)) 124 | start_index = 0 125 | #end_index = data_dlength - 1 126 | end_index = data_length 127 | 128 | data_window = padded_sample[start_index:end_index] 129 | 130 | return padded_sample, data_window 131 | 132 | 133 | 134 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ADOPT 2 | 3 | Although aviation accidents are rare, safety incidents occur more frequently and require a careful analysis to detect and mitigate risks in a timely manner. Analyzing safety incidents using operational data and producing event-based explanations is invaluable to airline companies as well as to governing organizations such as the Federal Aviation Administration (FAA) in the United States. However, this task is challenging because of the complexity involved in mining multi-dimensional heterogeneous time series data, the lack of time-step-wise annotation of events in a flight, and the lack of scalable tools to perform analysis over a large number of events. We propose a precursor mining algorithm: Automatic Discovery of Precursors in Time series data (ADOPT) that identifies events in the multidimensional time series that are correlated with the safety incident. Precursors are valuable to systems health and safety monitoring and in explaining and forecasting safety incidents. Current methods suffer from poor scalability to high dimensional time series data and are inefficient in capturing temporal behavior. We propose an approach by combining multiple-instance learning (MIL) and deep recurrent neural networks (DRNN) to take advantage of MIL's ability to learn using weakly supervised data and DRNN's ability to model temporal behavior. 4 | 5 | 6 | The objective of this project is to automate the analysis of flight safety incidents in a way that scales well and offers explanations. These explanations include: 7 | 8 | * When the degraded states start to appear? 9 | * What are the degraded states? 10 | * What is the likelihood of the event is to occur? 11 | * What corrective actions can be taken? 12 | 13 | This project aims to: 14 | 15 | * Create a novel deep temporal multiple-instance learning (DT-MIL) framework that combines multiple-instance learning with deep recurrent neural networks suitable for weakly-supervised learning problems involving time series or sequential data. 16 | * Provide a novel approach to explaining safety incidents using precursors mined from data. 17 | * Deliver a detailed evaluation of the DT-MIL model using real-world aviation data and comparison with baseline models. 18 | * Perform a precursor analysis and explanation of high speed exceedance safety incident using flight data from a commercial airline 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | This repository contains the following files in its top level directory: 27 | 28 | * [source](source) 29 | The source code of the repository, this includes the ADOPT model, GUI configuration tools, and a command line program that utilizes the model. 30 | 31 | * [documentation](documentation) 32 | Documents describing how to configure and run the program, as well as how to interpret the results. 33 | 34 | * [datasets](datasets) 35 | A directory containing a sample dataset. Other datasets may also be added here by the user. 36 | 37 | * [requirements.txt](requirements.txt) 38 | General module requirements for the program. A more specific requiremnts.txt can be found in [source](source). 39 | 40 | 41 | * [ADOPT NASA Open Source Agreement.pdf](ADOPT%20NASA%20Open%20Source%20Agreement.pdf) 42 | Licensing for ADOPT 43 | * [ADOPT Individual CLA.pdf](ADOPT%20Individual%20CLA.pdf) 44 | NASA Individual Contributor License Agreement 45 | * [ADOPT Corporate CLA.pdf](ADOPT%20Corporate%20CLA.pdf) 46 | NASA Corporate Contributor License Agreement 47 | 48 | 49 | 50 | 51 | ## Contact Info 52 | 53 | NASA Point of contact: Nikunj Oza , Data Science Group Lead. 54 | 55 | For questions regarding the research and development of the algorithm, please contact Bryan Matthews , Senior Research Engineer. 56 | 57 | For questions regarding the source code, please contact Daniel Weckler , Software Engineer. 58 | 59 | 60 | ## Copyright and Notices 61 | 62 | Notices: 63 | 64 | Copyright © 2019 United States Government as represented by the Administrator of the National Aeronautics and Space Administration. All Rights Reserved. 65 | 66 | Disclaimers 67 | 68 | No Warranty: THE SUBJECT SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY OF ANY KIND, EITHER EXPRESSED, IMPLIED, OR STATUTORY, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTY THAT THE SUBJECT SOFTWARE WILL CONFORM TO SPECIFICATIONS, ANY IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR FREEDOM FROM INFRINGEMENT, ANY WARRANTY THAT THE SUBJECT SOFTWARE WILL BE ERROR FREE, OR ANY WARRANTY THAT DOCUMENTATION, IF PROVIDED, WILL CONFORM TO THE SUBJECT SOFTWARE. THIS AGREEMENT DOES NOT, IN ANY MANNER, CONSTITUTE AN ENDORSEMENT BY GOVERNMENT AGENCY OR ANY PRIOR RECIPIENT OF ANY RESULTS, RESULTING DESIGNS, HARDWARE, SOFTWARE PRODUCTS OR ANY OTHER APPLICATIONS RESULTING FROM USE OF THE SUBJECT SOFTWARE. FURTHER, GOVERNMENT AGENCY DISCLAIMS ALL WARRANTIES AND LIABILITIES REGARDING THIRD-PARTY SOFTWARE, IF PRESENT IN THE ORIGINAL SOFTWARE, AND DISTRIBUTES IT "AS IS." 69 | 70 | Waiver and Indemnity: RECIPIENT AGREES TO WAIVE ANY AND ALL CLAIMS AGAINST THE UNITED STATES GOVERNMENT, ITS CONTRACTORS AND SUBCONTRACTORS, AS WELL AS ANY PRIOR RECIPIENT. IF RECIPIENT'S USE OF THE SUBJECT SOFTWARE RESULTS IN ANY LIABILITIES, DEMANDS, DAMAGES, EXPENSES OR LOSSES ARISING FROM SUCH USE, INCLUDING ANY DAMAGES FROM PRODUCTS BASED ON, OR RESULTING FROM, RECIPIENT'S USE OF THE SUBJECT SOFTWARE, RECIPIENT SHALL INDEMNIFY AND HOLD HARMLESS THE UNITED STATES GOVERNMENT, ITS CONTRACTORS AND SUBCONTRACTORS, AS WELL AS ANY PRIOR RECIPIENT, TO THE EXTENT PERMITTED BY LAW. RECIPIENT'S SOLE REMEDY FOR ANY SUCH MATTER SHALL BE THE IMMEDIATE, UNILATERAL TERMINATION OF THIS AGREEMENT. 71 | 72 | -------------------------------------------------------------------------------- /source/dtmil/feature_ranking/ranking_window.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Apr 16 19:42:33 2019 5 | 6 | @author: dweckler 7 | """ 8 | 9 | 10 | from typing import List 11 | import numpy as np 12 | from dtmil.prediction_data import Prediction_Data 13 | 14 | class Parameter_Score_Window: 15 | 16 | def __init__(self, start_idx:List[int],end_idx:List[int], parent_group, disturbed_parameter:int): 17 | self.prediction_data:Prediction_Data = parent_group.prediction_data 18 | self.sd_disturbances:List[int] = parent_group.parent.standard_deviation_disturbances 19 | self.disturbed_parameter:int = disturbed_parameter 20 | self.start_indeces:List[int] = start_idx 21 | self.end_indeces:List[int] = end_idx 22 | self.modified_precursor_scores:List[float] 23 | self.subwindows:List[Precursor_Event_Window] 24 | self.parent_group = parent_group 25 | 26 | precursor_score = self.prediction_data.precursor_score 27 | window_count = len(start_idx) 28 | 29 | if window_count == 0: 30 | self.modified_precursor_scores = [self.prediction_data.precursor_score] 31 | window = Precursor_Event_Window(precursor_score,None,self) 32 | self.subwindows = [window] 33 | 34 | else: 35 | self.__disturb_parameters() 36 | 37 | subwindows = [] 38 | for i in range(window_count): 39 | start = start_idx[i] 40 | end = end_idx[i] 41 | score_window = self.prediction_data.precursor_score[start:end] 42 | modified_windows = [window[start:end] for window in self.modified_precursor_scores] 43 | subwindows.append(Precursor_Event_Window(score_window,modified_windows,self)) 44 | 45 | self.subwindows = subwindows 46 | 47 | 48 | def __disturb_parameters(self): 49 | 50 | param_list = self.prediction_data.myData.parameter_selection.tolist() 51 | self.modified_precursor_scores = [] 52 | 53 | for standard_deviation_scale in self.sd_disturbances: 54 | modified_input_data = np.copy(self.prediction_data.input_values) 55 | 56 | i = param_list.index(self.disturbed_parameter) 57 | 58 | singleFeature = modified_input_data[:,:,i] 59 | standard_dev = np.std(singleFeature) * standard_deviation_scale 60 | singleFeature += standard_dev 61 | modified_input_data[:,:,i] = singleFeature 62 | 63 | L=self.prediction_data.instance_layer_output_function([modified_input_data])[0] 64 | modified_precursor_score = L[0,:,0].tolist() 65 | self.modified_precursor_scores.append(modified_precursor_score) 66 | 67 | @property 68 | def most_negative_percent_differences(self): 69 | return [abs(sw.most_negative_percent_diff) for sw in self.subwindows] 70 | 71 | @property 72 | def most_important_sd_responses(self): 73 | return [abs(sw.most_important_sd_response) for sw in self.subwindows] 74 | 75 | 76 | 77 | class Precursor_Event_Window: 78 | 79 | def __init__(self,precursor_score_window:List[float],modified_score_windows:List[List[float]], parent_window:Parameter_Score_Window): 80 | self.precursor_score_window = precursor_score_window 81 | self.modified_score_windows = modified_score_windows 82 | self.parent_window = parent_window 83 | 84 | self.most_negative_percent_diff:float 85 | 86 | if modified_score_windows is None: 87 | self.most_negative_percent_diff = 0 88 | self.most_important_sd_response = None 89 | 90 | else: 91 | self.__compare_precursor_scores() 92 | 93 | #complare all the scores with each SD disturbance to see which surpresses the precursor score the most 94 | def __compare_precursor_scores(self): 95 | 96 | percent_differences = [] 97 | precursor_window = self.precursor_score_window 98 | 99 | if len(precursor_window == 1): 100 | integrate = np.mean 101 | else: 102 | integrate = np.trapz 103 | 104 | avgDefault = integrate(precursor_window) 105 | 106 | for modified_window in self.modified_score_windows: 107 | 108 | avgCurrent = integrate(modified_window) 109 | 110 | diff_percent = (avgDefault-avgCurrent)/(avgDefault)*100 111 | percent_differences.append(diff_percent) 112 | 113 | most_negative_diff = 0 114 | most_important_sd_response = None 115 | 116 | 117 | for i,percent_diff in enumerate(percent_differences): 118 | if percent_diff > most_negative_diff: 119 | most_negative_diff = percent_diff 120 | most_important_sd_response = self.parent_window.sd_disturbances[i] 121 | 122 | self.most_negative_percent_diff = most_negative_diff 123 | self.most_important_sd_response = most_important_sd_response 124 | 125 | 126 | @property 127 | def ranking_score(self): 128 | return self.most_negative_percent_diff 129 | 130 | @property 131 | def attribute_index(self): 132 | return self.parent_window.disturbed_parameter 133 | 134 | @property 135 | def attribute_label(self): 136 | return self.parent_window.prediction_data.myData.param_index_to_label(self.attribute_index) 137 | 138 | 139 | 140 | -------------------------------------------------------------------------------- /source/dtmil/utilities.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Jun 19 12:34:58 2018 5 | 6 | @author: dweckler 7 | """ 8 | ''' 9 | @author: vjanakir 10 | This is the code for deep temporal multiple instance learning (DTMIL). This is the version of ADOPT that is based on deep learning. 11 | The code assumes Keras with Theano or Tensorflow backend. 12 | uses Anaconda virtual env with Python 2.7 and keras. It should also work in Python 3.x but not tested. 13 | ''' 14 | # load python libraries 15 | import numpy as np, math 16 | from keras.engine.topology import Layer, InputSpec 17 | from keras import backend as T 18 | import pickle 19 | import os 20 | from scipy.integrate import trapz 21 | 22 | 23 | 24 | source_path = os.path.dirname(os.path.realpath(__file__)) 25 | 26 | 27 | #%% custom model functions 28 | 29 | def sigmoid(x,decay,bias): 30 | a = [] 31 | for item in x: 32 | a.append(1/(1+math.exp(-decay*(item-bias)))) 33 | return a 34 | 35 | def get_weight_fn(maxlen): 36 | temp=0.1+np.array(sigmoid(np.arange(maxlen).tolist(),decay=0.1,bias=70)) 37 | temp=temp/np.sum(temp) 38 | return temp 39 | #plt.plot(get_weight_fn(100)) 40 | 41 | class aggregationLayer(Layer): 42 | """ 43 | This is a custom Keras layer. This pooling layer accepts the temporal 44 | sequence output by a recurrent layer and performs multiple instance pooling, 45 | looking at only the non-masked portion of the sequence. The pooling 46 | layer converts the instance probabilities (same length as input sequence) into a bag-level probability. 47 | 48 | input shape: (nb_samples, nb_timesteps, nb_features) 49 | output shape: (nb_samples, 1) 50 | """ 51 | def __init__(self, **kwargs): 52 | super(aggregationLayer, self).__init__(**kwargs) 53 | self.supports_masking = True 54 | self.input_spec = [InputSpec(ndim=3)] 55 | 56 | def get_output_shape_for(self, input_shape): 57 | return (input_shape[0], input_shape[2]) 58 | 59 | def call(self, x, mask=None): 60 | if mask is None: 61 | mask = T.mean(T.ones_like(x), axis=-1) 62 | mask = T.cast(mask,T.floatx()) 63 | 64 | dr_perc=0.5 65 | mask1=T.dropout(mask,level=dr_perc) 66 | mask1=T.clip(mask1, 0, 1) 67 | 68 | mod_smax=T.max(x[:,:,0]*mask1,axis=1).dimshuffle(0,'x') 69 | smax = T.max(x[:,:,0]*mask,axis=1).dimshuffle(0,'x') #(nb_samples, np_features) 70 | smin = T.min(x[:,:,0]*mask,axis=1).dimshuffle(0,'x') #(nb_samples, np_features) 71 | 72 | # mod_smax=T.expand_dims(T.max(x[:,:,0]*mask1,axis=1), 1) 73 | # smax = T.expand_dims(T.max(x[:,:,0]*mask,axis=1), 1) #(nb_samples, np_features) 74 | # smin = T.expand_dims(T.min(x[:,:,0]*mask,axis=1), 1) #(nb_samples, np_features) 75 | 76 | x_rounded=x[:,:,0]*mask 77 | sum_unmasked=T.batch_dot(x_rounded,mask,axes=1) # (nb_samples,np_features) 78 | 79 | ssum = T.sum(x,axis=-2) #(nb_samples, np_features) 80 | rcnt = T.sum(mask,axis=-1,keepdims=True) #(nb_samples) # number of unmasked samples in each record 81 | bag_label=sum_unmasked/rcnt 82 | smean=ssum/rcnt 83 | 84 | # # sigmoid weighted mean: 85 | # weight_fn=T.reshape(T.transpose(T.tile(T.reshape(T.variable(get_weight_fn(100)),(100,1)),T.shape(x)[0])),(T.shape(x)[0],T.shape(x)[1],1)) 86 | # weighted_x=weight_fn*x 87 | # wsum=T.sum(weighted_x,axis=-2) #(nb_samples, np_features) 88 | ## weight_sum=T.reshape(T.batch_dot(T.ones_like(x),weight_fn,axes=1),T.shape(rcnt)) # used T.ones_like(x) instead of x to check if I am seeing the outputs..which helped me debug 89 | # wmean=wsum # because the weights are normalized 90 | 91 | # sofmax=(1/largeNum)*T.log(T.sum(T.exp())) 92 | 93 | # return bag_label 94 | return smax # max voting 95 | # return smin # min voting 96 | # return smean # temporal mean pooling 97 | # return wmean # sigmoid weighted mean 98 | # return sofmax 99 | # return mod_smax 100 | 101 | def compute_mask(self, input, mask): 102 | return None 103 | 104 | 105 | 106 | def get_auc(ytest, ytest_prob): 107 | tau_mat=np.arange(0,1.01,0.01) 108 | TPR=np.zeros(len(tau_mat),) 109 | FPR=np.ones(len(tau_mat),) 110 | for i in np.arange(len(tau_mat)): 111 | tau=tau_mat[i] 112 | ytest_pred=np.zeros(ytest_prob.shape) 113 | ytest_pred[ytest_prob>tau]=1 114 | posIdx=np.where(ytest==1)[0] 115 | TPR[i]=len(np.where(ytest_pred[posIdx]==1)[0])/float(len(posIdx)) 116 | negIdx=np.where(ytest==0)[0] 117 | FPR[i]=len(np.where(ytest_pred[negIdx]==1)[0])/float(len(negIdx)) 118 | auc_bag=abs(trapz(TPR,FPR)) 119 | return auc_bag 120 | 121 | 122 | #save a file to a specified directory 123 | def save_something(stuffToSave,filename): 124 | with open ('{}'.format(filename),'wb') as output: 125 | pickle.dump(stuffToSave,output, pickle.HIGHEST_PROTOCOL) 126 | 127 | #load a file from a specified directory 128 | def load_something(filename): 129 | with open ('{}'.format(filename),'rb') as inFile: 130 | return pickle.load(inFile) 131 | 132 | #grab labels from indeces 133 | def get_labels_from_indeces(label_indeces,label_strings): 134 | ordered_label_strings = np.asarray([label_strings[p] for p in label_indeces]) 135 | 136 | if isinstance(label_indeces, list): 137 | ordered_label_strings = ordered_label_strings.tolist() 138 | 139 | return ordered_label_strings 140 | 141 | #dual option for multi-sort 142 | def dual_sort(myList, side_list,absolute_value = True,reverse = False): 143 | sorted_list, side_lists = multi_sort(myList, [side_list],absolute_value,reverse) 144 | 145 | return sorted_list, side_lists[0] 146 | 147 | 148 | #easily sort multiple arrays at once 149 | def multi_sort(myList,side_lists,absolute_value = True,reverse = False): 150 | #preprocess and get our sort arrays 151 | myArray = np.asarray(myList) 152 | if (absolute_value): 153 | myArray = np.absolute(myArray) 154 | sorted_indeces = np.argsort(myArray) 155 | 156 | ##main array sort 157 | sorted_array = myArray[sorted_indeces] 158 | if (reverse): 159 | sorted_array = np.flip(sorted_array, axis = 0) 160 | 161 | #sort everything else according to main array 162 | sorted_side_arrays = [] 163 | for sList in side_lists: 164 | sorted_arr = np.asarray(sList)[sorted_indeces] 165 | if(reverse): 166 | sorted_arr = np.flip(sorted_arr,axis=0) 167 | 168 | sorted_side_arrays.append(sorted_arr) 169 | 170 | return(sorted_array,sorted_side_arrays) 171 | 172 | 173 | def flat_avg(avg_array): 174 | flat_mean = np.mean(avg_array,axis = 0) 175 | mean_list = [] 176 | arr_size = avg_array.shape[0] 177 | for mean_val in flat_mean: 178 | new_arr = np.full(arr_size,mean_val) 179 | mean_list.append(new_arr) 180 | 181 | avg_guideline = np.array(mean_list) 182 | return avg_guideline.swapaxes(0,1) 183 | 184 | -------------------------------------------------------------------------------- /source/guis/parameter_selector.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # -*- coding: utf-8 -*- 3 | 4 | from tkinter import Tk, Listbox, Grid, Label, END,filedialog, Scrollbar, VERTICAL, RIGHT, Y, EXTENDED 5 | from tkinter.ttk import Frame, Button, Entry, Style 6 | 7 | from tkinter.messagebox import showinfo, showerror 8 | import os 9 | import sys 10 | import json 11 | 12 | sep = os.path.sep 13 | source_path = "..{}dtmil{}configuration{}".format(sep,sep,sep) 14 | sys.path.append(source_path) 15 | #from config_dtmil import get_json_config_data 16 | 17 | def exit_program(): 18 | root.destroy() 19 | exit() 20 | 21 | 22 | def read_lines_from_file(directory,filename): 23 | 24 | with open(os.path.join(directory,filename),'r') as f: 25 | content = f.readlines() 26 | content = [x.strip() for x in content] 27 | 28 | return content 29 | 30 | def write_lines_to_file(list_of_text,directory,filename): 31 | filepath = os.path.join(directory,filename) 32 | # print(f"Filepath: {filepath}") 33 | with open(filepath,'w') as f: 34 | for text in list_of_text: 35 | f.write("{}\n".format(text)) 36 | 37 | 38 | class ParameterSelector: 39 | 40 | def __init__(self,master): 41 | self.master = master 42 | sel_col = 0 43 | hold_col = 1 44 | 45 | lbl_row = 0 46 | lst_row = 1 47 | btn_row = 2 48 | save_row = 3 49 | 50 | self.edited = False 51 | 52 | scale_factor = 0.5 53 | window_width = int(root.winfo_screenwidth()*scale_factor) 54 | window_height = int(root.winfo_screenheight()*scale_factor) 55 | 56 | self.master.geometry(f"{window_width}x{window_height}") 57 | 58 | frame = Frame(master) 59 | 60 | Grid.columnconfigure(master,sel_col,weight=1) 61 | Grid.columnconfigure(master,hold_col,weight=1) 62 | 63 | Grid.rowconfigure(master, lst_row, weight=1) 64 | 65 | frame.grid() 66 | 67 | 68 | self.full_listbox = Listbox(master,selectmode = EXTENDED, width = 20) 69 | scrollbar = Scrollbar(self.full_listbox, orient=VERTICAL) 70 | self.full_listbox.config(yscrollcommand=scrollbar.set) 71 | scrollbar.config(command=self.full_listbox) 72 | scrollbar.pack(side=RIGHT, fill=Y) 73 | self.full_listbox.grid(row=lst_row,column=sel_col,padx=(20,20),pady=(5,10),sticky = 'news') 74 | 75 | self.holdout_listbox = Listbox(master,selectmode = EXTENDED, width = 20) 76 | scrollbar = Scrollbar(self.holdout_listbox, orient=VERTICAL) 77 | self.holdout_listbox.config(yscrollcommand = scrollbar.set) 78 | scrollbar.config(command=self.holdout_listbox) 79 | scrollbar.pack(side=RIGHT, fill=Y) 80 | self.holdout_listbox.grid(row=lst_row,column=hold_col,padx=(20,20),pady=(5,10),sticky = 'news') 81 | 82 | 83 | self.selected_param_label = Label(master,text = "Selected Parameters") 84 | self.selected_param_label.grid(row = lbl_row, column = sel_col) 85 | 86 | self.holdout_param_label = Label(master,text = "Holdout Parameters") 87 | self.holdout_param_label.grid(row = lbl_row,column = hold_col) 88 | 89 | self.selected_to_holdout_button = Button(master,text= "->", command = self.move_to_holdout) 90 | self.selected_to_holdout_button.grid(row= btn_row,column = sel_col,padx=(40,40),sticky = 'ew') 91 | 92 | self.holdout_to_selected_button = Button(master, text= "<-", command = self.move_to_selected) 93 | self.holdout_to_selected_button.grid(row= btn_row,column = hold_col,padx=(40,40),pady=(5,10),sticky = 'ew') 94 | 95 | self.save_button = Button(master,text = "Save", command = self.save_lists) 96 | self.save_button.grid(row = save_row, column = sel_col,pady=(5,10)) 97 | 98 | self.reset_button = Button(master,text= "Reset", command = self.config_listboxes) 99 | self.reset_button.grid(row = save_row,column = hold_col) 100 | 101 | self.dataset_dir = "" 102 | self.params_path = "" 103 | 104 | 105 | def select_dataset(self): 106 | #FIXME: Add an exception handler just in case the file isn't found 107 | 108 | #get DTMIL_config_dir.json 109 | directory_config_file = "DTMIL_config_dir.json" 110 | config_file = os.path.join(source_path, directory_config_file) 111 | 112 | with open(config_file, 'r') as dirfile: 113 | self.dir_data = json.load(dirfile) 114 | dirfile.close() 115 | 116 | 117 | #select the dataset directory 118 | self.dataset_dir = filedialog.askdirectory(title = "Choose Dataset Folder") 119 | 120 | if self.dataset_dir == "": 121 | exit_program() 122 | 123 | self.params_path = os.path.join(self.dataset_dir,self.dir_data["parameters_directory"]) 124 | 125 | self.dataset_cfg_filepath = os.path.join(self.dataset_dir,"DTMIL_config.json") 126 | 127 | with open(self.dataset_cfg_filepath) as cfg_file: 128 | 129 | self.dataset_config_file = json.load(cfg_file) 130 | cfg_file.close() 131 | 132 | 133 | def config_listboxes(self): 134 | 135 | self.edited = False 136 | 137 | #get the holdout state names + the selected variables 138 | 139 | ##load here from the config file 140 | preprocessing_parameters = self.dataset_config_file['preprocessing'] 141 | 142 | all_params = set(preprocessing_parameters["all_parameter_names"]) 143 | holdout_params = set(preprocessing_parameters["redundant_parameters"]) 144 | selected_params = all_params - holdout_params 145 | 146 | 147 | 148 | sel_list = sorted(list(selected_params)) 149 | hold_list = sorted(list(holdout_params)) 150 | 151 | #make sure the listboxes are clear 152 | self.full_listbox.delete(0,END) 153 | self.holdout_listbox.delete(0,END) 154 | 155 | self.full_listbox.insert(END,*sel_list) 156 | self.holdout_listbox.insert(END,*hold_list) 157 | 158 | #TODO: check with the dataset and verify that all the parameters are the same. 159 | # If there are missing ones, add them, if there is an extra in the set, throw and error, maybe give an option to delete it 160 | 161 | 162 | 163 | def after_startup_setup(self): 164 | self.select_dataset() 165 | self.config_listboxes() 166 | 167 | 168 | 169 | def move_to_holdout(self): 170 | self.move_listbox(self.full_listbox, self.holdout_listbox) 171 | 172 | 173 | def move_to_selected(self): 174 | self.move_listbox(self.holdout_listbox, self.full_listbox) 175 | 176 | 177 | def move_listbox(self,listbox_source, listbox_destination): 178 | self.edited = True 179 | 180 | print(listbox_source.curselection()) 181 | selected_idx = list(listbox_source.curselection()) 182 | 183 | selected_values = [] 184 | for i in selected_idx: 185 | selected_values.append(listbox_source.get(i)) 186 | 187 | for i in selected_idx[::-1]: 188 | listbox_source.delete(i) 189 | 190 | print(selected_values) 191 | listbox_destination.insert(END,*selected_values) 192 | 193 | def save_lists(self): 194 | #output lists to file 195 | if(not self.edited): 196 | showerror(title= "Error",message= "No changes made!") 197 | return 198 | 199 | #TODO: Have it save to the json file instead 200 | 201 | print("saving lists") 202 | selected_param_list = sorted(self.full_listbox.get(0,END)) 203 | holdout_param_list = sorted(self.holdout_listbox.get(0,END)) 204 | print(selected_param_list,holdout_param_list) 205 | 206 | ##write these to the files and overwrite 207 | 208 | 209 | preprocessing_parameters = self.dataset_config_file['preprocessing'] 210 | 211 | preprocessing_parameters["redundant_parameters"] = holdout_param_list 212 | 213 | json_cfg_string = json.dumps(self.dataset_config_file,sort_keys=True, indent=4, separators=(',', ': ')) 214 | 215 | with open(os.path.join(self.dataset_dir,"DTMIL_config.json"),'w') as outfile: 216 | outfile.write(json_cfg_string) 217 | outfile.close() 218 | 219 | showinfo("Info", "Successfully saved files\n" + "Sorting both lists") 220 | self.config_listboxes() 221 | 222 | 223 | 224 | 225 | 226 | root = Tk() 227 | root.title("Parameter Selector") 228 | 229 | param_sel = ParameterSelector(root) 230 | 231 | root.after(10,param_sel.after_startup_setup) 232 | 233 | root.mainloop() 234 | 235 | 236 | 237 | -------------------------------------------------------------------------------- /source/dtmil/feature_ranking/feature_ranking.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Thu Apr 18 11:41:04 2019 5 | 6 | @author: dweckler 7 | """ 8 | 9 | 10 | from dtmil.utilities import dual_sort, get_labels_from_indeces 11 | from dtmil.visualizations import Visualizer 12 | 13 | 14 | from typing import List 15 | import numpy as np 16 | from dtmil.prediction_data import Prediction_Data 17 | from dtmil.data_container import DataContainer 18 | from dtmil.model_container import ModelContainer 19 | from dtmil.feature_ranking.ranking_window import Parameter_Score_Window 20 | import pandas as pd 21 | 22 | #%% Feature Ranking Class 23 | #TODO: ADD Placehodler data window padding options 24 | class Feature_Ranking: 25 | 26 | def __init__(self, data_ID_list:List[int], myData:DataContainer, myModel:ModelContainer, standard_deviation_disturbances:List[int] = [-2,2]): 27 | 28 | self.data_ID_list: List[int] = data_ID_list 29 | self.myData: DataContainer = myData 30 | self.myModel: ModelContainer = myModel 31 | self.standard_deviation_disturbances: List[int] = standard_deviation_disturbances 32 | self.ranking_group_list: List[Ranking_Group] = [] 33 | 34 | #create a ranking group for each data ID. 35 | for i,idx in enumerate(data_ID_list): 36 | 37 | print (f" progress:{i+1}/{len(data_ID_list)}", end="\r") 38 | 39 | ranking_group = Ranking_Group(idx,standard_deviation_disturbances,self) 40 | self.ranking_group_list.append(ranking_group) 41 | 42 | print("\n") 43 | 44 | def get_ranking_scores(self, attribute_type = 'label', top_number_of_features:int = None): 45 | feature_scores_list = [] 46 | 47 | for group in self.ranking_group_list: 48 | score_lists = group.all_ranking_scores 49 | #possibly add weights for rankings here 50 | for score_list in score_lists: 51 | feature_scores_list.append(score_list) 52 | 53 | #check to make sure we have at least one array 54 | if len(feature_scores_list) == 0: 55 | print("no feature scores!") 56 | return 57 | 58 | attributeIdx = self.ranking_group_list[0].parameter_list 59 | attributeSum = [sum(x)/len(feature_scores_list) for x in zip(*feature_scores_list)] 60 | sorted_sums, sorted_attributes = dual_sort(attributeSum,attributeIdx,reverse = True ) 61 | 62 | 63 | if attribute_type == 'label': 64 | #TODO: replace this with 65 | sorted_attributes = get_labels_from_indeces(sorted_attributes,self.myData.header) 66 | elif attribute_type == 'index': 67 | pass #since it's set to index by default 68 | else: 69 | print(f"invalid attribute type \"{attribute_type}\" specified, using \"index\" instead") 70 | 71 | if top_number_of_features is not None: 72 | sorted_sums = sorted_sums[:top_number_of_features] 73 | sorted_attributes = sorted_attributes[:top_number_of_features] 74 | 75 | return sorted_sums, sorted_attributes 76 | 77 | #TODO: expand this and the function it calls 78 | def export_graphs(self, top_number_of_features:int = None): 79 | #"default is none" 80 | parameter_selection = None 81 | if top_number_of_features is not None: 82 | sorted_ranking_sums, sorted_ranking_attributes = self.get_ranking_scores("index",top_number_of_features) 83 | parameter_selection = sorted_ranking_attributes 84 | 85 | vis = Visualizer(self.myData,self.myModel) 86 | for feature_group in self.ranking_group_list: 87 | vis.visualize_ranking_data(feature_group, parameter_selection = parameter_selection) 88 | 89 | def batch_output(self): 90 | sorted_ranking_sums, sorted_ranking_attributes = self.get_ranking_scores("index",6) 91 | 92 | for feature_group in self.ranking_group_list: 93 | vis = Visualizer(self.myData,self.myModel,feature_group.data_ID) 94 | 95 | vis.special_ranking_visualization(sorted_ranking_attributes,sorted_ranking_sums) 96 | 97 | 98 | #TODO: implement previous ranking features 99 | class Ranking_Group: 100 | 101 | def __init__(self,data_ID:int,standard_deviation_disturbances:List[int],parent:Feature_Ranking): 102 | 103 | self.data_ID:int = data_ID 104 | self.parent:Feature_Ranking = parent 105 | #self.default_feature:Sample_With_Disturbance = None 106 | self.parameter_list:List[int] = self.parent.myData.parameter_selection.tolist() 107 | self.parameter_windows:List[Parameter_Score_Window] 108 | self.score_weights:List[float] 109 | 110 | self.prediction_data:Prediction_Data = Prediction_Data(self.parent.myData,self.parent.myModel,data_ID) 111 | self.__define_window_region() 112 | self.__generate_parameter_windows() 113 | 114 | 115 | def __define_window_region(self): 116 | precursor_scores = self.prediction_data.precursor_score 117 | 118 | #TODO: apply smoothing to the precursor scores for graphs that are not as consistent 119 | threshold_list= np.array([i>0.5 for i in precursor_scores]) 120 | tl_padded = np.r_[False,threshold_list, False] 121 | # Get indices of shifts, which represent the start and stop indices 122 | shift_idx = np.flatnonzero(tl_padded[:-1] != tl_padded[1:]) 123 | 124 | # Get the start and stop indeces for all the windows 125 | 126 | #TODO: end_idx goes out of bounds if the graph ends on the precursor score. This only impacts graphing (and marginally at that), but should be fixed eventually 127 | self.start_idx:List[int] = shift_idx[:-1:2] 128 | self.end_idx:List[int] = shift_idx[1::2] 129 | 130 | def __generate_parameter_windows(self): 131 | 132 | self.parameter_windows = [] 133 | for parameter in self.parameter_list: 134 | windows = Parameter_Score_Window(self.start_idx,self.end_idx,self, parameter) 135 | self.parameter_windows.append(windows) 136 | 137 | 138 | 139 | def display_ranking_scores(self, num_scores= None): 140 | 141 | parameter_response_windows_list = self.ordered_response_windows_list 142 | print(type) 143 | num_windows = len(parameter_response_windows_list) 144 | for index, _response_windows in enumerate(parameter_response_windows_list): 145 | 146 | print("Window {} of {}".format(index+1,num_windows)) 147 | 148 | response_windows = _response_windows[:num_scores] 149 | 150 | scores = [window.ranking_score for window in response_windows] 151 | attribute_labels = [window.attribute_label for window in response_windows] 152 | 153 | df = pd.DataFrame(list(zip(attribute_labels,scores)),columns = ["Attribute", "Score"]) 154 | print(df.to_string(index=False)) 155 | print("\n") 156 | 157 | # @property 158 | # def ranking_scores(self): 159 | # return [abs(window.most_negative_percent_diff) for window in self.parameter_windows] 160 | 161 | def top_response_windows(self,percent_cutoff = None): 162 | 163 | #TODO: add this to the config file 164 | 165 | if percent_cutoff is None: 166 | percent_cutoff = 0.4 167 | 168 | print("percent cutoff",percent_cutoff) 169 | top_response_windows = [] 170 | for response_windows in self.ordered_response_windows_list: 171 | 172 | sorted_scores = [window.ranking_score for window in response_windows] 173 | print("sorted score length",len(sorted_scores)) 174 | 175 | score_sum = np.sum(sorted_scores) 176 | cutoff_sum = percent_cutoff*score_sum 177 | #print(cutoff_sum) 178 | partial_sum = 0 179 | 180 | 181 | cutoff_index = 0 182 | for index, score in enumerate(sorted_scores): 183 | partial_sum += score 184 | if partial_sum >= cutoff_sum: 185 | cutoff_index = index 186 | break 187 | 188 | top_windows = response_windows[:cutoff_index] 189 | top_response_windows.append(top_windows) 190 | 191 | return top_response_windows 192 | 193 | 194 | 195 | @property 196 | def all_ranking_scores(self): 197 | rs = np.array([np.array(window.most_negative_percent_differences) for window in self.parameter_windows]) 198 | 199 | return np.swapaxes(rs,0,1) 200 | 201 | 202 | @property 203 | def all_subwindows(self): 204 | sw = np.array([np.array(window.subwindows) for window in self.parameter_windows]) 205 | #swap the axes so we select by subwindow rather than parameter 206 | return np.swapaxes(sw,0,1) 207 | 208 | 209 | @property 210 | def ordered_response_windows_list(self): 211 | subwindows = self.all_subwindows 212 | 213 | parameter_windows_lists = [] 214 | for param_windows in subwindows: 215 | #sort each set of subwindows by their ranking score 216 | sorted_parameter_windows_list = sorted(param_windows,key=lambda subwindow: subwindow.ranking_score,reverse = True) 217 | 218 | parameter_windows_lists.append(sorted_parameter_windows_list) 219 | 220 | #return each subwindow list, ranked by the parameter response 221 | return parameter_windows_lists 222 | 223 | @property 224 | def window_count(self): 225 | return len(self.start_idx) 226 | 227 | 228 | 229 | 230 | 231 | 232 | -------------------------------------------------------------------------------- /source/dtmil/configuration/config_dtmil.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sun Jul 21 22:56:09 2019 5 | 6 | @author: dweckler 7 | """ 8 | 9 | 10 | 11 | import numpy as np, math 12 | from keras.engine.topology import Layer, InputSpec 13 | from keras import backend as T 14 | import json 15 | import pickle 16 | import sys 17 | import os 18 | 19 | 20 | 21 | source_path = os.path.dirname(os.path.realpath(__file__)) 22 | directory_config_filename = "DTMIL_config_dir.json" 23 | 24 | 25 | def get_json_config_data(input_directory = None, new_run = False): 26 | 27 | dir_config_file_path = os.path.join(source_path, directory_config_filename) 28 | 29 | with open(dir_config_file_path, 'r') as dirfile: 30 | dir_data = json.load(dirfile) 31 | 32 | ##make sure the defined separators of the directory config file match that of the os 33 | for key,val in dir_data.items(): 34 | new_val = val.replace("/",os.path.sep) 35 | dir_data[key] = new_val 36 | 37 | ####Load a dataset from a specified directory 38 | if(input_directory): 39 | cfg_data, cfg_file_path, dataset_dir = get_from_input_directory(input_directory,dir_data) 40 | 41 | #get from "datasets" folder 42 | else: 43 | cfg_data, cfg_file_path, dataset_dir = get_from_datasets_directory(dir_data) 44 | 45 | #make sure the json file is up to date 46 | update_JSON(cfg_data,cfg_file_path) 47 | 48 | 49 | cfg_id_label = "config_id" 50 | name_label = "config_name" 51 | 52 | config_name = cfg_data[name_label] 53 | id_hold = cfg_data["id_hold"] 54 | config_id = cfg_data[cfg_id_label] 55 | 56 | if (new_run and not id_hold): 57 | #increment for this new run 58 | config_id += 1 59 | cfg_data[cfg_id_label] = config_id 60 | save_JSON(cfg_data,cfg_file_path) 61 | 62 | 63 | full_config_name = "{}_{}".format(config_name,config_id) 64 | 65 | 66 | #set up the model storage and model output directories and check to see if the directory was there before 67 | directory_existed_already = __get_directory_with_ID(dir_data,dataset_dir,"model_storage_directory",full_config_name) 68 | __get_directory_with_ID(dir_data,dataset_dir,"model_output_directory",full_config_name) 69 | 70 | #TODO: determine if we want this warning if hold is set 71 | #check to make sure we don't overwrite an existing run 72 | if(new_run and directory_existed_already): 73 | 74 | if(id_hold): 75 | print("ID hold is ON") 76 | 77 | choice = input("Existing run (\"{}\") already found, do you wish to overwrite? (y/n)\n".format(full_config_name)) 78 | 79 | if choice == 'y': 80 | print("Overwriting existing run") 81 | 82 | else: 83 | if choice != 'n': 84 | print("Invalid input") 85 | 86 | output_string = ("If you do not wish to overwrite the existing run, change the \"{}\" field in the JSON file " 87 | "to a number greater than or equal to the current run.\n" 88 | "Alternatively, change the \"{}\" field to a name that doesn't conflict\n".format(cfg_id_label,name_label)) 89 | 90 | print(output_string) 91 | 92 | print("Exiting program...") 93 | sys.exit(0) 94 | 95 | 96 | 97 | return dir_data, cfg_data, dataset_dir 98 | 99 | 100 | 101 | def get_from_input_directory(input_directory,dir_data): 102 | 103 | if(input_directory.endswith(".json")): 104 | json_name = os.path.basename(input_directory) 105 | cfg_file_path = input_directory 106 | dataset_dir = os.path.dirname(input_directory) 107 | 108 | else: 109 | json_name = 'DTMIL_config.json' 110 | cfg_file_path = os.path.join(input_directory,json_name) 111 | dataset_dir = input_directory 112 | 113 | with open(cfg_file_path) as cfgfile: 114 | cfg_data = json.load(cfgfile) 115 | 116 | 117 | datasets_dir = os.path.abspath(os.path.join(source_path, dir_data["datasets_directory"])) 118 | filename = dir_data["selected_dataset"] 119 | file_dir = os.path.join(datasets_dir, filename) 120 | 121 | 122 | with open (file_dir, 'w') as selected_dataset_file: 123 | selected_dataset_file.write(dataset_dir) 124 | selected_dataset_file.close() 125 | 126 | 127 | return cfg_data, cfg_file_path, dataset_dir 128 | 129 | 130 | 131 | def get_from_datasets_directory(dir_data): 132 | 133 | datasets_dir = os.path.abspath(os.path.join(source_path, dir_data["datasets_directory"])) 134 | filename = dir_data["selected_dataset"] 135 | file_dir = os.path.join(datasets_dir, filename) 136 | 137 | prior_selected_dataset_file = False 138 | 139 | #check for selected dataset file 140 | try: 141 | selected_dataset_file = open(file_dir,'r') 142 | dataset_name = selected_dataset_file.readline() 143 | #print(dataset_name) 144 | prior_selected_dataset_file = True 145 | selected_dataset_file.close() 146 | 147 | #if it's not found, create it 148 | except IOError: 149 | dataset_name = input("{} not found, type the path of the dataset you wish to open\n".format(filename)) 150 | selected_dataset_file = open(file_dir,'w') 151 | selected_dataset_file.write(dataset_name) 152 | selected_dataset_file.close() 153 | 154 | #open the dataset 155 | if(prior_selected_dataset_file): 156 | new_name = input("Type the path of the dataset you wish to open, or press enter to open '{}'\n".format(dataset_name)) 157 | if(new_name != ""): 158 | dataset_name = new_name 159 | os.remove(file_dir) 160 | 161 | 162 | selected_dataset_file = open(file_dir,'w') 163 | selected_dataset_file.write(dataset_name) 164 | selected_dataset_file.close() 165 | 166 | 167 | if os.path.exists(dataset_name): 168 | dataset_dir = dataset_name 169 | 170 | else: 171 | dataset_dir = os.path.join(datasets_dir,dataset_name) 172 | 173 | 174 | #check to see if the file is there, if not, then clear the selected dataset and exit the program 175 | try: 176 | cfg_file_name ="DTMIL_config.json" 177 | cfg_file_path = os.path.join(dataset_dir,cfg_file_name) 178 | 179 | cfgfile = open(cfg_file_path) 180 | cfg_data = json.load(cfgfile) 181 | cfgfile.close() 182 | 183 | except IOError as e: 184 | print("{}".format(e)) 185 | print("config file(s) not found. The dataset and/or config files may not exist in the specified directory. Clearing {}".format(filename)) 186 | 187 | os.remove(file_dir) 188 | 189 | sys.exit() 190 | 191 | return cfg_data, cfg_file_path, dataset_dir 192 | 193 | 194 | 195 | 196 | def find_missing_keys(d,old_d): 197 | # old_subdict = getFromDict(old_dict,map_list) 198 | added_keys = [] 199 | for key,val in d.items(): 200 | #print(key) 201 | if key not in old_d: 202 | old_d[key] = val 203 | added_keys.append(key) 204 | if isinstance(val,dict): 205 | old_subdict= old_d[key] 206 | added_keys = added_keys + find_missing_keys(val,old_subdict) 207 | 208 | return added_keys 209 | 210 | def delete_extra_keys(old_d,orig_dict): 211 | keys_to_pop = [] 212 | all_popped_keys = [] 213 | for key,val in old_d.items(): 214 | if key not in orig_dict: 215 | keys_to_pop.append(key) 216 | #print("pop attempted") 217 | if isinstance(val,dict): 218 | if key not in keys_to_pop: 219 | orig_subdict = orig_dict[key] 220 | all_popped_keys = all_popped_keys + delete_extra_keys(val,orig_subdict) 221 | 222 | for key in keys_to_pop: 223 | old_d.pop(key) 224 | 225 | return keys_to_pop + all_popped_keys 226 | 227 | 228 | def update_JSON(json_to_change,json_to_change_filepath): 229 | 230 | config_path = os.path.join(source_path,"DTMIL_config_default.json") 231 | with open(config_path) as json_file: 232 | orig_json = json.load(json_file) 233 | 234 | added_stuff = find_missing_keys(orig_json,json_to_change) 235 | deleted_stuff = delete_extra_keys(json_to_change,orig_json) 236 | stuff_added = len(added_stuff)>0 237 | stuff_removed = len(deleted_stuff)>0 238 | 239 | if stuff_added or stuff_removed: 240 | print("The current json file is outdated") 241 | 242 | if (stuff_removed): 243 | print(f"Entries that will be removed: {deleted_stuff}") 244 | if (stuff_added): 245 | print(f"Entries that will be added (program will crash otherwise): {added_stuff}") 246 | 247 | choice = input("Do you wish to overwrite the current JSON file? (y/n)\n") 248 | 249 | if choice == 'y': 250 | save_JSON(json_to_change,json_to_change_filepath) 251 | 252 | else: 253 | if choice != 'n': 254 | print("Invalid choice") 255 | 256 | print("Exiting program") 257 | sys.exit() 258 | 259 | def save_JSON(json_data_to_save,json_to_save_filepath): 260 | print("Writing to JSON file") 261 | json_cfg_string = json.dumps(json_data_to_save,sort_keys=True, indent=4, separators=(',', ': ')) 262 | with open(os.path.join(json_to_save_filepath),'w') as outfile: 263 | outfile.write(json_cfg_string) 264 | print("Write successful") 265 | outfile.close() 266 | 267 | 268 | 269 | 270 | def __get_directory_with_ID(dir_data, dataset_dir, directory_string,ID): 271 | 272 | #folder_name = 'run' 273 | 274 | updated_dir = os.path.join(dir_data[directory_string], "{}".format(ID)) 275 | dir_data[directory_string] = updated_dir 276 | full_dir_data = os.path.join(dataset_dir,dir_data[directory_string]) 277 | 278 | directory_exists = os.path.exists(full_dir_data) 279 | if not directory_exists: 280 | print("Creating:", full_dir_data) 281 | os.makedirs(full_dir_data) 282 | 283 | return directory_exists -------------------------------------------------------------------------------- /documentation/config_readme.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1252\cocoartf1671\cocoasubrtf600 2 | {\fonttbl\f0\fswiss\fcharset0 Helvetica;\f1\fswiss\fcharset0 Helvetica-Oblique;\f2\fswiss\fcharset0 Helvetica-BoldOblique; 3 | \f3\fswiss\fcharset0 Helvetica-Bold;} 4 | {\colortbl;\red255\green255\blue255;} 5 | {\*\expandedcolortbl;;} 6 | \margl1440\margr1440\vieww28600\viewh15240\viewkind0 7 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 8 | 9 | \f0\fs28 \cf0 Usage\ 10 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 11 | 12 | \fs24 \cf0 Place your properly formatted dataset in the \'93datasets\'94 directory. Run begin_DTMIL.py from the 13 | \f1\i source 14 | \f0\i0 directory. To edit parameters, for each run, change the values in DTMIL_config.json. 15 | \fs28 \ 16 | \ 17 | \ 18 | \ 19 | DTMIL_config.json\ 20 | 21 | \f1\i\fs24 Dataset specific parameters. Each dataset has one of these files describing it\ 22 | \ 23 | 24 | \f0\i0 \ul Config_ID:\ 25 | \ulnone The ID for the dataset used in model_storage and model_output. This is used so we can have multiple config files to run different parameters/models in parallel 26 | \f1\i \ 27 | \ 28 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 29 | 30 | \f0\i0 \cf0 \ul \ulc0 Training\ulnone \ 31 | \ 32 | batch_size\ 33 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 34 | 35 | \f1\i \cf0 Number of samples to be propagated through the network\ 36 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 37 | 38 | \f0\i0 \cf0 \ 39 | dr\ 40 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 41 | 42 | \f1\i \cf0 Dropout rate (between 0 and 1)\ 43 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 44 | 45 | \f0\i0 \cf0 \ 46 | epochs\ 47 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 48 | 49 | \f1\i \cf0 Number of training passes through the data 50 | \f0\i0 \ 51 | \ 52 | lam\ 53 | 54 | \f1\i Regularization parameter (lambda) 55 | \f0\i0 \ 56 | \ 57 | lr\ 58 | 59 | \f1\i Specified learning rate 60 | \f0\i0 \ 61 | \ 62 | nhd\ 63 | 64 | \f1\i Number of hidden units in the fully connected layer 65 | \f0\i0 \ 66 | \ 67 | nhr\ 68 | 69 | \f1\i Number of units in the recurrent layer 70 | \f0\i0 \ 71 | \ 72 | seed\ 73 | 74 | \f1\i Random seed\ 75 | \pard\pardeftab720\sl280\partightenfactor0 76 | \cf0 \ 77 | \pard\pardeftab720\sl280\partightenfactor0 78 | 79 | \f0\i0 \cf0 pre_trained_model (string filepath)\ 80 | \pard\pardeftab720\sl280\partightenfactor0 81 | 82 | \f1\i \cf0 Location of a pre-trained model to load if train_flag is set to 0. Loads from the default file path (model_saves/[run_id]/[model_filename] if set to a blank string (\'93\'94) ( 83 | \f2\b The \'93default file path\'94 is currently buggy, so if you want to load a new model, specify the file path here) 84 | \f1\b0 \ 85 | \ 86 | \pard\pardeftab720\sl280\partightenfactor0 87 | 88 | \f0\i0 \cf0 pre_trained_json (string filepath): 89 | \f1\i \ 90 | Location of a json file to load a pre-trained model from if the model is split into a model and json. Set to a blank string (\'93\'94) to ignore this parameter 91 | \f2\b (currently only works with a full file path, will be fixed in the future) 92 | \f1\b0 \ 93 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 94 | \cf0 \ 95 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 96 | 97 | \f0\i0 \cf0 train_flag\ 98 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 99 | 100 | \f1\i \cf0 Set to 0 to use a specified pre-trained model, 1 to create a new model \ 101 | \ 102 | \ 103 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 104 | 105 | \f0\i0 \cf0 \ul Importing\ 106 | \ 107 | \ulnone nominal_filename:\ul \ 108 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 109 | 110 | \f1\i \cf0 \ulnone Filename for the nominal data\ 111 | \ 112 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 113 | 114 | \f0\i0 \cf0 adverse_filename:\ 115 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 116 | 117 | \f1\i \cf0 Filename for the adverse data\ 118 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 119 | 120 | \f0\i0 \cf0 \ul \ 121 | \ulnone validation percent:\ 122 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 123 | 124 | \f1\i \cf0 Percent of samples to be used for the validation set (after the test set has been held out)\ 125 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 126 | 127 | \f0\i0 \cf0 \ 128 | holdout_percent\ 129 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 130 | 131 | \f1\i \cf0 Percent of samples to be held out for the test set\ 132 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 133 | 134 | \f0\i0 \cf0 \ul \ 135 | \ulnone state_cache\ 136 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 137 | 138 | \f1\i \cf0 # 0 to load states from a saved cache, 1 to load from the original CSV files. If a cache does not exist, the program will load from the CSV files, then create one. The cache is updated every time the program loads from CSV files. 139 | \f0\i0 \ul \ 140 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 141 | 142 | \fs28 \cf0 \ulnone \ 143 | \pard\pardeftab720\sl280\partightenfactor0 144 | 145 | \fs24 \cf0 time_splice:\ 146 | \pard\pardeftab720\sl280\partightenfactor0 147 | 148 | \f1\i \cf0 The percentage of the time window to use. Set to 1 to use the whole window (recommend). Example: 0.6 will only use the first 60 % of the time steps in the window\ 149 | \pard\pardeftab720\sl280\partightenfactor0 150 | 151 | \f0\i0 \cf0 \ul \ 152 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 153 | \cf0 \ 154 | Preprocessing\ 155 | \ulnone \ 156 | all_parameter_names (list of strings):\ 157 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 158 | 159 | \f1\i \cf0 List of all parameter names for the dataset\ 160 | \ 161 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 162 | 163 | \f0\i0 \cf0 set_sample_length (int):\ 164 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 165 | 166 | \f1\i \cf0 Number defining the length of each time series. Time series above this number will be truncated, ones below this number will be dropped. If this value is set to null, the length will be inferred from the dataset 167 | \f0\i0 \ 168 | \ 169 | redundant_parameters (int or string list): \ 170 | 171 | \f1\i List of parameters to hold out when training. This lets you manually define parameters to be held out (rather than the automatic process above)\ 172 | \ 173 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 174 | 175 | \f0\i0 \cf0 drop_parameters (int or string list):\ 176 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 177 | 178 | \f1\i \cf0 Parameters to be dropped and not included in the visualization (the redundant and held out parameters are still included in the visualization) 179 | \f0\i0 \ 180 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 181 | \cf0 \ul \ 182 | \ 183 | Model_IO:\ 184 | \ulnone model_filename:\ 185 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 186 | 187 | \f1\i \cf0 Filename for the keras model\ 188 | \ 189 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 190 | 191 | \f0\i0 \cf0 model_container_filename\ 192 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 193 | 194 | \f1\i \cf0 Filename for the model container, but this doesn\'92t include the keras model itself (trained or untrained)since that can\'92t be pickled\ 195 | \ 196 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 197 | 198 | \f0\i0 \cf0 data_container_filename:\ 199 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 200 | 201 | \f1\i \cf0 Filename for the data container where the dataset is stored 202 | \f0\i0 \ul \ 203 | \ 204 | Visualization\ 205 | \ 206 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 207 | \cf0 \ulnone precursor_threshold\ 208 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 209 | 210 | \f1\i \cf0 The precursor score threshold from which we define an adverse event (default 0.5)\ 211 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 212 | 213 | \f0\i0 \cf0 \ 214 | guideline_type:\ 215 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 216 | 217 | \f1\i \cf0 This sets the guidelines (dashed lines) in the visualization process. 218 | \f0\i0 \ 219 | \ 220 | \pard\pardeftab720\sl280\partightenfactor0 221 | \cf0 binary_parameters (int list)\ 222 | \pard\pardeftab720\sl280\partightenfactor0 223 | 224 | \f1\i \cf0 Tells the visualization code which parameters are binary 225 | \f0\i0 \ 226 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 227 | 228 | \fs28 \cf0 \ 229 | DTMIL_config_dir.json 230 | \fs24 \ 231 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 232 | 233 | \f1\i \cf0 This file contains the file directories needed to help the program run. These do not need to be changed unless the user wishes to customize the file structure of the program. The \'93datasets\'94 directory assumes the \'93source\'94 directory is used as a base. All the other directories assume that the datasets directory + the selected dataset (defined by the user) is used as a base.\ 234 | \ 235 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 236 | 237 | \f0\i0 \cf0 cache_file\ 238 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 239 | 240 | \f1\i \cf0 Save location for the parameters that are cached from a previous run of the program. This is used so large datasets don\'92t need to be imported from .csv files every time the program is run\ 241 | \ 242 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 243 | 244 | \f0\i0 \cf0 datasets_directory\ 245 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 246 | 247 | \f1\i \cf0 The directory where all the datasets are located\ 248 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 249 | 250 | \f0\i0 \cf0 \ 251 | parameters_directory\ 252 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 253 | 254 | \f1\i \cf0 This is where the file lists and parameters are stored\ 255 | \ 256 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 257 | 258 | \f0\i0 \cf0 model_archive_directory\ 259 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 260 | 261 | \f1\i \cf0 This is where backups from each run of the model are stored\ 262 | \ 263 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 264 | 265 | \f0\i0 \cf0 model_storage_directory 266 | \f3\b (Currently labeled as \'93model saves\'94) 267 | \f0\b0 \ 268 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 269 | 270 | \f1\i \cf0 Contains a separate save of the model and final parameters (used to visualize without having to run the whole program/train the whole model again).\ 271 | \ 272 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 273 | 274 | \f0\i0 \cf0 model_output_directory 275 | \f3\b (Currently \'93output\'94) 276 | \f0\b0 \ 277 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 278 | 279 | \f1\i \cf0 This is where output files like graphs are stored.\ 280 | \ 281 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 282 | 283 | \f0\i0 \cf0 selected_dataset 284 | \f1\i \ 285 | This is a text file that determines which dataset that you will be using. It\'92s auto-generated by the program, but you can edit it manually. You may also delete the file, which will prompt the user to define a new dataset\ 286 | \ 287 | 288 | \f0\i0 raw_data_directory\ 289 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 290 | 291 | \f1\i \cf0 This is where the raw data (in csv format) is stored. 292 | \f0\i0 \ 293 | \ 294 | } -------------------------------------------------------------------------------- /source/dtmil/data_container.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 5 | @author: vjanakir 6 | This is the code for deep temporal multiple instance learning (DTMIL). This is the version of ADOPT that is based on deep learning. 7 | The code assumes Keras with Theano or Tensorflow backend. 8 | uses Anaconda virtual env with Python 2.7 and keras. It should also work in Python 3.x but not tested. 9 | 10 | Created on Tue Jun 19 14:44:18 2018 11 | 12 | @author: dweckler 13 | 14 | """ 15 | 16 | from __future__ import print_function 17 | 18 | import os 19 | import sys 20 | import numpy as np 21 | import time 22 | import h5py 23 | import random 24 | import pandas as pd 25 | from pathlib import Path 26 | 27 | ##FIXME: This does not work properly for values other than 10%, the cause is currently unknown 28 | 29 | def holdout_split(indeces, holdout_percent=10): 30 | 31 | full_arr = np.asarray(indeces) 32 | 33 | if holdout_percent == 0: 34 | full_arr = np.asarray(indeces) 35 | full_arr.shape = (1,len(indeces)) 36 | 37 | return full_arr, np.asarray([]) 38 | 39 | holdout_num = int(len(indeces)*holdout_percent) 40 | ho_idx = indeces[-holdout_num:] 41 | main_idx = indeces[:len(indeces)-holdout_num] 42 | 43 | main_arr = np.asarray(main_idx) 44 | main_arr.shape = (1,len(main_idx)) 45 | 46 | ho_arr = np.asarray(ho_idx) 47 | ho_arr.shape = (1,len(ho_idx)) 48 | 49 | return main_arr[0], ho_arr[0] 50 | 51 | 52 | class DataContainer: 53 | 54 | def __init__(self, json_config_data, state_cache = False): 55 | 56 | 57 | 58 | self.json_dir_data, self.json_data,self.dataset_dir = json_config_data 59 | self.preprocessing_params = self.json_data["preprocessing"] 60 | self.importing_params = self.json_data["importing"] 61 | 62 | self.parameters_directory = os.path.join(self.dataset_dir,self.json_dir_data['parameters_directory']) 63 | self.raw_data_directory = os.path.join(self.dataset_dir, self.json_dir_data['raw_data_directory']) 64 | 65 | self.load_data(state_cache) 66 | 67 | 68 | 69 | def load_data(self,state_cache): 70 | 71 | # print('Loading data from {} CSV files...'.format(maxItemsInList)) 72 | time_start = time.time() 73 | self.sample_list = [] 74 | self.dropped_sample_filenames = [] 75 | self.seqlabels = [] 76 | 77 | nominal_filename = self.importing_params["nominal_filename"] 78 | adverse_filename = self.importing_params["adverse_filename"] 79 | 80 | #read file list 81 | read_lines_from_file = self.read_lines_from_file 82 | nominal_file_list = read_lines_from_file(nominal_filename) 83 | adverse_file_list = read_lines_from_file(adverse_filename) 84 | 85 | #the csv from which we get our default header 86 | default_csv_filename = os.path.join(self.raw_data_directory,nominal_file_list[0]) 87 | df = pd.read_csv(default_csv_filename) 88 | parameter_list = list(df.columns.values) 89 | self.header = parameter_list 90 | self.all_parameter_names = parameter_list 91 | 92 | self.preprocessing_params["all_parameter_names"] = parameter_list 93 | set_sample_length = self.preprocessing_params["set_sample_length"] 94 | if set_sample_length == None: 95 | self.max_seqlen = len(df) 96 | print("No sample length specified, assuming all samples are of equal length {}".format(self.max_seqlen)) 97 | 98 | else: 99 | self.max_seqlen = set_sample_length 100 | 101 | 102 | self.mismatched_files = [] 103 | 104 | I_opt_idx,nominal_imported_csvs = self.__import_sample_list(nominal_file_list,label = 0) 105 | I_bad_idx,adverse_imported_csvs = self.__import_sample_list(adverse_file_list, label = 1) 106 | 107 | 108 | if len(self.mismatched_files)> 0: 109 | out = ("{}/{} labels don't match the default csv header. This will either cause a shaping error (and subsequent crash), " 110 | "or cause some parameters to be labeled incorrectly (possibly leading to nonsensical data). Make sure to double check the headers " 111 | "for your csv files to make sure they all match".format(len(self.mismatched_files),len(I_bad_idx)+len(I_opt_idx))) 112 | print("\n\nMismatched CSV headers found!") 113 | 114 | print("Default CSV (used for comparison):",default_csv_filename) 115 | print("{}/{} mismatched csv files (for reference):".format(len(self.mismatched_files[:5]),len(self.mismatched_files))) 116 | print(self.mismatched_files[:5]) 117 | #save to model output 118 | 119 | 120 | print(out) 121 | 122 | choice = input("Are you sure you want to continue? (y/n)\n") 123 | 124 | if choice == 'y': 125 | pass 126 | else: 127 | sys.exit(0) 128 | 129 | 130 | 131 | 132 | random.Random(42).shuffle(I_opt_idx) 133 | random.Random(42).shuffle(I_bad_idx) 134 | 135 | #FIXME:this shouldn't need processing here, do it in the holdout split function 136 | holdout_percent = self.importing_params['holdout_percent'] 137 | 138 | #self.temp_I_opt_idx = I_opt_idx 139 | self.I_bad,self.I_bad_ho = holdout_split(I_bad_idx, holdout_percent) 140 | self.I_opt,self.I_opt_ho = holdout_split(I_opt_idx, holdout_percent) 141 | 142 | 143 | print("Dropped {} files that were too short".format(len(self.dropped_sample_filenames))) 144 | 145 | #this is just in case we come up with an algorithm that can handle differing sequence lengths 146 | self.seqLabels = np.asarray([self.seqlabels])[0] 147 | 148 | finalList = np.asarray(nominal_imported_csvs + adverse_imported_csvs) 149 | finalArray = np.swapaxes(finalList,0,1) 150 | del finalList 151 | 152 | #splice percentage 153 | time_splice = self.importing_params["time_splice"] 154 | self.time_splice = time_splice 155 | 156 | if ((time_splice > 0) and (time_splice<1)): 157 | 158 | sample_index = 0 159 | set_slice = int(finalArray.shape[sample_index]*time_splice) 160 | #self.seqlen[0,:] = set_slice 161 | finalArray = finalArray[0:set_slice,:,:] 162 | self.time_splice = time_splice 163 | else: 164 | self.time_splice = None 165 | 166 | print("saving sample_list") 167 | self.states_orig = finalArray 168 | self.states = finalArray 169 | self.save_to_cache() 170 | 171 | print("Time to load: {} seconds".format(time.time()-time_start)) 172 | 173 | 174 | def __import_sample_list(self,X_file_list,label): 175 | I_X_idx = [] 176 | 177 | imported_csv_list = [] 178 | for filename in X_file_list: 179 | #TODO load,verify, and filter data here 180 | #throw an error/exception if lengths do not match and filtering is not set in dtmil_config 181 | 182 | imported_csv = self.import_sample(filename) 183 | 184 | if imported_csv is not None: 185 | imported_csv_list.append(imported_csv) 186 | self.sample_list.append(filename) 187 | I_X_idx.append(len(self.seqlabels)) 188 | self.seqlabels.append(label) 189 | 190 | return I_X_idx,imported_csv_list 191 | 192 | 193 | def import_sample(self,filename): 194 | 195 | filepath = os.path.join(self.raw_data_directory,filename) 196 | 197 | 198 | df = pd.read_csv(filepath) 199 | header = list(df.columns.values) 200 | 201 | 202 | if (header != self.header): 203 | self.mismatched_files.append(filename) 204 | 205 | imported_csv = df.values[-self.max_seqlen:] 206 | 207 | if len(imported_csv) != self.max_seqlen: 208 | self.dropped_sample_filenames.append(filename) 209 | return None 210 | 211 | return imported_csv 212 | 213 | 214 | #FIXME: Make this actually work, maybe skip the whole my_data creation process? 215 | #load files from cache with CSV as backup if the cache isn't there 216 | def load_from_cache(self,backup_sample_list): 217 | print('Loading states from cache...') 218 | cache_dir = os.path.join(self.dataset_dir,self.json_dir_data['cache_file']) 219 | 220 | try: 221 | with h5py.File(cache_dir, 'r') as hf: 222 | self.states_orig = hf['states_orig'][:] 223 | self.states = hf['states_orig'][:] 224 | 225 | #TODO: have this in a different place? Also check to see if loading from cache breaks anything 226 | #splice percentage 227 | time_splice = self.importing_params["time_splice"] 228 | self.time_splice = time_splice 229 | 230 | except EnvironmentError: 231 | print('cache file not found, loading from CSV files instead') 232 | self.import_all_samples(backup_sample_list) 233 | 234 | 235 | def save_to_cache(self): 236 | print('saving states to cache...') 237 | cache_dir = os.path.join(self.dataset_dir,self.json_dir_data['cache_file']) 238 | 239 | with h5py.File(cache_dir, 'w') as hf: 240 | hf.create_dataset('states_orig', data = self.states_orig) 241 | 242 | 243 | 244 | 245 | def reshape_and_process(self): 246 | # after loading, the variables have the following shape 247 | # states_orig is of shape (T, N, D) where T is max length of sample, N is the total number of samples, D is the number of time series in each sample. 248 | # The length of sample i is given by seqlen[0,i]. 249 | # If length is less than T, the sample data is prepended with NAN to make it length T. 250 | # A sample i belongs to "opt" (or "bad") if i belongs to array I_opt (or I_bad). 251 | # I_opt_ho and I_bad_ho are hold-out "test" sets. 252 | # I_bad, I_opt, I_bad_ho, I_opt_ho are one-dimensional arrays 253 | # seqlen is of shape (1, N). 254 | # header is a list of D feature names 255 | # seqLabels is of shape (N,). sample i has a label seqLabels[i] - 1 if sample i has adverse event and 0 otherwise. 256 | # removing variables which are correlated with target (to avoid finding trivial precursors) 257 | 258 | correlated_states = self.preprocessing_params["redundant_parameters"] 259 | correlated_states = [self.decode_parameter_label(i) for i in correlated_states] 260 | 261 | #convert to a Numpy array that avoids redundant choices (just in case something was mistakenly added) 262 | self.correlated_states =np.unique(np.array(correlated_states)) 263 | 264 | dropped_states = self.preprocessing_params["drop_parameters"] 265 | dropped_states = [self.decode_parameter_label(i) for i in dropped_states] 266 | self.dropped_states = np.unique(np.array(dropped_states)) 267 | 268 | #make sure not to delete the same state twiceo 269 | states_to_remove = np.unique(np.array(correlated_states + dropped_states)) 270 | 271 | self.parameter_selection=np.delete(np.arange(self.states.shape[2]),states_to_remove,0) 272 | self.states=self.states[:,:,self.parameter_selection] 273 | 274 | # get max length of trjectories 275 | self.maxlen = np.shape(self.states)[0] 276 | 277 | # get total number of trajectories 278 | # Ntraj= np.shape(self.states)[1] 279 | 280 | # number of features (time series variables) 281 | self.nfeat=np.shape(self.states)[-1] 282 | 283 | # center the data - subtract mean and divide by STD. If variable is constant, remove it from analysis 284 | temp=np.reshape(self.states,(np.shape(self.states)[0]*np.shape(self.states)[1],np.shape(self.states)[2])) 285 | 286 | mean=np.nanmean(temp,0) 287 | std=np.nanstd(temp,0) 288 | elimidx=np.where(std<1E-5)[0] 289 | if elimidx.shape[0]>0: 290 | selidx=np.array(list(set(np.arange(self.nfeat).tolist()).difference(elimidx))) 291 | self.states=self.states[:,:,selidx] 292 | mean=mean[selidx] 293 | std=std[selidx] 294 | temp=temp[:,selidx] 295 | self.parameter_selection=self.parameter_selection[selidx] 296 | temp=(temp-mean)/std 297 | self.states=np.reshape(temp,(np.shape(self.states)[0],np.shape(self.states)[1],np.shape(self.states)[2])) 298 | del temp 299 | self.nfeat=np.shape(self.states)[-1] 300 | 301 | # Replace NAN by an arbitrary mask_val 302 | mask_val=int(np.nanmax(self.states)+1000) 303 | self.states[np.isnan(self.states)]=mask_val 304 | 305 | # reshape to match keras' definitions 306 | self.states=np.transpose(self.states,(1,0,2)) 307 | 308 | 309 | def train_test_split(self): 310 | # Split train data into train (60%) and validation (40%) sets 311 | #FIXME: maybe change this to use a more traditional validation set approach. The numbers don't match the output for some reason 312 | 313 | validation_percent = self.importing_params["validation_percent"] 314 | self.validation_percent = validation_percent *100 315 | 316 | nvalid=int(validation_percent*len(self.I_bad)) 317 | self.I_bad_valid=self.I_bad[len(self.I_bad)-nvalid:] 318 | self.I_bad=self.I_bad[:len(self.I_bad)-nvalid] 319 | self.I_opt_valid=self.I_opt[len(self.I_opt)-nvalid:] 320 | self.I_opt=self.I_opt[:len(self.I_opt)-nvalid] 321 | 322 | print(self.states.shape) 323 | temp=np.array([self.I_opt.tolist()+self.I_bad.tolist()])[0] 324 | 325 | self.xtrain=self.states[temp,:,:] 326 | self.ytrain=self.seqLabels[temp] 327 | temp=np.array([self.I_opt_valid.tolist()+self.I_bad_valid.tolist()])[0] 328 | self.xvalid=self.states[temp,:,:] 329 | self.yvalid=self.seqLabels[temp] 330 | del temp 331 | 332 | self.ytrain = np.expand_dims(np.expand_dims(self.ytrain,-1),-1) 333 | self.yvalid = np.expand_dims(np.expand_dims(self.yvalid,-1),-1) 334 | 335 | # currently data is balanced. If there is an imbalance, adjust this parameter. 336 | self.class_weight = {0 : 1,1: 1} 337 | 338 | def preprocess(self): 339 | self.reshape_and_process() 340 | self.train_test_split() 341 | 342 | def get_grouping(self, num): 343 | 344 | label = "" 345 | dataset = "" 346 | 347 | if num in np.concatenate([self.I_bad,self.I_bad_ho,self.I_bad_valid]): 348 | label = "Anomalous" 349 | elif num in np.concatenate([self.I_opt, self.I_opt_valid,self.I_opt_ho]): 350 | label = "Nominal" 351 | else: 352 | print("index doesn't exist in the dataset") 353 | 354 | if num in np.concatenate([self.I_bad, self.I_opt]): 355 | dataset = "Train" 356 | elif num in np.concatenate([self.I_bad_valid,self.I_opt_valid]): 357 | dataset = "Validation" 358 | elif num in np.concatenate([self.I_bad_ho, self.I_opt_ho]): 359 | dataset = "Test" 360 | else: 361 | print("Invalid dataset id") 362 | 363 | return label, dataset 364 | 365 | 366 | 367 | 368 | def get_filename(self, index): 369 | 370 | return Path(self.sample_list[index]).stem 371 | 372 | 373 | def read_lines_from_file(self,filename): 374 | 375 | with open(os.path.join(self.parameters_directory,filename),'r') as f: 376 | content = f.readlines() 377 | content = [x.strip() for x in content] 378 | 379 | return content 380 | 381 | 382 | ##TODO: have a decode type argument 383 | def decode_parameter_label(self,param): 384 | if (isinstance(param,int)): 385 | return param 386 | 387 | else: 388 | return self.all_parameter_names.index(param) 389 | 390 | 391 | def param_index_to_label(self, param_index): 392 | return self.all_parameter_names[param_index] 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | -------------------------------------------------------------------------------- /source/guis/dataset_formatter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon Feb 11 00:09:03 2019 5 | @author: dweckler 6 | """ 7 | from tkinter import Tk, Label, Button, Entry, mainloop, filedialog, messagebox, Grid, Checkbutton, IntVar, END 8 | import os 9 | #import tkSimpleDialog as simpledialog 10 | import matplotlib 11 | matplotlib.use('qt5agg') 12 | 13 | 14 | 15 | from shutil import copyfile, move,copytree,copy 16 | import pandas as pd 17 | import json 18 | import glob 19 | from pathlib import Path 20 | 21 | import errno 22 | 23 | #if nominal filelist and adverse filelist don't exist, generate a file list from the inputted folders 24 | #ask via a popup if you want to do this. "directories were found instead of file lists. 25 | #Would you like to use the inputted directories to generate their respective file lists?" 26 | #use listdir to get the lists, make sure to parse out everything but the relative file path 27 | 28 | 29 | 30 | def copy_files_or_directories(source, dest): 31 | try: 32 | copytree(source, dest) 33 | except OSError as exception: 34 | if exception.errno == errno.ENOTDIR: 35 | try: 36 | copy(source, dest) 37 | except: 38 | print("Error copying file") 39 | else: 40 | raise 41 | 42 | 43 | #recursively list visible files 44 | def list_visible_files(path): 45 | filepath = os.path.join(path,'*') 46 | 47 | all_files = [os.path.basename(f) for f in glob.glob(filepath)] 48 | 49 | return all_files 50 | 51 | def set_entry_text(entry,text): 52 | entry.delete(0,END) 53 | entry.insert(0,text) 54 | 55 | 56 | class DatasetFormatter: 57 | 58 | def __init__(self, master): 59 | 60 | data_row = 0 61 | nominal_flist_row = 1 62 | adverse_flist_row = 2 63 | last_row = 3 64 | 65 | label_column = 0 66 | filelist_column = 1 67 | button_column = 2 68 | 69 | self.copy_or_move = IntVar() 70 | self.copy_or_move.set(1) 71 | self.checkbutton = Checkbutton(master,text="Copy",variable = self.copy_or_move).grid(row=last_row) 72 | 73 | 74 | self.use_filelists = IntVar() 75 | self.use_filelists.set(0) 76 | self.filelist_check = Checkbutton(master,text="Filelists", command = self.list_swap, variable = self.use_filelists) 77 | self.filelist_check.grid(row = last_row,column = button_column) 78 | 79 | self.default_adverse_filelist_path = "" 80 | self.default_nominal_filelist_path = "" 81 | self.default_adverse_folder_path = "" 82 | self.default_nominal_folder_path = "" 83 | 84 | 85 | 86 | 87 | self.folder_label = Label(master, text="Data Folder:") 88 | self.folder_label.grid(row=data_row) 89 | 90 | self.nominal_filelist_label = Label(master, text="Nominal Directory:") 91 | self.nominal_filelist_label.grid(row=nominal_flist_row,column = label_column) 92 | 93 | self.adverse_filelist_label = Label(master, text="Adverse Directory:") 94 | self.adverse_filelist_label.grid(row=adverse_flist_row,column = label_column) 95 | 96 | #Define the entry fields 97 | self.data_folder_entry = Entry(master) 98 | self.nominal_path_entry = Entry(master) 99 | self.adverse_path_entry = Entry(master) 100 | 101 | Grid.columnconfigure(master,filelist_column,weight=1) 102 | 103 | self.data_folder_entry.grid(row=data_row, column=filelist_column,sticky = 'we') 104 | self.nominal_path_entry.grid(row=nominal_flist_row, column=filelist_column,sticky = 'we') 105 | self.adverse_path_entry.grid(row = adverse_flist_row, column = filelist_column,sticky = 'we') 106 | 107 | #define the "choose file" button 108 | self.data_folder_button = Button(master,text="Choose Folder", command = self.get_the_folder) 109 | self.nominal_filelist_button = Button(master,text="Choose Folder", command = self.get_nominal_filelist) 110 | self.adverse_filelist_button = Button(master,text="Choose Folder", command = self.get_adverse_filelist) 111 | 112 | self.data_folder_button.grid(row=data_row,column=button_column) 113 | self.nominal_filelist_button.grid(row=nominal_flist_row,column=button_column) 114 | self.adverse_filelist_button.grid(row=adverse_flist_row,column=button_column) 115 | 116 | self.save_button = Button(master,text="Generate Folder Hierarchy",fg ="#8b0000" , command = self.generate_folder_structure) 117 | self.save_button.grid(row = last_row,column = filelist_column) 118 | 119 | 120 | 121 | 122 | def get_the_folder(self): 123 | 124 | home_dir = str(Path.home()) 125 | print("Choosing folder") 126 | filename = filedialog.askdirectory(initialdir =home_dir) 127 | 128 | self.data_folder_entry.delete(0, 'end') 129 | self.data_folder_entry.insert(0,filename) 130 | 131 | def get_nominal_filelist(self): 132 | print("Choosing nominal") 133 | self.get_filelist(self.nominal_path_entry) 134 | 135 | 136 | def get_adverse_filelist(self): 137 | print("Choosing adverse") 138 | self.get_filelist(self.adverse_path_entry) 139 | 140 | 141 | def get_filelist(self,my_filelist_entry): 142 | 143 | dataset_folder_path = self.data_folder_entry.get() 144 | if dataset_folder_path == '': 145 | initial_directory = str(Path.home()) 146 | else: 147 | initial_directory = dataset_folder_path 148 | 149 | 150 | if (self.use_filelists.get()): 151 | filename = filedialog.askopenfilename(initialdir = initial_directory) 152 | 153 | else: 154 | filename = filedialog.askdirectory(initialdir = initial_directory) 155 | 156 | my_filelist_entry.delete(0, 'end') 157 | my_filelist_entry.insert(0,filename) 158 | 159 | 160 | def list_swap(self): 161 | 162 | 163 | if not (self.use_filelists.get()): 164 | the_label = "Directory" 165 | the_button_label = "Choose Folder" 166 | 167 | self.default_nominal_filelist_path = self.nominal_path_entry.get() 168 | set_entry_text(self.nominal_path_entry,self.default_nominal_folder_path) 169 | 170 | self.default_adverse_filelist_path = self.adverse_path_entry.get() 171 | set_entry_text(self.adverse_path_entry,self.default_adverse_folder_path) 172 | 173 | 174 | else: 175 | the_label = "Filelist" 176 | the_button_label = "Choose File" 177 | 178 | self.default_nominal_folder_path = self.nominal_path_entry.get() 179 | set_entry_text(self.nominal_path_entry,self.default_nominal_filelist_path) 180 | 181 | 182 | self.default_adverse_folder_path = self.adverse_path_entry.get() 183 | set_entry_text(self.adverse_path_entry,self.default_adverse_filelist_path) 184 | 185 | 186 | self.nominal_filelist_button.config(text = the_button_label) 187 | self.adverse_filelist_button.config(text = the_button_label) 188 | 189 | 190 | self.nominal_filelist_label.config(text = "Nominal {}".format(the_label)) 191 | self.adverse_filelist_label.config(text = "Adverse {}".format(the_label)) 192 | 193 | 194 | def generate_file_list(self,directory,dataset_folder_path): 195 | dir_name = os.path.basename(directory) 196 | 197 | file_list = glob.glob(os.path.join(directory,"*.csv")) 198 | file_list = [os.path.basename(pth) for pth in file_list] 199 | file_list = [os.path.join(dir_name,file) for file in file_list] 200 | 201 | 202 | file_list_txt = os.path.join(dataset_folder_path,f"{dir_name}.txt") 203 | 204 | with open(file_list_txt,'w') as f: 205 | for filename in file_list: 206 | f.write(filename) 207 | f.write('\n') 208 | 209 | return file_list_txt 210 | 211 | 212 | 213 | def generate_folder_structure(self): 214 | 215 | 216 | nominal_entry_path = self.nominal_path_entry.get() 217 | adverse_entry_path = self.adverse_path_entry.get() 218 | dataset_folder_path = self.data_folder_entry.get() 219 | 220 | 221 | blank_path = "" 222 | 223 | if (dataset_folder_path == blank_path): 224 | messagebox.showerror("Error",'Please enter a valid dataset folder path') 225 | return 226 | 227 | 228 | #TODO: make this generate the filelists paths if it's a directory (indicated by the checkmark) 229 | #TODO: add errors if the directory isn't where it's supposed to be 230 | 231 | if(self.use_filelists.get()): 232 | 233 | 234 | if (not (nominal_entry_path.endswith('.txt')) or nominal_entry_path == ""): 235 | messagebox.showerror("Error",'Please enter a valid path for the nominal file list') 236 | return 237 | 238 | 239 | if not (adverse_entry_path.endswith('.txt')): 240 | messagebox.showerror("Error",'Please enter a valid path for the adverse file list') 241 | return 242 | 243 | else: 244 | if((nominal_entry_path == blank_path) or (adverse_entry_path == blank_path)): 245 | messagebox.showerror("Error","Please enter a valid directory path") 246 | return 247 | 248 | nominal_entry_path = self.generate_file_list(nominal_entry_path,dataset_folder_path) 249 | adverse_entry_path = self.generate_file_list(adverse_entry_path,dataset_folder_path) 250 | 251 | 252 | #Ask for name of dataset + entry 253 | #dataset_name = simpledialog.askstring("Input", "Please enter the name of the dataset") 254 | 255 | 256 | 257 | print("Attempting to read filepath from nominal filelist") 258 | data_filenames = [] 259 | with open('{}'.format(nominal_entry_path),'r') as f: 260 | data_filenames = f.readlines() 261 | data_filenames = [x.strip() for x in data_filenames] 262 | 263 | 264 | 265 | #test to see if the files are there before doing anything 266 | test_file_path = data_filenames[0] 267 | csv_filepath = os.path.join(dataset_folder_path,test_file_path) 268 | 269 | try: 270 | df = pd.read_csv(csv_filepath) 271 | parameter_list = list(df.columns.values) 272 | 273 | except: 274 | 275 | messagebox.showerror("Error", f"Could not find any data files in the specified Data Folder path:\n{dataset_folder_path}\n\nAttempted to open file:\n{csv_filepath}") 276 | return 277 | 278 | 279 | #TODO: set initial directory to current dir initialdir = os.path.sep 280 | home_dir = str(Path.home()) 281 | dataset_path = filedialog.asksaveasfilename(initialdir = home_dir) 282 | if dataset_path is None or dataset_path == "": 283 | messagebox.showerror("Error", "No name entered!") 284 | return 285 | 286 | 287 | else: 288 | print("Generating folder structure for dataset {}".format(dataset_path)) 289 | 290 | 291 | #create new dataset directory (if one doesn't exist) 292 | make_directory(dataset_path) 293 | 294 | #create data folder 295 | data_path = os.path.join(dataset_path,'data') 296 | make_directory(data_path) 297 | 298 | #within data folder, create parameters and raw_data folder 299 | 300 | parameters_path = os.path.join(data_path, "parameters") 301 | make_directory(parameters_path) 302 | 303 | raw_data_path = os.path.join(data_path,'raw_data') 304 | make_directory(raw_data_path) 305 | 306 | #create misc, model_storage, and model_output folders 307 | model_storage_path = os.path.join(dataset_path,'model_saves') 308 | make_directory(model_storage_path) 309 | 310 | model_output_path = os.path.join(dataset_path, "output") 311 | make_directory(model_output_path) 312 | 313 | misc_path = os.path.join(dataset_path,"misc") 314 | make_directory(misc_path) 315 | #create model_saves within the misc folder 316 | 317 | model_saves_path = os.path.join(misc_path,"model_archive") 318 | make_directory(model_saves_path) 319 | 320 | 321 | print("Directory creation process complete") 322 | 323 | 324 | sep = os.path.sep 325 | adverse_filename = adverse_entry_path.split(sep)[-1] 326 | nominal_filename = nominal_entry_path.split(sep)[-1] 327 | 328 | #place the specified adverse and nominal filelists inside the parameters folder 329 | copyfile(adverse_entry_path,os.path.join(parameters_path,adverse_filename)) 330 | copyfile(nominal_entry_path,os.path.join(parameters_path,nominal_filename)) 331 | 332 | #move the directory of the dataset folder 333 | 334 | files = list_visible_files(dataset_folder_path) 335 | 336 | if (self.copy_or_move.get() == 1): 337 | print("copying") 338 | 339 | files = list_visible_files(dataset_folder_path) 340 | 341 | 342 | for f in files: 343 | 344 | 345 | if f not in [adverse_filename, nominal_filename]: 346 | #copytree(os.path.join(dataset_folder_path,f),raw_data_path) 347 | copy_files_or_directories(os.path.join(dataset_folder_path,f),os.path.join(raw_data_path,f)) 348 | 349 | 350 | 351 | elif(self.copy_or_move.get() == 0): 352 | print("Moving all files from the dataset directory to our raw data directory") 353 | for f in files: 354 | 355 | if f not in [adverse_filename, nominal_filename]: 356 | #copytree(os.path.join(dataset_folder_path,f),raw_data_path) 357 | move(os.path.join(dataset_folder_path,f),raw_data_path) 358 | 359 | else: 360 | print("this shouldn't happen") 361 | 362 | 363 | 364 | #generate parameter_names.txt 365 | #import one file list, grab the header, then make the files from said header 366 | 367 | 368 | with open(os.path.join(parameters_path,"parameter_names.txt"),'w') as f: 369 | for parameter in parameter_list: 370 | f.write("{}\n".format(parameter)) 371 | 372 | 373 | #generate DTMIL_config.json and add the path 374 | #grab code from the json generating ipynb 375 | 376 | export_json_cfg(directory = dataset_path, 377 | fl_nominal=nominal_filename, 378 | fl_adverse=adverse_filename, 379 | param_names=parameter_list) 380 | 381 | 382 | messagebox.showinfo(title = "Info", message="Dataset Formatting Process Completed") 383 | 384 | #TODO: Also maybe generate DTMIL_config_dir.json 385 | def make_directory(folder_path): 386 | 387 | try: 388 | os.mkdir(folder_path) 389 | except OSError: 390 | print ("Creation of the dataset folder %s failed" % folder_path) 391 | 392 | #TODO: add a menu option to just export this by itself 393 | def export_json_cfg(directory = "", json_filename = "DTMIL_config.json", fl_nominal = "filelist_nominal.txt", fl_adverse = "filelist_adverse.txt",param_names = []): 394 | 395 | 396 | sep = os.path.sep 397 | default_config_path = "..{}dtmil{}configuration{}".format(sep,sep,sep) 398 | 399 | 400 | 401 | with open(os.path.join(default_config_path,"DTMIL_config_default.json")) as jsonfile: 402 | json_data = json.load(jsonfile) 403 | 404 | json_data["importing"]["nominal_filename"] = fl_nominal 405 | json_data["importing"]["adverse_filename"] = fl_adverse 406 | json_data["preprocessing"]["all_parameter_names"] = param_names 407 | 408 | 409 | 410 | json_cfg_string = json.dumps(json_data,sort_keys=True, indent=4, separators=(',', ': ')) 411 | 412 | with open(os.path.join(directory,json_filename),'w') as outfile: 413 | outfile.write(json_cfg_string) 414 | outfile.close() 415 | 416 | 417 | 418 | master = Tk() 419 | master.title("Dataset Formatter") 420 | 421 | data_formatter = DatasetFormatter(master) 422 | #master.minsize(width = 100,height = 50) 423 | 424 | 425 | mainloop( ) 426 | 427 | 428 | -------------------------------------------------------------------------------- /source/dtmil/model_container.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Tue Jun 19 15:47:39 2018 5 | 6 | @author: dweckler 7 | 8 | @author: vjanakir 9 | This is the code for deep temporal multiple instance learning (DTMIL). This is the version of ADOPT that is based on deep learning. 10 | The code assumes Keras with Theano or Tensorflow backend. 11 | uses Anaconda virtual env with Python 2.7 and keras. It should also work in Python 3.x but not tested. 12 | ''' 13 | """ 14 | 15 | 16 | import os, numpy as np, time 17 | import datetime 18 | 19 | from keras.layers.core import Dense, Dropout 20 | from keras.layers import MaxPooling1D 21 | from keras.layers.recurrent import GRU 22 | from keras.layers.wrappers import TimeDistributed 23 | from keras.models import Sequential, model_from_json 24 | from keras.regularizers import l2 25 | from keras.callbacks import ModelCheckpoint 26 | from keras.optimizers import Nadam 27 | from keras.models import load_model 28 | import json 29 | 30 | from sklearn.metrics import precision_recall_fscore_support 31 | 32 | 33 | from dtmil.configuration.config_dtmil import get_json_config_data 34 | from dtmil.utilities import aggregationLayer 35 | from dtmil.utilities import save_something 36 | from dtmil.utilities import get_auc 37 | 38 | from dtmil.utilities import load_something 39 | 40 | 41 | 42 | ## model parameters 43 | #batch_size = 32 # mini-batch size (number of samples) 44 | #epochs=100 # number of training passes through data 45 | #nhr=5 # number of units in recurrent layer 46 | #nhd=500 # number of hidden units in fully connected layer 47 | #lr = 0.001 # Specify learning rate lr. 48 | #optim=Nadam(lr=lr) # ADAM optimizer with nestrov momentum (see keras documentation). 49 | #dr=0 # dropout rate (0-1) 50 | #lam=0.01 # regularization 51 | 52 | # path to data. 53 | 54 | 55 | class ModelContainer: 56 | 57 | #TODO: Update data_path and model_path whenever a model is reloaded, perhaps have a "reload" initializer 58 | def __init__(self,data_container): 59 | self.myData = data_container 60 | self.load_config_data() 61 | 62 | model_io_data = data_container.json_data['model_io'] 63 | model_filename = model_io_data["model_filename"] 64 | model_container_filename = model_io_data["model_container_filename"] 65 | data_container_filename = model_io_data["data_container_filename"] 66 | 67 | model_archive_directory = os.path.join(data_container.dataset_dir,data_container.json_dir_data['model_archive_directory']) 68 | self.model_archive_directory = model_archive_directory 69 | 70 | model_output_directory = os.path.join(self.myData.dataset_dir,self.myData.json_dir_data['model_output_directory']) 71 | self.model_output_directory = model_output_directory 72 | 73 | model_storage_directory = os.path.join(self.myData.dataset_dir,self.myData.json_dir_data['model_storage_directory']) 74 | self.model_storage_directory = model_storage_directory 75 | self.model_path = os.path.join(model_storage_directory,model_filename) 76 | self.model_container_path = os.path.join(model_storage_directory,model_container_filename) 77 | self.data_path = os.path.join(model_storage_directory,data_container_filename) 78 | 79 | optim = Nadam(lr=self.lr) 80 | pars = "_".join([str(k) for k in [self.batch_size, self.epochs, self.nhr, self.nhd, self.dr, self.lam, optim.__class__.__name__, self.lr]]) 81 | 82 | 83 | fname_add=model_archive_directory+"temporary".split(os.path.sep)[-1].split('.')[0]+"_"+pars+'_' 84 | self.model_fname=fname_add+"bestModel-{epoch:02d}-{val_acc:.4f}.hdf5" 85 | self.json_fname=fname_add+'.json' 86 | if not os.path.exists(model_archive_directory): 87 | os.makedirs(model_archive_directory) 88 | 89 | 90 | print(self.model_path) 91 | 92 | 93 | @classmethod 94 | def reload_all_data(cls,dataset_dir, json_data_block = None): 95 | 96 | print("reloading model and data") 97 | if json_data_block == None: 98 | json_data_block = get_json_config_data(dataset_dir) 99 | 100 | json_dir_data, json_group_data,dataset_dir = json_data_block 101 | model_storage_directory = os.path.join(dataset_dir,json_dir_data['model_storage_directory']) 102 | model_container_path = os.path.join(model_storage_directory, json_group_data['model_io']["model_container_filename"]) 103 | 104 | myModel = load_something(model_container_path) 105 | myModel.update_paths(model_container_path,dataset_dir) 106 | 107 | myData = myModel.myData 108 | 109 | myData.dataset_dir = dataset_dir 110 | model = load_model(myModel.model_path) 111 | 112 | myModel.model = model 113 | 114 | return myModel 115 | 116 | 117 | def update_paths(self,model_container_path, new_dataset_dir = None): 118 | 119 | if new_dataset_dir is not None: 120 | dataset_dir = new_dataset_dir 121 | self.myData.dataset_dir = new_dataset_dir 122 | 123 | else: 124 | dataset_dir= self.myData.dataset_dir 125 | 126 | 127 | model_io_data = self.myData.json_data['model_io'] 128 | model_filename = model_io_data["model_filename"] 129 | model_container_filename = model_io_data["model_container_filename"] 130 | data_container_filename = model_io_data["data_container_filename"] 131 | 132 | model_archive_directory = os.path.join(dataset_dir,self.myData.json_dir_data['model_archive_directory']) 133 | self.model_archive_directory = model_archive_directory 134 | 135 | model_output_directory = os.path.join(dataset_dir,self.myData.json_dir_data['model_output_directory']) 136 | self.model_output_directory = model_output_directory 137 | 138 | model_storage_directory = os.path.join(dataset_dir,self.myData.json_dir_data['model_storage_directory']) 139 | self.model_storage_directory = model_storage_directory 140 | self.model_path = os.path.join(model_storage_directory,model_filename) 141 | self.model_container_path = os.path.join(model_storage_directory,model_container_filename) 142 | self.data_path = os.path.join(model_storage_directory,data_container_filename) 143 | 144 | 145 | 146 | 147 | 148 | 149 | def load_config_data(self): 150 | data = self.myData.json_data["training"] 151 | self.epochs = data['epochs'] 152 | self.batch_size = data['batch_size'] 153 | self.nhr = data['nhr'] 154 | self.nhd = data['nhd'] 155 | self.lr = data['lr'] 156 | self.dr = data['dr'] 157 | self.lam = data['lam'] 158 | 159 | def configure_model(self,train_flag, pre_trained_model = None, pre_trained_json = None): 160 | # create model configuration 161 | myData = self.myData 162 | self.train_flag = train_flag 163 | 164 | self.pre_trained_model = pre_trained_model 165 | self.pre_trained_json = pre_trained_json 166 | 167 | 168 | if train_flag: 169 | 170 | # standard sequential model in Keras where layers can be added. 171 | model = Sequential() 172 | 173 | # masking layer to make sure masked time-steps are not considered in the gradient calculations 174 | # model.add(Masking(mask_value=mask_val, input_shape=(maxlen, nfeat))) 175 | lam = self.lam 176 | dr = self.dr 177 | optim = Nadam(lr=self.lr) 178 | 179 | # GRU layer (RNN) 180 | model.add(GRU( 181 | input_shape=(myData.maxlen, myData.nfeat), 182 | units=self.nhr, 183 | return_sequences=True, 184 | stateful=False, 185 | unroll=False, 186 | implementation='gpu', 187 | activation='tanh', 188 | kernel_regularizer=l2(lam), 189 | recurrent_regularizer=l2(lam), 190 | bias_regularizer=l2(lam))) 191 | model.add(Dropout(dr)) 192 | 193 | # fully connected layer - note the timedistributed type which processes data at every time step. 194 | model.add(TimeDistributed(Dense(units=self.nhd, 195 | activation='tanh', 196 | kernel_regularizer=l2(lam), 197 | bias_regularizer=l2(lam), 198 | kernel_constraint = None))) 199 | model.add(Dropout(dr)) 200 | 201 | # logistic layer (the output of this layer gives instance probabilities) 202 | model.add(TimeDistributed(Dense(units=1, 203 | activation='sigmoid', 204 | kernel_regularizer=l2(lam), 205 | bias_regularizer=l2(lam), 206 | kernel_constraint = None),name="inst_prob")) 207 | model.add(Dropout(0)) 208 | 209 | # multiple-instance aggregation layer 210 | # model.add(aggregationLayer(name="mil_layer")) 211 | model.add(MaxPooling1D(pool_size=myData.maxlen)) 212 | start = time.time() 213 | 214 | # compile model 215 | model.compile(loss="binary_crossentropy", optimizer=optim, metrics=['accuracy']) 216 | print("Compilation Time : ", time.time() - start) 217 | 218 | # serialize (save) model to JSON 219 | model_json = model.to_json() 220 | with open(self.json_fname, "w") as json_file: 221 | json_file.write(model_json) 222 | print('saved model json to disk') 223 | else: 224 | 225 | 226 | ##Check filepath here, if it doesn't exist, load existing model 227 | # load json and create model 228 | 229 | print("Train_Flag set to false, loading pre-trained model") 230 | model = self._load_pretrained_model() 231 | 232 | 233 | # json_file = open(load_jsonName, 'r') 234 | # loaded_model_json = json_file.read() 235 | # json_file.close() 236 | # model = model_from_json(loaded_model_json,{'aggregationLayer':aggregationLayer}) 237 | # 238 | # # load weights into new model 239 | # model.load_weights(load_h5Name) 240 | # model.compile(loss="binary_crossentropy", optimizer=optim, metrics=['accuracy']) 241 | # print("Loaded and compiled model from disk") 242 | 243 | 244 | self.model = model 245 | 246 | #TODO: Raise better errors for pretrained models 247 | def _load_pretrained_model(self): 248 | ##FIXME: Make this have better error handling than "None" 249 | 250 | json_filename = self.pre_trained_json 251 | pre_trained_model_filename = self.pre_trained_model 252 | 253 | 254 | if (pre_trained_model_filename== "") : 255 | print("No filepath specified, attempting to load from the default path") 256 | pre_trained_model_filename = self.model_path 257 | 258 | 259 | #FIXME: This is inconsistent with the above. Fix later somehow (probably with yet another JSON argument) 260 | if json_filename == "": 261 | json_filename = None 262 | 263 | if (json_filename): 264 | 265 | json_file = open(json_filename, 'r') 266 | loaded_model_json = json_file.read() 267 | json_file.close() 268 | model = model_from_json(loaded_model_json,{'aggregationLayer':aggregationLayer}) 269 | 270 | weights_filename = pre_trained_model_filename 271 | model.load_weights(weights_filename) 272 | model.compile(loss="binary_crossentropy", optimizer=Nadam(lr=self.lr) , metrics=['accuracy']) 273 | print("Loaded and compiled model from disk") 274 | 275 | else: 276 | model_filename = pre_trained_model_filename 277 | 278 | print("attempting to load: {}".format(model_filename)) 279 | model = load_model(model_filename) 280 | print("Loaded model from disk") 281 | 282 | return model 283 | 284 | 285 | 286 | #%% train model 287 | 288 | def train_model(self,trainNeeded): 289 | myData = self.myData 290 | 291 | if trainNeeded: 292 | try: 293 | # define checkpoint so that model is saved if it is better than previously saved model 294 | checkpoint = ModelCheckpoint(self.model_fname, 295 | monitor='val_accuracy', 296 | verbose=0, 297 | save_best_only=True, 298 | 299 | mode='auto') 300 | 301 | #FIXME: fix the callbacks list bug and model checkpoints 302 | callbacks_list = [checkpoint] 303 | 304 | start = time.time() 305 | self.training_history = self.model.fit(myData.xtrain, 306 | myData.ytrain, 307 | validation_data=(myData.xvalid,myData.yvalid), 308 | batch_size=self.batch_size, 309 | epochs=self.epochs, 310 | validation_split=0.33, 311 | verbose = 1, 312 | #callbacks=callbacks_list, 313 | shuffle=True) 314 | 315 | self.train_time = time.time() - start 316 | print("Train Time : ",self.train_time) 317 | 318 | except KeyboardInterrupt: 319 | print('interrupted') 320 | 321 | #%% evaluate model performance on train set 322 | 323 | def evaluate_model(self): 324 | myData = self.myData 325 | 326 | temp=np.array([myData.I_opt.tolist()+ 327 | myData.I_bad.tolist() 328 | +myData.I_opt_valid.tolist() 329 | +myData.I_bad_valid.tolist()])[0] 330 | # temp=np.array([I_opt_ho.tolist()+I_bad_ho.tolist()])[0] 331 | xval=myData.states[temp,:,:] 332 | yval=myData.seqLabels[temp] 333 | self.xval = xval 334 | 335 | #FIXME: Make this more clear that its evaluating on two different sets of the model 336 | y_pred_prob=self.model.predict_proba(xval)[:,0] 337 | self.yValidation_prob = y_pred_prob 338 | 339 | self.auc_train = get_auc(yval, y_pred_prob) 340 | 341 | #%% evaluate model performance on test set 342 | 343 | temp=np.array([myData.I_opt_ho.tolist()+ 344 | myData.I_bad_ho.tolist()])[0] 345 | # temp=np.array([I_opt_ho.tolist()+I_bad_ho.tolist()])[0] 346 | xtest=myData.states[temp,:,:] 347 | ytest=myData.seqLabels[temp] 348 | self.xtest = xtest 349 | 350 | 351 | y_pred_prob=self.model.predict_proba(xtest)[:,0] 352 | 353 | self.y_pred_prob = y_pred_prob 354 | 355 | self.auc_test = get_auc(ytest, y_pred_prob) 356 | 357 | #TODO: Add threshold definition 358 | 359 | self.precision,self.recall,self.fscore, _ = precision_recall_fscore_support(ytest,y_pred_prob.round(), average='weighted') 360 | 361 | self.xtest = xtest 362 | self.ytest = ytest 363 | 364 | 365 | 366 | self.train_date = datetime.datetime.now() 367 | 368 | self.generate_output_file() 369 | 370 | def save_model(self): 371 | 372 | 373 | #TODO: add this to the m 374 | timestr = time.strftime("%Y%h%d-%H%M%S") 375 | 376 | self.timestamp = timestr 377 | 378 | 379 | self.model.save(self.model_path) 380 | save_something(self.myData,self.data_path) 381 | #save model container separately from the model (otherwise pickle doesn't work) 382 | temp = self.model 383 | self.model = None 384 | save_something(self,self.model_container_path) 385 | self.model = temp 386 | 387 | json_cfg_string = json.dumps(self.myData.json_data,sort_keys=True, indent=4, separators=(',', ': ')) 388 | 389 | 390 | 391 | 392 | with open(os.path.join(self.model_storage_directory,"DTMIL_config_{}.json".format(timestr)),'w') as outfile: 393 | outfile.write(json_cfg_string) 394 | outfile.close() 395 | 396 | 397 | 398 | def generate_output_file(self): 399 | print("generating output file...\n\n\n") 400 | 401 | myData = self.myData 402 | model_output_directory = self.model_output_directory 403 | 404 | dataset_header = "Output Summary:" 405 | training_samples = self.__format_sample_output("Training",myData.xtrain,myData.I_opt,myData.I_bad) 406 | validation_samples = self.__format_sample_output("Validation",myData.xvalid, myData.I_opt_valid,myData.I_bad_valid) 407 | test_samples = self.__format_sample_output("Test", self.xtest,myData.I_opt_ho,myData.I_bad_ho) 408 | 409 | auc_train = "AUC Train: {}".format(self.auc_train) 410 | auc_test = "AUC Test: {}".format(self.auc_test) 411 | precision = "Precision: {}".format(self.precision) 412 | recall = "Recall: {}".format(self.recall) 413 | f1_score = "F1 Score: {}".format(self.fscore) 414 | 415 | epochs = "Epochs: {}".format(self.epochs) 416 | batch_size = "Batch Size: {}".format(self.batch_size) 417 | regularization_parameter = "Lambda: {}".format(self.lam) 418 | dropout_rate = "Dropout Rate: {}".format(self.dr) 419 | train_date = "Trained on: {}".format(self.train_date) 420 | number_of_features = "Number of features: {}".format(myData.nfeat) 421 | 422 | dropped_states = myData.correlated_states.tolist() + myData.dropped_states.tolist() 423 | 424 | dropped_parameters = "Dropped Parameters: \n{}".format( dropped_states ) 425 | dropped_parameter_names ="{}".format( [myData.header[p] for p in dropped_states]) 426 | 427 | #Find a better way to express this within keras 428 | if(self.train_flag == False): 429 | train_date = "Reloaded Model" 430 | 431 | 432 | output_string_list = [dataset_header, 433 | number_of_features, 434 | train_date, 435 | "", 436 | training_samples, 437 | validation_samples, 438 | test_samples, 439 | "", 440 | epochs, 441 | regularization_parameter, 442 | dropout_rate, 443 | batch_size, 444 | "", 445 | dropped_parameters, 446 | dropped_parameter_names, 447 | "", 448 | auc_train, 449 | auc_test, 450 | precision, 451 | recall, 452 | f1_score 453 | ] 454 | 455 | 456 | output_string = "\n".join(output_string_list) 457 | print(output_string) 458 | print("\n") 459 | splice = myData.time_splice 460 | if(not splice): 461 | splice = 1 462 | 463 | 464 | #summary_filename = "model_output_summary_{}_percent.txt".format(int(splice*100)) 465 | summary_filename = "model_output_summary.txt" 466 | 467 | with open(os.path.join(model_output_directory,summary_filename),'w') as outfile: 468 | outfile.write(output_string) 469 | 470 | 471 | def __format_sample_output(self,name, total_samples, nominal_samples, adverse_samples): 472 | 473 | total_samples = "{} Samples: {}".format(name,len(total_samples)) 474 | nominal_samples = " - Nominal: {}".format(len(nominal_samples)) 475 | adverse_samples = " - Adverse: {}".format(len(adverse_samples)) 476 | 477 | return "\n".join([total_samples,nominal_samples,adverse_samples]) 478 | 479 | 480 | 481 | -------------------------------------------------------------------------------- /source/dtmil/visualizations.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Wed Mar 20 15:55:46 2019 5 | 6 | @author: dweckler 7 | """ 8 | 9 | 10 | import numpy as np, matplotlib.pyplot as plt 11 | from keras import backend as T 12 | import time 13 | import os 14 | from .utilities import flat_avg 15 | from dtmil.configuration.config_dtmil import get_json_config_data 16 | from .prediction_data import Prediction_Data 17 | import math 18 | 19 | #%%class def 20 | 21 | class Visualizer: 22 | 23 | #TODO: Redesign this to work with multiple sources without depending on having all the data at once 24 | def __init__(self, myData, myModel, sample_idx = None, guidelines = True, prediction_data = None, dataset_dir = None, input_json_data = None): 25 | 26 | self.myData = myData 27 | self.myModel = myModel 28 | self._current_sample = sample_idx 29 | 30 | ##FIXME: make this update the visualization parameters every run (grab location of config file from myData?) 31 | 32 | if (input_json_data is not None): 33 | json_data = input_json_data 34 | 35 | else: 36 | _, json_data, _ = get_json_config_data(dataset_dir) 37 | 38 | 39 | 40 | self.visualization_params = json_data['visualization'] 41 | 42 | ##FIXME: Make this more able to be manually defined 43 | sf = 0.25 44 | self.xvec_scale_factor = sf 45 | 46 | self.xvec_timeline=np.arange((self.myData.maxlen-1)*sf,-sf,-sf) 47 | 48 | #this is to account for the extra value in the start and end indeces. Will be best practice to fix in the future 49 | self.xvec_temp_time_lookup = np.copy(self.xvec_timeline) 50 | self.xvec_temp_time_lookup = np.append(self.xvec_temp_time_lookup,self.xvec_timeline[-1]) 51 | 52 | 53 | 54 | if sample_idx == None: 55 | print(f"sample index is set to None, using default value") 56 | sample_idx = 0 57 | 58 | if prediction_data: 59 | self.prediction_data = prediction_data 60 | else: 61 | self.prediction_data = Prediction_Data(myData,myModel,sample_idx) 62 | 63 | self.guidelines = guidelines 64 | if (guidelines): 65 | self.get_guidelines() 66 | 67 | @classmethod 68 | def frompredictiondata(cls, prediction_data, guidelines = True): 69 | #initialize from preditcion data 70 | 71 | return cls(prediction_data.myData, prediction_data.myModel, prediction_data.current_sample, prediction_data = prediction_data) 72 | 73 | #%%plot sample timeline function 74 | 75 | @property 76 | def current_sample(self): 77 | return self._current_sample 78 | 79 | @current_sample.setter 80 | def current_sample(self,value): 81 | self._current_sample = value 82 | self.prediction_data = Prediction_Data(self.myData,self.myModel,value) 83 | 84 | def plot_sample_timeline(self, figure_size = None, saveFig = True): 85 | 86 | myModel = self.myModel 87 | model_output_directory = myModel.model_output_directory 88 | xtest = myModel.xtest 89 | 90 | if (saveFig): 91 | plt.switch_backend('agg') 92 | 93 | # function to get an intermediate layer's output (instance probabilities) 94 | inst_layer_output_fn = T.function([myModel.model.layers[0].input],[myModel.model.layers[-2].output]) 95 | 96 | temp=xtest 97 | L=inst_layer_output_fn([temp])[0] 98 | nex=int(temp.shape[0]/2) 99 | 100 | plt.figure(figsize=figure_size) 101 | plt.subplot(2,1,1) 102 | plt.plot(np.transpose(L[:nex,:,0]),'g') 103 | plt.ylim([-0.1,1.1]) 104 | #plt.xlabel('Time to adverse event',fontsize=14) 105 | #plt.xlabel('Sample timeline',fontsize=14) 106 | plt.ylabel('Probability of \n adverse event',fontsize=14) 107 | # plt.xticks([0,10,20],['1000 ft \n altitude', '10 mi', '20 mi'],rotation=0) 108 | #plt.gca().invert_xaxis() 109 | plt.subplot(2,1,2) 110 | plt.plot(np.transpose(L[nex:,:,0]),'r') 111 | plt.ylim([-0.1,1.1]) 112 | #plt.gca().invert_xaxis() 113 | plt.xlabel('sample timeline',fontsize=14) 114 | #plt.xticks([0,10,20],['1000 ft \n altitude', '10 mi', '20 mi'],rotation=0) 115 | plt.ylabel('Probability of \n adverse event',fontsize=14) 116 | 117 | temp=self.myData.xvalid 118 | L=inst_layer_output_fn([temp])[0] 119 | nex=int(temp.shape[0]/2) 120 | np.where(L[nex:,80:,0]>0.5)[0][:10] 121 | 122 | if(saveFig): 123 | plt.savefig(os.path.join(model_output_directory,"timeline.png")) 124 | 125 | #%%batch visualization function 126 | #FIXME: text sizing 127 | def visualize_sample_parameters(self,figure_size = None, saveFig = False, file_output_dir = "",file_output_type = "pdf",num_columns = 5, subplot_aspect_ratio = (1,1), subplot_size = 3.6): 128 | myData = self.myData 129 | # myModel = self.myModel 130 | 131 | 132 | if (saveFig): 133 | plt.switch_backend('agg') 134 | 135 | #specify the variables to be included in the plot 136 | correlated_states = myData.correlated_states.tolist() 137 | trained_states = myData.parameter_selection.tolist() 138 | parameters_to_plot=correlated_states + trained_states 139 | correlated_indeces = len(correlated_states) 140 | 141 | num_plots = len(parameters_to_plot) + 1 142 | num_rows = math.ceil(float(num_plots)/float(num_columns)) 143 | 144 | if figure_size is None: 145 | width = 4*num_columns 146 | height = num_rows * 3.5 147 | 148 | figure_size = (width,height) 149 | 150 | 151 | fig, axs = plt.subplots(num_rows,num_columns, figsize= figure_size) 152 | axs=axs.ravel() 153 | 154 | starting_index = -1-myData.maxlen+1 155 | 156 | for pltIdx in np.arange(len(parameters_to_plot)): 157 | selected_parameter = parameters_to_plot[pltIdx] 158 | 159 | plot_title = "{}".format(myData.header[selected_parameter]) 160 | #add holdout to the title if it's within the correlated indeces 161 | if (pltIdx < correlated_indeces): 162 | plot_title = plot_title + "(H/O)" 163 | 164 | self.plot_parameter(selected_parameter,axs[pltIdx],starting_index, plot_title = plot_title) 165 | 166 | # plot precursor score in a separate subplot 167 | pltIdx=pltIdx+1 168 | self.plot_precursor_score(axs[pltIdx],'Precursor Score') 169 | fig.tight_layout() 170 | 171 | # save figure if needed 172 | if saveFig: 173 | 174 | suffix = "_{}".format(self.myData.get_filename(self.current_sample)) 175 | 176 | file_label, file_dataset_type = self.myData.get_grouping(self.current_sample) 177 | 178 | filename = "{}_{}".format(file_label,file_dataset_type) 179 | 180 | save_figure(self.myModel,suffix,fig,file_output_dir,filename,file_output_type = 'pdf') 181 | #self.save_figure(fig,file_output_dir) 182 | 183 | 184 | 185 | def special_ranking_visualization(self, states_to_visualize,sorted_ranking_sums,figure_size = (10,10), saveFig = False, file_output_dir = "",file_output_type = "pdf"): 186 | myData = self.myData 187 | 188 | fig, axs = plt.subplots(3,3, figsize= figure_size) 189 | axs=axs.ravel() 190 | 191 | self.plot_precursor_score(axs[1],'Precursor Score') 192 | 193 | for i in range(6): 194 | selected_parameter = states_to_visualize[i] 195 | 196 | plot_title = "{} ({})".format(myData.header[selected_parameter],sorted_ranking_sums[i]) 197 | #add holdout to the title if it's within the correlated indeces 198 | 199 | self.plot_parameter(selected_parameter,axs[i+3],0, plot_title = plot_title) 200 | 201 | 202 | 203 | #TODO: same as below except ordered ranking parameters with a variable number of columns and such 204 | #output with values of ranking 205 | #figure out what the values mean to report to bryan tomorrow 206 | def visualize_top_ranking_parameters(self,ranking_group,feature_num_limit=None,num_columns = 4,displayfig = False): 207 | 208 | file_output_dir = "feature_ranking" 209 | myData = self.myData 210 | 211 | if (not displayfig): 212 | plt.switch_backend('agg') 213 | 214 | #get as many as we can 215 | #score_pair_lists = ranking_group.top_ranking_scores(1) 216 | 217 | #response_windows_lists = ranking_group.top_response_windows(1) 218 | response_windows_lists = ranking_group.ordered_response_windows_list 219 | 220 | if(feature_num_limit is not None): 221 | if len(response_windows_lists[0])> feature_num_limit: 222 | response_windows_lists = [lst[0:feature_num_limit] for lst in response_windows_lists] 223 | 224 | num_windows = len(response_windows_lists) 225 | #print(feature_num_limit,len(response_windows_lists[0]),len(response_windows_lists[1])) 226 | 227 | for idx,response_windows in enumerate(response_windows_lists): 228 | 229 | parameter_selection = [window.attribute_index for window in response_windows] 230 | 231 | # print([window.ranking_score for window in response_windows]) 232 | # print([window.most_important_sd_response for window in response_windows]) 233 | score_list = [round(window.ranking_score,3) for window in response_windows] 234 | 235 | sd_response_list = [] 236 | for window in response_windows: 237 | most_important_response = window.most_important_sd_response 238 | if most_important_response is not None: 239 | sd_response_list.append(str(most_important_response)) 240 | else: 241 | sd_response_list.append("n/a") 242 | 243 | #sd_response_list = [round(window.most_important_sd_response,3) for window in response_windows] 244 | 245 | 246 | num_plots = len(response_windows) + 1 247 | num_rows = math.ceil(float(num_plots)/float(num_columns)) 248 | 249 | width = 4*num_columns 250 | height = num_rows * 3.5 251 | 252 | figsize = (width,height) 253 | fig, axs = plt.subplots(num_rows,num_columns, figsize= figsize) 254 | 255 | axs=axs.ravel() 256 | fig.tight_layout() 257 | 258 | xvec_timeline = self.xvec_timeline 259 | plot_idx = 0 260 | 261 | axs[plot_idx].plot(xvec_timeline,ranking_group.prediction_data.precursor_score,'r',linewidth=2,label = "Default") 262 | axs[plot_idx].set_title("Precursor Score",fontsize=10) 263 | axs[plot_idx].set_ylim([0,1]) 264 | axs[plot_idx].invert_xaxis() 265 | 266 | if(self.guidelines): 267 | axs[plot_idx].plot(self.xvec_timeline,self.precursor_score_guideline,'k--') 268 | 269 | graph_colors = ['b','g','k','y','c','m','k','w'] 270 | color_idx = 0 271 | 272 | sd_disturbances = ranking_group.parent.standard_deviation_disturbances 273 | 274 | #TODO: condense everything below into one function (rather than writing the same code twice) 275 | parameter_window_indeces = [ranking_group.parameter_list.index(i) for i in parameter_selection] 276 | parameter_windows = [ranking_group.parameter_windows[i] for i in parameter_window_indeces] 277 | 278 | #if this process isn't behind an if statement, the algorithm will output blank graphs 279 | #furthermore, it will cause some of the following graphs to come out blank as well 280 | #the cause of this is unknown, but may be useful to investigate in the future 281 | if len(parameter_windows)>0: 282 | 283 | #TODO: Figure out why this conditional became necessary and the one above stopped working? (maybe some revisions impacted it?) 284 | if len(parameter_windows[0].start_indeces)>0: 285 | 286 | start_index = parameter_windows[0].start_indeces[idx] 287 | end_index = parameter_windows[0].end_indeces[idx] 288 | 289 | window_start_idx = self.xvec_temp_time_lookup[start_index] 290 | window_end_idx = self.xvec_temp_time_lookup[end_index] 291 | 292 | axs[plot_idx].axvspan(window_start_idx, window_end_idx, alpha=0.1, color='k') 293 | for index,window in enumerate(parameter_windows): 294 | color_idx = 0 295 | plot_idx = index+1 296 | 297 | axs[plot_idx].invert_xaxis() 298 | #axs[plot_idx].set(adjustable='box', aspect=1) 299 | axs[plot_idx].plot(xvec_timeline,ranking_group.prediction_data.precursor_score,'r', label = "Default",linewidth=2) 300 | axs[plot_idx].axvspan(window_start_idx, window_end_idx, alpha=0.1, color='k') 301 | 302 | 303 | for precursor_score in window.modified_precursor_scores: 304 | selected_parameter = parameter_selection[index] 305 | 306 | disturbance = sd_disturbances[color_idx] 307 | 308 | if disturbance > 0: 309 | label = "+ {} σ response".format(disturbance) 310 | 311 | else: 312 | label = "- {} σ response".format(abs(disturbance)) 313 | 314 | 315 | axs[plot_idx].plot(xvec_timeline,precursor_score,graph_colors[color_idx],linewidth=2,label = label) 316 | axs[plot_idx].set_title("{} \n({}, {} σ response)".format(myData.header[selected_parameter],score_list[index],sd_response_list[index]),fontsize=10) 317 | axs[plot_idx].set_ylim([0,1]) 318 | if(self.guidelines): 319 | axs[plot_idx].plot(self.xvec_timeline,self.precursor_score_guideline,'k--') 320 | color_idx += 1 321 | 322 | if(plot_idx>1): 323 | handles, labels = axs[plot_idx].get_legend_handles_labels() 324 | fig.legend(handles, labels, loc='lower right') 325 | 326 | #save the figure 327 | plt.tight_layout() 328 | 329 | file_label, file_dataset_type = self.myData.get_grouping(ranking_group.data_ID) 330 | filename = "{}_{}_ranking".format(file_label,file_dataset_type) 331 | 332 | suffix = "_{}".format(self.myData.get_filename(ranking_group.data_ID)) 333 | 334 | if num_windows > 1: 335 | suffix = "{}_precursor_event_{}".format(suffix,idx) 336 | 337 | save_figure(self.myModel,suffix,fig,file_output_dir,filename,output_time = False) 338 | 339 | else: 340 | #TODO: 341 | print("Precursor score for {} does not cross threshold?".format(self.myData.get_filename(ranking_group.data_ID))) 342 | 343 | else: 344 | print("Precursor score for {} does not cross threshold!".format(self.myData.get_filename(ranking_group.data_ID))) 345 | 346 | 347 | # def visualize_ranking_data(self,ranking_group, output_file = None, parameter_selection = None, num_columns = 7, subplot_aspect_ratio = (1,1), subplot_size = 3.6): 348 | # myData = self.myData 349 | # print("generating ranking data plot") 350 | # 351 | # if parameter_selection is None: 352 | # parameter_selection = myData.parameter_selection.tolist() 353 | # 354 | # #all the paramaeters plus the precursor score in its own plot 355 | # num_plots = len(parameter_selection) + 1 356 | # num_rows = math.ceil(float(num_plots)/float(num_columns)) 357 | # dx, dy = subplot_aspect_ratio 358 | # figsize = plt.figaspect(float(dy * num_rows) / float(dx * num_columns)) * subplot_size 359 | # 360 | # fig, axs = plt.subplots(num_rows,num_columns, figsize= figsize) 361 | # #fig, axs = plt.subplots(numRows,numColumns) 362 | # axs=axs.ravel() 363 | # fig.tight_layout() 364 | # #xvec_timeline=np.arange((myData.maxlen-1)*0.25,-0.25,-0.25) 365 | # 366 | # xvec_timeline = self.xvec_timeline 367 | # 368 | # axs[0].plot(xvec_timeline,ranking_group.prediction_data.precursor_score,'r',linewidth=2) 369 | # axs[0].set_title("Normal",fontsize=10) 370 | # axs[0].set_ylim([0,1]) 371 | # axs[0].invert_xaxis() 372 | # 373 | # graph_colors = ['b','g','k','y'] 374 | # color_idx = 0 375 | # 376 | # parameter_window_indeces = [ranking_group.parameter_list.index(i) for i in parameter_selection] 377 | # parameter_windows = [ranking_group.parameter_windows[i] for i in parameter_window_indeces] 378 | # 379 | # for index,window in enumerate(parameter_windows): 380 | # color_idx = 0 381 | # plot_idx = index+1 382 | # axs[plot_idx].invert_xaxis() 383 | # 384 | # for precursor_score in window.modified_precursor_scores: 385 | # selected_parameter = parameter_selection[index] 386 | # 387 | # axs[plot_idx].plot(xvec_timeline,precursor_score,graph_colors[color_idx],linewidth=2) 388 | # axs[plot_idx].set_title("{} ({})".format(myData.header[selected_parameter],selected_parameter),fontsize=10) 389 | # axs[plot_idx].set_ylim([0,1]) 390 | # axs[plot_idx].plot(xvec_timeline,ranking_group.prediction_data.precursor_score,'r',linewidth=1) 391 | # color_idx += 1 392 | 393 | 394 | #%%save figure 395 | 396 | def save_figure(self, fig,file_output_dir,file_output_type = 'pdf'): 397 | 398 | save_figure(self.myModel,self.current_sample,fig,file_output_dir,"parameters_graph",file_output_type = 'pdf') 399 | 400 | 401 | #%%plot precursor score 402 | 403 | def plot_precursor_score(self, plot_axis, plot_title = "Precursor Score", start_index = None, end_index = None): 404 | precursor_score = self.prediction_data.precursor_score 405 | plot_axis.plot(self.xvec_timeline[start_index:end_index], precursor_score[start_index:end_index],'r',linewidth=2) 406 | 407 | if(self.guidelines): 408 | plot_axis.plot(self.xvec_timeline[start_index:end_index],self.precursor_score_guideline[start_index:end_index],'k--') 409 | 410 | plot_axis.invert_xaxis() 411 | plot_axis.set_title(plot_title,fontsize=10) 412 | plot_axis.set_ylim([0,1]) 413 | 414 | 415 | #%%plot indivudual parameter 416 | 417 | def plot_parameter(self, selected_parameter, plot_axis,starting_index = 0,end_index = None,plot_title = "", precIdx = None): 418 | 419 | ##FIXME: Make this more able to be manually defined 420 | xvec_timeline=self.xvec_timeline 421 | 422 | #FIXME: Make Prediction Data update states_orig ("visualization_sample") 423 | parameter_values = self.prediction_data.visualization_window[starting_index:end_index,selected_parameter] 424 | 425 | # plot time series variable 426 | plot_axis.plot(xvec_timeline[starting_index:end_index],parameter_values,linewidth=2) 427 | 428 | ##plot the guidelines 429 | # if discrete variable, use discrete nominal data as guideline, else use continuous nominal data 430 | if selected_parameter in self.visualization_params["binary_parameters"]: 431 | plot_axis.plot(xvec_timeline[starting_index:end_index],self.discrete_nominal_guideline[starting_index:end_index,selected_parameter],'k--',linewidth=2) 432 | plot_axis.set_ylim([-0.1,1.1]) 433 | else: 434 | plot_axis.plot(xvec_timeline[starting_index:end_index],self.nominal_guideline[0,starting_index:end_index,selected_parameter],'k--',linewidth=2) 435 | plot_axis.plot(xvec_timeline[starting_index:end_index],self.nominal_guideline[1,starting_index:end_index,selected_parameter],'k--',linewidth=2) 436 | 437 | ##use this if we are dealing with multiple precursor score predictions, otherwise use the one genereated upon class initialization 438 | if (precIdx): 439 | precursor_indeces = precIdx 440 | else: 441 | precursor_indeces = self.prediction_data.precursor_indeces 442 | 443 | # plot precursor time instants as an overlay 444 | if len(precursor_indeces)>0: 445 | 446 | precursor_overlay_values = self.prediction_data.visualization_window[precursor_indeces,selected_parameter] 447 | 448 | self.precursor_overlay_values = precursor_overlay_values 449 | if(end_index): 450 | if end_index >= precursor_indeces[0]: 451 | precursor_end_index = (np.abs(precursor_indeces - (end_index))).argmin() 452 | print(precursor_end_index,end_index) 453 | 454 | plot_axis.plot(xvec_timeline[precursor_indeces][0:precursor_end_index],precursor_overlay_values[0:precursor_end_index],'ro', alpha = 0.4) 455 | else: 456 | plot_axis.plot(xvec_timeline[precursor_indeces],precursor_overlay_values,'ro', alpha = 0.4) 457 | 458 | # 459 | if plot_title == "": 460 | plot_title = "{} ({})".format(self.myData.header[selected_parameter],selected_parameter) 461 | 462 | plot_axis.set_title(plot_title,fontsize=10) 463 | 464 | # # invert x-axis so that distance to touchdown reduces as we go towards rightside of the plot 465 | plot_axis.invert_xaxis() 466 | 467 | #%%get guidelines 468 | 469 | def get_guidelines(self): 470 | myData = self.myData 471 | optimal_values=myData.states_orig[:,np.concatenate((myData.I_opt,myData.I_opt_valid),axis=0)] 472 | #determine guidelines 473 | guideline_type = self.visualization_params["guideline_type"] 474 | if guideline_type == 1: 475 | optimal_standard_dev = np.std(optimal_values, axis=1) 476 | optimal_mean = np.mean(optimal_values,axis = 1) 477 | 478 | avg_guideline =flat_avg(optimal_mean) 479 | sdev_guideline = flat_avg(optimal_standard_dev) 480 | 481 | sdev_scale = 2.5 482 | upper_guideline = avg_guideline + sdev_scale * sdev_guideline 483 | lower_guideline = avg_guideline - sdev_scale * sdev_guideline 484 | nominal_guideline = np.array([lower_guideline, upper_guideline]) 485 | else: 486 | # get nominal percentiles for plotting 487 | nominal_guideline=np.percentile(optimal_values,[10,90],axis=1) 488 | 489 | self.nominal_guideline = nominal_guideline 490 | # Get nominal values assuming binary (note that we will only use this if the variable is binary) 491 | self.discrete_nominal_guideline=np.mean(optimal_values,axis=1) 492 | self.precursor_score_guideline = np.full(optimal_values.shape[0],self.prediction_data.precursor_threshold) 493 | 494 | 495 | 496 | 497 | def save_figure(myModel, figure_suffix, fig,file_output_dir,filename,file_output_type = 'pdf', output_time = True): 498 | time_start = time.time() 499 | print("Saving figure: {}".format(figure_suffix)) 500 | model_output_directory = myModel.model_output_directory 501 | 502 | if model_output_directory != "": 503 | model_output_directory = os.path.join(model_output_directory,file_output_dir) 504 | if not os.path.exists(model_output_directory): 505 | print(f"creating directory {model_output_directory}") 506 | os.makedirs(model_output_directory) 507 | 508 | 509 | 510 | filename = "{}{}.{}".format(filename,figure_suffix,"pdf") 511 | filepath = os.path.join(model_output_directory,filename) 512 | 513 | #print("Saving figure: {}".format(filepath)) 514 | 515 | fig.savefig(filepath,format= file_output_type) 516 | 517 | # if(output_time): 518 | # print("Total time to save figure: {}".format(time.time()-time_start)) 519 | 520 | def visualize(myData, myModel,sample_idx = 0, savefig = False): 521 | 522 | vis = Visualizer(myData,myModel,sample_idx) 523 | 524 | vis.plot_sample_timeline(figure_size = (8,6), saveFig = savefig) 525 | 526 | print("Visualizing Sample {}".format(sample_idx)) 527 | vis.visualize_sample_parameters(figure_size=(32,24),saveFig = savefig) 528 | 529 | 530 | 531 | --------------------------------------------------------------------------------