├── requirements.txt
├── datasets
    └── toy_data.zip
├── source
    ├── begin_DTMIL.py
    ├── batch_visualization.py
    ├── dataset_formatter_cl.py
    ├── feature_ranking_analysis.py
    ├── requirements.txt
    ├── dtmil
    │   ├── configuration
    │   │   ├── DTMIL_config_dir.json
    │   │   ├── DTMIL_config_default.json
    │   │   └── config_dtmil.py
    │   ├── prediction_data.py
    │   ├── feature_ranking
    │   │   ├── ranking_window.py
    │   │   └── feature_ranking.py
    │   ├── utilities.py
    │   ├── data_container.py
    │   ├── model_container.py
    │   └── visualizations.py
    ├── simple_visualization.py
    └── guis
    │   ├── parameter_selector.py
    │   └── dataset_formatter.py
├── ADOPT Corporate CLA.pdf
├── ADOPT Individual CLA.pdf
├── documentation
    ├── readme.txt
    ├── Usage Guide.pdf
    ├── ADOPT Corporate CLA.pdf
    ├── ADOPT Individual CLA.pdf
    ├── feature_ranking_example.pdf
    ├── parameter_graph_example.pdf
    ├── ADOPT NASA Open Source Agreement.pdf
    ├── Anomalous_Test_ranking_data_0193_precursor_event_1.pdf
    └── config_readme.rtf
├── ADOPT NASA Open Source Agreement.pdf
└── README.md


/requirements.txt:
--------------------------------------------------------------------------------
1 | scipy
2 | matplotlib
3 | h5py
4 | numpy
5 | Keras
6 | tensorflow


--------------------------------------------------------------------------------
/datasets/toy_data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nasa/ADOPT/HEAD/datasets/toy_data.zip


--------------------------------------------------------------------------------
/source/begin_DTMIL.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nasa/ADOPT/HEAD/source/begin_DTMIL.py


--------------------------------------------------------------------------------
/ADOPT Corporate CLA.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nasa/ADOPT/HEAD/ADOPT Corporate CLA.pdf


--------------------------------------------------------------------------------
/ADOPT Individual CLA.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nasa/ADOPT/HEAD/ADOPT Individual CLA.pdf


--------------------------------------------------------------------------------
/documentation/readme.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nasa/ADOPT/HEAD/documentation/readme.txt


--------------------------------------------------------------------------------
/documentation/Usage Guide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nasa/ADOPT/HEAD/documentation/Usage Guide.pdf


--------------------------------------------------------------------------------
/source/batch_visualization.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nasa/ADOPT/HEAD/source/batch_visualization.py


--------------------------------------------------------------------------------
/source/dataset_formatter_cl.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nasa/ADOPT/HEAD/source/dataset_formatter_cl.py


--------------------------------------------------------------------------------
/ADOPT NASA Open Source Agreement.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nasa/ADOPT/HEAD/ADOPT NASA Open Source Agreement.pdf


--------------------------------------------------------------------------------
/source/feature_ranking_analysis.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nasa/ADOPT/HEAD/source/feature_ranking_analysis.py


--------------------------------------------------------------------------------
/documentation/ADOPT Corporate CLA.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nasa/ADOPT/HEAD/documentation/ADOPT Corporate CLA.pdf


--------------------------------------------------------------------------------
/documentation/ADOPT Individual CLA.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nasa/ADOPT/HEAD/documentation/ADOPT Individual CLA.pdf


--------------------------------------------------------------------------------
/documentation/feature_ranking_example.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nasa/ADOPT/HEAD/documentation/feature_ranking_example.pdf


--------------------------------------------------------------------------------
/documentation/parameter_graph_example.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nasa/ADOPT/HEAD/documentation/parameter_graph_example.pdf


--------------------------------------------------------------------------------
/documentation/ADOPT NASA Open Source Agreement.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nasa/ADOPT/HEAD/documentation/ADOPT NASA Open Source Agreement.pdf


--------------------------------------------------------------------------------
/source/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib==3.1.1
2 | pandas==0.25.1
3 | numpy==1.16.4
4 | Keras==2.3.0
5 | scipy==1.3.1
6 | h5py==2.9.0
7 | scikit_learn==0.21.3
8 | typing==3.7.4.1
9 | 


--------------------------------------------------------------------------------
/documentation/Anomalous_Test_ranking_data_0193_precursor_event_1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nasa/ADOPT/HEAD/documentation/Anomalous_Test_ranking_data_0193_precursor_event_1.pdf


--------------------------------------------------------------------------------
/source/dtmil/configuration/DTMIL_config_dir.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "cache_file": "data/cached_data.h5",
 3 |     "datasets_directory": "../../../datasets/",
 4 |     "parameters_directory": "data/parameters/",
 5 |     "model_archive_directory": "misc/model_archive/",
 6 |     "model_storage_directory": "model_saves/",
 7 |     "raw_data_directory": "data/raw_data/",
 8 |     "selected_dataset":"selected_dataset.txt",
 9 |     "model_output_directory":"output/"
10 | }


--------------------------------------------------------------------------------
/source/dtmil/configuration/DTMIL_config_default.json:
--------------------------------------------------------------------------------
 1 | {	
 2 | 	"config_name":"run",
 3 | 	"config_id":0,
 4 | 	"id_hold":false,
 5 | 	
 6 | 	"training":
 7 | 	{ 
 8 | 		"batch_size": 32,
 9 | 		"dr": 0,
10 | 		"epochs": 200,
11 | 		"lam": 0.01,
12 | 		"lr": 0.001,
13 | 		"nhd": 500,
14 | 		"nhr": 5,
15 | 		"seed": 0,
16 | 		"pre_trained_model":"",
17 | 		"train_flag": 1,
18 | 		"pre_trained_json":""
19 | 	},
20 |    
21 |     "importing": 
22 |     {
23 |         "nominal_filename":"filelist_nominal.txt",
24 |         "adverse_filename": "filelist_adverse.txt",
25 | 		"holdout_percent": 0.1,
26 | 		"validation_percent":0.4,
27 | 		"state_cache": 0,
28 | 		"time_splice": 1
29 |     },
30 |     
31 |     "preprocessing":
32 |     {
33 |         "set_sample_length":null,
34 | 		"redundant_parameters": [],
35 | 		"drop_parameters":[],
36 | 		"all_parameter_names":[]
37 |     }, 
38 |     
39 |     "model_io":
40 |     {
41 |         "model_filename":"keras_model.h5",
42 |         "model_container_filename":"model_container.pkl",
43 |         "data_container_filename":"data.pkl"
44 |     },  
45 |     
46 |     "visualization":
47 |     {
48 | 		"binary_parameters":[],
49 | 		"guideline_type": 1,	
50 | 		"precursor_threshold":0.5
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/source/simple_visualization.py:
--------------------------------------------------------------------------------
 1 | from sys import argv
 2 | from enum import Enum
 3 | from batch_visualization import Batch_Visualizer
 4 | 
 5 | num_columns = 4
 6 | 
 7 | def visualize_event(event_types,i_bad,i_good):
 8 |     if (Event_Type.Nominal in event_types):
 9 |         print("visualizing nominal")
10 |         viz.save_sample_parameters(i_good,num_columns=num_columns)
11 |         
12 |     if (Event_Type.Anomalous in event_types):
13 |         print("visualizing adverse")
14 | 
15 |         viz.save_sample_parameters(i_bad,num_columns=num_columns)
16 |         
17 | 
18 | class Dataset_Type(Enum):
19 |     Train = 1
20 |     Validation = 2
21 |     Test = 3
22 |     
23 | class Event_Type(Enum):
24 |     Nominal = 1
25 |     Anomalous = 2
26 | 
27 | 
28 | print(argv)
29 | #%% user defined variables
30 | 
31 | if (len(argv) > 1):
32 |     dataset_input = argv[1] 
33 |     
34 | else:
35 |     dataset_input = input("Input the path of the dataset:\n")
36 | 
37 | 
38 | viz = Batch_Visualizer(dataset_input)
39 | input_val = ""
40 | 
41 | while(input_val == ""):
42 |     set_types = {1:"Train",2:"Validation",3:"Test"}
43 | 
44 | 
45 |     prompt = ("Which part of the dataset would you like to visualize? If multiple, input the numbers separated by commas.\n\n" 
46 |               "1. Training Set\n"
47 |               "2. Validation Set\n"
48 |               "3. Test Set\n"
49 |               "\n")
50 | 
51 |     set_nums = input(prompt)
52 |     input_val = set_nums
53 |     
54 |     if (input_val == ""):
55 |         print("no input selected, try again, or press control-c to exit\n")
56 |         
57 |     else:
58 |         sets_list = [int(num) for num in set_nums.split(',')]
59 | 
60 | dataset_types_list = [Dataset_Type(num) for num in sets_list]
61 | 
62 | event_prompt = ("Which event would you like to visualize? If multiple, input the numbers separated by commas.\n\n"
63 |                 "1. Nominal\n"
64 |                 "2. Adverse\n"
65 |                 "\n")
66 | event_nums = input(event_prompt)
67 | event_list = [int(num) for num in event_nums.split(',')]
68 | event_types = [Event_Type(num) for num in event_list]
69 | 
70 | 
71 |     
72 | print(dataset_types_list)
73 | if(Dataset_Type.Train in dataset_types_list):
74 |     
75 |     visualize_event(event_types,viz.myData.I_bad,viz.myData.I_opt)
76 |     
77 | if(Dataset_Type.Validation in dataset_types_list):
78 |     visualize_event(event_types,viz.myData.I_bad_valid,viz.myData.I_opt_valid)
79 |     
80 | if(Dataset_Type.Test in dataset_types_list):
81 |     visualize_event(event_types,viz.myData.I_bad_ho,viz.myData.I_opt_ho)
82 | 
83 |     
84 | 
85 | 


--------------------------------------------------------------------------------
/source/dtmil/prediction_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Tue Apr  2 13:55:10 2019
  5 | 
  6 | @author: dweckler
  7 | """
  8 | 
  9 | 
 10 | 
 11 | import numpy as np, matplotlib.pyplot as plt
 12 | from keras import backend as T
 13 | import time
 14 | import os
 15 | from dtmil.utilities import flat_avg
 16 | from dtmil.model_container import ModelContainer
 17 | from dtmil.data_container import DataContainer
 18 | 
 19 | #%%class def
 20 | 
 21 | 
 22 | class Prediction_Data:
 23 |     
 24 |     def __init__(self,myData:DataContainer,myModel:ModelContainer,sample_id:int = None, data_padding:bool = False, input_window = None):
 25 |         
 26 |         self.myData = myData
 27 |         self.myModel = myModel
 28 |         self.current_sample = sample_id
 29 |         
 30 |         #FIXME: Figure out what shape the input window will be. For now, it just assumes the same shape as the data sample (two indeces: [time,feature])
 31 |         if input_window is not None:
 32 |             self.data_sample = input_window    
 33 |         else:
 34 |             #TODO: make states and states_orig have the same "shape order"
 35 |             #both the arrays below are the same shape
 36 |             if sample_id is None:
 37 |                 sample_id = 0
 38 |                 print(f"no value provided for sample_id, setting to default value of {sample_id}")
 39 |             self.data_sample = myData.states[sample_id,:,:] 
 40 |             
 41 |         self.data_length = len(self.data_sample)
 42 |         self.visualization_sample = myData.states_orig[:,sample_id,:]
 43 |         
 44 |         inst_layer_output_fn = T.function([myModel.model.layers[0].input],[myModel.model.layers[-2].output])
 45 |         self.instance_layer_output_function = inst_layer_output_fn
 46 |         
 47 |         if(data_padding):
 48 |             self.pad_data()    
 49 |             
 50 |             #self.pad_original_precursor_score()
 51 |             
 52 |         else:
 53 |             self.data_window = self.data_sample
 54 |             self.visualization_window = self.visualization_sample
 55 |             
 56 |             self.padded_sample = None
 57 |             self.padded_vis_sample = None
 58 |             
 59 |             self.start_index = 0
 60 |             self.end_index = self.data_length - 1 
 61 |             
 62 |         self.update_predictions()
 63 |         
 64 |     def update_predictions(self):
 65 |      
 66 |         data_window = self.data_window
 67 |         data_length = len(data_window)
 68 |         num_features = len(data_window[0])
 69 |         
 70 |         #TODO: get the states from myData if there isn't another type of input
 71 |         input_values=np.reshape(data_window,(1,data_length,num_features))
 72 |         self.input_values = input_values
 73 |         
 74 |         # get instance probabilities (precursor score)
 75 |         L=self.instance_layer_output_function([input_values])[0]
 76 |         self.L = L
 77 |         
 78 |         self.precursor_score = L[0,:,0]
 79 |         
 80 |         # get precursor indeces
 81 |         #FIXME: Make this work with updating visualization params, or let the visualization module take it
 82 |         self.precursor_threshold = self.myData.json_data['visualization']["precursor_threshold"]
 83 |         self.precursor_indeces=np.where(self.precursor_score>self.precursor_threshold)[0]  
 84 |         
 85 | 
 86 |   #This is only until we get actual streaming working        
 87 |     def update_data_window(self,step_size = 1):
 88 |         
 89 |         new_start_index = self.start_index + step_size
 90 |         end_index = new_start_index + self.data_length
 91 |         
 92 |         if end_index >= len(self.padded_sample):
 93 |             #array would be out of bounds so we set it to the last value
 94 |             end_index = len(self.padded_sample)
 95 |             #new_start_index = end_index - self.data_length +1
 96 |             new_start_index = end_index - self.data_length
 97 | 
 98 |     
 99 |         self.start_index = new_start_index
100 |         self.data_window = self.padded_sample[new_start_index:end_index]
101 |         self.visualization_window = self.padded_vis_sample[new_start_index:end_index]
102 |         
103 |         #self.orig_prec_score_window = self.padded_orig_prec_score[new_start_index:end_index]
104 |         
105 |         self.update_predictions()            
106 |     
107 |     #####TODO: Remove once demos are done
108 |     
109 |     def pad_data(self):
110 |         
111 |         data_sample = self.data_sample
112 |         vis_sample = self.visualization_sample
113 |         self.padded_sample, self.data_window = self.pad_sample(data_sample)
114 |         self.padded_vis_sample, self.visualization_window = self.pad_sample(vis_sample)
115 |         
116 |         self.start_index = 0               
117 |         
118 |     def pad_sample(self, sample):
119 |         data_length = self.data_length
120 |         pad_left = np.stack([sample[0]]*data_length)
121 |         pad_right = np.stack([sample[-1]]*data_length)
122 |         
123 |         padded_sample = np.concatenate((pad_left,sample,pad_right))
124 |         start_index = 0
125 |         #end_index = data_dlength - 1
126 |         end_index = data_length
127 |         
128 |         data_window = padded_sample[start_index:end_index]
129 |         
130 |         return padded_sample, data_window
131 |         
132 | 
133 | 
134 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ADOPT
 2 | 
 3 | Although aviation accidents are rare, safety incidents occur more frequently and require a careful analysis to detect and mitigate risks in a timely manner. Analyzing safety incidents using operational data and producing event-based explanations is invaluable to airline companies as well as to governing organizations such as the Federal Aviation Administration (FAA) in the United States. However, this task is challenging because of the complexity involved in mining multi-dimensional heterogeneous time series data, the lack of time-step-wise annotation of events in a flight, and the lack of scalable tools to perform analysis over a large number of events. We propose a precursor mining algorithm: Automatic Discovery of Precursors in Time series data (ADOPT) that identifies events in the multidimensional time series that are correlated with the safety incident. Precursors are valuable to systems health and safety monitoring and in explaining and forecasting safety incidents. Current methods suffer from poor scalability to high dimensional time series data and are inefficient in capturing temporal behavior. We propose an approach by combining multiple-instance learning (MIL) and deep recurrent neural networks (DRNN) to take advantage of MIL's ability to learn using weakly supervised data and DRNN's ability to model temporal behavior. 
 4 | 
 5 | 
 6 | The objective of this project is to automate the analysis of flight safety incidents in a way that scales well and offers explanations. These explanations include:
 7 | 
 8 | * When the degraded states start to appear?
 9 | * What are the degraded states?
10 | * What is the likelihood of the event is to occur?
11 | * What corrective actions can be taken?
12 | 
13 | This project aims to:
14 | 
15 | * Create a novel deep temporal multiple-instance learning (DT-MIL) framework that combines multiple-instance learning with deep recurrent neural networks suitable for weakly-supervised learning problems involving time series or sequential data. 
16 | * Provide a novel approach to explaining safety incidents using precursors mined from data.
17 | * Deliver a detailed evaluation of the DT-MIL model using real-world aviation data and comparison with baseline models. 
18 | * Perform a precursor analysis and explanation of high speed exceedance safety incident using flight data from a commercial airline
19 | 
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 | This repository contains the following files in its top level directory:
27 | 
28 | * [source](source)  
29 | The source code of the repository, this includes the ADOPT model, GUI configuration tools, and a command line program that utilizes the model.
30 | 
31 | * [documentation](documentation)  
32 | Documents describing how to configure and run the program, as well as how to interpret the results. 
33 | 
34 | * [datasets](datasets)  
35 | A directory containing a sample dataset. Other datasets may also be added here by the user.
36 | 
37 | * [requirements.txt](requirements.txt)   
38 | General module requirements for the program. A more specific requiremnts.txt can be found in [source](source).
39 | 
40 | 
41 | * [ADOPT NASA Open Source Agreement.pdf](ADOPT%20NASA%20Open%20Source%20Agreement.pdf)  
42 | Licensing for ADOPT
43 | * [ADOPT Individual CLA.pdf](ADOPT%20Individual%20CLA.pdf)  
44 | NASA Individual Contributor License Agreement
45 | * [ADOPT Corporate CLA.pdf](ADOPT%20Corporate%20CLA.pdf)   
46 | NASA Corporate Contributor License Agreement
47 | 
48 | 
49 | 
50 | 
51 | ## Contact Info
52 | 
53 | NASA Point of contact: Nikunj Oza <nikunj.c.oza@nasa.gov>, Data Science Group Lead.
54 | 
55 | For questions regarding the research and development of the algorithm, please contact Bryan Matthews <bryan.l.matthews@nasa.gov>, Senior Research Engineer.
56 | 
57 | For questions regarding the source code, please contact Daniel Weckler <daniel.i.weckler@nasa.gov>, Software Engineer.
58 | 
59 | 
60 | ## Copyright and Notices
61 | 
62 | Notices:
63 | 
64 | Copyright © 2019 United States Government as represented by the Administrator of the National Aeronautics and Space Administration.  All Rights Reserved.
65 | 
66 | Disclaimers
67 | 
68 | No Warranty: THE SUBJECT SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY OF ANY KIND, EITHER EXPRESSED, IMPLIED, OR STATUTORY, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTY THAT THE SUBJECT SOFTWARE WILL CONFORM TO SPECIFICATIONS, ANY IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR FREEDOM FROM INFRINGEMENT, ANY WARRANTY THAT THE SUBJECT SOFTWARE WILL BE ERROR FREE, OR ANY WARRANTY THAT DOCUMENTATION, IF PROVIDED, WILL CONFORM TO THE SUBJECT SOFTWARE. THIS AGREEMENT DOES NOT, IN ANY MANNER, CONSTITUTE AN ENDORSEMENT BY GOVERNMENT AGENCY OR ANY PRIOR RECIPIENT OF ANY RESULTS, RESULTING DESIGNS, HARDWARE, SOFTWARE PRODUCTS OR ANY OTHER APPLICATIONS RESULTING FROM USE OF THE SUBJECT SOFTWARE.  FURTHER, GOVERNMENT AGENCY DISCLAIMS ALL WARRANTIES AND LIABILITIES REGARDING THIRD-PARTY SOFTWARE, IF PRESENT IN THE ORIGINAL SOFTWARE, AND DISTRIBUTES IT "AS IS."
69 | 
70 | Waiver and Indemnity:  RECIPIENT AGREES TO WAIVE ANY AND ALL CLAIMS AGAINST THE UNITED STATES GOVERNMENT, ITS CONTRACTORS AND SUBCONTRACTORS, AS WELL AS ANY PRIOR RECIPIENT.  IF RECIPIENT'S USE OF THE SUBJECT SOFTWARE RESULTS IN ANY LIABILITIES, DEMANDS, DAMAGES, EXPENSES OR LOSSES ARISING FROM SUCH USE, INCLUDING ANY DAMAGES FROM PRODUCTS BASED ON, OR RESULTING FROM, RECIPIENT'S USE OF THE SUBJECT SOFTWARE, RECIPIENT SHALL INDEMNIFY AND HOLD HARMLESS THE UNITED STATES GOVERNMENT, ITS CONTRACTORS AND SUBCONTRACTORS, AS WELL AS ANY PRIOR RECIPIENT, TO THE EXTENT PERMITTED BY LAW.  RECIPIENT'S SOLE REMEDY FOR ANY SUCH MATTER SHALL BE THE IMMEDIATE, UNILATERAL TERMINATION OF THIS AGREEMENT.
71 | 
72 | 


--------------------------------------------------------------------------------
/source/dtmil/feature_ranking/ranking_window.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Tue Apr 16 19:42:33 2019
  5 | 
  6 | @author: dweckler
  7 | """
  8 | 
  9 | 
 10 | from typing import List
 11 | import numpy as np
 12 | from dtmil.prediction_data import Prediction_Data
 13 | 
 14 | class Parameter_Score_Window:
 15 |     
 16 |     def __init__(self, start_idx:List[int],end_idx:List[int], parent_group, disturbed_parameter:int):
 17 |         self.prediction_data:Prediction_Data = parent_group.prediction_data
 18 |         self.sd_disturbances:List[int] = parent_group.parent.standard_deviation_disturbances
 19 |         self.disturbed_parameter:int = disturbed_parameter
 20 |         self.start_indeces:List[int] = start_idx
 21 |         self.end_indeces:List[int] = end_idx
 22 |         self.modified_precursor_scores:List[float]
 23 |         self.subwindows:List[Precursor_Event_Window]
 24 |         self.parent_group = parent_group
 25 |         
 26 |         precursor_score = self.prediction_data.precursor_score
 27 |         window_count = len(start_idx)
 28 |         
 29 |         if window_count == 0:
 30 |             self.modified_precursor_scores = [self.prediction_data.precursor_score]
 31 |             window = Precursor_Event_Window(precursor_score,None,self)
 32 |             self.subwindows = [window]
 33 |             
 34 |         else:
 35 |             self.__disturb_parameters()
 36 |             
 37 |             subwindows = []
 38 |             for i in range(window_count):
 39 |                 start = start_idx[i]
 40 |                 end = end_idx[i]
 41 |                 score_window = self.prediction_data.precursor_score[start:end]
 42 |                 modified_windows = [window[start:end] for window in self.modified_precursor_scores]
 43 |                 subwindows.append(Precursor_Event_Window(score_window,modified_windows,self))
 44 |                 
 45 |             self.subwindows = subwindows
 46 |         
 47 |         
 48 |     def __disturb_parameters(self):
 49 |         
 50 |         param_list = self.prediction_data.myData.parameter_selection.tolist() 
 51 |         self.modified_precursor_scores = []
 52 |         
 53 |         for standard_deviation_scale in self.sd_disturbances:
 54 |             modified_input_data = np.copy(self.prediction_data.input_values)
 55 |     
 56 |             i = param_list.index(self.disturbed_parameter)
 57 |             
 58 |             singleFeature = modified_input_data[:,:,i]        
 59 |             standard_dev = np.std(singleFeature) * standard_deviation_scale
 60 |             singleFeature += standard_dev
 61 |             modified_input_data[:,:,i] = singleFeature
 62 | 
 63 |             L=self.prediction_data.instance_layer_output_function([modified_input_data])[0]
 64 |             modified_precursor_score = L[0,:,0].tolist()
 65 |             self.modified_precursor_scores.append(modified_precursor_score)
 66 |                 
 67 |     @property
 68 |     def most_negative_percent_differences(self):
 69 |         return [abs(sw.most_negative_percent_diff) for sw in self.subwindows]
 70 |     
 71 |     @property
 72 |     def most_important_sd_responses(self):
 73 |         return [abs(sw.most_important_sd_response) for sw in self.subwindows]
 74 | 
 75 |     
 76 |        
 77 | class Precursor_Event_Window:
 78 |     
 79 |     def __init__(self,precursor_score_window:List[float],modified_score_windows:List[List[float]], parent_window:Parameter_Score_Window):
 80 |         self.precursor_score_window = precursor_score_window
 81 |         self.modified_score_windows = modified_score_windows
 82 |         self.parent_window = parent_window
 83 |         
 84 |         self.most_negative_percent_diff:float
 85 |         
 86 |         if modified_score_windows is None:
 87 |             self.most_negative_percent_diff = 0 
 88 |             self.most_important_sd_response = None
 89 | 
 90 |         else:
 91 |             self.__compare_precursor_scores()
 92 |         
 93 |     #complare all the scores with each SD disturbance to see which surpresses the precursor score the most
 94 |     def __compare_precursor_scores(self):
 95 |     
 96 |         percent_differences = []
 97 |         precursor_window = self.precursor_score_window
 98 |         
 99 |         if len(precursor_window == 1):
100 |             integrate = np.mean     
101 |         else:
102 |             integrate = np.trapz
103 |             
104 |         avgDefault = integrate(precursor_window)
105 |      
106 |         for modified_window in self.modified_score_windows:                     
107 |           
108 |             avgCurrent = integrate(modified_window)            
109 |                 
110 |             diff_percent = (avgDefault-avgCurrent)/(avgDefault)*100
111 |             percent_differences.append(diff_percent)
112 |             
113 |         most_negative_diff = 0
114 |         most_important_sd_response = None
115 |         
116 |         
117 |         for i,percent_diff in enumerate(percent_differences):
118 |             if percent_diff > most_negative_diff:
119 |                 most_negative_diff = percent_diff
120 |                 most_important_sd_response = self.parent_window.sd_disturbances[i]
121 |                 
122 |         self.most_negative_percent_diff = most_negative_diff
123 |         self.most_important_sd_response = most_important_sd_response
124 |         
125 |         
126 |     @property
127 |     def ranking_score(self):
128 |         return self.most_negative_percent_diff
129 |     
130 |     @property
131 |     def attribute_index(self):
132 |         return self.parent_window.disturbed_parameter
133 |     
134 |     @property
135 |     def attribute_label(self):
136 |         return self.parent_window.prediction_data.myData.param_index_to_label(self.attribute_index)
137 |         
138 |         
139 |     
140 | 


--------------------------------------------------------------------------------
/source/dtmil/utilities.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Tue Jun 19 12:34:58 2018
  5 | 
  6 | @author: dweckler
  7 | """
  8 | '''
  9 | @author: vjanakir
 10 | This is the code for deep temporal multiple instance learning (DTMIL). This is the version of ADOPT that is based on deep learning.
 11 | The code assumes Keras with Theano or Tensorflow backend.
 12 | uses Anaconda virtual env with Python 2.7 and keras. It should also work in Python 3.x but not tested.
 13 | '''
 14 | # load python libraries
 15 | import numpy as np, math
 16 | from keras.engine.topology import Layer, InputSpec
 17 | from keras import backend as T
 18 | import pickle
 19 | import os
 20 | from scipy.integrate import trapz
 21 | 
 22 | 
 23 | 
 24 | source_path = os.path.dirname(os.path.realpath(__file__))
 25 | 
 26 | 
 27 | #%% custom model functions
 28 | 
 29 | def sigmoid(x,decay,bias):
 30 |     a = []
 31 |     for item in x:
 32 |         a.append(1/(1+math.exp(-decay*(item-bias))))
 33 |     return a
 34 | 
 35 | def get_weight_fn(maxlen):
 36 |     temp=0.1+np.array(sigmoid(np.arange(maxlen).tolist(),decay=0.1,bias=70))
 37 |     temp=temp/np.sum(temp)
 38 |     return temp
 39 | #plt.plot(get_weight_fn(100))    
 40 |         
 41 | class aggregationLayer(Layer):
 42 |     """
 43 |     This is a custom Keras layer. This pooling layer accepts the temporal
 44 |     sequence output by a recurrent layer and performs multiple instance pooling,
 45 |     looking at only the non-masked portion of the sequence. The pooling
 46 |     layer converts the instance probabilities (same length as input sequence) into a bag-level probability.
 47 |     
 48 |     input shape: (nb_samples, nb_timesteps, nb_features)
 49 |     output shape: (nb_samples, 1)
 50 |     """
 51 |     def __init__(self, **kwargs):
 52 |         super(aggregationLayer, self).__init__(**kwargs)
 53 |         self.supports_masking = True
 54 |         self.input_spec = [InputSpec(ndim=3)]
 55 |     
 56 |     def get_output_shape_for(self, input_shape):
 57 |         return (input_shape[0], input_shape[2])
 58 |     
 59 |     def call(self, x, mask=None):
 60 |         if mask is None:
 61 |             mask = T.mean(T.ones_like(x), axis=-1)
 62 |         mask = T.cast(mask,T.floatx())
 63 |         
 64 |         dr_perc=0.5
 65 |         mask1=T.dropout(mask,level=dr_perc)
 66 |         mask1=T.clip(mask1, 0, 1)
 67 |         
 68 |         mod_smax=T.max(x[:,:,0]*mask1,axis=1).dimshuffle(0,'x')
 69 |         smax = T.max(x[:,:,0]*mask,axis=1).dimshuffle(0,'x') #(nb_samples, np_features)
 70 |         smin = T.min(x[:,:,0]*mask,axis=1).dimshuffle(0,'x') #(nb_samples, np_features)
 71 |         
 72 | #        mod_smax=T.expand_dims(T.max(x[:,:,0]*mask1,axis=1), 1)
 73 | #        smax = T.expand_dims(T.max(x[:,:,0]*mask,axis=1), 1) #(nb_samples, np_features)
 74 | #        smin = T.expand_dims(T.min(x[:,:,0]*mask,axis=1), 1) #(nb_samples, np_features)
 75 |         
 76 |         x_rounded=x[:,:,0]*mask
 77 |         sum_unmasked=T.batch_dot(x_rounded,mask,axes=1) # (nb_samples,np_features)
 78 |         
 79 |         ssum = T.sum(x,axis=-2) #(nb_samples, np_features)
 80 |         rcnt = T.sum(mask,axis=-1,keepdims=True) #(nb_samples) # number of unmasked samples in each record
 81 |         bag_label=sum_unmasked/rcnt
 82 |         smean=ssum/rcnt
 83 |         
 84 | #        # sigmoid weighted mean:
 85 | #        weight_fn=T.reshape(T.transpose(T.tile(T.reshape(T.variable(get_weight_fn(100)),(100,1)),T.shape(x)[0])),(T.shape(x)[0],T.shape(x)[1],1))
 86 | #        weighted_x=weight_fn*x
 87 | #        wsum=T.sum(weighted_x,axis=-2) #(nb_samples, np_features)
 88 | ##        weight_sum=T.reshape(T.batch_dot(T.ones_like(x),weight_fn,axes=1),T.shape(rcnt)) # used T.ones_like(x) instead of x to check if I am seeing the outputs..which helped me debug
 89 | #        wmean=wsum # because the weights are normalized
 90 |         
 91 | #        sofmax=(1/largeNum)*T.log(T.sum(T.exp()))
 92 |         
 93 | #        return bag_label
 94 |         return smax # max voting
 95 | #        return smin # min voting
 96 | #        return smean # temporal mean pooling        
 97 | #        return wmean # sigmoid weighted mean
 98 | #        return sofmax
 99 | #        return mod_smax
100 |             
101 |     def compute_mask(self, input, mask):
102 |         return None 
103 | 
104 | 
105 | 
106 | def get_auc(ytest, ytest_prob):
107 |     tau_mat=np.arange(0,1.01,0.01)
108 |     TPR=np.zeros(len(tau_mat),)
109 |     FPR=np.ones(len(tau_mat),)
110 |     for i in np.arange(len(tau_mat)):
111 |         tau=tau_mat[i]
112 |         ytest_pred=np.zeros(ytest_prob.shape)
113 |         ytest_pred[ytest_prob>tau]=1
114 |         posIdx=np.where(ytest==1)[0]
115 |         TPR[i]=len(np.where(ytest_pred[posIdx]==1)[0])/float(len(posIdx))
116 |         negIdx=np.where(ytest==0)[0]
117 |         FPR[i]=len(np.where(ytest_pred[negIdx]==1)[0])/float(len(negIdx))
118 |     auc_bag=abs(trapz(TPR,FPR))
119 |     return auc_bag 
120 |     
121 |     
122 | #save a file to a specified directory
123 | def save_something(stuffToSave,filename):
124 |     with open ('{}'.format(filename),'wb') as output:
125 |         pickle.dump(stuffToSave,output, pickle.HIGHEST_PROTOCOL)
126 |         
127 | #load a file from a specified directory  
128 | def load_something(filename):
129 |     with open ('{}'.format(filename),'rb') as inFile:
130 |         return pickle.load(inFile)
131 | 
132 | #grab labels from indeces
133 | def get_labels_from_indeces(label_indeces,label_strings):
134 |     ordered_label_strings = np.asarray([label_strings[p] for p in label_indeces])
135 |     
136 |     if isinstance(label_indeces, list):
137 |         ordered_label_strings = ordered_label_strings.tolist()
138 |         
139 |     return ordered_label_strings
140 | 
141 | #dual option for multi-sort
142 | def dual_sort(myList, side_list,absolute_value = True,reverse = False):
143 |     sorted_list, side_lists = multi_sort(myList, [side_list],absolute_value,reverse)
144 |     
145 |     return sorted_list, side_lists[0]
146 |     
147 |     
148 | #easily sort multiple arrays at once
149 | def multi_sort(myList,side_lists,absolute_value = True,reverse = False):   
150 |     #preprocess and get our sort arrays
151 |     myArray = np.asarray(myList)     
152 |     if (absolute_value):
153 |         myArray = np.absolute(myArray)
154 |     sorted_indeces = np.argsort(myArray)
155 |     
156 |     ##main array sort
157 |     sorted_array = myArray[sorted_indeces]
158 |     if (reverse):
159 |         sorted_array = np.flip(sorted_array, axis = 0)
160 | 
161 |     #sort everything else according to main array
162 |     sorted_side_arrays = []
163 |     for sList in side_lists:
164 |         sorted_arr = np.asarray(sList)[sorted_indeces]
165 |         if(reverse):
166 |             sorted_arr = np.flip(sorted_arr,axis=0)
167 |         
168 |         sorted_side_arrays.append(sorted_arr)
169 |           
170 |     return(sorted_array,sorted_side_arrays)
171 |     
172 |     
173 | def flat_avg(avg_array):
174 |     flat_mean = np.mean(avg_array,axis = 0)    
175 |     mean_list = []
176 |     arr_size = avg_array.shape[0]
177 |     for mean_val in flat_mean:
178 |         new_arr = np.full(arr_size,mean_val)
179 |         mean_list.append(new_arr)
180 |     
181 |     avg_guideline = np.array(mean_list)  
182 |     return avg_guideline.swapaxes(0,1)
183 | 
184 | 


--------------------------------------------------------------------------------
/source/guis/parameter_selector.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from tkinter import Tk, Listbox, Grid, Label, END,filedialog, Scrollbar, VERTICAL, RIGHT, Y, EXTENDED
  5 | from tkinter.ttk import Frame, Button, Entry, Style
  6 | 
  7 | from tkinter.messagebox import showinfo, showerror
  8 | import os
  9 | import sys
 10 | import json
 11 | 
 12 | sep = os.path.sep
 13 | source_path = "..{}dtmil{}configuration{}".format(sep,sep,sep)
 14 | sys.path.append(source_path)
 15 | #from config_dtmil import get_json_config_data
 16 | 
 17 | def exit_program():
 18 |     root.destroy()
 19 |     exit()
 20 |     
 21 | 
 22 | def read_lines_from_file(directory,filename):
 23 |         
 24 |         with open(os.path.join(directory,filename),'r') as f:
 25 |             content = f.readlines()
 26 |             content = [x.strip() for x in content]
 27 |         
 28 |         return content 
 29 | 
 30 | def write_lines_to_file(list_of_text,directory,filename):
 31 |     filepath =  os.path.join(directory,filename)
 32 | #    print(f"Filepath: {filepath}")
 33 |     with open(filepath,'w') as f:
 34 |             for text in list_of_text:
 35 |                 f.write("{}\n".format(text))
 36 |     
 37 | 
 38 | class ParameterSelector:
 39 |     
 40 |     def __init__(self,master):
 41 |         self.master = master
 42 |         sel_col = 0
 43 |         hold_col = 1
 44 |         
 45 |         lbl_row = 0
 46 |         lst_row = 1
 47 |         btn_row = 2
 48 |         save_row = 3
 49 |         
 50 |         self.edited = False
 51 |         
 52 |         scale_factor = 0.5
 53 |         window_width = int(root.winfo_screenwidth()*scale_factor)
 54 |         window_height = int(root.winfo_screenheight()*scale_factor)
 55 |           
 56 |         self.master.geometry(f"{window_width}x{window_height}")
 57 |         
 58 |         frame = Frame(master)
 59 |         
 60 |         Grid.columnconfigure(master,sel_col,weight=1)
 61 |         Grid.columnconfigure(master,hold_col,weight=1)
 62 | 
 63 |         Grid.rowconfigure(master, lst_row, weight=1)
 64 | 
 65 |         frame.grid()
 66 |         
 67 |         
 68 |         self.full_listbox = Listbox(master,selectmode = EXTENDED, width = 20)
 69 |         scrollbar = Scrollbar(self.full_listbox, orient=VERTICAL)
 70 |         self.full_listbox.config(yscrollcommand=scrollbar.set)
 71 |         scrollbar.config(command=self.full_listbox)
 72 |         scrollbar.pack(side=RIGHT, fill=Y)
 73 |         self.full_listbox.grid(row=lst_row,column=sel_col,padx=(20,20),pady=(5,10),sticky = 'news')  
 74 |         
 75 |         self.holdout_listbox = Listbox(master,selectmode = EXTENDED, width = 20)
 76 |         scrollbar = Scrollbar(self.holdout_listbox, orient=VERTICAL)
 77 |         self.holdout_listbox.config(yscrollcommand = scrollbar.set)
 78 |         scrollbar.config(command=self.holdout_listbox)
 79 |         scrollbar.pack(side=RIGHT, fill=Y)
 80 |         self.holdout_listbox.grid(row=lst_row,column=hold_col,padx=(20,20),pady=(5,10),sticky = 'news')
 81 |         
 82 |         
 83 |         self.selected_param_label = Label(master,text = "Selected Parameters")
 84 |         self.selected_param_label.grid(row = lbl_row, column = sel_col)
 85 |         
 86 |         self.holdout_param_label = Label(master,text = "Holdout Parameters")
 87 |         self.holdout_param_label.grid(row = lbl_row,column = hold_col)
 88 |         
 89 |         self.selected_to_holdout_button = Button(master,text= "->", command = self.move_to_holdout)
 90 |         self.selected_to_holdout_button.grid(row= btn_row,column = sel_col,padx=(40,40),sticky = 'ew')
 91 |         
 92 |         self.holdout_to_selected_button = Button(master, text= "<-", command = self.move_to_selected)
 93 |         self.holdout_to_selected_button.grid(row= btn_row,column = hold_col,padx=(40,40),pady=(5,10),sticky = 'ew')
 94 |         
 95 |         self.save_button = Button(master,text = "Save", command = self.save_lists)
 96 |         self.save_button.grid(row = save_row, column = sel_col,pady=(5,10))
 97 |         
 98 |         self.reset_button = Button(master,text= "Reset", command = self.config_listboxes)
 99 |         self.reset_button.grid(row = save_row,column = hold_col)
100 |            
101 |         self.dataset_dir = ""
102 |         self.params_path = ""
103 |         
104 |     
105 |     def select_dataset(self):
106 |         #FIXME: Add an exception handler just in case the file isn't found
107 |         
108 |         #get DTMIL_config_dir.json
109 |         directory_config_file = "DTMIL_config_dir.json"
110 |         config_file = os.path.join(source_path, directory_config_file)
111 |         
112 |         with open(config_file, 'r') as dirfile:
113 |             self.dir_data = json.load(dirfile)
114 |             dirfile.close()    
115 |             
116 |         
117 |         #select the dataset directory
118 |         self.dataset_dir = filedialog.askdirectory(title = "Choose Dataset Folder")
119 |         
120 |         if self.dataset_dir == "":
121 |             exit_program()
122 |             
123 |         self.params_path = os.path.join(self.dataset_dir,self.dir_data["parameters_directory"])
124 |         
125 |         self.dataset_cfg_filepath = os.path.join(self.dataset_dir,"DTMIL_config.json")
126 |                 
127 |         with open(self.dataset_cfg_filepath) as cfg_file:
128 |             
129 |             self.dataset_config_file = json.load(cfg_file)
130 |             cfg_file.close()
131 |     
132 |             
133 |     def config_listboxes(self):
134 |         
135 |         self.edited = False
136 | 
137 |         #get the holdout state names + the selected variables
138 |     
139 |         ##load here from the config file
140 |         preprocessing_parameters = self.dataset_config_file['preprocessing']
141 | 
142 |         all_params = set(preprocessing_parameters["all_parameter_names"])
143 |         holdout_params = set(preprocessing_parameters["redundant_parameters"])
144 |         selected_params = all_params - holdout_params
145 | 
146 |         
147 |         
148 |         sel_list = sorted(list(selected_params))
149 |         hold_list = sorted(list(holdout_params))
150 |         
151 |         #make sure the listboxes are clear
152 |         self.full_listbox.delete(0,END)
153 |         self.holdout_listbox.delete(0,END)
154 |         
155 |         self.full_listbox.insert(END,*sel_list)
156 |         self.holdout_listbox.insert(END,*hold_list)
157 |         
158 |         #TODO: check with the dataset and verify that all the parameters are the same.  
159 |         # If there are missing ones, add them, if there is an extra in the set, throw and error, maybe give an option to delete it
160 |     
161 |         
162 |     
163 |     def after_startup_setup(self):
164 |         self.select_dataset()
165 |         self.config_listboxes()
166 |     
167 |     
168 |             
169 |     def move_to_holdout(self):
170 |         self.move_listbox(self.full_listbox, self.holdout_listbox)
171 |     
172 |     
173 |     def move_to_selected(self):
174 |         self.move_listbox(self.holdout_listbox, self.full_listbox)
175 |     
176 |         
177 |     def move_listbox(self,listbox_source, listbox_destination):
178 |         self.edited = True
179 | 
180 |         print(listbox_source.curselection())
181 |         selected_idx = list(listbox_source.curselection())
182 |         
183 |         selected_values = []
184 |         for i in selected_idx:
185 |             selected_values.append(listbox_source.get(i))
186 | 
187 |         for i in selected_idx[::-1]:
188 |             listbox_source.delete(i)
189 | 
190 |         print(selected_values)    
191 |         listbox_destination.insert(END,*selected_values)
192 |                 
193 |     def save_lists(self):
194 |         #output lists to file
195 |         if(not self.edited):
196 |             showerror(title= "Error",message= "No changes made!")
197 |             return
198 |             
199 |         #TODO: Have it save to the json file instead
200 |         
201 |         print("saving lists")
202 |         selected_param_list = sorted(self.full_listbox.get(0,END))
203 |         holdout_param_list = sorted(self.holdout_listbox.get(0,END))
204 |         print(selected_param_list,holdout_param_list)
205 |         
206 |         ##write these to the files and overwrite
207 |         
208 |         
209 |         preprocessing_parameters = self.dataset_config_file['preprocessing']
210 |     
211 |         preprocessing_parameters["redundant_parameters"] = holdout_param_list
212 |         
213 |         json_cfg_string = json.dumps(self.dataset_config_file,sort_keys=True, indent=4, separators=(',', ': '))
214 |     
215 |         with open(os.path.join(self.dataset_dir,"DTMIL_config.json"),'w') as outfile:
216 |             outfile.write(json_cfg_string)
217 |             outfile.close()
218 |         
219 |         showinfo("Info", "Successfully saved files\n" + "Sorting both lists")
220 |         self.config_listboxes()
221 | 
222 |         
223 |         
224 |         
225 | 
226 | root = Tk()
227 | root.title("Parameter Selector")
228 | 
229 | param_sel = ParameterSelector(root)
230 | 
231 | root.after(10,param_sel.after_startup_setup)
232 | 
233 | root.mainloop()  
234 | 
235 | 
236 | 
237 | 


--------------------------------------------------------------------------------
/source/dtmil/feature_ranking/feature_ranking.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Thu Apr 18 11:41:04 2019
  5 | 
  6 | @author: dweckler
  7 | """
  8 | 
  9 | 
 10 | from dtmil.utilities import dual_sort, get_labels_from_indeces
 11 | from dtmil.visualizations import Visualizer
 12 | 
 13 | 
 14 | from typing import List
 15 | import numpy as np
 16 | from dtmil.prediction_data import Prediction_Data
 17 | from dtmil.data_container import DataContainer
 18 | from dtmil.model_container import ModelContainer
 19 | from dtmil.feature_ranking.ranking_window import Parameter_Score_Window
 20 | import pandas as pd
 21 | 
 22 | #%% Feature Ranking Class           
 23 | #TODO: ADD Placehodler data window padding options
 24 | class Feature_Ranking:
 25 | 
 26 |     def __init__(self, data_ID_list:List[int], myData:DataContainer, myModel:ModelContainer, standard_deviation_disturbances:List[int] = [-2,2]):
 27 |           
 28 |         self.data_ID_list: List[int] = data_ID_list
 29 |         self.myData: DataContainer = myData
 30 |         self.myModel: ModelContainer = myModel
 31 |         self.standard_deviation_disturbances: List[int] = standard_deviation_disturbances
 32 |         self.ranking_group_list: List[Ranking_Group] = []  
 33 |         
 34 |         #create a ranking group for each data ID.
 35 |         for i,idx in enumerate(data_ID_list):
 36 |             
 37 |             print (f" progress:{i+1}/{len(data_ID_list)}", end="\r")
 38 | 
 39 |             ranking_group = Ranking_Group(idx,standard_deviation_disturbances,self)
 40 |             self.ranking_group_list.append(ranking_group)
 41 |         
 42 |         print("\n")
 43 |             
 44 |     def get_ranking_scores(self, attribute_type = 'label', top_number_of_features:int = None):
 45 |         feature_scores_list = []
 46 |         
 47 |         for group in self.ranking_group_list:
 48 |             score_lists = group.all_ranking_scores
 49 |             #possibly add weights for rankings here
 50 |             for score_list in score_lists:
 51 |                 feature_scores_list.append(score_list)
 52 |             
 53 |         #check to make sure we have at least one array
 54 |         if len(feature_scores_list) == 0:
 55 |                print("no feature scores!")
 56 |                return
 57 |         
 58 |         attributeIdx = self.ranking_group_list[0].parameter_list
 59 |         attributeSum = [sum(x)/len(feature_scores_list) for x in zip(*feature_scores_list)]
 60 |         sorted_sums, sorted_attributes = dual_sort(attributeSum,attributeIdx,reverse = True )
 61 |         
 62 |         
 63 |         if attribute_type == 'label':
 64 |             #TODO: replace this with 
 65 |             sorted_attributes = get_labels_from_indeces(sorted_attributes,self.myData.header)
 66 |         elif attribute_type == 'index':
 67 |             pass #since it's set to index by default
 68 |         else:
 69 |             print(f"invalid attribute type \"{attribute_type}\" specified, using \"index\" instead")
 70 |             
 71 |         if top_number_of_features is not None:
 72 |             sorted_sums = sorted_sums[:top_number_of_features]
 73 |             sorted_attributes = sorted_attributes[:top_number_of_features]
 74 | 
 75 |         return sorted_sums, sorted_attributes 
 76 |     
 77 |     #TODO: expand this and the function it calls
 78 |     def export_graphs(self, top_number_of_features:int = None):
 79 |         #"default is none"
 80 |         parameter_selection = None
 81 |         if top_number_of_features is not None:
 82 |             sorted_ranking_sums, sorted_ranking_attributes = self.get_ranking_scores("index",top_number_of_features)
 83 |             parameter_selection = sorted_ranking_attributes
 84 |             
 85 |         vis = Visualizer(self.myData,self.myModel)
 86 |         for feature_group in self.ranking_group_list:
 87 |             vis.visualize_ranking_data(feature_group, parameter_selection = parameter_selection)
 88 |             
 89 |     def batch_output(self):
 90 |         sorted_ranking_sums, sorted_ranking_attributes = self.get_ranking_scores("index",6)
 91 | 
 92 |         for feature_group in self.ranking_group_list:
 93 |             vis = Visualizer(self.myData,self.myModel,feature_group.data_ID)
 94 | 
 95 |             vis.special_ranking_visualization(sorted_ranking_attributes,sorted_ranking_sums)
 96 |             
 97 |         
 98 | #TODO: implement previous ranking features
 99 | class Ranking_Group:
100 |     
101 |     def __init__(self,data_ID:int,standard_deviation_disturbances:List[int],parent:Feature_Ranking):     
102 |  
103 |         self.data_ID:int = data_ID
104 |         self.parent:Feature_Ranking = parent
105 |         #self.default_feature:Sample_With_Disturbance = None
106 |         self.parameter_list:List[int] = self.parent.myData.parameter_selection.tolist() 
107 |         self.parameter_windows:List[Parameter_Score_Window]
108 |         self.score_weights:List[float]
109 |         
110 |         self.prediction_data:Prediction_Data = Prediction_Data(self.parent.myData,self.parent.myModel,data_ID)
111 |         self.__define_window_region()
112 |         self.__generate_parameter_windows()
113 |         
114 |         
115 |     def __define_window_region(self):
116 |         precursor_scores = self.prediction_data.precursor_score
117 |            
118 |         #TODO: apply smoothing to the precursor scores for graphs that are not as consistent
119 |         threshold_list= np.array([i>0.5 for i in precursor_scores])
120 |         tl_padded = np.r_[False,threshold_list, False]
121 |         # Get indices of shifts, which represent the start and stop indices
122 |         shift_idx = np.flatnonzero(tl_padded[:-1] != tl_padded[1:])
123 |         
124 |         # Get the start and stop indeces for all the windows
125 |         
126 |         #TODO: end_idx goes out of bounds if the graph ends on the precursor score. This only impacts graphing (and marginally at that), but should be fixed eventually
127 |         self.start_idx:List[int] = shift_idx[:-1:2]
128 |         self.end_idx:List[int] = shift_idx[1::2] 
129 |         
130 |     def __generate_parameter_windows(self):
131 |         
132 |         self.parameter_windows = []
133 |         for parameter in self.parameter_list:
134 |             windows = Parameter_Score_Window(self.start_idx,self.end_idx,self, parameter)
135 |             self.parameter_windows.append(windows)
136 |             
137 | 
138 |         
139 |     def display_ranking_scores(self, num_scores= None):
140 |         
141 |         parameter_response_windows_list = self.ordered_response_windows_list
142 |         print(type)
143 |         num_windows = len(parameter_response_windows_list)
144 |         for index, _response_windows in enumerate(parameter_response_windows_list):
145 |             
146 |             print("Window {} of {}".format(index+1,num_windows))
147 |              
148 |             response_windows = _response_windows[:num_scores]
149 |             
150 |             scores = [window.ranking_score for window in response_windows]
151 |             attribute_labels = [window.attribute_label for window in response_windows]
152 | 
153 |             df = pd.DataFrame(list(zip(attribute_labels,scores)),columns = ["Attribute", "Score"])
154 |             print(df.to_string(index=False))
155 |             print("\n")
156 |     
157 | #    @property 
158 | #    def ranking_scores(self):
159 | #        return [abs(window.most_negative_percent_diff) for window in self.parameter_windows]
160 |     
161 |     def top_response_windows(self,percent_cutoff = None):
162 |         
163 |         #TODO: add this to the config file 
164 | 
165 |         if percent_cutoff is None:
166 |             percent_cutoff = 0.4
167 | 
168 |         print("percent cutoff",percent_cutoff)
169 |         top_response_windows = []
170 |         for response_windows in self.ordered_response_windows_list:
171 |             
172 |             sorted_scores = [window.ranking_score for window in response_windows]
173 |             print("sorted score length",len(sorted_scores))
174 |             
175 |             score_sum = np.sum(sorted_scores)
176 |             cutoff_sum = percent_cutoff*score_sum
177 |             #print(cutoff_sum)
178 |             partial_sum = 0
179 |     
180 |             
181 |             cutoff_index = 0
182 |             for index, score in enumerate(sorted_scores):     
183 |                 partial_sum += score
184 |                 if partial_sum >= cutoff_sum:
185 |                     cutoff_index = index
186 |                     break
187 |                 
188 |             top_windows = response_windows[:cutoff_index]
189 |             top_response_windows.append(top_windows)
190 |             
191 |         return top_response_windows
192 |         
193 |     
194 |         
195 |     @property
196 |     def all_ranking_scores(self):
197 |         rs = np.array([np.array(window.most_negative_percent_differences) for window in self.parameter_windows])
198 |         
199 |         return np.swapaxes(rs,0,1)
200 |     
201 |     
202 |     @property
203 |     def all_subwindows(self):
204 |         sw =  np.array([np.array(window.subwindows) for window in self.parameter_windows])
205 |         #swap the axes so we select by subwindow rather than parameter
206 |         return np.swapaxes(sw,0,1)
207 | 
208 |     
209 |     @property
210 |     def ordered_response_windows_list(self):
211 |         subwindows = self.all_subwindows
212 |         
213 |         parameter_windows_lists = []
214 |         for param_windows in subwindows:
215 |             #sort each set of subwindows by their ranking score
216 |             sorted_parameter_windows_list = sorted(param_windows,key=lambda subwindow: subwindow.ranking_score,reverse = True)  
217 |             
218 |             parameter_windows_lists.append(sorted_parameter_windows_list)
219 |         
220 |         #return each subwindow list, ranked by the parameter response
221 |         return parameter_windows_lists
222 |         
223 |     @property
224 |     def window_count(self):
225 |         return len(self.start_idx)
226 |         
227 | 
228 |         
229 | 
230 | 
231 |  
232 |     


--------------------------------------------------------------------------------
/source/dtmil/configuration/config_dtmil.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Sun Jul 21 22:56:09 2019
  5 | 
  6 | @author: dweckler
  7 | """
  8 | 
  9 | 
 10 | 
 11 | import numpy as np, math
 12 | from keras.engine.topology import Layer, InputSpec
 13 | from keras import backend as T
 14 | import json
 15 | import pickle
 16 | import sys
 17 | import os
 18 | 
 19 | 
 20 | 
 21 | source_path = os.path.dirname(os.path.realpath(__file__))
 22 | directory_config_filename = "DTMIL_config_dir.json"
 23 | 
 24 | 
 25 | def get_json_config_data(input_directory = None, new_run = False):
 26 | 
 27 |     dir_config_file_path = os.path.join(source_path, directory_config_filename)
 28 | 
 29 |     with open(dir_config_file_path, 'r') as dirfile:
 30 |         dir_data = json.load(dirfile)
 31 |         
 32 |         ##make sure the defined separators of the directory config file match that of the os
 33 |         for key,val in dir_data.items():
 34 |             new_val = val.replace("/",os.path.sep)
 35 |             dir_data[key] = new_val
 36 |     
 37 |     ####Load a dataset from a specified directory
 38 |     if(input_directory):
 39 |         cfg_data, cfg_file_path, dataset_dir = get_from_input_directory(input_directory,dir_data)
 40 |         
 41 |     #get from "datasets" folder   
 42 |     else:      
 43 |         cfg_data, cfg_file_path, dataset_dir = get_from_datasets_directory(dir_data)
 44 |   
 45 |     #make sure the json file is up to date
 46 |     update_JSON(cfg_data,cfg_file_path)
 47 | 
 48 | 
 49 |     cfg_id_label = "config_id"
 50 |     name_label = "config_name"
 51 |     
 52 |     config_name = cfg_data[name_label]
 53 |     id_hold = cfg_data["id_hold"]
 54 |     config_id = cfg_data[cfg_id_label]
 55 |     
 56 |     if (new_run and not id_hold):
 57 |         #increment for this new run
 58 |         config_id += 1
 59 |         cfg_data[cfg_id_label] = config_id
 60 |         save_JSON(cfg_data,cfg_file_path)
 61 | 
 62 |     
 63 |     full_config_name = "{}_{}".format(config_name,config_id)
 64 | 
 65 |     
 66 |     #set up the model storage and model output directories and check to see if the directory was there before
 67 |     directory_existed_already = __get_directory_with_ID(dir_data,dataset_dir,"model_storage_directory",full_config_name)
 68 |     __get_directory_with_ID(dir_data,dataset_dir,"model_output_directory",full_config_name)
 69 | 
 70 |     #TODO: determine if we want this warning if hold is set
 71 |     #check to make sure we don't overwrite an existing run
 72 |     if(new_run and directory_existed_already):
 73 |         
 74 |         if(id_hold):
 75 |             print("ID hold is ON")
 76 |         
 77 |         choice = input("Existing run (\"{}\") already found, do you wish to overwrite? (y/n)\n".format(full_config_name))
 78 | 
 79 |         if choice == 'y':
 80 |             print("Overwriting existing run")
 81 | 
 82 |         else:
 83 |             if choice != 'n':
 84 |                 print("Invalid input")
 85 |                 
 86 |             output_string = ("If you do not wish to overwrite the existing run, change the \"{}\" field in the JSON file "
 87 |                              "to a number greater than or equal to the current run.\n"
 88 |                              "Alternatively, change the \"{}\" field to a name that doesn't conflict\n".format(cfg_id_label,name_label))
 89 |             
 90 |             print(output_string)
 91 |             
 92 |             print("Exiting program...")
 93 |             sys.exit(0)
 94 |         
 95 |         
 96 |     
 97 |     return dir_data, cfg_data, dataset_dir
 98 | 
 99 | 
100 | 
101 | def get_from_input_directory(input_directory,dir_data):
102 |     
103 |     if(input_directory.endswith(".json")):
104 |         json_name = os.path.basename(input_directory)
105 |         cfg_file_path = input_directory
106 |         dataset_dir = os.path.dirname(input_directory)
107 |         
108 |     else:
109 |         json_name = 'DTMIL_config.json'
110 |         cfg_file_path = os.path.join(input_directory,json_name)
111 |         dataset_dir = input_directory
112 | 
113 |     with open(cfg_file_path) as cfgfile:
114 |         cfg_data = json.load(cfgfile)
115 |         
116 |         
117 |     datasets_dir = os.path.abspath(os.path.join(source_path, dir_data["datasets_directory"]))    
118 |     filename = dir_data["selected_dataset"]
119 |     file_dir = os.path.join(datasets_dir, filename)
120 |     
121 |     
122 |     with open (file_dir, 'w') as selected_dataset_file:
123 |         selected_dataset_file.write(dataset_dir)
124 |         selected_dataset_file.close()
125 |         
126 |       
127 |     return cfg_data, cfg_file_path, dataset_dir
128 | 
129 | 
130 | 
131 | def get_from_datasets_directory(dir_data):
132 |     
133 |     datasets_dir = os.path.abspath(os.path.join(source_path, dir_data["datasets_directory"]))    
134 |     filename = dir_data["selected_dataset"]
135 |     file_dir = os.path.join(datasets_dir, filename)
136 |     
137 |     prior_selected_dataset_file = False
138 |     
139 |     #check for selected dataset file
140 |     try:
141 |         selected_dataset_file = open(file_dir,'r')
142 |         dataset_name = selected_dataset_file.readline()
143 |         #print(dataset_name)
144 |         prior_selected_dataset_file = True
145 |         selected_dataset_file.close()
146 |     
147 |     #if it's not found, create it
148 |     except IOError:
149 |         dataset_name = input("{} not found, type the path of the dataset you wish to open\n".format(filename))
150 |         selected_dataset_file = open(file_dir,'w')
151 |         selected_dataset_file.write(dataset_name)
152 |         selected_dataset_file.close()
153 |     
154 |     #open the dataset 
155 |     if(prior_selected_dataset_file):
156 |         new_name = input("Type the path of the dataset you wish to open, or press enter to open '{}'\n".format(dataset_name))
157 |         if(new_name != ""):
158 |             dataset_name = new_name
159 |             os.remove(file_dir)
160 |             
161 |             
162 |             selected_dataset_file = open(file_dir,'w')
163 |             selected_dataset_file.write(dataset_name)
164 |             selected_dataset_file.close()
165 |     
166 |     
167 |     if os.path.exists(dataset_name):
168 |         dataset_dir = dataset_name
169 |     
170 |     else:
171 |         dataset_dir = os.path.join(datasets_dir,dataset_name)
172 |         
173 |         
174 |     #check to see if the file is there, if not, then clear the selected dataset and exit the program
175 |     try:
176 |         cfg_file_name ="DTMIL_config.json"
177 |         cfg_file_path = os.path.join(dataset_dir,cfg_file_name)
178 |                 
179 |         cfgfile = open(cfg_file_path)
180 |         cfg_data = json.load(cfgfile)
181 |         cfgfile.close()
182 |         
183 |     except IOError as e:
184 |         print("{}".format(e))
185 |         print("config file(s) not found. The dataset and/or config files may not exist in the specified directory. Clearing {}".format(filename))
186 |         
187 |         os.remove(file_dir)
188 |         
189 |         sys.exit()
190 |         
191 |     return cfg_data, cfg_file_path, dataset_dir
192 | 
193 | 
194 | 
195 |     
196 | def find_missing_keys(d,old_d):
197 |    # old_subdict = getFromDict(old_dict,map_list)
198 |     added_keys = [] 
199 |     for key,val in d.items():
200 |         #print(key)
201 |         if key not in old_d:
202 |             old_d[key] = val
203 |             added_keys.append(key)
204 |         if isinstance(val,dict):
205 |             old_subdict= old_d[key]
206 |             added_keys = added_keys + find_missing_keys(val,old_subdict)
207 |         
208 |     return added_keys
209 |         
210 | def delete_extra_keys(old_d,orig_dict):   
211 |     keys_to_pop = []
212 |     all_popped_keys = []
213 |     for key,val in old_d.items():
214 |         if key not in orig_dict:
215 |             keys_to_pop.append(key)
216 |             #print("pop attempted")            
217 |         if isinstance(val,dict):
218 |             if key not in keys_to_pop:
219 |                 orig_subdict = orig_dict[key]
220 |                 all_popped_keys = all_popped_keys + delete_extra_keys(val,orig_subdict)  
221 | 
222 |     for key in keys_to_pop:
223 |         old_d.pop(key)
224 |         
225 |     return keys_to_pop + all_popped_keys
226 | 
227 | 
228 | def update_JSON(json_to_change,json_to_change_filepath):
229 |     
230 |     config_path = os.path.join(source_path,"DTMIL_config_default.json")
231 |     with open(config_path) as json_file:
232 |         orig_json = json.load(json_file)
233 |     
234 |     added_stuff = find_missing_keys(orig_json,json_to_change)
235 |     deleted_stuff = delete_extra_keys(json_to_change,orig_json)
236 |     stuff_added = len(added_stuff)>0
237 |     stuff_removed = len(deleted_stuff)>0
238 | 
239 |     if stuff_added or stuff_removed:
240 |         print("The current json file is outdated")
241 | 
242 |         if (stuff_removed):
243 |             print(f"Entries that will be removed: {deleted_stuff}")
244 |         if (stuff_added):
245 |             print(f"Entries that will be added (program will crash otherwise): {added_stuff}")
246 | 
247 |         choice = input("Do you wish to overwrite the current JSON file? (y/n)\n")
248 | 
249 |         if choice == 'y':
250 |             save_JSON(json_to_change,json_to_change_filepath)
251 | 
252 |         else:
253 |             if choice != 'n':
254 |                 print("Invalid choice")
255 | 
256 |             print("Exiting program")
257 |             sys.exit()
258 |     
259 | def save_JSON(json_data_to_save,json_to_save_filepath):
260 |         print("Writing to JSON file")
261 |         json_cfg_string = json.dumps(json_data_to_save,sort_keys=True, indent=4, separators=(',', ': '))
262 |         with open(os.path.join(json_to_save_filepath),'w') as outfile:
263 |             outfile.write(json_cfg_string)
264 |             print("Write successful")
265 |             outfile.close() 
266 | 
267 |     
268 | 
269 |     
270 | def __get_directory_with_ID(dir_data, dataset_dir, directory_string,ID):
271 |     
272 |     #folder_name = 'run'
273 |     
274 |     updated_dir = os.path.join(dir_data[directory_string], "{}".format(ID))
275 |     dir_data[directory_string] = updated_dir    
276 |     full_dir_data = os.path.join(dataset_dir,dir_data[directory_string])
277 |     
278 |     directory_exists = os.path.exists(full_dir_data)
279 |     if not directory_exists:
280 |             print("Creating:", full_dir_data)
281 |             os.makedirs(full_dir_data)
282 |             
283 |     return directory_exists


--------------------------------------------------------------------------------
/documentation/config_readme.rtf:
--------------------------------------------------------------------------------
  1 | {\rtf1\ansi\ansicpg1252\cocoartf1671\cocoasubrtf600
  2 | {\fonttbl\f0\fswiss\fcharset0 Helvetica;\f1\fswiss\fcharset0 Helvetica-Oblique;\f2\fswiss\fcharset0 Helvetica-BoldOblique;
  3 | \f3\fswiss\fcharset0 Helvetica-Bold;}
  4 | {\colortbl;\red255\green255\blue255;}
  5 | {\*\expandedcolortbl;;}
  6 | \margl1440\margr1440\vieww28600\viewh15240\viewkind0
  7 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
  8 | 
  9 | \f0\fs28 \cf0 Usage\
 10 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
 11 | 
 12 | \fs24 \cf0 Place your properly formatted dataset in the \'93datasets\'94 directory. Run begin_DTMIL.py from the 
 13 | \f1\i source
 14 | \f0\i0  directory. To edit parameters, for each run, change the values in DTMIL_config.json.
 15 | \fs28 \
 16 | \
 17 | \
 18 | \
 19 | DTMIL_config.json\
 20 | 
 21 | \f1\i\fs24 Dataset specific parameters. Each dataset has one of these files describing it\
 22 | \
 23 | 
 24 | \f0\i0 \ul Config_ID:\
 25 | \ulnone The ID for the dataset used in model_storage and model_output. This is used so we can have multiple config files to run different parameters/models in parallel 
 26 | \f1\i \
 27 | \
 28 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
 29 | 
 30 | \f0\i0 \cf0 \ul \ulc0 Training\ulnone \
 31 | \
 32 | batch_size\
 33 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
 34 | 
 35 | \f1\i \cf0 Number of samples to be propagated through the network\
 36 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
 37 | 
 38 | \f0\i0 \cf0 \
 39 | dr\
 40 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
 41 | 
 42 | \f1\i \cf0 Dropout rate (between 0 and 1)\
 43 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
 44 | 
 45 | \f0\i0 \cf0 \
 46 | epochs\
 47 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
 48 | 
 49 | \f1\i \cf0 Number of training passes through the data
 50 | \f0\i0 \
 51 | \
 52 | lam\
 53 | 
 54 | \f1\i Regularization parameter (lambda)
 55 | \f0\i0 \
 56 | \
 57 | lr\
 58 | 
 59 | \f1\i Specified learning rate
 60 | \f0\i0 \
 61 | \
 62 | nhd\
 63 | 
 64 | \f1\i Number of hidden units in the fully connected layer
 65 | \f0\i0 \
 66 | \
 67 | nhr\
 68 | 
 69 | \f1\i Number of units in the recurrent layer
 70 | \f0\i0 \
 71 | \
 72 | seed\
 73 | 
 74 | \f1\i Random seed\
 75 | \pard\pardeftab720\sl280\partightenfactor0
 76 | \cf0 \
 77 | \pard\pardeftab720\sl280\partightenfactor0
 78 | 
 79 | \f0\i0 \cf0 pre_trained_model (string filepath)\
 80 | \pard\pardeftab720\sl280\partightenfactor0
 81 | 
 82 | \f1\i \cf0 Location of a pre-trained model to load if train_flag is set to 0. Loads from the default file path (model_saves/[run_id]/[model_filename] if set to a blank string (\'93\'94) (
 83 | \f2\b The \'93default file path\'94 is currently buggy, so if you want to load a new model, specify the file path here)
 84 | \f1\b0 \
 85 | \
 86 | \pard\pardeftab720\sl280\partightenfactor0
 87 | 
 88 | \f0\i0 \cf0 pre_trained_json (string filepath):
 89 | \f1\i \
 90 | Location of a json file to load a pre-trained model from if the model is split into a model and json. Set to a blank string (\'93\'94) to ignore this parameter 
 91 | \f2\b (currently only works with a full file path, will be fixed in the future)
 92 | \f1\b0  \
 93 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
 94 | \cf0 \
 95 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
 96 | 
 97 | \f0\i0 \cf0 train_flag\
 98 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
 99 | 
100 | \f1\i \cf0 Set to 0 to use a specified pre-trained model, 1 to create a new model \
101 | \
102 | \
103 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
104 | 
105 | \f0\i0 \cf0 \ul Importing\
106 | \
107 | \ulnone nominal_filename:\ul \
108 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
109 | 
110 | \f1\i \cf0 \ulnone Filename for the nominal data\
111 | \
112 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
113 | 
114 | \f0\i0 \cf0 adverse_filename:\
115 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
116 | 
117 | \f1\i \cf0 Filename for the adverse data\
118 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
119 | 
120 | \f0\i0 \cf0 \ul \
121 | \ulnone validation percent:\
122 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
123 | 
124 | \f1\i \cf0 Percent of samples to be used for the validation set (after the test set has been held out)\
125 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
126 | 
127 | \f0\i0 \cf0 \
128 | holdout_percent\
129 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
130 | 
131 | \f1\i \cf0 Percent of samples to be held out for the test set\
132 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
133 | 
134 | \f0\i0 \cf0 \ul \
135 | \ulnone state_cache\
136 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
137 | 
138 | \f1\i \cf0 # 0 to load states from a saved cache, 1 to load from the original CSV files. If a cache does not exist, the program will load from the CSV files, then create one. The cache is updated every time the program loads from CSV files.
139 | \f0\i0 \ul \
140 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
141 | 
142 | \fs28 \cf0 \ulnone \
143 | \pard\pardeftab720\sl280\partightenfactor0
144 | 
145 | \fs24 \cf0 time_splice:\
146 | \pard\pardeftab720\sl280\partightenfactor0
147 | 
148 | \f1\i \cf0 The percentage of the time window to use. Set to 1 to use the whole window (recommend). Example: 0.6 will only use the first 60 % of the time steps in the window\
149 | \pard\pardeftab720\sl280\partightenfactor0
150 | 
151 | \f0\i0 \cf0 \ul \
152 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
153 | \cf0 \
154 | Preprocessing\
155 | \ulnone \
156 | all_parameter_names (list of strings):\
157 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
158 | 
159 | \f1\i \cf0 List of all parameter names for the dataset\
160 | \
161 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
162 | 
163 | \f0\i0 \cf0 set_sample_length (int):\
164 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
165 | 
166 | \f1\i \cf0 Number defining the length of each time series. Time series above this number will be truncated, ones below this number will be dropped. If this value is set to null, the length will be inferred from the dataset
167 | \f0\i0 \
168 | \
169 | redundant_parameters (int or string list): \
170 | 
171 | \f1\i List of parameters to hold out when training. This lets you manually define parameters to be held out (rather than the automatic process above)\
172 | \
173 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
174 | 
175 | \f0\i0 \cf0 drop_parameters (int or string list):\
176 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
177 | 
178 | \f1\i \cf0 Parameters to be dropped and not included in the visualization (the redundant and held out parameters are still included in the visualization)
179 | \f0\i0 \
180 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
181 | \cf0 \ul \
182 | \
183 | Model_IO:\
184 | \ulnone model_filename:\
185 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
186 | 
187 | \f1\i \cf0 Filename for the keras model\
188 | \
189 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
190 | 
191 | \f0\i0 \cf0 model_container_filename\
192 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
193 | 
194 | \f1\i \cf0 Filename for the model container, but this doesn\'92t include the keras model itself (trained or untrained)since that can\'92t be pickled\
195 | \
196 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
197 | 
198 | \f0\i0 \cf0 data_container_filename:\
199 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
200 | 
201 | \f1\i \cf0 Filename for the data container where the dataset is stored
202 | \f0\i0 \ul \
203 | \
204 | Visualization\
205 | \
206 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
207 | \cf0 \ulnone precursor_threshold\
208 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
209 | 
210 | \f1\i \cf0 The precursor score threshold from which we define an adverse event (default 0.5)\
211 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
212 | 
213 | \f0\i0 \cf0 \
214 | guideline_type:\
215 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
216 | 
217 | \f1\i \cf0 This sets the guidelines (dashed lines) in the visualization process. 
218 | \f0\i0 \
219 | \
220 | \pard\pardeftab720\sl280\partightenfactor0
221 | \cf0 binary_parameters (int list)\
222 | \pard\pardeftab720\sl280\partightenfactor0
223 | 
224 | \f1\i \cf0 Tells the visualization code which parameters are binary 
225 | \f0\i0 \
226 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
227 | 
228 | \fs28 \cf0 \
229 | DTMIL_config_dir.json
230 | \fs24 \
231 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
232 | 
233 | \f1\i \cf0 This file contains the file directories needed to help the program run. These do not need to be changed unless the user wishes to customize the file structure of the program. The \'93datasets\'94 directory assumes the \'93source\'94 directory is used as a base. All the other directories assume that the datasets directory + the selected dataset (defined by the user) is used as a base.\
234 | \
235 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
236 | 
237 | \f0\i0 \cf0 cache_file\
238 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
239 | 
240 | \f1\i \cf0 Save location for the parameters that are cached from a previous run of the program. This is used so large datasets don\'92t need to be imported from .csv files every time the program is run\
241 | \
242 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
243 | 
244 | \f0\i0 \cf0 datasets_directory\
245 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
246 | 
247 | \f1\i \cf0 The directory where all the datasets are located\
248 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
249 | 
250 | \f0\i0 \cf0 \
251 | parameters_directory\
252 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
253 | 
254 | \f1\i \cf0 This is where the file lists and parameters are stored\
255 | \
256 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
257 | 
258 | \f0\i0 \cf0 model_archive_directory\
259 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
260 | 
261 | \f1\i \cf0 This is where backups from each run of the model are stored\
262 | \
263 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
264 | 
265 | \f0\i0 \cf0 model_storage_directory 
266 | \f3\b (Currently labeled as \'93model saves\'94)
267 | \f0\b0 \
268 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
269 | 
270 | \f1\i \cf0 Contains a separate save of the model and final parameters (used to visualize without having to run the whole program/train the whole model again).\
271 | \
272 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
273 | 
274 | \f0\i0 \cf0 model_output_directory 
275 | \f3\b (Currently \'93output\'94)
276 | \f0\b0 \
277 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
278 | 
279 | \f1\i \cf0 This is where output files like graphs are stored.\
280 | \
281 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
282 | 
283 | \f0\i0 \cf0 selected_dataset
284 | \f1\i \
285 | This is a text file that determines which dataset that you will be using. It\'92s auto-generated by the program, but you can edit it manually. You may also delete the file, which will prompt the user to define a new dataset\
286 | \
287 | 
288 | \f0\i0 raw_data_directory\
289 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
290 | 
291 | \f1\i \cf0 This is where the raw data (in csv format) is stored.
292 | \f0\i0 \
293 | \
294 | }


--------------------------------------------------------------------------------
/source/dtmil/data_container.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | 
  5 | @author: vjanakir
  6 | This is the code for deep temporal multiple instance learning (DTMIL). This is the version of ADOPT that is based on deep learning.
  7 | The code assumes Keras with Theano or Tensorflow backend.
  8 | uses Anaconda virtual env with Python 2.7 and keras. It should also work in Python 3.x but not tested.
  9 | 
 10 | Created on Tue Jun 19 14:44:18 2018
 11 | 
 12 | @author: dweckler
 13 | 
 14 | """
 15 | 
 16 | from __future__ import print_function
 17 | 
 18 | import os
 19 | import sys
 20 | import numpy as np
 21 | import time
 22 | import h5py
 23 | import random
 24 | import pandas as pd
 25 | from pathlib import Path
 26 | 
 27 | ##FIXME: This does not work properly for values other than 10%, the cause is currently unknown
 28 | 
 29 | def holdout_split(indeces, holdout_percent=10):
 30 |   
 31 |     full_arr = np.asarray(indeces)
 32 | 
 33 |     if holdout_percent == 0:
 34 |         full_arr = np.asarray(indeces)
 35 |         full_arr.shape = (1,len(indeces)) 
 36 |         
 37 |         return full_arr, np.asarray([])
 38 | 
 39 |     holdout_num = int(len(indeces)*holdout_percent)        
 40 |     ho_idx = indeces[-holdout_num:]
 41 |     main_idx = indeces[:len(indeces)-holdout_num]
 42 |         
 43 |     main_arr = np.asarray(main_idx)
 44 |     main_arr.shape = (1,len(main_idx))
 45 |     
 46 |     ho_arr = np.asarray(ho_idx)
 47 |     ho_arr.shape = (1,len(ho_idx))
 48 |     
 49 |     return main_arr[0], ho_arr[0]
 50 |     
 51 | 
 52 | class DataContainer:
 53 | 
 54 |     def __init__(self, json_config_data, state_cache = False):
 55 |         
 56 |         
 57 |         
 58 |         self.json_dir_data, self.json_data,self.dataset_dir = json_config_data
 59 |         self.preprocessing_params = self.json_data["preprocessing"]
 60 |         self.importing_params = self.json_data["importing"]
 61 | 
 62 |         self.parameters_directory = os.path.join(self.dataset_dir,self.json_dir_data['parameters_directory'])
 63 |         self.raw_data_directory = os.path.join(self.dataset_dir, self.json_dir_data['raw_data_directory'])
 64 |         
 65 |         self.load_data(state_cache)
 66 |     
 67 | 
 68 |                 
 69 |     def load_data(self,state_cache):
 70 |      
 71 |  #       print('Loading data from {} CSV files...'.format(maxItemsInList))
 72 |         time_start = time.time()
 73 |         self.sample_list = []
 74 |         self.dropped_sample_filenames = []
 75 |         self.seqlabels = []
 76 |         
 77 |         nominal_filename = self.importing_params["nominal_filename"]
 78 |         adverse_filename = self.importing_params["adverse_filename"]
 79 |         
 80 |         #read file list
 81 |         read_lines_from_file = self.read_lines_from_file
 82 |         nominal_file_list = read_lines_from_file(nominal_filename)
 83 |         adverse_file_list = read_lines_from_file(adverse_filename)
 84 | 
 85 |         #the csv from which we get our default header
 86 |         default_csv_filename = os.path.join(self.raw_data_directory,nominal_file_list[0])
 87 |         df = pd.read_csv(default_csv_filename)
 88 |         parameter_list = list(df.columns.values)
 89 |         self.header = parameter_list
 90 |         self.all_parameter_names = parameter_list  
 91 |        
 92 |         self.preprocessing_params["all_parameter_names"] = parameter_list
 93 |         set_sample_length = self.preprocessing_params["set_sample_length"]
 94 |         if set_sample_length == None:
 95 |             self.max_seqlen = len(df)
 96 |             print("No sample length specified, assuming all samples are of equal length {}".format(self.max_seqlen))
 97 | 
 98 |         else:
 99 |             self.max_seqlen = set_sample_length              
100 | 
101 | 
102 |         self.mismatched_files = []
103 |         
104 |         I_opt_idx,nominal_imported_csvs = self.__import_sample_list(nominal_file_list,label = 0)
105 |         I_bad_idx,adverse_imported_csvs = self.__import_sample_list(adverse_file_list, label = 1)
106 |         
107 |         
108 |         if len(self.mismatched_files)> 0:
109 |             out = ("{}/{} labels don't match the default csv header. This will either cause a shaping error (and subsequent crash), " 
110 |                    "or cause some parameters to be labeled incorrectly (possibly leading to nonsensical data). Make sure to double check the headers "
111 |                    "for your csv files to make sure they all match".format(len(self.mismatched_files),len(I_bad_idx)+len(I_opt_idx)))
112 |             print("\n\nMismatched CSV headers found!")
113 |             
114 |             print("Default CSV (used for comparison):",default_csv_filename)
115 |             print("{}/{} mismatched csv files (for reference):".format(len(self.mismatched_files[:5]),len(self.mismatched_files)))
116 |             print(self.mismatched_files[:5])
117 |             #save to model output
118 |             
119 |             
120 |             print(out)
121 |             
122 |             choice = input("Are you sure you want to continue? (y/n)\n")
123 |             
124 |             if choice == 'y':
125 |                 pass
126 |             else:
127 |                 sys.exit(0)
128 |             
129 |             
130 |         
131 |         
132 |         random.Random(42).shuffle(I_opt_idx)
133 |         random.Random(42).shuffle(I_bad_idx)
134 |         
135 |         #FIXME:this shouldn't need processing here, do it in the holdout split function
136 |         holdout_percent = self.importing_params['holdout_percent']
137 |         
138 |         #self.temp_I_opt_idx = I_opt_idx
139 |         self.I_bad,self.I_bad_ho = holdout_split(I_bad_idx, holdout_percent)
140 |         self.I_opt,self.I_opt_ho = holdout_split(I_opt_idx, holdout_percent)
141 |         
142 |         
143 |         print("Dropped {} files that were too short".format(len(self.dropped_sample_filenames)))
144 |         
145 |         #this is just in case we come up with an algorithm that can handle differing sequence lengths
146 |         self.seqLabels = np.asarray([self.seqlabels])[0] 
147 | 
148 |         finalList = np.asarray(nominal_imported_csvs + adverse_imported_csvs)
149 |         finalArray = np.swapaxes(finalList,0,1)
150 |         del finalList
151 |         
152 |         #splice percentage
153 |         time_splice = self.importing_params["time_splice"]
154 |         self.time_splice = time_splice
155 |         
156 |         if ((time_splice > 0) and (time_splice<1)):
157 |         
158 |             sample_index = 0 
159 |             set_slice = int(finalArray.shape[sample_index]*time_splice)
160 |             #self.seqlen[0,:] = set_slice
161 |             finalArray = finalArray[0:set_slice,:,:]
162 |             self.time_splice = time_splice
163 |         else:
164 |             self.time_splice = None
165 |           
166 |         print("saving sample_list")
167 |         self.states_orig = finalArray
168 |         self.states = finalArray 
169 |         self.save_to_cache()
170 |         
171 |         print("Time to load: {} seconds".format(time.time()-time_start))       
172 |         
173 |         
174 |     def __import_sample_list(self,X_file_list,label):
175 |         I_X_idx = []
176 |         
177 |         imported_csv_list = []
178 |         for filename in X_file_list:
179 |             #TODO load,verify, and filter data here
180 |                 #throw an error/exception if lengths do not match and filtering is not set in dtmil_config
181 |             
182 |             imported_csv = self.import_sample(filename)
183 |             
184 |             if imported_csv is not None:
185 |                 imported_csv_list.append(imported_csv)
186 |                 self.sample_list.append(filename)
187 |                 I_X_idx.append(len(self.seqlabels))
188 |                 self.seqlabels.append(label)
189 |                 
190 |         return  I_X_idx,imported_csv_list
191 |         
192 |      
193 |     def import_sample(self,filename):
194 |         
195 |         filepath = os.path.join(self.raw_data_directory,filename)
196 |         
197 |         
198 |         df = pd.read_csv(filepath)
199 |         header = list(df.columns.values)
200 |         
201 |         
202 |         if (header != self.header):
203 |             self.mismatched_files.append(filename)
204 |                 
205 |         imported_csv = df.values[-self.max_seqlen:]
206 | 
207 |         if len(imported_csv) != self.max_seqlen:
208 |             self.dropped_sample_filenames.append(filename)
209 |             return None
210 |         
211 |         return imported_csv    
212 | 
213 |         
214 |     #FIXME: Make this actually work, maybe skip the whole my_data creation process?
215 |     #load files from cache with CSV as backup if the cache isn't there
216 |     def load_from_cache(self,backup_sample_list):
217 |         print('Loading states from cache...')
218 |         cache_dir = os.path.join(self.dataset_dir,self.json_dir_data['cache_file'])
219 |         
220 |         try:
221 |             with h5py.File(cache_dir, 'r') as hf:
222 |                 self.states_orig = hf['states_orig'][:]
223 |                 self.states = hf['states_orig'][:]
224 |                 
225 |                 #TODO: have this in a different place? Also check to see if loading from cache breaks anything
226 |                 #splice percentage
227 |                 time_splice = self.importing_params["time_splice"]
228 |                 self.time_splice = time_splice
229 |                 
230 |         except EnvironmentError:
231 |             print('cache file not found, loading from CSV files instead')
232 |             self.import_all_samples(backup_sample_list)
233 |         
234 |         
235 |     def save_to_cache(self):
236 |         print('saving states to cache...')
237 |         cache_dir = os.path.join(self.dataset_dir,self.json_dir_data['cache_file'])
238 | 
239 |         with h5py.File(cache_dir, 'w') as hf:
240 |             hf.create_dataset('states_orig', data = self.states_orig)
241 |             
242 | 
243 |         
244 |         
245 |     def reshape_and_process(self):
246 |         # after loading, the variables have the following shape
247 |         # states_orig is of shape (T, N, D) where T is max length of sample, N is the total number of samples, D is the number of time series in each sample.
248 |         # The length of sample i is given by seqlen[0,i]. 
249 |         # If length is less than T, the sample data is prepended with NAN to make it length T.
250 |         # A sample i belongs to "opt" (or "bad") if i belongs to array I_opt (or I_bad).
251 |         # I_opt_ho and I_bad_ho are hold-out "test" sets.
252 |         # I_bad, I_opt, I_bad_ho, I_opt_ho are one-dimensional arrays
253 |         # seqlen is of shape (1, N). 
254 |         # header is a list of D feature names
255 |         # seqLabels is of shape (N,). sample i has a label seqLabels[i] - 1 if sample i has adverse event and 0 otherwise.        
256 |         # removing variables which are correlated with target (to avoid finding trivial precursors)
257 |                   
258 |         correlated_states = self.preprocessing_params["redundant_parameters"]
259 |         correlated_states = [self.decode_parameter_label(i) for i in correlated_states]
260 |                 
261 |         #convert to a Numpy array that avoids redundant choices (just in case something was mistakenly added)
262 |         self.correlated_states =np.unique(np.array(correlated_states))
263 |         
264 |         dropped_states = self.preprocessing_params["drop_parameters"]
265 |         dropped_states = [self.decode_parameter_label(i) for i in dropped_states]
266 |         self.dropped_states = np.unique(np.array(dropped_states))
267 |         
268 |         #make sure not to delete the same state twiceo
269 |         states_to_remove = np.unique(np.array(correlated_states + dropped_states))
270 |         
271 |         self.parameter_selection=np.delete(np.arange(self.states.shape[2]),states_to_remove,0)
272 |         self.states=self.states[:,:,self.parameter_selection]
273 |         
274 |         # get max length of trjectories
275 |         self.maxlen = np.shape(self.states)[0]
276 |         
277 |         # get total number of trajectories
278 |         #        Ntraj= np.shape(self.states)[1]
279 |         
280 |         # number of features (time series variables)
281 |         self.nfeat=np.shape(self.states)[-1]
282 |         
283 |         # center the data - subtract mean and divide by STD. If variable is constant, remove it from analysis
284 |         temp=np.reshape(self.states,(np.shape(self.states)[0]*np.shape(self.states)[1],np.shape(self.states)[2]))
285 | 
286 |         mean=np.nanmean(temp,0)
287 |         std=np.nanstd(temp,0)
288 |         elimidx=np.where(std<1E-5)[0]
289 |         if elimidx.shape[0]>0:
290 |             selidx=np.array(list(set(np.arange(self.nfeat).tolist()).difference(elimidx)))
291 |             self.states=self.states[:,:,selidx]
292 |             mean=mean[selidx]
293 |             std=std[selidx]
294 |             temp=temp[:,selidx]
295 |             self.parameter_selection=self.parameter_selection[selidx]
296 |         temp=(temp-mean)/std
297 |         self.states=np.reshape(temp,(np.shape(self.states)[0],np.shape(self.states)[1],np.shape(self.states)[2]))
298 |         del temp
299 |         self.nfeat=np.shape(self.states)[-1]
300 |         
301 |         # Replace NAN by an arbitrary mask_val
302 |         mask_val=int(np.nanmax(self.states)+1000)
303 |         self.states[np.isnan(self.states)]=mask_val
304 |         
305 |         # reshape to match keras' definitions
306 |         self.states=np.transpose(self.states,(1,0,2))
307 |         
308 |         
309 |     def train_test_split(self):
310 |         # Split train data into train (60%) and validation (40%) sets
311 |         #FIXME: maybe change this to use a more traditional validation set approach. The numbers don't match the output for some reason
312 |         
313 |         validation_percent = self.importing_params["validation_percent"]
314 |         self.validation_percent = validation_percent *100
315 |         
316 |         nvalid=int(validation_percent*len(self.I_bad)) 
317 |         self.I_bad_valid=self.I_bad[len(self.I_bad)-nvalid:]
318 |         self.I_bad=self.I_bad[:len(self.I_bad)-nvalid]
319 |         self.I_opt_valid=self.I_opt[len(self.I_opt)-nvalid:]
320 |         self.I_opt=self.I_opt[:len(self.I_opt)-nvalid]
321 |        
322 |         print(self.states.shape)
323 |         temp=np.array([self.I_opt.tolist()+self.I_bad.tolist()])[0]
324 |         
325 |         self.xtrain=self.states[temp,:,:]
326 |         self.ytrain=self.seqLabels[temp]
327 |         temp=np.array([self.I_opt_valid.tolist()+self.I_bad_valid.tolist()])[0]
328 |         self.xvalid=self.states[temp,:,:]
329 |         self.yvalid=self.seqLabels[temp]
330 |         del temp
331 |         
332 |         self.ytrain = np.expand_dims(np.expand_dims(self.ytrain,-1),-1)
333 |         self.yvalid = np.expand_dims(np.expand_dims(self.yvalid,-1),-1)
334 |         
335 |         # currently data is balanced. If there is an imbalance, adjust this parameter.
336 |         self.class_weight = {0 : 1,1: 1}
337 |         
338 |     def preprocess(self):
339 |         self.reshape_and_process()
340 |         self.train_test_split()
341 |         
342 |     def get_grouping(self, num):
343 |         
344 |         label = "<Unknown>"
345 |         dataset = "<Unknown_Dataset>"
346 |         
347 |         if num in np.concatenate([self.I_bad,self.I_bad_ho,self.I_bad_valid]):
348 |             label = "Anomalous"
349 |         elif num in np.concatenate([self.I_opt, self.I_opt_valid,self.I_opt_ho]):
350 |             label = "Nominal"
351 |         else:
352 |             print("index doesn't exist in the dataset")
353 |         
354 |         if num in np.concatenate([self.I_bad, self.I_opt]):
355 |             dataset = "Train"
356 |         elif num in np.concatenate([self.I_bad_valid,self.I_opt_valid]):
357 |             dataset = "Validation"     
358 |         elif num in np.concatenate([self.I_bad_ho, self.I_opt_ho]):
359 |             dataset = "Test"
360 |         else: 
361 |             print("Invalid dataset id")
362 |             
363 |         return label, dataset
364 |             
365 |     
366 |     
367 |     
368 |     def get_filename(self, index):
369 | 
370 |         return Path(self.sample_list[index]).stem
371 |     
372 |     
373 |     def read_lines_from_file(self,filename):
374 |         
375 |         with open(os.path.join(self.parameters_directory,filename),'r') as f:
376 |             content = f.readlines()
377 |             content = [x.strip() for x in content]
378 |         
379 |         return content
380 | 
381 |     
382 |     ##TODO: have a decode type argument 
383 |     def decode_parameter_label(self,param):
384 |         if (isinstance(param,int)):
385 |             return param
386 |         
387 |         else:
388 |             return self.all_parameter_names.index(param)
389 |         
390 |     
391 |     def param_index_to_label(self, param_index):
392 |         return self.all_parameter_names[param_index]
393 |         
394 |         
395 |         
396 |         
397 |         
398 |         
399 |     
400 |     
401 |     
402 | 


--------------------------------------------------------------------------------
/source/guis/dataset_formatter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Mon Feb 11 00:09:03 2019
  5 | @author: dweckler
  6 | """
  7 | from tkinter import Tk, Label, Button, Entry, mainloop, filedialog, messagebox, Grid, Checkbutton, IntVar, END
  8 | import os
  9 | #import tkSimpleDialog as simpledialog
 10 | import matplotlib
 11 | matplotlib.use('qt5agg')
 12 | 
 13 | 
 14 | 
 15 | from shutil import copyfile, move,copytree,copy
 16 | import pandas as pd
 17 | import json
 18 | import glob
 19 | from pathlib import Path
 20 | 
 21 | import errno
 22 | 
 23 | #if nominal filelist and adverse filelist don't exist, generate a file list from the inputted folders
 24 | #ask via a popup if you want to do this. "directories were found instead of file lists. 
 25 | #Would you like to use the inputted directories to generate their respective file lists?"
 26 | #use listdir to get the lists, make sure to parse out everything but the relative file path
 27 | 
 28 | 
 29 | 
 30 | def copy_files_or_directories(source, dest):
 31 |     try:
 32 |         copytree(source, dest)
 33 |     except OSError as exception:
 34 |         if exception.errno == errno.ENOTDIR:
 35 |             try:
 36 |                 copy(source, dest)
 37 |             except:
 38 |                 print("Error copying file")                
 39 |         else:
 40 |             raise
 41 |      
 42 |         
 43 | #recursively list visible files 
 44 | def list_visible_files(path):
 45 |     filepath = os.path.join(path,'*')
 46 |     
 47 |     all_files = [os.path.basename(f) for f in glob.glob(filepath)]
 48 |     
 49 |     return all_files
 50 | 
 51 | def set_entry_text(entry,text):
 52 |     entry.delete(0,END)
 53 |     entry.insert(0,text)
 54 | 
 55 | 
 56 | class DatasetFormatter:
 57 |     
 58 |     def __init__(self, master):
 59 |         
 60 |         data_row = 0
 61 |         nominal_flist_row = 1
 62 |         adverse_flist_row = 2
 63 |         last_row = 3
 64 |         
 65 |         label_column = 0
 66 |         filelist_column = 1
 67 |         button_column = 2
 68 |         
 69 |         self.copy_or_move = IntVar()
 70 |         self.copy_or_move.set(1)
 71 |         self.checkbutton = Checkbutton(master,text="Copy",variable = self.copy_or_move).grid(row=last_row)
 72 |         
 73 |         
 74 |         self.use_filelists = IntVar()
 75 |         self.use_filelists.set(0)
 76 |         self.filelist_check = Checkbutton(master,text="Filelists", command = self.list_swap, variable = self.use_filelists)
 77 |         self.filelist_check.grid(row = last_row,column = button_column)
 78 |         
 79 |         self.default_adverse_filelist_path = ""
 80 |         self.default_nominal_filelist_path = ""
 81 |         self.default_adverse_folder_path = ""
 82 |         self.default_nominal_folder_path = ""
 83 | 
 84 |         
 85 |         
 86 |         
 87 |         self.folder_label = Label(master, text="Data Folder:")
 88 |         self.folder_label.grid(row=data_row)
 89 |         
 90 |         self.nominal_filelist_label = Label(master, text="Nominal Directory:")
 91 |         self.nominal_filelist_label.grid(row=nominal_flist_row,column = label_column)
 92 |     
 93 |         self.adverse_filelist_label = Label(master, text="Adverse Directory:")
 94 |         self.adverse_filelist_label.grid(row=adverse_flist_row,column = label_column)
 95 |         
 96 |         #Define the entry fields
 97 |         self.data_folder_entry = Entry(master)
 98 |         self.nominal_path_entry = Entry(master)
 99 |         self.adverse_path_entry = Entry(master)
100 |         
101 |         Grid.columnconfigure(master,filelist_column,weight=1)
102 |         
103 |         self.data_folder_entry.grid(row=data_row, column=filelist_column,sticky = 'we')
104 |         self.nominal_path_entry.grid(row=nominal_flist_row, column=filelist_column,sticky = 'we')
105 |         self.adverse_path_entry.grid(row = adverse_flist_row, column = filelist_column,sticky = 'we')
106 |         
107 |         #define the "choose file" button
108 |         self.data_folder_button = Button(master,text="Choose Folder", command = self.get_the_folder)
109 |         self.nominal_filelist_button = Button(master,text="Choose Folder", command = self.get_nominal_filelist)
110 |         self.adverse_filelist_button = Button(master,text="Choose Folder", command = self.get_adverse_filelist)
111 |         
112 |         self.data_folder_button.grid(row=data_row,column=button_column)
113 |         self.nominal_filelist_button.grid(row=nominal_flist_row,column=button_column)
114 |         self.adverse_filelist_button.grid(row=adverse_flist_row,column=button_column)
115 |         
116 |         self.save_button = Button(master,text="Generate Folder Hierarchy",fg ="#8b0000" , command = self.generate_folder_structure)
117 |         self.save_button.grid(row = last_row,column = filelist_column)
118 | 
119 |         
120 | 
121 |             
122 |     def get_the_folder(self):
123 |         
124 |         home_dir = str(Path.home())
125 |         print("Choosing folder")
126 |         filename = filedialog.askdirectory(initialdir =home_dir)
127 |         
128 |         self.data_folder_entry.delete(0, 'end')
129 |         self.data_folder_entry.insert(0,filename)
130 |     
131 |     def get_nominal_filelist(self):
132 |         print("Choosing nominal")
133 |         self.get_filelist(self.nominal_path_entry)
134 |         
135 |          
136 |     def get_adverse_filelist(self):
137 |         print("Choosing adverse")
138 |         self.get_filelist(self.adverse_path_entry)
139 | 
140 |         
141 |     def get_filelist(self,my_filelist_entry):
142 |         
143 |         dataset_folder_path = self.data_folder_entry.get()
144 |         if dataset_folder_path == '':
145 |             initial_directory = str(Path.home())       
146 |         else:
147 |             initial_directory = dataset_folder_path
148 |         
149 |         
150 |         if (self.use_filelists.get()):
151 |             filename = filedialog.askopenfilename(initialdir = initial_directory)
152 |             
153 |         else:
154 |             filename = filedialog.askdirectory(initialdir = initial_directory)
155 |         
156 |         my_filelist_entry.delete(0, 'end')
157 |         my_filelist_entry.insert(0,filename)
158 |         
159 |         
160 |     def list_swap(self):
161 |         
162 |         
163 |         if not (self.use_filelists.get()):
164 |             the_label = "Directory"
165 |             the_button_label = "Choose Folder"
166 |             
167 |             self.default_nominal_filelist_path = self.nominal_path_entry.get()            
168 |             set_entry_text(self.nominal_path_entry,self.default_nominal_folder_path)
169 |             
170 |             self.default_adverse_filelist_path = self.adverse_path_entry.get()
171 |             set_entry_text(self.adverse_path_entry,self.default_adverse_folder_path)
172 |             
173 |             
174 |         else:
175 |             the_label = "Filelist"
176 |             the_button_label = "Choose File"   
177 |             
178 |             self.default_nominal_folder_path = self.nominal_path_entry.get()
179 |             set_entry_text(self.nominal_path_entry,self.default_nominal_filelist_path)
180 | 
181 |             
182 |             self.default_adverse_folder_path = self.adverse_path_entry.get()
183 |             set_entry_text(self.adverse_path_entry,self.default_adverse_filelist_path)
184 |             
185 |         
186 |         self.nominal_filelist_button.config(text = the_button_label)
187 |         self.adverse_filelist_button.config(text = the_button_label)
188 |         
189 |         
190 |         self.nominal_filelist_label.config(text = "Nominal {}".format(the_label))
191 |         self.adverse_filelist_label.config(text = "Adverse {}".format(the_label))
192 |     
193 | 
194 |     def generate_file_list(self,directory,dataset_folder_path):
195 |         dir_name = os.path.basename(directory)
196 | 
197 |         file_list = glob.glob(os.path.join(directory,"*.csv"))
198 |         file_list = [os.path.basename(pth) for pth in file_list]
199 |         file_list = [os.path.join(dir_name,file) for file in file_list]
200 |         
201 |         
202 |         file_list_txt = os.path.join(dataset_folder_path,f"{dir_name}.txt")
203 |         
204 |         with open(file_list_txt,'w') as f:
205 |             for filename in file_list:
206 |                 f.write(filename)
207 |                 f.write('\n')
208 |                 
209 |         return file_list_txt
210 |             
211 |         
212 |     
213 |     def generate_folder_structure(self):
214 |         
215 |         
216 |         nominal_entry_path = self.nominal_path_entry.get()
217 |         adverse_entry_path = self.adverse_path_entry.get()
218 |         dataset_folder_path = self.data_folder_entry.get()
219 |         
220 |         
221 |         blank_path = ""
222 |         
223 |         if (dataset_folder_path == blank_path):
224 |             messagebox.showerror("Error",'Please enter a valid dataset folder path')
225 |             return
226 |         
227 |         
228 |         #TODO: make this generate the filelists paths if it's a directory (indicated by the checkmark)
229 |         #TODO: add errors if the directory isn't where it's supposed to be
230 | 
231 |         if(self.use_filelists.get()):
232 |             
233 |             
234 |             if (not (nominal_entry_path.endswith('.txt')) or nominal_entry_path == ""):
235 |                 messagebox.showerror("Error",'Please enter a valid path for the nominal file list')
236 |                 return
237 |                 
238 |             
239 |             if not (adverse_entry_path.endswith('.txt')):
240 |                 messagebox.showerror("Error",'Please enter a valid path for the adverse file list')
241 |                 return
242 |             
243 |         else:
244 |             if((nominal_entry_path == blank_path) or (adverse_entry_path == blank_path)):
245 |                 messagebox.showerror("Error","Please enter a valid directory path")
246 |                 return
247 |             
248 |             nominal_entry_path = self.generate_file_list(nominal_entry_path,dataset_folder_path)
249 |             adverse_entry_path = self.generate_file_list(adverse_entry_path,dataset_folder_path)
250 |                 
251 |     
252 |         #Ask for name of dataset + entry
253 |         #dataset_name = simpledialog.askstring("Input", "Please enter the name of the dataset")
254 |         
255 |         
256 |            
257 |         print("Attempting to read filepath from nominal filelist")
258 |         data_filenames = []
259 |         with open('{}'.format(nominal_entry_path),'r') as f:
260 |             data_filenames = f.readlines()
261 |             data_filenames = [x.strip() for x in data_filenames]
262 |     
263 |        
264 |         
265 |         #test to see if the files are there before doing anything
266 |         test_file_path = data_filenames[0]
267 |         csv_filepath = os.path.join(dataset_folder_path,test_file_path)
268 |         
269 |         try:
270 |             df = pd.read_csv(csv_filepath)
271 |             parameter_list = list(df.columns.values)
272 |             
273 |         except:
274 |             
275 |             messagebox.showerror("Error", f"Could not find any data files in the specified Data Folder path:\n{dataset_folder_path}\n\nAttempted to open file:\n{csv_filepath}")
276 |             return
277 |             
278 |         
279 |         #TODO: set initial directory to current dir initialdir = os.path.sep
280 |         home_dir = str(Path.home())
281 |         dataset_path = filedialog.asksaveasfilename(initialdir = home_dir)
282 |         if dataset_path is None or dataset_path == "":
283 |             messagebox.showerror("Error", "No name entered!")
284 |             return
285 | 
286 | 
287 |         else:
288 |             print("Generating folder structure for dataset {}".format(dataset_path))
289 | 
290 |         
291 |         #create new dataset directory (if one doesn't exist)
292 |         make_directory(dataset_path)
293 |         
294 |         #create data folder
295 |         data_path = os.path.join(dataset_path,'data')
296 |         make_directory(data_path)
297 |         
298 |         #within data folder, create parameters and raw_data folder
299 |     
300 |         parameters_path = os.path.join(data_path, "parameters")
301 |         make_directory(parameters_path)
302 |         
303 |         raw_data_path = os.path.join(data_path,'raw_data')
304 |         make_directory(raw_data_path)
305 |         
306 |         #create misc, model_storage, and model_output folders
307 |         model_storage_path = os.path.join(dataset_path,'model_saves')
308 |         make_directory(model_storage_path)
309 |         
310 |         model_output_path = os.path.join(dataset_path, "output")
311 |         make_directory(model_output_path)
312 |         
313 |         misc_path = os.path.join(dataset_path,"misc")
314 |         make_directory(misc_path)
315 |         #create model_saves within the misc folder
316 |         
317 |         model_saves_path = os.path.join(misc_path,"model_archive")
318 |         make_directory(model_saves_path)
319 |         
320 |         
321 |         print("Directory creation process complete")
322 |           
323 |         
324 |         sep = os.path.sep
325 |         adverse_filename = adverse_entry_path.split(sep)[-1]
326 |         nominal_filename = nominal_entry_path.split(sep)[-1]
327 |         
328 |         #place the specified adverse and nominal filelists inside the parameters folder
329 |         copyfile(adverse_entry_path,os.path.join(parameters_path,adverse_filename))
330 |         copyfile(nominal_entry_path,os.path.join(parameters_path,nominal_filename))
331 |         
332 |         #move the directory of the dataset folder
333 |         
334 |         files = list_visible_files(dataset_folder_path)
335 |         
336 |         if (self.copy_or_move.get() == 1):
337 |             print("copying")
338 |             
339 |             files = list_visible_files(dataset_folder_path)
340 | 
341 |             
342 |             for f in files:
343 |                 
344 | 
345 |                 if f not in [adverse_filename, nominal_filename]:
346 |                     #copytree(os.path.join(dataset_folder_path,f),raw_data_path)
347 |                     copy_files_or_directories(os.path.join(dataset_folder_path,f),os.path.join(raw_data_path,f))  
348 | 
349 | 
350 |             
351 |         elif(self.copy_or_move.get() == 0):
352 |             print("Moving all files from the dataset directory to our raw data directory")
353 |             for f in files:
354 |                 
355 |                 if f not in [adverse_filename, nominal_filename]:
356 |                     #copytree(os.path.join(dataset_folder_path,f),raw_data_path)
357 |                     move(os.path.join(dataset_folder_path,f),raw_data_path)            
358 |             
359 |         else: 
360 |             print("this shouldn't happen")
361 |             
362 | 
363 |         
364 |         #generate parameter_names.txt
365 |         #import one file list, grab the header, then make the files from said header
366 |        
367 | 
368 |         with open(os.path.join(parameters_path,"parameter_names.txt"),'w') as f:
369 |             for parameter in parameter_list:
370 |                 f.write("{}\n".format(parameter))
371 |         
372 |         
373 |         #generate DTMIL_config.json and add the path 
374 |             #grab code from the json generating ipynb
375 |             
376 |         export_json_cfg(directory = dataset_path,
377 |                         fl_nominal=nominal_filename,
378 |                         fl_adverse=adverse_filename,
379 |                         param_names=parameter_list)
380 |         
381 |         
382 |         messagebox.showinfo(title = "Info", message="Dataset Formatting Process Completed")
383 |         
384 | #TODO: Also maybe generate DTMIL_config_dir.json
385 | def make_directory(folder_path):
386 |         
387 |     try:  
388 |         os.mkdir(folder_path)
389 |     except OSError:  
390 |         print ("Creation of the dataset folder %s failed" % folder_path)
391 |         
392 | #TODO: add a menu option to just export this by itself
393 | def export_json_cfg(directory = "", json_filename = "DTMIL_config.json", fl_nominal = "filelist_nominal.txt", fl_adverse = "filelist_adverse.txt",param_names = []):
394 |     
395 |     
396 |     sep = os.path.sep
397 |     default_config_path = "..{}dtmil{}configuration{}".format(sep,sep,sep)
398 |     
399 |     
400 |     
401 |     with open(os.path.join(default_config_path,"DTMIL_config_default.json")) as jsonfile:
402 |         json_data =  json.load(jsonfile)
403 |     
404 |     json_data["importing"]["nominal_filename"] = fl_nominal
405 |     json_data["importing"]["adverse_filename"] = fl_adverse
406 |     json_data["preprocessing"]["all_parameter_names"] = param_names
407 |     
408 | 
409 |     
410 |     json_cfg_string = json.dumps(json_data,sort_keys=True, indent=4, separators=(',', ': '))
411 |     
412 |     with open(os.path.join(directory,json_filename),'w') as outfile:
413 |         outfile.write(json_cfg_string)
414 |         outfile.close()
415 |     
416 | 
417 | 
418 | master = Tk()
419 | master.title("Dataset Formatter")
420 | 
421 | data_formatter = DatasetFormatter(master)
422 | #master.minsize(width = 100,height = 50)
423 | 
424 | 
425 | mainloop( )
426 | 
427 | 
428 | 


--------------------------------------------------------------------------------
/source/dtmil/model_container.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Tue Jun 19 15:47:39 2018
  5 | 
  6 | @author: dweckler
  7 | 
  8 | @author: vjanakir
  9 | This is the code for deep temporal multiple instance learning (DTMIL). This is the version of ADOPT that is based on deep learning.
 10 | The code assumes Keras with Theano or Tensorflow backend.
 11 | uses Anaconda virtual env with Python 2.7 and keras. It should also work in Python 3.x but not tested.
 12 | '''
 13 | """
 14 | 
 15 | 
 16 | import os, numpy as np, time
 17 | import datetime
 18 | 
 19 | from keras.layers.core import Dense, Dropout
 20 | from keras.layers import MaxPooling1D
 21 | from keras.layers.recurrent import GRU
 22 | from keras.layers.wrappers import TimeDistributed
 23 | from keras.models import Sequential, model_from_json
 24 | from keras.regularizers import l2
 25 | from keras.callbacks import ModelCheckpoint
 26 | from keras.optimizers import Nadam
 27 | from keras.models import load_model
 28 | import json
 29 | 
 30 | from sklearn.metrics import precision_recall_fscore_support
 31 | 
 32 | 
 33 | from dtmil.configuration.config_dtmil import get_json_config_data
 34 | from dtmil.utilities import aggregationLayer
 35 | from dtmil.utilities import save_something
 36 | from dtmil.utilities import get_auc
 37 | 
 38 | from dtmil.utilities import load_something
 39 | 
 40 | 
 41 | 
 42 | ## model parameters
 43 | #batch_size = 32 # mini-batch size (number of samples)
 44 | #epochs=100 # number of training passes through data
 45 | #nhr=5 # number of units in recurrent layer
 46 | #nhd=500 # number of hidden units in fully connected layer
 47 | #lr = 0.001 # Specify learning rate lr.
 48 | #optim=Nadam(lr=lr) # ADAM optimizer with nestrov momentum (see keras documentation). 
 49 | #dr=0 # dropout rate (0-1)
 50 | #lam=0.01 # regularization 
 51 | 
 52 | # path to data. 
 53 | 
 54 | 
 55 | class ModelContainer:
 56 |    
 57 |     #TODO: Update data_path and model_path whenever a model is reloaded, perhaps have a "reload" initializer
 58 |     def __init__(self,data_container):
 59 |         self.myData = data_container
 60 |         self.load_config_data()
 61 |         
 62 |         model_io_data = data_container.json_data['model_io']
 63 |         model_filename = model_io_data["model_filename"]
 64 |         model_container_filename = model_io_data["model_container_filename"]
 65 |         data_container_filename = model_io_data["data_container_filename"]
 66 |         
 67 |         model_archive_directory = os.path.join(data_container.dataset_dir,data_container.json_dir_data['model_archive_directory'])
 68 |         self.model_archive_directory = model_archive_directory
 69 |         
 70 |         model_output_directory = os.path.join(self.myData.dataset_dir,self.myData.json_dir_data['model_output_directory'])
 71 |         self.model_output_directory = model_output_directory
 72 |         
 73 |         model_storage_directory = os.path.join(self.myData.dataset_dir,self.myData.json_dir_data['model_storage_directory'])
 74 |         self.model_storage_directory = model_storage_directory
 75 |         self.model_path = os.path.join(model_storage_directory,model_filename)
 76 |         self.model_container_path = os.path.join(model_storage_directory,model_container_filename)
 77 |         self.data_path = os.path.join(model_storage_directory,data_container_filename)
 78 |         
 79 |         optim = Nadam(lr=self.lr) 
 80 |         pars = "_".join([str(k) for k in [self.batch_size, self.epochs, self.nhr, self.nhd, self.dr, self.lam, optim.__class__.__name__, self.lr]])
 81 | 
 82 |         
 83 |         fname_add=model_archive_directory+"temporary".split(os.path.sep)[-1].split('.')[0]+"_"+pars+'_'
 84 |         self.model_fname=fname_add+"bestModel-{epoch:02d}-{val_acc:.4f}.hdf5"
 85 |         self.json_fname=fname_add+'.json'
 86 |         if not os.path.exists(model_archive_directory):
 87 |             os.makedirs(model_archive_directory)
 88 |     
 89 | 
 90 |         print(self.model_path)
 91 |         
 92 |     
 93 |     @classmethod
 94 |     def reload_all_data(cls,dataset_dir, json_data_block = None):
 95 |         
 96 |         print("reloading model and data")
 97 |         if json_data_block == None:
 98 |             json_data_block = get_json_config_data(dataset_dir)
 99 |         
100 |         json_dir_data, json_group_data,dataset_dir = json_data_block
101 |         model_storage_directory = os.path.join(dataset_dir,json_dir_data['model_storage_directory'])
102 |         model_container_path = os.path.join(model_storage_directory, json_group_data['model_io']["model_container_filename"])
103 | 
104 |         myModel = load_something(model_container_path)
105 |         myModel.update_paths(model_container_path,dataset_dir)
106 |         
107 |         myData = myModel.myData
108 | 
109 |         myData.dataset_dir = dataset_dir
110 |         model = load_model(myModel.model_path)
111 | 
112 |         myModel.model = model
113 |         
114 |         return myModel
115 |         
116 |         
117 |     def update_paths(self,model_container_path, new_dataset_dir = None):
118 |         
119 |         if new_dataset_dir is not None:
120 |             dataset_dir = new_dataset_dir
121 |             self.myData.dataset_dir = new_dataset_dir
122 |             
123 |         else:
124 |             dataset_dir= self.myData.dataset_dir
125 |         
126 |         
127 |         model_io_data = self.myData.json_data['model_io']
128 |         model_filename = model_io_data["model_filename"]
129 |         model_container_filename = model_io_data["model_container_filename"]
130 |         data_container_filename = model_io_data["data_container_filename"]
131 |     
132 |         model_archive_directory = os.path.join(dataset_dir,self.myData.json_dir_data['model_archive_directory'])
133 |         self.model_archive_directory = model_archive_directory
134 |         
135 |         model_output_directory = os.path.join(dataset_dir,self.myData.json_dir_data['model_output_directory'])
136 |         self.model_output_directory = model_output_directory
137 |         
138 |         model_storage_directory = os.path.join(dataset_dir,self.myData.json_dir_data['model_storage_directory'])
139 |         self.model_storage_directory = model_storage_directory
140 |         self.model_path = os.path.join(model_storage_directory,model_filename)
141 |         self.model_container_path = os.path.join(model_storage_directory,model_container_filename)
142 |         self.data_path = os.path.join(model_storage_directory,data_container_filename)
143 |         
144 |         
145 | 
146 |         
147 |         
148 |             
149 |     def load_config_data(self):
150 |         data = self.myData.json_data["training"]       
151 |         self.epochs = data['epochs']
152 |         self.batch_size = data['batch_size']
153 |         self.nhr = data['nhr']
154 |         self.nhd = data['nhd']
155 |         self.lr = data['lr']
156 |         self.dr = data['dr']
157 |         self.lam = data['lam']
158 |         
159 |     def configure_model(self,train_flag, pre_trained_model = None, pre_trained_json = None):
160 |         # create model configuration
161 |         myData = self.myData
162 |         self.train_flag = train_flag
163 |         
164 |         self.pre_trained_model = pre_trained_model
165 |         self.pre_trained_json = pre_trained_json
166 |         
167 |    
168 |         if train_flag:
169 |         
170 |             # standard sequential model in Keras where layers can be added.
171 |             model = Sequential()
172 |             
173 |             # masking layer to make sure masked time-steps are not considered in the gradient calculations    
174 |             # model.add(Masking(mask_value=mask_val, input_shape=(maxlen, nfeat)))
175 |             lam = self.lam
176 |             dr = self.dr
177 |             optim = Nadam(lr=self.lr) 
178 | 
179 |             # GRU layer (RNN)
180 |             model.add(GRU(
181 |                 input_shape=(myData.maxlen, myData.nfeat),
182 |                 units=self.nhr,
183 |                 return_sequences=True,
184 |                 stateful=False, 
185 |                 unroll=False, 
186 |                 implementation='gpu',
187 |                 activation='tanh',
188 |                 kernel_regularizer=l2(lam), 
189 |                 recurrent_regularizer=l2(lam), 
190 |                 bias_regularizer=l2(lam)))
191 |             model.add(Dropout(dr))
192 |             
193 |             # fully connected layer - note the timedistributed type which processes data at every time step.
194 |             model.add(TimeDistributed(Dense(units=self.nhd,
195 |                                             activation='tanh',
196 |                                             kernel_regularizer=l2(lam),
197 |                                             bias_regularizer=l2(lam),
198 |                                             kernel_constraint = None)))
199 |             model.add(Dropout(dr))
200 |         
201 |             # logistic layer (the output of this layer gives instance probabilities)
202 |             model.add(TimeDistributed(Dense(units=1, 
203 |                                             activation='sigmoid', 
204 |                                             kernel_regularizer=l2(lam), 
205 |                                             bias_regularizer=l2(lam),
206 |                                             kernel_constraint = None),name="inst_prob"))
207 |             model.add(Dropout(0))    
208 |             
209 |             # multiple-instance aggregation layer
210 |             # model.add(aggregationLayer(name="mil_layer"))
211 |             model.add(MaxPooling1D(pool_size=myData.maxlen))
212 |             start = time.time()
213 |         
214 |             # compile model 
215 |             model.compile(loss="binary_crossentropy", optimizer=optim, metrics=['accuracy'])
216 |             print("Compilation Time : ", time.time() - start)
217 |         
218 |             # serialize (save) model to JSON
219 |             model_json = model.to_json()
220 |             with open(self.json_fname, "w") as json_file:
221 |                 json_file.write(model_json)
222 |             print('saved model json to disk')
223 |         else:
224 |                         
225 |             
226 |             ##Check filepath here, if it doesn't exist, load existing model
227 |             # load json and create model
228 |             
229 |             print("Train_Flag set to false, loading pre-trained model")
230 |             model = self._load_pretrained_model()
231 |             
232 |             
233 | #            json_file = open(load_jsonName, 'r')
234 | #            loaded_model_json = json_file.read()
235 | #            json_file.close()
236 | #            model = model_from_json(loaded_model_json,{'aggregationLayer':aggregationLayer})
237 | #        
238 | #            # load weights into new model
239 | #            model.load_weights(load_h5Name)
240 | #            model.compile(loss="binary_crossentropy", optimizer=optim, metrics=['accuracy'])
241 | #            print("Loaded and compiled model from disk")
242 |             
243 |         
244 |         self.model = model
245 | 
246 | #TODO: Raise better errors for pretrained models
247 |     def _load_pretrained_model(self):
248 |         ##FIXME: Make this have better error handling than "None"
249 | 
250 |         json_filename = self.pre_trained_json
251 |         pre_trained_model_filename = self.pre_trained_model
252 |         
253 |         
254 |         if (pre_trained_model_filename== "") :
255 |             print("No filepath specified, attempting to load from the default path")
256 |             pre_trained_model_filename = self.model_path
257 |             
258 |  
259 |     #FIXME: This is inconsistent with the above. Fix later somehow (probably with yet another JSON argument)
260 |         if json_filename == "":
261 |             json_filename = None
262 |         
263 |         if (json_filename):
264 |            
265 |             json_file = open(json_filename, 'r')
266 |             loaded_model_json = json_file.read()
267 |             json_file.close()
268 |             model = model_from_json(loaded_model_json,{'aggregationLayer':aggregationLayer})
269 |             
270 |             weights_filename = pre_trained_model_filename
271 |             model.load_weights(weights_filename)
272 |             model.compile(loss="binary_crossentropy", optimizer=Nadam(lr=self.lr) , metrics=['accuracy'])
273 |             print("Loaded and compiled model from disk")
274 |             
275 |         else:
276 |             model_filename = pre_trained_model_filename
277 |             
278 |             print("attempting to load: {}".format(model_filename))
279 |             model = load_model(model_filename)
280 |             print("Loaded model from disk")      
281 |             
282 |         return model
283 |             
284 | 
285 | 
286 | #%% train model
287 | 
288 |     def train_model(self,trainNeeded):
289 |         myData = self.myData
290 | 
291 |         if trainNeeded:
292 |             try:
293 |                 # define checkpoint so that model is saved if it is better than previously saved model
294 |                 checkpoint = ModelCheckpoint(self.model_fname, 
295 |                                              monitor='val_accuracy', 
296 |                                              verbose=0,
297 |                                              save_best_only=True,
298 |                                              
299 |                                              mode='auto')
300 |                 
301 |                 #FIXME: fix the callbacks list bug and model checkpoints
302 |                 callbacks_list = [checkpoint]
303 |         
304 |                 start = time.time()
305 |                 self.training_history = self.model.fit(myData.xtrain, 
306 |                           myData.ytrain, 
307 |                           validation_data=(myData.xvalid,myData.yvalid),
308 |                           batch_size=self.batch_size,
309 |                           epochs=self.epochs,
310 |                           validation_split=0.33,
311 |                           verbose = 1, 
312 |                           #callbacks=callbacks_list,
313 |                           shuffle=True)
314 |                 
315 |                 self.train_time =  time.time() - start
316 |                 print("Train Time : ",self.train_time)
317 |         
318 |             except KeyboardInterrupt:
319 |                 print('interrupted')
320 | 
321 | #%% evaluate model performance on train set
322 | 
323 |     def evaluate_model(self):
324 |         myData = self.myData
325 |         
326 |         temp=np.array([myData.I_opt.tolist()+
327 |                        myData.I_bad.tolist()
328 |                        +myData.I_opt_valid.tolist()
329 |                        +myData.I_bad_valid.tolist()])[0]
330 |         # temp=np.array([I_opt_ho.tolist()+I_bad_ho.tolist()])[0]
331 |         xval=myData.states[temp,:,:]
332 |         yval=myData.seqLabels[temp]
333 |         self.xval = xval
334 |         
335 |         #FIXME: Make this more clear that its evaluating on two different sets of the model
336 |         y_pred_prob=self.model.predict_proba(xval)[:,0]
337 |         self.yValidation_prob = y_pred_prob
338 |         
339 |         self.auc_train =  get_auc(yval, y_pred_prob)
340 |         
341 |         #%% evaluate model performance on test set
342 |         
343 |         temp=np.array([myData.I_opt_ho.tolist()+
344 |                        myData.I_bad_ho.tolist()])[0]
345 |         # temp=np.array([I_opt_ho.tolist()+I_bad_ho.tolist()])[0]
346 |         xtest=myData.states[temp,:,:]
347 |         ytest=myData.seqLabels[temp]
348 |         self.xtest = xtest
349 |         
350 |         
351 |         y_pred_prob=self.model.predict_proba(xtest)[:,0]
352 |         
353 |         self.y_pred_prob = y_pred_prob
354 |         
355 |         self.auc_test = get_auc(ytest, y_pred_prob)
356 |         
357 |         #TODO: Add threshold definition
358 | 
359 |         self.precision,self.recall,self.fscore, _ = precision_recall_fscore_support(ytest,y_pred_prob.round(), average='weighted')
360 |         
361 |         self.xtest = xtest
362 |         self.ytest = ytest
363 |         
364 |         
365 |         
366 |         self.train_date = datetime.datetime.now()
367 |         
368 |         self.generate_output_file()
369 |       
370 |     def save_model(self):
371 |         
372 |                 
373 |         #TODO: add this to the m
374 |         timestr = time.strftime("%Y%h%d-%H%M%S")
375 |         
376 |         self.timestamp = timestr
377 | 
378 |         
379 |         self.model.save(self.model_path)
380 |         save_something(self.myData,self.data_path)
381 |         #save model container separately from the model (otherwise pickle doesn't work)
382 |         temp = self.model
383 |         self.model = None
384 |         save_something(self,self.model_container_path)
385 |         self.model = temp
386 |         
387 |         json_cfg_string = json.dumps(self.myData.json_data,sort_keys=True, indent=4, separators=(',', ': '))
388 |         
389 |         
390 |         
391 |         
392 |         with open(os.path.join(self.model_storage_directory,"DTMIL_config_{}.json".format(timestr)),'w') as outfile:
393 |             outfile.write(json_cfg_string)
394 |             outfile.close()
395 | 
396 | 
397 |   
398 |     def generate_output_file(self):
399 |         print("generating output file...\n\n\n")
400 |         
401 |         myData = self.myData
402 |         model_output_directory = self.model_output_directory
403 |         
404 |         dataset_header = "Output Summary:"
405 |         training_samples = self.__format_sample_output("Training",myData.xtrain,myData.I_opt,myData.I_bad)
406 |         validation_samples = self.__format_sample_output("Validation",myData.xvalid, myData.I_opt_valid,myData.I_bad_valid)
407 |         test_samples = self.__format_sample_output("Test", self.xtest,myData.I_opt_ho,myData.I_bad_ho)
408 |         
409 |         auc_train = "AUC Train: {}".format(self.auc_train)
410 |         auc_test = "AUC Test: {}".format(self.auc_test)
411 |         precision = "Precision: {}".format(self.precision)
412 |         recall = "Recall: {}".format(self.recall)
413 |         f1_score = "F1 Score: {}".format(self.fscore)
414 |         
415 |         epochs = "Epochs: {}".format(self.epochs)
416 |         batch_size = "Batch Size: {}".format(self.batch_size)
417 |         regularization_parameter = "Lambda: {}".format(self.lam)
418 |         dropout_rate = "Dropout Rate: {}".format(self.dr)
419 |         train_date = "Trained on: {}".format(self.train_date)
420 |         number_of_features = "Number of features: {}".format(myData.nfeat)
421 |         
422 |         dropped_states = myData.correlated_states.tolist() + myData.dropped_states.tolist()
423 |         
424 |         dropped_parameters = "Dropped Parameters: \n{}".format( dropped_states )
425 |         dropped_parameter_names ="{}".format( [myData.header[p] for p in dropped_states])
426 |         
427 |         #Find a better way to express this within keras
428 |         if(self.train_flag == False):
429 |             train_date = "Reloaded Model"
430 |         
431 |         
432 |         output_string_list = [dataset_header,
433 |                               number_of_features,
434 |                               train_date,
435 |                               "",
436 |                               training_samples,
437 |                               validation_samples,
438 |                               test_samples,
439 |                               "",
440 |                               epochs,
441 |                               regularization_parameter,
442 |                               dropout_rate,
443 |                               batch_size,
444 |                               "",
445 |                               dropped_parameters,
446 |                               dropped_parameter_names,
447 |                               "",
448 |                               auc_train,
449 |                               auc_test,
450 |                               precision,
451 |                               recall,
452 |                               f1_score
453 |                               ]
454 |         
455 |         
456 |         output_string = "\n".join(output_string_list)
457 |         print(output_string)
458 |         print("\n")
459 |         splice = myData.time_splice
460 |         if(not splice):
461 |             splice = 1       
462 |         
463 |         
464 |         #summary_filename = "model_output_summary_{}_percent.txt".format(int(splice*100))
465 |         summary_filename = "model_output_summary.txt"
466 |         
467 |         with open(os.path.join(model_output_directory,summary_filename),'w') as outfile:
468 |             outfile.write(output_string)
469 |         
470 |         
471 |     def __format_sample_output(self,name, total_samples, nominal_samples, adverse_samples):
472 |         
473 |         total_samples = "{} Samples: {}".format(name,len(total_samples))
474 |         nominal_samples = " - Nominal: {}".format(len(nominal_samples))
475 |         adverse_samples = " - Adverse: {}".format(len(adverse_samples))
476 |         
477 |         return "\n".join([total_samples,nominal_samples,adverse_samples])
478 | 
479 | 
480 | 
481 | 


--------------------------------------------------------------------------------
/source/dtmil/visualizations.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on Wed Mar 20 15:55:46 2019
  5 | 
  6 | @author: dweckler
  7 | """
  8 | 
  9 | 
 10 | import numpy as np, matplotlib.pyplot as plt
 11 | from keras import backend as T
 12 | import time
 13 | import os
 14 | from .utilities import flat_avg
 15 | from dtmil.configuration.config_dtmil import get_json_config_data
 16 | from .prediction_data import Prediction_Data
 17 | import math
 18 | 
 19 |  #%%class def
 20 | 
 21 | class Visualizer:
 22 |     
 23 |     #TODO: Redesign this to work with multiple sources without depending on having all the data at once
 24 |     def __init__(self, myData, myModel, sample_idx = None, guidelines = True, prediction_data = None, dataset_dir = None, input_json_data = None):
 25 |         
 26 |         self.myData = myData
 27 |         self.myModel = myModel
 28 |         self._current_sample = sample_idx
 29 |         
 30 |         ##FIXME: make this update the visualization parameters every run (grab location of config file from myData?) 
 31 |         
 32 |         if (input_json_data is not None):
 33 |             json_data = input_json_data
 34 |             
 35 |         else:
 36 |             _, json_data, _ = get_json_config_data(dataset_dir)
 37 |         
 38 |         
 39 |         
 40 |         self.visualization_params = json_data['visualization']
 41 |      
 42 |         ##FIXME: Make this more able to be manually defined
 43 |         sf = 0.25
 44 |         self.xvec_scale_factor = sf
 45 |         
 46 |         self.xvec_timeline=np.arange((self.myData.maxlen-1)*sf,-sf,-sf)
 47 |         
 48 |         #this is to account for the extra value in the start and end indeces. Will be best practice to fix in the future
 49 |         self.xvec_temp_time_lookup = np.copy(self.xvec_timeline)
 50 |         self.xvec_temp_time_lookup = np.append(self.xvec_temp_time_lookup,self.xvec_timeline[-1])
 51 |         
 52 |         
 53 | 
 54 |         if sample_idx == None:
 55 |             print(f"sample index is set to None, using default value")
 56 |             sample_idx = 0
 57 |             
 58 |         if prediction_data:
 59 |             self.prediction_data = prediction_data
 60 |         else:
 61 |             self.prediction_data = Prediction_Data(myData,myModel,sample_idx)
 62 |             
 63 |         self.guidelines = guidelines  
 64 |         if (guidelines):
 65 |             self.get_guidelines()
 66 |         
 67 |     @classmethod
 68 |     def frompredictiondata(cls, prediction_data, guidelines = True):
 69 |         #initialize from preditcion data
 70 |         
 71 |         return cls(prediction_data.myData, prediction_data.myModel, prediction_data.current_sample, prediction_data = prediction_data)
 72 |  
 73 |     #%%plot sample timeline function
 74 | 
 75 |     @property
 76 |     def current_sample(self):
 77 |         return self._current_sample
 78 |     
 79 |     @current_sample.setter
 80 |     def current_sample(self,value):
 81 |         self._current_sample = value
 82 |         self.prediction_data = Prediction_Data(self.myData,self.myModel,value)
 83 |     
 84 |     def plot_sample_timeline(self, figure_size = None, saveFig = True):
 85 |         
 86 |         myModel = self.myModel
 87 |         model_output_directory = myModel.model_output_directory
 88 |         xtest =  myModel.xtest
 89 |         
 90 |         if (saveFig):
 91 |             plt.switch_backend('agg')
 92 |             
 93 |         # function to get an intermediate layer's output (instance probabilities)
 94 |         inst_layer_output_fn = T.function([myModel.model.layers[0].input],[myModel.model.layers[-2].output])
 95 |         
 96 |         temp=xtest
 97 |         L=inst_layer_output_fn([temp])[0]
 98 |         nex=int(temp.shape[0]/2)
 99 |         
100 |         plt.figure(figsize=figure_size)
101 |         plt.subplot(2,1,1)
102 |         plt.plot(np.transpose(L[:nex,:,0]),'g')
103 |         plt.ylim([-0.1,1.1])
104 |         #plt.xlabel('Time to adverse event',fontsize=14)
105 |         #plt.xlabel('Sample timeline',fontsize=14)
106 |         plt.ylabel('Probability of \n adverse event',fontsize=14)
107 |        # plt.xticks([0,10,20],['1000 ft \n altitude', '10 mi', '20 mi'],rotation=0)
108 |         #plt.gca().invert_xaxis()
109 |         plt.subplot(2,1,2)
110 |         plt.plot(np.transpose(L[nex:,:,0]),'r')
111 |         plt.ylim([-0.1,1.1])
112 |         #plt.gca().invert_xaxis()
113 |         plt.xlabel('sample timeline',fontsize=14)
114 |         #plt.xticks([0,10,20],['1000 ft \n altitude', '10 mi', '20 mi'],rotation=0)
115 |         plt.ylabel('Probability of \n adverse event',fontsize=14)
116 |         
117 |         temp=self.myData.xvalid
118 |         L=inst_layer_output_fn([temp])[0]
119 |         nex=int(temp.shape[0]/2)
120 |         np.where(L[nex:,80:,0]>0.5)[0][:10]
121 |         
122 |         if(saveFig):
123 |             plt.savefig(os.path.join(model_output_directory,"timeline.png"))
124 | 
125 |     #%%batch visualization function
126 |     #FIXME: text sizing
127 |     def visualize_sample_parameters(self,figure_size = None, saveFig = False, file_output_dir = "",file_output_type = "pdf",num_columns = 5, subplot_aspect_ratio = (1,1), subplot_size = 3.6):
128 |         myData = self.myData
129 |  #       myModel = self.myModel
130 |     
131 |     
132 |         if (saveFig):
133 |             plt.switch_backend('agg')
134 |     
135 |         #specify the variables to be included in the plot
136 |         correlated_states = myData.correlated_states.tolist()
137 |         trained_states = myData.parameter_selection.tolist()
138 |         parameters_to_plot=correlated_states + trained_states 
139 |         correlated_indeces = len(correlated_states)    
140 |         
141 |         num_plots = len(parameters_to_plot) + 1
142 |         num_rows = math.ceil(float(num_plots)/float(num_columns))
143 |         
144 |         if figure_size is None:
145 |             width = 4*num_columns
146 |             height = num_rows * 3.5
147 |             
148 |             figure_size = (width,height)
149 |     
150 | 
151 |         fig, axs = plt.subplots(num_rows,num_columns, figsize= figure_size)
152 |         axs=axs.ravel()
153 |         
154 |         starting_index = -1-myData.maxlen+1
155 |         
156 |         for pltIdx in np.arange(len(parameters_to_plot)):
157 |             selected_parameter = parameters_to_plot[pltIdx]
158 |             
159 |             plot_title = "{}".format(myData.header[selected_parameter])
160 |              #add holdout to the title if it's within the correlated indeces
161 |             if (pltIdx < correlated_indeces):
162 |                 plot_title = plot_title + "(H/O)"
163 |             
164 |             self.plot_parameter(selected_parameter,axs[pltIdx],starting_index, plot_title = plot_title)
165 |             
166 |         # plot precursor score in a separate subplot
167 |         pltIdx=pltIdx+1
168 |         self.plot_precursor_score(axs[pltIdx],'Precursor Score')        
169 |         fig.tight_layout()
170 |         
171 |         # save figure if needed
172 |         if saveFig:
173 |             
174 |             suffix = "_{}".format(self.myData.get_filename(self.current_sample))
175 | 
176 |             file_label, file_dataset_type = self.myData.get_grouping(self.current_sample)
177 |             
178 |             filename = "{}_{}".format(file_label,file_dataset_type)
179 |             
180 |             save_figure(self.myModel,suffix,fig,file_output_dir,filename,file_output_type = 'pdf')
181 |             #self.save_figure(fig,file_output_dir)
182 |             
183 | 
184 |     
185 |     def special_ranking_visualization(self, states_to_visualize,sorted_ranking_sums,figure_size = (10,10), saveFig = False, file_output_dir = "",file_output_type = "pdf"):
186 |         myData = self.myData
187 |         
188 |         fig, axs = plt.subplots(3,3, figsize= figure_size)
189 |         axs=axs.ravel()
190 |         
191 |         self.plot_precursor_score(axs[1],'Precursor Score')        
192 |         
193 |         for i in range(6):
194 |             selected_parameter = states_to_visualize[i]
195 |             
196 |             plot_title = "{} ({})".format(myData.header[selected_parameter],sorted_ranking_sums[i])
197 |              #add holdout to the title if it's within the correlated indeces
198 |           
199 |             self.plot_parameter(selected_parameter,axs[i+3],0, plot_title = plot_title)
200 |     
201 |     
202 |     
203 |     #TODO: same as below except ordered ranking parameters with a variable number of columns and such
204 |     #output with values of ranking
205 |     #figure out what the values mean to report to bryan tomorrow 
206 |     def visualize_top_ranking_parameters(self,ranking_group,feature_num_limit=None,num_columns = 4,displayfig = False):
207 |         
208 |         file_output_dir = "feature_ranking"
209 |         myData = self.myData
210 |         
211 |         if (not displayfig):
212 |             plt.switch_backend('agg')
213 |             
214 |         #get as many as we can
215 |         #score_pair_lists = ranking_group.top_ranking_scores(1)
216 |         
217 |         #response_windows_lists = ranking_group.top_response_windows(1)
218 |         response_windows_lists = ranking_group.ordered_response_windows_list
219 |         
220 |         if(feature_num_limit is not None):     
221 |             if len(response_windows_lists[0])> feature_num_limit:
222 |                 response_windows_lists = [lst[0:feature_num_limit] for lst in response_windows_lists]
223 |                 
224 |         num_windows = len(response_windows_lists)
225 |         #print(feature_num_limit,len(response_windows_lists[0]),len(response_windows_lists[1]))
226 |         
227 |         for idx,response_windows in enumerate(response_windows_lists):
228 |             
229 |             parameter_selection = [window.attribute_index for window in response_windows]
230 |             
231 | #            print([window.ranking_score for window in response_windows])
232 | #            print([window.most_important_sd_response for window in response_windows])
233 |             score_list = [round(window.ranking_score,3) for window in response_windows]
234 |             
235 |             sd_response_list = []
236 |             for window in response_windows:
237 |                 most_important_response = window.most_important_sd_response
238 |                 if most_important_response is not None:
239 |                     sd_response_list.append(str(most_important_response))
240 |                 else:
241 |                     sd_response_list.append("n/a")
242 |             
243 |             #sd_response_list = [round(window.most_important_sd_response,3) for window in response_windows]
244 | 
245 |             
246 |             num_plots = len(response_windows) + 1
247 |             num_rows = math.ceil(float(num_plots)/float(num_columns))            
248 |             
249 |             width = 4*num_columns
250 |             height = num_rows * 3.5
251 |             
252 |             figsize = (width,height)
253 |             fig, axs = plt.subplots(num_rows,num_columns, figsize= figsize)
254 |             
255 |             axs=axs.ravel()
256 |             fig.tight_layout()
257 |             
258 |             xvec_timeline = self.xvec_timeline 
259 |             plot_idx = 0
260 |             
261 |             axs[plot_idx].plot(xvec_timeline,ranking_group.prediction_data.precursor_score,'r',linewidth=2,label = "Default")
262 |             axs[plot_idx].set_title("Precursor Score",fontsize=10)
263 |             axs[plot_idx].set_ylim([0,1])
264 |             axs[plot_idx].invert_xaxis()
265 |             
266 |             if(self.guidelines):
267 |                 axs[plot_idx].plot(self.xvec_timeline,self.precursor_score_guideline,'k--')    
268 |         
269 |             graph_colors = ['b','g','k','y','c','m','k','w']
270 |             color_idx = 0
271 |             
272 |             sd_disturbances = ranking_group.parent.standard_deviation_disturbances
273 |             
274 |             #TODO: condense everything below into one function (rather than writing the same code twice)
275 |             parameter_window_indeces = [ranking_group.parameter_list.index(i) for i in parameter_selection]
276 |             parameter_windows = [ranking_group.parameter_windows[i] for i in parameter_window_indeces]
277 |             
278 |             #if this process isn't behind an if statement, the algorithm will output blank graphs
279 |             #furthermore, it will cause some of the following graphs to come out blank as well
280 |             #the cause of this is unknown, but may be useful to investigate in the future
281 |             if len(parameter_windows)>0:        
282 |                 
283 |                 #TODO: Figure out why this conditional became necessary and the one above stopped working? (maybe some revisions impacted it?)
284 |                 if len(parameter_windows[0].start_indeces)>0:
285 |                     
286 |                     start_index = parameter_windows[0].start_indeces[idx]
287 |                     end_index = parameter_windows[0].end_indeces[idx]
288 |     
289 |                     window_start_idx = self.xvec_temp_time_lookup[start_index]
290 |                     window_end_idx = self.xvec_temp_time_lookup[end_index]
291 |                     
292 |                     axs[plot_idx].axvspan(window_start_idx, window_end_idx, alpha=0.1, color='k')
293 |                     for index,window in enumerate(parameter_windows):
294 |                         color_idx = 0
295 |                         plot_idx = index+1
296 |                         
297 |                         axs[plot_idx].invert_xaxis()
298 |                         #axs[plot_idx].set(adjustable='box', aspect=1)
299 |                         axs[plot_idx].plot(xvec_timeline,ranking_group.prediction_data.precursor_score,'r', label = "Default",linewidth=2)
300 |                         axs[plot_idx].axvspan(window_start_idx, window_end_idx, alpha=0.1, color='k')
301 |     
302 |             
303 |                         for precursor_score in window.modified_precursor_scores:
304 |                             selected_parameter = parameter_selection[index]
305 |                             
306 |                             disturbance = sd_disturbances[color_idx]
307 |                             
308 |                             if disturbance > 0:
309 |                                 label = "+ {} σ response".format(disturbance)
310 |                                 
311 |                             else:
312 |                                 label = "- {} σ response".format(abs(disturbance))
313 |     
314 |     
315 |                             axs[plot_idx].plot(xvec_timeline,precursor_score,graph_colors[color_idx],linewidth=2,label = label)
316 |                             axs[plot_idx].set_title("{} \n({}, {} σ response)".format(myData.header[selected_parameter],score_list[index],sd_response_list[index]),fontsize=10)
317 |                             axs[plot_idx].set_ylim([0,1])
318 |                             if(self.guidelines):
319 |                                 axs[plot_idx].plot(self.xvec_timeline,self.precursor_score_guideline,'k--')    
320 |                             color_idx += 1                    
321 |                     
322 |                     if(plot_idx>1):
323 |                         handles, labels = axs[plot_idx].get_legend_handles_labels()
324 |                         fig.legend(handles, labels, loc='lower right')
325 |                             
326 |                     #save the figure
327 |                     plt.tight_layout()
328 |     
329 |                     file_label, file_dataset_type = self.myData.get_grouping(ranking_group.data_ID)
330 |                     filename = "{}_{}_ranking".format(file_label,file_dataset_type)
331 |                     
332 |                     suffix = "_{}".format(self.myData.get_filename(ranking_group.data_ID))
333 |                     
334 |                     if num_windows > 1:
335 |                         suffix = "{}_precursor_event_{}".format(suffix,idx)
336 |                     
337 |                     save_figure(self.myModel,suffix,fig,file_output_dir,filename,output_time = False)
338 |                     
339 |                 else:
340 |                     #TODO: 
341 |                     print("Precursor score for {} does not cross threshold?".format(self.myData.get_filename(ranking_group.data_ID)))
342 |                 
343 |             else:
344 |                 print("Precursor score for {} does not cross threshold!".format(self.myData.get_filename(ranking_group.data_ID)))
345 |             
346 |     
347 | #    def visualize_ranking_data(self,ranking_group, output_file = None, parameter_selection = None, num_columns = 7, subplot_aspect_ratio = (1,1), subplot_size = 3.6):
348 | #        myData = self.myData
349 | #        print("generating ranking data plot")
350 | #        
351 | #        if parameter_selection is None:
352 | #            parameter_selection = myData.parameter_selection.tolist()
353 | #
354 | #        #all the paramaeters plus the precursor score in its own plot
355 | #        num_plots = len(parameter_selection) + 1
356 | #        num_rows = math.ceil(float(num_plots)/float(num_columns))
357 | #        dx, dy = subplot_aspect_ratio
358 | #        figsize = plt.figaspect(float(dy * num_rows) / float(dx * num_columns)) * subplot_size
359 | #        
360 | #        fig, axs = plt.subplots(num_rows,num_columns, figsize= figsize)
361 | #        #fig, axs = plt.subplots(numRows,numColumns)
362 | #        axs=axs.ravel()
363 | #        fig.tight_layout()
364 | #        #xvec_timeline=np.arange((myData.maxlen-1)*0.25,-0.25,-0.25)
365 | #        
366 | #        xvec_timeline = self.xvec_timeline 
367 | #        
368 | #        axs[0].plot(xvec_timeline,ranking_group.prediction_data.precursor_score,'r',linewidth=2)
369 | #        axs[0].set_title("Normal",fontsize=10)
370 | #        axs[0].set_ylim([0,1])
371 | #        axs[0].invert_xaxis()
372 | #        
373 | #        graph_colors = ['b','g','k','y']
374 | #        color_idx = 0
375 | #        
376 | #        parameter_window_indeces = [ranking_group.parameter_list.index(i) for i in parameter_selection]
377 | #        parameter_windows = [ranking_group.parameter_windows[i] for i in parameter_window_indeces]
378 | #        
379 | #        for index,window in enumerate(parameter_windows):
380 | #            color_idx = 0
381 | #            plot_idx = index+1
382 | #            axs[plot_idx].invert_xaxis()
383 | #
384 | #            for precursor_score in window.modified_precursor_scores:
385 | #                selected_parameter = parameter_selection[index]
386 | #                
387 | #                axs[plot_idx].plot(xvec_timeline,precursor_score,graph_colors[color_idx],linewidth=2)
388 | #                axs[plot_idx].set_title("{} ({})".format(myData.header[selected_parameter],selected_parameter),fontsize=10)
389 | #                axs[plot_idx].set_ylim([0,1])
390 | #                axs[plot_idx].plot(xvec_timeline,ranking_group.prediction_data.precursor_score,'r',linewidth=1)
391 | #                color_idx += 1
392 | 
393 |         
394 |     #%%save figure
395 | 
396 |     def save_figure(self, fig,file_output_dir,file_output_type = 'pdf'):
397 |         
398 |         save_figure(self.myModel,self.current_sample,fig,file_output_dir,"parameters_graph",file_output_type = 'pdf')
399 | 
400 |     
401 |         #%%plot precursor score
402 | 
403 |     def plot_precursor_score(self, plot_axis, plot_title = "Precursor Score", start_index = None, end_index = None):
404 |         precursor_score = self.prediction_data.precursor_score     
405 |         plot_axis.plot(self.xvec_timeline[start_index:end_index], precursor_score[start_index:end_index],'r',linewidth=2)     
406 |         
407 |         if(self.guidelines):
408 |             plot_axis.plot(self.xvec_timeline[start_index:end_index],self.precursor_score_guideline[start_index:end_index],'k--')    
409 |         
410 |         plot_axis.invert_xaxis()
411 |         plot_axis.set_title(plot_title,fontsize=10)
412 |         plot_axis.set_ylim([0,1])
413 |         
414 |     
415 |             #%%plot indivudual parameter
416 | 
417 |     def plot_parameter(self, selected_parameter, plot_axis,starting_index = 0,end_index = None,plot_title = "", precIdx = None):
418 |      
419 |         ##FIXME: Make this more able to be manually defined
420 |             xvec_timeline=self.xvec_timeline
421 |              
422 |             #FIXME: Make Prediction Data update states_orig ("visualization_sample")
423 |             parameter_values = self.prediction_data.visualization_window[starting_index:end_index,selected_parameter]
424 |           
425 |             # plot time series variable 
426 |             plot_axis.plot(xvec_timeline[starting_index:end_index],parameter_values,linewidth=2)
427 |             
428 |             ##plot the guidelines
429 |             # if discrete variable, use discrete nominal data as guideline, else use continuous nominal data
430 |             if selected_parameter in self.visualization_params["binary_parameters"]: 
431 |                 plot_axis.plot(xvec_timeline[starting_index:end_index],self.discrete_nominal_guideline[starting_index:end_index,selected_parameter],'k--',linewidth=2)
432 |                 plot_axis.set_ylim([-0.1,1.1])
433 |             else:
434 |                 plot_axis.plot(xvec_timeline[starting_index:end_index],self.nominal_guideline[0,starting_index:end_index,selected_parameter],'k--',linewidth=2)
435 |                 plot_axis.plot(xvec_timeline[starting_index:end_index],self.nominal_guideline[1,starting_index:end_index,selected_parameter],'k--',linewidth=2)
436 |             
437 |             ##use this if we are dealing with multiple precursor score predictions, otherwise use the one genereated upon class initialization
438 |             if (precIdx):
439 |                 precursor_indeces = precIdx
440 |             else:
441 |                 precursor_indeces = self.prediction_data.precursor_indeces
442 |             
443 |             # plot precursor time instants as an overlay
444 |             if len(precursor_indeces)>0:
445 |                 
446 |                 precursor_overlay_values = self.prediction_data.visualization_window[precursor_indeces,selected_parameter]
447 |                 
448 |                 self.precursor_overlay_values = precursor_overlay_values
449 |                 if(end_index):
450 |                     if end_index >= precursor_indeces[0]:
451 |                         precursor_end_index = (np.abs(precursor_indeces - (end_index))).argmin()
452 |                         print(precursor_end_index,end_index)
453 |                         
454 |                         plot_axis.plot(xvec_timeline[precursor_indeces][0:precursor_end_index],precursor_overlay_values[0:precursor_end_index],'ro', alpha = 0.4)        
455 |                 else:
456 |                     plot_axis.plot(xvec_timeline[precursor_indeces],precursor_overlay_values,'ro', alpha = 0.4)
457 |                     
458 | #                    
459 |             if plot_title == "":
460 |                 plot_title = "{} ({})".format(self.myData.header[selected_parameter],selected_parameter)
461 |             
462 |             plot_axis.set_title(plot_title,fontsize=10)
463 |            
464 | #            # invert x-axis so that distance to touchdown reduces as we go towards rightside of the plot
465 |             plot_axis.invert_xaxis()
466 |     
467 |     #%%get guidelines
468 | 
469 |     def get_guidelines(self):
470 |         myData = self.myData
471 |         optimal_values=myData.states_orig[:,np.concatenate((myData.I_opt,myData.I_opt_valid),axis=0)]
472 |                 #determine guidelines
473 |         guideline_type = self.visualization_params["guideline_type"]
474 |         if guideline_type == 1:
475 |             optimal_standard_dev = np.std(optimal_values, axis=1)
476 |             optimal_mean = np.mean(optimal_values,axis = 1)
477 |             
478 |             avg_guideline =flat_avg(optimal_mean)
479 |             sdev_guideline = flat_avg(optimal_standard_dev)
480 |                 
481 |             sdev_scale = 2.5
482 |             upper_guideline = avg_guideline + sdev_scale * sdev_guideline
483 |             lower_guideline = avg_guideline - sdev_scale * sdev_guideline
484 |             nominal_guideline = np.array([lower_guideline, upper_guideline])       
485 |         else:
486 |             # get nominal percentiles for plotting
487 |             nominal_guideline=np.percentile(optimal_values,[10,90],axis=1)
488 |             
489 |         self.nominal_guideline = nominal_guideline
490 |         # Get nominal values assuming binary (note that we will only use this if the variable is binary)
491 |         self.discrete_nominal_guideline=np.mean(optimal_values,axis=1)
492 |         self.precursor_score_guideline = np.full(optimal_values.shape[0],self.prediction_data.precursor_threshold)
493 |         
494 | 
495 | 
496 | 
497 | def save_figure(myModel, figure_suffix, fig,file_output_dir,filename,file_output_type = 'pdf', output_time = True):
498 |     time_start = time.time()
499 |     print("Saving figure: {}".format(figure_suffix))
500 |     model_output_directory = myModel.model_output_directory
501 | 
502 |     if model_output_directory != "":
503 |         model_output_directory = os.path.join(model_output_directory,file_output_dir)
504 |         if not os.path.exists(model_output_directory):
505 |             print(f"creating directory {model_output_directory}")
506 |             os.makedirs(model_output_directory)
507 |     
508 |     
509 |     
510 |     filename = "{}{}.{}".format(filename,figure_suffix,"pdf")
511 |     filepath = os.path.join(model_output_directory,filename)
512 |     
513 |     #print("Saving figure: {}".format(filepath))
514 | 
515 |     fig.savefig(filepath,format= file_output_type)
516 | 
517 | #    if(output_time):
518 | #        print("Total time to save figure: {}".format(time.time()-time_start))
519 | 
520 | def visualize(myData, myModel,sample_idx = 0, savefig = False):
521 | 
522 |     vis = Visualizer(myData,myModel,sample_idx)
523 |     
524 |     vis.plot_sample_timeline(figure_size = (8,6), saveFig = savefig)
525 |     
526 |     print("Visualizing Sample {}".format(sample_idx))
527 |     vis.visualize_sample_parameters(figure_size=(32,24),saveFig = savefig)
528 |     
529 | 
530 | 
531 | 


--------------------------------------------------------------------------------