├── MKAD Corporate CLA_2021.pdf ├── MKAD Individual CLA_2021.pdf ├── MKAD NOSA 2019.pdf ├── PythonCode ├── SAX.py ├── config_template.json ├── preprocess_files_multiprocess.py ├── run_mkad.py └── visualization.py ├── README.md ├── documentation └── README.docx └── kernels ├── README.rst ├── __init__.py ├── setup.py └── src └── nlcs ├── lcs.cpp ├── lcs.h └── nlcs_wrapper.cpp /MKAD Corporate CLA_2021.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/PyMKAD/02d29db9e13ceffc7fdabb188948618da40306a1/MKAD Corporate CLA_2021.pdf -------------------------------------------------------------------------------- /MKAD Individual CLA_2021.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/PyMKAD/02d29db9e13ceffc7fdabb188948618da40306a1/MKAD Individual CLA_2021.pdf -------------------------------------------------------------------------------- /MKAD NOSA 2019.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/PyMKAD/02d29db9e13ceffc7fdabb188948618da40306a1/MKAD NOSA 2019.pdf -------------------------------------------------------------------------------- /PythonCode/SAX.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @author: Bryan Matthews KBRWyle 3 | Data Science Group 4 | NASA Ames Research Center 5 | 6 | This code provides helper functions used by both preprocess_files_multiprocess.py and run_mkad.py. 7 | 8 | Code Updated: 2019-03-08 9 | ''' 10 | 11 | 12 | import numpy as np 13 | from scipy import signal 14 | import csv 15 | import json 16 | import pickle 17 | from glob import glob 18 | import time 19 | import gzip 20 | import os,sys 21 | import io 22 | import pandas as pd 23 | import nlcs 24 | 25 | global cut_points 26 | cut_points={ '2': [-np.inf,0], 27 | '3': [-np.inf,-0.43,0.43], 28 | '4': [-np.inf,-0.67,0,0.67], 29 | '5': [-np.inf,-0.84,-0.25,0.25,0.84], 30 | '6': [-np.inf,-0.97,-0.43,0,0.43,0.97], 31 | '7': [-np.inf,-1.07,-0.57,-0.18,0.18,0.57,1.07], 32 | '8': [-np.inf,-1.15,-0.67,-0.32,0,0.32,0.67,1.15], 33 | '9': [-np.inf,-1.22,-0.76,-0.43,-0.14,0.14,0.43,0.76,1.22], 34 | '10': [-np.inf,-1.28,-0.84,-0.52,-0.25,0,0.25,0.52,0.84,1.28], 35 | '11': [-np.inf,-1.34,-0.91,-0.6,-0.35,-0.11,0.11,0.35,0.6,0.91,1.34], 36 | '12': [-np.inf,-1.38,-0.97,-0.67,-0.43,-0.21,0,0.21,0.43,0.67,0.97,1.38], 37 | '13': [-np.inf,-1.43,-1.02,-0.74,-0.5,-0.29,-0.1,0.1,0.29,0.5,0.74,1.02,1.43], 38 | '14': [-np.inf,-1.47,-1.07,-0.79,-0.57,-0.37,-0.18,0,0.18,0.37,0.57,0.79,1.07,1.47], 39 | '15': [-np.inf,-1.5,-1.11,-0.84,-0.62,-0.43,-0.25,-0.08,0.08,0.25,0.43,0.62,0.84,1.11,1.5], 40 | '16': [-np.inf,-1.53,-1.15,-0.89,-0.67,-0.49,-0.32,-0.16,0,0.16,0.32,0.49,0.67,0.89,1.15,1.53], 41 | '17': [-np.inf,-1.56,-1.19,-0.93,-0.72,-0.54,-0.38,-0.22,-0.07,0.07,0.22,0.38,0.54,0.72,0.93,1.19,1.56], 42 | '18': [-np.inf,-1.59,-1.22,-0.97,-0.76,-0.59,-0.43,-0.28,-0.14,0,0.14,0.28,0.43,0.59,0.76,0.97,1.22,1.59], 43 | '19': [-np.inf,-1.62,-1.25,-1 -0.8,-0.63,-0.48,-0.34,-0.2,-0.07,0.07,0.2,0.34,0.48,0.63,0.8,1,1.25,1.62], 44 | '20': [-np.inf,-1.64,-1.28,-1.04,-0.84,-0.67,-0.52,-0.39,-0.25,-0.13,0,0.13,0.25,0.39,0.52,0.67,0.84,1.04,1.28,1.64]} 45 | 46 | def read_pandas(filename): 47 | gz = gzip.open(filename, 'rb') 48 | f = io.BufferedReader(gz) 49 | data= pd.read_csv(f,low_memory=False).replace('False','0').replace('True','1').replace('DNE','nan') 50 | f.close() 51 | gz.close() 52 | header = np.array(data.keys()) 53 | return(header,data.values.astype(float)) 54 | 55 | 56 | def quantize_lookup_table(x,alphabet_size): 57 | global cut_points 58 | return(alphabet_size-list(np.flipud(np.array(cut_points[str(alphabet_size)],dtype=float)<=x)).index(True)) 59 | 60 | 61 | def quantize_time_series(Data,params,alphabet,window_size): 62 | 63 | quantized_data = np.zeros((int(np.ceil(Data['data'].shape[0]/float(window_size))),len(params['continuous_indx'])),dtype=int) 64 | for i in range(int(np.ceil(Data['data'].shape[0]/float(window_size)))): 65 | jj=0 66 | for j in params['continuous_indx']: 67 | max_range=min([(i+1)*window_size-1,Data['data'].shape[0]]) 68 | val = np.mean(Data['data'][i*window_size:max_range,j]) 69 | quantized_data[i,jj]=quantize_lookup_table(val,alphabet) 70 | jj+=1 71 | return(quantized_data) 72 | 73 | def convert_disc_2_seq(Data,params): 74 | if(len(params['discrete_indx'])>0): 75 | changes=np.diff(Data['data'][:,params['discrete_indx']],axis=0) 76 | for i in range(changes.shape[1]): 77 | changes[changes[:,i]==1,i]=(i+1)*2-1 78 | changes[changes[:,i]==-1,i]=(i+1)*2 79 | seq=changes.flatten() 80 | seq=np.append(seq[seq!=0],0) 81 | else: 82 | seq=np.array([1]) 83 | return(seq) 84 | 85 | def output_vector_SVMlight(filename,append,quantized_data,discrete_seq): 86 | FeatureV=[len(discrete_seq),quantized_data.shape[0]] 87 | FeatureV.extend(list(discrete_seq.astype(int))) 88 | FeatureV.extend(list(np.transpose(quantized_data).flatten())) 89 | if(append): 90 | fid=open(filename,'a') 91 | else: 92 | fid=open(filename,'w') 93 | fid.write("1 ") 94 | for i in range(len(FeatureV)): 95 | fid.write(str(i+1)+":"+str(FeatureV[i])+" ") 96 | fid.write("\n") 97 | fid.close() 98 | return([]) 99 | 100 | 101 | def find_param_indices(header,params): 102 | indx=[] 103 | for p in params: 104 | indx.append(list(header).index(p)) 105 | return(tuple(indx)) 106 | 107 | def load_FOQA_csv(filename): 108 | header,data = read_pandas(filename) 109 | data[0,np.isnan(data[0,:])]=0 110 | for i,row in enumerate(data[:-1,:]): 111 | indx_nans = np.isnan(data[i+1,:]) 112 | data[i+1,indx_nans] = data[i,indx_nans] 113 | return({'header':header,'data':data}) 114 | 115 | #Finds touchdown point and decent beggining at cutoff altitude. 116 | def find_marker(Data,important_params): 117 | alt_indx=list(Data['header']).index(str(important_params['alt'])) 118 | td_indicator_indx=list(Data['header']).index(str(important_params['td_indicator'])) 119 | middle_indx=list(signal.filtfilt(np.ones((30),dtype=float),np.ones((1),dtype=float),Data['data'][:,alt_indx])/30**2>15000).index(True) #30 sec windowed filter to get rid of startup noise. 120 | td_indx=list(np.diff(Data['data'][middle_indx:,td_indicator_indx])>0).index(1)+middle_indx 121 | return({'middle_indx':middle_indx,'td_indx':td_indx,'alt_indx':alt_indx}) 122 | 123 | def get_approach(Data,start_alt,markers): 124 | ##Adjust altitudes by touchdown altitude## 125 | Data['data'][:,markers['alt_indx']]=Data['data'][:,markers['alt_indx']]-Data['data'][markers['td_indx'],markers['alt_indx']] 126 | start_indx=markers['td_indx']-list(np.flipud(Data['data'][markers['middle_indx']:markers['td_indx'],markers['alt_indx']])>start_alt).index(True) 127 | Data['data']=Data['data'][start_indx:markers['td_indx'],:] 128 | return(Data) 129 | 130 | # Keeps track of first order statistics using Welford's Online algorithm 131 | def zscore_stream(data,statistics={'dataMean':[],'dataStd':[],'S0':[],'S1':[],'S2':[]}): 132 | 133 | if(len(statistics['dataMean'])==0): 134 | statistics['S0']=np.zeros((data.shape[1]),dtype=int) 135 | statistics['S1']=np.zeros((data.shape[1]),dtype=int) 136 | statistics['S2']=np.zeros((data.shape[1]),dtype=int) 137 | statistics['dataMean']=np.zeros((data.shape[1]),dtype=float) 138 | statistics['dataStd']=np.zeros((data.shape[1]),dtype=float) 139 | 140 | statistics['S0']=statistics['S0']+np.sum(data**0,axis=0) 141 | statistics['S1']=statistics['S1']+np.sum(data**1,axis=0) 142 | statistics['S2']=statistics['S2']+np.sum(data**2,axis=0) 143 | 144 | for i in range(data.shape[1]): 145 | statistics['dataMean'][i]=statistics['S1'][i]/statistics['S0'][i] 146 | statistics['dataStd'][i]=(1.0/statistics['S0'][i])*np.sqrt(np.abs(statistics['S0'][i]*statistics['S2'][i]-statistics['S1'][i]**2)) 147 | statistics['dataStd'][statistics['dataStd']==0]=1 148 | return(statistics) 149 | 150 | # Merge reduce function to compute global statistics. 151 | def zscore_stream_merge(statistics1,statistics2): 152 | 153 | statistics1['S0']+=statistics2['S0'] 154 | statistics1['S1']+=statistics2['S1'] 155 | statistics1['S2']+=statistics2['S2'] 156 | 157 | for i in range(statistics1['dataMean'].shape[0]): 158 | statistics1['dataMean'][i]=statistics1['S1'][i]/statistics1['S0'][i] 159 | statistics1['dataStd'][i]=(1.0/statistics1['S0'][i])*np.sqrt(np.abs(statistics1['S0'][i]*statistics1['S2'][i]-statistics1['S1'][i]**2)) 160 | statistics1['dataStd'][statistics1['dataStd']==0]=1 161 | return(statistics1) 162 | 163 | # Calls nlcs from c-extensions code Compiled separately with kernels module 164 | def MKAD_kernel_function(A,B): 165 | return(nlcs.compute(np.atleast_2d(np.array(A,dtype=np.uint16)),np.atleast_2d(np.array(B,dtype=np.uint16)))) 166 | -------------------------------------------------------------------------------- /PythonCode/config_template.json: -------------------------------------------------------------------------------- 1 | { 2 | "name":"RUN_ID", 3 | "filelist":"/path/to/filelist_of_gzip_csvs.txt", 4 | "working_dir":"/path/to/working/directory/", 5 | "svmlight_file":"/path/to/svmlight/filename.txt", 6 | "MKAD_folder":"/path/to/mkad/output/folder/", 7 | "important_params":{"alt":"ALTITUDE", 8 | "td_indicator":"LANDING GEAR COMPRESSED", 9 | "ground_speed": "GRND SPEED"}, 10 | "params":{"continuous":"/path/to/continuous/parameter/continuous_parameter_list.txt", 11 | "discrete":"/path/to/continuous/parameter/discrete_parameter_list.txt"}, 12 | "nu":0.1, 13 | "alphabet":10, 14 | "window_size":30, 15 | "starting_alt":6000, 16 | "cluster_eps":0.07, 17 | "save_kernel":true, 18 | "use_existing_kernel":false 19 | } 20 | -------------------------------------------------------------------------------- /PythonCode/preprocess_files_multiprocess.py: -------------------------------------------------------------------------------- 1 | #!${HOMNE}/anaconda3/bin/python 2 | 3 | # __________________________________________________________________________ 4 | # 5 | # Notices: 6 | # 7 | # Copyright 2010, 2019 United States Government as represented by the Administrator of the National Aeronautics and 8 | # Space Administration. All Rights Reserved. 9 | # 10 | # Disclaimers 11 | # 12 | # No Warranty: THE SUBJECT SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY OF ANY KIND, EITHER EXPRESSED, 13 | # IMPLIED, OR STATUTORY, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTY THAT THE SUBJECT SOFTWARE WILL CONFORM 14 | # TO SPECIFICATIONS, ANY IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR FREEDOM 15 | # FROM INFRINGEMENT, ANY WARRANTY THAT THE SUBJECT SOFTWARE WILL BE ERROR FREE, OR ANY WARRANTY THAT DOCUMENTATION, 16 | # IF PROVIDED, WILL CONFORM TO THE SUBJECT SOFTWARE. THIS AGREEMENT DOES NOT, IN ANY MANNER, CONSTITUTE AN 17 | # ENDORSEMENT BY GOVERNMENT AGENCY OR ANY PRIOR RECIPIENT OF ANY RESULTS, RESULTING DESIGNS, HARDWARE, SOFTWARE 18 | # PRODUCTS OR ANY OTHER APPLICATIONS RESULTING FROM USE OF THE SUBJECT SOFTWARE. FURTHER, GOVERNMENT AGENCY 19 | # DISCLAIMS ALL WARRANTIES AND LIABILITIES REGARDING THIRD-PARTY SOFTWARE, IF PRESENT IN THE ORIGINAL SOFTWARE, 20 | # AND DISTRIBUTES IT "AS IS." 21 | # 22 | # Waiver and Indemnity: RECIPIENT AGREES TO WAIVE ANY AND ALL CLAIMS AGAINST THE UNITED STATES GOVERNMENT, 23 | # ITS CONTRACTORS AND SUBCONTRACTORS, AS WELL AS ANY PRIOR RECIPIENT. IF RECIPIENT'S USE OF THE SUBJECT SOFTWARE 24 | # RESULTS IN ANY LIABILITIES, DEMANDS, DAMAGES, EXPENSES OR LOSSES ARISING FROM SUCH USE, INCLUDING ANY 25 | # DAMAGES FROM PRODUCTS BASED ON, OR RESULTING FROM, RECIPIENT'S USE OF THE SUBJECT SOFTWARE, RECIPIENT 26 | # SHALL INDEMNIFY AND HOLD HARMLESS THE UNITED STATES GOVERNMENT, ITS CONTRACTORS AND SUBCONTRACTORS, AS WELL 27 | # AS ANY PRIOR RECIPIENT, TO THE EXTENT PERMITTED BY LAW. RECIPIENT'S SOLE REMEDY FOR ANY SUCH MATTER SHALL 28 | # BE THE IMMEDIATE, UNILATERAL TERMINATION OF THIS AGREEMENT. 29 | # 30 | # __________________________________________________________________________ 31 | # 32 | # 33 | ''' 34 | @author: Bryan Matthews KBRWyle 35 | Data Science Group 36 | NASA Ames Research Center 37 | 38 | This code is designed to process gzipped csv files and preprocess using Symbolic Aggregate approXimation (SAX). 39 | These files are stored in a SVMlight format file. Usage: 40 | $>python preprocess_files_multiprocess.py config.json number_of_processes(optional) 41 | 42 | Code Updated: 2019-03-08 43 | ''' 44 | 45 | 46 | 47 | import numpy as np 48 | from scipy import signal 49 | import csv 50 | import json 51 | import pickle 52 | from glob import glob 53 | import time 54 | import gzip 55 | import sys,os 56 | import SAX 57 | from multiprocessing import Process 58 | from progress.bar import IncrementalBar 59 | import warnings 60 | warnings.simplefilter(action='ignore', category=FutureWarning) 61 | 62 | # Distributed worker for loading, partitioning, and computing statistics of flights. 63 | def worker(filelist,config,thread_id): 64 | first_time=True 65 | continuous_params = np.atleast_1d(np.genfromtxt(config['params']['continuous'],delimiter="\n",comments="@",dtype=str)) 66 | discrete_params = np.atleast_1d(np.genfromtxt(config['params']['discrete'],delimiter="\n",comments="@",dtype=str)) 67 | data_cube = {'continuous':np.zeros((filelist.shape[0], 81, continuous_params.shape[0]), dtype=float), 68 | 'discrete':np.zeros((filelist.shape[0], 81, discrete_params.shape[0]), dtype=float), 69 | 'continuous_params':continuous_params,'discrete_params':discrete_params, 'filelist':[]} 70 | bar = IncrementalBar('Task '+str(100+thread_id)[1:]+': Partitioning Flights...', max=len(filelist)) 71 | for i,f in enumerate(filelist): 72 | data_cube['filelist'].append(os.path.basename(f).split(".")[0]) 73 | try: 74 | Data = SAX.load_FOQA_csv(f) 75 | markers=SAX.find_marker(Data,config['important_params']) 76 | Data=SAX.get_approach(Data,config['starting_alt'],markers) 77 | except ValueError: 78 | continue 79 | config['params']['continuous_indx']=SAX.find_param_indices(Data['header'],continuous_params) 80 | config['params']['discrete_indx']=SAX.find_param_indices(Data['header'],discrete_params) 81 | if(first_time): 82 | statistics=SAX.zscore_stream(Data['data'][:,config['params']['continuous_indx']]) 83 | first_time=False 84 | else: 85 | statistics=SAX.zscore_stream(Data['data'][:,config['params']['continuous_indx']],statistics) 86 | xvec = np.flipud(np.cumsum(np.flipud(Data['data'][:, np.where(np.array(Data['header']) == config['important_params']['ground_speed'])[0]])) / 3600) 87 | bins = [np.intersect1d(np.where((xvec >= d)), np.where(xvec < (d + 0.25))) for d in np.linspace(0, 20 - 0.25, 80)] #Create 20 NM to 0 NM vector in 0.25 mile bins. 88 | bins.append(np.intersect1d(np.where(xvec >= 20.0), np.where(xvec < np.inf))) 89 | 90 | data_cube['continuous'][i, :, :] = np.array([np.mean(Data['data'][b,:][:,config['params']['continuous_indx']],axis=0) if len(b)>0 else np.zeros((len(config['params']['continuous_indx'])),dtype=float)*np.nan for b in np.flipud(bins)]) 91 | data_cube['discrete'][i, :, :] = np.array([np.mean(Data['data'][b, :][:, config['params']['discrete_indx']],axis=0) if len(b) > 0 else np.zeros((len(config['params']['discrete_indx'])), dtype=float) * np.nan for b in np.flipud(bins)]) 92 | pickle.dump(Data,open(os.path.join(config['working_dir'],'data',os.path.basename(f).replace('.csv.gz','.pkl')),'wb')) 93 | bar.next() 94 | bar.finish() 95 | pickle.dump(statistics,open(os.path.join(config['working_dir'],'statistics_'+str(thread_id)+'.pkl'),'wb')) 96 | pickle.dump(data_cube, open(os.path.join(config['working_dir'], 'data_cube_' + str(thread_id) + '.pkl'), 'wb')) 97 | return() 98 | 99 | # Distributed worker for applying SAX vectorization to flight data. 100 | def worker_SAX(filelist,config,statistics,thread_id): 101 | 102 | good_indx=np.zeros((len(filelist)),dtype=bool) 103 | continuous_params = np.atleast_1d(np.genfromtxt(config['params']['continuous'],delimiter="\n",comments="@",dtype=str)) 104 | discrete_params = np.atleast_1d(np.genfromtxt(config['params']['discrete'],delimiter="\n",comments="@",dtype=str)) 105 | bar = IncrementalBar('Task '+str(100+thread_id)[1:]+': Creating SAX Vector...', max=len(filelist)) 106 | first_time=True 107 | for i,f in enumerate(filelist): 108 | Data=pickle.load(open(f,'rb')) 109 | config['params']['continuous_indx']=SAX.find_param_indices(Data['header'],continuous_params) 110 | config['params']['discrete_indx']=SAX.find_param_indices(Data['header'],discrete_params) 111 | Data['data'][:,config['params']['continuous_indx']]=(Data['data'][:,config['params']['continuous_indx']]-np.tile(statistics['dataMean'],[Data['data'].shape[0],1]))/np.tile(statistics['dataStd'],[Data['data'].shape[0],1]) 112 | Data['data'][np.isnan(Data['data'])]=0 113 | quantized_data=SAX.quantize_time_series(Data,config['params'],config['alphabet'],config['window_size']) 114 | good_indx[i]=quantized_data.shape[0]!=0 115 | if(not good_indx[i]): 116 | continue 117 | discrete_seq=SAX.convert_disc_2_seq(Data,config['params']) 118 | if(first_time): 119 | first_time=False 120 | # if(os.path.dirname(config['svmlight_file'])!=""): 121 | print(os.path.dirname(config['svmlight_file'])) 122 | os.makedirs(os.path.dirname(config['svmlight_file']), exist_ok=True) 123 | SAX.output_vector_SVMlight(config['svmlight_file']+'_'+str(100+thread_id)[1:],False,quantized_data,discrete_seq) 124 | else: 125 | SAX.output_vector_SVMlight(config['svmlight_file']+'_'+str(100+thread_id)[1:],True,quantized_data,discrete_seq) 126 | bar.next() 127 | bar.finish() 128 | np.savetxt(os.path.join(config['working_dir'],"filelist_in_svmlight_file_"+str(100+thread_id)[1:]+".txt"),np.array(filelist)[good_indx],fmt="%s") 129 | return() 130 | 131 | def cat_files(filelist,output): 132 | fid_out = open(output,'w') 133 | for f in filelist: 134 | with open(f,'r') as fid: 135 | data = fid.read() 136 | fid_out.write(data) 137 | fid_out.close() 138 | 139 | if __name__ == '__main__': 140 | 141 | if(len(sys.argv)<2): 142 | print("Usage:") 143 | print("$>python preprocess_files_multiprocess.py config.json number_of_processes(optional)") 144 | quit() 145 | 146 | skip_paritioning=False #For debugging purposes. Skipps the initial partitioning of flights when True. 147 | 148 | # Limit multi threading in scientific packagages like BLAS to 1 process to avoid conflict with our multiprocess preprocess steps. 149 | os.environ["OMP_NUM_THREADS"] = "1" 150 | 151 | startT = time.time() 152 | config=json.load(open(sys.argv[1])) 153 | if(len(sys.argv)<3): 154 | number_of_processes=1.0 155 | else: 156 | number_of_processes=float(sys.argv[2]) 157 | 158 | os.makedirs(os.path.join(config['working_dir'],'data'), exist_ok=True) 159 | if not skip_paritioning: 160 | print("Partitioning flights from "+str(config['starting_alt'])+ " ft to landing...") 161 | filelist=np.genfromtxt(config['filelist'],delimiter='\n',dtype=str) 162 | size_per_thread=np.ceil(float(filelist.shape[0])/number_of_processes) 163 | jobs=[] 164 | for i in range(int(number_of_processes)): 165 | p = Process(target=worker, args=(filelist[int((i)*size_per_thread):int(min(int((i+1)*size_per_thread),filelist.shape[0]))],config,i)) 166 | jobs.append(p) 167 | p.start() 168 | while len(jobs) > 0: 169 | jobs = [job for job in jobs if job.is_alive()] 170 | time.sleep(1) 171 | 172 | statistics=pickle.load(open(os.path.join(config['working_dir'],'statistics_0.pkl'),'rb')) 173 | data_cube = pickle.load(open(os.path.join(config['working_dir'],'data_cube_0.pkl'), 'rb')) 174 | for i in range(1,int(number_of_processes)): 175 | statistics2=pickle.load(open(os.path.join(config['working_dir'],'statistics_'+str(i)+'.pkl'),'rb')) 176 | statistics=SAX.zscore_stream_merge(statistics,statistics2) 177 | data_cube_tmp = pickle.load(open(os.path.join(config['working_dir'], 'data_cube_' + str(i) + '.pkl'), 'rb')) 178 | data_cube['continuous'] = np.vstack((data_cube['continuous'],data_cube_tmp['continuous'])) 179 | data_cube['discrete'] = np.vstack((data_cube['discrete'], data_cube_tmp['discrete'])) 180 | data_cube['filelist'].extend(data_cube_tmp['filelist']) 181 | 182 | pickle.dump(statistics,open(os.path.join(config['working_dir'],'statistics.pkl'),'wb')) 183 | pickle.dump(data_cube, open(os.path.join(config['working_dir'],'data_cube.pkl'), 'wb')) 184 | for i in range(int(number_of_processes)): 185 | os.remove(os.path.join(config['working_dir'],'statistics_'+str(i)+'.pkl')) 186 | os.remove(os.path.join(config['working_dir'],'data_cube_' + str(i) + '.pkl')) 187 | 188 | first_time=True 189 | statistics=pickle.load(open(os.path.join(config['working_dir'],'statistics.pkl'),'rb')) 190 | filelist=np.array(sorted(list(set(glob(os.path.join(config['working_dir'],'data','*.pkl')))))) 191 | size_per_thread=np.ceil(float(filelist.shape[0])/number_of_processes) 192 | jobs=[] 193 | for i in range(int(number_of_processes)): 194 | p = Process(target=worker_SAX, args=(filelist[int((i)*size_per_thread):int(min(int((i+1)*size_per_thread),filelist.shape[0]))],config,statistics,i)) 195 | jobs.append(p) 196 | p.start() 197 | while len(jobs) > 0: 198 | jobs = [job for job in jobs if job.is_alive()] 199 | time.sleep(1) 200 | 201 | filelist = sorted(glob(os.path.join(config['working_dir'],'filelist_in_svmlight_file_*'))) 202 | cat_files(filelist,os.path.join(config['working_dir'],'filelist_in_svmlight_file.txt')) 203 | # os.system('cat '+config['working_dir']+'/filelist_in_svmlight_file_* > '+ config['working_dir']+'/filelist_in_svmlight_file.txt') 204 | [os.remove(f) for f in glob(config['working_dir']+"/filelist_in_svmlight_file_*")] 205 | filelist = sorted(glob(config['svmlight_file']+'_*')) 206 | cat_files(filelist,config['svmlight_file']) 207 | # os.system('cat '+config['svmlight_file']+'_* > '+config['svmlight_file']) 208 | [os.remove(f) for f in glob(config['svmlight_file']+"_*")] 209 | 210 | print("Runtime:" + str(time.time()-startT) + "Seconds") 211 | 212 | 213 | -------------------------------------------------------------------------------- /PythonCode/run_mkad.py: -------------------------------------------------------------------------------- 1 | #!${HOMNE}/anaconda3/bin/python 2 | 3 | #_________________________________________________________________________ 4 | # 5 | # Notices: 6 | # 7 | # Copyright 2010, 2019 United States Government as represented by the Administrator of the National Aeronautics and 8 | # Space Administration. All Rights Reserved. 9 | # 10 | # Disclaimers 11 | # 12 | # No Warranty: THE SUBJECT SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY OF ANY KIND, EITHER EXPRESSED, 13 | # IMPLIED, OR STATUTORY, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTY THAT THE SUBJECT SOFTWARE WILL CONFORM 14 | # TO SPECIFICATIONS, ANY IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR FREEDOM 15 | # FROM INFRINGEMENT, ANY WARRANTY THAT THE SUBJECT SOFTWARE WILL BE ERROR FREE, OR ANY WARRANTY THAT DOCUMENTATION, 16 | # IF PROVIDED, WILL CONFORM TO THE SUBJECT SOFTWARE. THIS AGREEMENT DOES NOT, IN ANY MANNER, CONSTITUTE AN 17 | # ENDORSEMENT BY GOVERNMENT AGENCY OR ANY PRIOR RECIPIENT OF ANY RESULTS, RESULTING DESIGNS, HARDWARE, SOFTWARE 18 | # PRODUCTS OR ANY OTHER APPLICATIONS RESULTING FROM USE OF THE SUBJECT SOFTWARE. FURTHER, GOVERNMENT AGENCY 19 | # DISCLAIMS ALL WARRANTIES AND LIABILITIES REGARDING THIRD-PARTY SOFTWARE, IF PRESENT IN THE ORIGINAL SOFTWARE, 20 | # AND DISTRIBUTES IT "AS IS." 21 | # 22 | # Waiver and Indemnity: RECIPIENT AGREES TO WAIVE ANY AND ALL CLAIMS AGAINST THE UNITED STATES GOVERNMENT, 23 | # ITS CONTRACTORS AND SUBCONTRACTORS, AS WELL AS ANY PRIOR RECIPIENT. IF RECIPIENT'S USE OF THE SUBJECT SOFTWARE 24 | # RESULTS IN ANY LIABILITIES, DEMANDS, DAMAGES, EXPENSES OR LOSSES ARISING FROM SUCH USE, INCLUDING ANY 25 | # DAMAGES FROM PRODUCTS BASED ON, OR RESULTING FROM, RECIPIENT'S USE OF THE SUBJECT SOFTWARE, RECIPIENT 26 | # SHALL INDEMNIFY AND HOLD HARMLESS THE UNITED STATES GOVERNMENT, ITS CONTRACTORS AND SUBCONTRACTORS, AS WELL 27 | # AS ANY PRIOR RECIPIENT, TO THE EXTENT PERMITTED BY LAW. RECIPIENT'S SOLE REMEDY FOR ANY SUCH MATTER SHALL 28 | # BE THE IMMEDIATE, UNILATERAL TERMINATION OF THIS AGREEMENT. 29 | # 30 | # __________________________________________________________________________ 31 | 32 | ''' 33 | @author: Bryan Matthews KBRWyle 34 | Data Science Group 35 | NASA Ames Research Center 36 | 37 | This code will load the SVMlight file produced by preprocess_files_multiprocess.py and execute the Multiple Kernel Anomaly 38 | Detection (MKAD) algorithm. The output will be saved in a csv file with decomposed score compositions. Usage: 39 | $>python run_mkad.py config.json number_of_processes(optional) 40 | 41 | Code Updated: 2019-03-08 42 | ''' 43 | 44 | 45 | 46 | import sys,os 47 | import json 48 | import numpy as np 49 | from multiprocessing import Process, Queue 50 | import time 51 | from sklearn.datasets import load_svmlight_file 52 | import SAX 53 | from progress.bar import IncrementalBar 54 | from sklearn.svm import OneClassSVM 55 | import pickle 56 | from sklearn.cluster import DBSCAN 57 | 58 | 59 | def parse_SAX_vector(SAX_v): 60 | seq = SAX_v[0,2:2+int(SAX_v[0,0])] 61 | num_rows = int(SAX_v[0,1]) 62 | num_cols = int((SAX_v.shape[1]-int(SAX_v[0,0])-2)/int(SAX_v[0,1])) 63 | cont_matrix = SAX_v[0,2+int(SAX_v[0,0]):].reshape((num_cols,num_rows)) 64 | return([seq,cont_matrix]) 65 | 66 | def worker(index,svmlight_data,thread_id,q): 67 | 68 | bar = IncrementalBar('Task '+str(100+thread_id)[1:]+': Computing Kernel...', max=len(index)) 69 | K = np.zeros((len(index),svmlight_data.shape[0]),dtype=float) 70 | count = 0 71 | for I,i in enumerate(index): 72 | seq1,cont_matrix1 = parse_SAX_vector(svmlight_data[i,:svmlight_data.getrow(i).nonzero()[1][-1]+1].todense()) 73 | for j in range(i,svmlight_data.shape[0]): 74 | seq2,cont_matrix2 = parse_SAX_vector(svmlight_data[j,:svmlight_data.getrow(j).nonzero()[1][-1]+1].todense()) 75 | K[I,j] = 0.5*SAX.MKAD_kernel_function(np.transpose(seq1),np.transpose(seq2)) 76 | for l in range(cont_matrix1.shape[0]): 77 | K[I,j] += 0.5*SAX.MKAD_kernel_function(np.transpose(cont_matrix1[l,:]),np.transpose(cont_matrix2[l,:]))/cont_matrix1.shape[0] 78 | count += 1 79 | bar.next() 80 | bar.finish() 81 | q.put(K) 82 | return([]) 83 | 84 | def worker_test(alphas,SVs,test,thread_id,q): 85 | 86 | _,cont_matrix = parse_SAX_vector(SVs[0,:np.max(SVs[0,:].nonzero()[1])+1].todense()) 87 | num_contin = cont_matrix.shape[0] 88 | bar = IncrementalBar('Task '+str(100+thread_id)[1:]+': Calculating Decomposed Scores...', max=test.shape[0]) 89 | 90 | scores_decomposed = np.zeros((test.shape[0],1+num_contin),dtype=float) 91 | for j in range(test.shape[0]): 92 | seq2,cont_matrix2 = parse_SAX_vector(test[j,:np.max(test[j,:].nonzero()[1])+1].todense()) 93 | for i in range(SVs.shape[0]): 94 | seq1,cont_matrix1 = parse_SAX_vector(SVs[i,:np.max(SVs[i,:].nonzero()[1])+1].todense()) 95 | scores_decomposed[j,0] += alphas[i]*SAX.MKAD_kernel_function(np.transpose(seq1),np.transpose(seq2)) 96 | for l in range(num_contin): 97 | scores_decomposed[j,1+l] += alphas[i]*SAX.MKAD_kernel_function(np.transpose(cont_matrix1[l,:]),np.transpose(cont_matrix2[l,:])) 98 | bar.next() 99 | bar.finish() 100 | q.put(scores_decomposed) 101 | return([]) 102 | 103 | 104 | if __name__ == '__main__': 105 | 106 | if(len(sys.argv)<2): 107 | print("Usage:") 108 | print("$>python run_mkad.py config.json number_of_processes(optional)") 109 | quit() 110 | 111 | if(len(sys.argv)<3): 112 | number_of_processes=1.0 113 | else: 114 | number_of_processes=float(sys.argv[2]) 115 | 116 | config=json.load(open(sys.argv[1])) 117 | 118 | startT = time.time() 119 | 120 | svmlight_data = load_svmlight_file(config['svmlight_file'])[0][:,:] 121 | nu = config['nu'] 122 | working_dir = config['working_dir'] 123 | params_c = np.genfromtxt(config['params']['continuous'],delimiter="\n",dtype=str) 124 | 125 | # Check to make sure kernel file exists. If not resets to compute kernel from SVMlight file and save kernel. 126 | if(not os.path.isfile(os.path.join(config['working_dir'],'kernel_'+config['name']+'.pkl'))): 127 | print("No exisiting kernel found...Computing from SVMlightFile") 128 | config['use_existing_kernel'] = False 129 | config['save_kernel'] = True 130 | 131 | os.system('mkdir -p '+config['MKAD_folder']) 132 | if(not config['use_existing_kernel']): 133 | totals = np.cumsum(np.arange(svmlight_data.shape[0],1,-1)) 134 | chunk_size = int(totals[-1]/number_of_processes) 135 | index = [0] 136 | while np.sum(totals) > 0: 137 | I = np.argmax(totals>chunk_size) 138 | if(I==0): 139 | index.append(totals.shape[0]+1) 140 | break 141 | index.append(I) 142 | totals -= totals[index[-1]] 143 | totals[:index[-1]] = 0 144 | 145 | size_per_thread=np.ceil(float(svmlight_data.shape[0])/number_of_processes) 146 | jobs=[] 147 | pipe_list = [] 148 | for i in range(int(number_of_processes)): 149 | if(index[i]==svmlight_data.shape[0]): 150 | break 151 | q = Queue() 152 | p = Process(target=worker, args=(np.arange(index[i],index[i+1]),svmlight_data,i,q)) 153 | jobs.append(p) 154 | pipe_list.append(q) 155 | p.start() 156 | 157 | time.sleep(1) 158 | 159 | K = np.zeros((svmlight_data.shape[0],svmlight_data.shape[0]),dtype=float) 160 | indx = 0 161 | for i,x in enumerate(pipe_list): 162 | tmp = x.get() 163 | K[indx:indx+tmp.shape[0],:] = tmp 164 | indx += tmp.shape[0] 165 | 166 | # Copy over the upper to lower triangle 167 | i_lower = np.tril_indices(K.shape[0],-1) 168 | K[i_lower] = np.transpose(K)[i_lower] #Keep consisten row major indexing by transposing and getting the upper. 169 | 170 | if(config['save_kernel']): 171 | pickle.dump(K,open(os.path.join(config['working_dir'],'kernel_'+config['name']+'.pkl'),'wb')) 172 | if(config['use_existing_kernel']): 173 | print("Loading Exisiting Kernel...") 174 | K=pickle.load(open(os.path.join(config['working_dir'],'kernel_'+config['name']+'.pkl'),'rb')) 175 | 176 | # Solve the one-class SVM 177 | clf = OneClassSVM(kernel='precomputed',nu=0.1,tol=1e-12) 178 | clf.fit(K) 179 | scores = clf.score_samples(K) - clf.offset_ 180 | 181 | filelist = np.genfromtxt(working_dir+"/filelist_in_svmlight_file.txt",delimiter="\n",dtype=str) 182 | filelist = np.array([os.path.basename(f).split(".")[0] for f in filelist]) 183 | 184 | sorted_indx = np.argsort(scores) 185 | cutoff_point = np.argmax(scores[sorted_indx]>=0) 186 | 187 | # Reduce scores and flights to anomaly list 188 | filelist_anoms = filelist[sorted_indx][:cutoff_point] 189 | scores = scores[sorted_indx][:cutoff_point] 190 | 191 | # Select data for Support Vectors and anomalies 192 | SVs = svmlight_data[clf.support_,:] 193 | anoms = svmlight_data[sorted_indx,:][:cutoff_point,:] 194 | del(K) 195 | 196 | # Normalize alphas to sum to 1 197 | alphas = clf.dual_coef_[0]/np.sum(clf.dual_coef_[0]) 198 | 199 | # Get unbounded Support Vectors (used for computing rho) 200 | SVs_ub = SVs[alphas <= 1/(clf.dual_coef_[0]*svmlight_data.shape[0]),:] 201 | 202 | _,cont_matrix1 = parse_SAX_vector(svmlight_data[0,:np.max(svmlight_data[0,:].nonzero()[1])+1].todense()) #get the number of continuous parameters. 203 | num_contin = cont_matrix1.shape[0] 204 | 205 | 206 | print("\nComputing Decomposed Rho Values...") 207 | # Decompose the rhos 208 | rho = np.zeros((1+num_contin),dtype=float) 209 | for i in range(SVs.shape[0]): 210 | seq1,cont_matrix1 = parse_SAX_vector(SVs[i,:np.max(SVs[i,:].nonzero()[1])+1].todense()) 211 | for j in range(SVs_ub.shape[0]): 212 | seq2,cont_matrix2 = parse_SAX_vector(SVs_ub[j,:np.max(SVs_ub[j,:].nonzero()[1])+1].todense()) 213 | rho[0] += alphas[i]*SAX.MKAD_kernel_function(np.transpose(seq1),np.transpose(seq2)) 214 | for l in range(num_contin): 215 | rho[1+l] += alphas[i]*SAX.MKAD_kernel_function(np.transpose(cont_matrix1[l,:]),np.transpose(cont_matrix2[l,:]))#/cont_matrix1.shape[0] 216 | rho /= SVs_ub.shape[0] 217 | 218 | 219 | global_rho = np.sum(rho[1:]*0.5/num_contin)+rho[0]*0.5 220 | print(global_rho) 221 | 222 | print("Decomposing Scores for "+str(anoms.shape[0])+ " Anomalies...") 223 | size_per_thread=int(np.ceil(float(anoms.shape[0])/number_of_processes)) 224 | jobs=[] 225 | pipe_list = [] 226 | for i in range(int(number_of_processes)): 227 | q = Queue() 228 | p = Process(target=worker_test, args=(alphas,SVs,anoms[int(i)*size_per_thread:int(min(int((i+1)*size_per_thread),anoms.shape[0])),:],i,q)) 229 | jobs.append(p) 230 | p.start() 231 | pipe_list.append(q) 232 | 233 | scores_decomposed = np.zeros((anoms.shape[0],1+num_contin),dtype=float) 234 | indx = 0 235 | for x in pipe_list: 236 | tmp = x.get() 237 | scores_decomposed[indx:indx+tmp.shape[0],:] = tmp 238 | indx += tmp.shape[0] 239 | 240 | print("Computing Contributions...") 241 | # Account for kernel weights and subtract out the decomposed rhos 242 | scores_decomposed[:,0] -= rho[0] 243 | scores_decomposed[:,0] *= 0.5 244 | for l in range(num_contin): 245 | scores_decomposed[:,1+l] -= rho[1+l] 246 | scores_decomposed[:,1+l] *= 0.5/num_contin 247 | 248 | # Compute the global scores using the normalized alphas 249 | global_scores = np.sum(scores_decomposed,axis=1)- global_rho 250 | 251 | # Compute the percent contribution. 252 | percent_contribution = np.zeros((anoms.shape[0],1+num_contin),dtype=float) 253 | for i,s in enumerate(scores_decomposed): 254 | percent_contribution[i,:] = (s-np.max(s))/np.sum(s-np.max(s)) 255 | 256 | print("Clustering flights with similar contributions...") 257 | db = DBSCAN(eps=config['cluster_eps']).fit(percent_contribution) 258 | print(set(db.labels_)) 259 | print("Number of Clusters: " + str(len(set(db.labels_)))) 260 | 261 | print("Saving contribution file...\n"+config['MKAD_folder']+'/anomalous_flights_contributions_'+config['name']+'.csv') 262 | fid=open(config['MKAD_folder']+'/anomalous_flights_contributions_'+config['name']+'.csv','w') 263 | fid.write('Flight,MKAD_score,Cluster_ID,discrete_contribution,') 264 | fid.write(",".join(params_c)+"\n") 265 | for i in range(percent_contribution.shape[0]): 266 | fid.write(filelist_anoms[i]+","+str(round(global_scores[i],6))+','+str(db.labels_[i])+",") 267 | np.savetxt(fid,np.expand_dims(percent_contribution[i,:],axis=0),delimiter=",",fmt="%.6f") 268 | fid.close() 269 | print("Runtime:" + str(time.time()-startT) + "Seconds") -------------------------------------------------------------------------------- /PythonCode/visualization.py: -------------------------------------------------------------------------------- 1 | #!${HOMNE}/anaconda3/bin/python 2 | 3 | #_________________________________________________________________________ 4 | # 5 | # Notices: 6 | # 7 | # Copyright 2010, 2019 United States Government as represented by the Administrator of the National Aeronautics and 8 | # Space Administration. All Rights Reserved. 9 | # 10 | # Disclaimers 11 | # 12 | # No Warranty: THE SUBJECT SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY OF ANY KIND, EITHER EXPRESSED, 13 | # IMPLIED, OR STATUTORY, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTY THAT THE SUBJECT SOFTWARE WILL CONFORM 14 | # TO SPECIFICATIONS, ANY IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR FREEDOM 15 | # FROM INFRINGEMENT, ANY WARRANTY THAT THE SUBJECT SOFTWARE WILL BE ERROR FREE, OR ANY WARRANTY THAT DOCUMENTATION, 16 | # IF PROVIDED, WILL CONFORM TO THE SUBJECT SOFTWARE. THIS AGREEMENT DOES NOT, IN ANY MANNER, CONSTITUTE AN 17 | # ENDORSEMENT BY GOVERNMENT AGENCY OR ANY PRIOR RECIPIENT OF ANY RESULTS, RESULTING DESIGNS, HARDWARE, SOFTWARE 18 | # PRODUCTS OR ANY OTHER APPLICATIONS RESULTING FROM USE OF THE SUBJECT SOFTWARE. FURTHER, GOVERNMENT AGENCY 19 | # DISCLAIMS ALL WARRANTIES AND LIABILITIES REGARDING THIRD-PARTY SOFTWARE, IF PRESENT IN THE ORIGINAL SOFTWARE, 20 | # AND DISTRIBUTES IT "AS IS." 21 | # 22 | # Waiver and Indemnity: RECIPIENT AGREES TO WAIVE ANY AND ALL CLAIMS AGAINST THE UNITED STATES GOVERNMENT, 23 | # ITS CONTRACTORS AND SUBCONTRACTORS, AS WELL AS ANY PRIOR RECIPIENT. IF RECIPIENT'S USE OF THE SUBJECT SOFTWARE 24 | # RESULTS IN ANY LIABILITIES, DEMANDS, DAMAGES, EXPENSES OR LOSSES ARISING FROM SUCH USE, INCLUDING ANY 25 | # DAMAGES FROM PRODUCTS BASED ON, OR RESULTING FROM, RECIPIENT'S USE OF THE SUBJECT SOFTWARE, RECIPIENT 26 | # SHALL INDEMNIFY AND HOLD HARMLESS THE UNITED STATES GOVERNMENT, ITS CONTRACTORS AND SUBCONTRACTORS, AS WELL 27 | # AS ANY PRIOR RECIPIENT, TO THE EXTENT PERMITTED BY LAW. RECIPIENT'S SOLE REMEDY FOR ANY SUCH MATTER SHALL 28 | # BE THE IMMEDIATE, UNILATERAL TERMINATION OF THIS AGREEMENT. 29 | # 30 | # __________________________________________________________________________ 31 | 32 | ''' 33 | @author: Bryan Matthews KBRWyle 34 | Data Science Group 35 | NASA Ames Research Center 36 | 37 | This code will take the report and generate visualization plots for each flight using the statistics derrived by 38 | all the flights in the data set to determine 10-90th percentiles and binary state probabilities for each distance 39 | to touchdown. Usage: 40 | $>python visualization.py config.json number_of_processes(optional) 41 | 42 | Code Updated: 2019-03-08 43 | ''' 44 | 45 | import numpy as np 46 | import sys,os 47 | import time 48 | import pickle 49 | import json 50 | import matplotlib.pyplot as plt 51 | from multiprocessing import Process 52 | 53 | 54 | def worker(filelist,data_cube,MKAD_file,config,thread_id): 55 | 56 | ## Hard coded constants ## 57 | Ncols = 6.0 58 | discrete_fuzzy_threshold = 0.30 # determines when a normally off or on discrete is marked abnormal 59 | xvec = np.linspace(20,0,81) 60 | 61 | ptileData=np.percentile(data_cube['continuous'],[10,90],axis=0) 62 | avg_discrete = np.mean(data_cube['discrete'],axis=0) 63 | for i,a in enumerate(filelist): 64 | plot_idx = len(data_cube['continuous_params']) 65 | Nrows = np.ceil(len(data_cube['continuous_params'])/Ncols) 66 | 67 | flight_indx = np.where(a==np.array(data_cube['filelist']))[0][0] 68 | 69 | indx_sorted = np.argsort(MKAD_file[i,4:].astype(float))[::-1] 70 | scores = MKAD_file[i,4+indx_sorted].astype(float) 71 | indx_most_anomalous_params = indx_sorted[np.where(np.cumsum(scores/np.sum(scores))<0.5)] 72 | 73 | #Continuous 74 | fig, axs = plt.subplots(int(Ncols),int(Nrows),figsize=[12,19]) 75 | fig.subplots_adjust(hspace=.5) 76 | plt.suptitle(a+" (Continuous Parameters)", fontsize=16) 77 | 78 | axs=axs.ravel() 79 | for pltIdx in np.arange(plot_idx): 80 | excursions_below = data_cube['continuous'][flight_indx,:,pltIdx] < ptileData[0,:,pltIdx] 81 | excursions_above = data_cube['continuous'][flight_indx,:,pltIdx] > ptileData[1,:,pltIdx] 82 | # plot time series variable 83 | axs[pltIdx].plot(xvec,data_cube['continuous'][flight_indx,:,pltIdx],linewidth=2,label="flight data") 84 | axs[pltIdx].plot(xvec,ptileData[0,:,pltIdx],'k--',label="10/90 percentile") 85 | axs[pltIdx].plot(xvec,ptileData[1,:,pltIdx],'k--') 86 | axs[pltIdx].plot(xvec[excursions_below],data_cube['continuous'][flight_indx,excursions_below,pltIdx],'r.',markersize=10,linewidth=2,label="above|below percentile") 87 | axs[pltIdx].plot(xvec[excursions_above],data_cube['continuous'][flight_indx,excursions_above,pltIdx],'r.',markersize=10,linewidth=2) 88 | axs[pltIdx].invert_xaxis() 89 | if(pltIdx in indx_most_anomalous_params): 90 | axs[pltIdx].set_title("{}".format(data_cube['continuous_params'][pltIdx]),fontsize=10,color='red') 91 | else: 92 | axs[pltIdx].set_title("{}".format(data_cube['continuous_params'][pltIdx]),fontsize=10) 93 | axs[pltIdx].set_xlabel("Distance to Landing (NM)") 94 | if(pltIdx==1): 95 | axs[pltIdx].legend(loc=9, bbox_to_anchor=(0.5, 1.66), ncol=2) 96 | print("Saving:" + os.path.join(config['MKAD_folder'],'figs', a +'_c.pdf')) 97 | plt.savefig(os.path.join(config['MKAD_folder'],'figs', a +'_c.pdf')) 98 | plt.close() 99 | 100 | #Discretes 101 | plot_idx = len(data_cube['discrete_params']) 102 | Nrows = np.ceil(len(data_cube['discrete_params'])/Ncols) 103 | 104 | fig, axs = plt.subplots(int(Ncols),int(Nrows), figsize= [12,19]) 105 | fig.subplots_adjust(hspace=.5) 106 | plt.suptitle(a+" (Discrete Parameters)", fontsize=16) 107 | axs=axs.ravel() 108 | for pltIdx in np.arange(plot_idx): 109 | 110 | high_probability_on = avg_discrete[:,pltIdx] > (0.5 + discrete_fuzzy_threshold) 111 | high_probability_off = avg_discrete[:,pltIdx] < (0.5 - discrete_fuzzy_threshold) 112 | 113 | excursions_off = (data_cube['discrete'][flight_indx,:,pltIdx] < (0.5 - discrete_fuzzy_threshold)) & high_probability_on 114 | excursions_on = (data_cube['discrete'][flight_indx,:,pltIdx] > (0.5 + discrete_fuzzy_threshold)) & high_probability_off 115 | 116 | # plot time series variable 117 | axs[pltIdx].plot(xvec,(data_cube['discrete'][flight_indx,:,pltIdx]>0).astype(float),linewidth=2,label="flight data") #Have to threshold because we took the average over the 1/4 NM bin 118 | axs[pltIdx].plot(xvec,avg_discrete[:,pltIdx],'k--',label="average state") 119 | axs[pltIdx].plot(xvec[excursions_off],(data_cube['discrete'][flight_indx,excursions_off,pltIdx]>0).astype(float),'rs',markersize=8,linewidth=2,label="off when nominally on") #Have to threshold because we took the average over the 1/4 NM bin 120 | axs[pltIdx].plot(xvec[excursions_on],(data_cube['discrete'][flight_indx,excursions_on,pltIdx]>0).astype(float),'go',markersize=8,linewidth=2,label="on when nominally off") #Have to threshold because we took the average over the 1/4 NM bin 121 | axs[pltIdx].invert_xaxis() 122 | axs[pltIdx].set_title("{}".format(data_cube['discrete_params'][pltIdx]),fontsize=10) 123 | axs[pltIdx].set_xlabel("Distance to Landing (NM)") 124 | axs[pltIdx].set_ylim([-0.1,1.1]) 125 | if(pltIdx==1): 126 | axs[pltIdx].legend(loc=9, bbox_to_anchor=(0.5, 1.66), ncol=2) 127 | print("Saving:" + os.path.join(config['MKAD_folder'],'figs' , a +'_d.pdf')) 128 | plt.savefig(os.path.join(config['MKAD_folder'],'figs' , a +'_d.pdf')) 129 | # plt.show() 130 | plt.close() 131 | """thread worker function to generate pdf plots""" 132 | print('Process '+str(thread_id) + ' done.') 133 | return() 134 | 135 | 136 | 137 | if __name__ == '__main__': 138 | 139 | if(len(sys.argv)<2): 140 | print("Usage:") 141 | print("$>python visualization.py config.json number_of_processes(optional)") 142 | quit() 143 | 144 | config=json.load(open(sys.argv[1])) 145 | if(len(sys.argv)<3): 146 | number_of_processes=1.0 147 | else: 148 | number_of_processes=float(sys.argv[2]) 149 | 150 | params_cont = np.genfromtxt(config['params']['continuous'],delimiter="\n",comments="@",dtype=str) 151 | params_disc = np.genfromtxt(config['params']['discrete'],delimiter="\n",dtype=str) 152 | 153 | filelist = np.genfromtxt(os.path.join(config['working_dir'],'filelist_in_svmlight_file.txt'),delimiter="\n",dtype=str) 154 | MKAD_file = np.genfromtxt(os.path.join(config['MKAD_folder'],'anomalous_flights_contributions_'+config['name']+'.csv'),delimiter=",",comments="@",dtype=str)[1:,:] 155 | 156 | 157 | anomaly_list = np.genfromtxt(os.path.join(config['MKAD_folder'],'anomalous_flights_contributions_'+config['name']+'.csv'),delimiter=",",comments="@",dtype=str)[1:,0] 158 | data_cube = pickle.load(open(os.path.join(config['working_dir'] , 'data_cube.pkl'),'rb')) 159 | 160 | root_good_filelist = [os.path.basename(f).replace('.pkl','') for f in filelist] 161 | 162 | good_indx = np.zeros((len(data_cube['filelist'])),dtype=bool) 163 | for i,a in enumerate(data_cube['filelist']): 164 | good_indx[i] = a in root_good_filelist 165 | 166 | data_cube['continuous'] = data_cube['continuous'][good_indx,:,:] 167 | data_cube['discrete'] = data_cube['discrete'][good_indx,:,:] 168 | data_cube['filelist'] = np.array(data_cube['filelist'])[good_indx] 169 | 170 | for i in range(data_cube['continuous'].shape[0]): 171 | if(np.sum(np.isnan(data_cube['continuous'][i,:,:]))>0): 172 | last_nan = np.max(np.where(np.isnan(data_cube['continuous'][i,:,0])==True)) 173 | data_cube['continuous'][i,:last_nan+1,:] = data_cube['continuous'][i,last_nan+1,:] 174 | last_nan = np.max(np.where(np.isnan(data_cube['discrete'][i,:,0])==True)) 175 | data_cube['discrete'][i,:last_nan+1,:] = data_cube['discrete'][i,last_nan+1,:] 176 | 177 | os.makedirs(os.path.join(config['MKAD_folder'],'figs'), exist_ok=True) 178 | # os.system('mkdir -p ' + config['MKAD_folder']+'/figs') 179 | startT = time.time() 180 | 181 | size_per_thread=np.ceil(float(anomaly_list.shape[0])/number_of_processes) 182 | jobs=[] 183 | for i in range(int(number_of_processes)): 184 | p = Process(target=worker, args=(anomaly_list[int((i)*size_per_thread):int(min(int((i+1)*size_per_thread),anomaly_list.shape[0]))],data_cube,MKAD_file[int((i)*size_per_thread):int(min(int((i+1)*size_per_thread),anomaly_list.shape[0])),:],config,i)) 185 | jobs.append(p) 186 | p.start() 187 | while len(jobs) > 0: 188 | jobs = [job for job in jobs if job.is_alive()] 189 | time.sleep(1) 190 | 191 | print("Runtime: " + str(time.time() - startT) + " Seconds") 192 | 193 | 194 | 195 | 196 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyMKAD 2 | 3 | The world-wide aviation system is one of the most complex dynamical systems ever developed and is generating data at an extremely rapid rate. Most modern commercial aircraft record several hundred flight parameters including information from the guidance, navigation, and control systems, the avionics and propulsion systems, and the pilot inputs into the aircraft. These parameters may be continuous measurements or binary/categorical measurements recorded in one second intervals for the duration of the flight. Currently, most approaches to aviation safety are reactive, meaning that they are designed to react to an aviation safety incident or accident. PyMKAD is a novel approach based on the theory of multiple kernel learning to detect potential safety anomalies in very large data bases of discrete and continuous data from world-wide operations of commercial fleets. This code address an anomaly detection problem which includes both discrete and continuous data streams, where we assume that the discrete streams influence on the continuous streams. We also assume that atypical sequence of events in the discrete streams can lead to off-nominal system performance. 4 | 5 | The objective of this project is to automate the analysis of flight safety incidents in a way that combines both analysis of discrete and continuous parameters. 6 | 7 | This repository contains the following files in its top level directory: 8 | 9 | * [PythonCode](PythonCode) 10 | The source code of the repository includes: preprocessing modules, the main mkad code, and a post processing visualization tool. The code is uses a command line interface and a json file for configuring. 11 | 12 | * [documentation](documentation) 13 | Documents describing how to configure and run the program, as well as how to interpret the results. 14 | 15 | 16 | * [MKAD NOSA 2019.pdf](MKAD%20NOSA%202019.pdf) 17 | Licensing for MKAD 18 | 19 | 20 | 21 | 22 | ## Contact Info 23 | 24 | NASA Point of contact: Nikunj Oza , Data Science Group Lead. 25 | 26 | For questions regarding the research and development of the algorithm, please contact Bryan Matthews , Senior Research Engineer. 27 | 28 | 29 | ## Copyright and Notices 30 | 31 | Notices: 32 | 33 | Copyright © 2019 United States Government as represented by the Administrator of the National Aeronautics and Space Administration. All Rights Reserved. 34 | 35 | Disclaimers 36 | 37 | No Warranty: THE SUBJECT SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY OF ANY KIND, EITHER EXPRESSED, IMPLIED, OR STATUTORY, INCLUDING, BUT NOT LIMITED TO, ANY WARRANTY THAT THE SUBJECT SOFTWARE WILL CONFORM TO SPECIFICATIONS, ANY IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR FREEDOM FROM INFRINGEMENT, ANY WARRANTY THAT THE SUBJECT SOFTWARE WILL BE ERROR FREE, OR ANY WARRANTY THAT DOCUMENTATION, IF PROVIDED, WILL CONFORM TO THE SUBJECT SOFTWARE. THIS AGREEMENT DOES NOT, IN ANY MANNER, CONSTITUTE AN ENDORSEMENT BY GOVERNMENT AGENCY OR ANY PRIOR RECIPIENT OF ANY RESULTS, RESULTING DESIGNS, HARDWARE, SOFTWARE PRODUCTS OR ANY OTHER APPLICATIONS RESULTING FROM USE OF THE SUBJECT SOFTWARE. FURTHER, GOVERNMENT AGENCY DISCLAIMS ALL WARRANTIES AND LIABILITIES REGARDING THIRD-PARTY SOFTWARE, IF PRESENT IN THE ORIGINAL SOFTWARE, AND DISTRIBUTES IT "AS IS." 38 | 39 | Waiver and Indemnity: RECIPIENT AGREES TO WAIVE ANY AND ALL CLAIMS AGAINST THE UNITED STATES GOVERNMENT, ITS CONTRACTORS AND SUBCONTRACTORS, AS WELL AS ANY PRIOR RECIPIENT. IF RECIPIENT'S USE OF THE SUBJECT SOFTWARE RESULTS IN ANY LIABILITIES, DEMANDS, DAMAGES, EXPENSES OR LOSSES ARISING FROM SUCH USE, INCLUDING ANY DAMAGES FROM PRODUCTS BASED ON, OR RESULTING FROM, RECIPIENT'S USE OF THE SUBJECT SOFTWARE, RECIPIENT SHALL INDEMNIFY AND HOLD HARMLESS THE UNITED STATES GOVERNMENT, ITS CONTRACTORS AND SUBCONTRACTORS, AS WELL AS ANY PRIOR RECIPIENT, TO THE EXTENT PERMITTED BY LAW. RECIPIENT'S SOLE REMEDY FOR ANY SUCH MATTER SHALL BE THE IMMEDIATE, UNILATERAL TERMINATION OF THIS AGREEMENT. 40 | 41 | -------------------------------------------------------------------------------- /documentation/README.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/PyMKAD/02d29db9e13ceffc7fdabb188948618da40306a1/documentation/README.docx -------------------------------------------------------------------------------- /kernels/README.rst: -------------------------------------------------------------------------------- 1 | kernels 2 | ============ 3 | kernels is a Python module with utility functions for data mining to support 4 | the Data Sciences group at NASA Ames 5 | 6 | 7 | Dependencies 8 | ============ 9 | 10 | The required dependencies to build the software are 11 | Python >= 3.7, setuptools, 12 | Numpy >= 1.15.4, 13 | SciPy >= 1.2.0, 14 | scikit-learn >= 0.20.3 15 | and a working C/C++ compiler. 16 | 17 | 18 | Install 19 | ======= 20 | 21 | This package uses distutils, which is the default way of installing 22 | python modules. To install in your home directory, use:: 23 | 24 | python setup.py install --user 25 | 26 | To install for all users on Unix/Linux:: 27 | 28 | python setup.py build 29 | sudo python setup.py install 30 | 31 | 32 | -------------------------------------------------------------------------------- /kernels/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nasa/PyMKAD/02d29db9e13ceffc7fdabb188948618da40306a1/kernels/__init__.py -------------------------------------------------------------------------------- /kernels/setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup, Extension 2 | from os.path import join 3 | import os 4 | import numpy as np 5 | 6 | 7 | nlcs_sources = [join('src', 'nlcs', 'nlcs_wrapper.cpp'),\ 8 | join('src', 'nlcs', 'lcs.cpp')] 9 | 10 | setup(name = 'nlcs', version = '1.0', \ 11 | ext_modules = [Extension('nlcs', nlcs_sources, include_dirs=[join(os.path.split(np.__file__)[0],'core','include')])]) 12 | -------------------------------------------------------------------------------- /kernels/src/nlcs/lcs.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "lcs.h" 6 | 7 | using namespace std; 8 | 9 | // enum direction {north, west, nw}; 10 | // Direction uses chars {1,2,3}; 11 | 12 | 13 | LCS::LCS(){} 14 | 15 | 16 | float LCS::CalcDist(unsigned short *vec1,unsigned short vec1Size,unsigned short *vec2,unsigned short vec2Size){ 17 | 18 | unsigned short m, n; //lengths of the two strings 19 | unsigned short **c; // table of LCS lengths 20 | char **b; // table of which optimal subprob solution 21 | unsigned short i, j; 22 | unsigned short length; // length of LCS of prefixes 23 | 24 | m = (unsigned short)vec1Size; // length of X 25 | n = (unsigned short)vec2Size; // length of Y 26 | 27 | // Use two tables, b and c, each with m+1 rows and n+1 columns. 28 | // Initialize the c table to all 0. The b table doesn't need to be 29 | // initialized. 30 | c =(unsigned short**) calloc(m+1, sizeof(unsigned short *)); 31 | for (i = 0; i <= m; i++) 32 | { 33 | c[i] =(unsigned short*) calloc(n+1, sizeof(unsigned short)); 34 | } 35 | 36 | b = (char**)calloc(m+1, sizeof(char *)); 37 | for (i = 0; i <= m; i++) 38 | { 39 | b[i] = (char*)calloc(n+1, sizeof(char)); 40 | } 41 | 42 | 43 | // Now run through the main loop of the LCS-Length algorithm on p.353. 44 | for (i = 1; i <= m; i++) 45 | { 46 | for (j = 1; j <= n; j++) 47 | { 48 | if(vec1[i-1]==vec2[j-1]) 49 | { 50 | // Extending the LCS of X[1..i-1] and Y[1..j-1] by one character. 51 | c[i][j] = c[i-1][j-1] + 1; 52 | b[i][j] = 3; //NorthWest 53 | } 54 | else if (c[i-1][j] >= c[i][j-1]) 55 | { 56 | // Using LCS of X[1..i-1] and Y[1..j]. 57 | c[i][j] = c[i-1][j]; 58 | b[i][j] = 1; //North 59 | } 60 | else 61 | { 62 | // Using LCS of X[1..i] and Y[1..j-1]. 63 | c[i][j] = c[i][j-1]; 64 | b[i][j] = 2; //West 65 | } 66 | } 67 | } 68 | 69 | //The tables are all filled in. Print out the LCS found. 70 | //print_LCS also returns the length of the LCS found. 71 | length = print_LCS(b, m, n); 72 | // printf("\nlength = %d\n", length); 73 | for (i = 0; i <= m; i++) 74 | { 75 | free(c[i]); 76 | } 77 | for (i = 0; i <= m; i++) 78 | { 79 | free(b[i]); 80 | } 81 | free(c); 82 | free(b); 83 | return (float)(length)/sqrt((float)(m)*(float)(n)); 84 | 85 | } 86 | 87 | // Print an LCS of X[1..i] and Y[1..j], assuming that the b table has 88 | // already been filled in. Based on the Print-LCS procedure of p.355. 89 | // int print_LCS(enum direction **b, char *X, int i, int j) 90 | //int LCS::print_LCS(enum direction **b,int i, int j) 91 | unsigned short LCS::print_LCS(char **b,unsigned short i, unsigned short j) 92 | { 93 | if (i == 0 || j == 0) // is either string empty? 94 | return 0; 95 | if (b[i][j] == 3) //NorthWest 96 | { 97 | // We extended X[1..i-1] and Y[1..j-1] by one character, which is X[i]. 98 | // Print the LCS of X[1..i-1] and Y[1..j-1] and then print X[i]. 99 | unsigned short length = print_LCS(b,i-1, j-1); 100 | return length+1; 101 | } 102 | else if (b[i][j] == 1) //North 103 | { 104 | return print_LCS(b,i-1, j); // used LCS of X[1..i-1] and Y[1..j] 105 | } 106 | else 107 | { 108 | return print_LCS(b,i, j-1); // used LCS of X[1..i] and Y[1..j-1] 109 | } 110 | 111 | } 112 | 113 | -------------------------------------------------------------------------------- /kernels/src/nlcs/lcs.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | using namespace std; 6 | 7 | /*enum direction {north, west, nw};*/ 8 | 9 | class LCS 10 | { 11 | private: 12 | unsigned short print_LCS(char **b, unsigned short i, unsigned short j); 13 | 14 | public: 15 | LCS(void); 16 | float CalcDist(unsigned short *vec1,unsigned short vec1Size,unsigned short *str2,unsigned short vec2Size); 17 | 18 | }; 19 | -------------------------------------------------------------------------------- /kernels/src/nlcs/nlcs_wrapper.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "lcs.h" 3 | #include "numpy/arrayobject.h" 4 | #include 5 | 6 | // Compatible with Python 3.7. 7 | 8 | // Function: compute nLCS 9 | static PyObject* compute(PyObject* self, PyObject* args) 10 | { 11 | 12 | PyArrayObject *input1,*input2; 13 | 14 | if (!PyArg_ParseTuple(args, "OO",&input1,&input2)) 15 | return NULL; 16 | 17 | 18 | unsigned short *list1 = (unsigned short *)malloc(input1->dimensions[0]*sizeof(unsigned short)); 19 | unsigned short *list2 = (unsigned short *)malloc(input2->dimensions[0]*sizeof(unsigned short)); 20 | 21 | 22 | if(input1->dimensions[1]!=input2->dimensions[1]){ 23 | fprintf(stderr,"Error Dimensions missmatch\n %d!=%d\n",input1->dimensions[1],input2->dimensions[1]); 24 | return Py_BuildValue("f",-1.0); 25 | } 26 | 27 | LCS LCSObj; 28 | float d=0; 29 | for (int i=0;idimensions[1];i++){ 30 | for (int j=0;jdimensions[0];j++){ 31 | memcpy(list1+j,input1->data+(i+j*input1->dimensions[1])*sizeof(unsigned short),sizeof(unsigned short)); 32 | } 33 | for (int j=0;jdimensions[0];j++){ 34 | memcpy(list2+j,input2->data+(i+j*input2->dimensions[1])*sizeof(unsigned short),sizeof(unsigned short)); 35 | } 36 | d += LCSObj.CalcDist(list1,input1->dimensions[0],list2,input2->dimensions[0]); 37 | } 38 | 39 | free(list1); 40 | free(list2); 41 | 42 | return Py_BuildValue("f",d/((float)input1->dimensions[1])); 43 | } 44 | 45 | // Module's Function Definition struct 46 | // We require this `NULL` to signal the end of our method 47 | // definition 48 | static PyMethodDef myMethods[] = { 49 | { "compute", compute, METH_VARARGS, "Computes nLCS similarity" }, 50 | { NULL, NULL, 0, NULL } 51 | }; 52 | 53 | // Module Definition struct 54 | static struct PyModuleDef nlcs = { 55 | PyModuleDef_HEAD_INIT, 56 | "nlcs", 57 | "Normalized Longest Common Subsequence Calculation", 58 | -1, 59 | myMethods 60 | }; 61 | 62 | // Initializes our module using our above struct 63 | PyMODINIT_FUNC PyInit_nlcs(void) 64 | { 65 | return PyModule_Create(&nlcs); 66 | } 67 | --------------------------------------------------------------------------------