├── trajclus ├── __init__.py ├── spark │ ├── BUILD │ └── __init__.py ├── apps │ ├── __init__.py │ ├── traffic_density_plot.py │ ├── extract_data_to_airport.py │ ├── convert_flight_format.py │ ├── db_clustering.py │ └── lsh_clustering.py └── lib │ ├── __init__.py │ ├── config.py │ ├── common_utils.py │ ├── preprocessing_lib.py │ ├── lsh_lib.py │ ├── plot_utils.py │ └── geometric_utils.py ├── .gitignore ├── _img ├── traffic_density.png └── xxx_flights_clusters.png ├── README.md └── requirements.txt /trajclus/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /trajclus/spark/BUILD: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /trajclus/apps/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /trajclus/lib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /trajclus/spark/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | tmp/* 2 | .idea/* 3 | *.pyc 4 | */tmp/* 5 | -------------------------------------------------------------------------------- /_img/traffic_density.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tanthml/atc/HEAD/_img/traffic_density.png -------------------------------------------------------------------------------- /_img/xxx_flights_clusters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tanthml/atc/HEAD/_img/xxx_flights_clusters.png -------------------------------------------------------------------------------- /trajclus/lib/config.py: -------------------------------------------------------------------------------- 1 | LAT_FIELD = 'Latitude' 2 | LON_FIELD = 'Longitude' 3 | FLIGHT_ID_FIELD = 'Flight_ID' 4 | -------------------------------------------------------------------------------- /trajclus/lib/common_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | def gen_log_file(path_to_file): 5 | """ 6 | Generate log file 7 | 8 | Args: 9 | path_to_file (str): path to file 10 | 11 | Returns: 12 | 13 | """ 14 | 15 | logger = logging.getLogger(__name__) 16 | logger.setLevel(logging.INFO) 17 | 18 | # create a file handler 19 | handler = logging.FileHandler(path_to_file) 20 | handler.setLevel(logging.INFO) 21 | 22 | # create a logging format 23 | formatter = logging.Formatter( 24 | '%(asctime)s - %(name)s - %(levelname)s - %(message)s') 25 | handler.setFormatter(formatter) 26 | 27 | # add the handlers to the logger 28 | logger.addHandler(handler) 29 | 30 | return logger 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # atc 2 | Master Thesis - Trajectories Analysis 3 | 4 | How to run 5 | 6 | ``` 7 | 8 | change to absolute path to atc repo on your local 9 | export PYTHONPATH="$PYTHONPATH:/atc/" 10 | ``` 11 | 12 | ``` 13 | cd atc/trajclus 14 | 15 | To plot density traffic toward the airport 16 | 17 | python apps/traffic_density_plot.py --input_path ~/data/tracks_2016_09.csv --airport_code WSSS 18 | 19 | ``` 20 | ![alt text](_img/traffic_density.png "Traffic Density Plot") 21 | ``` 22 | 23 | To cluster traffic toward the airport 24 | 25 | python apps/lsh_clustering.py --input_path ~/data/tracks_2016_09.csv --airport_code WSSS --filter_date 2016-09-29 26 | 27 | 28 | ``` 29 | 30 | 31 | ![alt text](_img/xxx_flights_clusters.png "Traffic Clusters Plot") 32 | 33 | Logs and results will be save under atc/tmp directory 34 | 35 | 36 | Dependencies 37 | 38 | ``` 39 | 40 | pip install -r requirements.txt 41 | 42 | # skip this, old for Python2 43 | pip install -U git+https://github.com/maikol-solis/trajectory_distance.git 44 | 45 | ``` -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Requirements automatically generated by pigar. 2 | # https://github.com/damnever/pigar 3 | 4 | # trajclus/apps/lsh_clustering.py: 6 5 | Shapely == 1.6.4.post2 6 | 7 | # trajclus/apps/convert_flight_format.py: 4 8 | # trajclus/apps/db_clustering.py: 3 9 | # trajclus/apps/lsh_clustering.py: 1 10 | # trajclus/apps/traffic_density_plot.py: 3 11 | click == 6.7 12 | 13 | # trajclus/lib/lsh_lib.py: 4 14 | datasketch == 1.5.0 15 | 16 | # trajclus/apps/lsh_clustering.py: 5 17 | geopy == 1.20.0 18 | 19 | # trajclus/lib/plot_utils.py: 6 20 | matplotlib == 3.1.1 21 | 22 | # trajclus/apps/lsh_clustering.py: 3 23 | # trajclus/lib/geometric_utils.py: 1 24 | # trajclus/lib/plot_utils.py: 3 25 | numpy == 1.17.4 26 | 27 | # trajclus/apps/db_clustering.py: 4 28 | # trajclus/apps/lsh_clustering.py: 2 29 | # trajclus/apps/traffic_density_plot.py: 4 30 | # trajclus/lib/preprocessing_lib.py: 1 31 | pandas == 0.25.0 32 | 33 | # trajclus/lib/geometric_utils.py: 2 34 | scipy == 1.3.1 35 | 36 | # trajclus/lib/plot_utils.py: 5 37 | seaborn == 0.9.0 38 | 39 | # trajclus/lib/geometric_utils.py: 3 40 | simplification == 0.4.2 41 | 42 | # trajclus/apps/db_clustering.py: 5,6 43 | # trajclus/apps/lsh_clustering.py: 4,7,8 44 | # trajclus/lib/preprocessing_lib.py: 2 45 | sklearn == 0.0 46 | -------------------------------------------------------------------------------- /trajclus/apps/traffic_density_plot.py: -------------------------------------------------------------------------------- 1 | from time import gmtime, strftime 2 | 3 | import click 4 | import pandas as pd 5 | 6 | from trajclus.lib.common_utils import gen_log_file 7 | from trajclus.lib.preprocessing_lib import filter_by_airport 8 | from trajclus.lib.plot_utils import traffic_density_plot 9 | 10 | 11 | logger = gen_log_file(path_to_file='../tmp/traffic_density.log') 12 | 13 | 14 | def main(input_path, airport_code, date): 15 | history = strftime("%Y-%m-%d %H:%M:%S", gmtime()).replace(" ", "_") 16 | logger.info("=============================================") 17 | logger.info("================ DATETIME {} ================".format(history)) 18 | df = pd.read_csv(input_path) 19 | logger.info(df.head()) 20 | file_name = input_path.split("/")[-1].replace(".csv", "") 21 | 22 | # get fixed 23 | flights_to_airport = filter_by_airport( 24 | df=df, 25 | airport_code=airport_code, 26 | min_dr=0.1, 27 | max_dr=5.0 28 | ) 29 | file_path = "../tmp/{file_name}_{airport_code}_traffic_density.png".format( 30 | file_name=file_name, 31 | airport_code=airport_code 32 | ) 33 | traffic_density_plot( 34 | lat=flights_to_airport['Latitude'], 35 | lon=flights_to_airport['Longitude'], 36 | file_path=file_path, 37 | length_cutoff=600 38 | ) 39 | 40 | logger.info("Encoding flight ID ...") 41 | flight_ids = flights_to_airport['Flight_ID'].unique().tolist() 42 | logger.info("Total # flight ID {} {}".format(len(flight_ids), airport_code)) 43 | print("Total # flight ID {} {}".format(len(flight_ids), airport_code)) 44 | 45 | 46 | @click.command() 47 | @click.option( 48 | '--input_path', 49 | type=str, 50 | required=True, 51 | help='Full path to the trajectory file in CSV format') 52 | @click.option( 53 | '--airport_code', 54 | type=str, 55 | default='WSSS,VTBS,WMKK', 56 | help='Air Port Codename') 57 | @click.option( 58 | '--date', 59 | type=str, 60 | default='2016-09-01', 61 | help='Arrival date') 62 | def main_cli(input_path, airport_code, date): 63 | airports = airport_code.split(",") 64 | for airport in airports: 65 | main(input_path=input_path, airport_code=airport, date=date) 66 | 67 | 68 | if __name__ == '__main__': 69 | main_cli() 70 | -------------------------------------------------------------------------------- /trajclus/lib/preprocessing_lib.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn import preprocessing 3 | 4 | from trajclus.lib.geometric_utils import simplify_coordinator 5 | 6 | 7 | def flight_id_encoder(unique_id): 8 | """ 9 | Encoding flight id to integer number 10 | Args: 11 | unique_id (list[str]): list flight id 12 | 13 | Returns: 14 | le (LabelEncoder): 15 | 16 | """ 17 | le = preprocessing.LabelEncoder() 18 | le.fit(unique_id) 19 | return le 20 | 21 | 22 | def filter_by_airport(df, airport_code, min_dr=0.01, max_dr=2.0): 23 | """ 24 | Filter data-frame by airport-code and radius 25 | Args: 26 | df (pd.DataFrame): 27 | airport_code (str): Destination code name 28 | min_dr (float): min value of DRemains 29 | max_dr (float): max value of DRemains 30 | 31 | Returns: 32 | flights_to_airport (pd.DataFrame): 33 | """ 34 | one_airport = df[(df['Destination'] == airport_code)] 35 | # get fixed 36 | flights_to_airport = one_airport[ 37 | (min_dr < one_airport['DRemains']) & (one_airport['DRemains'] < max_dr) 38 | ] 39 | 40 | return flights_to_airport 41 | 42 | 43 | def build_flight_trajectory_df(flights_to_airport, label_encoder, flight_ids, 44 | max_flights=1000, epsilon=None): 45 | """ 46 | build data-frame contains flight-ID and coordinators of flight trajectories 47 | Args: 48 | flights_to_airport (pd.DataFrame): 49 | label_encoder (LabelEncoder): 50 | flight_ids (list[str]): 51 | max_flights (int): 52 | is_simplify (bool): 53 | 54 | Returns: 55 | flight_df (pd.DataFrame) 56 | """ 57 | encoded_idx = [] 58 | trajectories = [] 59 | flight_dicts = {} 60 | 61 | for fid in flight_ids[:max_flights]: 62 | df_min = flights_to_airport[flights_to_airport['Flight_ID'] == fid] 63 | df_min = df_min.sort_values(by='DRemains', ascending=False) 64 | encode_id = label_encoder.transform([fid])[0] 65 | encoded_idx.append(encode_id) 66 | coords = df_min[['Latitude', 'Longitude']].values 67 | flight_dicts[encode_id] = coords 68 | if epsilon: 69 | coords = simplify_coordinator(coords, epsilon=epsilon) 70 | trajectories.append(coords) 71 | 72 | 73 | flight_df = pd.DataFrame() 74 | flight_df['idx'] = encoded_idx 75 | flight_df['flight_id'] = flight_ids[:max_flights] 76 | flight_df['trajectory'] = trajectories 77 | print("Total extracted flights %s" % len(flight_df)) 78 | 79 | return flight_df, flight_dicts 80 | -------------------------------------------------------------------------------- /trajclus/apps/extract_data_to_airport.py: -------------------------------------------------------------------------------- 1 | import click 2 | import pandas as pd 3 | 4 | from trajclus.lib.common_utils import gen_log_file 5 | from trajclus.lib.preprocessing_lib import filter_by_airport 6 | 7 | 8 | def filter_by_date(datetime, filter_date): 9 | """ 10 | Filter date 11 | Args: 12 | datetime (str): yyyy-mm-dd format 13 | filter_date (str): yyyy-mm-dd format 14 | 15 | Returns: 16 | (bool) 17 | """ 18 | str_date = str(datetime).split(' ')[0] 19 | if str_date == str(filter_date): 20 | return True 21 | return False 22 | 23 | 24 | def main( 25 | input_path, 26 | airport_code='WSSS', 27 | min_dr=0.0, 28 | max_dr=5.0, 29 | filter_date='', 30 | ): 31 | # load raw-data from csv 32 | logger = gen_log_file(path_to_file='../tmp/extract_data_to_airport{}.log'.format(filter_date)) 33 | df = pd.read_csv(input_path) 34 | file_name = input_path.split("/")[-1].replace(".csv", "") 35 | 36 | if filter_date != '': 37 | print("before filtering %s" % len(df)) 38 | df['filtered'] = df['Actual_Arrival_Time_(UTC)'].apply( 39 | lambda x: filter_by_date(datetime=x, filter_date=filter_date) 40 | ) 41 | df = df[df['filtered']] 42 | file_name = filter_date 43 | print("after filtering %s" % len(df)) 44 | 45 | # filter data by airport code-name 46 | flights_to_airport = filter_by_airport( 47 | df=df, 48 | airport_code=airport_code, 49 | min_dr=min_dr, 50 | max_dr=max_dr 51 | ) 52 | 53 | flights_to_airport.to_csv( 54 | "{}_{}.csv".format(file_name, airport_code), index=False 55 | ) 56 | 57 | logger.info("Encoding flight ID ... %s" % airport_code) 58 | flight_ids = flights_to_airport['Flight_ID'].unique().tolist() 59 | logger.info("Total # flight ID {}".format(len(flight_ids))) 60 | 61 | 62 | @click.command() 63 | @click.option( 64 | '--input_path', 65 | type=str, 66 | required=True, 67 | help='Full path to the trajectory file in CSV format') 68 | @click.option( 69 | '--airport_code', 70 | type=str, 71 | default='WSSS,VTBS,WMKK', 72 | # default='WSSS', 73 | help='Air Port Codename') 74 | @click.option( 75 | '--dr_range', 76 | type=str, 77 | default='0.0,5.0', 78 | help='distance remains in radius') 79 | @click.option( 80 | '--filter_date', 81 | type=str, 82 | default='', 83 | help='Filter by date example 2016-09-29') 84 | def main_cli(input_path, airport_code, dr_range, filter_date): 85 | airports = airport_code.split(",") 86 | dr_ranges = [float(i) for i in dr_range.split(",")] 87 | for airport in airports: 88 | main( 89 | input_path=input_path, 90 | airport_code=airport, 91 | min_dr=dr_ranges[0], 92 | max_dr=dr_ranges[1], 93 | filter_date=filter_date, 94 | ) 95 | 96 | 97 | if __name__ == '__main__': 98 | main_cli() 99 | -------------------------------------------------------------------------------- /trajclus/apps/convert_flight_format.py: -------------------------------------------------------------------------------- 1 | import json 2 | from copy import deepcopy 3 | 4 | import click 5 | import csv 6 | 7 | from trajclus.lib.common_utils import gen_log_file 8 | logger = gen_log_file(path_to_file='../tmp/convert_flight_format.log') 9 | 10 | 11 | def get_num_lines_in_file(file_path): 12 | """ 13 | Get number of lines in file 14 | 15 | Args: 16 | file_path (str): file path 17 | 18 | Returns: 19 | (int): number of lines 20 | 21 | """ 22 | from subprocess import check_output 23 | return int(check_output( 24 | ['wc', '-l', file_path]).split(b' ')[0]) 25 | 26 | 27 | @click.command() 28 | @click.option( 29 | '--input_path', 30 | type=str, 31 | required=True, 32 | help='Full path to the trajectory file in json format') 33 | def main(input_path): 34 | logger.info("Filepath : {}".format(input_path)) 35 | flights = [] 36 | flight_keys = [ 37 | "Flight ID", 38 | "Ident", 39 | "Origin", 40 | "Destination", 41 | "Actual Arrival Time (UTC)"] 42 | tract_keys = [ 43 | "DRemains", 44 | "TRemains", 45 | "TTravelled", 46 | "Time (UTC)", 47 | "Latitude", 48 | "Longitude", 49 | "Altitude (ft)", 50 | "Rate", 51 | "Course", 52 | "Direction", 53 | # "Facility Name", 54 | # "Facility Description", 55 | # "Estimated Pos.", 56 | ] 57 | col_order = [ 58 | 'Flight_ID', 59 | 'Ident', 60 | 'Origin', 61 | 'Destination', 62 | 'Actual_Arrival_Time_(UTC)', 63 | 'DRemains', 64 | 'TRemains', 65 | 'TTravelled', 66 | 'Time_(UTC)', 67 | 'Latitude', 68 | 'Longitude', 69 | "Altitude_(ft)", 70 | "Rate", 71 | "Course", 72 | "Direction", 73 | # "Facility_Name", 74 | # "Facility_Description", 75 | # "Estimated_Pos.", 76 | ] 77 | fin = open(input_path.replace('.json', '.csv'), 'w') 78 | writer = csv.DictWriter(fin, fieldnames=col_order) 79 | writer.writeheader() 80 | num_lines = get_num_lines_in_file(input_path) 81 | logger.info("Total {} records ".format(num_lines)) 82 | flights_id = [] 83 | with open(input_path) as fin: 84 | for i, line in enumerate(fin): 85 | # print progress bar 86 | one_flight = (json.loads(line)) 87 | flight_header = {} 88 | if one_flight['flight']['Flight ID'] in flights_id: 89 | print("FlightID overlap: %s" % one_flight['flight']['Flight ID']) 90 | flights_id.append(one_flight['flight']['Flight ID']) 91 | for key in flight_keys: 92 | flight_header[key.replace(' ', '_')] = one_flight['flight'][key] 93 | for tract in one_flight['track']: 94 | flight_tract = deepcopy(flight_header) 95 | for track_key in tract_keys: 96 | flight_tract[track_key.replace(' ', '_')] = tract[track_key] 97 | writer.writerow(flight_tract) 98 | fin.close() 99 | 100 | 101 | if __name__ == '__main__': 102 | main() 103 | -------------------------------------------------------------------------------- /trajclus/lib/lsh_lib.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | 4 | from datasketch import MinHash, MinHashLSH 5 | 6 | 7 | class LSHClusteringLib(object): 8 | """ 9 | Suppose you have a very large collection of sets. Giving a query, which is 10 | also a set, you want to find sets in your collection that have Jaccard 11 | similarities above certain threshold, and you want to do it with many other 12 | queries. To do this efficiently, you can create a MinHash for every set, 13 | and when a query comes, you compute the Jaccard similarities between the 14 | query MinHash and all the MinHash of your collection, and return the sets 15 | that satisfy your threshold. 16 | 17 | *** Read more via : https://ekzhu.github.io/datasketch/lsh.html 18 | 19 | """ 20 | 21 | def __init__(self, threshold=0.9, num_perm=128): 22 | """ 23 | Init 24 | Args: 25 | threshold (float): The Jaccard similarity threshold between 0.0 and 26 | 1.0. The initialized MinHash LSH will be optimized for the threshold 27 | by minizing the false positive and false negative. 28 | num_perm (int): The number of permutation functions used 29 | by the MinHash to be indexed 30 | """ 31 | self.threshold = threshold 32 | self.num_perm = num_perm 33 | self.lsh_server = MinHashLSH(threshold=threshold, num_perm=num_perm) 34 | 35 | def get_lsh_server(self): 36 | return self.lsh_server 37 | 38 | def compute_min_hash_lsh(self, terms): 39 | """ 40 | Compute min hash LSH of a set of tokens 41 | 42 | Args: 43 | terms (set): set of unique terms 44 | 45 | Returns: 46 | (MinHash): min hash LSH value 47 | 48 | """ 49 | m = MinHash(num_perm=self.num_perm) 50 | for e in terms: 51 | m.update(e.encode('utf8')) 52 | return m 53 | 54 | def compute_min_hash_lsh_over_data(self, record_ids, data): 55 | """ 56 | Compute min hash of each document from given record Ids and data 57 | Args: 58 | record_ids (list[int]): list of given record Id 59 | data (list[list[str]]): list of content belonged to record Ids above 60 | 61 | Returns: 62 | lsh_vals (list[MinHash]): list of min hash value 63 | 64 | """ 65 | # make sure docId is unique over the corpus 66 | assert len(set(record_ids)) == len(record_ids) 67 | 68 | # for each record compute the hash 69 | lsh_vals = [ 70 | self.compute_min_hash_lsh(terms=set(terms)) 71 | for terms in data 72 | ] 73 | # TODO: convert to parallel 74 | for record_id, hash_val in zip(record_ids, lsh_vals): 75 | idx = "{}".format(record_id) 76 | # update the hash document to whole corpus 77 | self.lsh_server.insert(idx, hash_val) 78 | return lsh_vals 79 | 80 | def query_duplicated_record(self, query): 81 | """ 82 | Query to LSH corpus for getting duplicated record Id 83 | Args: 84 | query (MinHash): 85 | 86 | Returns: 87 | result (list[int]): record Id 88 | 89 | """ 90 | result = self.lsh_server.query(query) 91 | result = [idx for idx in result] 92 | return sorted(result) 93 | 94 | def clustering(self, df): 95 | """ 96 | Query every document in corpus to find duplicated content 97 | 98 | Args: 99 | lsh_vals (list[MinHash]): list of LSH hash 100 | Returns: 101 | duplicated_ids (list[int]): list of duplicated record Ids 102 | 103 | """ 104 | # duplicated_ids = [] 105 | # for idx in range(len(lsh_vals)): 106 | # result = self.query_duplicated_document(lsh_vals[idx]) 107 | # if len(result) > 1: 108 | # duplicated_ids.append(result) 109 | # 110 | # return duplicated_ids 111 | print(df.head()) 112 | -------------------------------------------------------------------------------- /trajclus/lib/plot_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | # matplotlib.use('Agg') 3 | import seaborn as sns 4 | sns.set() 5 | import matplotlib.pyplot as plt 6 | 7 | 8 | def traffic_density_plot(lat=[], lon=[], file_path=None, length_cutoff=600): 9 | """ 10 | Visualize density of geometric traffic 11 | 12 | Args: 13 | lat (list[float]): 14 | lon (list[float]): 15 | length_cutoff (int): 16 | 17 | Returns: 18 | 19 | """ 20 | 21 | xmin = min(lon) 22 | xmax = max(lon) 23 | ymin = min(lat) 24 | ymax = max(lat) 25 | 26 | fig = plt.figure(frameon=False) 27 | fig.set_size_inches(20, 20) 28 | # To make the content fill the whole figure 29 | # ax = plt.Axes(fig, [0., 0., 1., 1.]) 30 | # ax = plt.Axes(fig, [0.05, 0.05, 0.95, 0.95]) 31 | # fig.add_axes(ax) 32 | ax = fig.add_subplot(1, 1, 1) 33 | 34 | # And a corresponding grid 35 | # ax.grid(which='both') 36 | 37 | # Set a single t value to slice the multidimensional array. 38 | length_cutoff = length_cutoff-100 39 | # With the above note in mind, this may be an exception?? 40 | x1=np.copy(lon) 41 | y1=np.copy(lat) 42 | 43 | # Remove the nans from the array 44 | x1 = x1[~np.isnan(x1)] 45 | y1 = y1[~np.isnan(y1)] 46 | # plt.title(file_path.split("/")[-1].split(".")[0], fontsize=30) 47 | plt.xlabel('Longitude', fontsize=24) 48 | plt.ylabel('Latitude', fontsize=24) 49 | # Log colormap 50 | hb = ax.hexbin( 51 | x1, 52 | y1, 53 | gridsize=1000, 54 | bins='log', 55 | cmap='inferno', 56 | extent=(xmin, xmax, ymin, ymax) 57 | ) 58 | 59 | fig.add_axes(ax) 60 | # ax.axis('equal') 61 | plt.axis('on') 62 | # Setting the axes like this avoid the zero values in 63 | # the preallocated empty array. 64 | ax.axis([xmin, xmax, ymin, ymax]) 65 | 66 | if not file_path: 67 | plt.show() 68 | return 1 69 | else: 70 | # # save figure as png 71 | # 72 | # if not os.path.exists(directory): 73 | # os.makedirs(directory) 74 | # # png1 = BytesIO() 75 | fig.savefig(file_path, format='png', bbox_inches='tight', pad_inches=0, dpi=300) 76 | return 1 77 | 78 | 79 | def traffic_flight_plot( 80 | flight_ids, clusters, flight_dicts, file_path, group_clusters, info={}): 81 | """ 82 | visualization of clustering result 83 | Args: 84 | flight_ids (list[str]): 85 | clusters (list[]): 86 | flight_dicts: (dict) 87 | group_clusters list[Any]: 88 | file_path (str): 89 | 90 | Returns: 91 | 92 | """ 93 | # TODO: implement the visualization of clustering result 94 | 95 | unique_labels = set(clusters) 96 | 97 | colors = [plt.cm.Spectral(each) 98 | for each in np.linspace(0, 1, len(set(unique_labels)))] 99 | 100 | colors_dict = {} 101 | for idx, uni in enumerate(unique_labels): 102 | colors_dict[uni] = colors[idx] 103 | 104 | plt.style.use('dark_background') 105 | fig = plt.figure(frameon=False) 106 | fig.set_size_inches(20, 20) 107 | # To make the content fill the whole figure 108 | ax = plt.Axes(fig, [0.05, 0.05, 0.95, 0.95]) 109 | # fig.add_axes(ax) 110 | ax = fig.add_subplot(1, 1, 1) 111 | # ax.set_facecolor("grey") 112 | 113 | # And a corresponding grid 114 | ax.grid(False) 115 | # Or if you want different settings for the grids: 116 | # ax.grid(which='minor', alpha=0.2) 117 | # ax.grid(which='major', alpha=0.5) 118 | 119 | for index, code in enumerate(flight_ids): 120 | if clusters[index] == 0: 121 | # logger.info("outlier") 122 | continue 123 | x = flight_dicts[code][:, 1] # lon 124 | y = flight_dicts[code][:, 0] # lat 125 | label = clusters[index] 126 | color = colors_dict[label] 127 | plt.title("{} {}".format(info['airport_code'], info['file_name']), 128 | fontsize=24) 129 | plt.xlabel('Longitude', fontsize=24) 130 | plt.ylabel('Latitude', fontsize=24) 131 | plt.plot(x, y, '-ok', color=color, 132 | markersize=0, linewidth=1, 133 | markerfacecolor='white', 134 | markeredgecolor='gray', 135 | markeredgewidth=1) 136 | plt.legend() 137 | 138 | plt.savefig( 139 | "../tmp/{file_path}".format( 140 | file_path=file_path 141 | ), 142 | dpi=300 143 | ) 144 | -------------------------------------------------------------------------------- /trajclus/lib/geometric_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.spatial import distance 3 | from simplification.cutil import simplify_coords 4 | 5 | 6 | KM_PER_RADIAN = 6371.0088 7 | 8 | 9 | def simplify_coordinator(coord_curve, epsilon=0.0001): 10 | """ 11 | 12 | Args: 13 | coord_curve (list[list[float, float]]): a list of lat, lon coordinates 14 | epsilon (float): 15 | Returns: 16 | list[list[float, float]] 17 | """ 18 | coord_curve = np.asarray(coord_curve, order='C') 19 | return simplify_coords(coord_curve, epsilon) 20 | 21 | 22 | def build_coordinator_dict(df, label_encoder, flight_ids, max_flights=1000, 23 | is_simplify=True): 24 | """ 25 | Extract trajectories and store to dictionary with key is flight id 26 | Args: 27 | df (pandas.DataFrame): 28 | label_encoder (): 29 | flight_ids (list[str]): 30 | max_flights (int): 31 | is_simplify (bool): 32 | 33 | Returns: 34 | (list[str], list[list[float, float]], (dict)): 35 | """ 36 | flight_idx = [] 37 | flight_dicts = {} 38 | coord_list = [] 39 | count = 1 40 | for fid in flight_ids: 41 | if count > max_flights: 42 | break 43 | count += 1 44 | df_min = df[df['Flight_ID'] == fid] 45 | df_min = df_min.sort_values(by='DRemains', ascending=False) 46 | encode_id = label_encoder.transform([fid])[0] 47 | flight_idx.append(encode_id) 48 | coords = df_min.as_matrix(columns=['Latitude', 'Longitude']) 49 | if is_simplify: 50 | coords = simplify_coordinator(coords) 51 | coord_list.append(coords) 52 | flight_dicts[encode_id] = coords 53 | 54 | return flight_idx, coord_list, flight_dicts 55 | 56 | 57 | def compute_distance_between_curves(u, v, algo='directed_hausdorff'): 58 | """ 59 | Compute distance of 2 curve u, v 60 | Args: 61 | u (list[(float,float)]): list of Lat, Lon first curve 62 | v (list[(float,float)]): list of Lat, Lon second curve 63 | algo (str): name of algorithm 64 | 65 | Returns: 66 | 67 | """ 68 | if algo == 'directed_hausdorff': 69 | """ compute symmetric Hausdorff distances of curves """ 70 | D = distance.cdist(u, v, 'euclidean') 71 | # None symmetric Hausdorff distances 72 | H1 = np.max(np.min(D, axis=1)) 73 | H2 = np.max(np.min(D, axis=0)) 74 | return (H1 + H2) / 2. 75 | # import traj_dist.distance as tdist 76 | # Find the general (symmetric) Hausdorff distance between two 2-D arrays of coordinates: 77 | # return max(directed_hausdorff(u, v)[0], directed_hausdorff(v, u)[0]) 78 | # return tdist.hausdorff(u, v) 79 | else: 80 | raise (" Do not support %s" % algo) 81 | 82 | 83 | def build_matrix_distances(coords=[], dist_type='directed_hausdorff'): 84 | """ 85 | Construct the matrix distance between every curves (pair-wise distance) 86 | 87 | Args: 88 | coords (list[list[(float, float)]]): list of Lat, Lon of curves 89 | dist_type (str): the type of distance need to compute 90 | 91 | Returns: 92 | (numpy-array): matrix distance 93 | 94 | """ 95 | if dist_type not in ['directed_hausdorff']: 96 | return False 97 | n_curve = len(coords) 98 | # compute distance matrix 99 | dist_matrix = np.zeros(shape=(n_curve, n_curve)) 100 | for i in range(0, n_curve - 1): 101 | for j in range(i + 1, n_curve): 102 | tmp = compute_distance_between_curves(coords[i], coords[j], dist_type) 103 | dist_matrix[i, j] = tmp 104 | dist_matrix[j, i] = dist_matrix[i, j] 105 | return dist_matrix 106 | 107 | 108 | # Implementation of algorithm from https://stackoverflow.com/a/22640362/6029703 109 | def thresholding_algo(y, lag=30, threshold=5, influence=0): 110 | """ 111 | Detect peak event 112 | 113 | Args: 114 | y: 115 | lag: 116 | threshold: 117 | influence: 118 | 119 | Returns: 120 | 121 | """ 122 | signals = np.zeros(len(y)) 123 | filteredY = np.array(y) 124 | avgFilter = [0]*len(y) 125 | stdFilter = [0]*len(y) 126 | avgFilter[lag - 1] = np.mean(y[0:lag]) 127 | stdFilter[lag - 1] = np.std(y[0:lag]) 128 | for i in range(lag, len(y)): 129 | if abs(y[i] - avgFilter[i-1]) > threshold * stdFilter [i-1]: 130 | if y[i] > avgFilter[i-1]: 131 | signals[i] = 1 132 | else: 133 | signals[i] = -1 134 | 135 | filteredY[i] = influence * y[i] + (1 - influence) * filteredY[i-1] 136 | avgFilter[i] = np.mean(filteredY[(i-lag):i]) 137 | stdFilter[i] = np.std(filteredY[(i-lag):i]) 138 | else: 139 | signals[i] = 0 140 | filteredY[i] = y[i] 141 | avgFilter[i] = np.mean(filteredY[(i-lag):i]) 142 | stdFilter[i] = np.std(filteredY[(i-lag):i]) 143 | 144 | return dict( 145 | signals = np.asarray(signals), 146 | avgFilter = np.asarray(avgFilter), 147 | stdFilter = np.asarray(stdFilter)) 148 | -------------------------------------------------------------------------------- /trajclus/apps/db_clustering.py: -------------------------------------------------------------------------------- 1 | from time import gmtime, strftime 2 | 3 | import click 4 | import pandas as pd 5 | from sklearn.cluster import DBSCAN 6 | from sklearn.metrics import silhouette_score 7 | 8 | from trajclus.lib.common_utils import gen_log_file 9 | from trajclus.lib.preprocessing_lib import filter_by_airport, flight_id_encoder, \ 10 | build_flight_trajectory_df 11 | from trajclus.lib.plot_utils import traffic_flight_plot 12 | from trajclus.lib.geometric_utils import build_matrix_distances, KM_PER_RADIAN, \ 13 | simplify_coordinator 14 | 15 | logger = gen_log_file(path_to_file='../tmp/db_clustering.log') 16 | 17 | 18 | def cluster_trajectories(dist_matrix, epsilon=1, min_samples=1): 19 | """ 20 | Building cluster from distance matrix of all flight 21 | 22 | Args: 23 | dist_matrix (): 24 | epsilon (float): 25 | min_samples (int): 26 | 27 | Returns: 28 | clusters () 29 | labels (list[int]): list of cluster id 30 | 31 | """ 32 | 33 | db = DBSCAN( 34 | eps=epsilon, 35 | min_samples=min_samples, 36 | algorithm='auto', 37 | metric='precomputed' 38 | ) 39 | db.fit(X=dist_matrix) 40 | 41 | labels = db.labels_ 42 | num_clusters = len(set(labels)) 43 | clusters = pd.Series( 44 | [dist_matrix[labels == idx] for idx in range(num_clusters)] 45 | ) 46 | silhouette_val = silhouette_score( 47 | X=dist_matrix, 48 | labels=labels, 49 | metric='precomputed' 50 | ) 51 | 52 | logger.info( 53 | 'Number of trajectory clusters via DBSCAN: {} with Silhouette Coefficient {}'.format( 54 | num_clusters, silhouette_val 55 | ) 56 | ) 57 | return clusters, labels, silhouette_val 58 | 59 | 60 | def main(input_path, airport_code, distance, min_sample, max_flights, min_dr, max_dr, epsilon=0.001): 61 | history = strftime("%Y-%m-%d %H:%M:%S", gmtime()).replace(" ", "_") 62 | logger.info("=============================================") 63 | logger.info("================ DATETIME {} ================".format(history)) 64 | df = pd.read_csv(input_path) 65 | logger.info(df.head()) 66 | file_name = input_path.split("/")[-1].replace(".csv", "") 67 | 68 | # get fixed 69 | flights_to_airport = filter_by_airport( 70 | df=df, 71 | airport_code=airport_code, 72 | min_dr=min_dr, 73 | max_dr=max_dr 74 | ) 75 | 76 | 77 | logger.info("Encoding flight ID ...") 78 | flight_ids = flights_to_airport['Flight_ID'].unique().tolist() 79 | logger.info("Total # flight ID {}".format(len(flight_ids))) 80 | flight_encoder = flight_id_encoder(flight_ids) 81 | 82 | logger.info("Extracting trajectory coordinators and flight id from dataset") 83 | flight_df, flight_dicts = build_flight_trajectory_df( 84 | flights_to_airport=flights_to_airport, 85 | label_encoder=flight_encoder, 86 | flight_ids=flight_ids, 87 | max_flights=max_flights, 88 | epsilon=epsilon 89 | ) 90 | 91 | # prepare data-frame for detect entrance points toward the airport 92 | entrance_to_airport = filter_by_airport( 93 | df=df, 94 | airport_code=airport_code, 95 | min_dr=min_dr, 96 | max_dr=max_dr 97 | ) 98 | entrance_trajectories = [] 99 | for fid in flight_ids[:max_flights]: 100 | tmp_df = entrance_to_airport[entrance_to_airport['Flight_ID'] == fid] 101 | tmp_df = tmp_df.sort_values(by='DRemains', ascending=False) 102 | entrance_trajectories.append(tmp_df[['Latitude', 'Longitude']].values) 103 | simplified_coords = [ 104 | simplify_coordinator(coord_curve=curve, epsilon=epsilon) 105 | for curve in entrance_trajectories 106 | ] 107 | 108 | # create data-frame result 109 | clusters_df = pd.DataFrame() 110 | clusters_df['Flight_ID'] = flight_encoder.inverse_transform(flight_df['idx']) 111 | 112 | logger.info("Building distance matrix - {} ...".format(distance)) 113 | dist_matrix = build_matrix_distances( 114 | coords=simplified_coords, 115 | dist_type=distance 116 | ) 117 | 118 | # prepare grid search for tuning epsilon 119 | alpha = 0.001 120 | upper_bound = max(dist_matrix[0, :]) 121 | lower_bound = min(dist_matrix[0, :]) 122 | step = (upper_bound - lower_bound) * alpha 123 | logger.info( 124 | "upper_bound {}, lower_bound {}, step {}".format( 125 | upper_bound, lower_bound, step) 126 | ) 127 | # eps_list = np.arange(step*1, step*5, step) 128 | eps_list =[max_km / KM_PER_RADIAN /10.0 for max_km in [5, 10, 15, 20]] 129 | print(eps_list) 130 | 131 | last_clusters = None 132 | # for min_sp in range(1, min_sample, 1): 133 | min_sp = min_sample 134 | for eps in eps_list: 135 | epsilon = eps 136 | # epsilon = eps / kms_per_radian 137 | clusters, labels, silhouette = cluster_trajectories( 138 | dist_matrix=dist_matrix, 139 | epsilon=epsilon, 140 | min_samples=min_sp 141 | ) 142 | 143 | # list of cluster id along side with the encoded flight id 144 | last_clusters = clusters 145 | unique_labels = set(labels) 146 | clusters_df['c_{}_eps_{}'.format(len(unique_labels), epsilon)] = labels 147 | 148 | # export images 149 | result_file_name = "../tmp/{}_{}_dbscan_sil_{}_ms_{}_eps_{}.png".format( 150 | file_name, airport_code, silhouette, min_sp, epsilon 151 | ) 152 | traffic_flight_plot( 153 | flight_ids=flight_df['idx'].tolist(), 154 | clusters=labels, 155 | flight_dicts=flight_dicts, 156 | file_path=result_file_name, 157 | info={'file_name': file_name, 'airport_code': airport_code} 158 | ) 159 | if len(last_clusters) <= 2: 160 | break 161 | 162 | # export result 163 | clusters_df.to_csv( 164 | "../tmp/{}_{}_ms_{}.csv".format( 165 | file_name, airport_code, min_sample 166 | ), 167 | index=False 168 | ) 169 | logger.info("\n {}".format(clusters_df.head())) 170 | 171 | 172 | @click.command() 173 | @click.option( 174 | '--input_path', 175 | type=str, 176 | required=True, 177 | help='Full path to the trajectory file in CSV format') 178 | @click.option( 179 | '--airport_code', 180 | type=str, 181 | default='WSSS,VTBS,WMKK', 182 | help='Air Port Codename') 183 | @click.option( 184 | '--max_flights', 185 | type=int, 186 | default=1000, 187 | help='Max number of flights') 188 | @click.option( 189 | '--distance', 190 | type=str, 191 | default='directed_hausdorff', 192 | help='Distance algorithm current support: directed_hausdorff, frechet') 193 | @click.option( 194 | '--min_sample', 195 | type=int, 196 | default=1, 197 | help='Min sample value in DBSCAN') 198 | @click.option( 199 | '--dr_range', 200 | type=str, 201 | default='1.0,5.0', 202 | help='distance remains in radius') 203 | @click.option( 204 | '--epsilon', 205 | type=float, 206 | default=0.0001, 207 | help='epsilon for simplify curve using Douglas Peucker') 208 | def main_cli(input_path, airport_code, distance, min_sample, max_flights, dr_range, epsilon): 209 | airports = airport_code.split(",") 210 | dr_ranges = [float(i) for i in dr_range.split(",")] 211 | for airport in airports: 212 | main( 213 | input_path=input_path, 214 | airport_code=airport, 215 | distance=distance, 216 | min_sample=min_sample, 217 | max_flights=max_flights, 218 | min_dr=dr_ranges[0], 219 | max_dr=dr_ranges[1], 220 | epsilon=epsilon 221 | 222 | ) 223 | 224 | 225 | if __name__ == '__main__': 226 | main_cli() 227 | -------------------------------------------------------------------------------- /trajclus/apps/lsh_clustering.py: -------------------------------------------------------------------------------- 1 | import click 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.cluster import DBSCAN 5 | from geopy.distance import great_circle 6 | from shapely.geometry import MultiPoint 7 | from sklearn.metrics import silhouette_score 8 | from sklearn.cluster import KMeans 9 | 10 | from trajclus.lib.common_utils import gen_log_file 11 | from trajclus.lib.preprocessing_lib import filter_by_airport, \ 12 | build_flight_trajectory_df, flight_id_encoder 13 | from trajclus.lib.geometric_utils import KM_PER_RADIAN, simplify_coordinator, \ 14 | build_matrix_distances 15 | from trajclus.lib.lsh_lib import LSHClusteringLib 16 | from trajclus.lib.plot_utils import traffic_flight_plot 17 | 18 | 19 | def dbscan_clustering(coords, min_sample=1, max_distance=1.0, epsilon=None): 20 | """ 21 | Mapping all points in map to reduced cluster 22 | Args: 23 | coords : 24 | min_sample (int): 25 | max_distance (float): 26 | epsilon (float): 27 | 28 | Returns: 29 | 30 | """ 31 | 32 | """ 33 | The epsilon parameter is the max distance (max_distance) 34 | that points can be from each other to be considered a cluster. 35 | """ 36 | if not epsilon: 37 | epsilon = max_distance / KM_PER_RADIAN 38 | db = DBSCAN(eps=epsilon, min_samples=min_sample, algorithm='ball_tree', 39 | metric='haversine').fit(np.radians(coords)) 40 | cluster_labels = db.labels_ 41 | num_clusters = len(set(cluster_labels)) 42 | clusters = pd.Series([coords[cluster_labels == n] 43 | for n in range(num_clusters)]) 44 | centers = clusters.map(get_centermost_point) 45 | centers = np.array(centers.tolist()) 46 | print('Number of clusters for grouping: {}'.format(num_clusters)) 47 | return centers, db 48 | 49 | 50 | def kmeans_clustering(coords, k_cluster): 51 | kmeans = KMeans(n_clusters=k_cluster, random_state=0).fit(X=coords) 52 | cluster_labels = kmeans.labels_ 53 | num_clusters = len(set(cluster_labels)) 54 | 55 | centers = np.array(kmeans.cluster_centers_) 56 | 57 | print('Number of clusters for grouping: {}'.format(num_clusters)) 58 | return centers, kmeans 59 | 60 | 61 | def get_centermost_point(cluster): 62 | centroid = [MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y] 63 | centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m) 64 | 65 | return centermost_point 66 | 67 | 68 | def compute_silhouette_score(feature_matrix, labels): 69 | silhouette_val = silhouette_score( 70 | X=feature_matrix, 71 | labels=labels, 72 | metric='precomputed' 73 | ) 74 | return silhouette_val 75 | 76 | 77 | def detect_entrance_ways(point_coords, algorithm='k-means', estimated_n_entrance=9): 78 | if algorithm not in ['k-means', 'dbscan']: 79 | return [], False 80 | # auto detect entrance ways 81 | if algorithm == 'k-means': 82 | return kmeans_clustering( 83 | coords=point_coords, 84 | k_cluster=estimated_n_entrance 85 | ) 86 | if algorithm == 'dbscan': 87 | return dbscan_clustering( 88 | coords=point_coords, 89 | min_sample=1, # must be 1 90 | max_distance=15.0 91 | ) 92 | 93 | 94 | def filter_by_date(datetime, filter_date): 95 | """ 96 | Filter date 97 | Args: 98 | datetime (str): yyyy-mm-dd format 99 | filter_date (str): yyyy-mm-dd format 100 | 101 | Returns: 102 | (bool) 103 | """ 104 | str_date = str(datetime).split(' ')[0] 105 | if str_date == str(filter_date): 106 | return True 107 | return False 108 | 109 | 110 | def main( 111 | input_path, 112 | airport_code='WSSS', 113 | max_flights=1000, 114 | estimated_n_entrance=9, 115 | threshold=0.6, 116 | algo='k-means', 117 | min_dr=1.0, 118 | max_dr=2.0, 119 | filter_date='', 120 | epsilon=0.001 121 | ): 122 | # load raw-data from csv 123 | logger = gen_log_file(path_to_file='../tmp/lsh_clustering_{}.log'.format(filter_date)) 124 | df = pd.read_csv(input_path) 125 | file_name = input_path.split("/")[-1].replace(".csv", "") 126 | 127 | if filter_date != '': 128 | print("before filtering %s" % len(df)) 129 | df['filtered'] = df['Actual_Arrival_Time_(UTC)'].apply( 130 | lambda x: filter_by_date(datetime=x, filter_date=filter_date) 131 | ) 132 | df = df[df['filtered']] 133 | file_name = filter_date 134 | print("after filtering %s" % len(df)) 135 | 136 | # filter data by airport code-name 137 | flights_to_airport = filter_by_airport( 138 | df=df, 139 | airport_code=airport_code, 140 | min_dr=0.0, 141 | max_dr=max_dr 142 | ) 143 | 144 | # prepare data-frame for detect entrance points toward the airport 145 | entrance_to_airport = filter_by_airport( 146 | df=df, 147 | airport_code=airport_code, 148 | min_dr=min_dr, 149 | max_dr=max_dr 150 | ) 151 | 152 | logger.info("Encoding flight ID ... %s" % airport_code) 153 | flight_ids = flights_to_airport['Flight_ID'].unique().tolist() 154 | logger.info("Total # flight ID {}".format(len(flight_ids))) 155 | flight_encoder = flight_id_encoder(flight_ids) 156 | 157 | flight_df, flight_dicts = build_flight_trajectory_df( 158 | flights_to_airport=flights_to_airport, 159 | label_encoder=flight_encoder, 160 | flight_ids=flight_ids, 161 | max_flights=max_flights, 162 | epsilon=epsilon 163 | ) 164 | 165 | entrance_trajectories = [] 166 | total_original_points = 0 167 | for fid in flight_ids[:max_flights]: 168 | tmp_df = entrance_to_airport[entrance_to_airport['Flight_ID'] == fid] 169 | tmp_df = tmp_df.sort_values(by='DRemains', ascending=False) 170 | lat_lon_values = tmp_df[['Latitude', 'Longitude']].values 171 | total_original_points += len(lat_lon_values) 172 | entrance_trajectories.append(lat_lon_values) 173 | 174 | simplified_coords = [ 175 | simplify_coordinator(coord_curve=curve, epsilon=epsilon) 176 | for curve in entrance_trajectories 177 | ] 178 | 179 | logger.info("Total original points at entrance %s" % total_original_points) 180 | point_coords = simplified_coords[0] 181 | for item in simplified_coords[1:]: 182 | point_coords = np.concatenate((point_coords, item)) 183 | logger.info("Total points at entrance %s" % len(point_coords)) 184 | 185 | detect_entrance_algo = algo 186 | reduced_groups, classifier = detect_entrance_ways( 187 | point_coords=point_coords, 188 | algorithm=detect_entrance_algo, 189 | estimated_n_entrance=estimated_n_entrance 190 | ) 191 | 192 | # we trick each group label as a term, then each trajectory will contains 193 | # list of terms/tokens 194 | if detect_entrance_algo == 'dbscan': 195 | flight_df['groups'] = [classifier.fit_predict(X=coord) 196 | for coord in entrance_trajectories] 197 | elif detect_entrance_algo == 'k-means': 198 | entrance_groups = [] 199 | for traj in entrance_trajectories: 200 | if len(traj) > 1: 201 | entrance_groups.append(classifier.predict(X=traj)) 202 | else: 203 | entrance_groups.append([-1]) 204 | flight_df['groups'] = entrance_groups 205 | 206 | # convert clustering number to group label, 207 | flight_df['groups'] = flight_df['groups'].apply( 208 | lambda clusters: ["G{}".format(c) for c in clusters]) 209 | 210 | # Now we will apply Jaccard similarity and LSH for theses trajectories 211 | lsh_clustering = LSHClusteringLib( 212 | threshold=threshold, 213 | num_perm=128 214 | ) 215 | flight_df['hash'] = lsh_clustering.compute_min_hash_lsh_over_data( 216 | record_ids=flight_df['idx'].tolist(), 217 | data=flight_df['groups'].tolist() 218 | ) 219 | 220 | flight_df['duplicated'] = flight_df['hash'].apply( 221 | lambda x: lsh_clustering.query_duplicated_record(x) 222 | ) 223 | 224 | flight_df['buckets'] = flight_df['duplicated'].apply( 225 | lambda x: '_'.join(x) 226 | ) 227 | unique_buckets = flight_df['buckets'].unique().tolist() 228 | logger.info("number buckets %s" % len(unique_buckets)) 229 | logger.info(len(flight_df.groupby('buckets').size())) 230 | n_curve_per_bucket = flight_df.groupby('buckets').size().to_dict() 231 | 232 | def convert_to_cluster_number( 233 | bucket_label, unique_buckets, total_buckets, n_curve_per_bucket=None): 234 | # less number in bucket will be consider as outliers , label = -1 235 | if (n_curve_per_bucket[bucket_label] * 100.0 / total_buckets) <= 5.0: 236 | return -1 237 | return unique_buckets.index(bucket_label) 238 | 239 | cluster_labels = [ 240 | convert_to_cluster_number(bucket, unique_buckets, len(flight_df), n_curve_per_bucket) 241 | for bucket in flight_df['buckets'].tolist() 242 | ] 243 | flight_df['cluster'] = cluster_labels 244 | logger.info("Non-outlier cluster number %s" % 245 | len(flight_df[flight_df['cluster'] != -1]['cluster'].unique().tolist()) 246 | ) 247 | logger.info(flight_df[flight_df['cluster'] != -1]['cluster'].unique()) 248 | n_curve_per_cluster = flight_df.groupby('cluster').size() 249 | logger.info(n_curve_per_cluster) 250 | 251 | 252 | # # evaluation 253 | silhouette_val = None 254 | dist_matrix = build_matrix_distances( 255 | coords=flight_df['trajectory'].tolist(), 256 | dist_type='directed_hausdorff' 257 | ) 258 | silhouette_val = compute_silhouette_score( 259 | feature_matrix=dist_matrix, labels=cluster_labels 260 | ) 261 | logger.info("Silhouette Coefficient via LSH %s" % silhouette_val) 262 | 263 | # ### base-line with DBSCAN 264 | # from db_clustering import cluster_trajectories 265 | # alpha = 0.001 266 | # upper_bound = max(dist_matrix[0, :]) 267 | # lower_bound = min(dist_matrix[0, :]) 268 | # step = (upper_bound - lower_bound) * alpha 269 | # logger.info( 270 | # "upper_bound {}, lower_bound {}, step {}".format( 271 | # upper_bound, lower_bound, step) 272 | # ) 273 | # eps_list = np.arange(step*1, step*5, step) 274 | # for eps in eps_list: 275 | # try: 276 | # clusters, labels, silhouette = cluster_trajectories( 277 | # dist_matrix=dist_matrix, 278 | # epsilon=eps, 279 | # min_samples=1 280 | # ) 281 | # except: 282 | # continue 283 | 284 | plot_file_name = "{file_name}_{airport_code}_lsh_{threshold}_{algo}_{n_entrance}_dr_{dr_range}_sil_{silhoette}.png".format( 285 | file_name=file_name, 286 | airport_code="{}_{}_flights".format(airport_code, len(flight_df)), 287 | threshold=threshold, 288 | algo=detect_entrance_algo, 289 | n_entrance=estimated_n_entrance, 290 | dr_range="{}_{}".format(min_dr, max_dr), 291 | silhoette = silhouette_val 292 | 293 | ) 294 | 295 | traffic_flight_plot( 296 | flight_ids=flight_df['idx'].tolist(), 297 | clusters=cluster_labels, 298 | flight_dicts=flight_dicts, 299 | file_path=plot_file_name, 300 | group_clusters=reduced_groups, 301 | info={'file_name': file_name, 'airport_code': airport_code} 302 | ) 303 | 304 | result_file_name = "{file_name}_{airport_code}_lsh_{threshold}_{algo}_{n_entrance}_dr_{dr_range}_sil_{silhoette}.png".format( 305 | file_name=file_name, 306 | airport_code="{}_{}_flights".format(airport_code, len(flight_df)), 307 | threshold=threshold, 308 | algo=detect_entrance_algo, 309 | n_entrance=estimated_n_entrance, 310 | dr_range="{}_{}".format(min_dr, max_dr), 311 | silhoette=silhouette_val 312 | 313 | ) 314 | # export flight id with label of clusters to csv file 315 | flight_df[ 316 | ['flight_id', 'buckets', 'cluster'] 317 | ].to_csv("../tmp/{}.csv".format(result_file_name), index=False) 318 | 319 | 320 | @click.command() 321 | @click.option( 322 | '--input_path', 323 | type=str, 324 | required=True, 325 | help='Full path to the trajectory file in CSV format') 326 | @click.option( 327 | '--airport_code', 328 | type=str, 329 | # default='WSSS,VTBS,WMKK', 330 | default='WSSS', 331 | help='Air Port Codename') 332 | @click.option( 333 | '--max_flights', 334 | type=int, 335 | default=1000, 336 | help='Max number of flights') 337 | @click.option( 338 | '--dr_range', 339 | type=str, 340 | default='1.0,5.0', 341 | help='distance remains in radius') 342 | @click.option( 343 | '--filter_date', 344 | type=str, 345 | default='', 346 | help='Filter by date example 2016-09-29') 347 | @click.option( 348 | '--epsilon', 349 | type=float, 350 | default=0.001, 351 | help='epsilon for simplify curve using Douglas Peucker') 352 | def main_cli(input_path, airport_code, max_flights, dr_range, filter_date, epsilon): 353 | airports = airport_code.split(",") 354 | dr_ranges = [float(i) for i in dr_range.split(",")] 355 | for airport in airports: 356 | main( 357 | input_path=input_path, 358 | airport_code=airport, 359 | max_flights=max_flights, 360 | estimated_n_entrance=30, 361 | threshold=0.5, 362 | algo='k-means', 363 | min_dr=dr_ranges[0], 364 | max_dr=dr_ranges[1], 365 | filter_date=filter_date, 366 | epsilon=epsilon 367 | ) 368 | 369 | 370 | if __name__ == '__main__': 371 | main_cli() 372 | --------------------------------------------------------------------------------