├── code ├── __init__.py ├── readers │ ├── __init__.py │ ├── csv_data_reader.py │ └── pickle_data_reader.py ├── utilities │ ├── __init__.py │ ├── env.py │ ├── tt_utils.py │ └── df_utils.py ├── readme.txt ├── rmr_stn_models_training_file.py ├── cross_validation_file.py ├── rfr_stn_models_training_file.py ├── create_pickle_data.py ├── create_training_data.py ├── known_trains_lms_pred.py └── unknown_trains_lms_pred.py ├── tde_service ├── util │ ├── __init__.py │ └── log.py ├── env.py ├── logs │ └── tde_logs.log ├── app.py └── tde_prediction.py ├── requirements.txt ├── doc ├── TrainDelay-ITSC2018.pdf ├── LongPaper-arxiv-June2018.pdf ├── Summary-TrainDelayPrediction-June2018.pdf ├── Readme.md └── Tutorial.md ├── .gitignore ├── misc ├── read_status.py ├── train_status.py ├── lmr_stn_models_training_file.py ├── explore_data.R └── result_analysis.py ├── trains.txt ├── README.md ├── metadata_setup.sh └── LICENSE /code/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /code/readers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /code/utilities/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tde_service/util/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | joblib>=0.11 2 | pandas>=0.20.1 3 | scipy>=0.19.0 4 | flask>=0.12.2 5 | scikit-learn>=0.18.1 6 | -------------------------------------------------------------------------------- /doc/TrainDelay-ITSC2018.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/R-Gaurav/train-delay-estimation/HEAD/doc/TrainDelay-ITSC2018.pdf -------------------------------------------------------------------------------- /doc/LongPaper-arxiv-June2018.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/R-Gaurav/train-delay-estimation/HEAD/doc/LongPaper-arxiv-June2018.pdf -------------------------------------------------------------------------------- /doc/Summary-TrainDelayPrediction-June2018.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/R-Gaurav/train-delay-estimation/HEAD/doc/Summary-TrainDelayPrediction-June2018.pdf -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore *.pyc files. 2 | *.pyc 3 | *.DS_Store 4 | 5 | # Ignoring data directory 6 | data/ 7 | models/ 8 | 9 | # Ignoring config files and logs 10 | code/utilities/env.py 11 | tde_service/env.py 12 | tde_service/logs/tde_logs.log 13 | -------------------------------------------------------------------------------- /tde_service/env.py: -------------------------------------------------------------------------------- 1 | # 2 | # Train Delay Estimation Project 3 | # 4 | # Author: Ramashish Gaurav 5 | # 6 | # This module sets up the environment for running the TDE Service. 7 | # 8 | 9 | import sys 10 | 11 | # This module should NOT be executed. 12 | assert __name__ != "__main__" 13 | 14 | # Set up the project directory. 15 | project_dir_path = "/Personal/train-delay-estimation/" 16 | 17 | # Insert the project directory path in sys.path, so that subdirecotries and code 18 | # files therein are able to access the other (top level) files. 19 | sys.path.insert(0, project_dir_path) 20 | -------------------------------------------------------------------------------- /doc/Readme.md: -------------------------------------------------------------------------------- 1 | # Documentation 2 | 3 | ## Video 4 | 1. Train Chatbot: A Train Status Assistant for Indian Railways (Skype Prototype), 5 | https://www.youtube.com/watch?v=I-wtcAYLYr4, Oct 2018. 6 | An earlier version can be found here - 7 | https://www.youtube.com/watch?v=a-ABv29H6XU&feature=youtu.be, Sep 2018 (Emulator Prototype). 8 | 9 | ## Papers 10 | 1. Ramashish Gaurav, Biplav Srivastava, Estimating Train Delays in a Large Rail Network Using a Zero Shot Markov Model, IEEE International Conference on Intelligent Transportation Systems (ITSC). On Arxiv at: https://arxiv.org/abs/1806.02825, April 2018 [Area: Train delay, learning] 11 | 12 | 2. Himadri Mishra, Ramashish Gaurav, Biplav Srivastava, A Train Status Assistant for Indian Railways, On Arxiv at: https://arxiv.org/abs/1809.08509, Sep 2018 13 | [Area: Chatbot, Train delay] 14 | 15 | ---------- 16 | 17 | -------------------------------------------------------------------------------- /misc/read_status.py: -------------------------------------------------------------------------------- 1 | import json 2 | import csv 3 | import pickle 4 | #train_list = pickle.load(open('train_list_MGS.p','rb')) 5 | train_list = ['22308','13010','12307','12801','12802','14055'] 6 | for train_num in train_list: 7 | f=open('/home/zerone/python/ogd/train_running_status/Train'+str(train_num)+'.txt','r') 8 | f1=csv.writer(open('/home/zerone/python/ogd/train_running_status_csv/Train'+str(train_num)+'.csv','w')) #Everytime this file is run, it writes a new one 9 | f = f.readlines() 10 | f1.writerow(['actarr_date','day','station_code','station_name','scharr_date','scharr','actarr','latemin','status','schdep','actdep','distance','has_departed','has_arrived']) 11 | for line in f: 12 | run_stat = json.loads(line) 13 | run_stat = run_stat['route'] 14 | for stat in run_stat: 15 | f1.writerow([stat['actarr_date'],stat['day'],stat['station_']['code'],stat['station_']['name'],stat['scharr_date'],stat['scharr'],stat['actarr'],stat['latemin'],stat['status'],stat['schdep'],stat['actdep'],stat['distance'],stat['has_departed'],stat['has_arrived']]) 16 | -------------------------------------------------------------------------------- /code/utilities/env.py: -------------------------------------------------------------------------------- 1 | # 2 | # Train Delay Estimation Project 3 | # 4 | # Author: Ramashish Gaurav 5 | # 6 | # This module sets up paths for different directories of data and saved models. 7 | # Import this file always at the beginning of each file. 8 | # 9 | 10 | import sys 11 | 12 | # This module should NOT be executed. 13 | assert __name__ != "__main__" 14 | 15 | # Here "train-delay-estimation" is downloaded in "/Personal/projects/". 16 | # Here "data" is downloaded in "/Personal/projects/train-delay-estimation/". 17 | # Here "models" is set in "/Personal/projects/train-delay-estimation/". 18 | 19 | project_dir_path = "/Personal/train-delay-estimation/" 20 | 21 | # Insert the path to the project directory in sys.path so that subdirectories 22 | # and code files are accessible in other files. 23 | sys.path.insert(0, project_dir_path+"code/") 24 | 25 | # Insert the path to readers directory. 26 | sys.path.insert(0, project_dir_path+"code/readers/") 27 | 28 | # Insert the path to the data (input) directory. 29 | # data_path contains all the raw data and pickle data. 30 | data_path = project_dir_path+"data/" 31 | 32 | # Insert the path to the trained models of stations (output) directory. 33 | models_path = project_dir_path+"models/" 34 | -------------------------------------------------------------------------------- /tde_service/logs/tde_logs.log: -------------------------------------------------------------------------------- 1 | 2018-08-19 23:24:20,460 - util.log - INFO - Completed configuring logger()! 2 | 2018-08-19 23:26:55,831 - util.log - INFO - Train Number: 12307, Station Code: None, Date: None 3 | 2018-08-19 23:26:56,000 - util.log - INFO - Modified Date: 19 Aug 2018, Month: Aug, Weekday: Sunday 4 | 2018-08-19 23:26:56,001 - util.log - INFO - Train: 12307 single journey dataframe modified 5 | 2018-08-19 23:29:52,338 - util.log - INFO - Train Number: 12307, Station Code: None, Date: 2018-07-23 6 | 2018-08-19 23:29:52,470 - util.log - INFO - Modified Date: 23 Jul 2018, Month: Jul, Weekday: Monday 7 | 2018-08-19 23:29:52,471 - util.log - INFO - Train: 12307 single journey dataframe modified 8 | 2018-08-19 23:31:54,013 - util.log - INFO - Train Number: 12307, Station Code: ALD, Date: None 9 | 2018-08-19 23:31:54,149 - util.log - INFO - Modified Date: 19 Aug 2018, Month: Aug, Weekday: Sunday 10 | 2018-08-19 23:31:54,150 - util.log - INFO - Train: 12307 single journey dataframe modified 11 | 2018-08-19 23:34:44,370 - util.log - INFO - Train Number: 12307, Station Code: ALD, Date: 2018-12-09 12 | 2018-08-19 23:34:44,513 - util.log - INFO - Modified Date: 09 Dec 2018, Month: Dec, Weekday: Sunday 13 | 2018-08-19 23:34:44,514 - util.log - INFO - Train: 12307 single journey dataframe modified 14 | -------------------------------------------------------------------------------- /misc/train_status.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from requests.auth import HTTPBasicAuth 3 | import json 4 | import pickle 5 | import time 6 | header = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0',} 7 | 8 | ################################################################################ 9 | train_list = pickle.load(open('train_list_MGS.p','rb')) 10 | 11 | i=0 12 | train_list = train_list[i:] 13 | 14 | ################################################################################ 15 | for train_num in train_list: 16 | 17 | url="http://api.railwayapi.com/live/train/"+str(train_num)+"/doj/20160328/apikey//" 18 | response = requests.get(url,headers=header,proxies=None) 19 | if response.status_code == 200: 20 | #print response.text 21 | status = json.loads(response.text) 22 | stat = status['response_code'] 23 | if stat == 200: 24 | f = open('/home/zerone/python/ogd/train_running_status/Train'+str(train_num)+'.txt','a') 25 | f.write(response.text) 26 | f.write('\n') 27 | print 'Success',train_num, 'index', i 28 | else: 29 | print 'Fail',train_num, 'index', i 30 | 31 | else: 32 | print response.status_code,'Error',train_num, 'index', i 33 | 34 | time.sleep(2) 35 | i=i+1 36 | -------------------------------------------------------------------------------- /tde_service/util/log.py: -------------------------------------------------------------------------------- 1 | # 2 | # Train Delay Estimation Project 3 | # 4 | # Author: Ramashish Gaurav 5 | # 6 | # Log file utilities. 7 | # 8 | 9 | from os import path, remove 10 | 11 | import logging 12 | import traceback 13 | 14 | # Name of the log file where all messages would be logged. 15 | LOG_FILE = "logs/tde_logs.log" 16 | 17 | # If applicable, delete the existing log file to generate a fresh log file i 18 | # during each execution 19 | if path.isfile(LOG_FILE): 20 | remove(LOG_FILE) 21 | 22 | # Create the logger 23 | logger = logging.getLogger(__name__) 24 | # Set the logging level to DEBUG, such that all level messages are logged. 25 | logger.setLevel(logging.DEBUG) 26 | 27 | # Create handler for logging the messages to a log file. 28 | log_handler = logging.FileHandler(LOG_FILE) 29 | log_handler.setLevel(logging.DEBUG) 30 | 31 | # Set the format of the log. 32 | log_formatter = logging.Formatter( 33 | "%(asctime)s - %(name)s - %(levelname)s - %(message)s") 34 | 35 | # Add the Formatter to the Handler 36 | log_handler.setFormatter(log_formatter) 37 | 38 | # Add the Handler to the Logger 39 | logger.addHandler(log_handler) 40 | logger.info('Completed configuring logger()!') 41 | 42 | def INFO(msg): 43 | logger.info(msg) 44 | 45 | def WARN(msg): 46 | logger.warning(msg) 47 | logger.warning(traceback.format_exc()) 48 | 49 | def ERROR(msg): 50 | logger.error(msg) 51 | logger.error(traceback.format_exc()) 52 | -------------------------------------------------------------------------------- /trains.txt: -------------------------------------------------------------------------------- 1 | 12307 2 | 12331 3 | 12801 4 | 12802 5 | 12815 6 | 12816 7 | 12875 8 | 12876 9 | 13010 10 | 13050 11 | 13119 12 | 13131 13 | 13133 14 | 13151 15 | 13238 16 | 13483 17 | 14055 18 | 18612 19 | 22911 20 | 12178 21 | 12318 22 | 12327 23 | 12354 24 | 12361 25 | 12362 26 | 12372 27 | 12395 28 | 12569 29 | 12818 30 | 12942 31 | 14003 32 | 15632 33 | 15635 34 | 15636 35 | 22811 36 | 22812 37 | 22824 38 | 12305 39 | 12326 40 | 12424 41 | 12444 42 | 12578 43 | 12937 44 | 22409 45 | 09012 46 | 09305 47 | 12149 48 | 12282 49 | 12333 50 | 12335 51 | 12382 52 | 13239 53 | 01660 54 | 02050 55 | 02265 56 | 02397 57 | 03209 58 | 03210 59 | 03291 60 | 03563 61 | 03564 62 | 04039 63 | 04040 64 | 04401 65 | 04405 66 | 04406 67 | 04821 68 | 05066 69 | 06032 70 | 12141 71 | 12150 72 | 12175 73 | 12295 74 | 12296 75 | 12301 76 | 12302 77 | 12304 78 | 12308 79 | 12309 80 | 12312 81 | 12313 82 | 12317 83 | 12319 84 | 12320 85 | 12322 86 | 12325 87 | 12328 88 | 12332 89 | 12334 90 | 12356 91 | 12369 92 | 12381 93 | 12392 94 | 12397 95 | 12398 96 | 12401 97 | 12423 98 | 12439 99 | 12454 100 | 12495 101 | 12496 102 | 12506 103 | 12741 104 | 12817 105 | 12826 106 | 12947 107 | 12948 108 | 12987 109 | 12988 110 | 13005 111 | 13006 112 | 13008 113 | 13009 114 | 13049 115 | 13202 116 | 13240 117 | 13255 118 | 13307 119 | 13308 120 | 13414 121 | 15022 122 | 15483 123 | 15645 124 | 15668 125 | 18103 126 | 18104 127 | 18311 128 | 18609 129 | 18631 130 | 19313 131 | 22308 132 | 22405 133 | 22406 134 | 22488 135 | 25631 136 | -------------------------------------------------------------------------------- /code/readme.txt: -------------------------------------------------------------------------------- 1 | This Train Delay Estimation project aims to find a pattern in delays at stations 2 | during journey of trains in India. A set of 135 trains is considered, out of 3 | which 52 trains journey data are used for training various prediction models and 4 | it is tested on another set of 83 trains. Prediction of near accurate late 5 | minutes proves the existence of a pattern and our successful attempt to do so. 6 | 7 | For more information on algorithm, data collection and data division please 8 | refer the paper. 9 | 10 | Here, description of files in this repository is given. 11 | 12 | /readers: 13 | > Contains the helper code to read data from csv files and pickle files. 14 | 15 | /utilities: 16 | > Contains the helper code to generate data frames and to build our train-test 17 | algorithm. 18 | 19 | /create_pickle_data.py 20 | > Creates and saves the required data in pickle format. 21 | 22 | /create_training_data.py: 23 | > Code to create training data i.e. a Training Data Frame for each Known 24 | Station from each Known Train. 25 | 26 | /cross_validation_file.py: 27 | > Code to evaluate the trained models. 28 | 29 | /rfr_stn_models_training_file.py: 30 | > Code to train Random Forest Regressors on Training Data Frame. 31 | 32 | /rmr_stn_models_training_file.py 33 | > Code to train Ridge Model Regressor on Training Data Frame. 34 | 35 | /known_trains_lms_pred.py: 36 | > Implementation of N-Order Late Minutes Prediction Framework for Known Trains. 37 | 38 | /unknown_trains_lms_pred.py: 39 | > Implementation of N-Order Late Minutes Prediction Framework for UnKnown 40 | Trains. 41 | 42 | One can train the various n-previous-station models of Random Forest, Linear 43 | Regressors and Neural Network Rgressors by executing the corresponding files as 44 | metioned above. The total size of saved models exceed 120 GB, with at-least 40GB 45 | for each setting of Random Forests Models. In case you need the pre-trained 46 | models contact me. 47 | -------------------------------------------------------------------------------- /misc/lmr_stn_models_training_file.py: -------------------------------------------------------------------------------- 1 | # 2 | # Train Delay Estimation Project 3 | # 4 | # author: gaurav.ramashish@gmail.com 5 | # 6 | # Desc: This file trains linear models for 596 known stations. But the linear 7 | # models trained were not found to be robust during evaluation and 8 | # prediction. 9 | # Therefore not an important file with view of training and testing the 10 | # late minutes prediction framework. 11 | # 12 | # To run this file execute: 13 | # python lmr_stn_models_training_file.py 1 14 | # 15 | # where the numeral "1" can be changed to <1|2|3|4|5> depending on the "n" 16 | # in n-previous-station models. 17 | # 18 | 19 | import joblib 20 | import pickle 21 | import sys 22 | 23 | from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV 24 | from sklearn.metrics import mean_squared_error 25 | 26 | from utilities.tt_utils import TrainingTestUtils as TTU 27 | 28 | if __name__ == "__main__": 29 | n = int(sys.argv[1]) 30 | ttu = TTU() 31 | stns = ttu._pdr.get_all_52trains_stations() 32 | stns_having_model = [] 33 | for s in stns: 34 | df = ttu._cdr.get_n_prev_station_csv_df(s, "complete_training", n) 35 | df = ttu._get_labenc_station_df(df, n) 36 | 37 | if not df.empty: 38 | stns_having_model.append(s) 39 | target_late_mins = df.pop("crnt_stn_late_mins") 40 | 41 | # Remove unwanted columns from the data frame 42 | df = ttu.remove_unwanted_columns_df(df, n) 43 | 44 | model = LinearRegression(n_jobs=-1) 45 | model.fit(df, target_late_mins) 46 | pred_lms = model.predict(df) 47 | RMSE = mean_squared_error(target_late_mins, pred_lms)**0.5 48 | print "Linear Regression: ", s, RMSE 49 | 50 | joblib.dump(model, ttu._model_path+"lmr_models/"+str(n)+ 51 | "ps_lmr_labenc_models_complete_wonps_wdts/"+s+ 52 | "_label_encoding_model.sav") 53 | pickle.dump(stns_having_model, open(ttu._pdr._pdpath+"stations_having_"+ 54 | str(n)+"ps_lmr_models_complete_wonps_wdts.p", "wb")) 55 | 56 | -------------------------------------------------------------------------------- /code/rmr_stn_models_training_file.py: -------------------------------------------------------------------------------- 1 | # 2 | # Train Delay Estimation Project 3 | # 4 | # Author: Ramashish Gaurav 5 | # 6 | # Desc: This file trains linear models for 596 known stations. But the ridge 7 | # models trained were not found to be robust during evaluation and 8 | # prediction. 9 | # Therefore not an important file with view of training and testing the 10 | # late minutes prediction framework. 11 | # 12 | # To run this file execute: 13 | # python rmr_stn_models_training_file.py 1 14 | # 15 | # where the numeral "1" can be changed to <1|2|3|4|5> depending on the "n" 16 | # in n-previous-station models. 17 | # 18 | 19 | import joblib 20 | import pickle 21 | import sys 22 | 23 | from sklearn.linear_model import Ridge 24 | from sklearn.metrics import mean_squared_error 25 | 26 | from utilities.tt_utils import TrainingTestUtils as TTU 27 | 28 | if __name__ == "__main__": 29 | n = int(sys.argv[1]) 30 | ttu = TTU() 31 | stns = ttu._pdr.get_all_52trains_stations() 32 | stns_having_model = [] 33 | for s in stns: 34 | df = ttu._cdr.get_n_prev_station_csv_df(s, "complete_training", n) 35 | df = ttu._get_labenc_station_df(df, n) 36 | 37 | if not df.empty: 38 | stns_having_model.append(s) 39 | target_late_mins = df.pop("crnt_stn_late_mins") 40 | 41 | # Remove unwanted columns from the data frame 42 | df = ttu.remove_unwanted_columns_df(df, n) 43 | alpha_str_list = ["_1e_4", "_1e_2", "_5e_1", "_1", "_3"] 44 | alpha_list = [1e-4, 1e-2, 5e-1, 1, 3] 45 | for i in xrange(5): 46 | model = Ridge(alpha=alpha_list[i], normalize=True) 47 | model.fit(df, target_late_mins) 48 | pred_lms = model.predict(df) 49 | RMSE = mean_squared_error(target_late_mins, pred_lms)**0.5 50 | print "Ridge Regression: ", s, RMSE 51 | 52 | joblib.dump(model, ttu._model_path+"rmr"+alpha_str_list[i]+"_models/"+ 53 | str(n)+"ps_rmr"+alpha_str_list[i]+ 54 | "_labenc_models/"+s+"_label_encoding_model.sav") 55 | pickle.dump(stns_having_model, open(ttu._pdr._pdpath+"stations_having_"+ 56 | str(n)+"ps_rmr_models_complete_wonps_wdts.p", "wb")) 57 | 58 | -------------------------------------------------------------------------------- /code/cross_validation_file.py: -------------------------------------------------------------------------------- 1 | # 2 | # Train Delay Estimation Project 3 | # 4 | # Author: Ramashish Gaurav 5 | # 6 | # Desc: This file evaluates the trained Random Forest models for each stations. 7 | # Refer readme.txt in models/rfr_models/ to get the appropriate columns in 8 | # data frame, hence the corresponding model ("", "_without_nps_codes", 9 | # "_wonps_wdts"). Change the last line of pickle dump accordingly as 10 | # columns removed. 11 | # Not an important file from point of view of training the models and 12 | # testing the late minutes prediction framework. 13 | # 14 | # To run the file, execute: 15 | # python cross_validation_file.py rfr 2 16 | # 17 | # where the "rfr" can be changed to "lmr" and numeral can be changed to 18 | # <1|2|3|4|5> for different n_prev_station models to be cross-validated 19 | # (not to predict late minutes during journey). "rfr" stands for random 20 | # forest regressor models and "lmr" stands for linear model regressors. 21 | # 22 | # NOTE: This file is only meant to analyse the performance of late mins 23 | # prediction, so data frame is passed to trained models in batch set 24 | # instead of row wise (hence no filling of predicted late mins at 25 | # previous stations as done in N-OMLMPF algorithm). 26 | # 27 | import joblib 28 | import pickle 29 | import sys 30 | 31 | from utilities.tt_utils import TrainingTestUtils as TTU 32 | from sklearn.metrics import mean_squared_error 33 | 34 | if __name__ == "__main__": 35 | mdl = sys.argv[1] 36 | n = int(sys.argv[2]) 37 | ttu = TTU() 38 | stns = ttu._pdr.get_all_52trains_stations() 39 | rmse_list = [] 40 | 41 | for s in stns: 42 | df = ttu._cdr.get_n_prev_station_csv_df(s, "cross_validation", n) 43 | df = ttu._get_labenc_station_df(df, n) 44 | 45 | if not df.empty: 46 | actual_late_mins = df.pop("crnt_stn_late_mins") 47 | 48 | # Remove unwanted columns from the data frame 49 | df = ttu.remove_unwanted_columns_df(df, n) 50 | 51 | pred_late_mins = ttu.get_predicted_late_mins_list(s, n, df, mdl) 52 | RMSE = mean_squared_error(actual_late_mins, pred_late_mins)**0.5 53 | # Create a list of Station and corresponsing RMSE 54 | rmse_list.append([s, RMSE]) 55 | print s, RMSE 56 | # Dump the cross validation label encoding rmse list 57 | pickle.dump( 58 | rmse_list, open(ttu._pdr._pdpath+str(n)+"ps_cv_labenc_rmse_list.p", "wb")) 59 | -------------------------------------------------------------------------------- /code/rfr_stn_models_training_file.py: -------------------------------------------------------------------------------- 1 | # 2 | # Train Delay Estimation Project 3 | # 4 | # Author: Ramashish Gaurav 5 | # 6 | # Desc: This file trains the Random Forest Regressor models for different 7 | # stations. Refer the reademe.txt in models/rfr_models/ to figure out the 8 | # correct combination of models and corresponding columns removed. Change 9 | # the directory arguments in the line where joblib is used to dump the 10 | # *.sav models. 11 | # 12 | # Prints the RMSE of data frame on which training was done to see the fit. 13 | # 14 | # To run this file execute: 15 | # python rfr_stn_models_training_file.py 1 16 | # 17 | # where the numeral "1" can be changed as <1|2|3|4|5> as per the value of 18 | # "n" in n-previous-station models. The output trained models are saved in 19 | # "nps_rfr_labenc_models" directory, where n can be <1|2|3|4|5>. 20 | # 21 | # IMPORTANT NOTE: Make sure to remove the unwanted columns in data frame 22 | # depending on experiments for which you want trained 23 | # models. This can be done in function: 24 | # "remove_unwanted_columns_df()" in "utilities/tt_utils.py". 25 | # 26 | 27 | import joblib 28 | import pickle 29 | import sys 30 | 31 | from sklearn.ensemble import RandomForestRegressor as RFR 32 | from sklearn.metrics import mean_squared_error 33 | 34 | from utilities.tt_utils import TrainingTestUtils as TTU 35 | 36 | if __name__ == "__main__": 37 | n = int(sys.argv[1]) # Get the n in "n previous station" 38 | ttu = TTU() 39 | stns = ttu._pdr.get_all_52trains_stations() 40 | stns_having_model = [] # Stations having n prev stations RFR models 41 | for s in stns: 42 | df = ttu._cdr.get_n_prev_station_csv_df(s, "training", n) 43 | df = ttu._get_labenc_station_df(df, n) 44 | 45 | if not df.empty: 46 | stns_having_model.append(s) 47 | target_late_mins = df.pop("crnt_stn_late_mins") 48 | 49 | # Remove unwanted columns from the data frame 50 | df = ttu.remove_unwanted_columns_df(df, n) 51 | 52 | model = RFR(n_estimators=1000, n_jobs=-1, warm_start=True) 53 | model.fit(df, target_late_mins) 54 | pred_lms = model.predict(df) 55 | RMSE = mean_squared_error(target_late_mins, pred_lms)**0.5 56 | print s, RMSE 57 | 58 | joblib.dump(model, ttu._model_path + "rfr_models/" + str(n) + 59 | "ps_rfr_labenc_models/" + s + "_label_encoding_model.sav") 60 | pickle.dump(stns_having_model, open(ttu._pdr._pdpath+ 61 | "stations_having_"+str(n)+"ps_models.p", "wb")) 62 | -------------------------------------------------------------------------------- /code/readers/csv_data_reader.py: -------------------------------------------------------------------------------- 1 | # 2 | # Train Delay Estimation Project 3 | # 4 | # Author: Ramashish Gaurav 5 | # 6 | # Desc: This file reads the csv data. 7 | # 8 | 9 | import numpy as np 10 | import pandas as pd 11 | 12 | class CSVDataReader(object): 13 | 14 | def __init__(self, data_path=""): 15 | self._cdpath = data_path 16 | 17 | def get_train_journey_df(self, train_num, setting="training"): 18 | """ 19 | Returns the data frame of the given train. The data frame corresponds to 20 | either training or test setting. 21 | 22 | Args: 23 | train_num : Train number eg. "12307" whose data frame is required 24 | setting : <"training"|"cross_validation"|"known_test"| 25 | "unknown_test"> 26 | """ 27 | tr_grp = ("52_known_" if (setting == "training" or setting == "known_test" 28 | or setting == "cross_validation") else "83_unknown_") 29 | train_df = pd.read_csv( 30 | (self._cdpath+tr_grp+"trains_"+setting+"_folder/Train"+train_num+".csv")) 31 | return train_df 32 | 33 | def get_n_prev_station_csv_df(self, station, setting, n): 34 | """ 35 | Returns the n previous station training data frame of given station 36 | 37 | Args: 38 | station : should be one among 52trains unique stations 39 | setting : <"training"|"cross_validation"> 40 | n : <1|2|3|4|5> 41 | """ 42 | stn_csv = pd.read_csv( 43 | (self._cdpath+"52tr_stations_"+setting+"_data/"+str(n)+ 44 | "ps_"+setting+"_data/Station_"+station+".csv")) 45 | return stn_csv 46 | 47 | def get_jw_pred_late_mins_of_train_df(self, train_num, nps=4, rfr_mdl="", 48 | group="known"): 49 | """ 50 | Returns the data frame of Actual Late Mins and Predicted Late Mins for a 51 | train's cross validation data. 52 | 53 | Args: 54 | train_num : Train number eg. "12307" whose predicted late mins df 55 | is required. 56 | group : <"known"|"unknown"> 57 | nps : number of previous stations considered for prediction. 58 | rfr_mdl : <""|"_wonps_wdts"|"_without_nps_codes"> 59 | """ 60 | df = pd.read_csv(self._cdpath+"rfr_model_data/"+"jrny_wise_"+group+"_trains" 61 | +"_lms_"+str(nps)+"ps"+"_labenc"+rfr_mdl+"/"+"Train_"+ 62 | train_num+"_jw_lms.csv") 63 | return df 64 | 65 | def get_train_complete_journey_df(self, train_num): 66 | """ 67 | Returns a complete data frame of collected data for a train. 68 | 69 | Args: 70 | train_num : Train number eg. "12307" whose complete journey df is 71 | required. 72 | """ 73 | df = pd.read_csv(self._cdpath+ 74 | "csv_Mar16_Feb18_all_trains_135_months_weekdays/Train"+train_num+".csv") 75 | return df 76 | -------------------------------------------------------------------------------- /tde_service/app.py: -------------------------------------------------------------------------------- 1 | # 2 | # Train Delay Estimation Project 3 | # 4 | # Author: Ramashish Gaurav 5 | # 6 | # Desc: This file implements a flask REST API app for Train Delay Estimation. 7 | # 8 | # For multithreaded: http://flask.pocoo.org/docs/deploying/ 9 | 10 | import env 11 | 12 | from datetime import datetime 13 | from flask import Flask 14 | 15 | import json 16 | import pandas as pd 17 | import re 18 | 19 | from code.utilities.tt_utils import TrainingTestUtils as TTU 20 | from tde_prediction import TDEPrediction as TDEP 21 | 22 | from util import log 23 | 24 | app = Flask(__name__) 25 | pd.options.mode.chained_assignment = None # Disable "SettingWithCopyWarning". 26 | 27 | DATE_PATTERN = r'^\d{4}-\d{2}-\d{2}$' 28 | # Compile the regex since it is used multiple times in the life time of this app. 29 | DATE_REGEX = re.compile(DATE_PATTERN) 30 | 31 | # Instantiate following variables and keep them in memory because they are not 32 | # going to change throughout the life time of this app. 33 | ttu = TTU() 34 | pdr = ttu._pdr 35 | 36 | ALL_135_TRAINS = pdr.get_all_trains() 37 | 38 | STNS_WITH_N_MDLS = { 39 | "1ps": pdr.get_stations_having_nps_model_list(nps=1), 40 | ## One can add more deeper models one may have tried. E.g., 2-order in next line 41 | # "2ps": pdr.get_stations_having_nps_model_list(nps=2) 42 | } 43 | 44 | 45 | 46 | # Route when only train number is passed. 47 | @app.route("/", defaults={"station": None, "date": None}) 48 | # Route when train number and a date is passed. 49 | @app.route("//", defaults={"station": None}) 50 | # Route when train number and station code is passed. 51 | @app.route("///today", defaults={"date": None}) 52 | # Route when train number, station and date is passed. 53 | @app.route("///") 54 | def accept_url(train_num, station, date): 55 | log.INFO("Train Number: %s, Station Code: %s, Date: %s" 56 | % (train_num, station, date)) 57 | 58 | # Check for the validity of train number. 59 | if train_num not in ALL_135_TRAINS: 60 | log.ERROR("Train %s not in ALL_135_TRAINS list") 61 | return json.dumps({"Error": "Train: %s not accounted by our algorithm" 62 | % train_num, "Result": None}) 63 | 64 | if not date: 65 | date = str(datetime.now().date()) 66 | 67 | # TODO Check for the validity of date (Also check for 12 months 31 days) 68 | # TODO Check for past dates and error out those as invalid. 69 | match = DATE_REGEX.match(date) 70 | if not match: 71 | log.ERROR("Date: %s is not valid as per regex") 72 | return json.dumps({"Error": "Date %s not correct" % date, "Result": None}) 73 | 74 | lms_stns = TDEP().get_delay(STNS_WITH_N_MDLS, train_num, date, station) 75 | return json.dumps(lms_stns) 76 | 77 | if __name__ == "__main__": 78 | app.run(threaded=True) 79 | -------------------------------------------------------------------------------- /code/create_pickle_data.py: -------------------------------------------------------------------------------- 1 | # 2 | # Train Delay Estimation Project 3 | # 4 | # Author: Ramashish Gaurav 5 | # 6 | # Desc: This file creates all the required pickle data (used throughout the code 7 | # as pre-computed data) taking reference from the existing data on github 8 | # repo. Make sure you have create "pickle_data" folder under "data" 9 | # directory. 10 | # 11 | 12 | import numpy as np 13 | import pandas as pd 14 | import pickle 15 | 16 | from utilities.df_utils import TrainDataFrameUtils as TDFU 17 | 18 | class CreatePickleData(object): 19 | def __init__(self): 20 | self._tdfu = TDFU() 21 | self._pdr = self._tdfu._pdr 22 | self._cdr = self._tdfu._cdr 23 | 24 | def create_52trains_unique_stations_pickle(self): 25 | """ 26 | Creates a list of unique stations covered by all 52 "training" trains. 27 | It considers the complete journey of known trains (March 2016 to Feb 2018). 28 | """ 29 | trains52 = self._pdr.get_all_trains()[:52] # First 52 are Known Trains. 30 | tr52_unique_stations = [] 31 | for train in trains52: 32 | df = self._cdr.get_train_complete_journey_df(train) 33 | stations = df["station_code"] 34 | tr_unique_stations = np.unique(stations) 35 | tr52_unique_stations.extend(tr_unique_stations) 36 | 37 | tr52_unique_stations = np.unique(tr52_unique_stations).tolist() 38 | pickle.dump(tr52_unique_stations, 39 | open(self._pdr._pdpath+"52trains_unique_stations.p", "wb")) 40 | print ("52 Known Trains Unique Stations pickle data dumped in pickle_data" 41 | " directory. Number of unique stations: %s" 42 | % len(tr52_unique_stations)) 43 | print "-" * 80 44 | 45 | def create_135trains_unique_stations_pickle(self): 46 | """ 47 | Creates a list of unique stations covered by all 135 trains (Known + Unknown 48 | trains). It considers the complete journey of trains (March 2016 to Feb 2018). 49 | """ 50 | trains135 = self._pdr.get_all_trains() 51 | tr135_unique_stations = [] # To store all the unique stations for all trains. 52 | tr135_inline_stns = {} # To store the stations inline in a train's journey. 53 | for train in trains135: 54 | df = self._cdr.get_train_complete_journey_df(train) 55 | stations = df["station_code"] 56 | tr_unique_stations = np.unique(stations) 57 | tr135_unique_stations.extend(tr_unique_stations) 58 | tr135_inline_stns[train] = tr_unique_stations.tolist() 59 | 60 | tr135_unique_stations = np.unique(tr135_unique_stations).tolist() 61 | pickle.dump(tr135_unique_stations, 62 | open(self._pdr._pdpath+"135trains_unique_stations.p", "wb")) 63 | print ("135 Trains Unique Stations pickle data dumped in pickle_data" 64 | " directory. Number of unique stations: %s" 65 | % len(tr135_unique_stations)) 66 | pickle.dump(tr135_inline_stns, 67 | open(self._pdr._pdpath+"trains_inline_stations_dict.p", "wb")) 68 | print "135 Trains inline stations dict dumped in pickle_data directory" 69 | print "-" * 80 70 | 71 | if __name__ == "__main__": 72 | ob = CreatePickleData() 73 | ob.create_52trains_unique_stations_pickle() 74 | ob.create_135trains_unique_stations_pickle() 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Train Delay Estimation 2 | This project is first of its kind, an attempt made to learn the delay trends of 3 | Indian trains at their in-line stations. See [doc]( 4 | https://github.com/R-Gaurav/train-delay-estimation/tree/master/doc) dir for overview 5 | presentation; papers (ITSC version accepted at [IEEE ITSC 2018]( 6 | http://www.ieee-itsc2018.org) and long version of [Arxiv]( 7 | https://arxiv.org/abs/1806.02825) and a [tutorial]( 8 | https://github.com/R-Gaurav/train-delay-estimation/blob/master/doc/Tutorial.md) 9 | on using the code/ model/ data. This project is licensed under GNU GENERAL PUBLIC 10 | LICENSE Version 3. 11 | 12 | ## Team 13 | Ramashish Gaurav (2016 - ), 14 | Himadri Mishra (2018 - ), 15 | Biplav Srivastava (*main contact*). 16 | To request our collected data for research purposes, please fill [this form](https://docs.google.com/forms/d/e/1FAIpQLSc-u619QBL49KO7Lh6UvKOpSF4U1QDD-ZE0VZAqrtv-PlyehQ/viewform?usp=sf_link) and also email at: 17 | my.better.rail@gmail.com. 18 | 19 | ## Description 20 | India runs the fourth largest railway transport network size carrying 21 | over 8 billion passengers per year. However, the travel experience of 22 | passengers is frequently marked by delays, i.e., late arrival of trains at 23 | stations, causing inconvenience. In a first, we study the systemic delays 24 | in train arrivals using norder Markov frameworks and experiment with two 25 | regression based models. Using train running-status data collected for two 26 | years, we report on an efficient algorithm for estimating delays at 27 | railway stations with near accurate results. This work can help 28 | railways to manage their resources, while also helping passengers 29 | and businesses served by them to efficiently plan their activities. 30 | 31 | ## Tutorial to use our code 32 | Please visit the [tutorial]( 33 | https://github.com/R-Gaurav/train-delay-estimation/blob/master/doc/Tutorial.md) 34 | to find out the steps for using our code and setting up the experiment locally on 35 | your system. In the tutorial you will also find how to deploy a train delay 36 | estimation service locally on your system. On executing a REST API call, e.g. 37 | *curl http://127.0.0.1:5000/12333* (more to be found in tutorial) you will get 38 | delay estimates (in minutes) at in-line stations of Train 12333's journey on 39 | current date in a JSON format (example below). 40 | 41 | ` 42 | { 43 | "Result": { 44 | ..., "ALY": 322.184, "DLN": 81.23, "KIUL": 29.395, ... 45 | }, 46 | "Error": null 47 | } 48 | ` 49 | 50 | The list of trains for which you can avail this service is mentioned [here]( 51 | https://github.com/R-Gaurav/train-delay-estimation/blob/master/trains.txt). 52 | 53 | ## Future work (how you can contribute to it...) 54 | There are many avenues for extending current work. Please feel free to 55 | contact us for any help. 56 | 57 | - [Scaling] Expand the existing database of 135 trains (819 stations) to India wide. 58 | - [Improving] Improve the accuracy of existing prediction framework. Examples are 59 | time series prediction, neural networks. 60 | - [Improving] The current prediction framework is off-line in approach, i.e. it learns by 61 | batch processing the accumulated data. A realistic prediction framework will be 62 | on-line, i.e. so that it can keep learning with delays and railway network dynamics 63 | throughout its lifetime. 64 | 65 | In case you decide to contribute, please go through the [PEP8]( 66 | https://www.python.org/dev/peps/pep-0008/) coding conventions. 67 | The coding standards in this repository are very much based on that. 68 | 69 | -------- 70 | 71 | Suggestions and contributions are welcome. 72 | -------------------------------------------------------------------------------- /misc/explore_data.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | 3 | t12307 <- read.csv('Train12307.csv') 4 | 5 | # Exploring the relation between latemin and distance/stations. However there can be a numerical relation between distance and 6 | # latemins, also, on the other hand we can get some latent info about delays at train stations (need to be brainstormed) 7 | 8 | aggregate(latemin~distance,t12307,mean) 9 | #aggregate(latemin~station_code,t12307,mean) 10 | #aggregate(t12307$latemin,by=list(dist=t12307$distance),mean) In place of mean, we can pass functions : sd, max, var, median 11 | 12 | #plot the latemin with distance 13 | stn <- t12307[c(1:26),3] 14 | delay <- aggregate(latemin~distance,t,mean) 15 | delay$station <- stn 16 | 17 | # Saving a plot in the working directory 18 | jpeg('Latemin Vs Distance.jpg') 19 | plot(delay$distance,delay$latemin,type="l") 20 | lines(delay$distance,delay$latemin,col="red") 21 | dev.off() 22 | 23 | # Set a linear model on data 24 | lin_reg <- lm(latemin ~ distance+station_code,data=t12307) 25 | summary(lin_reg) # Has R-Squared : 0.7676 (without station_code: 0.6499, so station_code is required) 26 | # We can also include more variables in formula : difference between combination of scharr and schdep with actarr and actdep 27 | # schdep-scharr = total scheduled stop time at stations and so on... 28 | 29 | ### 3rd April ############################################################################################ 30 | 31 | time_sarr <- strptime(x=as.character(t12307$scharr), format="%H:%M") 32 | time_aarr <- strptime(x=as.character(t12307$actarr), format="%H:%M") 33 | 34 | time_delay <- (time_aarr-time_sarr)/60 (Divide by 60 to convert from seconds to minutes) 35 | # This is same as "latemin" 36 | # However we want to do get total delay minutes up to station in query, and not the delay minutes at that station, i.e. each row 37 | # will have delay minutes upto one station before, for example, for stations A, B, C, and D, B will have delay minutes the train 38 | # got up to station A, for station C -> up to station B, for station D -> up to station C. 39 | 40 | time_delay <- t12307$latemin 41 | time_delay <- time_delay[seq(1,length(time_delay)-1)] # Shift time delay by one station ahead 42 | # prepend 0 to time_delay now... 43 | time_delay <- c(0,time_delay) 44 | # Now for every instance of journey in data, the time_delay at the source station be 0, 45 | no_of_stations <- length(t12307$station_code) 46 | time_delay[seq(1,length(time_delay),no_of_stations)] <- 0 # Set 0 time_delay at each source station 47 | time_delay[time_delay < 0] <- 0 # Remove negative values from time_delay column 48 | t12307$time_delay<- time_delay # Add the column time_delay to data frame 49 | 50 | # Fit a linear model 51 | lin_reg <- lm(latemin ~ distance+station_code+time_delay, data=t12307) 52 | summary(lin_reg) 53 | # RSquared Error : 0.9834, Outstanding model, but it was expected, as "latemin" can have linear relation with "time_delay" 54 | # and was evident in summary of model, since the slope came out to be 1.02 and statistical importance of "distance" vanished. 55 | # In real environment we would not have time_delay up to a query station, so we can compute time_delay on mean of delays 56 | # ("latemin") so far and fit a linear model on it 57 | 58 | 59 | ## Use mean of "latemin" to construct time_delay column 60 | no_of_days <- nrow(t12307)/length(levels(t12307$station_code)) 61 | delay <- aggregate(latemin~distance,t12307,mean) 62 | delay <- rep(delay,no_of_days) 63 | delay <- delay[seq(1,length(delay)-1)] 64 | 65 | delay <- c(0,delay) 66 | delay[delay<0] <- 0 67 | delay[seq(1,length(delay),no_of_stations)] <- 0 68 | 69 | lin_reg <- lm(late_min ~ distance+station_code+delay, data = t12307) 70 | summary(lin_reg) 71 | # RSquared Error : 0.76 72 | 73 | ## Looks like "distance" has no role to play in presence of "station_code". Also there's a abnormal behaviour if "delay" is used 74 | # instead of "time_delay". Investigate further !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /metadata_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script sets up the directory structure for executing the code. 4 | 5 | echo "######################################" 6 | echo "# #" 7 | echo "# TRAIN DELAY ESTIMATION #" 8 | echo "# #" 9 | echo "######################################" 10 | 11 | echo "Installing the required dependencies in requirements.txt" 12 | pip install -r requirements.txt 13 | echo "All required python libraries installed." 14 | yes '' | sed 5q # Echo 5 blank lines. 15 | 16 | echo "Setting up the metadata (directory structure)..." 17 | yes '' | sed 5q # Echo 5 blank lines. 18 | 19 | # Untaring the tar data file. 20 | echo "Untaring data... Train_Delay_Estimation_Data_March_2016_February_2018.tar" 21 | tar -vxf Train_Delay_Estimation_Data_March_2016_February_2018.tar 22 | echo "Untaring done!" 23 | echo "*************************************************************************" 24 | yes '' | sed 5q # Echo 5 blank lines. 25 | 26 | echo "Renaming 'Train_Delay_Estimation_Data_March_2016_February_2018' to 'data'" 27 | mv Train_Delay_Estimation_Data_March_2016_February_2018 data 28 | echo "Renaming done!" 29 | echo "*************************************************************************" 30 | yes '' | sed 5q # Echo 5 blank lines. 31 | 32 | # Setting up the directory structure where trainig data would be saved. 33 | echo "Creating a new directory '52tr_stations_training_data' inside 'data'" 34 | echo "directory where Known Station's n-previous station training data-frames" 35 | echo "would be saved, which would be later used to train Random Forest Regressor" 36 | echo "models." 37 | mkdir data/52tr_stations_training_data 38 | 39 | echo "Creating subdirectories inside '52tr_stations_training_data', where" 40 | echo "Known Station's respective n-previous-station data-frames will be saved." 41 | for n in {1..5} 42 | do 43 | echo "Creating '"$n"ps_training_data' to store $n-previous station data-frames." 44 | mkdir data/52tr_stations_training_data/"$n"ps_training_data 45 | echo "-----------------------------------------------------------------------" 46 | done 47 | echo "Setting up the directory structure for saving training data done!" 48 | echo "*************************************************************************" 49 | yes '' | sed 5q # Echo 5 blank lines. 50 | 51 | # Setting up the directory structure where trained models would be saved. 52 | echo "Creating a new directory 'models' where your trained Random Forest" 53 | echo "Regressor (RFR) models would be saved." 54 | mkdir models 55 | mkdir models/rfr_models 56 | for n in {1..5} 57 | do 58 | echo "Creating '"$n"ps_rfr_labenc_models' to store the RFR models trained" 59 | echo "from $n-previous station training data-frames." 60 | mkdir models/rfr_models/"$n"ps_rfr_labenc_models 61 | echo "-----------------------------------------------------------------------" 62 | done 63 | echo "Setting up the directory structure for saving RFR trained models done!" 64 | echo "*************************************************************************" 65 | yes '' | sed 5q # Echo 5 blank lines. 66 | 67 | # Setting up the directory structure for saving predicted late minutes and 68 | # correspoding RMSEs of test journey data. 69 | echo "Creating a subdirectory 'rfr_model_data' inside 'data' directory to save" 70 | echo "the predicted late minutes of test journey data." 71 | mkdir data/rfr_model_data 72 | for n in {1..5} 73 | do 74 | echo "Creating 'jrny_wise_known_trains_lms_"$n"ps_labenc' to store journey" 75 | echo "wise predicted late-minutes of Known Train's test data with "$n"-OMLMPF." 76 | mkdir data/rfr_model_data/jrny_wise_known_trains_lms_"$n"ps_labenc 77 | echo "-----------------------------------------------------------------------" 78 | echo "Creating 'jrny_wise_unknown_trains_lms_"$n"ps_labenc' to store journey" 79 | echo "wise predicted late-minutes of Unknown Train's test data with "$n"-OMLMPF." 80 | mkdir data/rfr_model_data/jrny_wise_unknown_trains_lms_"$n"ps_labenc 81 | echo "-----------------------------------------------------------------------" 82 | done 83 | echo "Setting up the directory structure for saving predicted late-minutes done!" 84 | echo "*************************************************************************" 85 | yes '' | sed 5q # Echo 5 blank lines. 86 | 87 | echo "Creating a subdirectory 'rfr_model_pickle_data' inside 'data/pickle_data'" 88 | echo "to save RMSEs of predicted late-minutes in pickle format." 89 | mkdir -p data/pickle_data/rfr_model_pickle_data 90 | for n in {1..5} 91 | do 92 | echo "Creating 'rmse_of_jrny_wise_lms_pred_known_trains_"$n"ps' to store RMSE" 93 | echo "of journey wise predicted late minutes from "$n"-OMLMPF algorithm for" 94 | echo "Known Trains in pickle format." 95 | mkdir -p data/pickle_data/rfr_model_pickle_data/rmse_of_jrny_wise_lms_pred_known_trains_"$n"ps 96 | echo "-----------------------------------------------------------------------" 97 | echo "Creating 'rmse_of_jrny_wise_lms_pred_unknown_trains_"$n"ps' to store RMSE" 98 | echo "of journey wise predicted late minutes from "$n"-OMLMPF algorithm for" 99 | echo "Unknown Trains in pickle format." 100 | mkdir -p data/pickle_data/rfr_model_pickle_data/rmse_of_jrny_wise_lms_pred_unknown_trains_"$n"ps 101 | echo "-----------------------------------------------------------------------" 102 | done 103 | echo "Setting up the directory structure for saving RMSEs in pickle format done!" 104 | echo "*************************************************************************" 105 | yes '' | sed 5q # Echo 5 blank lines. 106 | 107 | echo "#########################################################################" 108 | echo "# You are all setup to run the codes as per your convenience. #" 109 | echo "# It is advised to go through the above output messages to understand #" 110 | echo "# the overall directory structure. #" 111 | echo "#########################################################################" 112 | -------------------------------------------------------------------------------- /code/readers/pickle_data_reader.py: -------------------------------------------------------------------------------- 1 | # 2 | # Train Delay Estimation Project 3 | # 4 | # Author: Ramashish Gaurav 5 | # 6 | # Desc: This file reads the pickle data. 7 | # 8 | 9 | import pickle 10 | import numpy as np 11 | 12 | 13 | class PickleDataReader(object): 14 | 15 | def __init__(self, data_path=""): 16 | self._pdpath = data_path+"pickle_data/" 17 | 18 | def get_all_trains(self): 19 | """ 20 | Returns a list of all 135 trains' train numbers. 21 | First 52 trains in list are Known Trains. Next 83 trains are Unknown Trains. 22 | """ 23 | all_trains = pickle.load(open(self._pdpath+"all_trains135.p", "rb")) 24 | return all_trains 25 | 26 | def get_all_52trains_stations(self): 27 | """ 28 | Returns a list of all 596 Known Stations of Known Trains. 29 | """ 30 | stations_52trains = pickle.load( 31 | open(self._pdpath+"52trains_unique_stations.p", "rb")) 32 | return stations_52trains 33 | 34 | def get_all_135trains_stations(self): 35 | """ 36 | Returns a list of all 799 Known Stations + Uknown Stations of all Known 37 | Trains and Unknown Trains. 38 | """ 39 | stations_135trains = pickle.load( 40 | open(self._pdpath+"135trains_unique_stations.p", "rb")) 41 | return stations_135trains 42 | 43 | def get_labenc_train_type_dict(self): 44 | """ 45 | Returns a dictionary of train type (key) vs numeric label (value). 46 | """ 47 | train_type_dict = pickle.load( 48 | open(self._pdpath+ 49 | "label_encodings/all_train_types_label_encoding_dict.p", "rb")) 50 | return train_type_dict 51 | 52 | def get_labenc_zone_dict(self): 53 | """ 54 | Returns a dictionary of zone (key) vs numeric label (value). 55 | """ 56 | zone_dict = pickle.load( 57 | open(self._pdpath+ 58 | "label_encodings/all_zones_label_encoding_dict.p", "rb")) 59 | return zone_dict 60 | 61 | def get_labenc_month_dict(self): 62 | """ 63 | Returns a dictionary of month (key) vs numeric label (value). 64 | """ 65 | month_dict = pickle.load( 66 | open(self._pdpath+ 67 | "label_encodings/all_months_label_encoding_dict.p", "rb")) 68 | return month_dict 69 | 70 | def get_labenc_weekday_dict(self): 71 | """ 72 | Returns a dictionary of weekday (key) vs numeric label (value). 73 | """ 74 | weekday_dict = pickle.load( 75 | open(self._pdpath+ 76 | "label_encodings/all_weekdays_label_encoding_dict.p", "rb")) 77 | return weekday_dict 78 | 79 | def get_labenc_station_dict(self): 80 | """ 81 | Returns a dict of station (key) vs numeric label (value). 82 | It is supposed to be universal set of all 4359 stations in India, for which 83 | numeric labels are assigned randomly. 84 | """ 85 | station_dict = pickle.load( 86 | open(self._pdpath+ 87 | "label_encodings/all_stations_label_encoding_dict.p", "rb")) 88 | return station_dict 89 | 90 | def get_station_degree_strength_dict(self): 91 | """ 92 | Returns a dictionary of station (key) vs degree strength (value). 93 | This dictionary contains info about only 799 Known and Unknown Stations. 94 | """ 95 | stn_deg_strength = pickle.load( 96 | open(self._pdpath+"station_degree_strength_dict.p", "rb")) 97 | return stn_deg_strength 98 | 99 | def get_station_traffic_strength_dict(self): 100 | """ 101 | Returns a dictionary of station (key) vs traffic strength (value). 102 | This dictionary contains info about only 799 Known and Unknown Stations. 103 | """ 104 | stn_tfc_strength = pickle.load( 105 | open(self._pdpath+"station_traffic_strength_dict.p", "rb")) 106 | return stn_tfc_strength 107 | 108 | def get_station_coordinates_dict(self): 109 | """ 110 | Returns a dictionary of station (key) vs a tuple of latitude and longitude 111 | of station (value). 112 | This dictionary contains info about only 799 Known and Unknown Stations. 113 | """ 114 | stn_coordinate = pickle.load( 115 | open(self._pdpath+"station_to_lat_lng_dict.p", "rb")) 116 | return stn_coordinate 117 | 118 | def get_known_596_stations_features_df(self): 119 | """ 120 | Returns a pandas DataFrame of station features of all 596 Known Stations. 121 | Valid Known Stations depending on their presence in `stations_having_nps_ 122 | models.p` would be chosen from here to perform kNN on them to find a nearest 123 | Known Station for an Unkown Station. 124 | """ 125 | stn_ftrs_df = pickle.load( 126 | open(self._pdpath+"known_596_stations_features_df.p", "rb")) 127 | return stn_ftrs_df 128 | 129 | def get_stations_having_nps_model_list(self, nps): 130 | """ 131 | Returns a list of stations which have an n_previous_station model. 132 | Args: 133 | nps : n in n_previous_stations models 134 | """ 135 | stns_hvng_nps_mdls = pickle.load( 136 | open(self._pdpath+"stations_having_"+str(nps)+"ps_models.p", "rb")) 137 | return stns_hvng_nps_mdls 138 | 139 | def get_rmse_of_journey_wise_lms_pred_list(self, n, group, train, rfr_mdl=""): 140 | """ 141 | Returns a list of RMSEs of different journeys undertaken by a train in 142 | given group and rfr_mdl with N-OMLMPF (depending on the value of n). 143 | 144 | Args: 145 | n : <1|2|3|4|5> 146 | group : <"known"|"unknown"> 147 | rfr_mdl : <""|"_wonps_wdts"> 148 | train : A five digit train number eg. "12307" 149 | """ 150 | rmse_list = pickle.load( 151 | open(self._pdpath+"rfr_model_pickle_data/rmse_of_jrny_wise_lms_pred"+ 152 | "_"+group+"_trains_"+str(n)+"ps"+rfr_mdl+"/Train_"+train+"_jw_rmse.p", 153 | "rb")) 154 | return rmse_list 155 | 156 | def get_all_trains_inline_stations_dict(self): 157 | """ 158 | Returns a dict of key as train number and values as a list of stations 159 | inline in its journey. 160 | """ 161 | train_stns_dict = pickle.load( 162 | open(self._pdpath+"trains_inline_stations_dict.p", "rb")) 163 | return train_stns_dict 164 | -------------------------------------------------------------------------------- /code/create_training_data.py: -------------------------------------------------------------------------------- 1 | # 2 | # Train Delay Estimation Project 3 | # 4 | # Author: Ramashish Gaurav 5 | # 6 | # Desc: This file creates data for training and evaluating the different models. 7 | # 8 | # To create training data to train models, make sure that the running 9 | # status data of trains are in "data/trains_training_file" and similarly 10 | # for test data of trains. 11 | # 12 | # Output station wise data frames are stored in 13 | # "data/52tr_stations_training_data/ps_training_data/" depending on the 14 | # value of n. 15 | # 16 | # To run this file execute following command (for both the functions). 17 | # 18 | # python create_training_data.py training 3 19 | # 20 | # It creates data frames of "training" setting for a current station with 21 | # 3 previous stations. 22 | # 23 | # This file also has a function to generate the known 596 stations 24 | # features data frame. 25 | # Station Features DF: ["Station", "latitude", "longitude", 26 | # "traffic_strength", "degree_strength"] 27 | # 28 | # To run a specific function, uncomment it in __main__ section. 29 | # 30 | 31 | import sys 32 | import pandas as pd 33 | import pickle 34 | 35 | from joblib import Parallel, delayed 36 | 37 | from utilities.df_utils import TrainDataFrameUtils as TDFU 38 | 39 | def generate_known_current_station_df( 40 | tdfu, current_station, setting="complete_training", n=3): 41 | """ 42 | Returns a data frame of the current station. The data frame consists of n 43 | previous station features to the current station. The setting if set to 44 | "cross_validation", generates data frames similar to "training" from Known 45 | Trains only to evaluate the models, not to cross-validate the late minutes 46 | prediction algorithm. 47 | 48 | Args: 49 | tdfu : An object of TrainDataFrameUtils 50 | current_station : A known station name which should be one among 51 | the stations of 52 trains. eg. "CNB" 52 | setting : <"training"|"cross_validation"|"complete_training"> 53 | n : Number of previous stations to the current station preferred 54 | <1|2|3|4|5>, default value is 3 55 | """ 56 | station_df = [] 57 | column_names_list = tdfu._get_column_names_list(n) 58 | trains = tdfu._pdr.get_all_trains() 59 | trains52 = trains[:52] # Choose the first 52 trains which are Known Trains 60 | # Rest 83 trains in list are Unknown Trains. 61 | 62 | # Iterate over all trains. 63 | # Get the complete df of each train. 64 | # Get the single journey df out of a complete df of each train. 65 | # For each single journey df find the station which is the current station 66 | # and append the n previous stations info to the station_df. 67 | for train_num in trains52: 68 | train_df = tdfu._cdr.get_train_journey_df(train_num, setting) 69 | 70 | # Get all the source station rows of each journey 71 | source_rows = train_df[train_df.scharr=="Source"].index.tolist() 72 | for i in range(len(source_rows)): 73 | sj_df = tdfu._generate_single_journey_df(train_df, i, source_rows) 74 | 75 | # Choose the required columns 76 | sj_df = sj_df[["station_code", "distance", 77 | "month", "weekday", "latemin"]] 78 | station_list = sj_df["station_code"].tolist() # Obtain the station list 79 | 80 | # Check if the sj_df is wrong due to extended journey 81 | if station_list != sj_df.station_code.unique().tolist(): 82 | print "Repeated stations found, Wrong DF, Check Train: ", train_num 83 | print "Obtained stations: ", station_list 84 | print "Actual stations: ", sj_df.station_code.unique().tolist() 85 | return 86 | else: 87 | for j in range(n+source_rows[i], len(station_list)+source_rows[i]): 88 | station = station_list[j-source_rows[i]] 89 | if station == current_station: 90 | # train_type. zone. is_superfast, month, weekday 91 | feature_list = [tdfu._generate_train_type_str(train_num), 92 | tdfu._generate_zone_str(train_num), 93 | tdfu._is_superfast_str(train_num), 94 | tdfu._generate_month_str(sj_df, j), 95 | tdfu._generate_weekday_str(sj_df, j)] 96 | # n_prev_station 97 | feature_list.extend( 98 | tdfu._generate_n_prev_station_codes_list(sj_df, j, n)) 99 | # n_ps_late_mins 100 | feature_list.extend( 101 | tdfu._generate_n_prev_stn_late_mins_list(sj_df, j, n)) 102 | # dist_bwn_stn_n-1_n 103 | feature_list.extend( 104 | tdfu._generate_n_prev_dist_bwn_stn_list(sj_df, j, n)) 105 | # stn_n_dist_frm_src 106 | feature_list.extend( 107 | tdfu._generate_n_prev_stn_dist_from_source_list(sj_df, j, n)) 108 | # tfc_of_stn_n 109 | feature_list.extend( 110 | tdfu._generate_n_prev_stn_tfc_strength_list(sj_df, j, n)) 111 | # deg_of_stn_n 112 | feature_list.extend( 113 | tdfu._generate_n_prev_stn_deg_strength_list(sj_df, j, n)) 114 | # crnt_stn_tfc, set n = 0 115 | feature_list.extend( 116 | tdfu._generate_n_prev_stn_tfc_strength_list(sj_df, j, 0)) 117 | # crnt_stn_deg, set n = 0 118 | feature_list.extend( 119 | tdfu._generate_n_prev_stn_deg_strength_list(sj_df, j, 0)) 120 | # crnt_stn_dist_frm_src, set n = 0 121 | feature_list.extend( 122 | tdfu._generate_n_prev_stn_dist_from_source_list(sj_df, j, 0)) 123 | # crnt_stn_late_mins, set n = 0 124 | feature_list.extend( 125 | tdfu._generate_n_prev_stn_late_mins_list(sj_df, j, 0)) 126 | 127 | station_df.append(feature_list) 128 | 129 | station_df = pd.DataFrame(station_df, columns = column_names_list) 130 | station_df.to_csv((tdfu._cdr._cdpath + "52tr_stations_" + setting+"_data/" + 131 | str(n) + "ps_" + setting + "_data/Station_" + 132 | current_station + ".csv"), index=False) 133 | print "Station: ", current_station, " Done!" 134 | return station_df 135 | 136 | def generate_known_stations_features_df(pdr): 137 | """ 138 | This function generates known stations features data frame helpful in 139 | projecting unknown stations to known stations. 140 | 141 | Args: 142 | pdr : A Pickle Data Reader object 143 | """ 144 | known_stations = pdr.get_all_52trains_stations() 145 | stn_ftrs_df = [] 146 | columns = ["Station", "Latitude", "Longitude", "Degree_Strength", 147 | "Traffic_Strength"] 148 | geo_crdnates = pdr.get_station_coordinates_dict() 149 | deg_strength = pdr.get_station_degree_strength_dict() 150 | tfc_strength = pdr.get_station_traffic_strength_dict() 151 | for stn in known_stations: 152 | stn_ftrs_df.append([stn, geo_crdnates[stn][0], geo_crdnates[stn][1], 153 | deg_strength[stn], tfc_strength[stn]]) 154 | stn_ftrs_df = pd.DataFrame(stn_ftrs_df, columns=columns) 155 | pickle.dump(stn_ftrs_df, 156 | open(pdr._pdpath + "known_596_stations_features_df.p", "wb")) 157 | 158 | if __name__ == '__main__': 159 | setting = sys.argv[1] 160 | n = int(sys.argv[2]) 161 | tdfu = TDFU() 162 | pdr = tdfu._pdr 163 | stns_of_52trains = pdr.get_all_52trains_stations() # Get all Known Stations. 164 | ################################################################################ 165 | # To create training or cross-validation data, runs parallely on 4 processors. 166 | Parallel(n_jobs=-1)(delayed(generate_known_current_station_df)(tdfu, stn, 167 | setting, n) for stn in stns_of_52trains) 168 | ################################################################################ 169 | # To create stations' features data frame. 170 | # generate_known_stations_features_df(pdr) 171 | ################################################################################ 172 | -------------------------------------------------------------------------------- /code/known_trains_lms_pred.py: -------------------------------------------------------------------------------- 1 | # 2 | # Train Delay Estimation Project 3 | # 4 | # Author: Ramashish Gaurav 5 | # 6 | # Desc: This file predicts the journey late minutes of known trains (52 trains). 7 | # Prints the RMSE of each journey. 8 | # 9 | # The path to saved trained models which would be loaded and employed to 10 | # predict late mins must be specified in function 11 | # "get_predicted_late_mins_list()" in file: "utilities/tt_utils.py". 12 | # 13 | # N-Order Markov Late Minutes Prediction Framework for Known Trains. 14 | # 15 | # To run this file execute: 16 | # 17 | # `python known_trains_lms_pred.py rfr 1` 18 | # 19 | # to run Random Forest Regressor Station models to predict late minutes; 20 | # considering 1-previous station and store the late mins prediction in 21 | # "jrny_wise_known_trains_lms_1ps_labenc" directory and corresponding RMSEs 22 | # in "rmse_of_jrny_wise_lms_pred_known_trains_1ps" directory. 23 | # 24 | # IMPORTANT NOTE: Make sure to remove the unwanted columns in data frame 25 | # depending on experiments. This can be done in function: 26 | # "remove_unwanted_columns_df()" defined in 27 | # "utilities/tt_utils.py", which gets eventually called in 28 | # "get_predicted_late_mins_at_station_float()". 29 | # 30 | # 31 | 32 | import pickle 33 | import pandas as pd 34 | import sys 35 | 36 | from sklearn.metrics import mean_squared_error 37 | 38 | from utilities.tt_utils import TrainingTestUtils as TTU 39 | 40 | def get_journey_wise_late_mins_of_known_trains( 41 | ttu, train_num, setting, mdl, n, exp_lms_output_dir, exp_rmse_output_dir): 42 | """ 43 | Finds the journey wise late minutes of Known Trains, ie. first 52 trains of 44 | all 135 trains whose data has been collected so far. 45 | 46 | The data of first 52 train has been used for training the station models. 47 | 48 | Args: 49 | ttu : An object of TrainingTestUtils 50 | train_num : A five digit train numebr string eg. "12307" 51 | setting : <"traininig"|"cross_validation"|"known_test"> 52 | mdl : <"rfr"|"lmr"> 53 | "rfr": Random Forest Regressor Models 54 | "lmr": Linear Model Regressor Models 55 | n : Value of n in n-prev-station or n-OMLMPF. 56 | exp_lms_output_dir : <"jrny_wise_known_trains_lms_1ps_labenc" | ..> 57 | Desired output directory of predicted latemins. 58 | exp_rmse_output_dir : <"rmse_of_jrny_wise_lms_pred_known_trains_1ps" 59 | |..> 60 | Desired output directory of predicted latemins 61 | RMSEs. Make sure it stays aligned with 62 | `exp_lms_output_dir`. 63 | 64 | """ 65 | pred_lms_df = [] # To caputre predicted late mins for each journey 66 | pred_lms_rmse = [] # Late Minutes RMSE for each journey 67 | columns = ["Stations", "ActualLateMins", "PredictedLateMins"] 68 | train_df = ttu._cdr.get_train_journey_df(train_num, setting) 69 | 70 | # Get all the source station rows of each journey in train_df 71 | source_rows = train_df[train_df.scharr=="Source"].index.tolist() 72 | 73 | for i in range(len(source_rows)): 74 | # Obtain the single journey dataframe 75 | sj_df = ttu._tdfu._generate_single_journey_df(train_df, i, source_rows) 76 | sj_df = sj_df[["station_code", "distance", "month", "weekday", "latemin"]] 77 | 78 | # Obtain the current single journey station list 79 | stn_list_sj = sj_df["station_code"].tolist() 80 | actual_late_mins_sj = sj_df["latemin"] 81 | pred_late_mins_sj = [0] # Assuming 0 late mins for source station 82 | 83 | # Uncomment the following lines in `if else` case accordingly as per value 84 | # of N in N-OMLMPF. If N is chosen to be 3, it implies we will consider only 85 | # 3-previous-station models of suitable stations to predict the late minutes. 86 | # Here, the value of N is chosen 1, so other `else` part of code is 87 | # commented out. Uncomment to generate desired results for different N. 88 | # 89 | # Depending on the Experiment you choose, make sure to remove the unwanted 90 | # columns in the call of function "get_predicted_late_mins_at_station_float". 91 | 92 | for j in range(1, len(stn_list_sj)): 93 | try: # Try to predict the late minutes for this station in single journey. 94 | if (j == 1 or n == 1): # valid for only 1 previous station. 95 | plm = ttu.get_predicted_late_mins_at_station_float(train_num, sj_df, 96 | j+source_rows[i], 1, stn_list_sj[j], pred_late_mins_sj, j, mdl) 97 | pred_late_mins_sj.append(plm) 98 | continue 99 | if (j == 2 or n == 2): # valid for only 2 previous stations. 100 | plm = ttu.get_predicted_late_mins_at_station_float(train_num, sj_df, 101 | j+source_rows[i], 2, stn_list_sj[j], pred_late_mins_sj, j, mdl) 102 | pred_late_mins_sj.append(plm) 103 | continue 104 | if (j == 3 or n == 3): # valid for only 3 previous stations. 105 | plm = ttu.get_predicted_late_mins_at_station_float(train_num, sj_df, 106 | j+source_rows[i], 3, stn_list_sj[j], pred_late_mins_sj, j, mdl) 107 | pred_late_mins_sj.append(plm) 108 | continue 109 | if (j == 4 or n == 4): # valid for only 4 previous stations. 110 | plm = ttu.get_predicted_late_mins_at_station_float(train_num, sj_df, 111 | j+source_rows[i], 4, stn_list_sj[j], pred_late_mins_sj, j, mdl) 112 | pred_late_mins_sj.append(plm) 113 | continue 114 | if (j ==5 or n == 5): # rest stations are valid for 5 previous stations. 115 | plm = ttu.get_predicted_late_mins_at_station_float(train_num, sj_df, 116 | j+source_rows[i], 5, stn_list_sj[j], pred_late_mins_sj, j, mdl) 117 | pred_late_mins_sj.append(plm) 118 | continue 119 | 120 | # Case when a new station comes whose trained model does not exist. 121 | except KeyError as e: 122 | # KeyError is obtained while creating row data frame for a station but 123 | # the previous station is not be present in station to index dict. Hence 124 | # set the late minutes at the current station as that of previous one. 125 | pred_late_mins_sj.append(pred_late_mins_sj[j-1]) 126 | except Exception as e: 127 | # Set the late minutes at that station for which no trained model exist 128 | # as the late minutes at the immediate previou station. 129 | print e 130 | pred_late_mins_sj.append(pred_late_mins_sj[j-1]) 131 | 132 | # Construct the data frame of Station Code, Actual Late Mins and 133 | # Predicted Late Mins for each journey 134 | for ele in zip(zip(stn_list_sj, actual_late_mins_sj), pred_late_mins_sj): 135 | pred_lms_df.append([ele[0][0], ele[0][1], ele[1]]) 136 | 137 | # Mark the end of current journey 138 | pred_lms_df.append(["JRNY END", "-------", "-------"]) 139 | # Store the RMSE of each journey for a train 140 | rmse = mean_squared_error(actual_late_mins_sj, pred_late_mins_sj)**0.5 141 | pred_lms_rmse.append(rmse) 142 | # Print the RMSE of each journey for the given train "train_num" 143 | print "Train Number:", train_num, "RMSE:", rmse 144 | 145 | pred_lms_df = pd.DataFrame(pred_lms_df, columns=columns) 146 | pred_lms_df.to_csv(ttu._cdr._cdpath+mdl+"_model_data/" + exp_lms_output_dir + 147 | "/Train_" + train_num + "_jw_lms.csv", index=False) 148 | pickle.dump(pred_lms_rmse, open(ttu._pdr._pdpath+mdl+"_model_pickle_data/" + 149 | exp_rmse_output_dir + "/Train_" + train_num + "_jw_rmse.p", "wb")) 150 | 151 | 152 | if __name__ == "__main__": 153 | mdl = sys.argv[1] # Accept <"rfr"|"lmr">. 154 | n = sys.argv[2] # Accept the n in n-OMLMPF (n-prev-stns to consider). 155 | # Create this directory to store predicted late minutes in each experiments 156 | # for different values of n in nps. 157 | exp_lms_output_dir = "jrny_wise_known_trains_lms_%sps_labenc" % n 158 | 159 | # Create this directory to store the RMSE for predicted late minutes in each 160 | # experiment for different values of n. Make sure it stays aligned with 161 | # exp_lms_output_dir. 162 | exp_rmse_output_dir = "rmse_of_jrny_wise_lms_pred_known_trains_%sps" % n 163 | 164 | ttu = TTU() 165 | trains52 = ttu._pdr.get_all_trains()[:52] # Choose the first 52 trains, which 166 | # are Known Trains. 167 | for train in trains52: 168 | get_journey_wise_late_mins_of_known_trains( 169 | ttu, train, "known_test", mdl, int(n), exp_lms_output_dir, 170 | exp_rmse_output_dir) 171 | -------------------------------------------------------------------------------- /doc/Tutorial.md: -------------------------------------------------------------------------------- 1 | # Tutorial: Using Train Delay Estimation 2 | 3 | ## Background 4 | We collected Train Running Status data (with format same as that shown in [NTES]( 5 | https://enquiry.indianrail.gov.in/mntes/)) for a period of two years from March 2016 6 | to February 2018 for 135 trains that pass through MGS (Mughalsarai Station, one of the 7 | top busiest stations in India). After required preprocessing of the data, we 8 | found that delays at stations depend on the month during which the journey is made, 9 | as well as the stations previous to the current station (at which we 10 | sought the predicted delay). As part of learning algorithms we 11 | used Random Forest Regressors and Ridge Regressors to 12 | devise a Zero Shot competent, scalable and train agnostic, late minutes prediction 13 | framework inspired from Markov Process. We name our prediction framework as 14 | *N*-Order Markov Late Minutes Prediction Framework (*N*-OMLMPF). 15 | 16 | The *N*-OMLMPF inputs a train number, its journey route information (i.e. 17 | stations along its journey route, distance of stations from source etc. - for 18 | more information, please see our papers in doc directory) and station at 19 | which the user wishes to known the expected delay and a date. It then 20 | outputs the expected delay at that particular station. 21 | 22 | The above only presents the gist of our work, it is highly recommended to go 23 | through our paper mentioned above, before proceeding ahead. 24 | 25 | The code is highly commented with function docstrings. Please let us know if you 26 | need help understanding them or setting up the experiments. The best way to set 27 | an experiment environment on your system is to download and install [Anaconda]( 28 | https://www.anaconda.com/download/). 29 | 30 | ## How to Use Code and Other Artifacts 31 | Upon contacted, we will share the raw data but not the pre-trained models. Each 32 | saved pre-trained model is approximately 40MB or more, hence not feasible to 33 | share all of them. However with the help of following simple steps, one can easily 34 | train the prediction models and use it for predicting delays at railway stations. 35 | 36 | The tutorial below details the steps to train Random Forest Regressor models (the 37 | most effective ones compared to Ridge Regressor models) on n-prev-stns training 38 | data-frames of Known Stations, with station codes removed from them (refer 39 | Experiments and Result Analysis section in our paper - Exp 3, Exp 4). 40 | 41 | To set up an experimental environment, follow the below steps precisely in the 42 | same order as mentioned. The preferred environment is Linux. 43 | 44 | **Note**: The code works with Python 2.7 45 | 46 | ### Setting up the directory structure. 47 | 1> Clone this repo on your local system by executing below command.\ 48 | `git clone https://github.com/R-Gaurav/train-delay-estimation.git` 49 | 50 | 2> After you download the tar file of data: 51 | `Train_Delay_Estimation_Data_March_2016_February_2018.tar`, move it inside the 52 | `train-delay-estimation` directory. You can download this data by contacting us 53 | at my.better.rail@gmail.com. 54 | 55 | 3> From inside `train-delay-estimation` directory execute: 56 | `./metadata_setup.sh` to setup the required directory structure along 57 | with automatically installing the dependencies in **requirements.txt**. 58 | 59 | ### Setting up the environment variables in file **env.py** 60 | 1> Navigate to directory **train-delay-estimation/code/utilities**. 61 | 62 | 2> Open **env.py**. 63 | 64 | 3> Set the `project_dir_path` variable to the location where you have downloaded 65 | the **train-delay-estimation** directory. 66 | 67 | 4> Save **env.py**. 68 | 69 | ### Creating pickle data 70 | 1> Move to the **code** directory. 71 | 72 | 2> We have already provided some data in pickle format which were either manually 73 | created or collected from internet via REST APIs. Although you need to create 74 | few more data in pickle format. 75 | 76 | To do this, just execute `python create_pickle_data.py`. 77 | 78 | ### Creating the training data (Table III in paper) to train the models 79 | 1> Move to the **code** directory. 80 | 81 | 2> Execute: `python create_training_data.py training 1` to create training 82 | data for 1 previous station data-frame, similarly replace `1` with <`2`,`3`,`4`,`5`> for 83 | creating data with respect to that many number of previous stations. 84 | 85 | On a system with 4 logical i5 cores (you can get the number of logical cores on 86 | you system by executing `htop` or `top` (followed by pressing `1` key)), it 87 | takes nearly 7 hours to prepare 1-prev-stn data frames. For 2-prev-stn data 88 | frames it takes 9 hours, so expect it to keep increasing for higher number 89 | of previous station data frames. 90 | 91 | NOTE: The data frames are created parallely, computation is done on all cores. 92 | 93 | For more information, go through the description mentioned in file: 94 | `create_training_data.py`. 95 | 96 | ### Training the regression models 97 | 1> Move to **code** directory. 98 | 99 | 2> Execute `python rfr_stn_models_training_file.py 1` to train 1-prev-stn Random 100 | Forest Regressor (RFR) models. Similary change `1` to <`2`,`3`,`4`,`5`> 101 | to train other models. However you would be required to prepare training data for 102 | them first though. 103 | 104 | On executing the above command, you will see a continuous output on command 105 | prompt: 106 | 107 | . 108 | . 109 | . 110 | . 111 | CAR 6.60625167783 112 | CBH 1.71117789831 113 | CBJ 17.4222160169 114 | CCK 3.79114575446 115 | CD 3.31220839301 116 | CDMR 5.39912244203 117 | CGR 8.08489734899 118 | CH 10.4774022913 119 | CHL 5.99947173966 120 | CHTI 67.8594204912 121 | CKDL 12.6303575828 122 | CKTD 5.57649677578 123 | CLG 4.48826310353 124 | CNB 62.4855739456 125 | . 126 | . 127 | . 128 | . 129 | 130 | where "CAR", CBH" are station codes and floating numbers beside them are 131 | RMSEs which evaluate the fit of models on training data itself. 132 | 133 | On a system with 4 logical cores it takes nearly an hour to train 1-prev-stn 134 | RFR models, for other n-prev-stn models it takes nearly the same time. 135 | 136 | ### Predicting delays of train's test data 137 | 1> Move to **code** directory. 138 | 139 | 2> Execute `python known_trains_lms_pred.py rfr 1`. 140 | The output on shell is similar to below: 141 | 142 | 143 | . 144 | . 145 | . 146 | . 147 | Train Number: 12307 RMSE: 39.7547311759 148 | Train Number: 12307 RMSE: 27.7902472271 149 | Train Number: 12307 RMSE: 69.2035611394 150 | Train Number: 12307 RMSE: 90.8565136872 151 | Train Number: 12307 RMSE: 56.4806884838 152 | Train Number: 12307 RMSE: 50.1364333031 153 | Train Number: 12307 RMSE: 34.8328977349 154 | Train Number: 12307 RMSE: 16.3028024387 155 | Train Number: 12307 RMSE: 24.3166122244 156 | Train Number: 12307 RMSE: 26.6479429784 157 | Train Number: 12307 RMSE: 67.5090362829 158 | Train Number: 12307 RMSE: 29.016842432 159 | Train Number: 12307 RMSE: 23.8403707468 160 | . 161 | . 162 | . 163 | . 164 | 165 | where each row corresponds to one journey of a train and the corresponding RMSE 166 | obtained on the test data for that journey. 167 | 168 | For Unknown Trains late minutes prediction, execute: 169 | `python unknown_trains_lms_pred.py rfr 10 1` 170 | 171 | This command will predict late minutes for unknown trains by using RFR 172 | models and will consider 10 Nearest Neighbors for a station. It will 173 | consider 2 previous stations i.e. n = 2 in n-OMLMPF. 174 | 175 | ### Deploying the Train Delay Estimation Service on your local machine 176 | Make sure that you have all the trained Random Forest Regressors models up to N= 177 | 5. 178 | 179 | 1> Move to **tde_service** directory. 180 | 181 | 2> In `env.py` file, set the `project_dir_path` to the location where you have 182 | downloaded the `train_delay_estimation` directory. 183 | 184 | 3> Execute: `python app.py`. The flask server would be running by default on 185 | loopback address: 127.0.0.1 at port 5000. 186 | 187 | From other terminal: 188 | 189 | 4> Execute: `curl http://127.0.0.1:5000/12307` to get the predicted late minutes 190 | at all the in-line stations of train 12307 on current date. 191 | 192 | 5> Execute: `curl http://127.0.0.1:5000/12307/2018-07-23` to get predicted late 193 | minutes at all the in-line stations of train 12307 on date 23rd July 2018. 194 | 195 | 6> Execute `curl http://127.0.0.1:5000/12307/ALD/today` to get predicted late 196 | minutes of train 12307 at station ALD (Allahabad) on current date. 197 | 198 | 7> Execute `curl http://127.0.0.1:5000/12307/ALD/2018-12-09` to get predicted 199 | late minutes for train 12307 at station ALD on 9th Dec 2018. 200 | 201 | The logs can be obtained in `train-delay-estimation/tde_service/logs/tde_logs.log` 202 | file. 203 | 204 | ---------- 205 | 206 | -------------------------------------------------------------------------------- /tde_service/tde_prediction.py: -------------------------------------------------------------------------------- 1 | # 2 | # Train Delay Estimation Project 3 | # 4 | # Author: Ramashish Gaurav 5 | # 6 | # This file implements the algorithm used to predict delays for a train at a 7 | # particular station on a particular date. 8 | # 9 | import env 10 | 11 | from datetime import datetime 12 | 13 | from code.utilities.tt_utils import TrainingTestUtils as TTU 14 | 15 | from util import log 16 | 17 | class TDEPrediction(object): 18 | def __init__(self): 19 | self._ttu = TTU() 20 | self._cdr = self._ttu._cdr 21 | self._tdfu = self._ttu._tdfu 22 | self._month_dict = {"01": "Jan", "02": "Feb", "03": "Mar", "04": "Apr", 23 | "05": "May", "06": "Jun", "07": "Jul", "08": "Aug", 24 | "09": "Sep", "10": "Oct", "11": "Nov", "12": "Dec"} 25 | self._week_dict = {0: "Monday", 1: "Tuesday", 2: "Wednesday", 3: "Thursday", 26 | 4: "Friday", 5: "Saturday", 6: "Sunday"} 27 | 28 | def _get_modified_date_month_week_tuple(self, date): 29 | """ 30 | Returns the month and weekday from the date. 31 | 32 | Args: 33 | date : A valid date in "YYYY-MM-DD" in string format. 34 | e.g. "2018-08-09". 35 | 36 | Returns: 37 | (str, str, str) i.e. (modified_date, month, weekday) 38 | """ 39 | date = date.split("-") #"2018-08-09" -> ['2018', '08', '09'] 40 | weekday = self._week_dict[ 41 | datetime(int(date[0]), int(date[1]), int(date[2])).date().weekday()] 42 | month = self._month_dict[date[1]] 43 | mod_date = date[2]+" "+month+" "+date[0] 44 | log.INFO("Modified Date: %s, Month: %s, Weekday: %s" 45 | % (mod_date, month, weekday)) 46 | return (mod_date, month, weekday) 47 | 48 | def _get_trains_modified_journey_dataframe(self, train_num, date): 49 | """ 50 | Returns a dataframe of the train `train_num` such that it has all the 51 | requried journey information and modified date, month and weekday column. 52 | 53 | Args: 54 | train_num : A five digit train number e.g. "12307". 55 | """ 56 | # TODO: Save some time here by having an updated latest journey dataframe. 57 | # Get the train's all journey data frame. 58 | train_df = self._cdr.get_train_complete_journey_df(train_num) 59 | # Extract the latest single journey data frame. 60 | source_rows = train_df[train_df.scharr == "Source"].index.tolist() 61 | train_latest_sj_df = self._tdfu._generate_single_journey_df( 62 | train_df, len(source_rows)-1, source_rows) 63 | num_rows_sj_df = train_latest_sj_df.shape[0] 64 | 65 | # TODO: Get a more accurate dateframe by incorporating actual previous dates 66 | # to the current queried date for a train which takes muliple days to 67 | # complete its journey. "day" column of dataframe might help. 68 | mod_date, month, weekday = self._get_modified_date_month_week_tuple(date) 69 | mod_date = [mod_date for _ in xrange(num_rows_sj_df)] 70 | month = [month for _ in xrange(num_rows_sj_df)] 71 | weekday = [weekday for _ in xrange(num_rows_sj_df)] 72 | 73 | # Modify the date columns. 74 | train_latest_sj_df["actarr_date"] = mod_date 75 | train_latest_sj_df["scharr_date"] = mod_date 76 | 77 | # Modify the month column. 78 | train_latest_sj_df["month"] = month 79 | 80 | # Modify the weekday column. 81 | train_latest_sj_df["weekday"] = weekday 82 | 83 | train_latest_sj_df = train_latest_sj_df.reset_index(drop=True) 84 | log.INFO("Train: %s single journey dataframe modified" % train_num) 85 | return train_latest_sj_df 86 | 87 | def get_delay(self, STNS_WITH_N_MDLS, train_num, date, station=None, nn=10, 88 | mdl="rfr", n=2): 89 | """ 90 | Gets the delay for train `train_num` at station `station` on date `date`. 91 | 92 | Args: 93 | STNS_WITH_N_MDLS : A dict having values as list of stations with 94 | n-prev-stns models. 95 | train_num : A five digit train number e.g. "12307". 96 | date : A date on which delays at stations are required, 97 | e.g. "2018-07-08" in "YYYY-MM-DD" format. 98 | station : A station code, e.g. "CNB". 99 | nn : Number of nearest neighbour to be considered if the current 100 | station does not have n-prev-station models. 101 | mdl : "rfr" for Random Forest Regressor models. 102 | n : N in N-OMLMPF i.e. number of previous station to consider. 103 | 104 | Returns: 105 | dict: 106 | 107 | { 108 | "Error": or , 109 | "Result": : An object of TrainingTestUtils 47 | train_num : A five digit train number string eg. "12307" 48 | setting : <"test"> 49 | nn : Number of nearest neighbors of unknown stations 50 | mdl : <"rfr"> # For Random Forest Regressor Models 51 | <"lmr"> # For Linear Model Regressor Models 52 | n : value of n in n-OMLMPF (n-prev-stns to consider) 53 | exp_lms_output_dir : < 54 | "jrny_wise_unknown_trains_lms_1ps_labenc_wonps_wdts" 55 | | ..> Desired output directory where predicted late 56 | minutes are to saved. 57 | exp_rmse_output_dir : < 58 | "rmse_of_jrny_wise_lms_pred_unknown_trains_1ps_wonps_wdts" | ..> Desired 59 | output directory where RMSE's of predicted late minutes are saved in 60 | pickle format. Make sure it stays aligned with `exp_lms_output_dir`. 61 | """ 62 | pred_lms_df = [] # To capture the predicted late mins for each journey 63 | pred_lms_rmse = [] # Late Minutes RMSE for each journey 64 | columns = ["Stations", "ActualLateMins", "PredictedLateMins"] 65 | train_df = ttu._cdr.get_train_journey_df(train_num, setting) 66 | 67 | # Get all the source station rows of each journey in train_df 68 | source_rows = train_df[train_df.scharr=="Source"].index.tolist() 69 | 70 | for i in range(len(source_rows)): 71 | # Obtain the single journey data frame 72 | sj_df = ttu._tdfu._generate_single_journey_df(train_df, i, source_rows) 73 | sj_df = sj_df[["station_code", "distance", "month", "weekday", "latemin"]] 74 | 75 | # Obtain the current single journey station list 76 | stn_list_sj = sj_df["station_code"].tolist() 77 | actual_late_mins_sj = sj_df["latemin"] 78 | pred_late_mins_sj = [0] # Assuming 0 late mins for source station 79 | num_of_unknown_stns = 0 80 | # Uncomment the following lines in `if else` case accordingly as per value 81 | # of N in N-OMLMPF. If N is chosen to be 3, it implies we will consider only 82 | # 3-previous-station models of suitable stations to predict the late minutes.\ 83 | # Here, the value of N is chosen 1, so other `else` part of code is 84 | # commented out. Uncomment to generate desired results for different N. 85 | for j in range(1, len(stn_list_sj)): 86 | try: 87 | stn = stn_list_sj[j] 88 | 89 | if (j == 1 or n == 1): # valid for only 1 previous station 90 | stns_hvng_1ps_model = ttu._pdr.get_stations_having_nps_model_list(nps=1) 91 | if stn not in stns_hvng_1ps_model: 92 | num_of_unknown_stns += 1 93 | # Get nn nearest neighbors of station "stn" 94 | nn_stns = ttu.get_station_nearest_neighbors_list(stn, 1, nn) 95 | stn = nn_stns[0] # Choose the 1st nearest neighbor station 96 | plm = ttu.get_predicted_late_mins_at_station_float(train_num, sj_df, 97 | j+source_rows[i], 1, stn, pred_late_mins_sj, j, mdl) 98 | pred_late_mins_sj.append(plm) 99 | continue 100 | if (j == 2 or n == 2): # valid for only 2 previous station 101 | stns_hvng_2ps_model = ttu._pdr.get_stations_having_nps_model_list(nps=2) 102 | if stn not in stns_hvng_2ps_model: 103 | num_of_unknown_stns += 1 104 | # Get nn nearest neighbors of station "stn" 105 | nn_stns = ttu.get_station_nearest_neighbors_list(stn, 2, nn) 106 | stn = nn_stns[0] # Choose the 1st nearest neighbor station 107 | plm = ttu.get_predicted_late_mins_at_station_float(train_num, sj_df, 108 | j+source_rows[i], 2, stn, pred_late_mins_sj, j, mdl) 109 | pred_late_mins_sj.append(plm) 110 | continue 111 | if (j == 3 or n == 3): # valid for only 3 previous station 112 | stns_hvng_3ps_model = ttu._pdr.get_stations_having_nps_model_list(nps=3) 113 | if stn not in stns_hvng_3ps_model: 114 | num_of_unknown_stns += 1 115 | # Get nn nearest neighbors of station "stn" 116 | nn_stns = ttu.get_station_nearest_neighbors_list(stn, 3, nn) 117 | stn = nn_stns[0] # Choose the 1st nearest neighbor station 118 | plm = ttu.get_predicted_late_mins_at_station_float(train_num, sj_df, 119 | j+source_rows[i], 3, stn, pred_late_mins_sj, j, mdl) 120 | pred_late_mins_sj.append(plm) 121 | continue 122 | if (j == 4 or n == 4): # valid for only 4 previous station 123 | stns_hvng_4ps_model = ttu._pdr.get_stations_having_nps_model_list(nps=4) 124 | if stn not in stns_hvng_4ps_model: 125 | num_of_unknown_stns += 1 126 | # Get nn nearest neighbors of station "stn" 127 | nn_stns = ttu.get_station_nearest_neighbors_list(stn, 4, nn) 128 | stn = nn_stns[0] # Choose the 1st nearest neighbor station 129 | plm = ttu.get_predicted_late_mins_at_station_float(train_num, sj_df, 130 | j+source_rows[i], 4, stn, pred_late_mins_sj, j, mdl) 131 | pred_late_mins_sj.append(plm) 132 | continue 133 | if (j == 5 or n == 5): # rest stations valid for only 5 previous station 134 | stns_hvng_5ps_model = ttu._pdr.get_stations_having_nps_model_list(nps=5) 135 | if stn not in stns_hvng_5ps_model: 136 | num_of_unknown_stns += 1 137 | # Get nn nearest neighbors of station "stn" 138 | nn_stns = ttu.get_station_nearest_neighbors_list(stn, 5, nn) 139 | stn = nn_stns[0] # Choose the 1st nearest neighbor station 140 | plm = ttu.get_predicted_late_mins_at_station_float(train_num, sj_df, 141 | j+source_rows[i], 5, stn, pred_late_mins_sj, j, mdl) 142 | pred_late_mins_sj.append(plm) 143 | continue 144 | 145 | except Exception as e: 146 | print e 147 | pred_late_mins_sj.append(pred_late_mins_sj[j-1]) 148 | 149 | # Construct the data frame of Station Code, Actual Late Mins and 150 | # Predicted Late Mins for each journey 151 | for ele in zip(zip(stn_list_sj, actual_late_mins_sj), pred_late_mins_sj): 152 | pred_lms_df.append([ele[0][0], ele[0][1], ele[1]]) 153 | 154 | # Mark the end of current journey 155 | pred_lms_df.append(["JRNY END", "-------", "-------"]) 156 | # Calculate the RMSE of each journey for a train 157 | rmse = mean_squared_error(actual_late_mins_sj, pred_late_mins_sj)**0.5 158 | # Store the Number of Unknown Stations and RMSE of each journey of a train 159 | pred_lms_rmse.append((num_of_unknown_stns, rmse)) 160 | # Print the RMSE of each journey for the given train "train_num" 161 | print ("Train Number:", train_num, 162 | "Number of Unknown Stations: ", num_of_unknown_stns, "RMSE: ", rmse) 163 | 164 | pred_lms_df = pd.DataFrame(pred_lms_df, columns=columns) 165 | pred_lms_df.to_csv(ttu._cdr._cdpath+mdl+"_model_data/" + exp_lms_output_dir + 166 | "/Train_" + train_num + "_jw_lms.csv", index=False) 167 | pickle.dump(pred_lms_rmse, open(ttu._pdr._pdpath+mdl + "_model_pickle_data/" + 168 | exp_rmse_output_dir + "/Train_" + train_num + "_jw_rmse.p", "wb")) 169 | 170 | if __name__ == "__main__": 171 | mdl = sys.argv[1] # Get the model <"rfr"|"lmr"> 172 | nn = int(sys.argv[2]) # Get the number of nearest neighbors 173 | n = sys.argv[3] # Get the n in n-OMLMPF (n-prev-stns to consider). 174 | # Create this directory to store predicted late minutes in each experiments 175 | # for different values of n in nps. 176 | exp_lms_output_dir = "jrny_wise_unknown_trains_lms_%sps_labenc" % n 177 | 178 | # Create this directory to store the RMSE for predicted late minutes in each 179 | # experiment for different values of n. Make sure it stays aligned with 180 | # exp_lms_output_dir. 181 | exp_rmse_output_dir = "rmse_of_jrny_wise_lms_pred_unknown_trains_%sps" % n 182 | 183 | ttu = TTU() 184 | trains83 = ttu._pdr.get_all_trains()[52:] # Choose the rest 83 Unknown Trains 185 | for train in trains83: 186 | get_journey_wise_late_mins_of_unknown_trains( 187 | ttu, train, "unknown_test", nn, mdl, int(n), exp_lms_output_dir, 188 | exp_rmse_output_dir) 189 | -------------------------------------------------------------------------------- /code/utilities/tt_utils.py: -------------------------------------------------------------------------------- 1 | # 2 | # Train Delay Estimation Project 3 | # 4 | # Author: Ramashish Gaurav 5 | # 6 | # Desc: This file provides the basic utility functions for training and testing 7 | # the models. 8 | # 9 | 10 | from env import * # Import it first as it imports data_path and models_path. 11 | import joblib 12 | import numpy as np 13 | import pandas as pd 14 | import pickle 15 | 16 | from sklearn.metrics import mean_squared_error 17 | from sklearn.neighbors import NearestNeighbors as NN 18 | 19 | from df_utils import TrainDataFrameUtils as TDFU 20 | from readers.pickle_data_reader import PickleDataReader as PDR 21 | from readers.csv_data_reader import CSVDataReader as CDR 22 | 23 | 24 | class TrainingTestUtils(object): 25 | 26 | def __init__(self): 27 | self._tdfu = TDFU() 28 | self._pdr = PDR(data_path) 29 | self._cdr = CDR(data_path) 30 | self._model_path = models_path 31 | self._stn_geo_crdnates = self._pdr.get_station_coordinates_dict() 32 | self._stn_deg_strength = self._pdr.get_station_degree_strength_dict() 33 | self._stn_tfc_strength = self._pdr.get_station_traffic_strength_dict() 34 | 35 | def _get_labenc_of_cat_var_df(self, df, cat_var, cat_var_dict): 36 | """ 37 | Returns the station data frame where the "cat_var" column is the label 38 | encoding of the categorial variables in passed station data frame "df" 39 | 40 | Args: 41 | df : The data frame whose categorical variables are to 42 | be encoded. 43 | cat_var : The column name of the categorical variables to be 44 | encoded eg. "train_type". 45 | cat_var_dict : A python dictionary to provide label encodings for 46 | categorical variables. 47 | """ 48 | l = [] 49 | cat_var_clmn = df[cat_var] 50 | for ele in cat_var_clmn: 51 | l.append(cat_var_dict[ele]) 52 | l = pd.DataFrame(l, columns=[cat_var]) 53 | temp = df.pop(cat_var) 54 | df = pd.concat([l, df], axis=1) 55 | return df 56 | 57 | def _get_labenc_station_df(self, df, n): 58 | """ 59 | Returns the complete training data frame of a station where all its 60 | categorical variables are encoded. 61 | 62 | Args: 63 | df : The data frame whose categorical variables are to 64 | be label encoded. 65 | n : The n in "n previous stations" data frame. 66 | """ 67 | # Encode Train Type 68 | train_type_dict = self._pdr.get_labenc_train_type_dict() 69 | df = self._get_labenc_of_cat_var_df(df, "train_type", train_type_dict) 70 | 71 | # Encode zone 72 | zone_dict = self._pdr.get_labenc_zone_dict() 73 | df = self._get_labenc_of_cat_var_df(df, "zone", zone_dict) 74 | 75 | # Encode month 76 | month_dict = self._pdr.get_labenc_month_dict() 77 | df = self._get_labenc_of_cat_var_df(df, "month", month_dict) 78 | 79 | # Encode weekday 80 | weekday_dict = self._pdr.get_labenc_weekday_dict() 81 | df = self._get_labenc_of_cat_var_df(df, "weekday", weekday_dict) 82 | 83 | # Encode n previous stations 84 | station_dict = self._pdr.get_labenc_station_dict() 85 | for i in range(n): 86 | df = self._get_labenc_of_cat_var_df( 87 | df, str(i+1)+"_prev_station", station_dict) 88 | 89 | return df 90 | 91 | def generate_row_df(self, train_num, sj_df, j, n): 92 | """ 93 | Returns a single row data frame info to test the late minutes prediction 94 | algorithm. 95 | 96 | Args: 97 | train_num : A five digit train number eg. "12307". 98 | sj_df : A single journey data frame from which row data 99 | frame is to be obtained. 100 | j : The row index of the current station in sj_df whose n previous 101 | stations' info is required. 102 | n : Number of previous stations. 103 | """ 104 | column_names_list = self._tdfu._get_column_names_list(n) 105 | 106 | # train_type. zone. is_superfast, month, weekday 107 | feature_list = [self._tdfu._generate_train_type_str(train_num), 108 | self._tdfu._generate_zone_str(train_num), 109 | self._tdfu._is_superfast_str(train_num), 110 | self._tdfu._generate_month_str(sj_df, j), 111 | self._tdfu._generate_weekday_str(sj_df, j)] 112 | # n_prev_station 113 | feature_list.extend( 114 | self._tdfu._generate_n_prev_station_codes_list(sj_df, j, n)) 115 | # n_ps_late_mins 116 | feature_list.extend( 117 | self._tdfu._generate_n_prev_stn_late_mins_list(sj_df, j, n)) 118 | # dist_bwn_stn_n-1_n 119 | feature_list.extend( 120 | self._tdfu._generate_n_prev_dist_bwn_stn_list(sj_df, j, n)) 121 | # stn_n_dist_frm_src 122 | feature_list.extend( 123 | self._tdfu._generate_n_prev_stn_dist_from_source_list(sj_df, j, n)) 124 | # tfc_of_stn_n 125 | feature_list.extend( 126 | self._tdfu._generate_n_prev_stn_tfc_strength_list(sj_df, j, n)) 127 | # deg_of_stn_n 128 | feature_list.extend( 129 | self._tdfu._generate_n_prev_stn_deg_strength_list(sj_df, j, n)) 130 | # crnt_stn_tfc 131 | feature_list.extend( 132 | self._tdfu._generate_n_prev_stn_tfc_strength_list(sj_df, j, 0)) 133 | # crnt_stn_deg 134 | feature_list.extend( 135 | self._tdfu._generate_n_prev_stn_deg_strength_list(sj_df, j, 0)) 136 | # crnt_stn_dist_frm_src 137 | feature_list.extend( 138 | self._tdfu._generate_n_prev_stn_dist_from_source_list(sj_df, j, 0)) 139 | # crnt_stn_late_mins 140 | feature_list.extend( 141 | self._tdfu._generate_n_prev_stn_late_mins_list(sj_df, j, 0)) 142 | 143 | feature_list_df = pd.DataFrame([feature_list], columns=column_names_list) 144 | # Obtain the label encoded feature_list_df 145 | feature_list_df = self._get_labenc_station_df(feature_list_df, n) 146 | return feature_list_df 147 | 148 | def remove_unwanted_columns_df(self, df, n): 149 | """ 150 | Returns the passed data frame after removal of unwanted columns from it. 151 | 152 | Args: 153 | df : The data frame from which columns are to be removed. 154 | n : Number of previous stations to the current station. 155 | """ 156 | # Remove "stn_n_dist_frm_src" 157 | # Remove "tfc_of_stn_n" 158 | # Remove "deg_of_stn_n" 159 | # Remove "n_prev_station" 160 | for k in range(n): 161 | #temp = df.pop("stn_"+str(k+1)+"_dist_frm_src") 162 | #temp = df.pop("tfc_of_stn_"+str(k+1)) 163 | #temp = df.pop("deg_of_stn_"+str(k+1)) 164 | temp = df.pop(str(k+1)+"_prev_station") # Remove station code names. 165 | return df 166 | 167 | def get_predicted_late_mins_list(self, current_station, n, df, mdl): 168 | """ 169 | Returns the predicted late mins at the current_station. 170 | 171 | Args: 172 | current_station : Station Code for the station in question 173 | eg. "CNB", used to choose the RFR model. 174 | n : Number of previous station to the current_station to choose the 175 | RFR model. 176 | df : The data frame of current_station to predict late 177 | minutes at it. 178 | mdl : <"rfr"|"lmr"|"nnr"> 179 | "rfr": Random Forest Regressor Models. 180 | "lmr": Linear Model Regressor Models (not reliable). 181 | "nnr": Neural Network Regressor Models (not converged). 182 | """ 183 | model = joblib.load(self._model_path + mdl + "_models/" + str(n) + 184 | "ps_" + mdl + "_labenc_models/" + current_station + 185 | "_label_encoding_model.sav") 186 | pred_late_mins = model.predict(df) 187 | return pred_late_mins 188 | 189 | def _get_selected_stations_df(self, stn_index_list, df): 190 | """ 191 | Returns a station features data frame of selected stations. 192 | 193 | Args: 194 | stn_index_list : A list of stations indices for which station 195 | features data frame is to be constructed. 196 | df : A Complete DataFrame of 596 known station features 197 | """ 198 | selected_station_df = df.iloc[stn_index_list] 199 | return selected_station_df 200 | 201 | def get_station_nearest_neighbors_list(self, station, nps, n): 202 | """ 203 | Returns the n nearest neighbors stations to given station among the stations 204 | in passed data frame "df". 205 | 206 | Args: 207 | station : The station code for which nearest neighbors are needed. 208 | nps : Number of previous stations to choose stations having nps model. 209 | n : Number of nearest neighbors needed. 210 | """ 211 | # Choose the stations who have the respective nps models. 212 | # If the unknown station occurs as 3rd station in the complete journey, then 213 | # the nearest known station should have a 3 previous station model and so on. 214 | stns_hvng_nps_mdls = self._pdr.get_stations_having_nps_model_list(nps) 215 | 216 | # Get the station features data frame for known stations having nps models 217 | df = self._pdr.get_known_596_stations_features_df() 218 | df = df[df.Station.isin(stns_hvng_nps_mdls)] 219 | 220 | query_stn_feature = [[self._stn_geo_crdnates[station][0], 221 | self._stn_geo_crdnates[station][1], 222 | self._stn_deg_strength[station], 223 | self._stn_tfc_strength[station]]] 224 | # First choose neighbors which are geographically closer 225 | lat_lon_df = df[["Latitude", "Longitude"]] 226 | 227 | lat_lon_query_stn_ftr = [[self._stn_geo_crdnates[station][0], 228 | self._stn_geo_crdnates[station][1]]] 229 | ll_nbrs = NN(n_neighbors=n, algorithm="auto").fit(lat_lon_df) 230 | # ll_indices are directly indexed corresponding to stns_hvng_nps_mdls 231 | ll_distances, ll_indices = ll_nbrs.kneighbors(lat_lon_query_stn_ftr) 232 | 233 | # Subselect the chosen stations features from the complete station 234 | # features df. 235 | selected_station_fts_df = self._get_selected_stations_df(ll_indices[0], df) 236 | 237 | # Then choose neighbors based on degree and traffic strength among the 238 | # above chosen geographically closer stations. 239 | deg_tfc_df = selected_station_fts_df[["Degree_Strength", "Traffic_Strength"]] 240 | deg_tfc_query_stn_ftr = [[self._stn_deg_strength[station], 241 | self._stn_tfc_strength[station]]] 242 | dt_nbrs = NN(n_neighbors=n, algorithm="auto").fit(deg_tfc_df) 243 | # dt_indices are indexed with 0, so not directly related to 244 | # stns_hvng_nps_mdls 245 | dt_distances, dt_indices = dt_nbrs.kneighbors(deg_tfc_query_stn_ftr) 246 | 247 | # Once the dt_indices are obtained where the stations are arranged as per 248 | # increasing distance of degree and traffic strength features, get the 249 | # station codes from the df at those indices (since the dt_indices are 250 | # indexed from 0 onwards with respect to the ll_indices, hence the following 251 | # code). Also the ll_indices are with respect to the df. 252 | final_nearest_neighbors_stns_list = [df.iloc[ll_indices[0][idx]].Station 253 | for idx in dt_indices[0]] 254 | return final_nearest_neighbors_stns_list 255 | 256 | def get_predicted_late_mins_at_station_float(self, train_num, sj_df, idxof_stn, 257 | n, station, pred_lms_sj, j, mdl): 258 | """ 259 | Returns the predicted late minutes at given "station". 260 | 261 | Args: 262 | train_num : A five digit train number eg. "12307". 263 | sj_df : A single journey data frame. 264 | idxof_stn : Index of current station in single journey data frame. 265 | n : N in number of previous station. 266 | station : Station Code at which late mins are to be predicted. 267 | pred_lms_sj : Predicted Late Minutes list. 268 | j : The current station index in station list of sj_df. 269 | mdl : <"rfr"> # For random forest regressor model. 270 | """ 271 | row_df_nps = self.generate_row_df(train_num, sj_df, idxof_stn, n) 272 | temp = row_df_nps.pop("crnt_stn_late_mins") 273 | # Remove unwanted columns from the row data frame 274 | row_df_nps = self.remove_unwanted_columns_df(row_df_nps, n) 275 | 276 | # Set the late minutes at n previous stations as predicted ones 277 | for i in range(n): 278 | row_df_nps[str(i+1)+"_ps_late_mins"] = pred_lms_sj[j-(i+1)] 279 | 280 | plm = self.get_predicted_late_mins_list(station, n, row_df_nps, mdl) 281 | return plm[0] 282 | -------------------------------------------------------------------------------- /misc/result_analysis.py: -------------------------------------------------------------------------------- 1 | # 2 | # Train Delay Estimation Project 3 | # 4 | # author: gaurav.ramashish@gmail.com 5 | # 6 | # Desc: This file analyses the results 7 | # 8 | 9 | import joblib 10 | import numpy as np 11 | import os 12 | import pandas as pd 13 | from scipy import stats 14 | 15 | from utilities.tt_utils import TrainingTestUtils as TTU 16 | 17 | class ResultAnalysis(object): 18 | def __init__(self): 19 | self._ttu = TTU() 20 | self._cdr = self._ttu._cdr 21 | self._pdr = self._ttu._pdr 22 | 23 | def get_confidence_intervals_train_stations_df_dict(self, train_num, 24 | ci_prob=0.95): 25 | """ 26 | Returns a df of CI of all known 52 trains stations, complete journey is 27 | considered. DF is dict of station vs CIs for the given train. 28 | 29 | Args: 30 | train_num : Train number eg. "12307" 31 | ci_prob : The Confidence Interval probability. <0..1> 32 | """ 33 | stns_stats_dict = {} 34 | train_df = self._cdr.get_train_complete_journey_df(train_num) 35 | # Get all the unique stations whose CI has to be calculated monthly. 36 | unq_stns = train_df.station_code.unique() 37 | 38 | # Get all the unique months for whom CI for a station would be calculated. 39 | unq_mnts = train_df.month.unique() 40 | 41 | stn_stats_cols = ["month", "mean_lms", "std", "ci"] 42 | # Calculate CI for each stations. 43 | for stn in unq_stns: 44 | # Select the data frame for the current station 45 | stn_df = train_df[train_df.station_code == stn] 46 | stn_stats_df = [] 47 | # Calculate for every month. 48 | for mnt in unq_mnts: 49 | # Select the data frame out of stn_df for the current month. 50 | stn_mnt_df = stn_df[stn_df.month == mnt] 51 | if not stn_mnt_df.empty: 52 | # If the stn_mnt_df is not empty, remove outliers by Tukeys Rule. 53 | first_q = np.percentile(stn_mnt_df.latemin, 25) 54 | third_q = np.percentile(stn_mnt_df.latemin, 75) 55 | iqr = third_q - first_q 56 | upr_threshold = third_q + 1.5*iqr # Factor of 1.5 can be changed to 3. 57 | # Not calculating lower threshold since trains can arrive on time at 58 | # the stations in best cases. 59 | # Select cleaned stn_mnt_df by removing outliers (outlier late mins 60 | # due to trains being late at the source). 61 | cln_stn_mnt_df = stn_mnt_df[stn_mnt_df.latemin <= upr_threshold] 62 | mean_lms = cln_stn_mnt_df.latemin.mean() 63 | std = cln_stn_mnt_df.latemin.std() 64 | # Calculate length of late minutes list for which mean is calculated. 65 | len_lml = len((cln_stn_mnt_df.latemin.tolist())) 66 | ci = stats.t.interval(ci_prob, len_lml-1, loc=mean_lms, 67 | scale=std/np.sqrt(len_lml)) 68 | else: # If the stn_mnt_df is empty. 69 | mean_lms = -1 70 | std = -1 71 | ci = (-1, -1) 72 | stn_stats_df.append([mnt, mean_lms, std, ci]) 73 | stn_stats_df = pd.DataFrame(stn_stats_df, columns = stn_stats_cols) 74 | stns_stats_dict[stn] = stn_stats_df 75 | return stns_stats_dict 76 | 77 | def find_ci_probability_of_pred_lms_df(self, train_num, ci_prob=0.95, nps=4, 78 | rfr_mdl="", group="known"): 79 | """ 80 | Returns total number of predictions and number of predictions of 81 | late minutes at stations within CI ci_prob. 82 | 83 | Args: 84 | train_num : Train number eg. "12307". 85 | ci_prob : The Confidence Interval probability. <0..1> 86 | nps : number of previous stations 87 | rfr_mdl : <""|"without_nps_codes"|"wonps_wdts"> 88 | group : <"known"|"unknown"> 89 | """ 90 | # If the predicted late minutes for a station falls in its CI of fixed 91 | # ci_prob say 0.95, then there is 95% chance that the train will get delayed 92 | # by that many predicted late minutes at the chosen station. 93 | 94 | # Get the train's data frame for cross validation. 95 | train_df = self._cdr.get_train_journey_df(train_num, "unknown_test") 96 | # Get the predicted late minutes for the train's cross validation data. 97 | pred_lms_df = self._cdr.get_jw_pred_late_mins_of_train_df(train_num, nps, 98 | rfr_mdl, group) 99 | # Remove "JRNY END" rows from pred_lms_df. 100 | pred_lms_df = pred_lms_df.loc[~pred_lms_df.Stations.isin(["JRNY END"])] 101 | # Get the CI for the train's stations. 102 | stns_stats_dict = self.get_confidence_intervals_train_stations_df_dict( 103 | train_num, ci_prob) 104 | 105 | total_predictions = len(train_df) 106 | num_of_ci_prob_preds = 0 107 | 108 | for i in range(total_predictions): 109 | # Select the month in which late mins is predicted for the station. 110 | mnt = train_df.iloc[i].month 111 | stn = train_df.iloc[i].station_code 112 | pred_lms = float(pred_lms_df.iloc[i].PredictedLateMins) 113 | 114 | # Select the stations CI from stns_stats_dict. 115 | try: 116 | stn_stats = stns_stats_dict[stn] 117 | except: 118 | continue 119 | stn_month_stats = stn_stats[stn_stats.month==mnt] 120 | stn_month_stats_ci = tuple(stn_month_stats["ci"]) 121 | try: 122 | if (pred_lms >= stn_month_stats_ci[0][0] and 123 | pred_lms <= stn_month_stats_ci[0][1]): 124 | num_of_ci_prob_preds += 1 125 | except: 126 | print stn_month_stats, pred_lms 127 | 128 | return total_predictions, num_of_ci_prob_preds 129 | 130 | def calculate_diff_of_af_df_and_nf_df(self, ci_prob): 131 | """ 132 | Calculates the difference of "%_preds_within_ci" in additional features (af) 133 | data frame and in normal features (nf) data frame and saves it in CSV files. 134 | 135 | Args: 136 | ci_prob : The confidence interval probability. [0..1] 137 | """ 138 | file_path = self._cdr._cdpath + "analysed_data/known_trains/" 139 | 140 | files = os.listdir(file_path) 141 | CI = str(int(ci_prob * 100)) 142 | diff_df = [] 143 | diff_df_cols = ["train_number", "1ps", "2ps", "3ps", "4ps", "5ps"] 144 | nf_df = pd.DataFrame() 145 | af_df = pd.DataFrame() 146 | 147 | for f in files: 148 | if f.startswith("CI"+CI): 149 | df = pd.read_csv(file_path+f) 150 | temp_df = pd.DataFrame() 151 | temp_df[f[13:16]] = df["%_preds_within_ci"] 152 | if f.endswith("model.csv"): # Info corresponding to normal features df. 153 | nf_df = pd.concat([nf_df, temp_df], axis=1) 154 | else: # Info corresponding to additional features df. 155 | af_df = pd.concat([af_df, temp_df], axis=1) 156 | 157 | diff_df = af_df - nf_df 158 | diff_df["train_number"] = pd.read_csv(file_path+files[0])["train_number"] 159 | diff_df = diff_df[["train_number", "1ps", "2ps", "3ps", "4ps", "5ps"]] 160 | desc = diff_df.describe() 161 | diff_df.to_csv(file_path+"CI"+CI+"diff_bwn_af_nf_results.csv") 162 | desc.to_csv(file_path+"CI"+CI+"diff_stats.csv") 163 | 164 | def calculate_AIC_or_BIC_float(self, train_num, nps, rfr_mdl="", group=""): 165 | """ 166 | Calculates BIC value of model determined by nps for different test settings. 167 | http://www.stat.wisc.edu/courses/st572-larget/Spring2007/handouts09-4.pdf 168 | Uncomment the formula for calculating either AIC or BIC accordingly. 169 | 170 | Args: 171 | train_num : Train Number for whom BIC is needed. eg. "12307" 172 | nps : Number of previous stations 173 | rfr_mdl : <""|"_wonps_wdts"> 174 | group : <"known"|"unknown"> 175 | """ 176 | jw_lms_df = self._cdr.get_jw_pred_late_mins_of_train_df(train_num, nps, 177 | rfr_mdl, group) 178 | # Load any nps model to get the number of parameters or features 179 | model = joblib.load(self._ttu._model_path+"rfr_models/"+str(nps)+"ps_rfr"+ 180 | "_labenc_models_complete"+rfr_mdl+"/CNB_label_encoding_model.sav") 181 | 182 | # Remove "JRNY END" rows from pred_lms_df. 183 | jw_lms_df = jw_lms_df.loc[~jw_lms_df.Stations.isin(["JRNY END"])] 184 | # Calcuate Residual Sum of Squares (also known as Sum of Squared Errors) 185 | actual_lms = jw_lms_df.ActualLateMins.astype(dtype=float) 186 | pred_lms = jw_lms_df.PredictedLateMins.astype(dtype=float) 187 | error = actual_lms - pred_lms 188 | sqrd_error = error ** 2 189 | rss = np.sum(sqrd_error) # Calculate RSS 190 | num_of_obsrs = jw_lms_df.shape[0] # Calculate Number of Observations 191 | num_of_parms = model.n_features_ 192 | 193 | #BIC = (num_of_obsrs * np.log((rss * 1.0)/num_of_obsrs) + 194 | # num_of_parms * np.log(num_of_obsrs)) 195 | #return BIC 196 | 197 | AIC = (num_of_obsrs * np.log((rss * 1.0)/num_of_obsrs) + 198 | num_of_parms * 2) 199 | return AIC 200 | 201 | def save_bic_df_and_calc_nps_with_minimum_bic_int( 202 | self, rfr_mdl="", group=""): 203 | """ 204 | Saves the BIC lists into a df and calculates the value nps which has minimum 205 | bic. This function acts generic depending on the value (either AIC or BIC) 206 | returned by function calculate_AIC_or_BIC_float. Do not confuse with the 207 | name of the function that it saves only BIC data frames, it is generic. 208 | Change the name of saved data frame file in last line accordingly. 209 | 210 | Args: 211 | rfr_mdl : <""|"_wonps_wdts"> 212 | group : <"known"|"unknown"> 213 | """ 214 | df = [] 215 | columns = ["TrainNum", "1OR", "2OR", "3OR", "4OR", "5OR", "Min_n"] 216 | all_trains = self._pdr.get_all_trains()[52:] # All Known Trains 217 | for train in all_trains: 218 | bic_list = [train] 219 | min_i = 1 220 | min_b = None 221 | for i in range(1,6): 222 | BIC = self.calculate_AIC_or_BIC_float(train, i, rfr_mdl, group) 223 | bic_list.append(BIC) 224 | if i == 1: 225 | min_i = 1 226 | min_b = BIC 227 | else: 228 | if BIC < min_b: 229 | min_i = i 230 | min_b = BIC 231 | bic_list.append(min_i) 232 | df.append(bic_list) 233 | df = pd.DataFrame(df, columns=columns) 234 | df.to_csv(self._cdr._cdpath+"analysed_data/"+group+"_trains/aic_analysis/"+ 235 | group+rfr_mdl+".csv", index=False) 236 | 237 | def calculate_sum_of_rmses_for_n_omlmpf_df(self, group, rfr_mdl=""): 238 | """ 239 | Caculcates and saves the total RMSE for all trains in different rfr_mdl 240 | settings for different values of n in different groups. This gives the 241 | overall measure of the performance of different N-OMLMPF, pointing out 242 | the one with minimum overall RMSE, thus used during production mode. 243 | 244 | Args: 245 | group : <"known"> 246 | rfr_mdl : <""|"_wonps_wdts"> 247 | """ 248 | df = [] 249 | columns = [ 250 | "Train", "1-OMLMPF", "2-OMLMPF", "3-OMLMPF", "4-OMLMPF", "5-OMLMPF"] 251 | trains = self._pdr.get_all_trains()[:52] 252 | for train in trains: 253 | train_wise_sum_rmse = [train] 254 | for i in range(5): 255 | rmse_list = self._pdr.get_rmse_of_journey_wise_lms_pred_list( 256 | i+1, group, train, rfr_mdl=rfr_mdl) 257 | train_wise_sum_rmse.append(sum(rmse_list)) 258 | df.append(train_wise_sum_rmse) 259 | df = pd.DataFrame(df, columns=columns) 260 | df.to_csv(self._cdr._cdpath+"analysed_data/"+group+"_trains/sum_rmse_of_"+ 261 | group+"_trains_"+rfr_mdl+".csv", index=False) 262 | 263 | if __name__ == "__main__": 264 | 265 | # Uncomment the codes in different blocks to run any specific data analysation 266 | # code. 267 | ra = ResultAnalysis() 268 | 269 | ############################################################################## 270 | """ 271 | group = "test_unknown" 272 | nps = 5 273 | rfr_mdl = "_wonps_wdts" 274 | all_trains = ra._pdr.get_all_trains()[52:] # Known Trains 275 | train_ci_df_cols = ["train_number", "#_preds", "#_preds_within_ci", 276 | "%_preds_within_ci"] 277 | 278 | for ci_prob in [0.68, 0.95, 0.99]: 279 | train_ci_df = [] 280 | for train in all_trains: 281 | (total_predictions, num_of_ci_prob_preds) = ( 282 | ra.find_ci_probability_of_pred_lms_df(train, ci_prob, nps, rfr_mdl, 283 | group)) 284 | try: 285 | train_ci_df.append([train, total_predictions, num_of_ci_prob_preds, 286 | num_of_ci_prob_preds*100.0/total_predictions]) 287 | except: 288 | print train, total_predictions, num_of_ci_prob_preds 289 | train_ci_df = pd.DataFrame(train_ci_df, columns = train_ci_df_cols) 290 | train_ci_df.to_csv(ra._cdr._cdpath+"rmr_analysed_data/"+group+"_trains/CI"+ 291 | str(int(ci_prob * 100))+"_results_"+str(nps)+"ps_rmr_5e_1_model"+ 292 | rfr_mdl+".csv", index=False) 293 | """ 294 | ############################################################################## 295 | """ 296 | ra.calculate_diff_of_af_df_and_nf_df(0.99) 297 | """ 298 | ############################################################################## 299 | 300 | ra.save_bic_df_and_calc_nps_with_minimum_bic_int("_wonps_wdts", "test_unknown") 301 | 302 | ############################################################################## 303 | """ 304 | ra.calculate_sum_of_rmses_for_n_omlmpf_df("known", "_wonps_wdts") 305 | """ 306 | -------------------------------------------------------------------------------- /code/utilities/df_utils.py: -------------------------------------------------------------------------------- 1 | # 2 | # Train Delay Estimation Project 3 | # 4 | # Author: Ramashish Gaurav 5 | # 6 | # Desc: Provides necessary utilities helpful for DataFrame creation 7 | # 8 | 9 | # Import it first, as it imports data_path and models_path and paths to other 10 | # modules namely pickle_data_reader and csv_data_reader. 11 | from env import * 12 | 13 | import pickle 14 | import pandas as pd 15 | 16 | from pickle_data_reader import PickleDataReader as PDR 17 | from csv_data_reader import CSVDataReader as CDR 18 | 19 | class TrainDataFrameUtils(object): 20 | 21 | def __init__(self): 22 | """ 23 | Initializes a list which contains train type <"EXPRESS"|..>, is superfast 24 | and zone <"CR"|"ECR"..> information. 25 | 26 | Args: 27 | train_num : A five digit train number e.g. "12307" 28 | """ 29 | self._pdr = PDR(data_path) 30 | self._cdr = CDR(data_path) 31 | 32 | def _generate_train_type_str(self, train_num): 33 | """ 34 | Generates the train type <"SPECIAL"|"EXPRESS"|"OTHER"> based on the first 35 | digit of the train number. 36 | 37 | Args: 38 | train_num : A five digit train number e.g. "12307" 39 | """ 40 | if (train_num[0]=='0'): 41 | return "SPECIAL" 42 | if (train_num[0]=='1' or train_num[0]=='2'): 43 | return "EXPRESS" 44 | return "OTHER" 45 | 46 | def _get_train_type_col_name_list(self): 47 | return ["train_type"] 48 | 49 | def _is_superfast_str(self, train_num): 50 | """ 51 | Returns boolean depending on the second digit of the train 52 | number. 53 | 54 | Args: 55 | train_num : A five digit train number e.g. "12307" 56 | """ 57 | if train_num[1]=='2': 58 | return True 59 | return False 60 | 61 | def _get_is_superfast_col_name_list(self): 62 | return ["is_superfast"] 63 | 64 | def _generate_zone_str(self, train_num): 65 | """ 66 | Returns zone string <"JS"|"CR"|...> based on the second and third digit of 67 | the train number. 68 | 69 | Args: 70 | train_num : A five digit train number e.g. "12307" 71 | """ 72 | if train_num[1]=='2': 73 | if train_num[2]=='0': 74 | return "JS" # 20 is for Shatabdis and Jan Shatabdis on all zonal railways 75 | if train_num[2]=='1': 76 | return "CR" # 21 is for superfasts on CR and WCR (formerly only CR) 77 | if train_num[2]=='2': 78 | return "NR" # 22 is for superfasts from various zones - 79 | # NR, NCR, NWR (formerly only NR). 80 | if train_num[2]=='3': 81 | return "ER" # 23 is for superfast on ER and ECR 82 | if train_num[2]=='4': 83 | return "NR" # 24 is for superfast on NR, NCR and NWR (formerly only NR) 84 | if train_num[2]=='5': 85 | return "NER" # 25 is for superfast on NER and NFR 86 | if train_num[2]=='6': 87 | return "SR" # 26 is for superfast on SR and SWR (formerly only SR) 88 | if train_num[2]=='7': 89 | return "SCR" # 27 is for superfast on SCR and SWR (formerly only SCR) 90 | if train_num[2]=='8': 91 | return "SER" # 28 is for superfast on SER, SECR and ECoR 92 | # (formerly only SER) 93 | if train_num[2]=='9': 94 | return "WR" # 29 is for superfast on WR, WCR and NWR (formerly only WR) 95 | return "OTHER" 96 | 97 | if train_num[1]=='0': 98 | return "KR" # 0 is for Konkan Railway 99 | if train_num[1]=='1': 100 | return "CR" # 1 is for CR, WCR and NCR(?) 101 | if train_num[1]=='3': 102 | return "ER" # 3 is shared by ER and ECR 103 | if train_num[1]=='4': 104 | return "NR" # 4 is for NR, NCR and NWR 105 | if train_num[1]=='5': 106 | return "NER" # 5 is shared by NER and NFR 107 | if train_num[1]=='6': 108 | return "SR" # 6 is for SR and SWR 109 | if train_num[1]=='7': 110 | return "SCR" # 7 is shared by SCR and SWR 111 | if train_num[1]=='8': 112 | return "SER" # 8 is for SER and ECoR 113 | if train_num[1]=='9': 114 | return "WR" # 9 is for WR, NWR and WCR 115 | return "OTHER" 116 | 117 | def _get_zone_col_name_list(self): 118 | return ["zone"] 119 | 120 | def _generate_month_str(self, sj_df, j): 121 | """ 122 | Returns the month value in the single journey data frame a particula row j. 123 | 124 | Args: 125 | sj_df : A single journey data frame corresponding to one 126 | single journey. 127 | j : The row index in sj_df at which month info is required. 128 | """ 129 | return sj_df["month"][j] 130 | 131 | def _get_month_col_name_list(self): 132 | return ["month"] 133 | 134 | def _generate_weekday_str(self, sj_df, j): 135 | """ 136 | Returns the weekday value in the single journey data frame a particula row j. 137 | 138 | Args: 139 | sj_df : A single journey data frame corresponding to one 140 | single journey. 141 | j : The row index in sj_df at which weekday info is required. 142 | """ 143 | return sj_df["weekday"][j] 144 | 145 | def _get_weekday_col_name_list(self): 146 | return ["weekday"] 147 | 148 | def _generate_n_prev_station_codes_list(self, sj_df, j, n): 149 | """ 150 | Returns a list containing n previous station codes to the current station. 151 | 152 | Args: 153 | sj_df : A single journey data frame. 154 | j : The row index of the current station in sj_df whose n previous 155 | stations codes list is required. 156 | n : Number of previous stations. 157 | """ 158 | l = [] 159 | for i in range(n): 160 | l.append(sj_df["station_code"][j-(i+1)]) 161 | return l 162 | 163 | def _get_n_prev_stations_col_names_list(self, n): 164 | """ 165 | Returns a list ["1_prev_station", "2_prev_station" ...] upto value of n. 166 | 167 | Args: 168 | n : Number of previous stations. 169 | """ 170 | return [(str(i+1)+"_prev_station") for i in range(n)] 171 | 172 | 173 | def _generate_n_prev_stn_late_mins_list(self, sj_df, j, n): 174 | """ 175 | Returns a list containing n previous station's late minutes. 176 | 177 | Args: 178 | sj_df : A single journey data frame. 179 | j : The row index of the current station in sj_df whose n previous 180 | late minutes list is required. 181 | n : Number of previous stations. 182 | """ 183 | l = [] 184 | # If current station (i.e. n == 0) 185 | if n == 0: 186 | l.append(sj_df["latemin"][j]) 187 | return l 188 | 189 | for i in range(n): 190 | l.append(sj_df["latemin"][j-(i+1)]) 191 | return l 192 | 193 | def _get_n_prev_stn_late_mins_col_names_list(self, n): 194 | """ 195 | Returns a list ["1_ps_late_mins", "2_ps_late_mins" ...] upto value of n. 196 | 197 | Args: 198 | n : Number of previous stations. 199 | """ 200 | return [(str(i+1)+"_ps_late_mins") for i in range(n)] 201 | 202 | def _get_crnt_stn_late_mins_col_names_list(self): 203 | return ["crnt_stn_late_mins"] 204 | 205 | def _generate_n_prev_dist_bwn_stn_list(self, sj_df, j, n): 206 | """ 207 | Returns a list containing n previous station's inter distance between them. 208 | 209 | Args: 210 | sj_df : A single journey data frame. 211 | j : The row index of the current station in sj_df whose n previous 212 | distance between stations list is required. 213 | n : Number of previous stations. 214 | """ 215 | l = [] 216 | for i in range(n): 217 | l.append(sj_df["distance"][j-i] - sj_df["distance"][j-(i+1)]) 218 | return l 219 | 220 | def _get_n_prev_dist_bwn_stn_col_names_list(self, n): 221 | """ 222 | Returns a list ["dist_bwn_stn_0_1", "dist_bwn_stn_1_2" ...] upto value of n. 223 | 224 | Args: 225 | n : Number of previous stations. 226 | """ 227 | return [("dist_bwn_stn_"+str(i)+"_"+str(i+1)) for i in range(n)] 228 | 229 | def _generate_n_prev_stn_deg_strength_list(self, sj_df, j, n): 230 | """ 231 | Returns a list containing n previous stations degree strength to the 232 | current station. 233 | 234 | Args: 235 | sj_df : A single journey data frame. 236 | j : The row index of the current station in sj_df whose n previous 237 | stations' degree strength list is required. 238 | n : Number of previous stations. 239 | """ 240 | l = [] 241 | # If current station (i.e. n == 0) 242 | if n == 0: 243 | l.append( 244 | self._pdr.get_station_degree_strength_dict()[ 245 | sj_df["station_code"][j]]) 246 | return l 247 | 248 | for i in range(n): 249 | l.append( 250 | self._pdr.get_station_degree_strength_dict()[ 251 | sj_df["station_code"][j-(i+1)]]) 252 | return l 253 | 254 | def _get_n_prev_stn_deg_col_names_list(self, n): 255 | """ 256 | Returns a list ["deg_of_stn_1", "deg_of_stn_2" ...] upto value of n. 257 | 258 | Args: 259 | n : Number of previous stations. 260 | """ 261 | return [("deg_of_stn_"+str(i+1)) for i in range(n)] 262 | 263 | def _get_crnt_stn_deg_col_names_list(self): 264 | return ["crnt_stn_deg"] 265 | 266 | def _generate_n_prev_stn_tfc_strength_list(self, sj_df, j, n): 267 | """ 268 | Returns a list containing n previous stations traffic strength to the 269 | current station. 270 | 271 | Args: 272 | sj_df : A single journey data frame. 273 | j : The row index of the current station in sj_df whose n previous 274 | stations' traffic strength list is required. 275 | n : Number of previous stations. 276 | """ 277 | l = [] 278 | # If current station (i.e. n == 0) 279 | if n == 0: 280 | l.append( 281 | self._pdr.get_station_traffic_strength_dict()[ 282 | sj_df["station_code"][j]]) 283 | return l 284 | 285 | for i in range(n): 286 | l.append( 287 | self._pdr.get_station_traffic_strength_dict()[ 288 | sj_df["station_code"][j-(i+1)]]) 289 | return l 290 | 291 | def _get_n_prev_stn_tfc_col_names_list(self, n): 292 | """ 293 | Returns a list ["tfc_of_stn_1", "tfc_of_stn_2" ...] upto value of n. 294 | 295 | Args: 296 | n : Number of previous stations. 297 | """ 298 | return [("tfc_of_stn_"+str(i+1)) for i in range(n)] 299 | 300 | def _get_crnt_stn_tfc_col_names_list(self): 301 | return ["crnt_stn_tfc"] 302 | 303 | def _generate_n_prev_stn_dist_from_source_list(self, sj_df, j, n): 304 | """ 305 | Returns a list containing n previous stations' distance from source for a 306 | given current station. 307 | 308 | Args: 309 | sj_df : A single journey data frame. 310 | j : The row index of the current station in sj_df whose n previous 311 | stations' distance from source is required. 312 | n : Number of previous station codes. 313 | """ 314 | l = [] 315 | # If current station (i.e. n == 0) 316 | if n == 0: 317 | l.append(sj_df["distance"][j]) 318 | return l 319 | 320 | for i in range(n): 321 | l.append(sj_df["distance"][j-(i+1)]) 322 | return l 323 | 324 | def _get_n_prev_stn_dist_frm_src_col_names_list(self, n): 325 | """ 326 | Returns a list ["stn_1_dist_frm_src", "stn_2_dist_frm_src" ...] upto value 327 | of n. 328 | 329 | Args: 330 | n : Number of previous stations. 331 | """ 332 | return [("stn_"+str(i+1)+"_dist_frm_src") for i in range(n)] 333 | 334 | def _get_crnt_stn_dist_frm_src_col_names_list(self): 335 | return ["crnt_stn_dist_frm_src"] 336 | 337 | def _generate_single_journey_df(self, df, i, source_rows): 338 | """ 339 | Returns the single journey data frame starting at ith index in the source 340 | rows, out of the given data frame 341 | 342 | Args: 343 | df : the complete data frame of a train 344 | i : ith index of source rows (rows at which journey info starts) 345 | source_rows <[...]>: the complete list of source rows in a train df i.e. 346 | indices in the df where source station occurs for 347 | each journey 348 | """ 349 | sj_df = None 350 | if i == len(source_rows)-1: 351 | sj_df = df[source_rows[i]:df.shape[0]] #Single Journey DataFrame 352 | else: 353 | sj_df = df[source_rows[i]:source_rows[i+1]] #Single Journey DataFrame 354 | return sj_df 355 | 356 | def _get_column_names_list(self, n): 357 | """ 358 | Returns a list of column headers in a data frame. 359 | 360 | Args: 361 | n : Number of previous stations. 362 | """ 363 | column_names_list = self._get_train_type_col_name_list() 364 | column_names_list.extend(self._get_zone_col_name_list()) 365 | column_names_list.extend(self._get_is_superfast_col_name_list()) 366 | column_names_list.extend(self._get_month_col_name_list()) 367 | column_names_list.extend(self._get_weekday_col_name_list()) 368 | column_names_list.extend(self._get_n_prev_stations_col_names_list(n)) 369 | column_names_list.extend(self._get_n_prev_stn_late_mins_col_names_list(n)) 370 | column_names_list.extend(self._get_n_prev_dist_bwn_stn_col_names_list(n)) 371 | column_names_list.extend(self._get_n_prev_stn_dist_frm_src_col_names_list(n)) 372 | column_names_list.extend(self._get_n_prev_stn_tfc_col_names_list(n)) 373 | column_names_list.extend(self._get_n_prev_stn_deg_col_names_list(n)) 374 | column_names_list.extend(self._get_crnt_stn_tfc_col_names_list()) 375 | column_names_list.extend(self._get_crnt_stn_deg_col_names_list()) 376 | column_names_list.extend(self._get_crnt_stn_dist_frm_src_col_names_list()) 377 | column_names_list.extend(self._get_crnt_stn_late_mins_col_names_list()) 378 | 379 | return column_names_list 380 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | --------------------------------------------------------------------------------