├── code
    ├── __init__.py
    ├── readers
    │   ├── __init__.py
    │   ├── csv_data_reader.py
    │   └── pickle_data_reader.py
    ├── utilities
    │   ├── __init__.py
    │   ├── env.py
    │   ├── tt_utils.py
    │   └── df_utils.py
    ├── readme.txt
    ├── rmr_stn_models_training_file.py
    ├── cross_validation_file.py
    ├── rfr_stn_models_training_file.py
    ├── create_pickle_data.py
    ├── create_training_data.py
    ├── known_trains_lms_pred.py
    └── unknown_trains_lms_pred.py
├── tde_service
    ├── util
    │   ├── __init__.py
    │   └── log.py
    ├── env.py
    ├── logs
    │   └── tde_logs.log
    ├── app.py
    └── tde_prediction.py
├── requirements.txt
├── doc
    ├── TrainDelay-ITSC2018.pdf
    ├── LongPaper-arxiv-June2018.pdf
    ├── Summary-TrainDelayPrediction-June2018.pdf
    ├── Readme.md
    └── Tutorial.md
├── .gitignore
├── misc
    ├── read_status.py
    ├── train_status.py
    ├── lmr_stn_models_training_file.py
    ├── explore_data.R
    └── result_analysis.py
├── trains.txt
├── README.md
├── metadata_setup.sh
└── LICENSE


/code/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/code/readers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/code/utilities/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tde_service/util/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | joblib>=0.11
2 | pandas>=0.20.1
3 | scipy>=0.19.0
4 | flask>=0.12.2
5 | scikit-learn>=0.18.1
6 | 


--------------------------------------------------------------------------------
/doc/TrainDelay-ITSC2018.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/R-Gaurav/train-delay-estimation/HEAD/doc/TrainDelay-ITSC2018.pdf


--------------------------------------------------------------------------------
/doc/LongPaper-arxiv-June2018.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/R-Gaurav/train-delay-estimation/HEAD/doc/LongPaper-arxiv-June2018.pdf


--------------------------------------------------------------------------------
/doc/Summary-TrainDelayPrediction-June2018.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/R-Gaurav/train-delay-estimation/HEAD/doc/Summary-TrainDelayPrediction-June2018.pdf


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Ignore *.pyc files.
 2 | *.pyc
 3 | *.DS_Store
 4 | 
 5 | # Ignoring data directory
 6 | data/
 7 | models/
 8 | 
 9 | # Ignoring config files and logs
10 | code/utilities/env.py
11 | tde_service/env.py
12 | tde_service/logs/tde_logs.log
13 | 


--------------------------------------------------------------------------------
/tde_service/env.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Train Delay Estimation Project
 3 | #
 4 | # Author: Ramashish Gaurav
 5 | #
 6 | # This module sets up the environment for running the TDE Service.
 7 | #
 8 | 
 9 | import sys
10 | 
11 | # This module should NOT be executed.
12 | assert __name__ != "__main__"
13 | 
14 | # Set up the project directory.
15 | project_dir_path = "/Personal/train-delay-estimation/"
16 | 
17 | # Insert the project directory path in sys.path, so that subdirecotries and code
18 | # files therein are able to access the other (top level) files.
19 | sys.path.insert(0, project_dir_path)
20 | 


--------------------------------------------------------------------------------
/doc/Readme.md:
--------------------------------------------------------------------------------
 1 | # Documentation
 2 | 
 3 | ## Video
 4 | 1. Train Chatbot: A Train Status Assistant for Indian Railways (Skype Prototype), 
 5 | https://www.youtube.com/watch?v=I-wtcAYLYr4, Oct 2018.
 6 | An earlier version can be found here - 
 7 | https://www.youtube.com/watch?v=a-ABv29H6XU&feature=youtu.be, Sep 2018 (Emulator Prototype).
 8 | 
 9 | ## Papers
10 | 1. Ramashish Gaurav, Biplav Srivastava, Estimating Train Delays in a Large Rail Network Using a Zero Shot Markov Model, IEEE International Conference on Intelligent Transportation Systems (ITSC). On Arxiv at: https://arxiv.org/abs/1806.02825, April 2018  [Area: Train delay, learning]
11 | 
12 | 2. Himadri Mishra, Ramashish Gaurav, Biplav Srivastava, A Train Status Assistant for Indian Railways, On Arxiv at: https://arxiv.org/abs/1809.08509, Sep 2018
13 | [Area: Chatbot, Train delay]
14 | 
15 | ----------
16 | 
17 | 


--------------------------------------------------------------------------------
/misc/read_status.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import csv
 3 | import pickle
 4 | #train_list = pickle.load(open('train_list_MGS.p','rb'))
 5 | train_list = ['22308','13010','12307','12801','12802','14055']
 6 | for train_num in train_list:
 7 |     f=open('/home/zerone/python/ogd/train_running_status/Train'+str(train_num)+'.txt','r')
 8 |     f1=csv.writer(open('/home/zerone/python/ogd/train_running_status_csv/Train'+str(train_num)+'.csv','w')) #Everytime this file is run, it writes a new one
 9 |     f = f.readlines()
10 |     f1.writerow(['actarr_date','day','station_code','station_name','scharr_date','scharr','actarr','latemin','status','schdep','actdep','distance','has_departed','has_arrived'])
11 |     for line in f:
12 |         run_stat = json.loads(line)
13 |         run_stat = run_stat['route']
14 |         for stat in run_stat:
15 |             f1.writerow([stat['actarr_date'],stat['day'],stat['station_']['code'],stat['station_']['name'],stat['scharr_date'],stat['scharr'],stat['actarr'],stat['latemin'],stat['status'],stat['schdep'],stat['actdep'],stat['distance'],stat['has_departed'],stat['has_arrived']])
16 | 


--------------------------------------------------------------------------------
/code/utilities/env.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Train Delay Estimation Project
 3 | #
 4 | # Author: Ramashish Gaurav
 5 | #
 6 | # This module sets up paths for different directories of data and saved models.
 7 | # Import this file always at the beginning of each file.
 8 | #
 9 | 
10 | import sys
11 | 
12 | # This module should NOT be executed.
13 | assert __name__ != "__main__"
14 | 
15 | # Here "train-delay-estimation" is downloaded in "/Personal/projects/".
16 | # Here "data" is downloaded in "/Personal/projects/train-delay-estimation/".
17 | # Here "models" is set in "/Personal/projects/train-delay-estimation/".
18 | 
19 | project_dir_path = "/Personal/train-delay-estimation/"
20 | 
21 | # Insert the path to the project directory in sys.path so that subdirectories
22 | # and code files are accessible in other files.
23 | sys.path.insert(0, project_dir_path+"code/")
24 | 
25 | # Insert the path to readers directory.
26 | sys.path.insert(0, project_dir_path+"code/readers/")
27 | 
28 | # Insert the path to the data (input) directory.
29 | # data_path contains all the raw data and pickle data.
30 | data_path = project_dir_path+"data/"
31 | 
32 | # Insert the path to the trained models of stations (output) directory.
33 | models_path =  project_dir_path+"models/"
34 | 


--------------------------------------------------------------------------------
/tde_service/logs/tde_logs.log:
--------------------------------------------------------------------------------
 1 | 2018-08-19 23:24:20,460 - util.log - INFO - Completed configuring logger()!
 2 | 2018-08-19 23:26:55,831 - util.log - INFO - Train Number: 12307, Station Code: None, Date: None
 3 | 2018-08-19 23:26:56,000 - util.log - INFO - Modified Date: 19 Aug 2018, Month: Aug, Weekday: Sunday
 4 | 2018-08-19 23:26:56,001 - util.log - INFO - Train: 12307 single journey dataframe modified
 5 | 2018-08-19 23:29:52,338 - util.log - INFO - Train Number: 12307, Station Code: None, Date: 2018-07-23
 6 | 2018-08-19 23:29:52,470 - util.log - INFO - Modified Date: 23 Jul 2018, Month: Jul, Weekday: Monday
 7 | 2018-08-19 23:29:52,471 - util.log - INFO - Train: 12307 single journey dataframe modified
 8 | 2018-08-19 23:31:54,013 - util.log - INFO - Train Number: 12307, Station Code: ALD, Date: None
 9 | 2018-08-19 23:31:54,149 - util.log - INFO - Modified Date: 19 Aug 2018, Month: Aug, Weekday: Sunday
10 | 2018-08-19 23:31:54,150 - util.log - INFO - Train: 12307 single journey dataframe modified
11 | 2018-08-19 23:34:44,370 - util.log - INFO - Train Number: 12307, Station Code: ALD, Date: 2018-12-09
12 | 2018-08-19 23:34:44,513 - util.log - INFO - Modified Date: 09 Dec 2018, Month: Dec, Weekday: Sunday
13 | 2018-08-19 23:34:44,514 - util.log - INFO - Train: 12307 single journey dataframe modified
14 | 


--------------------------------------------------------------------------------
/misc/train_status.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from requests.auth import HTTPBasicAuth
 3 | import json
 4 | import pickle
 5 | import time
 6 | header = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0',}
 7 | 
 8 | ################################################################################
 9 | train_list = pickle.load(open('train_list_MGS.p','rb'))
10 | 
11 | i=0
12 | train_list = train_list[i:]
13 | 
14 | ################################################################################
15 | for train_num in train_list:
16 | 
17 |     url="http://api.railwayapi.com/live/train/"+str(train_num)+"/doj/20160328/apikey/<api_key>/"
18 |     response = requests.get(url,headers=header,proxies=None)
19 |     if response.status_code == 200:
20 |         #print response.text
21 |         status = json.loads(response.text)
22 |         stat = status['response_code']
23 |         if stat == 200:
24 |             f = open('/home/zerone/python/ogd/train_running_status/Train'+str(train_num)+'.txt','a')
25 |             f.write(response.text)
26 |             f.write('\n')
27 |             print 'Success',train_num, 'index', i
28 |         else:
29 |             print 'Fail',train_num, 'index', i
30 | 
31 |     else:
32 |         print response.status_code,'Error',train_num, 'index', i
33 | 
34 |     time.sleep(2)
35 |     i=i+1
36 | 


--------------------------------------------------------------------------------
/tde_service/util/log.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Train Delay Estimation Project
 3 | #
 4 | # Author: Ramashish Gaurav
 5 | #
 6 | # Log file utilities.
 7 | #
 8 | 
 9 | from os import path, remove
10 | 
11 | import logging
12 | import traceback
13 | 
14 |  # Name of the log file where all messages would be logged.
15 | LOG_FILE = "logs/tde_logs.log"
16 | 
17 | # If applicable, delete the existing log file to generate a fresh log file i
18 | # during each execution
19 | if path.isfile(LOG_FILE):
20 |     remove(LOG_FILE)
21 | 
22 | # Create the logger
23 | logger = logging.getLogger(__name__)
24 | # Set the logging level to DEBUG, such that all level messages are logged.
25 | logger.setLevel(logging.DEBUG)
26 | 
27 | # Create handler for logging the messages to a log file.
28 | log_handler = logging.FileHandler(LOG_FILE)
29 | log_handler.setLevel(logging.DEBUG)
30 | 
31 | # Set the format of the log.
32 | log_formatter = logging.Formatter(
33 |     "%(asctime)s - %(name)s - %(levelname)s - %(message)s")
34 | 
35 | # Add the Formatter to the Handler
36 | log_handler.setFormatter(log_formatter)
37 | 
38 | # Add the Handler to the Logger
39 | logger.addHandler(log_handler)
40 | logger.info('Completed configuring logger()!')
41 | 
42 | def INFO(msg):
43 |   logger.info(msg)
44 | 
45 | def WARN(msg):
46 |   logger.warning(msg)
47 |   logger.warning(traceback.format_exc())
48 | 
49 | def ERROR(msg):
50 |   logger.error(msg)
51 |   logger.error(traceback.format_exc())
52 | 


--------------------------------------------------------------------------------
/trains.txt:
--------------------------------------------------------------------------------
  1 | 12307
  2 | 12331
  3 | 12801
  4 | 12802
  5 | 12815
  6 | 12816
  7 | 12875
  8 | 12876
  9 | 13010
 10 | 13050
 11 | 13119
 12 | 13131
 13 | 13133
 14 | 13151
 15 | 13238
 16 | 13483
 17 | 14055
 18 | 18612
 19 | 22911
 20 | 12178
 21 | 12318
 22 | 12327
 23 | 12354
 24 | 12361
 25 | 12362
 26 | 12372
 27 | 12395
 28 | 12569
 29 | 12818
 30 | 12942
 31 | 14003
 32 | 15632
 33 | 15635
 34 | 15636
 35 | 22811
 36 | 22812
 37 | 22824
 38 | 12305
 39 | 12326
 40 | 12424
 41 | 12444
 42 | 12578
 43 | 12937
 44 | 22409
 45 | 09012
 46 | 09305
 47 | 12149
 48 | 12282
 49 | 12333
 50 | 12335
 51 | 12382
 52 | 13239
 53 | 01660
 54 | 02050
 55 | 02265
 56 | 02397
 57 | 03209
 58 | 03210
 59 | 03291
 60 | 03563
 61 | 03564
 62 | 04039
 63 | 04040
 64 | 04401
 65 | 04405
 66 | 04406
 67 | 04821
 68 | 05066
 69 | 06032
 70 | 12141
 71 | 12150
 72 | 12175
 73 | 12295
 74 | 12296
 75 | 12301
 76 | 12302
 77 | 12304
 78 | 12308
 79 | 12309
 80 | 12312
 81 | 12313
 82 | 12317
 83 | 12319
 84 | 12320
 85 | 12322
 86 | 12325
 87 | 12328
 88 | 12332
 89 | 12334
 90 | 12356
 91 | 12369
 92 | 12381
 93 | 12392
 94 | 12397
 95 | 12398
 96 | 12401
 97 | 12423
 98 | 12439
 99 | 12454
100 | 12495
101 | 12496
102 | 12506
103 | 12741
104 | 12817
105 | 12826
106 | 12947
107 | 12948
108 | 12987
109 | 12988
110 | 13005
111 | 13006
112 | 13008
113 | 13009
114 | 13049
115 | 13202
116 | 13240
117 | 13255
118 | 13307
119 | 13308
120 | 13414
121 | 15022
122 | 15483
123 | 15645
124 | 15668
125 | 18103
126 | 18104
127 | 18311
128 | 18609
129 | 18631
130 | 19313
131 | 22308
132 | 22405
133 | 22406
134 | 22488
135 | 25631
136 | 


--------------------------------------------------------------------------------
/code/readme.txt:
--------------------------------------------------------------------------------
 1 | This Train Delay Estimation project aims to find a pattern in delays at stations
 2 | during journey of trains in India. A set of 135 trains is considered, out of
 3 | which 52 trains journey data are used for training various prediction models and
 4 | it is tested on another set of 83 trains. Prediction of near accurate late
 5 | minutes proves the existence of a pattern and our successful attempt to do so.
 6 | 
 7 | For more information on algorithm, data collection and data division please
 8 | refer the paper.
 9 | 
10 | Here, description of files in this repository is given.
11 | 
12 | /readers:
13 |   > Contains the helper code to read data from csv files and pickle files.
14 | 
15 | /utilities:
16 |   > Contains the helper code to generate data frames and to build our train-test
17 |   algorithm.
18 | 
19 | /create_pickle_data.py
20 |   > Creates and saves the required data in pickle format.
21 | 
22 | /create_training_data.py:
23 |   > Code to create training data i.e. a Training Data Frame for each Known
24 |   Station from each Known Train.
25 | 
26 | /cross_validation_file.py:
27 |   > Code to evaluate the trained models.
28 | 
29 | /rfr_stn_models_training_file.py:
30 |   > Code to train Random Forest Regressors on Training Data Frame.
31 | 
32 | /rmr_stn_models_training_file.py
33 |   > Code to train Ridge Model Regressor on Training Data Frame.
34 | 
35 | /known_trains_lms_pred.py:
36 |   > Implementation of N-Order Late Minutes Prediction Framework for Known Trains.
37 | 
38 | /unknown_trains_lms_pred.py:
39 |   > Implementation of N-Order Late Minutes Prediction Framework for UnKnown
40 |   Trains.
41 | 
42 | One can train the various n-previous-station models of Random Forest, Linear
43 | Regressors and Neural Network Rgressors by executing the corresponding files as
44 | metioned above. The total size of saved models exceed 120 GB, with at-least 40GB
45 | for each setting of Random Forests Models. In case you need the pre-trained
46 | models contact me.
47 | 


--------------------------------------------------------------------------------
/misc/lmr_stn_models_training_file.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Train Delay Estimation Project
 3 | #
 4 | # author: gaurav.ramashish@gmail.com
 5 | #
 6 | # Desc: This file trains linear models for 596 known stations. But the linear
 7 | #       models trained were not found to be robust during evaluation and
 8 | #       prediction.
 9 | #       Therefore not an important file with view of training and testing the
10 | #       late minutes prediction framework.
11 | #
12 | #       To run this file execute:
13 | #       python lmr_stn_models_training_file.py 1
14 | #
15 | #       where the numeral "1" can be changed to <1|2|3|4|5> depending on the "n"
16 | #       in n-previous-station models.
17 | #
18 | 
19 | import joblib
20 | import pickle
21 | import sys
22 | 
23 | from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
24 | from sklearn.metrics import mean_squared_error
25 | 
26 | from utilities.tt_utils import TrainingTestUtils as TTU
27 | 
28 | if __name__ == "__main__":
29 |   n = int(sys.argv[1])
30 |   ttu = TTU()
31 |   stns = ttu._pdr.get_all_52trains_stations()
32 |   stns_having_model = []
33 |   for s in stns:
34 |     df = ttu._cdr.get_n_prev_station_csv_df(s, "complete_training", n)
35 |     df = ttu._get_labenc_station_df(df, n)
36 | 
37 |     if not df.empty:
38 |       stns_having_model.append(s)
39 |       target_late_mins = df.pop("crnt_stn_late_mins")
40 | 
41 |       # Remove unwanted columns from the data frame
42 |       df = ttu.remove_unwanted_columns_df(df, n)
43 | 
44 |       model = LinearRegression(n_jobs=-1)
45 |       model.fit(df, target_late_mins)
46 |       pred_lms = model.predict(df)
47 |       RMSE = mean_squared_error(target_late_mins, pred_lms)**0.5
48 |       print "Linear Regression: ", s, RMSE
49 | 
50 |       joblib.dump(model, ttu._model_path+"lmr_models/"+str(n)+
51 |           "ps_lmr_labenc_models_complete_wonps_wdts/"+s+
52 |           "_label_encoding_model.sav")
53 |   pickle.dump(stns_having_model, open(ttu._pdr._pdpath+"stations_having_"+
54 |       str(n)+"ps_lmr_models_complete_wonps_wdts.p", "wb"))
55 | 
56 | 


--------------------------------------------------------------------------------
/code/rmr_stn_models_training_file.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Train Delay Estimation Project
 3 | #
 4 | # Author: Ramashish Gaurav
 5 | #
 6 | # Desc: This file trains linear models for 596 known stations. But the ridge
 7 | #       models trained were not found to be robust during evaluation and
 8 | #       prediction.
 9 | #       Therefore not an important file with view of training and testing the
10 | #       late minutes prediction framework.
11 | #
12 | #       To run this file execute:
13 | #       python rmr_stn_models_training_file.py 1
14 | #
15 | #       where the numeral "1" can be changed to <1|2|3|4|5> depending on the "n"
16 | #       in n-previous-station models.
17 | #
18 | 
19 | import joblib
20 | import pickle
21 | import sys
22 | 
23 | from sklearn.linear_model import Ridge
24 | from sklearn.metrics import mean_squared_error
25 | 
26 | from utilities.tt_utils import TrainingTestUtils as TTU
27 | 
28 | if __name__ == "__main__":
29 |   n = int(sys.argv[1])
30 |   ttu = TTU()
31 |   stns = ttu._pdr.get_all_52trains_stations()
32 |   stns_having_model = []
33 |   for s in stns:
34 |     df = ttu._cdr.get_n_prev_station_csv_df(s, "complete_training", n)
35 |     df = ttu._get_labenc_station_df(df, n)
36 | 
37 |     if not df.empty:
38 |       stns_having_model.append(s)
39 |       target_late_mins = df.pop("crnt_stn_late_mins")
40 | 
41 |       # Remove unwanted columns from the data frame
42 |       df = ttu.remove_unwanted_columns_df(df, n)
43 |       alpha_str_list = ["_1e_4", "_1e_2", "_5e_1", "_1", "_3"]
44 |       alpha_list = [1e-4, 1e-2, 5e-1, 1, 3]
45 |       for i in xrange(5):
46 |         model = Ridge(alpha=alpha_list[i], normalize=True)
47 |         model.fit(df, target_late_mins)
48 |         pred_lms = model.predict(df)
49 |         RMSE = mean_squared_error(target_late_mins, pred_lms)**0.5
50 |         print "Ridge Regression: ", s, RMSE
51 | 
52 |         joblib.dump(model, ttu._model_path+"rmr"+alpha_str_list[i]+"_models/"+
53 |             str(n)+"ps_rmr"+alpha_str_list[i]+
54 |             "_labenc_models/"+s+"_label_encoding_model.sav")
55 |   pickle.dump(stns_having_model, open(ttu._pdr._pdpath+"stations_having_"+
56 |       str(n)+"ps_rmr_models_complete_wonps_wdts.p", "wb"))
57 | 
58 | 


--------------------------------------------------------------------------------
/code/cross_validation_file.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Train Delay Estimation Project
 3 | #
 4 | # Author: Ramashish Gaurav
 5 | #
 6 | # Desc: This file evaluates the trained Random Forest models for each stations.
 7 | #       Refer readme.txt in models/rfr_models/ to get the appropriate columns in
 8 | #       data frame, hence the corresponding model ("", "_without_nps_codes",
 9 | #       "_wonps_wdts"). Change the last line of pickle dump accordingly as
10 | #       columns removed.
11 | #       Not an important file from point of view of training the models and
12 | #       testing the late minutes prediction framework.
13 | #
14 | #       To run the file, execute:
15 | #       python cross_validation_file.py rfr 2
16 | #
17 | #       where the "rfr" can be changed to "lmr" and numeral can be changed to
18 | #       <1|2|3|4|5> for different n_prev_station models to be cross-validated
19 | #       (not to predict late minutes during journey). "rfr" stands for random
20 | #       forest regressor models and "lmr" stands for linear model regressors.
21 | #
22 | #       NOTE: This file is only meant to analyse the performance of late mins
23 | #             prediction, so data frame is passed to trained models in batch set
24 | #             instead of row wise (hence no filling of predicted late mins at
25 | #             previous stations as done in N-OMLMPF algorithm).
26 | #
27 | import joblib
28 | import pickle
29 | import sys
30 | 
31 | from utilities.tt_utils import TrainingTestUtils as TTU
32 | from sklearn.metrics import mean_squared_error
33 | 
34 | if __name__ == "__main__":
35 |   mdl = sys.argv[1]
36 |   n = int(sys.argv[2])
37 |   ttu = TTU()
38 |   stns = ttu._pdr.get_all_52trains_stations()
39 |   rmse_list = []
40 | 
41 |   for s in stns:
42 |     df = ttu._cdr.get_n_prev_station_csv_df(s, "cross_validation", n)
43 |     df = ttu._get_labenc_station_df(df, n)
44 | 
45 |     if not df.empty:
46 |       actual_late_mins = df.pop("crnt_stn_late_mins")
47 | 
48 |       # Remove unwanted columns from the data frame
49 |       df = ttu.remove_unwanted_columns_df(df, n)
50 | 
51 |       pred_late_mins = ttu.get_predicted_late_mins_list(s, n, df, mdl)
52 |       RMSE = mean_squared_error(actual_late_mins, pred_late_mins)**0.5
53 |       # Create a list of Station and corresponsing RMSE
54 |       rmse_list.append([s, RMSE])
55 |       print s, RMSE
56 |   # Dump the cross validation label encoding rmse list
57 |   pickle.dump(
58 |       rmse_list, open(ttu._pdr._pdpath+str(n)+"ps_cv_labenc_rmse_list.p", "wb"))
59 | 


--------------------------------------------------------------------------------
/code/rfr_stn_models_training_file.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Train Delay Estimation Project
 3 | #
 4 | # Author: Ramashish Gaurav
 5 | #
 6 | # Desc: This file trains the Random Forest Regressor models for different
 7 | #       stations. Refer the reademe.txt in models/rfr_models/ to figure out the
 8 | #       correct combination of models and corresponding columns removed. Change
 9 | #       the directory arguments in the line where joblib is used to dump the
10 | #       *.sav models.
11 | #
12 | #       Prints the RMSE of data frame on which training was done to see the fit.
13 | #
14 | #       To run this file execute:
15 | #       python rfr_stn_models_training_file.py 1
16 | #
17 | #       where the numeral "1" can be changed as <1|2|3|4|5> as per the value of
18 | #       "n" in n-previous-station models. The output trained models are saved in
19 | #       "nps_rfr_labenc_models" directory, where n can be <1|2|3|4|5>.
20 | #
21 | #       IMPORTANT NOTE: Make sure to remove the unwanted columns in data frame
22 | #                       depending on experiments for which you want trained
23 | #                       models. This can be done in function:
24 | #                       "remove_unwanted_columns_df()" in "utilities/tt_utils.py".
25 | #
26 | 
27 | import joblib
28 | import pickle
29 | import sys
30 | 
31 | from sklearn.ensemble import RandomForestRegressor as RFR
32 | from sklearn.metrics import mean_squared_error
33 | 
34 | from utilities.tt_utils import TrainingTestUtils as TTU
35 | 
36 | if __name__ == "__main__":
37 |   n = int(sys.argv[1]) # Get the n in "n previous station"
38 |   ttu = TTU()
39 |   stns = ttu._pdr.get_all_52trains_stations()
40 |   stns_having_model = [] # Stations having n prev stations RFR models
41 |   for s in stns:
42 |     df = ttu._cdr.get_n_prev_station_csv_df(s, "training", n)
43 |     df = ttu._get_labenc_station_df(df, n)
44 | 
45 |     if not df.empty:
46 |       stns_having_model.append(s)
47 |       target_late_mins = df.pop("crnt_stn_late_mins")
48 | 
49 |       # Remove unwanted columns from the data frame
50 |       df = ttu.remove_unwanted_columns_df(df, n)
51 | 
52 |       model = RFR(n_estimators=1000, n_jobs=-1, warm_start=True)
53 |       model.fit(df, target_late_mins)
54 |       pred_lms = model.predict(df)
55 |       RMSE = mean_squared_error(target_late_mins, pred_lms)**0.5
56 |       print s, RMSE
57 | 
58 |       joblib.dump(model, ttu._model_path + "rfr_models/" + str(n) +
59 |                   "ps_rfr_labenc_models/" + s + "_label_encoding_model.sav")
60 |   pickle.dump(stns_having_model, open(ttu._pdr._pdpath+
61 |       "stations_having_"+str(n)+"ps_models.p", "wb"))
62 | 


--------------------------------------------------------------------------------
/code/readers/csv_data_reader.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Train Delay Estimation Project
 3 | #
 4 | # Author: Ramashish Gaurav
 5 | #
 6 | # Desc: This file reads the csv data.
 7 | #
 8 | 
 9 | import numpy as np
10 | import pandas as pd
11 | 
12 | class CSVDataReader(object):
13 | 
14 |   def __init__(self, data_path=""):
15 |     self._cdpath = data_path
16 | 
17 |   def get_train_journey_df(self, train_num, setting="training"):
18 |     """
19 |     Returns the data frame of the given train. The data frame corresponds to
20 |     either training or test setting.
21 | 
22 |     Args:
23 |       train_num <string>: Train number eg. "12307" whose data frame is required
24 |       setting <string>: <"training"|"cross_validation"|"known_test"|
25 |                         "unknown_test">
26 |     """
27 |     tr_grp = ("52_known_" if (setting == "training" or setting == "known_test"
28 |               or setting == "cross_validation") else "83_unknown_")
29 |     train_df = pd.read_csv(
30 |         (self._cdpath+tr_grp+"trains_"+setting+"_folder/Train"+train_num+".csv"))
31 |     return train_df
32 | 
33 |   def get_n_prev_station_csv_df(self, station, setting, n):
34 |     """
35 |     Returns the n previous station training data frame of given station
36 | 
37 |     Args:
38 |       station <string>: should be one among 52trains unique stations
39 |       setting <string>: <"training"|"cross_validation">
40 |       n <int>: <1|2|3|4|5>
41 |     """
42 |     stn_csv = pd.read_csv(
43 |         (self._cdpath+"52tr_stations_"+setting+"_data/"+str(n)+
44 |         "ps_"+setting+"_data/Station_"+station+".csv"))
45 |     return stn_csv
46 | 
47 |   def get_jw_pred_late_mins_of_train_df(self, train_num, nps=4, rfr_mdl="",
48 |       group="known"):
49 |     """
50 |     Returns the data frame of Actual Late Mins and Predicted Late Mins for a
51 |     train's cross validation data.
52 | 
53 |     Args:
54 |       train_num <string>: Train number eg. "12307" whose predicted late mins df
55 |                           is required.
56 |       group <string>: <"known"|"unknown">
57 |       nps <int>: number of previous stations considered for prediction.
58 |       rfr_mdl <string>: <""|"_wonps_wdts"|"_without_nps_codes">
59 |     """
60 |     df = pd.read_csv(self._cdpath+"rfr_model_data/"+"jrny_wise_"+group+"_trains"
61 |         +"_lms_"+str(nps)+"ps"+"_labenc"+rfr_mdl+"/"+"Train_"+
62 |         train_num+"_jw_lms.csv")
63 |     return df
64 | 
65 |   def get_train_complete_journey_df(self, train_num):
66 |     """
67 |     Returns a complete data frame of collected data for a train.
68 | 
69 |     Args:
70 |       train_num <string>: Train number eg. "12307" whose complete journey df is
71 |                           required.
72 |     """
73 |     df = pd.read_csv(self._cdpath+
74 |         "csv_Mar16_Feb18_all_trains_135_months_weekdays/Train"+train_num+".csv")
75 |     return df
76 | 


--------------------------------------------------------------------------------
/tde_service/app.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Train Delay Estimation Project
 3 | #
 4 | # Author: Ramashish Gaurav
 5 | #
 6 | # Desc: This file implements a flask REST API app for Train Delay Estimation.
 7 | #
 8 | # For multithreaded: http://flask.pocoo.org/docs/deploying/
 9 | 
10 | import env
11 | 
12 | from datetime import datetime
13 | from flask import Flask
14 | 
15 | import json
16 | import pandas as pd
17 | import re
18 | 
19 | from code.utilities.tt_utils import TrainingTestUtils as TTU
20 | from tde_prediction import TDEPrediction as TDEP
21 | 
22 | from util import log
23 | 
24 | app = Flask(__name__)
25 | pd.options.mode.chained_assignment = None  # Disable "SettingWithCopyWarning".
26 | 
27 | DATE_PATTERN = r'^\d{4}-\d{2}-\d{2}$'
28 | # Compile the regex since it is used multiple times in the life time of this app.
29 | DATE_REGEX = re.compile(DATE_PATTERN)
30 | 
31 | # Instantiate following variables and keep them in memory because they are not
32 | # going to change throughout the life time of this app.
33 | ttu = TTU()
34 | pdr = ttu._pdr
35 | 
36 | ALL_135_TRAINS = pdr.get_all_trains()
37 | 
38 | STNS_WITH_N_MDLS = {
39 |   "1ps": pdr.get_stations_having_nps_model_list(nps=1),
40 |   ## One can add more deeper models one may have tried. E.g., 2-order in next line
41 |   # "2ps": pdr.get_stations_having_nps_model_list(nps=2)
42 | }
43 | 
44 | 
45 | 
46 | # Route when only train number is passed.
47 | @app.route("/<train_num>", defaults={"station": None, "date": None})
48 | # Route when train number and a date is passed.
49 | @app.route("/<train_num>/<date>", defaults={"station": None})
50 | # Route when train number and station code is passed.
51 | @app.route("/<train_num>/<station>/today", defaults={"date": None})
52 | # Route when train number, station and date is passed.
53 | @app.route("/<train_num>/<station>/<date>")
54 | def accept_url(train_num, station, date):
55 |   log.INFO("Train Number: %s, Station Code: %s, Date: %s"
56 |            % (train_num, station, date))
57 | 
58 |   # Check for the validity of train number.
59 |   if train_num not in ALL_135_TRAINS:
60 |     log.ERROR("Train %s not in ALL_135_TRAINS list")
61 |     return json.dumps({"Error": "Train: %s not accounted by our algorithm"
62 |                       % train_num, "Result": None})
63 | 
64 |   if not date:
65 |     date = str(datetime.now().date())
66 | 
67 |   # TODO Check for the validity of date (Also check for 12 months 31 days)
68 |   # TODO Check for past dates and error out those as invalid.
69 |   match = DATE_REGEX.match(date)
70 |   if not match:
71 |     log.ERROR("Date: %s is not valid as per regex")
72 |     return json.dumps({"Error": "Date %s not correct" % date, "Result": None})
73 | 
74 |   lms_stns = TDEP().get_delay(STNS_WITH_N_MDLS, train_num, date, station)
75 |   return json.dumps(lms_stns)
76 | 
77 | if __name__ == "__main__":
78 |   app.run(threaded=True)
79 | 


--------------------------------------------------------------------------------
/code/create_pickle_data.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Train Delay Estimation Project
 3 | #
 4 | # Author: Ramashish Gaurav
 5 | #
 6 | # Desc: This file creates all the required pickle data (used throughout the code
 7 | #       as pre-computed data) taking reference from the existing data on github
 8 | #       repo. Make sure you have create "pickle_data" folder under "data"
 9 | #       directory.
10 | #
11 | 
12 | import numpy as np
13 | import pandas as pd
14 | import pickle
15 | 
16 | from utilities.df_utils import TrainDataFrameUtils as TDFU
17 | 
18 | class CreatePickleData(object):
19 |   def __init__(self):
20 |     self._tdfu = TDFU()
21 |     self._pdr = self._tdfu._pdr
22 |     self._cdr = self._tdfu._cdr
23 | 
24 |   def create_52trains_unique_stations_pickle(self):
25 |     """
26 |     Creates a list of unique stations covered by all 52 "training" trains.
27 |     It considers the complete journey of known trains (March 2016 to Feb 2018).
28 |     """
29 |     trains52 = self._pdr.get_all_trains()[:52] # First 52 are Known Trains.
30 |     tr52_unique_stations = []
31 |     for train in trains52:
32 |       df = self._cdr.get_train_complete_journey_df(train)
33 |       stations = df["station_code"]
34 |       tr_unique_stations = np.unique(stations)
35 |       tr52_unique_stations.extend(tr_unique_stations)
36 | 
37 |     tr52_unique_stations = np.unique(tr52_unique_stations).tolist()
38 |     pickle.dump(tr52_unique_stations,
39 |                 open(self._pdr._pdpath+"52trains_unique_stations.p", "wb"))
40 |     print ("52 Known Trains Unique Stations pickle data dumped in pickle_data"
41 |            " directory. Number of unique stations: %s"
42 |            % len(tr52_unique_stations))
43 |     print "-" * 80
44 | 
45 |   def create_135trains_unique_stations_pickle(self):
46 |     """
47 |     Creates a list of unique stations covered by all 135 trains (Known + Unknown
48 |     trains). It considers the complete journey of trains (March 2016 to Feb 2018).
49 |     """
50 |     trains135 = self._pdr.get_all_trains()
51 |     tr135_unique_stations = [] # To store all the unique stations for all trains.
52 |     tr135_inline_stns = {} # To store the stations inline in a train's journey.
53 |     for train in trains135:
54 |       df = self._cdr.get_train_complete_journey_df(train)
55 |       stations = df["station_code"]
56 |       tr_unique_stations = np.unique(stations)
57 |       tr135_unique_stations.extend(tr_unique_stations)
58 |       tr135_inline_stns[train] = tr_unique_stations.tolist()
59 | 
60 |     tr135_unique_stations = np.unique(tr135_unique_stations).tolist()
61 |     pickle.dump(tr135_unique_stations,
62 |                 open(self._pdr._pdpath+"135trains_unique_stations.p", "wb"))
63 |     print ("135 Trains Unique Stations pickle data dumped in pickle_data"
64 |            " directory. Number of unique stations: %s"
65 |            % len(tr135_unique_stations))
66 |     pickle.dump(tr135_inline_stns,
67 |                 open(self._pdr._pdpath+"trains_inline_stations_dict.p", "wb"))
68 |     print "135 Trains inline stations dict dumped in pickle_data directory"
69 |     print "-" * 80
70 | 
71 | if __name__ == "__main__":
72 |   ob = CreatePickleData()
73 |   ob.create_52trains_unique_stations_pickle()
74 |   ob.create_135trains_unique_stations_pickle()
75 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Train Delay Estimation
 2 | This project is first of its kind, an attempt made to learn the delay trends of
 3 | Indian trains at their in-line stations. See [doc](
 4 | https://github.com/R-Gaurav/train-delay-estimation/tree/master/doc) dir for overview
 5 | presentation; papers (ITSC version accepted at [IEEE ITSC 2018](
 6 | http://www.ieee-itsc2018.org) and long version of [Arxiv](
 7 | https://arxiv.org/abs/1806.02825) and a [tutorial](
 8 | https://github.com/R-Gaurav/train-delay-estimation/blob/master/doc/Tutorial.md)
 9 | on using the code/ model/ data. This project is licensed under GNU GENERAL PUBLIC
10 | LICENSE Version 3.
11 | 
12 | ## Team
13 | Ramashish Gaurav (2016 - ),
14 | Himadri Mishra (2018 - ),
15 | Biplav Srivastava (*main contact*).
16 | To request our collected data for research purposes, please fill [this form](https://docs.google.com/forms/d/e/1FAIpQLSc-u619QBL49KO7Lh6UvKOpSF4U1QDD-ZE0VZAqrtv-PlyehQ/viewform?usp=sf_link) and also email at:
17 | my.better.rail@gmail.com.
18 | 
19 | ## Description
20 | India runs the fourth largest railway transport network size carrying
21 | over 8 billion passengers per year. However, the travel experience of
22 | passengers is frequently marked by delays, i.e., late arrival of trains at
23 | stations, causing inconvenience. In a first, we study the systemic delays
24 | in train arrivals using norder Markov frameworks and experiment with two
25 | regression based models. Using train running-status data collected for two
26 | years, we report on an efficient algorithm for estimating delays at
27 | railway stations with near accurate results. This work can help
28 | railways to manage their resources, while also helping passengers
29 | and businesses served by them to efficiently plan their activities.
30 | 
31 | ## Tutorial to use our code
32 | Please visit the [tutorial](
33 | https://github.com/R-Gaurav/train-delay-estimation/blob/master/doc/Tutorial.md)
34 | to find out the steps for using our code and setting up the experiment locally on
35 | your system. In the tutorial you will also find how to deploy a train delay
36 | estimation service locally on your system. On executing a REST API call, e.g.
37 | *curl http://127.0.0.1:5000/12333* (more to be found in tutorial) you will get
38 | delay estimates (in minutes) at in-line stations of Train 12333's journey on
39 | current date in a JSON format (example below).
40 | 
41 | `
42 | {
43 |   "Result": {
44 |     ..., "ALY": 322.184, "DLN": 81.23, "KIUL": 29.395, ...
45 |   },
46 |   "Error": null
47 | }
48 | `
49 | 
50 | The list of trains for which you can avail this service is mentioned [here](
51 | https://github.com/R-Gaurav/train-delay-estimation/blob/master/trains.txt).
52 | 
53 | ## Future work (how you can contribute to it...)
54 | There are many avenues for extending current work. Please feel free to
55 | contact us for any help.
56 | 
57 | - [Scaling] Expand the existing database of 135 trains (819 stations) to India wide.
58 | - [Improving] Improve the accuracy of existing prediction framework. Examples are
59 | time series prediction, neural networks.
60 | - [Improving] The current prediction framework is off-line in approach, i.e. it learns by
61 | batch processing the accumulated data. A realistic prediction framework will be
62 | on-line, i.e. so that it can  keep learning with delays and railway network dynamics
63 | throughout its lifetime.
64 | 
65 | In case you decide to contribute, please go through the [PEP8](
66 | https://www.python.org/dev/peps/pep-0008/) coding conventions.
67 | The coding standards in this repository are very much based on that.
68 | 
69 | --------
70 | 
71 | Suggestions and contributions are welcome.
72 | 


--------------------------------------------------------------------------------
/misc/explore_data.R:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | 
 3 | t12307 <- read.csv('Train12307.csv')
 4 | 
 5 | # Exploring the relation between latemin and distance/stations. However there can be a numerical relation between distance and
 6 | # latemins, also, on the other hand we can get some latent info about delays at train stations (need to be brainstormed)
 7 | 
 8 | aggregate(latemin~distance,t12307,mean) 
 9 | #aggregate(latemin~station_code,t12307,mean)
10 | #aggregate(t12307$latemin,by=list(dist=t12307$distance),mean) In place of mean, we can pass functions : sd, max, var, median
11 | 
12 | #plot the latemin with distance
13 | stn <- t12307[c(1:26),3]
14 | delay <- aggregate(latemin~distance,t,mean)
15 | delay$station <- stn
16 | 
17 | # Saving a plot in the working directory
18 | jpeg('Latemin Vs Distance.jpg')
19 | plot(delay$distance,delay$latemin,type="l")
20 | lines(delay$distance,delay$latemin,col="red")
21 | dev.off()
22 | 
23 | # Set a linear model on data
24 | lin_reg <- lm(latemin ~ distance+station_code,data=t12307)
25 | summary(lin_reg) # Has R-Squared : 0.7676 (without station_code: 0.6499, so station_code is required)
26 | # We can also include more variables in formula : difference between combination of scharr and schdep with actarr and actdep
27 | # schdep-scharr = total scheduled stop time at stations and so on...
28 | 
29 | ### 3rd April ############################################################################################
30 | 
31 | time_sarr <- strptime(x=as.character(t12307$scharr), format="%H:%M")
32 | time_aarr <- strptime(x=as.character(t12307$actarr), format="%H:%M")
33 | 
34 | time_delay <- (time_aarr-time_sarr)/60 (Divide by 60 to convert from seconds to minutes) 
35 | # This is same as "latemin"
36 | # However we want to do get total delay minutes up to station in query, and not the delay minutes at that station, i.e. each row
37 | # will have delay minutes upto one station before, for example, for stations A, B, C, and D, B will have delay minutes the train
38 | # got up to station A, for station C -> up to station B, for station D -> up to station C.
39 | 
40 | time_delay <- t12307$latemin
41 | time_delay <- time_delay[seq(1,length(time_delay)-1)] # Shift time delay by one station ahead
42 | # prepend 0 to time_delay now...
43 | time_delay <- c(0,time_delay)
44 | # Now for every instance of journey in data, the time_delay at the source station be 0,
45 | no_of_stations <- length(t12307$station_code)
46 | time_delay[seq(1,length(time_delay),no_of_stations)] <- 0 # Set 0 time_delay at each source station
47 | time_delay[time_delay < 0] <- 0 # Remove negative values from time_delay column
48 | t12307$time_delay<- time_delay # Add the column time_delay to data frame
49 | 
50 | # Fit a linear model
51 | lin_reg <- lm(latemin ~ distance+station_code+time_delay, data=t12307)
52 | summary(lin_reg)
53 | # RSquared Error : 0.9834, Outstanding model, but it was expected, as "latemin" can have linear relation with "time_delay"
54 | # and was evident in summary of model, since the slope came out to be 1.02 and statistical importance of "distance" vanished.
55 | # In real environment we would not have time_delay up to a query station, so we can compute time_delay on mean of delays
56 | # ("latemin") so far and fit a linear model on it
57 | 
58 | 
59 | ## Use mean of "latemin" to construct time_delay column
60 | no_of_days <- nrow(t12307)/length(levels(t12307$station_code))
61 | delay <- aggregate(latemin~distance,t12307,mean)
62 | delay <- rep(delay,no_of_days)
63 | delay <- delay[seq(1,length(delay)-1)]
64 | 
65 | delay <- c(0,delay)
66 | delay[delay<0] <- 0
67 | delay[seq(1,length(delay),no_of_stations)] <- 0
68 | 
69 | lin_reg <- lm(late_min ~ distance+station_code+delay, data = t12307)
70 | summary(lin_reg)
71 | # RSquared Error : 0.76
72 | 
73 | ## Looks like "distance" has no role to play in presence of "station_code". Also there's a abnormal behaviour if "delay" is used
74 | # instead of "time_delay". Investigate further !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/metadata_setup.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # This script sets up the directory structure for executing the code.
  4 | 
  5 | echo "######################################"
  6 | echo "#                                    #"
  7 | echo "#       TRAIN DELAY ESTIMATION       #"
  8 | echo "#                                    #"
  9 | echo "######################################"
 10 | 
 11 | echo "Installing the required dependencies in requirements.txt"
 12 | pip install -r requirements.txt
 13 | echo "All required python libraries installed."
 14 | yes '' | sed 5q # Echo 5 blank lines.
 15 | 
 16 | echo "Setting up the metadata (directory structure)..."
 17 | yes '' | sed 5q # Echo 5 blank lines.
 18 | 
 19 | # Untaring the tar data file.
 20 | echo "Untaring data... Train_Delay_Estimation_Data_March_2016_February_2018.tar"
 21 | tar -vxf Train_Delay_Estimation_Data_March_2016_February_2018.tar
 22 | echo "Untaring done!"
 23 | echo "*************************************************************************"
 24 | yes '' | sed 5q # Echo 5 blank lines.
 25 | 
 26 | echo "Renaming 'Train_Delay_Estimation_Data_March_2016_February_2018' to 'data'"
 27 | mv Train_Delay_Estimation_Data_March_2016_February_2018 data
 28 | echo "Renaming done!"
 29 | echo "*************************************************************************"
 30 | yes '' | sed 5q # Echo 5 blank lines.
 31 | 
 32 | # Setting up the directory structure where trainig data would be saved.
 33 | echo "Creating a new directory '52tr_stations_training_data' inside 'data'"
 34 | echo "directory where Known Station's n-previous station training data-frames"
 35 | echo "would be saved, which would be later used to train Random Forest Regressor"
 36 | echo "models."
 37 | mkdir data/52tr_stations_training_data
 38 | 
 39 | echo "Creating subdirectories inside '52tr_stations_training_data', where"
 40 | echo "Known Station's respective n-previous-station data-frames will be saved."
 41 | for n in {1..5}
 42 | do
 43 |   echo "Creating '"$n"ps_training_data' to store $n-previous station data-frames."
 44 |   mkdir data/52tr_stations_training_data/"$n"ps_training_data
 45 |   echo "-----------------------------------------------------------------------"
 46 | done
 47 | echo "Setting up the directory structure for saving training data done!"
 48 | echo "*************************************************************************"
 49 | yes '' | sed 5q # Echo 5 blank lines.
 50 | 
 51 | # Setting up the directory structure where trained models would be saved.
 52 | echo "Creating a new directory 'models' where your trained Random Forest"
 53 | echo "Regressor (RFR) models would be saved."
 54 | mkdir models
 55 | mkdir models/rfr_models
 56 | for n in {1..5}
 57 | do
 58 |   echo "Creating '"$n"ps_rfr_labenc_models' to store the RFR models trained"
 59 |   echo "from $n-previous station training data-frames."
 60 |   mkdir models/rfr_models/"$n"ps_rfr_labenc_models
 61 |   echo "-----------------------------------------------------------------------"
 62 | done
 63 | echo "Setting up the directory structure for saving RFR trained models done!"
 64 | echo "*************************************************************************"
 65 | yes '' | sed 5q # Echo 5 blank lines.
 66 | 
 67 | # Setting up the directory structure for saving predicted late minutes and
 68 | # correspoding RMSEs of test journey data.
 69 | echo "Creating a subdirectory 'rfr_model_data' inside 'data' directory to save"
 70 | echo "the predicted late minutes of test journey data."
 71 | mkdir data/rfr_model_data
 72 | for n in {1..5}
 73 | do
 74 |   echo "Creating 'jrny_wise_known_trains_lms_"$n"ps_labenc' to store journey"
 75 |   echo "wise predicted late-minutes of Known Train's test data with "$n"-OMLMPF."
 76 |   mkdir data/rfr_model_data/jrny_wise_known_trains_lms_"$n"ps_labenc
 77 |   echo "-----------------------------------------------------------------------"
 78 |   echo "Creating 'jrny_wise_unknown_trains_lms_"$n"ps_labenc' to store journey"
 79 |   echo "wise predicted late-minutes of Unknown Train's test data with "$n"-OMLMPF."
 80 |   mkdir data/rfr_model_data/jrny_wise_unknown_trains_lms_"$n"ps_labenc
 81 |   echo "-----------------------------------------------------------------------"
 82 | done
 83 | echo "Setting up the directory structure for saving predicted late-minutes done!"
 84 | echo "*************************************************************************"
 85 | yes '' | sed 5q # Echo 5 blank lines.
 86 | 
 87 | echo "Creating a subdirectory 'rfr_model_pickle_data' inside 'data/pickle_data'"
 88 | echo "to save RMSEs of predicted late-minutes in pickle format."
 89 | mkdir -p data/pickle_data/rfr_model_pickle_data
 90 | for n in {1..5}
 91 | do
 92 |   echo "Creating 'rmse_of_jrny_wise_lms_pred_known_trains_"$n"ps' to store RMSE"
 93 |   echo "of journey wise predicted late minutes from "$n"-OMLMPF algorithm for"
 94 |   echo "Known Trains in pickle format."
 95 |   mkdir -p data/pickle_data/rfr_model_pickle_data/rmse_of_jrny_wise_lms_pred_known_trains_"$n"ps
 96 |   echo "-----------------------------------------------------------------------"
 97 |   echo "Creating 'rmse_of_jrny_wise_lms_pred_unknown_trains_"$n"ps' to store RMSE"
 98 |   echo "of journey wise predicted late minutes from "$n"-OMLMPF algorithm for"
 99 |   echo "Unknown Trains in pickle format."
100 |   mkdir -p data/pickle_data/rfr_model_pickle_data/rmse_of_jrny_wise_lms_pred_unknown_trains_"$n"ps
101 |   echo "-----------------------------------------------------------------------"
102 | done
103 | echo "Setting up the directory structure for saving RMSEs in pickle format done!"
104 | echo "*************************************************************************"
105 | yes '' | sed 5q # Echo 5 blank lines.
106 | 
107 | echo "#########################################################################"
108 | echo "#      You are all setup to run the codes as per your convenience.      #"
109 | echo "#  It is advised to go through the above output messages to understand  #"
110 | echo "#                   the overall directory structure.                    #"
111 | echo "#########################################################################"
112 | 


--------------------------------------------------------------------------------
/code/readers/pickle_data_reader.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Train Delay Estimation Project
  3 | #
  4 | # Author: Ramashish Gaurav
  5 | #
  6 | # Desc: This file reads the pickle data.
  7 | #
  8 | 
  9 | import pickle
 10 | import numpy as np
 11 | 
 12 | 
 13 | class PickleDataReader(object):
 14 | 
 15 |   def __init__(self, data_path=""):
 16 |     self._pdpath = data_path+"pickle_data/"
 17 | 
 18 |   def get_all_trains(self):
 19 |     """
 20 |     Returns a list of all 135 trains' train numbers.
 21 |     First 52 trains in list are Known Trains. Next 83 trains are Unknown Trains.
 22 |     """
 23 |     all_trains = pickle.load(open(self._pdpath+"all_trains135.p", "rb"))
 24 |     return all_trains
 25 | 
 26 |   def get_all_52trains_stations(self):
 27 |     """
 28 |     Returns a list of all 596 Known Stations of Known Trains.
 29 |     """
 30 |     stations_52trains = pickle.load(
 31 |         open(self._pdpath+"52trains_unique_stations.p", "rb"))
 32 |     return stations_52trains
 33 | 
 34 |   def get_all_135trains_stations(self):
 35 |     """
 36 |     Returns a list of all 799 Known Stations + Uknown Stations of all Known
 37 |     Trains and Unknown Trains.
 38 |     """
 39 |     stations_135trains = pickle.load(
 40 |         open(self._pdpath+"135trains_unique_stations.p", "rb"))
 41 |     return stations_135trains
 42 | 
 43 |   def get_labenc_train_type_dict(self):
 44 |     """
 45 |     Returns a dictionary of train type (key) vs numeric label (value).
 46 |     """
 47 |     train_type_dict = pickle.load(
 48 |         open(self._pdpath+
 49 |         "label_encodings/all_train_types_label_encoding_dict.p", "rb"))
 50 |     return train_type_dict
 51 | 
 52 |   def get_labenc_zone_dict(self):
 53 |     """
 54 |     Returns a dictionary of zone (key) vs numeric label (value).
 55 |     """
 56 |     zone_dict = pickle.load(
 57 |         open(self._pdpath+
 58 |         "label_encodings/all_zones_label_encoding_dict.p", "rb"))
 59 |     return zone_dict
 60 | 
 61 |   def get_labenc_month_dict(self):
 62 |     """
 63 |     Returns a dictionary of month (key) vs numeric label (value).
 64 |     """
 65 |     month_dict = pickle.load(
 66 |         open(self._pdpath+
 67 |         "label_encodings/all_months_label_encoding_dict.p", "rb"))
 68 |     return month_dict
 69 | 
 70 |   def get_labenc_weekday_dict(self):
 71 |     """
 72 |     Returns a dictionary of weekday (key) vs numeric label (value).
 73 |     """
 74 |     weekday_dict = pickle.load(
 75 |         open(self._pdpath+
 76 |         "label_encodings/all_weekdays_label_encoding_dict.p", "rb"))
 77 |     return weekday_dict
 78 | 
 79 |   def get_labenc_station_dict(self):
 80 |     """
 81 |     Returns a dict of station (key) vs numeric label (value).
 82 |     It is supposed to be universal set of all 4359 stations in India, for which
 83 |     numeric labels are assigned randomly.
 84 |     """
 85 |     station_dict = pickle.load(
 86 |         open(self._pdpath+
 87 |         "label_encodings/all_stations_label_encoding_dict.p", "rb"))
 88 |     return station_dict
 89 | 
 90 |   def get_station_degree_strength_dict(self):
 91 |     """
 92 |     Returns a dictionary of station (key) vs degree strength (value).
 93 |     This dictionary contains info about only 799 Known and Unknown Stations.
 94 |     """
 95 |     stn_deg_strength = pickle.load(
 96 |         open(self._pdpath+"station_degree_strength_dict.p", "rb"))
 97 |     return stn_deg_strength
 98 | 
 99 |   def get_station_traffic_strength_dict(self):
100 |     """
101 |     Returns a dictionary of station (key) vs traffic strength (value).
102 |     This dictionary contains info about only 799 Known and Unknown Stations.
103 |     """
104 |     stn_tfc_strength = pickle.load(
105 |         open(self._pdpath+"station_traffic_strength_dict.p", "rb"))
106 |     return stn_tfc_strength
107 | 
108 |   def get_station_coordinates_dict(self):
109 |     """
110 |     Returns a dictionary of station (key) vs a tuple of latitude and longitude
111 |     of station (value).
112 |     This dictionary contains info about only 799 Known and Unknown Stations.
113 |     """
114 |     stn_coordinate = pickle.load(
115 |         open(self._pdpath+"station_to_lat_lng_dict.p", "rb"))
116 |     return stn_coordinate
117 | 
118 |   def get_known_596_stations_features_df(self):
119 |     """
120 |     Returns a pandas DataFrame of station features of all 596 Known Stations.
121 |     Valid Known Stations depending on their presence in `stations_having_nps_
122 |     models.p` would be chosen from here to perform kNN on them to find a nearest
123 |     Known Station for an Unkown Station.
124 |     """
125 |     stn_ftrs_df = pickle.load(
126 |         open(self._pdpath+"known_596_stations_features_df.p", "rb"))
127 |     return stn_ftrs_df
128 | 
129 |   def get_stations_having_nps_model_list(self, nps):
130 |     """
131 |     Returns a list of stations which have an n_previous_station model.
132 |     Args:
133 |       nps <int>: n in n_previous_stations models
134 |     """
135 |     stns_hvng_nps_mdls = pickle.load(
136 |         open(self._pdpath+"stations_having_"+str(nps)+"ps_models.p", "rb"))
137 |     return stns_hvng_nps_mdls
138 | 
139 |   def get_rmse_of_journey_wise_lms_pred_list(self, n, group, train, rfr_mdl=""):
140 |     """
141 |     Returns a list of RMSEs of different journeys undertaken by a train in
142 |     given group and rfr_mdl with N-OMLMPF (depending on the value of n).
143 | 
144 |     Args:
145 |       n <int>: <1|2|3|4|5>
146 |       group <string>: <"known"|"unknown">
147 |       rfr_mdl <string>: <""|"_wonps_wdts">
148 |       train <string>: A five digit train number eg. "12307"
149 |     """
150 |     rmse_list = pickle.load(
151 |         open(self._pdpath+"rfr_model_pickle_data/rmse_of_jrny_wise_lms_pred"+
152 |         "_"+group+"_trains_"+str(n)+"ps"+rfr_mdl+"/Train_"+train+"_jw_rmse.p",
153 |         "rb"))
154 |     return rmse_list
155 | 
156 |   def get_all_trains_inline_stations_dict(self):
157 |     """
158 |     Returns a dict of key as train number and values as a list of stations
159 |     inline in its journey.
160 |     """
161 |     train_stns_dict = pickle.load(
162 |         open(self._pdpath+"trains_inline_stations_dict.p", "rb"))
163 |     return train_stns_dict
164 | 


--------------------------------------------------------------------------------
/code/create_training_data.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Train Delay Estimation Project
  3 | #
  4 | # Author: Ramashish Gaurav
  5 | #
  6 | # Desc: This file creates data for training and evaluating the different models.
  7 | #
  8 | #       To create training data to train models, make sure that the running
  9 | #       status data of trains are in "data/trains_training_file" and similarly
 10 | #       for test data of trains.
 11 | #
 12 | #       Output station wise data frames are stored in
 13 | #       "data/52tr_stations_training_data/<n>ps_training_data/" depending on the
 14 | #       value of n.
 15 | #
 16 | #       To run this file execute following command (for both the functions).
 17 | #
 18 | #       python create_training_data.py training 3
 19 | #
 20 | #       It creates data frames of "training" setting for a current station with
 21 | #       3 previous stations.
 22 | #
 23 | #       This file also has a function to generate the known 596 stations
 24 | #       features data frame.
 25 | #       Station Features DF: ["Station", "latitude", "longitude",
 26 | #                             "traffic_strength", "degree_strength"]
 27 | #
 28 | #       To run a specific function, uncomment it in __main__ section.
 29 | #
 30 | 
 31 | import sys
 32 | import pandas as pd
 33 | import pickle
 34 | 
 35 | from joblib import Parallel, delayed
 36 | 
 37 | from utilities.df_utils import TrainDataFrameUtils as TDFU
 38 | 
 39 | def generate_known_current_station_df(
 40 |     tdfu, current_station, setting="complete_training", n=3):
 41 |   """
 42 |   Returns a data frame of the current station. The data frame consists of n
 43 |   previous station features to the current station. The setting if set to
 44 |   "cross_validation", generates data frames similar to "training" from Known
 45 |   Trains only to evaluate the models, not to cross-validate the late minutes
 46 |   prediction algorithm.
 47 | 
 48 |   Args:
 49 |     tdfu <TDFU()>: An object of TrainDataFrameUtils
 50 |     current_station <string>: A known station name which should be one among
 51 |                               the stations of 52 trains. eg. "CNB"
 52 |     setting <string>: <"training"|"cross_validation"|"complete_training">
 53 |     n <int>: Number of previous stations to the current station preferred
 54 |              <1|2|3|4|5>, default value is 3
 55 |   """
 56 |   station_df = []
 57 |   column_names_list = tdfu._get_column_names_list(n)
 58 |   trains = tdfu._pdr.get_all_trains()
 59 |   trains52 = trains[:52] # Choose the first 52 trains which are Known Trains
 60 |                          # Rest 83 trains in list are Unknown Trains.
 61 | 
 62 |   # Iterate over all trains.
 63 |   # Get the complete df of each train.
 64 |   # Get the single journey df out of a complete df of each train.
 65 |   # For each single journey df find the station which is the current station
 66 |   # and append the n previous stations info to the station_df.
 67 |   for train_num in trains52:
 68 |     train_df = tdfu._cdr.get_train_journey_df(train_num, setting)
 69 | 
 70 |     # Get all the source station rows of each journey
 71 |     source_rows = train_df[train_df.scharr=="Source"].index.tolist()
 72 |     for i in range(len(source_rows)):
 73 |       sj_df = tdfu._generate_single_journey_df(train_df, i, source_rows)
 74 | 
 75 |       # Choose the required columns
 76 |       sj_df = sj_df[["station_code", "distance",
 77 |                      "month", "weekday", "latemin"]]
 78 |       station_list = sj_df["station_code"].tolist() # Obtain the station list
 79 | 
 80 |       # Check if the sj_df is wrong due to extended journey
 81 |       if station_list != sj_df.station_code.unique().tolist():
 82 |         print "Repeated stations found, Wrong DF, Check Train: ", train_num
 83 |         print "Obtained stations: ", station_list
 84 |         print "Actual stations: ", sj_df.station_code.unique().tolist()
 85 |         return
 86 |       else:
 87 |         for j in range(n+source_rows[i], len(station_list)+source_rows[i]):
 88 |           station = station_list[j-source_rows[i]]
 89 |           if station == current_station:
 90 |             # train_type. zone. is_superfast, month, weekday
 91 |             feature_list = [tdfu._generate_train_type_str(train_num),
 92 |                  tdfu._generate_zone_str(train_num),
 93 |                  tdfu._is_superfast_str(train_num),
 94 |                  tdfu._generate_month_str(sj_df, j),
 95 |                  tdfu._generate_weekday_str(sj_df, j)]
 96 |             # n_prev_station
 97 |             feature_list.extend(
 98 |                 tdfu._generate_n_prev_station_codes_list(sj_df, j, n))
 99 |             # n_ps_late_mins
100 |             feature_list.extend(
101 |                 tdfu._generate_n_prev_stn_late_mins_list(sj_df, j, n))
102 |             # dist_bwn_stn_n-1_n
103 |             feature_list.extend(
104 |                 tdfu._generate_n_prev_dist_bwn_stn_list(sj_df, j, n))
105 |             # stn_n_dist_frm_src
106 |             feature_list.extend(
107 |                 tdfu._generate_n_prev_stn_dist_from_source_list(sj_df, j, n))
108 |             # tfc_of_stn_n
109 |             feature_list.extend(
110 |                 tdfu._generate_n_prev_stn_tfc_strength_list(sj_df, j, n))
111 |             # deg_of_stn_n
112 |             feature_list.extend(
113 |                 tdfu._generate_n_prev_stn_deg_strength_list(sj_df, j, n))
114 |             # crnt_stn_tfc, set n = 0
115 |             feature_list.extend(
116 |                 tdfu._generate_n_prev_stn_tfc_strength_list(sj_df, j, 0))
117 |             # crnt_stn_deg, set n = 0
118 |             feature_list.extend(
119 |                 tdfu._generate_n_prev_stn_deg_strength_list(sj_df, j, 0))
120 |             # crnt_stn_dist_frm_src, set n = 0
121 |             feature_list.extend(
122 |                 tdfu._generate_n_prev_stn_dist_from_source_list(sj_df, j, 0))
123 |             # crnt_stn_late_mins, set n = 0
124 |             feature_list.extend(
125 |                 tdfu._generate_n_prev_stn_late_mins_list(sj_df, j, 0))
126 | 
127 |             station_df.append(feature_list)
128 | 
129 |   station_df = pd.DataFrame(station_df, columns = column_names_list)
130 |   station_df.to_csv((tdfu._cdr._cdpath + "52tr_stations_" + setting+"_data/" +
131 |                      str(n) + "ps_" + setting + "_data/Station_" +
132 |                      current_station + ".csv"), index=False)
133 |   print "Station: ", current_station, " Done!"
134 |   return station_df
135 | 
136 | def generate_known_stations_features_df(pdr):
137 |   """
138 |   This function generates known stations features data frame helpful in
139 |   projecting unknown stations to known stations.
140 | 
141 |   Args:
142 |     pdr <PDR()>: A Pickle Data Reader object
143 |   """
144 |   known_stations = pdr.get_all_52trains_stations()
145 |   stn_ftrs_df = []
146 |   columns = ["Station", "Latitude", "Longitude", "Degree_Strength",
147 |       "Traffic_Strength"]
148 |   geo_crdnates = pdr.get_station_coordinates_dict()
149 |   deg_strength = pdr.get_station_degree_strength_dict()
150 |   tfc_strength = pdr.get_station_traffic_strength_dict()
151 |   for stn in known_stations:
152 |     stn_ftrs_df.append([stn, geo_crdnates[stn][0], geo_crdnates[stn][1],
153 |                         deg_strength[stn], tfc_strength[stn]])
154 |   stn_ftrs_df = pd.DataFrame(stn_ftrs_df, columns=columns)
155 |   pickle.dump(stn_ftrs_df,
156 |       open(pdr._pdpath + "known_596_stations_features_df.p", "wb"))
157 | 
158 | if __name__ == '__main__':
159 |   setting = sys.argv[1]
160 |   n = int(sys.argv[2])
161 |   tdfu = TDFU()
162 |   pdr = tdfu._pdr
163 |   stns_of_52trains = pdr.get_all_52trains_stations() # Get all Known Stations.
164 | ################################################################################
165 |   # To create training or cross-validation data, runs parallely on 4 processors.
166 |   Parallel(n_jobs=-1)(delayed(generate_known_current_station_df)(tdfu, stn,
167 |       setting, n) for stn in stns_of_52trains)
168 | ################################################################################
169 |   # To create stations' features data frame.
170 |   # generate_known_stations_features_df(pdr)
171 | ################################################################################
172 | 


--------------------------------------------------------------------------------
/code/known_trains_lms_pred.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Train Delay Estimation Project
  3 | #
  4 | # Author: Ramashish Gaurav
  5 | #
  6 | # Desc: This file predicts the journey late minutes of known trains (52 trains).
  7 | #       Prints the RMSE of each journey.
  8 | #
  9 | #       The path to saved trained models which would be loaded and employed to
 10 | #       predict late mins must be specified in function
 11 | #       "get_predicted_late_mins_list()" in file: "utilities/tt_utils.py".
 12 | #
 13 | #       N-Order Markov Late Minutes Prediction Framework for Known Trains.
 14 | #
 15 | #       To run this file execute:
 16 | #
 17 | #       `python known_trains_lms_pred.py rfr 1`
 18 | #
 19 | #       to run Random Forest Regressor Station models to predict late minutes;
 20 | #       considering 1-previous station and store the late mins prediction in
 21 | #       "jrny_wise_known_trains_lms_1ps_labenc" directory and corresponding RMSEs
 22 | #       in "rmse_of_jrny_wise_lms_pred_known_trains_1ps" directory.
 23 | #
 24 | #       IMPORTANT NOTE: Make sure to remove the unwanted columns in data frame
 25 | #                       depending on experiments. This can be done in function:
 26 | #                       "remove_unwanted_columns_df()" defined in
 27 | #                       "utilities/tt_utils.py", which gets eventually called in
 28 | #                       "get_predicted_late_mins_at_station_float()".
 29 | #
 30 | #
 31 | 
 32 | import pickle
 33 | import pandas as pd
 34 | import sys
 35 | 
 36 | from sklearn.metrics import mean_squared_error
 37 | 
 38 | from utilities.tt_utils import TrainingTestUtils as TTU
 39 | 
 40 | def get_journey_wise_late_mins_of_known_trains(
 41 |     ttu, train_num, setting, mdl, n, exp_lms_output_dir, exp_rmse_output_dir):
 42 |   """
 43 |   Finds the journey wise late minutes of Known Trains, ie. first 52 trains of
 44 |   all 135 trains whose data has been collected so far.
 45 | 
 46 |   The data of first 52 train has been used for training the station models.
 47 | 
 48 |   Args:
 49 |     ttu <TTU()>: An object of TrainingTestUtils
 50 |     train_num <string>: A five digit train numebr string eg. "12307"
 51 |     setting <string>: <"traininig"|"cross_validation"|"known_test">
 52 |     mdl <string>: <"rfr"|"lmr">
 53 |                 "rfr": Random Forest Regressor Models
 54 |                 "lmr": Linear Model Regressor Models
 55 |     n <int>: Value of n in n-prev-station or n-OMLMPF.
 56 |     exp_lms_output_dir <string>: <"jrny_wise_known_trains_lms_1ps_labenc" | ..>
 57 |                                  Desired output directory of predicted latemins.
 58 |     exp_rmse_output_dir <string>: <"rmse_of_jrny_wise_lms_pred_known_trains_1ps"
 59 |                                    |..>
 60 |                                  Desired output directory of predicted latemins
 61 |                                  RMSEs. Make sure it stays aligned with
 62 |                                  `exp_lms_output_dir`.
 63 | 
 64 |   """
 65 |   pred_lms_df = [] # To caputre predicted late mins for each journey
 66 |   pred_lms_rmse = [] # Late Minutes RMSE for each journey
 67 |   columns = ["Stations", "ActualLateMins", "PredictedLateMins"]
 68 |   train_df = ttu._cdr.get_train_journey_df(train_num, setting)
 69 | 
 70 |   # Get all the source station rows of each journey in train_df
 71 |   source_rows = train_df[train_df.scharr=="Source"].index.tolist()
 72 | 
 73 |   for i in range(len(source_rows)):
 74 |     # Obtain the single journey dataframe
 75 |     sj_df = ttu._tdfu._generate_single_journey_df(train_df, i, source_rows)
 76 |     sj_df = sj_df[["station_code", "distance", "month", "weekday", "latemin"]]
 77 | 
 78 |     # Obtain the current single journey station list
 79 |     stn_list_sj = sj_df["station_code"].tolist()
 80 |     actual_late_mins_sj = sj_df["latemin"]
 81 |     pred_late_mins_sj = [0] # Assuming 0 late mins for source station
 82 | 
 83 |     # Uncomment the following lines in `if else` case accordingly as per value
 84 |     # of N in N-OMLMPF. If N is chosen to be 3, it implies we will consider only
 85 |     # 3-previous-station models of suitable stations to predict the late minutes.
 86 |     # Here, the value of N is chosen 1, so other `else` part of code is
 87 |     # commented out. Uncomment to generate desired results for different N.
 88 |     #
 89 |     # Depending on the Experiment you choose, make sure to remove the unwanted
 90 |     # columns in the call of function "get_predicted_late_mins_at_station_float".
 91 | 
 92 |     for j in range(1, len(stn_list_sj)):
 93 |       try: # Try to predict the late minutes for this station in single journey.
 94 |         if (j == 1 or n == 1): # valid for only 1 previous station.
 95 |           plm = ttu.get_predicted_late_mins_at_station_float(train_num, sj_df,
 96 |               j+source_rows[i], 1, stn_list_sj[j], pred_late_mins_sj, j, mdl)
 97 |           pred_late_mins_sj.append(plm)
 98 |           continue
 99 |         if (j == 2 or n == 2): # valid for only 2 previous stations.
100 |           plm = ttu.get_predicted_late_mins_at_station_float(train_num, sj_df,
101 |               j+source_rows[i], 2, stn_list_sj[j], pred_late_mins_sj, j, mdl)
102 |           pred_late_mins_sj.append(plm)
103 |           continue
104 |         if (j == 3 or n == 3): # valid for only 3 previous stations.
105 |           plm = ttu.get_predicted_late_mins_at_station_float(train_num, sj_df,
106 |               j+source_rows[i], 3, stn_list_sj[j], pred_late_mins_sj, j, mdl)
107 |           pred_late_mins_sj.append(plm)
108 |           continue
109 |         if (j == 4 or n == 4): # valid for only 4 previous stations.
110 |           plm = ttu.get_predicted_late_mins_at_station_float(train_num, sj_df,
111 |               j+source_rows[i], 4, stn_list_sj[j], pred_late_mins_sj, j, mdl)
112 |           pred_late_mins_sj.append(plm)
113 |           continue
114 |         if (j ==5 or n == 5): # rest stations are valid for 5 previous stations.
115 |           plm = ttu.get_predicted_late_mins_at_station_float(train_num, sj_df,
116 |               j+source_rows[i], 5, stn_list_sj[j], pred_late_mins_sj, j, mdl)
117 |           pred_late_mins_sj.append(plm)
118 |           continue
119 | 
120 |       # Case when a new station comes whose trained model does not exist.
121 |       except KeyError as e:
122 |         # KeyError is obtained while creating row data frame for a station but
123 |         # the previous station is not be present in station to index dict. Hence
124 |         # set the late minutes at the current station as that of previous one.
125 |         pred_late_mins_sj.append(pred_late_mins_sj[j-1])
126 |       except Exception as e:
127 |         # Set the late minutes at that station for which no trained model exist
128 |         # as the late minutes at the immediate previou station.
129 |         print e
130 |         pred_late_mins_sj.append(pred_late_mins_sj[j-1])
131 | 
132 |     # Construct the data frame of Station Code, Actual Late Mins and
133 |     # Predicted Late Mins for each journey
134 |     for ele in zip(zip(stn_list_sj, actual_late_mins_sj), pred_late_mins_sj):
135 |       pred_lms_df.append([ele[0][0], ele[0][1], ele[1]])
136 | 
137 |     # Mark the end of current journey
138 |     pred_lms_df.append(["JRNY END", "-------", "-------"])
139 |     # Store the RMSE of each journey for a train
140 |     rmse = mean_squared_error(actual_late_mins_sj, pred_late_mins_sj)**0.5
141 |     pred_lms_rmse.append(rmse)
142 |     # Print the RMSE of each journey for the given train "train_num"
143 |     print "Train Number:", train_num, "RMSE:", rmse
144 | 
145 |   pred_lms_df = pd.DataFrame(pred_lms_df, columns=columns)
146 |   pred_lms_df.to_csv(ttu._cdr._cdpath+mdl+"_model_data/" + exp_lms_output_dir +
147 |       "/Train_" + train_num + "_jw_lms.csv", index=False)
148 |   pickle.dump(pred_lms_rmse, open(ttu._pdr._pdpath+mdl+"_model_pickle_data/" +
149 |       exp_rmse_output_dir + "/Train_" + train_num + "_jw_rmse.p", "wb"))
150 | 
151 | 
152 | if __name__ == "__main__":
153 |   mdl = sys.argv[1] # Accept <"rfr"|"lmr">.
154 |   n = sys.argv[2] # Accept the n in n-OMLMPF (n-prev-stns to consider).
155 |   # Create this directory to store predicted late minutes in each experiments
156 |   # for different  values of n in nps.
157 |   exp_lms_output_dir = "jrny_wise_known_trains_lms_%sps_labenc" % n
158 | 
159 |   # Create this directory to store the RMSE for predicted late minutes in each
160 |   # experiment for different values of n. Make sure it stays aligned with
161 |   # exp_lms_output_dir.
162 |   exp_rmse_output_dir = "rmse_of_jrny_wise_lms_pred_known_trains_%sps" % n
163 | 
164 |   ttu = TTU()
165 |   trains52 = ttu._pdr.get_all_trains()[:52] # Choose the first 52 trains, which
166 |                                             # are Known Trains.
167 |   for train in trains52:
168 |     get_journey_wise_late_mins_of_known_trains(
169 |         ttu, train, "known_test", mdl, int(n), exp_lms_output_dir,
170 |         exp_rmse_output_dir)
171 | 


--------------------------------------------------------------------------------
/doc/Tutorial.md:
--------------------------------------------------------------------------------
  1 | # Tutorial: Using Train Delay Estimation
  2 | 
  3 | ## Background
  4 | We collected Train Running Status data (with format same as that shown in [NTES](
  5 | https://enquiry.indianrail.gov.in/mntes/)) for a period of two years from March 2016
  6 | to February 2018 for 135 trains that pass through MGS (Mughalsarai Station, one of the
  7 | top busiest stations in India). After required preprocessing of the data, we
  8 | found that delays at stations depend on the month during which the journey is made,
  9 | as well as the stations previous to the current station (at which we
 10 | sought the predicted delay). As part of learning algorithms we
 11 | used Random Forest Regressors and Ridge Regressors to
 12 | devise a Zero Shot competent, scalable and train agnostic, late minutes prediction
 13 | framework inspired from Markov Process. We name our prediction framework as
 14 | *N*-Order Markov Late Minutes Prediction Framework (*N*-OMLMPF).
 15 | 
 16 | The *N*-OMLMPF inputs a train number, its journey route information (i.e.
 17 | stations along its journey route, distance of stations from source etc. - for
 18 | more information, please see our papers in doc directory) and station at
 19 | which the user wishes to known the expected delay and a date. It then
 20 | outputs the expected delay at that particular station.
 21 | 
 22 | The above only presents the gist of our work, it is highly recommended to go
 23 | through our paper mentioned above, before proceeding ahead.
 24 | 
 25 | The code is highly commented with function docstrings. Please let us know if you
 26 | need help understanding them or setting up the experiments. The best way to set
 27 | an experiment environment on your system is to download and install [Anaconda](
 28 | https://www.anaconda.com/download/).
 29 | 
 30 | ## How to Use Code and Other Artifacts
 31 | Upon contacted, we will share the raw data but not the pre-trained models. Each
 32 | saved pre-trained model is approximately 40MB or more, hence not feasible to
 33 | share all of them. However with the help of following simple steps, one can easily
 34 | train the prediction models and use it for predicting delays at railway stations.
 35 | 
 36 | The tutorial below details the steps to train Random Forest Regressor models (the
 37 | most effective ones compared to Ridge Regressor models) on n-prev-stns training
 38 | data-frames of Known Stations, with station codes removed from them (refer
 39 | Experiments and Result Analysis section in our paper - Exp 3, Exp 4).
 40 | 
 41 | To set up an experimental environment, follow the below steps precisely in the
 42 | same order as mentioned. The preferred environment is Linux.
 43 | 
 44 | **Note**: The code works with Python 2.7
 45 | 
 46 | ### Setting up the directory structure.
 47 | 1> Clone this repo on your local system by executing below command.\
 48 | `git clone https://github.com/R-Gaurav/train-delay-estimation.git`
 49 | 
 50 | 2> After you download the tar file of data:
 51 | `Train_Delay_Estimation_Data_March_2016_February_2018.tar`, move it inside the
 52 | `train-delay-estimation` directory. You can download this data by contacting us
 53 | at my.better.rail@gmail.com.
 54 | 
 55 | 3> From inside `train-delay-estimation` directory execute:
 56 | `./metadata_setup.sh` to setup the required directory structure along 
 57 | with automatically installing the dependencies in **requirements.txt**.
 58 | 
 59 | ### Setting up the environment variables in file **env.py**
 60 | 1> Navigate to directory **train-delay-estimation/code/utilities**.
 61 | 
 62 | 2> Open **env.py**.
 63 | 
 64 | 3> Set the `project_dir_path` variable to the location where you have downloaded
 65 | the **train-delay-estimation** directory.
 66 | 
 67 | 4> Save **env.py**.
 68 | 
 69 | ### Creating pickle data
 70 | 1> Move to the **code** directory.
 71 | 
 72 | 2> We have already provided some data in pickle format which were either manually
 73 | created or collected from internet via REST APIs. Although you need to create
 74 | few more data in pickle format.
 75 | 
 76 | To do this, just execute `python create_pickle_data.py`.
 77 | 
 78 | ### Creating the training data (Table III in paper) to train the models
 79 | 1> Move to the **code** directory.
 80 | 
 81 | 2> Execute: `python create_training_data.py training 1` to create training
 82 | data for 1 previous station data-frame, similarly replace `1` with <`2`,`3`,`4`,`5`> for
 83 | creating data with respect to that many number of previous stations.
 84 | 
 85 | On a system with 4 logical i5 cores (you can get the number of logical cores on
 86 | you system by executing `htop` or `top` (followed by pressing `1` key)), it
 87 | takes nearly 7 hours to prepare 1-prev-stn data frames. For 2-prev-stn data
 88 | frames it takes 9 hours, so expect it to keep increasing for higher number
 89 | of previous station data frames.
 90 | 
 91 | NOTE: The data frames are created parallely, computation is done on all cores.
 92 | 
 93 | For more information, go through the description mentioned in file:
 94 | `create_training_data.py`.
 95 | 
 96 | ### Training the regression models
 97 | 1> Move to **code** directory.
 98 | 
 99 | 2> Execute `python rfr_stn_models_training_file.py 1` to train 1-prev-stn Random
100 | Forest Regressor (RFR) models. Similary change `1` to <`2`,`3`,`4`,`5`>
101 | to train other models. However you would be required to prepare training data for
102 | them first though.
103 | 
104 |    On executing the above command, you will see a continuous output on command
105 |    prompt:
106 | 
107 |            .
108 |            .
109 |            .
110 |            .
111 |     CAR 6.60625167783
112 |     CBH 1.71117789831
113 |     CBJ 17.4222160169
114 |     CCK 3.79114575446
115 |     CD 3.31220839301
116 |     CDMR 5.39912244203
117 |     CGR 8.08489734899
118 |     CH 10.4774022913
119 |     CHL 5.99947173966
120 |     CHTI 67.8594204912
121 |     CKDL 12.6303575828
122 |     CKTD 5.57649677578
123 |     CLG 4.48826310353
124 |     CNB 62.4855739456
125 |            .
126 |            .
127 |            .
128 |            .
129 | 
130 |    where "CAR", CBH" are station codes and floating numbers beside them are
131 |    RMSEs which evaluate the fit of models on training data itself.
132 | 
133 |    On a system with 4 logical cores it takes nearly an hour to train 1-prev-stn
134 |    RFR models, for other n-prev-stn models it takes nearly the same time.
135 | 
136 | ### Predicting delays of train's test data
137 | 1> Move to **code** directory.
138 | 
139 | 2> Execute `python known_trains_lms_pred.py rfr 1`.
140 | The output on shell is similar to below:
141 | 
142 | 
143 |                          .
144 |                          .
145 |                          .
146 |                          .
147 |       Train Number: 12307 RMSE: 39.7547311759
148 |       Train Number: 12307 RMSE: 27.7902472271
149 |       Train Number: 12307 RMSE: 69.2035611394
150 |       Train Number: 12307 RMSE: 90.8565136872
151 |       Train Number: 12307 RMSE: 56.4806884838
152 |       Train Number: 12307 RMSE: 50.1364333031
153 |       Train Number: 12307 RMSE: 34.8328977349
154 |       Train Number: 12307 RMSE: 16.3028024387
155 |       Train Number: 12307 RMSE: 24.3166122244
156 |       Train Number: 12307 RMSE: 26.6479429784
157 |       Train Number: 12307 RMSE: 67.5090362829
158 |       Train Number: 12307 RMSE: 29.016842432
159 |       Train Number: 12307 RMSE: 23.8403707468
160 |                          .
161 |                          .
162 |                          .
163 |                          .
164 | 
165 | where each row corresponds to one journey of a train and the corresponding RMSE
166 | obtained on the test data for that journey.
167 | 
168 | For Unknown Trains late minutes prediction, execute:
169 | `python unknown_trains_lms_pred.py rfr 10 1`
170 | 
171 | This command will predict late minutes for unknown trains by using RFR
172 | models and will consider 10 Nearest Neighbors for a station. It will
173 | consider 2 previous stations i.e. n = 2 in n-OMLMPF.
174 | 
175 | ### Deploying the Train Delay Estimation Service on your local machine
176 | Make sure that you have all the trained Random Forest Regressors models up to N=
177 | 5.
178 | 
179 | 1> Move to **tde_service** directory.
180 | 
181 | 2> In `env.py` file, set the `project_dir_path` to the location where you have
182 | downloaded the `train_delay_estimation` directory.
183 | 
184 | 3> Execute: `python app.py`. The flask server would be running by default on
185 | loopback address: 127.0.0.1 at port 5000.
186 | 
187 | From other terminal:
188 | 
189 | 4> Execute: `curl http://127.0.0.1:5000/12307` to get the predicted late minutes
190 | at all the in-line stations of train 12307 on current date.
191 | 
192 | 5> Execute: `curl http://127.0.0.1:5000/12307/2018-07-23` to get predicted late
193 | minutes at all the in-line stations of train 12307 on date 23rd July 2018.
194 | 
195 | 6> Execute `curl http://127.0.0.1:5000/12307/ALD/today` to get predicted late
196 | minutes of train 12307 at station ALD (Allahabad) on current date.
197 | 
198 | 7> Execute `curl http://127.0.0.1:5000/12307/ALD/2018-12-09` to get predicted
199 | late minutes for train 12307 at station ALD on 9th Dec 2018.
200 | 
201 | The logs can be obtained in `train-delay-estimation/tde_service/logs/tde_logs.log`
202 | file.
203 | 
204 | ----------
205 | 
206 | 


--------------------------------------------------------------------------------
/tde_service/tde_prediction.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Train Delay Estimation Project
  3 | #
  4 | # Author: Ramashish Gaurav
  5 | #
  6 | # This file implements the algorithm used to predict delays for a train at a
  7 | # particular station on a particular date.
  8 | #
  9 | import env
 10 | 
 11 | from datetime import datetime
 12 | 
 13 | from code.utilities.tt_utils import TrainingTestUtils as TTU
 14 | 
 15 | from util import log
 16 | 
 17 | class TDEPrediction(object):
 18 |   def __init__(self):
 19 |     self._ttu = TTU()
 20 |     self._cdr = self._ttu._cdr
 21 |     self._tdfu = self._ttu._tdfu
 22 |     self._month_dict = {"01": "Jan", "02": "Feb", "03": "Mar", "04": "Apr",
 23 |                         "05": "May", "06": "Jun", "07": "Jul", "08": "Aug",
 24 |                         "09": "Sep", "10": "Oct", "11": "Nov", "12": "Dec"}
 25 |     self._week_dict = {0: "Monday", 1: "Tuesday", 2: "Wednesday", 3: "Thursday",
 26 |                        4: "Friday", 5: "Saturday", 6: "Sunday"}
 27 | 
 28 |   def _get_modified_date_month_week_tuple(self, date):
 29 |     """
 30 |     Returns the month and weekday from the date.
 31 | 
 32 |     Args:
 33 |       date <str>: A valid date in "YYYY-MM-DD" in string format.
 34 |                   e.g. "2018-08-09".
 35 | 
 36 |     Returns:
 37 |       (str, str, str) i.e. (modified_date, month, weekday)
 38 |     """
 39 |     date = date.split("-")  #"2018-08-09" -> ['2018', '08', '09']
 40 |     weekday = self._week_dict[
 41 |         datetime(int(date[0]), int(date[1]), int(date[2])).date().weekday()]
 42 |     month = self._month_dict[date[1]]
 43 |     mod_date = date[2]+" "+month+" "+date[0]
 44 |     log.INFO("Modified Date: %s, Month: %s, Weekday: %s"
 45 |              % (mod_date, month, weekday))
 46 |     return (mod_date, month, weekday)
 47 | 
 48 |   def _get_trains_modified_journey_dataframe(self, train_num, date):
 49 |     """
 50 |     Returns a dataframe of the train `train_num` such that it has all the
 51 |     requried journey information and modified date, month and weekday column.
 52 | 
 53 |     Args:
 54 |       train_num <str>: A five digit train number e.g. "12307".
 55 |     """
 56 |     # TODO: Save some time here by having an updated latest journey dataframe.
 57 |     # Get the train's all journey data frame.
 58 |     train_df = self._cdr.get_train_complete_journey_df(train_num)
 59 |     # Extract the latest single journey data frame.
 60 |     source_rows = train_df[train_df.scharr == "Source"].index.tolist()
 61 |     train_latest_sj_df = self._tdfu._generate_single_journey_df(
 62 |         train_df, len(source_rows)-1, source_rows)
 63 |     num_rows_sj_df = train_latest_sj_df.shape[0]
 64 | 
 65 |     # TODO: Get a more accurate dateframe by incorporating actual previous dates
 66 |     # to the current queried date for a train which takes muliple days to
 67 |     # complete its journey. "day" column of dataframe might help.
 68 |     mod_date, month, weekday = self._get_modified_date_month_week_tuple(date)
 69 |     mod_date = [mod_date for _ in xrange(num_rows_sj_df)]
 70 |     month = [month for _ in xrange(num_rows_sj_df)]
 71 |     weekday = [weekday for _ in xrange(num_rows_sj_df)]
 72 | 
 73 |     # Modify the date columns.
 74 |     train_latest_sj_df["actarr_date"] = mod_date
 75 |     train_latest_sj_df["scharr_date"] = mod_date
 76 | 
 77 |     # Modify the month column.
 78 |     train_latest_sj_df["month"] = month
 79 | 
 80 |     # Modify the weekday column.
 81 |     train_latest_sj_df["weekday"] = weekday
 82 | 
 83 |     train_latest_sj_df = train_latest_sj_df.reset_index(drop=True)
 84 |     log.INFO("Train: %s single journey dataframe modified" % train_num)
 85 |     return train_latest_sj_df
 86 | 
 87 |   def get_delay(self, STNS_WITH_N_MDLS, train_num, date, station=None, nn=10,
 88 |                 mdl="rfr", n=2):
 89 |     """
 90 |     Gets the delay for train `train_num` at station `station` on date `date`.
 91 | 
 92 |     Args:
 93 |       STNS_WITH_N_MDLS <dict>: A dict having values as list of stations with
 94 |                                n-prev-stns models.
 95 |       train_num <str>: A five digit train number e.g. "12307".
 96 |       date <str>: A date on which delays at stations are required,
 97 |                   e.g. "2018-07-08" in "YYYY-MM-DD" format.
 98 |       station <str>: A station code, e.g. "CNB".
 99 |       nn <int>: Number of nearest neighbour to be considered if the current
100 |                 station does not have n-prev-station models.
101 |       mdl <str>: "rfr" for Random Forest Regressor models.
102 |       n <int>: N in N-OMLMPF i.e. number of previous station to consider.
103 | 
104 |     Returns:
105 |       dict:
106 | 
107 |       {
108 |         "Error": <None> or <str: Error Message>,
109 |         "Result": <dict: A dict of station_codes as keys and predicted late
110 |                   minutes as values.
111 |       }
112 |     """
113 |     # Get the train's journey information.
114 |     ret = {"Error": None, "Result": None}
115 |     try:
116 |       train_sj_df = self._get_trains_modified_journey_dataframe(train_num, date)
117 |     except Exception as e:
118 |       log.ERROR("Error occurred for train: %s, Error type: %s, Error message: %s"
119 |                 % (train_num, type(e), str(e)))
120 |       ret["Error"] = str(e)
121 |       return ret
122 | 
123 |     inline_stns = train_sj_df["station_code"].tolist()
124 |     # Store the predicted late minutes at inline stations in a list.
125 |     lms_at_stns = [0]
126 | 
127 |     for index in range(1, len(inline_stns)):
128 |       # TODO: In case a station is given, can we exit the for loop, once late
129 |       # minutes for the queried station is predicted?
130 |       stn = inline_stns[index]
131 |       try:
132 |         if (index == 1 or n == 1): # Valid for only 1 previous station.
133 |           # Get the nearest neighbour station to current station it has no models.
134 |           if stn not in STNS_WITH_N_MDLS["1ps"]:
135 |             stn = self._ttu.get_station_nearest_neighbors_list(stn, 1, nn)[0]
136 | 
137 |           # Pass the `index` of current station `stn` to make its row data frame
138 |           # and station code `stn` whose model would be used to predict late
139 |           # minutes. In case of a Known Station, `stn` and its `index` would
140 |           # represent the same station, in case of Unknown Station or Station
141 |           # with no models, `stn` would be the nearest neighbour station, however
142 |           # the `index` would make sure that the row data frame is calculated for
143 |           # the correct current station.
144 |           plm = self._ttu.get_predicted_late_mins_at_station_float(
145 |               train_num, train_sj_df, index, 1, stn, lms_at_stns, index, mdl)
146 |           lms_at_stns.append(plm)
147 |           continue
148 | 
149 |         if (index == 2 or n == 2): # Valid for only 2 previous stations.
150 |           if stn not in STNS_WITH_N_MDLS["2ps"]:
151 |             stn = self._ttu.get_station_nearest_neighbors_list(stn, 2, nn)[0]
152 |           plm = self._ttu.get_predicted_late_mins_at_station_float(
153 |               train_num, train_sj_df, index, 2, stn, lms_at_stns, index, mdl)
154 |           lms_at_stns.append(plm)
155 |           continue
156 | 
157 |         if (index == 3 or n == 3): # Valid for only 3 previous stations.
158 |           if stn not in STNS_WITH_N_MDLS["3ps"]:
159 |             stn = self._ttu.get_station_nearest_neighbors_list(stn, 3, nn)[0]
160 |           plm = self._ttu.get_predicted_late_mins_at_station_float(
161 |               train_num, train_sj_df, index, 3, stn, lms_at_stns, index, mdl)
162 |           lms_at_stns.append(plm)
163 |           continue
164 | 
165 |         if (index == 4 or n == 4): # Valid for only 4 previous stations.
166 |           if stn not in STNS_WITH_N_MDLS["4ps"]:
167 |             stn = self._ttu.get_station_nearest_neighbors_list(stn, 4, nn)[0]
168 |           plm = self._ttu.get_predicted_late_mins_at_station_float(
169 |               train_num, train_sj_df, index, 4, stn, lms_at_stns, index, mdl)
170 |           lms_at_stns.append(plm)
171 |           continue
172 | 
173 |         if (index == 5 or n == 5): # Valid for only 5 previous stations.
174 |           if stn not in STNS_WITH_N_MDLS["5ps"]:
175 |             stn = self._ttu.get_station_nearest_neighbors_list(stn, 5, nn)[0]
176 |           plm = self._ttu.get_predicted_late_mins_at_station_float(
177 |               train_num, train_sj_df, index, 5, stn, lms_at_stns, index, mdl)
178 |           lms_at_stns.append(plm)
179 | 
180 |       except Exception as e:
181 |         log.WARN("Error occurred for train: %s, Error type: %s, Error message: %s"
182 |                  % (train_num, type(e), str(e)))
183 |         lms_at_stns.append(plm)
184 | 
185 |     lms_at_stns_dict = {} # Store the predicted late minutes in a dict.
186 |     for index in range(len(inline_stns)):
187 |       lms_at_stns_dict[inline_stns[index]] = lms_at_stns[index]
188 | 
189 |     if station:
190 |       try:
191 |         ret["Result"] = {station: lms_at_stns_dict[station]}
192 |       except Exception as e:
193 |         log.ERROR("Error occurred for train: %s, Error type: %s, "
194 |                   "Error message: %s" % (train_num, type(e), str(e)))
195 |         ret["Error"] = ("Queried station: %s not found along the journey of "
196 |                         "train: %s. It may be because of the stale journey "
197 |                         "information of train: %s in database."
198 |                         % (station, train_num, train_num))
199 |         ret["Result"] = None
200 |       return ret
201 | 
202 |     ret["Result"] = lms_at_stns_dict
203 |     return ret
204 | 


--------------------------------------------------------------------------------
/code/unknown_trains_lms_pred.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Train Delay Estimation Project
  3 | #
  4 | # Author: Ramashish Gaurav
  5 | #
  6 | # Desc: This file predicts the journey wise late minutes for unknown trains.
  7 | #
  8 | #       The path to saved trained models which would be loaded and employed to
  9 | #       predict late mins must be specified in function
 10 | #       "get_predicted_late_mins_list()" in file: "utilities/tt_utils.py".
 11 | #
 12 | #       N-Order Markov Late Minutes Prediction Framework for Unknown Trains.
 13 | #
 14 | #       To run this file execute:
 15 | #       `python unknown_trains_lms_pred.py rfr 10 1`
 16 | #
 17 | #       Where the string "rfr" stands for the model employed to predict late
 18 | #       minutes, here "rfr" implies: Random Forest Regressor model. Numeral 10
 19 | #       can be changed to desired number of Nearest Neighbors stations out of
 20 | #       which first Known Station is to be chosen. Here we mention only 1-prev
 21 | #       station to be considered.
 22 | #
 23 | #       "jrny_wise_unknown_trains_lms_1ps_labenc_wonps_wdts" is the directory
 24 | #       where predicted late minutes will be stored and
 25 | #       "rmse_of_jrny_wise_lms_pred_unknown_trains_1ps_wonps_wdts" is the
 26 | #       directory where RMSE of predicted late minutes will be dumped.
 27 | #
 28 | 
 29 | import pandas as pd
 30 | import pickle
 31 | import sys
 32 | 
 33 | from sklearn.metrics import mean_squared_error
 34 | 
 35 | from utilities.tt_utils import TrainingTestUtils as TTU
 36 | 
 37 | def get_journey_wise_late_mins_of_unknown_trains(
 38 |     ttu, train_num, setting, nn, mdl, n, exp_lms_output_dir, exp_rmse_output_dir):
 39 |   """
 40 |   Finds the journey wise late minutes of unknown trains, ie. last 82 trains of
 41 |   all 135 trains whose data has been collected so far.
 42 | 
 43 |   The data of last 82 trains has NOT been used for training the station models.
 44 | 
 45 |   Args:
 46 |     ttu <TTU()>: An object of TrainingTestUtils
 47 |     train_num <string>: A five digit train number string eg. "12307"
 48 |     setting <string>: <"test">
 49 |     nn <int>: Number of nearest neighbors of unknown stations
 50 |     mdl <string>: <"rfr"> # For Random Forest Regressor Models
 51 |                   <"lmr"> # For Linear Model Regressor Models
 52 |     n <int>: value of n in n-OMLMPF (n-prev-stns to consider)
 53 |     exp_lms_output_dir <str>: <
 54 |                             "jrny_wise_unknown_trains_lms_1ps_labenc_wonps_wdts"
 55 |                             | ..> Desired output directory where predicted late
 56 |                             minutes are to saved.
 57 |     exp_rmse_output_dir <str>: <
 58 |         "rmse_of_jrny_wise_lms_pred_unknown_trains_1ps_wonps_wdts" | ..> Desired
 59 |         output directory where RMSE's of predicted late minutes are saved in
 60 |         pickle format. Make sure it stays aligned with `exp_lms_output_dir`.
 61 |   """
 62 |   pred_lms_df = [] # To capture the predicted late mins for each journey
 63 |   pred_lms_rmse = [] # Late Minutes RMSE for each journey
 64 |   columns = ["Stations", "ActualLateMins", "PredictedLateMins"]
 65 |   train_df = ttu._cdr.get_train_journey_df(train_num, setting)
 66 | 
 67 |   # Get all the source station rows of each journey in train_df
 68 |   source_rows = train_df[train_df.scharr=="Source"].index.tolist()
 69 | 
 70 |   for i in range(len(source_rows)):
 71 |     # Obtain the single journey data frame
 72 |     sj_df = ttu._tdfu._generate_single_journey_df(train_df, i, source_rows)
 73 |     sj_df = sj_df[["station_code", "distance", "month", "weekday", "latemin"]]
 74 | 
 75 |     # Obtain the current single journey station list
 76 |     stn_list_sj = sj_df["station_code"].tolist()
 77 |     actual_late_mins_sj = sj_df["latemin"]
 78 |     pred_late_mins_sj = [0] # Assuming 0 late mins for source station
 79 |     num_of_unknown_stns = 0
 80 |     # Uncomment the following lines in `if else` case accordingly as per value
 81 |     # of N in N-OMLMPF. If N is chosen to be 3, it implies we will consider only
 82 |     # 3-previous-station models of suitable stations to predict the late minutes.\
 83 |     # Here, the value of N is chosen 1, so other `else` part of code is
 84 |     # commented out. Uncomment to generate desired results for different N.
 85 |     for j in range(1, len(stn_list_sj)):
 86 |       try:
 87 |         stn = stn_list_sj[j]
 88 | 
 89 |         if (j == 1 or n == 1): # valid for only 1 previous station
 90 |           stns_hvng_1ps_model = ttu._pdr.get_stations_having_nps_model_list(nps=1)
 91 |           if stn not in stns_hvng_1ps_model:
 92 |             num_of_unknown_stns += 1
 93 |             # Get nn nearest neighbors of station "stn"
 94 |             nn_stns = ttu.get_station_nearest_neighbors_list(stn, 1, nn)
 95 |             stn = nn_stns[0] # Choose the 1st nearest neighbor station
 96 |           plm = ttu.get_predicted_late_mins_at_station_float(train_num, sj_df,
 97 |               j+source_rows[i], 1, stn, pred_late_mins_sj, j, mdl)
 98 |           pred_late_mins_sj.append(plm)
 99 |           continue
100 |         if (j == 2 or n == 2): # valid for only 2 previous station
101 |           stns_hvng_2ps_model = ttu._pdr.get_stations_having_nps_model_list(nps=2)
102 |           if stn not in stns_hvng_2ps_model:
103 |             num_of_unknown_stns += 1
104 |             # Get nn nearest neighbors of station "stn"
105 |             nn_stns = ttu.get_station_nearest_neighbors_list(stn, 2, nn)
106 |             stn = nn_stns[0] # Choose the 1st nearest neighbor station
107 |           plm = ttu.get_predicted_late_mins_at_station_float(train_num, sj_df,
108 |               j+source_rows[i], 2, stn, pred_late_mins_sj, j, mdl)
109 |           pred_late_mins_sj.append(plm)
110 |           continue
111 |         if (j == 3 or n == 3): # valid for only 3 previous station
112 |           stns_hvng_3ps_model = ttu._pdr.get_stations_having_nps_model_list(nps=3)
113 |           if stn not in stns_hvng_3ps_model:
114 |             num_of_unknown_stns += 1
115 |             # Get nn nearest neighbors of station "stn"
116 |             nn_stns = ttu.get_station_nearest_neighbors_list(stn, 3, nn)
117 |             stn = nn_stns[0] # Choose the 1st nearest neighbor station
118 |           plm = ttu.get_predicted_late_mins_at_station_float(train_num, sj_df,
119 |               j+source_rows[i], 3, stn, pred_late_mins_sj, j, mdl)
120 |           pred_late_mins_sj.append(plm)
121 |           continue
122 |         if (j == 4 or n == 4): # valid for only 4 previous station
123 |           stns_hvng_4ps_model = ttu._pdr.get_stations_having_nps_model_list(nps=4)
124 |           if stn not in stns_hvng_4ps_model:
125 |             num_of_unknown_stns += 1
126 |             # Get nn nearest neighbors of station "stn"
127 |             nn_stns = ttu.get_station_nearest_neighbors_list(stn, 4, nn)
128 |             stn = nn_stns[0] # Choose the 1st nearest neighbor station
129 |           plm = ttu.get_predicted_late_mins_at_station_float(train_num, sj_df,
130 |               j+source_rows[i], 4, stn, pred_late_mins_sj, j, mdl)
131 |           pred_late_mins_sj.append(plm)
132 |           continue
133 |         if (j == 5 or n == 5): # rest stations valid for only 5 previous station
134 |           stns_hvng_5ps_model = ttu._pdr.get_stations_having_nps_model_list(nps=5)
135 |           if stn not in stns_hvng_5ps_model:
136 |             num_of_unknown_stns += 1
137 |             # Get nn nearest neighbors of station "stn"
138 |             nn_stns = ttu.get_station_nearest_neighbors_list(stn, 5, nn)
139 |             stn = nn_stns[0] # Choose the 1st nearest neighbor station
140 |           plm = ttu.get_predicted_late_mins_at_station_float(train_num, sj_df,
141 |               j+source_rows[i], 5, stn, pred_late_mins_sj, j, mdl)
142 |           pred_late_mins_sj.append(plm)
143 |           continue
144 | 
145 |       except Exception as e:
146 |         print e
147 |         pred_late_mins_sj.append(pred_late_mins_sj[j-1])
148 | 
149 |     # Construct the data frame of Station Code, Actual Late Mins and
150 |     # Predicted Late Mins for each journey
151 |     for ele in zip(zip(stn_list_sj, actual_late_mins_sj), pred_late_mins_sj):
152 |       pred_lms_df.append([ele[0][0], ele[0][1], ele[1]])
153 | 
154 |     # Mark the end of current journey
155 |     pred_lms_df.append(["JRNY END", "-------", "-------"])
156 |     # Calculate the RMSE of each journey for a train
157 |     rmse = mean_squared_error(actual_late_mins_sj, pred_late_mins_sj)**0.5
158 |     # Store the Number of Unknown Stations and RMSE of each journey of a train
159 |     pred_lms_rmse.append((num_of_unknown_stns, rmse))
160 |     # Print the RMSE of each journey for the given train "train_num"
161 |     print ("Train Number:", train_num,
162 |         "Number of Unknown Stations: ", num_of_unknown_stns, "RMSE: ", rmse)
163 | 
164 |   pred_lms_df = pd.DataFrame(pred_lms_df, columns=columns)
165 |   pred_lms_df.to_csv(ttu._cdr._cdpath+mdl+"_model_data/" + exp_lms_output_dir +
166 |       "/Train_" + train_num + "_jw_lms.csv", index=False)
167 |   pickle.dump(pred_lms_rmse, open(ttu._pdr._pdpath+mdl + "_model_pickle_data/" +
168 |       exp_rmse_output_dir + "/Train_" + train_num + "_jw_rmse.p", "wb"))
169 | 
170 | if __name__ == "__main__":
171 |   mdl = sys.argv[1] # Get the model <"rfr"|"lmr">
172 |   nn = int(sys.argv[2]) # Get the number of nearest neighbors
173 |   n = sys.argv[3] # Get the n in n-OMLMPF (n-prev-stns to consider).
174 |   # Create this directory to store predicted late minutes in each experiments
175 |   # for different values of n in nps.
176 |   exp_lms_output_dir = "jrny_wise_unknown_trains_lms_%sps_labenc" % n
177 | 
178 |   # Create this directory to store the RMSE for predicted late minutes in each
179 |   # experiment for different values of n. Make sure it stays aligned with
180 |   # exp_lms_output_dir.
181 |   exp_rmse_output_dir = "rmse_of_jrny_wise_lms_pred_unknown_trains_%sps"  % n
182 | 
183 |   ttu = TTU()
184 |   trains83 = ttu._pdr.get_all_trains()[52:] # Choose the rest 83 Unknown Trains
185 |   for train in trains83:
186 |     get_journey_wise_late_mins_of_unknown_trains(
187 |         ttu, train, "unknown_test", nn, mdl, int(n), exp_lms_output_dir,
188 |         exp_rmse_output_dir)
189 | 


--------------------------------------------------------------------------------
/code/utilities/tt_utils.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Train Delay Estimation Project
  3 | #
  4 | # Author: Ramashish Gaurav
  5 | #
  6 | # Desc: This file provides the basic utility functions for training and testing
  7 | #       the models.
  8 | #
  9 | 
 10 | from env import * # Import it first as it imports data_path and models_path.
 11 | import joblib
 12 | import numpy as np
 13 | import pandas as pd
 14 | import pickle
 15 | 
 16 | from sklearn.metrics import mean_squared_error
 17 | from sklearn.neighbors import NearestNeighbors as NN
 18 | 
 19 | from df_utils import TrainDataFrameUtils as TDFU
 20 | from readers.pickle_data_reader import PickleDataReader as PDR
 21 | from readers.csv_data_reader import CSVDataReader as CDR
 22 | 
 23 | 
 24 | class TrainingTestUtils(object):
 25 | 
 26 |   def __init__(self):
 27 |     self._tdfu = TDFU()
 28 |     self._pdr = PDR(data_path)
 29 |     self._cdr = CDR(data_path)
 30 |     self._model_path = models_path
 31 |     self._stn_geo_crdnates = self._pdr.get_station_coordinates_dict()
 32 |     self._stn_deg_strength = self._pdr.get_station_degree_strength_dict()
 33 |     self._stn_tfc_strength = self._pdr.get_station_traffic_strength_dict()
 34 | 
 35 |   def _get_labenc_of_cat_var_df(self, df, cat_var, cat_var_dict):
 36 |     """
 37 |     Returns the station data frame where the "cat_var" column is the label
 38 |     encoding of the categorial variables in passed station data frame "df"
 39 | 
 40 |     Args:
 41 |       df <pandas.DataFrame>: The data frame whose categorical variables are to
 42 |                              be encoded.
 43 |       cat_var <string>: The column name of the categorical variables to be
 44 |                         encoded eg. "train_type".
 45 |       cat_var_dict <dict{}>: A python dictionary to provide label encodings for
 46 |                              categorical variables.
 47 |     """
 48 |     l = []
 49 |     cat_var_clmn = df[cat_var]
 50 |     for ele in cat_var_clmn:
 51 |       l.append(cat_var_dict[ele])
 52 |     l = pd.DataFrame(l, columns=[cat_var])
 53 |     temp = df.pop(cat_var)
 54 |     df = pd.concat([l, df], axis=1)
 55 |     return df
 56 | 
 57 |   def _get_labenc_station_df(self, df, n):
 58 |     """
 59 |     Returns the complete training data frame of a station where all its
 60 |     categorical variables are encoded.
 61 | 
 62 |     Args:
 63 |       df <pandas.DataFrame>: The data frame whose categorical variables are to
 64 |                              be label encoded.
 65 |       n <int>: The n in "n previous stations" data frame.
 66 |     """
 67 |     # Encode Train Type
 68 |     train_type_dict = self._pdr.get_labenc_train_type_dict()
 69 |     df = self._get_labenc_of_cat_var_df(df, "train_type", train_type_dict)
 70 | 
 71 |     # Encode zone
 72 |     zone_dict = self._pdr.get_labenc_zone_dict()
 73 |     df = self._get_labenc_of_cat_var_df(df, "zone", zone_dict)
 74 | 
 75 |     # Encode month
 76 |     month_dict = self._pdr.get_labenc_month_dict()
 77 |     df = self._get_labenc_of_cat_var_df(df, "month", month_dict)
 78 | 
 79 |     # Encode weekday
 80 |     weekday_dict = self._pdr.get_labenc_weekday_dict()
 81 |     df = self._get_labenc_of_cat_var_df(df, "weekday", weekday_dict)
 82 | 
 83 |     # Encode n previous stations
 84 |     station_dict = self._pdr.get_labenc_station_dict()
 85 |     for i in range(n):
 86 |       df = self._get_labenc_of_cat_var_df(
 87 |           df, str(i+1)+"_prev_station", station_dict)
 88 | 
 89 |     return df
 90 | 
 91 |   def generate_row_df(self, train_num, sj_df, j, n):
 92 |     """
 93 |     Returns a single row data frame info to test the late minutes prediction
 94 |     algorithm.
 95 | 
 96 |     Args:
 97 |       train_num <string>: A five digit train number eg. "12307".
 98 |       sj_df <pandas.DataFrame>: A single journey data frame from which row data
 99 |                                 frame is to be obtained.
100 |       j <int>: The row index of the current station in sj_df whose n previous
101 |                stations' info is required.
102 |       n <int>: Number of previous stations.
103 |     """
104 |     column_names_list = self._tdfu._get_column_names_list(n)
105 | 
106 |     # train_type. zone. is_superfast, month, weekday
107 |     feature_list = [self._tdfu._generate_train_type_str(train_num),
108 |          self._tdfu._generate_zone_str(train_num),
109 |          self._tdfu._is_superfast_str(train_num),
110 |          self._tdfu._generate_month_str(sj_df, j),
111 |          self._tdfu._generate_weekday_str(sj_df, j)]
112 |     # n_prev_station
113 |     feature_list.extend(
114 |         self._tdfu._generate_n_prev_station_codes_list(sj_df, j, n))
115 |     # n_ps_late_mins
116 |     feature_list.extend(
117 |         self._tdfu._generate_n_prev_stn_late_mins_list(sj_df, j, n))
118 |     # dist_bwn_stn_n-1_n
119 |     feature_list.extend(
120 |         self._tdfu._generate_n_prev_dist_bwn_stn_list(sj_df, j, n))
121 |     # stn_n_dist_frm_src
122 |     feature_list.extend(
123 |         self._tdfu._generate_n_prev_stn_dist_from_source_list(sj_df, j, n))
124 |     # tfc_of_stn_n
125 |     feature_list.extend(
126 |         self._tdfu._generate_n_prev_stn_tfc_strength_list(sj_df, j, n))
127 |     # deg_of_stn_n
128 |     feature_list.extend(
129 |         self._tdfu._generate_n_prev_stn_deg_strength_list(sj_df, j, n))
130 |     # crnt_stn_tfc
131 |     feature_list.extend(
132 |         self._tdfu._generate_n_prev_stn_tfc_strength_list(sj_df, j, 0))
133 |     # crnt_stn_deg
134 |     feature_list.extend(
135 |         self._tdfu._generate_n_prev_stn_deg_strength_list(sj_df, j, 0))
136 |     # crnt_stn_dist_frm_src
137 |     feature_list.extend(
138 |         self._tdfu._generate_n_prev_stn_dist_from_source_list(sj_df, j, 0))
139 |     # crnt_stn_late_mins
140 |     feature_list.extend(
141 |         self._tdfu._generate_n_prev_stn_late_mins_list(sj_df, j, 0))
142 | 
143 |     feature_list_df = pd.DataFrame([feature_list], columns=column_names_list)
144 |     # Obtain the label encoded feature_list_df
145 |     feature_list_df = self._get_labenc_station_df(feature_list_df, n)
146 |     return feature_list_df
147 | 
148 |   def remove_unwanted_columns_df(self, df, n):
149 |     """
150 |     Returns the passed data frame after removal of unwanted columns from it.
151 | 
152 |     Args:
153 |       df <pandas.DataFrame>: The data frame from which columns are to be removed.
154 |       n <int>: Number of previous stations to the current station.
155 |     """
156 |     # Remove "stn_n_dist_frm_src"
157 |     # Remove "tfc_of_stn_n"
158 |     # Remove "deg_of_stn_n"
159 |     # Remove "n_prev_station"
160 |     for k in range(n):
161 |       #temp = df.pop("stn_"+str(k+1)+"_dist_frm_src")
162 |       #temp = df.pop("tfc_of_stn_"+str(k+1))
163 |       #temp = df.pop("deg_of_stn_"+str(k+1))
164 |       temp = df.pop(str(k+1)+"_prev_station") # Remove station code names.
165 |     return df
166 | 
167 |   def get_predicted_late_mins_list(self, current_station, n, df, mdl):
168 |     """
169 |     Returns the predicted late mins at the current_station.
170 | 
171 |     Args:
172 |       current_station <string>: Station Code for the station in question
173 |                                 eg. "CNB", used to choose the RFR model.
174 |       n <int>: Number of previous station to the current_station to choose the
175 |                RFR model.
176 |       df <pandas.DataFrame>: The data frame of current_station to predict late
177 |                              minutes at it.
178 |       mdl <string>: <"rfr"|"lmr"|"nnr">
179 |                     "rfr": Random Forest Regressor Models.
180 |                     "lmr": Linear Model Regressor Models (not reliable).
181 |                     "nnr": Neural Network Regressor Models (not converged).
182 |     """
183 |     model = joblib.load(self._model_path + mdl + "_models/" + str(n) +
184 |         "ps_" + mdl + "_labenc_models/" + current_station +
185 |         "_label_encoding_model.sav")
186 |     pred_late_mins = model.predict(df)
187 |     return pred_late_mins
188 | 
189 |   def _get_selected_stations_df(self, stn_index_list, df):
190 |     """
191 |     Returns a station features data frame of selected stations.
192 | 
193 |     Args:
194 |       stn_index_list <list>: A list of stations indices for which station
195 |                              features data frame is to be constructed.
196 |       df <pandas.DataFrame>: A Complete DataFrame of 596 known station features
197 |     """
198 |     selected_station_df = df.iloc[stn_index_list]
199 |     return selected_station_df
200 | 
201 |   def get_station_nearest_neighbors_list(self, station, nps, n):
202 |     """
203 |     Returns the n nearest neighbors stations to given station among the stations
204 |     in passed data frame "df".
205 | 
206 |     Args:
207 |       station <string>: The station code for which nearest neighbors are needed.
208 |       nps <int>: Number of previous stations to choose stations having nps model.
209 |       n <int>: Number of nearest neighbors needed.
210 |     """
211 |     # Choose the stations who have the respective nps models.
212 |     # If the unknown station occurs as 3rd station in the complete journey, then
213 |     # the nearest known station should have a 3 previous station model and so on.
214 |     stns_hvng_nps_mdls = self._pdr.get_stations_having_nps_model_list(nps)
215 | 
216 |     # Get the station features data frame for known stations having nps models
217 |     df = self._pdr.get_known_596_stations_features_df()
218 |     df = df[df.Station.isin(stns_hvng_nps_mdls)]
219 | 
220 |     query_stn_feature = [[self._stn_geo_crdnates[station][0],
221 |                          self._stn_geo_crdnates[station][1],
222 |                          self._stn_deg_strength[station],
223 |                          self._stn_tfc_strength[station]]]
224 |     # First choose neighbors which are geographically closer
225 |     lat_lon_df = df[["Latitude", "Longitude"]]
226 | 
227 |     lat_lon_query_stn_ftr = [[self._stn_geo_crdnates[station][0],
228 |                               self._stn_geo_crdnates[station][1]]]
229 |     ll_nbrs = NN(n_neighbors=n, algorithm="auto").fit(lat_lon_df)
230 |     # ll_indices are directly indexed corresponding to stns_hvng_nps_mdls
231 |     ll_distances, ll_indices = ll_nbrs.kneighbors(lat_lon_query_stn_ftr)
232 | 
233 |     # Subselect the chosen stations features from the complete station
234 |     # features df.
235 |     selected_station_fts_df = self._get_selected_stations_df(ll_indices[0], df)
236 | 
237 |     # Then choose neighbors based on degree and traffic strength among the
238 |     # above chosen geographically closer stations.
239 |     deg_tfc_df = selected_station_fts_df[["Degree_Strength", "Traffic_Strength"]]
240 |     deg_tfc_query_stn_ftr = [[self._stn_deg_strength[station],
241 |                               self._stn_tfc_strength[station]]]
242 |     dt_nbrs = NN(n_neighbors=n, algorithm="auto").fit(deg_tfc_df)
243 |     # dt_indices are indexed with 0, so not directly related to
244 |     # stns_hvng_nps_mdls
245 |     dt_distances, dt_indices = dt_nbrs.kneighbors(deg_tfc_query_stn_ftr)
246 | 
247 |     # Once the dt_indices are obtained where the stations are arranged as per
248 |     # increasing distance of degree and traffic strength features, get the
249 |     # station codes from the df at those indices (since the dt_indices are
250 |     # indexed from 0 onwards with respect to the ll_indices, hence the following
251 |     # code). Also the ll_indices are with respect to the df.
252 |     final_nearest_neighbors_stns_list = [df.iloc[ll_indices[0][idx]].Station
253 |         for idx in dt_indices[0]]
254 |     return final_nearest_neighbors_stns_list
255 | 
256 |   def get_predicted_late_mins_at_station_float(self, train_num, sj_df, idxof_stn,
257 |       n, station, pred_lms_sj, j, mdl):
258 |     """
259 |     Returns the predicted late minutes at given "station".
260 | 
261 |     Args:
262 |       train_num <string>: A five digit train number eg. "12307".
263 |       sj_df <pandas.DataFrame>: A single journey data frame.
264 |       idxof_stn <int>: Index of current station in single journey data frame.
265 |       n <int>: N in number of previous station.
266 |       station <string>: Station Code at which late mins are to be predicted.
267 |       pred_lms_sj <list>: Predicted Late Minutes list.
268 |       j <int>: The current station index in station list of sj_df.
269 |       mdl <string>: <"rfr"> # For random forest regressor model.
270 |     """
271 |     row_df_nps = self.generate_row_df(train_num, sj_df, idxof_stn, n)
272 |     temp = row_df_nps.pop("crnt_stn_late_mins")
273 |     # Remove unwanted columns from the row data frame
274 |     row_df_nps = self.remove_unwanted_columns_df(row_df_nps, n)
275 | 
276 |     # Set the late minutes at n previous stations as predicted ones
277 |     for i in range(n):
278 |       row_df_nps[str(i+1)+"_ps_late_mins"] = pred_lms_sj[j-(i+1)]
279 | 
280 |     plm = self.get_predicted_late_mins_list(station, n, row_df_nps, mdl)
281 |     return plm[0]
282 | 


--------------------------------------------------------------------------------
/misc/result_analysis.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Train Delay Estimation Project
  3 | #
  4 | # author: gaurav.ramashish@gmail.com
  5 | #
  6 | # Desc: This file analyses the results
  7 | #
  8 | 
  9 | import joblib
 10 | import numpy as np
 11 | import os
 12 | import pandas as pd
 13 | from scipy import stats
 14 | 
 15 | from utilities.tt_utils import TrainingTestUtils as TTU
 16 | 
 17 | class ResultAnalysis(object):
 18 |   def __init__(self):
 19 |     self._ttu = TTU()
 20 |     self._cdr = self._ttu._cdr
 21 |     self._pdr = self._ttu._pdr
 22 | 
 23 |   def get_confidence_intervals_train_stations_df_dict(self, train_num,
 24 |       ci_prob=0.95):
 25 |     """
 26 |     Returns a df of CI of all known 52 trains stations, complete journey is
 27 |     considered. DF is dict of station vs CIs for the given train.
 28 | 
 29 |     Args:
 30 |       train_num <string>: Train number eg. "12307"
 31 |       ci_prob <float>: The Confidence Interval probability. <0..1>
 32 |     """
 33 |     stns_stats_dict = {}
 34 |     train_df = self._cdr.get_train_complete_journey_df(train_num)
 35 |     # Get all the unique stations whose CI has to be calculated monthly.
 36 |     unq_stns = train_df.station_code.unique()
 37 | 
 38 |     # Get all the unique months for whom CI for a station would be calculated.
 39 |     unq_mnts = train_df.month.unique()
 40 | 
 41 |     stn_stats_cols = ["month", "mean_lms", "std", "ci"]
 42 |     # Calculate CI for each stations.
 43 |     for stn in unq_stns:
 44 |       # Select the data frame for the current station
 45 |       stn_df = train_df[train_df.station_code == stn]
 46 |       stn_stats_df = []
 47 |       # Calculate for every month.
 48 |       for mnt in unq_mnts:
 49 |         # Select the data frame out of stn_df for the current month.
 50 |         stn_mnt_df = stn_df[stn_df.month == mnt]
 51 |         if not stn_mnt_df.empty:
 52 |           # If the stn_mnt_df is not empty, remove outliers by Tukeys Rule.
 53 |           first_q = np.percentile(stn_mnt_df.latemin, 25)
 54 |           third_q = np.percentile(stn_mnt_df.latemin, 75)
 55 |           iqr = third_q - first_q
 56 |           upr_threshold = third_q + 1.5*iqr # Factor of 1.5 can be changed to 3.
 57 |           # Not calculating lower threshold since trains can arrive on time at
 58 |           # the stations in best cases.
 59 |           # Select cleaned stn_mnt_df by removing outliers (outlier late mins
 60 |           # due to trains being late at the source).
 61 |           cln_stn_mnt_df = stn_mnt_df[stn_mnt_df.latemin <= upr_threshold]
 62 |           mean_lms = cln_stn_mnt_df.latemin.mean()
 63 |           std = cln_stn_mnt_df.latemin.std()
 64 |           # Calculate length of late minutes list for which mean is calculated.
 65 |           len_lml = len((cln_stn_mnt_df.latemin.tolist()))
 66 |           ci = stats.t.interval(ci_prob, len_lml-1, loc=mean_lms,
 67 |               scale=std/np.sqrt(len_lml))
 68 |         else: # If the stn_mnt_df is empty.
 69 |           mean_lms = -1
 70 |           std = -1
 71 |           ci = (-1, -1)
 72 |         stn_stats_df.append([mnt, mean_lms, std, ci])
 73 |       stn_stats_df = pd.DataFrame(stn_stats_df, columns = stn_stats_cols)
 74 |       stns_stats_dict[stn] = stn_stats_df
 75 |     return stns_stats_dict
 76 | 
 77 |   def find_ci_probability_of_pred_lms_df(self, train_num, ci_prob=0.95, nps=4,
 78 |       rfr_mdl="", group="known"):
 79 |     """
 80 |     Returns total number of predictions and number of predictions of
 81 |     late minutes at stations within CI ci_prob.
 82 | 
 83 |     Args:
 84 |       train_num <string>: Train number eg. "12307".
 85 |       ci_prob <float>: The Confidence Interval probability. <0..1>
 86 |       nps <int>: number of previous stations
 87 |       rfr_mdl <string>: <""|"without_nps_codes"|"wonps_wdts">
 88 |       group <string>: <"known"|"unknown">
 89 |     """
 90 |     # If the predicted late minutes for a station falls in its CI of fixed
 91 |     # ci_prob say 0.95, then there is 95% chance that the train will get delayed
 92 |     # by that many predicted late minutes at the chosen station.
 93 | 
 94 |     # Get the train's data frame for cross validation.
 95 |     train_df = self._cdr.get_train_journey_df(train_num, "unknown_test")
 96 |     # Get the predicted late minutes for the train's cross validation data.
 97 |     pred_lms_df = self._cdr.get_jw_pred_late_mins_of_train_df(train_num, nps,
 98 |         rfr_mdl, group)
 99 |     # Remove "JRNY END" rows from pred_lms_df.
100 |     pred_lms_df = pred_lms_df.loc[~pred_lms_df.Stations.isin(["JRNY END"])]
101 |     # Get the CI for the train's stations.
102 |     stns_stats_dict = self.get_confidence_intervals_train_stations_df_dict(
103 |         train_num, ci_prob)
104 | 
105 |     total_predictions = len(train_df)
106 |     num_of_ci_prob_preds = 0
107 | 
108 |     for i in range(total_predictions):
109 |       # Select the month in which late mins is predicted for the station.
110 |       mnt = train_df.iloc[i].month
111 |       stn = train_df.iloc[i].station_code
112 |       pred_lms = float(pred_lms_df.iloc[i].PredictedLateMins)
113 | 
114 |       # Select the stations CI from stns_stats_dict.
115 |       try:
116 |         stn_stats = stns_stats_dict[stn]
117 |       except:
118 |         continue
119 |       stn_month_stats = stn_stats[stn_stats.month==mnt]
120 |       stn_month_stats_ci = tuple(stn_month_stats["ci"])
121 |       try:
122 |         if (pred_lms >= stn_month_stats_ci[0][0] and
123 |             pred_lms <= stn_month_stats_ci[0][1]):
124 |           num_of_ci_prob_preds += 1
125 |       except:
126 |         print stn_month_stats, pred_lms
127 | 
128 |     return total_predictions, num_of_ci_prob_preds
129 | 
130 |   def calculate_diff_of_af_df_and_nf_df(self, ci_prob):
131 |     """
132 |     Calculates the difference of "%_preds_within_ci" in additional features (af)
133 |     data frame and in normal features (nf) data frame and saves it in CSV files.
134 | 
135 |     Args:
136 |       ci_prob <float>: The confidence interval probability. [0..1]
137 |     """
138 |     file_path = self._cdr._cdpath + "analysed_data/known_trains/"
139 | 
140 |     files = os.listdir(file_path)
141 |     CI = str(int(ci_prob * 100))
142 |     diff_df = []
143 |     diff_df_cols = ["train_number", "1ps", "2ps", "3ps", "4ps", "5ps"]
144 |     nf_df = pd.DataFrame()
145 |     af_df = pd.DataFrame()
146 | 
147 |     for f in files:
148 |       if f.startswith("CI"+CI):
149 |         df = pd.read_csv(file_path+f)
150 |         temp_df = pd.DataFrame()
151 |         temp_df[f[13:16]] = df["%_preds_within_ci"]
152 |         if f.endswith("model.csv"): # Info corresponding to normal features df.
153 |           nf_df = pd.concat([nf_df, temp_df], axis=1)
154 |         else: # Info corresponding to additional features df.
155 |           af_df = pd.concat([af_df, temp_df], axis=1)
156 | 
157 |     diff_df = af_df - nf_df
158 |     diff_df["train_number"] = pd.read_csv(file_path+files[0])["train_number"]
159 |     diff_df = diff_df[["train_number", "1ps", "2ps", "3ps", "4ps", "5ps"]]
160 |     desc = diff_df.describe()
161 |     diff_df.to_csv(file_path+"CI"+CI+"diff_bwn_af_nf_results.csv")
162 |     desc.to_csv(file_path+"CI"+CI+"diff_stats.csv")
163 | 
164 |   def calculate_AIC_or_BIC_float(self, train_num, nps, rfr_mdl="", group=""):
165 |     """
166 |     Calculates BIC value of model determined by nps for different test settings.
167 |     http://www.stat.wisc.edu/courses/st572-larget/Spring2007/handouts09-4.pdf
168 |     Uncomment the formula for calculating either AIC or BIC accordingly.
169 | 
170 |     Args:
171 |       train_num <string>: Train Number for whom BIC is needed. eg. "12307"
172 |       nps <int>: Number of previous stations
173 |       rfr_mdl <string>: <""|"_wonps_wdts">
174 |       group <string>: <"known"|"unknown">
175 |     """
176 |     jw_lms_df = self._cdr.get_jw_pred_late_mins_of_train_df(train_num, nps,
177 |         rfr_mdl, group)
178 |     # Load any nps model to get the number of parameters or features
179 |     model = joblib.load(self._ttu._model_path+"rfr_models/"+str(nps)+"ps_rfr"+
180 |         "_labenc_models_complete"+rfr_mdl+"/CNB_label_encoding_model.sav")
181 | 
182 |     # Remove "JRNY END" rows from pred_lms_df.
183 |     jw_lms_df = jw_lms_df.loc[~jw_lms_df.Stations.isin(["JRNY END"])]
184 |     # Calcuate Residual Sum of Squares (also known as Sum of Squared Errors)
185 |     actual_lms = jw_lms_df.ActualLateMins.astype(dtype=float)
186 |     pred_lms = jw_lms_df.PredictedLateMins.astype(dtype=float)
187 |     error = actual_lms - pred_lms
188 |     sqrd_error = error ** 2
189 |     rss = np.sum(sqrd_error) # Calculate RSS
190 |     num_of_obsrs = jw_lms_df.shape[0] # Calculate Number of Observations
191 |     num_of_parms = model.n_features_
192 | 
193 |     #BIC = (num_of_obsrs * np.log((rss * 1.0)/num_of_obsrs) +
194 |     #    num_of_parms * np.log(num_of_obsrs))
195 |     #return BIC
196 | 
197 |     AIC = (num_of_obsrs * np.log((rss * 1.0)/num_of_obsrs) +
198 |         num_of_parms * 2)
199 |     return AIC
200 | 
201 |   def save_bic_df_and_calc_nps_with_minimum_bic_int(
202 |       self, rfr_mdl="", group=""):
203 |     """
204 |     Saves the BIC lists into a df and calculates the value nps which has minimum
205 |     bic. This function acts generic depending on the value (either AIC or BIC)
206 |     returned by function calculate_AIC_or_BIC_float. Do not confuse with the
207 |     name of the function that it saves only BIC data frames, it is generic.
208 |     Change the name of saved data frame file in last line  accordingly.
209 | 
210 |     Args:
211 |       rfr_mdl <string>: <""|"_wonps_wdts">
212 |       group <string>: <"known"|"unknown">
213 |     """
214 |     df = []
215 |     columns = ["TrainNum", "1OR", "2OR", "3OR", "4OR", "5OR", "Min_n"]
216 |     all_trains = self._pdr.get_all_trains()[52:] # All Known Trains
217 |     for train in all_trains:
218 |       bic_list = [train]
219 |       min_i = 1
220 |       min_b = None
221 |       for i in range(1,6):
222 |         BIC = self.calculate_AIC_or_BIC_float(train, i, rfr_mdl, group)
223 |         bic_list.append(BIC)
224 |         if i == 1:
225 |           min_i = 1
226 |           min_b = BIC
227 |         else:
228 |           if BIC < min_b:
229 |             min_i = i
230 |             min_b = BIC
231 |       bic_list.append(min_i)
232 |       df.append(bic_list)
233 |     df = pd.DataFrame(df, columns=columns)
234 |     df.to_csv(self._cdr._cdpath+"analysed_data/"+group+"_trains/aic_analysis/"+
235 |         group+rfr_mdl+".csv", index=False)
236 | 
237 |   def calculate_sum_of_rmses_for_n_omlmpf_df(self, group, rfr_mdl=""):
238 |     """
239 |     Caculcates and saves the total RMSE for all trains in different rfr_mdl
240 |     settings for different values of n in different groups. This gives the
241 |     overall measure of the performance of different N-OMLMPF, pointing out
242 |     the one with minimum overall RMSE, thus used during production mode.
243 | 
244 |     Args:
245 |       group <string>: <"known">
246 |       rfr_mdl <string>: <""|"_wonps_wdts">
247 |     """
248 |     df = []
249 |     columns = [
250 |         "Train", "1-OMLMPF", "2-OMLMPF", "3-OMLMPF", "4-OMLMPF", "5-OMLMPF"]
251 |     trains = self._pdr.get_all_trains()[:52]
252 |     for train in trains:
253 |       train_wise_sum_rmse = [train]
254 |       for i in range(5):
255 |         rmse_list = self._pdr.get_rmse_of_journey_wise_lms_pred_list(
256 |             i+1, group, train, rfr_mdl=rfr_mdl)
257 |         train_wise_sum_rmse.append(sum(rmse_list))
258 |       df.append(train_wise_sum_rmse)
259 |     df = pd.DataFrame(df, columns=columns)
260 |     df.to_csv(self._cdr._cdpath+"analysed_data/"+group+"_trains/sum_rmse_of_"+
261 |               group+"_trains_"+rfr_mdl+".csv", index=False)
262 | 
263 | if __name__ == "__main__":
264 | 
265 |   # Uncomment the codes in different blocks to run any specific data analysation
266 |   # code.
267 |   ra = ResultAnalysis()
268 | 
269 |   ##############################################################################
270 |   """
271 |   group = "test_unknown"
272 |   nps = 5
273 |   rfr_mdl = "_wonps_wdts"
274 |   all_trains = ra._pdr.get_all_trains()[52:] # Known Trains
275 |   train_ci_df_cols = ["train_number", "#_preds", "#_preds_within_ci",
276 |       "%_preds_within_ci"]
277 | 
278 |   for ci_prob in [0.68, 0.95, 0.99]:
279 |     train_ci_df = []
280 |     for train in all_trains:
281 |       (total_predictions, num_of_ci_prob_preds) = (
282 |           ra.find_ci_probability_of_pred_lms_df(train, ci_prob, nps, rfr_mdl,
283 |                                                 group))
284 |       try:
285 |         train_ci_df.append([train, total_predictions, num_of_ci_prob_preds,
286 |             num_of_ci_prob_preds*100.0/total_predictions])
287 |       except:
288 |         print train, total_predictions, num_of_ci_prob_preds
289 |     train_ci_df = pd.DataFrame(train_ci_df, columns = train_ci_df_cols)
290 |     train_ci_df.to_csv(ra._cdr._cdpath+"rmr_analysed_data/"+group+"_trains/CI"+
291 |         str(int(ci_prob * 100))+"_results_"+str(nps)+"ps_rmr_5e_1_model"+
292 |         rfr_mdl+".csv", index=False)
293 |   """
294 |   ##############################################################################
295 |   """
296 |   ra.calculate_diff_of_af_df_and_nf_df(0.99)
297 |   """
298 |   ##############################################################################
299 | 
300 |   ra.save_bic_df_and_calc_nps_with_minimum_bic_int("_wonps_wdts", "test_unknown")
301 | 
302 |   ##############################################################################
303 |   """
304 |   ra.calculate_sum_of_rmses_for_n_omlmpf_df("known", "_wonps_wdts")
305 |   """
306 | 


--------------------------------------------------------------------------------
/code/utilities/df_utils.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Train Delay Estimation Project
  3 | #
  4 | # Author: Ramashish Gaurav
  5 | #
  6 | # Desc: Provides necessary utilities helpful for DataFrame creation
  7 | #
  8 | 
  9 | # Import it first, as it imports data_path and models_path and paths to other
 10 | # modules namely pickle_data_reader and csv_data_reader.
 11 | from env import *
 12 | 
 13 | import pickle
 14 | import pandas as pd
 15 | 
 16 | from pickle_data_reader import PickleDataReader as PDR
 17 | from csv_data_reader import CSVDataReader as CDR
 18 | 
 19 | class TrainDataFrameUtils(object):
 20 | 
 21 |   def __init__(self):
 22 |     """
 23 |     Initializes a list which contains train type <"EXPRESS"|..>, is superfast
 24 |     <True|False> and zone <"CR"|"ECR"..> information.
 25 | 
 26 |     Args:
 27 |       train_num <string>: A five digit train number e.g. "12307"
 28 |     """
 29 |     self._pdr = PDR(data_path)
 30 |     self._cdr = CDR(data_path)
 31 | 
 32 |   def _generate_train_type_str(self, train_num):
 33 |     """
 34 |     Generates the train type <"SPECIAL"|"EXPRESS"|"OTHER"> based on the first
 35 |     digit of the train number.
 36 | 
 37 |     Args:
 38 |       train_num <string>: A five digit train number e.g. "12307"
 39 |     """
 40 |     if (train_num[0]=='0'):
 41 |       return "SPECIAL"
 42 |     if (train_num[0]=='1' or train_num[0]=='2'):
 43 |       return "EXPRESS"
 44 |     return "OTHER"
 45 | 
 46 |   def _get_train_type_col_name_list(self):
 47 |     return ["train_type"]
 48 | 
 49 |   def _is_superfast_str(self, train_num):
 50 |     """
 51 | 		Returns boolean <True|False> depending on the second digit of the train
 52 | 		number.
 53 | 
 54 |     Args:
 55 |     	train_num <string>: A five digit train number e.g. "12307"
 56 |     """
 57 |     if train_num[1]=='2':
 58 |       return True
 59 |     return False
 60 | 
 61 |   def _get_is_superfast_col_name_list(self):
 62 |     return ["is_superfast"]
 63 | 
 64 |   def _generate_zone_str(self, train_num):
 65 |     """
 66 |     Returns zone string <"JS"|"CR"|...> based on the second and third digit of
 67 |     the train number.
 68 | 
 69 |     Args:
 70 |     	train_num <string>: A five digit train number e.g. "12307"
 71 |   	"""
 72 |     if train_num[1]=='2':
 73 |       if train_num[2]=='0':
 74 |       	return "JS" # 20 is for Shatabdis and Jan Shatabdis on all zonal railways
 75 |       if train_num[2]=='1':
 76 |         return "CR" # 21 is for superfasts on CR and WCR (formerly only CR)
 77 |       if train_num[2]=='2':
 78 |         return "NR" # 22 is for superfasts from various zones -
 79 | 										# NR, NCR, NWR (formerly only NR).
 80 |       if train_num[2]=='3':
 81 |         return "ER" # 23 is for superfast on ER and ECR
 82 |       if train_num[2]=='4':
 83 |         return "NR" # 24 is for superfast on NR, NCR and NWR (formerly only NR)
 84 |       if train_num[2]=='5':
 85 |         return "NER" # 25 is for superfast on NER and NFR
 86 |       if train_num[2]=='6':
 87 |         return "SR" # 26 is for superfast on SR and SWR (formerly only SR)
 88 |       if train_num[2]=='7':
 89 |         return "SCR" # 27 is for superfast on SCR and SWR (formerly only SCR)
 90 |       if train_num[2]=='8':
 91 |         return "SER" # 28 is for superfast on SER, SECR and ECoR
 92 | 										 # (formerly only SER)
 93 |       if train_num[2]=='9':
 94 |         return "WR" # 29 is for superfast on WR, WCR and NWR (formerly only WR)
 95 |       return "OTHER"
 96 | 
 97 |     if train_num[1]=='0':
 98 |       return "KR" # 0 is for Konkan Railway
 99 |     if train_num[1]=='1':
100 |       return "CR" # 1 is for CR, WCR and NCR(?)
101 |     if train_num[1]=='3':
102 |       return "ER" # 3 is shared by ER and ECR
103 |     if train_num[1]=='4':
104 |       return "NR" # 4 is for NR, NCR and NWR
105 |     if train_num[1]=='5':
106 |       return "NER" # 5 is shared by NER and NFR
107 |     if train_num[1]=='6':
108 |       return "SR" # 6 is for SR and SWR
109 |     if train_num[1]=='7':
110 |       return "SCR" # 7 is shared by SCR and SWR
111 |     if train_num[1]=='8':
112 |       return "SER" # 8 is for SER and ECoR
113 |     if train_num[1]=='9':
114 |       return "WR" # 9 is for WR, NWR and WCR
115 |     return "OTHER"
116 | 
117 |   def _get_zone_col_name_list(self):
118 |     return ["zone"]
119 | 
120 |   def _generate_month_str(self, sj_df, j):
121 |     """
122 |     Returns the month value in the single journey data frame a particula row j.
123 | 
124 |     Args:
125 |       sj_df <pandas.DataFrame>: A single journey data frame corresponding to one
126 |                                 single journey.
127 |       j <int>: The row index in sj_df at which month info is required.
128 |     """
129 |     return sj_df["month"][j]
130 | 
131 |   def _get_month_col_name_list(self):
132 |     return ["month"]
133 | 
134 |   def _generate_weekday_str(self, sj_df, j):
135 |     """
136 |     Returns the weekday value in the single journey data frame a particula row j.
137 | 
138 |     Args:
139 |       sj_df <pandas.DataFrame>: A single journey data frame corresponding to one
140 |                                 single journey.
141 |       j <int>: The row index in sj_df at which weekday info is required.
142 |     """
143 |     return sj_df["weekday"][j]
144 | 
145 |   def _get_weekday_col_name_list(self):
146 |     return ["weekday"]
147 | 
148 |   def _generate_n_prev_station_codes_list(self, sj_df, j, n):
149 |     """
150 |     Returns a list containing n previous station codes to the current station.
151 | 
152 |     Args:
153 |       sj_df <pandas.DataFrame>: A single journey data frame.
154 |       j <int>: The row index of the current station in sj_df whose n previous
155 |                stations codes list is required.
156 |       n <int>: Number of previous stations.
157 |     """
158 |     l = []
159 |     for i in range(n):
160 |       l.append(sj_df["station_code"][j-(i+1)])
161 |     return l
162 | 
163 |   def _get_n_prev_stations_col_names_list(self, n):
164 |     """
165 |     Returns a list ["1_prev_station", "2_prev_station" ...] upto value of n.
166 | 
167 |     Args:
168 |       n <int>: Number of previous stations.
169 |     """
170 |     return [(str(i+1)+"_prev_station") for i in range(n)]
171 | 
172 | 
173 |   def _generate_n_prev_stn_late_mins_list(self, sj_df, j, n):
174 |     """
175 |     Returns a list containing n previous station's late minutes.
176 | 
177 |     Args:
178 |       sj_df <pandas.DataFrame>: A single journey data frame.
179 |       j <int>: The row index of the current station in sj_df whose n previous
180 |                late minutes list is required.
181 |       n <int>: Number of previous stations.
182 |     """
183 |     l = []
184 |     # If current station (i.e. n == 0)
185 |     if n == 0:
186 |       l.append(sj_df["latemin"][j])
187 |       return l
188 | 
189 |     for i in range(n):
190 |       l.append(sj_df["latemin"][j-(i+1)])
191 |     return l
192 | 
193 |   def _get_n_prev_stn_late_mins_col_names_list(self, n):
194 |     """
195 |     Returns a list ["1_ps_late_mins", "2_ps_late_mins" ...] upto value of n.
196 | 
197 |     Args:
198 |       n <int>: Number of previous stations.
199 |     """
200 |     return [(str(i+1)+"_ps_late_mins") for i in range(n)]
201 | 
202 |   def _get_crnt_stn_late_mins_col_names_list(self):
203 |     return ["crnt_stn_late_mins"]
204 | 
205 |   def _generate_n_prev_dist_bwn_stn_list(self, sj_df, j, n):
206 |     """
207 |     Returns a list containing n previous station's inter distance between them.
208 | 
209 |     Args:
210 |       sj_df <pandas.DataFrame>: A single journey data frame.
211 |       j <int>: The row index of the current station in sj_df whose n previous
212 |                distance between stations list is required.
213 |       n <int>: Number of previous stations.
214 |     """
215 |     l = []
216 |     for i in range(n):
217 |       l.append(sj_df["distance"][j-i] - sj_df["distance"][j-(i+1)])
218 |     return l
219 | 
220 |   def _get_n_prev_dist_bwn_stn_col_names_list(self, n):
221 |     """
222 |     Returns a list ["dist_bwn_stn_0_1", "dist_bwn_stn_1_2" ...] upto value of n.
223 | 
224 |     Args:
225 |       n <int>: Number of previous stations.
226 |     """
227 |     return [("dist_bwn_stn_"+str(i)+"_"+str(i+1)) for i in range(n)]
228 | 
229 |   def _generate_n_prev_stn_deg_strength_list(self, sj_df, j, n):
230 |     """
231 |     Returns a list containing n previous stations degree strength to the
232 |     current station.
233 | 
234 |     Args:
235 |       sj_df <pandas.DataFrame>: A single journey data frame.
236 |       j <int>: The row index of the current station in sj_df whose n previous
237 |                stations' degree strength list is required.
238 |       n <int>: Number of previous stations.
239 |     """
240 |     l = []
241 |     # If current station (i.e. n == 0)
242 |     if n == 0:
243 |       l.append(
244 |           self._pdr.get_station_degree_strength_dict()[
245 |           sj_df["station_code"][j]])
246 |       return l
247 | 
248 |     for i in range(n):
249 |       l.append(
250 |           self._pdr.get_station_degree_strength_dict()[
251 |           sj_df["station_code"][j-(i+1)]])
252 |     return l
253 | 
254 |   def _get_n_prev_stn_deg_col_names_list(self, n):
255 |     """
256 |     Returns a list ["deg_of_stn_1", "deg_of_stn_2" ...] upto value of n.
257 | 
258 |     Args:
259 |       n <int>: Number of previous stations.
260 |     """
261 |     return [("deg_of_stn_"+str(i+1)) for i in range(n)]
262 | 
263 |   def _get_crnt_stn_deg_col_names_list(self):
264 |     return ["crnt_stn_deg"]
265 | 
266 |   def _generate_n_prev_stn_tfc_strength_list(self, sj_df, j, n):
267 |     """
268 |     Returns a list containing n previous stations traffic strength to the
269 |     current station.
270 | 
271 |     Args:
272 |       sj_df <pandas.DataFrame>: A single journey data frame.
273 |       j <int>: The row index of the current station in sj_df whose n previous
274 |                stations' traffic strength list is required.
275 |       n <int>: Number of previous stations.
276 |     """
277 |     l = []
278 |     # If current station (i.e. n == 0)
279 |     if n == 0:
280 |       l.append(
281 |           self._pdr.get_station_traffic_strength_dict()[
282 |           sj_df["station_code"][j]])
283 |       return l
284 | 
285 |     for i in range(n):
286 |       l.append(
287 |           self._pdr.get_station_traffic_strength_dict()[
288 |           sj_df["station_code"][j-(i+1)]])
289 |     return l
290 | 
291 |   def _get_n_prev_stn_tfc_col_names_list(self, n):
292 |     """
293 |     Returns a list ["tfc_of_stn_1", "tfc_of_stn_2" ...] upto value of n.
294 | 
295 |     Args:
296 |       n <int>: Number of previous stations.
297 |     """
298 |     return [("tfc_of_stn_"+str(i+1)) for i in range(n)]
299 | 
300 |   def _get_crnt_stn_tfc_col_names_list(self):
301 |     return ["crnt_stn_tfc"]
302 | 
303 |   def _generate_n_prev_stn_dist_from_source_list(self, sj_df, j, n):
304 |     """
305 |     Returns a list containing n previous stations' distance from source for a
306 |     given current station.
307 | 
308 |     Args:
309 |       sj_df <pandas.DataFrame>: A single journey data frame.
310 |       j <int>: The row index of the current station in sj_df whose n previous
311 |                stations' distance from source is required.
312 |       n <int>: Number of previous station codes.
313 |     """
314 |     l = []
315 |     # If current station (i.e. n == 0)
316 |     if n == 0:
317 |       l.append(sj_df["distance"][j])
318 |       return l
319 | 
320 |     for i in range(n):
321 |       l.append(sj_df["distance"][j-(i+1)])
322 |     return l
323 | 
324 |   def _get_n_prev_stn_dist_frm_src_col_names_list(self, n):
325 |     """
326 |     Returns a list ["stn_1_dist_frm_src", "stn_2_dist_frm_src" ...] upto value
327 |     of n.
328 | 
329 |     Args:
330 |       n <int>: Number of previous stations.
331 |     """
332 |     return [("stn_"+str(i+1)+"_dist_frm_src") for i in range(n)]
333 | 
334 |   def _get_crnt_stn_dist_frm_src_col_names_list(self):
335 |     return ["crnt_stn_dist_frm_src"]
336 | 
337 |   def _generate_single_journey_df(self, df, i, source_rows):
338 |     """
339 |     Returns the single journey data frame starting at ith index in the source
340 |     rows, out of the given data frame <df>
341 | 
342 |     Args:
343 |       df <pandas.DataFrame>: the complete data frame of a train
344 |       i <int>: ith index of source rows (rows at which journey info starts)
345 |       source_rows <[...]>: the complete list of source rows in a train df i.e.
346 |                            indices in the df where source station occurs for
347 |                            each journey
348 |     """
349 |     sj_df = None
350 |     if i == len(source_rows)-1:
351 |       sj_df = df[source_rows[i]:df.shape[0]] #Single Journey DataFrame
352 |     else:
353 |       sj_df = df[source_rows[i]:source_rows[i+1]] #Single Journey DataFrame
354 |     return sj_df
355 | 
356 |   def _get_column_names_list(self, n):
357 |     """
358 |     Returns a list of column headers in a data frame.
359 | 
360 |     Args:
361 |       n <int>: Number of previous stations.
362 |     """
363 |     column_names_list = self._get_train_type_col_name_list()
364 |     column_names_list.extend(self._get_zone_col_name_list())
365 |     column_names_list.extend(self._get_is_superfast_col_name_list())
366 |     column_names_list.extend(self._get_month_col_name_list())
367 |     column_names_list.extend(self._get_weekday_col_name_list())
368 |     column_names_list.extend(self._get_n_prev_stations_col_names_list(n))
369 |     column_names_list.extend(self._get_n_prev_stn_late_mins_col_names_list(n))
370 |     column_names_list.extend(self._get_n_prev_dist_bwn_stn_col_names_list(n))
371 |     column_names_list.extend(self._get_n_prev_stn_dist_frm_src_col_names_list(n))
372 |     column_names_list.extend(self._get_n_prev_stn_tfc_col_names_list(n))
373 |     column_names_list.extend(self._get_n_prev_stn_deg_col_names_list(n))
374 |     column_names_list.extend(self._get_crnt_stn_tfc_col_names_list())
375 |     column_names_list.extend(self._get_crnt_stn_deg_col_names_list())
376 |     column_names_list.extend(self._get_crnt_stn_dist_frm_src_col_names_list())
377 |     column_names_list.extend(self._get_crnt_stn_late_mins_col_names_list())
378 | 
379 |     return column_names_list
380 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU General Public License is a free, copyleft license for
 11 | software and other kinds of works.
 12 | 
 13 |   The licenses for most software and other practical works are designed
 14 | to take away your freedom to share and change the works.  By contrast,
 15 | the GNU General Public License is intended to guarantee your freedom to
 16 | share and change all versions of a program--to make sure it remains free
 17 | software for all its users.  We, the Free Software Foundation, use the
 18 | GNU General Public License for most of our software; it applies also to
 19 | any other work released this way by its authors.  You can apply it to
 20 | your programs, too.
 21 | 
 22 |   When we speak of free software, we are referring to freedom, not
 23 | price.  Our General Public Licenses are designed to make sure that you
 24 | have the freedom to distribute copies of free software (and charge for
 25 | them if you wish), that you receive source code or can get it if you
 26 | want it, that you can change the software or use pieces of it in new
 27 | free programs, and that you know you can do these things.
 28 | 
 29 |   To protect your rights, we need to prevent others from denying you
 30 | these rights or asking you to surrender the rights.  Therefore, you have
 31 | certain responsibilities if you distribute copies of the software, or if
 32 | you modify it: responsibilities to respect the freedom of others.
 33 | 
 34 |   For example, if you distribute copies of such a program, whether
 35 | gratis or for a fee, you must pass on to the recipients the same
 36 | freedoms that you received.  You must make sure that they, too, receive
 37 | or can get the source code.  And you must show them these terms so they
 38 | know their rights.
 39 | 
 40 |   Developers that use the GNU GPL protect your rights with two steps:
 41 | (1) assert copyright on the software, and (2) offer you this License
 42 | giving you legal permission to copy, distribute and/or modify it.
 43 | 
 44 |   For the developers' and authors' protection, the GPL clearly explains
 45 | that there is no warranty for this free software.  For both users' and
 46 | authors' sake, the GPL requires that modified versions be marked as
 47 | changed, so that their problems will not be attributed erroneously to
 48 | authors of previous versions.
 49 | 
 50 |   Some devices are designed to deny users access to install or run
 51 | modified versions of the software inside them, although the manufacturer
 52 | can do so.  This is fundamentally incompatible with the aim of
 53 | protecting users' freedom to change the software.  The systematic
 54 | pattern of such abuse occurs in the area of products for individuals to
 55 | use, which is precisely where it is most unacceptable.  Therefore, we
 56 | have designed this version of the GPL to prohibit the practice for those
 57 | products.  If such problems arise substantially in other domains, we
 58 | stand ready to extend this provision to those domains in future versions
 59 | of the GPL, as needed to protect the freedom of users.
 60 | 
 61 |   Finally, every program is threatened constantly by software patents.
 62 | States should not allow patents to restrict development and use of
 63 | software on general-purpose computers, but in those that do, we wish to
 64 | avoid the special danger that patents applied to a free program could
 65 | make it effectively proprietary.  To prevent this, the GPL assures that
 66 | patents cannot be used to render the program non-free.
 67 | 
 68 |   The precise terms and conditions for copying, distribution and
 69 | modification follow.
 70 | 
 71 |                        TERMS AND CONDITIONS
 72 | 
 73 |   0. Definitions.
 74 | 
 75 |   "This License" refers to version 3 of the GNU General Public License.
 76 | 
 77 |   "Copyright" also means copyright-like laws that apply to other kinds of
 78 | works, such as semiconductor masks.
 79 | 
 80 |   "The Program" refers to any copyrightable work licensed under this
 81 | License.  Each licensee is addressed as "you".  "Licensees" and
 82 | "recipients" may be individuals or organizations.
 83 | 
 84 |   To "modify" a work means to copy from or adapt all or part of the work
 85 | in a fashion requiring copyright permission, other than the making of an
 86 | exact copy.  The resulting work is called a "modified version" of the
 87 | earlier work or a work "based on" the earlier work.
 88 | 
 89 |   A "covered work" means either the unmodified Program or a work based
 90 | on the Program.
 91 | 
 92 |   To "propagate" a work means to do anything with it that, without
 93 | permission, would make you directly or secondarily liable for
 94 | infringement under applicable copyright law, except executing it on a
 95 | computer or modifying a private copy.  Propagation includes copying,
 96 | distribution (with or without modification), making available to the
 97 | public, and in some countries other activities as well.
 98 | 
 99 |   To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies.  Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 | 
103 |   An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License.  If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 | 
112 |   1. Source Code.
113 | 
114 |   The "source code" for a work means the preferred form of the work
115 | for making modifications to it.  "Object code" means any non-source
116 | form of a work.
117 | 
118 |   A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 | 
123 |   The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form.  A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 | 
134 |   The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities.  However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work.  For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 | 
147 |   The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 | 
151 |   The Corresponding Source for a work in source code form is that
152 | same work.
153 | 
154 |   2. Basic Permissions.
155 | 
156 |   All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met.  This License explicitly affirms your unlimited
159 | permission to run the unmodified Program.  The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work.  This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 | 
164 |   You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force.  You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright.  Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 | 
175 |   Conveying under any other circumstances is permitted solely under
176 | the conditions stated below.  Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 | 
179 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 | 
181 |   No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 | 
187 |   When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 | 
195 |   4. Conveying Verbatim Copies.
196 | 
197 |   You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 | 
205 |   You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 | 
208 |   5. Conveying Modified Source Versions.
209 | 
210 |   You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 | 
214 |     a) The work must carry prominent notices stating that you modified
215 |     it, and giving a relevant date.
216 | 
217 |     b) The work must carry prominent notices stating that it is
218 |     released under this License and any conditions added under section
219 |     7.  This requirement modifies the requirement in section 4 to
220 |     "keep intact all notices".
221 | 
222 |     c) You must license the entire work, as a whole, under this
223 |     License to anyone who comes into possession of a copy.  This
224 |     License will therefore apply, along with any applicable section 7
225 |     additional terms, to the whole of the work, and all its parts,
226 |     regardless of how they are packaged.  This License gives no
227 |     permission to license the work in any other way, but it does not
228 |     invalidate such permission if you have separately received it.
229 | 
230 |     d) If the work has interactive user interfaces, each must display
231 |     Appropriate Legal Notices; however, if the Program has interactive
232 |     interfaces that do not display Appropriate Legal Notices, your
233 |     work need not make them do so.
234 | 
235 |   A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit.  Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 | 
245 |   6. Conveying Non-Source Forms.
246 | 
247 |   You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 | 
252 |     a) Convey the object code in, or embodied in, a physical product
253 |     (including a physical distribution medium), accompanied by the
254 |     Corresponding Source fixed on a durable physical medium
255 |     customarily used for software interchange.
256 | 
257 |     b) Convey the object code in, or embodied in, a physical product
258 |     (including a physical distribution medium), accompanied by a
259 |     written offer, valid for at least three years and valid for as
260 |     long as you offer spare parts or customer support for that product
261 |     model, to give anyone who possesses the object code either (1) a
262 |     copy of the Corresponding Source for all the software in the
263 |     product that is covered by this License, on a durable physical
264 |     medium customarily used for software interchange, for a price no
265 |     more than your reasonable cost of physically performing this
266 |     conveying of source, or (2) access to copy the
267 |     Corresponding Source from a network server at no charge.
268 | 
269 |     c) Convey individual copies of the object code with a copy of the
270 |     written offer to provide the Corresponding Source.  This
271 |     alternative is allowed only occasionally and noncommercially, and
272 |     only if you received the object code with such an offer, in accord
273 |     with subsection 6b.
274 | 
275 |     d) Convey the object code by offering access from a designated
276 |     place (gratis or for a charge), and offer equivalent access to the
277 |     Corresponding Source in the same way through the same place at no
278 |     further charge.  You need not require recipients to copy the
279 |     Corresponding Source along with the object code.  If the place to
280 |     copy the object code is a network server, the Corresponding Source
281 |     may be on a different server (operated by you or a third party)
282 |     that supports equivalent copying facilities, provided you maintain
283 |     clear directions next to the object code saying where to find the
284 |     Corresponding Source.  Regardless of what server hosts the
285 |     Corresponding Source, you remain obligated to ensure that it is
286 |     available for as long as needed to satisfy these requirements.
287 | 
288 |     e) Convey the object code using peer-to-peer transmission, provided
289 |     you inform other peers where the object code and Corresponding
290 |     Source of the work are being offered to the general public at no
291 |     charge under subsection 6d.
292 | 
293 |   A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 | 
297 |   A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling.  In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage.  For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product.  A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 | 
310 |   "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source.  The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 | 
318 |   If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information.  But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 | 
329 |   The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed.  Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 | 
337 |   Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 | 
343 |   7. Additional Terms.
344 | 
345 |   "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law.  If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 | 
354 |   When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it.  (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.)  You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 | 
361 |   Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 | 
365 |     a) Disclaiming warranty or limiting liability differently from the
366 |     terms of sections 15 and 16 of this License; or
367 | 
368 |     b) Requiring preservation of specified reasonable legal notices or
369 |     author attributions in that material or in the Appropriate Legal
370 |     Notices displayed by works containing it; or
371 | 
372 |     c) Prohibiting misrepresentation of the origin of that material, or
373 |     requiring that modified versions of such material be marked in
374 |     reasonable ways as different from the original version; or
375 | 
376 |     d) Limiting the use for publicity purposes of names of licensors or
377 |     authors of the material; or
378 | 
379 |     e) Declining to grant rights under trademark law for use of some
380 |     trade names, trademarks, or service marks; or
381 | 
382 |     f) Requiring indemnification of licensors and authors of that
383 |     material by anyone who conveys the material (or modified versions of
384 |     it) with contractual assumptions of liability to the recipient, for
385 |     any liability that these contractual assumptions directly impose on
386 |     those licensors and authors.
387 | 
388 |   All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10.  If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term.  If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 | 
398 |   If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 | 
403 |   Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 | 
407 |   8. Termination.
408 | 
409 |   You may not propagate or modify a covered work except as expressly
410 | provided under this License.  Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 | 
415 |   However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 | 
422 |   Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 | 
429 |   Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License.  If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 | 
435 |   9. Acceptance Not Required for Having Copies.
436 | 
437 |   You are not required to accept this License in order to receive or
438 | run a copy of the Program.  Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance.  However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work.  These actions infringe copyright if you do
443 | not accept this License.  Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 | 
446 |   10. Automatic Licensing of Downstream Recipients.
447 | 
448 |   Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License.  You are not responsible
451 | for enforcing compliance by third parties with this License.
452 | 
453 |   An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations.  If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 | 
463 |   You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License.  For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 | 
471 |   11. Patents.
472 | 
473 |   A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based.  The
475 | work thus licensed is called the contributor's "contributor version".
476 | 
477 |   A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version.  For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 | 
487 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 | 
492 |   In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement).  To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 | 
499 |   If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients.  "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 | 
513 |   If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 | 
521 |   A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License.  You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 | 
536 |   Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 | 
540 |   12. No Surrender of Others' Freedom.
541 | 
542 |   If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License.  If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all.  For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 | 
552 |   13. Use with the GNU Affero General Public License.
553 | 
554 |   Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work.  The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 | 
563 |   14. Revised Versions of this License.
564 | 
565 |   The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time.  Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 | 
570 |   Each version is given a distinguishing version number.  If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation.  If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 | 
579 |   If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 | 
584 |   Later license versions may give you additional or different
585 | permissions.  However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 | 
589 |   15. Disclaimer of Warranty.
590 | 
591 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 | 
600 |   16. Limitation of Liability.
601 | 
602 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 | 
612 |   17. Interpretation of Sections 15 and 16.
613 | 
614 |   If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 | 
621 |                      END OF TERMS AND CONDITIONS
622 | 
623 |             How to Apply These Terms to Your New Programs
624 | 
625 |   If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 | 
629 |   To do so, attach the following notices to the program.  It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 | 
634 |     <one line to give the program's name and a brief idea of what it does.>
635 |     Copyright (C) <year>  <name of author>
636 | 
637 |     This program is free software: you can redistribute it and/or modify
638 |     it under the terms of the GNU General Public License as published by
639 |     the Free Software Foundation, either version 3 of the License, or
640 |     (at your option) any later version.
641 | 
642 |     This program is distributed in the hope that it will be useful,
643 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
644 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
645 |     GNU General Public License for more details.
646 | 
647 |     You should have received a copy of the GNU General Public License
648 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
649 | 
650 | Also add information on how to contact you by electronic and paper mail.
651 | 
652 |   If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 | 
655 |     <program>  Copyright (C) <year>  <name of author>
656 |     This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 |     This is free software, and you are welcome to redistribute it
658 |     under certain conditions; type `show c' for details.
659 | 
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License.  Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 | 
664 |   You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | <http://www.gnu.org/licenses/>.
668 | 
669 |   The GNU General Public License does not permit incorporating your program
670 | into proprietary programs.  If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library.  If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License.  But first, please read
674 | <http://www.gnu.org/philosophy/why-not-lgpl.html>.
675 | 


--------------------------------------------------------------------------------