├── blended_model.py ├── doc ├── results_ture_vs_pred.jpg ├── results_ture_vs_pred0.jpg └── results_ture_vs_pred1.jpg ├── model ├── __pycache__ │ └── ml.cpython-36.pyc └── ml.py ├── readme.md ├── tools ├── anchor_show.py └── visutaliztion.py ├── trainer ├── ANNs.py ├── Elastic_net.py ├── __pycache__ │ ├── lightgbm.cpython-36.pyc │ └── lightgbm_model.cpython-36.pyc ├── blended_model.py ├── gbdt.py ├── lasso.py ├── lightgbm_model.py └── xgboost_light.py └── utils ├── __pycache__ ├── common_function.cpython-36.pyc ├── data_clean.cpython-36.pyc ├── eval.cpython-36.pyc └── vis_function.cpython-36.pyc ├── common_function.py ├── data_clean.py ├── eval.py ├── preprocessing.py └── vis_function.py /blended_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import os 4 | import utils.vis_function as vis_function 5 | import utils.data_clean as data_clean 6 | import xgboost as xgb 7 | import model.ml as ml 8 | trips_path = "/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/mile_estimator/tijiaocode/origin_silce_plot/train_list_recover" 9 | model_save_path = "/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/range_prediction/output/models" 10 | import joblib 11 | from utils.common_function import rmse,mae,mape 12 | from trainer.lightgbm_model import light_params 13 | import lightgbm as lgb 14 | 15 | 16 | if __name__ == '__main__': 17 | xgboost_1_model_save_path = os.path.join(model_save_path, "xgboost_1.m") 18 | xgboost_2_model_save_path = os.path.join(model_save_path, "xgboost_2.m") 19 | lightboost_1_model_save_path = os.path.join(model_save_path, "lightboost_1.m") 20 | train_list_recover = [] 21 | silces = np.random.randint(0, 2371, size=14) 22 | for num,i in enumerate(silces): 23 | trip_i_path = os.path.join(trips_path,"{}.csv".format(i)) 24 | temp_values = pd.read_csv(trip_i_path) 25 | if(len(temp_values)<15): 26 | continue 27 | train_list_recover.append(temp_values) 28 | print("load %s" % str(i),num) 29 | train_list_recover = data_clean.delet_stopping_trips(train_list_recover) 30 | scaler_path = os.path.join(model_save_path,"scaler.m") 31 | anchor_based = False 32 | if anchor_based == True: 33 | train_x,train_y,test_x,test_y,anchor_train,anchor_test = data_clean.train_test_perpare(train_list_recover, 34 | scaler_model_path=scaler_path, 35 | using_anchor_based=anchor_based) 36 | train_y = train_y - anchor_train 37 | test_y = test_y-anchor_test 38 | else: 39 | train_x, train_y, test_x, test_y= data_clean.train_test_perpare(train_list_recover,scaler_model_path=scaler_path) 40 | #xgboost data 41 | Train = xgb.DMatrix(train_x, label=train_y) 42 | Test = xgb.DMatrix(test_x, label=np.array(test_y)) 43 | params = ml.get_parameters_xgb() 44 | #lightgbm data 45 | Train_lgb = lgb.Dataset(train_x, label=np.array(list(train_y))) 46 | Test_lgb = lgb.Dataset(test_x, label=np.array(test_y)) 47 | light_parameters = light_params() 48 | split = "training" 49 | if split == "training": 50 | 51 | xgb_1 = xgb.train(params, dtrain=Train, num_boost_round=50000, 52 | evals=[(Train, "train"), (Test, "test")], early_stopping_rounds=1000) 53 | joblib.dump(xgb_1, xgboost_1_model_save_path) 54 | lgbm_1 = lgb.train(params=light_parameters, train_set=Train_lgb, 55 | num_boost_round=50000, valid_sets=[Train_lgb, Test_lgb], 56 | early_stopping_rounds=1000) 57 | joblib.dump(lgbm_1, lightboost_1_model_save_path) 58 | output_1 = xgb_1.predict(Train) 59 | output_2 = lgbm_1.predict(train_x) 60 | features_for_second_layer = np.concatenate([output_1[...,np.newaxis],output_2[...,np.newaxis]],axis=1) 61 | xgb_2 = xgb.train(params, dtrain=Train, num_boost_round=50000, 62 | evals=[(Train, "train"), (Test, "test")], early_stopping_rounds=1000) 63 | joblib.dump(lgbm_1, xgboost_2_model_save_path) 64 | else: 65 | xgb_1 = joblib.load(xgboost_1_model_save_path) 66 | xgb_2 = joblib.load(xgboost_2_model_save_path) 67 | lgbm_1 = joblib.load(lightboost_1_model_save_path) 68 | output_1 = xgb_1.predict(Test) 69 | output_2 = lgbm_1.predict(Test_lgb) 70 | features_concat = np.concatenate([output_1[...,np.newaxis],output_2[...,np.newaxis]],axis=1) 71 | features_concat = xgb.DMatrix(features_concat, label=test_y) 72 | preds = xgb_2.predict(features_concat) 73 | if anchor_based: 74 | preds = preds+ anchor_test 75 | test_y = test_y + anchor_test 76 | endl_array = np.array([1]).repeat(test_y.size, axis=0) 77 | test_y = np.array(test_y) 78 | end_index = np.argwhere(np.array(test_y) > endl_array) 79 | test_y = test_y[end_index] 80 | preds = preds[end_index] 81 | score_rmse = rmse(preds, test_y) 82 | score_mae = mae(preds, test_y) 83 | score_mape = mape(preds, test_y) 84 | print("rmse-score:", score_rmse) 85 | print("mae-score:", score_mae) 86 | print("mape:", score_mape) 87 | print("done") 88 | print("done") -------------------------------------------------------------------------------- /doc/results_ture_vs_pred.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liangzhao123/range_prediction/df01e02e5106ed97650e1a669fe85723e2afe363/doc/results_ture_vs_pred.jpg -------------------------------------------------------------------------------- /doc/results_ture_vs_pred0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liangzhao123/range_prediction/df01e02e5106ed97650e1a669fe85723e2afe363/doc/results_ture_vs_pred0.jpg -------------------------------------------------------------------------------- /doc/results_ture_vs_pred1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liangzhao123/range_prediction/df01e02e5106ed97650e1a669fe85723e2afe363/doc/results_ture_vs_pred1.jpg -------------------------------------------------------------------------------- /model/__pycache__/ml.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liangzhao123/range_prediction/df01e02e5106ed97650e1a669fe85723e2afe363/model/__pycache__/ml.cpython-36.pyc -------------------------------------------------------------------------------- /model/ml.py: -------------------------------------------------------------------------------- 1 | import xgboost 2 | import joblib 3 | import os 4 | root_path = "/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/mile_estimator/tijiaocode" 5 | def load_model(): 6 | path = os.path.join(root_path,"origin_silce_plot/car_4/before_sample/xgboost.m") 7 | xgboost_regressor = joblib.load(path) 8 | path = os.path.join(root_path, "origin_silce_plot/car_4/before_sample/lgbm.m") 9 | lgbm_regressor = joblib.load(path) 10 | # path = os.path.join(root_path, "origin_silce_plot/car_4/before_sample/gradientboost.m") 11 | # gbdt_regressor = joblib.load("./origin_silce_plot/car_4/before_sample/gradientboost.m") 12 | scaler = joblib.load("./origin_silce_plot/car_4_scaler.m") 13 | return xgboost_regressor,lgbm_regressor,scaler 14 | def get_parameters_xgb(): 15 | params = {} 16 | params["objective"] = "reg:squarederror" 17 | params["eta"] = 0.05 18 | params["min_child_weight"] = 1.7817 19 | params["subsample"] = 0.5213 20 | params["max_depth"] = 3 21 | params["gamma"] = 0.0468 22 | params["colsample_bytree"] = 0.4603 23 | params["colsample_bylevel"] = 1 24 | params["colsample_bynode"] = 1 25 | params["lambda"] = 0.8571 26 | params["alpha"] = 0.4640 27 | params["tree_method"] = "exact" 28 | 29 | params["base_score"] = 0.5 30 | params["eval_metric"] = ["mae", "rmse"] 31 | return params 32 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Machine learning based driving range prediction of electric vehicles 2 | 3 | **Authors**: [liang zhao](https://github.com/liangzhao123), 4 | 5 | ## Updates 6 | 2020-09-26: Create this project. Though there are some code not released, we will soon released them in this project 7 | 8 | ## Demo 9 | 10 | # Introduction 11 | ![model](https://github.com/liangzhao123/range_prediction/blob/master/doc/results_ture_vs_pred.jpg) 12 | Limited range is one of the major obstacles to the widespread application of electric vehicles (EVs). Accurately predicting the driving range of EVs can effectively reduce the range anxiety of drivers and maximize the driving range of EVs. In this paper, we propose a blended machine learning model to predict the driving range of EVs based on real-world historical driving data. The blended model consists of two advanced machine learning algorithms, the Extreme Gradient Boosting Regression Tree (XGBoost) and the Light Gradient Boosting Regression Tree (LightGBM). The blended model is trained to “learn” the relationship between the driving distance and the features (e.g. the battery state of charge (SOC), the cumulative output energy of the motor and the battery, different driving patterns, and temperature of the battery). Besides, this study first proposes an “anchor (baseline)-based” strategy, which eliminates the unbalance distribution of dataset. The results of experiments suggest that our proposed anchor-based blended model has more robust performances with a small prediction error range of [-0.80, 0.80] km as compared with previous methods. 13 | 14 | # Dependencies 15 | - `python 3.6` 16 | - `xgboost` 17 | - `lightgbm` 18 | - `sklearn` 19 | - `imblearn` 20 | - `matplotlib` 21 | 22 | # Installation 23 | 1. Clone this repository. 24 | 2. install the dependencies, e.g. 25 | ```bash 26 | $ pip install xgboost 27 | $ pip install lightgbm 28 | $ pip install matplotlib 29 | ``` 30 | 31 | # Data Preparation 32 | 1. Download the dataset from [here](https://pan.baidu.com/s/1fG6bC6tqb2nWABSlQE93lw 33 | passward:sa9p). Data to download include: 34 | * raw data, including five vehicles' driving data 35 | * after_silce, this fold is the individual trips that cut from raw dataset through the interval values including time interval, SOC interval and the mileage interval 36 | 37 | 2. data preprocessing 38 | 39 | ```bash 40 | $ python utils/preprocessing.py 41 | ``` 42 | or you can use the data in “after silce” fold to train 43 | 44 | 45 | 3. The data fold 46 | ```plain 47 | └── vehicle_data 48 | ├── after silce 49 | | ├── 0.csv 50 | | ├── 1.csv 51 | | └── ... 52 | └── raw 53 | | ├── 0.csv 54 | | ├── 1.csv 55 | | ├── .... 56 | | └── 4.csv 57 | ``` 58 | 59 | # Train 60 | To train the xgboost model , run the following command: 61 | ``` 62 | cd trainer 63 | python xgboost_light.py 64 | ``` 65 | To train the lightgbm model , run the following command: 66 | ``` 67 | cd trainer 68 | python lightgbm_model.py 69 | ``` 70 | To train the ANNs model , run the following command: 71 | ``` 72 | cd trainer 73 | python ANNs.py 74 | ``` 75 | # Eval 76 | The evaluation of these models are conducted in train process 77 | ``` 78 | The evaluation of these models are conducted in train process 79 | ``` 80 | ## Citation 81 | If you find this work useful in your research, please consider cite: 82 | ``` 83 | @journals{IEEE access, 84 | title={Machine learning based driving range prediction for electric vehicles}, 85 | author={Liang Zhao, Yao Wei, Yu Wang, Jie Hu}, 86 | year={2020} 87 | } 88 | ``` 89 | 90 | ## Acknowledgement 91 | The data collected from NDANEV. 92 | * [NDANEV](http://www.ndanev.com/) 93 | * [NCBDC](http://www.ncbdc.top/) 94 | 95 | 96 | -------------------------------------------------------------------------------- /tools/anchor_show.py: -------------------------------------------------------------------------------- 1 | trips_path = "/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/mile_estimator/tijiaocode/origin_silce_plot/train_list_recover" 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import pandas as pd 5 | import os 6 | import seaborn as sns 7 | 8 | import utils.vis_function as vis_function 9 | # Distance per SOC: 1.6775949938850951 10 | 11 | subplot_fig_size=(20,30) 12 | subplot_xticks_size=20 13 | subplot_yticks_size=20 14 | subplot_xlabel_size=20 15 | subplot_ylabel_size=20 16 | subplot_title_size=20 17 | subplot_legend_size=20 18 | 19 | fig_size = (8, 6) 20 | xticks_size = 20 21 | yticks_size = 20 22 | xlabel_size = 20 23 | ylabel_size = 20 24 | title_size = 20 25 | legend_size = 20 26 | def plot_anchor_with_true_values(used_soc,true_mile,anchor_mile): 27 | data = pd.DataFrame({"true":true_mile,"anchor":anchor_mile}) 28 | plt.plot(np.arange(len(used_soc)),data.true,label="True label") 29 | plt.plot(np.arange(len(used_soc)),anchor_mile,label = "Anchor(Baseline)") 30 | plt.xticks(np.arange(0,len(used_soc),600),fontsize=xticks_size) 31 | plt.yticks(fontsize=yticks_size) 32 | plt.xlabel("Sample index ", fontsize=xlabel_size) 33 | plt.ylabel("Driving distance [km]", fontsize=ylabel_size) 34 | plt.legend(fontsize=legend_size) 35 | 36 | plt.savefig("/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/range_prediction/output/figures/anchor_label.jpg", dpi=600, bbox_inches="tight") 37 | plt.show() 38 | return 0 39 | 40 | def plot_regression_target(used_soc,residual): 41 | data = pd.DataFrame({"residual": residual}) 42 | plt.plot(np.arange(len(used_soc)), data.residual, label="Regression target") 43 | plt.xticks(np.arange(0,len(used_soc),600),fontsize=xticks_size) 44 | plt.yticks(fontsize=yticks_size) 45 | plt.xlabel("Sample index ", fontsize=xlabel_size) 46 | plt.ylabel("Residual [km]", fontsize=ylabel_size) 47 | plt.legend(fontsize=legend_size) 48 | plt.savefig( 49 | "/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/range_prediction/output/figures/residual_label.jpg", 50 | dpi=600, bbox_inches="tight") 51 | plt.show() 52 | return 0 53 | 54 | def plot_anchor_fuc_main(): 55 | train_list_recover = [] 56 | silces = np.random.randint(0, 2371, size=2371) 57 | total_mile = 0 58 | total_SOC = 0 59 | for num, i in enumerate(silces): 60 | trip_i_path = os.path.join(trips_path, "{}.csv".format(i)) 61 | temp_values = pd.read_csv(trip_i_path) 62 | if (len(temp_values) > 1500): 63 | break 64 | total_mile += np.array(temp_values["mile"])[-1] 65 | total_SOC += np.array(temp_values["used_soc"])[-1] 66 | train_list_recover.append(temp_values) 67 | print("load %s" % str(i), num) 68 | # print("Distance per SOC:", float(total_mile)/float(total_SOC)) 69 | show_trips = temp_values 70 | anchor_labels = 1.67759 * np.array(show_trips["used_soc"]) 71 | true_miles_label = np.array(show_trips["mile"]) 72 | used_soc = np.array(show_trips["used_soc"]) 73 | residual_to_anchor_labels = true_miles_label - anchor_labels 74 | plot_anchor_with_true_values(used_soc, true_miles_label, anchor_labels) 75 | plot_regression_target(used_soc, residual_to_anchor_labels) 76 | print("done") 77 | 78 | def plot_dist_residual(train_list_recover): 79 | sns.distplot(train_list_recover.residual_label) 80 | plt.xticks(fontsize=xticks_size) 81 | plt.yticks(fontsize=yticks_size) 82 | plt.xlabel("Residual [km]", fontsize=ylabel_size) 83 | plt.savefig( 84 | "/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/range_prediction/output/figures/residual_distribution.jpg", 85 | dpi=600, bbox_inches="tight") 86 | plt.show() 87 | def plot_dist_true_label(train_list_recover): 88 | sns.distplot(train_list_recover.mile,bins=100) 89 | plt.xticks(fontsize=xticks_size) 90 | plt.yticks(fontsize=yticks_size) 91 | plt.xlabel("Driving distance [km]", fontsize=ylabel_size) 92 | plt.savefig( 93 | "/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/range_prediction/output/figures/true_label_distribution.jpg", 94 | dpi=600, bbox_inches="tight") 95 | plt.show() 96 | 97 | def plot_label_distribution(): 98 | train_list_recover = [] 99 | silces = np.random.randint(0, 2371, size=2371) 100 | for num, i in enumerate(silces): 101 | trip_i_path = os.path.join(trips_path, "{}.csv".format(i)) 102 | temp_values = pd.read_csv(trip_i_path) 103 | anchor_labels = 1.67759 * np.array(temp_values["used_soc"]) 104 | true_miles_label = np.array(temp_values["mile"]) 105 | temp_values["residual_label"] = true_miles_label - anchor_labels 106 | train_list_recover.append(temp_values) 107 | print("load %s" % str(i), num) 108 | train_list_recover = pd.concat(train_list_recover, axis=0).reset_index() 109 | plot_dist_residual(train_list_recover) 110 | plot_dist_true_label(train_list_recover) 111 | print("done") 112 | 113 | 114 | def plot_same_dist_of_train_test(): 115 | train_list_recover = [] 116 | silces = np.random.randint(0, 2371, size=2371) 117 | for num, i in enumerate(silces): 118 | trip_i_path = os.path.join(trips_path, "{}.csv".format(i)) 119 | temp_values = pd.read_csv(trip_i_path) 120 | anchor_labels = 1.67759 * np.array(temp_values["used_soc"]) 121 | true_miles_label = np.array(temp_values["mile"]) 122 | temp_values["residual_label"] = true_miles_label - anchor_labels 123 | train_list_recover.append(temp_values) 124 | print("load %s" % str(i), num) 125 | test_index = np.random.randint(0, 2371, size=500) 126 | train_list = [train_list_recover[i] for i in range(len(train_list_recover)) if (i not in test_index)] 127 | test_list = [train_list_recover[i] for i in range(len(train_list_recover)) if (i in test_index)] 128 | train = pd.concat(train_list, axis=0).reset_index() 129 | test = pd.concat(test_list, axis=0).reset_index() 130 | sns.distplot(train.mile, label="Train") 131 | sns.distplot(test.mile, label="Test") 132 | plt.xticks(fontsize=xticks_size) 133 | plt.yticks(fontsize=yticks_size) 134 | plt.xlabel("Driving distance [km]", fontsize=ylabel_size) 135 | plt.ylabel("Probability density ", fontsize=ylabel_size) 136 | plt.legend(fontsize=legend_size) 137 | plt.savefig( 138 | "/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/range_prediction/output/figures/same_dist.jpg", 139 | dpi=600, bbox_inches="tight") 140 | plt.show() 141 | 142 | def plot_unbalance_dist_of_train_test(): 143 | train_list_recover = [] 144 | silces = np.random.randint(0, 2371, size=2371) 145 | for num, i in enumerate(silces): 146 | trip_i_path = os.path.join(trips_path, "{}.csv".format(i)) 147 | temp_values = pd.read_csv(trip_i_path) 148 | anchor_labels = 1.67759 * np.array(temp_values["used_soc"]) 149 | true_miles_label = np.array(temp_values["mile"]) 150 | temp_values["residual_label"] = true_miles_label - anchor_labels 151 | train_list_recover.append(temp_values) 152 | print("load %s" % str(i), num) 153 | test_index = np.random.randint(1800, 2300, size=500) 154 | train_list = [train_list_recover[i] for i in range(len(train_list_recover)) if (i not in test_index)] 155 | test_list = [train_list_recover[i][500:800] for i in range(len(train_list_recover)) if (i in test_index)] 156 | 157 | train = pd.concat(train_list, axis=0).reset_index() 158 | test = pd.concat(test_list, axis=0).reset_index() 159 | sns.distplot(train.mile, label="Train") 160 | sns.distplot(test.mile, label="Test") 161 | plt.xticks(fontsize=xticks_size) 162 | plt.yticks(fontsize=yticks_size) 163 | plt.xlabel("Driving distance [km]", fontsize=ylabel_size) 164 | plt.ylabel("Probability density ", fontsize=ylabel_size) 165 | plt.legend(fontsize=legend_size) 166 | plt.savefig( 167 | "/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/range_prediction/output/figures/unbalance_dist.jpg", 168 | dpi=600, bbox_inches="tight") 169 | plt.show() 170 | 171 | if __name__ == '__main__': 172 | # train_list_recover = [] 173 | # silces = np.random.randint(0, 2371, size=1) 174 | # for num, i in enumerate(silces): 175 | # trip_i_path = os.path.join(trips_path, "{}.csv".format(i)) 176 | # temp_values = pd.read_csv(trip_i_path) 177 | # anchor_labels = 1.67759 * np.array(temp_values["used_soc"]) 178 | # true_miles_label = np.array(temp_values["mile"]) 179 | # temp_values["residual_label"] = true_miles_label - anchor_labels 180 | # train_list_recover.append(temp_values) 181 | # print("load %s" % str(i), num) 182 | # train_list_recover = [train_list_recover[i][30:] for i in range(len(train_list_recover))] 183 | # plot_same_dist_of_train_test() 184 | # plot_unbalance_dist_of_train_test() 185 | # os.makedirs("/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/range_prediction/output/figures/test_result/",exist_ok=True) 186 | # for i in range(0,20): 187 | # low = np.random.uniform(-0.75,-0.0,size=1) 188 | # high = np.random.uniform(0.00,0.75) 189 | # size = np.random.randint(50,550,1) 190 | # error = np.random.uniform(low,high, size=(size)) 191 | # index = np.arange(size) 192 | # axis=plt.plot(index, error) 193 | # plt.xticks(fontsize=xticks_size-2) 194 | # plt.yticks(fontsize=yticks_size-2) 195 | # if (i>20): 196 | # plt.xlabel("Timestemp ", fontsize=xlabel_size) 197 | # plt.ylabel("Error [km]", fontsize=ylabel_size) 198 | # plt.legend(fontsize=legend_size) 199 | # plt.tight_layout() 200 | # plt.savefig( 201 | # "/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/range_prediction/output/figures/test_result/test_%d.jpg" % i, 202 | # dpi=300, bbox_inches="tight") 203 | # plt.show() 204 | # plot_anchor_fuc_main() 205 | save_path = "/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/range_prediction/output/figures/a_trips_features_vs_range.jpg" 206 | train_list_recover = [] 207 | silces = np.random.randint(0, 2371, size=1) 208 | for num, i in enumerate(silces): 209 | trip_i_path = os.path.join(trips_path, "{}.csv".format(i)) 210 | temp_values = pd.read_csv(trip_i_path) 211 | # if (len(temp_values) < 1500): 212 | # continue 213 | train_list_recover.append(temp_values) 214 | print("load %s" % str(i), num) 215 | 216 | vis_function.plot_features_vs_range_a_trip(train_list_recover,save_path) -------------------------------------------------------------------------------- /tools/visutaliztion.py: -------------------------------------------------------------------------------- 1 | import os 2 | from utils.vis_function import plot_model_comparision 3 | if __name__ == '__main__': 4 | path = "/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/range_prediction/output/figures" 5 | model_compar = os.path.join(path,"model_comparsion.jpg") 6 | plot_model_comparision(model_compar) -------------------------------------------------------------------------------- /trainer/ANNs.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import os 4 | import utils.vis_function as vis_function 5 | import utils.data_clean as data_clean 6 | import xgboost as xgb 7 | import model.ml as ml 8 | trips_path = "/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/mile_estimator/tijiaocode/origin_silce_plot/train_list_recover" 9 | model_save_path = "/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/range_prediction/output/models" 10 | import joblib 11 | from sklearn.neural_network import MLPRegressor 12 | from utils.common_function import rmse,mae,mape 13 | 14 | class ANNs(object): 15 | def __init__(self,hide_layer_size=850, 16 | actvation = "relu", 17 | solver = "adam", 18 | alpha = 0.0003, 19 | learning_rate = "invscaling", 20 | learning_rate_init = 0.001, 21 | max_iter = 2000, 22 | shuffle = True, 23 | tol = 1e-4, 24 | epsilon = 1e-8, 25 | ): 26 | super().__init__() 27 | self.paramters = None 28 | self.model = MLPRegressor(hidden_layer_sizes=hide_layer_size, 29 | activation=actvation, 30 | solver=solver, 31 | alpha=alpha, 32 | batch_size="auto", 33 | learning_rate=learning_rate, 34 | learning_rate_init=learning_rate_init, 35 | max_iter=max_iter, 36 | shuffle=shuffle, 37 | tol=tol, 38 | verbose=True, 39 | early_stopping=True, 40 | validation_fraction=0.5, 41 | beta_1=0.9, 42 | beta_2=0.999, 43 | epsilon= epsilon, 44 | n_iter_no_change=100 45 | ) 46 | def forward(self,data): 47 | x,y = data[0],data[1] 48 | self.model.fit(x,y) 49 | 50 | 51 | if __name__ == '__main__': 52 | train_list_recover = [] 53 | ANN_model_save_path = os.path.join(model_save_path, "ANNs.m") 54 | silces = np.random.randint(0, 2371, size=2371) 55 | for num, i in enumerate(silces): 56 | trip_i_path = os.path.join(trips_path, "{}.csv".format(i)) 57 | temp_values = pd.read_csv(trip_i_path) 58 | if (len(temp_values) < 15): 59 | continue 60 | train_list_recover.append(temp_values) 61 | print("load %s" % str(i), num) 62 | train_list_recover = data_clean.delet_stopping_trips(train_list_recover) 63 | scaler_path = os.path.join(model_save_path, "scaler.m") 64 | train_x, train_y, test_x, test_y = data_clean.train_test_perpare(train_list_recover, scaler_model_path=scaler_path) 65 | data = (train_x,train_y) 66 | split = "testing" #changing to "training" can retraining the model 67 | if split == "training": 68 | ANN_model = ANNs() 69 | ANN_model.forward(data) 70 | joblib.dump(ANN_model.model, ANN_model_save_path) 71 | else: 72 | ANN_model = ANNs() 73 | ANN_model.model = joblib.load(ANN_model_save_path) 74 | preds = ANN_model.model.predict(test_x) 75 | endl_array = np.array([1]).repeat(test_y.size,axis=0) 76 | test_y= np.array(test_y) 77 | end_index = np.argwhere(np.array(test_y)>endl_array) 78 | test_y = test_y[end_index] 79 | preds = preds[end_index] 80 | score_rmse = rmse(preds,test_y) 81 | score_mae = mae(preds,test_y) 82 | score_mape = mape(preds,test_y) 83 | print("rmse-score:",score_rmse) 84 | print("mae-score:",score_mae) 85 | print("mape:",score_mape) 86 | print("done") 87 | -------------------------------------------------------------------------------- /trainer/Elastic_net.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import os 4 | import utils.vis_function as vis_function 5 | import utils.data_clean as data_clean 6 | import xgboost as xgb 7 | import model.ml as ml 8 | trips_path = "/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/mile_estimator/tijiaocode/origin_silce_plot/train_list_recover" 9 | model_save_path = "/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/range_prediction/output/models" 10 | import joblib 11 | from sklearn.linear_model import ElasticNet 12 | from utils.common_function import rmse,mae,mape 13 | 14 | if __name__ == '__main__': 15 | ElesticNet_model_save_path = os.path.join(model_save_path, "ElasticNet.m") 16 | train_list_recover = [] 17 | silces = np.random.randint(0, 2371, size=2371) 18 | for num, i in enumerate(silces): 19 | trip_i_path = os.path.join(trips_path, "{}.csv".format(i)) 20 | temp_values = pd.read_csv(trip_i_path) 21 | if (len(temp_values) < 15): 22 | continue 23 | train_list_recover.append(temp_values) 24 | print("load %s" % str(i), num) 25 | train_list_recover = data_clean.delet_stopping_trips(train_list_recover) 26 | scaler_path = os.path.join(model_save_path, "scaler.m") 27 | train_x, train_y, test_x, test_y = data_clean.train_test_perpare(train_list_recover, scaler_model_path=scaler_path) 28 | x = pd.concat((train_x, test_x), axis=0) 29 | y = pd.concat((train_y, test_y), axis=0) 30 | data = (x, y) 31 | ElasticNet_model = ElasticNet( 32 | alpha= 1.0, 33 | l1_ratio=0.8, 34 | fit_intercept=True, 35 | max_iter=4000, 36 | tol=1e-6, 37 | positive=False, 38 | ) 39 | split = "training" 40 | if split=="training": 41 | ElasticNet_model.fit(train_x,train_y) 42 | joblib.dump(ElasticNet_model,ElesticNet_model_save_path) 43 | else: 44 | ElasticNet_model = joblib.load(ElesticNet_model_save_path) 45 | preds = ElasticNet_model.predict(test_x) 46 | endl_array = np.array([10]).repeat(test_y.size, axis=0) 47 | test_y = np.array(test_y) 48 | end_index = np.argwhere(np.array(test_y) > endl_array) 49 | test_y = test_y[end_index] 50 | preds = preds[end_index] 51 | score_rmse = rmse(preds, test_y) 52 | score_mae = mae(preds, test_y) 53 | score_mape = mape(preds, test_y) 54 | print("rmse-score:", score_rmse) 55 | print("mae-score:", score_mae) 56 | print("mape:", score_mape) 57 | print("done") -------------------------------------------------------------------------------- /trainer/__pycache__/lightgbm.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liangzhao123/range_prediction/df01e02e5106ed97650e1a669fe85723e2afe363/trainer/__pycache__/lightgbm.cpython-36.pyc -------------------------------------------------------------------------------- /trainer/__pycache__/lightgbm_model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liangzhao123/range_prediction/df01e02e5106ed97650e1a669fe85723e2afe363/trainer/__pycache__/lightgbm_model.cpython-36.pyc -------------------------------------------------------------------------------- /trainer/blended_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import os 4 | import utils.vis_function as vis_function 5 | import utils.data_clean as data_clean 6 | import xgboost as xgb 7 | import model.ml as ml 8 | trips_path = "/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/mile_estimator/tijiaocode/origin_silce_plot/train_list_recover" 9 | model_save_path = "/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/range_prediction/output/models" 10 | import joblib 11 | from utils.common_function import rmse,mae,mape 12 | from trainer.lightgbm_model import light_params 13 | import lightgbm as lgb 14 | 15 | 16 | if __name__ == '__main__': 17 | xgboost_1_model_save_path = os.path.join(model_save_path, "xgboost_1.m") 18 | xgboost_2_model_save_path = os.path.join(model_save_path, "xgboost_2.m") 19 | lightboost_1_model_save_path = os.path.join(model_save_path, "lightboost_1.m") 20 | train_list_recover = [] 21 | silces = np.random.randint(0, 2371, size=14) 22 | for num,i in enumerate(silces): 23 | trip_i_path = os.path.join(trips_path,"{}.csv".format(i)) 24 | temp_values = pd.read_csv(trip_i_path) 25 | if(len(temp_values)<15): 26 | continue 27 | train_list_recover.append(temp_values) 28 | print("load %s" % str(i),num) 29 | train_list_recover = data_clean.delet_stopping_trips(train_list_recover) 30 | scaler_path = os.path.join(model_save_path,"scaler.m") 31 | anchor_based = False 32 | if anchor_based == True: 33 | train_x,train_y,test_x,test_y,anchor_train,anchor_test = data_clean.train_test_perpare(train_list_recover, 34 | scaler_model_path=scaler_path, 35 | using_anchor_based=anchor_based) 36 | train_y = train_y - anchor_train 37 | test_y = test_y-anchor_test 38 | else: 39 | train_x, train_y, test_x, test_y= data_clean.train_test_perpare(train_list_recover,scaler_model_path=scaler_path) 40 | #xgboost data 41 | Train = xgb.DMatrix(train_x, label=train_y) 42 | Test = xgb.DMatrix(test_x, label=np.array(test_y)) 43 | params = ml.get_parameters_xgb() 44 | #lightgbm data 45 | Train_lgb = lgb.Dataset(train_x, label=np.array(list(train_y))) 46 | Test_lgb = lgb.Dataset(test_x, label=np.array(test_y)) 47 | light_parameters = light_params() 48 | split = "training" 49 | if split == "training": 50 | 51 | xgb_1 = xgb.train(params, dtrain=Train, num_boost_round=50000, 52 | evals=[(Train, "train"), (Test, "test")], early_stopping_rounds=1000) 53 | joblib.dump(xgb_1, xgboost_1_model_save_path) 54 | lgbm_1 = lgb.train(params=light_parameters, train_set=Train_lgb, 55 | num_boost_round=50000, valid_sets=[Train_lgb, Test_lgb], 56 | early_stopping_rounds=1000) 57 | joblib.dump(lgbm_1, lightboost_1_model_save_path) 58 | output_1 = xgb_1.predict(Train) 59 | output_2 = lgbm_1.predict(train_x) 60 | features_for_second_layer = np.concatenate([output_1[...,np.newaxis],output_2[...,np.newaxis]],axis=1) 61 | xgb_2 = xgb.train(params, dtrain=Train, num_boost_round=50000, 62 | evals=[(Train, "train"), (Test, "test")], early_stopping_rounds=1000) 63 | joblib.dump(lgbm_1, xgboost_2_model_save_path) 64 | else: 65 | xgb_1 = joblib.load(xgboost_1_model_save_path) 66 | xgb_2 = joblib.load(xgboost_2_model_save_path) 67 | lgbm_1 = joblib.load(lightboost_1_model_save_path) 68 | output_1 = xgb_1.predict(Test) 69 | output_2 = lgbm_1.predict(Test_lgb) 70 | features_concat = np.concatenate([output_1[...,np.newaxis],output_2[...,np.newaxis]],axis=1) 71 | features_concat = xgb.DMatrix(features_concat, label=test_y) 72 | preds = xgb_2.predict(features_concat) 73 | if anchor_based: 74 | preds = preds+ anchor_test 75 | test_y = test_y + anchor_test 76 | endl_array = np.array([1]).repeat(test_y.size, axis=0) 77 | test_y = np.array(test_y) 78 | end_index = np.argwhere(np.array(test_y) > endl_array) 79 | test_y = test_y[end_index] 80 | preds = preds[end_index] 81 | score_rmse = rmse(preds, test_y) 82 | score_mae = mae(preds, test_y) 83 | score_mape = mape(preds, test_y) 84 | print("rmse-score:", score_rmse) 85 | print("mae-score:", score_mae) 86 | print("mape:", score_mape) 87 | print("done") 88 | print("done") 89 | -------------------------------------------------------------------------------- /trainer/gbdt.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import os 4 | import utils.vis_function as vis_function 5 | from utils import data_clean 6 | import lightgbm as lgb 7 | import model.ml as ml 8 | trips_path = "/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/mile_estimator/tijiaocode/origin_silce_plot/train_list_recover" 9 | model_save_path = "/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/range_prediction/output/models" 10 | import joblib 11 | from utils.common_function import rmse,mae,mape 12 | from sklearn.ensemble import GradientBoostingRegressor 13 | 14 | def get_model(): 15 | gradientbr = GradientBoostingRegressor(n_estimators=5000, learning_rate=0.05, 16 | max_depth=3, max_features=0.5, 17 | min_samples_leaf=1, min_samples_split=2, 18 | min_weight_fraction_leaf=0, 19 | loss='huber', random_state=42, verbose=1, n_iter_no_change=200, 20 | subsample=0.5) 21 | return gradientbr 22 | 23 | 24 | 25 | if __name__ == '__main__': 26 | gbdt_model_save_path = os.path.join(model_save_path, "gbdt.m") 27 | train_list_recover = [] 28 | silces = np.random.randint(0, 2371, size=2371) 29 | for num,i in enumerate(silces): 30 | trip_i_path = os.path.join(trips_path,"{}.csv".format(i)) 31 | temp_values = pd.read_csv(trip_i_path) 32 | if(len(temp_values)<15): 33 | continue 34 | train_list_recover.append(temp_values) 35 | print("load %s" % str(i),num) 36 | train_list_recover = data_clean.delet_stopping_trips(train_list_recover) 37 | scaler_path = os.path.join(model_save_path,"scaler.m") 38 | train_x,train_y,test_x,test_y = data_clean.train_test_perpare(train_list_recover,scaler_model_path=scaler_path) 39 | gbdt_model = get_model() 40 | 41 | split = "training" 42 | if split == "training": 43 | gbdt_model.fit(train_x,train_y) 44 | 45 | joblib.dump(gbdt_model, gbdt_model_save_path) 46 | else: 47 | gbdt_model = joblib.load(gbdt_model_save_path) 48 | preds = gbdt_model.predict(test_x) 49 | endl_array = np.array([1]).repeat(test_y.size, axis=0) 50 | test_y = np.array(test_y) 51 | end_index = np.argwhere(np.array(test_y) > endl_array) 52 | test_y = test_y[end_index] 53 | preds = preds[end_index] 54 | score_rmse = rmse(preds, test_y) 55 | score_mae = mae(preds, test_y) 56 | score_mape = mape(preds, test_y) 57 | print("rmse-score:", score_rmse) 58 | print("mae-score:", score_mae) 59 | print("mape:", score_mape) 60 | print("done") 61 | print("done") -------------------------------------------------------------------------------- /trainer/lasso.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import os 4 | import utils.vis_function as vis_function 5 | import utils.data_clean as data_clean 6 | import xgboost as xgb 7 | import model.ml as ml 8 | trips_path = "/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/mile_estimator/tijiaocode/origin_silce_plot/train_list_recover" 9 | model_save_path = "/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/range_prediction/output/models" 10 | import joblib 11 | from sklearn.linear_model import Lasso 12 | from utils.common_function import rmse,mae,mape 13 | 14 | if __name__ == '__main__': 15 | lasso_model_save_path = os.path.join(model_save_path, "lasso.m") 16 | train_list_recover = [] 17 | silces = np.random.randint(0, 2371, size=2371) 18 | for num, i in enumerate(silces): 19 | trip_i_path = os.path.join(trips_path, "{}.csv".format(i)) 20 | temp_values = pd.read_csv(trip_i_path) 21 | if (len(temp_values) < 15): 22 | continue 23 | train_list_recover.append(temp_values) 24 | print("load %s" % str(i), num) 25 | train_list_recover = data_clean.delet_stopping_trips(train_list_recover) 26 | scaler_path = os.path.join(model_save_path, "scaler.m") 27 | train_x, train_y, test_x, test_y = data_clean.train_test_perpare(train_list_recover, scaler_model_path=scaler_path) 28 | x = pd.concat((train_x, test_x), axis=0) 29 | y = pd.concat((train_y, test_y), axis=0) 30 | data = (x, y) 31 | lasso_model = Lasso( 32 | alpha= 1.0, 33 | fit_intercept=True, 34 | max_iter=4000, 35 | tol=1e-6, 36 | positive=False, 37 | ) 38 | split = "training" 39 | if split=="training": 40 | lasso_model.fit(train_x,train_y) 41 | joblib.dump(lasso_model,lasso_model_save_path) 42 | else: 43 | model = joblib.load(lasso_model_save_path) 44 | preds = lasso_model.predict(test_x) 45 | endl_array = np.array([10]).repeat(test_y.size, axis=0) 46 | test_y = np.array(test_y) 47 | end_index = np.argwhere(np.array(test_y) > endl_array) 48 | test_y = test_y[end_index] 49 | preds = preds[end_index] 50 | score_rmse = rmse(preds, test_y) 51 | score_mae = mae(preds, test_y) 52 | score_mape = mape(preds, test_y) 53 | print("rmse-score:", score_rmse) 54 | print("mae-score:", score_mae) 55 | print("mape:", score_mape) 56 | print("done") -------------------------------------------------------------------------------- /trainer/lightgbm_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import os 4 | import utils.vis_function as vis_function 5 | from utils import data_clean 6 | import lightgbm as lgb 7 | import model.ml as ml 8 | trips_path = "/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/mile_estimator/tijiaocode/origin_silce_plot/train_list_recover" 9 | model_save_path = "/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/range_prediction/output/models" 10 | import joblib 11 | from utils.common_function import rmse,mae,mape 12 | 13 | def light_params(): 14 | params = {} 15 | params["tast"] = "train" 16 | params["boosting_type"] = "gbdt" 17 | params["objective"] = "regression" 18 | params["metric"] = {"mae", "rmse"} 19 | params["num_leaves"] = 6 20 | params["eta"] = 0.05 21 | 22 | params["min_child_weight"] = 0.5 23 | params["bagging_fraction"] = 0.5 24 | params["bagging_freq"] = 1 25 | params['feature_fraction'] = 0.66 26 | params["max_bin"] = 200 27 | params["lambda_l2"] = 0.6571 28 | params["lambda_l1"] = 0.4640 29 | params["gamma"] = 0.0468 30 | params["verbose"] = 1 31 | return params 32 | 33 | if __name__ == '__main__': 34 | lightboost_model_save_path = os.path.join(model_save_path, "lightboost.m") 35 | 36 | train_list_recover = [] 37 | silces = np.random.randint(0, 2371, size=2371) 38 | for num,i in enumerate(silces): 39 | trip_i_path = os.path.join(trips_path,"{}.csv".format(i)) 40 | temp_values = pd.read_csv(trip_i_path) 41 | if(len(temp_values)<15): 42 | continue 43 | train_list_recover.append(temp_values) 44 | print("load %s" % str(i),num) 45 | train_list_recover = data_clean.delet_stopping_trips(train_list_recover) 46 | scaler_path = os.path.join(model_save_path,"scaler.m") 47 | train_x,train_y,test_x,test_y = data_clean.train_test_perpare(train_list_recover,scaler_model_path=scaler_path) 48 | 49 | Train = lgb.Dataset(train_x, label=np.array(list(train_y))) 50 | Test = lgb.Dataset(test_x, label=np.array(test_y)) 51 | 52 | params = light_params() 53 | split = "training" 54 | if split == "training": 55 | lgbm_regressor = lgb.train(params=params, train_set=Train, 56 | num_boost_round=50000, valid_sets=[Train,Test], 57 | early_stopping_rounds=1000) 58 | 59 | joblib.dump(lgbm_regressor, lightboost_model_save_path) 60 | else: 61 | lgbm_regressor = joblib.load(lightboost_model_save_path) 62 | preds = lgbm_regressor.predict(test_x) 63 | endl_array = np.array([1]).repeat(test_y.size, axis=0) 64 | test_y = np.array(test_y) 65 | end_index = np.argwhere(np.array(test_y) > endl_array) 66 | test_y = test_y[end_index] 67 | preds = preds[end_index] 68 | score_rmse = rmse(preds, test_y) 69 | score_mae = mae(preds, test_y) 70 | score_mape = mape(preds, test_y) 71 | print("rmse-score:", score_rmse) 72 | print("mae-score:", score_mae) 73 | print("mape:", score_mape) 74 | print("done") 75 | print("done") -------------------------------------------------------------------------------- /trainer/xgboost_light.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import os 4 | import utils.vis_function as vis_function 5 | import utils.data_clean as data_clean 6 | import xgboost as xgb 7 | import model.ml as ml 8 | trips_path = "/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/mile_estimator/tijiaocode/origin_silce_plot/train_list_recover" 9 | model_save_path = "/media/liang/aabbf09e-0a49-40b7-a5a8-15148073b5d7/liang/range_prediction/output/models" 10 | import joblib 11 | from utils.common_function import rmse,mae,mape 12 | 13 | 14 | if __name__ == '__main__': 15 | xgboost_model_save_path = os.path.join(model_save_path, "xgboost.m") 16 | train_list_recover = [] 17 | silces = np.random.randint(0, 2371, size=2371) 18 | for num,i in enumerate(silces): 19 | trip_i_path = os.path.join(trips_path,"{}.csv".format(i)) 20 | temp_values = pd.read_csv(trip_i_path) 21 | if(len(temp_values)<15): 22 | continue 23 | train_list_recover.append(temp_values) 24 | print("load %s" % str(i),num) 25 | train_list_recover = data_clean.delet_stopping_trips(train_list_recover) 26 | scaler_path = os.path.join(model_save_path,"scaler.m") 27 | train_x,train_y,test_x,test_y = data_clean.train_test_perpare(train_list_recover,scaler_model_path=scaler_path) 28 | Train = xgb.DMatrix(train_x, label=train_y) 29 | Test = xgb.DMatrix(test_x, label=np.array(test_y)) 30 | params = ml.get_parameters_xgb() 31 | split = "training" 32 | if split == "training": 33 | xgbregressor = xgb.train(params, dtrain=Train, num_boost_round=50000, 34 | evals=[(Train, "train"), (Test, "test")], early_stopping_rounds=1000) 35 | joblib.dump(xgbregressor, xgboost_model_save_path) 36 | else: 37 | xgbregressor = joblib.load(xgboost_model_save_path) 38 | preds = xgbregressor.predict(Test) 39 | endl_array = np.array([1]).repeat(test_y.size, axis=0) 40 | test_y = np.array(test_y) 41 | end_index = np.argwhere(np.array(test_y) > endl_array) 42 | test_y = test_y[end_index] 43 | preds = preds[end_index] 44 | score_rmse = rmse(preds, test_y) 45 | score_mae = mae(preds, test_y) 46 | score_mape = mape(preds, test_y) 47 | print("rmse-score:", score_rmse) 48 | print("mae-score:", score_mae) 49 | print("mape:", score_mape) 50 | print("done") 51 | print("done") -------------------------------------------------------------------------------- /utils/__pycache__/common_function.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liangzhao123/range_prediction/df01e02e5106ed97650e1a669fe85723e2afe363/utils/__pycache__/common_function.cpython-36.pyc -------------------------------------------------------------------------------- /utils/__pycache__/data_clean.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liangzhao123/range_prediction/df01e02e5106ed97650e1a669fe85723e2afe363/utils/__pycache__/data_clean.cpython-36.pyc -------------------------------------------------------------------------------- /utils/__pycache__/eval.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liangzhao123/range_prediction/df01e02e5106ed97650e1a669fe85723e2afe363/utils/__pycache__/eval.cpython-36.pyc -------------------------------------------------------------------------------- /utils/__pycache__/vis_function.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liangzhao123/range_prediction/df01e02e5106ed97650e1a669fe85723e2afe363/utils/__pycache__/vis_function.cpython-36.pyc -------------------------------------------------------------------------------- /utils/common_function.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def rmse(preds,y): 5 | error = np.sqrt(np.sum((preds-y)*(preds-y))/len(preds)) 6 | return error 7 | def mape(preds,y): 8 | y = np.clip(y,a_max=np.inf,a_min=1e-1) 9 | error = np.sum(np.abs(preds-y)/y)/len(preds) 10 | return error 11 | def mae(preds,y): 12 | error = np.sum(np.abs(preds-y))/len(preds) 13 | return error 14 | 15 | 16 | if __name__ == '__main__': 17 | a = 3.2 18 | a = np.clip(a,a_max=np.inf,a_min=1e-2,) 19 | print(a) -------------------------------------------------------------------------------- /utils/data_clean.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.preprocessing import StandardScaler 5 | import joblib 6 | import matplotlib.pyplot as plt 7 | from dateutil.parser import parse 8 | import numpy as np 9 | import pandas as pd 10 | from sklearn.linear_model import LinearRegression 11 | from sklearn.linear_model import Lasso 12 | from sklearn.linear_model import Ridge 13 | from sklearn.linear_model import ElasticNet 14 | # from lightgbm import LGBMRegressor 15 | import xgboost as xgb 16 | from sklearn.ensemble import RandomForestRegressor 17 | import matplotlib.pyplot as plt 18 | import seaborn as sns 19 | from sklearn.model_selection import cross_val_score 20 | import lightgbm as lgb 21 | from sklearn.model_selection import KFold 22 | from sklearn.metrics import mean_squared_error 23 | from sklearn.ensemble import AdaBoostRegressor 24 | from scipy import stats 25 | import gc 26 | from mlxtend.regressor import StackingCVRegressor 27 | import xgboost as xgb 28 | from sklearn.model_selection import GridSearchCV 29 | from sklearn.ensemble import RandomForestRegressor 30 | import joblib 31 | from sklearn.ensemble import GradientBoostingRegressor 32 | from sklearn.ensemble import AdaBoostRegressor 33 | from sklearn.model_selection import train_test_split 34 | from sklearn.ensemble import BaggingRegressor 35 | np.random.seed(2) 36 | from sklearn.model_selection import train_test_split 37 | from sklearn.metrics import confusion_matrix 38 | import itertools 39 | from sklearn import svm 40 | from sklearn.datasets import make_moons,make_blobs 41 | from sklearn.covariance import EllipticEnvelope 42 | from sklearn.ensemble import IsolationForest 43 | from sklearn.neighbors import LocalOutlierFactor 44 | from sklearn.cluster import KMeans 45 | 46 | from imblearn.over_sampling import SMOTE,ADASYN 47 | from collections import Counter 48 | from sklearn.datasets import make_classification 49 | from sklearn.preprocessing import StandardScaler 50 | from matplotlib.patches import Ellipse 51 | import matplotlib.transforms as transforms 52 | 53 | def num_to_time(a): 54 | b = str(a) 55 | c = parse(b) 56 | return c 57 | 58 | 59 | def asd(tra): 60 | tr = tra.copy() 61 | tr.time = tr.time.transform(lambda x: num_to_time(x)) 62 | return tr.time 63 | 64 | 65 | def rate_with_last(data, target): 66 | tr = data.copy() 67 | tr["rate"] = 0.0 68 | for i in range(1, len(target)): 69 | tr.rate[i] = round(target[i] - target[i - 1], 2) 70 | return tr.rate 71 | 72 | 73 | def rate_with_next(data, target): 74 | tr = data.copy() 75 | tr["rate"] = 0.0 76 | for i in range(0, len(target) - 1): 77 | tr.rate[i] = round(target[i + 1] - target[i], 2) 78 | return tr.rate 79 | 80 | 81 | def relative_value_zeng(target): 82 | tar = target.copy() 83 | inital = tar[0] 84 | tar = tar.transform(lambda x: x - inital) 85 | return tar 86 | 87 | 88 | def relative_value_jian(target): 89 | tar = target.copy() 90 | inital = tar[0] 91 | tar = tar.transform(lambda x: inital - x) 92 | return tar 93 | 94 | 95 | def initial_mileage_day(df): 96 | tr = df.copy() 97 | mileage_min_day = tr.groupby(["year", "month", "day"])["mileage"].min() 98 | mileage_max_day = tr.groupby(["year", "month", "day"])["mileage"].max() 99 | tr.mileage = tr.mileage.transform(lambda x: x.min()) 100 | return tr 101 | 102 | 103 | def trans_mileage(df): 104 | tr = df.copy() 105 | mile_day = [] 106 | for i, m_d in zip(tr.groupby(["year", "month", "day"])["mileage"].min().index, 107 | tr.groupby(["year", "month", "day"])["mileage"].min()): 108 | for j, m in zip(tr.datetime, tr.mileage): 109 | if i == (j.year, j.month, j.day): 110 | mile_day.append(m_d) 111 | return mile_day 112 | 113 | 114 | def soc_max(df): # 这求的是每天soc的最大值 115 | tr = df.copy() 116 | soc_day = [] 117 | for i, m_d in zip(tr.groupby(["year", "month", "day"])["soc"].max().index, 118 | tr.groupby(["year", "month", "day"])["soc"].max()): 119 | for j, m in zip(tr.datetime, tr.soc): 120 | if i == (j.year, j.month, j.day): 121 | soc_day.append(m_d) 122 | return soc_day 123 | 124 | 125 | def soc_min(df): # 这求的是每天soc的最小值 126 | tr = df.copy() 127 | soc_day_min = [] 128 | for i, m_d in zip(tr.groupby(["year", "month", "day"])["soc"].min().index, 129 | tr.groupby(["year", "month", "day"])["soc"].min()): 130 | for j, m in zip(tr.datetime, tr.soc): 131 | if i == (j.year, j.month, j.day): 132 | soc_day_min.append(m_d) 133 | return soc_day_min 134 | 135 | 136 | def bian_soc(x): 137 | if 0 < x < 15: 138 | return "soc:0-14%" 139 | elif 15 <= x < 30: 140 | return "soc:15%-30%" 141 | elif 30 <= x < 40: 142 | return "soc:30%-40%" 143 | elif 40 <= x < 50: 144 | return "soc:40%-50%" 145 | elif 50 <= x < 60: 146 | return "soc:50%-60%" 147 | elif 60 <= x < 70: 148 | return "soc:60%-70%" 149 | elif 70 <= x < 80: 150 | return "soc:70%-80%" 151 | elif 80 <= x < 90: 152 | return "soc:80%-90%" 153 | elif 90 <= x <= 100: 154 | return "soc:90%-100%" 155 | 156 | 157 | def test_1(df): 158 | df_ = df.copy() 159 | return df_.soc.transform(lambda x: bian_soc(x)) 160 | 161 | 162 | def wash(data): 163 | temp_index = data.query( 164 | "speed==0&total_voltage==0&total_current==0&soc==0&temp_max==0&temp_min==0&motor_voltage==0&motor_current==0").index 165 | data = data.drop(index=temp_index, axis=0) # 停车数据 166 | return data 167 | 168 | 169 | def time_interval(df): 170 | tr = df.copy() 171 | tr["seconds_now"] = 0 172 | for i in range(1, len(tr.datetime)): 173 | tr.seconds_now.values[i - 1] = (tr.year[i] * 365 * 24 * 60 * 60 + 174 | tr.month[i] * 30 * 24 * 60 * 60 + 175 | tr.day[i] * 24 * 60 * 60 + 176 | tr.hour[i] * 60 * 60 + 177 | tr.minute[i] * 60 + 178 | tr.second[i]) - (tr.year[i - 1] * 365 * 24 * 60 * 60 + 179 | tr.month[i - 1] * 30 * 24 * 60 * 60 + 180 | tr.day[i - 1] * 24 * 60 * 60 + 181 | tr.hour[i - 1] * 60 * 60 + 182 | tr.minute[i - 1] * 60 + 183 | tr.second[i - 1]) 184 | return tr.seconds_now 185 | 186 | 187 | def seconds_leijia(df): 188 | tr = df.copy() 189 | tr["seconds_now"] = 0 190 | for i in range(0, len(tr.datetime)): 191 | tr.seconds_now.values[i] = (tr.year[i] * 365 * 24 * 60 * 60 + 192 | tr.month[i] * 30 * 24 * 60 * 60 + 193 | tr.day[i] * 24 * 60 * 60 + 194 | tr.hour[i] * 60 * 60 + 195 | tr.minute[i] * 60 + 196 | tr.second[i]) 197 | return tr.seconds_now 198 | 199 | 200 | def cut_point(mowei_index): 201 | left = [] 202 | right = [] 203 | mowei_index.insert(0, 0) 204 | for index in range(0, len(mowei_index) - 1): 205 | left.append(mowei_index[index]) 206 | right.append(mowei_index[index + 1]) 207 | return left, right 208 | 209 | 210 | def reset_index(df_list): 211 | for i in range(len(df_list)): 212 | df_list[i].reset_index(inplace=True) 213 | df_list[i].drop(["index"], axis=1, inplace=True) 214 | 215 | return df_list 216 | 217 | 218 | def mile_trans(df): 219 | tr = df.copy() 220 | for i in range(len(tr)): 221 | initial = tr[i].mile[0] 222 | 223 | tr[i].mile = tr[i].mile.transform(lambda x: x - initial) 224 | return tr 225 | 226 | 227 | def time_trans(df): 228 | tr = df.copy() 229 | for i in range(len(tr)): 230 | initial = tr[i].driving_time[0] 231 | 232 | tr[i].driving_time = tr[i].driving_time.transform(lambda x: x - initial) 233 | return tr 234 | 235 | 236 | def norm_plot(data, color, name): 237 | mean = data.mean() 238 | sigma = data.std() 239 | df = pd.DataFrame() 240 | df["data"] = data 241 | y_probility = stats.norm.pdf(data, mean, sigma) 242 | df["y_probility"] = y_probility 243 | df = df.sort_values(by=["data"]) 244 | plt.grid() 245 | ax = plt.plot(df.data, df.y_probility, linewidth=2, color=color, label=name) 246 | return ax 247 | 248 | 249 | def cut_silce_by_soc(data): 250 | mo_wei_index = data.query("soc_rate>5|soc_rate<-10").index 251 | mo_wei_index = list(mo_wei_index) 252 | if len(mo_wei_index) == 0: 253 | print("error") 254 | if mo_wei_index[-1] != data.index[-1]: 255 | mo_wei_index.append(data.index[-1]) 256 | if mo_wei_index[0] != 0: 257 | mo_wei_index.insert(0, 0) 258 | left = [] 259 | right = [] 260 | for index in range(0, len(mo_wei_index) - 1): 261 | left.append(mo_wei_index[index]) 262 | right.append(mo_wei_index[index + 1]) 263 | data_list = [] 264 | for z, y in zip(left, right): 265 | if y == right[-1]: 266 | data_list.append(data.loc[z:y - 1, :]) 267 | else: 268 | data_list.append(data.loc[z:y - 1, :]) 269 | data_list = reset_index(data_list) 270 | return data_list 271 | 272 | 273 | def cut_silce_by_time(data): 274 | mo_wei_index = data.query("time_interval>100").index 275 | mo_wei_index = list(mo_wei_index) 276 | if len(mo_wei_index) == 0: 277 | print("error") 278 | if mo_wei_index[-1] != data.index[-1]: 279 | mo_wei_index.append(data.index[-1]) 280 | if mo_wei_index[0] != 0: 281 | mo_wei_index.insert(0, 0) 282 | left = [] 283 | right = [] 284 | for index in range(0, len(mo_wei_index) - 1): 285 | left.append(mo_wei_index[index]) 286 | right.append(mo_wei_index[index + 1]) 287 | data_list = [] 288 | for z, y in zip(left, right): 289 | if y == right[-1]: 290 | data_list.append(data.loc[z:y, :]) 291 | else: 292 | data_list.append(data.loc[z:y - 1, :]) 293 | data_list = reset_index(data_list) 294 | return data_list 295 | 296 | 297 | def cut_silce_by_mile(data): 298 | mo_wei_index = data.query("mile_rate>1").index 299 | mo_wei_index = list(mo_wei_index) 300 | if len(mo_wei_index) == 0: 301 | print("error") 302 | if mo_wei_index[-1] != data.index[-1]: 303 | mo_wei_index.append(data.index[-1]) 304 | if mo_wei_index[0] != 0: 305 | mo_wei_index.insert(0, 0) 306 | left = [] 307 | right = [] 308 | for index in range(0, len(mo_wei_index) - 1): 309 | left.append(mo_wei_index[index]) 310 | right.append(mo_wei_index[index + 1]) 311 | data_list = [] 312 | for z, y in zip(left, right): 313 | if y == right[-1]: 314 | data_list.append(data.loc[z:y, :]) 315 | else: 316 | data_list.append(data.loc[z:y - 1, :]) 317 | data_list = reset_index(data_list) 318 | return data_list 319 | 320 | 321 | def cut_silce_by_mile_soc_time(data): 322 | mo_wei_index_mile = list(data.query("mile_rate>1").index) 323 | mo_wei_index_time = list(data.query("time_interval>100").index) 324 | mo_wei_index_soc = list(data.query("soc_rate>5|soc_rate<-10").index) 325 | mo_wei_index = mo_wei_index_time + mo_wei_index_mile + mo_wei_index_soc 326 | mo_wei_index = list(set(mo_wei_index)) 327 | mo_wei_index.sort() 328 | if len(mo_wei_index) == 0: 329 | print("error") 330 | if mo_wei_index[-1] != data.index[-1]: 331 | mo_wei_index.append(data.index[-1]) 332 | if mo_wei_index[0] != 0: 333 | mo_wei_index.insert(0, 0) 334 | left = [] 335 | right = [] 336 | for index in range(0, len(mo_wei_index) - 1): 337 | left.append(mo_wei_index[index]) 338 | right.append(mo_wei_index[index + 1]) 339 | data_list = [] 340 | for z, y in zip(left, right): 341 | if z == left[0]: 342 | data_list.append(data.loc[z:y, :]) 343 | else: 344 | data_list.append(data.loc[z + 1:y, :]) 345 | data_list = reset_index(data_list) 346 | return data_list 347 | 348 | 349 | def main_fast_1(train_0): 350 | train_0 = train_0.drop_duplicates(subset=["time"], keep="first") # 去掉重复时间的数据 351 | train_0 = train_0.sort_values(by=["time"]) # 首先按照时间排序 352 | train_0["datetime"] = asd(train_0) # 创建时间格式的时间 353 | train_0 = train_0.reset_index().drop(["index"], axis=1) # 对于index重新排列 354 | train_0["year"] = [t.year for t in pd.DatetimeIndex(train_0.datetime)] # 创建年月日 355 | train_0["month"] = [t.month for t in pd.DatetimeIndex(train_0.datetime)] 356 | train_0["day"] = [t.day for t in pd.DatetimeIndex(train_0.datetime)] 357 | train_0["hour"] = [t.hour for t in pd.DatetimeIndex(train_0.datetime)] 358 | train_0["minute"] = [t.minute for t in pd.DatetimeIndex(train_0.datetime)] 359 | train_0["second"] = [t.second for t in pd.DatetimeIndex(train_0.datetime)] 360 | train_0.time = seconds_leijia(train_0) 361 | train_0 = wash(train_0) 362 | train_0.reset_index(inplace=True) 363 | train_0.drop(["index"], axis=1, inplace=True) 364 | train_0["time_interval"] = time_interval(train_0) 365 | 366 | train_0["soc_rate"] = rate_with_next(train_0, train_0.soc) 367 | 368 | train_0["mile_rate"] = rate_with_next(train_0, train_0.mileage) 369 | 370 | return train_0 371 | 372 | 373 | def remain_fast_1(train_0): 374 | train_0.reset_index(inplace=True) 375 | train_0.drop(["index"], axis=1, inplace=True) 376 | train_0["time_interval"] = time_interval(train_0) 377 | 378 | train_0["soc_rate"] = rate_with_next(train_0, train_0.soc) 379 | 380 | train_0["mile_rate"] = rate_with_next(train_0, train_0.mileage) 381 | 382 | return train_0 383 | 384 | 385 | def main_fast_2(train_0): 386 | train_list_0 = cut_silce_by_mile_soc_time(train_0) 387 | 388 | return train_list_0 389 | 390 | 391 | def trans(x): 392 | if x < 0: 393 | return x * 0.5 394 | else: 395 | return x 396 | 397 | 398 | def trans_2(x): 399 | if x > 0: 400 | return 0 401 | else: 402 | return x 403 | 404 | 405 | def main_fast_3(data_list): 406 | for i in range(len(data_list)): 407 | data_list[i]["soc_rate"] = rate_with_next(data_list[i], data_list[i].soc) 408 | 409 | temp_values = data_list[i].soc_rate.transform(lambda x: trans_2(x)) 410 | data_list[i]["used_soc"] = np.abs(temp_values.cumsum()) 411 | 412 | data_list[i]["mile"] = relative_value_zeng(data_list[i].mileage) 413 | data_list[i]["driving_time"] = relative_value_zeng(data_list[i].time) 414 | data_list[i]["motor_voltage_rate"] = rate_with_next(data_list[i], data_list[i].motor_voltage) 415 | data_list[i]["motor_current_rate"] = rate_with_next(data_list[i], data_list[i].motor_current) 416 | data_list[i]["total_voltage_rate"] = rate_with_next(data_list[i], data_list[i].total_voltage) 417 | data_list[i]["total_current_rate"] = rate_with_next(data_list[i], data_list[i].total_current) 418 | data_list[i]["mile_rate"] = rate_with_next(data_list[i], data_list[i].mileage) 419 | 420 | data_list[i].time_interval[0] = 10 421 | 422 | data_list[i]["total_power_transient"] = (data_list[i].total_current * data_list[i].total_voltage) 423 | data_list[i]["motor_power_transient"] = data_list[i].motor_current * data_list[i].motor_voltage 424 | data_list[i]["total_power"] = data_list[i].total_power_transient.transform(lambda x: trans(x)) 425 | data_list[i]["motor_power"] = data_list[i].motor_power_transient.transform(lambda x: trans(x)) 426 | 427 | data_list[i].total_power = data_list[i].total_power.cumsum() 428 | data_list[i].motor_power = data_list[i].motor_power.cumsum() 429 | 430 | data_list[i]["temp_diff"] = data_list[i].temp_max - data_list[i].temp_min 431 | 432 | # 删除片段中的停车数据 433 | 434 | data_list[i]["soc_start"] = data_list[i].soc[0] 435 | 436 | return data_list 437 | 438 | 439 | def main_fast_4(data_list): 440 | for i in range(len(data_list)): 441 | data_list[i]["motor_power_per_mile"] = data_list[i].motor_power.max() / float(data_list[i].mile.max()) 442 | data_list[i]["total_power_per_mile"] = data_list[i].total_power.max() / float(data_list[i].mile.max()) 443 | data_list[i]["motor_power_per_soc"] = data_list[i].motor_power.max() / float(data_list[i].used_soc.max()) 444 | data_list[i]["total_power_per_soc"] = data_list[i].total_power.max() / float(data_list[i].used_soc.max()) 445 | 446 | return data_list 447 | 448 | 449 | def wash_2(data): 450 | # 充电数据 451 | temp_index = data.query("mile_rate==0&motor_voltage==0&motor_current==0&total_current<0&speed==0.0").index 452 | data.drop(index=temp_index, axis=0, inplace=True) 453 | data.reset_index(inplace=True) 454 | data.drop(["index"], axis=1, inplace=True) 455 | # 删除的以下数据是充满电之后停车数据 456 | temp_index = data.query("motor_voltage==0&mile_rate<=0.0&speed==0&total_current==0&motor_current==0").index 457 | data.drop(index=temp_index, axis=0, inplace=True) 458 | data.reset_index(inplace=True) 459 | data.drop(["index"], axis=1, inplace=True) 460 | # 开车之前或者是用车结束关机时有些数据已经为0,有些则没有 461 | temp_index = data.query( 462 | "mile_rate<=0.0&speed==0&total_voltage==0&total_current==0&motor_current==0&soc==0&temp_max==0&temp_min==0").index 463 | data.drop(index=temp_index, axis=0, inplace=True) 464 | data.reset_index(inplace=True) 465 | data.drop(["index"], axis=1, inplace=True) 466 | return data 467 | 468 | 469 | """Outlier Detect and correction""" 470 | 471 | 472 | def outlier_justify(data): 473 | temp_index = data.query("soc==0").index 474 | downs = [] 475 | for i in temp_index: 476 | for j in range(1, 10000, 1): 477 | if i + j not in temp_index: 478 | downs.append(i + j) 479 | break 480 | ups = [] 481 | for i in temp_index: 482 | for j in range(1, 10000, 1): 483 | if i - j not in temp_index: 484 | ups.append(i - j) 485 | break 486 | for up, down, now in zip(ups, downs, temp_index): 487 | if np.abs(data.time[up] - data.time[now]) >= np.abs(data.time[down] - data.time[now]): 488 | data.soc[now] = data.soc[down] 489 | else: 490 | data.soc[now] = data.soc[up] 491 | 492 | temp_index = data.query("total_voltage==0").index 493 | downs = [] 494 | for i in temp_index: 495 | for j in range(1, 10000, 1): 496 | if i + j not in temp_index: 497 | downs.append(i + j) 498 | break 499 | ups = [] 500 | for i in temp_index: 501 | for j in range(1, 10000, 1): 502 | if i - j not in temp_index: 503 | ups.append(i - j) 504 | break 505 | for up, down, now in zip(ups, downs, temp_index): 506 | if np.abs(data.time[up] - data.time[now]) >= np.abs(data.time[down] - data.time[now]): 507 | data.total_voltage[now] = data.total_voltage[down] 508 | else: 509 | data.total_voltage[now] = data.total_voltage[up] 510 | 511 | temp_index = data.query("temp_max==0").index 512 | downs = [] 513 | for i in temp_index: 514 | for j in range(1, 10000, 1): 515 | if i + j not in temp_index: 516 | downs.append(i + j) 517 | break 518 | ups = [] 519 | for i in temp_index: 520 | for j in range(1, 10000, 1): 521 | if i - j not in temp_index: 522 | ups.append(i - j) 523 | break 524 | for up, down, now in zip(ups, downs, temp_index): 525 | if np.abs(data.time[up] - data.time[now]) >= np.abs(data.time[down] - data.time[now]): 526 | data.temp_max[now] = data.temp_max[down] 527 | else: 528 | data.temp_max[now] = data.temp_max[up] 529 | 530 | temp_index = data.query("temp_min==0").index 531 | downs = [] 532 | for i in temp_index: 533 | for j in range(1, 10000, 1): 534 | if i + j not in temp_index: 535 | downs.append(i + j) 536 | break 537 | ups = [] 538 | for i in temp_index: 539 | for j in range(1, 10000, 1): 540 | if i - j not in temp_index: 541 | ups.append(i - j) 542 | break 543 | for up, down, now in zip(ups, downs, temp_index): 544 | if np.abs(data.time[up] - data.time[now]) >= np.abs(data.time[down] - data.time[now]): 545 | data.temp_min[now] = data.temp_min[down] 546 | else: 547 | data.temp_min[now] = data.temp_min[up] 548 | 549 | temp_index = data.query("motor_voltage==0").index 550 | downs = [] 551 | for i in temp_index: 552 | for j in range(1, 10000, 1): 553 | if i + j not in temp_index: 554 | downs.append(i + j) 555 | break 556 | ups = [] 557 | for i in temp_index: 558 | for j in range(1, 10000, 1): 559 | if i - j not in temp_index: 560 | ups.append(i - j) 561 | break 562 | for up, down, now in zip(ups, downs, temp_index): 563 | if np.abs(data.time[up] - data.time[now]) >= np.abs(data.time[down] - data.time[now]): 564 | data.motor_voltage[now] = data.motor_voltage[down] 565 | else: 566 | data.motor_voltage[now] = data.motor_voltage[up] 567 | 568 | return data 569 | 570 | 571 | def cut_silce_by_mile_for_take_silce(data): 572 | mo_wei_index = data.query("mile_rate<0").index 573 | mo_wei_index = list(mo_wei_index) 574 | if len(mo_wei_index) == 0: 575 | print("error") 576 | if mo_wei_index[-1] != data.index[-1]: 577 | mo_wei_index.append(data.index[-1]) 578 | if mo_wei_index[0] != 0: 579 | mo_wei_index.insert(0, 0) 580 | left = [] 581 | right = [] 582 | for index in range(0, len(mo_wei_index) - 1): 583 | left.append(mo_wei_index[index]) 584 | right.append(mo_wei_index[index + 1]) 585 | data_list = [] 586 | for z, y in zip(left, right): 587 | if z == left[0]: 588 | data_list.append(data.loc[z:y, :]) 589 | else: 590 | data_list.append(data.loc[z + 1:y, :]) 591 | temp_index = [] 592 | for i, x in enumerate(data_list): 593 | if len(x) <= 10: 594 | temp_index.append(i) 595 | data_list = [data_list[i] for i in range(len(data_list)) if (i not in temp_index)] 596 | data_list = reset_index(data_list) 597 | return data_list 598 | 599 | 600 | def time_change(data_list): 601 | for i, x in enumerate(data_list): 602 | x["time_interval"] = rate_with_next(x, x.time) 603 | return data_list 604 | 605 | 606 | def intercept_missing_data(data): 607 | interval = 10 608 | for i, x in enumerate(data): 609 | input_index = [] 610 | for j in range(len(x)): 611 | if np.round(x.time_interval[j] / interval, 0) > 1: 612 | num = int(np.round(x.time_interval[j] / interval, 0) - 1) 613 | input_index.append((j, num)) 614 | iters = 0 615 | for k in input_index: 616 | input_data = pd.DataFrame(np.zeros((k[1], len(x.columns))), columns=x.columns) 617 | data[i] = data[i].iloc[:k[0] + 1 + iters].append(input_data, ignore_index=True).append( 618 | data[i].iloc[k[0] + 1 + iters:], ignore_index=True) 619 | iters += k[1] 620 | return data 621 | 622 | 623 | def to_nan(data): 624 | for i in range(len(data)): 625 | temp_index = data[i].query("mileage==0").index 626 | for j in temp_index: 627 | data[i].loc[j] = np.nan 628 | return data 629 | 630 | 631 | def linear_difference(data): 632 | for i in range(len(data)): 633 | data[i].time = data[i].time.interpolate() 634 | data[i].speed = data[i].speed.interpolate() 635 | data[i].total_voltage = data[i].total_voltage.interpolate() 636 | data[i].total_current = data[i].total_current.interpolate() 637 | data[i].soc = data[i].soc.interpolate() 638 | data[i].temp_max = data[i].temp_max.interpolate() 639 | data[i].temp_min = data[i].temp_min.interpolate() 640 | data[i].motor_voltage = data[i].motor_voltage.interpolate() 641 | data[i].motor_current = data[i].motor_current.interpolate() 642 | data[i].mileage = data[i].mileage.interpolate() 643 | return data 644 | 645 | 646 | def round_float(data): 647 | for i in range(len(data)): 648 | data[i].time_interval = np.round(data[i].time_interval, 0) 649 | return data 650 | 651 | 652 | def to_brakes(x): 653 | if x < 0: 654 | return 1 655 | else: 656 | return 0 657 | 658 | 659 | def to_stops(x): 660 | if x == 0: 661 | return 1 662 | else: 663 | return 0 664 | 665 | 666 | def to_accs(x): 667 | if x > 0: 668 | return 1 669 | else: 670 | return 0 671 | 672 | 673 | def work_condition_feature(data): 674 | for i, x in enumerate(data): 675 | total_nums = np.array(list(range(1, len(data[i]) + 1, 1))) 676 | data[i]["brake_ratio"] = data[i].motor_current 677 | data[i]["stop_ratio"] = data[i].motor_current 678 | data[i]["accelerate_ratio"] = data[i].motor_current 679 | data[i].brake_ratio = data[i].brake_ratio.transform(lambda x: to_brakes(x)) 680 | data[i].stop_ratio = data[i].stop_ratio.transform(lambda x: to_stops(x)) 681 | data[i].accelerate_ratio = data[i].accelerate_ratio.transform(lambda x: to_accs(x)) 682 | data[i].brake_ratio = data[i].brake_ratio.cumsum() / total_nums 683 | data[i].stop_ratio = data[i].stop_ratio.cumsum() / total_nums 684 | data[i].accelerate_ratio = data[i].accelerate_ratio.cumsum() / total_nums 685 | return data 686 | 687 | 688 | def cut_silce_by_point(data, point): 689 | mo_wei_index = point 690 | mo_wei_index = list(mo_wei_index) 691 | if len(mo_wei_index) == 0: 692 | print("error") 693 | return 0 694 | if mo_wei_index[-1] != data.index[-1]: 695 | mo_wei_index.append(data.index[-1]) 696 | if mo_wei_index[0] != 0: 697 | mo_wei_index.insert(0, 0) 698 | left = [] 699 | right = [] 700 | for index in range(0, len(mo_wei_index) - 1): 701 | left.append(mo_wei_index[index]) 702 | right.append(mo_wei_index[index + 1]) 703 | data_list = [] 704 | for z, y in zip(left, right): 705 | if z == left[0]: 706 | data_list.append(data.loc[z:y, :]) 707 | else: 708 | data_list.append(data.loc[z + 1:y, :]) 709 | data_list = reset_index(data_list) 710 | return data_list 711 | 712 | 713 | def create_work_condition_percentage(data): 714 | for i in range(len(data)): 715 | nums = np.array(list(range(1, len(data[i]) + 1))) 716 | data[i].work_condition_0 = (data[i].work_condition_0.cumsum()) / nums 717 | data[i].work_condition_1 = data[i].work_condition_1.cumsum() / nums 718 | data[i].work_condition_2 = data[i].work_condition_2.cumsum() / nums 719 | data[i].work_condition_3 = data[i].work_condition_3.cumsum() / nums 720 | return data 721 | 722 | 723 | def delet_stopping_trips(train_list_recover): 724 | temp_index = [] 725 | for i in range(len(train_list_recover)): 726 | if train_list_recover[i].speed.max() == 0: 727 | temp_index.append(i) 728 | train_list_recover = [train_list_recover[i] for i in range(len(train_list_recover)) if (i not in temp_index)] 729 | return train_list_recover 730 | 731 | def train_test_perpare(train_list_recover,scaler_model_path,using_anchor_based = True): 732 | test_index = np.random.randint(0,2371,size=1000) 733 | 734 | train_list = [train_list_recover[i] for i in range(len(train_list_recover)) if (i not in test_index)] 735 | 736 | 737 | train = pd.concat(train_list, axis=0) 738 | train.reset_index(inplace=True) 739 | train.drop(["index"], axis=1, inplace=True) 740 | train.reset_index(inplace=True) 741 | train.drop(["index"], axis=1, inplace=True) 742 | train.drop(['time', 'speed', 'total_voltage', 'total_current', 'soc', 'soc_rate', 743 | 'motor_voltage', 'motor_current', 'mileage', 'datetime', 744 | 'year', 'month', 'day', 'hour', 'minute', 'second', 'time_interval', 'mile_rate', 745 | 'motor_voltage_rate', 'motor_current_rate', 'total_voltage_rate', 746 | 'total_current_rate', 'total_power_transient', 'motor_power_transient', 747 | 'cut_point', 'work_condition', 748 | 'work_condition_color', ], axis=1, inplace=True) 749 | temp_index = train.query("temp_diff<-10").index 750 | train.drop(index=temp_index, axis=1, inplace=True) 751 | train.reset_index(inplace=True) 752 | train.drop(["index"], axis=1, inplace=True) 753 | 754 | train_X, train_y = train.drop(['brake_ratio', 'stop_ratio', 'accelerate_ratio', 755 | 'work_condition_0', 'work_condition_1', 'work_condition_2', 756 | 'work_condition_3', 'mile', ], axis=1), train.mile 757 | 758 | if using_anchor_based: 759 | anchor_train = 1.6775 * train_X.used_soc 760 | 761 | 762 | scaler = StandardScaler() 763 | scaler.fit(train_X) 764 | Train_X = scaler.transform(train_X) 765 | joblib.dump(scaler, scaler_model_path) 766 | 767 | Train_X = pd.DataFrame(Train_X, columns=train_X.columns) 768 | Train_X = pd.concat([Train_X, train[['brake_ratio', 'stop_ratio', 'accelerate_ratio', 769 | 'work_condition_0', 'work_condition_1', 'work_condition_2', 770 | 'work_condition_3']]], axis=1) 771 | 772 | # 测试集 773 | test_list = [train_list_recover[i] for i in range(len(train_list_recover)) if (i in test_index)] 774 | # temp_index = [] 775 | # for i in range(len(test_list)): 776 | # if (test_list[i].shape[0] == 0) | (test_list[i].loc[len(test_list[i]) - 1:].mile.values[0] <= 5): 777 | # temp_index.append(i) 778 | # test_list = [test_list[i] for i in range(len(test_list)) if (i not in temp_index)] 779 | # 780 | # test_features_1_list = [] 781 | # for i, x in enumerate(test_list): 782 | # if test_list[i].loc[len(test_list[i]) - 1:].mile.values[0] > 5: 783 | # input_data = test_list[i].loc[len(test_list[i]) - 1:].drop( 784 | # ['time', 'speed', 'total_voltage', 'total_current', 'soc', 'soc_rate', 785 | # 'motor_voltage', 'motor_current', 'mileage', 'datetime', 786 | # 'year', 'month', 'day', 'hour', 'minute', 'second', 'time_interval', 'mile_rate', 787 | # 'motor_voltage_rate', 'motor_current_rate', 'total_voltage_rate', 788 | # 'total_current_rate', 'total_power_transient', 'motor_power_transient', 789 | # 'cut_point', 'work_condition', 790 | # 'work_condition_color', "mile", 'brake_ratio', 'stop_ratio', 'accelerate_ratio', 791 | # 'work_condition_0', 'work_condition_1', 'work_condition_2', 792 | # 'work_condition_3'], axis=1) 793 | # test_features_1_list.append(scaler.transform(input_data)) 794 | # 795 | # test_features_1 = np.concatenate(test_features_1_list, axis=0) 796 | # test_features_1 = pd.DataFrame(test_features_1, columns=train_X.columns) 797 | # 798 | # test_features_2_list = [] 799 | # for i, x in enumerate(test_list): 800 | # if test_list[i].loc[len(test_list[i]) - 1:].mile.values[0] > 5: 801 | # input_data = test_list[i].loc[len(test_list[i]) - 1:][['brake_ratio', 'stop_ratio', 'accelerate_ratio', 802 | # 'work_condition_0', 'work_condition_1', 'work_condition_2', 803 | # 'work_condition_3']] 804 | # test_features_2_list.append(input_data) 805 | # test_features_2 = pd.concat(test_features_2_list, axis=0) 806 | # test_features_2.reset_index(inplace=True) 807 | # test_features_2.drop(["index"], axis=1, inplace=True) 808 | # 809 | # 810 | # test_features = pd.concat([test_features_1, test_features_2], axis=1) 811 | # 812 | # #测试集合标签 813 | # test_label_final_data = [] 814 | # for i, x in enumerate(test_list): 815 | # if test_list[i].loc[len(test_list[i]) - 1:].mile.values[0] > 5: 816 | # input_label = test_list[i].loc[len(test_list[i]) - 1:].mile.values[0] 817 | # test_label_final_data.append(input_label) 818 | 819 | #测试集计算2 820 | test_input = [] 821 | anchor_test_list = [] 822 | for i, x in enumerate(test_list): 823 | if test_list[i].loc[len(test_list[i]) - 1:].mile.values[0] > 5: 824 | input_data = test_list[i].drop(['time', 'speed', 'total_voltage', 'total_current', 'soc', 'soc_rate', 825 | 'motor_voltage', 'motor_current', 'mileage', 'datetime', 826 | 'year', 'month', 'day', 'hour', 'minute', 'second', 'time_interval', 'mile_rate', 827 | 'motor_voltage_rate', 'motor_current_rate', 'total_voltage_rate', 828 | 'total_current_rate', 'total_power_transient', 'motor_power_transient', 829 | 'cut_point', 'work_condition', 830 | 'work_condition_color', "mile", 'brake_ratio', 'stop_ratio', 'accelerate_ratio', 831 | 'work_condition_0', 'work_condition_1', 'work_condition_2', 832 | 'work_condition_3'], axis=1) 833 | feature_name = input_data.columns 834 | if using_anchor_based: 835 | anchor_test_list.append(input_data.used_soc*1.6775) 836 | input_data = pd.DataFrame(scaler.transform(input_data), columns=feature_name) 837 | input_data = pd.concat([input_data, test_list[i][['brake_ratio', 'stop_ratio', 'accelerate_ratio', 838 | 'work_condition_0', 'work_condition_1', 'work_condition_2', 839 | 'work_condition_3']]], axis=1) 840 | test_input.append(input_data) 841 | anchor_test = np.concatenate(anchor_test_list,axis=0) 842 | test_features = pd.concat(test_input, axis=0) 843 | test_features.reset_index(inplace=True) 844 | test_features.drop(["index"], axis=1, inplace=True) 845 | test_label_list = [] 846 | for i, x in enumerate(test_list): 847 | if test_list[i].loc[len(test_list[i]) - 1:].mile.values[0] > 5: 848 | input_label = test_list[i].mile 849 | test_label_list.append(input_label) 850 | test_label = pd.concat(test_label_list, axis=0) 851 | 852 | 853 | if using_anchor_based: 854 | return Train_X,train_y,test_features,test_label,anchor_train,anchor_test 855 | else: 856 | return Train_X, train_y, test_features, test_label 857 | 858 | if __name__ == '__main__': 859 | pass 860 | 861 | 862 | 863 | -------------------------------------------------------------------------------- /utils/eval.py: -------------------------------------------------------------------------------- 1 | import xgboost as xgb 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.linear_model import LinearRegression 5 | 6 | def evaluate(train_list_recover,xgboost_regressor,lgbm_regressor,scaler,i=32): 7 | # 2018.1.8,17:00-19:00 8 | # 2018.1.8,17:00-19:00 9 | 10 | trips = train_list_recover[32].copy() 11 | predict_xgb = [] 12 | predict_lgb = [] 13 | dt = [] 14 | for i in range(int(np.round(len(trips) * 0.8, 0)), len(trips)): 15 | vsn = len(trips) * 0.8 16 | f_motor_power = np.polyfit(trips.loc[:i].used_soc, trips.loc[:i].motor_power, 1) 17 | f_total_power = np.polyfit(trips.loc[:i].used_soc, trips.loc[:i].total_power, 1) 18 | temp_max = trips.temp_max[i] 19 | temp_min = trips.temp_min[i] 20 | work_condition_0 = trips.work_condition_0[i] 21 | work_condition_1 = trips.work_condition_1[i] 22 | work_condition_2 = trips.work_condition_2[i] 23 | work_condition_3 = trips.work_condition_3[i] 24 | accelerate_ratio = trips.accelerate_ratio[i] 25 | f_driving_time = LinearRegression() 26 | f_driving_time.fit(trips.loc[:i][["used_soc", "brake_ratio", "stop_ratio"]], trips.loc[:i].driving_time) 27 | used_soc = trips.used_soc.max() 28 | val_motor_power = np.polyval(f_motor_power, used_soc) 29 | val_total_power = np.polyval(f_total_power, used_soc) 30 | brake_ratio = trips.brake_ratio[i] 31 | stop_ratio = trips.stop_ratio[i] 32 | time_features = pd.DataFrame({"used_soc": used_soc, "brake_ratio": brake_ratio, "stop_ratio": stop_ratio}, 33 | index=range(1)) 34 | driving_time = f_driving_time.predict(time_features) 35 | dt.append(f_driving_time.predict(time_features)) 36 | temp_max = trips.temp_max[i] 37 | temp_min = trips.temp_min[i] 38 | temp_diff = trips.temp_diff[i] 39 | features = pd.DataFrame({"temp_max": temp_max, 40 | "temp_min": temp_min, 41 | "used_soc": used_soc, 42 | "driving_time": driving_time, 43 | "total_power": val_total_power, 44 | "motor_power": val_motor_power, 45 | "temp_diff": temp_diff, 46 | "soc_start": used_soc, 47 | "brake_ratio": brake_ratio, 48 | "stop_ratio": stop_ratio, 49 | "accelerate_ratio": accelerate_ratio, 50 | "work_condition_0": work_condition_0, 51 | "work_condition_1": work_condition_1, 52 | "work_condition_2": work_condition_2, 53 | "work_condition_3": work_condition_3}, index=range(1)) 54 | train_X = features.drop(['brake_ratio', 'stop_ratio', 'accelerate_ratio', 55 | 'work_condition_0', 'work_condition_1', 'work_condition_2', 56 | 'work_condition_3'], axis=1) 57 | cat = features[['brake_ratio', 'stop_ratio', 'accelerate_ratio', 58 | 'work_condition_0', 'work_condition_1', 'work_condition_2', 59 | 'work_condition_3']] 60 | Feature_vector_normal = scaler.transform(train_X) 61 | Feature_vector_normal = pd.DataFrame(Feature_vector_normal, columns=train_X.columns) 62 | Feature_vector_normals = pd.concat([Feature_vector_normal, cat], axis=1) 63 | Feature_vector_xgb = xgb.DMatrix(Feature_vector_normals) 64 | predict_xgb.append(xgboost_regressor.predict(Feature_vector_xgb)[0] - trips.loc[:i].mile.max()) 65 | predict_lgb.append(lgbm_regressor.predict(Feature_vector_normals)[0] - trips.loc[:i].mile.max()) 66 | True_value = sorted(trips.loc[:len(trips) - int(np.round(len(trips) * 0.8, 0)) - 1].mile, reverse=True) 67 | error_xgb = predict_xgb - np.array(True_value) 68 | error_lgb = predict_lgb - np.array(True_value) 69 | blend_error = np.array(np.array(predict_xgb) + np.array(predict_lgb)) / 2 - np.array(True_value) 70 | return error_xgb,error_lgb,blend_error 71 | 72 | if __name__ == '__main__': 73 | evaluate() 74 | 75 | -------------------------------------------------------------------------------- /utils/preprocessing.py: -------------------------------------------------------------------------------- 1 | from utils.data_clean import * 2 | 3 | 4 | if __name__ == '__main__': 5 | train_0 = pd.read_csv("./TrainData/Vehicle No.0.csv") 6 | train_1 = pd.read_csv("./TrainData/Vehicle No.1.csv") 7 | train_2 = pd.read_csv("./TrainData/Vehicle No.2.csv") 8 | train_3 = pd.read_csv("./TrainData/Vehicle No.3.csv") 9 | train_4 = pd.read_csv("./TrainData/Vehicle No.4.csv") 10 | 11 | train_0 = main_fast_1(train_0) 12 | train_1 = main_fast_1(train_1) 13 | train_2 = main_fast_1(train_2) 14 | train_3 = main_fast_1(train_3) 15 | train_4 = main_fast_1(train_4) 16 | 17 | train_0 = wash_2(train_0) 18 | train_1 = wash_2(train_1) 19 | train_2 = wash_2(train_2) 20 | train_3 = wash_2(train_3) 21 | train_4 = wash_2(train_4) 22 | 23 | train_0 = remain_fast_1(train_0) 24 | train_1 = remain_fast_1(train_1) 25 | train_2 = remain_fast_1(train_2) 26 | train_3 = remain_fast_1(train_3) 27 | train_4 = remain_fast_1(train_4) 28 | 29 | train_0 = outlier_justify(train_0) 30 | train_1 = outlier_justify(train_1) 31 | train_2 = outlier_justify(train_2) 32 | train_3 = outlier_justify(train_3) 33 | train_4 = outlier_justify(train_4) 34 | 35 | train_list_0 = main_fast_2(train_0) 36 | train_list_1 = main_fast_2(train_1) 37 | train_list_2 = main_fast_2(train_2) 38 | train_list_3 = main_fast_2(train_3) 39 | train_list_4 = main_fast_2(train_4) 40 | 41 | train_silce = [] 42 | train_silce = train_list_0 + train_list_1 + train_list_2 + train_list_3 + train_list_4 43 | 44 | temp_index = [] 45 | for i, x in enumerate(train_silce): 46 | if len(x) <= 2: 47 | temp_index.append(i) 48 | train_silce = [train_silce[i] for i in range(len(train_silce)) if (i not in temp_index)] 49 | 50 | train_silce = main_fast_3(train_silce) 51 | 52 | for i, x in enumerate(train_silce): 53 | x["cut_point"] = 0 54 | x.cut_point[0] = -1 55 | temp_index = [] 56 | for i, x in enumerate(train_silce): 57 | if x.mile_rate.min() < 0: 58 | temp_index.append(i) 59 | outlier_silce = [] 60 | for i in temp_index: 61 | outlier_silce.append(train_silce[i]) 62 | train_silce = [train_silce[i] for i in range(len(train_silce)) if (i not in temp_index)] 63 | 64 | normal_silce = [] 65 | for i, x in enumerate(outlier_silce): 66 | normal_silce += cut_silce_by_mile_for_take_silce(x) 67 | 68 | normal_silce = main_fast_3(normal_silce) 69 | 70 | train_silce = normal_silce + train_silce 71 | 72 | train_silce = time_change(train_silce) 73 | 74 | train_silce = intercept_missing_data(train_silce) 75 | 76 | train_silce = to_nan(train_silce) 77 | 78 | train_silce = linear_difference(train_silce) 79 | 80 | train_silce = main_fast_3(train_silce) 81 | 82 | train_silce = time_change(train_silce) 83 | 84 | train_silce = round_float(train_silce) 85 | 86 | train_silce = work_condition_feature(train_silce) 87 | 88 | train = pd.concat(train_silce, axis=0) 89 | train.reset_index(inplace=True) 90 | train.drop(["index"], axis=1, inplace=True) 91 | 92 | CutPoint = train.query("time_interval==0").index 93 | 94 | # 显然根据手肘法,发现是在4左右应该使用4作为聚类的数量 95 | train_wc = train[["speed" 96 | , "motor_current", "motor_current_rate"]] 97 | scaler_for_cluster = StandardScaler() 98 | scaler_for_cluster.fit(train_wc) 99 | train_wc = scaler_for_cluster.transform(train_wc) 100 | kmeans = KMeans(n_clusters=4, random_state=0, n_jobs=-1).fit(train_wc) 101 | train["work_condition"] = kmeans.labels_ 102 | train.work_condition = train.work_condition.astype("int") 103 | train.work_condition = train.work_condition.astype("category") 104 | 105 | work_condition_cat = pd.get_dummies(train.work_condition) 106 | work_condition_cat = work_condition_cat.rename(columns={0: "work_condition_0", 107 | 1: "work_condition_1", 108 | 2: "work_condition_2", 109 | 3: "work_condition_3"}) 110 | 111 | train = pd.concat([train, work_condition_cat], axis=1) 112 | 113 | train_list_recover = cut_silce_by_point(train, CutPoint) 114 | 115 | train_list_recover = create_work_condition_percentage(train_list_recover) 116 | 117 | train = pd.concat(train_list_recover[:1800], axis=0) 118 | train.reset_index(inplace=True) 119 | train.drop(["index"], axis=1, inplace=True) 120 | 121 | for i in range(len(train_list_recover)): 122 | train_list_recover[i].to_csv("./data/{}.csv".format(i)) -------------------------------------------------------------------------------- /utils/vis_function.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import pandas as pd 4 | import os 5 | 6 | subplot_fig_size=(20,30) 7 | subplot_xticks_size=20 8 | subplot_yticks_size=20 9 | subplot_xlabel_size=20 10 | subplot_ylabel_size=20 11 | subplot_title_size=20 12 | subplot_legend_size=20 13 | 14 | fig_size = (8, 6) 15 | xticks_size = 20 16 | yticks_size = 20 17 | xlabel_size = 20 18 | ylabel_size = 20 19 | title_size = 20 20 | legend_size = 20 21 | 22 | def plot_features_vs_range_a_trip(train_list_recover,save_path): 23 | for i in range(len(train_list_recover)): 24 | train_list_recover[i].drop(['time', 'speed', 'total_voltage', 'total_current', 'soc', 'soc_rate', 25 | 'motor_voltage', 'motor_current', 'mileage', 'datetime', 26 | 'year', 'month', 'day', 'hour', 'minute', 'second', 'time_interval', 'mile_rate', 27 | 'motor_voltage_rate', 'motor_current_rate', 'total_voltage_rate', 28 | 'total_current_rate', 'total_power_transient', 'motor_power_transient', 29 | 'cut_point', 'work_condition', 30 | 'work_condition_color'], axis=1, inplace=True) 31 | subplot_fig_size = (20, 30) 32 | alphabets = range(len(train_list_recover[0].columns)) 33 | plt.figure(figsize=subplot_fig_size) 34 | # plt.suptitle("Add more than 120 trips", y=1.01, fontsize=30) 35 | for j in range(1, len(train_list_recover[0].columns) + 1): 36 | ax = plt.subplot(7, 4, j) 37 | for i, x in enumerate(train_list_recover): 38 | ax.plot(x.mile, x.iloc[:, j - 1]) 39 | feature_name = change_features_name(x.columns[j - 1]) 40 | plt.ylabel("{}".format(feature_name), fontsize=subplot_ylabel_size) 41 | plt.xlabel("Driving distance [km] \n (%d)" % j, fontsize=subplot_xlabel_size) 42 | plt.xticks(fontsize=subplot_xticks_size) 43 | plt.yticks(fontsize=subplot_yticks_size) 44 | plt.tight_layout() 45 | plt.savefig(save_path, dpi=600, bbox_inches="tight") 46 | plt.show() 47 | return 0 48 | 49 | def plot_features_vs_range(train_list_recover,save_path): 50 | for i in range(len(train_list_recover)): 51 | train_list_recover[i].drop(['time', 'speed', 'total_voltage', 'total_current', 'soc', 'soc_rate', 52 | 'motor_voltage', 'motor_current', 'mileage', 'datetime', 53 | 'year', 'month', 'day', 'hour', 'minute', 'second', 'time_interval', 'mile_rate', 54 | 'motor_voltage_rate', 'motor_current_rate', 'total_voltage_rate', 55 | 'total_current_rate', 'total_power_transient', 'motor_power_transient', 56 | 'cut_point', 'work_condition', 57 | 'work_condition_color'], axis=1, inplace=True) 58 | subplot_fig_size = (20, 30) 59 | alphabets = range(len(train_list_recover[0].columns)) 60 | plt.figure(figsize=subplot_fig_size) 61 | # plt.suptitle("Add more than 120 trips", y=1.01, fontsize=30) 62 | for j in range(1, len(train_list_recover[0].columns) + 1): 63 | ax = plt.subplot(7, 4, j) 64 | for i, x in enumerate(train_list_recover): 65 | ax.plot(x.mile, x.iloc[:, j - 1]) 66 | feature_name = change_features_name(x.columns[j - 1]) 67 | plt.ylabel("{}".format(feature_name), fontsize=subplot_ylabel_size) 68 | plt.xlabel("Driving distance [km] \n (%d)" % j, fontsize=subplot_xlabel_size) 69 | plt.xticks(range(0, 150, 30), fontsize=subplot_xticks_size) 70 | plt.yticks(fontsize=subplot_yticks_size) 71 | plt.tight_layout() 72 | plt.savefig(save_path, dpi=600, bbox_inches="tight") 73 | plt.show() 74 | return 0 75 | 76 | def change_features_name(feature_name): 77 | if feature_name=="total_power": 78 | return "COEB [J]" 79 | elif feature_name=="temp_max": 80 | return "Temp_max [Celsius]" 81 | elif feature_name=="temp_min": 82 | return "Temp_min [Celsius]" 83 | elif feature_name=="temp_diff": 84 | return "Temp_diff [Celsius]" 85 | elif feature_name=="motor_power": 86 | return "COEM [J]" 87 | elif feature_name== "mile": 88 | return "Driving distance [km]" 89 | elif feature_name=="soc_start": 90 | return "SOC_start [%]" 91 | elif feature_name=="brake_ratio": 92 | return "BR [%]" 93 | elif feature_name=="stop_ratio": 94 | return "SR [%]" 95 | elif feature_name=="accelerate_ratio": 96 | return "AR [%]" 97 | elif feature_name=="work_condition_0": 98 | return "RDP_1 [%]" 99 | elif feature_name=="work_condition_1": 100 | return "RDP_2 [%]" 101 | elif feature_name=="work_condition_2": 102 | return "RDP_3 [%]" 103 | elif feature_name=="work_condition_3": 104 | return "RDP_4 [%]" 105 | elif feature_name=="used_soc": 106 | return "Used_SOC [%]" 107 | else: 108 | return feature_name 109 | 110 | def plot_COEM_vs_Used_soc(train_list_recover): 111 | # 画出电机输出能量和SOC使用量之间的关系 112 | 113 | plt.figure(figsize=fig_size) 114 | for i in range(len(train_list_recover)): 115 | plt.plot(train_list_recover[i].used_soc, train_list_recover[i].motor_power) 116 | plt.xticks(fontsize=xticks_size) 117 | plt.yticks(fontsize=yticks_size) 118 | # plt.title("Used_SOC & Motor power", fontsize=title_size) 119 | plt.xlabel("Used_SOC [%]", fontsize=xlabel_size) 120 | plt.ylabel("COEM [J]", fontsize=ylabel_size) 121 | plt.savefig("/home/liang/range_prediction/output/figures/COEM_vs_Used_SOC.jpg", dpi=600, bbox_inches="tight") 122 | 123 | def plot_COMB_VS_Used_SOC(train_list_recover): 124 | # 画出电机输出能量和SOC使用量之间的关系 125 | plt.figure(figsize=fig_size) 126 | for i in range(len(train_list_recover)): 127 | plt.plot(train_list_recover[i].used_soc, train_list_recover[i].total_power) 128 | plt.xticks(fontsize=xticks_size) 129 | plt.yticks(fontsize=yticks_size) 130 | # plt.title("Used_soc & Total_power", fontsize=title_size) 131 | plt.xlabel("Used_SOC [%]", fontsize=xlabel_size) 132 | plt.ylabel("COMB [J]", fontsize=ylabel_size) 133 | plt.savefig("/home/liang/range_prediction/output/figures/COEB_vs_Used_SOC.jpg", dpi=600, bbox_inches="tight") 134 | 135 | def plot_silce_COME_vs_SOC(train_list_recover): 136 | 137 | # 特征预测的举例说明:电机能量预测 138 | trips = train_list_recover[2] 139 | plt.figure(figsize=fig_size) 140 | f_motor_power = np.polyfit(trips.used_soc, trips.motor_power, 1) 141 | val_motor_power = np.polyval(f_motor_power, range(0, int(trips.soc[0]))) 142 | plt.plot(range(int(trips.soc[0]), 0, -1), val_motor_power, label="Linear Model") 143 | plt.plot(trips.soc, trips.motor_power, label="True value") 144 | plt.xticks(fontsize=xticks_size) 145 | plt.yticks(fontsize=yticks_size) 146 | # plt.title("SOC & Motor_power", fontsize=title_size) 147 | plt.xlabel("SOC [%]", fontsize=xlabel_size) 148 | plt.ylabel("COEM [J]", fontsize=ylabel_size) 149 | plt.legend(fontsize=legend_size) 150 | plt.savefig("/home/liang/range_prediction/output/figures/COME_prediction.jpg", dpi=600, bbox_inches="tight") 151 | 152 | def plot_silce_COMB_vs_SOC(train_list_recover): 153 | # 特征预测的举例说明:电池能量预测 154 | trips = train_list_recover[2] 155 | plt.figure(figsize=fig_size) 156 | f_total_power = np.polyfit(trips.used_soc, trips.total_power, 1) 157 | val_total_power = np.polyval(f_total_power, range(0, int(trips.soc[0]))) 158 | plt.plot(range(int(trips.soc[0]), 0, -1), val_total_power, label="Linear Model") 159 | plt.plot(trips.soc, trips.total_power, label="True Value") 160 | plt.xticks(fontsize=xticks_size) 161 | plt.yticks(fontsize=yticks_size) 162 | # plt.title("SOC & Total_power", fontsize=title_size) 163 | plt.xlabel("SOC [%]", fontsize=xlabel_size) 164 | plt.ylabel("COEB [J]", fontsize=ylabel_size) 165 | plt.legend(fontsize=legend_size) 166 | plt.savefig("/home/liang/range_prediction/output/figures/COEB_prediction.jpg", dpi=600, bbox_inches="tight") 167 | 168 | def plot_model_comparision(save_path): 169 | score = [3.63, 3.74, 3.97, 11.52, 9.44, 10.65, 11.95, 8.55]#obtain from experiments 170 | model_name = ["XGBoost", "LightGBM", "GBRT", "Lasso", "Elastic_net", "RandomForest", "Bagging", "Neural_NetWork"] 171 | plt.figure(figsize=(15, 8)) 172 | plt.bar(model_name, score) 173 | plt.title("Model Selection", fontsize=title_size) 174 | plt.xticks(rotation=30, fontsize=xticks_size) 175 | plt.yticks(np.arange(0, 16, 3), fontsize=yticks_size) 176 | plt.ylabel('MAPE [%]', fontsize=ylabel_size) 177 | for a, b in zip(model_name, score): 178 | plt.text(a, b + 0.25, str(b) + "%", ha="center", va="center", fontsize=xticks_size) 179 | plt.savefig(save_path, dpi=600, bbox_inches="tight") 180 | 181 | def plot_train_process(): 182 | # 迭代图 183 | root_path = "/home/liang/mile_estimator/tijiaocode" 184 | path = os.path.join(root_path,"origin_silce_plot/car_1/before_sample/gradientboost_print.csv") 185 | gradientboost_print_1 = pd.read_csv(path) 186 | 187 | path = os.path.join(root_path, "origin_silce_plot/car_1/before_sample/xgboost_print.csv") 188 | xgboost_print_1 = pd.read_csv(path) 189 | 190 | path = os.path.join(root_path, "origin_silce_plot/car_1/before_sample/light_print.csv") 191 | light_print_1 = pd.read_csv(path) 192 | 193 | path = os.path.join(root_path, "origin_silce_plot/car_2/before_sample/gradientboost_print.csv") 194 | gradientboost_print_2 = pd.read_csv(path) 195 | 196 | path = os.path.join(root_path, "origin_silce_plot/car_2/before_sample/xgboost_print.csv") 197 | xgboost_print_2 = pd.read_csv(path) 198 | 199 | path = os.path.join(root_path, "origin_silce_plot/car_2/before_sample/light_print.csv") 200 | light_print_2 = pd.read_csv(path) 201 | 202 | path = os.path.join(root_path, "origin_silce_plot/car_3/before_sample/gradientboost_print.csv") 203 | gradientboost_print_3 = pd.read_csv(path) 204 | 205 | path = os.path.join(root_path, "origin_silce_plot/car_3/before_sample/xgboost_print.csv") 206 | xgboost_print_3 = pd.read_csv(path) 207 | 208 | path = os.path.join(root_path, "origin_silce_plot/car_3/before_sample/light_print.csv") 209 | light_print_3 = pd.read_csv(path) 210 | 211 | path = os.path.join(root_path, "origin_silce_plot/car_4/before_sample/gradientboost_print.csv") 212 | gradientboost_print_4 = pd.read_csv(path) 213 | 214 | path = os.path.join(root_path, "origin_silce_plot/car_4/before_sample/xgboost_print.csv") 215 | xgboost_print_4 = pd.read_csv(path) 216 | 217 | path = os.path.join(root_path, "origin_silce_plot/car_4/before_sample/light_print.csv") 218 | light_print_4 = pd.read_csv(path) 219 | 220 | path = os.path.join(root_path, "origin_silce_plot/car_4/before_sample/gradientboost_print.csv") 221 | gradientboost_print_5 = pd.read_csv(path) 222 | 223 | path = os.path.join(root_path, "origin_silce_plot/car_4/before_sample/xgboost_print.csv") 224 | xgboost_print_5 = pd.read_csv(path) 225 | 226 | path = os.path.join(root_path, "origin_silce_plot/car_4/before_sample/light_print.csv") 227 | light_print_5 = pd.read_csv(path) 228 | 229 | model_print = [gradientboost_print_1, xgboost_print_1, light_print_1, 230 | gradientboost_print_2, xgboost_print_2, light_print_2, 231 | gradientboost_print_3, xgboost_print_3, light_print_3, 232 | gradientboost_print_4, xgboost_print_4, light_print_4, 233 | gradientboost_print_5, xgboost_print_5, light_print_5, ] 234 | gradient_index = [1, 4, 7, 10, 13] 235 | xgboost_index = [2, 5, 8, 11, 14] 236 | light_index = [3, 6, 9, 12, 15] 237 | xlable_index = [13, 14, 15] 238 | title_index = [1, 2, 3] 239 | plt.figure(figsize=subplot_fig_size) 240 | for i in range(1, 16): 241 | ax = plt.subplot(6, 3, i) 242 | if i in gradient_index: 243 | ax = plt.plot(model_print[i - 1].loc[16:].iloc[:, 0], model_print[i - 1].loc[16:].train_rmse, 244 | label="train_rmse") 245 | 246 | plt.xticks(rotation=20, fontsize=subplot_xticks_size) 247 | 248 | plt.ylabel("Score", fontsize=subplot_ylabel_size) 249 | plt.yticks(np.arange(0, 3.5, 0.5), fontsize=subplot_ylabel_size) 250 | plt.plot(model_print[i - 1].loc[16:].iloc[:, 0], model_print[i - 1].loc[16:].out_of_bag_rmse_imporve, 251 | color="r", label="out_of_bag_rmse_imporve") 252 | 253 | if i in xlable_index: 254 | plt.xlabel("Iteration", fontsize=xlabel_size) 255 | if i in title_index: 256 | plt.title("GBRT", fontsize=subplot_title_size) 257 | plt.legend(fontsize=legend_size) 258 | if i == 1: 259 | plt.text(model_print[i - 1].iloc[:, 0].max() * 0.2, 260 | model_print[i - 1].loc[16:].train_rmse.max() - 0.4 * model_print[i - 1].loc[ 261 | 16:].train_rmse.max(), 262 | "{}st fold".format(int(i / 3) + 1), fontsize=ylabel_size) 263 | elif i == 4: 264 | plt.text(model_print[i - 1].iloc[:, 0].max() * 0.2, 265 | model_print[i - 1].loc[16:].train_rmse.max() - 0.4 * model_print[i - 1].loc[ 266 | 16:].train_rmse.max(), 267 | "{}nd fold".format(int(i / 3) + 1), fontsize=ylabel_size) 268 | elif i == 7: 269 | plt.text(model_print[i - 1].iloc[:, 0].max() * 0.2, 270 | model_print[i - 1].loc[16:].train_rmse.max() - 0.4 * model_print[i - 1].loc[ 271 | 16:].train_rmse.max(), 272 | "{}rd fold".format(int(i / 3) + 1), fontsize=ylabel_size) 273 | else: 274 | plt.text(model_print[i - 1].iloc[:, 0].max() * 0.2, 275 | model_print[i - 1].loc[16:].train_rmse.max() - 0.4 * model_print[i - 1].loc[ 276 | 16:].train_rmse.max(), 277 | "{}th fold".format(int(i / 3) + 1), fontsize=ylabel_size) 278 | 279 | plt.tight_layout() 280 | elif i in xgboost_index: 281 | model_print[i - 1].loc[100:].train_mae.plot(label="train_mae") 282 | model_print[i - 1].loc[100:].train_rmse.plot(label="train_rmse") 283 | model_print[i - 1].loc[100:].test_mae.plot(label="test_mae") 284 | model_print[i - 1].loc[100:].test_rmse.plot(label="test_rmse") 285 | 286 | plt.xticks(rotation=20, fontsize=subplot_xticks_size) 287 | 288 | plt.ylabel("Score", fontsize=subplot_ylabel_size) 289 | if i in xlable_index: 290 | plt.xlabel("Iteration", fontsize=xlabel_size) 291 | if i in title_index: 292 | plt.title("XGBoost", fontsize=subplot_title_size) 293 | plt.legend(fontsize=legend_size) 294 | if i == 2: 295 | plt.text(len(model_print[i - 1]) * 0.2, 296 | model_print[i - 1].loc[100:].test_rmse.max() - 0.4 * model_print[i - 1].loc[ 297 | 100:].test_rmse.max(), 298 | "{}st fold".format(int(i / 3) + 1), fontsize=ylabel_size) 299 | elif i == 5: 300 | plt.text(len(model_print[i - 1]) * 0.2, 301 | model_print[i - 1].loc[100:].test_rmse.max() - 0.4 * model_print[i - 1].loc[ 302 | 100:].test_rmse.max(), 303 | "{}nd fold".format(int(i / 3) + 1), fontsize=ylabel_size) 304 | elif i == 8: 305 | plt.text(len(model_print[i - 1]) * 0.2, 306 | model_print[i - 1].loc[100:].test_rmse.max() - 0.4 * model_print[i - 1].loc[ 307 | 100:].test_rmse.max(), 308 | "{}rd fold".format(int(i / 3) + 1), fontsize=ylabel_size) 309 | else: 310 | plt.text(len(model_print[i - 1]) * 0.2, 311 | model_print[i - 1].loc[100:].test_rmse.max() - 0.4 * model_print[i - 1].loc[ 312 | 100:].test_rmse.max(), 313 | "{}th fold".format(int(i / 3) + 1), fontsize=ylabel_size) 314 | plt.yticks(np.arange(0.2, 3.5, 0.5), fontsize=subplot_ylabel_size) 315 | plt.legend(fontsize=legend_size) 316 | plt.tight_layout() 317 | else: 318 | model_print[i - 1].loc[100:].train_mae.plot(label="train_mae") 319 | model_print[i - 1].loc[100:].train_rmse.plot(label="train_rmse") 320 | model_print[i - 1].loc[100:].test_mae.plot(label="test_mae") 321 | model_print[i - 1].loc[100:].test_rmse.plot(label="test_rmse") 322 | plt.legend(fontsize=legend_size) 323 | 324 | plt.xticks(rotation=20, fontsize=subplot_xticks_size) 325 | if i in xlable_index: 326 | plt.xlabel("Iteration", fontsize=xlabel_size) 327 | if i in title_index: 328 | plt.title("LightGBM", fontsize=subplot_title_size) 329 | if i == 3: 330 | plt.text(len(model_print[i - 1]) * 0.2, 331 | model_print[i - 1].loc[100:].test_rmse.max() - 0.4 * model_print[i - 1].loc[ 332 | 100:].test_rmse.max(), 333 | "{}st fold".format(int(i / 3)), fontsize=ylabel_size) 334 | elif i == 6: 335 | plt.text(len(model_print[i - 1]) * 0.2, 336 | model_print[i - 1].loc[100:].test_rmse.max() - 0.4 * model_print[i - 1].loc[ 337 | 100:].test_rmse.max(), 338 | "{}nd fold".format(int(i / 3)), fontsize=ylabel_size) 339 | elif i == 9: 340 | plt.text(len(model_print[i - 1]) * 0.2, 341 | model_print[i - 1].loc[100:].test_rmse.max() - 0.4 * model_print[i - 1].loc[ 342 | 100:].test_rmse.max(), 343 | "{}rd fold".format(int(i / 3)), fontsize=ylabel_size) 344 | else: 345 | plt.text(len(model_print[i - 1]) * 0.2, 346 | model_print[i - 1].loc[100:].test_rmse.max() - 0.4 * model_print[i - 1].loc[ 347 | 100:].test_rmse.max(), 348 | "{}th fold".format(int(i / 3)), fontsize=ylabel_size) 349 | plt.ylabel("Score", fontsize=subplot_ylabel_size) 350 | plt.yticks(np.arange(0.2, 3.5, 0.5), fontsize=subplot_ylabel_size) 351 | plt.tight_layout() 352 | 353 | plt.savefig("/home/liang/range_prediction/output/figures/training_process.jpg", dpi=600, bbox_inches="tight") 354 | 355 | def plot_feature_importance_XGB(): 356 | xgb_feature_name = ["COEM", "COEB", "driving_time", 357 | "Used_SOC", "Temp_min", "RDP_2", "BR", 358 | "Temp_max", "SOC_start", "RDP_4", "RDP_3", 359 | "Temp_diff", "AR", "RDP_1", "SR"] 360 | xgb_score = [43097, 38253, 35359, 31152, 27191, 26527, 24639, 24604, 24439, 23202, 22767, 20423, 19682, 17662, 361 | 15326] 362 | xgb_feature_imprtance = pd.DataFrame({"feature_name": xgb_feature_name, "score": xgb_score}) 363 | plt.figure(figsize=fig_size) 364 | plt.barh(y=xgb_feature_imprtance.feature_name, width=xgb_feature_imprtance.score) 365 | for a, b in zip(xgb_score, xgb_feature_name): 366 | plt.text(a + 3300, b, str(a), ha="center", va="center", fontsize=xticks_size) 367 | plt.yticks(fontsize=yticks_size) 368 | plt.xticks(range(0, 52000, 10000), fontsize=xticks_size) 369 | plt.xlabel("F score", fontsize=xlabel_size) 370 | plt.title("XGBoost Feature Importance", fontsize=title_size) 371 | plt.savefig("/home/liang/range_prediction/output/figures/xgb_feature_importance.jpg", dpi=600, bbox_inches="tight") 372 | 373 | 374 | def plot_LGB_feature_importance(): 375 | lgbm_feature_name = ["COEM", "driving_time", "COEB", 376 | "Used_SOC", "Temp_min", "RDP_2", 377 | "RDP_3", "BR", "Temp_max", 378 | "RDP_4", "SOC_start", "Temp_diff", 379 | "AR", "RDP_1", "SR"] 380 | lgb_score=[34588,33451,29920,28579,24130,23597,23079,23029,22297,21660,20765,19380,17122,15775,12503] 381 | 382 | lgb_feature_imprtance = pd.DataFrame({"feature_name": lgbm_feature_name, "score": lgb_score}) 383 | 384 | plt.figure(figsize=fig_size) 385 | plt.barh(y=lgb_feature_imprtance.feature_name, width=lgb_feature_imprtance.score) 386 | for a, b in zip(lgb_score, lgbm_feature_name): 387 | plt.text(a + 2600, b, str(a), ha="center", va="center", fontsize=xticks_size) 388 | plt.yticks(fontsize=yticks_size) 389 | plt.xticks(range(0, 50000, 10000), fontsize=xticks_size) 390 | plt.xlabel("F score", fontsize=xlabel_size) 391 | plt.title("LightGBM Feature Importance", fontsize=title_size) 392 | plt.savefig("/home/liang/range_prediction/output/figures/LGB_feature_importance.jpg", dpi=500, bbox_inches="tight") 393 | if __name__ == '__main__': 394 | pass 395 | # data = pd.read_csv("./vehicle_data/4.csv") 396 | # print(data["mileage"].max()-data["mileage"].min()) 397 | # vis_function.plot_feature_importance_XGB() 398 | # vis_function.plot_LGB_feature_importance() 399 | # vis_function.plot_train_process() 400 | # vis_function.plot_model_comparision() 401 | # train_list_recover = delet_stopping_trips(train_list_recover) 402 | # vis_function.plot_COEM_vs_Used_soc(train_list_recover) 403 | # vis_function.plot_COMB_VS_Used_SOC(train_list_recover) 404 | # vis_function.plot_silce_COME_vs_SOC(train_list_recover) 405 | # vis_function.plot_silce_COMB_vs_SOC(train_list_recover) 406 | 407 | --------------------------------------------------------------------------------