├── .gitignore
├── get_fea_importance.py
├── plot_lat_lon.py
├── observe_va.py
├── plot_data.py
├── legacy
    ├── test.py
    ├── model.py
    └── train.py
├── observe_data.py
├── test_xgb.py
├── calc_lat_lon.py
├── README.md
├── train_xgb.py
├── LICENSE
└── util.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | data/
3 | figures/
4 | models/
5 | 
6 | *.csv
7 | *.txt
8 | 


--------------------------------------------------------------------------------
/get_fea_importance.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | 
 4 | import numpy as np
 5 | 
 6 | import util
 7 | 
 8 | np.set_printoptions(linewidth=150)
 9 | 
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('model_dir', help='Model directory name')
12 | parser.add_argument('--n_fold_train', type=int, default=5)
13 | args = parser.parse_args()
14 | 
15 | imp_all = []
16 | for fold in range(args.n_fold_train):
17 |     print('Getting fold', fold)
18 |     model = util.load_pkl(os.path.join(args.model_dir, 'model_{}_time_range_fold.pkl'.format(fold)))
19 |     imp_all.append(model.feature_importances_)
20 | 
21 | imp_all = np.array(imp_all)
22 | print('Shape:', imp_all.shape, len(util.FEA_NAMES))
23 | 
24 | for idx, name in enumerate(util.FEA_NAMES):
25 |     print('{}\t{}\t{}\t{}\t{}\t{}'.format(
26 |         name, imp_all[0, idx], imp_all[1, idx], imp_all[2, idx], imp_all[3, idx], imp_all[4, idx]
27 |     ))
28 | 


--------------------------------------------------------------------------------
/plot_lat_lon.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import time
 4 | import matplotlib.pyplot as plt
 5 | import numpy as np
 6 | 
 7 | import util
 8 | 
 9 | np.set_printoptions(linewidth=150)
10 | 
11 | with open('data/test.csv', 'r') as fin:
12 |     cnt = fin.read().splitlines()[1:]
13 |     print('Data count:', len(cnt))
14 | 
15 | idx_lat = util.header_to_row_idx['Lat']
16 | idx_lon = util.header_to_row_idx['Lon']
17 | lat_lon = np.array([[float(line.split(',')[idx_lat]), float(line.split(',')[idx_lon])] for line in cnt])
18 | print('Shape:', lat_lon.shape)
19 | print('Lats:', sorted(list(set([float(line.split(',')[idx_lat]) for line in cnt]))))
20 | print('Lons:', sorted(list(set([float(line.split(',')[idx_lon]) for line in cnt]))))
21 | print('All:', sorted(list(set(['{}-{}'.format(line.split(',')[idx_lat], line.split(',')[idx_lon]) for line in cnt]))))
22 | 
23 | plt.plot(lat_lon[:, 0], lat_lon[:, 1], '.')
24 | plt.show()
25 | 


--------------------------------------------------------------------------------
/observe_va.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | 
 4 | import matplotlib.pyplot as plt
 5 | import numpy as np
 6 | 
 7 | import util
 8 | 
 9 | np.set_printoptions(linewidth=150)
10 | 
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('model_dir', help='Model directory name')
13 | parser.add_argument('--n_fold_train', type=int, default=7)
14 | args = parser.parse_args()
15 | 
16 | pred_all = []
17 | ans_all = []
18 | for fold in range(args.n_fold_train):
19 |     pred = np.load(os.path.join(args.model_dir, 'va_pred_{}.npy'.format(fold)))
20 |     ans = np.load(os.path.join(args.model_dir, 'va_ans_{}.npy'.format(fold)))[:, 0]
21 |     pred_all.append(pred)
22 |     ans_all.append(ans)
23 |     print(fold, pred.shape, ans.shape, np.mean((pred-ans)**2)**0.5)
24 |     plt.subplot(2, 4, fold+1)
25 |     plt.plot(ans, pred, '.')
26 |     plt.plot([0, 3000], [0, 3000], 'r-')
27 | 
28 | pred_all = np.hstack(pred_all)
29 | ans_all = np.hstack(ans_all)
30 | plt.subplot(2, 4, 8)
31 | plt.plot(ans_all, pred_all, '.')
32 | plt.plot([0, 3000], [0, 3000], 'r-')
33 | 
34 | print('RMSE:', np.mean((pred_all-ans_all)**2)**0.5)
35 | 
36 | plt.show()
37 | 


--------------------------------------------------------------------------------
/plot_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import time
 4 | import matplotlib.pyplot as plt
 5 | import numpy as np
 6 | 
 7 | import util
 8 | 
 9 | np.set_printoptions(linewidth=150)
10 | 
11 | with open('data/train.csv', 'r') as fin:
12 |     cnt = fin.read().splitlines()[1:]
13 |     print('Data count:', len(cnt))
14 | 
15 | # idx_mmm = util.header_to_row_idx['Irradiance_m']
16 | # idx_irr = util.header_to_row_idx['Irradiance']
17 | # data = np.array([
18 | #     [float(line.split(',')[idx_mmm]), float(line.split(',')[idx_irr])] \
19 | #         for line in cnt if line.split(',')[idx_irr] != ''
20 | # ])
21 | 
22 | # idx_tmp = util.header_to_row_idx['Temp']
23 | # idx_mmm = util.header_to_row_idx['Temp_m']
24 | # data = np.array([
25 | #     [float(line.split(',')[idx_tmp]), float(line.split(',')[idx_mmm])] \
26 | #         for line in cnt if line.split(',')[idx_mmm] != '' and line.split(',')[idx_mmm] != '']
27 | # )
28 | 
29 | idx_mmm = util.header_to_row_idx['Capacity']
30 | idx_tmp = util.header_to_row_idx['Temp_m']
31 | data = np.array([
32 |     [float(line.split(',')[idx_mmm]), float(line.split(',')[idx_tmp])] \
33 |         for line in cnt if line.split(',')[idx_tmp] != ''
34 | ])
35 | 
36 | print('Shape:', data.shape)
37 | 
38 | plt.plot(data[:, 0], data[:, 1], '.')
39 | plt.show()
40 | 


--------------------------------------------------------------------------------
/legacy/test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | 
 4 | import torch
 5 | import numpy as np
 6 | from torch.autograd import Variable
 7 | 
 8 | import util, model
 9 | 
10 | np.set_printoptions(linewidth=150)
11 | device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
12 | 
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument('modeldir', help='Model directory name')
15 | parser.add_argument('--output_file_name', default='submission.csv', help='Results file name')
16 | parser.add_argument('--model_postfix', default='_bestVa')
17 | parser.add_argument('--n_fold_train', type=int, default=5)
18 | parser.add_argument('--test_batch_size', type=int, default=256)
19 | args = parser.parse_args()
20 | 
21 | nn_param = util.load_cfg(os.path.join(args.modeldir, 'config.yml'))
22 | 
23 | fea_all = []
24 | with open('data/test.csv', 'r') as fin:
25 |     cnt = fin.read().splitlines()[1:]
26 |     print('Data count:', len(cnt))
27 | for line in cnt:
28 |     fea, _ = util.fea_ext(line.split(','))
29 |     fea_all.append(fea)
30 | fea_all = np.array(fea_all)
31 | print('Overall shapes:', fea_all.shape)
32 | 
33 | print('Loading network...')
34 | networks = {}
35 | for fold in range(args.n_fold_train):
36 |     save_dic = torch.load(os.path.join(args.modeldir, 'model_{}{}'.format(fold, args.model_postfix)))
37 |     networks[fold] = model.MLP(nn_param['dim_fea'])
38 |     networks[fold].load_state_dict(save_dic)
39 |     networks[fold].eval()
40 |     networks[fold].to(device)
41 | 
42 | data_loader = torch.utils.data.DataLoader(
43 |     util.Data2Torch({
44 |         'fea': fea_all,
45 |     }),
46 |     batch_size=nn_param['batch'],
47 | )
48 | 
49 | pred_all = []
50 | for idx, data in enumerate(data_loader):
51 |     with torch.no_grad():
52 |         pred = [networks[fold](Variable(data['fea'].to(device))).detach().cpu().numpy() for fold in range(args.n_fold_train)]
53 |         pred_all.append(np.array(pred))
54 | pred_all = np.concatenate(pred_all, axis=1)
55 | print('Finish prediction, shape:', pred_all.shape)
56 | 
57 | with open(args.output_file_name, 'w') as fout:
58 |     fout.write('ID,Generation\n')
59 |     for i in range(pred_all.shape[1]):
60 |         fout.write('{},{:.6f}\n'.format(
61 |             i+1, np.median(pred_all[:, i])
62 |         ))
63 | 


--------------------------------------------------------------------------------
/legacy/model.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | 
 6 | 
 7 | class MLP(nn.Module):
 8 |     def __init__(self, dim_fea):
 9 |         super(MLP, self).__init__()
10 |         DIM_HIDDEN = 512
11 |         self.SCALE_FACTOR = 100
12 |         self.net_1 = nn.Sequential(
13 |             nn.Linear(in_features=dim_fea, out_features=DIM_HIDDEN),
14 |             nn.BatchNorm1d(DIM_HIDDEN),
15 |             nn.SiLU(),
16 |         )
17 |         self.net_2 = nn.Sequential(
18 |             nn.Linear(in_features=dim_fea+DIM_HIDDEN, out_features=DIM_HIDDEN),
19 |             nn.BatchNorm1d(DIM_HIDDEN),
20 |             nn.SiLU(),
21 |         )
22 |         self.net_3 = nn.Sequential(
23 |             nn.Linear(in_features=dim_fea+DIM_HIDDEN, out_features=DIM_HIDDEN),
24 |             nn.BatchNorm1d(DIM_HIDDEN),
25 |             nn.SiLU(),
26 |         )
27 |         self.net_4 = nn.Sequential(
28 |             nn.Linear(in_features=dim_fea+DIM_HIDDEN, out_features=DIM_HIDDEN),
29 |             nn.BatchNorm1d(DIM_HIDDEN),
30 |             nn.SiLU(),
31 |         )
32 |         self.net_5 = nn.Sequential(
33 |             nn.Linear(in_features=DIM_HIDDEN, out_features=1),
34 |             nn.SiLU(),
35 |         )
36 |         # self.net = nn.Sequential(
37 |         #     nn.Linear(in_features=dim_fea, out_features=DIM_HIDDEN),
38 |         #     nn.BatchNorm1d(DIM_HIDDEN),
39 |         #     nn.SiLU(),
40 |         #     nn.Linear(in_features=DIM_HIDDEN, out_features=DIM_HIDDEN),
41 |         #     nn.BatchNorm1d(DIM_HIDDEN),
42 |         #     nn.SiLU(),
43 |         #     nn.Linear(in_features=DIM_HIDDEN, out_features=DIM_HIDDEN),
44 |         #     nn.BatchNorm1d(DIM_HIDDEN),
45 |         #     nn.SiLU(),
46 |         #     nn.Linear(in_features=DIM_HIDDEN, out_features=DIM_HIDDEN),
47 |         #     nn.BatchNorm1d(DIM_HIDDEN),
48 |         #     nn.SiLU(),
49 |         #     nn.Linear(in_features=DIM_HIDDEN, out_features=1),
50 |         #     nn.SiLU(),
51 |         # )
52 | 
53 |     def forward(self, x):
54 |         h = self.net_1(x)
55 |         h = self.net_2(torch.cat((h, x), dim=1))
56 |         h = self.net_3(torch.cat((h, x), dim=1))
57 |         h = self.net_4(torch.cat((h, x), dim=1))
58 |         h = self.net_5(h)
59 |         return h * self.SCALE_FACTOR
60 |         # return self.net(x) * self.SCALE_FACTOR
61 | 


--------------------------------------------------------------------------------
/observe_data.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | 
 4 | import util
 5 | 
 6 | np.set_printoptions(linewidth=150)
 7 | 
 8 | with open('data/train.csv', 'r') as fin:
 9 |     cnt = fin.read().splitlines()[1:]
10 |     print('Data count:', len(cnt))
11 | 
12 | idx_dat = util.header_to_row_idx['Date']
13 | idx_tpm = util.header_to_row_idx['Temp_m']
14 | idx_gen = util.header_to_row_idx['Generation']
15 | idx_irr = util.header_to_row_idx['Irradiance']
16 | idx_cap = util.header_to_row_idx['Capacity']
17 | idx_lat = util.header_to_row_idx['Lat']
18 | idx_lon = util.header_to_row_idx['Lon']
19 | idx_irm = util.header_to_row_idx['Irradiance_m']
20 | idx_tmp = util.header_to_row_idx['Temp']
21 | idx_mod = util.header_to_row_idx['Module']
22 | 
23 | # --- 一個經緯度下的模組數量
24 | # lat_lon_to_mod_dic = {}
25 | # for line in cnt:
26 | #     arr = line.split(',')
27 | #     key = '{}-{}'.format(arr[idx_lat], arr[idx_lon])
28 | #     if key not in lat_lon_to_mod_dic:
29 | #         lat_lon_to_mod_dic[key] = set()
30 | #     lat_lon_to_mod_dic[key].add(arr[idx_mod])
31 | 
32 | # for key, mod in lat_lon_to_mod_dic.items():
33 | #     print(key, mod)
34 | 
35 | lat_lon_mod_to_dat_dic = {}
36 | for line in cnt:
37 |     arr = line.split(',')
38 |     key = '{}-{}-{}-{}'.format(arr[idx_lat], arr[idx_lon], arr[idx_mod], arr[idx_cap])
39 |     if key not in lat_lon_mod_to_dat_dic:
40 |         lat_lon_mod_to_dat_dic[key] = {
41 |             'dates': [],
42 |             'temp_ms': [],
43 |             'temps': [],
44 |             'irr_ms': [],
45 |             'irrs': [],
46 |             'capacity': [],
47 |             'generation': [],
48 |         }
49 |     lat_lon_mod_to_dat_dic[key]['dates'].append(arr[idx_dat])
50 |     lat_lon_mod_to_dat_dic[key]['temp_ms'].append(arr[idx_tpm])
51 |     lat_lon_mod_to_dat_dic[key]['temps'].append(arr[idx_tmp])
52 |     lat_lon_mod_to_dat_dic[key]['irr_ms'].append(arr[idx_irm])
53 |     lat_lon_mod_to_dat_dic[key]['irrs'].append(arr[idx_irr])
54 |     lat_lon_mod_to_dat_dic[key]['capacity'].append(arr[idx_cap])
55 |     lat_lon_mod_to_dat_dic[key]['generation'].append(arr[idx_gen])
56 | 
57 | keys = sorted(lat_lon_mod_to_dat_dic.keys())
58 | 
59 | for idx, key in enumerate(keys):
60 |     item = lat_lon_mod_to_dat_dic[key]
61 |     print(
62 |         key,
63 |         # len(item['dates']),
64 |         # ''.join([
65 |         #     '1' if val != '' else '0' for val in item['temp_ms']
66 |         # ]),
67 |         # item['dates']
68 |     )
69 | 
70 | # plt.figure()
71 | for idx, key in enumerate(keys):
72 |     item = lat_lon_mod_to_dat_dic[key]
73 |     # data = np.array([
74 |     #     [float(t), float(tm)] \
75 |     #         for t, tm in zip(item['irr_ms'], item['temp_ms']) \
76 |     #             if t != '' and tm != ''
77 |     # ])
78 |     data = np.array([float(val) if val != '' else np.nan for val in item['irr_ms']])
79 |     if data.size == 0:
80 |         continue
81 |     plt.figure()
82 |     # plt.subplot(2, 7, idx+1)
83 |     # plt.plot(data[:, 0], data[:, 1], '.')
84 |     plt.plot(data, '.-')
85 |     plt.title(key)
86 |     plt.savefig('figures/{}_irr_ms.png'.format(idx+1))
87 | 
88 | # plt.show()
89 | 


--------------------------------------------------------------------------------
/test_xgb.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | 
 4 | import numpy as np
 5 | import weightedstats as ws
 6 | 
 7 | import util
 8 | 
 9 | np.set_printoptions(linewidth=150)
10 | 
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('model_dir', help='Model directory name')
13 | parser.add_argument('--output_file_name', default='submission.csv', help='Results file name')
14 | parser.add_argument('--n_fold_train', type=int, default=7)
15 | args = parser.parse_args()
16 | 
17 | with open('data/test.csv', 'r') as fin:
18 |     cnt = fin.read().splitlines()[1:]
19 |     print('Data count:', len(cnt))
20 | 
21 | print('Loading extra data...')
22 | ext_data_dict_month, ext_header_to_row_idx_month = util.load_external_monthly_report()
23 | ext_data_dict_day, ext_header_to_row_idx_day = util.load_external_daily_report()
24 | 
25 | lat_lon_mod_to_fea = {}
26 | idx_cap = util.header_to_row_idx['Capacity']
27 | idx_lat = util.header_to_row_idx['Lat']
28 | idx_lon = util.header_to_row_idx['Lon']
29 | idx_mod = util.header_to_row_idx['Module']
30 | for idx, line in enumerate(cnt):
31 |     arr = line.split(',')
32 |     key = '{}-{}-{}-{}'.format(arr[idx_lat], arr[idx_lon], arr[idx_mod], arr[idx_cap])
33 |     fea, _ = util.fea_ext(
34 |         arr, ext_data_dict_month, ext_header_to_row_idx_month, ext_data_dict_day, ext_header_to_row_idx_day
35 |     )
36 |     if key not in lat_lon_mod_to_fea:
37 |         lat_lon_mod_to_fea[key] = {
38 |             'fea': [],
39 |             'id': [],
40 |         }
41 |     lat_lon_mod_to_fea[key]['fea'].append(fea)
42 |     lat_lon_mod_to_fea[key]['id'].append(int(arr[0]))
43 | 
44 | fea_all = []
45 | id_all = []
46 | for key in lat_lon_mod_to_fea:
47 |     lat_lon_mod_to_fea[key]['fea'] = np.array(lat_lon_mod_to_fea[key]['fea'])
48 |     fea_all.append(lat_lon_mod_to_fea[key]['fea'])
49 |     id_all.append(lat_lon_mod_to_fea[key]['id'])
50 |     print('{}, raw shape: {}'.format(
51 |         key, fea_all[-1].shape
52 |     ))
53 | 
54 | fea_all = np.vstack(fea_all)
55 | id_all = np.hstack(id_all)
56 | print('Overall shape:', fea_all.shape, id_all.shape)
57 | 
58 | pred_all = []
59 | # last_loss_all = []
60 | for fold in range(args.n_fold_train):
61 |     print('Predicting fold', fold)
62 |     model_t = util.load_pkl(os.path.join(args.model_dir, 'model_{}_time_range_fold.pkl'.format(fold)))
63 |     model_s = util.load_pkl(os.path.join(args.model_dir, 'model_{}_sort_fold.pkl'.format(fold)))
64 |     # last_loss_all.append(model.evals_result()['validation_0']['rmse'][-1])
65 |     pred_t = model_t.predict(fea_all)
66 |     pred_s = model_s.predict(fea_all)
67 |     print('Shape:', pred_t.shape, pred_s.shape)
68 |     pred_all.append(pred_t)
69 |     pred_all.append(pred_s)
70 | # last_loss_all = np.array(last_loss_all)
71 | # weight_all = 1 / last_loss_all
72 | # weight_all = weight_all / sum(weight_all)
73 | 
74 | pred_all = np.vstack(pred_all)
75 | print('Finish prediction, shape:', pred_all.shape)
76 | 
77 | sort_idx = np.argsort(id_all)
78 | pred_all = pred_all[:, sort_idx]
79 | 
80 | with open(args.output_file_name, 'w') as fout:
81 |     fout.write('ID,Generation\n')
82 |     for i in range(pred_all.shape[1]):
83 |         fout.write('{},{:.6f}\n'.format(
84 |             i+1, np.median(pred_all[:, i])
85 |         ))
86 | 


--------------------------------------------------------------------------------
/calc_lat_lon.py:
--------------------------------------------------------------------------------
 1 | from itertools import permutations
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | pred = list(map(float, '25.11-121.26'.split('-')))
 6 | 
 7 | sites = {
 8 |     # 彰化
 9 |     'shenggang': [24.1489, 120.4844],
10 |     'xianxi': [24.1433, 120.4435],
11 |     'lukang': [24.0753, 120.4304],
12 |     'fuxing': [24.0412, 120.4376],
13 |     'puyang': [24.0003, 120.4316],
14 |     'xiushui': [24.0340, 120.5038],
15 |     'huatan': [24.0320, 120.5494],
16 |     'fenyuan': [24.0156, 120.6213],
17 |     'xihu': [23.9483, 120.4791],
18 |     'puxin': [23.9476, 120.5254],
19 |     'yuanlin': [23.9465, 120.5855],
20 |     # 一些台中
21 |     'wuri': [24.1070, 120.6241],
22 |     'dadu': [24.1529, 120.5721],
23 |     'longjing': [24.1845, 120.5289],
24 |     # 桃園
25 |     # 'jhonda': [24.9661, 121.0085], # 無資料
26 |     # 'shueiwei': [24.9400, 121.0871], # 無資料
27 |     'yangmei': [24.9123, 121.1430],
28 |     'xinwu': [25.0067, 121.0474],
29 |     # 'guanyin_ins': [25.0647, 121.1148], # 無資料
30 |     'guanyin': [25.0270, 121.1533],
31 |     # 'jhuwei': [25.1126, 121.2398], # 無資料
32 |     'luzhu': [25.0842, 121.2657],
33 |     'zhongli': [24.9776, 121.2563],
34 |     'taoyuan': [24.9924, 121.3231],
35 |     'guishan': [25.0284, 121.3865],
36 |     'pingjhen': [24.8975, 121.2146],
37 |     'bade': [24.9287, 121.2832],
38 |     # 一些台北
39 |     # 'n039k': [25.0643, 121.3838], # 無資料
40 |     # 'linkou': [25.0721, 121.3808], # 無資料
41 |     # 一些新竹
42 |     'waihu': [24.9177, 120.9687],
43 |     'hukou': [24.9047, 121.0436],
44 |     'bade': [24.9287, 121.2832],
45 | }
46 | 
47 | 
48 | def get_dist_pa(p, a):
49 |     return ((p[0] - a[0]) ** 2 + (p[1] - a[1]) ** 2) ** 0.5
50 | 
51 | 
52 | def get_dist_pab(p, a, b, op):
53 |     m = ((a[0] + b[0]) / 2, (a[1] + b[1]) / 2)
54 |     pa = ((p[0] - a[0]) ** 2 + (p[1] - a[1]) ** 2) ** 0.5
55 |     pb = ((p[0] - b[0]) ** 2 + (p[1] - b[1]) ** 2) ** 0.5
56 |     pm = ((p[0] - m[0]) ** 2 + (p[1] - m[1]) ** 2) ** 0.5
57 |     if op == 'add':
58 |         return pa + pb # + pm
59 |     elif op == 'mul':
60 |         return pa * pb # * pm
61 |     else:
62 |         raise Exception('op not supported')
63 | 
64 | 
65 | def get_dist_pabc(p, a, b, c, op):
66 |     g = ((a[0] + b[0] + c[0]) / 2, (a[1] + b[1] + c[1]) / 2)
67 |     pa = ((p[0] - a[0]) ** 2 + (p[1] - a[1]) ** 2) ** 0.5
68 |     pb = ((p[0] - b[0]) ** 2 + (p[1] - b[1]) ** 2) ** 0.5
69 |     pc = ((p[0] - c[0]) ** 2 + (p[1] - c[1]) ** 2) ** 0.5
70 |     pg = ((p[0] - g[0]) ** 2 + (p[1] - g[1]) ** 2) ** 0.5
71 |     if op == 'add':
72 |         return pa + pb + pc + pg
73 |     elif op == 'mul':
74 |         return pa * pb * pc * pg
75 |     else:
76 |         raise Exception('op not supported')
77 | 
78 | 
79 | site_and_dist = sorted([(name, get_dist_pa(pred, loc)) for name, loc in sites.items()], key=lambda x: x[1])
80 | # site_and_dist = sorted([((name_a, name_b), get_dist_pab(pred, loc_a, loc_b, 'add')) for (name_a, loc_a), (name_b, loc_b) in permutations(sites.items(), r=2)], key=lambda x: x[1])
81 | # site_and_dist = sorted([((name_a, name_b, name_c), get_dist_pabc(pred, loc_a, loc_b, loc_c, 'add')) for (name_a, loc_a), (name_b, loc_b), (name_c, loc_c) in permutations(sites.items(), r=3)], key=lambda x: x[1])
82 | for name, dist in site_and_dist[:10]:
83 |     print(name, dist)
84 | 
85 | # plt.plot(pred[0], pred[1], 'd', label='Pred')
86 | # for name, pos in sites.items():
87 | #     if name in site_and_dist[0][0]:
88 | #         plt.plot(pos[0], pos[1], 'rx', label='top-1')
89 | #     else:
90 | #         plt.plot(pos[0], pos[1], 'b.')
91 | # plt.legend()
92 | # plt.show()
93 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 2022-solar-power-generation-prediction
  2 | 
  3 | My implementation and sharing of this contest: https://aidea-web.tw/topic/09679060-518a-4e6f-94db-53c7d8de8138. I got rank 5 (out of 179 teams) in the Public Leaderboard and rank 10 in the Private Leaderboard.
  4 | 
  5 | ## Run My Implementation
  6 | 
  7 | ### Required libs
  8 | 
  9 | `numpy`, `sklearn`, and `xgboost`. Versions of them are not restricted as long as they're new enough. `matplotlib` is also used, but it's for data observation only, and is not required for model training and inference. Besides, if you want to try my deep learning methods (which may not be effective in this contest) in the `legacy` directory, then `torch` and `yaml` are needed.
 10 | 
 11 | ### Training
 12 | ```bash
 13 | python3 train_xgb.py save_model_dir --n_fold 7
 14 | ```
 15 | * `save_model_dir`: where you want to save the trained model.
 16 | * `--n_fold`: number of folds for cross validation. Default to 7.
 17 | * Input csv files are assumed to be in the `data` dir.
 18 | 
 19 | ### Inference
 20 | ```bash
 21 | python3 test_xgb.py model_dir --output_file_name submission.csv --n_fold_train 7
 22 | ```
 23 | * `model_dir`: directory of the trained model.
 24 | * `--output_file_name`: output file name for submission.
 25 | * `--n_fold_train`: number of folds used while training.
 26 | 
 27 | ## 作法分享
 28 | 
 29 | 以下將介紹本競賽的問題概述，以及所使用的執行環境、特徵截取、模型設計與訓練，以及預測方式。另備有競賽媒合會的[簡報](https://docs.google.com/presentation/d/17suzUIDLBmEjl1oNEXQR6xojnX-PdpDn0nuAUdyi5zk/edit?usp=sharing)版本，內容相差無幾，但若有興趣亦歡迎參考。
 30 | 
 31 | ### 問題概述
 32 | 
 33 | * 給定資料：約一年期間內，數個太陽能發電場域的經緯度、天氣觀測資料、發電模組規格，以及發電量等。
 34 |   * 每個發電廠域的資料起始時間不盡相同，結束時間皆為 2021/10/28。
 35 |   * 發電場域位於彰化及桃園的數個不同鄉鎮市區。
 36 | * 預測目標：2021/10/29 起至 2022 年二月下旬止，每場域的每日發電量。
 37 |   * 不同場域的截止日期不同，位於彰化者皆為 2022/2/16，位於桃園者皆為 2022/2/17
 38 | * 評估標準：RMSE。
 39 | 
 40 | ### 執行環境
 41 | 
 42 | 硬體方面為 ASUS P2440 UF 筆電，含 i7-8550U CPU 及 MX130 顯示卡，主記憶體擴充至 20 GB。程式語言為 Python 3，函式庫則如本說明前半部所示，皆未特別指定版本。
 43 | 
 44 | ### 特徵擷取
 45 | 
 46 | 於本比賽中，除了大會給定的資料外，亦有使用外部資料。對於兩種資料的特徵擷取，分別介紹如下。
 47 | 
 48 | #### 給定資料
 49 | 
 50 | 從大會給定的資料中，抽出特徵共 59 維，細節如下：
 51 | * 可能缺值類共 6 維: Temp_m, Irradiance, Temp 各 2 維，一維代表是否缺值，另一維為實際值
 52 | * 模組相關共 10 維: 模組 one-hot 共 4 維，以及模組規格（峰值輸出等）共 6 維
 53 | * 經緯度相關 11 維: 發電廠域經緯度的 one-hot
 54 | * 時間相關 9 維: 月份、月份以三月為第一月、月份以六月為第一月、月份以九月為第一月、是否為春季（三到五月）、是否為夏季（是否為六到八月）、是否為秋季（九到十一月）、是否為冬季（十二到二月）、日期
 55 | * 其餘原始特徵及角度特徵工程 10 維: 裝置容量、發電廠域經度、發電廠域緯度、角度、角度正弦值、角度餘弦值、角度雙曲正切值、角度雙曲正切值取負號再加一、角度正負號（零度設為正號）、日照計之日射量
 56 | * 其餘特徵工程 13 維:
 57 |   * 1 維: 裝置容量乘以日照計之日射量除以一千
 58 |   * 5 維: 日照計之日射量分別乘以一減角度、角度正弦值、角度餘弦值、角度雙曲正切值、角度雙曲正切值取負號再加一
 59 |   * 5 維: 裝置容量乘以前述 5 維
 60 |   * 2 維: 模溫計之模板溫度除以當日平均氣溫，以及日射量除以日照計之日射量，遇有缺值時皆填零
 61 | 
 62 | #### 外部資料
 63 | 
 64 | 外部資料的來源為氣象局的[觀測資料查詢](https://e-service.cwb.gov.tw/HistoryDataQuery/index.jsp)，測站選擇的方式是離發電廠域最接近的，且有觀測資料的測站（手動準備好測站座標，以 `calc_lat_lon.py` 計算離某發電廠域最近的測站）；資料格式為月報表，即逐日的觀測資料。特徵的列表如下，共 39 維；代號所指的觀測項目和單位等，請自行參考報表：
 65 | * 測站類 5 維: 測站 one-hot
 66 | * 氣壓類 4 維: StnPres, SeaPres, StnPresMax, StnPresMin
 67 | * 溫度類 4 維: Temperature, T Max, T Min, Td dew point
 68 | * 濕度類 2 維: RH, RHMin
 69 | * 風力類 4 維: WS, WD, WSGust, WDGust
 70 | * 降水類 4 維: Precp, PrecpHour, PrecpMax10, PrecpMax60
 71 | * 日照及其他類 7 維: SunShine, SunShineRate, GloblRad, VisbMean, EvapA, UVI Max, Cloud Amount
 72 | * 特徵工程 9 維: SunShine, SunShineRate, 以及 GloblRad 分別乘以 angle 的正弦值、angle 的雙曲正切值，以及 1 - angle
 73 | 
 74 | #### 其他說明
 75 | 
 76 | * 特徵的部份設計，如多一維表示缺值，或者除以常數等，是因早先使用深度學習模型時而設計，而模型轉換為 XGBRegressor 仍將其保留，並未拆除。
 77 | * 嘗試過其他特徵工程，但在嘗試當下的模型參數設定下，沒有取得比當時最佳成果好的結果。
 78 | * 試過使用兩個測站的資料，根據測站選擇與特徵使用（單獨使用或以不同方式合併）方式的不同，效果與一個測站的相比，會稍差或相同。未嘗試過使用三個以上的測站資料。
 79 | * 使用過討論區參賽者提供的[這個工具](https://github.com/JackyWeng526/Taiwan_Weather_Data)來下載日報表，以加入逐小時的觀測資料為特徵，但沒有取得比較好的效果。
 80 | 
 81 | ### 模型設計、訓練與觀察
 82 | 
 83 | #### 模型設計與訓練
 84 | 
 85 | 本次比賽使用的模型為 XGBRegressor，訓練方式為 n folds cross validation。模型的細節參數請參考 `train_xgb.py` 的 `param` 變數，未設定之參數係依照預設值，未進行修改。所有模型的預測目標皆是直接輸出每場域每日的發電量，沒有另外作正規化等調整。
 86 | 
 87 | 切 fold 的方式分為以下兩種，實驗結果為以下兩種方式都使用，來訓練出 2n 個模型，效果會稍微好一些：
 88 | 1. 每個場域各自依照時間區段來切。例如某場域有十個月的資料要切成 5 個 folds，則前兩個月為 fold 0，接下來的兩個月為 fold 1，依此類推。
 89 | 2. 先將全部訓練資料依照發電量排序，再根據索引值除以 fold 數的餘數來切。例如要切成 5 個 folds，則排序後的位置為 0, 5, 10, ... (餘數 0)者為 fold 0，位置為 1, 6, 11, ... (餘數 1)者為 fold 1，依此類推。
 90 | 
 91 | 另外，由於 XGBRegressor 並不會自動回傳達到最佳 validation loss 的模型，因此我先用 `param` 變數當中所示的參數訓練一次，待取得 validatino loss 的曲線後，再根據最佳 loss 的位置來重新設定 `n_estimators` 的值，並重新訓練一次，來當作最佳 validation loss 的模型來使用。
 92 | 
 93 | 由上述方式產生出的 2n 個模型，都會做為預測使用。另外，在使用 XGBRegressor 之前，我先使用了深度學習，但效果不是很理想，RMSE 最佳僅約 290.0 上下。
 94 | 
 95 | #### Feature Importance 觀察
 96 | 
 97 | 不同參數訓練出的模型，以及不同 fold 當中，比較重要的特徵可能不盡相同，但大致可觀察到這幾個現象：
 98 | * 裝置容量和日射量穩定佔據前兩名，且裝置容量占據五成以上的重要性，而在早期未引入外部資料時，甚至有觀察到裝置容量占了八成左右的比重。
 99 | * 內部及外部有關日射量的特徵工程，有少部分明顯的佔據接下來的幾名，個別的重要性約 1% 至 10%。事實上，在僅使用內部資料時已觀察到相關現象，故外部資料的特徵工程也參考了此現象來設計。
100 | * 發電模組規格、季節，以及部分的角度特徵工程等特徵，其重要性為 0，可能是模型已從其他特徵上學習到相關資訊，例如模組的 one-hot 已經隱含地包含了模組規格，故模型可能就不再需要更細節的規格資訊。
101 | 
102 | ### 預測
103 | 
104 | 預測時會將 2n 個模型的結果，取中位數做為最終輸出。
105 | 
106 | 我亦嘗試過使用平均值，或者只使用部分模型等其他方式來產生最終輸出，例如根據 validation loss 設定權重或者去除表現較差的模型，但是都沒有達到比較好的效果。亦可以經由 `test_xgb.py` 的 `--n_fold_train` 參數，帶入比訓練時的 folds 數目少的數字，來達到只使用前 k 個模型來預測的效果，但並未於實驗中測試過。
107 | 
108 | ### 心得
109 | 
110 | * 有些調整參數的方向比較晚開始嘗試，但是最後期限已近，所剩的上傳次數不夠我再嘗試，因此模型應該還有微幅調整的空間；雖然光是靠調參數，應該也只會有微幅變化，比較難產生決定性的影響。
111 | * 本次競賽比較特別的是，設立了一個「持續領先獎」，也就是佔據排行榜第一名最長時間的獎項，但比較可惜的是，這個獎項搭配上了 AIdea 平台以最後上傳而非最佳結果為準的設計，可能會讓佔據第一名的參賽者怯於持續調整精進，我認為平台在這方面有改進空間。
112 | 


--------------------------------------------------------------------------------
/train_xgb.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import warnings
  4 | from copy import deepcopy
  5 | 
  6 | import numpy as np
  7 | from sklearn.metrics import mean_squared_error
  8 | 
  9 | import util
 10 | from xgboost import XGBRegressor
 11 | 
 12 | np.set_printoptions(linewidth=150)
 13 | 
 14 | parser = argparse.ArgumentParser()
 15 | parser.add_argument('save_model_dir_name')
 16 | parser.add_argument('--n_fold', type=int, default=7)
 17 | args = parser.parse_args()
 18 | 
 19 | print('Loading data...')
 20 | with open('data/train.csv', 'r') as fin:
 21 |     cnt = fin.read().splitlines()[1:]
 22 |     print('\tData count:', len(cnt))
 23 | 
 24 | print('Loading extra data...')
 25 | ext_data_dict_month, ext_header_to_row_idx_month = util.load_external_monthly_report()
 26 | ext_data_dict_day, ext_header_to_row_idx_day = util.load_external_daily_report()
 27 | 
 28 | lat_lon_mod_to_fea = {}
 29 | idx_cap = util.header_to_row_idx['Capacity']
 30 | idx_lat = util.header_to_row_idx['Lat']
 31 | idx_lon = util.header_to_row_idx['Lon']
 32 | idx_mod = util.header_to_row_idx['Module']
 33 | for idx, line in enumerate(cnt):
 34 |     arr = line.split(',')
 35 |     key = '{}-{}-{}-{}'.format(arr[idx_lat], arr[idx_lon], arr[idx_mod], arr[idx_cap])
 36 |     fea, ans = util.fea_ext(
 37 |         arr, ext_data_dict_month, ext_header_to_row_idx_month, ext_data_dict_day, ext_header_to_row_idx_day
 38 |     )
 39 |     if key not in lat_lon_mod_to_fea:
 40 |         lat_lon_mod_to_fea[key] = {
 41 |             'fea': [],
 42 |             'ans': [],
 43 |         }
 44 |     lat_lon_mod_to_fea[key]['fea'].append(fea)
 45 |     lat_lon_mod_to_fea[key]['ans'].append(ans)
 46 | 
 47 | fea_all = []
 48 | ans_all = []
 49 | fold_all = []
 50 | for key in lat_lon_mod_to_fea:
 51 |     lat_lon_mod_to_fea[key]['fea'] = np.array(lat_lon_mod_to_fea[key]['fea'])
 52 |     lat_lon_mod_to_fea[key]['ans'] = np.array(lat_lon_mod_to_fea[key]['ans'])[:, np.newaxis]
 53 | 
 54 |     data_num = lat_lon_mod_to_fea[key]['ans'].shape[0]
 55 |     fold = np.zeros(data_num)
 56 |     for f in range(1, args.n_fold):
 57 |         fold[int(data_num/args.n_fold)*f:] += 1
 58 | 
 59 |     fea_all.append(lat_lon_mod_to_fea[key]['fea'])
 60 |     ans_all.append(lat_lon_mod_to_fea[key]['ans'])
 61 |     fold_all.append(fold)
 62 | 
 63 |     print('{}, shapes: {}, {}, {}'.format(
 64 |         key, fea_all[-1].shape, ans_all[-1].shape, fold_all[-1].shape,
 65 |     ))
 66 | 
 67 | fea_all = np.vstack(fea_all)
 68 | ans_all = np.vstack(ans_all)
 69 | fold_all = np.hstack(fold_all)
 70 | print('Overall shapes:', fea_all.shape, ans_all.shape, fold_all.shape)
 71 | 
 72 | param = {
 73 |     'n_estimators': 1400,
 74 |     'max_depth': 9,
 75 |     'learning_rate': 0.01,
 76 |     'eval_metric': mean_squared_error,
 77 |     'min_child_weight': 4,
 78 |     'gamma': 0.5,
 79 |     'reg_lambda': 2,
 80 |     'reg_alpha': 0.001,
 81 |     'max_delta_step': 2000,
 82 |     'n_jobs': 7,
 83 |     'verbosity': 0,
 84 | }
 85 | 
 86 | if not os.path.exists(args.save_model_dir_name):
 87 |     os.makedirs(args.save_model_dir_name, 0o755)
 88 |     print('Model will be saved in {}'.format(args.save_model_dir_name))
 89 | else:
 90 |     warnings.warn('Dir {} already exist, result files will be overwritten.'.format(args.save_model_dir_name))
 91 | 
 92 | models = {}
 93 | data_num = fea_all.shape[0]
 94 | for fold in range(args.n_fold):
 95 |     valid_idx = np.where(fold_all == fold)[0]
 96 |     train_idx = np.where(fold_all != fold)[0]
 97 | 
 98 |     print('Fold {}, train num {}, test num {}'.format(
 99 |         fold, len(train_idx), len(valid_idx),
100 |     ))
101 | 
102 |     # 1st pass
103 |     models[fold] = XGBRegressor(**param)
104 |     models[fold].fit(
105 |         fea_all[train_idx],
106 |         ans_all[train_idx],
107 |         eval_set=[(fea_all[valid_idx], ans_all[valid_idx])],
108 |     )
109 | 
110 |     # 2nd pass
111 |     loss = models[fold].evals_result()['validation_0']['rmse']
112 |     local_param = deepcopy(param)
113 |     local_param['n_estimators'] = np.argsort(loss)[0] + 1
114 |     models[fold] = XGBRegressor(**local_param)
115 |     models[fold].fit(fea_all[train_idx], ans_all[train_idx])
116 | 
117 |     util.save_pkl(os.path.join(args.save_model_dir_name, 'model_{}_time_range_fold.pkl'.format(fold)), models[fold])
118 |     np.save(os.path.join(args.save_model_dir_name, 'va_pred_{}_time_range_fold.npy'.format(fold)), models[fold].predict(fea_all[valid_idx]))
119 |     np.save(os.path.join(args.save_model_dir_name, 'va_ans_{}_time_range_fold.npy'.format(fold)), ans_all[valid_idx])
120 | 
121 | sort_idx = np.argsort(ans_all[:, 0])
122 | fea_all = fea_all[sort_idx]
123 | ans_all = ans_all[sort_idx]
124 | print('Sorted shapes:', sort_idx.shape, fea_all.shape, ans_all.shape)
125 | 
126 | models = {}
127 | data_num = fea_all.shape[0]
128 | for fold in range(args.n_fold):
129 |     valid_idx = np.where(np.arange(data_num)%args.n_fold == fold)[0]
130 |     train_idx = np.where(np.arange(data_num)%args.n_fold != fold)[0]
131 | 
132 |     print('Fold {}, train num {}, test num {}'.format(
133 |         fold, len(train_idx), len(valid_idx),
134 |     ))
135 | 
136 |     # 1st pass
137 |     models[fold] = XGBRegressor(**param)
138 |     models[fold].fit(
139 |         fea_all[train_idx],
140 |         ans_all[train_idx],
141 |         eval_set=[(fea_all[valid_idx], ans_all[valid_idx])],
142 |     )
143 | 
144 |     # 2nd pass
145 |     loss = models[fold].evals_result()['validation_0']['rmse']
146 |     local_param = deepcopy(param)
147 |     local_param['n_estimators'] = np.argsort(loss)[0] + 1
148 |     models[fold] = XGBRegressor(**local_param)
149 |     models[fold].fit(fea_all[train_idx], ans_all[train_idx])
150 | 
151 |     util.save_pkl(os.path.join(args.save_model_dir_name, 'model_{}_sort_fold.pkl'.format(fold)), models[fold])
152 |     np.save(os.path.join(args.save_model_dir_name, 'va_pred_{}_sort_fold.npy'.format(fold)), models[fold].predict(fea_all[valid_idx]))
153 |     np.save(os.path.join(args.save_model_dir_name, 'va_ans_{}_sort_fold.npy'.format(fold)), ans_all[valid_idx])
154 | 


--------------------------------------------------------------------------------
/legacy/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import time
  5 | import warnings
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | from torch import nn, optim
 10 | from torch.autograd import Variable
 11 | 
 12 | import model
 13 | import util
 14 | 
 15 | np.set_printoptions(linewidth=150)
 16 | device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 17 | 
 18 | parser = argparse.ArgumentParser()
 19 | parser.add_argument('save_model_dir_name')
 20 | parser.add_argument('--epoch', type=int, default=500)
 21 | parser.add_argument('--va_not_imp_limit', '-va', type=int, default=500)
 22 | parser.add_argument('--n_fold', type=int, default=5)
 23 | args = parser.parse_args()
 24 | 
 25 | fea_all = []
 26 | ans_all = []
 27 | with open('data/train.csv', 'r') as fin:
 28 |     cnt = fin.read().splitlines()[1:]
 29 |     print('Data count:', len(cnt))
 30 | for idx, line in enumerate(cnt):
 31 |     fea, ans = util.fea_ext(line.split(','))
 32 |     fea_all.append(fea)
 33 |     ans_all.append(ans)
 34 | fea_all = np.array(fea_all)
 35 | ans_all = np.array(ans_all)[:, np.newaxis]
 36 | print('Overall shapes:', fea_all.shape, ans_all.shape)
 37 | 
 38 | sort_idx = np.argsort(ans_all[:, 0])
 39 | fea_all = fea_all[sort_idx]
 40 | ans_all = ans_all[sort_idx]
 41 | print('Sorted shapes:', sort_idx.shape, fea_all.shape, ans_all.shape)
 42 | 
 43 | # --- Setup network
 44 | nn_param = {
 45 |     'dim_fea': fea_all.shape[1],
 46 |     'batch': 256,
 47 |     'optm_params': {
 48 |         'lr': 0.001
 49 |     },
 50 | }
 51 | 
 52 | print('Setting network')
 53 | networks = {}
 54 | optimizers = {}
 55 | schedulers = {}
 56 | loss_func = nn.MSELoss()
 57 | for f in range(args.n_fold):
 58 |     networks[f] = model.MLP(nn_param['dim_fea'])
 59 |     networks[f].to(device)
 60 |     optimizers[f] = optim.Adam(list(networks[f].parameters()), lr=nn_param['optm_params']['lr'])
 61 |     schedulers[f] = torch.optim.lr_scheduler.StepLR(optimizers[f], step_size=1, gamma=0.9999)
 62 | 
 63 | # --- Write config
 64 | if not os.path.exists(args.save_model_dir_name):
 65 |     os.makedirs(args.save_model_dir_name, 0o755)
 66 |     print('Model will be saved in {}'.format(args.save_model_dir_name))
 67 | else:
 68 |     warnings.warn('Dir {} already exist, result files will be overwritten.'.format(args.save_model_dir_name))
 69 | util.write_cfg(os.path.join(args.save_model_dir_name, 'config.yml'), nn_param)
 70 | 
 71 | data_num = fea_all.shape[0]
 72 | for fold in range(args.n_fold):
 73 | 
 74 |     best_va_loss = 9999999
 75 | 
 76 |     valid_idx = np.where(np.arange(data_num)%args.n_fold==fold)[0]
 77 |     train_idx = np.where(np.arange(data_num)%args.n_fold!=fold)[0]
 78 | 
 79 |     print('Fold {}, train num {}, test num {}'.format(
 80 |         fold, len(train_idx), len(valid_idx),
 81 |     ))
 82 | 
 83 |     data_loader_train = torch.utils.data.DataLoader(
 84 |         util.Data2Torch({
 85 |             'fea': fea_all[train_idx],
 86 |             'ans': ans_all[train_idx],
 87 |         }),
 88 |         shuffle=True,
 89 |         batch_size=nn_param['batch'],
 90 |     )
 91 | 
 92 |     data_loader_valid = torch.utils.data.DataLoader(
 93 |         util.Data2Torch({
 94 |             'fea': fea_all[valid_idx],
 95 |             'ans': ans_all[valid_idx],
 96 |         }),
 97 |         batch_size=nn_param['batch'],
 98 |     )
 99 | 
100 |     va_not_imporved_continue_count = 0
101 |     totalTime = 0
102 |     fout = open(os.path.join(args.save_model_dir_name, 'train_report_{}.txt'.format(fold)), 'w')
103 |     for epoch in range(args.epoch):
104 |         util.print_and_write_file(fout, 'epoch {}/{}...'.format(epoch + 1, args.epoch))
105 |         tic = time.time()
106 |         # --- Batch training
107 |         networks[fold].train()
108 |         training_loss = 0
109 |         n_batch = 0
110 |         optimizers[fold].zero_grad()
111 |         for idx, data in enumerate(data_loader_train):
112 |             pred = networks[fold](Variable(data['fea'].to(device)))
113 |             ans = Variable(data['ans'].to(device))
114 |             loss = torch.sqrt(loss_func(pred, ans))
115 |             optimizers[fold].zero_grad()
116 |             loss.backward()
117 |             optimizers[fold].step()
118 |             training_loss += loss.data
119 |             n_batch += 1
120 |         # --- Training loss
121 |         training_loss_avg = training_loss / n_batch
122 |         util.print_and_write_file(
123 |             fout, '\tTraining loss (avg over batch): {}, {}, {}'.format(
124 |                 training_loss_avg, training_loss, n_batch
125 |             )
126 |         )
127 |         # --- Batch validation
128 |         networks[fold].eval()
129 |         va_loss = 0
130 |         n_batch = 0
131 |         for idx, data in enumerate(data_loader_valid):
132 |             ans = Variable(data['ans'].to(device)).float()
133 |             with torch.no_grad():
134 |                 pred = networks[fold](Variable(data['fea'].to(device)))
135 |                 loss = torch.sqrt(loss_func(pred, ans))
136 |             va_loss += loss.data
137 |             n_batch += 1
138 |         # --- Validation loss
139 |         va_loss_avg = va_loss / n_batch
140 |         util.print_and_write_file(
141 |             fout, '\tValidation loss (avg over batch): {}, {}, {}'.format(
142 |                 va_loss_avg, va_loss, n_batch
143 |             )
144 |         )
145 |         # --- Save if needed
146 |         if va_loss_avg < best_va_loss:
147 |             best_va_loss = va_loss_avg
148 |             va_not_imporved_continue_count = 0
149 |             util.print_and_write_file(fout, '\tWill save bestVa model')
150 |             torch.save(
151 |                 networks[fold].state_dict(),
152 |                 os.path.join(args.save_model_dir_name, 'model_{}_bestVa'.format(fold))
153 |             )
154 |         else:
155 |             va_not_imporved_continue_count += 1
156 |             util.print_and_write_file(fout, '\tva_not_imporved_continue_count: {}'.format(va_not_imporved_continue_count))
157 |             if va_not_imporved_continue_count >= args.va_not_imp_limit:
158 |                 break
159 |         util.print_and_write_file(fout, '\tLearning rate used for this epoch: {}'.format(schedulers[fold].get_last_lr()[0]))
160 |         if schedulers[fold].get_last_lr()[0] >= 1e-4:
161 |             schedulers[fold].step()
162 |         # --- Time
163 |         toc = time.time()
164 |         totalTime += toc - tic
165 |         util.print_and_write_file(fout, '\tTime: {:.3f} sec, estimated remaining: {:.3} hr'.format(
166 |             toc - tic,
167 |             1.0 * totalTime / (epoch + 1) * (args.epoch - (epoch + 1)) / 3600
168 |         ))
169 |         fout.flush()
170 |     fout.close()
171 |     # Save model
172 |     torch.save(
173 |         networks[fold].state_dict(),
174 |         os.path.join(args.save_model_dir_name, 'model_{}_final'.format(fold))
175 |     )
176 |     print('Model saved in {}'.format(args.save_model_dir_name))
177 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/util.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import os
  3 | import pickle
  4 | from copy import deepcopy
  5 | 
  6 | import numpy as np
  7 | 
  8 | # 觀測資料查詢: https://e-service.cwb.gov.tw/HistoryDataQuery/index.jsp
  9 | header_to_row_idx = {h: i for i, h in enumerate('ID,Date,Temp_m,Generation,Irradiance,Capacity,Lat,Lon,Angle,Irradiance_m,Temp,Module'.split(','))}
 10 | lat_lon_to_site = { # nearest
 11 |     '24.04-120.52': ['C0G780'], #  彰化 秀水   2021/5/19  2022/2/16 秀水測站 C0G780
 12 |     '24.06-120.47': ['C0G770'], #  彰化 鹿港   2021/6/12  2022/2/16 福興測站 C0G770
 13 |     '24.07-120.47': ['C0G640'], #  彰化 鹿港   2021/5/27  2022/2/16 鹿港測站 C0G640
 14 |     '24.07-120.48': ['C0G780'], #  彰化 鹿港   2021/5/21  2022/2/16 秀水測站 C0G780
 15 |     '24.08-120.5': ['C0G780'], #   彰化 和美   2020/9/27  2022/2/16 秀水測站 C0G780
 16 |     '24.08-120.52': ['C0G780'], #  彰化 彰化市 2021/5/22  2022/2/16 秀水測站 C0G780
 17 |     '24.09-120.52': ['C0G780'], #  彰化 和美   2021/5/21  2022/2/16 秀水測站 C0G780
 18 |     '24.107-120.44': ['C0G640'], # 彰化 鹿港   2020/9/23  2022/2/16 鹿港測站 C0G640
 19 |     '24.98-121.03': ['467050'], #  桃園 新屋   2020/12/17 2022/2/17 新屋測站 467050
 20 |     '25.03-121.08': ['467050'], #  桃園 觀音   2020/12/17 2022/2/17 新屋測站 467050
 21 |     '25.11-121.26': ['C0C620'], #  桃園 蘆竹   2020/6/9   2022/2/17 蘆竹測站 C0C620
 22 | }
 23 | # lat_lon_to_site = { # nearest 2
 24 | #     '24.04-120.52': ['C0G780', 'C0G910'], #  彰化 秀水   2021/5/19  2022/2/16 秀水測站 C0G780 花壇測站 C0G910
 25 | #     '24.06-120.47': ['C0G770', 'C0G640'], #  彰化 鹿港   2021/6/12  2022/2/16 福興測站 C0G770 鹿港測站 C0G640
 26 | #     '24.07-120.47': ['C0G640', 'C0G770'], #  彰化 鹿港   2021/5/27  2022/2/16 鹿港測站 C0G640 福興測站 C0G770
 27 | #     '24.07-120.48': ['C0G780', 'C0G640'], #  彰化 鹿港   2021/5/21  2022/2/16 秀水測站 C0G780 鹿港測站 C0G640
 28 | #     '24.08-120.5': ['C0G780', 'C0G910'], #   彰化 和美   2020/9/27  2022/2/16 秀水測站 C0G780 花壇測站 C0G910
 29 | #     '24.08-120.52': ['C0G780', 'C0G910'], #  彰化 彰化市 2021/5/22  2022/2/16 秀水測站 C0G780 花壇測站 C0G910
 30 | #     '24.09-120.52': ['C0G780', 'C0G910'], #  彰化 和美   2021/5/21  2022/2/16 秀水測站 C0G780 花壇測站 C0G910
 31 | #     '24.107-120.44': ['C0G640', 'C0G900'], # 彰化 鹿港   2020/9/23  2022/2/16 鹿港測站 C0G640 線西測站 C0G900
 32 | #     '24.98-121.03': ['467050', 'C0D650'], #  桃園 新屋   2020/12/17 2022/2/17 新屋測站 467050 湖口測站 C0D650
 33 | #     '25.03-121.08': ['467050', 'C0C590'], #  桃園 觀音   2020/12/17 2022/2/17 新屋測站 467050 觀音測站 C0C590
 34 | #     '25.11-121.26': ['C0C620', 'C0C700'], #  桃園 蘆竹   2020/6/9   2022/2/17 蘆竹測站 C0C620 中壢測站 C0C700
 35 | # }
 36 | # lat_lon_to_site = { # m = (a+b)/2, minimize pa + pb + pm
 37 | #     '24.04-120.52': ['C0G780', 'C0G910'], #  彰化 秀水   2021/5/19  2022/2/16 秀水測站 C0G780 花壇測站 C0G910
 38 | #     '24.06-120.47': ['C0G640', 'C0G780'], #  彰化 鹿港   2021/6/12  2022/2/16 鹿港測站 C0G640 秀水測站 C0G780
 39 | #     '24.07-120.47': ['C0G640', 'C0G780'], #  彰化 鹿港   2021/5/27  2022/2/16 鹿港測站 C0G640 秀水測站 C0G780
 40 | #     '24.07-120.48': ['C0G640', 'C0G780'], #  彰化 鹿港   2021/5/21  2022/2/16 鹿港測站 C0G640 秀水測站 C0G780
 41 | #     '24.08-120.5': ['C0G890', 'C0G780'], #   彰化 和美   2020/9/27  2022/2/16 伸港測站 C0G890 秀水測站 C0G780
 42 | #     '24.08-120.52': ['C0G890', 'C0G910'], #  彰化 彰化市 2021/5/22  2022/2/16 伸港測站 C0G890 花壇測站 C0G910
 43 | #     '24.09-120.52': ['C0G890', 'C0G910'], #  彰化 和美   2021/5/21  2022/2/16 伸港測站 C0G890 花壇測站 C0G910
 44 | #     '24.107-120.44': ['C0G640', 'C0G640'], # 彰化 鹿港   2020/9/23  2022/2/16 線西測站 C0G900 鹿港測站 C0G640
 45 | #     '24.98-121.03': ['467050', 'C0D650'], #  桃園 新屋   2020/12/17 2022/2/17 新屋測站 467050 湖口測站 C0D650
 46 | #     '25.03-121.08': ['467050', 'C0C590'], #  桃園 觀音   2020/12/17 2022/2/17 新屋測站 467050 觀音測站 C0C590
 47 | #     '25.11-121.26': ['C0C620', 'C0C590'], #  桃園 蘆竹   2020/6/9   2022/2/17 蘆竹測站 C0C620 觀音測站 C0C590
 48 | # }
 49 | lat_lon_dic = {lat_lon: i for i, lat_lon in enumerate(sorted(list(lat_lon_to_site.keys())))}
 50 | site_to_idx = {
 51 |     'C0G780': 0,
 52 |     'C0G770': 1,
 53 |     'C0G640': 2,
 54 |     '467050': 3,
 55 |     'C0C620': 4,
 56 |     # 'C0G910': 5,
 57 |     # 'C0G890': 6,
 58 |     # 'C0D650': 7,
 59 |     # 'C0C590': 8,
 60 |     # 'C0G900': 9,
 61 |     # 'C0C700': 10,
 62 | }
 63 | 
 64 | 
 65 | def save_pkl(path, pkl):
 66 |     with open(path, 'wb') as handle:   
 67 |         pickle.dump(pkl, handle, protocol=pickle.HIGHEST_PROTOCOL)
 68 | 
 69 | 
 70 | def load_pkl(path):
 71 |     with open(path, 'rb') as handle:
 72 |         try:
 73 |             return pickle.load(handle)
 74 |         except:
 75 |             return pickle5.load(handle)
 76 | 
 77 | 
 78 | def print_and_write_file(fout, cnt, fout_end='\n'):
 79 |     print(cnt)
 80 |     if fout is not None:
 81 |         fout.write(cnt  + fout_end)
 82 |         fout.flush()
 83 | 
 84 | 
 85 | def load_external_monthly_report(path='data/external-monthly'):
 86 |     ret_dict = {}
 87 |     ext_header_to_row_idx = None
 88 |     for file in os.listdir(path):
 89 |         if file.endswith('.csv'):
 90 |             site, year, month = file.split('.')[0].split('-')
 91 |             month = str(int(month))
 92 |             if site not in ret_dict:
 93 |                 ret_dict[site] = {}
 94 |             if year not in ret_dict[site]:
 95 |                 ret_dict[site][year] = {}
 96 |             with open(os.path.join(path, file), 'r', encoding='utf-8') as fin:
 97 |                 cnt = fin.read().splitlines()
 98 |             if ext_header_to_row_idx is None:
 99 |                 ext_header_to_row_idx = {h.replace('"', ''): i for i, h in enumerate(cnt[1].split(','))}
100 |             cnt = cnt[1:]
101 |             ret_dict[site][year][month] = [list(map(lambda x: x.replace('"', ''), line.split(','))) for line in cnt]
102 |     return ret_dict, ext_header_to_row_idx
103 | 
104 | 
105 | def load_external_daily_report(path='data/external-daily'):
106 |     ret_dict = {}
107 |     ext_header_to_row_idx = None
108 |     for file in os.listdir(path):
109 |         if not file.endswith('.csv'):
110 |             continue
111 |         site = file.split('_')[0]
112 |         if site not in ret_dict:
113 |             ret_dict[site] = {}
114 |         with open(os.path.join(path, file), 'r', encoding='utf-8') as fin:
115 |             cnt = fin.read().splitlines()
116 |         if ext_header_to_row_idx is None:
117 |             ext_header_to_row_idx = {h.replace('"', ''): i for i, h in enumerate(cnt[0].split(',')[1:])}
118 |         for line in cnt[1:]:
119 |             arr = line.split(',')[1:]
120 |             year, month, date = arr[0].split('-')
121 |             month = str(int(month))
122 |             date = str(int(date))
123 |             hour = str(int(arr[1]))
124 |             if year not in ret_dict[site]:
125 |                 ret_dict[site][year] = {}
126 |             if month not in ret_dict[site][year]:
127 |                 ret_dict[site][year][month] = {}
128 |             if date not in ret_dict[site][year][month]:
129 |                 ret_dict[site][year][month][date] = {}
130 |             if hour not in ret_dict[site][year][month][date]:
131 |                 ret_dict[site][year][month][date][hour] = deepcopy(arr)
132 |     return ret_dict, ext_header_to_row_idx
133 | 
134 | 
135 | def get_possible_empty_fea(raw_val):
136 |     return [
137 |         float(raw_val == ''),
138 |         float(raw_val if raw_val != '' else 0),
139 |     ]
140 | 
141 | 
142 | def get_module_fea(raw_val):
143 |     if raw_val == 'MM60-6RT-300':
144 |         return [1, 0, 0, 0, 300, 32.61, 9.20, 38.97, 09.68, 18.44]
145 |     elif raw_val == 'SEC-6M-60A-295':
146 |         return [0, 1, 0, 0, 295, 31.60, 9.34, 39.40, 09.85, 17.74]
147 |     elif raw_val == 'AUO PM060MW3 320W':
148 |         return [0, 0, 1, 0, 320, 33.48, 9.56, 40.90, 10.24, 19.20]
149 |     elif raw_val == 'AUO PM060MW3 325W':
150 |         return [0, 0, 0, 1, 325, 33.66, 9.66, 41.10, 10.35, 19.50]
151 |     else:
152 |         raise ValueError('Unknown raw_val: ' + raw_val)
153 | 
154 | 
155 | def get_lat_lon_fea(lat_str, lon_str):
156 |     key = '{}-{}'.format(lat_str, lon_str)
157 |     ret = [0 for _ in range(len(lat_lon_dic))]
158 |     ret[lat_lon_dic[key]] = 1
159 |     return ret
160 | 
161 | 
162 | def get_div_fea(a, b):
163 |     a = float(a) if a != '' else 0
164 |     b = float(b) if b != '' else 0
165 |     return a / b if a * b > 0 else 0
166 | 
167 | 
168 | def get_ext_float(raw_str, val_of_empty=-1):
169 |     if raw_str in ('...', 'X', '/', '&', ' ', ''):
170 |         return val_of_empty
171 |     elif raw_str == 'T':
172 |         return 0.01
173 |     else:
174 |         return float(raw_str)
175 | 
176 | 
177 | def get_ext_time_fea(raw_str, zero_base):
178 |     try:
179 |         hh, mm = list(map(float, raw_str.split(' ')[1].split(':')))
180 |     except:
181 |         return -1
182 |     if hh >= zero_base:
183 |         hh = hh - zero_base
184 |     else:
185 |         hh = hh + 24 - zero_base
186 |     return hh + mm / 60
187 | 
188 | 
189 | def get_ext_one_site_monthly_report_fea(site, this_arr, ext_header_to_row_idx, angle, capacity):
190 |     site_one_hot = [0 for _ in range(len(site_to_idx))]
191 |     site_one_hot[site_to_idx[site]] = 1
192 |     return site_one_hot + [
193 |         get_ext_float(this_arr[ext_header_to_row_idx['StnPres']]),
194 |         get_ext_float(this_arr[ext_header_to_row_idx['SeaPres']]),
195 |         get_ext_float(this_arr[ext_header_to_row_idx['StnPresMax']]),
196 |         get_ext_float(this_arr[ext_header_to_row_idx['StnPresMin']]),
197 |         get_ext_float(this_arr[ext_header_to_row_idx['Temperature']]),
198 |         get_ext_float(this_arr[ext_header_to_row_idx['T Max']]),
199 |         get_ext_float(this_arr[ext_header_to_row_idx['T Min']]),
200 |         get_ext_float(this_arr[ext_header_to_row_idx['Td dew point']]),
201 |         get_ext_float(this_arr[ext_header_to_row_idx['RH']]),
202 |         get_ext_float(this_arr[ext_header_to_row_idx['RHMin']]),
203 |         get_ext_float(this_arr[ext_header_to_row_idx['WS']]),
204 |         get_ext_float(this_arr[ext_header_to_row_idx['WD']]),
205 |         get_ext_float(this_arr[ext_header_to_row_idx['WSGust']]),
206 |         get_ext_float(this_arr[ext_header_to_row_idx['WDGust']]),
207 |         get_ext_float(this_arr[ext_header_to_row_idx['Precp']]),
208 |         get_ext_float(this_arr[ext_header_to_row_idx['PrecpHour']]),
209 |         get_ext_float(this_arr[ext_header_to_row_idx['PrecpMax10']]),
210 |         get_ext_float(this_arr[ext_header_to_row_idx['PrecpMax60']]),
211 |         get_ext_float(this_arr[ext_header_to_row_idx['SunShine']]),
212 |         get_ext_float(this_arr[ext_header_to_row_idx['SunShineRate']]),
213 |         get_ext_float(this_arr[ext_header_to_row_idx['GloblRad']]),
214 |         get_ext_float(this_arr[ext_header_to_row_idx['VisbMean']]),
215 |         get_ext_float(this_arr[ext_header_to_row_idx['EvapA']], val_of_empty=-999),
216 |         get_ext_float(this_arr[ext_header_to_row_idx['UVI Max']]),
217 |         get_ext_float(this_arr[ext_header_to_row_idx['Cloud Amount']]),
218 |         # Fea Eng
219 |         get_ext_float(this_arr[ext_header_to_row_idx['SunShine']]) * math.sin(math.radians(angle)),
220 |         get_ext_float(this_arr[ext_header_to_row_idx['SunShine']]) * math.tanh(math.radians(angle)),
221 |         get_ext_float(this_arr[ext_header_to_row_idx['SunShine']]) * (1 - angle),
222 |         get_ext_float(this_arr[ext_header_to_row_idx['SunShineRate']]) * math.sin(math.radians(angle)),
223 |         get_ext_float(this_arr[ext_header_to_row_idx['SunShineRate']]) * math.tanh(math.radians(angle)),
224 |         get_ext_float(this_arr[ext_header_to_row_idx['SunShineRate']]) * (1 - angle),
225 |         get_ext_float(this_arr[ext_header_to_row_idx['GloblRad']]) * math.sin(math.radians(angle)),
226 |         get_ext_float(this_arr[ext_header_to_row_idx['GloblRad']]) * math.tanh(math.radians(angle)),
227 |         get_ext_float(this_arr[ext_header_to_row_idx['GloblRad']]) * (1 - angle),
228 |     ]
229 | 
230 | 
231 | def merge_ext_list(fea_a, fea_b):
232 |     # return [max(va, vb) for va, vb in zip(fea_a, fea_b)]
233 |     return [max(va, vb) if va < 0 or vb < 0 else (va + vb) / 2 for va, vb in zip(fea_a, fea_b)]
234 | 
235 | 
236 | def get_ext_monthly_report_fea(lat_str, lon_str, date_str, ext_data_dict, ext_header_to_row_idx, angle, capacity):
237 |     year, month, date = date_str.split('/')
238 |     date = int(date)
239 |     site = lat_lon_to_site['{}-{}'.format(lat_str, lon_str)][0]
240 |     this_arr = ext_data_dict[site][year][month][date]
241 |     return get_ext_one_site_monthly_report_fea(site, this_arr, ext_header_to_row_idx, angle, capacity)
242 |     # site_a = lat_lon_to_site['{}-{}'.format(lat_str, lon_str)][0]
243 |     # site_b = lat_lon_to_site['{}-{}'.format(lat_str, lon_str)][0]
244 |     # this_arr_a = ext_data_dict[site_a][year][month][date]
245 |     # this_arr_b = ext_data_dict[site_b][year][month][date]
246 |     # return merge_ext_list(
247 |     #     get_ext_one_site_monthly_report_fea(site_a, this_arr_a, ext_header_to_row_idx, angle, capacity),
248 |     #     get_ext_one_site_monthly_report_fea(site_b, this_arr_b, ext_header_to_row_idx, angle, capacity),
249 |     # )
250 | 
251 | 
252 | def get_ext_one_hour_fea(this_dict, hour, ext_header_to_row_idx, angle, capacity):
253 |     return [
254 |         get_ext_float(this_dict[hour][ext_header_to_row_idx['StnPres']]),
255 |         get_ext_float(this_dict[hour][ext_header_to_row_idx['SeaPres']]),
256 |         get_ext_float(this_dict[hour][ext_header_to_row_idx['Temperature']]),
257 |         get_ext_float(this_dict[hour][ext_header_to_row_idx['Td dew point']]),
258 |         get_ext_float(this_dict[hour][ext_header_to_row_idx['RH']]),
259 |         get_ext_float(this_dict[hour][ext_header_to_row_idx['WS']]),
260 |         get_ext_float(this_dict[hour][ext_header_to_row_idx['WD']]),
261 |         get_ext_float(this_dict[hour][ext_header_to_row_idx['WSGust']]),
262 |         get_ext_float(this_dict[hour][ext_header_to_row_idx['WDGust']]),
263 |         get_ext_float(this_dict[hour][ext_header_to_row_idx['Precp']]),
264 |         get_ext_float(this_dict[hour][ext_header_to_row_idx['PrecpHour']]),
265 |         get_ext_float(this_dict[hour][ext_header_to_row_idx['SunShine']]),
266 |         get_ext_float(this_dict[hour][ext_header_to_row_idx['GloblRad']]),
267 |         get_ext_float(this_dict[hour][ext_header_to_row_idx['Visb']]),
268 |         get_ext_float(this_dict[hour][ext_header_to_row_idx['UVI']]),
269 |         get_ext_float(this_dict[hour][ext_header_to_row_idx['Cloud Amount']]),
270 |         # Fea Eng
271 |         get_ext_float(this_dict[hour][ext_header_to_row_idx['SunShine']]) * math.sin(math.radians(angle)),
272 |         get_ext_float(this_dict[hour][ext_header_to_row_idx['SunShine']]) * math.tanh(math.radians(angle)),
273 |         get_ext_float(this_dict[hour][ext_header_to_row_idx['SunShine']]) * (1 - angle),
274 |         get_ext_float(this_dict[hour][ext_header_to_row_idx['GloblRad']]) * math.sin(math.radians(angle)),
275 |         get_ext_float(this_dict[hour][ext_header_to_row_idx['GloblRad']]) * math.tanh(math.radians(angle)),
276 |         get_ext_float(this_dict[hour][ext_header_to_row_idx['GloblRad']]) * (1 - angle),
277 |     ]
278 | 
279 | 
280 | def get_ext_one_site_daily_report_fea(site, this_dict, ext_header_to_row_idx, angle, capacity):
281 |     site_one_hot = [0 for _ in range(len(site_to_idx))]
282 |     site_one_hot[site_to_idx[site]] = 1
283 |     return \
284 |         get_ext_one_hour_fea(this_dict, '11', ext_header_to_row_idx, angle, capacity) + \
285 |         get_ext_one_hour_fea(this_dict, '12', ext_header_to_row_idx, angle, capacity) + \
286 |         get_ext_one_hour_fea(this_dict, '13', ext_header_to_row_idx, angle, capacity)
287 | 
288 | 
289 | def get_ext_daily_report_fea(lat_str, lon_str, date_str, ext_data_dict, ext_header_to_row_idx, angle, capacity):
290 |     year, month, date = date_str.split('/')
291 |     date = str(int(date))
292 |     site = lat_lon_to_site['{}-{}'.format(lat_str, lon_str)][0]
293 |     this_dict = ext_data_dict[site][year][month][date]
294 |     return get_ext_one_site_daily_report_fea(site, this_dict, ext_header_to_row_idx, angle, capacity)
295 | 
296 | 
297 | def fea_ext(row, ext_data_dict_month, ext_header_to_row_idx_month, ext_data_dict_day, ext_header_to_row_idx_day):
298 |     angle = float(row[header_to_row_idx['Angle']])
299 |     month = int(row[header_to_row_idx['Date']].split('/')[1])
300 |     date = int(row[header_to_row_idx['Date']].split('/')[2])
301 |     irradiance = float(row[header_to_row_idx['Irradiance']]) if row[header_to_row_idx['Irradiance']] != '' else 0
302 |     irradiance_m = float(row[header_to_row_idx['Irradiance_m']]) / 1000
303 |     capacity = float(row[header_to_row_idx['Capacity']])
304 |     lat_str = row[header_to_row_idx['Lat']]
305 |     lon_str = row[header_to_row_idx['Lon']]
306 |     return \
307 |         get_possible_empty_fea(row[header_to_row_idx['Temp_m']]) + \
308 |         get_possible_empty_fea(row[header_to_row_idx['Irradiance']]) + \
309 |         get_possible_empty_fea(row[header_to_row_idx['Temp']]) + \
310 |         get_module_fea(row[header_to_row_idx['Module']]) + \
311 |         get_lat_lon_fea(lat_str, lon_str) + \
312 |         get_ext_monthly_report_fea(lat_str, lon_str, row[header_to_row_idx['Date']], ext_data_dict_month, ext_header_to_row_idx_month, angle, capacity) + \
313 |     [
314 |         # Single
315 |         month,
316 |         month - 2 if month >= 3 else month + 10,
317 |         month - 5 if month >= 6 else month + 7,
318 |         month - 8 if month >= 9 else month + 4,
319 |         1 if month in (3, 4, 5) else 0,
320 |         1 if month in (6, 7, 8) else 0,
321 |         1 if month in (9, 10, 11) else 0,
322 |         1 if month in (12, 1, 2) else 0,
323 |         date,
324 |         capacity,
325 |         float(lat_str),
326 |         float(lon_str),
327 |         angle,
328 |         math.sin(math.radians(angle)),
329 |         math.cos(math.radians(angle)),
330 |         math.tanh(math.radians(angle)),
331 |         -abs(math.tanh(math.radians(angle))) + 1,
332 |         1 if angle >= 0 else -1,
333 |         irradiance_m,
334 |         # Comb
335 |         capacity * irradiance_m / 1000,
336 |         irradiance_m * (1- angle),
337 |         irradiance_m * math.sin(math.radians(angle)),
338 |         irradiance_m * math.cos(math.radians(angle)),
339 |         irradiance_m * (math.tanh(math.radians(angle))),
340 |         irradiance_m * (-abs(math.tanh(math.radians(angle))) + 1),
341 |         capacity * irradiance_m * (1- angle),
342 |         capacity * irradiance_m * math.sin(math.radians(angle)),
343 |         capacity * irradiance_m * math.cos(math.radians(angle)),
344 |         capacity * irradiance_m * (math.tanh(math.radians(angle))),
345 |         capacity * irradiance_m * (-abs(math.tanh(math.radians(angle))) + 1),
346 |         get_div_fea(row[header_to_row_idx['Temp_m']], row[header_to_row_idx['Temp']]),
347 |         get_div_fea(row[header_to_row_idx['Irradiance']], row[header_to_row_idx['Irradiance_m']]),
348 |     ], int(row[header_to_row_idx['Generation']]) if row[header_to_row_idx['Generation']] != '' else None
349 | 
350 | 
351 | FEA_NAMES = [
352 |     'is Temp_m empty',
353 |     'Temp_m',
354 |     'is Irradiance empty',
355 |     'Irradiance',
356 |     'is Temp empty',
357 |     'Temp',
358 |     'is MM60-6RT-300',
359 |     'is SEC-6M-60A-295',
360 |     'is AUO PM060MW3 320W',
361 |     'is AUO PM060MW3 325W',
362 |     'Pmax',
363 |     'Vmp',
364 |     'Imp',
365 |     'Voc',
366 |     'Isc',
367 |     '%',
368 |     'is 24.04-120.52',
369 |     'is 24.06-120.47',
370 |     'is 24.07-120.47',
371 |     'is 24.07-120.48',
372 |     'is 24.08-120.5',
373 |     'is 24.08-120.52',
374 |     'is 24.09-120.52',
375 |     'is 24.107-120.44',
376 |     'is 24.98-121.03',
377 |     'is 25.03-121.08',
378 |     'is 25.11-121.26',
379 |     'ext-C0G780',
380 |     'ext-C0G770',
381 |     'ext-C0G640',
382 |     'ext-467050',
383 |     'ext-C0C620',
384 |     'ext-StnPres',
385 |     'ext-SeaPres',
386 |     'ext-StnPresMax',
387 |     'ext-StnPresMin',
388 |     'ext-Temperature',
389 |     'ext-T Max',
390 |     'ext-T Min',
391 |     'ext-Td dew point',
392 |     'ext-RH',
393 |     'ext-RHMin',
394 |     'ext-WS',
395 |     'ext-WD',
396 |     'ext-WSGust',
397 |     'ext-WDGust',
398 |     'ext-Precp',
399 |     'ext-PrecpHour',
400 |     'ext-PrecpMax10',
401 |     'ext-PrecpMax60',
402 |     'ext-SunShine',
403 |     'ext-SunShineRate',
404 |     'ext-GloblRad',
405 |     'ext-VisbMean',
406 |     'ext-EvapA',
407 |     'ext-UVI Max',
408 |     'ext-Cloud Amount',
409 |     'ext-SunShine * sin angle',
410 |     'ext-SunShine * tanh angle',
411 |     'ext-SunShine * (1 - angle)',
412 |     'ext-SunShineRate * sin angle',
413 |     'ext-SunShineRate * tanh angle',
414 |     'ext-SunShineRate * (1 - angle)',
415 |     'ext-GloblRad * sin angle',
416 |     'ext-GloblRad * tanh angle',
417 |     'ext-GloblRad * (1 - angle)',
418 |     'month',
419 |     'month base 3',
420 |     'month base 6',
421 |     'month base 9',
422 |     'is spring',
423 |     'is summar',
424 |     'is fall',
425 |     'is winter',
426 |     'date',
427 |     'capacity',
428 |     'lat',
429 |     'lon',
430 |     'angle',
431 |     'sin angle',
432 |     'cos angle',
433 |     'tanh angel',
434 |     '-abs(tanh angel)+1',
435 |     'sign angle',
436 |     'irradiance_m',
437 |     'capacity * irradiance_m / 1000',
438 |     'irradiance_m * (1- angle)',
439 |     'irradiance_m * sin angle',
440 |     'irradiance_m * cos angle',
441 |     'irradiance_m * tanh angle',
442 |     'irradiance_m * (-abs(tanh angel)+1)',
443 |     'capacity * irradiance_m * (1- angle)',
444 |     'capacity * irradiance_m * sin angle',
445 |     'capacity * irradiance_m * cos angle',
446 |     'capacity * irradiance_m * tanh angle',
447 |     'capacity * irradiance_m * (-abs(tanh angel)+1)',
448 |     'Temp_m / temp',
449 |     'Irradiance / irradiance_m',
450 | ]
451 | 


--------------------------------------------------------------------------------