├── controllers
    ├── __init__.py
    ├── buffer.py
    ├── models.py
    ├── baseline_controller.py
    ├── ppo_controller.py
    └── td3_controller.py
├── README.md
├── data.py
├── setting.py
├── cigre_mv_microgrid.py
├── main.py
└── utils.py


/controllers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Microgrid Energy Management using Deep Reinforcment Learning
 2 | This repository contains experiment code for my master thesis "Time Series Observation and Action Handling for Battery Management in Applying Deep Reinforcement Learning for
 3 | Microgrid Energy Management".
 4 | 
 5 | # Abstract
 6 | Time Series Observation and Action Handling for Battery Management in Applying Deep Reinforcement Learning for Microgrid Energy Management / 
 7 | The transformation from traditional grids to microgrids introduces challenges due to multiple distributed energy resources and the intermittency of renewable energy sources and loads. Much effort has been committed to the design of microgrid energy management systems to attain optimal operation, and reinforcement learning is considered one of the most promising methods because of its competitive properties. Reinforcement learning algorithms generally do not assume precise models and can learn the underlying dynamics of the system under uncertainty by interacting with the environment. However, directly applying reinforcement learning to microgrid energy management is not an easy task. In this paper, we study two design aspects in reinforcement learning algorithms for microgrid energy management, which are related to time series observation and battery management in microgrids. In order to process time series data and handle varying battery charging/discharging bounds in our deep reinforcement learning algorithm, recurrent neural networks and valid action space mapping are used in our implementation. Experimental results confirm that the two design aspects are crucial for applying reinforcement learning in microgrid energy management.
 8 | 
 9 | # Code Explanation
10 | | File                  | Description         |
11 | |-----------------------|---------------------|
12 | | cigre_mv_microgrid.py | Contains code for creating our test grid               |
13 | | data.py               | Convert data from PJM for our environment|
14 | | main.py               | Entry point of our experiment |
15 | | setting.py            | Environment settings                 |
16 | | utils.py              | Some frequently used repeated functions           |
17 | 
18 | |Directory   |Description   |
19 | |---|---|
20 | | controllers  | Controllers for microgrid energy management using various algorithms  |
21 | | data   | Processed data for our environment  |
22 | |history| Training history |
23 | |model_weights| Trained model weights|
24 | |pf_res| Results of power flow analysis|
25 | |plot| Plots of experimental results |
26 | |rms| Store values for input normalization and running mean std|
27 | 
28 | ## main.py
29 | - train_ppo(): train PPO agent.
30 | - train_td3(): train TD3 agent
31 | - test(): test with the trained agent.
32 | - baseline(): test baseline.
33 | 


--------------------------------------------------------------------------------
/data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import utils
 3 | from setting import *
 4 | 
 5 | def create_unit_profile(pv_df, wt_df, load_df, price_df):
 6 |     # reshape tables
 7 |     pv_df = pd.pivot_table(pv_df, values='solar_generation_mw', index=['datetime_beginning_ept'], columns=['area'], sort=False)
 8 |     wt_df = pd.pivot_table(wt_df, values='wind_generation_mw', index=['datetime_beginning_ept'], columns=['area'], sort=False)
 9 |     load_df = pd.pivot_table(load_df, values='mw', index=['datetime_beginning_ept'], columns=['load_area'], sort=False)
10 |     price_df = pd.pivot_table(price_df, values='hrly_da_demand_bid', index=['datetime_beginning_ept'], columns=['area'], sort=False)
11 | 
12 |     # scale values
13 |     for df in [pv_df, wt_df, load_df, price_df]:
14 |         df /= df.max()
15 |     print(f'Unit profile: {pv_df.max()}, {wt_df.max()}, {load_df.max()}, {price_df.max()}')
16 | 
17 |     return pv_df, wt_df, load_df, price_df
18 | 
19 | def create_save_profile(pv_df, wt_df, load_df, price_df):
20 |     pv_df, wt_df, load_df, price_df =  create_unit_profile(pv_df, wt_df, load_df, price_df)
21 | 
22 |     pv_profile = pd.DataFrame({ 
23 |         'pv3': pv_df['MIDATL'] * P_PV3_MAX,
24 |         'pv4': pv_df['MIDATL'] * P_PV4_MAX,
25 |         'pv5': pv_df['MIDATL'] * P_PV5_MAX,
26 |         'pv6': pv_df['RFC'] * P_PV6_MAX,
27 |         'pv8': pv_df['RFC'] * P_PV8_MAX,
28 |         'pv9': pv_df['RFC'] * P_PV9_MAX,
29 |         'pv10': pv_df['RTO'] * P_PV10_MAX,
30 |         'pv11': pv_df['RTO'] * P_PV11_MAX
31 |     })
32 |     wt_profile = pd.DataFrame({
33 |         'wt7': wt_df['MIDATL'] * P_WT7_MAX
34 |     })
35 |     load_profile = pd.DataFrame({
36 |         'load_r1': load_df['AECO'] * P_LOADR1_MAX,
37 |         'load_r3': load_df['BC'] * P_LOADR3_MAX,
38 |         'load_r4': load_df['DPLCO'] * P_LOADR4_MAX,
39 |         'load_r5': load_df['EASTON'] * P_LOADR5_MAX,
40 |         'load_r6': load_df['JC'] * P_LOADR6_MAX,
41 |         'load_r8': load_df['ME'] * P_LOADR8_MAX,
42 |         'load_r10': load_df['PE'] * P_LOADR10_MAX,
43 |         'load_r11': load_df['PEPCO'] * P_LOADR11_MAX,
44 |     })
45 |     price_profile = pd.DataFrame({
46 |         'price': price_df['PJM_RTO'] * C_PRICE_MAX
47 |     })
48 | 
49 |     # create csv files
50 |     pv_profile.to_csv('./data/profile/pv_profile.csv')
51 |     wt_profile.to_csv('./data/profile/wt_profile.csv')
52 |     load_profile.to_csv('./data/profile/load_profile.csv')
53 |     price_profile.to_csv('./data/profile/price_profile.csv')
54 | 
55 | if __name__ == '__main__':
56 |     pv_df = pd.read_csv('./data/solar_gen.csv')
57 |     wt_df = pd.read_csv('./data/wind_gen.csv')
58 |     load_df = pd.read_csv('./data/hrl_load_metered.csv')
59 |     price_df = pd.read_csv('./data/hrl_dmd_bids.csv')
60 |     create_save_profile(pv_df, wt_df, load_df, price_df)
61 | 
62 |     pv_profile = pd.read_csv('./data/profile/pv_profile.csv')
63 |     wt_profile = pd.read_csv('./data/profile/wt_profile.csv')
64 |     load_profile = pd.read_csv('./data/profile/load_profile.csv')
65 |     price_profile = pd.read_csv('./data/profile/price_profile.csv')
66 |     # excess = pv_profile.sum(axis=1) + wt_profile.sum(axis=1) - load_profile.sum(axis=1)
67 |     # surplus = excess[excess > 0]
68 |     # print(surplus)
69 |     # print(surplus.shape[0] / pv_profile.shape[0])
70 |     utils.view_profile(pv_profile, wt_profile, load_profile, price_profile)


--------------------------------------------------------------------------------
/setting.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | - hyperparameters
  3 | - environment
  4 | - power ratings
  5 | - cost parameters
  6 | '''
  7 | 
  8 | import numpy as np
  9 | 
 10 | # --- Hyperparameters ---
 11 | BATCH_SIZE = 100
 12 | GAMMA = 0.99
 13 | LR_ACTOR = 5e-4
 14 | LR_CRITIC = 5e-4
 15 | NN_BOUND = 1.
 16 | SEQ_LENGTH= 1
 17 | 
 18 | # TD3 only
 19 | ACTION_NOISE_SCALE = 0.3
 20 | BUFFER_SIZE = 500000
 21 | NOISE_TYPE = 'param' # ['action', 'param']
 22 | PARAM_NOISE_ADAPT_RATE = 1.01
 23 | PARAM_NOISE_BOUND = 0.1
 24 | PARAM_NOISE_SCALE = 0.1
 25 | UPDATE_FREQ = 50
 26 | UPDATE_TIMES = 4
 27 | WARMUP = 1000
 28 | 
 29 | # PPO only
 30 | POLICY_CLIP = 0.2
 31 | TARGET_KL = 0.01
 32 | PPO_BATCH_SIZE = 60
 33 | PPO_TRAIN_FREQ = 720
 34 | PPO_TRAIN_ITERS = 80
 35 | 
 36 | # others
 37 | PREDICT_LENGTH = 24
 38 | DENSE_DIM_A = 16
 39 | DENSE_DIM_FNN = 16
 40 | DENSE_DIM_SEQ = 32
 41 | 
 42 | # Environment
 43 | HOUR_PER_TIME_STEP = 1
 44 | 
 45 | # --- Power Ratings ---
 46 | # PV
 47 | P_PV3_MAX = 0.3
 48 | P_PV4_MAX = 0.3
 49 | P_PV5_MAX = 0.4
 50 | P_PV6_MAX = 0.4
 51 | P_PV8_MAX = 0.4
 52 | P_PV9_MAX = 0.5
 53 | P_PV10_MAX = 0.5
 54 | P_PV11_MAX = 0.3
 55 | P_PV_MAX_LIST = [P_PV3_MAX, P_PV4_MAX, P_PV5_MAX, P_PV6_MAX, P_PV8_MAX, P_PV9_MAX, P_PV10_MAX, P_PV11_MAX]
 56 | 
 57 | # WT
 58 | P_WT7_MAX = 2.5
 59 | P_WT_MAX_LIST = [P_WT7_MAX]
 60 | 
 61 | # MGT
 62 | # P_MGT5_MAX = 0.033
 63 | # P_MGT9_MAX = 0.212
 64 | # P_MGT10_MAX = 0.033
 65 | # P_MGT5_MIN = 0.
 66 | # P_MGT9_MIN = 0.
 67 | # P_MGT10_MIN = 0.
 68 | # P_MGT_MAX_LIST = [P_MGT5_MAX, P_MGT9_MAX, P_MGT10_MAX]
 69 |  
 70 | # Battery
 71 | E_B5_MAX = 3.
 72 | P_B5_MAX = 0.6
 73 | P_B5_MIN = -0.6
 74 | 
 75 | E_B10_MAX = 1.
 76 | P_B10_MAX = 0.2
 77 | P_B10_MIN = -0.2
 78 | 
 79 | SOC_MAX = 0.9
 80 | SOC_MIN = 0.1
 81 | SOC_TOLERANCE = 0.01
 82 | 
 83 | # Load
 84 | P_LOADR1_MAX = 0.85
 85 | P_LOADR3_MAX = 0.285
 86 | P_LOADR4_MAX = 0.245
 87 | P_LOADR5_MAX = 0.65
 88 | P_LOADR6_MAX = 0.565
 89 | P_LOADR8_MAX = 0.605
 90 | P_LOADR10_MAX = 0.49
 91 | P_LOADR11_MAX = 0.34
 92 | P_LOAD_MAX_LIST = [P_LOADR1_MAX, P_LOADR3_MAX, P_LOADR4_MAX, P_LOADR5_MAX, P_LOADR6_MAX, P_LOADR8_MAX, P_LOADR10_MAX, P_LOADR11_MAX]
 93 | P_LOAD_MAX = P_LOADR1_MAX + P_LOADR3_MAX + P_LOADR4_MAX + P_LOADR5_MAX + P_LOADR6_MAX + P_LOADR8_MAX + P_LOADR10_MAX + P_LOADR11_MAX
 94 | 
 95 | # PCC
 96 | P_EXCESS_MAX = sum([*P_PV_MAX_LIST, *P_WT_MAX_LIST])
 97 | 
 98 | # State
 99 | # N_INTERMITTENT_STATES = len([P_EXCESS_MAX,'price'])
100 | N_INTERMITTENT_STATES = len([*P_PV_MAX_LIST, *P_WT_MAX_LIST, *P_LOAD_MAX_LIST,'price'])
101 | # N_INTERMITTENT_STATES = len([*P_PV_MAX_LIST, *P_WT_MAX_LIST, *P_LOAD_MAX_LIST, P_EXCESS_MAX,'price'])
102 | N_CONTROLLABLE_STATES = len([P_B5_MAX, P_B10_MAX])
103 | STATE_SEQ_SHAPE = (SEQ_LENGTH, N_INTERMITTENT_STATES)
104 | STATE_FNN_SHAPE = (N_CONTROLLABLE_STATES,)
105 | 
106 | # Action
107 | ACTION_IDX = {'p_b5': 0, 'p_b10': 1}
108 | MAX_ACTION = np.array([P_B5_MAX, P_B10_MAX])
109 | MIN_ACTION = np.array([P_B5_MIN, P_B10_MIN])
110 | N_ACTION = len(MAX_ACTION)
111 | 
112 | # --- Cost Parameters ---
113 | C_PRICE_MAX = 3.
114 | # C_MGT5 = [100, 1.5]
115 | # C_MGT9 = [15.8, 2.]
116 | # C_MGT10 = [100, 1.5]
117 | C_BAT5_DoD = 0.43
118 | C_BAT10_DoD = 0.16
119 | C_SOC_LIMIT = 100
120 | MAX_COST = C_PRICE_MAX * (P_B5_MAX + P_B10_MAX + P_LOAD_MAX) + \
121 |         (C_BAT5_DoD + C_BAT10_DoD) * pow(SOC_MAX-SOC_MIN, 2) + \
122 |         C_SOC_LIMIT
123 | 
124 | REWARD_INVALID_ACTION = -5e-3
125 | 
126 | if __name__ == '__main__':
127 |     print(f'Number of actions: {N_ACTION}')
128 |     print(f'Number of intermittent states: {N_INTERMITTENT_STATES}')
129 |     print(f'Number of controllable states: {N_CONTROLLABLE_STATES}')
130 |     print(f'Load max: {P_LOAD_MAX}')


--------------------------------------------------------------------------------
/cigre_mv_microgrid.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Modified CIGRE Task Force C6.04.02 network
  3 | 
  4 | elements:
  5 | - 8 PVs
  6 | - 1 WT
  7 | - 2 Batteries
  8 | - 8 Loads
  9 | '''
 10 | 
 11 | import pandapower as pp
 12 | from pandapower.control import ConstControl
 13 | from setting import *
 14 | 
 15 | def create_cigre_mv_microgrid(pv_ds, wt_ds, load_ds):
 16 |     net = pp.create_empty_network(name='CIGRE MV Microgrid')
 17 | 
 18 |     # --- Buses --- 
 19 |     bus0 = pp.create_bus(net, vn_kv=110, name='Buse 0', type='b', zone='CIGRE_MV')
 20 |     buses = pp.create_buses(net, 11, vn_kv=20, name=[f'Bus {i}' for i in range(1, 12)], type='b', zone='CIGRE_MV')
 21 | 
 22 |     # --- Lines ---
 23 |     line_data = {'c_nf_per_km': 151.1749, 'r_ohm_per_km': 0.501,
 24 |                  'x_ohm_per_km': 0.716, 'max_i_ka': 0.145,
 25 |                  'type': 'cs'}
 26 |     pp.create_std_type(net, line_data, name='CABLE_CIGRE_MV', element='line')
 27 | 
 28 |     pp.create_line(net, buses[0], buses[1], length_km=2.82,
 29 |                    std_type='CABLE_CIGRE_MV', name='Line 1-2')
 30 |     pp.create_line(net, buses[1], buses[2], length_km=4.42,
 31 |                    std_type='CABLE_CIGRE_MV', name='Line 2-3')
 32 |     pp.create_line(net, buses[2], buses[3], length_km=0.61,
 33 |                    std_type='CABLE_CIGRE_MV', name='Line 3-4')
 34 |     pp.create_line(net, buses[3], buses[4], length_km=0.56,
 35 |                    std_type='CABLE_CIGRE_MV', name='Line 4-5')
 36 |     pp.create_line(net, buses[4], buses[5], length_km=1.54,
 37 |                    std_type='CABLE_CIGRE_MV', name='Line 5-6')
 38 |     pp.create_line(net, buses[6], buses[7], length_km=1.67,
 39 |                    std_type='CABLE_CIGRE_MV', name='Line 7-8')
 40 |     pp.create_line(net, buses[7], buses[8], length_km=0.32,
 41 |                    std_type='CABLE_CIGRE_MV', name='Line 8-9')
 42 |     pp.create_line(net, buses[8], buses[9], length_km=0.77,
 43 |                    std_type='CABLE_CIGRE_MV', name='Line 9-10')
 44 |     pp.create_line(net, buses[9], buses[10], length_km=0.33,
 45 |                    std_type='CABLE_CIGRE_MV', name='Line 10-11')
 46 |     pp.create_line(net, buses[2], buses[7], length_km=1.3,
 47 |                    std_type='CABLE_CIGRE_MV', name='Line 3-8')
 48 | 
 49 |     # --- External Grid ---
 50 |     pp.create_ext_grid(net, bus0, vm_pu=1.03, va_degree=0., s_sc_max_mva=5000, s_sc_min_mva=5000, rx_max=0.1, rx_min=0.1)
 51 | 
 52 |     # --- Trafos ---
 53 |     trafo0 = pp.create_transformer_from_parameters(net, bus0, buses[0], sn_mva=25,
 54 |                                                    vn_hv_kv=110, vn_lv_kv=20, vkr_percent=0.16,
 55 |                                                    vk_percent=12.00107, pfe_kw=0, i0_percent=0,
 56 |                                                    shift_degree=30.0, name='Trafo 0-1')
 57 |     pp.create_switch(net, bus0, trafo0, et='t', closed=True, type='CB')
 58 | 
 59 |     # --- RESs ---
 60 |     # PV
 61 |     pv3 = pp.create_sgen(net, buses[2], 0.0, q_mvar=0, name='PV 3', type='PV')
 62 |     pv4 = pp.create_sgen(net, buses[3], 0.0, q_mvar=0, name='PV 4', type='PV')
 63 |     pv5 = pp.create_sgen(net, buses[4], 0.0, q_mvar=0, name='PV 5', type='PV')
 64 |     pv6 = pp.create_sgen(net, buses[5], 0.0, q_mvar=0, name='PV 6', type='PV')
 65 |     pv8 = pp.create_sgen(net, buses[7], 0.0, q_mvar=0, name='PV 8', type='PV')
 66 |     pv9 = pp.create_sgen(net, buses[8], 0.0, q_mvar=0, name='PV 9', type='PV')
 67 |     pv10 = pp.create_sgen(net, buses[9], 0.0, q_mvar=0, name='PV 10', type='PV')
 68 |     pv11 = pp.create_sgen(net, buses[10], 0.0, q_mvar=0, name='PV 11', type='PV')
 69 |     ConstControl(net, element='sgen', variable='p_mw', element_index=pv3, profile_name='pv3', data_source=pv_ds)
 70 |     ConstControl(net, element='sgen', variable='p_mw', element_index=pv4, profile_name='pv4', data_source=pv_ds)
 71 |     ConstControl(net, element='sgen', variable='p_mw', element_index=pv5, profile_name='pv5', data_source=pv_ds)
 72 |     ConstControl(net, element='sgen', variable='p_mw', element_index=pv6, profile_name='pv6', data_source=pv_ds)
 73 |     ConstControl(net, element='sgen', variable='p_mw', element_index=pv8, profile_name='pv8', data_source=pv_ds)
 74 |     ConstControl(net, element='sgen', variable='p_mw', element_index=pv9, profile_name='pv9', data_source=pv_ds)
 75 |     ConstControl(net, element='sgen', variable='p_mw', element_index=pv10, profile_name='pv10', data_source=pv_ds)
 76 |     ConstControl(net, element='sgen', variable='p_mw', element_index=pv11, profile_name='pv11', data_source=pv_ds)
 77 | 
 78 |     # WT
 79 |     wt7 = pp.create_sgen(net, buses[6], 0.0, q_mvar=0, name='WKA 7',type='WP')
 80 |     ConstControl(net, element='sgen', variable='p_mw', element_index=wt7, profile_name='wt7', data_source=wt_ds)
 81 | 
 82 |     # --- Generators ---
 83 |     # mgt5 = pp.create_sgen(net, bus=buses[4], p_mw=0.0, name='MGT 5')
 84 |     # mgt9 = pp.create_sgen(net, bus=buses[8], p_mw=0.0, name='MGT 9')
 85 |     # mgt10 = pp.create_sgen(net, bus=buses[9], p_mw=0.0, name='MGT 10')
 86 | 
 87 |     # --- Batteries ---
 88 |     bat5 = pp.create_storage(net, bus=buses[4], p_mw=0.0, max_e_mwh=E_B5_MAX, name='Battery 5', type='Battery', max_p_mw=P_B5_MAX, min_p_mw=P_B5_MIN)
 89 |     bat10 = pp.create_storage(net, bus=buses[9], p_mw=0.0, max_e_mwh=E_B10_MAX, name='Battery 10', type='Battery', max_p_mw=P_B10_MAX, min_p_mw=P_B10_MIN)
 90 | 
 91 |     # --- Loads ---
 92 |     load_r1 = pp.create_load_from_cosphi(net, buses[0], 0.0, 0.98, "underexcited", name='Load R1')
 93 |     load_r3 = pp.create_load_from_cosphi(net, buses[2], 0.0, 0.97, "underexcited", name='Load R3')
 94 |     load_r4 = pp.create_load_from_cosphi(net, buses[3], 0.0, 0.97, "underexcited", name='Load R4')
 95 |     load_r5 = pp.create_load_from_cosphi(net, buses[4], 0.0, 0.97, "underexcited", name='Load R5')
 96 |     load_r6 = pp.create_load_from_cosphi(net, buses[5], 0.0, 0.97, "underexcited", name='Load R6')
 97 |     load_r8 = pp.create_load_from_cosphi(net, buses[7], 0.0, 0.97, "underexcited", name='Load R8')
 98 |     load_r10 = pp.create_load_from_cosphi(net, buses[9], 0.0, 0.97, "underexcited", name='Load R10')
 99 |     load_r11 = pp.create_load_from_cosphi(net, buses[10], 0.0, 0.97, "underexcited", name='Load R11')
100 |     ConstControl(net, element='load', variable='p_mw', element_index=load_r1, profile_name='load_r1', data_source=load_ds)
101 |     ConstControl(net, element='load', variable='p_mw', element_index=load_r3, profile_name='load_r3', data_source=load_ds)
102 |     ConstControl(net, element='load', variable='p_mw', element_index=load_r4, profile_name='load_r4', data_source=load_ds)
103 |     ConstControl(net, element='load', variable='p_mw', element_index=load_r5, profile_name='load_r5', data_source=load_ds)
104 |     ConstControl(net, element='load', variable='p_mw', element_index=load_r6, profile_name='load_r6', data_source=load_ds)
105 |     ConstControl(net, element='load', variable='p_mw', element_index=load_r8, profile_name='load_r8', data_source=load_ds)
106 |     ConstControl(net, element='load', variable='p_mw', element_index=load_r10, profile_name='load_r10', data_source=load_ds)
107 |     ConstControl(net, element='load', variable='p_mw', element_index=load_r11, profile_name='load_r11', data_source=load_ds)
108 | 
109 |     ids = {
110 |         'trafo0': trafo0,
111 |         'pv3': pv3, 'pv4': pv4, 'pv5': pv5, 'pv6': pv6, 'pv8': pv8, 'pv9': pv9, 'pv10': pv10, 'pv11': pv11,
112 |         'wt7': wt7,
113 |         # 'mgt5': mgt5, 'mgt9': mgt9, 'mgt10': mgt10,
114 |         'bat5': bat5, 'bat10': bat10,
115 |         'load_r1': load_r1, 'load_r3': load_r3, 'load_r4': load_r4, 'load_r5': load_r5, 'load_r6': load_r6, 'load_r8': load_r8, 'load_r10': load_r10, 'load_r11': load_r11
116 |     }
117 | 
118 |     return net, ids


--------------------------------------------------------------------------------
/controllers/buffer.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | class
  3 | - Buffer
  4 | - PrioritizedReplayBuffer
  5 | - ReplayBuffer
  6 | '''
  7 | import numpy as np
  8 | import scipy.signal
  9 | from typing import Dict
 10 | 
 11 | class Buffer:
 12 |     def __init__(self, buffer_size, state_seq_shape, state_fnn_shape, n_actions, gamma=0.99, lam=0.97):
 13 |         self.trajectory_start_idx = 0
 14 |         self.buffer_counter = 0
 15 |         self.buffer_size = buffer_size
 16 |         self.gamma = gamma
 17 |         self.lam = lam
 18 | 
 19 |         # transition
 20 |         self.state_seq_buffer = np.zeros((self.buffer_size, *state_seq_shape))
 21 |         self.state_fnn_buffer = np.zeros((self.buffer_size, *state_fnn_shape))
 22 |         self.action_buffer = np.zeros((self.buffer_size, n_actions))
 23 |         self.reward_buffer = np.zeros((self.buffer_size, 1))
 24 | 
 25 |         self.state_value_buffer = np.zeros((buffer_size, 1))
 26 |         self.action_logprob_buffer = np.zeros((buffer_size, 1))
 27 |         self.return_buffer = np.zeros((buffer_size, 1))
 28 |         self.advantage_buffer = np.zeros((buffer_size, 1))
 29 |     
 30 |     def clear(self):
 31 |         self.trajectory_start_idx = 0
 32 |         self.buffer_counter = 0
 33 |         self.state_seq_buffer = np.zeros_like(self.state_seq_buffer)
 34 |         self.state_fnn_buffer = np.zeros_like(self.state_fnn_buffer)
 35 |         self.action_buffer = np.zeros_like(self.action_buffer)
 36 |         self.reward_buffer = np.zeros_like(self.reward_buffer)
 37 |         self.state_value_buffer = np.zeros_like(self.state_value_buffer)
 38 |         self.action_logprob_buffer = np.zeros_like(self.action_logprob_buffer)
 39 |         self.return_buffer = np.zeros_like(self.return_buffer)
 40 |         self.advantage_buffer = np.zeros_like(self.advantage_buffer)
 41 | 
 42 |     def discounted_cumulative_sums(self, x_arr, discount):
 43 |         gae = scipy.signal.lfilter([1], [1, float(-discount)], x_arr[::-1], axis=0)[::-1]
 44 |         return gae.reshape((len(gae), 1))
 45 | 
 46 |     def finish_trajectory(self, last_value):
 47 |         trajectory_end_idx = self.buffer_counter
 48 |         path_slice = slice(self.trajectory_start_idx, trajectory_end_idx)
 49 |         rewards = np.append(self.reward_buffer[path_slice], last_value)
 50 |         state_values = np.append(self.state_value_buffer[path_slice], last_value)
 51 | 
 52 |         deltas = rewards[:-1] + self.gamma * state_values[1:] - state_values[:-1]
 53 |         self.advantage_buffer[path_slice] = self.discounted_cumulative_sums(deltas, self.gamma*self.lam)
 54 |         self.return_buffer[path_slice] = self.advantage_buffer[path_slice] + self.state_value_buffer[path_slice]
 55 | 
 56 |     def sample(self, batch_size=32):
 57 |         batch_starts = np.arange(0, self.buffer_size, batch_size)
 58 |         batch_indices = np.arange(self.buffer_size)
 59 |         np.random.shuffle(batch_indices)
 60 |         batches = [batch_indices[batch_start: batch_start+batch_size] for batch_start in batch_starts]
 61 | 
 62 |         return self.state_seq_buffer, \
 63 |             self.state_fnn_buffer, \
 64 |             self.action_buffer, \
 65 |             self.action_logprob_buffer, \
 66 |             self.return_buffer, \
 67 |             self.advantage_buffer, \
 68 |             batches
 69 | 
 70 |     def store_transition(self, state_seq, state_fnn, action, reward, state_value, action_logprob):
 71 |         idx = self.buffer_counter
 72 |         self.state_seq_buffer[idx] = state_seq
 73 |         self.state_fnn_buffer[idx] = state_fnn
 74 |         self.action_buffer[idx] = action
 75 |         self.reward_buffer[idx] = reward
 76 |         self.state_value_buffer[idx] = state_value
 77 |         self.action_logprob_buffer[idx] = action_logprob
 78 | 
 79 |         self.buffer_counter += 1
 80 | 
 81 | class PrioritizedReplayBuffer:
 82 |     def __init__(self, buffer_size, state_seq_shape, state_fnn_shape, n_actions, alpha=0.6, beta=0.4):
 83 |         # params
 84 |         self.buffer_size = buffer_size
 85 |         self.buffer_counter = 0
 86 |         self.alpha = alpha
 87 |         self.beta = beta
 88 |         
 89 |         # transition
 90 |         self.state_seq_buffer = np.zeros((self.buffer_size, *state_seq_shape))
 91 |         self.state_fnn_buffer = np.zeros((self.buffer_size, *state_fnn_shape))
 92 |         self.action_buffer = np.zeros((self.buffer_size, n_actions))
 93 |         self.reward_buffer = np.zeros((self.buffer_size, 1))
 94 |         self.next_state_seq_buffer = np.zeros((self.buffer_size, *state_seq_shape))
 95 |         self.next_state_fnn_buffer = np.zeros((self.buffer_size, *state_fnn_shape))
 96 | 
 97 |         # sum tree
 98 |         n_node = buffer_size * 2 - 1
 99 |         self.sum_tree = np.zeros(n_node)
100 | 
101 |     def get_leaf(self, cdf):
102 |         idx = self._retrieve(0, cdf)
103 |         return idx
104 | 
105 |     def get_max_priority(self):
106 |         max_p = np.max(self.sum_tree[-self.buffer_size:])
107 |         if max_p == 0:
108 |             max_p = 1.
109 |         return max_p
110 | 
111 |     def sample(self, batch_size=32):
112 |         idxs = np.zeros(batch_size, dtype=np.int32)
113 |         trans_idxs = np.zeros(batch_size, dtype=np.int32)
114 |         weights = np.zeros((batch_size, 1))
115 | 
116 |         trans_idx_start = self.buffer_size - 1
117 |         trans_idx_end = trans_idx_start + min(self.buffer_counter, self.buffer_size)
118 |         min_prob = np.min(self.sum_tree[trans_idx_start: trans_idx_end]) / self.sum_tree[0]
119 |         max_weight = np.power(self.buffer_size * min_prob, -self.beta)
120 | 
121 |         total_p = self.sum_tree[0]
122 |         segment_size = total_p / batch_size
123 |         for i in range(batch_size):
124 |             segment_low = i * segment_size
125 |             segment_high = (i + 1) * segment_size
126 |             cdf = np.random.uniform(low=segment_low, high=segment_high)
127 | 
128 |             idx = self.get_leaf(cdf)
129 |             idxs[i] = idx
130 |             trans_idxs[i] = idx - self.buffer_size + 1
131 |             
132 |             prob = self.sum_tree[idx] / self.sum_tree[0]
133 |             weights[i] = np.power(self.buffer_size * prob, -self.beta) / max_weight
134 |         
135 |         return self.state_seq_buffer[trans_idxs], \
136 |             self.state_fnn_buffer[trans_idxs], \
137 |             self.action_buffer[trans_idxs], \
138 |             self.reward_buffer[trans_idxs], \
139 |             self.next_state_seq_buffer[trans_idxs], \
140 |             self.next_state_fnn_buffer[trans_idxs], \
141 |             idxs, \
142 |             weights
143 | 
144 |     def schedule_beta(self, beta_inc):
145 |         self.beta = min(self.beta + beta_inc, 1.)
146 | 
147 |     def store_transition(self, state_seq, state_fnn, action, reward, next_state_seq, next_state_fnn):
148 |         # transition
149 |         transition_idx = self.buffer_counter % self.buffer_size
150 |         self.state_seq_buffer[transition_idx] = state_seq
151 |         self.state_fnn_buffer[transition_idx] = state_fnn
152 |         self.action_buffer[transition_idx] = action
153 |         self.reward_buffer[transition_idx] = reward
154 |         self.next_state_seq_buffer[transition_idx] = next_state_seq
155 |         self.next_state_fnn_buffer[transition_idx] = next_state_fnn
156 | 
157 |         # priority
158 |         tree_idx = transition_idx + self.buffer_size - 1
159 |         priority = self.get_max_priority()
160 |         self.update_tree(tree_idx, priority)
161 | 
162 |         self.buffer_counter += 1
163 | 
164 |     def update_tree(self, idx, priority):
165 |         new_p = np.power(priority, self.alpha)
166 |         change = new_p - self.sum_tree[idx]
167 |         self.sum_tree[idx] = new_p
168 |         self._propogate(idx, change)
169 | 
170 |     def _propogate(self, idx, change):
171 |         parent_idx = (idx - 1) // 2
172 |         self.sum_tree[parent_idx] += change
173 |         if parent_idx != 0:
174 |             self._propogate(parent_idx, change)
175 | 
176 |     def _retrieve(self, idx, cdf):
177 |         l_child_idx = 2 * idx + 1
178 |         r_child_idx = l_child_idx + 1
179 | 
180 |         if l_child_idx >= len(self.sum_tree):
181 |             return idx
182 |         elif cdf <= self.sum_tree[l_child_idx]:
183 |             return self._retrieve(l_child_idx, cdf)
184 |         else:
185 |             return self._retrieve(r_child_idx, cdf - self.sum_tree[l_child_idx])
186 | 
187 | class ReplayBuffer:
188 |     def __init__(self, buffer_size, state_seq_shape, state_fnn_shape, n_actions):
189 |         self.buffer_size = buffer_size
190 |         self.buffer_counter = 0
191 |         
192 |         self.state_seq_buffer = np.zeros((self.buffer_size, *state_seq_shape))
193 |         self.state_fnn_buffer = np.zeros((self.buffer_size, *state_fnn_shape))
194 |         self.action_buffer = np.zeros((self.buffer_size, n_actions))
195 |         self.reward_buffer = np.zeros((self.buffer_size, 1))
196 |         self.next_state_seq_buffer = np.zeros((self.buffer_size, *state_seq_shape))
197 |         self.next_state_fnn_buffer = np.zeros((self.buffer_size, *state_fnn_shape))
198 | 
199 |     def store_transition(self, state_seq, state_fnn, action, reward, next_state_seq, next_state_fnn):
200 |         index = self.buffer_counter % self.buffer_size
201 | 
202 |         self.state_seq_buffer[index] = state_seq
203 |         self.state_fnn_buffer[index] = state_fnn
204 |         self.action_buffer[index] = action
205 |         self.reward_buffer[index] = reward
206 |         self.next_state_seq_buffer[index] = next_state_seq
207 |         self.next_state_fnn_buffer[index] = next_state_fnn
208 | 
209 |         self.buffer_counter += 1
210 | 
211 |     def sample(self, batch_size) -> Dict:
212 |         sample_range = min(self.buffer_counter, self.buffer_size)
213 |         batch_indices = np.random.choice(sample_range, size=batch_size)
214 | 
215 |         state_seq_batch = self.state_seq_buffer[batch_indices]
216 |         state_fnn_batch = self.state_fnn_buffer[batch_indices]
217 |         action_batch = self.action_buffer[batch_indices]
218 |         reward_batch = self.reward_buffer[batch_indices]
219 |         next_state_seq_batch = self.next_state_seq_buffer[batch_indices]
220 |         next_state_fnn_batch = self.next_state_fnn_buffer[batch_indices]
221 | 
222 |         return state_seq_batch, state_fnn_batch, action_batch, reward_batch, next_state_seq_batch, next_state_fnn_batch


--------------------------------------------------------------------------------
/controllers/models.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | class:
  3 |     - TransformerEncoder
  4 |     - ActorModel
  5 |     - CriticModel
  6 | '''
  7 | 
  8 | import tensorflow as tf
  9 | import tensorflow_addons as tfa
 10 | from tensorflow_addons.layers import SpectralNormalization
 11 | import tensorflow.keras as keras
 12 | import tensorflow.keras.layers as layers
 13 | 
 14 | from setting import *
 15 | 
 16 | class TransformerEncoder(layers.Layer):
 17 |     def __init__(self, key_dim=64, num_heads=2, dense_dim=32, sequence_length=SEQ_LENGTH, **kwargs):
 18 |         super().__init__(**kwargs)
 19 |         self.key_dim = key_dim
 20 |         self.num_heads = num_heads
 21 |         self.dense_dim = dense_dim
 22 |         self.sequence_length = sequence_length
 23 | 
 24 |         self.dense_inputs = layers.Dense(key_dim)
 25 |         self.position_embeddings = layers.Embedding(sequence_length, key_dim)
 26 |         self.attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)
 27 |         self.dense_proj = keras.Sequential([
 28 |             layers.Dense(dense_dim, activation='relu'),
 29 |             layers.Dense(key_dim)
 30 |         ])
 31 |         self.layernorm1 = layers.LayerNormalization()
 32 |         self.layernorm2 = layers.LayerNormalization()
 33 | 
 34 |     def call(self, inputs):
 35 |         positions = tf.range(start=0, limit=self.sequence_length, delta=1)
 36 |         attention_input = self.dense_inputs(inputs) + self.position_embeddings(positions)
 37 |         attention_output = self.attention(attention_input, attention_input)
 38 |         proj_input = self.layernorm1(attention_input + attention_output)
 39 |         proj_output = self.dense_proj(proj_input)
 40 |         outputs = self.layernorm2(proj_input + proj_output)
 41 |         return outputs
 42 | 
 43 |     def get_config(self):
 44 |         config = super().get_config()
 45 |         config.update({
 46 |             'key_dim': self.key_dim,
 47 |             'num_heads': self.num_heads,
 48 |             'dense_dim': self.dense_dim,
 49 |             'sequence_length': self.sequence_length
 50 |         })
 51 |         return config
 52 | 
 53 | class SequenceModel(keras.Model):
 54 |     def __init__(self, sequence_model_type='rnn', activation='relu', **kwargs):
 55 |         super().__init__()
 56 |         self.dense_proj = get_dense_proj_seq(activation=activation)
 57 |         if sequence_model_type == 'conv1d':
 58 |             self.seq = get_conv1d_model()
 59 |         elif sequence_model_type == 'rnn':    
 60 |             self.seq = get_rnn_model()
 61 |         elif sequence_model_type == 'transformer':
 62 |             self.seq = get_transformer_model()
 63 |         else:
 64 |             self.seq = layers.Flatten()
 65 |     
 66 |     def call(self, inputs):
 67 |         state_seq = self.dense_proj(inputs)
 68 |         state_seq = self.seq(state_seq)
 69 |         # state_seq = self.seq(inputs)
 70 | 
 71 |         return state_seq
 72 | 
 73 | class ActorMuModel(keras.Model):
 74 |     def __init__(self, n_action, **kwargs):
 75 |         super().__init__()
 76 |         # self.dense_proj = get_dense_proj_fnn()
 77 |         self.concat = layers.Concatenate()
 78 |         self.fc = keras.Sequential([
 79 |             layers.LayerNormalization(),
 80 |             layers.Dense(64, activation='tanh', kernel_initializer=tf.random_uniform_initializer(minval=-0.001, maxval=0.001)),
 81 |             layers.LayerNormalization(),
 82 |             layers.Dense(64, activation='tanh', kernel_initializer=tf.random_uniform_initializer(minval=-0.001, maxval=0.001)),
 83 |             layers.LayerNormalization(),
 84 |         ]) 
 85 | 
 86 |         self.action = layers.Dense(n_action, activation='tanh', kernel_initializer=tf.random_uniform_initializer(minval=-0.001, maxval=0.001))
 87 | 
 88 |     def call(self, state_seq, state_fnn):
 89 |         # state_fnn = self.dense_proj(state_fnn)
 90 |         state = self.concat([state_seq, state_fnn])
 91 |         state = self.fc(state)
 92 |         action = self.action(state)
 93 | 
 94 |         return action
 95 | 
 96 | class ActorPiModel(keras.Model):
 97 |     def __init__(self, n_action, logstd_init=0., **kwargs):
 98 |         super().__init__()
 99 |         # self.dense_proj = get_dense_proj_fnn(activation='tanh')
100 |         self.concat = layers.Concatenate()
101 |         self.fc = keras.Sequential([
102 |             layers.Dense(64, activation='tanh', kernel_initializer=tf.random_uniform_initializer(minval=-0.001, maxval=0.001)),
103 |             layers.Dense(64, activation='tanh', kernel_initializer=tf.random_uniform_initializer(minval=-0.001, maxval=0.001)),
104 |         ]) 
105 |         self.action_mean = layers.Dense(n_action, activation='tanh', kernel_initializer=tf.random_uniform_initializer(minval=-0.001, maxval=0.001))
106 |         
107 |         self.action_logstd = tf.Variable(logstd_init * tf.ones(n_action), trainable=True)
108 |     
109 |     def call(self, state_seq, state_fnn):
110 |         # state_fnn = self.dense_proj(state_fnn)
111 |         state = self.concat([state_seq, state_fnn])
112 |         state = self.fc(state)
113 |         action_mean = self.action_mean(state)
114 | 
115 |         action_std = tf.math.exp(self.action_logstd)
116 |         return action_mean, action_std
117 | 
118 | class CriticQModel(keras.Model):
119 |     def __init__(self, **kwargs):
120 |         super().__init__()
121 |         # self.dense_proj_fnn = get_dense_proj_fnn()
122 |         # self.dense_proj_a = get_dense_proj_a()
123 |         self.concat = layers.Concatenate()
124 |         self.dense = keras.Sequential([
125 |             layers.Dense(64, activation='tanh', kernel_initializer=tf.random_uniform_initializer(minval=-0.001, maxval=0.001)),
126 |             SpectralNormalization(layers.Dense(64, activation='tanh', kernel_initializer=tf.random_uniform_initializer(minval=-0.001, maxval=0.001))),
127 |         ])
128 |         self.q = layers.Dense(1)
129 | 
130 |     def call(self, state_seq, state_fnn, action):
131 |         # state_fnn = self.dense_proj_fnn(state_fnn)
132 |         # action = self.dense_proj_a(action)
133 |         state_action = self.concat([state_seq, state_fnn, action])
134 |         state_action = self.dense(state_action)
135 |         q_value = self.q(state_action)
136 | 
137 |         return q_value
138 |     
139 | class CriticVModel(keras.Model):
140 |     def __init__(self, name='critic', **kwargs):
141 |         super().__init__()
142 |         # self.dense_proj_fnn = get_dense_proj_fnn(activation='tanh')
143 |         self.concat = layers.Concatenate()
144 |         self.dense = keras.Sequential([
145 |             SpectralNormalization(layers.Dense(64, activation='tanh', kernel_initializer=tf.random_uniform_initializer(minval=-0.001, maxval=0.001))),
146 |             SpectralNormalization(layers.Dense(64, activation='tanh', kernel_initializer=tf.random_uniform_initializer(minval=-0.001, maxval=0.001))),
147 |         ])
148 |         self.v = layers.Dense(1)
149 |     
150 |     def call(self, state_seq, state_fnn):
151 |         # state_fnn = self.dense_proj_fnn(state_fnn)
152 |         state = self.concat([state_seq, state_fnn])
153 |         state = self.dense(state)
154 |         state_value = self.v(state)
155 | 
156 |         return state_value
157 | 
158 | def get_dense_proj_a(proj_dim=DENSE_DIM_A):
159 |     inputs = keras.Input(shape=(N_ACTION,))
160 |     outputs = keras.Sequential([
161 |         layers.Dense(proj_dim, activation='relu')
162 |     ])(inputs)
163 | 
164 |     model = keras.Model(inputs, outputs, name='dense_proj_a')
165 |     return model
166 | 
167 | def get_dense_proj_fnn(proj_dim=DENSE_DIM_FNN, activation='relu'):
168 |     inputs = keras.Input(shape=STATE_FNN_SHAPE)
169 |     outputs = layers.Dense(proj_dim, activation=activation, kernel_initializer=tf.random_uniform_initializer(minval=-0.001, maxval=0.001))(inputs)
170 | 
171 |     model = keras.Model(inputs, outputs, name='dense_proj_fnn')
172 |     return model
173 | 
174 | def get_dense_proj_seq(proj_dim=DENSE_DIM_SEQ, activation='tanh'):
175 |     inputs = keras.Input(shape=STATE_SEQ_SHAPE)
176 |     outputs = layers.Dense(proj_dim, activation=activation, kernel_initializer=tf.random_uniform_initializer(minval=-0.001, maxval=0.001))(inputs)
177 | 
178 |     model = keras.Model(inputs, outputs, name='dense_proj_seq')
179 |     return model
180 | 
181 | def get_conv1d_model():
182 |     inputs = keras.Input(shape=(SEQ_LENGTH, DENSE_DIM_SEQ))
183 |     outputs = keras.Sequential([
184 |         SpectralNormalization(layers.Conv1D(8, 5, activation='tanh')),
185 |         layers.MaxPooling1D(2),
186 |         layers.GRU(16)
187 |         # layers.Conv1D(8, 3, activation='tanh'),
188 |         # layers.GlobalMaxPooling1D(),
189 |     ])(inputs)
190 | 
191 |     model = keras.Model(inputs, outputs, name='sequence_model')
192 |     return model
193 | 
194 | def get_rnn_model():
195 |     # inputs = keras.Input(shape=(SEQ_LENGTH, DENSE_DIM_SEQ))
196 |     # lstm_in = layers.LayerNormalization()(inputs)
197 |     # lstm_out = layers.LSTM(DENSE_DIM_SEQ, return_sequences=True)(lstm_in)
198 |     # outputs = layers.Add()([inputs, lstm_out]) # residual connection
199 |     # outputs = layers.LayerNormalization()(outputs)
200 |     # outputs = layers.LSTM(16)(outputs)
201 |     inputs = keras.Input(shape=(SEQ_LENGTH, DENSE_DIM_SEQ))
202 |     outputs = layers.GRU(32)(inputs)
203 | 
204 |     model = keras.Model(inputs, outputs, name='sequence_model')
205 |     return model
206 | 
207 | # TODO
208 | def get_transformer_model():
209 |     pass
210 | 
211 | # deterministic actor
212 | def get_mu_actor(sequence_model, actor_model):
213 |     input_seq = keras.Input(shape=STATE_SEQ_SHAPE)
214 |     state_seq = sequence_model(input_seq)
215 |     input_fnn = keras.Input(shape=STATE_FNN_SHAPE)
216 |     state_fnn = get_dense_proj_fnn()(input_fnn)
217 |     action = actor_model(state_seq, state_fnn)
218 | 
219 |     actor = keras.Model([input_seq, input_fnn], action)
220 |     return actor
221 | 
222 | # stochastic actor
223 | def get_pi_actor(sequence_model, actor_model):
224 |     input_seq = keras.Input(shape=STATE_SEQ_SHAPE)
225 |     state_seq = sequence_model(input_seq)
226 |     state_fnn = keras.Input(shape=STATE_FNN_SHAPE)
227 |     action_mean, action_std = actor_model(state_seq, state_fnn)
228 | 
229 |     actor = keras.Model([input_seq, state_fnn], [action_mean, action_std])
230 |     return actor
231 | 
232 | # q value critic
233 | def get_q_critic(sequence_model, critic_model):
234 |     input_seq = keras.Input(shape=STATE_SEQ_SHAPE)
235 |     state_seq = sequence_model(input_seq)
236 |     state_fnn = keras.Input(shape=STATE_FNN_SHAPE)
237 |     action = keras.Input(shape=(N_ACTION,))
238 |     q_value = critic_model(state_seq, state_fnn, action)
239 | 
240 |     critic = keras.Model([input_seq, state_fnn, action], q_value)
241 |     return critic
242 | 
243 | # state value critic
244 | def get_v_critic(sequence_model, critic_model):
245 |     input_seq = keras.Input(shape=STATE_SEQ_SHAPE)
246 |     state_seq = sequence_model(input_seq)
247 |     state_fnn = keras.Input(shape=STATE_FNN_SHAPE)
248 |     state_value = critic_model(state_seq, state_fnn)
249 | 
250 |     critic = keras.Model([input_seq, state_fnn], state_value)
251 |     return critic


--------------------------------------------------------------------------------
/controllers/baseline_controller.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from pandapower.control.basic_controller import Controller
  3 | 
  4 | import utils
  5 | from setting import *
  6 | 
  7 | class SimpleControl(Controller):
  8 |     def __init__(self, net, ids, **kwargs):
  9 |         super().__init__(net, **kwargs)
 10 |         self.rewards = []
 11 |         self.costs = []
 12 |         self.bat5_soc_prev = 0.
 13 |         self.bat10_soc_prev = 0.
 14 |         self.bat5_soc = 0.
 15 |         self.bat10_soc = 0.
 16 |         self.soc_history = {'bat5_soc': [self.bat5_soc], 'bat10_soc': [self.bat10_soc]}
 17 |         self.last_time_step = None
 18 |         self.applied = False
 19 | 
 20 |         self.price_profile = kwargs['price_profile']
 21 | 
 22 |         self.ids = ids
 23 |         self.trafo0_id = ids.get('trafo0')
 24 |         # self.mgt5_id = ids.get('mgt5')
 25 |         # self.mgt5_p_mw = net.sgen.at[self.mgt5_id, 'p_mw']
 26 |         # self.mgt9_id = ids.get('mgt9')
 27 |         # self.mgt9_p_mw = net.sgen.at[self.mgt9_id, 'p_mw']
 28 |         # self.mgt10_id = ids.get('mgt10')
 29 |         # self.mgt10_p_mw = net.sgen.at[self.mgt10_id, 'p_mw']
 30 | 
 31 |         self.bat5_id = ids.get('bat5')
 32 |         self.bat5_p_mw = net.storage.at[self.bat5_id, 'p_mw']
 33 |         self.bat5_max_e_mwh = net.storage.at[self.bat5_id, 'max_e_mwh']
 34 |         self.bat10_id = ids.get('bat10')
 35 |         self.bat10_p_mw = net.storage.at[self.bat10_id, 'p_mw']
 36 |         self.bat10_max_e_mwh = net.storage.at[self.bat10_id, 'max_e_mwh']
 37 | 
 38 |     def is_converged(self, net) -> bool:
 39 |         return self.applied
 40 | 
 41 |     def calculate_reward(self, net, t):
 42 |         price = self.price_profile['price'][t - 1]
 43 |         cost, normalized_cost = utils.cal_cost(
 44 |             price=price,
 45 |             pcc_p_mw=-net.res_trafo.at[self.trafo0_id, 'p_lv_mw'],
 46 |             # mgt5_p_mw=self.mgt5_p_mw,
 47 |             # mgt9_p_mw=self.mgt9_p_mw,
 48 |             # mgt10_p_mw=self.mgt10_p_mw,
 49 |             bat5_soc_now=self.bat5_soc,
 50 |             bat5_soc_prev=self.bat5_soc_prev,
 51 |             bat10_soc_now=self.bat10_soc,
 52 |             bat10_soc_prev=self.bat10_soc_prev,
 53 |         )
 54 |         reward = -normalized_cost
 55 | 
 56 |         return cost, reward
 57 | 
 58 |     def control_step(self, net):
 59 |         # net.sgen.at[self.mgt5_id, 'p_mw'] = self.mgt5_p_mw
 60 |         # net.sgen.at[self.mgt9_id, 'p_mw'] = self.mgt9_p_mw
 61 |         # net.sgen.at[self.mgt10_id, 'p_mw'] = self.mgt10_p_mw
 62 |         net.storage.at[self.bat5_id, 'p_mw'] = self.bat5_p_mw
 63 |         net.storage.at[self.bat10_id, 'p_mw'] = self.bat10_p_mw
 64 |         self.applied = True
 65 | 
 66 |     def finalize_step(self, net, t):
 67 |         super().finalize_step(net, t)
 68 | 
 69 |         # update soc
 70 |         self.bat5_soc_prev = self.bat5_soc
 71 |         self.bat10_soc_prev = self.bat10_soc
 72 |         self.bat5_soc += (self.bat5_p_mw * HOUR_PER_TIME_STEP) / self.bat5_max_e_mwh
 73 |         self.bat10_soc += (self.bat10_p_mw * HOUR_PER_TIME_STEP) / self.bat10_max_e_mwh
 74 | 
 75 |         # calculate reward
 76 |         t += 1
 77 |         cost, reward = self.calculate_reward(net, t)
 78 |         self.costs.append(cost)
 79 |         self.rewards.append(reward)
 80 | 
 81 |     def time_step(self, net, t):
 82 |         # select action
 83 |         self.bat5_p_mw, self.bat10_p_mw = self.policy(net)
 84 | 
 85 |         self.soc_history['bat5_soc'].append(self.bat5_soc)
 86 |         self.soc_history['bat10_soc'].append(self.bat10_soc)
 87 | 
 88 |         self.applied = False
 89 |         self.last_time_step = t
 90 | 
 91 |     def policy(self, net):
 92 |         p_pv = net.sgen.at[self.ids.get('pv3'), 'p_mw'] +\
 93 |             net.sgen.at[self.ids.get('pv4'), 'p_mw'] +\
 94 |             net.sgen.at[self.ids.get('pv5'), 'p_mw'] +\
 95 |             net.sgen.at[self.ids.get('pv6'), 'p_mw'] +\
 96 |             net.sgen.at[self.ids.get('pv8'), 'p_mw'] +\
 97 |             net.sgen.at[self.ids.get('pv9'), 'p_mw'] +\
 98 |             net.sgen.at[self.ids.get('pv10'), 'p_mw'] +\
 99 |             net.sgen.at[self.ids.get('pv11'), 'p_mw']
100 |         p_wt = net.sgen.at[self.ids.get('wt7'), 'p_mw']
101 |         p_load = net.load.at[self.ids.get('load_r1'), 'p_mw'] +\
102 |             net.load.at[self.ids.get('load_r3'), 'p_mw'] +\
103 |             net.load.at[self.ids.get('load_r4'), 'p_mw'] +\
104 |             net.load.at[self.ids.get('load_r5'), 'p_mw'] +\
105 |             net.load.at[self.ids.get('load_r6'), 'p_mw'] +\
106 |             net.load.at[self.ids.get('load_r8'), 'p_mw'] +\
107 |             net.load.at[self.ids.get('load_r10'), 'p_mw'] +\
108 |             net.load.at[self.ids.get('load_r11'), 'p_mw']
109 |                             
110 |         p_b5_max = min((SOC_MAX - self.bat5_soc) * self.bat5_max_e_mwh / HOUR_PER_TIME_STEP, P_B5_MAX)
111 |         p_b5_min = max((SOC_MIN - self.bat5_soc) * self.bat5_max_e_mwh / HOUR_PER_TIME_STEP, P_B5_MIN)
112 |         p_b10_max = min((SOC_MAX - self.bat10_soc) * self.bat10_max_e_mwh / HOUR_PER_TIME_STEP, P_B10_MAX)
113 |         p_b10_min = max((SOC_MIN - self.bat10_soc) * self.bat10_max_e_mwh / HOUR_PER_TIME_STEP, P_B10_MIN)
114 | 
115 |         excess = p_pv + p_wt - p_load
116 |         # print(f'Excess = {excess}, pv: {p_pv}, wt: {p_wt}, load: {p_load}')
117 |         if excess > 0:
118 |             # charge
119 |             b5_ratio = p_b5_max / (p_b5_max + p_b10_max) if (p_b5_max + p_b10_max) != 0. else 0.
120 |             b10_ratio = p_b10_max / (p_b5_max + p_b10_max) if (p_b5_max + p_b10_max) != 0. else 0.
121 |             p_b5 = min(excess * b5_ratio, p_b5_max)
122 |             p_b10 = min(excess * b10_ratio, p_b10_max)
123 |             # p_mgt5 = 0.
124 |             # p_mgt9 = 0.
125 |             # p_mgt10 = 0.
126 |         else:
127 |             # discharge
128 |             b5_ratio = p_b5_min / (p_b5_min + p_b10_min) if (p_b5_min + p_b10_min) != 0. else 0.
129 |             b10_ratio = p_b10_min / (p_b5_min + p_b10_min) if (p_b5_min + p_b10_min) != 0. else 0.
130 |             p_b5 = max(excess * b5_ratio, p_b5_min)
131 |             p_b10 = max(excess * b10_ratio, p_b10_min)
132 |             p_b = p_b5 + p_b10
133 | 
134 |             # mgt5_ratio = P_MGT5_MAX / (P_MGT5_MAX + P_MGT9_MAX + P_MGT10_MAX)
135 |             # mgt9_ratio = P_MGT9_MAX / (P_MGT5_MAX + P_MGT9_MAX + P_MGT10_MAX)
136 |             # mgt10_ratio = P_MGT10_MAX / (P_MGT5_MAX + P_MGT9_MAX + P_MGT10_MAX)
137 |             # mgt5_op_point = (C_BUY - C_MGT5[1]) / C_MGT5[0]
138 |             # mgt9_op_point = (C_BUY - C_MGT9[1]) / C_MGT9[0]
139 |             # mgt10_op_point = (C_BUY - C_MGT10[1]) / C_MGT10[0]
140 |             # p_mgt5 = 0. if excess > p_b  else min((p_b - excess) * mgt5_ratio, mgt5_op_point)
141 |             # p_mgt9 = 0. if excess > p_b  else min((p_b - excess) * mgt9_ratio, mgt9_op_point)
142 |             # p_mgt10 = 0. if excess > p_b  else min((p_b - excess) * mgt10_ratio, mgt10_op_point)
143 |         
144 |         return p_b5, p_b10
145 | 
146 |     def reset(self):
147 |         # self.mgt5_p_mw = 0.
148 |         # self.mgt9_p_mw = 0.
149 |         # self.mgt10_p_mw = 0.
150 |         self.bat5_p_mw = 0.
151 |         self.bat10_p_mw = 0.
152 |         self.rewards = []
153 |         self.costs = []
154 |         self.bat5_soc = 0.
155 |         self.bat10_soc = 0.
156 |         self.soc_history = {'bat5_soc': [self.bat5_soc], 'bat10_soc': [self.bat10_soc]}
157 |         self.last_time_step = None
158 |         self.applied = False
159 | 
160 | class RandomControl(Controller):
161 |     def __init__(self, net, ids, **kwargs):
162 |         super().__init__(net, **kwargs)
163 |         self.rewards = []
164 |         self.costs = []
165 |         self.bat5_soc = np.random.uniform(low=SOC_MIN, high=SOC_MAX)
166 |         self.bat10_soc = np.random.uniform(low=SOC_MIN, high=SOC_MAX)
167 |         self.last_time_step = None
168 |         self.applied = False
169 | 
170 |         self.price_profile = kwargs['price_profile']
171 | 
172 |         self.trafo0_id = ids.get('trafo0')
173 |         # self.mgt5_id = ids.get('mgt5')
174 |         # self.mgt5_p_mw = net.sgen.at[self.mgt5_id, 'p_mw']
175 |         # self.mgt9_id = ids.get('mgt9')
176 |         # self.mgt9_p_mw = net.sgen.at[self.mgt9_id, 'p_mw']
177 |         # self.mgt10_id = ids.get('mgt10')
178 |         # self.mgt10_p_mw = net.sgen.at[self.mgt10_id, 'p_mw']
179 | 
180 |         self.bat5_id = ids.get('bat5')
181 |         self.bat5_p_mw = net.storage.at[self.bat5_id, 'p_mw']
182 |         self.bat5_max_e_mwh = net.storage.at[self.bat5_id, 'max_e_mwh']
183 |         self.bat10_id = ids.get('bat10')
184 |         self.bat10_p_mw = net.storage.at[self.bat10_id, 'p_mw']
185 |         self.bat10_max_e_mwh = net.storage.at[self.bat10_id, 'max_e_mwh']
186 | 
187 |     def is_converged(self, net):
188 |         return self.applied
189 |     
190 |     def control_step(self, net):
191 |         # net.sgen.at[self.mgt5_id, 'p_mw'] = self.mgt5_p_mw
192 |         # net.sgen.at[self.mgt9_id, 'p_mw'] = self.mgt9_p_mw
193 |         # net.sgen.at[self.mgt10_id, 'p_mw'] = self.mgt10_p_mw
194 |         net.storage.at[self.bat5_id, 'p_mw'] = self.bat5_p_mw
195 |         net.storage.at[self.bat10_id, 'p_mw'] = self.bat10_p_mw
196 |         self.applied = True
197 |     
198 |     def time_step(self, net, t):
199 |         if self.last_time_step is not None:
200 |             # update soc
201 |             bat5_soc_prev = self.bat5_soc
202 |             bat10_soc_prev = self.bat10_soc
203 |             self.bat5_soc += (self.bat5_p_mw * HOUR_PER_TIME_STEP) / self.bat5_max_e_mwh
204 |             self.bat10_soc += (self.bat10_p_mw * HOUR_PER_TIME_STEP) / self.bat10_max_e_mwh
205 | 
206 |             # calculate reward
207 |             price = self.price_profile['price'][t]
208 |             cost, normalized_cost = utils.cal_cost(
209 |                 price=price,
210 |                 pcc_p_mw=-net.res_trafo.at[self.trafo0_id, 'p_lv_mw'],
211 |                 # mgt5_p_mw=self.mgt5_p_mw,
212 |                 # mgt9_p_mw=self.mgt9_p_mw,
213 |                 # mgt10_p_mw=self.mgt10_p_mw,
214 |                 bat5_soc_now=self.bat5_soc,
215 |                 bat5_soc_prev=bat5_soc_prev,
216 |                 bat10_soc_now=self.bat10_soc,
217 |                 bat10_soc_prev=bat10_soc_prev
218 |             )
219 |             reward = -normalized_cost
220 |             self.rewards.append(reward)
221 |             self.costs.append(cost)
222 |         
223 |         # select action
224 |         self.bat5_p_mw, self.bat10_p_mw = np.random.uniform(low=MIN_ACTION, high=MAX_ACTION, size=(N_ACTIONS,))
225 |         p_b5_max = min((SOC_MAX - self.bat5_soc) * self.bat5_max_e_mwh / HOUR_PER_TIME_STEP, P_B5_MAX)
226 |         p_b5_min = max((SOC_MIN - self.bat5_soc) * self.bat5_max_e_mwh / HOUR_PER_TIME_STEP, P_B5_MIN)
227 |         p_b10_max = min((SOC_MAX - self.bat10_soc) * self.bat10_max_e_mwh / HOUR_PER_TIME_STEP, P_B10_MAX)
228 |         p_b10_min = max((SOC_MIN - self.bat10_soc) * self.bat10_max_e_mwh / HOUR_PER_TIME_STEP, P_B10_MIN)
229 |         self.bat5_p_mw = np.clip(self.bat5_p_mw, p_b5_min, p_b5_max)
230 |         self.bat10_p_mw = np.clip(self.bat10_p_mw, p_b10_min, p_b10_max)
231 |         self.applied = False
232 |         self.last_time_step = t
233 |     
234 |     def reset(self):
235 |         # self.mgt5_p_mw = 0.
236 |         # self.mgt9_p_mw = 0.
237 |         # self.mgt10_p_mw = 0.
238 |         self.bat5_p_mw = 0.
239 |         self.bat10_p_mw = 0.
240 |         self.rewards = []
241 |         self.costs = []
242 |         self.bat5_soc = np.random.uniform(low=SOC_MIN, high=SOC_MAX)
243 |         self.bat10_soc = np.random.uniform(low=SOC_MIN, high=SOC_MAX)
244 |         self.last_time_step = None
245 |         self.applied = False


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Main program file.
  3 | 
  4 | func:
  5 | - train_ppo
  6 | - train_td3
  7 | - test
  8 | - baseline
  9 | '''
 10 | 
 11 | import os, shutil
 12 | import logging
 13 | import numpy as np
 14 | import pandas as pd
 15 | import matplotlib.pyplot as plt
 16 | 
 17 | import pandapower as pp
 18 | import pandapower.timeseries as ts
 19 | from pandapower.timeseries.data_sources.frame_data import DFData
 20 | from pandapower.timeseries.output_writer import OutputWriter
 21 | 
 22 | import utils
 23 | from cigre_mv_microgrid import create_cigre_mv_microgrid
 24 | from controllers.baseline_controller import RandomControl, SimpleControl
 25 | from controllers.td3_controller import TD3Agent
 26 | from controllers.ppo_controller import PPOAgent
 27 | from setting import *
 28 | 
 29 | def train_ppo(n_runs, n_epochs, start, train_length, pv_profile, wt_profile, load_profile, price_profile,
 30 |     sequence_model_type='none', noise_type='action'):
 31 |     # env
 32 |     assert(start >= 0 and start < pv_profile.shape[0])
 33 |     assert(train_length >= 0 and train_length <= pv_profile.shape[0] - start)
 34 |     time_steps = range(start, start + train_length)
 35 |     pv_ds = DFData(pv_profile.iloc[start: start+train_length])
 36 |     wt_ds = DFData(wt_profile.iloc[start: start+train_length])
 37 |     load_ds = DFData(load_profile.iloc[start: start+train_length])
 38 | 
 39 |     # history
 40 |     history_dir = os.path.join('.', 'history', 'train', 'PPO')
 41 |     if os.path.isdir(history_dir):
 42 |         shutil.rmtree(history_dir)
 43 | 
 44 |     # run
 45 |     ep_return_list = np.zeros((n_runs, n_epochs))
 46 |     ep_cost_list = np.zeros((n_runs, n_epochs))
 47 |     for run in range(n_runs):
 48 |         net, ids = create_cigre_mv_microgrid(pv_ds, wt_ds, load_ds)
 49 |         agent = PPOAgent(net, ids, pv_profile, wt_profile, load_profile, price_profile, sequence_model_type)
 50 | 
 51 |         best_cost = train_length * MAX_COST
 52 |         for epoch in range(n_epochs):
 53 |             agent.training = True
 54 |             ts.run_timeseries(net, time_steps=time_steps, continue_on_divergence=False)
 55 |             ep_return_list[run, epoch] = np.sum(agent.rewards)
 56 |             np.save(os.path.join('.', 'plot', 'ep_return_list.npy'), ep_return_list)
 57 |             ep_cost_list[run, epoch] = np.sum(agent.costs)
 58 |             np.save(os.path.join('.', 'plot', 'ep_cost_list.npy'), ep_cost_list)
 59 |             print(f'Run: {run + 1}, epoch: {epoch + 1}, return = {ep_return_list[run, epoch]:.3f}, cost = {ep_cost_list[run, epoch]:.3f}')
 60 | 
 61 |             # history & best models
 62 |             if epoch >= 20:
 63 |                 cost = np.sum(agent.costs)
 64 |                 if cost < best_cost or (epoch % 20 == 0):
 65 |                     # log history
 66 |                     dir = os.path.join(history_dir,str(run+1), 'best_avg_cost')
 67 |                     if not os.path.isdir(dir):
 68 |                         os.makedirs(dir)
 69 |                     pd.DataFrame(agent.history).to_csv(os.path.join(dir, f'[{epoch}]_cost{cost:.3f}.csv'))
 70 |                     
 71 |                     # save best cost model
 72 |                     if cost < best_cost:
 73 |                         agent.save(run)
 74 |                         best_cost = cost
 75 |             agent.reset()
 76 |         
 77 |     # plot
 78 |     print(f'Epoch return: \n {np.mean(ep_return_list, axis=0)}')
 79 |     print(f'Epoch cost: \n {np.mean(ep_cost_list, axis=0)}')
 80 |     utils.plot_ep_values(ep_return_list, train_length, n_epochs, ylabel='Return')
 81 |     utils.plot_ep_values(ep_cost_list, train_length, n_epochs, ylabel='Cost')
 82 | 
 83 | 
 84 | def train_td3(n_runs, n_epochs, start, train_length, pv_profile, wt_profile, load_profile, price_profile,
 85 |     verbose=True, sequence_model_type='rnn', use_pretrained_sequence_model=False, 
 86 |     noise_type='action', retrain=False, run=1):
 87 |     # env
 88 |     assert(start >= 0 and start < pv_profile.shape[0])
 89 |     assert(train_length >= 0 and train_length <= pv_profile.shape[0] - start)
 90 |     time_steps = range(start, start + train_length)
 91 |     pv_ds = DFData(pv_profile.iloc[start: start+train_length])
 92 |     wt_ds = DFData(wt_profile.iloc[start: start+train_length])
 93 |     load_ds = DFData(load_profile.iloc[start: start+train_length])
 94 | 
 95 |     # history
 96 |     history_dir = os.path.join('.', 'history', 'train', 'TD3')
 97 |     if os.path.isdir(history_dir):
 98 |         shutil.rmtree(history_dir)
 99 | 
100 |     # run
101 |     ep_return_list = np.zeros((n_runs, n_epochs))
102 |     ep_cost_list = np.zeros((n_runs, n_epochs))
103 |     for run in range(n_runs):
104 |         net, ids = create_cigre_mv_microgrid(pv_ds, wt_ds, load_ds)
105 |         agent = TD3Agent(net, ids, pv_profile, wt_profile, load_profile, price_profile,
106 |             training=True, n_epochs=n_epochs,
107 |             sequence_model_type=sequence_model_type, use_pretrained_sequence_model=use_pretrained_sequence_model, 
108 |             buffer_size=BUFFER_SIZE, noise_type=noise_type, batch_size=BATCH_SIZE)
109 |         if retrain:
110 |             agent.load_models(run=run)
111 | 
112 |         # run
113 |         best_cost = train_length * MAX_COST
114 |         for epoch in range(n_epochs):
115 |             # train
116 |             agent.training = True
117 |             ts.run_timeseries(net, time_steps=time_steps, verbose=verbose, continue_on_divergence=False)
118 |             ep_return_list[run, epoch] = np.sum(agent.rewards)
119 |             ep_cost_list[run, epoch] = np.sum(agent.costs)
120 |             print(f'Run: {run + 1}, episode: {epoch + 1}, return = {ep_return_list[run, epoch]:.3f}, cost = {ep_cost_list[run, epoch]:.3f}')
121 |             # agent.reset()
122 | 
123 |             # test
124 |             # agent.training = False
125 |             # ts.run_timeseries(net, time_steps=time_steps, verbose=verbose, continue_on_divergence=False)
126 |             
127 |             test_cost = np.sum(agent.costs)
128 |             if (epoch >= 20) and ((epoch % 20 == 0) or test_cost < best_cost):
129 |                 # log history
130 |                 dir = os.path.join(history_dir,str(run+1), 'best_avg_cost')
131 |                 if not os.path.isdir(dir):
132 |                     os.makedirs(dir)
133 |                 pd.DataFrame(agent.history).to_csv(os.path.join(dir, f'[{epoch}]_cost{ep_cost_list[run, epoch]:.3f}.csv'))
134 |                 
135 |                 # save best cost model
136 |                 # if test_cost < best_cost:
137 |                 #     agent.save_models(run=run+1)
138 |                 #     best_cost = test_cost
139 |             agent.reset()        
140 | 
141 |     # plot
142 |     print(f'Episode return: \n {np.mean(ep_return_list, axis=0)}')
143 |     print(f'Episode cost: \n {np.mean(ep_cost_list, axis=0)}')
144 |     utils.plot_ep_values(ep_return_list, train_length, n_epochs, ylabel='Return')
145 |     utils.plot_ep_values(ep_cost_list, train_length, n_epochs, ylabel='Cost')
146 | 
147 | def test(n_runs, start, test_length, pv_profile, wt_profile, load_profile, price_profile, run, sequence_model_type='rnn', log=False, log_path=None):
148 |     assert(start >= 0 and start < pv_profile.shape[0])
149 |     assert(test_length >= 0 and test_length <= pv_profile.shape[0] - start)
150 |     time_steps=range(start, start+test_length)
151 |     
152 |     # env
153 |     pv_ds = DFData(pv_profile.iloc[start: start+test_length])
154 |     wt_ds = DFData(wt_profile.iloc[start: start+test_length])
155 |     load_ds = DFData(load_profile.iloc[start: start+test_length])
156 |     net, ids = create_cigre_mv_microgrid(pv_ds, wt_ds, load_ds)
157 | 
158 |     # log pf results
159 |     if log:
160 |         n_runs = 1
161 |         ow = OutputWriter(net, time_steps, output_path=log_path, output_file_type='.csv', csv_separator=',')
162 |         ow.log_variable('res_sgen', 'p_mw')
163 |         ow.log_variable('res_load', 'p_mw')
164 |         ow.log_variable('res_storage', 'p_mw')
165 |         ow.log_variable('res_trafo', 'p_lv_mw', index=[ids['trafo0']])
166 | 
167 |     # agent
168 |     agent = PPOAgent(net, ids, pv_profile, wt_profile, load_profile, price_profile, sequence_model_type, training=False)
169 |     agent.load(run)
170 |     # agent = TD3Agent(net, ids, pv_profile, wt_profile, load_profile, price_profile,
171 |     #         training=False, sequence_model_type=sequence_model_type)
172 |     # agent.load_models(run=run)
173 | 
174 |     # run
175 |     ep_cost_list = []
176 |     for _ in range(n_runs):
177 |         ts.run_timeseries(net, time_steps=time_steps, verbose=False, continue_on_divergence=False)
178 |         ep_cost_list.append(np.sum(agent.costs))
179 |         agent.reset()
180 |     print(f'Avg cost = {np.mean(ep_cost_list)}')
181 | 
182 | def baseline(n_runs, start, test_length, pv_profile, wt_profile, load_profile, price_profile, Control, log=False, log_path=None):
183 |     assert(start >= 0 and start < pv_profile.shape[0])
184 |     assert(test_length >= 0 and test_length <= pv_profile.shape[0] - start)
185 |     time_steps=range(start, start+test_length)
186 |     
187 |     # env
188 |     pv_ds = DFData(pv_profile.iloc[start: start+test_length])
189 |     wt_ds = DFData(wt_profile.iloc[start: start+test_length])
190 |     load_ds = DFData(load_profile.iloc[start: start+test_length])
191 |     net, ids = create_cigre_mv_microgrid(pv_ds, wt_ds, load_ds)
192 | 
193 |     # log pf results
194 |     if log:
195 |         n_runs = 1
196 |         ow = OutputWriter(net, time_steps, output_path=log_path, output_file_type='.csv', csv_separator=',')
197 |         ow.log_variable('res_sgen', 'p_mw')
198 |         ow.log_variable('res_load', 'p_mw')
199 |         ow.log_variable('res_storage', 'p_mw')
200 |         ow.log_variable('res_trafo', 'p_lv_mw', index=[ids['trafo0']])
201 | 
202 |     # controller
203 |     controller = Control(net, ids, price_profile=price_profile)
204 | 
205 |     # run
206 |     ep_cost_list = []
207 |     for _ in range(n_runs):
208 |         ts.run_timeseries(net, time_steps=time_steps, continue_on_divergence=False, verbose=True)
209 |         ep_cost_list.append(np.sum(controller.costs))
210 |         controller.reset()
211 |     print(f'Avg cost = {np.mean(ep_cost_list)}')
212 | 
213 | if __name__ == '__main__':
214 |     # --- configurations ---
215 |     logging.basicConfig(level=logging.INFO)
216 |     algo = 'ppo'
217 |     sequence_model_type = 'rnn' # ['none', 'conv1d', 'rnn', 'transformer']
218 |     sequence_length = 1 if (sequence_model_type == 'none') else SEQ_LENGTH
219 | 
220 |     # train configs
221 |     n_train_runs = 10
222 |     n_epochs = 500
223 |     train_start = 0
224 |     train_length = 30 * 24
225 |     noise_type = 'action' # ['action', 'param']
226 |     use_pretrained_sequence_model = False
227 | 
228 |     # test configs
229 |     n_test_runs = 1
230 |     # test_start = train_start + train_length
231 |     # test_length = 7 * 24
232 |     test_start = train_start
233 |     test_length = train_length
234 |     log = True
235 |     log_path = os.path.join('.', 'pf_res', algo, sequence_model_type)
236 |     log_path_baseline = os.path.join('.', 'pf_res', 'baseline', 'simple')
237 | 
238 |     # --- profile ---
239 |     pv_profile = pd.read_csv('./data/profile/pv_profile.csv')
240 |     wt_profile = pd.read_csv('./data/profile/wt_profile.csv')
241 |     load_profile = pd.read_csv('./data/profile/load_profile.csv')
242 |     price_profile = pd.read_csv('./data/profile/price_profile.csv')
243 | 
244 |     # --- train, test ---
245 |     train_ppo(n_train_runs, n_epochs, train_start, train_length, 
246 |         pv_profile, wt_profile, load_profile, price_profile, sequence_model_type)
247 | 
248 |     # train_td3(n_runs=n_train_runs, n_epochs=n_epochs, start=train_start, train_length=train_length,
249 |     #     pv_profile=pv_profile, wt_profile=wt_profile, load_profile=load_profile, price_profile=price_profile,
250 |     #     sequence_model_type=sequence_model_type, use_pretrained_sequence_model=use_pretrained_sequence_model, noise_type=noise_type)
251 | 
252 |     # test(n_runs=n_test_runs, start=test_start, test_length=test_length, 
253 |     #     pv_profile=pv_profile, wt_profile=wt_profile, load_profile=load_profile, price_profile=price_profile,
254 |     #     run=0, sequence_model_type=sequence_model_type, log=log, log_path=log_path)
255 | 
256 |     # baseline(n_runs=n_test_runs, start=test_start, test_length=test_length, 
257 |     #     pv_profile=pv_profile, wt_profile=wt_profile, load_profile=load_profile, price_profile=price_profile,
258 |     #     Control=SimpleControl, 
259 |     #     log=log, log_path=log_path_baseline)
260 | 
261 |     # --- plot pf results ---
262 |     # utils.plot_pf_results(log_path, test_start, test_length)
263 |     # utils.plot_pf_results(dir=log_path_baseline)


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | func:
  3 | - scale_to_mg
  4 | - normalize_state
  5 | - cal_cost
  6 | - extra_reward
  7 | - plot_return
  8 | - plot_pf_results
  9 | - view_profile
 10 | '''
 11 | 
 12 | import os
 13 | import logging
 14 | import pickle
 15 | from pathlib import Path
 16 | import numpy as np
 17 | from typing import Dict
 18 | import matplotlib.pyplot as plt
 19 | import pandas as pd
 20 | import tensorflow as tf
 21 | 
 22 | from setting import *
 23 | 
 24 | # --- Action Scaling ---
 25 | def scale_to_mg(nn_action, min_action, max_action):
 26 |     nn_action = np.clip(nn_action, -1., 1.)
 27 |     return (nn_action + 1) * (max_action - min_action) / 2 + min_action
 28 | 
 29 | # --- Normalization ---
 30 | def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var, batch_count):
 31 |     delta = batch_mean - mean
 32 |     tot_count = count + batch_count
 33 | 
 34 |     new_mean = mean + delta * batch_count / tot_count
 35 |     m_a = var * count
 36 |     m_b = batch_var * batch_count
 37 |     M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count
 38 |     new_var = M2 / tot_count
 39 |     new_count = tot_count
 40 | 
 41 |     return new_mean, new_var, new_count
 42 | 
 43 | class NormalizeAction:
 44 |     def __init__(self, epsilon=1e-8):
 45 |         self.a_rms = RunningMeanStd(shape=(N_ACTION,))
 46 |         self.epsilon = epsilon
 47 | 
 48 |     def normalize(self, a):
 49 |         self.a_rms.update(a)
 50 |         a = (a -self.a_rms.mean) / np.sqrt(self.a_rms.var + self.epsilon)
 51 |         a = np.clip(a, -5, 5)
 52 |         return a
 53 | 
 54 |     def tf_normalize(self, a):
 55 |         mean = tf.convert_to_tensor(self.a_rms.mean, dtype=tf.float32)
 56 |         var = tf.convert_to_tensor(self.a_rms.var, dtype=tf.float32)
 57 |         a = (a - mean) / tf.math.sqrt(var + self.epsilon)
 58 |         a = tf.clip_by_value(a, -5, 5)
 59 |         return a
 60 | 
 61 | class NormalizeObservation:
 62 |     def __init__(self, epsilon=1e-8):
 63 |         self.obs_seq_rms = RunningMeanStd(shape=STATE_SEQ_SHAPE)
 64 |         self.obs_fnn_rms = RunningMeanStd(shape=STATE_FNN_SHAPE)
 65 |         self.epsilon = epsilon
 66 | 
 67 |     def normalize(self, obs, update=True):
 68 |         obs_seq, obs_fnn = obs
 69 |         if update:
 70 |             self.obs_seq_rms.update(obs_seq)
 71 |             self.obs_fnn_rms.update(obs_fnn)
 72 |         obs_seq = (obs_seq - self.obs_seq_rms.mean) / np.sqrt(self.obs_seq_rms.var + self.epsilon)
 73 |         obs_seq = np.clip(obs_seq, -5, 5)
 74 |         obs_fnn = (obs_fnn - self.obs_fnn_rms.mean) / np.sqrt(self.obs_fnn_rms.var + self.epsilon)
 75 |         obs_fnn = np.clip(obs_fnn, -5, 5)
 76 |         return obs_seq, obs_fnn
 77 |     
 78 |     def save(self, dir):
 79 |         fpath = Path(os.path.join(dir, 'obs.pkl'))
 80 |         fpath.parent.mkdir(parents=True, exist_ok=True)
 81 |         with open(fpath, 'wb') as f:
 82 |             pickle.dump({
 83 |                 'obs_seq_mean': self.obs_seq_rms.mean,
 84 |                 'obs_seq_var': self.obs_seq_rms.var,
 85 |                 'obs_fnn_mean': self.obs_fnn_rms.mean,
 86 |                 'obs_fnn_var': self.obs_fnn_rms.var,
 87 |             }, f)
 88 | 
 89 |     def load(self, dir):
 90 |         with open(os.path.join(dir, 'obs.pkl'), 'rb') as f:
 91 |             data = pickle.load(f)
 92 |             self.obs_seq_rms.mean = data['obs_seq_mean']
 93 |             self.obs_seq_rms.var = data['obs_seq_var']
 94 |             self.obs_fnn_rms.mean = data['obs_fnn_mean']
 95 |             self.obs_fnn_rms.var = data['obs_fnn_var']
 96 | 
 97 | class NormalizeReward:
 98 |     def __init__(self, gamma=GAMMA, epsilon=1e-8):
 99 |         self.return_rms = RunningMeanStd()
100 |         self.return_ = np.zeros(1)
101 |         self.gamma = gamma
102 |         self.epsilon = epsilon
103 | 
104 |     def normalize(self, r):
105 |         self.return_ = r + self.gamma * self.return_
106 |         self.return_rms.update(self.return_)
107 |         r /= np.sqrt(self.return_rms.var + self.epsilon)
108 |         r = np.clip(r, -5, 5)
109 |         return r
110 | 
111 | class RunningMeanStd(object):
112 |     # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
113 |     def __init__(self, epsilon=1e-4, shape=()):
114 |         self.mean = np.zeros(shape, 'float64')
115 |         self.var = np.ones(shape, 'float64')
116 |         self.count = epsilon
117 | 
118 |     def update(self, x):
119 |         batch_mean = np.mean(x, axis=0)
120 |         batch_var = np.var(x, axis=0)
121 |         batch_count = x.shape[0]
122 |         self.update_from_moments(batch_mean, batch_var, batch_count)
123 | 
124 |     def update_from_moments(self, batch_mean, batch_var, batch_count):
125 |         self.mean, self.var, self.count = update_mean_var_count_from_moments(
126 |             self.mean, self.var, self.count, batch_mean, batch_var, batch_count)
127 | 
128 | 
129 | def normalize_state(state) -> Dict:
130 |     normalized_state_rnn = state[0] / np.array([P_EXCESS_MAX, C_PRICE_MAX])
131 |     # normalized_state_rnn = state[0] / np.array([*P_PV_MAX_LIST, *P_WT_MAX_LIST, *P_LOAD_MAX_LIST, C_PRICE_MAX])
132 |     # normalized_state_rnn = state[0] / np.array([*P_PV_MAX_LIST, *P_WT_MAX_LIST, *P_LOAD_MAX_LIST, P_EXCESS_MAX, C_PRICE_MAX])
133 |     normalized_state_fnn = state[1] / SOC_MAX
134 | 
135 |     return normalized_state_rnn, normalized_state_fnn
136 | 
137 | # --- Reward ---
138 | def cal_cost(price, pcc_p_mw, bat5_soc_now, bat5_soc_prev, bat10_soc_now, bat10_soc_prev, **kwargs):
139 |     transaction_cost = price * pcc_p_mw
140 |     # mgt_cost = C_MGT5[0] * pow(mgt5_p_mw, 2) + C_MGT5[1] * mgt5_p_mw + \
141 |     #     C_MGT9[0] * pow(mgt9_p_mw, 2) + C_MGT9[1] * mgt9_p_mw + \
142 |     #     C_MGT10[0] * pow(mgt10_p_mw, 2) + C_MGT10[1] * mgt10_p_mw
143 |     battery_cost = C_BAT5_DoD * pow((bat5_soc_now - bat5_soc_prev), 2) + \
144 |         C_BAT10_DoD * pow((bat10_soc_now - bat10_soc_prev), 2)
145 |     soc_penalty = C_SOC_LIMIT if ((bat5_soc_now > (1+SOC_TOLERANCE)*SOC_MAX or bat5_soc_now < (1-SOC_TOLERANCE)*SOC_MIN) or 
146 |         (bat10_soc_now > (1+SOC_TOLERANCE)*SOC_MAX or bat10_soc_now < (1-SOC_TOLERANCE)*SOC_MIN)) else 0
147 | 
148 |     if len(kwargs):
149 |         ids = kwargs['ids']
150 |         t = kwargs['t']
151 |         net = kwargs['net']
152 |         log_cost_info(transaction_cost, battery_cost, soc_penalty, t, net=net, ids=ids, pcc_p_mw=pcc_p_mw)
153 | 
154 |     cost = (transaction_cost + battery_cost) * HOUR_PER_TIME_STEP + soc_penalty 
155 |     normalized_cost = cost / MAX_COST
156 | 
157 |     return cost, normalized_cost
158 | 
159 | def extra_reward(nn_bat_p_mw, valid_bat_p_mw):
160 |     # penalty for invalid action
161 |     dif = np.sum(np.abs(nn_bat_p_mw - valid_bat_p_mw))
162 |     dif /= (P_B10_MAX + P_B5_MAX)
163 |     reward = 0. if (dif < 1e-3) else (REWARD_INVALID_ACTION + dif * REWARD_INVALID_ACTION)
164 |     return reward
165 | 
166 | # --- Plot ---
167 | def plot_ep_values(ep_values, train_length, epochs, ylabel):
168 |     runs = ep_values.shape[0]
169 |     fig_path = os.path.join('plot', f'{int(train_length/24)}days_{runs}runs_{epochs}eps_{str.lower(ylabel)}.png')
170 |     arr_path = os.path.join('plot', f'{int(train_length/24)}days_{runs}runs_{epochs}eps_{str.lower(ylabel)}.npy')
171 |     np.save(arr_path, ep_values)
172 | 
173 |     ep_return = np.median(ep_values, axis=0)
174 |     epochs = range(1, len(ep_return) + 1)
175 |     plt.plot(epochs, ep_return)
176 |     plt.title(f'Training')
177 |     plt.xlabel('Epoch')
178 |     plt.ylabel(ylabel)
179 |     plt.savefig(fig_path)
180 |     plt.show()
181 | 
182 | def plot_pf_results(dir, start, length):
183 |     # pv, wt, mgt, load, bat, util, excess
184 |     res_sgen_file = os.path.join(dir, 'res_sgen', 'p_mw.csv')
185 |     res_load_file = os.path.join(dir, 'res_load', 'p_mw.csv')
186 |     res_storage_file = os.path.join(dir, 'res_storage', 'p_mw.csv')
187 |     res_trafo_file = os.path.join(dir, 'res_trafo', 'p_lv_mw.csv')
188 | 
189 |     # pv, wt, mgt
190 |     sgen_p_mw = pd.read_csv(res_sgen_file)
191 |     pv_p_mw = sgen_p_mw.iloc[:, 1:9]
192 |     pv_p_mw.columns = ['pv3', 'pv4', 'pv5', 'pv6', 'pv8', 'pv9', 'pv10', 'pv11']
193 |     wt_p_mw = sgen_p_mw.iloc[:, [9]]
194 |     wt_p_mw.columns = ['wt7']
195 |     # mgt_p_mw = sgen_p_mw.iloc[:, 10:]
196 |     # mgt_p_mw.columns = ['mgt5', 'mgt9', 'mgt10']
197 | 
198 |     # load
199 |     load_p_mw = pd.read_csv(res_load_file)
200 |     load_p_mw = load_p_mw.iloc[:, 1:]
201 |     load_p_mw.columns = ['load_r1', 'load_r3', 'load_r4', 'load_r5', 'load_r6', 'load_r8', 'load_r10', 'load_r11']
202 | 
203 |     # bat
204 |     bat_p_mw = pd.read_csv(res_storage_file)
205 |     bat_p_mw = bat_p_mw.iloc[:, 1:]
206 |     bat_p_mw.columns = ['bat5', 'bat10']
207 | 
208 |     # utility
209 |     trafo_p_mw = pd.read_csv(res_trafo_file)
210 |     util_p_mw = -trafo_p_mw.iloc[:, [1]]
211 |     util_p_mw.columns = ['utility']
212 | 
213 |     # price
214 |     price = pd.read_csv(os.path.join('.', 'data', 'profile', 'price_profile.csv'))
215 | 
216 |     excess_p_mw = pv_p_mw.sum(axis=1) + wt_p_mw.sum(axis=1) - load_p_mw.sum(axis=1)
217 |     excess_p_mw = pd.DataFrame({'excess': excess_p_mw})
218 | 
219 |     ax = excess_p_mw.iloc[start: start+length].plot(drawstyle='steps-post')
220 |     bat_p_mw.iloc[start: start+length].plot(ax=ax, drawstyle='steps-post')
221 |     price.iloc[start: start+length].plot(ax=ax, drawstyle='steps-post')
222 |     plt.title('Power Flow')
223 |     plt.xlabel('hour')
224 |     plt.ylabel('MW')
225 |     plt.show()
226 | 
227 | def view_profile(pv_profile, wt_profile, load_profile, price_profile, start=None, length=None):
228 |     start = 0 if start is None else start
229 |     length = (len(pv_profile.index)-start) if length is None else length
230 |     pv_p_mw = pv_profile.iloc[start: start+length, :]
231 |     wt_p_mw = wt_profile.iloc[start: start+length, :]
232 |     load_p_mw = load_profile.iloc[start: start+length, :]
233 |     price_profile = price_profile.iloc[start: start+length, :]
234 | 
235 |     # MW and excess profile
236 |     profile_p_mw = pd.concat([pv_p_mw, wt_p_mw, load_p_mw]).iloc[start: start+length, :]
237 |     excess_profile = pv_p_mw.sum(axis=1) + wt_p_mw.sum(axis=1) - load_p_mw.sum(axis=1)
238 |     excess_profile = pd.DataFrame({'Excess': excess_profile})
239 | 
240 |     # info
241 |     print('--- Profile ---')
242 |     print(f'PV:\n max = {pv_profile.max(numeric_only=True)}, \nmin = {pv_profile.min(numeric_only=True)}')
243 |     print(f'WT:\n max = {wt_profile.max(numeric_only=True)}, \nmin = {wt_profile.min(numeric_only=True)}')
244 |     print(f'Load:\n max = {load_profile.max(numeric_only=True)}, \nmin = {load_profile.min(numeric_only=True)}')
245 |     print(f'Excess:\n max = {excess_profile.max(numeric_only=True)}, \nmin = {excess_profile.min(numeric_only=True)}')
246 |     print(f'Price:\n max = {price_profile.max(numeric_only=True)}, \nmin = {price_profile.min(numeric_only=True)}')
247 | 
248 |     # plot
249 |     pv_p_mw.plot(xlabel='hour', ylabel='p_mw', title='PV')
250 |     wt_p_mw.plot(xlabel='hour', ylabel='p_mw', title='WT')
251 |     load_p_mw.plot(xlabel='hour', ylabel='p_mw', title='Load')
252 |     price_profile.plot(xlabel='hour', ylabel='price', title='Price')
253 |     profile_p_mw.plot(xlabel='hour', ylabel='p_mw', title='Microgrid')
254 |     ax = excess_profile.plot(xlabel='hour', ylabel='p_mw', title='excess')
255 |     ax.plot(range(start, start+length), np.zeros((length),))
256 |     plt.show()
257 | 
258 | # --- Logging ---
259 | def log_actor_critic_info(actor_loss, critic_loss, t=None, freq=20, **kwargs):
260 |     if t is None:
261 |         logging.info('--- Learn ---')
262 |         logging.info(f'actor loss = {actor_loss}')
263 |         logging.info(f'critic loss = {critic_loss}')
264 |         return
265 | 
266 |     if t % freq == 0:
267 |         logging.info('--- Learn ---')
268 |         logging.info(f'actor loss = {actor_loss}')
269 |         logging.info(f'critic loss = {critic_loss}')
270 | 
271 | def log_cost_info(transaction_cost, battery_cost, soc_penalty, t, freq=100, **kwargs):
272 |     if t % freq == 0:
273 |         net = kwargs['net']
274 |         ids = kwargs['ids']
275 |         pcc_p_mw = kwargs['pcc_p_mw']
276 |         p_wt = net.res_sgen['p_mw'].iloc[ids['wt7']].sum()
277 |         p_pv = net.res_sgen['p_mw'].sum() - p_wt
278 |         p_bat = net.res_storage['p_mw'].sum()
279 |         p_load = net.res_load['p_mw'].sum()
280 |         excess = p_pv + p_wt - p_bat - p_load
281 | 
282 |         logging.info('--- Cost ---')
283 |         logging.info(f'trans: {transaction_cost:.3f}, bat: {battery_cost:.3f}, soc: {soc_penalty:.3f}')
284 |         logging.info('--- Power flow ---')
285 |         logging.info(f'pcc = {pcc_p_mw:.3f}, excess = {excess:.3f},  pv = {p_pv:.3f}, wt = {p_wt:.3f}, bat = {p_bat:.3f}, load = {p_load:.3f}')
286 | 
287 | def log_trans_info(s, a, t, freq=100, **kwargs):
288 |     if t % freq == 0:
289 |         s_seq = s[0]    
290 |         s_fnn = s[1]
291 | 
292 |         logging.info('--- State ---')
293 |         logging.info(f'shape: ({s_seq.shape}, {s_fnn.shape})')
294 |         logging.info(f'content: {s_seq[0]}, {s_fnn}')
295 |         logging.info('--- Action ---')
296 |         logging.info(f'shape: {a.shape}')
297 |         logging.info(f'content: {a}')
298 | 
299 | # --- Others ---
300 | def get_excess(pv_profile, wt_profile, load_profile, t):
301 |     excess = pv_profile['pv3'][t] +\
302 |         pv_profile['pv4'][t] +\
303 |         pv_profile['pv5'][t] +\
304 |         pv_profile['pv6'][t] +\
305 |         pv_profile['pv8'][t] +\
306 |         pv_profile['pv9'][t] +\
307 |         pv_profile['pv10'][t] +\
308 |         pv_profile['pv11'][t] +\
309 |         wt_profile['wt7'][t] -\
310 |         load_profile['load_r1'][t] -\
311 |         load_profile['load_r3'][t] -\
312 |         load_profile['load_r4'][t] -\
313 |         load_profile['load_r5'][t] -\
314 |         load_profile['load_r6'][t] -\
315 |         load_profile['load_r8'][t] -\
316 |         load_profile['load_r10'][t] -\
317 |         load_profile['load_r11'][t]
318 | 
319 |     return excess
320 | 
321 | def policy_simple(net, ids, bat5_soc, bat10_soc, bat5_max_e_mwh, bat10_max_e_mwh):
322 |     p_pv = net.sgen.at[ids.get('pv3'), 'p_mw'] +\
323 |         net.sgen.at[ids.get('pv4'), 'p_mw'] +\
324 |         net.sgen.at[ids.get('pv5'), 'p_mw'] +\
325 |         net.sgen.at[ids.get('pv6'), 'p_mw'] +\
326 |         net.sgen.at[ids.get('pv8'), 'p_mw'] +\
327 |         net.sgen.at[ids.get('pv9'), 'p_mw'] +\
328 |         net.sgen.at[ids.get('pv10'), 'p_mw'] +\
329 |         net.sgen.at[ids.get('pv11'), 'p_mw']
330 |     p_wt = net.sgen.at[ids.get('wt7'), 'p_mw']
331 |     p_load = net.load.at[ids.get('load_r1'), 'p_mw'] +\
332 |         net.load.at[ids.get('load_r3'), 'p_mw'] +\
333 |         net.load.at[ids.get('load_r4'), 'p_mw'] +\
334 |         net.load.at[ids.get('load_r5'), 'p_mw'] +\
335 |         net.load.at[ids.get('load_r6'), 'p_mw'] +\
336 |         net.load.at[ids.get('load_r8'), 'p_mw'] +\
337 |         net.load.at[ids.get('load_r10'), 'p_mw'] +\
338 |         net.load.at[ids.get('load_r11'), 'p_mw']
339 |                         
340 |     p_b5_max = min((SOC_MAX - bat5_soc) * bat5_max_e_mwh / HOUR_PER_TIME_STEP, P_B5_MAX)
341 |     p_b5_min = max((SOC_MIN - bat5_soc) * bat5_max_e_mwh / HOUR_PER_TIME_STEP, P_B5_MIN)
342 |     p_b10_max = min((SOC_MAX - bat10_soc) * bat10_max_e_mwh / HOUR_PER_TIME_STEP, P_B10_MAX)
343 |     p_b10_min = max((SOC_MIN - bat10_soc) * bat10_max_e_mwh / HOUR_PER_TIME_STEP, P_B10_MIN)
344 | 
345 |     excess = p_pv + p_wt - p_load
346 |     # print(f'Excess = {excess}, pv: {p_pv}, wt: {p_wt}, load: {p_load}')
347 |     if excess > 0:
348 |         # charge
349 |         b5_ratio = p_b5_max / (p_b5_max + p_b10_max) if (p_b5_max + p_b10_max) != 0. else 0.
350 |         b10_ratio = p_b10_max / (p_b5_max + p_b10_max) if (p_b5_max + p_b10_max) != 0. else 0.
351 |         p_b5 = min(excess * b5_ratio, p_b5_max)
352 |         p_b10 = min(excess * b10_ratio, p_b10_max)
353 |         # p_mgt5 = 0.
354 |         # p_mgt9 = 0.
355 |         # p_mgt10 = 0.
356 |     else:
357 |         # discharge
358 |         b5_ratio = p_b5_min / (p_b5_min + p_b10_min) if (p_b5_min + p_b10_min) != 0. else 0.
359 |         b10_ratio = p_b10_min / (p_b5_min + p_b10_min) if (p_b5_min + p_b10_min) != 0. else 0.
360 |         p_b5 = max(excess * b5_ratio, p_b5_min)
361 |         p_b10 = max(excess * b10_ratio, p_b10_min)
362 |         p_b = p_b5 + p_b10
363 | 
364 |         # mgt5_ratio = P_MGT5_MAX / (P_MGT5_MAX + P_MGT9_MAX + P_MGT10_MAX)
365 |         # mgt9_ratio = P_MGT9_MAX / (P_MGT5_MAX + P_MGT9_MAX + P_MGT10_MAX)
366 |         # mgt10_ratio = P_MGT10_MAX / (P_MGT5_MAX + P_MGT9_MAX + P_MGT10_MAX)
367 |         # mgt5_op_point = (C_BUY - C_MGT5[1]) / C_MGT5[0]
368 |         # mgt9_op_point = (C_BUY - C_MGT9[1]) / C_MGT9[0]
369 |         # mgt10_op_point = (C_BUY - C_MGT10[1]) / C_MGT10[0]
370 |         # p_mgt5 = 0. if excess > p_b  else min((p_b - excess) * mgt5_ratio, mgt5_op_point)
371 |         # p_mgt9 = 0. if excess > p_b  else min((p_b - excess) * mgt9_ratio, mgt9_op_point)
372 |         # p_mgt10 = 0. if excess > p_b  else min((p_b - excess) * mgt10_ratio, mgt10_op_point)
373 |     
374 |     return np.array([p_b5, p_b10])


--------------------------------------------------------------------------------
/controllers/ppo_controller.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | import tensorflow.keras as keras
  5 | import tensorflow_probability as tfp
  6 | from tensorflow.keras.optimizers import Adam
  7 | from pandapower.control.basic_controller import Controller
  8 | 
  9 | from controllers.buffer import Buffer
 10 | from controllers.models import ActorPiModel, CriticVModel, SequenceModel, get_pi_actor, get_v_critic
 11 | from setting import *
 12 | import utils
 13 | 
 14 | class PPOAgent(Controller):
 15 |     def __init__(self, net, ids, pv_profile, wt_profile, load_profile, price_profile, 
 16 |         sequence_model_type, training=False,
 17 |         lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, 
 18 |         **kwargs):
 19 |         super().__init__(net, **kwargs)
 20 | 
 21 |         self.ids = ids
 22 |         self.pv_profile = pv_profile
 23 |         self.wt_profile = wt_profile
 24 |         self.load_profile = load_profile
 25 |         self.price_profile = price_profile
 26 |         self.state_seq_shape = STATE_SEQ_SHAPE
 27 |         self.state_fnn_shape = STATE_FNN_SHAPE
 28 |         self.n_action = N_ACTION
 29 |         self.training = training
 30 |         self.time_step_counter = 0
 31 | 
 32 |         # normalization
 33 |         self.obs_norm = utils.NormalizeObservation()
 34 |         self.r_norm = utils.NormalizeReward()
 35 | 
 36 |         # action bounds
 37 |         self.max_action = MAX_ACTION
 38 |         self.min_action = MIN_ACTION
 39 | 
 40 |         # hyper parameters
 41 |         self.batch_size = PPO_BATCH_SIZE
 42 |         self.policy_clip = POLICY_CLIP
 43 |         self.target_kl = TARGET_KL
 44 |         self.train_freq = PPO_TRAIN_FREQ
 45 |         self.train_iters = PPO_TRAIN_ITERS
 46 | 
 47 |         # battery
 48 |         self.bat5_id = ids.get('bat5')
 49 |         self.bat5_p_mw = net.storage.at[self.bat5_id, 'p_mw']
 50 |         self.bat5_max_e_mwh = net.storage.at[self.bat5_id, 'max_e_mwh']
 51 |         self.bat10_id = ids.get('bat10')
 52 |         self.bat10_p_mw = net.storage.at[self.bat10_id, 'p_mw']
 53 |         self.bat10_max_e_mwh = net.storage.at[self.bat10_id, 'max_e_mwh']
 54 | 
 55 |         # other elements
 56 |         self.pv3_id = ids.get('pv3')
 57 |         self.pv4_id = ids.get('pv4')
 58 |         self.pv5_id = ids.get('pv5')
 59 |         self.pv6_id = ids.get('pv6')
 60 |         self.pv8_id = ids.get('pv8')
 61 |         self.pv9_id = ids.get('pv9')
 62 |         self.loadr1_id = ids.get('load_r1')
 63 |         self.loadr3_id = ids.get('Load_r3')
 64 |         self.loadr4_id = ids.get('Load_r4')
 65 |         self.loadr5_id = ids.get('Load_r5')
 66 |         self.loadr6_id = ids.get('Load_r6')
 67 |         self.loadr8_id = ids.get('Load_r8')
 68 |         self.loadr10_id = ids.get('Load_r10')
 69 |         self.loadr11_id = ids.get('Load_r11')
 70 |         self.trafo0_id = ids.get('trafo0') # PCC trafo
 71 | 
 72 |         # internal states
 73 |         self.state = None
 74 |         self.state_value = None
 75 |         self.action_logprob = None
 76 |         self.bat5_soc = 0.
 77 |         self.bat10_soc = 0.
 78 |         self.rewards = []
 79 |         self.costs = []
 80 |         self.last_time_step = None
 81 |         self.applied = False
 82 | 
 83 |         self.history = {
 84 |             'price': [],
 85 |             'excess': [],
 86 |             'nn_bat5_p_mw': [],
 87 |             'nn_bat10_p_mw': [],
 88 |             'bat5_p_mw': [],
 89 |             'bat10_p_mw': [],
 90 |             'bat5_soc': [], 
 91 |             'bat10_soc': []}
 92 | 
 93 |         # buffer
 94 |         self.buffer = Buffer(self.train_freq, STATE_SEQ_SHAPE, STATE_FNN_SHAPE, N_ACTION)
 95 |         
 96 |         # networks
 97 |         self.sequence_model_type = sequence_model_type
 98 |         self.actor = get_pi_actor(SequenceModel(sequence_model_type, activation='tanh'), ActorPiModel(self.n_action))
 99 |         self.actor_dist = tfp.distributions.Normal
100 |         self.actor.compile(optimizer=Adam(lr_actor))
101 |         self.critic = get_v_critic(SequenceModel(sequence_model_type, activation='tanh'), CriticVModel())
102 |         self.critic.compile(optimizer=Adam(lr_critic))
103 | 
104 |     @tf.function
105 |     def cal_kl(self, state_seq_buffer, state_fnn_buffer, action_buffer, action_logprob_buffer):
106 |         action_means, action_stds = self.actor([state_seq_buffer, state_fnn_buffer])
107 |         action_dists = self.actor_dist(action_means, action_stds)
108 |         action_logprobs = self.cal_logprob(action_dists, action_buffer)
109 |         kl = tf.math.reduce_mean(action_logprob_buffer - action_logprobs)
110 |         return kl
111 | 
112 |     @tf.function
113 |     def cal_logprob(self, dist, a):
114 |         a_logprob = dist.log_prob(a)
115 |         a_logprob = tf.math.reduce_sum(a_logprob, axis=-1, keepdims=True)
116 |         return a_logprob
117 | 
118 |     def cal_reward(self, net, t, bat5_soc_prev, bat10_soc_prev) -> float:
119 |         price = self.price_profile['price'][t]
120 |         cost, normalized_cost = utils.cal_cost(
121 |             price=price,
122 |             pcc_p_mw=-net.res_trafo.at[self.trafo0_id, 'p_lv_mw'],
123 |             bat5_soc_now=self.bat5_soc,
124 |             bat5_soc_prev=bat5_soc_prev,
125 |             bat10_soc_now=self.bat10_soc,
126 |             bat10_soc_prev=bat10_soc_prev
127 |         )
128 |         reward = -normalized_cost
129 | 
130 |         return cost, reward
131 | 
132 |     def control_step(self, net):
133 |         net.storage.at[self.bat5_id, 'p_mw'] = self.bat5_p_mw
134 |         net.storage.at[self.bat10_id, 'p_mw'] = self.bat10_p_mw
135 |         self.applied = True
136 | 
137 |     def finalize_step(self, net, t):
138 |         super().finalize_step(net, t)
139 |         # update soc
140 |         bat5_soc_prev = self.bat5_soc
141 |         bat10_soc_prev = self.bat10_soc
142 |         self.bat5_soc += self.bat5_p_mw * HOUR_PER_TIME_STEP / self.bat5_max_e_mwh
143 |         self.bat10_soc += self.bat10_p_mw * HOUR_PER_TIME_STEP / self.bat10_max_e_mwh
144 | 
145 |         # reward
146 |         cost, reward = self.cal_reward(net, t, bat5_soc_prev, bat10_soc_prev)
147 |         self.rewards.append(reward)
148 |         self.costs.append(cost)
149 | 
150 |         if not self.training:
151 |             return
152 | 
153 |         # store transition
154 |         state_seq, state_fnn = self.obs_norm.normalize(self.state)
155 |         reward = self.r_norm.normalize(reward)
156 |         self.buffer.store_transition(state_seq, state_fnn, self.action, reward, self.state_value, self.action_logprob)
157 |         self.time_step_counter += 1
158 | 
159 |         if self.time_step_counter % self.train_freq != 0:
160 |             return
161 | 
162 |         # finish trajectory
163 |         state = self.get_state(net, t+1)
164 |         state_seq, state_fnn = self.obs_norm.normalize(state)
165 |         state_seq = tf.expand_dims(tf.convert_to_tensor(state_seq, dtype=tf.float32), axis=0) 
166 |         state_fnn = tf.expand_dims(tf.convert_to_tensor(state_fnn, dtype=tf.float32), axis=0)
167 |         last_value = self.critic([state_seq, state_fnn])
168 |         last_value = tf.squeeze(last_value).numpy()
169 |         self.buffer.finish_trajectory(last_value)
170 | 
171 |         self.learn()            
172 | 
173 |     def get_state(self, net, t):
174 |         state_seq = np.zeros(self.state_seq_shape)
175 |         state_fnn = np.zeros(self.state_fnn_shape)
176 | 
177 |         for i in range(SEQ_LENGTH):
178 |             state_seq[i, 0] = self.pv_profile['pv3'][t + i]
179 |             state_seq[i, 1] = self.pv_profile['pv4'][t + i]
180 |             state_seq[i, 2] = self.pv_profile['pv5'][t + i]
181 |             state_seq[i, 3] = self.pv_profile['pv6'][t + i]
182 |             state_seq[i, 4] = self.pv_profile['pv8'][t + i]
183 |             state_seq[i, 5] = self.pv_profile['pv9'][t + i]
184 |             state_seq[i, 6] = self.pv_profile['pv10'][t + i]
185 |             state_seq[i, 7] = self.pv_profile['pv11'][t + i]
186 |             state_seq[i, 8] = self.wt_profile['wt7'][t + i]
187 |             state_seq[i, 9] = self.load_profile['load_r1'][t + i]
188 |             state_seq[i, 10] = self.load_profile['load_r3'][t + i]
189 |             state_seq[i, 11] = self.load_profile['load_r4'][t + i]
190 |             state_seq[i, 12] = self.load_profile['load_r5'][t + i]
191 |             state_seq[i, 13] = self.load_profile['load_r6'][t + i]
192 |             state_seq[i, 14] = self.load_profile['load_r8'][t + i]
193 |             state_seq[i, 15] = self.load_profile['load_r10'][t + i]
194 |             state_seq[i, 16] = self.load_profile['load_r11'][t + i]
195 |             state_seq[i, 17] = self.price_profile['price'][t + i]
196 |             # state_seq[i, 0] = utils.get_excess(self.pv_profile, self.wt_profile, self.load_profile, t+i)
197 |             # state_seq[i, 1] = self.price_profile['price'][t + i]
198 | 
199 |         state_fnn[0] = self.bat5_soc
200 |         state_fnn[1] = self.bat10_soc
201 | 
202 |         return state_seq, state_fnn
203 | 
204 |     def is_converged(self, net) -> bool:
205 |         return self.applied
206 | 
207 |     def learn(self):
208 |         kl = 0.
209 |         for _ in range(self.train_iters):
210 |             state_seq_buffer, \
211 |             state_fnn_buffer, \
212 |             action_buffer, \
213 |             action_logprob_buffer, \
214 |             return_buffer, \
215 |             advantage_buffer, \
216 |             batches = self.buffer.sample(self.batch_size)
217 | 
218 |             # batch update
219 |             for batch in batches:
220 |                 state_seq_batch = tf.convert_to_tensor(state_seq_buffer[batch], dtype=tf.float32)
221 |                 state_fnn_batch = tf.convert_to_tensor(state_fnn_buffer[batch], dtype=tf.float32)
222 |                 action_batch = tf.convert_to_tensor(action_buffer[batch], dtype=tf.float32)
223 |                 action_logprob_batch = tf.convert_to_tensor(action_logprob_buffer[batch], dtype=tf.float32)
224 |                 return_batch = tf.convert_to_tensor(return_buffer[batch], dtype=tf.float32)
225 |                 advantage_batch = tf.convert_to_tensor(advantage_buffer[batch], dtype=tf.float32)
226 | 
227 |                 if kl < 1.5 * self.target_kl:
228 |                     actor_loss = self.update_actor(state_seq_batch, state_fnn_batch, action_batch, action_logprob_batch, advantage_batch)
229 |                 critic_loss = self.update_critic(state_seq_batch, state_fnn_batch, return_batch)
230 | 
231 |             # kl divergence
232 |             if kl < 1.5 * self.target_kl:
233 |                 state_seq_buffer = tf.convert_to_tensor(state_seq_buffer, dtype=tf.float32)
234 |                 state_fnn_buffer = tf.convert_to_tensor(state_fnn_buffer, dtype=tf.float32)
235 |                 action_buffer = tf.convert_to_tensor(action_buffer, dtype=tf.float32)
236 |                 action_logprob_buffer = tf.convert_to_tensor(action_logprob_buffer, dtype=tf.float32)
237 |                 kl = self.cal_kl(state_seq_buffer, state_fnn_buffer, action_buffer, action_logprob_buffer)
238 | 
239 |         utils.log_actor_critic_info(actor_loss, critic_loss)
240 |         self.buffer.clear()
241 | 
242 |     def load(self, run=1):
243 |         self.obs_norm.load(dir=os.path.join('.', 'rms', 'PPO', self.sequence_model_type, str(run)))
244 |         self.load_models(dir=os.path.join('.', 'model_weights', 'PPO', self.sequence_model_type, str(run)))
245 | 
246 |     def load_models(self, dir):
247 |         print('... Loading Models ...')
248 |         self.actor.load_weights(os.path.join(dir, 'actor_weights'))
249 |         self.critic.load_weights(os.path.join(dir, 'critic_weights'))
250 |         
251 |     def model_info(self):
252 |         self.actor.summary()
253 |         self.critic.summary()
254 | 
255 |     def policy(self, net, state):
256 |         state_seq, state_fnn = self.obs_norm.normalize(state, update=False)
257 |         state_seq = tf.expand_dims(tf.convert_to_tensor(state_seq, dtype=tf.float32), axis=0) 
258 |         state_fnn = tf.expand_dims(tf.convert_to_tensor(state_fnn, dtype=tf.float32), axis=0)
259 | 
260 |         action_mean, action_std = self.actor([state_seq, state_fnn])
261 |         action_std = action_std if self.training else 0.
262 |         action_dist = self.actor_dist(action_mean, action_std)
263 |         nn_action = action_dist.sample()
264 |         nn_action = np.clip(nn_action, -NN_BOUND, NN_BOUND)
265 |         action_logprob = self.cal_logprob(action_dist, nn_action)
266 |         state_value = self.critic([state_seq, state_fnn])
267 | 
268 |         # remove batch dim
269 |         nn_action = tf.squeeze(nn_action, axis=0).numpy()
270 |         # reduce to single value
271 |         action_logprob = tf.squeeze(action_logprob).numpy()
272 |         state_value = tf.squeeze(state_value).numpy()
273 | 
274 |         # mg action
275 |         p_b5_min = max(P_B5_MIN, (SOC_MIN - self.bat5_soc) * self.bat5_max_e_mwh / HOUR_PER_TIME_STEP)
276 |         p_b5_max = min(P_B5_MAX, (SOC_MAX - self.bat5_soc) * self.bat5_max_e_mwh / HOUR_PER_TIME_STEP)
277 |         p_b10_min = max(P_B10_MIN, (SOC_MIN - self.bat10_soc) * self.bat10_max_e_mwh / HOUR_PER_TIME_STEP)
278 |         p_b10_max = min(P_B10_MAX, (SOC_MAX - self.bat10_soc) * self.bat10_max_e_mwh / HOUR_PER_TIME_STEP)
279 | 
280 |         # invalid action masking
281 |         self.min_action[ACTION_IDX.get('p_b5')] = p_b5_min
282 |         self.min_action[ACTION_IDX.get('p_b10')] = p_b10_min
283 |         self.max_action[ACTION_IDX.get('p_b5')] = p_b5_max
284 |         self.max_action[ACTION_IDX.get('p_b10')] = p_b10_max
285 |         mg_action = utils.scale_to_mg(nn_action, self.min_action, self.max_action)
286 | 
287 |         return mg_action, nn_action, action_logprob, state_value
288 | 
289 |     def reset(self):
290 |         # reset internal states
291 |         self.state = None
292 |         self.state_value = None
293 |         self.action_logprob = None
294 |         self.bat5_soc = 0.
295 |         self.bat10_soc = 0.
296 |         self.rewards = []
297 |         self.costs = []
298 |         self.last_time_step = None
299 |         self.applied = False
300 |         self.history = {
301 |             'price': [],
302 |             'excess': [],
303 |             'nn_bat5_p_mw': [],
304 |             'nn_bat10_p_mw': [],
305 |             'bat5_p_mw': [],
306 |             'bat10_p_mw': [],
307 |             'bat5_soc': [], 
308 |             'bat10_soc': []}
309 | 
310 |     def save(self, run=1):
311 |         self.obs_norm.save(dir=os.path.join('.', 'rms', 'PPO', self.sequence_model_type, str(run)))
312 |         self.save_models(dir=os.path.join('.', 'model_weights', 'PPO', self.sequence_model_type, str(run)))
313 | 
314 |     def save_models(self, dir):
315 |         print('... Saving Models ...')
316 |         self.actor.save_weights(os.path.join(dir, 'actor_weights'))
317 |         self.critic.save_weights(os.path.join(dir, 'critic_weights'))
318 | 
319 |     @tf.function
320 |     def update_actor(self, state_seq_batch, state_fnn_batch, action_batch, action_logprob_batch, advantage_batch):
321 |         with tf.GradientTape() as tape:
322 |             action_means, action_stds = self.actor([state_seq_batch, state_fnn_batch])
323 |             action_dists = self.actor_dist(action_means, action_stds)
324 |             action_logprobs = self.cal_logprob(action_dists, action_batch)
325 |             prob_ratio = tf.math.exp(action_logprobs - action_logprob_batch)
326 | 
327 |             surrogate_obj = prob_ratio * advantage_batch
328 |             clipped_surrogate_obj = tf.clip_by_value(prob_ratio, 1-self.policy_clip, 1+self.policy_clip) * advantage_batch
329 |             actor_loss = -tf.math.reduce_mean(
330 |                 tf.math.minimum(surrogate_obj, clipped_surrogate_obj)
331 |             )
332 |         actor_grads = tape.gradient(actor_loss, self.actor.trainable_variables)
333 |         self.actor.optimizer.apply_gradients(zip(actor_grads, self.actor.trainable_variables))
334 | 
335 |         return actor_loss
336 | 
337 |     @tf.function
338 |     def update_critic(self, state_seq_batch, state_fnn_batch, return_batch):
339 |         huber_loss = keras.losses.Huber()
340 |         with tf.GradientTape() as tape:
341 |             critic_values = self.critic([state_seq_batch, state_fnn_batch])
342 |             critic_loss = huber_loss(return_batch, critic_values)
343 |         critic_grads = tape.gradient(critic_loss, self.critic.trainable_variables)
344 |         self.critic.optimizer.apply_gradients(zip(critic_grads, self.critic.trainable_variables))
345 | 
346 |         return critic_loss
347 | 
348 |     @tf.function
349 |     def update_sequence_model(self, state_seq_batch, state_fnn_batch, return_batch):
350 |         huber_loss = keras.losses.Huber()
351 |         with tf.GradientTape() as tape:
352 |             critic_values = self.critic([state_seq_batch, state_fnn_batch])
353 |             critic_loss = huber_loss(return_batch, critic_values)
354 |         seq_grads = tape.gradient(critic_loss, self.sequence_model.trainable_variables)
355 |         seq_grads = [tf.clip_by_norm(g, 1.0) for g in seq_grads]
356 |         self.sequence_model.optimizer.apply_gradients(zip(seq_grads, self.sequence_model.trainable_variables))
357 | 
358 |     def time_step(self, net, t):
359 |         # action selection
360 |         self.state = self.get_state(net, t)
361 |         mg_action, nn_action, self.action_logprob, self.state_value = self.policy(net, self.state)
362 |         self.bat5_p_mw, self.bat10_p_mw = mg_action
363 |         self.action = nn_action    
364 |         # utils.log_trans_info(self.state, nn_action, t)
365 | 
366 |         # history
367 |         self.history['price'].append(round(self.price_profile['price'][t], 3))
368 |         excess = net.res_sgen['p_mw'].sum() - net.res_load['p_mw'].sum()
369 |         self.history['excess'].append(round(excess, 3))
370 |         self.history['nn_bat5_p_mw'].append(round(self.action[ACTION_IDX['p_b5']], 3))
371 |         self.history['nn_bat10_p_mw'].append(round(self.action[ACTION_IDX['p_b10']], 3))
372 |         self.history['bat5_p_mw'].append(round(self.bat5_p_mw, 3))
373 |         self.history['bat10_p_mw'].append(round(self.bat10_p_mw, 3))
374 |         self.history['bat5_soc'].append(round(self.bat5_soc, 3))
375 |         self.history['bat10_soc'].append(round(self.bat10_soc, 3))
376 |         
377 |         self.applied = False
378 |         self.last_time_step = t


--------------------------------------------------------------------------------
/controllers/td3_controller.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | class:
  3 | - TD3Agent
  4 |     - adapt_param_noise()
  5 |     - adjust_action_noise()
  6 |     - calculate_distance()
  7 |     - calculate_reward()
  8 |     - control_step()
  9 |     - get_state()
 10 |     - is_converged()
 11 |     - learn()
 12 |     - load_models()
 13 |     - model_info()
 14 |     - perturb_policy()
 15 |     - policy()
 16 |     - reset()
 17 |     - save_models()
 18 |     - update_actor()
 19 |     - update_critics()
 20 |     - update_sequence_model()
 21 |     - update_target_networks()
 22 |     - time_step()
 23 | '''
 24 | 
 25 | import logging
 26 | import os
 27 | from typing import Dict
 28 | import numpy as np
 29 | 
 30 | import tensorflow as tf
 31 | import tensorflow.keras as keras
 32 | from tensorflow.keras.optimizers import Adam
 33 | 
 34 | from pandapower.control.basic_controller import Controller
 35 | from controllers.models import ActorMuModel, CriticQModel, SequenceModel, get_mu_actor, get_q_critic
 36 | from controllers.buffer import ReplayBuffer, PrioritizedReplayBuffer
 37 | from setting import *
 38 | import utils
 39 | 
 40 | class TD3Agent(Controller):
 41 |     def __init__(self, net, ids, pv_profile, wt_profile, load_profile, price_profile,
 42 |         noise_type = 'action', sequence_model_type='none', use_pretrained_sequence_model=False,
 43 |         n_epochs=None, training=False,
 44 |         delay=2, gamma=GAMMA, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, 
 45 |         buffer_size=50000, batch_size=128, epsilon_p=0.001, **kwargs):
 46 |         super().__init__(net, **kwargs)
 47 |         self.ids = ids
 48 |         self.pv_profile = pv_profile
 49 |         self.wt_profile = wt_profile
 50 |         self.load_profile = load_profile
 51 |         self.price_profile = price_profile
 52 |         self.state_seq_shape = STATE_SEQ_SHAPE
 53 |         self.state_fnn_shape = STATE_FNN_SHAPE
 54 |         self.n_action = N_ACTION
 55 |         self.use_pretrained_sequence_model = use_pretrained_sequence_model
 56 |         self.training = training
 57 |         self.noise_type = noise_type
 58 |         self.action_noise_scale = ACTION_NOISE_SCALE
 59 |         self.action_noise_scale_ = ACTION_NOISE_SCALE
 60 |         self.param_noise_adapt_rate = PARAM_NOISE_ADAPT_RATE
 61 |         self.param_noise_bound = PARAM_NOISE_BOUND
 62 |         self.param_noise_scale = PARAM_NOISE_SCALE
 63 |         self.n_epochs = n_epochs
 64 |         self.update_freq = UPDATE_FREQ
 65 |         self.update_times = UPDATE_TIMES
 66 |         self.warmup = WARMUP
 67 |         self.delay = delay
 68 |         self.gamma = gamma
 69 |         self.batch_size = batch_size
 70 |         self.epsilon_p = epsilon_p
 71 | 
 72 |         # counter
 73 |         self.time_step_counter = 0
 74 |         self.learn_step_counter = 0
 75 | 
 76 |         # normalization
 77 |         self.obs_norm = utils.NormalizeObservation()
 78 |         self.a_norm = utils.NormalizeAction()
 79 |         self.r_norm = utils.NormalizeReward()
 80 | 
 81 |         # action bounds
 82 |         self.max_action = MAX_ACTION
 83 |         self.min_action = MIN_ACTION
 84 | 
 85 |         # generator
 86 |         # self.mgt5_id = ids.get('mgt5')
 87 |         # self.mgt5_p_mw = net.sgen.at[self.mgt5_id, 'p_mw']
 88 |         # self.mgt9_id = ids.get('mgt9')
 89 |         # self.mgt9_p_mw = net.sgen.at[self.mgt9_id, 'p_mw']
 90 |         # self.mgt10_id = ids.get('mgt10')
 91 |         # self.mgt10_p_mw = net.sgen.at[self.mgt10_id, 'p_mw']
 92 | 
 93 |         # battery
 94 |         self.bat5_id = ids.get('bat5')
 95 |         self.bat5_p_mw = net.storage.at[self.bat5_id, 'p_mw']
 96 |         self.bat5_max_e_mwh = net.storage.at[self.bat5_id, 'max_e_mwh']
 97 |         self.bat10_id = ids.get('bat10')
 98 |         self.bat10_p_mw = net.storage.at[self.bat10_id, 'p_mw']
 99 |         self.bat10_max_e_mwh = net.storage.at[self.bat10_id, 'max_e_mwh']
100 | 
101 |         # internal states
102 |         self.prev_state = None
103 |         self.bat5_soc = 0.
104 |         self.bat10_soc = 0.
105 |         self.action = None
106 |         self.rewards = []
107 |         self.costs = []
108 |         self.history = {
109 |             'price': [],
110 |             # 'mgt5_p_mw': [round(self.mgt5_p_mw, 3)],
111 |             # 'mgt9_p_mw': [round(self.mgt9_p_mw, 3)],
112 |             # 'mgt10_p_mw': [round(self.mgt10_p_mw, 3)],
113 |             'excess': [],
114 |             'nn_bat5_p_mw': [],
115 |             'nn_bat10_p_mw': [],
116 |             'bat5_p_mw': [],
117 |             'bat10_p_mw': [],
118 |             'bat5_soc': [], 
119 |             'bat10_soc': []}
120 |         self.last_time_step = None
121 |         self.applied = False
122 | 
123 |         # other elements
124 |         self.pv3_id = ids.get('pv3')
125 |         self.pv4_id = ids.get('pv4')
126 |         self.pv5_id = ids.get('pv5')
127 |         self.pv6_id = ids.get('pv6')
128 |         self.pv8_id = ids.get('pv8')
129 |         self.pv9_id = ids.get('pv9')
130 |         self.loadr1_id = ids.get('load_r1')
131 |         self.loadr3_id = ids.get('Load_r3')
132 |         self.loadr4_id = ids.get('Load_r4')
133 |         self.loadr5_id = ids.get('Load_r5')
134 |         self.loadr6_id = ids.get('Load_r6')
135 |         self.loadr8_id = ids.get('Load_r8')
136 |         self.loadr10_id = ids.get('Load_r10')
137 |         self.loadr11_id = ids.get('Load_r11')
138 |         self.trafo0_id = ids.get('trafo0') # PCC trafo
139 | 
140 |         # buffer
141 |         # self.buffer = ReplayBuffer(buffer_size, self.state_seq_shape, self.state_fnn_shape, self.n_action)
142 |         self.buffer = PrioritizedReplayBuffer(buffer_size, self.state_seq_shape, self.state_fnn_shape, self.n_action)
143 | 
144 |         # models
145 |         self.sequence_model_type = sequence_model_type
146 |         if self.sequence_model_type == 'none':
147 |             # actor critic
148 |             self.actor = get_mu_actor(SequenceModel(sequence_model_type, name='sequence_model'), ActorMuModel(self.n_action))
149 |             self.perturbed_actor = get_mu_actor(SequenceModel(sequence_model_type, name='sequence_model'), ActorMuModel(self.n_action))
150 |             self.target_actor = get_mu_actor(SequenceModel(sequence_model_type, name='sequence_model'), ActorMuModel(self.n_action))
151 |             self.critic1 = get_q_critic(SequenceModel(sequence_model_type, name='sequence_model'), CriticQModel())
152 |             self.critic2 = get_q_critic(SequenceModel(sequence_model_type, name='sequence_model'), CriticQModel())
153 |             self.target_critic1 = get_q_critic(SequenceModel(sequence_model_type, name='sequence_model'), CriticQModel())
154 |             self.target_critic2 = get_q_critic(SequenceModel(sequence_model_type, name='sequence_model'), CriticQModel())
155 |         else:
156 |             # sequence model
157 |             self.sequence_model = SequenceModel(sequence_model_type, name='sequence_model')
158 |             self.sequence_model.compile(optimizer=Adam(learning_rate=lr_critic, epsilon=1e-5))
159 | 
160 |             # actor critic
161 |             self.actor = get_mu_actor(self.sequence_model, ActorMuModel(self.n_action))
162 |             self.perturbed_actor = get_mu_actor(self.sequence_model, ActorMuModel(self.n_action))
163 |             self.target_actor = get_mu_actor(self.sequence_model, ActorMuModel(self.n_action))
164 |             self.critic1 = get_q_critic(self.sequence_model, CriticQModel())
165 |             self.critic2 = get_q_critic(self.sequence_model, CriticQModel())
166 |             self.target_critic1 = get_q_critic(self.sequence_model, CriticQModel())
167 |             self.target_critic2 = get_q_critic(self.sequence_model, CriticQModel())
168 | 
169 |         self.actor.compile(optimizer=Adam(learning_rate=lr_actor, epsilon=1e-5))
170 |         self.critic1.compile(optimizer=Adam(learning_rate=lr_critic, epsilon=1e-5))
171 |         self.critic2.compile(optimizer=Adam(learning_rate=lr_critic, epsilon=1e-5))
172 | 
173 |         # initialization
174 |         if self.training:
175 |             if self.use_pretrained_sequence_model:
176 |                 file_path = os.path.join('.', 'pretrained_sequence_model', self.sequence_model_type, 'pretrained_sequence_model_weights.hdf5')
177 |                 self.sequence_model.load_weights(file_path, by_name=True)
178 |             self.perturbed_actor.set_weights(self.actor.get_weights())
179 |             self.update_target_networks(tau=1)
180 | 
181 |     def adapt_param_noise(self, d):
182 |         if d <= self.param_noise_bound:
183 |             self.param_noise_scale *= self.param_noise_adapt_rate
184 |         else:
185 |             self.param_noise_scale /= self.param_noise_adapt_rate
186 |     
187 |     def adjust_action_noise(self, adjust=True):
188 |         if not adjust:
189 |             return
190 |         else:
191 |             self.action_noise_scale -= self.action_noise_scale_ / self.n_epochs
192 |             self.action_noise_scale = max(self.action_noise_scale, 0.1)
193 | 
194 |     # distance for parameter noise
195 |     @tf.function
196 |     def calculate_distance(self, state_seq_batch, state_fnn_batch):
197 |         actions = self.actor([state_seq_batch, state_fnn_batch])
198 |         perturbed_actions = self.perturbed_actor([state_seq_batch, state_fnn_batch])
199 |         d = tf.math.square(actions - perturbed_actions)
200 |         d = tf.math.reduce_mean(d)
201 |         d = tf.math.sqrt(d)
202 |         return d
203 |     
204 |     def calculate_reward(self, net, t):
205 |         price = self.price_profile['price'][t - 1]
206 |         cost, normalized_cost = utils.cal_cost(
207 |             price=price,
208 |             pcc_p_mw=-net.res_trafo.at[self.trafo0_id, 'p_lv_mw'],
209 |             # mgt5_p_mw=self.mgt5_p_mw,
210 |             # mgt9_p_mw=self.mgt9_p_mw,
211 |             # mgt10_p_mw=self.mgt10_p_mw,
212 |             bat5_soc_now=self.bat5_soc,
213 |             bat5_soc_prev=self.prev_state[1][0],
214 |             bat10_soc_now=self.bat10_soc,
215 |             bat10_soc_prev=self.prev_state[1][1],
216 |             # ids=self.ids,
217 |             # t=t,
218 |             # net=net
219 |             )
220 |         reward = -normalized_cost
221 | 
222 |         # invalid action penalty
223 |         # nn_bat_p_mw = self.action * np.array([P_B5_MAX, P_B10_MAX])
224 |         # valid_bat_p_mw = np.array([self.bat5_p_mw, self.bat10_p_mw])
225 |         # extra_reward = utils.extra_reward(nn_bat_p_mw, valid_bat_p_mw)
226 |         # reward += extra_reward
227 | 
228 |         return cost, reward
229 | 
230 |     def control_step(self, net):
231 |         # net.sgen.at[self.mgt5_id, 'p_mw'] = self.mgt5_p_mw
232 |         # net.sgen.at[self.mgt9_id, 'p_mw'] = self.mgt9_p_mw
233 |         # net.sgen.at[self.mgt10_id, 'p_mw'] = self.mgt10_p_mw
234 |         net.storage.at[self.bat5_id, 'p_mw'] = self.bat5_p_mw
235 |         net.storage.at[self.bat10_id, 'p_mw'] = self.bat10_p_mw
236 |         self.applied = True
237 | 
238 |     def finalize_step(self, net, t):
239 |         super().finalize_step(net, t)
240 | 
241 |         # next time step
242 |         t += 1
243 | 
244 |         # update soc
245 |         self.bat5_soc += self.bat5_p_mw * HOUR_PER_TIME_STEP / self.bat5_max_e_mwh
246 |         self.bat10_soc += self.bat10_p_mw * HOUR_PER_TIME_STEP / self.bat10_max_e_mwh
247 | 
248 |         # observe transition
249 |         state = self.get_state(net, t)
250 |         cost, reward = self.calculate_reward(net, t)
251 |         self.rewards.append(reward)
252 |         self.costs.append(cost)
253 | 
254 |         if self.training:
255 |             # store transition
256 |             normalized_prev_state = self.obs_norm.normalize(self.prev_state)
257 |             normalized_state = self.obs_norm.normalize(state)
258 |             normalized_action = self.a_norm.normalize(self.action)
259 |             normalized_reward = self.r_norm.normalize(reward)
260 |             self.buffer.store_transition(normalized_prev_state[0], normalized_prev_state[1], normalized_action, normalized_reward, normalized_state[0], normalized_state[1])
261 | 
262 |             # update networks
263 |             self.learn()
264 | 
265 |     def get_state(self, net, t):
266 |         state_seq = np.zeros(self.state_seq_shape)
267 |         state_fnn = np.zeros(self.state_fnn_shape)
268 | 
269 |         for i in range(SEQ_LENGTH):
270 |             state_seq[i, 0] = self.pv_profile['pv3'][t + i]
271 |             state_seq[i, 1] = self.pv_profile['pv4'][t + i]
272 |             state_seq[i, 2] = self.pv_profile['pv5'][t + i]
273 |             state_seq[i, 3] = self.pv_profile['pv6'][t + i]
274 |             state_seq[i, 4] = self.pv_profile['pv8'][t + i]
275 |             state_seq[i, 5] = self.pv_profile['pv9'][t + i]
276 |             state_seq[i, 6] = self.pv_profile['pv10'][t + i]
277 |             state_seq[i, 7] = self.pv_profile['pv11'][t + i]
278 |             state_seq[i, 8] = self.wt_profile['wt7'][t + i]
279 |             state_seq[i, 9] = self.load_profile['load_r1'][t + i]
280 |             state_seq[i, 10] = self.load_profile['load_r3'][t + i]
281 |             state_seq[i, 11] = self.load_profile['load_r4'][t + i]
282 |             state_seq[i, 12] = self.load_profile['load_r5'][t + i]
283 |             state_seq[i, 13] = self.load_profile['load_r6'][t + i]
284 |             state_seq[i, 14] = self.load_profile['load_r8'][t + i]
285 |             state_seq[i, 15] = self.load_profile['load_r10'][t + i]
286 |             state_seq[i, 16] = self.load_profile['load_r11'][t + i]
287 |             # state_seq[i, 17] = utils.get_excess(self.pv_profile, self.wt_profile, self.load_profile, t+i)
288 |             state_seq[i, 17] = self.price_profile['price'][t + i]
289 | 
290 |             # state_seq[i, 0] = utils.get_excess(self.pv_profile, self.wt_profile, self.load_profile, t+i)
291 |             # state_seq[i, 1] = self.price_profile['price'][t + i]
292 |         
293 |         state_fnn[0] = self.bat5_soc
294 |         state_fnn[1] = self.bat10_soc
295 | 
296 |         return state_seq, state_fnn
297 | 
298 |     def is_converged(self, net) -> bool:
299 |         return self.applied
300 | 
301 |     def learn(self):
302 |         if self.buffer.buffer_counter < self.batch_size:
303 |             return
304 | 
305 |         if self.buffer.buffer_counter < self.warmup:
306 |             return
307 | 
308 |         if self.time_step_counter % self.update_freq != 0:
309 |             return
310 | 
311 |         for _ in range(self.update_times):
312 |             self.update()
313 | 
314 |     def load_models(self, dir='model_weights', run=1):
315 |         print('... Loading Models ...')
316 |         self.actor.load_weights(os.path.join(dir, self.sequence_model_type, str(run), 'actor_weights'))
317 |         self.critic1.load_weights(os.path.join(dir, self.sequence_model_type, str(run), 'critic1_weights'))
318 |         self.critic2.load_weights(os.path.join(dir, self.sequence_model_type, str(run), 'critic2_weights'))
319 |         self.target_actor.load_weights(os.path.join(dir, self.sequence_model_type, str(run), 'target_actor_weights'))
320 |         self.target_critic1.load_weights(os.path.join(dir, self.sequence_model_type, str(run), 'target_critic1_weights'))
321 |         self.target_critic2.load_weights(os.path.join(dir, self.sequence_model_type, str(run), 'target_critic2_weights'))
322 | 
323 |     def model_info(self):
324 |         self.actor.summary()
325 |         self.critic1.summary()
326 |     
327 |     @tf.function
328 |     def perturb_policy(self):
329 |         # if self.sequence_model_type == 'none':
330 |         #     perturbed_actor_weights = self.perturbed_actor.trainable_weights
331 |         #     actor_weights = self.actor.trainable_weights   
332 |         # else:
333 |         #     perturbed_actor_weights = self.perturbed_actor.get_layer('actor_mu_model_1').trainable_weights
334 |         #     actor_weights = self.actor.get_layer('actor_mu_model').trainable_weights
335 |         perturbed_actor_weights = self.perturbed_actor.get_layer('actor_mu_model_1').trainable_weights
336 |         actor_weights = self.actor.get_layer('actor_mu_model').trainable_weights
337 | 
338 |         for (perturbed_weights, weights) in zip(perturbed_actor_weights, actor_weights):
339 |             perturbed_weights.assign(weights + tf.random.normal(shape=tf.shape(weights), mean=0., stddev=self.param_noise_scale))
340 |     
341 |     def policy(self, net, t, state):
342 |         # network outputs
343 |         if self.time_step_counter < self.warmup and self.training:
344 |             # warmup
345 |             nn_action = np.random.uniform(low=-NN_BOUND, high=NN_BOUND, size=(self.n_action,))
346 |         else:
347 |             state_seq, state_fnn = self.obs_norm.normalize(state, update=False)
348 |             # add batch index
349 |             tf_state_seq = tf.expand_dims(tf.convert_to_tensor(state_seq, dtype=tf.float32), axis=0) 
350 |             tf_state_fnn = tf.expand_dims(tf.convert_to_tensor(state_fnn, dtype=tf.float32), axis=0)
351 | 
352 |             if self.training:
353 |                 # param noise
354 |                 if self.noise_type == 'param':
355 |                     tf_action = self.perturbed_actor([tf_state_seq, tf_state_fnn], training=self.training)
356 |                     tf_action = tf.squeeze(tf_action, axis=0) # remove batch index
357 |                     nn_action = tf_action.numpy()
358 |                 # action noise
359 |                 else:
360 |                     tf_action = self.actor([tf_state_seq, tf_state_fnn], training=self.training)
361 |                     tf_action = tf.squeeze(tf_action, axis=0) # remove batch index
362 |                     nn_action = tf_action.numpy()
363 |                     if t % 100 == 0:
364 |                         print(f'nn outputs = {nn_action}')
365 |                     nn_action += np.random.normal(loc=0., scale=self.action_noise_scale, size=(self.n_action,))
366 |                 # testing
367 |             else:
368 |                 tf_action = self.actor([tf_state_seq, tf_state_fnn], training=self.training)
369 |                 tf_action = tf.squeeze(tf_action, axis=0) # remove batch index
370 |                 nn_action = tf_action.numpy()
371 |             nn_action = np.clip(nn_action, -NN_BOUND, NN_BOUND)
372 | 
373 |         # mg action
374 |         p_b5_min = max(P_B5_MIN, (SOC_MIN - self.bat5_soc) * self.bat5_max_e_mwh / HOUR_PER_TIME_STEP)
375 |         p_b5_max = min(P_B5_MAX, (SOC_MAX - self.bat5_soc) * self.bat5_max_e_mwh / HOUR_PER_TIME_STEP)
376 |         p_b10_min = max(P_B10_MIN, (SOC_MIN - self.bat10_soc) * self.bat10_max_e_mwh / HOUR_PER_TIME_STEP)
377 |         p_b10_max = min(P_B10_MAX, (SOC_MAX - self.bat10_soc) * self.bat10_max_e_mwh / HOUR_PER_TIME_STEP)
378 | 
379 |         # invalid action clipping
380 |         # mgt5_p_mw, mgt9_p_mw, mgt10_p_mw, bat5_p_mw, bat10_p_mw = utils.scale_to_mg(nn_action, self.max_action, self.min_action)
381 |         # bat5_p_mw = np.clip(bat5_p_mw, p_b5_min, p_b5_max)
382 |         # bat10_p_mw = np.clip(bat10_p_mw, p_b10_min, p_b10_max)
383 | 
384 |         # invalid action masking
385 |         self.min_action[ACTION_IDX.get('p_b5')] = p_b5_min
386 |         self.min_action[ACTION_IDX.get('p_b10')] = p_b10_min
387 |         self.max_action[ACTION_IDX.get('p_b5')] = p_b5_max
388 |         self.max_action[ACTION_IDX.get('p_b10')] = p_b10_max
389 |         bat5_p_mw, bat10_p_mw = utils.scale_to_mg(nn_action, self.min_action, self.max_action)
390 | 
391 |         mg_action = np.array([bat5_p_mw, bat10_p_mw])
392 |         self.time_step_counter += 1
393 |         return mg_action, nn_action
394 |     
395 |     def reset(self):
396 |         # init states
397 |         # self.mgt5_p_mw = 0.
398 |         # self.mgt9_p_mw = 0.
399 |         # self.mgt10_p_mw = 0.
400 |         self.bat5_p_mw = 0.
401 |         self.bat10_p_mw = 0.
402 |         self.prev_state = None
403 |         self.bat5_soc = 0.
404 |         self.bat10_soc = 0.
405 |         self.action = None
406 |         self.rewards = []
407 |         self.costs = []
408 |         self.history = {
409 |             'price': [],
410 |             # 'mgt5_p_mw': [round(self.mgt5_p_mw, 3)],
411 |             # 'mgt9_p_mw': [round(self.mgt9_p_mw, 3)],
412 |             # 'mgt10_p_mw': [round(self.mgt10_p_mw, 3)],
413 |             'excess': [],
414 |             'nn_bat5_p_mw': [],
415 |             'nn_bat10_p_mw': [],
416 |             'bat5_p_mw': [],
417 |             'bat10_p_mw': [],
418 |             'bat5_soc': [], 
419 |             'bat10_soc': []}
420 |         self.last_time_step = None
421 |         self.applied = False
422 | 
423 |         if not self.training:
424 |             return
425 | 
426 |         # parameter scheduling
427 |         if self.noise_type == 'action':
428 |             self.adjust_action_noise()  
429 |         self.buffer.schedule_beta(beta_inc=0.01)
430 | 
431 |     def save_models(self, dir='model_weights', run=1):
432 |         print('... Saving Models ...')
433 |         self.actor.save_weights(os.path.join(dir, self.sequence_model_type, str(run), 'actor_weights'))
434 |         self.critic1.save_weights(os.path.join(dir, self.sequence_model_type, str(run), 'critic1_weights'))
435 |         self.critic2.save_weights(os.path.join(dir, self.sequence_model_type, str(run), 'critic2_weights'))
436 |         self.target_actor.save_weights(os.path.join(dir, self.sequence_model_type, str(run), 'target_actor_weights'))
437 |         self.target_critic1.save_weights(os.path.join(dir, self.sequence_model_type, str(run), 'target_critic1_weights'))
438 |         self.target_critic2.save_weights(os.path.join(dir, self.sequence_model_type, str(run), 'target_critic2_weights'))
439 | 
440 |     def update(self):
441 |         # sample
442 |         state_seq_batch, state_fnn_batch, action_batch, reward_batch, next_state_seq_batch, next_state_fnn_batch, idxs, weights = self.buffer.sample(self.batch_size)
443 |         state_seq_batch = tf.convert_to_tensor(state_seq_batch, dtype=tf.float32)
444 |         state_fnn_batch = tf.convert_to_tensor(state_fnn_batch, dtype=tf.float32)
445 |         action_batch = tf.convert_to_tensor(action_batch, dtype=tf.float32)
446 |         reward_batch = tf.convert_to_tensor(reward_batch, dtype=tf.float32)
447 |         next_state_seq_batch = tf.convert_to_tensor(next_state_seq_batch, dtype=tf.float32)
448 |         next_state_fnn_batch = tf.convert_to_tensor(next_state_fnn_batch, dtype=tf.float32)
449 |         weights = tf.convert_to_tensor(weights, dtype=tf.float32)
450 | 
451 |         # update critics
452 |         critic_loss, target_values, td_errs = self.update_critics(state_seq_batch, state_fnn_batch, action_batch, reward_batch, next_state_seq_batch, next_state_fnn_batch, weights)
453 |         self.update_priority(td_errs, idxs)
454 |         self.learn_step_counter += 1
455 | 
456 |         # update sequence model
457 |         if (self.sequence_model_type != 'none') and (not self.use_pretrained_sequence_model):
458 |             self.update_sequence_model(state_seq_batch, state_fnn_batch, action_batch, target_values)
459 | 
460 |         if self.learn_step_counter % self.delay != 0:
461 |             return
462 | 
463 |         # update actor
464 |         actor_loss = self.update_actor(state_seq_batch, state_fnn_batch)
465 | 
466 |         # parameter noise
467 |         if self.noise_type == 'param':
468 |             self.perturb_policy()
469 |             d = self.calculate_distance(state_seq_batch, state_fnn_batch)
470 |             self.adapt_param_noise(d)
471 | 
472 |         # update targets
473 |         self.update_target_networks()
474 | 
475 |     @tf.function
476 |     def update_actor(self, state_seq_batch, state_fnn_batch):
477 |         # trainable variables
478 |         if self.sequence_model_type == 'none':
479 |             actor_vars = self.actor.trainable_variables
480 |         else:
481 |             actor_vars = self.actor.get_layer('actor_mu_model').trainable_variables
482 | 
483 |         # gradient descent
484 |         with tf.GradientTape() as tape:
485 |             actions = self.actor([state_seq_batch, state_fnn_batch], training=True)
486 |             actions = self.a_norm.tf_normalize(actions)
487 |             q_values = self.critic1([state_seq_batch, state_fnn_batch, actions], training=True)
488 |             actor_loss = -tf.math.reduce_mean(q_values)
489 |         actor_grads = tape.gradient(actor_loss, actor_vars)
490 |         self.actor.optimizer.apply_gradients(zip(actor_grads, actor_vars))
491 | 
492 |         return actor_loss
493 |     
494 |     @tf.function
495 |     def update_critics(self, state_seq_batch, state_fnn_batch, action_batch, reward_batch, next_state_seq_batch, next_state_fnn_batch, weights):
496 |         # Issue: https://github.com/tensorflow/tensorflow/issues/35928
497 |         # with tf.GradientTape(persistent=True) as tape:
498 | 
499 |         # target actions
500 |         target_actions = self.target_actor([next_state_seq_batch, next_state_fnn_batch], training=True)
501 |         target_actions += tf.clip_by_value(tf.random.normal(shape=(self.batch_size, self.n_action), stddev=0.2), -0.5, 0.5)
502 |         target_actions = tf.clip_by_value(target_actions, -NN_BOUND, NN_BOUND)
503 |         target_actions = self.a_norm.tf_normalize(target_actions)
504 | 
505 |         # target values
506 |         target_q_value1 = self.target_critic1([next_state_seq_batch, next_state_fnn_batch, target_actions], training=True)
507 |         target_q_value2 = self.target_critic2([next_state_seq_batch, next_state_fnn_batch, target_actions], training=True)
508 |         target_values = reward_batch + self.gamma * tf.math.minimum(target_q_value1, target_q_value2)
509 | 
510 |         # td errors
511 |         td_errs = target_values - self.critic1([state_seq_batch, state_fnn_batch, action_batch])
512 | 
513 |         # trainable variables
514 |         if self.sequence_model_type == 'none':
515 |             critic1_vars = self.critic1.trainable_variables
516 |             critic2_vars = self.critic2.trainable_variables
517 |         else:
518 |             critic1_vars = self.critic1.get_layer('critic_q_model').trainable_variables
519 |             critic2_vars = self.critic2.get_layer('critic_q_model_1').trainable_variables
520 | 
521 |         huber_loss = keras.losses.Huber()
522 |         # update critic model 1
523 |         with tf.GradientTape() as tape1:
524 |             critic_loss1 = huber_loss(weights*target_values, weights*self.critic1([state_seq_batch, state_fnn_batch, action_batch], training=True))
525 |         critic_grads1 = tape1.gradient(critic_loss1, critic1_vars)
526 |         self.critic1.optimizer.apply_gradients(zip(critic_grads1, critic1_vars))
527 | 
528 |         # update critic model 2
529 |         with tf.GradientTape() as tape2:
530 |             critic_loss2 = huber_loss(weights*target_values, weights*self.critic2([state_seq_batch, state_fnn_batch, action_batch], training=True))
531 |         critic_grads2 = tape2.gradient(critic_loss2, critic2_vars)
532 |         self.critic2.optimizer.apply_gradients(zip(critic_grads2, critic2_vars))
533 | 
534 |         return critic_loss1, target_values, td_errs
535 |     
536 |     def update_priority(self, td_errs, idxs):
537 |         priorities = np.abs(td_errs.numpy().flatten()) + self.epsilon_p
538 |         for idx, p in zip(idxs, priorities):
539 |             self.buffer.update_tree(idx, p)
540 | 
541 |     @tf.function
542 |     def update_sequence_model(self, state_seq_batch, state_fnn_batch, action_batch, target_values):
543 |         huber_loss = keras.losses.Huber()
544 |         with tf.GradientTape() as tape:
545 |             critic_loss = huber_loss(target_values, self.critic1([state_seq_batch, state_fnn_batch, action_batch], training=True)) 
546 |             critic_loss += huber_loss(target_values, self.critic2([state_seq_batch, state_fnn_batch, action_batch], training=True))
547 |             critic_loss /= (2 * SEQ_LENGTH)
548 |         seq_grads = tape.gradient(critic_loss, self.sequence_model.trainable_variables)
549 |         seq_grads = [tf.clip_by_norm(g, 1.0) for g in seq_grads]
550 |         self.sequence_model.optimizer.apply_gradients(zip(seq_grads, self.sequence_model.trainable_variables))
551 |     
552 |     @tf.function
553 |     def update_target_networks(self, tau=0.005):
554 |         if self.sequence_model_type == 'none':
555 |             target_actor_weights = self.target_actor.trainable_weights
556 |             actor_weights = self.actor.trainable_weights
557 |             target_critic1_weights = self.target_critic1.trainable_weights
558 |             critic1_weights = self.critic1.trainable_weights
559 |             target_critic2_weights = self.target_critic2.trainable_weights
560 |             critic2_weights = self.critic2.trainable_weights
561 |         else:
562 |             target_actor_weights = self.target_actor.get_layer('actor_mu_model_2').trainable_weights
563 |             actor_weights = self.actor.get_layer('actor_mu_model').trainable_weights
564 |             target_critic1_weights = self.target_critic1.get_layer('critic_q_model_2').trainable_weights
565 |             critic1_weights = self.critic1.get_layer('critic_q_model').trainable_weights
566 |             target_critic2_weights = self.target_critic2.get_layer('critic_q_model_3').trainable_weights
567 |             critic2_weights = self.critic2.get_layer('critic_q_model_1').trainable_weights
568 | 
569 |         # update target actor
570 |         for target_weight, weight in zip(target_actor_weights, actor_weights):
571 |             target_weight.assign(tau * weight + (1 - tau) * target_weight)
572 | 
573 |         # update target critic1
574 |         for target_weight, weight in zip(target_critic1_weights, critic1_weights):
575 |             target_weight.assign(tau * weight + (1 - tau) * target_weight)
576 | 
577 |         # update target critic2
578 |         for target_weight, weight in zip(target_critic2_weights, critic2_weights):
579 |             target_weight.assign(tau * weight + (1 - tau) * target_weight)
580 | 
581 |     def time_step(self, net, t):
582 |         # action
583 |         state = self.get_state(net, t)
584 |         mg_action, nn_action = self.policy(net, t, state)
585 |         self.bat5_p_mw, self.bat10_p_mw = mg_action
586 |         self.action = nn_action
587 |         self.prev_state = state
588 |         if self.training:
589 |             utils.log_trans_info(state, mg_action, t+5, freq=50)
590 |         
591 |         # history
592 |         self.history['price'].append(round(self.price_profile['price'][t], 3))
593 |         excess = net.res_sgen['p_mw'].sum() - net.res_load['p_mw'].sum()
594 |         self.history['excess'].append(round(excess, 3))
595 |         # self.history['mgt5_p_mw'].append(round(self.mgt5_p_mw, 5))
596 |         # self.history['mgt9_p_mw'].append(round(self.mgt9_p_mw, 5))
597 |         # self.history['mgt10_p_mw'].append(round(self.mgt10_p_mw, 5))
598 |         self.history['nn_bat5_p_mw'].append(round(self.action[ACTION_IDX['p_b5']], 3))
599 |         self.history['nn_bat10_p_mw'].append(round(self.action[ACTION_IDX['p_b10']], 3))
600 |         self.history['bat5_p_mw'].append(round(self.bat5_p_mw, 3))
601 |         self.history['bat10_p_mw'].append(round(self.bat10_p_mw, 3))
602 |         self.history['bat5_soc'].append(round(self.bat5_soc, 3))
603 |         self.history['bat10_soc'].append(round(self.bat10_soc, 3))
604 | 
605 |         self.applied = False
606 |         self.last_time_step = t


--------------------------------------------------------------------------------