├── .gitignore
├── README.md
├── transformer_helper.py
├── rolling_and_plot_tf.py
└── transform_notebook.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/
2 | .DS_Store
3 | __pycache__/
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Transformers + TensorFlow and Pandas for SOC Estimation
 2 | 
 3 | **The Testing branch is the most up to date**
 4 | 
 5 | Repo with the Decoder implemented: [Attar's Github Repo](https://github.com/att-ar/transform_decode_soc)
 6 | 
 7 | Building a transformer neural network using TensorFlow and Transformers in Python with the goal of prediciting Li-ion State of Charge based on real time voltage, current and delta time data.
 8 | 
 9 | This transformer is composed of only the encoder layer, and it uses Batch Normalization instead of the Layer Normalization found in NLP.
10 | This was done because literature said these two changes proved significantly more effective than the NLP application of transformers.
11 | 
12 | The transformers' input will be voltage, current, delta time and previous SOC points in a batch of windowed data of shape:<br>
13 | ```(G.batch_size, G.window_size, G.num_features)```
14 | 
15 | The voltage, current and soc data will be from time: $$t - \text{windowsize} \rightarrow t$$<br>
16 | The output should be the SOC prediction at time $t + 1$ for each batch, the output shape should be `(G.batch_size, 1)`
17 | 


--------------------------------------------------------------------------------
/transformer_helper.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | from os import environ
 4 | environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
 5 | 
 6 | 
 7 | def get_angles(pos, k, d: int):
 8 |     """
 9 |     Get angles to be used in the positional encoding vectors
10 | 
11 |     Arguments:
12 |         pos -- Column vector containing the positions [[0], [1], ...,[N-1]]
13 |         k --   Row vector containing the dimension span [[0, 1, 2, ..., d-1]]
14 |         d -- Encoding size
15 | 
16 |     Returns:
17 |         angles -- (pos, d) np.array
18 |     """
19 |     # Get i from dimension span k
20 |     i = k // 2
21 |     # Calculate the angles using pos, i and d
22 |     angles = pos / (10000 ** (2 * i / d))
23 | 
24 |     return angles
25 | 
26 | 
27 | def positional_encoding(positions: int, d: int):
28 |     """
29 |     Precomputes a matrix with all the positional encodings
30 | 
31 |     Arguments:
32 |         positions - Maximum number of positions to be encoded
33 |         d - Encoding size
34 | 
35 |     Returns:
36 |         pos_encoding - (1, position, d_model) matrix with the positional encodings
37 |     """
38 |     angle_rads = get_angles(np.arange(positions)[:, np.newaxis],
39 |                             np.arange(d)[np.newaxis, :],
40 |                             d)
41 | 
42 |     # apply sin to even indices 2i
43 |     angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
44 | 
45 |     # apply cos to odd indices 2i+1
46 |     angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
47 | 
48 |     pos_encoding = angle_rads[np.newaxis, :, :].reshape(1, positions, d)
49 | 
50 |     # casts tensor to float dtype
51 |     return tf.cast(pos_encoding, dtype=tf.float32)
52 | 
53 | 
54 | def create_look_ahead_mask(sequence_length):
55 |     """
56 |     Returns an upper triangular matrix filled with ones.
57 |     Lets the training model check if it got predictions right by having access to the actual output
58 | 
59 |     Arguments:
60 |         sequence_length -- matrix size (sequence length is the number of time steps per input
61 |                            input.shape = [batch_size, sequence_length, num_features])
62 | 
63 |     Returns:
64 |         mask -- (size, size) tensor
65 | 
66 |     >>>create_look_ahead_mask(5)
67 |     <tf.Tensor: shape=(5, 5), dtype=float32, numpy=
68 |     array([[1., 0., 0., 0., 0.],
69 |            [1., 1., 0., 0., 0.],
70 |            [1., 1., 1., 0., 0.],
71 |            [1., 1., 1., 1., 0.],
72 |            [1., 1., 1., 1., 1.]], dtype=float32)>
73 |     """
74 |     mask = tf.linalg.band_part(
75 |         tf.ones((1, sequence_length, sequence_length)), -1, 0)
76 |     return mask.squeeze()
77 | 


--------------------------------------------------------------------------------
/rolling_and_plot_tf.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | 
  4 | import plotly.express as px
  5 | import plotly.graph_objects as go
  6 | from plotly.subplots import make_subplots
  7 | 
  8 | from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler
  9 | from sklearn.model_selection import train_test_split
 10 | 
 11 | import torch
 12 | 
 13 | 
 14 | def helper(value, j):
 15 |     '''
 16 |     helper function for data_plot()
 17 |     '''
 18 |     if value == "None":
 19 |         return None
 20 |     elif type(value) == list and j < len(value):
 21 |         return value[j]
 22 |     else:  # not a list so only one value
 23 |         if j == 0:
 24 |             return value
 25 |         else:
 26 |             return None
 27 | 
 28 | 
 29 | def data_plot(data=None, x=None, y=None,
 30 |               x_title=None, y_title=None, title=None,
 31 |               **kwargs):
 32 |     '''
 33 |     list of pandas.DataFrame, list of str, list of str, list of str, kwargs -> plotly plot object
 34 | 
 35 |     Precondition: If an argument has multiple objects, they must be in a list (can have nested lists).
 36 |                   The order of the arguments must be in the same order as the DataFrames.
 37 |                   There must be the same number of x columns as y columns passed.
 38 | 
 39 |                   ex) ocv_plot(
 40 |                       data = [df1, df2],
 41 |                       x = [ "SOC", "SOC-Dis" ],
 42 |                       y = [ "OCV", "OCV-Dis" ],
 43 |                       mode = ["lines+markers", "markers"],
 44 |                       color = ["mintcream", "darkorchid"]
 45 |                       )
 46 | 
 47 |     This function takes one or more DataFrames, columns from the respective DataFrames to be plot on x and y-axes.
 48 |     It also takes the mode of plotting desired for the DataFrames and optional keyword arguments.
 49 |     It outputs a plotly plot of the data from the columns that were passed.
 50 | 
 51 |     Parameters:
 52 |     `data` DataFrame or list of DataFrames
 53 | 
 54 |     `x` list of columns or nested lists of columns
 55 |         example of each option in order:
 56 |             x = ["SOC-Dis"]
 57 |             x = ["SOC-Dis","SOC-Chg","SOC"]
 58 |             x = [ ["Test Time (sec)","Step Time (sec)"], "Step"]
 59 |                 Test Time and Step Time are both from the same DataFrame; there must be two y columns as well.
 60 | 
 61 |     `y` list of columns or nested lists of columns
 62 |         View `x` for help
 63 | 
 64 |     `x_title` str
 65 |         the name of the x_axis to be displayed
 66 |         else None
 67 | 
 68 |     `y_title` str
 69 |         the name of the y_axis to be displayed
 70 |         else None
 71 | 
 72 |     `title` str
 73 |         The title of the Plot
 74 |         default None will not add a title
 75 | 
 76 |     **kwargs: (alphabetical order)
 77 | 
 78 |     `color` str, list of str, nested lists of str:
 79 |         same principle as above arguments,
 80 |         assigns the color of the individual data lines.
 81 |         if no value is passed for a plot, plotly will do it automatically.
 82 | 
 83 |         The 'color' property is a color and may be specified as:
 84 |           - A hex string (e.g. '#ff0000')
 85 |           - An rgb/rgba string (e.g. 'rgb(255,0,0)')
 86 |           - An hsl/hsla string (e.g. 'hsl(0,100%,50%)')
 87 |           - An hsv/hsva string (e.g. 'hsv(0,100%,100%)')
 88 |           - A named CSS color:
 89 |                 aliceblue, antiquewhite, aqua, aquamarine, azure,
 90 |                 beige, bisque, black, blanchedalmond, blue,
 91 |                 blueviolet, brown, burlywood, cadetblue,
 92 |                 chartreuse, chocolate, coral, cornflowerblue,
 93 |                 cornsilk, crimson, cyan, darkblue, darkcyan,
 94 |                 darkgoldenrod, darkgray, darkgrey, darkgreen,
 95 |                 darkkhaki, darkmagenta, darkolivegreen, darkorange,
 96 |                 darkorchid, darkred, darksalmon, darkseagreen,
 97 |                 darkslateblue, darkslategray, darkslategrey,
 98 |                 darkturquoise, darkviolet, deeppink, deepskyblue,
 99 |                 dimgray, dimgrey, dodgerblue, firebrick,
100 |                 floralwhite, forestgreen, fuchsia, gainsboro,
101 |                 ghostwhite, gold, goldenrod, gray, grey, green,
102 |                 greenyellow, honeydew, hotpink, indianred, indigo,
103 |                 ivory, khaki, lavender, lavenderblush, lawngreen,
104 |                 lemonchiffon, lightblue, lightcoral, lightcyan,
105 |                 lightgoldenrodyellow, lightgray, lightgrey,
106 |                 lightgreen, lightpink, lightsalmon, lightseagreen,
107 |                 lightskyblue, lightslategray, lightslategrey,
108 |                 lightsteelblue, lightyellow, lime, limegreen,
109 |                 linen, magenta, maroon, mediumaquamarine,
110 |                 mediumblue, mediumorchid, mediumpurple,
111 |                 mediumseagreen, mediumslateblue, mediumspringgreen,
112 |                 mediumturquoise, mediumvioletred, midnightblue,
113 |                 mintcream, mistyrose, moccasin, navajowhite, navy,
114 |                 oldlace, olive, olivedrab, orange, orangered,
115 |                 orchid, palegoldenrod, palegreen, paleturquoise,
116 |                 palevioletred, papayawhip, peachpuff, peru, pink,
117 |                 plum, powderblue, purple, red, rosybrown,
118 |                 royalblue, rebeccapurple, saddlebrown, salmon,
119 |                 sandybrown, seagreen, seashell, sienna, silver,
120 |                 skyblue, slateblue, slategray, slategrey, snow,
121 |                 springgreen, steelblue, tan, teal, thistle, tomato,
122 |                 turquoise, violet, wheat, white, whitesmoke,
123 |                 yellow, yellowgreen
124 |           - A number that will be interpreted as a color
125 |             according to scatter.marker.colorscale
126 |           - A list or array of any of the above
127 | 
128 |     `mode` str, list of str, nested lists of str:
129 |         default None: will set mode = "lines"
130 |         Note: str must be one of "lines", "markers", "lines+markers" which are self-explanatory
131 |         example of each option in order:
132 |             mode = "markers"
133 |             mode = ["lines+markers", "lines"]
134 |             mode = ["lines+markers",["lines","lines"]]
135 | 
136 |     `name` str, list of str, nested list of strs
137 |         same principle as above arguments
138 |         assigns the names of the individual data lines to be displayed in the legend
139 | 
140 |     `size` int/float, list of int/float or nested lists of int/float
141 |         same principle as above arguments
142 |         assigns the size of the individual data lines
143 |         if no value is passed, plotly will do it automatically.
144 | 
145 | 
146 |     >>>df1 = generate_ocv_pts("JMFM_12_SOC_OCV_Test_220411.txt", to_csv = False)
147 |     >>>df2 = ocv_estimate(df1, to_csv = False)
148 |     >>>data_plot(data = [df1,df2],
149 |           x=[ ["SOC-Chg","SOC-Dis"],"SOC" ],
150 |           y = [ ["OCV-Chg","OCV-Dis"], "OCV" ],
151 |           title = "JMFM-12 OCV vs. SOC Curve",
152 |           x_title = "SOC (%)",
153 |           y_title = "OCV (V)",
154 |           mode = [ ["markers","markers"] ],
155 |           color = [ ["violet","lightcoral"], "darkorchid"],
156 |           name = [ ["Charge-OCV","Discharge-OCV"], "OCV"],
157 |           size = [[4.5,4.5]]
158 |          )
159 |     figure...
160 |     '''
161 |     if type(data) == list and not pd.Series(
162 |         pd.Series([len(x), len(y)]) == len(data)
163 |     ).all():
164 |         return '''Error: x and y columns passed much match the number of DataFrames passed
165 |         Use nested lists for multiple columns from the same DataFrame
166 |         '''
167 | 
168 |     elif type(data) != list and not pd.Series(pd.Series([len(x), len(y)]) == 1).all():
169 |         return '''Error: x and y columns passed much match the number of DataFrames passed
170 |         Use nested lists for multiple columns from the same DataFrame
171 |         '''
172 | 
173 |     if "mode" in kwargs.keys():
174 |         if type(kwargs["mode"]) == list and len(kwargs["mode"]) > len(data):
175 |             return "Error: passed more modes than DataFrames"
176 | 
177 |     if "color" in kwargs.keys():
178 |         if type(kwargs["color"]) == list and len(kwargs["color"]) > len(data):
179 |             return "Error: passed more colors than DataFrames"
180 | 
181 |     if "name" in kwargs.keys():
182 |         if type(kwargs["name"]) == list and len(kwargs["name"]) > len(data):
183 |             return "Error: passed more names than DataFrames"
184 | 
185 |     if "size" in kwargs.keys():
186 |         if type(kwargs["size"]) == list and len(kwargs["size"]) > len(data):
187 |             return "Error: passed more sizes than DataFrames"
188 | 
189 |     frame = pd.DataFrame(data={"x": x, "y": y})
190 | 
191 |     for i in ["color", "mode", "name", "size"]:
192 |         frame = frame.join(
193 |             pd.Series(kwargs.get(i), name=i, dtype="object"),
194 |             how="outer")
195 | 
196 |     frame.fillna("None", inplace=True)
197 | 
198 |     figure = make_subplots(
199 |         x_title=x_title, y_title=y_title, subplot_titles=[title])
200 | 
201 |     for i in frame.index:
202 |         if type(data) == list:
203 |             use_data = data[i]
204 |         else:
205 |             use_data = data
206 | 
207 |         if type(frame["x"][i]) == list:  # y[i] must be a list
208 |             for j in range(len(x[i])):
209 |                 use_x = frame.loc[i, "x"][j]
210 |                 use_y = frame.loc[i, "y"][j]
211 | 
212 |                 use_color = helper(frame.loc[i, "color"], j)
213 |                 use_mode = helper(frame.loc[i, "mode"], j)
214 |                 use_name = helper(frame.loc[i, "name"], j)
215 |                 use_size = helper(frame.loc[i, "size"], j)
216 | 
217 |                 figure.add_trace(
218 |                     go.Scatter(
219 |                         x=use_data[use_x], y=use_data[use_y],
220 |                         mode=use_mode, marker={
221 |                             "size": use_size, "color": use_color},
222 |                         name=use_name)
223 |                 )
224 |         else:  # x[i] and y[i] are not lists
225 |             use_x = frame.loc[i, "x"]
226 |             use_y = frame.loc[i, "y"]
227 |             use_color = helper(frame.loc[i, "color"], 0)
228 |             use_mode = helper(frame.loc[i, "mode"], 0)
229 |             use_name = helper(frame.loc[i, "name"], 0)
230 |             use_size = helper(frame.loc[i, "size"], 0)
231 |             # zero is just a placholder
232 | 
233 |             figure.add_trace(
234 |                 go.Scatter(
235 |                     x=use_data[use_x], y=use_data[use_y],
236 |                     mode=use_mode, marker={
237 |                         "size": use_size, "color": use_color},
238 |                     name=use_name)
239 |             )
240 |     return figure
241 | 
242 | 
243 | # -------------------------------------------------------
244 | 
245 | def normalize(data: pd.DataFrame, capacity: float):
246 |     '''
247 |     pd.DataFrame -> pd.DataFrame
248 |     Precondition: "delta t" is removed from the DataFrame
249 | 
250 |     Normalizes the data by applying sklearn.preprocessing functions
251 |     Voltage is scaled between 0 and 1;
252 |     Current is scaled to become C-rate
253 |     SOC is scaled between 0 and 1 (just divided by 100)
254 | 
255 |     Output:
256 |         normalized pd.DataFrame
257 |     '''
258 |     data["current"] /= capacity
259 |     data["voltage"] = MinMaxScaler((0, 1)).fit_transform(
260 |         data["voltage"].values.reshape(-1, 1))
261 |     data["soc"] /= 100.
262 | 
263 |     print(f'''Scaled stats:
264 | 
265 | variance:\n{data.var(axis = 0)},
266 | 
267 | mean:\n{data.mean(axis=0)}''')
268 | 
269 |     return data
270 | 
271 | # -------------------------------------------------------
272 | 
273 | 
274 | def rolling_split_trial(df, window_size):
275 |     '''
276 |     implements rolling window sectioning
277 |     There are four input features: delta_t, V, I at time t, and SOC at time t-1
278 |     Prediction at time t uses the features given
279 |     '''
280 |     if "delta t" in df.columns:
281 |         col = ["delta t", "current", "voltage"]
282 |     else:
283 |         col = ["current", "voltage"]
284 |     df_x = (df[col].iloc[1:].reset_index(drop=True)  # staggered right by one
285 |             .join(
286 |         df["soc"].iloc[:-1].reset_index(drop=True),  # staggered left by one
287 |         how="outer"
288 |     ))
289 |     df_x = [window.values
290 |             for window
291 |             in df_x.rolling(window=window_size,
292 |                             min_periods=window_size - 2,
293 |                             method="table"
294 |                             )][window_size:]
295 | 
296 |     # staggered right by one
297 |     df_y = df["soc"].iloc[window_size + 1:].values[:, np.newaxis]
298 | 
299 |     return np.array(df_x, dtype="float32"), np.array(df_y, dtype="float32")
300 | 
301 | 
302 | def rolling_split(df, window_size, test_size=0.1, train=True):
303 |     '''
304 |     Precondition: "delta t" is not in the columns
305 |     implements rolling window sectioning
306 |     Four input features: delta_t, I, V, SOC all at time t-1
307 |     The prediction of SOC at time t uses no other information
308 | 
309 |     Returns a shuffled and windowed dataset using
310 |     sklearn.model_selection.train_test_split
311 | 
312 |     Parameters:
313 |     `window_size` int
314 |         the number of consecutive data points needed to form a data window
315 |     `test_size` float in between 0 and 0.2 exclusive
316 |         the ratio of data points allocated to the dev/test set
317 |         Should never exceed 0.2
318 |     '''
319 |     assert "delta t" not in df.columns
320 |     assert isinstance(test_size, float)
321 |     assert test_size > 0 and test_size <= 0.2
322 | 
323 |     df_x = [window.values
324 |             for window
325 |             # staggered left by one
326 |             in df[["current", "voltage", "soc"]].iloc[:-1]
327 |             .rolling(window=window_size,
328 |                      min_periods=window_size - 2,
329 |                      method="table"
330 |                      )][window_size:]
331 | 
332 |     df_y = df["soc"].iloc[window_size + 1:].values
333 | 
334 |     if train:
335 |         return train_test_split(np.array(df_x, dtype="float32"),
336 |                                 np.array(df_y, dtype="float32")[:, np.newaxis],
337 |                                 test_size=test_size,
338 |                                 shuffle=True)
339 |     else:
340 |         return (np.array(df_x, dtype="float32"),
341 |                 np.array(df_y, dtype="float32")[:, np.newaxis])
342 | 
343 | # ----------------------------------------------------------------
344 | # Validation
345 | 
346 | def validate(model, dataloader, dev=True):
347 |     '''
348 |     tensorflow model, tensorflow DataSet -> pd.DataFrame, prints 2 floats and a Plotly plot
349 | 
350 |     !! Tensorflow version, not the original PyTorch version
351 |     This function runs a td.data.Dataset through the model and prints the max and min
352 |     predicted SOC, it also prints a Plotly plot of the predictions versus the labels
353 |     This function outputs a pandas.DataFrame of the predictions with their corresponding labels.
354 |     
355 |     Parameters:
356 |     `dev` bool
357 |         whether or not it's the developmental set
358 |         use False if it's the entire dataset
359 |     '''
360 |     
361 |     aggregate = model.predict(dataloader, verbose = 1)
362 |     print("Max pred: ", aggregate.max(), "\tMin pred: ", aggregate.min())
363 | 
364 |     np_labels = np.concatenate([label.numpy() for _, label in dataloader][
365 |         :len(aggregate)], axis = 0)
366 | 
367 |     visualize = pd.DataFrame(data={"pred": aggregate.squeeze(),
368 |                                    "labels": np_labels.squeeze()})
369 |     
370 |     if dev:  # if it is the dev set, the values need to be sorted by value
371 |         visualize.sort_values("labels", inplace=True)
372 |     # if it is the entire dataset, it is already sorted chronologically which is more important
373 | 
374 |     visualize.reset_index(drop=True)
375 | 
376 |     visualize["point"] = list(range(1, len(visualize) + 1))
377 |     print("Percent Accuracy:", np.mean(100.0 - abs((aggregate - np_labels))/(np_labels+0.01) * 100))
378 | 
379 |     fig = data_plot(data=visualize,
380 |                     x=[["point", "point"]],
381 |                     y=[["pred", "labels"]],
382 |                     x_title="Data Point",
383 |                     y_title="SOC",
384 |                     title="Predicted vs Actual SOC",
385 |                     name=[["predictions", "labels"]],
386 |                     mode=[["lines", "lines"]],
387 |                     color=[["red", "yellow"]]
388 |                     )
389 |     fig.show()
390 |     return visualize
391 | 


--------------------------------------------------------------------------------
/transform_notebook.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {
   6 |     "id": "h_Iw1qCBlT-z"
   7 |    },
   8 |    "source": [
   9 |     "<a name='0'></a>\n",
  10 |     "## Import"
  11 |    ]
  12 |   },
  13 |   {
  14 |    "cell_type": "code",
  15 |    "execution_count": null,
  16 |    "metadata": {
  17 |     "id": "cqXowf9MlT-1"
  18 |    },
  19 |    "outputs": [],
  20 |    "source": [
  21 |     "from google.colab import drive\n",
  22 |     "drive.mount('/content/drive')"
  23 |    ]
  24 |   },
  25 |   {
  26 |    "cell_type": "code",
  27 |    "execution_count": null,
  28 |    "metadata": {
  29 |     "id": "eh1JdQmwlT-3"
  30 |    },
  31 |    "outputs": [],
  32 |    "source": [
  33 |     "!cp /content/drive/MyDrive/transformer_soc/rolling_and_plot_tf.py .\n",
  34 |     "!cp /content/drive/MyDrive/transformer_soc/sim_data.csv .\n",
  35 |     "!cp /content/drive/MyDrive/transformer_soc/transformer_helper.py ."
  36 |    ]
  37 |   },
  38 |   {
  39 |    "cell_type": "code",
  40 |    "execution_count": null,
  41 |    "metadata": {
  42 |     "id": "_OpwqWL2QH5G"
  43 |    },
  44 |    "outputs": [],
  45 |    "source": [
  46 |     "# from os import environ\n",
  47 |     "# environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"1\"\n",
  48 |     "# removes tensorflow warnings triggered because of Tensorflow incompatibility with my Apple M1 chip.\n",
  49 |     "# ignore this when using a non Apple Silicon device, ie. Google Colab or the likes.\n",
  50 |     "\n",
  51 |     "import tensorflow as tf\n",
  52 |     "from tensorflow.keras.layers import MultiHeadAttention, Dense, Input, Dropout, BatchNormalization\n",
  53 |     "import tensorflow.keras.backend as K\n",
  54 |     "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n",
  55 |     "\n",
  56 |     "from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler\n",
  57 |     "from sklearn.model_selection import train_test_split\n",
  58 |     "\n",
  59 |     "from dataclasses import dataclass"
  60 |    ]
  61 |   },
  62 |   {
  63 |    "cell_type": "markdown",
  64 |    "metadata": {
  65 |     "id": "Z6BKLL9B3vIZ"
  66 |    },
  67 |    "source": [
  68 |     "Cells Below is **only for TPUs**\n",
  69 |     "\n",
  70 |     "---\n",
  71 |     "\n"
  72 |    ]
  73 |   },
  74 |   {
  75 |    "cell_type": "code",
  76 |    "execution_count": null,
  77 |    "metadata": {
  78 |     "id": "WMA_zsLY3x6O"
  79 |    },
  80 |    "outputs": [],
  81 |    "source": [
  82 |     "# import os\n",
  83 |     "# resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')\n",
  84 |     "# tf.config.experimental_connect_to_cluster(resolver)\n",
  85 |     "# # This is the TPU initialization code that has to be at the beginning.\n",
  86 |     "# tf.tpu.experimental.initialize_tpu_system(resolver)\n",
  87 |     "# print(\"All devices: \", tf.config.list_logical_devices('TPU'))\n",
  88 |     "\n",
  89 |     "# strategy = tf.distribute.TPUStrategy(resolver)"
  90 |    ]
  91 |   },
  92 |   {
  93 |    "cell_type": "markdown",
  94 |    "metadata": {
  95 |     "id": "0K8Ni6bD4Mge"
  96 |    },
  97 |    "source": [
  98 |     "\n",
  99 |     "\n",
 100 |     "---\n",
 101 |     "\n"
 102 |    ]
 103 |   },
 104 |   {
 105 |    "cell_type": "code",
 106 |    "execution_count": null,
 107 |    "metadata": {
 108 |     "id": "_DOA-JbhlT-4"
 109 |    },
 110 |    "outputs": [],
 111 |    "source": [
 112 |     "import numpy as np\n",
 113 |     "import pandas as pd\n",
 114 |     "\n",
 115 |     "!pip install jupyterplot\n",
 116 |     "from jupyterplot import ProgressPlot as PP\n",
 117 |     "\n",
 118 |     "from transformer_helper import *\n",
 119 |     "from rolling_and_plot_tf import data_plot, rolling_split, normalize, validate\n",
 120 |     "\n",
 121 |     "%reload_ext autoreload\n",
 122 |     "%autoreload 2"
 123 |    ]
 124 |   },
 125 |   {
 126 |    "cell_type": "markdown",
 127 |    "metadata": {
 128 |     "id": "pvorie1ElT-5"
 129 |    },
 130 |    "source": [
 131 |     "Will have to figure out how to set device to cuda in TensorFlow"
 132 |    ]
 133 |   },
 134 |   {
 135 |    "cell_type": "markdown",
 136 |    "metadata": {
 137 |     "id": "RUteRx9dlT-5"
 138 |    },
 139 |    "source": [
 140 |     "## Table of Contents\n",
 141 |     "\n",
 142 |     "- [Import](#0)\n",
 143 |     "- [JupyterPlot](#jup)\n",
 144 |     "- [Preprocessing](#win)\n",
 145 |     "- [Encoder](#enc)\n",
 146 |     "    - [Encoder Layer](#enc-lay)\n",
 147 |     "    - [Full Encoder](#full-enc)\n",
 148 |     "- [Transformer](#transform)\n",
 149 |     "- [Callbacks & Learn Rate Scheduler](#loss)\n",
 150 |     "- [Training](#train)\n",
 151 |     "- [Validate](#val)"
 152 |    ]
 153 |   },
 154 |   {
 155 |    "cell_type": "markdown",
 156 |    "metadata": {
 157 |     "id": "0EL21GdslT-5"
 158 |    },
 159 |    "source": [
 160 |     "# Literature:\n",
 161 |     "\n",
 162 |     "\n",
 163 |     "According to [A Transformer-based Framework for Multivariate Time Series Representation Learning](https://dl.acm.org/doi/abs/10.1145/3447548.3467401):\n",
 164 |     "Using **Batch Normalization is significantly more effective** for multivariate time-series than using the traditional Layer Normalization method found in NLP.\n",
 165 |     "\n",
 166 |     "In addition, according to [Deep learning approach towards accurate state of charge estimation for lithium-ion batteries using self-supervised transformer model](https://www.nature.com/articles/s41598-021-98915-8#Sec9):\n",
 167 |     "Using a transformer network while **forgoing the Decoder Layer** is more effective for the application of State-of-Charge estimation."
 168 |    ]
 169 |   },
 170 |   {
 171 |    "cell_type": "markdown",
 172 |    "metadata": {
 173 |     "id": "VG0gPyv0oDBi"
 174 |    },
 175 |    "source": [
 176 |     "$\\large{Self\\ Attention}$\n",
 177 |     "$$\n",
 178 |     "\\text { Attention }(Q, K, V)=\\operatorname{softmax}\\left(\\frac{Q K^{T}}{\\sqrt{d_{k}}}+{M}\\right) V\n",
 179 |     "$$"
 180 |    ]
 181 |   },
 182 |   {
 183 |    "cell_type": "markdown",
 184 |    "metadata": {
 185 |     "id": "k2DSwSOZlT-7"
 186 |    },
 187 |    "source": [
 188 |     "$\\large{Input}$\n",
 189 |     "\n",
 190 |     "Voltage, Current, SOC at times:\n",
 191 |     "$$t - window\\_size - 1 \\rightarrow t - 1 $$"
 192 |    ]
 193 |   },
 194 |   {
 195 |    "cell_type": "markdown",
 196 |    "metadata": {
 197 |     "id": "Bw-WpE1ulT-9"
 198 |    },
 199 |    "source": [
 200 |     "**Note**\n",
 201 |     "\n",
 202 |     "Cannot use embedding layers with battery data because of floating point values and negative values"
 203 |    ]
 204 |   },
 205 |   {
 206 |    "cell_type": "code",
 207 |    "execution_count": null,
 208 |    "metadata": {
 209 |     "id": "WStD-7ytlT-9"
 210 |    },
 211 |    "outputs": [],
 212 |    "source": [
 213 |     "@dataclass\n",
 214 |     "class G:\n",
 215 |     "    #preprocess\n",
 216 |     "    capacity = 18.02 # cell capacity in Ampere hours\n",
 217 |     "    window_time = 96 #seconds\n",
 218 |     "    window_size = 32\n",
 219 |     "    slicing = window_time // window_size\n",
 220 |     "    batch_size = 16\n",
 221 |     "    #network\n",
 222 |     "    dense_dim = 32\n",
 223 |     "    model_dim = 128\n",
 224 |     "    num_features = 3 # current, voltage, and soc at t minus G.window_size -> t minus 1\n",
 225 |     "    num_heads = 16\n",
 226 |     "    num_layers = 6\n",
 227 |     "    #learning_rate_scheduler\n",
 228 |     "    T_i = 1\n",
 229 |     "    T_mult = 2\n",
 230 |     "    T_cur = 0.0\n",
 231 |     "    #training\n",
 232 |     "    epochs = 256 #should be a power of T_mult because of cosine annealing with warm restarts scheduler\n",
 233 |     "    learning_rate = 0.0045\n",
 234 |     "    min_learning_rate = 6e-11\n",
 235 |     "#     weight_decay = 0.0 #No weight decay param in the the keras optimizers"
 236 |    ]
 237 |   },
 238 |   {
 239 |    "cell_type": "markdown",
 240 |    "metadata": {
 241 |     "id": "prIueTe-lT-9"
 242 |    },
 243 |    "source": [
 244 |     "<a id=\"win\"></a>\n",
 245 |     "# Preprocessing"
 246 |    ]
 247 |   },
 248 |   {
 249 |    "cell_type": "code",
 250 |    "execution_count": null,
 251 |    "metadata": {
 252 |     "id": "il6DI4Z7lT--"
 253 |    },
 254 |    "outputs": [],
 255 |    "source": [
 256 |     "# from google.colab import files\n",
 257 |     "file = pd.read_csv(\"/content/sim_data.csv\")\n",
 258 |     "#if using sim_data.csv:\n",
 259 |     "file[\"soc\"] *= 100.0"
 260 |    ]
 261 |   },
 262 |   {
 263 |    "cell_type": "code",
 264 |    "execution_count": null,
 265 |    "metadata": {
 266 |     "id": "SLQrFOvrlT--"
 267 |    },
 268 |    "outputs": [],
 269 |    "source": [
 270 |     "data_plot(data = [file],\n",
 271 |     "          title=\"OCV v SOC\",\n",
 272 |     "          x = [\"test time (sec)\"],\n",
 273 |     "          y = [\"soc\"],\n",
 274 |     "          markers = \"lines\",\n",
 275 |     "          color = \"darkorchid\",\n",
 276 |     "          x_title = \"Test Time (sec)\",\n",
 277 |     "          y_title = \"SOC\"\n",
 278 |     "         )"
 279 |    ]
 280 |   },
 281 |   {
 282 |    "cell_type": "code",
 283 |    "execution_count": null,
 284 |    "metadata": {
 285 |     "id": "_f7QighFlT--"
 286 |    },
 287 |    "outputs": [],
 288 |    "source": [
 289 |     "file = normalize(file.loc[:,[\"current\",\"voltage\",\"soc\"]].iloc[::G.slicing], G.capacity)\n",
 290 |     "#uses sklearn.preprocessing"
 291 |    ]
 292 |   },
 293 |   {
 294 |    "cell_type": "code",
 295 |    "execution_count": null,
 296 |    "metadata": {
 297 |     "id": "x79KvZ3ilT--"
 298 |    },
 299 |    "outputs": [],
 300 |    "source": [
 301 |     "x_train, x_test, y_train, y_test = rolling_split(file, G.window_size, train=True)\n",
 302 |     "print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)\n",
 303 |     "#uses sklearn.model_selection\n",
 304 |     "\n",
 305 |     "x_train = tf.data.Dataset.from_tensor_slices(x_train)\n",
 306 |     "y_train = tf.data.Dataset.from_tensor_slices(y_train)\n",
 307 |     "x_test = tf.data.Dataset.from_tensor_slices(x_test)\n",
 308 |     "y_test = tf.data.Dataset.from_tensor_slices(y_test)\n",
 309 |     "\n",
 310 |     "train_dataloader = tf.data.Dataset.zip((x_train, y_train)).batch(G.batch_size, drop_remainder=True)\n",
 311 |     "test_dataloader = tf.data.Dataset.zip((x_test, y_test)).batch(G.batch_size, drop_remainder=True)"
 312 |    ]
 313 |   },
 314 |   {
 315 |    "cell_type": "code",
 316 |    "execution_count": null,
 317 |    "metadata": {
 318 |     "id": "yRmivoyVlT-_"
 319 |    },
 320 |    "outputs": [],
 321 |    "source": [
 322 |     "for x,y in train_dataloader:\n",
 323 |     "    print(f\"Shape of X [window, features]: {x.shape}\")\n",
 324 |     "    print(f\"Shape of y: {y.shape} {y.dtype}\")\n",
 325 |     "    break"
 326 |    ]
 327 |   },
 328 |   {
 329 |    "cell_type": "markdown",
 330 |    "metadata": {
 331 |     "id": "blS0pEpTqRVI"
 332 |    },
 333 |    "source": [
 334 |     "<a name='enc'></a>\n",
 335 |     "# Encoder"
 336 |    ]
 337 |   },
 338 |   {
 339 |    "cell_type": "code",
 340 |    "execution_count": null,
 341 |    "metadata": {
 342 |     "id": "sC5vJhz29vZR"
 343 |    },
 344 |    "outputs": [],
 345 |    "source": [
 346 |     "def FullyConnected():\n",
 347 |     "    return tf.keras.Sequential([\n",
 348 |     "        tf.keras.layers.Dense(G.dense_dim, activation='relu',\n",
 349 |     "                              kernel_initializer = tf.keras.initializers.HeNormal(),\n",
 350 |     "                              bias_initializer = tf.keras.initializers.RandomUniform(minval=0.005, maxval = 0.08)\n",
 351 |     "                             ),\n",
 352 |     "        # (G.batch_size, G.window_size, G.dense_dim)\n",
 353 |     "        tf.keras.layers.BatchNormalization(momentum = 0.98, epsilon=5e-4),\n",
 354 |     "        tf.keras.layers.Dense(G.dense_dim, activation='relu',\n",
 355 |     "                              kernel_initializer = tf.keras.initializers.HeNormal(),\n",
 356 |     "                              bias_initializer = tf.keras.initializers.RandomUniform(minval=0.001, maxval = 0.01)\n",
 357 |     "                             ),\n",
 358 |     "        # (G.batch_size, G.window_size, G.dense_dim)\n",
 359 |     "        tf.keras.layers.BatchNormalization(momentum = 0.95, epsilon=5e-4)\n",
 360 |     "    ])"
 361 |    ]
 362 |   },
 363 |   {
 364 |    "cell_type": "markdown",
 365 |    "metadata": {
 366 |     "id": "R65WbX5wqYYH"
 367 |    },
 368 |    "source": [
 369 |     "<a name='enc-lay'></a>\n",
 370 |     "###  Encoder Layer"
 371 |    ]
 372 |   },
 373 |   {
 374 |    "cell_type": "code",
 375 |    "execution_count": null,
 376 |    "metadata": {
 377 |     "id": "tIufbrc-9_2u"
 378 |    },
 379 |    "outputs": [],
 380 |    "source": [
 381 |     "class EncoderLayer(tf.keras.layers.Layer):\n",
 382 |     "    \"\"\"\n",
 383 |     "    The encoder layer is composed by a multi-head self-attention mechanism,\n",
 384 |     "    followed by a simple, positionwise fully connected feed-forward network. \n",
 385 |     "    This archirecture includes a residual connection around each of the two \n",
 386 |     "    sub-layers, followed by batch normalization.\n",
 387 |     "    \"\"\"\n",
 388 |     "    def __init__(self,\n",
 389 |     "                 num_heads,\n",
 390 |     "                 num_features,\n",
 391 |     "                 dense_dim,\n",
 392 |     "                 dropout_rate,\n",
 393 |     "                 batchnorm_eps):\n",
 394 |     "        super(EncoderLayer, self).__init__()\n",
 395 |     "\n",
 396 |     "        self.mha = MultiHeadAttention(\n",
 397 |     "            num_heads = num_heads,\n",
 398 |     "            key_dim = dense_dim,\n",
 399 |     "            dropout = dropout_rate,\n",
 400 |     "            kernel_initializer = tf.keras.initializers.HeNormal(),\n",
 401 |     "            # kernel_regularizer = tf.keras.regularizers.L2(1e-4),\n",
 402 |     "            bias_initializer = tf.keras.initializers.RandomUniform(minval=0.001, maxval = 0.01)\n",
 403 |     "                                     )\n",
 404 |     "        \n",
 405 |     "        #feed-forward-network\n",
 406 |     "        self.ffn = FullyConnected()\n",
 407 |     "        \n",
 408 |     "        \n",
 409 |     "        self.batchnorm1 = BatchNormalization(momentum = 0.95, epsilon=batchnorm_eps)\n",
 410 |     "        self.batchnorm2 = BatchNormalization(momentum = 0.95, epsilon=batchnorm_eps)\n",
 411 |     "\n",
 412 |     "        self.dropout_ffn = Dropout(dropout_rate)\n",
 413 |     "    \n",
 414 |     "    def call(self, x, training):\n",
 415 |     "        \"\"\"\n",
 416 |     "        Forward pass for the Encoder Layer\n",
 417 |     "        \n",
 418 |     "        Arguments:\n",
 419 |     "            x -- Tensor of shape (G.batch_size, G.window_size, G.num_features)\n",
 420 |     "            training -- Boolean, set to true to activate\n",
 421 |     "                        the training mode for dropout layers\n",
 422 |     "        Returns:\n",
 423 |     "            encoder_layer_out -- Tensor of shape (G.batch_size, G.window_size, G.num_features)\n",
 424 |     "        \"\"\"\n",
 425 |     "        # Dropout is added by Keras automatically if the dropout parameter is non-zero during training\n",
 426 |     "        \n",
 427 |     "        attn_output = self.mha(query = x,\n",
 428 |     "                               value = x) # Self attention\n",
 429 |     "        \n",
 430 |     "        out1 = self.batchnorm1(tf.add(x, attn_output))  # (G.batch_size, G.window_size, G.dense_dim)\n",
 431 |     "        \n",
 432 |     "        ffn_output = self.ffn(out1)\n",
 433 |     "    \n",
 434 |     "        ffn_output = self.dropout_ffn(ffn_output) # (G.batch_size, G.window_size, G.dense_dim)\n",
 435 |     "        \n",
 436 |     "        encoder_layer_out = self.batchnorm2(tf.add(ffn_output, out1))\n",
 437 |     "        # (G.batch_size, G.window_size, G.dense_dim)\n",
 438 |     "        return encoder_layer_out"
 439 |    ]
 440 |   },
 441 |   {
 442 |    "cell_type": "markdown",
 443 |    "metadata": {
 444 |     "id": "IKgObFUUlT_B"
 445 |    },
 446 |    "source": [
 447 |     "<a name='full-enc'></a>\n",
 448 |     "### Full Encoder"
 449 |    ]
 450 |   },
 451 |   {
 452 |    "cell_type": "code",
 453 |    "execution_count": null,
 454 |    "metadata": {
 455 |     "id": "7j2Tjr0K0t0I"
 456 |    },
 457 |    "outputs": [],
 458 |    "source": [
 459 |     "class Encoder(tf.keras.layers.Layer):\n",
 460 |     "    \"\"\"\n",
 461 |     "    The entire Encoder starts by passing the input to an embedding layer \n",
 462 |     "    and using positional encoding to then pass the output through a stack of\n",
 463 |     "    encoder Layers\n",
 464 |     "        \n",
 465 |     "    \"\"\"  \n",
 466 |     "    def __init__(self,\n",
 467 |     "                 num_layers = G.num_layers,\n",
 468 |     "                 num_heads = G.num_heads,\n",
 469 |     "                 num_features = G.num_features,\n",
 470 |     "                 dense_dim = G.dense_dim,\n",
 471 |     "                 maximum_position_encoding = G.window_size,\n",
 472 |     "                 dropout_rate=0.15,\n",
 473 |     "                 batchnorm_eps=1e-4):\n",
 474 |     "        \n",
 475 |     "        super(Encoder, self).__init__()\n",
 476 |     "\n",
 477 |     "        self.num_layers = num_layers\n",
 478 |     "\n",
 479 |     "        #linear input layer\n",
 480 |     "        self.lin_input = tf.keras.layers.Dense(dense_dim, activation=\"relu\")\n",
 481 |     "        \n",
 482 |     "        self.pos_encoding = positional_encoding(maximum_position_encoding, \n",
 483 |     "                                                dense_dim)\n",
 484 |     "\n",
 485 |     "\n",
 486 |     "        self.enc_layers = [EncoderLayer(num_heads = num_heads,\n",
 487 |     "                                        num_features = num_features,\n",
 488 |     "                                        dense_dim = dense_dim,\n",
 489 |     "                                        dropout_rate = dropout_rate,\n",
 490 |     "                                        batchnorm_eps = batchnorm_eps) \n",
 491 |     "                           for _ in range(self.num_layers)]\n",
 492 |     "        \n",
 493 |     "    def call(self, x, training):\n",
 494 |     "        \"\"\"\n",
 495 |     "        Forward pass for the Encoder\n",
 496 |     "        \n",
 497 |     "        Arguments:\n",
 498 |     "            x -- Tensor of shape (G.batch_size, G.window_size, G.num_features)\n",
 499 |     "            training -- Boolean, set to true to activate\n",
 500 |     "                        the training mode for dropout layers\n",
 501 |     "            mask -- Boolean mask to ensure that the padding is not \n",
 502 |     "                    treated as part of the input\n",
 503 |     "        Returns:\n",
 504 |     "            Tensor of shape (G.batch_size, G.dense_dim)\n",
 505 |     "        \"\"\"\n",
 506 |     "        x = self.lin_input(x)\n",
 507 |     "        seq_len = tf.shape(x)[1]\n",
 508 |     "        x += self.pos_encoding[:, :seq_len, :]\n",
 509 |     "        \n",
 510 |     "        for i in range(self.num_layers):\n",
 511 |     "            x = self.enc_layers[i](x, training)\n",
 512 |     "            \n",
 513 |     "        # only need the final time's data : time = t-1 from the window\n",
 514 |     "        # x has shape (G.batch_size, G.window_size, G.dense_dim)\n",
 515 |     "        # but I am only returning time t-1:\n",
 516 |     "        return x[:, -1, :] # (G.batch_size, G.dense_dim)"
 517 |    ]
 518 |   },
 519 |   {
 520 |    "cell_type": "markdown",
 521 |    "metadata": {
 522 |     "id": "_U2F58rnlT_C"
 523 |    },
 524 |    "source": [
 525 |     "<a name='transform'></a> \n",
 526 |     "# Transformer"
 527 |    ]
 528 |   },
 529 |   {
 530 |    "cell_type": "code",
 531 |    "execution_count": null,
 532 |    "metadata": {
 533 |     "id": "QHymPmaj-2ba"
 534 |    },
 535 |    "outputs": [],
 536 |    "source": [
 537 |     "class Transformer(tf.keras.Model):\n",
 538 |     "    \"\"\"\n",
 539 |     "    Complete transformer with an Encoder and a Decoder\n",
 540 |     "    \"\"\"\n",
 541 |     "    def __init__(self,\n",
 542 |     "                 num_layers = G.num_layers,\n",
 543 |     "                 num_heads = G.num_heads,\n",
 544 |     "                 dense_dim = G.dense_dim,\n",
 545 |     "                 max_positional_encoding_input = G.window_size,\n",
 546 |     "                 max_positional_encoding_target = G.window_size):\n",
 547 |     "        super(Transformer, self).__init__()\n",
 548 |     "\n",
 549 |     "\n",
 550 |     "        self.encoder = Encoder()\n",
 551 |     "\n",
 552 |     "        self.final_stack = tf.keras.Sequential([\n",
 553 |     "            tf.keras.layers.Dense(\n",
 554 |     "                dense_dim, activation = \"relu\",\n",
 555 |     "                kernel_initializer = tf.keras.initializers.HeNormal(),\n",
 556 |     "                bias_initializer = tf.keras.initializers.RandomUniform(minval=0.001, maxval = 0.02)\n",
 557 |     "                                  ),\n",
 558 |     "            tf.keras.layers.BatchNormalization(momentum = 0.97, epsilon=5e-4),\n",
 559 |     "\n",
 560 |     "            tf.keras.layers.Dense(\n",
 561 |     "                1, activation = \"sigmoid\",\n",
 562 |     "                bias_initializer = tf.keras.initializers.RandomUniform(minval=0.001, maxval = 0.005)\n",
 563 |     "                                 )\n",
 564 |     "                                              ])\n",
 565 |     "    \n",
 566 |     "    def call(self, x, training):\n",
 567 |     "        \"\"\"\n",
 568 |     "        Forward pass for the entire Transformer\n",
 569 |     "        Arguments:\n",
 570 |     "            x -- tf.data.Dataset containing batch inputs and targets\n",
 571 |     "                 batched & windowed voltage, current and soc data with batched soc targets\n",
 572 |     "            training -- Boolean, set to true to activate\n",
 573 |     "                        the training mode for dropout and batchnorm layers\n",
 574 |     "        Returns:\n",
 575 |     "            final_output -- SOC prediction at time t\n",
 576 |     "        \n",
 577 |     "        \"\"\"\n",
 578 |     "        enc_output = self.encoder(x, training) # (G.batch_size, G.dense_dim)\n",
 579 |     "        \n",
 580 |     "        final_output = self.final_stack(enc_output) # (G.batch_size, 1)\n",
 581 |     "\n",
 582 |     "\n",
 583 |     "    \n",
 584 |     "        return final_output"
 585 |    ]
 586 |   },
 587 |   {
 588 |    "cell_type": "markdown",
 589 |    "metadata": {
 590 |     "id": "kiILRshLv9Bx"
 591 |    },
 592 |    "source": [
 593 |     "## Note:\n",
 594 |     "\n",
 595 |     "The `training` argument in the model and layer calls sets the `keras.backend.learning_phase()` value to the appropriate value for the use case.\n",
 596 |     "ie.\n",
 597 |     "- If I am using the train_loop(), `training` is set to True which means all the Dropout and BatchNormalization layers are active.\n",
 598 |     "- If I am using the test_loop(), `training` is set to False which means all the Dropout and BatchNormalization layers are inactive."
 599 |    ]
 600 |   },
 601 |   {
 602 |    "cell_type": "markdown",
 603 |    "metadata": {
 604 |     "id": "Q6IncgGX4z_9"
 605 |    },
 606 |    "source": [
 607 |     "If Using **TPUs** use the cell right below this text\n",
 608 |     "\n",
 609 |     "---\n",
 610 |     "\n"
 611 |    ]
 612 |   },
 613 |   {
 614 |    "cell_type": "code",
 615 |    "execution_count": null,
 616 |    "metadata": {
 617 |     "id": "un5xiWL644Uf"
 618 |    },
 619 |    "outputs": [],
 620 |    "source": [
 621 |     "# tf.keras.backend.clear_session()\n",
 622 |     "# with strategy.scope():\n",
 623 |     "#     model = Transformer()"
 624 |    ]
 625 |   },
 626 |   {
 627 |    "cell_type": "markdown",
 628 |    "metadata": {
 629 |     "id": "yeCjW7VP44fP"
 630 |    },
 631 |    "source": [
 632 |     "\n",
 633 |     "\n",
 634 |     "---\n",
 635 |     "\n"
 636 |    ]
 637 |   },
 638 |   {
 639 |    "cell_type": "markdown",
 640 |    "metadata": {
 641 |     "id": "nJ8bVUEh45Mj"
 642 |    },
 643 |    "source": [
 644 |     "If **not using TPUs**:\n",
 645 |     "\n",
 646 |     "---\n",
 647 |     "\n"
 648 |    ]
 649 |   },
 650 |   {
 651 |    "cell_type": "code",
 652 |    "execution_count": null,
 653 |    "metadata": {
 654 |     "id": "ovllyglWlT_C"
 655 |    },
 656 |    "outputs": [],
 657 |    "source": [
 658 |     "tf.keras.backend.clear_session()\n",
 659 |     "model = Transformer()\n",
 660 |     "model.build((G.batch_size, G.window_size, G.num_features))\n",
 661 |     "model.summary(expand_nested=True)"
 662 |    ]
 663 |   },
 664 |   {
 665 |    "cell_type": "markdown",
 666 |    "metadata": {
 667 |     "id": "SWtYX-8348Z1"
 668 |    },
 669 |    "source": [
 670 |     "\n",
 671 |     "\n",
 672 |     "---\n",
 673 |     "\n"
 674 |    ]
 675 |   },
 676 |   {
 677 |    "cell_type": "code",
 678 |    "execution_count": null,
 679 |    "metadata": {
 680 |     "id": "JUcLoUmWlT_D"
 681 |    },
 682 |    "outputs": [],
 683 |    "source": [
 684 |     "model.load_weights(\"/content/drive/MyDrive/transformer_soc/model_weights.tf\")"
 685 |    ]
 686 |   },
 687 |   {
 688 |    "cell_type": "markdown",
 689 |    "metadata": {
 690 |     "id": "yYtQv1TtlT_D"
 691 |    },
 692 |    "source": [
 693 |     "<a id = \"loss\"></a>\n",
 694 |     "# Callbacks and Scheduler"
 695 |    ]
 696 |   },
 697 |   {
 698 |    "cell_type": "markdown",
 699 |    "metadata": {
 700 |     "id": "aTN3TiSblT_D"
 701 |    },
 702 |    "source": [
 703 |     "**Learning Rate Scheduler**\n",
 704 |     "\n",
 705 |     "Cosine Annealing with Warm Restarts proposed by Loshchilov et al. in [SGDR: Stochastic Gradient Descent with Warm Restarts](https://doi.org/10.48550/arXiv.1608.03983)"
 706 |    ]
 707 |   },
 708 |   {
 709 |    "cell_type": "markdown",
 710 |    "metadata": {
 711 |     "id": "xWt1eUd9o6WA"
 712 |    },
 713 |    "source": [
 714 |     "$$\\mu_t = \\mu_{min} + \\frac{1}{2}(\\mu_{max} - \\mu_{min})\\cdot (1 + \\cos (\\frac{T_{cur}}{T_i}\\pi))$$\n",
 715 |     "\n",
 716 |     "Where:\n",
 717 |     " - $\\mu$ is the learning_rate, subscript $t$ is for time = $t$\n",
 718 |     " - $T_{cur}$ is the number of epochs since the last restart\n",
 719 |     " - $T_i$ is the number of epochs between two restarts\n",
 720 |     "\n",
 721 |     "Note:\n",
 722 |     " - When $T_{cur} = T_i \\rightarrow \\mu_t = \\mu_{min}$\n",
 723 |     " - When $T_{cur} = 0 \\rightarrow \\mu_t = \\mu_{max}$"
 724 |    ]
 725 |   },
 726 |   {
 727 |    "cell_type": "markdown",
 728 |    "metadata": {
 729 |     "id": "sLjZ7ICoSGif"
 730 |    },
 731 |    "source": [
 732 |     "---\n",
 733 |     "**The Cell below is for the LambdaCallback Class in keras in order to implement Cosine Annealing with Warm Restarts** ↓\n",
 734 |     "\n",
 735 |     "Used with callbacks in model.fit()\n",
 736 |     "\n",
 737 |     "---"
 738 |    ]
 739 |   },
 740 |   {
 741 |    "cell_type": "code",
 742 |    "execution_count": null,
 743 |    "metadata": {
 744 |     "id": "mZg1uSmDQMTZ"
 745 |    },
 746 |    "outputs": [],
 747 |    "source": [
 748 |     "def schedule(batch, logs):\n",
 749 |     "        '''\n",
 750 |     "        This is a dummy function for the LearningRateScheduler Class\n",
 751 |     "        I am trying to see if I can use the model.compile(), model.fit(), model.evaluate(), trio with\n",
 752 |     "        Cosine Annealing with Warm Restarts\n",
 753 |     "        Returns a new learning rate based on the schedule described below\n",
 754 |     "        \n",
 755 |     "        Call after every batch\n",
 756 |     "        '''\n",
 757 |     "        \n",
 758 |     "        mu_i = G.min_learning_rate + 0.5 * (\n",
 759 |     "                G.learning_rate - G.min_learning_rate) * (\n",
 760 |     "                    1 + tf.math.cos(np.pi * G.T_cur / G.T_i))\n",
 761 |     "        \n",
 762 |     "        G.T_cur += G.batch_size / len(x_train)\n",
 763 |     "        if np.isclose(G.T_cur, G.T_i):\n",
 764 |     "            G.T_i *= G.T_mult\n",
 765 |     "            G.T_cur = 0.0\n",
 766 |     "        K.set_value(model.optimizer.learning_rate, mu_i)"
 767 |    ]
 768 |   },
 769 |   {
 770 |    "cell_type": "markdown",
 771 |    "metadata": {
 772 |     "id": "bzZcCFve2o5O"
 773 |    },
 774 |    "source": [
 775 |     "**Progress Plot Callback**"
 776 |    ]
 777 |   },
 778 |   {
 779 |    "cell_type": "code",
 780 |    "execution_count": null,
 781 |    "metadata": {
 782 |     "id": "ZeomH0iN2o5O"
 783 |    },
 784 |    "outputs": [],
 785 |    "source": [
 786 |     "class ProgressCallback(tf.keras.callbacks.Callback):\n",
 787 |     "    def on_epoch_end(self, epoch, logs = None):\n",
 788 |     "        train_loss = logs[\"loss\"]\n",
 789 |     "        train_acc = 100.0 - logs[\"mean_absolute_percentage_error\"]\n",
 790 |     "        test_loss = logs[\"val_loss\"]\n",
 791 |     "        test_acc = 100.0 - logs[\"val_mean_absolute_percentage_error\"]\n",
 792 |     "        global pp\n",
 793 |     "        pp.update([[train_loss, test_loss],\n",
 794 |     "                   [train_acc, test_acc]])"
 795 |    ]
 796 |   },
 797 |   {
 798 |    "cell_type": "markdown",
 799 |    "metadata": {
 800 |     "id": "A699g9Sp2o5P"
 801 |    },
 802 |    "source": [
 803 |     "**Save Model Progress Callback**\n",
 804 |     "\n",
 805 |     "Does not work with TPUs"
 806 |    ]
 807 |   },
 808 |   {
 809 |    "cell_type": "code",
 810 |    "execution_count": null,
 811 |    "metadata": {
 812 |     "id": "dTdfb-br2o5P"
 813 |    },
 814 |    "outputs": [],
 815 |    "source": [
 816 |     "class SaveModel(tf.keras.callbacks.Callback):\n",
 817 |     "    def on_epoch_end(self, epoch, logs = None):\n",
 818 |     "        if epoch != 0 and epoch % 15 == 0:\n",
 819 |     "            self.model.save_weights(\"/content/drive/MyDrive/transformer_soc/model_weights.h5\")"
 820 |    ]
 821 |   },
 822 |   {
 823 |    "cell_type": "markdown",
 824 |    "metadata": {
 825 |     "id": "Mgkzjt8NReWS"
 826 |    },
 827 |    "source": [
 828 |     "**Early Stopping and Saving Best Model checkpoint Callbacks**"
 829 |    ]
 830 |   },
 831 |   {
 832 |    "cell_type": "code",
 833 |    "execution_count": null,
 834 |    "metadata": {
 835 |     "id": "_-Lh-49NRb_r"
 836 |    },
 837 |    "outputs": [],
 838 |    "source": [
 839 |     "model_options = tf.saved_model.SaveOptions(experimental_io_device=\"/job:localhost\")\n",
 840 |     "# earlystopping = EarlyStopping(monitor='val_mean_absolute_percentage_error', patience=150, verbose=0, mode='min')\n",
 841 |     "mcp_save = ModelCheckpoint('/content/drive/MyDrive/transformer_soc/tpu_model_weights', save_format = \"tf\", save_best_only=True, monitor='val_mean_absolute_percentage_error', mode='min', options = model_options)"
 842 |    ]
 843 |   },
 844 |   {
 845 |    "cell_type": "code",
 846 |    "execution_count": null,
 847 |    "metadata": {
 848 |     "id": "hg7FmZOHlT_E"
 849 |    },
 850 |    "outputs": [],
 851 |    "source": [
 852 |     "loss_object = tf.keras.losses.LogCosh()\n",
 853 |     "\n",
 854 |     "optimizer = tf.keras.optimizers.Adam(learning_rate = G.learning_rate,\n",
 855 |     "                                     beta_1 = 0.9,\n",
 856 |     "                                     beta_2 = 0.999\n",
 857 |     "                                    )\n",
 858 |     "\n",
 859 |     "#cos_anneal is for the model.fit() call\n",
 860 |     "cos_anneal = tf.keras.callbacks.LambdaCallback(on_batch_end = schedule)\n",
 861 |     "\n",
 862 |     "#progress plot callback\n",
 863 |     "pp_update = ProgressCallback()\n",
 864 |     "\n",
 865 |     "#model parameters save callback\n",
 866 |     "model_save = SaveModel() #This is optional"
 867 |    ]
 868 |   },
 869 |   {
 870 |    "cell_type": "markdown",
 871 |    "metadata": {
 872 |     "id": "45--3qknlT_H"
 873 |    },
 874 |    "source": [
 875 |     "<a id = \"train\"></a>\n",
 876 |     "# Training\n",
 877 |     "\n",
 878 |     "**There are two compile calls, one requires a TPU**"
 879 |    ]
 880 |   },
 881 |   {
 882 |    "cell_type": "code",
 883 |    "execution_count": null,
 884 |    "metadata": {
 885 |     "id": "ynnk3or6-FMd"
 886 |    },
 887 |    "outputs": [],
 888 |    "source": [
 889 |     "pp = PP(plot_names = [\"Mean Log Loss\", \"% Accuracy\"],\n",
 890 |     "        line_names = [\"Train Loop\", \"Test Loop\"],\n",
 891 |     "        x_label = \"epochs\"\n",
 892 |     "       )\n",
 893 |     "\n",
 894 |     "# ##### if using a TPU:\n",
 895 |     "# with strategy.scope():\n",
 896 |     "#     model.compile(optimizer, loss_object, steps_per_execution = 3, metrics=[\"mean_absolute_percentage_error\"])\n",
 897 |     "\n",
 898 |     "##### else:\n",
 899 |     "# model.compile(optimizer, loss_object, metrics=[\"mean_absolute_percentage_error\"])\n",
 900 |     "## Dont compile after training, it causes issues.\n",
 901 |     "\n",
 902 |     "#-----------------------------------------------------------------\n",
 903 |     "#Note: can add `model_save` to the callbacks list in model.fit()\n",
 904 |     "#      it saves the model params to the google drive every 15 epochs\n",
 905 |     "#-------------------------------------------------------------------\n",
 906 |     "\n",
 907 |     "steps_per_epoch = len(train_dataloader) // G.epochs\n",
 908 |     "validation_steps = len(test_dataloader) // G.epochs\n",
 909 |     "\n",
 910 |     "history = model.fit(train_dataloader,\n",
 911 |     "                    batch_size = G.batch_size,\n",
 912 |     "                    epochs = G.epochs,\n",
 913 |     "                    verbose = 1,\n",
 914 |     "                    steps_per_epoch = steps_per_epoch,\n",
 915 |     "                    callbacks = [cos_anneal, pp_update],\n",
 916 |     "                    validation_data = test_dataloader,\n",
 917 |     "                    validation_steps = validation_steps\n",
 918 |     "                    )"
 919 |    ]
 920 |   },
 921 |   {
 922 |    "cell_type": "code",
 923 |    "execution_count": null,
 924 |    "metadata": {
 925 |     "id": "9yF6RygxlT_I"
 926 |    },
 927 |    "outputs": [],
 928 |    "source": [
 929 |     "model.save(\"/content/drive/MyDrive/transformer_soc/tpu_model.h5\") #doesnt work with TPUs"
 930 |    ]
 931 |   },
 932 |   {
 933 |    "cell_type": "code",
 934 |    "execution_count": null,
 935 |    "metadata": {
 936 |     "id": "ljBF-U_vIjrL"
 937 |    },
 938 |    "outputs": [],
 939 |    "source": [
 940 |     "#works with TPUs\n",
 941 |     "checkpoint = tf.train.Checkpoint(model = model)\n",
 942 |     "options = tf.train.CheckpointOptions(experimental_io_device=\"/job:localhost\")\n",
 943 |     "checkpoint.save(\"/content/drive/MyDrive/transformer_soc/tpu_model/ckpt\", options=options)"
 944 |    ]
 945 |   },
 946 |   {
 947 |    "cell_type": "markdown",
 948 |    "metadata": {
 949 |     "id": "L5pSwH7QlT_I"
 950 |    },
 951 |    "source": [
 952 |     "<a id = \"val\"></a>\n",
 953 |     "# Validate"
 954 |    ]
 955 |   },
 956 |   {
 957 |    "cell_type": "markdown",
 958 |    "metadata": {
 959 |     "id": "JYr2y9eulT_I"
 960 |    },
 961 |    "source": [
 962 |     "**Dev Set**"
 963 |    ]
 964 |   },
 965 |   {
 966 |    "cell_type": "code",
 967 |    "execution_count": null,
 968 |    "metadata": {
 969 |     "id": "CuY9saCblT_I",
 970 |     "scrolled": true
 971 |    },
 972 |    "outputs": [],
 973 |    "source": [
 974 |     "visualize_dev = validate(model, test_dataloader, dev = True)"
 975 |    ]
 976 |   },
 977 |   {
 978 |    "cell_type": "markdown",
 979 |    "metadata": {
 980 |     "id": "v5uLkWkLlT_I"
 981 |    },
 982 |    "source": [
 983 |     "**Entire Dataset**"
 984 |    ]
 985 |   },
 986 |   {
 987 |    "cell_type": "code",
 988 |    "execution_count": null,
 989 |    "metadata": {
 990 |     "id": "gjvsvbIllT_I"
 991 |    },
 992 |    "outputs": [],
 993 |    "source": [
 994 |     "x_set, y_set = rolling_split(file, G.window_size, train = False)\n",
 995 |     "\n",
 996 |     "x_set = tf.data.Dataset.from_tensor_slices(x_set)\n",
 997 |     "y_set = tf.data.Dataset.from_tensor_slices(y_set)\n",
 998 |     "\n",
 999 |     "set_dataloader = tf.data.Dataset.zip((x_set, y_set)).batch(G.batch_size, drop_remainder=True)\n",
1000 |     "\n",
1001 |     "visualize = validate(model, set_dataloader, dev = False)"
1002 |    ]
1003 |   }
1004 |  ],
1005 |  "metadata": {
1006 |   "accelerator": "TPU",
1007 |   "colab": {
1008 |    "name": "transform_notebook.ipynb",
1009 |    "provenance": []
1010 |   },
1011 |   "gpuClass": "standard",
1012 |   "kernelspec": {
1013 |    "display_name": "Python 3 (ipykernel)",
1014 |    "language": "python",
1015 |    "name": "python3"
1016 |   },
1017 |   "language_info": {
1018 |    "codemirror_mode": {
1019 |     "name": "ipython",
1020 |     "version": 3
1021 |    },
1022 |    "file_extension": ".py",
1023 |    "mimetype": "text/x-python",
1024 |    "name": "python",
1025 |    "nbconvert_exporter": "python",
1026 |    "pygments_lexer": "ipython3",
1027 |    "version": "3.10.5"
1028 |   }
1029 |  },
1030 |  "nbformat": 4,
1031 |  "nbformat_minor": 1
1032 | }
1033 | 


--------------------------------------------------------------------------------