├── .gitattributes ├── README.md ├── lstm.py ├── main.py └── transformer.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep learning based state estimation: incorporating Transformer and LSTM to Kalman Filter with EM algorithm 2 | 3 | ## Overview 4 | 5 | - Kalman Filter requires the true parameters of the model and solves optimal state estimation recursively. Expectation Maximization (EM) algorithm is applicable for estimating the parameters of the model that are not available before Kalman filtering, which is EM-KF algorithm. 6 | - To improve the preciseness of EM-KF algorithm, the author presents a state estimation method by combining the Long-Short Term Memory network (LSTM), Transformer and EM-KF algorithm in the framework of Encoder-Decoder in Sequence to Sequence (seq2seq). 7 | - Simulation on a linear mobile robot model demonstrates that the new method is more precise. 8 | - Please read our paper on arXiv: [Incorporating Transformer and LSTM to Kalman Filter with EM algorithm for state estimation](https://arxiv.org/abs/2105.00250), for understanding the details w.r.t. theoretical analysis and experiment in our method. 9 | 10 | ## Usage 11 | 12 | ``` 13 | python main.py 14 | ``` 15 | 16 | ## Requirements 17 | 18 | The code has been tested running under Python3, with package PyTorch, NumPy, Matplotlib, PyKalman and their dependencies installed. 19 | 20 | ## Methodology 21 | 22 | We proposed encoder-decoder framework in seq2seq for state estimation, that state estimation is equivalent to encode and decode observation. 23 | 24 | 1. Previous works incorporating LSTM to KF, are adopting LSTM encoder and KF 25 | decoder. We proposed LSTM-KF adopting LSTM encoder and EM-KF decoder. 26 | 2. Before EM-KF decoder, replace LSTM encoder by Transformer encoder, we call this 27 | Transformer-KF. 28 | 3. Integrating Transformer and LSTM, we call this TL-KF. 29 | 30 | Integrating Transformer and LSTM to encode observation before filtering, makes it easier for EM algorithm to estimate parameters. 31 | 32 | ## Conclusions 33 | 34 | 1. Combining Transformer and LSTM as an encoder-decoder framework for observation, can depict state more effectively, attenuate noise interference, and weaken the assumption of Markov property of states, and conditional independence of observations. This can enhance the preciseness and robustness of state estimation. 35 | 2. Transformer, based on multi-head self attention and residual connection, can capture long-term dependency, while LSTM-encoder can model time-series. TL-KF, a combination of Transformer, LSTM and EM-KF, is precise for state estimation in systems with unknown parameters. 36 | 3. Kalman smoother can ameliorate Kalman filter, but in TL-KF, filtering is precise enough. Therefore, after offline training for parameter estimation, KF for online estimation can be adopted. 37 | 38 | ## Citation 39 | 40 | ``` 41 | @article{shi2021kalman, 42 | author={Zhuangwei Shi}, 43 | title={Incorporating Transformer and LSTM to Kalman Filter with EM algorithm for state estimation}, 44 | journal={arXiv preprint arXiv:2105.00250}, 45 | year={2021}, 46 | } 47 | ``` -------------------------------------------------------------------------------- /lstm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch import nn 4 | from torch.autograd import Variable 5 | from numpy.linalg import norm 6 | 7 | def create_dataset(datas, look_back): 8 | dataset = datas.astype('float32') 9 | max_value = np.max(dataset) 10 | min_value = np.min(dataset) 11 | scalar = max_value - min_value 12 | dataset = list(map(lambda x: x / scalar, dataset)) 13 | #look_back = 3 14 | #window_wid = 2*look_back 15 | 16 | dataX, dataY = [], [] 17 | for i in range(len(dataset) - look_back): 18 | a = dataset[i:(i + look_back)] 19 | dataX.append(a) 20 | dataY.append(dataset[i + look_back]) 21 | return dataset, np.array(dataX), np.array(dataY) 22 | 23 | class lstm_reg(nn.Module): 24 | def __init__(self, input_size, hidden_size, output_size=1, num_layers=3): 25 | super(lstm_reg, self).__init__() 26 | 27 | self.rnn = nn.LSTM(input_size, hidden_size, num_layers)#,dropout=1) 28 | #bidirectional=True) # rnn 29 | self.reg = nn.Linear(hidden_size, output_size) # regression 30 | 31 | def forward(self, x): 32 | x, _ = self.rnn(x) # (seq, batch, hidden) 33 | s, b, h = x.shape 34 | x = x.view(s*b, h) 35 | x = self.reg(x) 36 | x = x.view(s, b, -1) 37 | return x 38 | 39 | def train(datas,look_back=5,step = 200,epochtime = 100): 40 | dataset, data_X, data_Y = create_dataset(datas,look_back) 41 | train_size = int(len(data_X)*0.9) 42 | test_size = len(data_X) - train_size 43 | train_X = data_X[:train_size] 44 | train_Y = data_Y[:train_size] 45 | test_X = data_X[train_size:] 46 | test_Y = data_Y[train_size:] 47 | train_X = train_X.reshape(-1, 1, look_back) 48 | train_Y = train_Y.reshape(-1, 1, 1) 49 | test_X = test_X.reshape(-1, 1, look_back) 50 | 51 | train_x = torch.from_numpy(train_X) 52 | train_y = torch.from_numpy(train_Y) 53 | test_x = torch.from_numpy(test_X) 54 | 55 | net = lstm_reg(look_back, 2*look_back) 56 | criterion = nn.MSELoss() 57 | optimizer = torch.optim.Adam(net.parameters(),lr=0.01) 58 | loss_list = [] 59 | for e in range(epochtime): 60 | var_x = Variable(train_x) 61 | var_y = Variable(train_y) 62 | out = net(var_x) 63 | loss = criterion(out, var_y) 64 | optimizer.zero_grad() 65 | loss.backward() 66 | optimizer.step() 67 | loss_list.append(loss.item()) 68 | if (e + 1) % 10 == 0: 69 | print('Epoch: {}, Loss: {:.5f}'.format(e + 1, loss.item())) 70 | 71 | net = net.eval() 72 | data_X = data_X.reshape(-1, 1, look_back) 73 | data_X = torch.from_numpy(data_X) 74 | var_data = Variable(data_X) 75 | pred_test = net(var_data) 76 | pred_test = pred_test.view(-1).data.numpy() 77 | pred_test = np.concatenate((np.array(dataset)[:look_back],pred_test)) 78 | 79 | return pred_test,loss_list -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from pykalman import KalmanFilter 4 | import lstm 5 | import transformer 6 | import time 7 | import torch 8 | 9 | random_state = np.random.RandomState(0) 10 | torch.manual_seed(0) 11 | T = 0.01 12 | step = 200 13 | 14 | ''' 15 | Step 1: Provide a de facto system 16 | ''' 17 | 18 | A= np.array([[1,T, 0.5*T*T], 19 | [ 0, 1,T], 20 | [ 0, 0, 1]]) 21 | B= [0, 0, 0] 22 | C= [1,0,0] 23 | D= [0] 24 | Q= 0.01*np.eye(3) 25 | R= 0.005*np.eye(1) 26 | m0= [ 0,0,0.1] 27 | P0= 0.1*np.eye(3) 28 | 29 | kft = KalmanFilter( 30 | A,C,Q,R,B,D,m0,P0, 31 | random_state=random_state 32 | )# model should be 33 | state, observation = kft.sample( 34 | n_timesteps=step, 35 | initial_state=m0 36 | )# provide data 37 | #filtered_state_estimatet, f_covt = kft.filter(observation) 38 | #smoothed_state_estimatet, s_covt = kft.smooth(observation) 39 | 40 | ''' 41 | Step 2: Initialize our model 42 | ''' 43 | 44 | # specify parameters 45 | transition_matrix = A 46 | transition_offset = B 47 | observation_matrix = C 48 | observation_offset = D 49 | transition_covariance = 0.02*np.eye(3) 50 | observation_covariance = np.eye(1) 51 | initial_state_mean =[0,0,1] 52 | initial_state_covariance = 5*np.eye(3) 53 | 54 | # sample from model 55 | 56 | kf = KalmanFilter( 57 | transition_matrix, observation_matrix, transition_covariance, 58 | observation_covariance, transition_offset, observation_offset,initial_state_mean,initial_state_covariance, 59 | random_state=random_state, 60 | em_vars=[ 61 | #'transition_matrices', 'observation_matrices', 62 | 'transition_covariance','observation_covariance', 63 | #'transition_offsets', 'observation_offsets', 64 | 'initial_state_mean', 'initial_state_covariance' 65 | ] 66 | ) 67 | data = kf.sample(n_timesteps=step,initial_state=initial_state_mean)[1] 68 | filtered_state_estimater, nf_cov = kf.filter(observation) 69 | smoothed_state_estimater, ns_cov = kf.smooth(observation) 70 | 71 | ''' 72 | Step 3: Learn good values for parameters named in `em_vars` using the EM algorithm 73 | ''' 74 | 75 | def compute_tr(a): 76 | size = a.shape[0] 77 | return (np.trace(a)/size)*np.eye(size) 78 | 79 | def test(data,method='TL',n_iteration=10): 80 | t_start = time.process_time() 81 | if method == 'TL': 82 | print('----transformer+lstm----') 83 | data,loss_list = transformer.train(data,step) 84 | data,loss_list = lstm.train(data) 85 | labelfilter = 'TL-KF' 86 | labelsmooth = 'TL-KS' 87 | elif method == 'L': 88 | print('----lstm----') 89 | data,loss_list = lstm.train(data) 90 | labelfilter = 'LSTM-KF' 91 | labelsmooth = 'LSTM-KS' 92 | elif method == 'T': 93 | print('----transformer----') 94 | data,loss_list = transformer.train(data,step) 95 | labelfilter = 'Transformer-KF' 96 | labelsmooth = 'Transformer-KS' 97 | else: 98 | print('----EM----') 99 | labelfilter = 'EM-KF' 100 | labelsmooth = 'EM-KS' 101 | 102 | t_train = time.process_time() 103 | kfem = kf.em(X=data, n_iter=n_iteration) 104 | t_em = time.process_time() 105 | print('train-time/sec',t_train-t_start) 106 | print('em-time/sec',t_em-t_train) 107 | Qem = compute_tr(kfem.transition_covariance) 108 | Rem = compute_tr(kfem.observation_covariance) 109 | P0em = compute_tr(kfem.initial_state_covariance) 110 | m0em = [0,0,np.abs(kfem.initial_state_mean[2])] 111 | print('Q=',Qem) 112 | print('R=',Rem) 113 | print('m0=',m0em) 114 | print('P0=',P0em) 115 | kfem = KalmanFilter( 116 | A,C,Qem,Rem,B,D,m0em,P0em, 117 | random_state=random_state 118 | ) 119 | #obsem = kfem.sample(n_timesteps=step,initial_state=m0)[1] 120 | filtered_state_estimates, f_cov = kfem.filter(observation) 121 | smoothed_state_estimates, s_cov = kfem.smooth(observation) 122 | return filtered_state_estimates, f_cov, smoothed_state_estimates, s_cov,labelfilter,labelsmooth 123 | 124 | 125 | # draw estimates 126 | filtered_state_estimates, f_cov, smoothed_state_estimates, s_cov, labelfilter,labelsmooth = test(data[:,0],n_iteration=10) 127 | #print('emkf=',filtered_state_estimates[:,0].tolist()) 128 | #print('emks=',smoothed_state_estimates[:,0].tolist()) 129 | filtered_delta_estimater = filtered_state_estimater[:,0] - state[:,0] 130 | smoothed_delta_estimater = smoothed_state_estimater[:,0] - state[:,0] 131 | filtered_delta_estimates = filtered_state_estimates[:,0] - state[:,0] 132 | smoothed_delta_estimates = smoothed_state_estimates[:,0] - state[:,0] 133 | ''' 134 | filtered_delta_estimates_lstm = filtered_state_estimates_lstm[:,0] - state[:,0] 135 | smoothed_delta_estimates_lstm = smoothed_state_estimates_lstm[:,0] - state[:,0] 136 | filtered_delta_estimates_tranf = filtered_state_estimates_tranf[:,0] - state[:,0] 137 | smoothed_delta_estimates_tranf = smoothed_state_estimates_tranf[:,0] - state[:,0] 138 | ''' 139 | #smoothed_delta_estimates[step-1] = smoothed_delta_estimates[step-3] 140 | # lines_true = plt.plot(state[:,0],state[:,1] ,color='c',label='true') 141 | #lines_model = plt.plot(state, color='m') 142 | msefr = np.linalg.norm(filtered_delta_estimater)**2/step 143 | msesr = np.linalg.norm(smoothed_delta_estimater)**2/step 144 | msefs = np.linalg.norm(filtered_delta_estimates)**2/step 145 | msess = np.linalg.norm(smoothed_delta_estimates)**2/step 146 | print('----MSE----') 147 | print('KF',msefr) 148 | print('KS',msesr) 149 | print(labelfilter,msefs) 150 | print(labelsmooth,msess) 151 | 152 | #draw 153 | taxis = np.linspace(0,step*T,step) 154 | plt.figure() 155 | lines_filter = plt.scatter(taxis,state[:,0], color='c',label='True') 156 | lines_filter = plt.plot(taxis,filtered_state_estimater[:,0], 'r',label='KF') 157 | lines_smoother = plt.plot(taxis,smoothed_state_estimater[:,0], 'r--',label='KS') 158 | lines_filt = plt.plot(taxis,filtered_state_estimates[:,0], 'b',label=labelfilter) 159 | lines_smooth = plt.plot(taxis,smoothed_state_estimates[:,0], 'b--',label=labelsmooth) 160 | plt.xlim(0,step*T) 161 | plt.xlabel('Time/s') 162 | plt.ylabel('x/m') 163 | plt.legend() 164 | plt.grid() 165 | plt.figure() 166 | dlines_filter = plt.plot(taxis,filtered_delta_estimater, 'r',label='KF') 167 | dlines_smoother = plt.plot(taxis,smoothed_delta_estimater, 'r--',label='KS') 168 | dlines_filt = plt.plot(taxis,filtered_delta_estimates, 'b',label=labelfilter) 169 | dlines_smooth = plt.plot(taxis,smoothed_delta_estimates, 'b--',label=labelsmooth) 170 | #plt.plot(observation[:,0] - state[:,0],color='c') 171 | plt.xlim(0,step*T) 172 | plt.xlabel('Time/s') 173 | plt.ylabel('Error/m') 174 | plt.legend() 175 | plt.grid() 176 | plt.show() 177 | #lines_filt1 = plt.plot(filtered_state_estimates1, color='b') 178 | #lines_smooth1 = plt.plotsmoothed_delta_estimates1, color='k') 179 | -------------------------------------------------------------------------------- /transformer.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import torch 4 | import math 5 | from torch.autograd import Variable 6 | 7 | class TransformerBlock(nn.Module): 8 | """ 9 | Bidirectional Encoder = Transformer (self-attention) 10 | Transformer = MultiHead_Attention + Feed_Forward with sublayer connection 11 | """ 12 | 13 | def __init__(self, hidden, attn_heads, feed_forward_hidden, dropout): 14 | """ 15 | :param hidden: hidden size of transformer 16 | :param attn_heads: head sizes of multi-head attention 17 | :param feed_forward_hidden: feed_forward_hidden, usually 4*hidden_size 18 | :param dropout: dropout rate 19 | """ 20 | 21 | super().__init__() 22 | self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden) 23 | self.feed_forward = PositionwiseFeedForward(d_model=hidden, d_ff=feed_forward_hidden, dropout=dropout) 24 | self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout) 25 | self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout) 26 | self.dropout = nn.Dropout(p=dropout) 27 | self.hidden = hidden 28 | 29 | 30 | 31 | 32 | def forward(self, x, mask): 33 | self.position = PositionalEmbedding(self.hidden, x.size()[1]) 34 | 35 | x = x + self.position(x) 36 | x = self.input_sublayer(x, lambda _x: self.attention.forward(_x, _x, _x, mask=mask)) 37 | x = self.output_sublayer(x, self.feed_forward) 38 | return self.dropout(x) 39 | 40 | 41 | class PositionalEmbedding(nn.Module): 42 | 43 | def __init__(self, d_model, max_len=512): 44 | super().__init__() 45 | 46 | # Compute the positional encodings once in log space. 47 | pe = torch.zeros(max_len, d_model).float() 48 | pe.require_grad = False 49 | 50 | position = torch.arange(0, max_len).float().unsqueeze(1) 51 | div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp() 52 | 53 | pe[:, 0::2] = torch.sin(position * div_term) 54 | pe[:, 1::2] = torch.cos(position * div_term) 55 | 56 | pe = pe.unsqueeze(0) 57 | self.register_buffer('pe', pe) 58 | 59 | def forward(self, x): 60 | return self.pe[:, :x.size(1)] 61 | 62 | 63 | class MultiHeadedAttention(nn.Module): 64 | """ 65 | Take in model size and number of heads. 66 | """ 67 | 68 | def __init__(self, h, d_model, dropout=0.1): 69 | super().__init__() 70 | assert d_model % h == 0 71 | 72 | # We assume d_v always equals d_k 73 | self.d_k = d_model // h 74 | self.h = h 75 | 76 | self.linear_layers = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(3)]) 77 | self.output_linear = nn.Linear(d_model, d_model) 78 | self.attention = Attention() 79 | 80 | self.dropout = nn.Dropout(p=dropout) 81 | 82 | def forward(self, query, key, value, mask=None): 83 | batch_size = query.size(0) 84 | 85 | # 1) Do all the linear projections in batch from d_model => h x d_k 86 | query, key, value = [l(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2) 87 | for l, x in zip(self.linear_layers, (query, key, value))] 88 | 89 | # 2) Apply attention on all the projected vectors in batch. 90 | x, attn = self.attention(query, key, value, mask=mask, dropout=self.dropout) 91 | 92 | # 3) "Concat" using a view and apply a final linear. 93 | x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k) 94 | 95 | return self.output_linear(x) 96 | 97 | 98 | 99 | class Attention(nn.Module): 100 | """ 101 | Compute 'Scaled Dot Product Attention 102 | """ 103 | 104 | def forward(self, query, key, value, mask=None, dropout=None): 105 | scores = torch.matmul(query, key.transpose(-2, -1)) \ 106 | / math.sqrt(query.size(-1)) 107 | 108 | if mask is not None: 109 | scores = scores.masked_fill(mask == 0, -1e9) 110 | 111 | p_attn = F.softmax(scores, dim=-1) 112 | 113 | if dropout is not None: 114 | p_attn = dropout(p_attn) 115 | 116 | return torch.matmul(p_attn, value), p_attn 117 | 118 | class PositionwiseFeedForward(nn.Module): 119 | "Implements FFN equation." 120 | 121 | def __init__(self, d_model, d_ff, dropout=0.1): 122 | super(PositionwiseFeedForward, self).__init__() 123 | self.w_1 = nn.Linear(d_model, d_ff) 124 | self.w_2 = nn.Linear(d_ff, d_model) 125 | self.dropout = nn.Dropout(dropout) 126 | self.activation = GELU() 127 | 128 | def forward(self, x): 129 | return self.w_2(self.dropout(self.activation(self.w_1(x)))) 130 | 131 | class SublayerConnection(nn.Module): 132 | """ 133 | A residual connection followed by a layer norm. 134 | Note for code simplicity the norm is first as opposed to last. 135 | """ 136 | 137 | def __init__(self, size, dropout): 138 | super(SublayerConnection, self).__init__() 139 | self.norm = LayerNorm(size) 140 | self.dropout = nn.Dropout(dropout) 141 | 142 | def forward(self, x, sublayer): 143 | "Apply residual connection to any sublayer with the same size." 144 | return x + self.dropout(sublayer(self.norm(x))) 145 | 146 | class LayerNorm(nn.Module): 147 | "Construct a layernorm module (See citation for details)." 148 | 149 | def __init__(self, features, eps=1e-6): 150 | super(LayerNorm, self).__init__() 151 | self.a_2 = nn.Parameter(torch.ones(features)) 152 | self.b_2 = nn.Parameter(torch.zeros(features)) 153 | self.eps = eps 154 | 155 | def forward(self, x): 156 | mean = x.mean(-1, keepdim=True) 157 | std = x.std(-1, keepdim=True) 158 | return self.a_2 * (x - mean) / (std + self.eps) + self.b_2 159 | 160 | class GELU(nn.Module): 161 | """ 162 | Paper Section 3.4, last paragraph notice that BERT used the GELU instead of RELU 163 | """ 164 | 165 | def forward(self, x): 166 | return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) 167 | 168 | def train(datas,hidden,sample_size=1,max_len=1,attn_heads = 4,dropout = 0.1,epochtime = 500): 169 | 170 | net = TransformerBlock(hidden, attn_heads, hidden * 4, dropout) 171 | criterion = nn.MSELoss() 172 | optimizer = torch.optim.Adam(net.parameters(),lr=0.01) 173 | loss_list = [] 174 | datas = datas.reshape(sample_size, max_len, hidden) 175 | datas = torch.Tensor(datas) 176 | for e in range(epochtime): 177 | out = net(datas,mask=None) 178 | loss = criterion(out, datas) 179 | optimizer.zero_grad() 180 | loss.backward() 181 | optimizer.step() 182 | loss_list.append(loss.item()) 183 | if (e + 1) % 50 == 0: 184 | print('Epoch: {}, Loss: {:.5f}'.format(e + 1, loss.item())) 185 | 186 | net = net.eval() 187 | var_data = Variable(datas) 188 | pred_test = net(var_data,mask=None) 189 | pred_test = pred_test.view(-1).data.numpy().reshape(hidden) 190 | return pred_test,loss_list 191 | --------------------------------------------------------------------------------