├── models ├── utils │ ├── __pycache__ │ │ ├── gelu.cpython-37.pyc │ │ ├── layer_norm.cpython-37.pyc │ │ ├── sublayer.cpython-37.pyc │ │ └── feed_forward.cpython-37.pyc │ ├── gelu.py │ ├── feed_forward.py │ ├── layer_norm.py │ └── sublayer.py ├── attention │ ├── __pycache__ │ │ ├── single.cpython-37.pyc │ │ └── multi_head.cpython-37.pyc │ ├── single.py │ └── multi_head.py ├── embedding │ ├── __pycache__ │ │ ├── position.cpython-37.pyc │ │ ├── event_type.cpython-37.pyc │ │ └── event_embedding.cpython-37.pyc │ ├── event_type.py │ ├── event_embedding.py │ └── position.py ├── base.py └── sahp.py ├── utils ├── save_model.py ├── atten_optimizer.py ├── convert_realdata_syntheform.py ├── load_synth_data.py ├── evaluation.py └── util.py ├── README.md ├── train_functions └── train_sahp.py └── main_func.py /models/utils/__pycache__/gelu.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QiangAIResearcher/sahp_repo/HEAD/models/utils/__pycache__/gelu.cpython-37.pyc -------------------------------------------------------------------------------- /models/attention/__pycache__/single.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QiangAIResearcher/sahp_repo/HEAD/models/attention/__pycache__/single.cpython-37.pyc -------------------------------------------------------------------------------- /models/utils/__pycache__/layer_norm.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QiangAIResearcher/sahp_repo/HEAD/models/utils/__pycache__/layer_norm.cpython-37.pyc -------------------------------------------------------------------------------- /models/utils/__pycache__/sublayer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QiangAIResearcher/sahp_repo/HEAD/models/utils/__pycache__/sublayer.cpython-37.pyc -------------------------------------------------------------------------------- /models/embedding/__pycache__/position.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QiangAIResearcher/sahp_repo/HEAD/models/embedding/__pycache__/position.cpython-37.pyc -------------------------------------------------------------------------------- /models/utils/__pycache__/feed_forward.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QiangAIResearcher/sahp_repo/HEAD/models/utils/__pycache__/feed_forward.cpython-37.pyc -------------------------------------------------------------------------------- /models/attention/__pycache__/multi_head.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QiangAIResearcher/sahp_repo/HEAD/models/attention/__pycache__/multi_head.cpython-37.pyc -------------------------------------------------------------------------------- /models/embedding/__pycache__/event_type.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QiangAIResearcher/sahp_repo/HEAD/models/embedding/__pycache__/event_type.cpython-37.pyc -------------------------------------------------------------------------------- /models/embedding/__pycache__/event_embedding.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/QiangAIResearcher/sahp_repo/HEAD/models/embedding/__pycache__/event_embedding.cpython-37.pyc -------------------------------------------------------------------------------- /models/embedding/event_type.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | class TypeEmbedding(nn.Embedding): 5 | def __init__(self, type_size, embed_size, padding_idx): 6 | super().__init__(type_size, embed_size, padding_idx=padding_idx)# padding_idx not 0 -------------------------------------------------------------------------------- /models/utils/gelu.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | import math 4 | 5 | 6 | class GELU(nn.Module): 7 | """ 8 | Paper Section 3.4, last paragraph notice that BERT used the GELU instead of RELU 9 | """ 10 | 11 | def forward(self, x): 12 | return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) -------------------------------------------------------------------------------- /models/utils/feed_forward.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from .gelu import GELU 3 | 4 | 5 | class PositionwiseFeedForward(nn.Module): 6 | "Implements FFN equation." 7 | 8 | def __init__(self, d_model, d_ff, dropout=0.1): 9 | super(PositionwiseFeedForward, self).__init__() 10 | self.w_1 = nn.Linear(d_model, d_ff) 11 | self.w_2 = nn.Linear(d_ff, d_model) 12 | self.dropout = nn.Dropout(dropout) 13 | self.activation = GELU() 14 | 15 | def forward(self, x): 16 | return self.w_2(self.dropout(self.activation.forward(self.w_1(x)))) -------------------------------------------------------------------------------- /models/utils/layer_norm.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | 4 | 5 | class LayerNorm(nn.Module): 6 | "Construct a layernorm module (See citation for details)." 7 | 8 | def __init__(self, size, eps=1e-6): 9 | super(LayerNorm, self).__init__() 10 | self.a_2 = nn.Parameter(torch.ones(size)) 11 | self.b_2 = nn.Parameter(torch.zeros(size)) 12 | self.eps = eps 13 | 14 | def forward(self, x): 15 | mean = x.mean(-1, keepdim=True) 16 | std = x.std(-1, keepdim=True) 17 | return self.a_2 * (x - mean) / (std + self.eps) + self.b_2 -------------------------------------------------------------------------------- /models/utils/sublayer.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from .layer_norm import LayerNorm 3 | 4 | 5 | class SublayerConnection(nn.Module): 6 | """ 7 | A residual connection followed by a layer norm. 8 | Note for code simplicity the norm is first as opposed to last. 9 | """ 10 | 11 | def __init__(self, size, dropout): 12 | super(SublayerConnection, self).__init__() 13 | self.norm = LayerNorm(size) 14 | self.dropout = nn.Dropout(dropout) 15 | 16 | def forward(self, x, sublayer): 17 | "Apply residual connection to any sublayer with the same size." 18 | return x + self.dropout(sublayer(self.norm(x))) -------------------------------------------------------------------------------- /models/attention/single.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import torch 4 | 5 | import math 6 | import numpy as np 7 | 8 | class Attention(nn.Module): 9 | """ 10 | Compute 'Scaled Dot Product Attention 11 | """ 12 | def forward(self, query, key, value, mask=None, dropout=None): 13 | 14 | # scores = torch.matmul(query, key.transpose(-2, -1)) \ 15 | # / math.sqrt(query.size(-1)) 16 | 17 | scores = torch.exp(torch.matmul(query, key.transpose(-2, -1))) \ 18 | / math.sqrt(query.size(-1)) 19 | 20 | if mask is not None: 21 | scores = scores.masked_fill(mask == 0, -1e9) 22 | 23 | p_attn = F.softmax(scores, dim=-1) 24 | 25 | if dropout is not None: 26 | p_attn = dropout(p_attn) 27 | 28 | return torch.matmul(p_attn, value), p_attn -------------------------------------------------------------------------------- /utils/save_model.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import torch 4 | 5 | SAVED_MODELS_PATH = "saved_models" 6 | 7 | 8 | def save_model(model: torch.nn.Module, chosen_data_file, extra_tag, hidden_size, now_timestamp, model_name=None): 9 | if model_name is None: 10 | model_name = model.__class__.__name__ 11 | filename_base = "{}-{}_hidden{}-{}".format( 12 | model_name, extra_tag, 13 | hidden_size, now_timestamp) 14 | filename_model_save = filename_base + ".pth" 15 | model_filepath = os.path.join(SAVED_MODELS_PATH, filename_model_save) 16 | print("Saving models to: {}".format(model_filepath)) 17 | torch.save(model.state_dict(), model_filepath) 18 | 19 | file_correspondance = { 20 | "model_path": model_filepath, 21 | "data_path": chosen_data_file 22 | } 23 | print(file_correspondance) 24 | 25 | with open(os.path.join(SAVED_MODELS_PATH, "train_data_correspondance.jsonl"), "a") as f: 26 | json.dump(file_correspondance, f) 27 | f.write('\n') 28 | 29 | -------------------------------------------------------------------------------- /models/embedding/event_embedding.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from .event_type import TypeEmbedding 3 | from .position import PositionalEmbedding 4 | 5 | 6 | class EventEmbedding(nn.Module): 7 | """ 8 | Event Embedding which is consisted with under features 9 | 1. TypeEmbedding : normal embedding matrix 10 | 2. PositionalEmbedding : adding positional information using sin, cos 11 | 3. Time: TBD 12 | sum of all these features are output of EventEmbedding 13 | """ 14 | 15 | def __init__(self, type_size, embed_size, dropout=0.1): 16 | """ 17 | :param vocab_size: total vocab size 18 | :param embed_size: embedding size of token embedding 19 | :param dropout: dropout rate 20 | """ 21 | super().__init__() 22 | # self.type = TypeEmbedding(type_size=type_size, embed_size=embed_size) 23 | self.type = nn.Embedding(type_size+1, embed_size) 24 | self.position = PositionalEmbedding(d_model=embed_size) 25 | self.dropout = nn.Dropout(p=dropout) 26 | self.embed_size = embed_size 27 | 28 | def forward(self, sequence): 29 | x = self.type(sequence) + self.position(sequence) 30 | return self.dropout(x) -------------------------------------------------------------------------------- /utils/atten_optimizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | class NoamOpt: 4 | "Optim wrapper that implements rate." 5 | 6 | def __init__(self, model_size, factor, warmup, initial_lr, optimizer): 7 | self.optimizer = optimizer 8 | self._step = 0 9 | self.warmup = warmup 10 | self.factor = factor 11 | self.model_size = model_size 12 | self._rate = 0 13 | self.initial_lr = initial_lr 14 | 15 | def step(self): 16 | "Update parameters and rate" 17 | self._step += 1 18 | rate = self.rate() 19 | for p in self.optimizer.param_groups: 20 | p['lr'] = rate 21 | self._rate = rate 22 | self.optimizer.step() 23 | 24 | def rate(self, step=None): 25 | "Implement `lrate` above" 26 | if step is None: 27 | step = self._step 28 | return self.initial_lr + self.factor * \ 29 | (self.model_size ** (-0.5) * 30 | min(step ** (-0.5), step * self.warmup ** (-1.5))) 31 | 32 | 33 | if __name__ == '__main__': 34 | opts = [NoamOpt(512, 1, 4000, 0.01, None), 35 | NoamOpt(512, 1, 8000, 0.01, None), 36 | NoamOpt(256, 1, 4000, 0.01, None)] 37 | plt.plot(np.arange(1, 20000), [[opt.rate(i) for opt in opts] for i in range(1, 20000)]) 38 | plt.legend(["512:4000", "512:8000", "256:4000"]) 39 | plt.show() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SAHP 2 | 3 | This is the repository for the [Self-Attentive Hawkes Processes](https://proceedings.icml.cc/static/paper_files/icml/2020/1421-Paper.pdf) paper where self-attention is used to adapt the intensity function of Hawkes process. 4 | 5 | ## Dataset 6 | The realword datasets are available on this [Google drive] (https://drive.google.com/drive/folders/0BwqmV0EcoUc8UklIR1BKV25YR1U) while the synthetic dataset is at this [link] (https://drive.google.com/file/d/1lRUIJx5UIPMx4TMwKy6GiAiP-k2vwvDc/view?usp=sharing). To run the model, you should download them to 7 | the parent directory of the source code, with the folder name `data`. 8 | 9 | To make the data format consistent, it is necessary to run the script [convert_realdata_syntheform.py](utils/convert_realdata_syntheform.py) first. 10 | 11 | 12 | ## Package 13 | The Python version should be at least 3.5 and the torch version can be 0.4.1 14 | 15 | ## Scripts 16 | `models` defines the self-attentive Hawkes model, multi-head attention and the related. 17 | 18 | `main_func.py` is the main function to run the experiments, hyper-parameters are provided here. 19 | 20 | `utils` contains utility functions 21 | 22 | To run the model: python main_func.py 23 | 24 | ## Citation 25 | ``` 26 | @article{zhang2019self, 27 | title={Self-attentive Hawkes processes}, 28 | author={Zhang, Qiang and Lipani, Aldo and Kirnap, Omer and Yilmaz, Emine}, 29 | journal={arXiv preprint arXiv:1907.07561}, 30 | year={2019} 31 | } 32 | ``` 33 | -------------------------------------------------------------------------------- /models/attention/multi_head.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from .single import Attention 3 | 4 | 5 | class MultiHeadedAttention(nn.Module): 6 | """ 7 | Take in models size and number of heads. 8 | """ 9 | 10 | def __init__(self, h, d_model, dropout=0.1): 11 | super().__init__() 12 | assert d_model % h == 0 13 | 14 | # We assume d_v always equals d_k 15 | self.d_k = d_model // h 16 | self.h = h 17 | 18 | self.linear_layers = nn.ModuleList([nn.Linear(d_model, d_model, bias=True) for _ in range(3)]) 19 | self.output_linear = nn.Linear(d_model, d_model, bias=True) 20 | self.attention = Attention() 21 | 22 | self.dropout = nn.Dropout(p=dropout) 23 | 24 | def forward(self, query, key, value, mask=None): 25 | if mask is not None: 26 | # the same mask applies to all heads 27 | # unsqueeze Returns a new tensor with a dimension of size one 28 | # inserted at the specified position. 29 | mask = mask.unsqueeze(1) 30 | 31 | batch_size = query.size(0) 32 | 33 | # 1) Do all the linear projections in batch from d_model => h x d_k 34 | query, key, value = [l(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2) 35 | for l, x in zip(self.linear_layers, (query, key, value))] 36 | 37 | # 2) Apply attention on all the projected vectors in batch. 38 | x, attn = self.attention.forward(query, key, value, mask=mask, dropout=self.dropout) 39 | 40 | # 3) "Concat" using a view and apply a final linear. 41 | x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k) 42 | 43 | return self.output_linear(x) -------------------------------------------------------------------------------- /models/embedding/position.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | import math 4 | 5 | 6 | class PositionalEmbedding(nn.Module): 7 | 8 | def __init__(self, d_model, max_len=4096): 9 | super().__init__() 10 | 11 | # Compute the positional encodings once in log space. 12 | pe = torch.zeros(max_len, d_model).float() 13 | pe.require_grad = False 14 | 15 | position = torch.arange(0, max_len).float().unsqueeze(1) 16 | div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp() 17 | 18 | pe[:, 0::2] = torch.sin(position * div_term) 19 | pe[:, 1::2] = torch.cos(position * div_term) 20 | 21 | pe = pe.unsqueeze(0) 22 | self.register_buffer('pe', pe) 23 | 24 | def forward(self, x): 25 | 26 | aa = len(x.size()) 27 | if aa > 1: 28 | length = x.size(1) 29 | else: 30 | length = x.size(0) 31 | 32 | return self.pe[:, :length] 33 | 34 | 35 | class BiasedPositionalEmbedding(nn.Module): 36 | def __init__(self, d_model, max_len=4096): 37 | super().__init__() 38 | 39 | position = torch.arange(0, max_len).float().unsqueeze(1) 40 | div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp() 41 | self.register_buffer('position', position) 42 | self.register_buffer('div_term', div_term) 43 | 44 | self.Wt = nn.Linear(1, d_model // 2, bias=False) 45 | 46 | def forward(self, x, interval): 47 | phi = self.Wt(interval.unsqueeze(-1)) 48 | aa = len(x.size()) 49 | if aa > 1: 50 | length = x.size(1) 51 | else: 52 | length = x.size(0) 53 | 54 | arc = (self.position[:length] * self.div_term).unsqueeze(0) 55 | 56 | pe_sin = torch.sin(arc + phi) 57 | pe_cos = torch.cos(arc + phi) 58 | pe = torch.cat([pe_sin, pe_cos], dim=-1) 59 | 60 | return pe -------------------------------------------------------------------------------- /utils/convert_realdata_syntheform.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | 4 | def convert_task_sub(task, sub): 5 | timestamps_list = [] 6 | types_list = [] 7 | lengths_list = [] 8 | timeintervals_list = [] 9 | 10 | file_path = '../../data/' + task + '/' + sub +'.pkl' 11 | with open(file_path, 'rb') as f: 12 | file = pickle.load(f, encoding='latin1') 13 | dim_process = file['dim_process'] 14 | print('dim_process: {} for task: {}'.format(dim_process,task)) 15 | seqs = file[sub] 16 | one_seq_num = 0 17 | for seq in seqs: 18 | timestamps = [] 19 | types = [] 20 | timeintervals = [] 21 | for event in seq: 22 | event_type = event['type_event'] 23 | event_timestamp = event['time_since_start'] 24 | event_timeinterval = event['time_since_last_event'] 25 | 26 | timestamps.append(event_timestamp) 27 | types.append(event_type) 28 | timeintervals.append(event_timeinterval) 29 | lengths = len(seq) 30 | if lengths == 1: 31 | one_seq_num += 1 32 | continue 33 | timestamps_list.append(np.asarray(timestamps)) 34 | types_list.append(np.asarray(types)) 35 | lengths_list.append(np.asarray(lengths)) 36 | timeintervals_list.append(np.asarray(timeintervals)) 37 | 38 | print('one_seq_num: {}'.format(one_seq_num)) 39 | save_path = '../../data/' + task + '/' + sub +'_manifold_format.pkl' 40 | with open(save_path, "wb") as f: 41 | save_data_ = {'timestamps': np.asarray(timestamps_list), 42 | 'types': np.asarray(types_list), 43 | 'lengths': np.asarray(lengths_list), 44 | 'timeintervals': np.asarray(timeintervals_list) 45 | } 46 | pickle.dump(save_data_,f) 47 | 48 | if __name__ == '__main__': 49 | task_list = ['bookorder'] 50 | sub_dataset = ['train', 'dev', 'test'] 51 | 52 | for task in task_list: 53 | for sub in sub_dataset: 54 | convert_task_sub(task,sub) -------------------------------------------------------------------------------- /utils/load_synth_data.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor, nn 3 | from typing import Tuple 4 | import pickle 5 | 6 | def read_syn(file_name): 7 | with open(file_name, 'rb') as f: 8 | loaded_hawkes_data = pickle.load(f) 9 | 10 | mu = loaded_hawkes_data['mu'] 11 | alpha = loaded_hawkes_data['alpha'] 12 | decay = loaded_hawkes_data['decay'] 13 | tmax = loaded_hawkes_data['tmax'] 14 | 15 | print("Simulated Hawkes process parameters:") 16 | for label, val in [("mu", mu), ("alpha", alpha), ("decay", decay), ("tmax", tmax)]: 17 | print("{:<20}{}".format(label, val)) 18 | 19 | return loaded_hawkes_data,tmax 20 | 21 | 22 | def process_loaded_sequences(loaded_hawkes_data: dict, process_dim: int) -> Tuple[Tensor, Tensor, Tensor]: 23 | """ 24 | Preprocess synthetic Hawkes data by padding the sequences. 25 | Args: 26 | loaded_hawkes_data: 27 | process_dim: 28 | tmax: 29 | Returns: 30 | sequence event times, event types and overall lengths (dim0: batch size) 31 | """ 32 | # Tensor of sequence lengths (with additional BOS event) 33 | seq_lengths = torch.Tensor(loaded_hawkes_data['lengths']).int() 34 | 35 | event_times_list = loaded_hawkes_data['timestamps'] 36 | event_types_list = loaded_hawkes_data['types'] 37 | event_times_list = [torch.from_numpy(e) for e in event_times_list] 38 | event_types_list = [torch.from_numpy(e) for e in event_types_list] 39 | 40 | tmax = 0 41 | for tsr in event_times_list: 42 | if torch.max(tsr) > tmax: 43 | tmax = torch.max(tsr) 44 | 45 | # Build a data tensor by padding 46 | seq_times = nn.utils.rnn.pad_sequence(event_times_list, batch_first=True, padding_value=tmax).float() 47 | seq_times = torch.cat((torch.zeros_like(seq_times[:, :1]), seq_times), dim=1) # add 0 to the sequence beginning 48 | 49 | seq_types = nn.utils.rnn.pad_sequence(event_types_list, batch_first=True, padding_value=process_dim) 50 | seq_types = torch.cat( 51 | (process_dim*torch.ones_like(seq_types[:, :1]), seq_types), dim=1).long()# convert from floattensor to longtensor 52 | 53 | return seq_times, seq_types, seq_lengths, tmax 54 | 55 | 56 | def one_hot_embedding(labels: Tensor, num_classes: int) -> torch.Tensor: 57 | """Embedding labels to one-hot form. Produces an easy-to-use mask to select components of the intensity. 58 | Args: 59 | labels: class labels, sized [N,]. 60 | num_classes: number of classes. 61 | Returns: 62 | (tensor) encoded labels, sized [N, #classes]. 63 | """ 64 | device = labels.device 65 | y = torch.eye(num_classes).to(device) 66 | return y[labels] -------------------------------------------------------------------------------- /utils/evaluation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tqdm 3 | from models.base import SeqGenerator 4 | 5 | 6 | def generate_multiple_sequences(generator: SeqGenerator, tmax: float, n_gen_seq: int = 100): 7 | """ 8 | 9 | Args: 10 | generator: 11 | tmax: end time for the simulations 12 | n_gen_seq: number of samples to take 13 | """ 14 | print("tmax:", tmax) 15 | # Build a statistic for the no. of events 16 | gen_seq_lengths = [] 17 | gen_seq_types_lengths = [] 18 | for i in range(n_gen_seq): 19 | print('Generating the {} sequence'.format(i)) 20 | generator.generate_sequence(tmax, record_intensity=False) 21 | gen_seq_times = generator.event_times 22 | gen_seq_types = np.array(generator.event_types) 23 | gen_seq_lengths.append(len(gen_seq_times)) 24 | gen_seq_types_lengths.append([ 25 | (gen_seq_types == i).sum() for i in range(generator.model.input_size) 26 | ]) 27 | gen_seq_lengths = np.array(gen_seq_lengths) 28 | gen_seq_types_lengths = np.array(gen_seq_types_lengths) 29 | 30 | print("Mean generated sequence length: {}".format(gen_seq_lengths.mean())) 31 | print("Generated sequence length std. dev: {}".format(gen_seq_lengths.std())) 32 | return gen_seq_lengths, gen_seq_types_lengths 33 | 34 | 35 | def predict_test(model, seq_times, seq_types, seq_lengths, pad, device='cpu', 36 | hmax: float = 40., use_jupyter: bool = False, rnn: bool = True): 37 | """Run predictions on testing dataset 38 | 39 | Args: 40 | seq_lengths: 41 | seq_types: 42 | seq_times: 43 | model: 44 | hmax: 45 | use_jupyter: 46 | 47 | Returns: 48 | 49 | """ 50 | incr_estimates = [] 51 | incr_real = [] 52 | incr_errors = [] 53 | types_real = [] 54 | types_estimates = [] 55 | test_size = seq_times.shape[0] 56 | if use_jupyter: 57 | index_range_ = tqdm.tnrange(test_size) 58 | else: 59 | index_range_ = tqdm.trange(test_size) 60 | for index_ in index_range_: 61 | _seq_data = (seq_times[index_], 62 | seq_types[index_], 63 | seq_lengths[index_]) 64 | if rnn: 65 | est, real_dt, err, real_type, est_type = model.read_predict(*_seq_data, hmax) 66 | else: 67 | est, real_dt, err, real_type, est_type = model.read_predict(*_seq_data, pad, device, hmax) 68 | 69 | if err != err: # is nan 70 | continue 71 | incr_estimates.append(est) 72 | incr_real.append(real_dt) 73 | incr_errors.append(err) 74 | types_real.append(real_type) 75 | types_estimates.append(est_type) 76 | 77 | incr_estimates = np.asarray(incr_estimates) 78 | incr_errors = np.asarray(incr_errors) 79 | types_real = np.asarray(types_real) 80 | types_estimates = np.asarray(types_estimates) 81 | return incr_estimates, incr_errors, types_real, types_estimates 82 | 83 | -------------------------------------------------------------------------------- /utils/util.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Utility functions for CTLSTM model 3 | ''' 4 | 5 | import torch 6 | from torch import nn 7 | from utils.load_synth_data import one_hot_embedding 8 | 9 | def get_batch(batch_size, i_batch, model, seq_lengths, seq_times, seq_types, rnn = True): 10 | start_pos = i_batch 11 | end_pos = i_batch + batch_size 12 | batch_seq_lengths = seq_lengths[start_pos:end_pos] 13 | max_seq_length = batch_seq_lengths[0] 14 | batch_seq_times = seq_times[start_pos:end_pos, :max_seq_length + 1] 15 | batch_seq_types = seq_types[start_pos:end_pos, :max_seq_length + 1] 16 | # Inter-event time intervals 17 | batch_dt = batch_seq_times[:, 1:] - batch_seq_times[:, :-1] 18 | 19 | batch_onehot = one_hot_embedding(batch_seq_types, model.input_size) 20 | batch_onehot = batch_onehot[:, :, :model.process_dim]# [1,0], [0,1], [0,0] 21 | 22 | if rnn: 23 | # Pack the sequences for rnn 24 | packed_dt = nn.utils.rnn.pack_padded_sequence(batch_dt, batch_seq_lengths, batch_first=True) 25 | packed_types = nn.utils.rnn.pack_padded_sequence(batch_seq_types, batch_seq_lengths, batch_first=True) 26 | max_pack_batch_size = packed_dt.batch_sizes[0] 27 | else: 28 | # self-attention 29 | packed_dt,packed_types,max_pack_batch_size = None, None,0 30 | return batch_onehot, batch_seq_times, batch_dt, batch_seq_types, \ 31 | max_pack_batch_size, packed_dt, packed_types, batch_seq_lengths 32 | 33 | def generate_sim_interval_seqs(interval_seqs, seqs_length): 34 | ''' Generate a simulated time interval sequences from original time interval sequences 35 | based on uniform distribution 36 | 37 | Args: 38 | interval_seqs: list of torch float tensors 39 | Results: 40 | sim_interval_seqs: list of torch float tensors 41 | sim_index_seqs: list of torch long tensors 42 | ''' 43 | sim_interval_seqs = torch.zeros((interval_seqs.size()[0], interval_seqs.size()[1]-1)).float() 44 | sim_index_seqs = torch.zeros((interval_seqs.size()[0], interval_seqs.size()[1]-1)).long() 45 | restore_interval_seqs, restore_sim_interval_seqs = [], [] 46 | for idx, interval_seq in enumerate(interval_seqs): 47 | restore_interval_seq = torch.stack([torch.sum(interval_seq[0:i]) for i in range(1,seqs_length[idx]+1)]) 48 | # Generate N-1 time points. Here N includes 49 | restore_sim_interval_seq, _ = torch.sort(torch.empty(seqs_length[idx]-1).uniform_(0, restore_interval_seq[-1])) 50 | 51 | sim_interval_seq = torch.zeros(seqs_length[idx]-1) 52 | sim_index_seq = torch.zeros(seqs_length[idx]-1).long() 53 | 54 | for idx_t, t in enumerate(restore_interval_seq): 55 | indices_to_update = restore_sim_interval_seq > t 56 | 57 | sim_interval_seq[indices_to_update] = restore_sim_interval_seq[indices_to_update] - t 58 | sim_index_seq[indices_to_update] = idx_t 59 | 60 | restore_interval_seqs.append(restore_interval_seq) 61 | restore_sim_interval_seqs.append(restore_sim_interval_seq) 62 | sim_interval_seqs[idx, :seqs_length[idx]-1] = sim_interval_seq 63 | sim_index_seqs[idx, :seqs_length[idx]-1] = sim_index_seq 64 | 65 | return sim_interval_seqs 66 | 67 | def pad_bos(batch_data, type_size): 68 | event_seqs, interval_seqs, total_interval_seqs, seqs_length = batch_data 69 | pad_event_seqs = torch.zeros((event_seqs.size()[0], event_seqs.size()[1]+1)).long() * type_size 70 | pad_interval_seqs = torch.zeros((interval_seqs.size()[0], event_seqs.size()[1]+1)).float() 71 | 72 | pad_event_seqs[:, 1:] = event_seqs.clone() 73 | pad_event_seqs[:, 0] = type_size 74 | pad_interval_seqs[:, 1:] = interval_seqs.clone() 75 | 76 | return pad_event_seqs, pad_interval_seqs, total_interval_seqs, seqs_length 77 | 78 | 79 | def count_parameters(model): 80 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 81 | 82 | 83 | if __name__ == '__main__': 84 | a = torch.tensor([0., 1., 2., 3., 4., 5.]) 85 | b = torch.tensor([0., 2., 4., 6., 0., 0.]) 86 | sim_interval_seqs, sim_index_seqs = generate_sim_interval_seqs(torch.stack([a,b]), torch.LongTensor([6,4])) 87 | print(sim_interval_seqs) 88 | print(sim_index_seqs) 89 | 90 | -------------------------------------------------------------------------------- /models/base.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import torch 4 | from torch import nn, Tensor 5 | 6 | 7 | class SeqGenerator: 8 | def __init__(self, model: nn.Module, record_intensity: bool = False): 9 | self.model = model 10 | self.process_dim = model.input_size - 1 # process dimension 11 | print("Process models dim:\t{}\tHidden units:\t{}".format(self.process_dim, model.hidden_size)) 12 | self.event_times = [] 13 | self.event_types = [] 14 | self.decay_hist = [] 15 | self.hidden_hist = [] 16 | self.intens_hist = [] 17 | self._plot_times = [] 18 | self.event_intens = [] 19 | self.record_intensity: bool = record_intensity 20 | 21 | def _restart_sequence(self): 22 | self.event_times = [] 23 | self.event_types = [] 24 | self.event_intens = [] 25 | self.decay_hist = [] 26 | self.hidden_hist = [] 27 | self.intens_hist = [] 28 | self._plot_times = [] 29 | 30 | 31 | def generate_sequence(self, tmax: float, record_intensity: bool): 32 | raise NotImplementedError 33 | 34 | 35 | def plot_events_and_intensity(self, model_name: str = None, debug=False): 36 | gen_seq_times = self.event_times 37 | gen_seq_types = self.event_types 38 | sequence_length = len(gen_seq_times) 39 | print("no. of events: {}".format(sequence_length)) 40 | evt_times = np.array(gen_seq_times) 41 | evt_types = np.array(gen_seq_types) 42 | fig, ax = plt.subplots(1, 1, sharex='all', dpi=100, 43 | figsize=(9, 4.5)) 44 | ax: plt.Axes 45 | inpt_size = self.process_dim 46 | ax.set_xlabel('Time $t$ (s)') 47 | intens_hist = np.stack(self.intens_hist)[:, 0] 48 | labels = ["type {}".format(i) for i in range(self.process_dim)] 49 | for y, lab in zip(intens_hist.T, labels): 50 | # plot intensity curve 51 | ax.plot(self._plot_times, y, linewidth=.7, label=lab) 52 | # pass 53 | ax.set_ylabel(r"Intensities $\lambda^i_t$") 54 | title = "Event arrival times and intensities for generated sequence" 55 | if model_name is None: 56 | model_name = self.model.__class__.__name__ 57 | title += " ({})".format(model_name) 58 | ax.set_title(title) 59 | ylims = ax.get_ylim() 60 | ts_y = np.stack(self.event_intens)[:, 0] 61 | for k in range(inpt_size): 62 | mask = evt_types == k 63 | print(k, end=': ') 64 | if k == self.process_dim: 65 | print("starter type") 66 | # label = "start event".format(k) 67 | y = self.intens_hist[0].sum(axis=1) 68 | else: 69 | print("type {}".format(k)) 70 | y = ts_y[mask, k] 71 | # label = "type {} event".format(k) 72 | # plot event point 73 | # ax.scatter(evt_times[mask], y, s=9, zorder=5, alpha=0.8) 74 | # plot time column 75 | ax.vlines(evt_times[mask], ylims[0], ylims[1], linewidth=0.2, linestyles='-', alpha=0.5) 76 | 77 | # Useful for debugging the sampling for the intensity curve. 78 | if debug: 79 | for s in self._plot_times: 80 | ax.vlines(s, ylims[0], ylims[1], linewidth=0.3, linestyles='--', alpha=0.6, colors='red') 81 | 82 | ax.set_ylim(*ylims) 83 | ax.legend() 84 | fig.tight_layout() 85 | return fig 86 | 87 | 88 | def predict_from_hidden(model, h_t_vals, dt_vals, next_dt, next_type, plot, hmax: float = 40., 89 | n_samples=1000, print_info: bool = False): 90 | model.eval() 91 | timestep = hmax / n_samples 92 | 93 | intens_t_vals: Tensor = model.intensity_layer(h_t_vals) 94 | intens_t_vals_sum = intens_t_vals.sum(dim=1) 95 | integral_ = torch.cumsum(timestep * intens_t_vals_sum, dim=0) 96 | # density for the time-until-next-event law 97 | density = intens_t_vals_sum * torch.exp(-integral_) 98 | # Check density 99 | if print_info: 100 | print("sum of density:", (timestep * density).sum()) 101 | t_pit = dt_vals * density # integrand for the time estimator 102 | ratio = intens_t_vals / intens_t_vals_sum[:, None] 103 | prob_type = ratio * density[:, None] # integrand for the types 104 | # trapeze method 105 | estimate_dt = (timestep * 0.5 * (t_pit[1:] + t_pit[:-1])).sum() 106 | estimate_type_prob = (timestep * 0.5 * (prob_type[1:] + prob_type[:-1])).sum(dim=0) 107 | if print_info: 108 | print("type probabilities:", estimate_type_prob) 109 | estimate_type = torch.argmax(estimate_type_prob) 110 | next_dt += 1e-5 111 | error_dt = ((estimate_dt - next_dt)/next_dt)** 2#, normalization, np.abs, 112 | if plot: 113 | process_dim = model.process_dim 114 | fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(10, 4), dpi=100) 115 | ax0.plot(dt_vals.numpy(), density.numpy(), 116 | linestyle='-', linewidth=.8) 117 | ax0.set_title("Probability density $p_i(u)$\nof the next increment") 118 | ax0.set_xlabel("Time $u$") 119 | ax0.set_ylabel('density $p_i(u)$') 120 | ylims = ax0.get_ylim() 121 | ax0.vlines(estimate_dt.item(), *ylims, 122 | linestyle='--', linewidth=.7, color='red', 123 | label=r'estimate $\hat{t}_i - t_{i-1}$') 124 | ax0.vlines(next_dt.item(), *ylims, 125 | linestyle='--', linewidth=.7, color='green', 126 | label=r'true $t_i - t_{i-1}$') 127 | ax0.set_ylim(ylims) 128 | ax0.legend() 129 | ax1.plot(dt_vals.numpy(), intens_t_vals_sum.numpy(), 130 | linestyle='-', linewidth=.7, label=r'total intensity $\bar\lambda$') 131 | for k in range(process_dim): 132 | ax1.plot(dt_vals.numpy(), intens_t_vals[:, k].numpy(), 133 | label='type {}'.format(k), 134 | linestyle='--', linewidth=.7) 135 | ax1.set_title("Intensities") 136 | ax1.set_xlabel("Time $t$") 137 | ax1.legend() 138 | # definite integral of the density 139 | return (estimate_dt, next_dt, error_dt, next_type, estimate_type), fig 140 | return estimate_dt, next_dt, error_dt, next_type, estimate_type 141 | -------------------------------------------------------------------------------- /train_functions/train_sahp.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | from torch.autograd import Variable 5 | from torch import autograd 6 | 7 | import numpy as np 8 | import random 9 | 10 | from models.sahp import SAHP 11 | from utils import atten_optimizer 12 | from utils import util 13 | 14 | def make_model(nLayers=6, d_model=128, atten_heads=8, dropout=0.1, process_dim=10, 15 | device = 'cpu', pe='concat', max_sequence_length=4096): 16 | "helper: construct a models form hyper parameters" 17 | 18 | model = SAHP(nLayers, d_model, atten_heads, dropout=dropout, process_dim=process_dim, device = device, 19 | max_sequence_length=max_sequence_length) 20 | 21 | # initialize parameters with Glorot / fan_avg 22 | for p in model.parameters(): 23 | if p.dim() > 1: 24 | nn.init.xavier_uniform_(p) 25 | return model 26 | 27 | 28 | def subsequent_mask(size): 29 | "mask out subsequent positions" 30 | atten_shape = (1,size,size) 31 | # np.triu: Return a copy of a matrix with the elements below the k-th diagonal zeroed. 32 | mask = np.triu(np.ones(atten_shape),k=1).astype('uint8') 33 | aaa = torch.from_numpy(mask) == 0 34 | return aaa 35 | 36 | 37 | class MaskBatch(): 38 | "object for holding a batch of data with mask during training" 39 | def __init__(self,src,pad, device): 40 | self.src = src 41 | self.src_mask = self.make_std_mask(self.src, pad, device) 42 | 43 | @staticmethod 44 | def make_std_mask(tgt,pad,device): 45 | "create a mask to hide padding and future input" 46 | # torch.cuda.set_device(device) 47 | tgt_mask = (tgt != pad).unsqueeze(-2) 48 | tgt_mask = tgt_mask & Variable(subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data)).to(device) 49 | return tgt_mask 50 | 51 | 52 | def l1_loss(model): 53 | ## l1 loss 54 | l1 = 0 55 | for p in model.parameters(): 56 | l1 = l1 + p.abs().sum() 57 | return l1 58 | 59 | def eval_sahp(batch_size, loop_range, seq_lengths, seq_times, seq_types, model, device, lambda_l1=0): 60 | model.eval() 61 | epoch_loss = 0 62 | for i_batch in loop_range: 63 | batch_onehot, batch_seq_times, batch_dt, batch_seq_types, _, _, _, batch_seq_lengths = \ 64 | util.get_batch(batch_size, i_batch, model, seq_lengths, seq_times, seq_types, rnn=False) 65 | batch_seq_types = batch_seq_types[:, 1:] 66 | 67 | masked_seq_types = MaskBatch(batch_seq_types,pad=model.process_dim, device=device)# exclude the first added event 68 | model.forward(batch_dt, masked_seq_types.src, masked_seq_types.src_mask) 69 | nll = model.compute_loss(batch_seq_times, batch_onehot) 70 | 71 | loss = nll 72 | epoch_loss += loss.detach() 73 | event_num = torch.sum(seq_lengths).float() 74 | model.train() 75 | return event_num, epoch_loss 76 | 77 | 78 | def train_eval_sahp(params): 79 | 80 | args, process_dim, device, tmax, \ 81 | train_seq_times, train_seq_types, train_seq_lengths, \ 82 | dev_seq_times, dev_seq_types, dev_seq_lengths, \ 83 | test_seq_times, test_seq_types, test_seq_lengths, \ 84 | batch_size, epoch_num, use_cuda = params 85 | 86 | ## sequence length 87 | train_seq_lengths, reorder_indices_train = train_seq_lengths.sort(descending=True) 88 | # # Reorder by descending sequence length 89 | train_seq_times = train_seq_times[reorder_indices_train] 90 | train_seq_types = train_seq_types[reorder_indices_train] 91 | # 92 | dev_seq_lengths, reorder_indices_dev = dev_seq_lengths.sort(descending=True) 93 | # # Reorder by descending sequence length 94 | dev_seq_times = dev_seq_times[reorder_indices_dev] 95 | dev_seq_types = dev_seq_types[reorder_indices_dev] 96 | 97 | test_seq_lengths, reorder_indices_test = test_seq_lengths.sort(descending=True) 98 | # # Reorder by descending sequence length 99 | test_seq_times = test_seq_times[reorder_indices_test] 100 | test_seq_types = test_seq_types[reorder_indices_test] 101 | 102 | max_sequence_length = max(train_seq_lengths[0], dev_seq_lengths[0], test_seq_lengths[0]) 103 | print('max_sequence_length: {}'.format(max_sequence_length)) 104 | 105 | d_model = args.d_model 106 | atten_heads = args.atten_heads 107 | dropout = args.dropout 108 | 109 | model = make_model(nLayers=args.nLayers, d_model=d_model, atten_heads=atten_heads, 110 | dropout=dropout, process_dim=process_dim, device=device, pe=args.pe, 111 | max_sequence_length=max_sequence_length + 1).to(device) 112 | 113 | print("the number of trainable parameters: " + str(util.count_parameters(model))) 114 | 115 | optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9, weight_decay=args.lambda_l2) 116 | model_opt = atten_optimizer.NoamOpt(args.d_model, 1, 100, initial_lr=args.lr, optimizer=optimizer) 117 | 118 | 119 | ## Size of the traing dataset 120 | train_size = train_seq_times.size(0) 121 | dev_size = dev_seq_times.size(0) 122 | test_size = test_seq_times.size(0) 123 | tr_loop_range = list(range(0, train_size, batch_size)) 124 | de_loop_range = list(range(0, dev_size, batch_size)) 125 | test_loop_range = list(range(0, test_size, batch_size)) 126 | 127 | last_dev_loss = 0.0 128 | early_step = 0 129 | 130 | model.train() 131 | for epoch in range(epoch_num): 132 | epoch_train_loss = 0.0 133 | print('Epoch {} starts '.format(epoch)) 134 | 135 | ## training 136 | random.shuffle(tr_loop_range) 137 | for i_batch in tr_loop_range: 138 | 139 | model_opt.optimizer.zero_grad() 140 | 141 | batch_onehot, batch_seq_times, batch_dt, batch_seq_types, _, _, _, batch_seq_lengths = \ 142 | util.get_batch(batch_size, i_batch, model, train_seq_lengths, train_seq_times, train_seq_types, rnn=False) 143 | 144 | batch_seq_types = batch_seq_types[:, 1:] 145 | 146 | masked_seq_types = MaskBatch(batch_seq_types, pad=model.process_dim, device=device)# exclude the first added even 147 | model.forward(batch_dt, masked_seq_types.src, masked_seq_types.src_mask) 148 | nll = model.compute_loss(batch_seq_times, batch_onehot) 149 | 150 | loss = nll 151 | 152 | loss.backward() 153 | model_opt.optimizer.step() 154 | 155 | if i_batch %50 == 0: 156 | batch_event_num = torch.sum(batch_seq_lengths).float() 157 | print('Epoch {} Batch {}: Negative Log-Likelihood per event: {:5f} nats' \ 158 | .format(epoch, i_batch, loss.item()/ batch_event_num)) 159 | epoch_train_loss += loss.detach() 160 | 161 | if epoch_train_loss < 0: 162 | break 163 | train_event_num = torch.sum(train_seq_lengths).float() 164 | print('---\nEpoch.{} Training set\nTrain Negative Log-Likelihood per event: {:5f} nats\n' \ 165 | .format(epoch, epoch_train_loss / train_event_num)) 166 | 167 | ## dev 168 | dev_event_num, epoch_dev_loss = eval_sahp(batch_size, de_loop_range, dev_seq_lengths, dev_seq_times, 169 | dev_seq_types, model, device, args.lambda_l2) 170 | print('Epoch.{} Devlopment set\nDev Negative Likelihood per event: {:5f} nats.\n'. \ 171 | format(epoch, epoch_dev_loss / dev_event_num)) 172 | 173 | ## test 174 | test_event_num, epoch_test_loss = eval_sahp(batch_size, test_loop_range, test_seq_lengths, test_seq_times, 175 | test_seq_types, model, device, args.lambda_l2) 176 | print('Epoch.{} Test set\nTest Negative Likelihood per event: {:5f} nats.\n'. \ 177 | format(epoch, epoch_test_loss / test_event_num)) 178 | 179 | ## early stopping 180 | gap = epoch_dev_loss / dev_event_num - last_dev_loss 181 | if abs(gap) < args.early_stop_threshold: 182 | early_step += 1 183 | last_dev_loss = epoch_dev_loss / dev_event_num 184 | 185 | if early_step >=3: 186 | print('Early Stopping') 187 | break 188 | 189 | # prediction 190 | avg_rmse, types_predict_score = \ 191 | prediction_evaluation(device, model, test_seq_lengths, test_seq_times, test_seq_types, test_size, tmax) 192 | 193 | return model 194 | 195 | 196 | def prediction_evaluation(device, model, test_seq_lengths, test_seq_times, test_seq_types, test_size, tmax): 197 | model.eval() 198 | from utils import evaluation 199 | test_data = (test_seq_times, test_seq_types, test_seq_lengths) 200 | incr_estimates, incr_errors, types_real, types_estimates = \ 201 | evaluation.predict_test(model, *test_data, pad=model.process_dim, device=device, 202 | hmax=tmax, use_jupyter=False, rnn=False) 203 | if device != 'cpu': 204 | incr_errors = [incr_err.item() for incr_err in incr_errors] 205 | types_real = [types_rl.item() for types_rl in types_real] 206 | types_estimates = [types_esti.item() for types_esti in types_estimates] 207 | 208 | avg_rmse = np.sqrt(np.mean(incr_errors), dtype=np.float64) 209 | print("rmse", avg_rmse) 210 | mse_var = np.var(incr_errors, dtype=np.float64) 211 | 212 | delta_meth_stderr = 1 / test_size * mse_var / (4 * avg_rmse) 213 | 214 | from sklearn.metrics import confusion_matrix, accuracy_score, f1_score 215 | types_predict_score = f1_score(types_real, types_estimates, average='micro')# preferable in class imbalance 216 | print("Type prediction score:", types_predict_score) 217 | # print("Confusion matrix:\n", confusion_matrix(types_real, types_estimates)) 218 | model.train() 219 | return avg_rmse, types_predict_score 220 | 221 | if __name__ == "__main__": 222 | mode = 'train' 223 | 224 | if mode == 'train': 225 | with autograd.detect_anomaly(): 226 | train_eval_sahp() 227 | 228 | else: 229 | pass 230 | print("Done!") 231 | 232 | 233 | 234 | -------------------------------------------------------------------------------- /main_func.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import glob 4 | import os 5 | import pickle 6 | import numpy as np 7 | import time 8 | 9 | import torch 10 | from torch import autograd 11 | 12 | from utils.load_synth_data import process_loaded_sequences 13 | from train_functions.train_sahp import make_model, train_eval_sahp 14 | 15 | DEFAULT_BATCH_SIZE = 32 16 | DEFAULT_HIDDEN_SIZE = 16 17 | DEFAULT_LEARN_RATE = 5e-5 18 | 19 | parser = argparse.ArgumentParser(description="Train the models.") 20 | parser.add_argument('-e', '--epochs', type=int, default = 1000, 21 | help='number of epochs.') 22 | parser.add_argument('-b', '--batch', type=int, 23 | dest='batch_size', default=DEFAULT_BATCH_SIZE, 24 | help='batch size. (default: {})'.format(DEFAULT_BATCH_SIZE)) 25 | parser.add_argument('--lr', default=DEFAULT_LEARN_RATE, type=float, 26 | help="set the optimizer learning rate. (default {})".format(DEFAULT_LEARN_RATE)) 27 | parser.add_argument('--hidden', type=int, 28 | dest='hidden_size', default=DEFAULT_HIDDEN_SIZE, 29 | help='number of hidden units. (default: {})'.format(DEFAULT_HIDDEN_SIZE)) 30 | parser.add_argument('--d-model', type=int, default=DEFAULT_HIDDEN_SIZE) 31 | parser.add_argument('--atten-heads', type=int, default=8) 32 | parser.add_argument('--pe', type=str,default='add',help='concat, add') 33 | parser.add_argument('--nLayers', type=int, default=4) 34 | parser.add_argument('--dropout', type=float, default=0.1) 35 | parser.add_argument('--cuda', type=int, default=0) 36 | parser.add_argument('--train-ratio', type=float, default=0.8, 37 | help='override the size of the training dataset.') 38 | parser.add_argument('--lambda-l2', type=float, default=3e-4, 39 | help='regularization loss.') 40 | parser.add_argument('--dev-ratio', type=float, default=0.1, 41 | help='override the size of the dev dataset.') 42 | parser.add_argument('--early-stop-threshold', type=float, default=1e-2, 43 | help='early_stop_threshold') 44 | parser.add_argument('--log-dir', type=str, 45 | dest='log_dir', default='logs', 46 | help="training logs target directory.") 47 | parser.add_argument('--save_model', default=False, 48 | help="do not save the models state dict and loss history.") 49 | parser.add_argument('--bias', default=False, 50 | help="use bias on the activation (intensity) layer.") 51 | parser.add_argument('--samples', default=10, 52 | help="number of samples in the integral.") 53 | parser.add_argument('-m', '--model', default='sahp', 54 | type=str, choices=['sahp'], 55 | help='choose which models to train.') 56 | parser.add_argument('-t', '--task', type=str, default='retweet', 57 | help = 'task type') 58 | args = parser.parse_args() 59 | print(args) 60 | 61 | if torch.cuda.is_available(): 62 | USE_CUDA = True 63 | else: 64 | USE_CUDA = False 65 | 66 | SYNTH_DATA_FILES = glob.glob("../data/simulated/*.pkl") 67 | TYPE_SIZE_DICT = {'retweet': 3, 'bookorder':8, 'meme':5000, 'mimic':75, 'stackOverflow':22, 68 | 'synthetic':2} 69 | REAL_WORLD_TASKS = list(TYPE_SIZE_DICT.keys())[:5] 70 | SYNTHETIC_TASKS = list(TYPE_SIZE_DICT.keys())[5:] 71 | 72 | start_time = time.time() 73 | 74 | if __name__ == '__main__': 75 | cuda_num = 'cuda:{}'.format(args.cuda) 76 | device = torch.device(cuda_num if USE_CUDA else 'cpu') 77 | print("Training on device {}".format(device)) 78 | 79 | process_dim = TYPE_SIZE_DICT[args.task] 80 | print("Loading {}-dimensional process.".format(process_dim), end=' \n') 81 | 82 | if args.task in SYNTHETIC_TASKS: 83 | print("Available files:") 84 | for i, s in enumerate(SYNTH_DATA_FILES): 85 | print("{:<8}{:<8}".format(i, s)) 86 | 87 | chosen_file_index = -1 88 | chosen_file = SYNTH_DATA_FILES[chosen_file_index] 89 | print('chosen file:%s'+str(chosen_file)) 90 | 91 | with open(chosen_file, 'rb') as f: 92 | loaded_hawkes_data = pickle.load(f) 93 | 94 | mu = loaded_hawkes_data['mu'] 95 | alpha = loaded_hawkes_data['alpha'] 96 | decay = loaded_hawkes_data['decay'] 97 | tmax = loaded_hawkes_data['tmax'] 98 | print("Simulated Hawkes process parameters:") 99 | for label, val in [("mu", mu), ("alpha", alpha), ("decay", decay), ("tmax", tmax)]: 100 | print("{:<20}{}".format(label, val)) 101 | 102 | seq_times, seq_types, seq_lengths, _ = process_loaded_sequences(loaded_hawkes_data, process_dim) 103 | 104 | seq_times = seq_times.to(device) 105 | seq_types = seq_types.to(device) 106 | seq_lengths = seq_lengths.to(device) 107 | 108 | total_sample_size = seq_times.size(0) 109 | print("Total sample size: {}".format(total_sample_size)) 110 | 111 | train_ratio = args.train_ratio 112 | train_size = int(train_ratio * total_sample_size) 113 | dev_ratio = args.dev_ratio 114 | dev_size = int(dev_ratio * total_sample_size) 115 | print("Train sample size: {:}/{:}".format(train_size, total_sample_size)) 116 | print("Dev sample size: {:}/{:}".format(dev_size, total_sample_size)) 117 | 118 | # Define training data 119 | train_times_tensor = seq_times[:train_size] 120 | train_seq_types = seq_types[:train_size] 121 | train_seq_lengths = seq_lengths[:train_size] 122 | print("No. of event tokens in training subset:", train_seq_lengths.sum()) 123 | 124 | # Define development data 125 | dev_times_tensor = seq_times[train_size:]#train_size+dev_size 126 | dev_seq_types = seq_types[train_size:] 127 | dev_seq_lengths = seq_lengths[train_size:] 128 | print("No. of event tokens in development subset:", dev_seq_lengths.sum()) 129 | 130 | test_times_tensor = dev_times_tensor 131 | test_seq_types = dev_seq_types 132 | test_seq_lengths = dev_seq_lengths 133 | print("No. of event tokens in test subset:", test_seq_lengths.sum()) 134 | 135 | elif args.task in REAL_WORLD_TASKS: 136 | train_path = '../data/' + args.task + '/train_manifold_format.pkl' 137 | dev_path = '../data/' + args.task + '/dev_manifold_format.pkl' 138 | test_path = '../data/' + args.task + '/test_manifold_format.pkl' 139 | 140 | chosen_file = args.task 141 | 142 | with open(train_path, 'rb') as f: 143 | train_hawkes_data = pickle.load(f) 144 | with open(dev_path, 'rb') as f: 145 | dev_hawkes_data = pickle.load(f) 146 | with open(test_path, 'rb') as f: 147 | test_hawkes_data = pickle.load(f) 148 | 149 | train_seq_times, train_seq_types, train_seq_lengths, train_tmax = \ 150 | process_loaded_sequences(train_hawkes_data, process_dim) 151 | dev_seq_times, dev_seq_types, dev_seq_lengths, dev_tmax = \ 152 | process_loaded_sequences(dev_hawkes_data, process_dim) 153 | test_seq_times, test_seq_types, test_seq_lengths, test_tmax = \ 154 | process_loaded_sequences(test_hawkes_data, process_dim) 155 | 156 | tmax = max([train_tmax,dev_tmax,test_tmax]) 157 | 158 | train_sample_size = train_seq_times.size(0) 159 | print("Train sample size: {}".format(train_sample_size)) 160 | 161 | dev_sample_size = dev_seq_times.size(0) 162 | print("Dev sample size: {}".format(dev_sample_size)) 163 | 164 | test_sample_size = test_seq_times.size(0) 165 | print("Test sample size: {}".format(test_sample_size)) 166 | 167 | # Define training data 168 | train_times_tensor = train_seq_times.to(device) 169 | train_seq_types = train_seq_types.to(device) 170 | train_seq_lengths = train_seq_lengths.to(device) 171 | print("No. of event tokens in training subset:", train_seq_lengths.sum()) 172 | 173 | # Define development data 174 | dev_times_tensor = dev_seq_times.to(device) 175 | dev_seq_types = dev_seq_types.to(device) 176 | dev_seq_lengths = dev_seq_lengths.to(device) 177 | print("No. of event tokens in development subset:", dev_seq_lengths.sum()) 178 | 179 | # Define test data 180 | test_times_tensor = test_seq_times.to(device) 181 | test_seq_types = test_seq_types.to(device) 182 | test_seq_lengths = test_seq_lengths.to(device) 183 | print("No. of event tokens in test subset:", test_seq_lengths.sum()) 184 | 185 | else: 186 | exit() 187 | 188 | 189 | MODEL_TOKEN = args.model 190 | print("Chose models {}".format(MODEL_TOKEN)) 191 | hidden_size = args.hidden_size 192 | print("Hidden size: {}".format(hidden_size)) 193 | learning_rate = args.lr 194 | # Training parameters 195 | BATCH_SIZE = args.batch_size 196 | EPOCHS = args.epochs 197 | 198 | model = None 199 | if MODEL_TOKEN == 'sahp': 200 | with autograd.detect_anomaly(): 201 | params = args, process_dim, device, tmax, \ 202 | train_times_tensor, train_seq_types, train_seq_lengths, \ 203 | dev_times_tensor, dev_seq_types, dev_seq_lengths, \ 204 | test_times_tensor, test_seq_types, test_seq_lengths, \ 205 | BATCH_SIZE, EPOCHS, USE_CUDA 206 | model = train_eval_sahp(params) 207 | 208 | else: 209 | exit() 210 | 211 | 212 | if args.save_model: 213 | # Model file dump 214 | SAVED_MODELS_PATH = os.path.abspath('saved_models') 215 | os.makedirs(SAVED_MODELS_PATH, exist_ok=True) 216 | # print("Saved models directory: {}".format(SAVED_MODELS_PATH)) 217 | 218 | date_format = "%Y%m%d-%H%M%S" 219 | now_timestamp = datetime.datetime.now().strftime(date_format) 220 | extra_tag = "{}".format(args.task) 221 | filename_base = "{}-{}_hidden{}-{}".format( 222 | MODEL_TOKEN, extra_tag, 223 | hidden_size, now_timestamp) 224 | from utils.save_model import save_model 225 | save_model(model, chosen_file, extra_tag, 226 | hidden_size, now_timestamp, MODEL_TOKEN) 227 | 228 | print('Done! time elapsed %.2f sec for %d epoches' % (time.time() - start_time, EPOCHS)) 229 | 230 | -------------------------------------------------------------------------------- /models/sahp.py: -------------------------------------------------------------------------------- 1 | ''' 2 | self-attentive Hawkes process 3 | ''' 4 | 5 | import torch 6 | import torch.nn as nn 7 | from torch.autograd import Variable 8 | import torch.nn.functional as F 9 | import numpy as np 10 | import math, copy 11 | 12 | from models.embedding.event_type import TypeEmbedding 13 | from models.embedding.position import PositionalEmbedding,BiasedPositionalEmbedding 14 | from models.embedding.event_embedding import EventEmbedding 15 | from models.attention.multi_head import MultiHeadedAttention 16 | from models.utils.sublayer import SublayerConnection 17 | from models.utils.feed_forward import PositionwiseFeedForward 18 | from models.base import SeqGenerator, predict_from_hidden 19 | from models.utils.gelu import GELU 20 | 21 | from matplotlib import pyplot as plt 22 | 23 | class SAHP(nn.Module): 24 | "Generic N layer attentive Hawkes with masking" 25 | 26 | def __init__(self, nLayers, d_model, atten_heads, dropout, process_dim, device, max_sequence_length): 27 | super(SAHP, self).__init__() 28 | self.nLayers = nLayers 29 | self.process_dim = process_dim 30 | self.input_size = process_dim + 1 31 | self.query_size = d_model // atten_heads 32 | self.device = device 33 | self.gelu = GELU() 34 | 35 | self.d_model = d_model 36 | self.type_emb = TypeEmbedding(self.input_size, d_model, padding_idx=self.process_dim) 37 | self.position_emb = BiasedPositionalEmbedding(d_model=d_model,max_len = max_sequence_length) 38 | 39 | self.attention = MultiHeadedAttention(h=atten_heads, d_model=self.d_model) 40 | self.feed_forward = PositionwiseFeedForward(d_model=self.d_model, d_ff=self.d_model * 4, dropout=dropout) 41 | self.input_sublayer = SublayerConnection(size=self.d_model, dropout=dropout) 42 | self.output_sublayer = SublayerConnection(size=self.d_model, dropout=dropout) 43 | self.dropout = nn.Dropout(p=dropout) 44 | 45 | self.start_layer = nn.Sequential( 46 | nn.Linear(self.d_model, self.d_model, bias=True), 47 | self.gelu 48 | ) 49 | 50 | self.converge_layer = nn.Sequential( 51 | nn.Linear(self.d_model, self.d_model, bias=True), 52 | self.gelu 53 | ) 54 | 55 | self.decay_layer = nn.Sequential( 56 | nn.Linear(self.d_model, self.d_model, bias=True) 57 | ,nn.Softplus(beta=10.0) 58 | ) 59 | 60 | self.intensity_layer = nn.Sequential( 61 | nn.Linear(self.d_model, self.process_dim, bias = True) 62 | ,nn.Softplus(beta=1.) 63 | ) 64 | 65 | def state_decay(self, converge_point, start_point, omega, duration_t): 66 | # * element-wise product 67 | cell_t = torch.tanh(converge_point + (start_point - converge_point) * torch.exp(- omega * duration_t)) 68 | return cell_t 69 | 70 | def forward(self, seq_dt, seq_types, src_mask): 71 | type_embedding = self.type_emb(seq_types) * math.sqrt(self.d_model) # 72 | position_embedding = self.position_emb(seq_types,seq_dt) 73 | 74 | x = type_embedding + position_embedding 75 | for i in range(self.nLayers): 76 | x = self.input_sublayer(x, lambda _x: self.attention.forward(_x, _x, _x, mask=src_mask)) 77 | x = self.dropout(self.output_sublayer(x, self.feed_forward)) 78 | 79 | embed_info = x 80 | 81 | self.start_point = self.start_layer(embed_info) 82 | self.converge_point = self.converge_layer(embed_info) 83 | self.omega = self.decay_layer(embed_info) 84 | 85 | def compute_loss(self, seq_times, seq_onehot_types,n_mc_samples = 20): 86 | """ 87 | Compute the negative log-likelihood as a loss function. 88 | 89 | Args: 90 | seq_times: event occurrence timestamps 91 | seq_onehot_types: types of events in the sequence, one hot encoded 92 | batch_sizes: batch sizes for each event sequence tensor, by length 93 | tmax: temporal horizon 94 | 95 | Returns: 96 | log-likelihood of the event times under the learned parameters 97 | 98 | Shape: 99 | one-element tensor 100 | """ 101 | 102 | dt_seq = seq_times[:, 1:] - seq_times[:, :-1] 103 | cell_t = self.state_decay(self.converge_point, self.start_point, self.omega, dt_seq[:, :, None]) 104 | 105 | 106 | n_batch = seq_times.size(0) 107 | n_times = seq_times.size(1) - 1 108 | device = dt_seq.device 109 | # Get the intensity process 110 | intens_at_evs = self.intensity_layer(cell_t) 111 | intens_at_evs = nn.utils.rnn.pad_sequence( 112 | intens_at_evs, padding_value=1.0,batch_first=True) # pad with 0 to get rid of the non-events, log1=0 113 | log_intensities = intens_at_evs.log() # log intensities 114 | seq_mask = seq_onehot_types[:, 1:] 115 | log_sum = (log_intensities * seq_mask).sum(dim=(2, 1)) # shape batch 116 | 117 | 118 | taus = torch.rand(n_batch, n_times, 1, n_mc_samples).to(device)# self.process_dim replaced 1 119 | taus = dt_seq[:, :, None, None] * taus # inter-event times samples) 120 | 121 | cell_tau = self.state_decay( 122 | self.converge_point[:,:,:,None], 123 | self.start_point[:,:,:,None], 124 | self.omega[:,:,:,None], 125 | taus) 126 | cell_tau = cell_tau.transpose(2, 3) 127 | intens_at_samples = self.intensity_layer(cell_tau).transpose(2,3) 128 | intens_at_samples = nn.utils.rnn.pad_sequence( 129 | intens_at_samples, padding_value=0.0, batch_first=True) 130 | 131 | total_intens_samples = intens_at_samples.sum(dim=2) # shape batch * N * MC 132 | partial_integrals = dt_seq * total_intens_samples.mean(dim=2) 133 | 134 | integral_ = partial_integrals.sum(dim=1) 135 | 136 | res = torch.sum(- log_sum + integral_) 137 | return res 138 | 139 | 140 | def read_predict(self, seq_times, seq_types, seq_lengths, pad, device, 141 | hmax = 40, n_samples=1000, plot = False, print_info = False): 142 | """ 143 | Read an event sequence and predict the next event time and type. 144 | 145 | Args: 146 | seq_times: # start from 0 147 | seq_types: 148 | seq_lengths: 149 | hmax: 150 | plot: 151 | print_info: 152 | 153 | Returns: 154 | 155 | """ 156 | 157 | length = seq_lengths.item() # exclude the first added event 158 | 159 | ## remove the first added event 160 | dt_seq = seq_times[1:] - seq_times[:-1] 161 | last_t = seq_times[length - 1] 162 | next_t = seq_times[length] 163 | 164 | dt_seq_valid = dt_seq[:length] # exclude the last timestamp 165 | dt_seq_used = dt_seq_valid[:length-1] # exclude the last timestamp 166 | next_dt = dt_seq_valid[length - 1] 167 | 168 | seq_types_valid = seq_types[1:length + 1] # include the first added event 169 | from train_functions.train_sahp import MaskBatch 170 | last_type = seq_types[length-1] 171 | next_type = seq_types[length] 172 | if next_type == self.process_dim: 173 | print('Error: wrong next event type') 174 | seq_types_used = seq_types_valid[:-1] 175 | seq_types_valid_masked = MaskBatch(seq_types_used[None, :], pad, device) 176 | seq_types_used_mask = seq_types_valid_masked.src_mask 177 | 178 | 179 | with torch.no_grad(): 180 | self.forward(dt_seq_used, seq_types_used, seq_types_used_mask) 181 | 182 | if self.omega.shape[1] == 0: # only one element 183 | estimate_dt, next_dt, error_dt, next_type, estimate_type = 0,0,0,0,0 184 | return estimate_dt, next_dt, error_dt, next_type, estimate_type 185 | 186 | elif self.omega.shape[1] == 1: # only one element 187 | converge_point = torch.squeeze(self.converge_point)[None, :] 188 | start_point = torch.squeeze(self.start_point)[None,:] 189 | omega = torch.squeeze(self.omega)[None, :] 190 | else: 191 | converge_point = torch.squeeze(self.converge_point)[-1, :] 192 | start_point = torch.squeeze(self.start_point)[-1, :] 193 | omega = torch.squeeze(self.omega)[-1, :] 194 | 195 | dt_vals = torch.linspace(0, hmax, n_samples + 1).to(device) 196 | h_t_vals = self.state_decay(converge_point, 197 | start_point, 198 | omega, 199 | dt_vals[:, None]) 200 | if print_info: 201 | print("last event: time {:.3f} type {:.3f}" 202 | .format(last_t.item(), last_type.item())) 203 | print("next event: time {:.3f} type {:.3f}, in {:.3f}" 204 | .format(next_t.item(), next_type.item(), next_dt.item())) 205 | 206 | return predict_from_hidden(self, h_t_vals, dt_vals, next_dt, next_type, 207 | plot, hmax, n_samples, print_info) 208 | 209 | 210 | def plot_estimated_intensity(self,timestamps, n_points=10000, plot_nodes=None, 211 | t_min=None, t_max=None, 212 | intensity_track_step=None, max_jumps=None, 213 | show=True, ax=None, qqplot=None): 214 | from simulation.simulate_hawkes import fuse_node_times 215 | event_timestamps, event_types = fuse_node_times(timestamps) 216 | 217 | event_timestamps = torch.from_numpy(event_timestamps) 218 | seq_times = torch.cat((torch.zeros_like(event_timestamps[:1]), event_timestamps), 219 | dim=0).float() # add 0 to the sequence beginning 220 | dt_seq = seq_times[1:] - seq_times[:-1] 221 | 222 | seq_types = torch.from_numpy(event_types) 223 | seq_types = seq_types.long()# convert from floattensor to longtensor 224 | 225 | intens_at_evs_lst = [] 226 | sample_times = np.linspace(t_min, t_max, n_points) 227 | for i in range(self.process_dim): 228 | intens_at_samples, intens_at_evs = self.intensity_per_type(seq_types, dt_seq, sample_times, timestamps[i], type=i) 229 | intens_at_evs_lst.append(intens_at_samples) 230 | if qqplot is None: 231 | self._plot_tick_intensity(timestamps[i], sample_times, intens_at_samples,intens_at_evs, 232 | ax[i], i, n_points) 233 | if qqplot is not None: 234 | return intens_at_evs_lst 235 | 236 | def intensity_per_type(self, seq_types, dt_seq, sample_times, timestamps, type): 237 | from train_functions.train_sahp import MaskBatch 238 | 239 | intens_at_samples = [] 240 | with torch.no_grad(): 241 | 242 | onetype_length = timestamps.size 243 | alltype_length = len(seq_types) 244 | 245 | type_idx = np.arange(alltype_length)[seq_types == type] 246 | 247 | event_types_masked = MaskBatch(seq_types[None, :], pad=self.process_dim, device='cpu') 248 | event_types_mask = event_types_masked.src_mask 249 | 250 | self.forward(dt_seq, seq_types, event_types_mask) 251 | converge_point = torch.squeeze(self.converge_point) 252 | start_point = torch.squeeze(self.start_point) 253 | omega = torch.squeeze(self.omega) 254 | 255 | cell_t = self.state_decay(converge_point, 256 | start_point, 257 | omega, 258 | dt_seq[:, None])# 259 | 260 | intens_at_evs = torch.squeeze(self.intensity_layer(cell_t)).numpy() 261 | intens_at_evs = intens_at_evs[type_idx, type] 262 | 263 | 264 | event_idx = -1 265 | for t_time in sample_times: 266 | if t_time < timestamps[0]: 267 | intens_at_samples.append(0)#np.zeros(self.process_dim) 268 | continue 269 | 270 | if event_idx < onetype_length - 1 and t_time >= timestamps[event_idx + 1]: 271 | event_idx += 1 272 | # print(omega) 273 | 274 | aaa=dt_seq[:event_idx+1] 275 | bbb=seq_types[:event_idx+1] 276 | 277 | event_types_masked = MaskBatch(bbb[None, :], pad=self.process_dim, device='cpu') 278 | event_types_mask = event_types_masked.src_mask 279 | 280 | self.forward(aaa, bbb, event_types_mask) 281 | 282 | converge_point = torch.squeeze(self.converge_point) 283 | start_point = torch.squeeze(self.start_point) 284 | omega = torch.squeeze(self.omega) 285 | 286 | if omega.ndim == 2: 287 | omega = omega[-1,:] 288 | converge_point = converge_point [-1,:] 289 | start_point = start_point[-1,:] 290 | cell_t = self.state_decay(converge_point, 291 | start_point, 292 | omega, 293 | t_time - timestamps[event_idx])# 294 | 295 | xxx = self.intensity_layer(cell_t).numpy() 296 | intens_at_samples.append(xxx[type]) 297 | 298 | 299 | return intens_at_samples, intens_at_evs 300 | 301 | def _plot_tick_intensity(self, timestamps_i, sample_times, intensity_i, intens_at_evs, 302 | ax, label, n_points):# 303 | x_intensity = np.linspace(sample_times.min(), sample_times.max(), n_points) 304 | y_intensity = intensity_i 305 | ax.plot(x_intensity, y_intensity) 306 | 307 | ax.set_title(label) 308 | 309 | 310 | class SAHPGen(SeqGenerator): 311 | # sequence generator for the SAHP model 312 | 313 | def __init__(self,model, record_intensity = True): 314 | super(SAHPGen, self).__init__(model, record_intensity) 315 | self.lbda_ub = [] 316 | 317 | def _restart_sequence(self): 318 | super(SAHPGen, self)._restart_sequence() 319 | self.lbda_ub = [] 320 | --------------------------------------------------------------------------------