├── FANLayer-tf.py ├── FANLayer.py ├── Image_Recognition ├── run_image_recognition.sh └── test_image_recognition.py ├── LICENSE ├── Periodicity_Modeling ├── architecture.py ├── generate_periodic_data.py ├── run.sh └── test.py ├── README.md ├── Sentiment_Analysis ├── get_dataloader.py ├── model │ ├── CustomBERT.py │ ├── Mamba.py │ ├── __init__.py │ └── build_model.py ├── scripts │ ├── Trans │ │ ├── test_baseline_trans.sh │ │ └── train_baseline_trans.sh │ ├── Trans_with_FAN │ │ ├── test_ours.sh │ │ ├── test_ours_withgate.sh │ │ ├── train_ours.sh │ │ └── train_ours_withgate.sh │ └── mamba │ │ ├── test_mamba.sh │ │ └── train_mamba.sh ├── test.py ├── train.py └── utils.py ├── Symbolic_Formula_Representation ├── gen_dataset.py ├── requirements.txt ├── run_train_fan.sh ├── run_train_kan.sh ├── run_train_mlp.sh ├── run_train_transformer.sh ├── train_fan.py ├── train_kan.py ├── train_mlp.py └── train_transformer.py ├── Timeseries_Forecasting ├── data_provider │ ├── __init__.py │ ├── data_factory.py │ └── data_loader.py ├── exp │ ├── __init__.py │ ├── exp_basic.py │ └── exp_main.py ├── layers │ ├── AutoCorrelation.py │ ├── Autoformer_EncDec.py │ ├── Embed.py │ ├── FANLayer.py │ ├── SelfAttention_Family.py │ ├── Transformer_EncDec.py │ └── __init__.py ├── models │ ├── Modified_Transformer.py │ └── __init__.py ├── requirements.txt ├── run.py ├── scripts │ ├── ETT_script │ │ ├── Transformer.sh │ │ ├── Transformer_setting_2.sh │ │ └── Transformer_setting_4.sh │ ├── Exchange_script │ │ ├── Modified_Transformer_setting_2.sh │ │ ├── Modified_Transformer_setting_4.sh │ │ └── Transformer.sh │ ├── Traffic_script │ │ ├── Modified_Transformer_setting_2.sh │ │ ├── Modified_Transformer_setting_4.sh │ │ └── Transformer.sh │ └── Weather_script │ │ └── Modified_Transformer.sh └── utils │ ├── __init__.py │ ├── download_data.py │ ├── masking.py │ ├── metrics.py │ ├── timefeatures.py │ └── tools.py └── img ├── FANLayer.jpg ├── IR.jpg ├── mod.jpg └── sin.jpg /FANLayer-tf.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras import layers 3 | from tensorflow.keras import constraints, activations, initializers, regularizers 4 | from tensorflow.keras.constraints import NonNeg 5 | from tensorflow.keras.constraints import Constraint 6 | 7 | class FANLayer(tf.keras.layers.Layer): 8 | """ 9 | FANLayer: The layer used in FAN (https://arxiv.org/abs/2410.02675). 10 | 11 | Args: 12 | input_dim (int): The number of input features. 13 | output_dim (int): The number of output features. 14 | p_ratio (float): The ratio of output dimensions used for cosine and sine parts (default: 0.25). 15 | activation (str or callable): The activation function to apply to the g component (default: 'gelu'). 16 | use_p_bias (bool): If True, include bias in the linear transformations of the p component (default: True). 17 | gated (bool): If True, applies gating to the output. 18 | kernel_regularizer: Regularizer for kernel weights. 19 | bias_regularizer: Regularizer for bias weights. 20 | """ 21 | 22 | def __init__(self, 23 | output_dim, 24 | p_ratio=0.25, 25 | activation='gelu', 26 | use_p_bias=True, 27 | gated=False, 28 | kernel_regularizer=None, 29 | bias_regularizer=None, 30 | **kwargs): 31 | super(FANLayer, self).__init__(**kwargs) 32 | 33 | assert 0 < p_ratio < 0.5, "p_ratio must be between 0 and 0.5" 34 | 35 | self.p_ratio = p_ratio 36 | self.output_dim = output_dim 37 | self.activation = activations.get(activation) 38 | self.use_p_bias = use_p_bias 39 | self.gated = gated 40 | self.kernel_regularizer = regularizers.get(kernel_regularizer) 41 | self.bias_regularizer = regularizers.get(bias_regularizer) 42 | 43 | # Compute output dimensions for p and g components 44 | self.p_output_dim = int(output_dim * self.p_ratio) 45 | self.g_output_dim = output_dim - 2 * self.p_output_dim # Account for cosine and sine 46 | 47 | # Layers for linear transformations 48 | self.input_linear_p = layers.Dense(self.p_output_dim, 49 | use_bias=self.use_p_bias, 50 | kernel_regularizer=self.kernel_regularizer, 51 | bias_regularizer=self.bias_regularizer) 52 | self.input_linear_g = layers.Dense(self.g_output_dim, 53 | kernel_regularizer=self.kernel_regularizer, 54 | bias_regularizer=self.bias_regularizer) 55 | 56 | if self.gated: 57 | self.gate = self.add_weight(name='gate', 58 | shape=(1,), 59 | initializer=initializers.RandomNormal(), 60 | trainable=True, 61 | regularizer=None, 62 | constraint=NonNeg()) 63 | 64 | def call(self, inputs): 65 | # Apply the linear transformation followed by the activation for the g component 66 | g = self.activation(self.input_linear_g(inputs)) 67 | 68 | # Apply the linear transformation for the p component 69 | p = self.input_linear_p(inputs) 70 | 71 | if self.gated: 72 | gate = tf.sigmoid(self.gate) 73 | output = tf.concat([gate * tf.cos(p), gate * tf.sin(p), (1 - gate) * g], axis=-1) 74 | else: 75 | output = tf.concat([tf.cos(p), tf.sin(p), g], axis=-1) 76 | 77 | return output 78 | 79 | def get_config(self): 80 | config = super(FANLayer, self).get_config() 81 | config.update({ 82 | "output_dim": self.output_dim, 83 | "p_ratio": self.p_ratio, 84 | "activation": activations.serialize(self.activation), 85 | "use_p_bias": self.use_p_bias, 86 | "gated": self.gated, 87 | "kernel_regularizer": regularizers.serialize(self.kernel_regularizer), 88 | "bias_regularizer": regularizers.serialize(self.bias_regularizer) 89 | }) 90 | return config 91 | -------------------------------------------------------------------------------- /FANLayer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class FANLayer(nn.Module): 6 | """ 7 | FANLayer: The layer used in FAN (https://arxiv.org/abs/2410.02675). 8 | 9 | Args: 10 | input_dim (int): The number of input features. 11 | output_dim (int): The number of output features. 12 | p_ratio (float): The ratio of output dimensions used for cosine and sine parts (default: 0.25). 13 | activation (str or callable): The activation function to apply to the g component. If a string is passed, 14 | the corresponding activation from torch.nn.functional is used (default: 'gelu'). 15 | use_p_bias (bool): If True, include bias in the linear transformations of p component (default: True). 16 | There is almost no difference between bias and non-bias in our experiments. 17 | """ 18 | 19 | def __init__(self, input_dim, output_dim, p_ratio=0.25, activation='gelu', use_p_bias=True): 20 | super(FANLayer, self).__init__() 21 | 22 | # Ensure the p_ratio is within a valid range 23 | assert 0 < p_ratio < 0.5, "p_ratio must be between 0 and 0.5" 24 | 25 | self.p_ratio = p_ratio 26 | p_output_dim = int(output_dim * self.p_ratio) 27 | g_output_dim = output_dim - p_output_dim * 2 # Account for cosine and sine terms 28 | 29 | # Linear transformation for the p component (for cosine and sine parts) 30 | self.input_linear_p = nn.Linear(input_dim, p_output_dim, bias=use_p_bias) 31 | 32 | # Linear transformation for the g component 33 | self.input_linear_g = nn.Linear(input_dim, g_output_dim) 34 | 35 | # Set the activation function 36 | if isinstance(activation, str): 37 | self.activation = getattr(F, activation) 38 | else: 39 | self.activation = activation if activation else lambda x: x 40 | 41 | def forward(self, src): 42 | """ 43 | Args: 44 | src (Tensor): Input tensor of shape (batch_size, input_dim). 45 | 46 | Returns: 47 | Tensor: Output tensor of shape (batch_size, output_dim), after applying the FAN layer. 48 | """ 49 | 50 | # Apply the linear transformation followed by the activation for the g component 51 | g = self.activation(self.input_linear_g(src)) 52 | 53 | # Apply the linear transformation for the p component 54 | p = self.input_linear_p(src) 55 | 56 | # Concatenate cos(p), sin(p), and activated g along the last dimension 57 | output = torch.cat((torch.cos(p), torch.sin(p), g), dim=-1) 58 | 59 | return output 60 | 61 | -------------------------------------------------------------------------------- /Image_Recognition/run_image_recognition.sh: -------------------------------------------------------------------------------- 1 | GPU=1 2 | LR=0.01 3 | Epoch=100 4 | Versions=(fan mlp) 5 | Dataset=MNIST #(MNIST MNIST-M Fashion-MNIST Fashion-MNIST-corrupted) 6 | logdirpath=result 7 | 8 | if [ ! -d ./${logdirpath} ]; then 9 | mkdir ./${logdirpath} 10 | fi 11 | 12 | for Version in "${Versions[@]}"; do 13 | path=${Version} 14 | 15 | echo "running ${path}..." 16 | python3 -u ./test_image_recognition.py \ 17 | --gpu_id ${GPU} \ 18 | --lr ${LR} \ 19 | --epoch ${Epoch} \ 20 | --version ${Version} \ 21 | --dataset ${Dataset} \ 22 | > ./${logdirpath}/${Dataset}_${path}.log 2>&1 & 23 | done 24 | 25 | wait 26 | echo "End." 27 | -------------------------------------------------------------------------------- /Image_Recognition/test_image_recognition.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | import torchvision.transforms.functional as TF 5 | from torch.utils.data import DataLoader 6 | from torchvision import datasets, transforms 7 | import numpy as np 8 | import random 9 | from datasets import load_dataset 10 | 11 | import argparse 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('--dataset', type=str, help='dataset', default='MNIST') 15 | parser.add_argument('--gpu_id', type=int, help='gpu_id', default=1) 16 | parser.add_argument('--lr', type=float, help='lr', default=0.01) 17 | parser.add_argument('--epoch', type=int, help='epoch', default=100) 18 | parser.add_argument('--version', type=str, help='version', default='fan') 19 | parser.add_argument('--similarparameter', type=bool, help='similarparameter', default=True) 20 | 21 | args = parser.parse_args() 22 | 23 | def set_seed(seed): 24 | torch.manual_seed(seed) 25 | torch.cuda.manual_seed(seed) 26 | torch.cuda.manual_seed_all(seed) 27 | np.random.seed(seed) 28 | random.seed(seed) 29 | torch.backends.cudnn.deterministic = True 30 | torch.backends.cudnn.benchmark = False 31 | 32 | set_seed(2023) 33 | 34 | 35 | from FANLayer import FANLayer 36 | 37 | class CNNModel(nn.Module): 38 | def __init__(self, input_dim=1, output_dim=10): 39 | super(CNNModel, self).__init__() 40 | self.conv_layer = nn.Sequential( 41 | 42 | nn.Conv2d(input_dim, 64, kernel_size=3, padding=1), 43 | nn.BatchNorm2d(64), 44 | nn.ReLU(inplace=True), 45 | nn.Conv2d(64, 64, kernel_size=3, padding=1), 46 | nn.BatchNorm2d(64), 47 | nn.ReLU(inplace=True), 48 | nn.MaxPool2d(2, 2), 49 | nn.Dropout(0.25), 50 | 51 | nn.Conv2d(64, 128, kernel_size=3, padding=1), 52 | nn.BatchNorm2d(128), 53 | nn.ReLU(inplace=True), 54 | nn.Conv2d(128, 128, kernel_size=3, padding=1), 55 | nn.BatchNorm2d(128), 56 | nn.ReLU(inplace=True), 57 | nn.MaxPool2d(2, 2), 58 | nn.Dropout(0.25), 59 | ) 60 | 61 | self.scalar = lambda x: x*4//3 if args.similarparameter else x 62 | 63 | if args.version == 'mlp': 64 | self.fc_layer = nn.Sequential( 65 | nn.Flatten(), 66 | nn.Linear(128 * 7 * 7, 256), 67 | nn.BatchNorm1d(256), 68 | nn.ReLU(inplace=True), 69 | nn.Dropout(0.5), 70 | nn.Linear(256, 10) 71 | ) 72 | else: 73 | self.fc_layer = nn.Sequential( 74 | nn.Flatten(), 75 | FANLayer(128 * 7 * 7, self.scalar(256)), #nn.Linear(128 * 7 * 7, 256), 76 | nn.BatchNorm1d(self.scalar(256)), 77 | nn.Dropout(0.5), 78 | nn.Linear(self.scalar(256), output_dim) 79 | ) 80 | 81 | def forward(self, x): 82 | x = self.conv_layer(x) 83 | x = self.fc_layer(x) 84 | return x 85 | 86 | 87 | device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu") 88 | 89 | model = CNNModel() 90 | model.to(device) 91 | 92 | num_epochs = args.epoch 93 | 94 | def run(model, train_loader, OOD_test_loader, test_loader, num_epochs, name): 95 | 96 | criterion = nn.CrossEntropyLoss() 97 | optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9) 98 | 99 | 100 | scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.8) 101 | 102 | best_accuracy = [0.0, 0.0] 103 | 104 | for epoch in range(1, num_epochs + 1): 105 | model.train() 106 | running_loss = 0.0 107 | 108 | for images, labels in train_loader: 109 | images = images.to(device) 110 | labels = labels.to(device) 111 | 112 | outputs = model(images) 113 | loss = criterion(outputs, labels) 114 | 115 | optimizer.zero_grad() 116 | loss.backward() 117 | optimizer.step() 118 | 119 | running_loss += loss.item() * images.size(0) 120 | 121 | epoch_loss = running_loss / len(train_loader.dataset) 122 | 123 | model.eval() 124 | correct = 0 125 | total = 0 126 | testloss = 0.0 127 | 128 | with torch.no_grad(): 129 | for images, labels in test_loader: 130 | images = images.to(device) 131 | labels = labels.to(device) 132 | outputs = model(images) 133 | loss = criterion(outputs, labels) 134 | _, preds = torch.max(outputs.data, 1) 135 | total += labels.size(0) 136 | correct += (preds == labels).sum().item() 137 | testloss += loss.item() * images.size(0) 138 | 139 | epoch_accuracy = 100 * correct / total 140 | epoch_test_loss = testloss / len(test_loader.dataset) 141 | 142 | if OOD_test_loader is not None: 143 | model.eval() 144 | correct = 0 145 | total = 0 146 | 147 | with torch.no_grad(): 148 | for images, labels in OOD_test_loader: 149 | images = images.to(device) 150 | labels = labels.to(device) 151 | outputs = model(images) 152 | _, preds = torch.max(outputs.data, 1) 153 | total += labels.size(0) 154 | correct += (preds == labels).sum().item() 155 | 156 | OOD_accuracy = 100 * correct / total 157 | 158 | print(f'Epoch [{epoch}/{num_epochs}], Loss: {epoch_test_loss:.4f}, Accuracy: {epoch_accuracy:.2f}%, OOD Accuracy: {OOD_accuracy:.2f}%') 159 | 160 | scheduler.step() 161 | 162 | if epoch_accuracy > best_accuracy[0]: 163 | best_accuracy = [epoch_accuracy, best_accuracy[1]] 164 | if OOD_accuracy > best_accuracy[1]: 165 | best_accuracy = [best_accuracy[0], OOD_accuracy] 166 | 167 | return {'best_accuracy': best_accuracy[0], 'best_OOD_accuracy': best_accuracy[1],\ 168 | 'accuracy': epoch_accuracy, 'OOD_accuracy': OOD_accuracy} 169 | 170 | 171 | def get_dataloader(dataset, batch_size=256, shuffle=True, Train=True): 172 | def transform_m(example): 173 | example['image'] = TF.resize(example['image'], (28, 28)) 174 | example['image'] = example['image'].convert('L') 175 | example['image'] = TF.to_tensor(example['image']) 176 | example['image'] = TF.normalize(example['image'], mean=(.5,), std=(.5,)) 177 | return example 178 | 179 | 180 | def collate_fn(batch): 181 | images = [torch.tensor(item['image']) for item in batch if not isinstance(item['image'], torch.Tensor)] 182 | labels = [torch.tensor(item['label']) for item in batch if not isinstance(item['label'], torch.Tensor)] 183 | 184 | images = torch.stack(images, dim=0) 185 | labels = torch.stack(labels, dim=0) 186 | 187 | return images, labels 188 | 189 | if Train: 190 | trainset = dataset['train'].map(transform_m) 191 | testset = dataset['test'].map(transform_m) 192 | 193 | train_loader = DataLoader(trainset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn) 194 | test_loader = DataLoader(testset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn) 195 | return train_loader, test_loader 196 | else: 197 | testset = dataset['test'].map(transform_m) 198 | 199 | test_loader = DataLoader(testset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn) 200 | return test_loader 201 | 202 | 203 | transform = transforms.Compose([ 204 | transforms.RandomRotation(10), 205 | transforms.RandomAffine(0, translate=(0.1, 0.1)), 206 | transforms.ToTensor(), 207 | transforms.Normalize((0.1307,), (0.3081,)) 208 | ]) 209 | 210 | if args.dataset == 'MNIST': 211 | 212 | train_dataset = datasets.MNIST(root='./data', train=True, transform=transform, download=True) 213 | test_dataset = datasets.MNIST(root='./data', train=False, transform=transform, download=True) 214 | 215 | train_loader = DataLoader(dataset=train_dataset, batch_size=256, shuffle=True) 216 | test_loader = DataLoader(dataset=test_dataset, batch_size=256, shuffle=False) 217 | 218 | OOD_test_loader = get_dataloader(load_dataset("Mike0307/MNIST-M"), Train=False) 219 | 220 | accuracy_checkpoints = run(model, train_loader, OOD_test_loader, test_loader, num_epochs, name='mnist') 221 | 222 | elif args.dataset == 'MNIST-M': 223 | dataset = load_dataset("Mike0307/MNIST-M") 224 | 225 | train_loader, test_loader = get_dataloader(dataset) 226 | 227 | OOD_dataset = datasets.MNIST(root='./data', train=False, transform=transform, download=True) 228 | OOD_test_loader = DataLoader(dataset=OOD_dataset, batch_size=256, shuffle=False) 229 | 230 | accuracy_checkpoints = run(model, train_loader, OOD_test_loader, test_loader, num_epochs, name='m_mnist') 231 | 232 | elif args.dataset == 'Fashion-MNIST': 233 | dataset = load_dataset("zalando-datasets/fashion_mnist") 234 | train_loader, test_loader = get_dataloader(dataset) 235 | 236 | OOD_test_loader = get_dataloader(load_dataset("mweiss/fashion_mnist_corrupted"), Train=False) 237 | 238 | accuracy_checkpoints = run(model, train_loader, OOD_test_loader, test_loader, num_epochs, name='f_mnist') 239 | 240 | elif args.dataset == 'Fashion-MNIST-corrupted': 241 | dataset = load_dataset("mweiss/fashion_mnist_corrupted") 242 | train_loader, test_loader = get_dataloader(dataset) 243 | 244 | OOD_test_loader = get_dataloader(load_dataset("zalando-datasets/fashion_mnist"), Train=False) 245 | 246 | accuracy_checkpoints = run(model, train_loader, OOD_test_loader, test_loader, num_epochs, name='fc_mnist') 247 | 248 | 249 | print(f'{args.dataset}:', accuracy_checkpoints) 250 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 YihongDong 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Periodicity_Modeling/architecture.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | import math 5 | 6 | # Define a model registry 7 | model_registry = {} 8 | 9 | # Create a decorator to register models 10 | def register_model(model_name): 11 | def decorator(cls): 12 | model_registry[model_name] = cls 13 | return cls 14 | return decorator 15 | 16 | # Define a function to retrieve and instantiate the model class by model_name 17 | def get_model_by_name(model_name, *args, **kwargs): 18 | model_cls = model_registry.get(model_name) 19 | if model_cls is None: 20 | raise ValueError(f"No model found with model_name{model_name}.") 21 | return model_cls(*args, **kwargs) 22 | 23 | # Use the decorator to register the model class 24 | 25 | @register_model('FANLayer') 26 | class FANLayer(nn.Module): 27 | def __init__(self, input_dim, output_dim, bias=True): 28 | super(FANLayer, self).__init__() 29 | self.input_linear_p = nn.Linear(input_dim, output_dim//4, bias=bias) # There is almost no difference between bias and non-bias in our experiments. 30 | self.input_linear_g = nn.Linear(input_dim, (output_dim-output_dim//2)) 31 | self.activation = nn.GELU() 32 | 33 | def forward(self, src): 34 | g = self.activation(self.input_linear_g(src)) 35 | p = self.input_linear_p(src) 36 | 37 | output = torch.cat((torch.cos(p), torch.sin(p), g), dim=-1) 38 | return output 39 | 40 | @register_model('FANLayerGated') 41 | class FANLayerGated(nn.Module): 42 | def __init__(self, input_dim, output_dim, bias=True, gated = True): 43 | super(FANLayerGated, self).__init__() 44 | self.input_linear_p = nn.Linear(input_dim, output_dim//4, bias=bias) 45 | self.input_linear_g = nn.Linear(input_dim, (output_dim-output_dim//2)) 46 | self.activation = nn.GELU() 47 | if gated: 48 | self.gate = nn.Parameter(torch.randn(1, dtype=torch.float32)) 49 | 50 | def forward(self, src): 51 | g = self.activation(self.input_linear_g(src)) 52 | p = self.input_linear_p(src) 53 | 54 | if not hasattr(self, 'gate'): 55 | output = torch.cat((torch.cos(p), torch.sin(p), g), dim=-1) 56 | else: 57 | gate = torch.sigmoid(self.gate) 58 | output = torch.cat((gate*torch.cos(p), gate*torch.sin(p), (1-gate)*g), dim=-1) 59 | return output 60 | 61 | @register_model('FAN') 62 | class FAN(nn.Module): 63 | def __init__(self, input_dim=1, output_dim=1, hidden_dim=2048, num_layers=3): 64 | super(FAN, self).__init__() 65 | self.embedding = nn.Linear(input_dim, hidden_dim) 66 | self.layers = nn.ModuleList() 67 | for _ in range(num_layers - 1): 68 | self.layers.append(FANLayer(hidden_dim, hidden_dim)) 69 | self.layers.append(nn.Linear(hidden_dim, output_dim)) 70 | 71 | def forward(self, src): 72 | output = self.embedding(src) 73 | for layer in self.layers: 74 | output = layer(output) 75 | return output 76 | 77 | @register_model('FANGated') 78 | class FANGated(nn.Module): 79 | def __init__(self, input_dim=1, output_dim=1, hidden_dim=2048, num_layers=3, gated = True): 80 | super(FANGated, self).__init__() 81 | self.embedding = nn.Linear(input_dim, hidden_dim) 82 | self.layers = nn.ModuleList() 83 | for _ in range(num_layers - 1): 84 | self.layers.append(FANLayerGated(hidden_dim, hidden_dim, gated = gated)) 85 | self.layers.append(nn.Linear(hidden_dim, output_dim)) 86 | 87 | def forward(self, src): 88 | output = self.embedding(src) 89 | for layer in self.layers: 90 | output = layer(output) 91 | return output 92 | 93 | @register_model('MLP') 94 | class MLPModel(nn.Module): 95 | def __init__(self, input_dim=1, output_dim=1, hidden_dim=2048, num_layers=3, use_embedding=True): 96 | super(MLPModel, self).__init__() 97 | self.activation = nn.GELU() 98 | self.layers = nn.ModuleList() 99 | if use_embedding: 100 | self.embedding = nn.Linear(input_dim, hidden_dim) 101 | self.layers.extend([nn.Linear(hidden_dim, hidden_dim), self.activation]) 102 | else: 103 | self.layers.extend([nn.Linear(input_dim, hidden_dim), self.activation]) 104 | 105 | for _ in range(num_layers - 2): 106 | self.layers.extend([nn.Linear(hidden_dim, hidden_dim), self.activation]) 107 | self.layers.append(nn.Linear(hidden_dim, output_dim)) 108 | 109 | def forward(self, src): 110 | output = self.embedding(src) if hasattr(self, 'embedding') else src 111 | for layer in self.layers: 112 | output = layer(output) 113 | return output 114 | 115 | 116 | class RoPEPositionalEncoding(torch.nn.Module): 117 | 118 | def __init__(self, d_model: int, max_len: int = 5000): 119 | super().__init__() 120 | 121 | position = torch.arange(max_len).unsqueeze(1) 122 | div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)) 123 | pe = torch.zeros(max_len, 1, d_model) 124 | pe[:, 0, 0::2] = torch.sin(position * div_term) 125 | pe[:, 0, 1::2] = torch.cos(position * div_term) 126 | self.register_buffer('pe', pe) 127 | 128 | def forward(self, x): 129 | """ 130 | Arguments: 131 | x: Tensor, shape ``[seq_len, batch_size, embedding_dim]`` 132 | """ 133 | x = x + self.pe[:x.size(0)] 134 | return x 135 | 136 | 137 | @register_model('Transformer') 138 | class TransformerModel(nn.Module): 139 | def __init__(self, input_dim=1, output_dim=1, hidden_dim=768, num_layers=12, num_heads=12, norm_first = True, encoder_only=True, decoder_only=False): 140 | super(TransformerModel, self).__init__() 141 | self.embedding = nn.Linear(input_dim, hidden_dim) 142 | self.pos_encoder = RoPEPositionalEncoding(hidden_dim) 143 | self.encoder_only = encoder_only 144 | self.decoder_only = decoder_only 145 | assert not (self.encoder_only and self.decoder_only) 146 | if self.encoder_only: 147 | encoder_layers = nn.TransformerEncoderLayer(hidden_dim, num_heads, hidden_dim, norm_first = norm_first) 148 | self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers) 149 | elif self.decoder_only: 150 | decoder_layers = nn.TransformerDecoderLayer(hidden_dim, num_heads, hidden_dim, norm_first = norm_first) 151 | self.transformer_decoder = nn.TransformerDecoder(decoder_layers, num_layers) 152 | else: 153 | encoder_layers = nn.TransformerEncoderLayer(hidden_dim, num_heads, hidden_dim, norm_first = norm_first) 154 | self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers//2) 155 | decoder_layers = nn.TransformerDecoderLayer(hidden_dim, num_heads, hidden_dim, norm_first = norm_first) 156 | self.transformer_decoder = nn.TransformerDecoder(decoder_layers, num_layers//2) 157 | self.out = nn.Linear(hidden_dim, output_dim) 158 | 159 | def forward(self, src): 160 | src = self.embedding(src).unsqueeze(0) 161 | src = self.pos_encoder(src) 162 | if self.encoder_only: 163 | src = self.transformer_encoder(src) 164 | elif self.decoder_only: 165 | src = self.transformer_decoder(src, src) 166 | else: 167 | src = self.transformer_encoder(src) 168 | src = self.transformer_decoder(src, src) 169 | output = self.out(src) 170 | return output 171 | 172 | 173 | class KANLinear(torch.nn.Module): 174 | def __init__( 175 | self, 176 | in_features, 177 | out_features, 178 | grid_size=5, 179 | spline_order=3, 180 | scale_noise=0.1, 181 | scale_base=1.0, 182 | scale_spline=1.0, 183 | enable_standalone_scale_spline=True, 184 | base_activation=torch.nn.SiLU, 185 | grid_eps=0.02, 186 | grid_range=[-1, 1], 187 | ): 188 | super(KANLinear, self).__init__() 189 | self.in_features = in_features 190 | self.out_features = out_features 191 | self.grid_size = grid_size 192 | self.spline_order = spline_order 193 | 194 | h = (grid_range[1] - grid_range[0]) / grid_size 195 | grid = ( 196 | ( 197 | torch.arange(-spline_order, grid_size + spline_order + 1) * h 198 | + grid_range[0] 199 | ) 200 | .expand(in_features, -1) 201 | .contiguous() 202 | ) 203 | 204 | self.register_buffer("grid", grid) 205 | 206 | self.base_weight = torch.nn.Parameter(torch.Tensor(out_features, in_features)) 207 | self.spline_weight = torch.nn.Parameter( 208 | torch.Tensor(out_features, in_features, grid_size + spline_order) 209 | ) 210 | if enable_standalone_scale_spline: 211 | self.spline_scaler = torch.nn.Parameter( 212 | torch.Tensor(out_features, in_features) 213 | ) 214 | 215 | self.scale_noise = scale_noise 216 | self.scale_base = scale_base 217 | self.scale_spline = scale_spline 218 | self.enable_standalone_scale_spline = enable_standalone_scale_spline 219 | self.base_activation = base_activation() 220 | self.grid_eps = grid_eps 221 | 222 | self.reset_parameters() 223 | 224 | def reset_parameters(self): 225 | torch.nn.init.kaiming_uniform_(self.base_weight, a=math.sqrt(5) * self.scale_base) 226 | with torch.no_grad(): 227 | noise = ( 228 | ( 229 | torch.rand(self.grid_size + 1, self.in_features, self.out_features) 230 | - 1 / 2 231 | ) 232 | * self.scale_noise 233 | / self.grid_size 234 | ) 235 | self.spline_weight.data.copy_( 236 | (self.scale_spline if not self.enable_standalone_scale_spline else 1.0) 237 | * self.curve2coeff( 238 | self.grid.T[self.spline_order : -self.spline_order], 239 | noise, 240 | ) 241 | ) 242 | if self.enable_standalone_scale_spline: 243 | # torch.nn.init.constant_(self.spline_scaler, self.scale_spline) 244 | torch.nn.init.kaiming_uniform_(self.spline_scaler, a=math.sqrt(5) * self.scale_spline) 245 | 246 | def b_splines(self, x: torch.Tensor): 247 | """ 248 | Compute the B-spline bases for the given input tensor. 249 | 250 | Args: 251 | x (torch.Tensor): Input tensor of shape (batch_size, in_features). 252 | 253 | Returns: 254 | torch.Tensor: B-spline bases tensor of shape (batch_size, in_features, grid_size + spline_order). 255 | """ 256 | assert x.dim() == 2 and x.size(1) == self.in_features 257 | 258 | grid: torch.Tensor = ( 259 | self.grid 260 | ) # (in_features, grid_size + 2 * spline_order + 1) 261 | x = x.unsqueeze(-1) 262 | bases = ((x >= grid[:, :-1]) & (x < grid[:, 1:])).to(x.dtype) 263 | for k in range(1, self.spline_order + 1): 264 | bases = ( 265 | (x - grid[:, : -(k + 1)]) 266 | / (grid[:, k:-1] - grid[:, : -(k + 1)]) 267 | * bases[:, :, :-1] 268 | ) + ( 269 | (grid[:, k + 1 :] - x) 270 | / (grid[:, k + 1 :] - grid[:, 1:(-k)]) 271 | * bases[:, :, 1:] 272 | ) 273 | 274 | assert bases.size() == ( 275 | x.size(0), 276 | self.in_features, 277 | self.grid_size + self.spline_order, 278 | ) 279 | return bases.contiguous() 280 | 281 | def curve2coeff(self, x: torch.Tensor, y: torch.Tensor): 282 | """ 283 | Compute the coefficients of the curve that interpolates the given points. 284 | 285 | Args: 286 | x (torch.Tensor): Input tensor of shape (batch_size, in_features). 287 | y (torch.Tensor): Output tensor of shape (batch_size, in_features, out_features). 288 | 289 | Returns: 290 | torch.Tensor: Coefficients tensor of shape (out_features, in_features, grid_size + spline_order). 291 | """ 292 | assert x.dim() == 2 and x.size(1) == self.in_features 293 | assert y.size() == (x.size(0), self.in_features, self.out_features) 294 | 295 | A = self.b_splines(x).transpose( 296 | 0, 1 297 | ) # (in_features, batch_size, grid_size + spline_order) 298 | B = y.transpose(0, 1) # (in_features, batch_size, out_features) 299 | solution = torch.linalg.lstsq( 300 | A, B 301 | ).solution # (in_features, grid_size + spline_order, out_features) 302 | result = solution.permute( 303 | 2, 0, 1 304 | ) # (out_features, in_features, grid_size + spline_order) 305 | 306 | assert result.size() == ( 307 | self.out_features, 308 | self.in_features, 309 | self.grid_size + self.spline_order, 310 | ) 311 | return result.contiguous() 312 | 313 | @property 314 | def scaled_spline_weight(self): 315 | return self.spline_weight * ( 316 | self.spline_scaler.unsqueeze(-1) 317 | if self.enable_standalone_scale_spline 318 | else 1.0 319 | ) 320 | 321 | def forward(self, x: torch.Tensor): 322 | assert x.size(-1) == self.in_features 323 | original_shape = x.shape 324 | x = x.reshape(-1, self.in_features) 325 | 326 | base_output = F.linear(self.base_activation(x), self.base_weight) 327 | spline_output = F.linear( 328 | self.b_splines(x).view(x.size(0), -1), 329 | self.scaled_spline_weight.view(self.out_features, -1), 330 | ) 331 | output = base_output + spline_output 332 | 333 | output = output.reshape(*original_shape[:-1], self.out_features) 334 | return output 335 | 336 | @torch.no_grad() 337 | def update_grid(self, x: torch.Tensor, margin=0.01): 338 | assert x.dim() == 2 and x.size(1) == self.in_features 339 | batch = x.size(0) 340 | 341 | splines = self.b_splines(x) # (batch, in, coeff) 342 | splines = splines.permute(1, 0, 2) # (in, batch, coeff) 343 | orig_coeff = self.scaled_spline_weight # (out, in, coeff) 344 | orig_coeff = orig_coeff.permute(1, 2, 0) # (in, coeff, out) 345 | unreduced_spline_output = torch.bmm(splines, orig_coeff) # (in, batch, out) 346 | unreduced_spline_output = unreduced_spline_output.permute( 347 | 1, 0, 2 348 | ) # (batch, in, out) 349 | 350 | # sort each channel individually to collect data distribution 351 | x_sorted = torch.sort(x, dim=0)[0] 352 | grid_adaptive = x_sorted[ 353 | torch.linspace( 354 | 0, batch - 1, self.grid_size + 1, dtype=torch.int64, device=x.device 355 | ) 356 | ] 357 | 358 | uniform_step = (x_sorted[-1] - x_sorted[0] + 2 * margin) / self.grid_size 359 | grid_uniform = ( 360 | torch.arange( 361 | self.grid_size + 1, dtype=torch.float32, device=x.device 362 | ).unsqueeze(1) 363 | * uniform_step 364 | + x_sorted[0] 365 | - margin 366 | ) 367 | 368 | grid = self.grid_eps * grid_uniform + (1 - self.grid_eps) * grid_adaptive 369 | grid = torch.concatenate( 370 | [ 371 | grid[:1] 372 | - uniform_step 373 | * torch.arange(self.spline_order, 0, -1, device=x.device).unsqueeze(1), 374 | grid, 375 | grid[-1:] 376 | + uniform_step 377 | * torch.arange(1, self.spline_order + 1, device=x.device).unsqueeze(1), 378 | ], 379 | dim=0, 380 | ) 381 | 382 | self.grid.copy_(grid.T) 383 | self.spline_weight.data.copy_(self.curve2coeff(x, unreduced_spline_output)) 384 | 385 | def regularization_loss(self, regularize_activation=1.0, regularize_entropy=1.0): 386 | """ 387 | Compute the regularization loss. 388 | 389 | This is a dumb simulation of the original L1 regularization as stated in the 390 | paper, since the original one requires computing absolutes and entropy from the 391 | expanded (batch, in_features, out_features) intermediate tensor, which is hidden 392 | behind the F.linear function if we want an memory efficient implementation. 393 | 394 | The L1 regularization is now computed as mean absolute value of the spline 395 | weights. The authors implementation also includes this term in addition to the 396 | sample-based regularization. 397 | """ 398 | l1_fake = self.spline_weight.abs().mean(-1) 399 | regularization_loss_activation = l1_fake.sum() 400 | p = l1_fake / regularization_loss_activation 401 | regularization_loss_entropy = -torch.sum(p * p.log()) 402 | return ( 403 | regularize_activation * regularization_loss_activation 404 | + regularize_entropy * regularization_loss_entropy 405 | ) 406 | 407 | 408 | @register_model('KAN') 409 | class KAN(nn.Module): 410 | def __init__( 411 | self, 412 | input_dim=1, 413 | output_dim=1, 414 | hidden_dim=128, 415 | num_layers=3, 416 | grid_size=50, 417 | spline_order=3, 418 | scale_noise=0.1, 419 | scale_base=1.0, 420 | scale_spline=1.0, 421 | base_activation=torch.nn.SiLU, 422 | grid_eps=0.02, 423 | grid_range=[-1, 1], 424 | ): 425 | super(KAN, self).__init__() 426 | self.grid_size = grid_size 427 | self.spline_order = spline_order 428 | layers_hidden=[input_dim] + [hidden_dim] * num_layers + [output_dim] 429 | 430 | self.layers = torch.nn.ModuleList() 431 | for in_features, out_features in zip(layers_hidden, layers_hidden[1:]): 432 | self.layers.append( 433 | KANLinear( 434 | in_features, 435 | out_features, 436 | grid_size=grid_size, 437 | spline_order=spline_order, 438 | scale_noise=scale_noise, 439 | scale_base=scale_base, 440 | scale_spline=scale_spline, 441 | base_activation=base_activation, 442 | grid_eps=grid_eps, 443 | grid_range=grid_range, 444 | ) 445 | ) 446 | 447 | def forward(self, x: torch.Tensor, update_grid=False): 448 | for layer in self.layers: 449 | if update_grid: 450 | layer.update_grid(x) 451 | x = layer(x) 452 | return x 453 | 454 | def regularization_loss(self, regularize_activation=1.0, regularize_entropy=1.0): 455 | return sum( 456 | layer.regularization_loss(regularize_activation, regularize_entropy) 457 | for layer in self.layers 458 | ) 459 | -------------------------------------------------------------------------------- /Periodicity_Modeling/generate_periodic_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def sawtooth_wave(t, n): 4 | """Generate a single term of the sawtooth wave harmonic series.""" 5 | return (t / np.pi) - np.floor(t / np.pi + 0.5) 6 | 7 | def gen_periodic_data(periodic_type): 8 | 9 | if periodic_type == 'sin': 10 | def generate_periodic_data(num_samples, num_periods=100, is_train = True): 11 | if is_train: 12 | t = np.linspace(-num_periods * np.pi, num_periods * np.pi, num_samples) 13 | else: 14 | t = np.linspace(-num_periods * 3 * np.pi, num_periods * 3 * np.pi, num_samples) 15 | data = np.sin(t) 16 | return t, data 17 | print(f'generate data from the {periodic_type} function') 18 | 19 | PERIOD = 6 20 | BATCHSIZE = 32 21 | NUMEPOCH = 10000 22 | PRINTEPOCH = 50 23 | lr = 1e-5 24 | wd = 0.01 25 | 26 | t, data = generate_periodic_data(int(10000*PERIOD), PERIOD) 27 | t_test, data_test = generate_periodic_data(4000, PERIOD, is_train = False) 28 | 29 | y_uper = 1.5 30 | y_lower = -1.5 31 | 32 | # ---------------------------------------------------------------------------------------------------------- 33 | 34 | elif periodic_type == 'mod': 35 | def generate_periodic_data(num_samples, PERIOD=100, is_train = True): 36 | if is_train: 37 | t = np.linspace(-PERIOD, PERIOD, num_samples) 38 | else: 39 | t = np.linspace(-2*PERIOD, 2*PERIOD, num_samples) 40 | data = [i%5 for i in t] 41 | return t, data 42 | 43 | print(f'generate data from the {periodic_type} function') 44 | 45 | PERIOD = 20 46 | BATCHSIZE = 32 47 | NUMEPOCH = 10000 48 | PRINTEPOCH = 50 49 | lr = 1e-5 50 | wd = 0.01 51 | 52 | t, data = generate_periodic_data(int(10000*PERIOD)) 53 | t_test, data_test = generate_periodic_data(4000, is_train = False) 54 | 55 | y_uper = 10 56 | y_lower = -5 57 | 58 | 59 | # ---------------------------------------------------------------------------------------------------------- 60 | 61 | elif periodic_type == 'complex_1': 62 | 63 | # complex_period 64 | def generate_periodic_data(num_samples, PERIOD=100, is_train = True): 65 | if is_train: 66 | t = np.linspace(-PERIOD, PERIOD, num_samples) 67 | else: 68 | t = np.linspace(-2*PERIOD, 2*PERIOD, num_samples) 69 | data = np.exp(np.sin(np.pi * t)**2 + np.cos(t) + t%3 - 1) 70 | return t, data 71 | print(f'generate data from the {periodic_type} function') 72 | 73 | PERIOD = 4 74 | BATCHSIZE = 32 75 | NUMEPOCH = 10000 76 | PRINTEPOCH = 50 77 | lr = 1e-5 78 | wd = 0.01 79 | 80 | t, data = generate_periodic_data(int(10000*PERIOD)) 81 | t_test, data_test = generate_periodic_data(4000, is_train = False) 82 | 83 | y_uper = 20 84 | y_lower = -20 85 | 86 | # ---------------------------------------------------------------------------------------------------------- 87 | 88 | elif periodic_type == 'complex_2': 89 | def generate_periodic_data(num_samples, PERIOD=100, is_train = True): 90 | if is_train: 91 | t = np.linspace(-PERIOD, PERIOD, num_samples) 92 | else: 93 | t = np.linspace(-2*PERIOD, 2*PERIOD, num_samples) 94 | 95 | data = (1 + np.sin(t)) * np.sin(2 * t) 96 | return t, data 97 | print(f'generate data from the {periodic_type} function') 98 | 99 | PERIOD = 4 100 | BATCHSIZE = 32 101 | NUMEPOCH = 10000 102 | PRINTEPOCH = 50 103 | lr = 1e-5 104 | wd = 0.01 105 | 106 | t, data = generate_periodic_data(int(10000*PERIOD)) 107 | t_test, data_test = generate_periodic_data(4000, is_train = False) 108 | 109 | y_uper = 4 110 | y_lower = -4 111 | 112 | # ---------------------------------------------------------------------------------------------------------- 113 | 114 | elif periodic_type == 'complex_3': 115 | 116 | def generate_periodic_data(num_samples, PERIOD=100, is_train = True): 117 | if is_train: 118 | t = np.linspace(-PERIOD, PERIOD, num_samples) 119 | else: 120 | t = np.linspace(-2*PERIOD, 2*PERIOD, num_samples) 121 | 122 | data = np.sin(t + np.sin(2 * t)) 123 | return t, data 124 | print(f'generate data from the {periodic_type} function') 125 | 126 | PERIOD = 4 127 | BATCHSIZE = 32 128 | NUMEPOCH = 10000 129 | PRINTEPOCH = 50 130 | lr = 1e-5 131 | wd = 0.01 132 | 133 | t, data = generate_periodic_data(int(10000*PERIOD)) 134 | t_test, data_test = generate_periodic_data(4000, is_train = False) 135 | 136 | y_uper = 2 137 | y_lower = -2 138 | 139 | # ---------------------------------------------------------------------------------------------------------- 140 | 141 | elif periodic_type == 'complex_4': 142 | 143 | def generate_periodic_data(num_samples, PERIOD=100, is_train = True): 144 | if is_train: 145 | t = np.linspace(-PERIOD, PERIOD, num_samples) 146 | else: 147 | t = np.linspace(-2*PERIOD, 2*PERIOD, num_samples) 148 | 149 | data = np.sin(t) * np.cos(2 * t)**2 + np.cos(t) * np.sin(3 * t)**2 150 | return t, data 151 | print(f'generate data from the {periodic_type} function') 152 | 153 | PERIOD = 4 154 | BATCHSIZE = 32 155 | NUMEPOCH = 10000 156 | PRINTEPOCH = 50 157 | lr = 1e-5 158 | wd = 0.01 159 | 160 | t, data = generate_periodic_data(int(10000*PERIOD)) 161 | t_test, data_test = generate_periodic_data(4000, is_train = False) 162 | 163 | y_uper = 2 164 | y_lower = -2 165 | 166 | 167 | # ---------------------------------------------------------------------------------------------------------- 168 | 169 | elif periodic_type == 'complex_5': 170 | 171 | def generate_periodic_data(num_samples, PERIOD=100, is_train = True): 172 | if is_train: 173 | t = np.linspace(-PERIOD, PERIOD, num_samples) 174 | else: 175 | t = np.linspace(-2*PERIOD, 2*PERIOD, num_samples) 176 | 177 | N = 5 178 | data = np.zeros_like(t) 179 | for n in range(1, N+1): 180 | data += (1/n) * sawtooth_wave(n * t, n) 181 | 182 | return t, data 183 | print(f'generate data from the {periodic_type} function') 184 | 185 | PERIOD = 4 186 | BATCHSIZE = 32 187 | NUMEPOCH = 10000 188 | PRINTEPOCH = 50 189 | lr = 1e-5 190 | wd = 0.01 191 | 192 | t, data = generate_periodic_data(int(10000*PERIOD)) 193 | t_test, data_test = generate_periodic_data(4000, is_train = False) 194 | 195 | y_uper = 1 196 | y_lower = -1 197 | 198 | # ---------------------------------------------------------------------------------------------------------- 199 | 200 | elif periodic_type == 'complex_6': 201 | 202 | def generate_periodic_data(num_samples, PERIOD=100, is_train = True): 203 | if is_train: 204 | t = np.linspace(-PERIOD, PERIOD, num_samples) 205 | else: 206 | t = np.linspace(-2*PERIOD, 2*PERIOD, num_samples) 207 | 208 | data = np.exp(np.sin(t)) / (1 + np.cos(2 * t)**2) 209 | 210 | return t, data 211 | print(f'generate data from the {periodic_type} function') 212 | 213 | PERIOD = 4 214 | BATCHSIZE = 32 215 | NUMEPOCH = 10000 216 | PRINTEPOCH = 50 217 | lr = 1e-5 218 | wd = 0.01 219 | 220 | t, data = generate_periodic_data(int(10000*PERIOD)) 221 | t_test, data_test = generate_periodic_data(4000, is_train = False) 222 | 223 | y_uper = 3 224 | y_lower = 0 225 | 226 | 227 | return t, data, t_test, data_test, PERIOD, BATCHSIZE, NUMEPOCH, PRINTEPOCH, lr, wd, y_uper, y_lower 228 | 229 | 230 | def plot_periodic_data(t, data, t_test, data_test, result, args, epoch, path, y_uper, y_lower): 231 | import matplotlib.pyplot as plt 232 | import numpy as np 233 | 234 | plt.figure(figsize=(35, 5)) 235 | plt.plot(t_test, data_test, label='Domain of Test Data', color='blue') 236 | plt.plot(t, data, label='Domain of Training Data', color='green') 237 | plt.plot(t_test, result, label='Model Predictions', color='red', linestyle='--') 238 | plt.xlabel('x') 239 | plt.ylabel('y') 240 | plt.xlim(min(t_test),max(t_test)) 241 | plt.ylim(y_lower, y_uper) 242 | # plt.legend() 243 | plt.savefig(f'{path}/epoch{epoch}.png') 244 | 245 | def read_log_file(file_path): 246 | with open(file_path, 'r') as f: 247 | lines = f.readlines() 248 | train_loss = [] 249 | test_loss = [] 250 | for line in lines: 251 | if 'Train Loss' in line: 252 | train_loss.append(float(line.split(' ')[-1].strip())) 253 | elif 'Test Loss' in line: 254 | test_loss.append(float(line.split(' ')[-1].strip())) 255 | return train_loss, test_loss 256 | 257 | def plot_periodic_loss(log_file_path): 258 | import matplotlib.pyplot as plt 259 | fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4)) 260 | 261 | train_log_loss, test_log_loss = read_log_file(log_file_path) 262 | 263 | log_file_name = log_file_path.split('.')[0] 264 | ax1.plot(np.arange(0,len(train_log_loss)*50,50),train_log_loss, label=log_file_name) 265 | ax2.plot(np.arange(0,len(test_log_loss)*50,50),test_log_loss, label=log_file_name) 266 | 267 | ax1.set_xlabel('Epoch') 268 | ax1.set_ylabel('Training Loss') 269 | ax1.legend(loc='upper right') 270 | 271 | ax2.set_xlabel('Epoch') 272 | ax2.set_ylabel('Test Loss') 273 | ax2.legend(loc='upper right') 274 | plt.savefig(f'{log_file_name}.pdf') 275 | -------------------------------------------------------------------------------- /Periodicity_Modeling/run.sh: -------------------------------------------------------------------------------- 1 | GPU=0 2 | export CUDA_VISIBLE_DEVICES=${GPU} 3 | 4 | periodicType=sin 5 | modelName=FAN 6 | path=./${periodicType}_${modelName} 7 | python3 -u ./test.py \ 8 | --model_name ${modelName} \ 9 | --periodic_type ${periodicType} \ 10 | --path ${path} 11 | 12 | wait $! -------------------------------------------------------------------------------- /Periodicity_Modeling/test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | 7 | sns.set_style('whitegrid') 8 | 9 | import os 10 | import sys 11 | from generate_periodic_data import gen_periodic_data, plot_periodic_data 12 | 13 | import argparse 14 | 15 | model_names = ['FAN', 'FANGated', 'MLP', 'KAN', 'Transformer'] 16 | periodic_types = ['sin', 'mod', 'complex_1', 'complex_2', 'complex_3', 'complex_4', 'complex_5', 'complex_6'] 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('--periodic_type', type=str, choices=periodic_types, help='periodic type', default='sin') 20 | parser.add_argument('--path', type=str, help='path') 21 | parser.add_argument('--model_name', type=str, choices=model_names, help='model name', default='FAN') 22 | 23 | args = parser.parse_args() 24 | 25 | t, data, t_test, data_test, PERIOD, BATCHSIZE, NUMEPOCH, PRINTEPOCH, lr, wd, y_uper, y_lower = gen_periodic_data(args.periodic_type) 26 | 27 | 28 | import os 29 | path = args.path 30 | if not os.path.exists(f'{path}'): 31 | os.makedirs(f'{path}') 32 | 33 | 34 | from torch.utils.data import TensorDataset, DataLoader 35 | 36 | t_tensor = torch.tensor(t).float().unsqueeze(1) 37 | data_tensor = torch.tensor(data).float().unsqueeze(1) 38 | dataset = TensorDataset(t_tensor, data_tensor) 39 | 40 | dataloader_train = DataLoader(dataset, batch_size=BATCHSIZE, shuffle=True) 41 | 42 | 43 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 44 | 45 | # load model 46 | from architecture import get_model_by_name 47 | 48 | print(f'model name: {args.model_name}') 49 | model = get_model_by_name(args.model_name, input_dim=1, output_dim=1, num_layers = 3).to(device) 50 | 51 | criterion = torch.nn.MSELoss() 52 | optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd) 53 | 54 | t_test_tensor = torch.tensor(t_test).float().unsqueeze(1) 55 | data_test_tensor = torch.tensor(data_test).float().unsqueeze(1) 56 | dataset_test = TensorDataset(t_test_tensor, data_test_tensor) 57 | 58 | dataloader_test = DataLoader(dataset_test, batch_size=BATCHSIZE) 59 | 60 | 61 | # train 62 | num_epochs = NUMEPOCH 63 | for epoch in range(num_epochs): 64 | model.train() 65 | total_loss = 0 66 | for x,y in dataloader_train: 67 | x = x.to(device) 68 | y = y.to(device) 69 | optimizer.zero_grad() 70 | output = model(x) 71 | loss = criterion(output.squeeze(0), y) 72 | loss.backward() 73 | optimizer.step() 74 | total_loss += loss.item() 75 | 76 | if epoch%PRINTEPOCH==0: 77 | print(f'Epoch {epoch}, Train Loss {total_loss / len(dataloader_train)}') 78 | model.eval() 79 | 80 | result = [] 81 | # test 82 | total_test_loss = 0 83 | with torch.no_grad(): 84 | for x,y in dataloader_test: 85 | x = x.to(device) 86 | y = y.to(device) 87 | predictions = model(x) 88 | result.extend(predictions.cpu().squeeze()) 89 | test_loss = criterion(predictions.squeeze(0), y) 90 | total_test_loss += test_loss.item() 91 | print(f'Epoch {epoch}, Test Loss {total_test_loss / len(dataloader_test)}') 92 | 93 | 94 | # plot 95 | plot_periodic_data(t, data, t_test, data_test, result, args, epoch, path, y_uper, y_lower) 96 | 97 | torch.save(model.state_dict(), f'{args.model_name}/.pth') 98 | 99 | model.eval() 100 | 101 | total_test_loss = 0 102 | with torch.no_grad(): 103 | for x,y in dataloader_test: 104 | x = x.to(device) 105 | y = y.to(device) 106 | predictions = model(x) 107 | test_loss = criterion(predictions.squeeze(0), y) 108 | total_test_loss += test_loss.item() 109 | print(f'Final Epoch, Test Loss {total_test_loss / len(dataloader_test)}') 110 | 111 | 112 | 113 | 114 | 115 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FAN: Fourier Analysis Networks 2 | [![arXiv](https://img.shields.io/badge/arXiv-2410.02675-b31b1b.svg)](https://arxiv.org/abs/2410.02675) 3 | 4 | FAN Layer 5 | 6 | | | MLP Layer | FAN layer | 7 | |---------------------------|:----------------------------------------------------------:|:------------------------------------------------------------------------:| 8 | | **Formula** | $\Phi(x) = \sigma(B_{m} + W_{m}x)$ | $\phi(x) = [\cos(W_px)\|\| \sin(W_px)\|\| \sigma(B_{\bar{p}} + W_{\bar{p}}x)]$ | 9 | | **Num of Params** | $(d_\text{input} \times d_\text{output}) + d_\text{output}$ | $(1-\frac{d_p}{d_\text{output}})\times((d_\text{input} \times d_\text{output}) + d_\text{output})$ | 10 | | **FLOPs** | $2\times(d_\text{input} \times d_\text{output})$
$+ d_\text{output} \times \text{FLOPs}_\text{non-linear}$ | $(1-\frac{d_p}{d_\text{output}})\times(2\times(d_\text{input} \times d_\text{output}))$
$+ d_\text{output} \times \text{FLOPs}_\text{non-linear} $ | 11 | 12 | 13 | ## Periodicity Modeling 14 | ```shell 15 | cd Periodicity_Modeling 16 | bash ./run.sh 17 | ``` 18 | ![sin](./img/sin.jpg) 19 | ![mod](./img/mod.jpg) 20 | 21 | ## Scaling Law 22 | Detailed implementations are available in [![FANformer Repo](https://img.shields.io/badge/GitHub-FANformer-blue)](https://github.com/YihongDong/FANformer). 23 | 24 | ## Sentiment Analysis 25 | The data can be automatically downloaded using the Huggingface Datasets `load_dataset` function in the `./Sentiment_Analysis/get_dataloader.py`. 26 | 27 | ```shell 28 | cd Sentiment_Analysis 29 | bash scripts/Trans_with_FAN/train_ours.sh 30 | bash scripts/Trans_with_FAN/test_ours.sh 31 | ``` 32 | 33 | ## Timeseries Forecasting 34 | You can obtain data from [Google Drive](https://drive.google.com/drive/folders/1v1uLx5zhGaNAOTIqHLHYMXtA-XFrKTxS?usp=sharing). All the datasets are well pre-processed and can be used easily. 35 | 36 | ```shell 37 | cd Timeseries_Forecasting 38 | bash scripts/Weather_script/Modified_Transformer.sh 39 | ``` 40 | 41 | ## Symbolic Formula Representation 42 | ```shell 43 | cd Symbolic_Formula_Representation 44 | python gen_dataset.py 45 | bash run_train_fan.sh 46 | ``` 47 | 48 | ## Image Recognition 49 | ```shell 50 | cd Image_Recognition 51 | bash run_image_recognition.sh 52 | ``` 53 | 54 | ## Citation 55 | ``` 56 | @article{dong2024fan, 57 | title={FAN: Fourier Analysis Networks}, 58 | author={Yihong Dong and Ge Li and Yongding Tao and Xue Jiang and Kechi Zhang and Jia Li and Jing Su and Jun Zhang and Jingjing Xu}, 59 | journal={arXiv preprint arXiv:2410.02675}, 60 | year={2024} 61 | } 62 | ``` 63 | -------------------------------------------------------------------------------- /Sentiment_Analysis/get_dataloader.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | from transformers import BertTokenizer, AutoTokenizer 3 | from torch.utils.data import DataLoader, ConcatDataset 4 | 5 | class tokenized_dataloader: 6 | def __init__(self, args): 7 | if args.model == "mamba": 8 | self.tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf") 9 | else: 10 | self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") 11 | 12 | self.dataset = args.dataset 13 | self.batch_size = args.batch_size 14 | 15 | def get_data_loaders(self, part_list=["train"]): 16 | dataset_list = [] 17 | for part in part_list: 18 | if part in ["train", "validation", "test"]: 19 | dataset = self.get_tokenized_dataset(dataset_name=self.dataset, part=part) 20 | dataset_list.append(dataset) 21 | else: 22 | raise ValueError("part must be one of 'train', 'validation', 'test'") 23 | if len(part_list) == 1 and part_list[0] in ["validation", "test"]: 24 | return DataLoader(dataset_list[0], batch_size=self.batch_size, shuffle=False, num_workers=4) 25 | else: 26 | dataset = ConcatDataset(dataset_list) 27 | dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, num_workers=4) 28 | return dataloader 29 | 30 | def get_tokenized_dataset(self, dataset_name, part="train"): 31 | if dataset_name == "sst2": 32 | dataset = load_dataset("glue", "sst2") 33 | dataset[part] = dataset[part].remove_columns(["idx"]) 34 | dataset[part] = dataset[part].rename_column("sentence", "text") 35 | elif dataset_name == "imdb": 36 | dataset = load_dataset("imdb") 37 | elif dataset_name == "sentiment140": 38 | dataset = load_dataset("adilbekovich/Sentiment140Twitter", encoding='ISO-8859-1') 39 | elif dataset_name == "amazon_polarity": 40 | dataset = load_dataset("amazon_polarity") 41 | def combine_title_content(batch): 42 | batch['text'] = [title + '. ' + content for title, content in zip(batch['title'], batch['content'])] 43 | return batch 44 | dataset = dataset.map(combine_title_content, batched=True) 45 | dataset = dataset.remove_columns(['title', 'content']) 46 | 47 | dataset = dataset[part].map(self.tokenize_function, batched=True) 48 | dataset = dataset.remove_columns(['text']) 49 | dataset = dataset.rename_column("label", "labels") 50 | dataset.set_format("torch") 51 | return dataset 52 | 53 | def tokenize_function(self, examples): 54 | return self.tokenizer( 55 | examples['text'], 56 | padding="max_length", 57 | truncation=True, 58 | max_length=128, 59 | return_tensors="pt" 60 | ) -------------------------------------------------------------------------------- /Sentiment_Analysis/model/CustomBERT.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from transformers import BertModel, BertForSequenceClassification, BertConfig 5 | 6 | class FANLayer(nn.Module): 7 | def __init__(self, input_dim, output_dim, bias=True, with_gate = True): 8 | super(FANLayer, self).__init__() 9 | self.input_linear_p = nn.Linear(input_dim, output_dim//4, bias=bias) 10 | self.input_linear_g = nn.Linear(input_dim, (output_dim-output_dim//2)) 11 | self.activation = nn.GELU() 12 | if with_gate: 13 | self.gate = nn.Parameter(torch.randn(1, dtype=torch.float32)) 14 | 15 | def forward(self, src): 16 | g = self.activation(self.input_linear_g(src)) 17 | p = self.input_linear_p(src) 18 | 19 | if not hasattr(self, 'gate'): 20 | output = torch.cat((torch.cos(p), torch.sin(p), g), dim=-1) 21 | else: 22 | gate = torch.sigmoid(self.gate) 23 | output = torch.cat((gate*torch.cos(p), gate*torch.sin(p), (1-gate)*g), dim=-1) 24 | return output 25 | 26 | class CustomBertClassifier(BertForSequenceClassification): 27 | def __init__(self, num_labels=2, num_hidden_layers=12, replace_ffn=False, with_gate=False): 28 | config = BertConfig.from_pretrained("bert-base-uncased", num_labels=num_labels) 29 | config.num_hidden_layers = num_hidden_layers 30 | super(CustomBertClassifier, self).__init__(config) 31 | if replace_ffn: # replace the two linear layers in FFN for each layer 32 | for layer in self.bert.encoder.layer: 33 | layer.intermediate = BertIntermediate_withFAN(config) # replace the intermediate layer because we don't need the activation function within the bert intermediate layer, which is already implemented in the FANLayer 34 | layer.output.dense = FANLayer(config.intermediate_size, config.hidden_size, with_gate=with_gate) 35 | 36 | class BertIntermediate_withFAN(nn.Module): 37 | def __init__(self, config): 38 | super().__init__() 39 | self.dense = FANLayer(config.hidden_size, config.intermediate_size, p_ratio=config.p_ratio, with_gate=config.with_gate) 40 | 41 | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: 42 | hidden_states = self.dense(hidden_states) 43 | return hidden_states 44 | -------------------------------------------------------------------------------- /Sentiment_Analysis/model/Mamba.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('..') 3 | import torch 4 | import torch.nn as nn 5 | from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss 6 | from typing import Any, Dict, Optional, Tuple, Union 7 | from transformers.cache_utils import MambaCache 8 | from transformers.modeling_outputs import SequenceClassifierOutput 9 | from transformers.models.mamba.modeling_mamba import MambaPreTrainedModel, MambaModel 10 | # from transformers.models.bert.modeling_bert import BertPooler 11 | 12 | from model.Linear import Pooling 13 | 14 | class Pooler(nn.Module): 15 | def __init__(self, config): 16 | super().__init__() 17 | self.dense = nn.Linear(config.hidden_size, config.hidden_size) 18 | self.activation = nn.Tanh() 19 | 20 | def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: 21 | # We "pool" the model by simply taking the hidden state corresponding 22 | # to the first token. 23 | first_token_tensor = hidden_states[:, 0, :] 24 | pooled_output = self.dense(first_token_tensor) 25 | pooled_output = self.activation(pooled_output) 26 | return pooled_output 27 | 28 | class MambaForSequenceClassification(MambaPreTrainedModel): 29 | def __init__(self, config, num_labels=2, max_pooler=False): 30 | super(MambaForSequenceClassification, self).__init__(config) 31 | self.num_labels = num_labels 32 | self.backbone = MambaModel(config) 33 | if max_pooler: 34 | self.pooler = Pooling(pool_type='max') 35 | else: 36 | self.pooler = Pooler(config) 37 | self.dropout = nn.Dropout(0.1) 38 | self.classifier = nn.Linear(config.hidden_size, num_labels) 39 | 40 | def forward( 41 | self, 42 | input_ids: Optional[torch.LongTensor] = None, 43 | inputs_embeds: Optional[torch.FloatTensor] = None, 44 | cache_params: Optional[MambaCache] = None, 45 | labels: Optional[torch.LongTensor] = None, 46 | output_hidden_states: Optional[bool] = None, 47 | return_dict: Optional[bool] = None, 48 | use_cache: Optional[bool] = None, 49 | cache_position: Optional[torch.Tensor] = None, 50 | **kwargs, 51 | ): 52 | 53 | mamba_outputs = self.backbone( 54 | input_ids, 55 | cache_params=cache_params, 56 | inputs_embeds=inputs_embeds, 57 | output_hidden_states=output_hidden_states, 58 | return_dict=return_dict, 59 | use_cache=use_cache, 60 | cache_position=cache_position, 61 | ) 62 | last_hidden_state = mamba_outputs.last_hidden_state 63 | pooled_output = self.pooler(last_hidden_state) 64 | pooled_output = self.dropout(pooled_output) 65 | logits = self.classifier(pooled_output) 66 | 67 | loss = None 68 | if labels is not None: 69 | if self.config.problem_type is None: 70 | if self.num_labels == 1: 71 | self.config.problem_type = "regression" 72 | elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): 73 | self.config.problem_type = "single_label_classification" 74 | else: 75 | self.config.problem_type = "multi_label_classification" 76 | 77 | 78 | if self.config.problem_type == "regression": 79 | loss_fct = MSELoss() 80 | if self.num_labels == 1: 81 | loss = loss_fct(logits.squeeze(), labels.squeeze()) 82 | else: 83 | loss = loss_fct(logits, labels) 84 | elif self.config.problem_type == "single_label_classification": 85 | loss_fct = CrossEntropyLoss() 86 | loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 87 | elif self.config.problem_type == "multi_label_classification": 88 | loss_fct = BCEWithLogitsLoss() 89 | loss = loss_fct(logits, labels.float()) 90 | 91 | return SequenceClassifierOutput( 92 | loss=loss, 93 | logits=logits 94 | ) -------------------------------------------------------------------------------- /Sentiment_Analysis/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihongDong/FAN/7f1b16a1fdf2e36e8b123109d7f6b6987361a1fe/Sentiment_Analysis/model/__init__.py -------------------------------------------------------------------------------- /Sentiment_Analysis/model/build_model.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from model.CustomBERT import CustomBertClassifier 3 | from model.Mamba import MambaForSequenceClassification 4 | from transformers import MambaConfig 5 | 6 | 7 | def build_model(args): 8 | if args.model == "trans": 9 | model = CustomBertClassifier(num_labels=args.num_classes, num_hidden_layers=args.num_hidden_layers, replace_ffn=args.replace_ffn, with_gate=args.with_gate).to(args.device) 10 | logging.info("with_gate: {}".format(args.with_gate)) 11 | elif args.model == "mamba": 12 | config = MambaConfig.from_pretrained("state-spaces/mamba-130m-hf") 13 | config.num_hidden_layers = args.num_hidden_layers 14 | config.hidden_size = args.hidden_size 15 | logging.info("hidden_size: {}".format(config.hidden_size)) 16 | logging.info("num_hidden_layers: {}".format(config.num_hidden_layers)) 17 | model = MambaForSequenceClassification(config=config, num_labels=args.num_classes, max_pooler=args.max_pooler).to(args.device) 18 | print('model:', model) 19 | return model -------------------------------------------------------------------------------- /Sentiment_Analysis/scripts/Trans/test_baseline_trans.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0 4 | 5 | prefix=baseline_trans 6 | dataset=amazon_polarity # (imdb, sentiment140, amazon_polarity) 7 | 8 | # create log dir 9 | log_dir=./logs/$prefix 10 | mkdir -p $log_dir 11 | 12 | python test.py \ 13 | --batch_size 128 \ 14 | --prefix $prefix \ 15 | --dataset $dataset \ 16 | --log_file $log_dir/test_on_$dataset.log # > logs/screen_$prefix.log 2>&1 17 | # --save_path checkpoints/$prefix # > logs/screen_$prefix.log 2>&1 18 | -------------------------------------------------------------------------------- /Sentiment_Analysis/scripts/Trans/train_baseline_trans.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0 4 | 5 | prefix=baseline_trans 6 | 7 | python train.py \ 8 | --batch_size 128 \ 9 | --epochs 10 \ 10 | --learning_rate 5e-5 \ 11 | --prefix $prefix \ 12 | --log_file logs/tmp.log 13 | # logs/$prefix.log # > logs/screen_$prefix.log 2>&1 14 | # --save_path checkpoints/$prefix # > logs/screen_$prefix.log 2>&1 -------------------------------------------------------------------------------- /Sentiment_Analysis/scripts/Trans_with_FAN/test_ours.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0 4 | 5 | num_hidden_layers=12 6 | prefix=ours_trans 7 | 8 | dataset=amazon_polarity # (imdb, sentiment140, amazon_polarity) 9 | 10 | # create log dir 11 | log_dir=./logs/$prefix 12 | mkdir -p $log_dir 13 | 14 | python test.py \ 15 | --batch_size 128 \ 16 | --replace_ffn \ 17 | --prefix $prefix \ 18 | --dataset $dataset \ 19 | --num_hidden_layers $num_hidden_layers \ 20 | --log_file $log_dir/test_on_$dataset.log # > logs/screen_$prefix.log 2>&1 21 | # --save_path checkpoints/$prefix # > logs/screen_$prefix.log 2>&1 -------------------------------------------------------------------------------- /Sentiment_Analysis/scripts/Trans_with_FAN/test_ours_withgate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0 4 | 5 | num_hidden_layers=12 6 | prefix=ours_trans_withgate 7 | 8 | datasets=("imdb") 9 | # ( "imdb" "sentiment140" "amazon_polarity") 10 | 11 | log_dir=./logs/$prefix 12 | mkdir -p $log_dir 13 | 14 | for dataset in "${datasets[@]}" 15 | do 16 | python test.py \ 17 | --batch_size 128 \ 18 | --replace_ffn \ 19 | --with_gate \ 20 | --prefix $prefix \ 21 | --dataset $dataset \ 22 | --num_hidden_layers $num_hidden_layers \ 23 | --log_file $log_dir/test_on_$dataset.log 24 | done 25 | -------------------------------------------------------------------------------- /Sentiment_Analysis/scripts/Trans_with_FAN/train_ours.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0 4 | 5 | num_hidden_layers=12 6 | prefix=ours_trans 7 | 8 | python train.py \ 9 | --batch_size 128 \ 10 | --epochs 50 \ 11 | --learning_rate 5e-5 \ 12 | --prefix $prefix \ 13 | --replace_ffn \ 14 | --num_hidden_layers $num_hidden_layers \ 15 | --log_file logs/$prefix.log # > logs/screen_$prefix.log 2>&1 16 | # --save_path checkpoints/$prefix # > logs/screen_$prefix.log 2>&1 -------------------------------------------------------------------------------- /Sentiment_Analysis/scripts/Trans_with_FAN/train_ours_withgate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0 4 | 5 | prefix=ours_trans_withgate 6 | 7 | num_hidden_layers=12 8 | 9 | python train.py \ 10 | --batch_size 128 \ 11 | --epochs 50 \ 12 | --learning_rate 5e-5 \ 13 | --prefix $prefix \ 14 | --replace_ffn \ 15 | --with_gate \ 16 | --num_hidden_layers $num_hidden_layers \ 17 | --log_file logs/$prefix'_'hlayers_$num_hidden_layers.log # > logs/screen_$prefix.log 2>&1 18 | # --save_path checkpoints/$prefix # > logs/screen_$prefix.log 2>&1 -------------------------------------------------------------------------------- /Sentiment_Analysis/scripts/mamba/test_mamba.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0 4 | 5 | model=mamba 6 | 7 | hidden_layers=24 8 | hidden_size=768 9 | learning_rate=5e-5 10 | prefix=$model'_'hlayers_$hidden_layers'_'hsize_$hidden_size'_'lr_$learning_rate'_'maxpooler 11 | 12 | # dataset list 13 | datasets=("imdb" "sentiment140" "amazon_polarity") 14 | 15 | log_dir=./logs/$model 16 | mkdir -p $log_dir 17 | 18 | for dataset in "${datasets[@]}" 19 | do 20 | python test.py \ 21 | --batch_size 128 \ 22 | --prefix $prefix \ 23 | --dataset $dataset \ 24 | --num_hidden_layers $hidden_layers \ 25 | --hidden_size $hidden_size \ 26 | --max_pooler \ 27 | --model $model \ 28 | --log_file $log_dir/$prefix'_'test_on_$dataset.log 29 | done 30 | -------------------------------------------------------------------------------- /Sentiment_Analysis/scripts/mamba/train_mamba.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export CUDA_VISIBLE_DEVICES=0 4 | 5 | model=mamba 6 | 7 | hidden_layers=24 8 | hidden_size=768 9 | learning_rate=5e-5 10 | prefix=$model'_'hlayers_$hidden_layers'_'hsize_$hidden_size'_'lr_$learning_rate'_'maxpooler 11 | 12 | python train.py \ 13 | --batch_size 128 \ 14 | --epochs 50 \ 15 | --learning_rate $learning_rate \ 16 | --prefix $prefix \ 17 | --num_hidden_layers $hidden_layers \ 18 | --hidden_size $hidden_size \ 19 | --max_pooler \ 20 | --model $model \ 21 | --log_file logs/$prefix.log # > logs/screen_$prefix.log 2>&1 22 | # --save_path checkpoints/$prefix # > logs/screen_$prefix.log 2>&1 -------------------------------------------------------------------------------- /Sentiment_Analysis/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import torch 4 | from model.build_model import build_model 5 | import logging 6 | from tqdm.auto import tqdm 7 | import evaluate 8 | from get_dataloader import tokenized_dataloader 9 | 10 | device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") 11 | 12 | def setup_logging(log_file): 13 | logging.basicConfig(filename=log_file, level=logging.INFO, 14 | format='%(asctime)s - %(levelname)s - %(message)s') 15 | 16 | def test(args): 17 | setup_logging(args.log_file) 18 | logging.info("--------------- Start testing ---------------") 19 | 20 | dataloader = tokenized_dataloader(args=args) 21 | if args.dataset == "imdb": 22 | part_list = ["test"] 23 | else: 24 | part_list = ["train", "test"] 25 | test_loader = dataloader.get_data_loaders(part_list=part_list) 26 | logging.info("test data on {} of {}".format(part_list, args.dataset)) 27 | 28 | model = build_model(args) 29 | print(model) 30 | model_path = os.path.join('checkpoints', args.prefix + "_best.pth") 31 | model.load_state_dict(torch.load(model_path)) 32 | 33 | progress_bar = tqdm(range(len(test_loader))) 34 | 35 | metric = evaluate.load("accuracy") 36 | # testing 37 | model.eval() 38 | total_loss = 0.0 39 | for batch in test_loader: 40 | batch = {k: v.to(device) for k, v in batch.items()} 41 | with torch.no_grad(): 42 | outputs = model(**batch) 43 | loss = outputs.loss 44 | total_loss += loss.item() 45 | logits = outputs.logits 46 | predictions = torch.argmax(logits, dim=-1) 47 | metric.add_batch(predictions=predictions, references=batch["labels"]) 48 | progress_bar.update(1) 49 | 50 | avg_loss = total_loss / len(test_loader) 51 | val_accuracy = metric.compute()['accuracy'] 52 | 53 | logging.info(f"Test {args.prefix} Model on {args.dataset} part_list={part_list}") 54 | logging.info(f"Test Loss: {avg_loss:.4f}, Test Accuracy: {val_accuracy:.4f}") 55 | 56 | if __name__ == "__main__": 57 | parser = argparse.ArgumentParser() 58 | parser.add_argument("--num_classes", type=int, default=2) 59 | parser.add_argument("--replace_ffn", action='store_true') 60 | parser.add_argument("--batch_size", type=int, default=32) 61 | parser.add_argument("--learning_rate", type=float, default=5e-5) 62 | parser.add_argument("--epochs", type=int, default=10) 63 | parser.add_argument("--log_file", type=str, default="training.log") 64 | parser.add_argument("--prefix", type=str, default="baseline") 65 | parser.add_argument("--model", type=str, default="trans") 66 | # for FAN layer 67 | parser.add_argument("--with_gate", action='store_true') 68 | 69 | # for trans and mamba 70 | parser.add_argument("--num_hidden_layers", type=int, default=12) 71 | parser.add_argument("--hidden_size", type=int, default=768) 72 | # form mamba 73 | parser.add_argument("--max_pooler", action='store_true') 74 | 75 | # for dataset 76 | parser.add_argument("--dataset", type=str, default="sst2") 77 | 78 | args = parser.parse_args() 79 | device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") 80 | args.device = device 81 | test(args) 82 | -------------------------------------------------------------------------------- /Sentiment_Analysis/train.py: -------------------------------------------------------------------------------- 1 | import time 2 | import argparse 3 | import torch 4 | import torch.optim as optim 5 | from torch.utils.data import DataLoader 6 | from datasets import load_dataset 7 | from model.build_model import build_model 8 | import logging 9 | from transformers import BertTokenizer, AutoTokenizer, DataCollatorWithPadding, BertConfig, MambaConfig 10 | from transformers import get_scheduler 11 | from tqdm.auto import tqdm 12 | from functools import partial 13 | import evaluate 14 | from utils import view_params 15 | from get_dataloader import tokenized_dataloader 16 | 17 | 18 | def setup_logging(log_file): 19 | logging.basicConfig(filename=log_file, level=logging.INFO, 20 | format='%(asctime)s - %(levelname)s - %(message)s') 21 | 22 | def tokenize_function(tokenizer, examples): 23 | return tokenizer( 24 | examples['sentence'], 25 | padding="max_length", 26 | truncation=True, 27 | max_length=128, 28 | return_tensors="pt" 29 | ) 30 | 31 | def train(args): 32 | torch.manual_seed(42) 33 | setup_logging(args.log_file) 34 | logging.info("--------------- Start training ---------------") 35 | 36 | dataloader = tokenized_dataloader(args=args) 37 | train_loader = dataloader.get_data_loaders(part_list=['train']) 38 | val_loader = dataloader.get_data_loaders(part_list=['validation']) 39 | 40 | logging.info("the model is: {}".format(args.model)) 41 | 42 | model = build_model(args) 43 | 44 | params = view_params(model) 45 | logging.info(params) 46 | 47 | optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) 48 | logging.info("Optimizer: {}".format(optimizer)) 49 | logging.info(f"Learning rate: {args.learning_rate}") 50 | 51 | num_training_steps = args.epochs * len(train_loader) 52 | lr_scheduler = get_scheduler( 53 | name="cosine_with_restarts", 54 | optimizer=optimizer, 55 | num_warmup_steps=0, 56 | num_training_steps=num_training_steps, 57 | ) 58 | 59 | best_val_accuracy = 0.0 60 | progress_bar = tqdm(range(num_training_steps)) 61 | 62 | for epoch in range(args.epochs): 63 | model.train() 64 | metric = evaluate.load("accuracy") 65 | 66 | total_loss = 0.0 67 | for batch in train_loader: 68 | batch = {k: v.to(args.device) for k, v in batch.items()} 69 | outputs = model(**batch) 70 | logits = outputs.logits 71 | if args.model == 'linear': 72 | predictions = (logits > 0.5).long() 73 | else: 74 | predictions = torch.argmax(logits, dim=-1) 75 | metric.add_batch(predictions=predictions, references=batch["labels"]) 76 | loss = outputs.loss 77 | total_loss += loss.item() 78 | loss.backward() 79 | 80 | optimizer.step() 81 | lr_scheduler.step() 82 | optimizer.zero_grad() 83 | progress_bar.update(1) 84 | 85 | avg_loss = total_loss / len(train_loader) 86 | train_accuracy = metric.compute()['accuracy'] 87 | logging.info(f"Epoch {epoch + 1}, Average Loss: {avg_loss:.4f}, Train Accuracy: {train_accuracy:.4f}") 88 | 89 | # validation 90 | model.eval() 91 | total_loss = 0.0 92 | for batch in val_loader: 93 | batch = {k: v.to(args.device) for k, v in batch.items()} 94 | with torch.no_grad(): 95 | outputs = model(**batch) 96 | loss = outputs.loss 97 | total_loss += loss.item() 98 | logits = outputs.logits 99 | if args.model == 'linear': 100 | predictions = (logits > 0.5).long() 101 | else: 102 | predictions = torch.argmax(logits, dim=-1) 103 | metric.add_batch(predictions=predictions, references=batch["labels"]) 104 | 105 | avg_loss = total_loss / len(val_loader) 106 | val_accuracy = metric.compute()['accuracy'] 107 | logging.info(f"Epoch {epoch + 1}, Validation Loss: {avg_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}") 108 | 109 | # save model with best validation accuracy 110 | if val_accuracy > best_val_accuracy: 111 | best_val_accuracy = val_accuracy 112 | save_directory = f"checkpoints/{args.prefix}_best.pth" 113 | torch.save(model.state_dict(), save_directory) 114 | logging.info("Save model with best validation accuracy") 115 | 116 | if __name__ == "__main__": 117 | parser = argparse.ArgumentParser() 118 | parser.add_argument("--num_classes", type=int, default=2) 119 | parser.add_argument("--replace_ffn", action='store_true') 120 | parser.add_argument("--batch_size", type=int, default=32) 121 | parser.add_argument("--learning_rate", type=float, default=5e-5) 122 | parser.add_argument("--epochs", type=int, default=10) 123 | parser.add_argument("--log_file", type=str, default="training.log") 124 | parser.add_argument("--prefix", type=str, default="baseline") 125 | parser.add_argument("--model", type=str, default="trans") 126 | # for FAN layer 127 | parser.add_argument("--with_gate", action='store_true') 128 | 129 | # for trans and mamba 130 | parser.add_argument("--num_hidden_layers", type=int, default=12) 131 | parser.add_argument("--hidden_size", type=int, default=768) 132 | 133 | # form mamba 134 | parser.add_argument("--max_pooler", action='store_true') 135 | 136 | # for dataset 137 | parser.add_argument("--dataset", type=str, default="sst2") 138 | 139 | args = parser.parse_args() 140 | device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") 141 | args.device = device 142 | train(args) -------------------------------------------------------------------------------- /Sentiment_Analysis/utils.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | def view_params(model) -> str: 4 | res = "Total parameters: " + str(sum(p.numel() for p in model.parameters())/1e6) + ' M' 5 | return res 6 | 7 | 8 | class Pooling(nn.Module): 9 | def __init__(self, pool_type='mean'): 10 | super(Pooling, self).__init__() 11 | self.pool_type = pool_type 12 | if pool_type == 'mean': 13 | self.pool = nn.AdaptiveAvgPool1d(1) 14 | elif pool_type == 'max': 15 | self.pool = nn.AdaptiveMaxPool1d(1) 16 | else: 17 | raise ValueError("pool_type must be either 'mean' or 'max'.") 18 | 19 | def forward(self, x): 20 | x = x.transpose(1, 2) 21 | pooled = self.pool(x) 22 | return pooled.squeeze(2) -------------------------------------------------------------------------------- /Symbolic_Formula_Representation/gen_dataset.py: -------------------------------------------------------------------------------- 1 | # The Code of this part is based on KAN (https://github.com/KindXiaoming/pykan). 2 | 3 | import torch 4 | import scipy.special 5 | import numpy as np 6 | import json 7 | import os 8 | from tqdm import tqdm 9 | from kan import * 10 | 11 | 12 | device = torch.device('cpu') 13 | 14 | def produce_dataset(dataset_idx): 15 | 16 | if dataset_idx == 0: 17 | f = lambda x: torch.tensor(torch.special.bessel_j0(20 * x[:, [0]])) 18 | dataset = create_dataset(f, n_var=1, train_num=3000, device=device) 19 | elif dataset_idx == 1: 20 | def f(x): 21 | return torch.exp(torch.sin(torch.pi * x[:, [0]]) + x[:, [0]]**2) 22 | dataset = create_dataset(f, n_var=2, train_num=3000, device=device) 23 | elif dataset_idx == 2: 24 | f = lambda x: x[:, [0]] * x[:, [1]] 25 | dataset = create_dataset(f, n_var=2, train_num=3000, device=device) 26 | elif dataset_idx == 3: 27 | f = lambda x: torch.exp((torch.sin(torch.pi*(x[:,[0]]**2+x[:,[1]]**2))+torch.sin(torch.pi*(x[:,[2]]**2+x[:,[3]]**2)))/2) 28 | dataset = create_dataset(f, n_var=4, train_num=3000, device=device) 29 | return dataset 30 | 31 | 32 | if __name__ == '__main__': 33 | save_dir = 'dataset' 34 | if not os.path.exists(save_dir): 35 | os.makedirs(save_dir) 36 | 37 | for i in range(4): 38 | if i == 1: 39 | continue 40 | dataset = produce_dataset(i) 41 | torch.save(dataset, f'{save_dir}/dataset_{i}.pt') 42 | print(f'dataset_{i} saved into {save_dir}/dataset_{i}.pt') 43 | -------------------------------------------------------------------------------- /Symbolic_Formula_Representation/requirements.txt: -------------------------------------------------------------------------------- 1 | kan 2 | tqdm 3 | torch 4 | numpy -------------------------------------------------------------------------------- /Symbolic_Formula_Representation/run_train_fan.sh: -------------------------------------------------------------------------------- 1 | dataset_ids=(0 1 2 3) 2 | 3 | for dataset_id in ${dataset_ids[@]} 4 | do 5 | CUDA_VISIBLE_DEVICES=0 python train_fan.py \ 6 | --dataset_id $dataset_id \ 7 | --dataset_dir dataset \ 8 | --save_dir fan_checkpoint 9 | 10 | done -------------------------------------------------------------------------------- /Symbolic_Formula_Representation/run_train_kan.sh: -------------------------------------------------------------------------------- 1 | dataset_ids=(0 1 2 3) 2 | 3 | for dataset_id in ${dataset_ids[@]} 4 | do 5 | CUDA_VISIBLE_DEVICES=0 python train_kan.py \ 6 | --dataset_id $dataset_id \ 7 | --dataset_dir dataset \ 8 | --save_dir kan_checkpoint 9 | done -------------------------------------------------------------------------------- /Symbolic_Formula_Representation/run_train_mlp.sh: -------------------------------------------------------------------------------- 1 | dataset_ids=(0 1 2 3) 2 | 3 | for dataset_id in ${dataset_ids[@]} 4 | do 5 | CUDA_VISIBLE_DEVICES=0 python train_MLP.py \ 6 | --dataset_id $dataset_id \ 7 | --dataset_dir dataset \ 8 | --save_dir mlp_checkpoint 9 | 10 | done -------------------------------------------------------------------------------- /Symbolic_Formula_Representation/run_train_transformer.sh: -------------------------------------------------------------------------------- 1 | dataset_ids=(0 1 2 3 4) 2 | 3 | for dataset_id in ${dataset_ids[@]} 4 | do 5 | CUDA_VISIBLE_DEVICES=0 python train_transformer.py \ 6 | --dataset_id $dataset_id \ 7 | --dataset_dir dataset \ 8 | --save_dir transformer_checkpoint 9 | 10 | done 11 | -------------------------------------------------------------------------------- /Symbolic_Formula_Representation/train_fan.py: -------------------------------------------------------------------------------- 1 | import torch, os, argparse, json 2 | import torch.nn as nn 3 | from tqdm import tqdm 4 | from kan import LBFGS 5 | import torch.optim as optim 6 | from torch.utils.data import DataLoader, TensorDataset 7 | import numpy as np 8 | import torch 9 | import torch.nn.functional as F 10 | 11 | 12 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 13 | 14 | def parse_args(): 15 | parser = argparse.ArgumentParser(description='Train MLP') 16 | parser.add_argument('--dataset_idx', type=int, default=0, help='Dataset index') 17 | parser.add_argument('--dataset_dir', type=str, default='dataset') 18 | parser.add_argument('--save_dir', type=str, default='kan_checkpoint') 19 | return parser.parse_args() 20 | 21 | 22 | def load_dataset(args, dataset_idx): 23 | print(f'Loading dataset_{dataset_idx} from {args.dataset_dir}/dataset_{dataset_idx}.pt') 24 | 25 | dataset = torch.load(f'{args.dataset_dir}/dataset_{dataset_idx}.pt') 26 | dataset['train_input'] = dataset['train_input'].to(device) 27 | dataset['test_input'] = dataset['test_input'].to(device) 28 | dataset['train_label'] = dataset['train_label'].to(device) 29 | dataset['test_label'] = dataset['test_label'].to(device) 30 | return dataset 31 | 32 | 33 | class FANLayer(nn.Module): 34 | def __init__(self, input_dim, output_dim, bias=True): 35 | super(FANLayer, self).__init__() 36 | self.input_linear_p = nn.Linear(input_dim, output_dim//4, bias=bias) 37 | self.input_linear_g = nn.Linear(input_dim, (output_dim-output_dim//2)) 38 | self.activation = nn.GELU() 39 | 40 | def forward(self, src): 41 | g = self.activation(self.input_linear_g(src)) 42 | p = self.input_linear_p(src) 43 | 44 | output = torch.cat((torch.cos(p), torch.sin(p), g), dim=-1) 45 | return output 46 | 47 | class FAN(nn.Module): 48 | def __init__(self, input_dim=1, output_dim=1, hidden_dim=2048, num_layers=3): 49 | super(FAN, self).__init__() 50 | self.embedding = nn.Linear(input_dim, hidden_dim) 51 | self.layers = nn.ModuleList() 52 | for _ in range(num_layers - 1): 53 | self.layers.append(FANLayer(hidden_dim, hidden_dim)) 54 | self.layers.append(nn.Linear(hidden_dim, output_dim)) 55 | 56 | def forward(self, src): 57 | output = self.embedding(src) 58 | for layer in self.layers: 59 | output = layer(output) 60 | return output 61 | 62 | 63 | def train_with_test(model, dataset, ckpt_dir): 64 | 65 | if not os.path.exists(ckpt_dir): 66 | os.makedirs(ckpt_dir) 67 | criterion = nn.MSELoss() 68 | optimizer = LBFGS(filter(lambda p: p.requires_grad, model.parameters()), 69 | lr=0.0001, 70 | history_size=40, 71 | line_search_fn="strong_wolfe", 72 | tolerance_grad=1e-32, 73 | tolerance_change=1e-32, 74 | tolerance_ys=1e-32) 75 | 76 | model.train() 77 | for _ in tqdm(range(1800)): 78 | def closure(): 79 | optimizer.zero_grad() 80 | output = model(dataset['train_input']) 81 | loss = criterion(output, dataset['train_label']) 82 | loss.backward() 83 | return loss 84 | optimizer.step(closure) 85 | 86 | torch.save(model.state_dict(), f'{ckpt_dir}/model.pth') 87 | 88 | model.eval() 89 | with torch.no_grad(): 90 | output = model(dataset['test_input']) 91 | test_loss = criterion(output, dataset['test_label']).item() 92 | return test_loss 93 | 94 | 95 | if __name__ == '__main__': 96 | args = parse_args() 97 | if args.dataset_idx == 0: 98 | dataset = load_dataset(args, 0) 99 | input_size, output_size = 1, 1 100 | elif args.dataset_idx == 1: 101 | dataset = load_dataset(args, 1) 102 | input_size, output_size = 2, 1 103 | elif args.dataset_idx == 2: 104 | dataset = load_dataset(args, 2) 105 | input_size, output_size = 2, 1 106 | elif args.dataset_idx == 3: 107 | dataset = load_dataset(args, 3) 108 | input_size, output_size = 4, 1 109 | 110 | save_dir = f'{args.save_dir}/dataset_{args.dataset_idx}' 111 | if not os.path.exists(save_dir): 112 | os.makedirs(save_dir) 113 | log_file = open(f'{save_dir}/results.jsonl', 'w') 114 | 115 | for depth in [2, 3, 4, 5]: 116 | for hidden_size in [4, 8, 16, 32, 64, 128]: 117 | print(f'Depth: {depth}, Hidden size: {hidden_size}') 118 | model = FAN(input_dim=input_size, hidden_dim=hidden_size, output_dim=output_size, num_layers=depth).to(device) 119 | param_size = sum(p.numel() for p in model.parameters()) 120 | ckpt_dir = f'{save_dir}/depth_{depth}_hidden_{hidden_size}' 121 | test_loss = train_with_test(model, dataset, ckpt_dir) 122 | 123 | output_js = {} 124 | output_js['depth'] = depth 125 | output_js['hidden_size'] = hidden_size 126 | param_size = model.get_param_size() 127 | output_js['param_size'] = param_size 128 | output_js['test_loss'] = test_loss 129 | log_file.write(json.dumps(output_js) + '\n') 130 | log_file.flush() 131 | log_file.close() 132 | -------------------------------------------------------------------------------- /Symbolic_Formula_Representation/train_kan.py: -------------------------------------------------------------------------------- 1 | from kan import * 2 | import torch 3 | import argparse, json 4 | import numpy as np 5 | import pdb 6 | import os 7 | 8 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 9 | 10 | def parse_args(): 11 | parser = argparse.ArgumentParser(description='Train KAN') 12 | parser.add_argument('--dataset_idx', type=int, default=0, help='Dataset index') 13 | parser.add_argument('--dataset_dir', type=str, default='dataset') 14 | parser.add_argument('--save_dir', type=str, default='kan_checkpoint') 15 | return parser.parse_args() 16 | 17 | 18 | def load_dataset(args, dataset_idx): 19 | print(f'Loading dataset_{dataset_idx} from {args.dataset_dir}/dataset_{dataset_idx}.pt') 20 | 21 | dataset = torch.load(f'{args.dataset_dir}/dataset_{dataset_idx}.pt') 22 | dataset['train_input'] = dataset['train_input'].to(device) 23 | dataset['test_input'] = dataset['test_input'].to(device) 24 | dataset['train_label'] = dataset['train_label'].to(device) 25 | dataset['test_label'] = dataset['test_label'].to(device) 26 | return dataset 27 | 28 | 29 | def compute_kan_size(width, grid, k): 30 | kan_size = 0 31 | for i in range(len(width) - 1): 32 | kan_size += (width[i][0] * width[i+1][0] * (grid + k + 3) + width[i+1][0]) 33 | return kan_size 34 | 35 | 36 | if __name__ == '__main__': 37 | args = parse_args() 38 | if args.dataset_idx == 0: 39 | dataset = load_dataset(args, 0) 40 | width = [1, 1] 41 | elif args.dataset_idx == 1: 42 | dataset = load_dataset(args, 1) 43 | width = [2, 1, 1] 44 | elif args.dataset_idx == 2: 45 | dataset = load_dataset(args, 2) 46 | width = [2, 2, 1] 47 | elif args.dataset_idx == 3: 48 | dataset = load_dataset(args, 3) 49 | width = [4, 4, 2, 1] 50 | else: 51 | raise ValueError('Invalid dataset index') 52 | 53 | save_dir = f'{args.save_dir}/dataset_{args.dataset_idx}' 54 | if not os.path.exists(save_dir): 55 | os.makedirs(save_dir) 56 | log_file = open(f'{save_dir}/results.jsonl', 'a') 57 | 58 | grids = [3, 5, 10, 20, 50, 100, 200, 500, 1000] 59 | for i, grid in enumerate(grids): 60 | if i == 0: 61 | ckpt_dir = f'{save_dir}/ckpt' 62 | model = KAN(width=width, grid=grid, k=3, device=device, ckpt_path=ckpt_dir) 63 | else: 64 | model = model.refine(grid) 65 | results = model.fit(dataset, opt="LBFGS", steps=200, lr=0.01) 66 | 67 | output_js = {} 68 | output_js['grid'] = grid 69 | param_size = compute_kan_size(width, grid, 3) 70 | output_js['param_size'] = param_size 71 | output_js['train_loss'] = results['train_loss'][-1].item() 72 | output_js['test_loss'] = results['test_loss'][-1].item() 73 | log_file.write(json.dumps(output_js) + '\n') 74 | log_file.flush() 75 | log_file.close() -------------------------------------------------------------------------------- /Symbolic_Formula_Representation/train_mlp.py: -------------------------------------------------------------------------------- 1 | import torch, os, argparse, json 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | import numpy as np 5 | from torch.utils.data import DataLoader, TensorDataset 6 | from tqdm import tqdm 7 | from kan import LBFGS 8 | 9 | 10 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 11 | 12 | def parse_args(): 13 | parser = argparse.ArgumentParser(description='Train MLP') 14 | parser.add_argument('--dataset_idx', type=int, default=0, help='Dataset index') 15 | parser.add_argument('--dataset_dir', type=str, default='dataset') 16 | parser.add_argument('--save_dir', type=str, default='kan_checkpoint') 17 | return parser.parse_args() 18 | 19 | 20 | def load_dataset(args, dataset_idx): 21 | print(f'Loading dataset_{dataset_idx} from {args.dataset_dir}/dataset_{dataset_idx}.pt') 22 | 23 | dataset = torch.load(f'{args.dataset_dir}/dataset_{dataset_idx}.pt') 24 | dataset['train_input'] = dataset['train_input'].to(device) 25 | dataset['test_input'] = dataset['test_input'].to(device) 26 | dataset['train_label'] = dataset['train_label'].to(device) 27 | dataset['test_label'] = dataset['test_label'].to(device) 28 | return dataset 29 | 30 | 31 | class MLP(nn.Module): 32 | def __init__(self, input_size, hidden_size, output_size, depth): 33 | super(MLP, self).__init__() 34 | 35 | self.input_size = input_size 36 | self.hidden_size = hidden_size 37 | self.output_size = output_size 38 | self.depth = depth 39 | 40 | layers = [] 41 | 42 | layers.append(nn.Linear(input_size, hidden_size)) 43 | layers.append(nn.ReLU()) 44 | 45 | for _ in range(depth - 1): 46 | layers.append(nn.Linear(hidden_size, hidden_size)) 47 | layers.append(nn.ReLU()) 48 | 49 | layers.append(nn.Linear(hidden_size, output_size)) 50 | 51 | self.network = nn.Sequential(*layers) 52 | 53 | def forward(self, x): 54 | return self.network(x) 55 | 56 | def get_param_size(self): 57 | total_parameters = 0 58 | total_parameters += (self.input_size + 1) * self.hidden_size 59 | for _ in range(self.depth - 1): 60 | total_parameters += (self.hidden_size + 1) * self.hidden_size 61 | total_parameters += (self.hidden_size + 1) * self.output_size 62 | return total_parameters 63 | 64 | 65 | def train_with_test(model, dataset, ckpt_dir): 66 | 67 | if not os.path.exists(ckpt_dir): 68 | os.makedirs(ckpt_dir) 69 | criterion = nn.MSELoss() 70 | optimizer = LBFGS(filter(lambda p: p.requires_grad, model.parameters()), 71 | lr=0.01, 72 | history_size=10, 73 | line_search_fn="strong_wolfe", 74 | tolerance_grad=1e-32, 75 | tolerance_change=1e-32, 76 | tolerance_ys=1e-32) 77 | 78 | model.train() 79 | for _ in tqdm(range(1800)): 80 | def closure(): 81 | optimizer.zero_grad() 82 | output = model(dataset['train_input']) 83 | loss = criterion(output, dataset['train_label']) 84 | loss.backward() 85 | return loss 86 | optimizer.step(closure) 87 | 88 | torch.save(model.state_dict(), f'{ckpt_dir}/model.pth') 89 | 90 | model.eval() 91 | with torch.no_grad(): 92 | output = model(dataset['test_input']) 93 | test_loss = criterion(output, dataset['test_label']).item() 94 | return test_loss 95 | 96 | 97 | if __name__ == '__main__': 98 | args = parse_args() 99 | if args.dataset_idx == 0: 100 | dataset = load_dataset(args, 0) 101 | input_size, output_size = 1, 1 102 | elif args.dataset_idx == 1: 103 | dataset = load_dataset(args, 1) 104 | input_size, output_size = 2, 1 105 | elif args.dataset_idx == 2: 106 | dataset = load_dataset(args, 2) 107 | input_size, output_size = 2, 1 108 | elif args.dataset_idx == 3: 109 | dataset = load_dataset(args, 3) 110 | input_size, output_size = 4, 1 111 | 112 | save_dir = f'{args.save_dir}/dataset_{args.dataset_idx}' 113 | if not os.path.exists(save_dir): 114 | os.makedirs(save_dir) 115 | log_file = open(f'{save_dir}/results.jsonl', 'w') 116 | 117 | for depth in [2, 3, 4, 5]: 118 | for hidden_size in [2, 4, 8, 16, 32, 64, 128]: 119 | print(f'Depth: {depth}, Hidden size: {hidden_size}') 120 | model = MLP(input_size=input_size, hidden_size=hidden_size, output_size=output_size, depth=depth).to(device) 121 | param_size = sum(p.numel() for p in model.parameters()) 122 | ckpt_dir = f'{save_dir}/depth_{depth}_hidden_{hidden_size}' 123 | test_loss = train_with_test(model, dataset, ckpt_dir) 124 | 125 | output_js = {} 126 | output_js['depth'] = depth 127 | output_js['hidden_size'] = hidden_size 128 | param_size = model.get_param_size() 129 | output_js['param_size'] = param_size 130 | output_js['test_loss'] = test_loss 131 | log_file.write(json.dumps(output_js) + '\n') 132 | log_file.flush() 133 | log_file.close() 134 | -------------------------------------------------------------------------------- /Symbolic_Formula_Representation/train_transformer.py: -------------------------------------------------------------------------------- 1 | import torch, os, argparse, json 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | import numpy as np 5 | from torch.utils.data import DataLoader, TensorDataset 6 | from tqdm import tqdm 7 | from kan import LBFGS 8 | from torch.nn import TransformerEncoder, TransformerEncoderLayer 9 | import pdb 10 | 11 | 12 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 13 | 14 | def parse_args(): 15 | parser = argparse.ArgumentParser(description='Train MLP') 16 | parser.add_argument('--dataset_idx', type=int, default=0, help='Dataset index') 17 | parser.add_argument('--dataset_dir', type=str, default='dataset') 18 | parser.add_argument('--save_dir', type=str, default='kan_checkpoint') 19 | return parser.parse_args() 20 | 21 | 22 | def load_dataset(args, dataset_idx): 23 | print(f'Loading dataset_{dataset_idx} from {args.dataset_dir}/dataset_{dataset_idx}.pt') 24 | 25 | dataset = torch.load(f'{args.dataset_dir}/dataset_{dataset_idx}.pt') 26 | dataset['train_input'] = dataset['train_input'].to(device) 27 | dataset['test_input'] = dataset['test_input'].to(device) 28 | dataset['train_label'] = dataset['train_label'].to(device) 29 | dataset['test_label'] = dataset['test_label'].to(device) 30 | return dataset 31 | 32 | 33 | class TransformerRegressor(nn.Module): 34 | def __init__(self, model_dim=64, num_layers=2): 35 | super(TransformerRegressor, self).__init__() 36 | self.model_dim = model_dim 37 | self.embedding = nn.Linear(1, model_dim) 38 | encoder_layer = nn.TransformerEncoderLayer(d_model=model_dim, nhead=4, dim_feedforward=4*model_dim) 39 | self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers) 40 | self.fc_out = nn.Linear(model_dim, 1) 41 | 42 | def forward(self, x): 43 | x = x.unsqueeze(2) 44 | x = self.embedding(x) 45 | x = x.permute(1, 0, 2) 46 | x = self.transformer_encoder(x) 47 | x = x.mean(dim=0) 48 | x = self.fc_out(x) 49 | return x 50 | 51 | def get_param_size(self): 52 | return sum(p.numel() for p in self.parameters()) 53 | 54 | def train_with_test(model, dataset, ckpt_dir): 55 | 56 | if not os.path.exists(ckpt_dir): 57 | os.makedirs(ckpt_dir) 58 | criterion = nn.MSELoss() 59 | optimizer = LBFGS(filter(lambda p: p.requires_grad, model.parameters()), 60 | lr=0.01, 61 | history_size=10, 62 | line_search_fn="strong_wolfe", 63 | tolerance_grad=1e-32, 64 | tolerance_change=1e-32, 65 | tolerance_ys=1e-32) 66 | 67 | model.train() 68 | for _ in tqdm(range(1800)): 69 | def closure(): 70 | optimizer.zero_grad() 71 | output = model(dataset['train_input']) 72 | loss = criterion(output, dataset['train_label']) 73 | loss.backward() 74 | return loss 75 | optimizer.step(closure) 76 | 77 | torch.save(model.state_dict(), f'{ckpt_dir}/model.pth') 78 | 79 | model.eval() 80 | with torch.no_grad(): 81 | output = model(dataset['test_input']) 82 | test_loss = criterion(output, dataset['test_label']).item() 83 | return test_loss 84 | 85 | 86 | 87 | if __name__ == '__main__': 88 | args = parse_args() 89 | if args.dataset_idx == 0: 90 | dataset = load_dataset(args, 0) 91 | input_size, output_size = 1, 1 92 | elif args.dataset_idx == 1: 93 | dataset = load_dataset(args, 1) 94 | input_size, output_size = 2, 1 95 | elif args.dataset_idx == 2: 96 | dataset = load_dataset(args, 2) 97 | input_size, output_size = 2, 1 98 | elif args.dataset_idx == 3: 99 | dataset = load_dataset(args, 3) 100 | input_size, output_size = 4, 1 101 | 102 | save_dir = f'{args.save_dir}/dataset_{args.dataset_idx}' 103 | if not os.path.exists(save_dir): 104 | os.makedirs(save_dir) 105 | log_file = open(f'{save_dir}/results.jsonl', 'w') 106 | 107 | 108 | for layer_num in [2, 3, 4, 5]: 109 | for dim in [4, 8, 12, 16]: 110 | print(f'Layer Num: {layer_num}, Model Dim: {dim}, FFN Dim: {4*dim}') 111 | model = TransformerRegressor(dim, layer_num).to(device) 112 | param_size = model.get_param_size() 113 | ckpt_dir = f'{save_dir}/depth_{layer_num}_hidden_{dim}' 114 | test_loss = train_with_test(model, dataset, ckpt_dir) 115 | 116 | output_js = {} 117 | output_js['depth'] = layer_num 118 | output_js['hidden_size'] = dim 119 | param_size = model.get_param_size() 120 | output_js['param_size'] = param_size 121 | output_js['test_loss'] = test_loss 122 | log_file.write(json.dumps(output_js) + '\n') 123 | log_file.flush() 124 | log_file.close() 125 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/data_provider/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/data_provider/data_factory.py: -------------------------------------------------------------------------------- 1 | from data_provider.data_loader import Dataset_ETT_hour, Dataset_Custom, Dataset_Pred 2 | from torch.utils.data import DataLoader 3 | 4 | data_dict = { 5 | 'ETTh': Dataset_ETT_hour, 6 | 'custom': Dataset_Custom, 7 | } 8 | 9 | 10 | def data_provider(args, flag): 11 | Data = data_dict[args.data] 12 | timeenc = 0 if args.embed != 'timeF' else 1 13 | 14 | if flag == 'test': 15 | shuffle_flag = False 16 | drop_last = False 17 | batch_size = args.batch_size 18 | freq = args.freq 19 | elif flag == 'pred': 20 | shuffle_flag = False 21 | drop_last = False 22 | batch_size = 1 23 | freq = args.freq 24 | Data = Dataset_Pred 25 | else: 26 | shuffle_flag = True 27 | drop_last = True 28 | batch_size = args.batch_size 29 | freq = args.freq 30 | 31 | data_set = Data( 32 | root_path=args.root_path, 33 | data_path=args.data_path, 34 | flag=flag, 35 | size=[args.seq_len, args.label_len, args.pred_len], 36 | features=args.features, 37 | target=args.target, 38 | timeenc=timeenc, 39 | freq=freq 40 | ) 41 | print(flag, len(data_set)) 42 | data_loader = DataLoader( 43 | data_set, 44 | batch_size=batch_size, 45 | shuffle=shuffle_flag, 46 | num_workers=args.num_workers, 47 | drop_last=drop_last) 48 | return data_set, data_loader 49 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/data_provider/data_loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from torch.utils.data import Dataset 4 | from sklearn.preprocessing import StandardScaler 5 | from utils.timefeatures import time_features 6 | import warnings 7 | 8 | warnings.filterwarnings('ignore') 9 | 10 | 11 | class Dataset_ETT_hour(Dataset): 12 | def __init__(self, root_path, flag='train', size=None, 13 | features='S', data_path='ETTh.csv', 14 | target='OT', scale=True, timeenc=0, freq='h'): 15 | # size [seq_len, label_len, pred_len] 16 | # info 17 | if size == None: 18 | self.seq_len = 24 * 4 * 4 19 | self.label_len = 24 * 4 20 | self.pred_len = 24 * 4 21 | else: 22 | self.seq_len = size[0] 23 | self.label_len = size[1] 24 | self.pred_len = size[2] 25 | # init 26 | assert flag in ['train', 'test', 'val'] 27 | type_map = {'train': 0, 'val': 1, 'test': 2} 28 | self.set_type = type_map[flag] 29 | 30 | self.features = features 31 | self.target = target 32 | self.scale = scale 33 | self.timeenc = timeenc 34 | self.freq = freq 35 | 36 | self.root_path = root_path 37 | self.data_path = data_path 38 | self.__read_data__() 39 | 40 | def __read_data__(self): 41 | self.scaler = StandardScaler() 42 | df_raw = pd.read_csv(os.path.join(self.root_path, 43 | self.data_path)) 44 | 45 | border1s = [0, 12 * 30 * 24 - self.seq_len, 12 * 30 * 24 + 4 * 30 * 24 - self.seq_len] 46 | border2s = [12 * 30 * 24, 12 * 30 * 24 + 4 * 30 * 24, 12 * 30 * 24 + 8 * 30 * 24] 47 | border1 = border1s[self.set_type] 48 | border2 = border2s[self.set_type] 49 | 50 | if self.features == 'M' or self.features == 'MS': 51 | cols_data = df_raw.columns[1:] 52 | df_data = df_raw[cols_data] 53 | elif self.features == 'S': 54 | df_data = df_raw[[self.target]] 55 | 56 | if self.scale: 57 | train_data = df_data[border1s[0]:border2s[0]] 58 | self.scaler.fit(train_data.values) 59 | data = self.scaler.transform(df_data.values) 60 | else: 61 | data = df_data.values 62 | 63 | df_stamp = df_raw[['date']][border1:border2] 64 | df_stamp['date'] = pd.to_datetime(df_stamp.date) 65 | if self.timeenc == 0: 66 | df_stamp['month'] = df_stamp.date.apply(lambda row: row.month, 1) 67 | df_stamp['day'] = df_stamp.date.apply(lambda row: row.day, 1) 68 | df_stamp['weekday'] = df_stamp.date.apply(lambda row: row.weekday(), 1) 69 | df_stamp['hour'] = df_stamp.date.apply(lambda row: row.hour, 1) 70 | data_stamp = df_stamp.drop(['date'], 1).values 71 | elif self.timeenc == 1: 72 | data_stamp = time_features(pd.to_datetime(df_stamp['date'].values), freq=self.freq) 73 | data_stamp = data_stamp.transpose(1, 0) 74 | 75 | self.data_x = data[border1:border2] 76 | self.data_y = data[border1:border2] 77 | self.data_stamp = data_stamp 78 | 79 | def __getitem__(self, index): 80 | s_begin = index 81 | s_end = s_begin + self.seq_len 82 | r_begin = s_end - self.label_len 83 | r_end = r_begin + self.label_len + self.pred_len 84 | 85 | seq_x = self.data_x[s_begin:s_end] 86 | seq_y = self.data_y[r_begin:r_end] 87 | seq_x_mark = self.data_stamp[s_begin:s_end] 88 | seq_y_mark = self.data_stamp[r_begin:r_end] 89 | 90 | return seq_x, seq_y, seq_x_mark, seq_y_mark 91 | 92 | def __len__(self): 93 | return len(self.data_x) - self.seq_len - self.pred_len + 1 94 | 95 | def inverse_transform(self, data): 96 | return self.scaler.inverse_transform(data) 97 | 98 | 99 | class Dataset_Custom(Dataset): 100 | def __init__(self, root_path, flag='train', size=None, 101 | features='S', data_path='ETTh.csv', 102 | target='OT', scale=True, timeenc=0, freq='h'): 103 | # size [seq_len, label_len, pred_len] 104 | # info 105 | if size == None: 106 | self.seq_len = 24 * 4 * 4 107 | self.label_len = 24 * 4 108 | self.pred_len = 24 * 4 109 | else: 110 | self.seq_len = size[0] 111 | self.label_len = size[1] 112 | self.pred_len = size[2] 113 | # init 114 | assert flag in ['train', 'test', 'val'] 115 | type_map = {'train': 0, 'val': 1, 'test': 2} 116 | self.set_type = type_map[flag] 117 | 118 | self.features = features 119 | self.target = target 120 | self.scale = scale 121 | self.timeenc = timeenc 122 | self.freq = freq 123 | 124 | self.root_path = root_path 125 | self.data_path = data_path 126 | self.__read_data__() 127 | 128 | def __read_data__(self): 129 | self.scaler = StandardScaler() 130 | df_raw = pd.read_csv(os.path.join(self.root_path, 131 | self.data_path)) 132 | 133 | ''' 134 | df_raw.columns: ['date', ...(other features), target feature] 135 | ''' 136 | cols = list(df_raw.columns) 137 | cols.remove(self.target) 138 | cols.remove('date') 139 | df_raw = df_raw[['date'] + cols + [self.target]] 140 | num_train = int(len(df_raw) * 0.7) 141 | num_test = int(len(df_raw) * 0.2) 142 | num_vali = len(df_raw) - num_train - num_test 143 | border1s = [0, num_train - self.seq_len, len(df_raw) - num_test - self.seq_len] 144 | border2s = [num_train, num_train + num_vali, len(df_raw)] 145 | border1 = border1s[self.set_type] 146 | border2 = border2s[self.set_type] 147 | 148 | if self.features == 'M' or self.features == 'MS': 149 | cols_data = df_raw.columns[1:] 150 | df_data = df_raw[cols_data] 151 | elif self.features == 'S': 152 | df_data = df_raw[[self.target]] 153 | 154 | if self.scale: 155 | train_data = df_data[border1s[0]:border2s[0]] 156 | self.scaler.fit(train_data.values) 157 | data = self.scaler.transform(df_data.values) 158 | else: 159 | data = df_data.values 160 | 161 | df_stamp = df_raw[['date']][border1:border2] 162 | df_stamp['date'] = pd.to_datetime(df_stamp.date) 163 | if self.timeenc == 0: 164 | df_stamp['month'] = df_stamp.date.apply(lambda row: row.month, 1) 165 | df_stamp['day'] = df_stamp.date.apply(lambda row: row.day, 1) 166 | df_stamp['weekday'] = df_stamp.date.apply(lambda row: row.weekday(), 1) 167 | df_stamp['hour'] = df_stamp.date.apply(lambda row: row.hour, 1) 168 | data_stamp = df_stamp.drop(['date'], 1).values 169 | elif self.timeenc == 1: 170 | data_stamp = time_features(pd.to_datetime(df_stamp['date'].values), freq=self.freq) 171 | data_stamp = data_stamp.transpose(1, 0) 172 | 173 | self.data_x = data[border1:border2] 174 | self.data_y = data[border1:border2] 175 | self.data_stamp = data_stamp 176 | 177 | def __getitem__(self, index): 178 | s_begin = index 179 | s_end = s_begin + self.seq_len 180 | r_begin = s_end - self.label_len 181 | r_end = r_begin + self.label_len + self.pred_len 182 | 183 | seq_x = self.data_x[s_begin:s_end] 184 | seq_y = self.data_y[r_begin:r_end] 185 | seq_x_mark = self.data_stamp[s_begin:s_end] 186 | seq_y_mark = self.data_stamp[r_begin:r_end] 187 | 188 | return seq_x, seq_y, seq_x_mark, seq_y_mark 189 | 190 | def __len__(self): 191 | return len(self.data_x) - self.seq_len - self.pred_len + 1 192 | 193 | def inverse_transform(self, data): 194 | return self.scaler.inverse_transform(data) 195 | 196 | 197 | class Dataset_Pred(Dataset): 198 | def __init__(self, root_path, flag='pred', size=None, 199 | features='S', data_path='ETTh.csv', 200 | target='OT', scale=True, inverse=False, timeenc=0, freq='15min', cols=None): 201 | # size [seq_len, label_len, pred_len] 202 | # info 203 | if size == None: 204 | self.seq_len = 24 * 4 * 4 205 | self.label_len = 24 * 4 206 | self.pred_len = 24 * 4 207 | else: 208 | self.seq_len = size[0] 209 | self.label_len = size[1] 210 | self.pred_len = size[2] 211 | # init 212 | assert flag in ['pred'] 213 | 214 | self.features = features 215 | self.target = target 216 | self.scale = scale 217 | self.inverse = inverse 218 | self.timeenc = timeenc 219 | self.freq = freq 220 | self.cols = cols 221 | self.root_path = root_path 222 | self.data_path = data_path 223 | self.__read_data__() 224 | 225 | def __read_data__(self): 226 | self.scaler = StandardScaler() 227 | df_raw = pd.read_csv(os.path.join(self.root_path, 228 | self.data_path)) 229 | ''' 230 | df_raw.columns: ['date', ...(other features), target feature] 231 | ''' 232 | if self.cols: 233 | cols = self.cols.copy() 234 | cols.remove(self.target) 235 | else: 236 | cols = list(df_raw.columns) 237 | cols.remove(self.target) 238 | cols.remove('date') 239 | df_raw = df_raw[['date'] + cols + [self.target]] 240 | border1 = len(df_raw) - self.seq_len 241 | border2 = len(df_raw) 242 | 243 | if self.features == 'M' or self.features == 'MS': 244 | cols_data = df_raw.columns[1:] 245 | df_data = df_raw[cols_data] 246 | elif self.features == 'S': 247 | df_data = df_raw[[self.target]] 248 | 249 | if self.scale: 250 | self.scaler.fit(df_data.values) 251 | data = self.scaler.transform(df_data.values) 252 | else: 253 | data = df_data.values 254 | 255 | tmp_stamp = df_raw[['date']][border1:border2] 256 | tmp_stamp['date'] = pd.to_datetime(tmp_stamp.date) 257 | pred_dates = pd.date_range(tmp_stamp.date.values[-1], periods=self.pred_len + 1, freq=self.freq) 258 | 259 | df_stamp = pd.DataFrame(columns=['date']) 260 | df_stamp.date = list(tmp_stamp.date.values) + list(pred_dates[1:]) 261 | if self.timeenc == 0: 262 | df_stamp['month'] = df_stamp.date.apply(lambda row: row.month, 1) 263 | df_stamp['day'] = df_stamp.date.apply(lambda row: row.day, 1) 264 | df_stamp['weekday'] = df_stamp.date.apply(lambda row: row.weekday(), 1) 265 | df_stamp['hour'] = df_stamp.date.apply(lambda row: row.hour, 1) 266 | df_stamp['minute'] = df_stamp.date.apply(lambda row: row.minute, 1) 267 | df_stamp['minute'] = df_stamp.minute.map(lambda x: x // 15) 268 | data_stamp = df_stamp.drop(['date'], 1).values 269 | elif self.timeenc == 1: 270 | data_stamp = time_features(pd.to_datetime(df_stamp['date'].values), freq=self.freq) 271 | data_stamp = data_stamp.transpose(1, 0) 272 | 273 | self.data_x = data[border1:border2] 274 | if self.inverse: 275 | self.data_y = df_data.values[border1:border2] 276 | else: 277 | self.data_y = data[border1:border2] 278 | self.data_stamp = data_stamp 279 | 280 | def __getitem__(self, index): 281 | s_begin = index 282 | s_end = s_begin + self.seq_len 283 | r_begin = s_end - self.label_len 284 | r_end = r_begin + self.label_len + self.pred_len 285 | 286 | seq_x = self.data_x[s_begin:s_end] 287 | if self.inverse: 288 | seq_y = self.data_x[r_begin:r_begin + self.label_len] 289 | else: 290 | seq_y = self.data_y[r_begin:r_begin + self.label_len] 291 | seq_x_mark = self.data_stamp[s_begin:s_end] 292 | seq_y_mark = self.data_stamp[r_begin:r_end] 293 | 294 | return seq_x, seq_y, seq_x_mark, seq_y_mark 295 | 296 | def __len__(self): 297 | return len(self.data_x) - self.seq_len + 1 298 | 299 | def inverse_transform(self, data): 300 | return self.scaler.inverse_transform(data) 301 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/exp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihongDong/FAN/7f1b16a1fdf2e36e8b123109d7f6b6987361a1fe/Timeseries_Forecasting/exp/__init__.py -------------------------------------------------------------------------------- /Timeseries_Forecasting/exp/exp_basic.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | 5 | class Exp_Basic(object): 6 | def __init__(self, args): 7 | self.args = args 8 | self.device = self._acquire_device() 9 | self.model = self._build_model().to(self.device) 10 | 11 | def _build_model(self): 12 | raise NotImplementedError 13 | return None 14 | 15 | def _acquire_device(self): 16 | if self.args.use_gpu: 17 | os.environ["CUDA_VISIBLE_DEVICES"] = str( 18 | self.args.gpu) if not self.args.use_multi_gpu else self.args.devices 19 | device = torch.device('cuda:{}'.format(self.args.gpu)) 20 | print('Use GPU: cuda:{}'.format(self.args.gpu)) 21 | else: 22 | device = torch.device('cpu') 23 | print('Use CPU') 24 | return device 25 | 26 | def _get_data(self): 27 | pass 28 | 29 | def vali(self): 30 | pass 31 | 32 | def train(self): 33 | pass 34 | 35 | def test(self): 36 | pass 37 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/exp/exp_main.py: -------------------------------------------------------------------------------- 1 | import logging 2 | logging.basicConfig(format='%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s', 3 | datefmt='%Y-%m-%d:%H:%M:%S', 4 | level=logging.INFO) 5 | 6 | from data_provider.data_factory import data_provider 7 | from exp.exp_basic import Exp_Basic 8 | from models import Modified_Transformer 9 | from utils.tools import EarlyStopping, adjust_learning_rate, visual 10 | from utils.metrics import metric 11 | 12 | import numpy as np 13 | import torch 14 | import torch.nn as nn 15 | from torch import optim 16 | 17 | import os 18 | import time 19 | 20 | import warnings 21 | import numpy as np 22 | 23 | warnings.filterwarnings('ignore') 24 | 25 | 26 | class Exp_Main(Exp_Basic): 27 | def __init__(self, args): 28 | super(Exp_Main, self).__init__(args) 29 | 30 | def _build_model(self): 31 | model_dict = { 32 | 'Modified_Transformer': Modified_Transformer, 33 | } 34 | model = model_dict[self.args.model].Model(self.args).float() 35 | from utils.tools import count_parameters 36 | parameters = count_parameters(model) 37 | print('the parameters of the model: {} M'.format(parameters/1e6)) 38 | if self.args.use_multi_gpu and self.args.use_gpu: 39 | model = nn.DataParallel(model, device_ids=self.args.device_ids) 40 | 41 | return model 42 | 43 | def _get_data(self, flag): 44 | data_set, data_loader = data_provider(self.args, flag) 45 | return data_set, data_loader 46 | 47 | def _select_optimizer(self): 48 | model_optim = optim.Adam(self.model.parameters(), lr=self.args.learning_rate) 49 | return model_optim 50 | 51 | def _select_criterion(self): 52 | criterion = nn.MSELoss() 53 | return criterion 54 | 55 | def _predict(self, batch_x, batch_y, batch_x_mark, batch_y_mark): 56 | # decoder input 57 | dec_inp = torch.zeros_like(batch_y[:, -self.args.pred_len:, :]).float() 58 | dec_inp = torch.cat([batch_y[:, :self.args.label_len, :], dec_inp], dim=1).float().to(self.device) 59 | # encoder - decoder 60 | 61 | def _run_model(): 62 | outputs = self.model(batch_x, batch_x_mark, dec_inp, batch_y_mark) 63 | if self.args.output_attention: 64 | outputs = outputs[0] 65 | return outputs 66 | 67 | if self.args.use_amp: 68 | with torch.cuda.amp.autocast(): 69 | outputs = _run_model() 70 | else: 71 | outputs = _run_model() 72 | 73 | f_dim = -1 if self.args.features == 'MS' else 0 74 | outputs = outputs[:, -self.args.pred_len:, f_dim:] 75 | batch_y = batch_y[:, -self.args.pred_len:, f_dim:].to(self.device) 76 | 77 | return outputs, batch_y 78 | 79 | def vali(self, vali_data, vali_loader, criterion): 80 | total_loss = [] 81 | self.model.eval() 82 | with torch.no_grad(): 83 | for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(vali_loader): 84 | batch_x = batch_x.float().to(self.device) 85 | batch_y = batch_y.float() 86 | 87 | batch_x_mark = batch_x_mark.float().to(self.device) 88 | batch_y_mark = batch_y_mark.float().to(self.device) 89 | 90 | outputs, batch_y = self._predict(batch_x, batch_y, batch_x_mark, batch_y_mark) 91 | 92 | pred = outputs.detach().cpu() 93 | true = batch_y.detach().cpu() 94 | 95 | loss = criterion(pred, true) 96 | 97 | total_loss.append(loss) 98 | total_loss = np.average(total_loss) 99 | self.model.train() 100 | return total_loss 101 | 102 | def train(self, setting): 103 | train_data, train_loader = self._get_data(flag='train') 104 | vali_data, vali_loader = self._get_data(flag='val') 105 | test_data, test_loader = self._get_data(flag='test') 106 | 107 | path = os.path.join(self.args.checkpoints, setting) 108 | if not os.path.exists(path): 109 | os.makedirs(path) 110 | 111 | time_now = time.time() 112 | 113 | train_steps = len(train_loader) 114 | early_stopping = EarlyStopping(patience=self.args.patience, verbose=True) 115 | 116 | model_optim = self._select_optimizer() 117 | criterion = self._select_criterion() 118 | 119 | if self.args.use_amp: 120 | scaler = torch.cuda.amp.GradScaler() 121 | 122 | for epoch in range(self.args.train_epochs): 123 | iter_count = 0 124 | train_loss = [] 125 | 126 | self.model.train() 127 | epoch_time = time.time() 128 | for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(train_loader): 129 | iter_count += 1 130 | model_optim.zero_grad() 131 | batch_x = batch_x.float().to(self.device) 132 | 133 | batch_y = batch_y.float().to(self.device) 134 | batch_x_mark = batch_x_mark.float().to(self.device) 135 | batch_y_mark = batch_y_mark.float().to(self.device) 136 | 137 | outputs, batch_y = self._predict(batch_x, batch_y, batch_x_mark, batch_y_mark) 138 | 139 | 140 | loss = criterion(outputs, batch_y) 141 | train_loss.append(loss.item()) 142 | 143 | if (i + 1) % 100 == 0: 144 | print("\titers: {0}, epoch: {1} | loss: {2:.7f}".format(i + 1, epoch + 1, loss.item())) 145 | speed = (time.time() - time_now) / iter_count 146 | left_time = speed * ((self.args.train_epochs - epoch) * train_steps - i) 147 | print('\tspeed: {:.4f}s/iter; left time: {:.4f}s'.format(speed, left_time)) 148 | iter_count = 0 149 | time_now = time.time() 150 | 151 | if self.args.use_amp: 152 | scaler.scale(loss).backward() 153 | scaler.step(model_optim) 154 | scaler.update() 155 | else: 156 | loss.backward() 157 | model_optim.step() 158 | 159 | print("Epoch: {} cost time: {}".format(epoch + 1, time.time() - epoch_time)) 160 | train_loss = np.average(train_loss) 161 | vali_loss = self.vali(vali_data, vali_loader, criterion) 162 | test_loss = self.vali(test_data, test_loader, criterion) 163 | 164 | print("Epoch: {0}, Steps: {1} | Train Loss: {2:.7f} Vali Loss: {3:.7f} Test Loss: {4:.7f}".format( 165 | epoch + 1, train_steps, train_loss, vali_loss, test_loss)) 166 | early_stopping(vali_loss, self.model, path) 167 | if early_stopping.early_stop: 168 | print("Early stopping") 169 | break 170 | 171 | adjust_learning_rate(model_optim, epoch + 1, self.args) 172 | 173 | best_model_path = path + '/' + 'checkpoint.pth' 174 | self.model.load_state_dict(torch.load(best_model_path)) 175 | 176 | return 177 | 178 | def test(self, setting, test=0): 179 | test_data, test_loader = self._get_data(flag='test') 180 | if test: 181 | print('loading model') 182 | self.model.load_state_dict(torch.load(os.path.join('./checkpoints/' + setting, 'checkpoint.pth'))) 183 | 184 | preds = [] 185 | trues = [] 186 | folder_path = './exp_figure/' + setting + '/' 187 | if not os.path.exists(folder_path): 188 | os.makedirs(folder_path) 189 | 190 | self.model.eval() 191 | with torch.no_grad(): 192 | for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(test_loader): 193 | batch_x = batch_x.float().to(self.device) 194 | batch_y = batch_y.float().to(self.device) 195 | 196 | batch_x_mark = batch_x_mark.float().to(self.device) 197 | batch_y_mark = batch_y_mark.float().to(self.device) 198 | 199 | outputs, batch_y = self._predict(batch_x, batch_y, batch_x_mark, batch_y_mark) 200 | 201 | outputs = outputs.detach().cpu().numpy() 202 | batch_y = batch_y.detach().cpu().numpy() 203 | 204 | pred = outputs # outputs.detach().cpu().numpy() # .squeeze() 205 | true = batch_y # batch_y.detach().cpu().numpy() # .squeeze() 206 | 207 | preds.append(pred) 208 | trues.append(true) 209 | if i % 20 == 0: 210 | input = batch_x.detach().cpu().numpy() 211 | gt = np.concatenate((input[0, :, -1], true[0, :, -1]), axis=0) 212 | pd = np.concatenate((input[0, :, -1], pred[0, :, -1]), axis=0) 213 | visual(gt, pd, os.path.join(folder_path, str(i) + '.pdf')) 214 | 215 | preds = np.concatenate(preds, axis=0) 216 | trues = np.concatenate(trues, axis=0) 217 | print('test shape:', preds.shape, trues.shape) 218 | preds = preds.reshape(-1, preds.shape[-2], preds.shape[-1]) 219 | trues = trues.reshape(-1, trues.shape[-2], trues.shape[-1]) 220 | print('test shape:', preds.shape, trues.shape) 221 | 222 | # result save 223 | folder_path = './results/' + setting + '/' 224 | if not os.path.exists(folder_path): 225 | os.makedirs(folder_path) 226 | 227 | mae, mse, rmse, mape, mspe = metric(preds, trues) 228 | print('mse:{}, mae:{}'.format(mse, mae)) 229 | f = open("result.txt", 'a') 230 | f.write(setting + " \n") 231 | f.write('mse:{}, mae:{}'.format(mse, mae)) 232 | f.write('\n') 233 | f.write('\n') 234 | f.close() 235 | 236 | np.save(folder_path + 'metrics.npy', np.array([mae, mse, rmse, mape, mspe])) 237 | # np.save(folder_path + 'pred.npy', preds) 238 | # np.save(folder_path + 'true.npy', trues) 239 | 240 | return 241 | 242 | def predict(self, setting, load=False): 243 | pred_data, pred_loader = self._get_data(flag='pred') 244 | 245 | if load: 246 | path = os.path.join(self.args.checkpoints, setting) 247 | best_model_path = path + '/' + 'checkpoint.pth' 248 | logging.info(best_model_path) 249 | self.model.load_state_dict(torch.load(best_model_path)) 250 | 251 | preds = [] 252 | 253 | self.model.eval() 254 | with torch.no_grad(): 255 | for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(pred_loader): 256 | batch_x = batch_x.float().to(self.device) 257 | batch_y = batch_y.float() 258 | batch_x_mark = batch_x_mark.float().to(self.device) 259 | batch_y_mark = batch_y_mark.float().to(self.device) 260 | 261 | outputs, batch_y = self._predict(batch_x, batch_y, batch_x_mark, batch_y_mark) 262 | 263 | pred = outputs.detach().cpu().numpy() # .squeeze() 264 | preds.append(pred) 265 | 266 | preds = np.array(preds) 267 | preds = preds.reshape(-1, preds.shape[-2], preds.shape[-1]) 268 | 269 | # result save 270 | folder_path = './results/' + setting + '/' 271 | if not os.path.exists(folder_path): 272 | os.makedirs(folder_path) 273 | 274 | np.save(folder_path + 'real_prediction.npy', preds) 275 | 276 | return 277 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/layers/AutoCorrelation.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import math 4 | 5 | 6 | class AutoCorrelation(nn.Module): 7 | """ 8 | AutoCorrelation Mechanism with the following two phases: 9 | (1) period-based dependencies discovery 10 | (2) time delay aggregation 11 | This block can replace the self-attention family mechanism seamlessly. 12 | """ 13 | def __init__(self, mask_flag=True, factor=1, scale=None, attention_dropout=0.1, output_attention=False): 14 | super(AutoCorrelation, self).__init__() 15 | self.factor = factor 16 | self.scale = scale 17 | self.mask_flag = mask_flag 18 | self.output_attention = output_attention 19 | self.dropout = nn.Dropout(attention_dropout) 20 | 21 | def time_delay_agg_training(self, values, corr): 22 | """ 23 | SpeedUp version of Autocorrelation (a batch-normalization style design) 24 | This is for the training phase. 25 | """ 26 | head = values.shape[1] 27 | channel = values.shape[2] 28 | length = values.shape[3] 29 | # find top k 30 | top_k = int(self.factor * math.log(length)) 31 | mean_value = torch.mean(torch.mean(corr, dim=1), dim=1) 32 | index = torch.topk(torch.mean(mean_value, dim=0), top_k, dim=-1)[1] 33 | weights = torch.stack([mean_value[:, index[i]] for i in range(top_k)], dim=-1) 34 | # update corr 35 | tmp_corr = torch.softmax(weights, dim=-1) 36 | # aggregation 37 | tmp_values = values 38 | delays_agg = torch.zeros_like(values).float() 39 | for i in range(top_k): 40 | pattern = torch.roll(tmp_values, -int(index[i]), -1) 41 | delays_agg = delays_agg + pattern * \ 42 | (tmp_corr[:, i].unsqueeze(1).unsqueeze(1).unsqueeze(1).repeat(1, head, channel, length)) 43 | return delays_agg 44 | 45 | def time_delay_agg_inference(self, values, corr): 46 | """ 47 | SpeedUp version of Autocorrelation (a batch-normalization style design) 48 | This is for the inference phase. 49 | """ 50 | batch = values.shape[0] 51 | head = values.shape[1] 52 | channel = values.shape[2] 53 | length = values.shape[3] 54 | # index init 55 | init_index = torch.arange(length).unsqueeze(0).unsqueeze(0).unsqueeze(0)\ 56 | .repeat(batch, head, channel, 1).to(values.device) 57 | # find top k 58 | top_k = int(self.factor * math.log(length)) 59 | mean_value = torch.mean(torch.mean(corr, dim=1), dim=1) 60 | weights, delay = torch.topk(mean_value, top_k, dim=-1) 61 | # update corr 62 | tmp_corr = torch.softmax(weights, dim=-1) 63 | # aggregation 64 | tmp_values = values.repeat(1, 1, 1, 2) 65 | delays_agg = torch.zeros_like(values).float() 66 | for i in range(top_k): 67 | tmp_delay = init_index + delay[:, i].unsqueeze(1).unsqueeze(1).unsqueeze(1).repeat(1, head, channel, length) 68 | pattern = torch.gather(tmp_values, dim=-1, index=tmp_delay) 69 | delays_agg = delays_agg + pattern * \ 70 | (tmp_corr[:, i].unsqueeze(1).unsqueeze(1).unsqueeze(1).repeat(1, head, channel, length)) 71 | return delays_agg 72 | 73 | def time_delay_agg_full(self, values, corr): 74 | """ 75 | Standard version of Autocorrelation 76 | """ 77 | batch = values.shape[0] 78 | head = values.shape[1] 79 | channel = values.shape[2] 80 | length = values.shape[3] 81 | # index init 82 | init_index = torch.arange(length).unsqueeze(0).unsqueeze(0).unsqueeze(0)\ 83 | .repeat(batch, head, channel, 1).to(values.device) 84 | # find top k 85 | top_k = int(self.factor * math.log(length)) 86 | weights, delay = torch.topk(corr, top_k, dim=-1) 87 | # update corr 88 | tmp_corr = torch.softmax(weights, dim=-1) 89 | # aggregation 90 | tmp_values = values.repeat(1, 1, 1, 2) 91 | delays_agg = torch.zeros_like(values).float() 92 | for i in range(top_k): 93 | tmp_delay = init_index + delay[..., i].unsqueeze(-1) 94 | pattern = torch.gather(tmp_values, dim=-1, index=tmp_delay) 95 | delays_agg = delays_agg + pattern * (tmp_corr[..., i].unsqueeze(-1)) 96 | return delays_agg 97 | 98 | def forward(self, queries, keys, values, attn_mask): 99 | B, L, H, E = queries.shape 100 | _, S, _, D = values.shape 101 | if L > S: 102 | zeros = torch.zeros_like(queries[:, :(L - S), :]).float() 103 | values = torch.cat([values, zeros], dim=1) 104 | keys = torch.cat([keys, zeros], dim=1) 105 | else: 106 | values = values[:, :L, :, :] 107 | keys = keys[:, :L, :, :] 108 | 109 | # period-based dependencies 110 | q_fft = torch.fft.rfft(queries.permute(0, 2, 3, 1).contiguous(), dim=-1) 111 | k_fft = torch.fft.rfft(keys.permute(0, 2, 3, 1).contiguous(), dim=-1) 112 | res = q_fft * torch.conj(k_fft) 113 | corr = torch.fft.irfft(res, n=L, dim=-1) 114 | 115 | # time delay agg 116 | if self.training: 117 | V = self.time_delay_agg_training(values.permute(0, 2, 3, 1).contiguous(), corr).permute(0, 3, 1, 2) 118 | else: 119 | V = self.time_delay_agg_inference(values.permute(0, 2, 3, 1).contiguous(), corr).permute(0, 3, 1, 2) 120 | 121 | if self.output_attention: 122 | return (V.contiguous(), corr.permute(0, 3, 1, 2)) 123 | else: 124 | return (V.contiguous(), None) 125 | 126 | 127 | class AutoCorrelationLayer(nn.Module): 128 | def __init__(self, correlation, d_model, n_heads, d_keys=None, 129 | d_values=None): 130 | super(AutoCorrelationLayer, self).__init__() 131 | 132 | d_keys = d_keys or (d_model // n_heads) 133 | d_values = d_values or (d_model // n_heads) 134 | 135 | self.inner_correlation = correlation 136 | self.query_projection = nn.Linear(d_model, d_keys * n_heads) 137 | self.key_projection = nn.Linear(d_model, d_keys * n_heads) 138 | self.value_projection = nn.Linear(d_model, d_values * n_heads) 139 | self.out_projection = nn.Linear(d_values * n_heads, d_model) 140 | self.n_heads = n_heads 141 | 142 | def forward(self, queries, keys, values, attn_mask): 143 | B, L, _ = queries.shape 144 | _, S, _ = keys.shape 145 | H = self.n_heads 146 | 147 | queries = self.query_projection(queries).view(B, L, H, -1) 148 | keys = self.key_projection(keys).view(B, S, H, -1) 149 | values = self.value_projection(values).view(B, S, H, -1) 150 | 151 | out, attn = self.inner_correlation( 152 | queries, 153 | keys, 154 | values, 155 | attn_mask 156 | ) 157 | out = out.view(B, L, -1) 158 | 159 | return self.out_projection(out), attn 160 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/layers/Autoformer_EncDec.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class my_Layernorm(nn.Module): 7 | """ 8 | Special designed layernorm for the seasonal part 9 | """ 10 | def __init__(self, channels): 11 | super(my_Layernorm, self).__init__() 12 | self.layernorm = nn.LayerNorm(channels) 13 | 14 | def forward(self, x): 15 | x_hat = self.layernorm(x) 16 | bias = torch.mean(x_hat, dim=1).unsqueeze(1).repeat(1, x.shape[1], 1) 17 | return x_hat - bias 18 | 19 | 20 | class moving_avg(nn.Module): 21 | """ 22 | Moving average block to highlight the trend of time series 23 | """ 24 | def __init__(self, kernel_size, stride): 25 | super(moving_avg, self).__init__() 26 | self.kernel_size = kernel_size 27 | self.avg = nn.AvgPool1d(kernel_size=kernel_size, stride=stride, padding=0) 28 | 29 | def forward(self, x): 30 | # padding on the both ends of time series 31 | front = x[:, 0:1, :].repeat(1, (self.kernel_size - 1) // 2, 1) 32 | end = x[:, -1:, :].repeat(1, (self.kernel_size - 1) // 2, 1) 33 | x = torch.cat([front, x, end], dim=1) 34 | x = self.avg(x.permute(0, 2, 1)) 35 | x = x.permute(0, 2, 1) 36 | return x 37 | 38 | 39 | class series_decomp(nn.Module): 40 | """ 41 | Series decomposition block 42 | """ 43 | def __init__(self, kernel_size): 44 | super(series_decomp, self).__init__() 45 | self.moving_avg = moving_avg(kernel_size, stride=1) 46 | 47 | def forward(self, x): 48 | moving_mean = self.moving_avg(x) 49 | res = x - moving_mean 50 | return res, moving_mean 51 | 52 | 53 | class EncoderLayer(nn.Module): 54 | """ 55 | Autoformer encoder layer with the progressive decomposition architecture 56 | """ 57 | def __init__(self, attention, d_model, d_ff=None, moving_avg=25, dropout=0.1, activation="relu"): 58 | super(EncoderLayer, self).__init__() 59 | d_ff = d_ff or 4 * d_model 60 | self.attention = attention 61 | self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1, bias=False) 62 | self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1, bias=False) 63 | self.decomp1 = series_decomp(moving_avg) 64 | self.decomp2 = series_decomp(moving_avg) 65 | self.dropout = nn.Dropout(dropout) 66 | self.activation = F.relu if activation == "relu" else F.gelu 67 | 68 | def forward(self, x, attn_mask=None): 69 | new_x, attn = self.attention( 70 | x, x, x, 71 | attn_mask=attn_mask 72 | ) 73 | x = x + self.dropout(new_x) 74 | x, _ = self.decomp1(x) 75 | y = x 76 | y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1)))) 77 | y = self.dropout(self.conv2(y).transpose(-1, 1)) 78 | res, _ = self.decomp2(x + y) 79 | return res, attn 80 | 81 | 82 | class Encoder(nn.Module): 83 | """ 84 | Autoformer encoder 85 | """ 86 | def __init__(self, attn_layers, conv_layers=None, norm_layer=None): 87 | super(Encoder, self).__init__() 88 | self.attn_layers = nn.ModuleList(attn_layers) 89 | self.conv_layers = nn.ModuleList(conv_layers) if conv_layers is not None else None 90 | self.norm = norm_layer 91 | 92 | def forward(self, x, attn_mask=None): 93 | attns = [] 94 | if self.conv_layers is not None: 95 | for attn_layer, conv_layer in zip(self.attn_layers, self.conv_layers): 96 | x, attn = attn_layer(x, attn_mask=attn_mask) 97 | x = conv_layer(x) 98 | attns.append(attn) 99 | x, attn = self.attn_layers[-1](x) 100 | attns.append(attn) 101 | else: 102 | for attn_layer in self.attn_layers: 103 | x, attn = attn_layer(x, attn_mask=attn_mask) 104 | attns.append(attn) 105 | 106 | if self.norm is not None: 107 | x = self.norm(x) 108 | 109 | return x, attns 110 | 111 | 112 | class DecoderLayer(nn.Module): 113 | """ 114 | Autoformer decoder layer with the progressive decomposition architecture 115 | """ 116 | def __init__(self, self_attention, cross_attention, d_model, c_out, d_ff=None, 117 | moving_avg=25, dropout=0.1, activation="relu"): 118 | super(DecoderLayer, self).__init__() 119 | d_ff = d_ff or 4 * d_model 120 | self.self_attention = self_attention 121 | self.cross_attention = cross_attention 122 | self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1, bias=False) 123 | self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1, bias=False) 124 | self.decomp1 = series_decomp(moving_avg) 125 | self.decomp2 = series_decomp(moving_avg) 126 | self.decomp3 = series_decomp(moving_avg) 127 | self.dropout = nn.Dropout(dropout) 128 | self.projection = nn.Conv1d(in_channels=d_model, out_channels=c_out, kernel_size=3, stride=1, padding=1, 129 | padding_mode='circular', bias=False) 130 | self.activation = F.relu if activation == "relu" else F.gelu 131 | 132 | def forward(self, x, cross, x_mask=None, cross_mask=None): 133 | x = x + self.dropout(self.self_attention( 134 | x, x, x, 135 | attn_mask=x_mask 136 | )[0]) 137 | x, trend1 = self.decomp1(x) 138 | x = x + self.dropout(self.cross_attention( 139 | x, cross, cross, 140 | attn_mask=cross_mask 141 | )[0]) 142 | x, trend2 = self.decomp2(x) 143 | y = x 144 | y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1)))) 145 | y = self.dropout(self.conv2(y).transpose(-1, 1)) 146 | x, trend3 = self.decomp3(x + y) 147 | 148 | residual_trend = trend1 + trend2 + trend3 149 | residual_trend = self.projection(residual_trend.permute(0, 2, 1)).transpose(1, 2) 150 | return x, residual_trend 151 | 152 | 153 | class Decoder(nn.Module): 154 | """ 155 | Autoformer encoder 156 | """ 157 | def __init__(self, layers, norm_layer=None, projection=None): 158 | super(Decoder, self).__init__() 159 | self.layers = nn.ModuleList(layers) 160 | self.norm = norm_layer 161 | self.projection = projection 162 | 163 | def forward(self, x, cross, x_mask=None, cross_mask=None, trend=None): 164 | for layer in self.layers: 165 | x, residual_trend = layer(x, cross, x_mask=x_mask, cross_mask=cross_mask) 166 | trend = trend + residual_trend 167 | 168 | if self.norm is not None: 169 | x = self.norm(x) 170 | 171 | if self.projection is not None: 172 | x = self.projection(x) 173 | return x, trend 174 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/layers/Embed.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import math 4 | 5 | class DataEmbedding_inverted(nn.Module): 6 | def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1): 7 | super(DataEmbedding_inverted, self).__init__() 8 | self.value_embedding = nn.Linear(c_in, d_model) 9 | self.dropout = nn.Dropout(p=dropout) 10 | 11 | def forward(self, x, x_mark): 12 | # x: [Batch Time Variate] 13 | x = x.permute(0, 2, 1) 14 | 15 | # x: [Batch Variate Time] 16 | if x_mark is None: 17 | 18 | x = self.value_embedding(x) 19 | else: 20 | x = self.value_embedding(torch.cat([x, x_mark.permute(0, 2, 1)], 1)) 21 | return self.dropout(x) 22 | 23 | def compared_version(ver1, ver2): 24 | """ 25 | :param ver1 26 | :param ver2 27 | :return: ver1< = >ver2 False/True 28 | """ 29 | list1 = str(ver1).split(".") 30 | list2 = str(ver2).split(".") 31 | 32 | for i in range(len(list1)) if len(list1) < len(list2) else range(len(list2)): 33 | if int(list1[i]) == int(list2[i]): 34 | pass 35 | elif int(list1[i]) < int(list2[i]): 36 | return -1 37 | else: 38 | return 1 39 | 40 | if len(list1) == len(list2): 41 | return True 42 | elif len(list1) < len(list2): 43 | return False 44 | else: 45 | return True 46 | 47 | class PositionalEmbedding(nn.Module): 48 | def __init__(self, d_model, max_len=5000): 49 | super(PositionalEmbedding, self).__init__() 50 | # Compute the positional encodings once in log space. 51 | pe = torch.zeros(max_len, d_model).float() 52 | pe.require_grad = False 53 | 54 | position = torch.arange(0, max_len).float().unsqueeze(1) 55 | div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp() 56 | 57 | pe[:, 0::2] = torch.sin(position * div_term) 58 | pe[:, 1::2] = torch.cos(position * div_term) 59 | 60 | pe = pe.unsqueeze(0) 61 | self.register_buffer('pe', pe) 62 | 63 | def forward(self, x): 64 | return self.pe[:, :x.size(1)] 65 | 66 | 67 | class TokenEmbedding(nn.Module): 68 | def __init__(self, c_in, d_model): 69 | super(TokenEmbedding, self).__init__() 70 | padding = 1 if compared_version(torch.__version__, '1.5.0') else 2 71 | self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model, 72 | kernel_size=3, padding=padding, padding_mode='circular', bias=False) 73 | for m in self.modules(): 74 | if isinstance(m, nn.Conv1d): 75 | nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='leaky_relu') 76 | 77 | def forward(self, x): 78 | x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2) 79 | return x 80 | 81 | 82 | class FixedEmbedding(nn.Module): 83 | def __init__(self, c_in, d_model): 84 | super(FixedEmbedding, self).__init__() 85 | 86 | w = torch.zeros(c_in, d_model).float() 87 | w.require_grad = False 88 | 89 | position = torch.arange(0, c_in).float().unsqueeze(1) 90 | div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp() 91 | 92 | w[:, 0::2] = torch.sin(position * div_term) 93 | w[:, 1::2] = torch.cos(position * div_term) 94 | 95 | self.emb = nn.Embedding(c_in, d_model) 96 | self.emb.weight = nn.Parameter(w, requires_grad=False) 97 | 98 | def forward(self, x): 99 | return self.emb(x).detach() 100 | 101 | 102 | class TemporalEmbedding(nn.Module): 103 | def __init__(self, d_model, embed_type='fixed', freq='h'): 104 | super(TemporalEmbedding, self).__init__() 105 | 106 | minute_size = 4 107 | hour_size = 24 108 | weekday_size = 7 109 | day_size = 32 110 | month_size = 13 111 | 112 | Embed = FixedEmbedding if embed_type == 'fixed' else nn.Embedding 113 | if freq == 't': 114 | self.minute_embed = Embed(minute_size, d_model) 115 | self.hour_embed = Embed(hour_size, d_model) 116 | self.weekday_embed = Embed(weekday_size, d_model) 117 | self.day_embed = Embed(day_size, d_model) 118 | self.month_embed = Embed(month_size, d_model) 119 | 120 | def forward(self, x): 121 | x = x.long() 122 | 123 | minute_x = self.minute_embed(x[:, :, 4]) if hasattr(self, 'minute_embed') else 0. 124 | hour_x = self.hour_embed(x[:, :, 3]) 125 | weekday_x = self.weekday_embed(x[:, :, 2]) 126 | day_x = self.day_embed(x[:, :, 1]) 127 | month_x = self.month_embed(x[:, :, 0]) 128 | 129 | return hour_x + weekday_x + day_x + month_x + minute_x 130 | 131 | 132 | class TimeFeatureEmbedding(nn.Module): 133 | def __init__(self, d_model, embed_type='timeF', freq='h'): 134 | super(TimeFeatureEmbedding, self).__init__() 135 | 136 | freq_map = {'h': 4, 't': 5, 's': 6, 'm': 1, 'a': 1, 'w': 2, 'd': 3, 'b': 3} 137 | d_inp = freq_map[freq] 138 | self.embed = nn.Linear(d_inp, d_model, bias=False) 139 | 140 | def forward(self, x): 141 | return self.embed(x) 142 | 143 | 144 | class DataEmbedding(nn.Module): 145 | def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1): 146 | super(DataEmbedding, self).__init__() 147 | 148 | self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model) 149 | self.position_embedding = PositionalEmbedding(d_model=d_model) 150 | self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type, 151 | freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding( 152 | d_model=d_model, embed_type=embed_type, freq=freq) 153 | self.dropout = nn.Dropout(p=dropout) 154 | 155 | def forward(self, x, x_mark): 156 | x = self.value_embedding(x) + self.temporal_embedding(x_mark) + self.position_embedding(x) 157 | return self.dropout(x) 158 | 159 | 160 | class DataEmbedding_wo_pos(nn.Module): 161 | def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1): 162 | super(DataEmbedding_wo_pos, self).__init__() 163 | 164 | self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model) 165 | self.position_embedding = PositionalEmbedding(d_model=d_model) 166 | self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type, 167 | freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding( 168 | d_model=d_model, embed_type=embed_type, freq=freq) 169 | self.dropout = nn.Dropout(p=dropout) 170 | 171 | def forward(self, x, x_mark): 172 | x = self.value_embedding(x) + self.temporal_embedding(x_mark) 173 | return self.dropout(x) 174 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/layers/FANLayer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class FANLayer(nn.Module): 6 | def __init__(self, input_dim, output_dim, bias=True, with_gate = True): 7 | super(FANLayer, self).__init__() 8 | self.input_linear_p = nn.Linear(input_dim, output_dim//4, bias=bias) 9 | self.input_linear_g = nn.Linear(input_dim, (output_dim-output_dim//2)) 10 | self.activation = nn.GELU() 11 | if with_gate: 12 | self.gate = nn.Parameter(torch.randn(1, dtype=torch.float32)) 13 | 14 | def forward(self, src): 15 | g = self.activation(self.input_linear_g(src)) 16 | p = self.input_linear_p(src) 17 | 18 | if not hasattr(self, 'gate'): 19 | output = torch.cat((torch.cos(p), torch.sin(p), g), dim=-1) 20 | else: 21 | gate = torch.sigmoid(self.gate) 22 | output = torch.cat((gate*torch.cos(p), gate*torch.sin(p), (1-gate)*g), dim=-1) 23 | return output 24 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/layers/SelfAttention_Family.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | import torch.nn as nn 5 | 6 | import numpy as np 7 | from math import sqrt 8 | from utils.masking import TriangularCausalMask, ProbMask 9 | from reformer_pytorch import LSHSelfAttention 10 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 11 | from FANLayer import FANLayer 12 | 13 | 14 | 15 | class FullAttention(nn.Module): 16 | def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False): 17 | super(FullAttention, self).__init__() 18 | self.scale = scale 19 | self.mask_flag = mask_flag 20 | self.output_attention = output_attention 21 | self.dropout = nn.Dropout(attention_dropout) 22 | 23 | def forward(self, queries, keys, values, attn_mask): 24 | B, L, H, E = queries.shape 25 | _, S, _, D = values.shape 26 | scale = self.scale or 1. / sqrt(E) 27 | 28 | scores = torch.einsum("blhe,bshe->bhls", queries, keys) 29 | 30 | if self.mask_flag: 31 | if attn_mask is None: 32 | attn_mask = TriangularCausalMask(B, L, device=queries.device) 33 | 34 | scores.masked_fill_(attn_mask.mask, -np.inf) 35 | 36 | A = self.dropout(torch.softmax(scale * scores, dim=-1)) 37 | V = torch.einsum("bhls,bshd->blhd", A, values) 38 | 39 | if self.output_attention: 40 | return (V.contiguous(), A) 41 | else: 42 | return (V.contiguous(), None) 43 | 44 | 45 | class ProbAttention(nn.Module): 46 | def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False): 47 | super(ProbAttention, self).__init__() 48 | self.factor = factor 49 | self.scale = scale 50 | self.mask_flag = mask_flag 51 | self.output_attention = output_attention 52 | self.dropout = nn.Dropout(attention_dropout) 53 | 54 | def _prob_QK(self, Q, K, sample_k, n_top): # n_top: c*ln(L_q) 55 | # Q [B, H, L, D] 56 | B, H, L_K, E = K.shape 57 | _, _, L_Q, _ = Q.shape 58 | 59 | # calculate the sampled Q_K 60 | K_expand = K.unsqueeze(-3).expand(B, H, L_Q, L_K, E) 61 | index_sample = torch.randint(L_K, (L_Q, sample_k)) # real U = U_part(factor*ln(L_k))*L_q 62 | K_sample = K_expand[:, :, torch.arange(L_Q).unsqueeze(1), index_sample, :] 63 | Q_K_sample = torch.matmul(Q.unsqueeze(-2), K_sample.transpose(-2, -1)).squeeze() 64 | 65 | # find the Top_k query with sparisty measurement 66 | M = Q_K_sample.max(-1)[0] - torch.div(Q_K_sample.sum(-1), L_K) 67 | M_top = M.topk(n_top, sorted=False)[1] 68 | 69 | # use the reduced Q to calculate Q_K 70 | Q_reduce = Q[torch.arange(B)[:, None, None], 71 | torch.arange(H)[None, :, None], 72 | M_top, :] # factor*ln(L_q) 73 | Q_K = torch.matmul(Q_reduce, K.transpose(-2, -1)) # factor*ln(L_q)*L_k 74 | 75 | return Q_K, M_top 76 | 77 | def _get_initial_context(self, V, L_Q): 78 | B, H, L_V, D = V.shape 79 | if not self.mask_flag: 80 | # V_sum = V.sum(dim=-2) 81 | V_sum = V.mean(dim=-2) 82 | contex = V_sum.unsqueeze(-2).expand(B, H, L_Q, V_sum.shape[-1]).clone() 83 | else: # use mask 84 | assert (L_Q == L_V) # requires that L_Q == L_V, i.e. for self-attention only 85 | contex = V.cumsum(dim=-2) 86 | return contex 87 | 88 | def _update_context(self, context_in, V, scores, index, L_Q, attn_mask): 89 | B, H, L_V, D = V.shape 90 | 91 | if self.mask_flag: 92 | attn_mask = ProbMask(B, H, L_Q, index, scores, device=V.device) 93 | scores.masked_fill_(attn_mask.mask, -np.inf) 94 | 95 | attn = torch.softmax(scores, dim=-1) # nn.Softmax(dim=-1)(scores) 96 | 97 | context_in[torch.arange(B)[:, None, None], 98 | torch.arange(H)[None, :, None], 99 | index, :] = torch.matmul(attn, V).type_as(context_in) 100 | if self.output_attention: 101 | attns = (torch.ones([B, H, L_V, L_V]) / L_V).type_as(attn).to(attn.device) 102 | attns[torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :] = attn 103 | return (context_in, attns) 104 | else: 105 | return (context_in, None) 106 | 107 | def forward(self, queries, keys, values, attn_mask): 108 | B, L_Q, H, D = queries.shape 109 | _, L_K, _, _ = keys.shape 110 | 111 | queries = queries.transpose(2, 1) 112 | keys = keys.transpose(2, 1) 113 | values = values.transpose(2, 1) 114 | 115 | U_part = self.factor * np.ceil(np.log(L_K)).astype('int').item() # c*ln(L_k) 116 | u = self.factor * np.ceil(np.log(L_Q)).astype('int').item() # c*ln(L_q) 117 | 118 | U_part = U_part if U_part < L_K else L_K 119 | u = u if u < L_Q else L_Q 120 | 121 | scores_top, index = self._prob_QK(queries, keys, sample_k=U_part, n_top=u) 122 | 123 | # add scale factor 124 | scale = self.scale or 1. / sqrt(D) 125 | if scale is not None: 126 | scores_top = scores_top * scale 127 | # get the context 128 | context = self._get_initial_context(values, L_Q) 129 | # update the context with selected top_k queries 130 | context, attn = self._update_context(context, values, scores_top, index, L_Q, attn_mask) 131 | 132 | return context.contiguous(), attn 133 | 134 | 135 | class AttentionLayer(nn.Module): 136 | def __init__(self, attention, d_model, n_heads, d_keys=None, 137 | d_values=None): 138 | super(AttentionLayer, self).__init__() 139 | 140 | d_keys = d_keys or (d_model // n_heads) 141 | d_values = d_values or (d_model // n_heads) 142 | 143 | self.inner_attention = attention 144 | self.query_projection = nn.Linear(d_model, d_keys * n_heads) 145 | self.key_projection = nn.Linear(d_model, d_keys * n_heads) 146 | self.value_projection = nn.Linear(d_model, d_values * n_heads) 147 | self.out_projection = nn.Linear(d_values * n_heads, d_model) 148 | self.n_heads = n_heads 149 | 150 | def forward(self, queries, keys, values, attn_mask): 151 | B, L, _ = queries.shape 152 | _, S, _ = keys.shape 153 | H = self.n_heads 154 | 155 | queries = self.query_projection(queries).view(B, L, H, -1) 156 | keys = self.key_projection(keys).view(B, S, H, -1) 157 | values = self.value_projection(values).view(B, S, H, -1) 158 | 159 | out, attn = self.inner_attention( 160 | queries, 161 | keys, 162 | values, 163 | attn_mask 164 | ) 165 | out = out.view(B, L, -1) 166 | 167 | return self.out_projection(out), attn 168 | 169 | class ReformerLayer(nn.Module): 170 | def __init__(self, attention, d_model, n_heads, d_keys=None, 171 | d_values=None, causal=False, bucket_size=4, n_hashes=4): 172 | super().__init__() 173 | self.bucket_size = bucket_size 174 | self.attn = LSHSelfAttention( 175 | dim=d_model, 176 | heads=n_heads, 177 | bucket_size=bucket_size, 178 | n_hashes=n_hashes, 179 | causal=causal 180 | ) 181 | 182 | def fit_length(self, queries): 183 | # inside reformer: assert N % (bucket_size * 2) == 0 184 | B, N, C = queries.shape 185 | if N % (self.bucket_size * 2) == 0: 186 | return queries 187 | else: 188 | # fill the time series 189 | fill_len = (self.bucket_size * 2) - (N % (self.bucket_size * 2)) 190 | return torch.cat([queries, torch.zeros([B, fill_len, C]).to(queries.device)], dim=1) 191 | 192 | def forward(self, queries, keys, values, attn_mask): 193 | # in Reformer: defalut queries=keys 194 | B, N, C = queries.shape 195 | queries = self.attn(self.fit_length(queries))[:, :N, :] 196 | return queries, None 197 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/layers/Transformer_EncDec.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 7 | from FANLayer import FANLayer 8 | 9 | class ConvLayer(nn.Module): 10 | def __init__(self, c_in): 11 | super(ConvLayer, self).__init__() 12 | self.downConv = nn.Conv1d(in_channels=c_in, 13 | out_channels=c_in, 14 | kernel_size=3, 15 | padding=2, 16 | padding_mode='circular') 17 | self.norm = nn.BatchNorm1d(c_in) 18 | self.activation = nn.ELU() 19 | self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1) 20 | 21 | def forward(self, x): 22 | x = self.downConv(x.permute(0, 2, 1)) 23 | x = self.norm(x) 24 | x = self.activation(x) 25 | x = self.maxPool(x) 26 | x = x.transpose(1, 2) 27 | return x 28 | 29 | 30 | class EncoderLayer(nn.Module): 31 | def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu", exp_setting=0): 32 | super(EncoderLayer, self).__init__() 33 | self.exp_setting = exp_setting 34 | d_ff = d_ff or 4 * d_model 35 | self.attention = attention 36 | assert exp_setting in [0, 2, 4] 37 | if exp_setting == 0: 38 | self.mlp1 = nn.Linear(d_model, d_ff) 39 | self.mlp2 = nn.Linear(d_ff, d_model) 40 | elif exp_setting == 2: 41 | self.mlp1 = FANLayer(input_dim=d_model, output_dim=d_ff, with_gate=True) 42 | self.mlp2 = FANLayer(input_dim=d_ff, output_dim=d_model, with_gate=True) 43 | elif exp_setting == 4: 44 | self.mlp1 = FANLayer(input_dim=d_model, output_dim=d_ff, with_gate=False) 45 | self.mlp2 = FANLayer(input_dim=d_ff, output_dim=d_model, with_gate=False) 46 | 47 | self.norm1 = nn.LayerNorm(d_model) 48 | self.norm2 = nn.LayerNorm(d_model) 49 | self.dropout = nn.Dropout(dropout) 50 | self.activation = F.relu if activation == "relu" else F.gelu 51 | 52 | def forward(self, x, attn_mask=None): 53 | new_x, attn = self.attention( 54 | x, x, x, 55 | attn_mask=attn_mask 56 | ) 57 | x = x + self.dropout(new_x) 58 | 59 | y = x = self.norm1(x) 60 | if self.exp_setting == 0: 61 | y = self.dropout(self.activation(self.mlp1(y))) 62 | else: 63 | y = self.dropout(self.mlp1(y)) 64 | y = self.dropout(self.mlp2(y)) 65 | 66 | return self.norm2(x + y), attn 67 | 68 | 69 | class Encoder(nn.Module): 70 | def __init__(self, attn_layers, conv_layers=None, norm_layer=None): 71 | super(Encoder, self).__init__() 72 | self.attn_layers = nn.ModuleList(attn_layers) 73 | self.conv_layers = nn.ModuleList(conv_layers) if conv_layers is not None else None 74 | self.norm = norm_layer 75 | 76 | def forward(self, x, attn_mask=None): 77 | # x [B, L, D] 78 | attns = [] 79 | if self.conv_layers is not None: 80 | for attn_layer, conv_layer in zip(self.attn_layers, self.conv_layers): 81 | x, attn = attn_layer(x, attn_mask=attn_mask) 82 | x = conv_layer(x) 83 | attns.append(attn) 84 | x, attn = self.attn_layers[-1](x) 85 | attns.append(attn) 86 | else: 87 | for attn_layer in self.attn_layers: 88 | x, attn = attn_layer(x, attn_mask=attn_mask) 89 | attns.append(attn) 90 | 91 | if self.norm is not None: 92 | x = self.norm(x) 93 | 94 | return x, attns 95 | 96 | 97 | class DecoderLayer(nn.Module): 98 | def __init__(self, self_attention, cross_attention, d_model, d_ff=None, 99 | dropout=0.1, activation="relu", exp_setting=0): 100 | super(DecoderLayer, self).__init__() 101 | self.exp_setting = exp_setting 102 | d_ff = d_ff or 4 * d_model 103 | self.self_attention = self_attention 104 | self.cross_attention = cross_attention 105 | assert exp_setting in [0, 2, 4] 106 | if exp_setting == 0: 107 | self.mlp1 = nn.Linear(d_model, d_ff) 108 | self.mlp2 = nn.Linear(d_ff, d_model) 109 | elif exp_setting == 2: 110 | self.mlp1 = FANLayer(input_dim=d_model, output_dim=d_ff, with_gate=True) 111 | self.mlp2 = FANLayer(input_dim=d_ff, output_dim=d_model, with_gate=True) 112 | elif exp_setting == 4: 113 | self.mlp1 = FANLayer(input_dim=d_model, output_dim=d_ff, with_gate=False) 114 | self.mlp2 = FANLayer(input_dim=d_ff, output_dim=d_model, with_gate=False) 115 | self.norm1 = nn.LayerNorm(d_model) 116 | self.norm2 = nn.LayerNorm(d_model) 117 | self.norm3 = nn.LayerNorm(d_model) 118 | self.dropout = nn.Dropout(dropout) 119 | self.activation = F.relu if activation == "relu" else F.gelu 120 | 121 | def forward(self, x, cross, x_mask=None, cross_mask=None): 122 | x = x + self.dropout(self.self_attention( 123 | x, x, x, 124 | attn_mask=x_mask 125 | )[0]) 126 | x = self.norm1(x) 127 | 128 | x = x + self.dropout(self.cross_attention( 129 | x, cross, cross, 130 | attn_mask=cross_mask 131 | )[0]) 132 | 133 | y = x = self.norm2(x) 134 | if self.exp_setting == 0: 135 | y = self.dropout(self.activation(self.mlp1(y))) 136 | else: 137 | y = self.dropout(self.mlp1(y)) 138 | y = self.dropout(self.mlp2(y)) 139 | 140 | return self.norm3(x + y) 141 | 142 | 143 | class Decoder(nn.Module): 144 | def __init__(self, layers, norm_layer=None, projection=None): 145 | super(Decoder, self).__init__() 146 | self.layers = nn.ModuleList(layers) 147 | self.norm = norm_layer 148 | self.projection = projection 149 | 150 | def forward(self, x, cross, x_mask=None, cross_mask=None): 151 | for layer in self.layers: 152 | x = layer(x, cross, x_mask=x_mask, cross_mask=cross_mask) 153 | 154 | if self.norm is not None: 155 | x = self.norm(x) 156 | 157 | if self.projection is not None: 158 | x = self.projection(x) 159 | return x 160 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihongDong/FAN/7f1b16a1fdf2e36e8b123109d7f6b6987361a1fe/Timeseries_Forecasting/layers/__init__.py -------------------------------------------------------------------------------- /Timeseries_Forecasting/models/Modified_Transformer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from layers.Transformer_EncDec import Decoder, DecoderLayer, Encoder, EncoderLayer 4 | from layers.SelfAttention_Family import FullAttention, AttentionLayer 5 | from layers.Embed import DataEmbedding 6 | 7 | 8 | class Model(nn.Module): 9 | """ 10 | Vanilla Transformer with O(L^2) complexity 11 | """ 12 | def __init__(self, configs): 13 | super(Model, self).__init__() 14 | self.pred_len = configs.pred_len 15 | self.output_attention = configs.output_attention 16 | 17 | # Embedding 18 | self.enc_embedding = DataEmbedding(configs.enc_in, configs.d_model, configs.embed, configs.freq, 19 | configs.dropout) 20 | self.dec_embedding = DataEmbedding(configs.dec_in, configs.d_model, configs.embed, configs.freq, 21 | configs.dropout) 22 | # Encoder 23 | self.encoder = Encoder( 24 | [ 25 | EncoderLayer( 26 | AttentionLayer( 27 | FullAttention(False, configs.factor, attention_dropout=configs.dropout, 28 | output_attention=configs.output_attention), configs.d_model, configs.n_heads), 29 | configs.d_model, 30 | configs.d_ff, 31 | dropout=configs.dropout, 32 | activation=configs.activation, 33 | exp_setting=configs.exp_setting, 34 | ) for l in range(configs.e_layers) 35 | ], 36 | norm_layer=torch.nn.LayerNorm(configs.d_model) 37 | ) 38 | # Decoder 39 | self.decoder = Decoder( 40 | [ 41 | DecoderLayer( 42 | AttentionLayer( 43 | FullAttention(True, configs.factor, attention_dropout=configs.dropout, output_attention=False), 44 | configs.d_model, configs.n_heads), 45 | AttentionLayer( 46 | FullAttention(False, configs.factor, attention_dropout=configs.dropout, output_attention=False), 47 | configs.d_model, configs.n_heads), 48 | configs.d_model, 49 | configs.d_ff, 50 | dropout=configs.dropout, 51 | activation=configs.activation, 52 | exp_setting=configs.exp_setting, 53 | ) 54 | for l in range(configs.d_layers) 55 | ], 56 | norm_layer=torch.nn.LayerNorm(configs.d_model), 57 | projection=nn.Linear(configs.d_model, configs.c_out, bias=True) 58 | ) 59 | 60 | def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec, 61 | enc_self_mask=None, dec_self_mask=None, dec_enc_mask=None): 62 | 63 | enc_out = self.enc_embedding(x_enc, x_mark_enc) 64 | enc_out, attns = self.encoder(enc_out, attn_mask=enc_self_mask) 65 | 66 | dec_out = self.dec_embedding(x_dec, x_mark_dec) 67 | dec_out = self.decoder(dec_out, enc_out, x_mask=dec_self_mask, cross_mask=dec_enc_mask) 68 | 69 | if self.output_attention: 70 | return dec_out[:, -self.pred_len:, :], attns 71 | else: 72 | return dec_out[:, -self.pred_len:, :] # [B, L, D] 73 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihongDong/FAN/7f1b16a1fdf2e36e8b123109d7f6b6987361a1fe/Timeseries_Forecasting/models/__init__.py -------------------------------------------------------------------------------- /Timeseries_Forecasting/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | scikit-learn 3 | torchvision 4 | numpy 5 | matplotlib 6 | reformer_pytorch 7 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/run.py: -------------------------------------------------------------------------------- 1 | # The Code of this part is based on Autoformer (https://github.com/thuml/Autoformer). 2 | 3 | import argparse 4 | import os 5 | import torch 6 | from exp.exp_main import Exp_Main 7 | import random 8 | import numpy as np 9 | 10 | def get_setting_str(args, exp_iter): 11 | setting = [ 12 | 'expset{}'.format(args.exp_setting), 13 | '{}'.format(args.model_id), 14 | '{}'.format(args.model), 15 | '{}'.format(args.data), 16 | 'bs{}'.format(args.batch_size), 17 | 'drop{}'.format(args.dropout), 18 | 'lr{}'.format(args.learning_rate), 19 | 'ep{}'.format(args.train_epochs), 20 | 'pat{}'.format(args.patience), 21 | 'ft{}'.format(args.features), 22 | 'sl{}'.format(args.seq_len), 23 | 'll{}'.format(args.label_len), 24 | 'pl{}'.format(args.pred_len), 25 | 'dm{}'.format(args.d_model), 26 | 'nh{}'.format(args.n_heads), 27 | 'el{}'.format(args.e_layers), 28 | 'dl{}'.format(args.d_layers), 29 | 'df{}'.format(args.d_ff), 30 | 'fc{}'.format(args.factor), 31 | 'eb{}'.format(args.embed), 32 | 'dt{}'.format(args.distil), 33 | '{}'.format(args.des), 34 | '{}'.format(exp_iter) 35 | ] 36 | setting = '_'.join(setting) 37 | 38 | return setting 39 | 40 | 41 | def main(): 42 | fix_seed = 2021 43 | random.seed(fix_seed) 44 | torch.manual_seed(fix_seed) 45 | np.random.seed(fix_seed) 46 | 47 | parser = argparse.ArgumentParser(description='Autoformer & Transformer family for Time Series Forecasting') 48 | 49 | # basic config 50 | parser.add_argument('--is_training', type=int, required=True, default=1, help='status') 51 | parser.add_argument('--model_id', type=str, required=True, default='test', help='model id') 52 | parser.add_argument('--model', type=str, required=True, default='Transformer') 53 | 54 | # data loader 55 | parser.add_argument('--data', type=str, required=True, default='ETTh', help='dataset type') 56 | parser.add_argument('--root_path', type=str, default='./data/ETT/', help='root path of the data file') 57 | parser.add_argument('--data_path', type=str, default='ETTh.csv', help='data file') 58 | parser.add_argument('--features', type=str, default='M', 59 | help='forecasting task, options:[M, S, MS]; M:multivariate predict multivariate, S:univariate predict univariate, MS:multivariate predict univariate') 60 | parser.add_argument('--target', type=str, default='OT', help='target feature in S or MS task') 61 | parser.add_argument('--freq', type=str, default='h', 62 | help='freq for time features encoding, options:[s:secondly, t:minutely, h:hourly, d:daily, b:business days, w:weekly, m:monthly], you can also use more detailed freq like 15min or 3h') 63 | parser.add_argument('--checkpoints', type=str, default='./checkpoints/', help='location of model checkpoints') 64 | 65 | # forecasting task 66 | parser.add_argument('--seq_len', type=int, default=96, help='input sequence length') 67 | parser.add_argument('--label_len', type=int, default=48, help='start token length') 68 | parser.add_argument('--pred_len', type=int, default=96, help='prediction sequence length') 69 | 70 | # model define 71 | parser.add_argument('--bucket_size', type=int, default=4, help='for Reformer') 72 | parser.add_argument('--n_hashes', type=int, default=4, help='for Reformer') 73 | parser.add_argument('--enc_in', type=int, default=7, help='encoder input size') 74 | parser.add_argument('--dec_in', type=int, default=7, help='decoder input size') 75 | parser.add_argument('--c_out', type=int, default=7, help='output size') 76 | parser.add_argument('--d_model', type=int, default=512, help='dimension of model') 77 | parser.add_argument('--n_heads', type=int, default=8, help='num of heads') 78 | parser.add_argument('--e_layers', type=int, default=2, help='num of encoder layers') 79 | parser.add_argument('--d_layers', type=int, default=1, help='num of decoder layers') 80 | parser.add_argument('--d_ff', type=int, default=2048, help='dimension of fcn') 81 | parser.add_argument('--moving_avg', type=int, default=25, help='window size of moving average') 82 | parser.add_argument('--factor', type=int, default=1, help='attn factor') 83 | parser.add_argument('--distil', action='store_false', 84 | help='whether to use distilling in encoder, using this argument means not using distilling', 85 | default=True) 86 | parser.add_argument('--dropout', type=float, default=0.1, help='dropout') 87 | parser.add_argument('--embed', type=str, default='timeF', 88 | help='time features encoding, options:[timeF, fixed, learned]') 89 | parser.add_argument('--activation', type=str, default='gelu', help='activation') 90 | parser.add_argument('--output_attention', action='store_true', help='whether to output attention in encoder') 91 | parser.add_argument('--do_predict', action='store_true', help='whether to predict unseen future data') 92 | parser.add_argument('--exp_setting', type=int, default=0, help='experiment setting') 93 | parser.add_argument('--use_norm', type=int, default=True, help='use norm and denorm') 94 | 95 | # optimization 96 | parser.add_argument('--num_workers', type=int, default=10, help='data loader num workers') 97 | parser.add_argument('--itr', type=int, default=2, help='experiments times') 98 | parser.add_argument('--train_epochs', type=int, default=10, help='train epochs') 99 | parser.add_argument('--batch_size', type=int, default=32, help='batch size of train input data') 100 | parser.add_argument('--patience', type=int, default=10, help='early stopping patience') 101 | parser.add_argument('--learning_rate', type=float, default=0.0001, help='optimizer learning rate') 102 | parser.add_argument('--des', type=str, default='test', help='exp description') 103 | parser.add_argument('--loss', type=str, default='mse', help='loss function') 104 | parser.add_argument('--lradj', type=str, default='type1', help='adjust learning rate') 105 | parser.add_argument('--use_amp', action='store_true', help='use automatic mixed precision training', default=False) 106 | 107 | # GPU 108 | parser.add_argument('--use_gpu', type=bool, default=True, help='use gpu') 109 | parser.add_argument('--gpu', type=int, default=0, help='gpu') 110 | parser.add_argument('--use_multi_gpu', action='store_true', help='use multiple gpus', default=False) 111 | parser.add_argument('--devices', type=str, default='0,1,2,3', help='device ids of multile gpus') 112 | 113 | args = parser.parse_args() 114 | 115 | args.use_gpu = True if torch.cuda.is_available() and args.use_gpu else False 116 | 117 | if args.use_gpu and args.use_multi_gpu: 118 | args.devices = args.devices.replace(' ', '') 119 | device_ids = args.devices.split(',') 120 | args.device_ids = [int(id_) for id_ in device_ids] 121 | args.gpu = args.device_ids[0] 122 | 123 | print('Args in experiment:') 124 | print(args) 125 | 126 | Exp = Exp_Main 127 | 128 | if args.is_training: 129 | for ii in range(args.itr): 130 | setting = get_setting_str(args, ii) 131 | 132 | exp = Exp(args) # set experiments 133 | print('>>>>>>>start training : {}>>>>>>>>>>>>>>>>>>>>>>>>>>'.format(setting)) 134 | exp.train(setting) 135 | 136 | print('>>>>>>>testing : {}<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'.format(setting)) 137 | exp.test(setting) 138 | 139 | if args.do_predict: 140 | print('>>>>>>>predicting : {}<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'.format(setting)) 141 | exp.predict(setting, True) 142 | 143 | torch.cuda.empty_cache() 144 | else: 145 | ii = 0 146 | setting = get_setting_str(args, ii) 147 | 148 | exp = Exp(args) # set experiments 149 | print('>>>>>>>testing : {}<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'.format(setting)) 150 | exp.test(setting, test=1) 151 | torch.cuda.empty_cache() 152 | 153 | 154 | if __name__ == "__main__": 155 | main() 156 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/scripts/ETT_script/Transformer.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0 2 | 3 | datasets=("ETTh") 4 | pred_lens=(96 192 336 720) 5 | model="Modified_Transformer" 6 | exp_setting=0 7 | 8 | for dataset in "${datasets[@]}"; do 9 | if [ ! -d "./logs/LongForecasting/$dataset" ]; then 10 | mkdir ./logs/LongForecasting/$dataset 11 | fi 12 | for pred_len in "${pred_lens[@]}"; do 13 | model_id="${dataset}_96_${pred_len}" 14 | echo "Running model: $model_id" 15 | python -u run.py \ 16 | --is_training 1 \ 17 | --root_path "./dataset/ETT-small/" \ 18 | --data_path "${dataset}.csv" \ 19 | --model_id "$model_id" \ 20 | --model "$model" \ 21 | --data "$dataset" \ 22 | --features M \ 23 | --seq_len 96 \ 24 | --label_len 48 \ 25 | --pred_len "$pred_len" \ 26 | --e_layers 2 \ 27 | --d_layers 1 \ 28 | --enc_in 7 \ 29 | --dec_in 7 \ 30 | --c_out 7 \ 31 | --des 'Exp' \ 32 | --freq 't' \ 33 | --exp_setting $exp_setting \ 34 | --itr 1 >logs/LongForecasting/$dataset/$model'_'$model_id'_'exp_setting_$exp_setting.log 2>&1 35 | done 36 | done 37 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/scripts/ETT_script/Transformer_setting_2.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0 2 | 3 | datasets=("ETTh") 4 | pred_lens=(96 192 336 720) 5 | model="Modified_Transformer" 6 | exp_setting=2 7 | learning_rate=1e-5 8 | 9 | for dataset in "${datasets[@]}"; do 10 | if [ ! -d "./logs/LongForecasting/$dataset" ]; then 11 | mkdir ./logs/LongForecasting/$dataset 12 | fi 13 | for pred_len in "${pred_lens[@]}"; do 14 | model_id="${dataset}_96_${pred_len}" 15 | echo "Running model: $model_id" 16 | python -u run.py \ 17 | --is_training 1 \ 18 | --root_path "./dataset/ETT-small/" \ 19 | --data_path "${dataset}.csv" \ 20 | --model_id "$model_id" \ 21 | --model "$model" \ 22 | --data "$dataset" \ 23 | --features M \ 24 | --seq_len 96 \ 25 | --label_len 48 \ 26 | --pred_len "$pred_len" \ 27 | --e_layers 2 \ 28 | --d_layers 1 \ 29 | --enc_in 7 \ 30 | --dec_in 7 \ 31 | --c_out 7 \ 32 | --des 'Exp' \ 33 | --freq 't' \ 34 | --learning_rate $learning_rate \ 35 | --exp_setting $exp_setting \ 36 | --itr 1 >logs/LongForecasting/$dataset/$model'_'$model_id'_'exp_setting_$exp_setting'_'lr$learning_rate.log 2>&1 37 | done 38 | done 39 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/scripts/ETT_script/Transformer_setting_4.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0 2 | 3 | datasets=("ETTh") 4 | pred_lens=(96 192 336 720) 5 | model="Modified_Transformer" 6 | exp_setting=4 7 | learning_rate=1e-5 8 | 9 | for dataset in "${datasets[@]}"; do 10 | if [ ! -d "./logs/LongForecasting/$dataset" ]; then 11 | mkdir ./logs/LongForecasting/$dataset 12 | fi 13 | for pred_len in "${pred_lens[@]}"; do 14 | model_id="${dataset}_96_${pred_len}" 15 | echo "Running model: $model_id" 16 | python -u run.py \ 17 | --is_training 1 \ 18 | --root_path "./dataset/ETT-small/" \ 19 | --data_path "${dataset}.csv" \ 20 | --model_id "$model_id" \ 21 | --model "$model" \ 22 | --data "$dataset" \ 23 | --features M \ 24 | --seq_len 96 \ 25 | --label_len 48 \ 26 | --pred_len "$pred_len" \ 27 | --e_layers 2 \ 28 | --d_layers 1 \ 29 | --enc_in 7 \ 30 | --dec_in 7 \ 31 | --c_out 7 \ 32 | --des 'Exp' \ 33 | --freq 't' \ 34 | --learning_rate $learning_rate \ 35 | --exp_setting $exp_setting \ 36 | --itr 1 >logs/LongForecasting/$dataset/$model'_'$model_id'_'exp_setting_$exp_setting'_'lr$learning_rate.log 2>&1 37 | done 38 | done 39 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/scripts/Exchange_script/Modified_Transformer_setting_2.sh: -------------------------------------------------------------------------------- 1 | if [ ! -d "./logs/LongForecasting/exchange_rate" ]; then 2 | mkdir ./logs/LongForecasting/exchange_rate 3 | fi 4 | 5 | export CUDA_VISIBLE_DEVICES=0 6 | 7 | ROOT_PATH="./dataset/exchange_rate/" 8 | DATA_PATH="exchange_rate.csv" 9 | MODEL="Modified_Transformer" 10 | DATA="custom" 11 | FEATURES=M 12 | SEQ_LEN=96 13 | LABEL_LEN=48 14 | E_LAYERS=2 15 | D_LAYERS=1 16 | FACTOR=3 17 | ENC_IN=8 18 | DEC_IN=8 19 | C_OUT=8 20 | DES="Exp" 21 | ITR=1 22 | EXP_SETTING=2 23 | 24 | # 96 192 336 720 25 | for PRED_LEN in 96 192 336 720 26 | do 27 | MODEL_ID="ECL_${SEQ_LEN}_${PRED_LEN}" 28 | python -u run.py \ 29 | --is_training 1 \ 30 | --root_path $ROOT_PATH \ 31 | --data_path $DATA_PATH \ 32 | --model_id $MODEL_ID \ 33 | --model $MODEL \ 34 | --data $DATA \ 35 | --features $FEATURES \ 36 | --seq_len $SEQ_LEN \ 37 | --label_len $LABEL_LEN \ 38 | --pred_len $PRED_LEN \ 39 | --e_layers $E_LAYERS \ 40 | --d_layers $D_LAYERS \ 41 | --factor $FACTOR \ 42 | --enc_in $ENC_IN \ 43 | --dec_in $DEC_IN \ 44 | --c_out $C_OUT \ 45 | --des $DES \ 46 | --itr $ITR \ 47 | --exp_setting $EXP_SETTING # >logs/LongForecasting/exchange_rate/$MODEL'_'$MODEL_ID'_'exp_setting_$EXP_SETTING.log 2>&1 48 | done 49 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/scripts/Exchange_script/Modified_Transformer_setting_4.sh: -------------------------------------------------------------------------------- 1 | if [ ! -d "./logs/LongForecasting/exchange_rate" ]; then 2 | mkdir ./logs/LongForecasting/exchange_rate 3 | fi 4 | 5 | export CUDA_VISIBLE_DEVICES=0 6 | 7 | ROOT_PATH="./dataset/exchange_rate/" 8 | DATA_PATH="exchange_rate.csv" 9 | MODEL="Modified_Transformer" 10 | DATA="custom" 11 | FEATURES=M 12 | SEQ_LEN=96 13 | LABEL_LEN=48 14 | E_LAYERS=2 15 | D_LAYERS=1 16 | FACTOR=3 17 | ENC_IN=8 18 | DEC_IN=8 19 | C_OUT=8 20 | DES="Exp" 21 | ITR=1 22 | EXP_SETTING=4 23 | 24 | # 96 192 336 720 25 | for PRED_LEN in 96 192 336 720 26 | do 27 | MODEL_ID="ECL_${SEQ_LEN}_${PRED_LEN}" 28 | python -u run.py \ 29 | --is_training 1 \ 30 | --root_path $ROOT_PATH \ 31 | --data_path $DATA_PATH \ 32 | --model_id $MODEL_ID \ 33 | --model $MODEL \ 34 | --data $DATA \ 35 | --features $FEATURES \ 36 | --seq_len $SEQ_LEN \ 37 | --label_len $LABEL_LEN \ 38 | --pred_len $PRED_LEN \ 39 | --e_layers $E_LAYERS \ 40 | --d_layers $D_LAYERS \ 41 | --factor $FACTOR \ 42 | --enc_in $ENC_IN \ 43 | --dec_in $DEC_IN \ 44 | --c_out $C_OUT \ 45 | --des $DES \ 46 | --itr $ITR \ 47 | --exp_setting $EXP_SETTING >logs/LongForecasting/exchange_rate/$MODEL'_'$MODEL_ID'_'exp_setting_$EXP_SETTING.log 2>&1 48 | done 49 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/scripts/Exchange_script/Transformer.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0 2 | 3 | model_id_prefix="Exchange_96" 4 | 5 | for pred_len in 96 192 336 720; do 6 | model_id="${model_id_prefix}_${pred_len}" 7 | 8 | python -u run.py \ 9 | --is_training 1 \ 10 | --root_path ./dataset/exchange_rate/ \ 11 | --data_path exchange_rate.csv \ 12 | --model_id "$model_id" \ 13 | --model Modified_Transformer \ 14 | --data custom \ 15 | --features M \ 16 | --seq_len 96 \ 17 | --label_len 48 \ 18 | --pred_len "$pred_len" \ 19 | --e_layers 2 \ 20 | --d_layers 1 \ 21 | --factor 3 \ 22 | --enc_in 8 \ 23 | --dec_in 8 \ 24 | --c_out 8 \ 25 | --des 'Exp' \ 26 | --exp_setting 0 \ 27 | --itr 1 >logs/LongForecasting/exchange_rate/baseline_$model_id.log 2>&1 28 | done -------------------------------------------------------------------------------- /Timeseries_Forecasting/scripts/Traffic_script/Modified_Transformer_setting_2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ ! -d "./logs" ]; then 4 | mkdir ./logs 5 | fi 6 | 7 | if [ ! -d "./logs/LongForecasting" ]; then 8 | mkdir ./logs/LongForecasting 9 | fi 10 | 11 | export CUDA_VISIBLE_DEVICES=0 12 | 13 | ROOT_PATH="./dataset/traffic/" 14 | DATA_PATH="traffic.csv" 15 | MODEL="Modified_Transformer" 16 | DATA="custom" 17 | FEATURES="M" 18 | SEQ_LEN=96 19 | LABEL_LEN=48 20 | E_LAYERS=2 21 | D_LAYERS=1 22 | FACTOR=3 23 | ENC_IN=862 24 | DEC_IN=862 25 | C_OUT=862 26 | DES="Exp" 27 | ITR=1 28 | EXP_SETTING=2 29 | TRAIN_EPOCHS=50 30 | learn_rates=(1e-3 5e-5 1e-5 1e-6) 31 | 32 | for LEARN_RATE in "${learn_rates[@]}" 33 | do 34 | for PRED_LEN in 336 720 35 | do 36 | MODEL_ID="traffic_${SEQ_LEN}_${PRED_LEN}" 37 | 38 | python -u run.py \ 39 | --is_training 1 \ 40 | --root_path $ROOT_PATH \ 41 | --data_path $DATA_PATH \ 42 | --model_id $MODEL_ID \ 43 | --model $MODEL \ 44 | --data $DATA \ 45 | --features $FEATURES \ 46 | --seq_len $SEQ_LEN \ 47 | --label_len $LABEL_LEN \ 48 | --pred_len $PRED_LEN \ 49 | --e_layers $E_LAYERS \ 50 | --d_layers $D_LAYERS \ 51 | --factor $FACTOR \ 52 | --enc_in $ENC_IN \ 53 | --dec_in $DEC_IN \ 54 | --c_out $C_OUT \ 55 | --des $DES \ 56 | --itr $ITR \ 57 | --train_epochs $TRAIN_EPOCHS \ 58 | --exp_setting $EXP_SETTING \ 59 | --learning_rate $LEARN_RATE >logs/LongForecasting/traffic/$MODEL'_'$MODEL_ID'_'expsetting$exp_setting'_'learn_rate_$LEARN_RATE.log 2>&1 60 | done 61 | done 62 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/scripts/Traffic_script/Modified_Transformer_setting_4.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ ! -d "./logs" ]; then 4 | mkdir ./logs 5 | fi 6 | 7 | if [ ! -d "./logs/LongForecasting" ]; then 8 | mkdir ./logs/LongForecasting 9 | fi 10 | 11 | export CUDA_VISIBLE_DEVICES=0 12 | 13 | # 定义变量 14 | ROOT_PATH="./dataset/traffic/" 15 | DATA_PATH="traffic.csv" 16 | MODEL="Modified_Transformer" 17 | DATA="custom" 18 | FEATURES="M" 19 | SEQ_LEN=96 20 | LABEL_LEN=48 21 | E_LAYERS=2 22 | D_LAYERS=1 23 | FACTOR=3 24 | ENC_IN=862 25 | DEC_IN=862 26 | C_OUT=862 27 | DES="Exp" 28 | ITR=1 29 | EXP_SETTING=4 30 | TRAIN_EPOCHS=50 31 | learn_rates=(1e-3 5e-5 1e-5 1e-6) 32 | 33 | # 循环运行脚本 34 | for LEARN_RATE in "${learn_rates[@]}" 35 | do 36 | for PRED_LEN in 336 720 37 | do 38 | MODEL_ID="traffic_${SEQ_LEN}_${PRED_LEN}" 39 | 40 | python -u run.py \ 41 | --is_training 1 \ 42 | --root_path $ROOT_PATH \ 43 | --data_path $DATA_PATH \ 44 | --model_id $MODEL_ID \ 45 | --model $MODEL \ 46 | --data $DATA \ 47 | --features $FEATURES \ 48 | --seq_len $SEQ_LEN \ 49 | --label_len $LABEL_LEN \ 50 | --pred_len $PRED_LEN \ 51 | --e_layers $E_LAYERS \ 52 | --d_layers $D_LAYERS \ 53 | --factor $FACTOR \ 54 | --enc_in $ENC_IN \ 55 | --dec_in $DEC_IN \ 56 | --c_out $C_OUT \ 57 | --des $DES \ 58 | --itr $ITR \ 59 | --train_epochs $TRAIN_EPOCHS \ 60 | --exp_setting $EXP_SETTING \ 61 | --learning_rate $LEARN_RATE >logs/LongForecasting/traffic/$MODEL'_'$MODEL_ID'_'expsetting$exp_setting'_'learn_rate_$LEARN_RATE.log 2>&1 62 | done 63 | done 64 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/scripts/Traffic_script/Transformer.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0 2 | 3 | train_epochs=50 4 | 5 | for pred_len in 96 192 336 720 6 | do 7 | model_id="traffic_96_${pred_len}" 8 | 9 | python -u run.py \ 10 | --is_training 1 \ 11 | --root_path ./dataset/traffic/ \ 12 | --data_path traffic.csv \ 13 | --model_id $model_id \ 14 | --model Modified_Transformer \ 15 | --data custom \ 16 | --features M \ 17 | --seq_len 96 \ 18 | --label_len 48 \ 19 | --pred_len $pred_len \ 20 | --e_layers 2 \ 21 | --d_layers 1 \ 22 | --factor 3 \ 23 | --enc_in 862 \ 24 | --dec_in 862 \ 25 | --c_out 862 \ 26 | --des 'Exp' \ 27 | --itr 1 \ 28 | ----exp_setting 0 \ 29 | --train_epochs $train_epochs >logs/LongForecasting/traffic/baseline_$model_id.log 2>&1 30 | done 31 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/scripts/Weather_script/Modified_Transformer.sh: -------------------------------------------------------------------------------- 1 | if [ ! -d "./logs" ]; then 2 | mkdir ./logs 3 | fi 4 | 5 | if [ ! -d "./logs/LongForecasting" ]; then 6 | mkdir ./logs/LongForecasting 7 | fi 8 | 9 | export CUDA_VISIBLE_DEVICES=0 10 | 11 | ROOT_PATH="./dataset/weather/" 12 | DATA_PATH="weather.csv" 13 | MODEL="Modified_Transformer" 14 | DATA="custom" 15 | FEATURES="M" 16 | SEQ_LEN=96 17 | LABEL_LEN=48 18 | E_LAYERS=2 19 | D_LAYERS=1 20 | FACTOR=3 21 | ENC_IN=21 22 | DEC_IN=21 23 | C_OUT=21 24 | DES="Exp" 25 | ITR=1 26 | 27 | BATCH_SIZE=256 28 | DROP_OUT=0.05 29 | lr=1e-5 30 | EPOCHS=10 31 | PATIENCE=3 32 | # Transformer(baseline): 0, FANGated: 2, FAN: 4 33 | EXP_SETTING=4 34 | 35 | for PRED_LEN in 96 192 336 720 36 | do 37 | MODEL_ID="weather_${SEQ_LEN}_${PRED_LEN}" 38 | python -u run.py \ 39 | --is_training 1 \ 40 | --root_path $ROOT_PATH \ 41 | --data_path $DATA_PATH \ 42 | --model_id $MODEL_ID \ 43 | --model $MODEL \ 44 | --data $DATA \ 45 | --features $FEATURES \ 46 | --seq_len $SEQ_LEN \ 47 | --label_len $LABEL_LEN \ 48 | --pred_len $PRED_LEN \ 49 | --e_layers $E_LAYERS \ 50 | --d_layers $D_LAYERS \ 51 | --factor $FACTOR \ 52 | --enc_in $ENC_IN \ 53 | --dec_in $DEC_IN \ 54 | --c_out $C_OUT \ 55 | --des $DES \ 56 | --itr $ITR \ 57 | --batch_size $BATCH_SIZE \ 58 | --dropout $DROP_OUT \ 59 | --learning_rate $lr \ 60 | --train_epochs $EPOCHS \ 61 | --patience $PATIENCE \ 62 | --exp_setting $EXP_SETTING # > logs/LongForecasting/$MODEL'_'$MODEL_ID'_'exp_setting_$EXP_SETTING.log 2>&1 63 | done 64 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihongDong/FAN/7f1b16a1fdf2e36e8b123109d7f6b6987361a1fe/Timeseries_Forecasting/utils/__init__.py -------------------------------------------------------------------------------- /Timeseries_Forecasting/utils/download_data.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | if __name__=="__main__": 4 | source_url = 'https://cloud.tsinghua.edu.cn/d/e1ccfff39ad541908bae/files/?p=%2Fall_six_datasets.zip&dl=1' 5 | headers = {'User-Agent': 'Mozilla/5.0'} 6 | res = requests.get(source_url, headers=headers) 7 | 8 | with open('dataset/datasets.zip', 'wb') as f: 9 | f.write(res.content) 10 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/utils/masking.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class TriangularCausalMask(): 5 | def __init__(self, B, L, device="cpu"): 6 | mask_shape = [B, 1, L, L] 7 | with torch.no_grad(): 8 | self._mask = torch.triu(torch.ones(mask_shape, dtype=torch.bool), diagonal=1).to(device) 9 | 10 | @property 11 | def mask(self): 12 | return self._mask 13 | 14 | 15 | class ProbMask(): 16 | def __init__(self, B, H, L, index, scores, device="cpu"): 17 | _mask = torch.ones(L, scores.shape[-1], dtype=torch.bool).to(device).triu(1) 18 | _mask_ex = _mask[None, None, :].expand(B, H, L, scores.shape[-1]) 19 | indicator = _mask_ex[torch.arange(B)[:, None, None], 20 | torch.arange(H)[None, :, None], 21 | index, :].to(device) 22 | self._mask = indicator.view(scores.shape).to(device) 23 | 24 | @property 25 | def mask(self): 26 | return self._mask 27 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/utils/metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def RSE(pred, true): 5 | return np.sqrt(np.sum((true - pred) ** 2)) / np.sqrt(np.sum((true - true.mean()) ** 2)) 6 | 7 | 8 | def CORR(pred, true): 9 | u = ((true - true.mean(0)) * (pred - pred.mean(0))).sum(0) 10 | d = np.sqrt(((true - true.mean(0)) ** 2).sum(0) * ((pred - pred.mean(0)) ** 2).sum(0)) 11 | return (u / d).mean(-1) 12 | 13 | 14 | def MAE(pred, true): 15 | return np.mean(np.abs(pred - true)) 16 | 17 | 18 | def MSE(pred, true): 19 | return np.mean((pred - true) ** 2) 20 | 21 | 22 | def RMSE(pred, true): 23 | return np.sqrt(MSE(pred, true)) 24 | 25 | 26 | def MAPE(pred, true): 27 | return np.mean(np.abs((pred - true) / true)) 28 | 29 | 30 | def MSPE(pred, true): 31 | return np.mean(np.square((pred - true) / true)) 32 | 33 | 34 | def metric(pred, true): 35 | mae = MAE(pred, true) 36 | mse = MSE(pred, true) 37 | rmse = RMSE(pred, true) 38 | mape = MAPE(pred, true) 39 | mspe = MSPE(pred, true) 40 | 41 | return mae, mse, rmse, mape, mspe 42 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/utils/timefeatures.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from pandas.tseries import offsets 6 | from pandas.tseries.frequencies import to_offset 7 | 8 | 9 | class TimeFeature: 10 | def __init__(self): 11 | pass 12 | 13 | def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: 14 | pass 15 | 16 | def __repr__(self): 17 | return self.__class__.__name__ + "()" 18 | 19 | 20 | class SecondOfMinute(TimeFeature): 21 | """Minute of hour encoded as value between [-0.5, 0.5]""" 22 | 23 | def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: 24 | return index.second / 59.0 - 0.5 25 | 26 | 27 | class MinuteOfHour(TimeFeature): 28 | """Minute of hour encoded as value between [-0.5, 0.5]""" 29 | 30 | def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: 31 | return index.minute / 59.0 - 0.5 32 | 33 | 34 | class HourOfDay(TimeFeature): 35 | """Hour of day encoded as value between [-0.5, 0.5]""" 36 | 37 | def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: 38 | return index.hour / 23.0 - 0.5 39 | 40 | 41 | class DayOfWeek(TimeFeature): 42 | """Hour of day encoded as value between [-0.5, 0.5]""" 43 | 44 | def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: 45 | return index.dayofweek / 6.0 - 0.5 46 | 47 | 48 | class DayOfMonth(TimeFeature): 49 | """Day of month encoded as value between [-0.5, 0.5]""" 50 | 51 | def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: 52 | return (index.day - 1) / 30.0 - 0.5 53 | 54 | 55 | class DayOfYear(TimeFeature): 56 | """Day of year encoded as value between [-0.5, 0.5]""" 57 | 58 | def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: 59 | return (index.dayofyear - 1) / 365.0 - 0.5 60 | 61 | 62 | class MonthOfYear(TimeFeature): 63 | """Month of year encoded as value between [-0.5, 0.5]""" 64 | 65 | def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: 66 | return (index.month - 1) / 11.0 - 0.5 67 | 68 | 69 | class WeekOfYear(TimeFeature): 70 | """Week of year encoded as value between [-0.5, 0.5]""" 71 | 72 | def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: 73 | return (index.isocalendar().week - 1) / 52.0 - 0.5 74 | 75 | 76 | def time_features_from_frequency_str(freq_str: str) -> List[TimeFeature]: 77 | """ 78 | Returns a list of time features that will be appropriate for the given frequency string. 79 | Parameters 80 | ---------- 81 | freq_str 82 | Frequency string of the form [multiple][granularity] such as "12H", "5min", "1D" etc. 83 | """ 84 | 85 | features_by_offsets = { 86 | offsets.YearEnd: [], 87 | offsets.QuarterEnd: [MonthOfYear], 88 | offsets.MonthEnd: [MonthOfYear], 89 | offsets.Week: [DayOfMonth, WeekOfYear], 90 | offsets.Day: [DayOfWeek, DayOfMonth, DayOfYear], 91 | offsets.BusinessDay: [DayOfWeek, DayOfMonth, DayOfYear], 92 | offsets.Hour: [HourOfDay, DayOfWeek, DayOfMonth, DayOfYear], 93 | offsets.Minute: [ 94 | MinuteOfHour, 95 | HourOfDay, 96 | DayOfWeek, 97 | DayOfMonth, 98 | DayOfYear, 99 | ], 100 | offsets.Second: [ 101 | SecondOfMinute, 102 | MinuteOfHour, 103 | HourOfDay, 104 | DayOfWeek, 105 | DayOfMonth, 106 | DayOfYear, 107 | ], 108 | } 109 | 110 | offset = to_offset(freq_str) 111 | 112 | for offset_type, feature_classes in features_by_offsets.items(): 113 | if isinstance(offset, offset_type): 114 | return [cls() for cls in feature_classes] 115 | 116 | supported_freq_msg = f""" 117 | Unsupported frequency {freq_str} 118 | The following frequencies are supported: 119 | Y - yearly 120 | alias: A 121 | M - monthly 122 | W - weekly 123 | D - daily 124 | B - business days 125 | H - hourly 126 | T - minutely 127 | alias: min 128 | S - secondly 129 | """ 130 | raise RuntimeError(supported_freq_msg) 131 | 132 | 133 | def time_features(dates, freq='h'): 134 | return np.vstack([feat(dates) for feat in time_features_from_frequency_str(freq)]) 135 | -------------------------------------------------------------------------------- /Timeseries_Forecasting/utils/tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import matplotlib.pyplot as plt 4 | 5 | plt.switch_backend('agg') 6 | 7 | # calc the number of parameters 8 | def count_parameters(model): 9 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 10 | 11 | def adjust_learning_rate(optimizer, epoch, args): 12 | # lr = args.learning_rate * (0.2 ** (epoch // 2)) 13 | if args.lradj == 'type1': 14 | lr_adjust = {epoch: args.learning_rate * (0.5 ** ((epoch - 1) // 1))} 15 | elif args.lradj == 'type2': 16 | lr_adjust = { 17 | 2: 5e-5, 4: 1e-5, 6: 5e-6, 8: 1e-6, 18 | 10: 5e-7, 15: 1e-7, 20: 5e-8 19 | } 20 | if epoch in lr_adjust.keys(): 21 | lr = lr_adjust[epoch] 22 | for param_group in optimizer.param_groups: 23 | param_group['lr'] = lr 24 | print('Updating learning rate to {}'.format(lr)) 25 | 26 | 27 | class EarlyStopping: 28 | def __init__(self, patience=7, verbose=False, delta=0): 29 | self.patience = patience 30 | self.verbose = verbose 31 | self.counter = 0 32 | self.best_score = None 33 | self.early_stop = False 34 | self.val_loss_min = np.inf 35 | self.delta = delta 36 | 37 | def __call__(self, val_loss, model, path): 38 | score = -val_loss 39 | if self.best_score is None: 40 | self.best_score = score 41 | self.save_checkpoint(val_loss, model, path) 42 | elif score < self.best_score + self.delta: 43 | self.counter += 1 44 | print(f'EarlyStopping counter: {self.counter} out of {self.patience}') 45 | if self.counter >= self.patience: 46 | self.early_stop = True 47 | else: 48 | self.best_score = score 49 | self.save_checkpoint(val_loss, model, path) 50 | self.counter = 0 51 | 52 | def save_checkpoint(self, val_loss, model, path): 53 | if self.verbose: 54 | print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model ...') 55 | torch.save(model.state_dict(), path + '/' + 'checkpoint.pth') 56 | self.val_loss_min = val_loss 57 | 58 | 59 | class dotdict(dict): 60 | """dot.notation access to dictionary attributes""" 61 | __getattr__ = dict.get 62 | __setattr__ = dict.__setitem__ 63 | __delattr__ = dict.__delitem__ 64 | 65 | 66 | class StandardScaler(): 67 | def __init__(self, mean, std): 68 | self.mean = mean 69 | self.std = std 70 | 71 | def transform(self, data): 72 | return (data - self.mean) / self.std 73 | 74 | def inverse_transform(self, data): 75 | return (data * self.std) + self.mean 76 | 77 | 78 | def visual(true, preds=None, name='./pic/test.pdf'): 79 | """ 80 | Results visualization 81 | """ 82 | plt.figure() 83 | plt.plot(true, label='GroundTruth', linewidth=2) 84 | if preds is not None: 85 | plt.plot(preds, label='Prediction', linewidth=2) 86 | plt.legend() 87 | plt.savefig(name, bbox_inches='tight') 88 | -------------------------------------------------------------------------------- /img/FANLayer.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihongDong/FAN/7f1b16a1fdf2e36e8b123109d7f6b6987361a1fe/img/FANLayer.jpg -------------------------------------------------------------------------------- /img/IR.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihongDong/FAN/7f1b16a1fdf2e36e8b123109d7f6b6987361a1fe/img/IR.jpg -------------------------------------------------------------------------------- /img/mod.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihongDong/FAN/7f1b16a1fdf2e36e8b123109d7f6b6987361a1fe/img/mod.jpg -------------------------------------------------------------------------------- /img/sin.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YihongDong/FAN/7f1b16a1fdf2e36e8b123109d7f6b6987361a1fe/img/sin.jpg --------------------------------------------------------------------------------