├── ltsm ├── utils │ ├── __init__.py │ ├── .DS_Store │ ├── masking.py │ ├── dist.py │ ├── metrics.py │ └── timefeatures.py ├── sk_interface │ └── __init__.py ├── prompt_reader │ ├── soft_prompt │ │ └── README.md │ ├── stat_prompt │ │ ├── tsfel │ │ │ ├── __init__.py │ │ │ ├── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── progress_bar.py │ │ │ │ ├── signal_processing.py │ │ │ │ ├── calculate_complexity.py │ │ │ │ └── add_personal_features.py │ │ │ └── feature_extraction │ │ │ │ ├── __init__.py │ │ │ │ └── features_settings.py │ │ ├── README.md │ │ └── prompt_tsne.py │ └── text_prompt │ │ └── csv_prompt.json ├── common │ ├── base_splitter.py │ ├── base_reader.py │ ├── base_processor.py │ └── sklearn.py ├── data_provider │ ├── tokenizer │ │ ├── __init__.py │ │ └── standard_scaler.py │ ├── hf_train_data_loader.py │ ├── __init__.py │ └── data_splitter.py ├── data_pipeline │ └── __init__.py ├── models │ ├── ltsm_base.py │ ├── ltsm_ts_tokenizer.py │ ├── __init__.py │ ├── ltsm_stat_model.py │ ├── DLinear.py │ ├── PatchTST.py │ ├── base_config.py │ ├── Informer.py │ └── utils.py ├── data_reader │ ├── __init__.py │ └── database_reader.py └── layers │ ├── RevIN.py │ ├── Transformer_EncDec.py │ └── PatchTST_layers.py ├── tests ├── common │ └── __init__.py ├── models │ ├── __init__.py │ ├── init_test.py │ ├── DLinear_test.py │ ├── PatchTST_test.py │ └── Informer_test.py ├── data_provider │ ├── __init__.py │ ├── data_splitter_test.py │ ├── tokenizer │ │ └── standard_scaler_test.py │ └── prompt_generator_test.py ├── data_reader │ ├── __init__.py │ ├── dataloader_unittest_example.py │ ├── database_reader_test.py │ ├── train_database_reader_test.py │ └── npy_database_reader_test.py ├── test_scripts │ ├── anomaly_main_ltsm.py │ ├── main_tokenizer.py │ ├── prompt_generation_norm.sh │ ├── dlinear.json │ ├── ltsm.json │ ├── patchtst.json │ ├── train_ltsm_csv.sh │ ├── train_dlinear_csv.sh │ ├── train_patchtst_csv.sh │ ├── train_informer_csv.sh │ ├── informer.json │ ├── train_ltsm_tokenizer_csv.sh │ ├── train_anomaly_main_ltsm.sh │ ├── anomaly_config │ │ ├── config-1.json │ │ └── config.json │ ├── train_ltsm_textprompt_csv.sh │ ├── test_ltsm.sh │ ├── test_tokenizer_training.py │ ├── test_csv_lora.sh │ ├── main_ltsm.py │ └── test_pipeline_training.py ├── data_pipeline │ └── stat_pipeline_test.py └── evaluate_pipeline │ └── evaluation_pipeline_test.py ├── datasets └── README.md ├── multi_agents_pipeline ├── agents │ ├── __init__.py │ ├── custom_messages.py │ ├── TS_Agent.py │ ├── QA_Agent.py │ └── Planning_Agent.py ├── Agents.jpg ├── model_config.yaml ├── llm-server.py ├── Readme.md ├── ltsm_inference.py └── main.py ├── imgs ├── ltsm_model.png ├── stat_prompt.png └── prompt_csv_tsne.png ├── setup.py ├── .github └── workflows │ └── test.yml ├── .gitignore ├── requirements.txt └── tutorial └── README.md /ltsm/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ltsm/sk_interface/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/data_provider/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/data_reader/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datasets/README.md: -------------------------------------------------------------------------------- 1 | # Training Dataset -------------------------------------------------------------------------------- /multi_agents_pipeline/agents/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ltsm/prompt_reader/soft_prompt/README.md: -------------------------------------------------------------------------------- 1 | # Time Series Prompt Dataset -------------------------------------------------------------------------------- /imgs/ltsm_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/ltsm/HEAD/imgs/ltsm_model.png -------------------------------------------------------------------------------- /imgs/stat_prompt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/ltsm/HEAD/imgs/stat_prompt.png -------------------------------------------------------------------------------- /ltsm/utils/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/ltsm/HEAD/ltsm/utils/.DS_Store -------------------------------------------------------------------------------- /imgs/prompt_csv_tsne.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/ltsm/HEAD/imgs/prompt_csv_tsne.png -------------------------------------------------------------------------------- /multi_agents_pipeline/Agents.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datamllab/ltsm/HEAD/multi_agents_pipeline/Agents.jpg -------------------------------------------------------------------------------- /ltsm/prompt_reader/stat_prompt/tsfel/__init__.py: -------------------------------------------------------------------------------- 1 | from tsfel.utils import * 2 | from tsfel.feature_extraction import * 3 | -------------------------------------------------------------------------------- /ltsm/common/base_splitter.py: -------------------------------------------------------------------------------- 1 | class DataSplitter: 2 | def __init__(self): 3 | pass 4 | 5 | def get_splits(self): 6 | pass -------------------------------------------------------------------------------- /ltsm/common/base_reader.py: -------------------------------------------------------------------------------- 1 | class BaseReader: 2 | def __init__(self): 3 | pass 4 | 5 | def fetch(self): 6 | # input: path 7 | # output: DataFrame 8 | pass -------------------------------------------------------------------------------- /multi_agents_pipeline/model_config.yaml: -------------------------------------------------------------------------------- 1 | provider: autogen_ext.models.openai.OpenAIChatCompletionClient 2 | config: 3 | model: gpt-4o 4 | base_url: http://127.0.0.1:8000/v1 5 | api_key: REPLACE_WITH_YOUR_API_KEY -------------------------------------------------------------------------------- /ltsm/prompt_reader/stat_prompt/tsfel/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from tsfel.utils.calculate_complexity import * 2 | from tsfel.utils.signal_processing import * 3 | from tsfel.utils.add_personal_features import * 4 | from tsfel.utils.progress_bar import * 5 | -------------------------------------------------------------------------------- /ltsm/prompt_reader/stat_prompt/tsfel/feature_extraction/__init__.py: -------------------------------------------------------------------------------- 1 | from tsfel.feature_extraction.calc_features import * 2 | from tsfel.feature_extraction.features import * 3 | from tsfel.feature_extraction.features_settings import * 4 | from tsfel.feature_extraction.features_utils import * 5 | -------------------------------------------------------------------------------- /tests/test_scripts/anomaly_main_ltsm.py: -------------------------------------------------------------------------------- 1 | from ltsm.data_pipeline import AnomalyTrainingPipeline, anomaly_get_args, anomaly_seed_all 2 | 3 | if __name__ == "__main__": 4 | args = anomaly_get_args() 5 | anomaly_seed_all(args.seed) 6 | pipeline = AnomalyTrainingPipeline(args) 7 | pipeline.run() -------------------------------------------------------------------------------- /tests/test_scripts/main_tokenizer.py: -------------------------------------------------------------------------------- 1 | from ltsm.data_pipeline import TokenizerTrainingPipeline, tokenizer_get_args, tokenizer_seed_all 2 | 3 | if __name__ == "__main__": 4 | config = tokenizer_get_args() 5 | tokenizer_seed_all(config.seed) 6 | pipeline = TokenizerTrainingPipeline(config) 7 | pipeline.run() -------------------------------------------------------------------------------- /ltsm/data_provider/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | from ltsm.data_provider.tokenizer.standard_scaler import StandardScaler 2 | 3 | processor_dict = {} 4 | 5 | def register_processor(module): 6 | assert module.module_id not in processor_dict, f"Processor {module.module_id} alreader registered" 7 | processor_dict[module.module_id] = module 8 | 9 | register_processor(StandardScaler) 10 | -------------------------------------------------------------------------------- /ltsm/data_provider/hf_train_data_loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | import torch 5 | from torch.utils.data import Dataset, DataLoader 6 | from sklearn.preprocessing import StandardScaler 7 | import warnings 8 | from pathlib import Path 9 | 10 | from torch.utils.data.dataset import ConcatDataset, Dataset 11 | 12 | from ltsm.utils.timefeatures import time_features 13 | from ltsm.utils.tools import convert_tsf_to_dataframe 14 | 15 | warnings.filterwarnings('ignore') 16 | 17 | -------------------------------------------------------------------------------- /ltsm/common/base_processor.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List, Literal, Optional, Tuple, Union 2 | from dataclasses import dataclass 3 | import torch 4 | 5 | class BaseProcessor: 6 | def __init__(self): 7 | pass 8 | 9 | def process(self, raw_data, train_data, val_data, test_data, fit_train_only=False): 10 | pass 11 | 12 | def inverse_process(self, data): 13 | pass 14 | 15 | def save(self, save_dir): 16 | pass 17 | 18 | def load(self, save_dir): 19 | pass 20 | -------------------------------------------------------------------------------- /tests/test_scripts/prompt_generation_norm.sh: -------------------------------------------------------------------------------- 1 | data_name=" 2 | web_attack" 3 | save_format="pth.tar" 4 | 5 | 6 | python ./ltsm/prompt_reader/stat_prompt/prompt_generate_split.py \ 7 | --dataset_name ${data_name} \ 8 | --save_format ${save_format} 9 | python ./ltsm/prompt_reader/stat_prompt/prompt_normalization_split.py --mode fit --dataset_name ${data_name} --save_format ${save_format} 10 | python ./ltsm/prompt_reader/stat_prompt/prompt_normalization_split.py --mode transform --dataset_name ${data_name} --save_format ${save_format} -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | setuptools.setup( 4 | name="ltsm", 5 | version='1.0.0', 6 | author="Data Lab", 7 | author_email="daochen.zha@rice.edu", 8 | description="Large Time Sereis Model", 9 | url="XXXX", 10 | keywords=["Time Series"], 11 | packages=setuptools.find_packages(exclude=('tests',)), 12 | requires_python='>=3.8', 13 | classifiers=[ 14 | "Programming Language :: Python :: 3.8", 15 | "License :: OSI Approved :: MIT License", 16 | "Operating System :: OS Independent", 17 | ], 18 | ) 19 | -------------------------------------------------------------------------------- /tests/test_scripts/dlinear.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_params": { 3 | "model": "DLinear", 4 | "model_name_or_path": "gpt2-medium", 5 | "train_epochs": 100, 6 | "gradient_accumulation_steps": 64, 7 | "des": "Exp", 8 | "freeze": 0, 9 | "itr": 1, 10 | "learning_rate": 1e-3, 11 | "downsample_rate": 20, 12 | "output_dir": "output/dlinear/", 13 | "eval": 0, 14 | "features": "M" 15 | }, 16 | "model_config": { 17 | "pred_len": 96, 18 | "seq_len": 336 19 | } 20 | } -------------------------------------------------------------------------------- /tests/test_scripts/ltsm.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_params": { 3 | "model": "LTSM", 4 | "train_epochs": 2, 5 | "batch_size": 100, 6 | "gradient_accumulation_steps": 64, 7 | "prompt_data_path": "../../prompt_bank/prompt_data_normalize_split", 8 | "freeze": 0, 9 | "learning_rate": 1e-3, 10 | "downsample_rate": 20, 11 | "eval": 0, 12 | "tmax": 100 13 | }, 14 | "model_config": { 15 | "model_name_or_path": "gpt2-medium", 16 | "patch_size": 16, 17 | "pretrain": 1, 18 | "stride": 8, 19 | "gpt_layers": 3 20 | } 21 | } -------------------------------------------------------------------------------- /ltsm/data_pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | from .stat_pipeline import StatisticalTrainingPipeline, get_args, seed_all 2 | from .model_manager import ModelManager 3 | from .anormly_pipeline import AnomalyTrainingPipeline, anomaly_get_args, anomaly_seed_all 4 | from .tokenizer_pipeline import TokenizerTrainingPipeline, tokenizer_get_args, tokenizer_seed_all 5 | 6 | __all__ = { 7 | StatisticalTrainingPipeline, 8 | AnomalyTrainingPipeline, 9 | TokenizerTrainingPipeline, 10 | ModelManager, 11 | get_args, 12 | anomaly_get_args, 13 | tokenizer_get_args, 14 | seed_all, 15 | anomaly_seed_all, 16 | tokenizer_seed_all 17 | } -------------------------------------------------------------------------------- /ltsm/models/ltsm_base.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from transformers import PretrainedConfig 3 | import json 4 | 5 | @dataclass 6 | class LTSMConfig(PretrainedConfig): 7 | 8 | def __init__(self, **kwargs): 9 | super().__init__(**kwargs) 10 | 11 | for key, value in kwargs.items(): 12 | setattr(self, key, value) 13 | 14 | def update(self, **kwargs): 15 | for key, value in kwargs.items(): 16 | setattr(self, key, value) 17 | 18 | def load(self, json_file): 19 | 20 | with open(json_file) as f: 21 | config = json.load(f) 22 | 23 | for key, value in config.items(): 24 | setattr(self, key, value) 25 | 26 | return self -------------------------------------------------------------------------------- /multi_agents_pipeline/agents/custom_messages.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from typing import Optional, List 3 | 4 | 5 | class TextMessage(BaseModel): 6 | """ 7 | passed from Planner to QA Agent""" 8 | source: str 9 | content: str 10 | task: Optional[str] = None 11 | 12 | class TSMessage(BaseModel): 13 | """ 14 | passed from Planner to TS Agent, and from TS Agent to QA Agent 15 | 16 | filepath should be a valid path to a csv/tsv file""" 17 | source: str 18 | filepath: str 19 | task_type:Optional[str] = None 20 | description: Optional[str] = None 21 | 22 | class TSTaskMessage(BaseModel): 23 | """ 24 | passed to Planner 25 | 26 | This message contains a text prompt and the filepath to the data file. 27 | """ 28 | description: str 29 | filepath: str -------------------------------------------------------------------------------- /tests/test_scripts/patchtst.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_params": { 3 | "model": "PatchTST", 4 | "model_name_or_path": "gpt2-medium", 5 | "des": "Exp", 6 | "train_epochs": 100, 7 | "patience": 10, 8 | "lradj": "TST", 9 | "pct_start": 0.2, 10 | "freeze": 0, 11 | "itr": 1, 12 | "learning_rate": 1e-3, 13 | "downsample_rate": 20, 14 | "features": "M" 15 | }, 16 | "model_config": { 17 | "pred_len": 96, 18 | "gradient_accumulation_steps": 64, 19 | "e_layers": 3, 20 | "n_heads": 16, 21 | "d_model": 128, 22 | "d_ff": 256, 23 | "dropout": 0.2, 24 | "fc_dropout": 0.2, 25 | "head_dropout": 0, 26 | "seq_len": 336, 27 | "patch_len": 16, 28 | "stride": 8 29 | } 30 | } -------------------------------------------------------------------------------- /tests/test_scripts/train_ltsm_csv.sh: -------------------------------------------------------------------------------- 1 | nohup bash -c ' 2 | data_paths="../../datasets/ETT-small/ETTh1.csv 3 | ../../datasets/ETT-small/ETTh2.csv 4 | ../../datasets/ETT-small/ETTm1.csv 5 | ../../datasets/ETT-small/ETTm2.csv 6 | ../../datasets/electricity/electricity.csv 7 | ../../datasets/traffic/traffic.csv 8 | ../../datasets/exchange_rate/exchange_rate.csv 9 | ../../datasets/weather/weather.csv" 10 | 11 | declare -a pred_len=(96 192 336 720) 12 | 13 | for index in "${!pred_len[@]}"; 14 | do 15 | CUDA_VISIBLE_DEVICES=0,1,2,3 python3 main_ltsm.py \ 16 | --config "ltsm.json" \ 17 | --data_path ${data_paths} \ 18 | --test_data_path_list ${data_paths} \ 19 | --pred_len ${pred_len[$index]} \ 20 | --output_dir output/ltsm_lr1e-3_loraFalse_down20_freeze0_e2_pred${pred_len[$index]}/, 21 | done 22 | ' > output.log 2>&1 & 23 | echo $! > save_pid.txt -------------------------------------------------------------------------------- /ltsm/data_reader/__init__.py: -------------------------------------------------------------------------------- 1 | from ltsm.data_reader.monash_reader import MonashReader 2 | from ltsm.data_reader.csv_reader import CSVReader 3 | reader_dict = {} 4 | 5 | def register_reader(module): 6 | """ 7 | Registers a BaseReader module into the reader dictionary. 8 | 9 | Args: 10 | module: A Python module or class that implements a BaseReader. 11 | module_name (str): The key name for the module in the reader dictionary. 12 | 13 | Raises: 14 | AssertionError: If a reader with the same name is already registered 15 | """ 16 | assert module.module_id not in reader_dict, f"Reader {module.module_id} already registered" 17 | reader_dict[module.module_id] = module 18 | 19 | register_reader(MonashReader) 20 | register_reader(CSVReader) 21 | 22 | __all__ = { 23 | register_reader, 24 | MonashReader, 25 | CSVReader 26 | } -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | - pull_request 5 | - push 6 | 7 | jobs: 8 | test: 9 | runs-on: ${{ matrix.os }} 10 | strategy: 11 | matrix: 12 | os: [ubuntu-latest] 13 | python-version: ['3.10'] 14 | steps: 15 | - name: Checkout repository 16 | uses: actions/checkout@v4 17 | 18 | - name: Set up Python 19 | uses: actions/setup-python@v5 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install -r requirements.txt 27 | 28 | - name: Test with pytest 29 | run: | 30 | export PYTHONPATH=./:$PYTHONPATH 31 | pytest tests/ --doctest-modules --junitxml=junit/test-results.xml --cov=com --cov-report=xml --cov-report=html 32 | -------------------------------------------------------------------------------- /ltsm/utils/masking.py: -------------------------------------------------------------------------------- 1 | # code from https://github.com/yuqinie98/PatchTST, with minor modifications 2 | import torch 3 | 4 | class TriangularCausalMask(): 5 | def __init__(self, B, L, device="cpu"): 6 | mask_shape = [B, 1, L, L] 7 | with torch.no_grad(): 8 | self._mask = torch.triu(torch.ones(mask_shape, dtype=torch.bool), diagonal=1).to(device) 9 | 10 | @property 11 | def mask(self): 12 | return self._mask 13 | 14 | 15 | class ProbMask(): 16 | def __init__(self, B, H, L, index, scores, device="cpu"): 17 | _mask = torch.ones(L, scores.shape[-1], dtype=torch.bool).to(device).triu(1) 18 | _mask_ex = _mask[None, None, :].expand(B, H, L, scores.shape[-1]) 19 | indicator = _mask_ex[torch.arange(B)[:, None, None], 20 | torch.arange(H)[None, :, None], 21 | index, :].to(device) 22 | self._mask = indicator.view(scores.shape).to(device) 23 | 24 | @property 25 | def mask(self): 26 | return self._mask -------------------------------------------------------------------------------- /ltsm/data_provider/__init__.py: -------------------------------------------------------------------------------- 1 | from .data_factory import DatasetFactory 2 | from .data_loader import ( 3 | HF_Dataset, 4 | HF_Timestamp_Dataset, 5 | Dataset_ETT_hour, 6 | Dataset_ETT_minute, 7 | Dataset_Custom, 8 | Dataset_Pred, 9 | Dataset_TSF, 10 | Dataset_Custom_List, 11 | Dataset_Custom_List_TS, 12 | Dataset_Custom_List_TS_TSF 13 | ) 14 | from .data_splitter import SplitterByTimestamp 15 | from .dataset import TSDataset, TSPromptDataset, TSTokenDataset 16 | from .prompt_generator import prompt_generate_split, prompt_normalization_split 17 | 18 | __all__ = { 19 | DatasetFactory, 20 | HF_Dataset, 21 | HF_Timestamp_Dataset, 22 | Dataset_ETT_hour, 23 | Dataset_ETT_minute, 24 | Dataset_Custom, 25 | Dataset_Pred, 26 | Dataset_TSF, 27 | Dataset_Custom_List, 28 | Dataset_Custom_List_TS, 29 | Dataset_Custom_List_TS_TSF, 30 | SplitterByTimestamp, 31 | TSDataset, 32 | TSPromptDataset, 33 | TSTokenDataset, 34 | prompt_generate_split, 35 | prompt_normalization_split 36 | } -------------------------------------------------------------------------------- /tests/test_scripts/train_dlinear_csv.sh: -------------------------------------------------------------------------------- 1 | nohup bash -c ' 2 | declare -a data_paths=( 3 | "../../datasets/ETT-small/ETTh1.csv" 4 | "../../datasets/ETT-small/ETTh2.csv" 5 | "../../datasets/ETT-small/ETTm1.csv" 6 | "../../datasets/ETT-small/ETTm2.csv" 7 | "../../datasets/electricity/electricity.csv" 8 | "../../datasets/traffic/traffic.csv" 9 | "../../datasets/exchange_rate/exchange_rate.csv" 10 | "../../datasets/weather/weather.csv" 11 | ) 12 | 13 | declare -a data=( 14 | "ETTh1" 15 | "ETTh2" 16 | "ETTm1" 17 | "ETTm2" 18 | "custom" 19 | "custom" 20 | "custom" 21 | "custom" 22 | ) 23 | 24 | declare -a features=(7 7 7 7 321 862 8 21) 25 | 26 | declare -a batch_sizes=(128 128 128 128 32 24 128 128) 27 | 28 | for index in "${!data_paths[@]}"; 29 | do 30 | CUDA_VISIBLE_DEVICES=0,1,2,3 python3 main_ltsm.py \ 31 | --config "dlinear.json" --data_path ${data_paths[$index]} \ 32 | --data ${data[$index]} \ 33 | --enc_in ${features[$index]} \ 34 | --batch_size ${batch_sizes[$index]} 35 | done 36 | ' > output.log 2>&1 & 37 | echo $! > save_pid.txt -------------------------------------------------------------------------------- /tests/test_scripts/train_patchtst_csv.sh: -------------------------------------------------------------------------------- 1 | nohup bash -c ' 2 | declare -a data_paths=( 3 | "../../datasets/ETT-small/ETTh1.csv" 4 | "../../datasets/ETT-small/ETTh2.csv" 5 | "../../datasets/ETT-small/ETTm1.csv" 6 | "../../datasets/ETT-small/ETTm2.csv" 7 | "../../datasets/electricity/electricity.csv" 8 | "../../datasets/traffic/traffic.csv" 9 | "../../datasets/exchange_rate/exchange_rate.csv" 10 | "../../datasets/weather/weather.csv" 11 | ) 12 | 13 | declare -a data=( 14 | "ETTh1" 15 | "ETTh2" 16 | "ETTm1" 17 | "ETTm2" 18 | "custom" 19 | "custom" 20 | "custom" 21 | "custom" 22 | ) 23 | 24 | declare -a features=(7 7 7 7 321 862 8 21) 25 | 26 | declare -a batch_sizes=(128 128 128 128 32 24 128 128) 27 | 28 | for index in "${!data_paths[@]}"; 29 | do 30 | CUDA_VISIBLE_DEVICES=0,1,2,3 python3 main_ltsm.py \ 31 | --config "patchtst.json" \ 32 | --data_path ${data_paths[$index]} \ 33 | --data ${data[$index]} \ 34 | --enc_in ${features[$index]} \ 35 | --batch_size ${batch_sizes[$index]} 36 | done 37 | ' > output.log 2>&1 & 38 | echo $! > save_pid.txt -------------------------------------------------------------------------------- /ltsm/utils/dist.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.spatial.distance import euclidean 3 | from fastdtw import fastdtw 4 | import torch 5 | 6 | def pairwise_dtw(x_batch, y_batch): 7 | """ 8 | 9 | Args: 10 | :param x_batch: Tensor, [ Batchsize, Time, Dimension_x ] 11 | :param y_batch: Tensor, [ Batchsize, Time, Dimension_y ] 12 | 13 | The input tensor should have Dimension_x == Dimension_y 14 | 15 | :return: Pair-wise Distance, Tensor, [ Batchsize, Batchsize ] 16 | """ 17 | 18 | batchsize_x = x_batch.shape[0] 19 | batchsize_y = y_batch.shape[0] 20 | dist_matrix = torch.zeros((batchsize_x, batchsize_y), device=torch.device("cpu")) 21 | for idx1, x in enumerate(x_batch): 22 | for idx2, y in enumerate(y_batch): 23 | if x_batch is y_batch and dist_matrix[idx2, idx1] > 0: 24 | dist_matrix[idx1, idx2] = dist_matrix[idx2, idx1] 25 | 26 | else: 27 | distance_xy, _ = fastdtw(x, y, dist=euclidean) 28 | dist_matrix[idx1, idx2] = distance_xy 29 | 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /tests/test_scripts/train_informer_csv.sh: -------------------------------------------------------------------------------- 1 | nohup bash -c ' 2 | declare -a data_paths=( 3 | "../../datasets/ETT-small/ETTh1.csv" 4 | "../../datasets/ETT-small/ETTh2.csv" 5 | "../../datasets/ETT-small/ETTm1.csv" 6 | "../../datasets/ETT-small/ETTm2.csv" 7 | "../../datasets/electricity/electricity.csv" 8 | "../../datasets/traffic/traffic.csv" 9 | "../../datasets/exchange_rate/exchange_rate.csv" 10 | "../../datasets/weather/weather.csv" 11 | ) 12 | 13 | declare -a data=( 14 | "ETTh1" 15 | "ETTh2" 16 | "ETTm1" 17 | "ETTm2" 18 | "custom" 19 | "custom" 20 | "custom" 21 | "custom" 22 | ) 23 | 24 | declare -a features=(7 7 7 7 321 862 8 21) 25 | 26 | declare -a batch_sizes=(128 128 128 128 32 24 128 128) 27 | 28 | for index in "${!data_paths[@]}"; 29 | do 30 | CUDA_VISIBLE_DEVICES=0,1,2,3 python3 main_ltsm.py \ 31 | --config "informer.json" \ 32 | --data_path ${data_paths[$index]} \ 33 | --data ${data[$index]} \ 34 | --enc_in ${features[$index]} \ 35 | --dec_in ${features[$index]} \ 36 | --c_out ${features[$index]} \ 37 | --batch_size ${batch_sizes[$index]} 38 | done 39 | ' > output.log 2>&1 & 40 | echo $! > save_pid.txt -------------------------------------------------------------------------------- /tests/test_scripts/informer.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_params": { 3 | "model": "Informer", 4 | "model_name_or_path": "gpt2-medium", 5 | "des": "Exp", 6 | "train_epochs": 100, 7 | "patience": 10, 8 | "lradj": "TST", 9 | "pct_start": 0.2, 10 | "freeze": 0, 11 | "itr": 1, 12 | "learning_rate": 1e-3, 13 | "downsample_rate": 20, 14 | "output_dir": "output/patchtst/", 15 | "eval": 0, 16 | "padding_patch": "end", 17 | "affine": 0, 18 | "subtract_last": 0, 19 | "decomposition": 0, 20 | "kernel_size": 25, 21 | "individual": 0, 22 | "embed": "timeF", 23 | "factor": 1, 24 | "features": "M", 25 | "local_pretrain": "None" 26 | }, 27 | "model_config":{ 28 | "pred_len": 96, 29 | "gradient_accumulation_steps": 512, 30 | "e_layers": 3, 31 | "d_layers": 1, 32 | "n_heads": 16, 33 | "d_model": 128, 34 | "d_ff": 256, 35 | "dropout": 0.2, 36 | "seq_len": 336, 37 | "activation": "gelu", 38 | "output_attention": 0, 39 | "embed_type": 0, 40 | "distil": 1 41 | } 42 | } -------------------------------------------------------------------------------- /tests/test_scripts/train_ltsm_tokenizer_csv.sh: -------------------------------------------------------------------------------- 1 | nohup bash -c ' 2 | TRAIN=" 3 | ../../datasets/exchange_rate/exchange_rate.csv 4 | ../../datasets/illness/national_illness.csv" 5 | 6 | TEST=" 7 | ../../datasets/exchange_rate/exchange_rate.csv 8 | ../../datasets/illness/national_illness.csv" 9 | PROMPT="../../prompt_bank/prompt_data_normalize_split" 10 | lr=1e-3 11 | epoch=10 12 | downsample_rate=20 13 | freeze=0 14 | d_ff=128 15 | 16 | for pred_len in 96 17 | do 18 | OUTPUT_PATH="output/ltsm_tokenizer_lr${lr}_loraFalse_down${downsample_rate}_freeze${freeze}_e${epoch}_pred${pred_len}/" 19 | CUDA_VISIBLE_DEVICES=5,6,7 python3 main_tokenizer.py \ 20 | --model LTSM_Tokenizer \ 21 | --model_name_or_path gpt2-medium \ 22 | --d_ff $d_ff \ 23 | --train_epochs ${epoch} \ 24 | --batch_size 20 \ 25 | --pred_len ${pred_len} \ 26 | --gradient_accumulation_steps 64 \ 27 | --data_path ${TRAIN} \ 28 | --test_data_path_list ${TEST} \ 29 | --prompt_data_path ${PROMPT} \ 30 | --freeze ${freeze} \ 31 | --learning_rate ${lr} \ 32 | --downsample_rate ${downsample_rate} \ 33 | --output_dir ${OUTPUT_PATH}\ 34 | --eval 0 35 | done 36 | ' > output.log 2>&1 & 37 | 38 | #tail -f output.log # check the latest output -------------------------------------------------------------------------------- /ltsm/utils/metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def RSE(pred, true): 5 | return np.sqrt(np.sum((true - pred) ** 2)) / np.sqrt(np.sum((true - true.mean()) ** 2)) 6 | 7 | 8 | def CORR(pred, true): 9 | u = ((true - true.mean(0)) * (pred - pred.mean(0))).sum(0) 10 | d = np.sqrt(((true - true.mean(0)) ** 2 * (pred - pred.mean(0)) ** 2).sum(0)) 11 | return (u / d).mean(-1) 12 | 13 | 14 | def MAE(pred, true): 15 | return np.mean(np.abs(pred - true)) 16 | 17 | 18 | def MSE(pred, true): 19 | return np.mean((pred - true) ** 2) 20 | 21 | 22 | def RMSE(pred, true): 23 | return np.sqrt(MSE(pred, true)) 24 | 25 | 26 | def MAPE(pred, true): 27 | return np.mean(np.abs(100 * (pred - true) / (true +1e-8))) 28 | 29 | 30 | def MSPE(pred, true): 31 | return np.mean(np.square((pred - true) / (true + 1e-8))) 32 | 33 | def SMAPE(pred, true): 34 | return np.mean(200 * np.abs(pred - true) / (np.abs(pred) + np.abs(true) + 1e-8)) 35 | # return np.mean(200 * np.abs(pred - true) / (pred + true + 1e-8)) 36 | 37 | def ND(pred, true): 38 | return np.mean(np.abs(true - pred)) / np.mean(np.abs(true)) 39 | 40 | def metric(pred, true): 41 | mae = MAE(pred, true) 42 | mse = MSE(pred, true) 43 | rmse = RMSE(pred, true) 44 | mape = MAPE(pred, true) 45 | mspe = MSPE(pred, true) 46 | smape = SMAPE(pred, true) 47 | nd = ND(pred, true) 48 | 49 | return mae, mse, rmse, mape, mspe, smape, nd 50 | -------------------------------------------------------------------------------- /ltsm/prompt_reader/stat_prompt/README.md: -------------------------------------------------------------------------------- 1 | # Time Series Prompt Generator 2 | 3 | 4 | Time series prompts are designed to capture the extensive characteristics of time series data comprehensively. These prompts, distinct from text-based ones, are created by extracting a wide range of global features from the entire training dataset. This method ensures a robust representation of the underlying dynamics, essential for boosting model performance. 5 | 6 | ## Quick Start 7 | **Step 1.** Download the dataset from our [Google Drive](). Make sure your local data folder like this: 8 | ````angular2html 9 | - ltsm/ 10 | - datasets/ 11 | electricity/ 12 | ETT-small/ 13 | exchange_rate/ 14 | illness/ 15 | traffic/ 16 | weather/ 17 | ... 18 | ```` 19 | 20 | **Step 2.** Generating the time series prompts from training, validating, and testing datasets 21 | ````angular2html 22 | python3 prompt_generate_split.py 23 | ```` 24 | 25 | **Step 3.** Find the generated time series prompts in the './prompt_data_split' folder. Then run the following command for normalizing the prompts: 26 | ````angular2html 27 | python3 prompt_normalization_split.py --mode fit 28 | ```` 29 | 30 | **Step 4.** Run this command to export the prompts to the "./prompt_data_normalize_split" folder: 31 | ````angular2html 32 | python3 prompt_normalization_split.py --mode transform 33 | ```` -------------------------------------------------------------------------------- /tests/test_scripts/train_anomaly_main_ltsm.sh: -------------------------------------------------------------------------------- 1 | CONFIG_PATH="./anomaly_config/config.json" 2 | 3 | CUDA_VISIBLE_DEVICES=6,7 python3 anomaly_main_ltsm.py \ 4 | --config_path ${CONFIG_PATH} 5 | # #TRAIN="../../datasets/creditcard/creditcard.csv" 6 | # TRAIN="../../datasets/water_quality/water_quality.csv" 7 | # #TRAIN="../../datasets/multi-Synthetic/0.csv" 8 | 9 | 10 | # #TEST="../../datasets/creditcard/creditcard.csv" 11 | # TEST="../../datasets/water_quality/water_quality.csv" 12 | # #TEST="../../datasets/multi-Synthetic/0.csv" 13 | 14 | # PROMPT="../../prompt_bank/stat-prompt/prompt_data_normalize_split" 15 | 16 | # epoch=4 17 | # downsample_rate=20 18 | # freeze=0 19 | # lr=1e-7 20 | 21 | 22 | # for seq_len in 133 23 | # do 24 | # OUTPUT_PATH="output/ltsm_lr${lr}_loraFalse_down${downsample_rate}_freeze${freeze}_e${epoch}_pred${pred_len}_water_quality/" 25 | # echo "Current OUTPUT_PATH: ${OUTPUT_PATH}" 26 | # CUDA_VISIBLE_DEVICES=6,7 python3 anomaly_main_ltsm.py \ 27 | # --model LTSM \ 28 | # --model_name_or_path gpt2-medium \ 29 | # --train_epochs ${epoch} \ 30 | # --batch_size 100 \ 31 | # --seq_len ${seq_len} \ 32 | # --gradient_accumulation_steps 64 \ 33 | # --data_path ${TRAIN} \ 34 | # --test_data_path_list ${TEST} \ 35 | # --prompt_data_path ${PROMPT} \ 36 | # --freeze ${freeze} \ 37 | # --learning_rate ${lr} \ 38 | # --downsample_rate ${downsample_rate} \ 39 | # --output_dir ${OUTPUT_PATH}\ 40 | # --eval 0 41 | # done -------------------------------------------------------------------------------- /tests/test_scripts/anomaly_config/config-1.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_id": "test_run", 3 | "model_name_or_path": "gpt2-medium", 4 | "seed": 2024, 5 | "device": "cuda:0", 6 | "checkpoints": "./checkpoints/", 7 | "data_path": ["../../datasets/creditcard/creditcard.csv"], 8 | "test_data_path_list": ["../../datasets/creditcard/creditcard.csv"], 9 | "prompt_data_path": "../../prompt_bank/stat-prompt/prompt_data_normalize_split", 10 | "data_processing": "standard_scaler", 11 | "learning_rate": 1e-4, 12 | "batch_size": 8, 13 | "num_workers": 10, 14 | "train_epochs": 1, 15 | "train_ratio": 0.7, 16 | "val_ratio": 0.1, 17 | "do_anomaly": true, 18 | "seq_len": 113, 19 | "pred_len": 113, 20 | "prompt_len": 133, 21 | "lora": false, 22 | "lora_dim": 128, 23 | "gpt_layers": 3, 24 | "d_model": 1024, 25 | "n_heads": 16, 26 | "d_ff": 512, 27 | "dropout": 0.2, 28 | "enc_in": 1, 29 | "c_out": 862, 30 | "patch_size": 16, 31 | "pretrain": 1, 32 | "local_pretrain": "None", 33 | "freeze": 0, 34 | "model": "LTSM", 35 | "stride": 8, 36 | "tmax": 10, 37 | "eval": 0, 38 | "itr": 1, 39 | "output_dir_template": "output/ltsm_lr{learning_rate}_loraFalse_down{downsample_rate}_freeze{freeze}_e{train_epochs}_pred{pred_len}_creditcard_113_check_bsize=8/", 40 | "downsample_rate": 20, 41 | "llm_layers": 32, 42 | "decay_fac": 0.75, 43 | "lradj": "type1", 44 | "patience": 3, 45 | "gradient_accumulation_steps": 64 46 | } 47 | -------------------------------------------------------------------------------- /tests/test_scripts/train_ltsm_textprompt_csv.sh: -------------------------------------------------------------------------------- 1 | TRAIN="datasets/ETT-small/ETTh1.csv 2 | datasets/ETT-small/ETTh2.csv 3 | datasets/ETT-small/ETTm1.csv 4 | datasets/ETT-small/ETTm2.csv 5 | datasets/electricity/electricity.csv 6 | datasets/exchange_rate/exchange_rate.csv 7 | datasets/traffic/traffic.csv 8 | datasets/weather/weather.csv" 9 | 10 | TEST="datasets/ETT-small/ETTh1.csv 11 | datasets/ETT-small/ETTh2.csv 12 | datasets/ETT-small/ETTm1.csv 13 | datasets/ETT-small/ETTm2.csv 14 | datasets/electricity/electricity.csv 15 | datasets/exchange_rate/exchange_rate.csv 16 | datasets/traffic/traffic.csv 17 | datasets/weather/weather.csv" 18 | 19 | PROMPT="prompt_bank/text_prompt_data_csv/csv_prompt.json" 20 | epoch=1000 21 | downsample_rate=20 22 | freeze=0 23 | lr=1e-3 24 | 25 | 26 | for pred_len in 96 192 336 720 27 | do 28 | OUTPUT_PATH="output/ltsm_textprompt_lr${lr}_loraFalse_down${downsample_rate}_freeze${freeze}_e${epoch}_pred${pred_len}/" 29 | CUDA_VISIBLE_DEVICES=0,1,2,3 python3 main_ltsm.py \ 30 | --model LTSM_WordPrompt \ 31 | --model_name_or_path gpt2-medium \ 32 | --train_epochs ${epoch} \ 33 | --batch_size 10 \ 34 | --pred_len ${pred_len} \ 35 | --gradient_accumulation_steps 64 \ 36 | --data_path ${TRAIN} \ 37 | --test_data_path_list ${TEST} \ 38 | --prompt_data_path ${PROMPT} \ 39 | --freeze ${freeze} \ 40 | --learning_rate ${lr} \ 41 | --downsample_rate ${downsample_rate} \ 42 | --output_dir ${OUTPUT_PATH} \ 43 | --eval 0 44 | done 45 | -------------------------------------------------------------------------------- /tests/test_scripts/anomaly_config/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_id": "test_run", 3 | "model_name_or_path": "gpt2-medium", 4 | "seed": 2024, 5 | "device": "cuda:0", 6 | "checkpoints": "./checkpoints/", 7 | "data_path": ["../../datasets/water_quality/water_quality.csv"], 8 | "test_data_path_list": ["../../datasets/water_quality/water_quality.csv"], 9 | "prompt_data_path": "../../prompt_bank/stat-prompt/prompt_data_normalize_split", 10 | "data_processing": "standard_scaler", 11 | "learning_rate": 2e-5, 12 | "batch_size": 8, 13 | "num_workers": 10, 14 | "train_epochs": 4, 15 | "train_ratio": 0.7, 16 | "val_ratio": 0.1, 17 | "do_anomaly": true, 18 | "seq_len": 133, 19 | "pred_len": 133, 20 | "prompt_len": 133, 21 | "lora": false, 22 | "lora_dim": 128, 23 | "gpt_layers": 1, 24 | "d_model": 1024, 25 | "n_heads": 16, 26 | "d_ff": 512, 27 | "dropout": 0.2, 28 | "enc_in": 1, 29 | "c_out": 862, 30 | "patch_size": 16, 31 | "pretrain": 1, 32 | "local_pretrain": "None", 33 | "freeze": 0, 34 | "model": "LTSM", 35 | "stride": 8, 36 | "tmax": 10, 37 | "eval": 0, 38 | "itr": 1, 39 | "output_dir_template": "output/ltsm_lr{learning_rate}_loraFalse_down{downsample_rate}_freeze{freeze}_e{train_epochs}_pred{pred_len}_113_check_bsize=8_grad_accumulate=16_layer=1", 40 | "downsample_rate": 20, 41 | "llm_layers": 32, 42 | "decay_fac": 0.75, 43 | "lradj": "type1", 44 | "patience": 3, 45 | "gradient_accumulation_steps": 16 46 | } 47 | -------------------------------------------------------------------------------- /tests/data_provider/data_splitter_test.py: -------------------------------------------------------------------------------- 1 | from ltsm.data_provider.data_splitter import SplitterByTimestamp 2 | import pandas as pd 3 | import numpy as np 4 | import pytest 5 | import math 6 | 7 | def test_splitter_by_timestamp_get_csv_splits(): 8 | indices = ["cosine", "linear", "exponential"] 9 | test_df = pd.DataFrame([[math.cos(i) for i in range(100)], 10 | [2*i for i in range(100)], 11 | [math.exp(i) for i in range(100)]], 12 | index=indices) 13 | splitter = SplitterByTimestamp(seq_len=5, 14 | pred_len=1, 15 | train_ratio=0.7, 16 | val_ratio=0.1) 17 | train, val, test, buff = splitter.get_csv_splits(test_df) 18 | assert len(train) == 3 19 | assert len(val) == 3 20 | assert len(test) == 3 21 | assert len(buff) == 3 22 | assert buff == indices 23 | for i in range(3): 24 | assert len(train[i]) == 70 25 | assert len(val[i]) == 15 26 | assert len(test[i]) == 25 27 | 28 | def test_splitter_by_timestamp_get_csv_splits_invalid_ndim(): 29 | test_df = pd.DataFrame([np.array([1, 2, 3]), np.array([[4, 5], [6, 7], [8, 9]])]) 30 | splitter = SplitterByTimestamp(seq_len=5, 31 | pred_len=1, 32 | train_ratio=0.7, 33 | val_ratio=0.1) 34 | with pytest.raises(ValueError): 35 | train, val, test, buff = splitter.get_csv_splits(test_df) -------------------------------------------------------------------------------- /tests/test_scripts/test_ltsm.sh: -------------------------------------------------------------------------------- 1 | TRAIN=" 2 | all_six_datasets/ETT-small/ETTh1.csv 3 | all_six_datasets/ETT-small/ETTh2.csv 4 | all_six_datasets/ETT-small/ETTm1.csv 5 | all_six_datasets/ETT-small/ETTm2.csv 6 | all_six_datasets/electricity/electricity.csv 7 | all_six_datasets/exchange_rate/exchange_rate.csv 8 | all_six_datasets/traffic/traffic.csv 9 | all_six_datasets/weather/weather.csv" 10 | 11 | 12 | TEST=" 13 | all_six_datasets/ETT-small/ETTh1.csv 14 | all_six_datasets/ETT-small/ETTh2.csv 15 | all_six_datasets/ETT-small/ETTm1.csv 16 | all_six_datasets/ETT-small/ETTm2.csv 17 | all_six_datasets/electricity/electricity.csv 18 | all_six_datasets/exchange_rate/exchange_rate.csv 19 | all_six_datasets/traffic/traffic.csv 20 | all_six_datasets/weather/weather.csv" 21 | 22 | PROMPT="prompt_bank/prompt_data_normalize_csv_split" 23 | epoch=500 24 | downsample_rate=20 25 | freeze=0 26 | lr=1e-3 27 | 28 | 29 | for pred_len in 96 30 | do 31 | 32 | CUDA_VISIBLE_DEVICES=0,1 python3 main_ltsm.py \ 33 | --model LTSM \ 34 | --model_name_or_path gpt2-medium \ 35 | --local_pretrain LSC2204/LTSM-bundle \ 36 | --train_epochs ${epoch} \ 37 | --batch_size 800 \ 38 | --pred_len ${pred_len} \ 39 | --gradient_accumulation_steps 64 \ 40 | --data_path ${TRAIN} \ 41 | --test_data_path_list ${TEST} \ 42 | --prompt_data_path ${PROMPT} \ 43 | --freeze ${freeze} \ 44 | --learning_rate ${lr} \ 45 | --downsample_rate ${downsample_rate} \ 46 | --output_dir "output/ltsm_csv_medium_lr${lr}_loraFalse_down${downsample_rate}_freeze${freeze}_e${epoch}_pred${pred_len}/"\ 47 | --eval 1 48 | done 49 | -------------------------------------------------------------------------------- /tests/test_scripts/test_tokenizer_training.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch import nn 4 | import os 5 | import argparse 6 | import random 7 | import sys 8 | 9 | sys.path.append("/home/yc146/github_open_ltsm/ltsm") 10 | 11 | from ltsm.data_provider.data_loader import HF_Dataset 12 | from ltsm.data_provider.tokenizer.tokenizer_processor import TokenizerConfig 13 | from ltsm.data_pipeline.tokenizer_pipeline import TokenizerTrainingPipeline, tokenizer_get_args, tokenizer_seed_all 14 | from ltsm.models import get_model 15 | from ltsm.models.utils import freeze_parameters, print_trainable_parameters 16 | from peft import get_peft_model, LoraConfig 17 | 18 | from transformers import ( 19 | Trainer, 20 | TrainingArguments, 21 | EvalPrediction, 22 | ) 23 | 24 | def run(): 25 | config = tokenizer_get_args() 26 | seed = config.seed 27 | tokenizer_seed_all(seed) 28 | model = get_model(config) 29 | 30 | if config.lora: 31 | peft_config = LoraConfig( 32 | target_modules=["c_attn"], # ["q", "v"], 33 | inference_mode=False, 34 | r=config.lora_dim, 35 | lora_alpha=32, 36 | lora_dropout=0.1 37 | ) 38 | model = get_peft_model(model, peft_config) 39 | model.print_trainable_parameters() 40 | 41 | elif config.freeze: 42 | freeze_parameters(model) 43 | 44 | print_trainable_parameters(model) 45 | 46 | 47 | model_optim = torch.optim.Adam(model.parameters(), lr=config.learning_rate) 48 | lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(model_optim, T_max=config.tmax, eta_min=1e-8) 49 | 50 | pipeline = TokenizerTrainingPipeline(config, model, model_optim, lr_scheduler) 51 | 52 | pipeline.run() 53 | 54 | 55 | if __name__ == "__main__": 56 | run() 57 | -------------------------------------------------------------------------------- /ltsm/models/ltsm_ts_tokenizer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .base_config import LTSMConfig 3 | from transformers.modeling_utils import PreTrainedModel 4 | from transformers import AutoModel, AutoConfig 5 | 6 | 7 | class LTSM_Tokenizer(PreTrainedModel): 8 | config_class = LTSMConfig 9 | def __init__(self, configs): 10 | super().__init__(configs) 11 | self.patch_size = configs.patch_size 12 | self.pretrain = configs.pretrain 13 | 14 | self.d_type = torch.bfloat16 15 | self.pred_len = configs.pred_len 16 | 17 | if configs.pretrain: 18 | print("Loading the pretraining weight.") 19 | self.llm_config = AutoConfig.from_pretrained(configs.model_name_or_path) 20 | self.llm = AutoModel.from_pretrained(configs.model_name_or_path) # loads a pretrained GPT-2 base model 21 | else: 22 | raise NotImplementedError("You must load the pretraining weight.") 23 | 24 | self.model_prune(configs) 25 | print("gpt2 = {}".format(self.llm)) 26 | 27 | def model_prune(self, configs): 28 | if "gpt2" in configs.model_name_or_path: 29 | self.llm.h = self.llm.h[:configs.gpt_layers] 30 | elif "phi" in configs.model_name_or_path or "llama" in configs.model_name_or_path or "gemma" in configs.model_name_or_path: 31 | self.llm.layers = self.llm.layers[:configs.gpt_layers] 32 | else: 33 | raise NotImplementedError(f"No implementation in model prune for {self.llm}.") 34 | 35 | def forward(self, x): 36 | x = x.int().unsqueeze(-1) 37 | # x = x.int().to(self.llm.device) 38 | # import ipdb; ipdb.set_trace() 39 | outputs = self.llm(input_ids = x).last_hidden_state 40 | outputs = outputs[:, -self.pred_len:, :] 41 | 42 | return outputs -------------------------------------------------------------------------------- /tests/test_scripts/test_csv_lora.sh: -------------------------------------------------------------------------------- 1 | TRAIN="datasets/ETT-small/ETTh1.csv 2 | datasets/ETT-small/ETTh2.csv 3 | datasets/ETT-small/ETTm1.csv 4 | datasets/ETT-small/ETTm2.csv 5 | datasets/electricity/electricity.csv 6 | datasets/exchange_rate/exchange_rate.csv 7 | datasets/traffic/traffic.csv 8 | datasets/weather/weather.csv" 9 | 10 | TEST="datasets/ETT-small/ETTh1.csv 11 | datasets/ETT-small/ETTh2.csv 12 | datasets/ETT-small/ETTm1.csv 13 | datasets/ETT-small/ETTm2.csv 14 | datasets/electricity/electricity.csv 15 | datasets/exchange_rate/exchange_rate.csv 16 | datasets/traffic/traffic.csv 17 | datasets/weather/weather.csv" 18 | 19 | PROMPT="prompt_bank/prompt_data_normalize_csv_split" 20 | 21 | epoch=500 22 | downsample_rate=20 23 | freeze=0 24 | OUTPUT_PATH="output/test_ltsm_lr${lr}_loraFalse_down${downsample_rate}_freeze${freeze}_e${epoch}_pred${pred_len}/" 25 | 26 | for pred_len in 96 192 336 720 27 | do 28 | for lr in 1e-3 29 | do 30 | for lora_dim in 32 64 31 | do 32 | CUDA_VISIBLE_DEVICES=1,2,3,4,5,6,7 python3 main_ltsm.py \ 33 | --lora \ 34 | --lora_dim ${lora_dim} \ 35 | --model_id test_run \ 36 | --train_epochs ${epoch} \ 37 | --batch_size 800 \ 38 | --pred_len ${pred_len} \ 39 | --gradient_accumulation_steps 64 \ 40 | --data_path ${TRAIN} \ 41 | --test_data_path ${INIT_TEST} \ 42 | --test_data_path_list ${TEST} \ 43 | --prompt_data_path ${PROMPT} \ 44 | --freeze ${freeze} \ 45 | --learning_rate ${lr} \ 46 | --downsample_rate ${downsample_rate} \ 47 | --output_dir ${OUTPUT_PATH} 48 | done 49 | done 50 | done 51 | -------------------------------------------------------------------------------- /multi_agents_pipeline/agents/TS_Agent.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pathlib import Path 3 | from typing import Optional, List 4 | 5 | from autogen_core import ( 6 | RoutedAgent, 7 | message_handler, 8 | default_subscription, 9 | MessageContext, 10 | DefaultTopicId, 11 | TopicId, 12 | type_subscription 13 | ) 14 | from autogen_core.models import ChatCompletionClient, UserMessage, AssistantMessage 15 | from pydantic import BaseModel 16 | from .custom_messages import TextMessage, TSMessage 17 | from multi_agents_pipeline.ltsm_inference import inference 18 | 19 | 20 | @type_subscription(topic_type="Planner-TS") # for receiving task from Planner 21 | @type_subscription(topic_type="Redo-TS") # for receiving TS Feedback 22 | class TSAgent(RoutedAgent): 23 | def __init__(self, name: str, model_client: Optional[ChatCompletionClient] = None): 24 | super().__init__(description=f"{name} with LTSM Package support") 25 | self.name = name 26 | self._last_plan: Optional[str] = None 27 | self._model_client = model_client 28 | self._last_ts_response: Optional[str] = None # for evaluation 29 | 30 | @message_handler 31 | async def handle_TS(self, message: TSMessage, ctx: MessageContext) -> None: 32 | """This is the TS info given by Planner. LTSM will process the TS data and return the answer. 33 | """ 34 | file_path = message.filepath 35 | task_type = message.task_type 36 | 37 | ts_response = inference( 38 | file=file_path, 39 | task_type=task_type 40 | ) 41 | 42 | 43 | # publish 44 | await self.publish_message(TSMessage(source=self.name, 45 | filepath = ts_response, 46 | task_type="ts-classification"), TopicId(type="TS-Info", source=self.id.key)) 47 | 48 | def get_last_response(self) -> Optional[str]: 49 | return self._last_ts_response 50 | 51 | -------------------------------------------------------------------------------- /tests/models/init_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from transformers import PretrainedConfig, PreTrainedModel 3 | from ltsm.models import register_model, get_model, model_dict 4 | 5 | def test_register_model(mocker): 6 | mock_model = mocker.MagicMock(spec=PreTrainedModel) 7 | register_model(mock_model, "MockModel1") 8 | assert "MockModel1" in model_dict 9 | assert model_dict["MockModel1"] == mock_model 10 | 11 | with pytest.raises(AssertionError, match="Reader MockModel1 already registered"): 12 | register_model(mock_model, "MockModel1") 13 | 14 | def test_get_model(mocker): 15 | mock_model = mocker.MagicMock(spec=PreTrainedModel) 16 | mock_config = mocker.MagicMock(spec=PretrainedConfig) 17 | register_model(mock_model, "MockModel2") 18 | 19 | instance = get_model(mock_config, "MockModel2") 20 | mock_model.assert_called_once_with(mock_config) 21 | assert isinstance(instance, mocker.MagicMock) 22 | 23 | def test_get_model_invalid_name(): 24 | with pytest.raises(ValueError, match="Model NonExistentModel is not registered"): 25 | get_model(PretrainedConfig(), "NonExistentModel") 26 | 27 | def test_get_model_local_pretrain(mocker): 28 | mock_from_pretrained = mocker.patch("transformers.PretrainedConfig.from_pretrained") 29 | mock_model = mocker.MagicMock(spec=PreTrainedModel) 30 | register_model(mock_model, "MockModel3") 31 | 32 | mock_from_pretrained.return_value = mocker.MagicMock() 33 | instance = get_model(PretrainedConfig(), "MockModel3", local_pretrain="path/to/pretrained") 34 | mock_model.from_pretrained.assert_called_once_with("path/to/pretrained", mock_from_pretrained.return_value) 35 | assert isinstance(instance, mocker.MagicMock) 36 | 37 | def test_get_model_hf_hub(mocker): 38 | mock_from_pretrained = mocker.patch("transformers.PreTrainedModel.from_pretrained") 39 | mock_model = mocker.MagicMock(spec=PreTrainedModel) 40 | register_model(mock_model, "MockModel4") 41 | 42 | instance = get_model(PretrainedConfig(), "MockModel4", hf_hub_model="mock-hub-model") 43 | mock_model.from_pretrained.assert_called_once_with("mock-hub-model", PretrainedConfig()) 44 | assert isinstance(instance, mocker.MagicMock) -------------------------------------------------------------------------------- /ltsm/layers/RevIN.py: -------------------------------------------------------------------------------- 1 | # code from https://github.com/ts-kim/RevIN, with minor modifications 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | class RevIN(nn.Module): 7 | def __init__(self, num_features: int, eps=1e-5, affine=True, subtract_last=False): 8 | """ 9 | :param num_features: the number of features or channels 10 | :param eps: a value added for numerical stability 11 | :param affine: if True, RevIN has learnable affine parameters 12 | """ 13 | super(RevIN, self).__init__() 14 | self.num_features = num_features 15 | self.eps = eps 16 | self.affine = affine 17 | self.subtract_last = subtract_last 18 | if self.affine: 19 | self._init_params() 20 | 21 | def forward(self, x, mode:str): 22 | if mode == 'norm': 23 | self._get_statistics(x) 24 | x = self._normalize(x) 25 | elif mode == 'denorm': 26 | x = self._denormalize(x) 27 | else: raise NotImplementedError 28 | return x 29 | 30 | def _init_params(self): 31 | # initialize RevIN params: (C,) 32 | self.affine_weight = nn.Parameter(torch.ones(self.num_features)) 33 | self.affine_bias = nn.Parameter(torch.zeros(self.num_features)) 34 | 35 | def _get_statistics(self, x): 36 | dim2reduce = tuple(range(1, x.ndim-1)) 37 | if self.subtract_last: 38 | self.last = x[:,-1,:].unsqueeze(1) 39 | else: 40 | self.mean = torch.mean(x, dim=dim2reduce, keepdim=True).detach() 41 | self.stdev = torch.sqrt(torch.var(x, dim=dim2reduce, keepdim=True, unbiased=False) + self.eps).detach() 42 | 43 | def _normalize(self, x): 44 | if self.subtract_last: 45 | x = x - self.last 46 | else: 47 | x = x - self.mean 48 | x = x / self.stdev 49 | if self.affine: 50 | x = x * self.affine_weight 51 | x = x + self.affine_bias 52 | return x 53 | 54 | def _denormalize(self, x): 55 | if self.affine: 56 | x = x - self.affine_bias 57 | x = x / (self.affine_weight + self.eps*self.eps) 58 | x = x * self.stdev 59 | if self.subtract_last: 60 | x = x + self.last 61 | else: 62 | x = x + self.mean 63 | return x -------------------------------------------------------------------------------- /multi_agents_pipeline/llm-server.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI 2 | from pydantic import BaseModel 3 | from transformers import AutoModelForCausalLM, AutoTokenizer 4 | import torch 5 | 6 | app = FastAPI() 7 | model_name = "meta-llama/Meta-Llama-3-8B-Instruct" 8 | tokenizer = AutoTokenizer.from_pretrained(model_name, local_files_only=True) 9 | model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16) 10 | print(model.hf_device_map) 11 | 12 | model.eval() 13 | 14 | tokenizer.pad_token = tokenizer.eos_token 15 | 16 | class ChatRequest(BaseModel): 17 | model: str 18 | messages: list 19 | temperature: float = 0.7 20 | max_tokens: int = 1024 21 | 22 | def format_prompt_llama3(prompt: str) -> str: 23 | return ( 24 | "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n" 25 | f"{prompt}\n" 26 | "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n" 27 | ) 28 | 29 | 30 | @app.post("/v1/chat/completions") 31 | async def chat(request: ChatRequest): 32 | prompt = request.messages[-1]["content"] # for convenience, temporarily just use the last message. 33 | prompt = format_prompt_llama3(prompt) 34 | 35 | input_data = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True) 36 | input_ids = input_data["input_ids"].to(model.device) 37 | attention_mask = input_data["attention_mask"].to(model.device) 38 | 39 | with torch.no_grad(): 40 | output = model.generate( 41 | input_ids, 42 | attention_mask=attention_mask, 43 | max_new_tokens=request.max_tokens, 44 | temperature=request.temperature, 45 | do_sample=True, 46 | pad_token_id=tokenizer.pad_token_id, 47 | ) 48 | 49 | generated = output[0][input_ids.shape[1]:] 50 | response_text = tokenizer.decode(generated, skip_special_tokens=True) 51 | 52 | return { 53 | "id": "chatcmpl-123", 54 | "object": "chat.completion", 55 | "created": 1234567890, 56 | "model": request.model, 57 | "choices": [ 58 | { 59 | "message": {"role": "assistant", "content": response_text}, 60 | "finish_reason": "stop", 61 | "index": 0, 62 | } 63 | ], 64 | "usage": { 65 | "prompt_tokens": len(input_ids[0]), 66 | "completion_tokens": len(output[0]), 67 | "total_tokens": len(input_ids[0]) + len(output[0]), 68 | } 69 | } -------------------------------------------------------------------------------- /tests/data_reader/dataloader_unittest_example.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pandas as pd 3 | from io import StringIO 4 | 5 | class TestDataTransformation(unittest.TestCase): 6 | 7 | def setUp(self): 8 | # Create a simulated CSV input data 9 | self.input_csv = StringIO( 10 | """Updated Time,Suction Pressure,Suction temperature,Condenser Inlet Temperature,Condenser Outlet Temperature,Liquid temperature,Liquid Pressure,Compressor current,Condensing Fan Current,Top Shell Temperature,Discharge Temperature,Bottom Temperature,Motor Temperature 11 | 6/30/2023 19:01:24,61.712231658240015,102.75,98.340625,109.73125,100.84,363.4015032,9.9,0.6,58.22,26.27,118.6609375,118.8859375 12 | 6/30/2023 19:03:04,69.21224676096001,103.19,98.93125,109.73125,100.84,364.13170107648006,9.86,0.6,58.89,26.4,118.365625,118.6046875 13 | """ 14 | ) 15 | 16 | # Expected format of converted data 17 | self.expected_df = pd.DataFrame({ 18 | 0: [0, 1], 19 | 1: [61.712231658240015, 69.21224676096001], 20 | 2: [102.75, 103.19], 21 | 3: [98.340625, 98.93125], 22 | 4: [109.73125, 109.73125], 23 | 5: [100.84, 100.84], 24 | 6: [363.4015032, 364.13170107648006], 25 | 7: [9.9, 9.86], 26 | 8: [0.6, 0.6], 27 | 9: [58.22, 58.89], 28 | 10: [26.27, 26.4], 29 | 11: [118.6609375, 118.365625], 30 | 12: [118.8859375, 118.6046875] 31 | }) 32 | 33 | def test_data_transformation(self): 34 | # Read CSV data as a DataFrame 35 | input_df = pd.read_csv(self.input_csv, parse_dates=['Updated Time']) 36 | 37 | # Execute data conversion function 38 | transformed_df = transform_data(input_df) 39 | 40 | # Verify that the time column has been successfully converted to the 0, 1, 2... format 41 | self.assertTrue((transformed_df.iloc[0, :] == range(len(transformed_df.columns))).all(), 42 | "Time sequence conversion failed.") 43 | 44 | # Verify that the converted data structure meets expectations 45 | pd.testing.assert_frame_equal( 46 | transformed_df.iloc[1:, :].reset_index(drop=True), 47 | self.expected_df.reset_index(drop=True), 48 | check_dtype=False, 49 | err_msg="Data transformation did not produce the expected output." 50 | ) 51 | 52 | if __name__ == '__main__': 53 | unittest.main() 54 | # Step 1 55 | # Step 2 56 | -------------------------------------------------------------------------------- /tests/test_scripts/main_ltsm.py: -------------------------------------------------------------------------------- 1 | from ltsm.data_pipeline import StatisticalTrainingPipeline, get_args, seed_all 2 | from ltsm.common.base_training_pipeline import TrainingConfig 3 | import torch 4 | import torch.nn as nn 5 | import numpy as np 6 | 7 | if __name__ == "__main__": 8 | # Two ways to load the configuration: from a JSON file or from command line arguments 9 | # First method 10 | config = get_args() 11 | 12 | # Second method 13 | # train_config = TrainingConfig.load("ltsm.json") 14 | 15 | seed = config.train_params["seed"] 16 | seed_all(seed) 17 | 18 | if config.train_params["model"] == "Informer": 19 | def collate_fn(batch): 20 | return { 21 | 'input_data': torch.from_numpy(np.stack([x['input_data'] for x in batch])).type(torch.float32), 22 | 'labels': torch.from_numpy(np.stack([x['labels'] for x in batch])).type(torch.float32), 23 | 'timestamp_input': torch.from_numpy(np.stack([x['timestamp_input'] for x in batch])).type(torch.float32), 24 | 'timestamp_labels': torch.from_numpy(np.stack([x['timestamp_labels'] for x in batch])).type(torch.float32) 25 | } 26 | 27 | def prediction_step(model, inputs, prediction_loss_only=False, ignore_keys=None): 28 | labels = inputs["labels"].to(model.module.device) 29 | input_data_mark = inputs["timestamp_input"].to(model.module.device) 30 | label_mark = inputs["timestamp_labels"].to(model.module.device) 31 | input_data = inputs["input_data"].to(model.module.device) 32 | 33 | outputs = model(input_data, input_data_mark, labels, label_mark) 34 | loss = nn.functional.mse_loss(outputs, labels) 35 | return (loss, outputs, labels) 36 | 37 | def compute_loss(model, inputs, return_outputs=False): 38 | input_data_mark = inputs["timestamp_input"].to(model.module.device) 39 | label_mark = inputs["timestamp_labels"].to(model.module.device) 40 | outputs = model(inputs["input_data"], input_data_mark, inputs["labels"], label_mark) 41 | 42 | loss = nn.functional.mse_loss(outputs, inputs["labels"]) 43 | return (loss, outputs) if return_outputs else loss 44 | 45 | pipeline = StatisticalTrainingPipeline(config, 46 | collate_fn=collate_fn, 47 | prediction_step=prediction_step, 48 | compute_loss=compute_loss) 49 | else: 50 | pipeline = StatisticalTrainingPipeline(config) 51 | 52 | pipeline.run() 53 | -------------------------------------------------------------------------------- /ltsm/prompt_reader/text_prompt/csv_prompt.json: -------------------------------------------------------------------------------- 1 | { 2 | "0": "The Electricity Transformer Temperature (ETT) is a crucial indicator in the electric power long-term deployment. This dataset consists of 2 years data from two separated counties in China. To explore the granularity on the Long sequence time-series forecasting (LSTF) problem, different subsets are created, {ETTh1, ETTh2} for 1-hour-level and ETTm1 for 15-minutes-level. Each data point consists of the target value ”oil temperature” and 6 power load features. The train/val/test is 12/4/4 months.", 3 | "1": "The Electricity Transformer Temperature (ETT) is a crucial indicator in the electric power long-term deployment. This dataset consists of 2 years data from two separated counties in China. To explore the granularity on the Long sequence time-series forecasting (LSTF) problem, different subsets are created, {ETTh1, ETTh2} for 1-hour-level and ETTm1 for 15-minutes-level. Each data point consists of the target value ”oil temperature” and 6 power load features. The train/val/test is 12/4/4 months.", 4 | "2": "The Electricity Transformer Temperature (ETT) is a crucial indicator in the electric power long-term deployment. This dataset consists of 2 years data from two separated counties in China. To explore the granularity on the Long sequence time-series forecasting (LSTF) problem, different subsets are created, {ETTh1, ETTh2} for 1-hour-level and ETTm1 for 15-minutes-level. Each data point consists of the target value ”oil temperature” and 6 power load features. The train/val/test is 12/4/4 months.", 5 | "3": "The Electricity Transformer Temperature (ETT) is a crucial indicator in the electric power long-term deployment. This dataset consists of 2 years data from two separated counties in China. To explore the granularity on the Long sequence time-series forecasting (LSTF) problem, different subsets are created, {ETTh1, ETTh2} for 1-hour-level and ETTm1 for 15-minutes-level. Each data point consists of the target value ”oil temperature” and 6 power load features. The train/val/test is 12/4/4 months.", 6 | "4": "Electricity contains electircity consumption of 321 clients from 2012 to 2014. And the data was converted to reflect hourly consumption.", 7 | "5": "Exchange rate is a collection of the daily exchange rates of eight foreign countries ranging from 1990 to 2016.", 8 | "6": "Traffic is a collection of hourly data from California Department of Transportation, which describes the road occupancy rates measured by different sensors on San Francisco Bay area freeways.", 9 | "7": "Weather is recorded every 10 minutes for the 2020 whole year, which contains 21 meteorological indicators, such as air temperature, humidity, etc." 10 | } -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | checkpoints/ 2 | dataset/ 3 | TSForecasting/ 4 | *.swp 5 | output/ 6 | .idea/ 7 | 8 | *.pub 9 | 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | *$py.class 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | wheels/ 32 | pip-wheel-metadata/ 33 | share/python-wheels/ 34 | *.egg-info/ 35 | .installed.cfg 36 | *.egg 37 | MANIFEST 38 | 39 | # PyInstaller 40 | # Usually these files are written by a python script from a template 41 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 42 | *.manifest 43 | *.spec 44 | 45 | # Installer logs 46 | pip-log.txt 47 | pip-delete-this-directory.txt 48 | 49 | # Unit test / coverage reports 50 | htmlcov/ 51 | .tox/ 52 | .nox/ 53 | .coverage 54 | .coverage.* 55 | .cache 56 | nosetests.xml 57 | coverage.xml 58 | *.cover 59 | *.py,cover 60 | .hypothesis/ 61 | .pytest_cache/ 62 | 63 | # Translations 64 | *.mo 65 | *.pot 66 | 67 | # Django stuff: 68 | *.log 69 | local_settings.py 70 | db.sqlite3 71 | db.sqlite3-journal 72 | 73 | # Flask stuff: 74 | instance/ 75 | .webassets-cache 76 | 77 | # Scrapy stuff: 78 | .scrapy 79 | 80 | # Sphinx documentation 81 | docs/_build/ 82 | 83 | # PyBuilder 84 | target/ 85 | 86 | # Jupyter Notebook 87 | .ipynb_checkpoints 88 | 89 | # IPython 90 | profile_default/ 91 | ipython_config.py 92 | 93 | # pyenv 94 | .python-version 95 | 96 | # pipenv 97 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 98 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 99 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 100 | # install all needed dependencies. 101 | #Pipfile.lock 102 | 103 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 104 | __pypackages__/ 105 | 106 | # Celery stuff 107 | celerybeat-schedule 108 | celerybeat.pid 109 | 110 | # SageMath parsed files 111 | *.sage.py 112 | 113 | # Environments 114 | .env 115 | .venv 116 | env/ 117 | venv/ 118 | ENV/ 119 | env.bak/ 120 | venv.bak/ 121 | 122 | # Spyder project settings 123 | .spyderproject 124 | .spyproject 125 | 126 | # Rope project settings 127 | .ropeproject 128 | 129 | # mkdocs documentation 130 | /site 131 | 132 | # mypy 133 | .mypy_cache/ 134 | .dmypy.json 135 | dmypy.json 136 | 137 | # Pyre type checker 138 | .pyre/ 139 | *.csv 140 | scratch/ 141 | .DS_Store 142 | .idea/ 143 | 144 | /datasets 145 | /prompt_bank -------------------------------------------------------------------------------- /ltsm/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .ltsm_stat_model import LTSM 2 | from .ltsm_wordprompt import LTSM_WordPrompt 3 | from .ltsm_ts_tokenizer import LTSM_Tokenizer 4 | from .PatchTST import PatchTST 5 | from .DLinear import DLinear 6 | from .Informer import Informer 7 | from transformers import PretrainedConfig, PreTrainedModel 8 | 9 | model_dict = {} 10 | 11 | def register_model(module, module_name: str): 12 | """ 13 | Registers a PreTrainedModel module into the model dictionary. 14 | 15 | Args: 16 | module: A Python module or class that implements a PreTrainedModel. 17 | module_name (str): The key name for the module in the model dictionary. 18 | 19 | Raises: 20 | AssertionError: If a model with the same name is already registered 21 | """ 22 | assert module_name not in model_dict, f"Reader {module_name} already registered" 23 | model_dict[module_name] = module 24 | 25 | register_model(LTSM, 'LTSM') 26 | register_model(LTSM_WordPrompt, 'LTSM_WordPrompt') 27 | register_model(LTSM_Tokenizer, 'LTSM_Tokenizer') 28 | register_model(PatchTST, 'PatchTST') 29 | register_model(DLinear, 'DLinear') 30 | register_model(Informer, 'Informer') 31 | 32 | def get_model(config: PretrainedConfig, model_name: str, local_pretrain: str = None, hf_hub_model: str = None) -> PreTrainedModel: 33 | """ 34 | Factory method to create a model by name. 35 | 36 | Args: 37 | config (PreTrainedConfig): The configuration for the model. 38 | model_name (str): The name of the model to instantiate. 39 | local_pretrain (bool): If True, load the model from a local pretraining path. 40 | hf_hub_model (str): The Hugging Face Hub model name. 41 | 42 | Returns: 43 | torch.nn.Module: Instantiated model. 44 | 45 | Raises: 46 | ValueError: If the model name is not found in model_dict. 47 | """ 48 | if model_name not in model_dict: 49 | raise ValueError(f"Model {model_name} is not registered. Available models: {list(model_dict.keys())}") 50 | 51 | # Load pretrained weights if hf_hub_model is provided 52 | if hf_hub_model is not None: 53 | return model_dict[model_name].from_pretrained(hf_hub_model, config) 54 | 55 | # Check for local pretraining 56 | if local_pretrain is None or local_pretrain == "None": 57 | return model_dict[model_name](config) 58 | else: 59 | model_config = PretrainedConfig.from_pretrained(local_pretrain) 60 | return model_dict[model_name].from_pretrained(local_pretrain, model_config) 61 | 62 | 63 | __all__ = { 64 | register_model, 65 | get_model, 66 | PatchTST, 67 | DLinear, 68 | Informer, 69 | LTSM, 70 | LTSM_WordPrompt, 71 | LTSM_Tokenizer 72 | } -------------------------------------------------------------------------------- /tests/data_reader/database_reader_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import patch, MagicMock 3 | import ltsm.data_reader.database_reader as db_connector 4 | import pandas as pd 5 | 6 | class TestDatabaseConnector(unittest.TestCase): 7 | 8 | def setUp(self): 9 | # Sample DataFrame with different data types 10 | self.input_df = pd.DataFrame({ 11 | 'Updated Time': ['06/30/2023 19:01:24', '06/30/2023 19:03:04', '06/30/2023 19:04:44'], 12 | 'Temperature': [61.71, 69.21,323.64], # Float 13 | 'Count': [10, 15,18], # Integer 14 | 'Status': [True, False,False], # Boolean 15 | 'Description': ['Normal', 'High','Low'] # String 16 | }) 17 | self.database = "test_database" 18 | self.table_name = "test_table" 19 | 20 | @patch('ltsm.data_reader.database_reader.create_connection') 21 | def test_setup_tables_with_various_data_types(self, mock_create_connection): 22 | # Mock the connection and cursor 23 | mock_conn = MagicMock() 24 | mock_cursor = MagicMock() 25 | mock_conn.cursor.return_value = mock_cursor 26 | mock_create_connection.return_value = mock_conn 27 | 28 | # Call the function to be tested 29 | db_connector.setup_tables(mock_conn, self.database, self.table_name, self.input_df) 30 | 31 | # Check if the correct SQL commands were executed 32 | mock_cursor.execute.assert_any_call(f"USE {self.database}") 33 | expected_schema = "(ts TIMESTAMP, Temperature FLOAT, Count INT, Status BOOL, Description STRING)" 34 | mock_cursor.execute.assert_any_call(f"CREATE TABLE IF NOT EXISTS {self.table_name} {expected_schema}") 35 | 36 | @patch('ltsm.data_reader.database_reader.create_connection') 37 | @patch('ltsm.data_reader.database_reader.pd.read_csv') 38 | def test_insert_data_with_various_data_types(self, mock_read_csv, mock_create_connection): 39 | # Mock the connection and cursor 40 | mock_conn = MagicMock() 41 | mock_cursor = MagicMock() 42 | mock_conn.cursor.return_value = mock_cursor 43 | mock_create_connection.return_value = mock_conn 44 | 45 | # Mock reading CSV with various data types 46 | mock_read_csv.return_value = self.input_df 47 | 48 | # Call the function to be tested 49 | db_connector.insert_data_from_csv(mock_conn, self.database, "dummy_path.csv", self.table_name) 50 | 51 | # Check if data insertion commands were executed 52 | self.assertTrue(mock_cursor.execute.called) 53 | self.assertEqual(mock_cursor.execute.call_count, len(self.input_df)+4) # Check the number of execute calls 54 | 55 | if __name__ == '__main__': 56 | unittest.main() 57 | -------------------------------------------------------------------------------- /ltsm/prompt_reader/stat_prompt/tsfel/utils/progress_bar.py: -------------------------------------------------------------------------------- 1 | from IPython.display import HTML 2 | from IPython import get_ipython 3 | 4 | 5 | def progress_bar_terminal(iteration, total, prefix="", suffix="", decimals=0, length=100, fill="█", printend="\r"): 6 | """Call in a loop to create terminal progress bar. 7 | 8 | Parameters 9 | ---------- 10 | iteration: int 11 | current iteration 12 | total: int 13 | total iterations 14 | prefix: str 15 | prefix string 16 | suffix: str 17 | suffix string 18 | decimals: int 19 | positive number of decimals in percent complete 20 | length: int 21 | character length of bar 22 | fill: str 23 | bar fill character 24 | printend: str 25 | end character (e.g. "\r", "\r\n") 26 | """ 27 | 28 | percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total))) 29 | filledlength = int(length * iteration // total) 30 | bar = fill * filledlength + "-" * (length - filledlength) 31 | print("\r%s |%s| %s%% %s" % (prefix, bar, percent, suffix), end=printend) 32 | # Print New Line on Complete 33 | if iteration == total: 34 | print() 35 | 36 | 37 | def progress_bar_notebook(iteration, total=100): 38 | """Progress bar for notebooks. 39 | 40 | Parameters 41 | ---------- 42 | iteration: int 43 | current iteration 44 | total: int 45 | total iterations 46 | 47 | Returns 48 | ------- 49 | Progress bar for notebooks 50 | 51 | """ 52 | result = int((iteration / total) * 100) 53 | return HTML( 54 | """ 55 |

56 | Progress: {result}% Complete 57 |

58 | 63 | {value} 64 | 65 | 66 | """.format( 67 | value=iteration, max_value=total, result=result 68 | ) 69 | ) 70 | 71 | 72 | def display_progress_bar(iteration, total, out): 73 | """Displays progress bar according to python interface. 74 | 75 | Parameters 76 | ---------- 77 | iteration: int 78 | current iteration 79 | total: int 80 | total iterations 81 | out: progress bar notebook output 82 | 83 | """ 84 | 85 | if ( 86 | (get_ipython().__class__.__name__ == "ZMQInteractiveShell") 87 | or (get_ipython().__class__.__name__ == "Shell") 88 | and out is not None 89 | ): 90 | out.update(progress_bar_notebook(iteration + 1, total)) 91 | else: 92 | progress_bar_terminal(iteration + 1, total, prefix="Progress:", suffix="Complete", length=50) 93 | return -------------------------------------------------------------------------------- /tests/models/DLinear_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from ltsm.models import get_model 3 | from ltsm.models.base_config import DLinearConfig 4 | from ltsm.common.base_training_pipeline import TrainingConfig 5 | from transformers import PreTrainedModel 6 | import torch 7 | import numpy as np 8 | 9 | @pytest.fixture 10 | def config(tmp_path): 11 | data_path = tmp_path / "test.csv" 12 | prompt_data_path = tmp_path / "prompt_normalize_split" 13 | prompt_data_path.mkdir() 14 | OUTPUT_PATH = data_path / "output" 15 | 16 | train_params = { 17 | "data_path": str(data_path), 18 | "model": "DLinear", 19 | "model_name_or_path": "gpt2-medium", 20 | "gradient_accumulation_steps": 64, 21 | "test_data_path_list": [str(data_path)], 22 | "prompt_data_path": str(prompt_data_path), 23 | "train_epochs": 100, 24 | "patience": 10, 25 | "lradj": 'TST', 26 | "pct_start": 0.2, 27 | "freeze": 0, 28 | "itr": 1, 29 | "batch_size": 32, 30 | "learning_rate": 1e-3, 31 | "downsample_rate": 20, 32 | "output_dir": str(OUTPUT_PATH), 33 | "eval": 0, 34 | "local_pretrain": "None" 35 | } 36 | config = { 37 | "pred_len": 96, 38 | "enc_in": 1, 39 | "seq_len": 336, # Equal to the sequence length + the length of prompt 40 | "individual": 0, 41 | "embed": "timeF" 42 | } 43 | dlinear_config = DLinearConfig(**config) 44 | 45 | return TrainingConfig(model_config=dlinear_config, **train_params) 46 | 47 | def test_model_initialization(config): 48 | print(config.train_params["model"]) 49 | model = get_model(config.model_config, model_name=config.train_params["model"], local_pretrain=config.train_params["local_pretrain"]) 50 | assert model is not None 51 | assert isinstance(model, PreTrainedModel) 52 | 53 | 54 | def test_parameter_count(config): 55 | model =get_model(config.model_config, model_name=config.train_params["model"], local_pretrain=config.train_params["local_pretrain"]) 56 | param_count = sum([p.numel() for p in model.parameters() if p.requires_grad]) 57 | 58 | expected_param_count = 2*(config.model_config.seq_len*config.model_config.pred_len + config.model_config.pred_len) 59 | 60 | assert param_count == expected_param_count 61 | 62 | def test_forward_output_shape(config): 63 | torch.set_default_dtype(torch.float64) 64 | model = get_model(config.model_config, model_name=config.train_params["model"], local_pretrain=config.train_params["local_pretrain"]) 65 | batch_size = 32 66 | channel = 16 67 | input_length = config.model_config.seq_len 68 | input = torch.tensor(np.zeros((batch_size, input_length, channel))) 69 | output = model(input) 70 | assert output.size() == torch.Size([batch_size, config.model_config.pred_len, channel]) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate==0.32.0 2 | aiofiles==24.1.0 3 | aiohappyeyeballs==2.4.4 4 | aiohttp==3.11.10 5 | aiosignal==1.3.2 6 | annotated-types==0.7.0 7 | anyio==4.6.2 8 | asttokens==3.0.0 9 | async-timeout==5.0.1 10 | attrs==24.3.0 11 | autogen-agentchat==0.5.1 12 | autogen-core==0.5.1 13 | autogen-ext==0.5.1 14 | certifi==2024.8.30 15 | charset-normalizer==3.1.0 16 | click==8.1.8 17 | cmake==3.26.3 18 | contourpy==1.0.7 19 | coverage==7.8.0 20 | cycler==0.11.0 21 | decorator==5.2.1 22 | Deprecated==1.2.18 23 | distro==1.9.0 24 | einops==0.6.0 25 | exceptiongroup==1.2.2 26 | executing==2.2.0 27 | fastapi==0.112.2 28 | filelock==3.12.0 29 | fonttools==4.39.3 30 | frozenlist==1.5.0 31 | fsspec==2025.3.2 32 | h11==0.14.0 33 | httpcore==1.0.8 34 | httpx==0.28.1 35 | huggingface-hub==0.30.2 36 | idna==3.4 37 | importlib-resources==5.12.0 38 | importlib_metadata==8.4.0 39 | iniconfig==2.0.0 40 | ipdb==0.13.13 41 | ipython==8.35.0 42 | jedi==0.19.2 43 | Jinja2==3.1.2 44 | jiter==0.9.0 45 | joblib==1.2.0 46 | jsonref==1.1.0 47 | kiwisolver==1.4.4 48 | lit==16.0.1 49 | MarkupSafe==2.1.2 50 | matplotlib==3.7.1 51 | matplotlib-inline==0.1.7 52 | mpmath==1.3.0 53 | multidict==6.4.3 54 | networkx==3.1 55 | numpy==1.24.2 56 | nvidia-cublas-cu11==11.10.3.66 57 | nvidia-cuda-cupti-cu11==11.7.101 58 | nvidia-cuda-nvrtc-cu11==11.7.99 59 | nvidia-cuda-runtime-cu11==11.7.99 60 | nvidia-cudnn-cu11==8.5.0.96 61 | nvidia-cufft-cu11==10.9.0.58 62 | nvidia-curand-cu11==10.2.10.91 63 | nvidia-cusolver-cu11==11.4.0.1 64 | nvidia-cusparse-cu11==11.7.4.91 65 | nvidia-nccl-cu11==2.14.3 66 | nvidia-nvtx-cu11==11.7.91 67 | openai==1.73.0 68 | opentelemetry-api==1.32.0 69 | packaging==23.1 70 | pandas==2.0.0 71 | parso==0.8.4 72 | peft==0.10.0 73 | pexpect==4.9.0 74 | pillow==11.1.0 75 | pluggy==1.5.0 76 | prompt_toolkit==3.0.50 77 | propcache==0.3.1 78 | protobuf==5.29.4 79 | psutil==6.1.0 80 | ptyprocess==0.7.0 81 | pure_eval==0.2.3 82 | pydantic==2.11.3 83 | pydantic_core==2.33.1 84 | Pygments==2.19.1 85 | pyparsing==3.0.9 86 | pytest==8.1.1 87 | pytest-cov==4.1.0 88 | pytest-mock==3.14.0 89 | python-dateutil==2.8.2 90 | pytz==2023.3 91 | PyYAML==6.0 92 | regex==2023.3.23 93 | requests==2.28.2 94 | safetensors==0.5.3 95 | scikit-learn==1.2.2 96 | scipy==1.10.1 97 | six==1.16.0 98 | sniffio==1.3.1 99 | stack-data==0.6.3 100 | starlette==0.38.6 101 | sympy==1.11.1 102 | taos-ws-py==0.3.3 103 | threadpoolctl==3.1.0 104 | tiktoken==0.9.0 105 | tokenizers==0.19.1 106 | tomli==2.0.2 107 | torch==2.0.0 108 | tqdm==4.65.0 109 | traitlets==5.14.3 110 | transformers==4.40.0 111 | triton==2.0.0 112 | typing-inspection==0.4.0 113 | typing_extensions==4.12.2 114 | tzdata==2023.3 115 | urllib3==1.26.15 116 | uvicorn==0.34.0 117 | wcwidth==0.2.13 118 | wrapt==1.17.2 119 | yarl==1.19.0 120 | zipp==3.15.0 121 | ipdb 122 | peft==0.10.0 -------------------------------------------------------------------------------- /multi_agents_pipeline/Readme.md: -------------------------------------------------------------------------------- 1 | 2 | # Quick Command 3 | 4 | ## Run the local LLM Server 5 | The command `CUDA_VISIBLE_DEVICES=1,2,3 uvicorn llm-server:app --port --reload` should be run in the `multi_agents_pipeline` directory. e.g. `CUDA_VISIBLE_DEVICES=2,3,4 uvicorn llm-server:app` will run the FastAPI app on http://127.0.0.1:8000/. 6 | 7 | ## Run the Pipeline 8 | To execute the full pipeline, go to the `multi_agents_pipeline` folder and run `python main.py`. 9 | 10 | > To use LLama-3-8B-Instruct, please check transformers >= 4.40! 11 | 12 | # Messages and Communication 13 | 14 | ```python 15 | from pydantic import BaseModel 16 | from typing import Optional, List 17 | 18 | 19 | class TextMessage(BaseModel): 20 | """ 21 | pass QA related message""" 22 | source: str 23 | content: str 24 | 25 | class TSMessage(BaseModel): 26 | """ 27 | passed from Planner to TS Agent, and from TS Agent to QA Agent 28 | 29 | filepath should be a valid path to a csv/tsv file""" 30 | source: str 31 | filepath: str # TO DO : Sopport more possible types 32 | task_type:Optional[str] = None 33 | description: Optional[str] = None 34 | 35 | class TSTaskMessage(BaseModel): 36 | """ 37 | passed to Planner 38 | 39 | This message contains a text prompt and the filepath to the data file. 40 | """ 41 | description: str 42 | filepath: str 43 | ``` 44 | | **Agent** | **Publishes** | **Subscribes** | 45 | |------------------|--------------------------------------------------------|--------------------------------------------------------| 46 | | **Planner** | `Planner-QA` (`TextMessage`)
`Planner-TS` (`TSMessage`) | `TSTaskMessage` | 47 | | **TS Agent** | `TS-Info` (`TSMessage`) | `Planner-TS` (`TSMessage`)
`Reward-TS` (`TSMessage`) | 48 | | **QA Agent** | `QA-Response` (`TextMessage`) | `Planner-QA` (`TextMessage`)
`TS-Info` (`TSMessage`)
`Reward-QA` (`TextMessage`) | 49 | | **Reward Agent** | `Reward-QA` (`TextMessage`)
`Reward-TS` (`TSMessage`) | `TS-Info` (`TSMessage`)
`QA-Response` (`TextMessage`) | 50 | 51 | 52 | 53 | # Agents 54 | 55 | ![](./Agents.jpg) 56 | 57 | ## Planner 58 | 59 | Receive TSTaskMessage from user. Then generate TS Task and QA Task to be sent tox TS Agent and QA Agent. 60 | 61 | ## TS Agent 62 | 63 | Handle TSMessage, use Time Series Models(e.g., LTSM) or Chat Models(e.g., ChatGPT) to extract features from time series. 64 | 65 | ## QA Agent 66 | 67 | Combine TS Info and Planner-QA, get the response of LLM, and provide 68 | 69 | ## Reward Agent 70 | 71 | Gather output of TS Agent and QA Agent. Send Feedback to TS and QA if the evaluation score is lower than a threshold. 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /multi_agents_pipeline/ltsm_inference.py: -------------------------------------------------------------------------------- 1 | """ 2 | TODO Apr 7, 2025 ~ Apr 13. 2025 3 | - Select different models based on task_type 4 | 5 | """ 6 | from ltsm.models.base_config import LTSMConfig 7 | from ltsm.models import get_model 8 | from ltsm.data_provider.prompt_generator import prompt_generate_split, prompt_normalization_split 9 | import torch 10 | import torch.nn as nn 11 | import numpy as np 12 | import pandas as pd 13 | from huggingface_hub import login 14 | from ltsm.data_reader.csv_reader import CSVReader 15 | from ltsm.data_provider.tokenizer.standard_scaler import StandardScaler 16 | from pydantic import BaseModel 17 | import os 18 | 19 | 20 | def inference(file: str, task_type: str = "ts-classification") -> str: 21 | """ 22 | Currently just a minimal working example. 23 | 24 | Task: according to different task requirements, select different models, and save inference results. 25 | 26 | Models can be selected: 27 | - LTSM : forecasting 28 | - DLinear 29 | - Informer 30 | - PatchTST 31 | """ 32 | 33 | #login(token="Hugging Face Token") # Login to Hugging Face Hub if needed 34 | config = LTSMConfig(seq_len=150, pred_len=150, prompt_len=0) 35 | #model = get_model(config, "LTSM", local_pretrain=None, hf_hub_model="LSC2204/LTSM-bundle") 36 | model = get_model(config, "LTSM", local_pretrain=None, hf_hub_model=None) 37 | 38 | task_type = task_type 39 | files = file.split() 40 | print(f"[TS Inferencer] Received inference request with task_type: {task_type}") 41 | 42 | dataList = [] 43 | base_path = os.path.join(os.path.dirname(__file__), "cache") 44 | os.makedirs(base_path, exist_ok=True) 45 | for index, file in enumerate(files): 46 | df = CSVReader(file).fetch() 47 | processor = StandardScaler() 48 | input_data, _, _, = processor.process( 49 | raw_data=df.to_numpy(), 50 | train_data=[df.to_numpy()], 51 | val_data=[df.to_numpy()], 52 | test_data=[df.to_numpy()], 53 | fit_train_only=True, # Use the training data for scaling 54 | do_anomaly=False 55 | ) 56 | input_data = np.array(input_data[0]) 57 | if input_data.ndim == 1: 58 | input_data = input_data.reshape(-1, 1) 59 | tensor_data = torch.tensor(input_data, dtype=torch.float32) 60 | tensor_data = tensor_data.unsqueeze(0) 61 | with torch.no_grad(): 62 | model.eval() 63 | output = model(tensor_data) 64 | 65 | output_np = output.squeeze(0).detach().numpy() 66 | output_path = os.path.join(base_path, f"{index}.csv") 67 | pd.DataFrame(output_np).to_csv(output_path, index=False) 68 | dataList.append(output_path) 69 | LTSM_Output = " ".join(dataList) 70 | 71 | return LTSM_Output 72 | 73 | 74 | 75 | 76 | 77 | #inference() 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /ltsm/models/ltsm_stat_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | from einops import rearrange 5 | from .base_config import LTSMConfig 6 | from transformers.modeling_utils import PreTrainedModel, PretrainedConfig 7 | from transformers import AutoModel, AutoConfig, AutoTokenizer 8 | 9 | class LTSM(PreTrainedModel): 10 | config_class = LTSMConfig 11 | def __init__(self, configs, *model_args, **model_kwargs): 12 | super().__init__(configs) 13 | self.patch_size = configs.patch_size 14 | self.pretrain = configs.pretrain 15 | self.stride = configs.stride 16 | self.patch_num = (configs.seq_len + configs.prompt_len - self.patch_size) // self.stride + 1 17 | self.d_type = torch.bfloat16 18 | self.padding_patch_layer = nn.ReplicationPad1d((0, self.stride)) 19 | self.patch_num += 1 20 | self.configs = configs 21 | 22 | if configs.pretrain: 23 | print("Loading the pretraining weight.") 24 | self.llm_config = AutoConfig.from_pretrained(configs.model_name_or_path) 25 | self.llm = AutoModel.from_pretrained(configs.model_name_or_path) # loads a pretrained GPT-2 base model 26 | else: 27 | raise NotImplementedError("You must load the pretraining weight.") 28 | 29 | self.model_prune(configs) 30 | print("model = {}".format(self.llm)) 31 | 32 | self.in_layer = nn.Linear(configs.patch_size, self.llm_config.hidden_size) 33 | self.out_layer = nn.Linear(self.llm_config.hidden_size * self.patch_num, configs.pred_len) 34 | 35 | self.cnt = 0 36 | 37 | def model_prune(self, configs): 38 | if "gpt2" in configs.model_name_or_path: 39 | self.llm.h = self.llm.h[:configs.gpt_layers] 40 | elif "phi" in configs.model_name_or_path or "llama" in configs.model_name_or_path or "gemma" in configs.model_name_or_path: 41 | self.llm.layers = self.llm.layers[:configs.gpt_layers] 42 | else: 43 | raise NotImplementedError(f"No implementation in model prune for {self.llm}.") 44 | 45 | def forward(self, x): 46 | B, L, M = x.shape 47 | 48 | means = x.mean(1, keepdim=True).detach() 49 | 50 | x = x - means 51 | stdev = torch.sqrt(torch.var(x, dim=1, keepdim=True, unbiased=False)+ 1e-5).detach() 52 | x /= stdev 53 | x = rearrange(x, 'b l m -> b m l') 54 | 55 | x = self.padding_patch_layer(x) 56 | x = x.unfold(dimension=-1, size=self.patch_size, step=self.stride) 57 | x = rearrange(x, 'b m n p -> (b m) n p') 58 | outputs = self.in_layer(x).to(dtype=torch.bfloat16) 59 | 60 | outputs = self.llm(inputs_embeds=outputs).last_hidden_state 61 | outputs = outputs.to(dtype=x.dtype) 62 | 63 | outputs = self.out_layer(outputs.reshape(B*M, -1)) 64 | outputs = rearrange(outputs, '(b m) l -> b l m', b=B) 65 | 66 | outputs = outputs * stdev 67 | outputs = outputs + means 68 | 69 | return outputs 70 | -------------------------------------------------------------------------------- /ltsm/data_provider/data_splitter.py: -------------------------------------------------------------------------------- 1 | from ltsm.common.base_splitter import DataSplitter 2 | import pandas as pd 3 | import numpy as np 4 | 5 | from typing import Tuple, List 6 | import logging 7 | 8 | logging.basicConfig( 9 | level=logging.INFO, 10 | format='%(asctime)s - %(levelname)s - %(message)s', 11 | ) 12 | 13 | class SplitterByTimestamp(DataSplitter): 14 | """ 15 | Data splitter class that splits time-series data by timestamp. 16 | """ 17 | def __init__(self, seq_len: int, pred_len: int, train_ratio: float, val_ratio: float): 18 | """ 19 | Initializes the SplitterByTimestamp with the given arguments. 20 | 21 | Args: 22 | seq_len (int): The number of timesteps used in the input sequence. 23 | pred_len (int): The number of timesteps the model should predict for the output sequence. 24 | train_ratio (float): The training set ratio. 25 | val_ratio (float): The validation set ratio. 26 | """ 27 | super().__init__() 28 | self.seq_len = seq_len 29 | self.pred_len = pred_len 30 | self.train_ratio = train_ratio 31 | self.val_ratio = val_ratio 32 | 33 | 34 | def get_csv_splits(self, df_data: pd.DataFrame, do_anomaly: bool=False) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray], List[np.ndarray]]: 35 | """ 36 | Splits the .csv data into training-validation-training sets. 37 | 38 | Args: 39 | df_data (pd.DataFrame): A Pandas DataFrame containing the data to be split. 40 | 41 | Returns: 42 | Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray], List[np.ndarray]]: 43 | A tuple containing fours lists of sequences for the training, validation, and test sets. 44 | The last list contains the row labels of these sequences. 45 | """ 46 | train_split, val_split, test_split, buff = [], [], [], [] 47 | raw_data = df_data.to_numpy() 48 | 49 | for index, sequence in zip(df_data.index, raw_data): 50 | if len(sequence) > 0 and isinstance(sequence[0], np.ndarray): 51 | logging.error("Time-series should be 1D.") 52 | raise ValueError("Time-series should be 1D.") 53 | 54 | num_train = int(len(sequence) * self.train_ratio) 55 | num_val = int(len(sequence) * self.val_ratio) 56 | 57 | if not do_anomaly: 58 | if num_train < self.seq_len + self.pred_len: 59 | continue 60 | else: 61 | if num_train < self.seq_len: 62 | continue 63 | 64 | 65 | # We also add the previous seq_len points to the val and test sets 66 | train_split.append(sequence[:num_train]) 67 | val_split.append(sequence[num_train-self.seq_len:num_train+num_val]) 68 | test_split.append(sequence[num_train+num_val-self.seq_len:]) 69 | buff.append(index) 70 | 71 | return train_split, val_split, test_split, buff 72 | -------------------------------------------------------------------------------- /tests/data_provider/tokenizer/standard_scaler_test.py: -------------------------------------------------------------------------------- 1 | from ltsm.data_provider.tokenizer.standard_scaler import StandardScaler 2 | 3 | import numpy as np 4 | import pytest 5 | import os 6 | 7 | @pytest.fixture 8 | def setup(): 9 | processor = StandardScaler() 10 | train_data = [np.array([x*i for i in range(100)]) for x in [1, 100, 10000]] 11 | val_data = [np.array([x*i for i in range(100)]) for x in [1, 100, 10000]] 12 | test_data = [np.array([x*i for i in range(100)]) for x in [1, 100, 10000]] 13 | raw_data = [np.concatenate((train_data[x], val_data[x], test_data[x])) for x in range(3)] 14 | 15 | new_train, new_val, new_test = processor.process(raw_data, train_data, val_data, test_data, fit_train_only=True) 16 | return new_train, new_val, new_test, train_data, val_data, test_data, raw_data, processor 17 | 18 | def test_standard_scaler_process_on_train_only(setup): 19 | new_train, new_val, new_test, train_data, val_data, test_data, raw_data, processor = setup 20 | 21 | assert len(new_train) == len(train_data) 22 | assert len(new_val) == len(val_data) 23 | assert len(new_test) == len(test_data) 24 | 25 | means = [np.mean(train_data[i]) for i in range(3)] 26 | stds = [np.std(train_data[i]) for i in range(3)] 27 | for i in range(3): 28 | assert new_train[i].shape == train_data[i].shape 29 | assert new_val[i].shape == val_data[i].shape 30 | assert new_test[i].shape == test_data[i].shape 31 | for j in range(100): 32 | assert new_train[i][j] == (train_data[i][j] - means[i]) / stds[i] 33 | assert new_val[i][j] == (val_data[i][j] - means[i]) / stds[i] 34 | assert new_test[i][j] == (test_data[i][j] - means[i]) / stds[i] 35 | 36 | def test_standard_scaler_process(setup): 37 | new_train, new_val, new_test, train_data, val_data, test_data, raw_data, processor = setup 38 | 39 | assert len(new_train) == len(train_data) 40 | assert len(new_val) == len(val_data) 41 | assert len(new_test) == len(test_data) 42 | 43 | means = [np.mean(raw_data[i]) for i in range(3)] 44 | stds = [np.std(raw_data[i]) for i in range(3)] 45 | for i in range(3): 46 | assert new_train[i].shape == train_data[i].shape 47 | assert new_val[i].shape == val_data[i].shape 48 | assert new_test[i].shape == test_data[i].shape 49 | for j in range(100): 50 | assert new_train[i][j] == (train_data[i][j] - means[i]) / stds[i] 51 | assert new_val[i][j] == (val_data[i][j] - means[i]) / stds[i] 52 | assert new_test[i][j] == (test_data[i][j] - means[i]) / stds[i] 53 | 54 | def test_standard_scaler_save(tmp_path, setup): 55 | d = tmp_path / "save_dir" 56 | d.mkdir() 57 | new_train, new_val, new_test, train_data, val_data, test_data, raw_data, processor = setup 58 | processor.save(str(d)) 59 | assert os.path.isfile(f"{str(d)}/processor.pkl") 60 | 61 | def test_standard_scaler_load(tmp_path, setup): 62 | d = tmp_path / "save_dir" 63 | d.mkdir() 64 | new_train, new_val, new_test, train_data, val_data, test_data, raw_data, processor = setup 65 | processor.save(str(d)) 66 | processor._scaler = None 67 | processor.load(str(d)) 68 | assert processor is not None -------------------------------------------------------------------------------- /ltsm/prompt_reader/stat_prompt/tsfel/utils/signal_processing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from scipy.interpolate import interp1d 4 | 5 | 6 | def signal_window_splitter(signal, window_size, overlap=0): 7 | """Splits the signal into windows 8 | Parameters 9 | ---------- 10 | signal : nd-array or pandas DataFrame 11 | input signal 12 | window_size : int 13 | number of points of window size 14 | overlap : float 15 | percentage of overlap, value between 0 and 1 (exclusive) 16 | Default: 0 17 | Returns 18 | ------- 19 | list 20 | list of signal windows 21 | """ 22 | if not isinstance(window_size, int): 23 | raise SystemExit('window_size must be an integer.') 24 | step = int(round(window_size)) if overlap == 0 else int(round(window_size * (1 - overlap))) 25 | if step == 0: 26 | raise SystemExit('Invalid overlap. ' 27 | 'Choose a lower overlap value.') 28 | if len(signal) % window_size == 0 and overlap == 0: 29 | return [signal[i:i + window_size] for i in range(0, len(signal), step)] 30 | else: 31 | return [signal[i:i + window_size] for i in range(0, len(signal) - window_size + 1, step)] 32 | 33 | 34 | def merge_time_series(data, fs_resample, time_unit): 35 | """Time series data interpolation 36 | 37 | Parameters 38 | ---------- 39 | data : dict 40 | data to interpolate 41 | fs_resample : 42 | resample sampling frequency 43 | time_unit : 44 | time unit in seconds 45 | 46 | Returns 47 | ------- 48 | DataFrame 49 | Interpolated data 50 | 51 | """ 52 | 53 | # time interval for interpolation 54 | sensors_time = np.array([[dn.iloc[0, 0], dn.iloc[-1, 0]] for k, dn in data.items()]) 55 | t0 = np.max(sensors_time[:, 0]) 56 | tn = np.min(sensors_time[:, 1]) 57 | x_new = np.linspace(t0, tn, int((tn - t0) / ((1 / fs_resample) * time_unit))) 58 | 59 | # interpolation 60 | data_new = np.copy(x_new.reshape(len(x_new), 1)) 61 | header_values = ['time'] 62 | for k, dn in data.items(): 63 | header_values += [k + str(i) for i in range(1, np.shape(dn)[1])] 64 | data_new = np.hstack((data_new, np.array([interp1d(dn.iloc[:, 0], dn.iloc[:, ax])(x_new) for ax in range(1, np.shape(dn)[1])]).T)) 65 | 66 | return pd.DataFrame(data=data_new[:, 1:], columns=header_values[1:]) 67 | 68 | 69 | def correlated_features(features, threshold=0.95): 70 | """Compute pairwise correlation of features using pearson method 71 | 72 | Parameters 73 | ---------- 74 | features : DataFrame 75 | features 76 | threshold : 77 | correlation value for removing highly correlated features 78 | Returns 79 | ------- 80 | DataFrame 81 | correlated features names 82 | 83 | """ 84 | corr_matrix = features.corr().abs() 85 | # Select upper triangle of correlation matrix 86 | upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) 87 | # Find index and column name of features with correlation greater than 0.95 88 | to_drop = [column for column in upper.columns if any(upper[column] > threshold)] 89 | 90 | return to_drop 91 | -------------------------------------------------------------------------------- /tests/test_scripts/test_pipeline_training.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch import nn 4 | 5 | from ltsm.data_pipeline import StatisticalTrainingPipeline, get_args, seed_all 6 | from ltsm.data_provider.data_loader import HF_Dataset 7 | from ltsm.models.utils import freeze_parameters, print_trainable_parameters 8 | from peft import get_peft_config, get_peft_model, LoraConfig 9 | 10 | from transformers import ( 11 | EvalPrediction, 12 | ) 13 | 14 | def run(): 15 | config = get_args() 16 | seed = config.seed 17 | seed_all(seed) 18 | 19 | model = get_model(config) 20 | 21 | if config.lora: 22 | peft_config = LoraConfig( 23 | target_modules=["c_attn"], 24 | inference_mode=False, 25 | r=config.lora_dim, 26 | lora_alpha=32, 27 | lora_dropout=0.1 28 | ) 29 | model = get_peft_model(model, peft_config) 30 | model.print_trainable_parameters() 31 | 32 | elif config.freeze: 33 | freeze_parameters(model) 34 | 35 | print_trainable_parameters(model) 36 | 37 | # Optimizer settings 38 | model_optim = torch.optim.Adam(model.parameters(), lr=config.learning_rate) 39 | lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(model_optim, T_max=config.tmax, eta_min=1e-8) 40 | 41 | # Evaluation metrics 42 | def compute_metrics(p: EvalPrediction): 43 | preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions 44 | preds = np.squeeze(preds) 45 | if preds.shape != p.label_ids.shape: 46 | label_ids = np.squeeze(p.label_ids) 47 | else: 48 | label_ids = p.label_ids 49 | return { 50 | "mse": ((preds - label_ids) ** 2).mean().item(), 51 | "mae": (np.abs(preds - label_ids)).mean().item() 52 | } 53 | 54 | # Loss function 55 | def compute_loss(model, inputs, return_outputs=False): 56 | outputs = model(inputs["input_data"]) 57 | loss = nn.functional.mse_loss(outputs, inputs["labels"]) 58 | return (loss, outputs) if return_outputs else loss 59 | 60 | # Data collator 61 | def collate_fn(batch): 62 | return { 63 | 'input_data': torch.from_numpy(np.stack([x['input_data'] for x in batch])).type(torch.float32), 64 | 'labels': torch.from_numpy(np.stack([x['labels'] for x in batch])).type(torch.float32), 65 | } 66 | 67 | # Prediction step 68 | @torch.no_grad() 69 | def prediction_step(model, inputs, prediction_loss_only=False, ignore_keys=None): 70 | # CSV 71 | input_data = inputs["input_data"].to(model.module.device) 72 | labels = inputs["labels"].to(model.module.device) 73 | outputs = model(input_data) 74 | loss = nn.functional.mse_loss(outputs, labels) 75 | return (loss, outputs, labels) 76 | 77 | 78 | pipeline = StatisticalTrainingPipeline(config, 79 | model=model, 80 | collate_fn=collate_fn, 81 | prediction_step=prediction_step, 82 | compute_loss=compute_loss, 83 | compute_metrics=compute_metrics) 84 | pipeline.run() 85 | 86 | 87 | if __name__ == "__main__": 88 | run() -------------------------------------------------------------------------------- /ltsm/common/sklearn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | def get_default_hyperparameter(primitive, hyperparameter): 5 | 6 | # check if input legal hyperparameter 7 | hyperparam_buf = list(primitive.metadata.get_hyperparams().defaults().keys()) 8 | hyperparam_input = list(hyperparameter.keys()) 9 | if not set(hyperparam_buf) > set(hyperparam_input): 10 | invalid_hyperparam = list(set(hyperparam_input) - set(hyperparam_buf)) 11 | raise TypeError(primitive.__name__ + ' got unexpected keyword argument ' + str(invalid_hyperparam)) 12 | 13 | hyperparams_class = primitive.metadata.get_hyperparams() 14 | hyperparams = hyperparams_class.defaults() 15 | 16 | if len(hyperparameter.items()) != 0: 17 | hyperparams = hyperparams.replace(hyperparameter) 18 | 19 | return hyperparams 20 | 21 | class BaseSKI: 22 | 23 | def __init__(self, primitive, **hyperparameter): 24 | 25 | self.fit_available = True if 'fit' in primitive.__dict__ else False 26 | self.predict_available = True if 'produce' in primitive.__dict__ else False 27 | self.predict_score_available = True if 'produce_score' in dir(primitive) else False 28 | self.produce_available = True if 'produce' in primitive.__dict__ else False 29 | 30 | hyperparams = get_default_hyperparameter(primitive, hyperparameter) 31 | self.primitives = primitive(hyperparams=hyperparams) 32 | 33 | def _sys_data_check(self, data): 34 | if self.system_num == 1: 35 | if type(data) is np.ndarray and data.ndim == 2: 36 | data = [data] # np.expand_dims(data, axis=0) 37 | else: 38 | raise AttributeError('For system_num = 1, input data should be 2D numpy array.') 39 | elif self.system_num > 1: 40 | if type(data) is list and len(data) == self.system_num: 41 | for ts_data in data: 42 | if type(ts_data) is np.ndarray and ts_data.ndim == 2: 43 | continue 44 | else: 45 | raise AttributeError('For system_num > 1, each element of input list should be 2D numpy arrays.') 46 | else: 47 | raise AttributeError('For system_num > 1, input data should be the list of `system_num` 2D numpy arrays.') 48 | 49 | return data 50 | 51 | def fit(self, data): 52 | 53 | if not self.fit_available: 54 | raise AttributeError('type object ' + self.__class__.__name__ + ' has no attribute \'fit\'') 55 | 56 | data = self._sys_data_check(data) 57 | 58 | for sys_idx, primitive in enumerate(self.primitives): 59 | sys_data = data[sys_idx] 60 | sys_data = self._transform(sys_data) 61 | primitive.set_training_data(inputs=sys_data) 62 | primitive.fit() 63 | 64 | return 65 | 66 | def predict(self, data): 67 | 68 | if not self.predict_available: 69 | raise AttributeError('type object ' + self.__class__.__name__ + ' has no attribute \'predict\'') 70 | 71 | data = self._sys_data_check(data) 72 | output_data = self._forward(data, '_produce') 73 | 74 | return output_data 75 | 76 | def _transform(self, X): #transform the ndarray to d3m dataframe, select columns to use 77 | column_name = [str(col_index) for col_index in range(X.shape[1])] 78 | return pd.DataFrame(X, columns=column_name, generate_metadata=True) 79 | -------------------------------------------------------------------------------- /multi_agents_pipeline/agents/QA_Agent.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pathlib import Path 3 | from typing import Optional, List 4 | 5 | from autogen_core import ( 6 | RoutedAgent, 7 | message_handler, 8 | default_subscription, 9 | MessageContext, 10 | DefaultTopicId, 11 | TopicId, 12 | type_subscription 13 | ) 14 | from autogen_core.models import ChatCompletionClient, UserMessage, AssistantMessage, SystemMessage 15 | from autogen_core.model_context import BufferedChatCompletionContext 16 | from pydantic import BaseModel 17 | from .custom_messages import TextMessage, TSMessage 18 | 19 | @type_subscription(topic_type="Planner-QA") # for receiving task from Planner 20 | @type_subscription(topic_type="Redo-QA") # for receiving QA Feedback 21 | @type_subscription(topic_type="TS-Info") # for receiving TS info from TS Agent 22 | class QAAgent(RoutedAgent): 23 | def __init__(self, name: str, model_client: ChatCompletionClient): 24 | super().__init__(description=f"{name} with LLM support") 25 | self.name = name 26 | self._last_plan: Optional[str] = None 27 | self._model_client = model_client 28 | self._model_context = BufferedChatCompletionContext(buffer_size=5) 29 | self._system_messages = [SystemMessage(content="You are a helpful AI assistant.")] 30 | 31 | self._last_llm_response: Optional[str] = None # for evaluation 32 | 33 | @message_handler 34 | async def handle_plan(self, message: TextMessage, ctx: MessageContext) -> None: 35 | self._last_plan = message.content 36 | print(f"[{self.name}] Stored plan from {message.source}: {message.content}") 37 | 38 | @message_handler 39 | async def handle_TS(self, message: TSMessage, ctx: MessageContext) -> None: 40 | """This is the TS info given by TS Agent 41 | """ 42 | df = pd.read_csv(Path(message.filepath)) 43 | stats = df.describe().to_string() 44 | 45 | # below is the prompt that combine the task and the TS Info. 46 | # TODO : Modify according to the task type and task description. Currently just a placeholder 47 | prompt = f""" 48 | You are a Time Series Expert. 49 | 50 | Here is a task given by the planner: 51 | {self._last_plan or "(no plan received)"} 52 | 53 | Here is the output of Time-Series Agent: 54 | {stats} 55 | 56 | Please finish the task based on the above information. 57 | """ 58 | 59 | print(f"[{self.name}] Sending prompt to LLM...") 60 | 61 | user_message = UserMessage(content=prompt, source=self.name) 62 | await self._model_context.add_message(user_message) 63 | 64 | # send to LLM for response 65 | llm_response = await self._model_client.create( 66 | self._system_messages + (await self._model_context.get_messages()), 67 | cancellation_token=ctx.cancellation_token, 68 | ) 69 | 70 | assert isinstance(llm_response.content, str) 71 | 72 | self._last_llm_response = llm_response.content 73 | 74 | await self._model_context.add_message( 75 | AssistantMessage(content=self._last_llm_response, source=self.name) 76 | ) 77 | # publish the inference result of QA Agent 78 | await self.publish_message( 79 | TextMessage(source=self.name, content=self._last_llm_response, task = self._last_plan), # add task 80 | TopicId(type="QA-Response", source=self.id.key) # publish to a specific topic for QA response 81 | ) 82 | 83 | def get_last_response(self) -> Optional[str]: 84 | return self._last_llm_response 85 | -------------------------------------------------------------------------------- /ltsm/prompt_reader/stat_prompt/tsfel/utils/calculate_complexity.py: -------------------------------------------------------------------------------- 1 | import time 2 | import json 3 | import numpy as np 4 | from scipy.optimize import curve_fit 5 | from tsfel.feature_extraction.features_settings import load_json 6 | from tsfel.feature_extraction.calc_features import calc_window_features 7 | 8 | 9 | # curves 10 | def n_squared(x, no): 11 | """The model function""" 12 | return no * x ** 2 13 | 14 | 15 | def n_nlog(x, no): 16 | """The model function""" 17 | return no * x * np.log(x) 18 | 19 | 20 | def n_linear(x, no): 21 | """The model function""" 22 | return no * x 23 | 24 | 25 | def n_log(x, no): 26 | """The model function""" 27 | return no * np.log(x) 28 | 29 | 30 | def n_constant(x, no): 31 | """The model function""" 32 | return np.zeros(len(x)) + no 33 | 34 | 35 | def find_best_curve(t, signal): 36 | """Finds the best curve. 37 | 38 | Parameters 39 | ---------- 40 | t : nd-array 41 | Log space 42 | signal : nd-array 43 | Mean execution time array 44 | 45 | Returns 46 | ------- 47 | str 48 | Best fit curve name 49 | 50 | """ 51 | 52 | all_chisq = [] 53 | list_curves = [n_squared, n_nlog, n_linear, n_log, n_constant] 54 | all_curves = [] 55 | # Model parameters 56 | stdev = 2 57 | sig = np.zeros(len(signal)) + stdev 58 | 59 | # Fit the curve 60 | for curve in list_curves: 61 | start = 1 62 | popt, pcov = curve_fit(curve, t, signal, sigma=sig, p0=start, absolute_sigma=True) 63 | 64 | # Compute chi square 65 | nexp = curve(t, *popt) 66 | r = signal - nexp 67 | chisq = np.sum((r / stdev) ** 2) 68 | all_chisq.append(chisq) 69 | all_curves.append(nexp) 70 | 71 | idx_best = np.argmin(all_chisq) 72 | 73 | curve_name = str(list_curves[idx_best]) 74 | idx1 = curve_name.find("n_") 75 | idx2 = curve_name.find("at") 76 | curve_name = curve_name[idx1 + 2:idx2 - 1] 77 | 78 | return curve_name 79 | 80 | 81 | def compute_complexity(feature, domain, json_path, **kwargs): 82 | """Computes the feature complexity. 83 | 84 | Parameters 85 | ---------- 86 | feature : string 87 | Feature name 88 | domain : string 89 | Feature domain 90 | json_path: json 91 | Features json file 92 | \**kwargs: 93 | See below: 94 | * *features_path* (``string``) -- 95 | Directory of script with personal features 96 | 97 | Returns 98 | ------- 99 | int 100 | Feature complexity 101 | 102 | Writes complexity in json file 103 | 104 | """ 105 | 106 | dictionary = load_json(json_path) 107 | 108 | features_path = kwargs.get('features_path', None) 109 | 110 | # The inputs from this function should be replaced by a dictionary 111 | one_feat_dict = {domain: {feature: dictionary[domain][feature]}} 112 | 113 | t = np.logspace(3.0, 5.0, 6) 114 | signal, s = [], [] 115 | f = 0.05 116 | x = np.arange(0, t[-1] + 1, 1) 117 | fs = 100 118 | wave = np.sin(2 * np.pi * f * x / fs) 119 | 120 | for ti in t: 121 | for _ in range(20): 122 | 123 | start = time.time() 124 | calc_window_features(one_feat_dict, wave[:int(ti)], fs, features_path=features_path) 125 | end = time.time() 126 | 127 | s += [end - start] 128 | 129 | signal += [np.mean(s)] 130 | 131 | curve_name = find_best_curve(t, signal) 132 | dictionary[domain][feature]['complexity'] = curve_name 133 | 134 | with open(json_path, "w") as write_file: 135 | json.dump(dictionary, write_file, indent=4, sort_keys=True) 136 | 137 | if curve_name == 'constant' or curve_name == 'log': 138 | return 1 139 | elif curve_name == 'linear': 140 | return 2 141 | elif curve_name == 'nlog' or curve_name == 'squared': 142 | return 3 143 | else: 144 | return 0 145 | -------------------------------------------------------------------------------- /tests/data_pipeline/stat_pipeline_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from ltsm.models.base_config import LTSMConfig 3 | from ltsm.data_provider.dataset import TSDataset 4 | from ltsm.data_pipeline import StatisticalTrainingPipeline 5 | from ltsm.common.base_training_pipeline import TrainingConfig 6 | from transformers import TrainingArguments 7 | 8 | @pytest.fixture 9 | def mock_config(): 10 | #Fixture for creating mock arguments 11 | 12 | train_params = { 13 | 'model': 'LTSM', 14 | 'model_name_or_path': 'gpt2-medium', 15 | 'log_file': 'log.txt', 16 | 'data_path':'./datasets', 17 | 'prompt_data_path':'./prompt_bank', 18 | 'output_dir': './output', 19 | 'train_ratio': 0.7, 20 | 'val_ratio': 0.1, 21 | 'tmax': 10, 22 | 'learning_rate': 5e-5, 23 | 'downsample_rate': 10, 24 | 'train_epochs': 8, 25 | 'batch_size': 100, 26 | 'eval': False, 27 | 'lora': False, 28 | 'freeze': False, 29 | 'data_processing': 'standard_scaler', 30 | 'gradient_accumulation_steps': 1 31 | } 32 | 33 | model_params = { 34 | 'gpt_layers': 3, 35 | 'patch_size': 16, 36 | 'pretrain': True, 37 | 'stride': 2, 38 | 'seq_len': 256, 39 | 'pred_len': 12, 40 | 'prompt_len': 8, 41 | } 42 | 43 | model_config = LTSMConfig(**model_params) 44 | return TrainingConfig(model_config, **train_params) 45 | 46 | @pytest.fixture 47 | def pipeline(mock_config): 48 | # Fixture to create pipeline 49 | return StatisticalTrainingPipeline(mock_config) 50 | 51 | def test_initialization(pipeline, mock_config): 52 | #Test that StatisticalTrainingPipeline initializes correctly 53 | 54 | assert pipeline.config == mock_config 55 | assert pipeline.training_args.output_dir == mock_config.train_params["output_dir"] 56 | assert pipeline.training_args.per_device_train_batch_size == mock_config.train_params["batch_size"] 57 | assert pipeline.training_args.per_device_eval_batch_size == mock_config.train_params["batch_size"] 58 | assert pipeline.training_args.num_train_epochs == mock_config.train_params["train_epochs"] 59 | assert pipeline.training_args.learning_rate == mock_config.train_params["learning_rate"] 60 | assert pipeline.training_args.gradient_accumulation_steps == mock_config.train_params["gradient_accumulation_steps"] 61 | 62 | 63 | def test_run_training(mocker, pipeline): 64 | # Mock dataset loading and Trainer behavior 65 | mock_get_datasets = mocker.patch.object(pipeline, 'get_datasets', return_value=(TSDataset([], 0, 0), TSDataset([], 0, 0), [None, None, None, None], None)) 66 | mock_trainer = mocker.patch('ltsm.data_pipeline.stat_pipeline.Trainer') 67 | mock_trainer.evaluate.return_value = None 68 | 69 | pipeline.run() 70 | 71 | # Ensure datasets are loaded and Trainer is instantiated 72 | mock_get_datasets.assert_called_once() 73 | 74 | # Check if train is called when eval is False 75 | if not pipeline.config.train_params["eval"]: 76 | assert mock_trainer.return_value.train.called 77 | assert mock_trainer.return_value.save_model.called 78 | 79 | assert mock_trainer.return_value.evaluate.call_count == 4 80 | assert mock_trainer.return_value.save_metrics.call_count == 5 81 | assert mock_trainer.return_value.log_metrics.call_count == 5 82 | 83 | 84 | def test_run_evaluation_only(mocker, pipeline): 85 | pipeline.config.train_params["eval"] = True # Set eval-only mode 86 | # Mock dataset loading and Trainer behavior 87 | mock_get_datasets = mocker.patch.object(pipeline, 'get_datasets', return_value=(TSDataset([], 0, 0), TSDataset([], 0, 0), [None, None, None, None], None)) 88 | mock_trainer = mocker.patch('ltsm.data_pipeline.stat_pipeline.Trainer') 89 | 90 | pipeline.run() 91 | 92 | # Ensure datasets are loaded and Trainer is instantiated 93 | mock_get_datasets.assert_called_once() 94 | 95 | # Ensure training is skipped and only evaluation is called 96 | assert not mock_trainer.return_value.train.called 97 | assert mock_trainer.return_value.evaluate.called 98 | assert mock_trainer.return_value.save_metrics.called -------------------------------------------------------------------------------- /tests/models/PatchTST_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from ltsm.models import get_model 3 | from ltsm.models.base_config import PatchTSTConfig 4 | from ltsm.common.base_training_pipeline import TrainingConfig 5 | from transformers import PreTrainedModel 6 | import torch 7 | import numpy as np 8 | 9 | @pytest.fixture 10 | def config(tmp_path): 11 | data_path = tmp_path / "test.csv" 12 | prompt_data_path = tmp_path / "prompt_normalize_split" 13 | prompt_data_path.mkdir() 14 | OUTPUT_PATH = data_path / "output" 15 | 16 | train_params = { 17 | "data_path": str(data_path), 18 | "model": "PatchTST", 19 | "model_name_or_path": "gpt2-medium", 20 | "gradient_accumulation_steps": 64, 21 | "test_data_path_list": [str(data_path)], 22 | "prompt_data_path": str(prompt_data_path), 23 | "train_epochs": 1000, 24 | "patience": 10, 25 | "lradj": 'TST', 26 | "pct_start": 0.2, 27 | "freeze": 0, 28 | "itr": 1, 29 | "batch_size": 32, 30 | "learning_rate": 1e-3, 31 | "downsample_rate": 20, 32 | "output_dir": str(OUTPUT_PATH), 33 | "des": 'Exp', 34 | "eval": 0 35 | } 36 | config = { 37 | "pred_len": 96, 38 | "enc_in": 1, 39 | "seq_len": 336, 40 | "patch_len": 16, 41 | "decomposition": False, 42 | "stride": 8, 43 | "e_layers": 3, 44 | "n_heads": 16, 45 | "d_model": 128, 46 | "d_ff": 256, 47 | "dropout": 0.2, 48 | "fc_dropout": 0.2, 49 | "head_dropout": 0, 50 | "revin": True, 51 | "affine": True, 52 | "subtract_last": False, 53 | "individual": False 54 | } 55 | 56 | patchtst_config = PatchTSTConfig(**config) 57 | return TrainingConfig(patchtst_config, **train_params) 58 | 59 | def test_model_initialization(config): 60 | model = get_model(config.model_config, model_name=config.train_params["model"], local_pretrain=config.train_params["local_pretrain"]) 61 | assert model is not None 62 | assert isinstance(model, PreTrainedModel) 63 | 64 | 65 | def test_parameter_count(config): 66 | model = get_model(config.model_config, model_name=config.train_params["model"], local_pretrain=config.train_params["local_pretrain"]) 67 | param_count = sum([p.numel() for p in model.parameters() if p.requires_grad]) 68 | 69 | patch_num = int((config.model_config.seq_len - config.model_config.patch_len) / config.model_config.stride + 1) 70 | # multi-head self-attention parameter count (W_Q, W_K, W_V, to_out) 71 | expected_param_count = 4*(config.model_config.d_model * config.model_config.d_model + config.model_config.d_model) 72 | # feed-forward nn parameter count 73 | expected_param_count += 2*config.model_config.d_model*config.model_config.d_ff + config.model_config.d_model + config.model_config.d_ff 74 | # layer norm parameter count 75 | expected_param_count += 4*config.model_config.d_model 76 | 77 | # multiply by number of encoder layers 78 | expected_param_count *= config.model_config.e_layers 79 | 80 | # Input encoding parameter count 81 | expected_param_count += config.model_config.patch_len*config.model_config.d_model + config.model_config.d_model 82 | 83 | # Positional encoding parameter count 84 | expected_param_count += patch_num*config.model_config.d_model 85 | 86 | # RevIn parameter count 87 | expected_param_count += 2 88 | 89 | # Flatten Head parameter count 90 | expected_param_count += config.model_config.d_model*patch_num*config.model_config.pred_len + config.model_config.pred_len 91 | 92 | assert param_count == expected_param_count 93 | 94 | def test_forward_output_shape(config): 95 | model = get_model(config.model_config, model_name=config.train_params["model"], local_pretrain=config.train_params["local_pretrain"]) 96 | batch_size = 32 97 | channel = 16 98 | input_length = config.model_config.seq_len 99 | input = torch.tensor(np.zeros((batch_size, input_length, channel))).float() 100 | output = model(input) 101 | assert output.size() == torch.Size([batch_size, config.model_config.pred_len, channel]) -------------------------------------------------------------------------------- /ltsm/models/DLinear.py: -------------------------------------------------------------------------------- 1 | # code from https://github.com/yuqinie98/PatchTST, with minor modifications 2 | import torch 3 | from torch import Tensor 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import numpy as np 7 | from transformers import PreTrainedModel 8 | from .base_config import DLinearConfig 9 | 10 | class DLinear(PreTrainedModel): 11 | """ 12 | Decomposition-Linear 13 | """ 14 | config_class = DLinearConfig 15 | 16 | def __init__(self, config: DLinearConfig, **kwargs): 17 | super().__init__(config) 18 | self.seq_len = config.seq_len 19 | self.pred_len = config.pred_len 20 | 21 | # Decompsition Kernel Size 22 | kernel_size = 25 23 | self.decompsition = series_decomp(kernel_size) 24 | self.individual = config.individual 25 | self.channels = config.enc_in 26 | 27 | if self.individual: 28 | self.Linear_Seasonal = nn.ModuleList() 29 | self.Linear_Trend = nn.ModuleList() 30 | 31 | for i in range(self.channels): 32 | self.Linear_Seasonal.append(nn.Linear(self.seq_len,self.pred_len)) 33 | self.Linear_Trend.append(nn.Linear(self.seq_len,self.pred_len)) 34 | 35 | # Use this two lines if you want to visualize the weights 36 | # self.Linear_Seasonal[i].weight = nn.Parameter((1/self.seq_len)*torch.ones([self.pred_len,self.seq_len])) 37 | # self.Linear_Trend[i].weight = nn.Parameter((1/self.seq_len)*torch.ones([self.pred_len,self.seq_len])) 38 | else: 39 | self.Linear_Seasonal = nn.Linear(self.seq_len,self.pred_len) 40 | self.Linear_Trend = nn.Linear(self.seq_len,self.pred_len) 41 | 42 | # Use this two lines if you want to visualize the weights 43 | # self.Linear_Seasonal.weight = nn.Parameter((1/self.seq_len)*torch.ones([self.pred_len,self.seq_len])) 44 | # self.Linear_Trend.weight = nn.Parameter((1/self.seq_len)*torch.ones([self.pred_len,self.seq_len])) 45 | 46 | def forward(self, x: Tensor): 47 | # x: [Batch, Input length, Channel] 48 | seasonal_init, trend_init = self.decompsition(x) 49 | seasonal_init, trend_init = seasonal_init.permute(0,2,1), trend_init.permute(0,2,1) 50 | if self.individual: 51 | seasonal_output = torch.zeros([seasonal_init.size(0),seasonal_init.size(1),self.pred_len],dtype=seasonal_init.dtype).to(seasonal_init.device) 52 | trend_output = torch.zeros([trend_init.size(0),trend_init.size(1),self.pred_len],dtype=trend_init.dtype).to(trend_init.device) 53 | for i in range(self.channels): 54 | seasonal_output[:,i,:] = self.Linear_Seasonal[i](seasonal_init[:,i,:]) 55 | trend_output[:,i,:] = self.Linear_Trend[i](trend_init[:,i,:]) 56 | else: 57 | seasonal_output = self.Linear_Seasonal(seasonal_init) 58 | trend_output = self.Linear_Trend(trend_init) 59 | 60 | x = seasonal_output + trend_output 61 | return x.permute(0,2,1) # to [Batch, Output length, Channel] 62 | 63 | 64 | class moving_avg(nn.Module): 65 | """ 66 | Moving average block to highlight the trend of time series 67 | """ 68 | def __init__(self, kernel_size, stride): 69 | super(moving_avg, self).__init__() 70 | self.kernel_size = kernel_size 71 | self.avg = nn.AvgPool1d(kernel_size=kernel_size, stride=stride, padding=0) 72 | 73 | def forward(self, x): 74 | # padding on the both ends of time series 75 | front = x[:, 0:1, :].repeat(1, (self.kernel_size - 1) // 2, 1) 76 | end = x[:, -1:, :].repeat(1, (self.kernel_size - 1) // 2, 1) 77 | x = torch.cat([front, x, end], dim=1) 78 | x = self.avg(x.permute(0, 2, 1)) 79 | x = x.permute(0, 2, 1) 80 | return x 81 | 82 | 83 | class series_decomp(nn.Module): 84 | """ 85 | Series decomposition block 86 | """ 87 | def __init__(self, kernel_size): 88 | super(series_decomp, self).__init__() 89 | self.moving_avg = moving_avg(kernel_size, stride=1) 90 | 91 | def forward(self, x): 92 | moving_mean = self.moving_avg(x) 93 | res = x - moving_mean 94 | return res, moving_mean -------------------------------------------------------------------------------- /multi_agents_pipeline/main.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | from autogen_core import AgentId, SingleThreadedAgentRuntime, TopicId 4 | from autogen_core.models import ( 5 | ChatCompletionClient, 6 | LLMMessage, 7 | SystemMessage, 8 | UserMessage, 9 | ) 10 | from agents.QA_Agent import QAAgent 11 | from agents.TS_Agent import TSAgent 12 | from agents.Planning_Agent import PlanningAgent 13 | from agents.Reward_Agent import RewardAgent 14 | from agents.custom_messages import TextMessage, TSMessage, TSTaskMessage 15 | from autogen_core import TRACE_LOGGER_NAME 16 | import aiofiles 17 | import yaml 18 | 19 | QA_MODEL_CONFIG_PATH = "model_config.yaml" 20 | 21 | async def get_model_client(model_config_path: str) -> ChatCompletionClient: 22 | async with aiofiles.open(model_config_path, "r") as file: 23 | model_config = yaml.safe_load(await file.read()) 24 | return ChatCompletionClient.load_component(model_config) 25 | 26 | async def main() -> None: 27 | 28 | runtime = SingleThreadedAgentRuntime() 29 | 30 | model_client = await get_model_client(QA_MODEL_CONFIG_PATH) 31 | 32 | await PlanningAgent.register( 33 | runtime, 34 | "Planning_Agent", 35 | lambda: PlanningAgent(name="Planning_Agent", model_client=model_client), 36 | ) 37 | 38 | await QAAgent.register( 39 | runtime, 40 | "QA_Agent", 41 | lambda: QAAgent(name="QA_Agent", model_client=model_client), 42 | ) 43 | 44 | # Register the TS Agent 45 | await TSAgent.register( 46 | runtime, 47 | "TS_Agent", 48 | lambda: TSAgent(name="TS_Agent"), 49 | ) 50 | 51 | # Register the Reward Agent 52 | await RewardAgent.register( 53 | runtime, 54 | "Reward_Agent", 55 | lambda: RewardAgent(name="Reward_Agent", model_client=model_client, force_bad_score=True), 56 | ) 57 | 58 | runtime.start() 59 | 60 | # # mock a plan message from planner 61 | # await runtime.send_message( 62 | # TextMessage(source="user", content="TS classification"), 63 | # AgentId("QA_Agent", "default"), 64 | # ) 65 | 66 | # # mock a TS Info message from TS Agent 67 | # await runtime.send_message( 68 | # TSMessage(source="user", filepath="../datasets/UCR-gunpoint/sample_0000.csv",task_type="TS_classification", description="TS Data"), 69 | # AgentId("TS_Agent", "default"), 70 | # ) 71 | 72 | await runtime.publish_message( 73 | TextMessage(source="Planner", content="TS classification"), 74 | topic_id=TopicId( 75 | type="Planner-QA", # This is the topic for Planner to send the initial plan 76 | source="Planner" 77 | ) 78 | ) 79 | 80 | await runtime.publish_message( 81 | TSMessage( 82 | source="Planner", 83 | filepath="../datasets/UCR-gunpoint/sample_0000.csv", # Example file path 84 | task_type="ts-classification", # Example task type 85 | description="TS Data" 86 | ), 87 | topic_id=TopicId( 88 | type="Planner-TS", # This is the topic for TS Agent to send the TS info 89 | source="Planner" 90 | ) 91 | ) 92 | 93 | 94 | # mock a TSTaskMessage from user 95 | # ts_task_message = TSTaskMessage( 96 | # description="The file contains time series data of the hand motion of an actor raising their arm. From this data alone, tell me if the actor is raising a gun or pointing their finger.", 97 | # filepath="../datasets/GunPointAgeSpan/GunPointAgeSpan_TRAIN.tsv" 98 | # ) 99 | # await runtime.send_message(ts_task_message, AgentId("Planning_Agent", "default")) 100 | 101 | 102 | await runtime.stop_when_idle() 103 | 104 | if __name__ == "__main__": 105 | logging.basicConfig(level=logging.WARNING) 106 | logging.getLogger("autogen_core").setLevel(logging.WARNING) 107 | logging.getLogger("autogen_core.events").setLevel(logging.WARNING) 108 | logging.getLogger("autogen_core.runtime").setLevel(logging.WARNING) 109 | logging.getLogger("uvicorn.access").setLevel(logging.WARNING) 110 | logging.getLogger(TRACE_LOGGER_NAME).setLevel(logging.WARNING) 111 | asyncio.run(main()) 112 | -------------------------------------------------------------------------------- /tutorial/README.md: -------------------------------------------------------------------------------- 1 | # Tutorial of LTSM-bundle 2 | 3 | 4 | ## Installation 5 | ``` 6 | conda create -n ltsm python=3.8.0 7 | conda activate ltsm 8 | git clone git@github.com:daochenzha/ltsm.git 9 | cd ltsm 10 | pip3 install -e . 11 | pip3 install -r requirements.txt 12 | ``` 13 | 14 | 15 | ## :bookmark: Step 0: Collect Datasets and Time Series Prompts 16 | 17 | ### :cyclone: You can use our prepared dataset to on-board youselves on LTSM-bundle 18 | 19 | ### Download training datasets 20 | ```bash 21 | cd datasets 22 | download: https://drive.google.com/drive/folders/1hLFbz0FRxdiDCzgFYtKCOPJYSBVvwW9P 23 | ``` 24 | 25 | ### Download time sereis prompts 26 | ```bash 27 | cd prompt_bank/propmt_data_csv 28 | download: https://drive.google.com/drive/folders/1hLFbz0FRxdiDCzgFYtKCOPJYSBVvwW9P 29 | ``` 30 | 31 | ### Check word prompts 32 | ```bash 33 | cd prompt_bank/text_prompt_data_csv/ 34 | check: csv_prompt.json 35 | ``` 36 | 37 | ## :bookmark: Step 1: Customize Datasets and Time Series Prompts 38 | 39 | ### :cyclone: If you prefer to build LTSM-bundle on your own dataset, please follow the 5-step instructions below: 40 | 41 | **Step 1-a.** Prepare your dataset. Make sure your local data folder like this: 42 | ````angular2html 43 | - ltsm/ 44 | - datasets/ 45 | DATA_1.csv/ 46 | DATA_2.csv/ 47 | ... 48 | ```` 49 | 50 | **Step 1-b.** Generating the time series prompts from training, validating, and testing datasets 51 | ````angular2html 52 | python3 prompt_generate_split.py 53 | ```` 54 | 55 | **Step 1-c.** Find the generated time series prompts in the './prompt_data_split' folder. Then run the following command for normalizing the prompts: 56 | ````angular2html 57 | python3 prompt_normalization_split.py --mode fit 58 | ```` 59 | 60 | **Step 1-d.** Run this command to export the prompts to the "./prompt_data_normalize_split" folder: 61 | ````angular2html 62 | python3 prompt_normalization_split.py --mode transform 63 | ```` 64 | 65 | **Step 1-e.** Modify the word prompt based on your dataset description in "prompt_bank/text_prompt_data_csv/csv_prompt.json": 66 | ````angular2html 67 | vim prompt_bank/text_prompt_data_csv/csv_prompt.json 68 | ```` 69 | 70 | ## :bookmark: Step 2: Customize your own LTSM-bundle 71 | 72 | ### :cyclone: Now, it's time to build you own LTSM-bundle!! 73 | 74 | #### Option-(1) Explore [Word Prompt] and [Linear Tokenization] on gpt2-medium 75 | ```bash 76 | python3 main_ltsm.py \ 77 | --model LTSM_WordPrompt \ 78 | --model_name_or_path gpt2-medium \ 79 | --train_epochs 500 \ 80 | --batch_size 10 \ 81 | --pred_len 96 \ 82 | --data_path "datasets/ETT-small/ETTh1.csv" \ 83 | --test_data_path_list "datasets/ETT-small/ETTh1.csv" \ 84 | --prompt_data_path "prompt_bank/text_prompt_data_csv/csv_prompt.json" \ 85 | --freeze 0 \ 86 | --learning_rate 1e-3 \ 87 | --downsample_rate 20 \ 88 | --output_dir [Your_Output_Path] \ 89 | ``` 90 | 91 | #### Option-(2) Explore [Time Series Prompt] and [Linear Tokenization] on gpt2-medium 92 | ```bash 93 | python3 main_ltsm.py \ 94 | --model LTSM \ 95 | --model_name_or_path gpt2-medium \ 96 | --train_epochs 500 \ 97 | --batch_size 10 \ 98 | --pred_len 96 \ 99 | --data_path "datasets/ETT-small/ETTh1.csv" \ 100 | --test_data_path_list "datasets/ETT-small/ETTh1.csv" \ 101 | --prompt_data_path "prompt_bank/prompt_data_normalize_split" \ 102 | --freeze 0 \ 103 | --learning_rate 1e-3 \ 104 | --downsample_rate 20 \ 105 | --output_dir [Your_Output_Path] \ 106 | ``` 107 | 108 | #### Option-(3) Finetune your dataset based on pre-trained LTSM-bundle model: [Time Series Prompt] and [Linear Tokenization] on gpt2-medium 109 | ```bash 110 | python3 main_ltsm.py \ 111 | --model LTSM \ 112 | --model_name_or_path gpt2-medium \ 113 | --local_pretrain LSC2204/LTSM-bundle \ # This model weight is for pred_len == 96 114 | --train_epochs 500 \ 115 | --batch_size 10 \ 116 | --pred_len 96 \ 117 | --data_path "datasets/ETT-small/ETTh1.csv" \ 118 | --test_data_path_list "datasets/ETT-small/ETTh1.csv" \ 119 | --prompt_data_path "prompt_bank/prompt_data_normalize_split" \ 120 | --freeze 0 \ 121 | --learning_rate 1e-3 \ 122 | --downsample_rate 20 \ 123 | --output_dir [Your_Output_Path] \ 124 | ``` 125 | 126 | -------------------------------------------------------------------------------- /ltsm/prompt_reader/stat_prompt/tsfel/utils/add_personal_features.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import inspect 3 | import json 4 | import os 5 | import sys 6 | import warnings 7 | from inspect import getmembers, isfunction 8 | 9 | from tsfel.feature_extraction.features_settings import load_json 10 | from tsfel.utils.calculate_complexity import compute_complexity 11 | 12 | 13 | def add_feature_json(features_path, json_path): 14 | """Adds new feature to features.json. 15 | 16 | Parameters 17 | ---------- 18 | features_path: string 19 | Personal Python module directory containing new features implementation. 20 | 21 | json_path: string 22 | Personal .json file directory containing existing features from TSFEL. 23 | New customised features will be added to file in this directory. 24 | 25 | """ 26 | 27 | sys.path.append(features_path[:-len(features_path.split(os.sep)[-1]) - 1]) 28 | exec("import " + features_path.split(os.sep)[-1][:-3]) 29 | 30 | # Reload module containing the new features 31 | importlib.reload(sys.modules[features_path.split(os.sep)[-1][:-3]]) 32 | exec("import " + features_path.split(os.sep)[-1][:-3] + " as pymodule") 33 | 34 | # Functions from module containing the new features 35 | functions_list = [o for o in getmembers(locals()['pymodule']) if isfunction(o[1])] 36 | function_names = [fname[0] for fname in functions_list] 37 | 38 | # Check if @set_domain was declared on features module 39 | vset_domain = False 40 | 41 | for fname, f in list(locals()['pymodule'].__dict__.items()): 42 | 43 | if getattr(f, "domain", None) is not None: 44 | 45 | vset_domain = True 46 | 47 | # Access to personal features.json 48 | feat_json = load_json(json_path) 49 | 50 | # Assign domain and tag 51 | domain = getattr(f, "domain", None) 52 | tag = getattr(f, "tag", None) 53 | 54 | # Feature specifications 55 | # Description 56 | if f.__doc__ is not None: 57 | descrip = f.__doc__.split("\n")[0] 58 | else: 59 | descrip = "" 60 | # Feature usage 61 | use = "yes" 62 | # Feature function arguments 63 | args_name = inspect.getfullargspec(f)[0] 64 | 65 | # Access feature parameters 66 | if args_name != "": 67 | # Retrieve default values of arguments 68 | spec = inspect.getfullargspec(f) 69 | defaults = dict(zip(spec.args[::-1], (spec.defaults or ())[::-1])) 70 | defaults.update(spec.kwonlydefaults or {}) 71 | 72 | for p in args_name[1:]: 73 | if p not in list(defaults.keys()): 74 | if p == 'fs': 75 | # Assigning a default value for fs if not given 76 | defaults[p] = 100 77 | else: 78 | defaults[p] = None 79 | if len(defaults) == 0: 80 | defaults = "" 81 | else: 82 | defaults = "" 83 | 84 | # Settings of new feature 85 | new_feature = {"description": descrip, 86 | "parameters": defaults, 87 | "function": fname, 88 | "use": use 89 | } 90 | 91 | # Check if domain exists 92 | try: 93 | feat_json[domain][fname] = new_feature 94 | except KeyError: 95 | feat_json[domain] = {fname: new_feature} 96 | 97 | # Insert tag if it is declared 98 | if tag is not None: 99 | feat_json[domain][fname]['tag'] = tag 100 | 101 | # Write new feature on json file 102 | with open(json_path, "w") as fout: 103 | json.dump(feat_json, fout, indent=" ") 104 | 105 | # Calculate feature complexity 106 | compute_complexity(fname, domain, json_path, features_path=features_path) 107 | print('Feature '+str(fname)+' was added.') 108 | 109 | if vset_domain is False: 110 | warnings.warn('No features were added. Please declare @set_domain.', stacklevel=2) 111 | 112 | 113 | -------------------------------------------------------------------------------- /ltsm/utils/timefeatures.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from pandas.tseries import offsets 6 | from pandas.tseries.frequencies import to_offset 7 | 8 | 9 | class TimeFeature: 10 | def __init__(self): 11 | pass 12 | 13 | def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: 14 | pass 15 | 16 | def __repr__(self): 17 | return self.__class__.__name__ + "()" 18 | 19 | 20 | class SecondOfMinute(TimeFeature): 21 | """Minute of hour encoded as value between [-0.5, 0.5]""" 22 | 23 | def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: 24 | return index.second / 59.0 - 0.5 25 | 26 | 27 | class MinuteOfHour(TimeFeature): 28 | """Minute of hour encoded as value between [-0.5, 0.5]""" 29 | 30 | def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: 31 | return index.minute / 59.0 - 0.5 32 | 33 | 34 | class HourOfDay(TimeFeature): 35 | """Hour of day encoded as value between [-0.5, 0.5]""" 36 | 37 | def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: 38 | return index.hour / 23.0 - 0.5 39 | 40 | 41 | class DayOfWeek(TimeFeature): 42 | """Hour of day encoded as value between [-0.5, 0.5]""" 43 | 44 | def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: 45 | return index.dayofweek / 6.0 - 0.5 46 | 47 | 48 | class DayOfMonth(TimeFeature): 49 | """Day of month encoded as value between [-0.5, 0.5]""" 50 | 51 | def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: 52 | return (index.day - 1) / 30.0 - 0.5 53 | 54 | 55 | class DayOfYear(TimeFeature): 56 | """Day of year encoded as value between [-0.5, 0.5]""" 57 | 58 | def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: 59 | return (index.dayofyear - 1) / 365.0 - 0.5 60 | 61 | 62 | class MonthOfYear(TimeFeature): 63 | """Month of year encoded as value between [-0.5, 0.5]""" 64 | 65 | def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: 66 | return (index.month - 1) / 11.0 - 0.5 67 | 68 | 69 | class WeekOfYear(TimeFeature): 70 | """Week of year encoded as value between [-0.5, 0.5]""" 71 | 72 | def __call__(self, index: pd.DatetimeIndex) -> np.ndarray: 73 | return (index.isocalendar().week - 1) / 52.0 - 0.5 74 | 75 | 76 | def time_features_from_frequency_str(freq_str: str) -> List[TimeFeature]: 77 | """ 78 | Returns a list of time features that will be appropriate for the given frequency string. 79 | Parameters 80 | ---------- 81 | freq_str 82 | Frequency string of the form [multiple][granularity] such as "12H", "5min", "1D" etc. 83 | """ 84 | 85 | features_by_offsets = { 86 | offsets.YearEnd: [], 87 | offsets.QuarterEnd: [MonthOfYear], 88 | offsets.MonthEnd: [MonthOfYear], 89 | offsets.Week: [DayOfMonth, WeekOfYear], 90 | offsets.Day: [DayOfWeek, DayOfMonth, DayOfYear], 91 | offsets.BusinessDay: [DayOfWeek, DayOfMonth, DayOfYear], 92 | offsets.Hour: [HourOfDay, DayOfWeek, DayOfMonth, DayOfYear], 93 | offsets.Minute: [ 94 | MinuteOfHour, 95 | HourOfDay, 96 | DayOfWeek, 97 | DayOfMonth, 98 | DayOfYear, 99 | ], 100 | offsets.Second: [ 101 | SecondOfMinute, 102 | MinuteOfHour, 103 | HourOfDay, 104 | DayOfWeek, 105 | DayOfMonth, 106 | DayOfYear, 107 | ], 108 | } 109 | 110 | offset = to_offset(freq_str) 111 | 112 | for offset_type, feature_classes in features_by_offsets.items(): 113 | if isinstance(offset, offset_type): 114 | return [cls() for cls in feature_classes] 115 | 116 | supported_freq_msg = f""" 117 | Unsupported frequency {freq_str} 118 | The following frequencies are supported: 119 | Y - yearly 120 | alias: A 121 | M - monthly 122 | W - weekly 123 | D - daily 124 | B - business days 125 | H - hourly 126 | T - minutely 127 | alias: min 128 | S - secondly 129 | """ 130 | raise RuntimeError(supported_freq_msg) 131 | 132 | 133 | def time_features(dates, freq='h'): 134 | return np.vstack([feat(dates) for feat in time_features_from_frequency_str(freq)]) 135 | -------------------------------------------------------------------------------- /ltsm/data_provider/tokenizer/standard_scaler.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import numpy as np 4 | from sklearn.preprocessing import StandardScaler as SKStandardScaler 5 | 6 | from ltsm.common.base_processor import BaseProcessor 7 | from typing import Tuple, List 8 | 9 | 10 | class StandardScaler(BaseProcessor): 11 | """ 12 | Represents a Standard Scaler object that uses Sklearn's Standard Scaler for data processing. 13 | 14 | Attributes: 15 | module_id (str): The identifier for base processor objects. 16 | """ 17 | module_id = "standard_scaler" 18 | 19 | def __init__(self): 20 | self._scaler = None 21 | 22 | def process(self, raw_data: np.ndarray, train_data: List[np.ndarray], val_data: List[np.ndarray], test_data: List[np.ndarray], fit_train_only:bool=False, do_anomaly:bool=False)->Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]: 23 | """ 24 | Standardizes the training, validation, and test sets by removing the mean and scaling to unit variance. 25 | 26 | Args: 27 | raw_data (np.ndarray): The raw data. 28 | train_data (List[np.ndarray]): The list of training sequences. 29 | val_data (List[np.ndarray]): The list of validation sequences. 30 | test_data (List[np.ndarray]): The list of test sequences. 31 | fit_train_only (bool): Indicates whether the datasets should be scaled based on the training data. 32 | 33 | Returns: 34 | Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]: 35 | A tuple of three lists containing the processed training, validation, and test data. 36 | """ 37 | scaled_train_data, scaled_val_data, scaled_test_data = [], [], [] 38 | for i, (raw_sequence, train_sequence, val_sequence, test_sequence) in enumerate(zip( 39 | raw_data, 40 | train_data, 41 | val_data, 42 | test_data, 43 | )): 44 | if do_anomaly and i == len(raw_data) - 1: # Skip anomaly label 45 | scaled_train_data.append(train_sequence) 46 | scaled_val_data.append(val_sequence) 47 | scaled_test_data.append(test_sequence) 48 | continue 49 | 50 | train_sequence = train_sequence.reshape(-1, 1) 51 | val_sequence = val_sequence.reshape(-1, 1) 52 | test_sequence = test_sequence.reshape(-1, 1) 53 | 54 | self._scaler = SKStandardScaler() 55 | 56 | if fit_train_only: 57 | self._scaler.fit(train_sequence) 58 | else: 59 | self._scaler.fit(raw_sequence.reshape(-1, 1)) 60 | 61 | scaled_train_data.append(self._scaler.transform(train_sequence).flatten()) 62 | scaled_val_data.append(self._scaler.transform(val_sequence).flatten()) 63 | scaled_test_data.append(self._scaler.transform(test_sequence).flatten()) 64 | 65 | return scaled_train_data, scaled_val_data, scaled_test_data 66 | 67 | def inverse_process(self, data: np.ndarray)->np.ndarray: 68 | """ 69 | Scales back the data to its original representation. 70 | 71 | Args: 72 | data (np.ndarray): The data to scale back. 73 | 74 | Returns: 75 | np.ndarray: The scaled back data. 76 | """ 77 | assert self._scaler is not None, "StandardScaler has not been fitted" 78 | raw_shape = data.shape 79 | data = self._scaler.inverse_transform(data.reshape(-1, 1)) 80 | 81 | return data.reshape(raw_shape) 82 | 83 | def save(self, save_dir: str): 84 | """ 85 | Saves the scaler to the save_dir directory as a Pickle file named processor.pkl. 86 | 87 | Args: 88 | save_dir (str): The directory where to store the scaler. 89 | """ 90 | save_path = os.path.join(save_dir, "processor.pkl") 91 | with open(save_path, 'wb') as f: 92 | pickle.dump(self._scaler, f) 93 | 94 | def load(self, save_dir): 95 | """ 96 | Loads the scaler saved at the save_dir directory. 97 | 98 | Args: 99 | save_dir (str): The directory the scaler was saved. 100 | """ 101 | save_path = os.path.join(save_dir, "processor.pkl") 102 | with open(save_path, 'rb') as f: 103 | self._scaler = pickle.load(f) 104 | 105 | -------------------------------------------------------------------------------- /tests/data_reader/train_database_reader_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pandas as pd 3 | import numpy as np 4 | from io import StringIO 5 | from unittest.mock import MagicMock, patch 6 | import taosws 7 | 8 | # Assuming your script is named `script` and contains the functions 9 | from ltsm.data_reader.train_database_reader import create_connection, setup_database, setup_tables, insert_data_from_csv, retrieve_data_to_csv 10 | 11 | class TestDatabaseOperations(unittest.TestCase): 12 | 13 | def setUp(self): 14 | # Simulated database connection 15 | self.conn = MagicMock(spec=taosws.Connection) 16 | self.cursor = MagicMock() 17 | self.conn.cursor.return_value = self.cursor 18 | 19 | # A larger, complex CSV input data (1000 rows, 10 float columns, 1 int column as Label) 20 | num_rows = 1000 21 | num_features = 10 22 | np.random.seed(42) 23 | float_data = np.random.rand(num_rows, num_features) # Generate random float values between 0 and 1 24 | label_data = np.random.randint(0, 2, size=(num_rows, 1)) # Random integer values 0 or 1 for 'Label' 25 | 26 | # Combine float data and label column to create a full dataset 27 | data = np.hstack((float_data, label_data)) 28 | columns = [f'Feature{i + 1}' for i in range(num_features)] + ['Label'] 29 | 30 | # Create a Pandas DataFrame from the generated data 31 | self.df = pd.DataFrame(data, columns=columns) 32 | 33 | # Ensure 'Label' is an integer type 34 | self.df['Label'] = self.df['Label'].astype(int) 35 | 36 | self.input_csv = StringIO(self.df.to_csv(index=False)) 37 | 38 | # Sample expected table creation schema 39 | self.expected_schema = ( 40 | "CREATE TABLE IF NOT EXISTS test_table (" 41 | "ts TIMESTAMP, " + 42 | ", ".join([f"`Feature{i + 1}` FLOAT" for i in range(num_features)]) + 43 | ", Label INT)" 44 | ) 45 | 46 | @patch('taosws.connect') 47 | def test_create_connection(self, mock_connect): 48 | # Test connection creation 49 | mock_connect.return_value = self.conn 50 | connection = create_connection() 51 | mock_connect.assert_called_once() 52 | self.assertIsNotNone(connection) 53 | 54 | def test_setup_database(self): 55 | # Test database setup 56 | setup_database(self.conn, 'test_database') 57 | self.cursor.execute.assert_called_with("CREATE DATABASE IF NOT EXISTS test_database") 58 | 59 | def test_setup_tables(self): 60 | # Test table creation 61 | setup_tables(self.conn, 'test_database', 'test_table', self.df) 62 | self.cursor.execute.assert_any_call(f"USE test_database") 63 | self.cursor.execute.assert_any_call(self.expected_schema) 64 | 65 | def test_insert_data_from_csv(self): 66 | # Test data insertion 67 | insert_data_from_csv(self.conn, 'test_database', self.input_csv, 'test_table') 68 | self.cursor.execute.assert_any_call(f"USE test_database") 69 | # Check that data is being inserted 70 | self.assertTrue( 71 | any("INSERT INTO test_table VALUES" in call[0][0] for call in self.cursor.execute.call_args_list), 72 | "Insert data command was not called correctly." 73 | ) 74 | 75 | def test_retrieve_data_to_csv(self): 76 | # Mock fetched data and column descriptions 77 | self.cursor.fetchall.side_effect = [ 78 | [tuple(row) for row in self.df.values], # Fetched data as tuples 79 | [(f'Feature{i+1}',) for i in range(10)] + [('Label',)] # Column names 80 | ] 81 | 82 | output_file = "test_output.csv" 83 | retrieve_data_to_csv(self.conn, 'test_database', 'test_table', output_file) 84 | 85 | # Verify that the SELECT command was called 86 | self.cursor.execute.assert_any_call(f"SELECT * FROM test_table") 87 | 88 | # Check if output file is created and matches expected data structure 89 | result_df = pd.read_csv(output_file) 90 | self.assertEqual(len(result_df), 1000, "Output file does not have the expected number of rows.") 91 | for i in range(1, 11): 92 | self.assertTrue(f'Feature{i}' in result_df.columns, f"Expected column Feature{i} not found in output CSV.") 93 | self.assertTrue('Label' in result_df.columns, "Expected column 'Label' not found in output CSV.") 94 | 95 | if __name__ == '__main__': 96 | unittest.main() -------------------------------------------------------------------------------- /ltsm/prompt_reader/stat_prompt/tsfel/feature_extraction/features_settings.py: -------------------------------------------------------------------------------- 1 | import json 2 | import tsfel 3 | import numpy as np 4 | 5 | 6 | def load_json(json_path): 7 | """Loads the json file given by filename. 8 | 9 | Parameters 10 | ---------- 11 | json_path : string 12 | Json path 13 | 14 | Returns 15 | ------- 16 | Dict 17 | Dictionary 18 | 19 | """ 20 | 21 | return json.load(open(json_path)) 22 | 23 | 24 | def get_features_by_domain(domain=None, json_path=None): 25 | """Creates a dictionary with the features settings by domain. 26 | 27 | Parameters 28 | ---------- 29 | domain : string 30 | Available domains: "statistical"; "spectral"; "temporal" 31 | If domain equals None, then the features settings from all domains are returned. 32 | json_path : string 33 | Directory of json file. Default: package features.json directory 34 | 35 | Returns 36 | ------- 37 | Dict 38 | Dictionary with the features settings 39 | 40 | """ 41 | 42 | if json_path is None: 43 | json_path = tsfel.__path__[0] + "/feature_extraction/features.json" 44 | 45 | if domain not in ['statistical', 'temporal', 'spectral', None]: 46 | raise SystemExit( 47 | 'No valid domain. Choose: statistical, temporal, spectral or None (for all feature settings).') 48 | 49 | dict_features = load_json(json_path) 50 | if domain is None: 51 | return dict_features 52 | else: 53 | return {domain: dict_features[domain]} 54 | 55 | 56 | def get_features_by_tag(tag=None, json_path=None): 57 | """Creates a dictionary with the features settings by tag. 58 | 59 | Parameters 60 | ---------- 61 | tag : string 62 | Available tags: "audio"; "inertial", "ecg"; "eeg"; "emg". 63 | If tag equals None then, all available features are returned. 64 | json_path : string 65 | Directory of json file. Default: package features.json directory 66 | 67 | Returns 68 | ------- 69 | Dict 70 | Dictionary with the features settings 71 | 72 | """ 73 | if json_path is None: 74 | json_path = tsfel.__path__[0] + "/feature_extraction/features.json" 75 | 76 | if tag not in ["audio", "inertial", "ecg", "eeg", "emg", None]: 77 | raise SystemExit( 78 | "No valid tag. Choose: audio, inertial, ecg, eeg, emg or None.") 79 | features_tag = {} 80 | dict_features = load_json(json_path) 81 | if tag is None: 82 | return dict_features 83 | else: 84 | for domain in dict_features: 85 | features_tag[domain] = {} 86 | for feat in dict_features[domain]: 87 | if dict_features[domain][feat]["use"] == "no": 88 | continue 89 | # Check if tag is defined 90 | try: 91 | js_tag = dict_features[domain][feat]["tag"] 92 | if isinstance(js_tag, list): 93 | if any([tag in js_t for js_t in js_tag]): 94 | features_tag[domain].update({feat: dict_features[domain][feat]}) 95 | elif js_tag == tag: 96 | features_tag[domain].update({feat: dict_features[domain][feat]}) 97 | except KeyError: 98 | continue 99 | # To remove empty dicts 100 | return dict([[d, features_tag[d]] for d in list(features_tag.keys()) if bool(features_tag[d])]) 101 | 102 | 103 | def get_number_features(dict_features): 104 | """Count the total number of features based on input parameters of each feature 105 | 106 | Parameters 107 | ---------- 108 | dict_features : dict 109 | Dictionary with features settings 110 | 111 | Returns 112 | ------- 113 | int 114 | Feature vector size 115 | """ 116 | number_features = 0 117 | for domain in dict_features: 118 | for feat in dict_features[domain]: 119 | if dict_features[domain][feat]["use"] == "no": 120 | continue 121 | n_feat = dict_features[domain][feat]["n_features"] 122 | 123 | if isinstance(n_feat, int): 124 | number_features += n_feat 125 | else: 126 | n_feat_param = dict_features[domain][feat]["parameters"][n_feat] 127 | if isinstance(n_feat_param, int): 128 | number_features += n_feat_param 129 | else: 130 | number_features += eval("len(" + n_feat_param + ")") 131 | 132 | return number_features 133 | -------------------------------------------------------------------------------- /tests/data_reader/npy_database_reader_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | import pandas as pd 4 | from unittest.mock import MagicMock, patch 5 | import taosws 6 | import os 7 | 8 | # Assuming your script is named `script` and it contains the functions defined 9 | from ltsm.data_reader.npy_database_reader import create_connection, setup_database, setup_tables, insert_data_from_npy, retrieve_data_to_npy 10 | 11 | class TestDatabaseOperations(unittest.TestCase): 12 | 13 | def setUp(self): 14 | # Simulated database connection 15 | self.conn = MagicMock(spec=taosws.Connection) 16 | self.cursor = MagicMock() 17 | self.conn.cursor.return_value = self.cursor 18 | 19 | # Generate a large, complex synthetic NumPy array for testing (1000 rows, 50 columns) 20 | self.num_rows = 1000 21 | self.num_cols = 50 22 | np.random.seed(42) 23 | self.data = np.random.rand(self.num_rows, self.num_cols) # Random floats between 0 and 100 24 | 25 | # Save the array to a temporary .npy file 26 | self.test_npy_file = 'test_data.npy' 27 | np.save(self.test_npy_file, self.data) 28 | 29 | # Create a DataFrame from the NumPy array for table setup 30 | self.df = pd.DataFrame(self.data) 31 | self.table_name = 'test_table' 32 | 33 | @patch('taosws.connect') 34 | def test_create_connection(self, mock_connect): 35 | # Test the connection creation 36 | mock_connect.return_value = self.conn 37 | connection = create_connection() 38 | mock_connect.assert_called_once() 39 | self.assertIsNotNone(connection) 40 | 41 | def test_setup_database(self): 42 | # Test database setup 43 | setup_database(self.conn, 'test_database') 44 | self.cursor.execute.assert_called_with("CREATE DATABASE IF NOT EXISTS test_database") 45 | 46 | def test_setup_tables(self): 47 | # Test table creation with a large number of columns 48 | setup_tables(self.conn, 'test_database', self.table_name, self.df) 49 | self.cursor.execute.assert_any_call(f"USE test_database") 50 | self.cursor.execute.assert_any_call(f"DROP TABLE IF EXISTS {self.table_name}") 51 | 52 | # Check that the table creation schema was executed correctly 53 | expected_schema_columns = [f"`{i}` FLOAT" for i in range(self.num_cols)] 54 | expected_schema = f"CREATE TABLE IF NOT EXISTS {self.table_name} (ts TIMESTAMP, {', '.join(expected_schema_columns)})" 55 | self.cursor.execute.assert_any_call(expected_schema) 56 | 57 | def test_insert_data_from_npy(self): 58 | # Test data insertion from .npy file with batch processing 59 | insert_data_from_npy(self.conn, 'test_database', self.test_npy_file, self.table_name, batch_size=100) 60 | self.cursor.execute.assert_any_call(f"USE test_database") 61 | 62 | # Check that data is inserted in batches 63 | batch_inserts = [call for call in self.cursor.execute.call_args_list if "INSERT INTO" in call[0][0]] 64 | self.assertGreaterEqual(len(batch_inserts), 5, "Expected at least 5 batch insertions for 1000 rows with a batch size of 200.") 65 | 66 | def test_retrieve_data_to_npy(self): 67 | # Mock fetched data and column descriptions for the retrieval test 68 | self.cursor.fetchall.side_effect = [ 69 | [tuple(row) for row in self.data], # Mocked data returned as tuples 70 | [(f'{i}',) for i in range(self.num_cols)] # Mocked column names 71 | ] 72 | 73 | output_file = 'test_output.npy' 74 | retrieve_data_to_npy(self.conn, 'test_database', self.table_name, output_file) 75 | 76 | # Verify that the SELECT command was called 77 | self.cursor.execute.assert_any_call(f"SELECT * FROM {self.table_name}") 78 | 79 | # Load and check the output file 80 | result_array = np.load(output_file) 81 | self.assertEqual(result_array.shape, (self.num_rows, self.num_cols), # Subtract 1 for the timestamp column 82 | "Output file shape does not match expected data shape.") 83 | np.testing.assert_array_almost_equal(self.data, result_array, decimal=5, 84 | err_msg="Output data does not match expected data.") 85 | 86 | # Clean up by deleting the generated .npy file 87 | if os.path.exists(output_file): 88 | os.remove(output_file) 89 | print(f"Cleaned up generated file: {output_file}") 90 | 91 | if os.path.exists('test_data.npy'): 92 | os.remove('test_data.npy') 93 | print(f"Cleaned up generated file: test_data.npy") 94 | 95 | if __name__ == '__main__': 96 | unittest.main() 97 | -------------------------------------------------------------------------------- /tests/data_provider/prompt_generator_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import pandas as pd 4 | import numpy as np 5 | import torch 6 | from ltsm.data_provider.prompt_generator import save_data, prompt_save 7 | 8 | @pytest.fixture 9 | def setup_prompt(mocker, tmp_path): 10 | """set up the test environment""" 11 | mocker.patch.dict('sys.modules', {'tsfel': mocker.MagicMock()}) 12 | 13 | sample_prompt_buf = { 14 | 'train': pd.DataFrame({ 15 | 'feature1': np.random.rand(10), 16 | 'feature2': np.random.rand(10) 17 | }), 18 | 'val': pd.DataFrame({ 19 | 'feature1': np.random.rand(5), 20 | 'feature2': np.random.rand(5) 21 | }), 22 | 'test': pd.DataFrame({ 23 | 'feature1': np.random.rand(5), 24 | 'feature2': np.random.rand(5) 25 | }) 26 | } 27 | 28 | output_path = str(tmp_path) 29 | data_name = "test_data" 30 | ifTest = False 31 | 32 | for split in ["train", "val", "test"]: 33 | split_dir = os.path.join(output_path, split) 34 | os.makedirs(split_dir, exist_ok=True) 35 | 36 | return prompt_save, sample_prompt_buf, output_path, data_name, ifTest 37 | 38 | @pytest.mark.parametrize("save_format", ["pth.tar", "csv", "npz"]) 39 | def test_prompt_save(setup_prompt, save_format): 40 | """test if the prompt data is saved correctly in different formats and loaded back correctly 41 | """ 42 | prompt_save, sample_prompt_buf, output_path, data_name, ifTest = setup_prompt 43 | prompt_save(sample_prompt_buf, output_path, data_name, save_format, ifTest) 44 | 45 | for split in ["train", "val", "test"]: 46 | split_dir = os.path.join(output_path, split) 47 | for index, col in sample_prompt_buf[split].T.iterrows(): 48 | file_name = f"{data_name}_{index}_prompt.{save_format}" 49 | file_path = os.path.join(split_dir, file_name) 50 | assert os.path.exists(file_path), f"File {file_path} does not exist" 51 | 52 | prompt_data = col 53 | prompt_data.columns = [index] 54 | prompt_data = prompt_data.T 55 | 56 | if save_format == "pth.tar": 57 | load_data = torch.load(file_path) 58 | elif save_format == "csv": 59 | load_data = pd.read_csv(file_path) 60 | if isinstance(load_data, pd.DataFrame): 61 | load_data = load_data.squeeze() 62 | elif save_format == "npz": 63 | loaded = np.load(file_path) 64 | load_data = pd.Series(data=loaded["data"], index=loaded["index"], name=loaded["name"].item()) 65 | if isinstance(load_data, pd.DataFrame): 66 | load_data = load_data.squeeze() 67 | else: 68 | raise ValueError(f"Unsupported save format: {save_format}") 69 | 70 | assert type(load_data) == type(prompt_data), f"Type mismatch: {type(load_data)} vs {type(prompt_data)}" 71 | assert load_data.shape == prompt_data.shape, f"Shape mismatch: {load_data.shape} vs {prompt_data.shape}" 72 | assert load_data.index.equals(prompt_data.index), "Index mismatch" 73 | assert load_data.name == prompt_data.name, f"Series names mismatch: {load_data.name} vs {prompt_data.name}" 74 | assert np.allclose(load_data.values, prompt_data.values, rtol=1e-8, atol=1e-8), "Data values mismatch" 75 | if save_format != "csv": 76 | assert load_data.equals(prompt_data), f"Data mismatch: {load_data} vs {prompt_data}" 77 | print(f"All tests passed for {file_path}") 78 | 79 | 80 | @pytest.fixture 81 | def setup_save(): 82 | """input data for testing""" 83 | data = pd.DataFrame([range(133)]) 84 | print(data.shape) 85 | return data 86 | 87 | @pytest.mark.parametrize("save_format", ["pth.tar", "csv", "npz"]) 88 | def test_save_data(tmpdir, setup_save, save_format): 89 | """test save_data function: save data in different formats and load it back to check if the data is saved correctly""" 90 | data_path = os.path.join(tmpdir, f"test_data.{save_format}") 91 | 92 | save_data(setup_save, data_path, save_format) 93 | 94 | if save_format == "pth.tar": 95 | loaded_data = torch.load(data_path) 96 | elif save_format == "csv": 97 | loaded_data = pd.read_csv(data_path) 98 | loaded_data.columns = loaded_data.columns.astype(int) 99 | elif save_format == "npz": 100 | loaded = np.load(data_path) 101 | loaded_data = pd.DataFrame(data=loaded["data"]) 102 | 103 | assert isinstance(loaded_data, pd.DataFrame), "Loaded data should be a DataFrame" 104 | assert loaded_data.shape == setup_save.shape, f"Shape mismatch: {loaded_data.shape} vs {setup_save.shape}" 105 | assert loaded_data.columns.equals(setup_save.columns), "Columns mismatch" 106 | assert np.allclose(loaded_data.values, setup_save.values, rtol=1e-8, atol=1e-8), "Data values mismatch" 107 | 108 | 109 | -------------------------------------------------------------------------------- /tests/evaluate_pipeline/evaluation_pipeline_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from ltsm.evaluate_pipeline.evaluation_pipeline import EvaluationPipeline 3 | import random 4 | 5 | 6 | class TestEvaluationPipeline(unittest.TestCase): 7 | def setUp(self): 8 | # Complex data for testing 9 | # Predicted ranges with overlaps, gaps, and exact matches 10 | self.x_test = [ 11 | (0, 10), (15, 25), (30, 40), (45, 55), (60, 70) 12 | ] 13 | # True ranges with partial overlaps, non-overlapping ranges, and complete overlaps 14 | self.y_test = [ 15 | (5, 15), (20, 30), (35, 50), (65, 75) 16 | ] 17 | 18 | # Large, random test set 19 | self.x_test_large = self.generate_random_ranges(100, 0, 1000) 20 | self.y_test_large = self.generate_random_ranges(100, 0, 1000) 21 | 22 | @staticmethod 23 | def generate_random_ranges(count, range_start, range_end): 24 | """Generate random range pairs.""" 25 | ranges = [] 26 | for _ in range(count): 27 | start = random.randint(range_start, range_end - 10) 28 | end = start + random.randint(5, 20) 29 | ranges.append((start, end)) 30 | return ranges 31 | 32 | def test_overlap(self): 33 | # Test overlap size with varying ranges 34 | pipeline = EvaluationPipeline(self.x_test, self.y_test) 35 | 36 | # Overlap between first predicted and first true range 37 | overlap_1 = pipeline.overlap_size((0, 10), (5, 15)) 38 | self.assertAlmostEqual(overlap_1, 0.545, places=2) # Partial overlap 39 | 40 | overlap_2 = pipeline.overlap_size((17, 29), (23, 33)) 41 | self.assertAlmostEqual(overlap_2, 0.5384615384615384, places=2) # Partial overlap 42 | 43 | # Overlap between first predicted and first true range 44 | overlap_3 = pipeline.overlap_size((16, 20), (20, 35)) 45 | self.assertAlmostEqual(overlap_3, 0.2, places=2) 46 | 47 | # No overlap 48 | overlap_4 = pipeline.overlap_size((15, 25), (35, 50)) 49 | self.assertEqual(overlap_4, 0) 50 | 51 | # Complete overlap 52 | overlap_5 = pipeline.overlap_size((30, 40), (30, 40)) 53 | self.assertEqual(overlap_5, 1.0) 54 | 55 | def test_cardinality_factor(self): 56 | # Test cardinality factor with overlapping ranges 57 | pipeline = EvaluationPipeline(self.x_test, self.y_test) 58 | 59 | # A range with overlaps 60 | cardinality_1 = pipeline.cardinality_factor((35, 50), self.y_test) 61 | self.assertGreaterEqual(cardinality_1, 1.0) 62 | 63 | # A range with no overlaps 64 | cardinality_2 = pipeline.cardinality_factor((100, 110), self.y_test) 65 | self.assertEqual(cardinality_2, 1.0) 66 | 67 | def test_large_random_data(self): 68 | # Test pipeline with large randomized data 69 | pipeline = EvaluationPipeline(self.x_test_large, self.y_test_large) 70 | 71 | recall = pipeline.evaluate_recall_score() 72 | precision = pipeline.evaluate_precision_score() 73 | f1_score = pipeline.evaluate_f1_score() 74 | 75 | # Check scores are within bounds 76 | self.assertGreaterEqual(recall, 0.0) 77 | self.assertGreaterEqual(precision, 0.0) 78 | self.assertGreaterEqual(f1_score, 0.0) 79 | self.assertLessEqual(recall, 1.0) 80 | self.assertLessEqual(precision, 1.0) 81 | self.assertLessEqual(f1_score, 1.0) 82 | 83 | def test_edge_case_empty_inputs(self): 84 | # Edge case: Empty inputs 85 | pipeline = EvaluationPipeline([], []) 86 | recall = pipeline.evaluate_recall_score() 87 | precision = pipeline.evaluate_precision_score() 88 | f1_score = pipeline.evaluate_f1_score() 89 | 90 | self.assertEqual(recall, 0.0) 91 | self.assertEqual(precision, 0.0) 92 | self.assertEqual(f1_score, 0.0) 93 | 94 | def test_edge_case_no_overlap(self): 95 | # Edge case: No overlaps 96 | x_test_no_overlap = [(0, 10), (20, 30)] 97 | y_test_no_overlap = [(40, 50), (60, 70)] 98 | pipeline = EvaluationPipeline(x_test_no_overlap, y_test_no_overlap) 99 | recall = pipeline.evaluate_recall_score() 100 | precision = pipeline.evaluate_precision_score() 101 | f1_score = pipeline.evaluate_f1_score() 102 | 103 | self.assertEqual(recall, 0.0) 104 | self.assertEqual(precision, 0.0) 105 | self.assertEqual(f1_score, 0.0) 106 | 107 | def test_f1_score_consistency(self): 108 | # Validate that F1 score is consistent with precision and recall 109 | pipeline = EvaluationPipeline(self.x_test, self.y_test) 110 | recall = pipeline.evaluate_recall_score() 111 | precision = pipeline.evaluate_precision_score() 112 | f1_score = pipeline.evaluate_f1_score() 113 | 114 | if precision + recall > 0: 115 | self.assertAlmostEqual(f1_score, 2 * (precision * recall) / (precision + recall), places=2) 116 | else: 117 | self.assertEqual(f1_score, 0.0) 118 | 119 | 120 | if __name__ == "__main__": 121 | unittest.main() 122 | -------------------------------------------------------------------------------- /ltsm/layers/Transformer_EncDec.py: -------------------------------------------------------------------------------- 1 | # code from https://github.com/yuqinie98/PatchTST, with minor modifications 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | 7 | class ConvLayer(nn.Module): 8 | def __init__(self, c_in): 9 | super(ConvLayer, self).__init__() 10 | self.downConv = nn.Conv1d(in_channels=c_in, 11 | out_channels=c_in, 12 | kernel_size=3, 13 | padding=2, 14 | padding_mode='circular') 15 | self.norm = nn.BatchNorm1d(c_in) 16 | self.activation = nn.ELU() 17 | self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1) 18 | 19 | def forward(self, x): 20 | x = self.downConv(x.permute(0, 2, 1)) 21 | x = self.norm(x) 22 | x = self.activation(x) 23 | x = self.maxPool(x) 24 | x = x.transpose(1, 2) 25 | return x 26 | 27 | 28 | class EncoderLayer(nn.Module): 29 | def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu"): 30 | super(EncoderLayer, self).__init__() 31 | d_ff = d_ff or 4 * d_model 32 | self.attention = attention 33 | self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1) 34 | self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1) 35 | self.norm1 = nn.LayerNorm(d_model) 36 | self.norm2 = nn.LayerNorm(d_model) 37 | self.dropout = nn.Dropout(dropout) 38 | self.activation = F.relu if activation == "relu" else F.gelu 39 | 40 | def forward(self, x, attn_mask=None): 41 | new_x, attn = self.attention( 42 | x, x, x, 43 | attn_mask=attn_mask 44 | ) 45 | x = x + self.dropout(new_x) 46 | 47 | y = x = self.norm1(x) 48 | y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1)))) 49 | y = self.dropout(self.conv2(y).transpose(-1, 1)) 50 | 51 | return self.norm2(x + y), attn 52 | 53 | 54 | class Encoder(nn.Module): 55 | def __init__(self, attn_layers, conv_layers=None, norm_layer=None): 56 | super(Encoder, self).__init__() 57 | self.attn_layers = nn.ModuleList(attn_layers) 58 | self.conv_layers = nn.ModuleList(conv_layers) if conv_layers is not None else None 59 | self.norm = norm_layer 60 | 61 | def forward(self, x, attn_mask=None): 62 | # x [B, L, D] 63 | attns = [] 64 | if self.conv_layers is not None: 65 | for attn_layer, conv_layer in zip(self.attn_layers, self.conv_layers): 66 | x, attn = attn_layer(x, attn_mask=attn_mask) 67 | x = conv_layer(x) 68 | attns.append(attn) 69 | x, attn = self.attn_layers[-1](x) 70 | attns.append(attn) 71 | else: 72 | for attn_layer in self.attn_layers: 73 | x, attn = attn_layer(x, attn_mask=attn_mask) 74 | attns.append(attn) 75 | 76 | if self.norm is not None: 77 | x = self.norm(x) 78 | 79 | return x, attns 80 | 81 | 82 | class DecoderLayer(nn.Module): 83 | def __init__(self, self_attention, cross_attention, d_model, d_ff=None, 84 | dropout=0.1, activation="relu"): 85 | super(DecoderLayer, self).__init__() 86 | d_ff = d_ff or 4 * d_model 87 | self.self_attention = self_attention 88 | self.cross_attention = cross_attention 89 | self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1) 90 | self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1) 91 | self.norm1 = nn.LayerNorm(d_model) 92 | self.norm2 = nn.LayerNorm(d_model) 93 | self.norm3 = nn.LayerNorm(d_model) 94 | self.dropout = nn.Dropout(dropout) 95 | self.activation = F.relu if activation == "relu" else F.gelu 96 | 97 | def forward(self, x, cross, x_mask=None, cross_mask=None): 98 | x = x + self.dropout(self.self_attention( 99 | x, x, x, 100 | attn_mask=x_mask 101 | )[0]) 102 | x = self.norm1(x) 103 | 104 | x = x + self.dropout(self.cross_attention( 105 | x, cross, cross, 106 | attn_mask=cross_mask 107 | )[0]) 108 | 109 | y = x = self.norm2(x) 110 | y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1)))) 111 | y = self.dropout(self.conv2(y).transpose(-1, 1)) 112 | 113 | return self.norm3(x + y) 114 | 115 | 116 | class Decoder(nn.Module): 117 | def __init__(self, layers, norm_layer=None, projection=None): 118 | super(Decoder, self).__init__() 119 | self.layers = nn.ModuleList(layers) 120 | self.norm = norm_layer 121 | self.projection = projection 122 | 123 | def forward(self, x, cross, x_mask=None, cross_mask=None): 124 | for layer in self.layers: 125 | x = layer(x, cross, x_mask=x_mask, cross_mask=cross_mask) 126 | 127 | if self.norm is not None: 128 | x = self.norm(x) 129 | 130 | if self.projection is not None: 131 | x = self.projection(x) 132 | return x -------------------------------------------------------------------------------- /ltsm/layers/PatchTST_layers.py: -------------------------------------------------------------------------------- 1 | # code from https://github.com/yuqinie98/PatchTST, with minor modifications 2 | __all__ = ['Transpose', 'get_activation_fn', 'moving_avg', 'series_decomp', 'PositionalEncoding', 'SinCosPosEncoding', 'Coord2dPosEncoding', 'Coord1dPosEncoding', 'positional_encoding'] 3 | 4 | import torch 5 | from torch import nn 6 | import math 7 | 8 | class Transpose(nn.Module): 9 | def __init__(self, *dims, contiguous=False): 10 | super().__init__() 11 | self.dims, self.contiguous = dims, contiguous 12 | def forward(self, x): 13 | if self.contiguous: return x.transpose(*self.dims).contiguous() 14 | else: return x.transpose(*self.dims) 15 | 16 | 17 | def get_activation_fn(activation): 18 | if callable(activation): return activation() 19 | elif activation.lower() == "relu": return nn.ReLU() 20 | elif activation.lower() == "gelu": return nn.GELU() 21 | raise ValueError(f'{activation} is not available. You can use "relu", "gelu", or a callable') 22 | 23 | 24 | # decomposition 25 | 26 | class moving_avg(nn.Module): 27 | """ 28 | Moving average block to highlight the trend of time series 29 | """ 30 | def __init__(self, kernel_size, stride): 31 | super(moving_avg, self).__init__() 32 | self.kernel_size = kernel_size 33 | self.avg = nn.AvgPool1d(kernel_size=kernel_size, stride=stride, padding=0) 34 | 35 | def forward(self, x): 36 | # padding on the both ends of time series 37 | front = x[:, 0:1, :].repeat(1, (self.kernel_size - 1) // 2, 1) 38 | end = x[:, -1:, :].repeat(1, (self.kernel_size - 1) // 2, 1) 39 | x = torch.cat([front, x, end], dim=1) 40 | x = self.avg(x.permute(0, 2, 1)) 41 | x = x.permute(0, 2, 1) 42 | return x 43 | 44 | 45 | class series_decomp(nn.Module): 46 | """ 47 | Series decomposition block 48 | """ 49 | def __init__(self, kernel_size): 50 | super(series_decomp, self).__init__() 51 | self.moving_avg = moving_avg(kernel_size, stride=1) 52 | 53 | def forward(self, x): 54 | moving_mean = self.moving_avg(x) 55 | res = x - moving_mean 56 | return res, moving_mean 57 | 58 | 59 | 60 | # pos_encoding 61 | 62 | def PositionalEncoding(q_len, d_model, normalize=True): 63 | pe = torch.zeros(q_len, d_model) 64 | position = torch.arange(0, q_len).unsqueeze(1) 65 | div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)) 66 | pe[:, 0::2] = torch.sin(position * div_term) 67 | pe[:, 1::2] = torch.cos(position * div_term) 68 | if normalize: 69 | pe = pe - pe.mean() 70 | pe = pe / (pe.std() * 10) 71 | return pe 72 | 73 | SinCosPosEncoding = PositionalEncoding 74 | 75 | def Coord2dPosEncoding(q_len, d_model, exponential=False, normalize=True, eps=1e-3, verbose=False): 76 | x = .5 if exponential else 1 77 | i = 0 78 | for i in range(100): 79 | cpe = 2 * (torch.linspace(0, 1, q_len).reshape(-1, 1) ** x) * (torch.linspace(0, 1, d_model).reshape(1, -1) ** x) - 1 80 | pv(f'{i:4.0f} {x:5.3f} {cpe.mean():+6.3f}', verbose) 81 | if abs(cpe.mean()) <= eps: break 82 | elif cpe.mean() > eps: x += .001 83 | else: x -= .001 84 | i += 1 85 | if normalize: 86 | cpe = cpe - cpe.mean() 87 | cpe = cpe / (cpe.std() * 10) 88 | return cpe 89 | 90 | def Coord1dPosEncoding(q_len, exponential=False, normalize=True): 91 | cpe = (2 * (torch.linspace(0, 1, q_len).reshape(-1, 1)**(.5 if exponential else 1)) - 1) 92 | if normalize: 93 | cpe = cpe - cpe.mean() 94 | cpe = cpe / (cpe.std() * 10) 95 | return cpe 96 | 97 | def positional_encoding(pe, learn_pe, q_len, d_model): 98 | # Positional encoding 99 | if pe == None: 100 | W_pos = torch.empty((q_len, d_model)) # pe = None and learn_pe = False can be used to measure impact of pe 101 | nn.init.uniform_(W_pos, -0.02, 0.02) 102 | learn_pe = False 103 | elif pe == 'zero': 104 | W_pos = torch.empty((q_len, 1)) 105 | nn.init.uniform_(W_pos, -0.02, 0.02) 106 | elif pe == 'zeros': 107 | W_pos = torch.empty((q_len, d_model)) 108 | nn.init.uniform_(W_pos, -0.02, 0.02) 109 | elif pe == 'normal' or pe == 'gauss': 110 | W_pos = torch.zeros((q_len, 1)) 111 | torch.nn.init.normal_(W_pos, mean=0.0, std=0.1) 112 | elif pe == 'uniform': 113 | W_pos = torch.zeros((q_len, 1)) 114 | nn.init.uniform_(W_pos, a=0.0, b=0.1) 115 | elif pe == 'lin1d': W_pos = Coord1dPosEncoding(q_len, exponential=False, normalize=True) 116 | elif pe == 'exp1d': W_pos = Coord1dPosEncoding(q_len, exponential=True, normalize=True) 117 | elif pe == 'lin2d': W_pos = Coord2dPosEncoding(q_len, d_model, exponential=False, normalize=True) 118 | elif pe == 'exp2d': W_pos = Coord2dPosEncoding(q_len, d_model, exponential=True, normalize=True) 119 | elif pe == 'sincos': W_pos = PositionalEncoding(q_len, d_model, normalize=True) 120 | else: raise ValueError(f"{pe} is not a valid pe (positional encoder. Available types: 'gauss'=='normal', \ 121 | 'zeros', 'zero', uniform', 'lin1d', 'exp1d', 'lin2d', 'exp2d', 'sincos', None.)") 122 | return nn.Parameter(W_pos, requires_grad=learn_pe) -------------------------------------------------------------------------------- /ltsm/models/PatchTST.py: -------------------------------------------------------------------------------- 1 | # code from https://github.com/yuqinie98/PatchTST, with minor modifications 2 | import torch 3 | from torch import Tensor 4 | 5 | from .base_config import PatchTSTConfig 6 | from ltsm.layers.PatchTST_backbone import PatchTST_backbone 7 | from ltsm.layers.PatchTST_layers import series_decomp 8 | from transformers import PreTrainedModel 9 | 10 | class PatchTST(PreTrainedModel): 11 | config_class = PatchTSTConfig 12 | 13 | def __init__(self, config: PatchTSTConfig, **kwargs): 14 | super().__init__(config) 15 | 16 | self.decomposition = config.decomposition 17 | if self.decomposition: 18 | self.decomp_module = series_decomp(config.kernel_size) 19 | self.model_trend = PatchTST_backbone( 20 | config.enc_in, 21 | config.seq_len, 22 | config.pred_len, 23 | config.patch_len, 24 | config.stride, 25 | config.max_seq_len, 26 | config.n_layers, 27 | config.d_model, 28 | config.n_heads, 29 | config.d_k, 30 | config.d_v, 31 | config.d_ff, 32 | config.norm, 33 | config.attn_dropout, 34 | config.dropout, 35 | config.activation, 36 | config.key_padding_mask, 37 | config.padding_var, 38 | config.attn_mask, 39 | config.res_attention, 40 | config.pre_norm, 41 | config.store_attn, 42 | config.pe, 43 | config.learn_pe, 44 | config.fc_dropout, 45 | config.head_dropout, 46 | config.padding_patch, 47 | config.pretrain_head, 48 | config.head_type, 49 | config.individual, 50 | config.revin, 51 | config.affine, 52 | config.subtract_last, 53 | config.verbose 54 | ) 55 | self.model_res = PatchTST_backbone( 56 | config.enc_in, 57 | config.seq_len, 58 | config.pred_len, 59 | config.patch_len, 60 | config.stride, 61 | config.max_seq_len, 62 | config.n_layers, 63 | config.d_model, 64 | config.n_heads, 65 | config.d_k, 66 | config.d_v, 67 | config.d_ff, 68 | config.norm, 69 | config.attn_dropout, 70 | config.dropout, 71 | config.activation, 72 | config.key_padding_mask, 73 | config.padding_var, 74 | config.attn_mask, 75 | config.res_attention, 76 | config.pre_norm, 77 | config.store_attn, 78 | config.pe, 79 | config.learn_pe, 80 | config.fc_dropout, 81 | config.head_dropout, 82 | config.padding_patch, 83 | config.pretrain_head, 84 | config.head_type, 85 | config.individual, 86 | config.revin, 87 | config.affine, 88 | config.subtract_last, 89 | config.verbose 90 | ) 91 | else: 92 | self.model = PatchTST_backbone( 93 | config.enc_in, 94 | config.seq_len, 95 | config.pred_len, 96 | config.patch_len, 97 | config.stride, 98 | config.max_seq_len, 99 | config.n_layers, 100 | config.d_model, 101 | config.n_heads, 102 | config.d_k, 103 | config.d_v, 104 | config.d_ff, 105 | config.norm, 106 | config.attn_dropout, 107 | config.dropout, 108 | config.activation, 109 | config.key_padding_mask, 110 | config.padding_var, 111 | config.attn_mask, 112 | config.res_attention, 113 | config.pre_norm, 114 | config.store_attn, 115 | config.pe, 116 | config.learn_pe, 117 | config.fc_dropout, 118 | config.head_dropout, 119 | config.padding_patch, 120 | config.pretrain_head, 121 | config.head_type, 122 | config.individual, 123 | config.revin, 124 | config.affine, 125 | config.subtract_last, 126 | config.verbose 127 | ) 128 | 129 | def forward(self, x: Tensor): 130 | if self.decomposition: 131 | res_init, trend_init = self.decomp_module(x) 132 | res_init, trend_init = res_init.permute(0, 2, 1), trend_init.permute(0, 2, 1) # [Batch, Channel, Input length] 133 | res = self.model_res(res_init) 134 | trend = self.model_trend(trend_init) 135 | x = res + trend 136 | x = x.permute(0, 2, 1) # [Batch, Input length, Channel] 137 | else: 138 | x = x.permute(0, 2, 1) # [Batch, Channel, Input length] 139 | x = self.model(x) 140 | x = x.permute(0, 2, 1) # [Batch, Input length, Channel] 141 | return x -------------------------------------------------------------------------------- /tests/models/Informer_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from ltsm.models import get_model 3 | from ltsm.models.base_config import InformerConfig 4 | from ltsm.common.base_training_pipeline import TrainingConfig 5 | from transformers import PreTrainedModel 6 | import torch 7 | import numpy as np 8 | 9 | @pytest.fixture 10 | def config(tmp_path): 11 | data_path = tmp_path / "test.csv" 12 | prompt_data_path = tmp_path / "prompt_normalize_split" 13 | prompt_data_path.mkdir() 14 | OUTPUT_PATH = data_path / "output" 15 | 16 | train_params = { 17 | "data_path": str(data_path), 18 | "model": "Informer", 19 | "model_name_or_path": "gpt2-medium", 20 | "gradient_accumulation_steps": 64, 21 | "test_data_path_list": [str(data_path)], 22 | "prompt_data_path": str(prompt_data_path), 23 | "train_epochs": 1000, 24 | "patience": 10, 25 | "lradj": 'TST', 26 | "pct_start": 0.2, 27 | "freeze": 0, 28 | "itr": 1, 29 | "batch_size": 32, 30 | "learning_rate": 1e-3, 31 | "downsample_rate": 20, 32 | "output_dir": str(OUTPUT_PATH), 33 | "eval": 0, 34 | "des": 'Exp', 35 | "padding_patch": 'end', 36 | "local_pretrain": "None" 37 | } 38 | 39 | config = { 40 | "pred_len": 96, 41 | "enc_in": 1, 42 | "e_layers": 3, 43 | "d_layers": 1, 44 | "n_heads": 16, 45 | "d_model": 128, 46 | "d_ff": 256, 47 | "dropout": 0.2, 48 | "fc_dropout": 0.2, 49 | "head_dropout": 0, 50 | "seq_len": 336, 51 | "output_attention": 0, 52 | "freq": "h", 53 | "embed": "timeF", 54 | "factor": 1, 55 | "c_out": 862, 56 | "distil": True, 57 | "embed_type": 0, 58 | "dec_in": 7, 59 | "activation": "gelu" 60 | } 61 | informer_config = InformerConfig(**config) 62 | return TrainingConfig(model_config=informer_config, **train_params) 63 | 64 | def test_model_initialization(config): 65 | model = get_model(config.model_config, model_name=config.train_params["model"], local_pretrain=config.train_params["local_pretrain"]) 66 | assert model is not None 67 | assert isinstance(model, PreTrainedModel) 68 | 69 | def test_parameter_count(config): 70 | model = get_model(config.model_config, model_name=config.train_params["model"], local_pretrain=config.train_params["local_pretrain"]) 71 | param_count = sum([p.numel() for p in model.parameters() if p.requires_grad]) 72 | 73 | # Encoder Embedding parameter count 74 | expected_param_count = config.model_config.d_model*config.model_config.enc_in*3 + 4*config.model_config.d_model 75 | 76 | # Decoder Embedding parameter count 77 | expected_param_count += config.model_config.d_model*config.model_config.dec_in*3 + 4*config.model_config.d_model 78 | 79 | # Encoder parameter count 80 | # Encoder layer Conv 81 | encoder_param_count = 2*config.model_config.d_model*config.model_config.d_ff + config.model_config.d_model + config.model_config.d_ff 82 | # Encoder Layer Norm 83 | encoder_param_count += 4*config.model_config.d_model 84 | # Attention Layer 85 | encoder_param_count += 4*(config.model_config.d_model*config.model_config.d_model + config.model_config.d_model) 86 | # Multiply by number of encoder layers 87 | encoder_param_count *= config.model_config.e_layers 88 | 89 | # Conv layer 90 | encoder_param_count += (config.model_config.e_layers-1)*(config.model_config.d_model*config.model_config.d_model*3 + 3*config.model_config.d_model) 91 | # Layer Norm 92 | encoder_param_count += 2*config.model_config.d_model 93 | 94 | expected_param_count += encoder_param_count 95 | 96 | # Decoder layer parameter count 97 | # Decoder Conv layers 98 | decoder_param_count = 2*config.model_config.d_model*config.model_config.d_ff + config.model_config.d_model + config.model_config.d_ff 99 | # Decoder Layer Norm 100 | decoder_param_count += 6*config.model_config.d_model 101 | # Attention Layer 102 | decoder_param_count += 8*(config.model_config.d_model*config.model_config.d_model + config.model_config.d_model) 103 | # Multiply by number of decoder layers 104 | decoder_param_count *= config.model_config.d_layers 105 | 106 | # Layer Norm parameter count 107 | decoder_param_count += 2*config.model_config.d_model 108 | 109 | # Projection layer parameter count 110 | decoder_param_count += config.model_config.d_model*config.model_config.c_out+config.model_config.c_out 111 | 112 | expected_param_count += decoder_param_count 113 | 114 | assert param_count == expected_param_count 115 | 116 | 117 | def test_forward_output_shape(config): 118 | torch.set_default_dtype(torch.float64) 119 | model = get_model(config.model_config, model_name=config.train_params["model"], local_pretrain=config.train_params["local_pretrain"]) 120 | batch_size = 32 121 | input_length = config.model_config.seq_len 122 | input = torch.tensor(np.zeros((batch_size, input_length, config.model_config.enc_in))) 123 | input_mark = torch.tensor(np.zeros((batch_size, input_length, 4))) 124 | dec_inp = torch.tensor(np.zeros((batch_size, input_length, config.model_config.dec_in))) 125 | dec_mark = torch.tensor(np.zeros((batch_size, input_length, 4))) 126 | output = model(input, input_mark, dec_inp, dec_mark) 127 | assert output.size() == torch.Size([batch_size, config.model_config.pred_len, config.model_config.c_out]) -------------------------------------------------------------------------------- /ltsm/models/base_config.py: -------------------------------------------------------------------------------- 1 | from transformers import PretrainedConfig 2 | from dataclasses import dataclass 3 | from typing import Optional 4 | from torch import Tensor 5 | 6 | @dataclass 7 | class LTSMConfig(PretrainedConfig): 8 | """ 9 | LTSMConfig is a configuration class for the LTSM model. 10 | It contains all the necessary parameters to initialize the model. 11 | """ 12 | 13 | def __init__(self, seq_len: int=336, pred_len: int=96, patch_size: int=16, pretrain: bool=True, stride: int=8, prompt_len: int=133, 14 | gpt_layers: int=3, model_name_or_path: str="gpt2-medium", d_ff: int=512, d_model: int=1024, enc_in: int=1, 15 | dropout: float=0.2, n_heads: int=16, prompt_data_path: str=None, **kwargs): 16 | 17 | super().__init__(**kwargs) 18 | self.patch_size = patch_size 19 | self.pretrain = pretrain 20 | self.stride = stride 21 | self.seq_len = seq_len 22 | self.pred_len = pred_len 23 | self.prompt_len = prompt_len 24 | self.gpt_layers = gpt_layers 25 | self.model_name_or_path = model_name_or_path 26 | self.d_ff = d_ff 27 | self.d_model = d_model 28 | self.enc_in = enc_in 29 | self.dropout = dropout 30 | self.n_heads = n_heads 31 | self.prompt_data_path = prompt_data_path 32 | 33 | 34 | @dataclass 35 | class DLinearConfig(PretrainedConfig): 36 | """ 37 | DLinearConfig is a configuration class for the DLinear model. 38 | It contains all the necessary parameters to initialize the model. 39 | """ 40 | 41 | def __init__(self, seq_len: int=336, pred_len: int=96, individual: bool=0, enc_in: int=1, **kwargs): 42 | super().__init__(**kwargs) 43 | self.seq_len = seq_len 44 | self.pred_len = pred_len 45 | self.individual = individual 46 | self.enc_in = enc_in 47 | 48 | @dataclass 49 | class InformerConfig(PretrainedConfig): 50 | """ 51 | InformerConfig is a configuration class for the Informer model. 52 | It contains all the necessary parameters to initialize the model. 53 | """ 54 | 55 | def __init__(self, seq_len=336, pred_len=96, enc_in=1, dec_in=7, d_model=1024, n_heads=16, e_layers=2, d_ff=512, 56 | dropout=0.2, activation='gelu', output_attention=False, embed_type=0, freq='h', factor=1, 57 | distil=True, c_out=862, embed='timeF', **kwargs): 58 | super().__init__(**kwargs) 59 | self.seq_len = seq_len 60 | self.pred_len = pred_len 61 | self.enc_in = enc_in 62 | self.dec_in = dec_in 63 | self.d_model = d_model 64 | self.n_heads = n_heads 65 | self.e_layers = e_layers 66 | self.d_ff = d_ff 67 | self.dropout = dropout 68 | self.activation = activation 69 | self.output_attention = output_attention 70 | self.embed_type = embed_type 71 | self.factor = factor 72 | self.freq = freq 73 | self.distil = distil 74 | self.c_out = c_out 75 | self.embed = embed 76 | 77 | 78 | @dataclass 79 | class PatchTSTConfig(PretrainedConfig): 80 | """ 81 | PatchTSTConfig is a configuration class for the PatchTST model. 82 | It contains all the necessary parameters to initialize the model. 83 | """ 84 | 85 | def __init__(self, seq_len=336, pred_len=96, enc_in=1, patch_len=16, stride=8, decomposition=False, max_seq_len:Optional[int]=1024, 86 | n_layers:int=3, d_model=128, n_heads=16, d_k:Optional[int]=None, d_v:Optional[int]=None, 87 | d_ff:int=256, norm:str='BatchNorm', attn_dropout:float=0., dropout:float=0., act:str="gelu", key_padding_mask:bool='auto', 88 | padding_var:Optional[int]=None, attn_mask:Optional[Tensor]=None, res_attention:bool=True, pre_norm:bool=False, store_attn:bool=False, 89 | pe:str='zeros', learn_pe:bool=True, fc_dropout:float=0., head_dropout = 0, padding_patch = None, 90 | pretrain_head:bool=False, head_type = 'flatten', individual = False, revin = True, affine = True, subtract_last = False, 91 | verbose:bool=False, embed='timeF', **kwargs): 92 | super().__init__(**kwargs) 93 | self.seq_len = seq_len 94 | self.pred_len = pred_len 95 | self.enc_in = enc_in 96 | self.patch_len = patch_len 97 | self.stride = stride 98 | self.decomposition = decomposition 99 | self.max_seq_len = max_seq_len 100 | self.n_layers = n_layers 101 | self.d_model = d_model 102 | self.n_heads = n_heads 103 | self.d_k = d_k 104 | self.d_v = d_v 105 | self.d_ff = d_ff 106 | self.norm = norm 107 | self.attn_dropout = attn_dropout 108 | self.dropout = dropout 109 | self.activation = act 110 | self.key_padding_mask = key_padding_mask 111 | self.padding_var = padding_var 112 | self.attn_mask = attn_mask 113 | self.res_attention = res_attention 114 | self.pre_norm = pre_norm 115 | self.store_attn = store_attn 116 | self.pe = pe 117 | self.learn_pe = learn_pe 118 | self.fc_dropout = fc_dropout 119 | self.head_dropout = head_dropout 120 | self.padding_patch = padding_patch 121 | self.pretrain_head = pretrain_head 122 | self.head_type = head_type 123 | self.individual = individual 124 | self.revin = revin 125 | self.affine = affine 126 | self.subtract_last = subtract_last 127 | self.verbose = verbose, 128 | self.embed = embed 129 | -------------------------------------------------------------------------------- /ltsm/models/Informer.py: -------------------------------------------------------------------------------- 1 | # code from https://github.com/yuqinie98/PatchTST, with minor modifications 2 | import torch 3 | from torch import Tensor 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from ltsm.utils.masking import TriangularCausalMask, ProbMask 7 | from ltsm.layers.Transformer_EncDec import Decoder, DecoderLayer, Encoder, EncoderLayer, ConvLayer 8 | from ltsm.layers.SelfAttention_Family import FullAttention, ProbAttention, AttentionLayer 9 | from ltsm.layers.Embed import DataEmbedding,DataEmbedding_wo_pos,DataEmbedding_wo_temp,DataEmbedding_wo_pos_temp 10 | import numpy as np 11 | from transformers import PreTrainedModel 12 | from .base_config import InformerConfig 13 | 14 | class Informer(PreTrainedModel): 15 | """ 16 | Informer with Propspare attention in O(LlogL) complexity 17 | """ 18 | config_class = InformerConfig 19 | 20 | def __init__(self, config: InformerConfig, **kwargs): 21 | super().__init__(config) 22 | self.pred_len = config.pred_len 23 | self.output_attention = config.output_attention 24 | 25 | # Embedding 26 | if config.embed_type == 0: 27 | self.enc_embedding = DataEmbedding(config.enc_in, config.d_model, config.embed, config.freq, 28 | config.dropout) 29 | self.dec_embedding = DataEmbedding(config.dec_in, config.d_model, config.embed, config.freq, 30 | config.dropout) 31 | elif config.embed_type == 1: 32 | self.enc_embedding = DataEmbedding(config.enc_in, config.d_model, config.embed, config.freq, 33 | config.dropout) 34 | self.dec_embedding = DataEmbedding(config.dec_in, config.d_model, config.embed, config.freq, 35 | config.dropout) 36 | elif config.embed_type == 2: 37 | self.enc_embedding = DataEmbedding_wo_pos(config.enc_in, config.d_model, config.embed, config.freq, 38 | config.dropout) 39 | self.dec_embedding = DataEmbedding_wo_pos(config.dec_in, config.d_model, config.embed, config.freq, 40 | config.dropout) 41 | 42 | elif config.embed_type == 3: 43 | self.enc_embedding = DataEmbedding_wo_temp(config.enc_in, config.d_model, config.embed, config.freq, 44 | config.dropout) 45 | self.dec_embedding = DataEmbedding_wo_temp(config.dec_in, config.d_model, config.embed, config.freq, 46 | config.dropout) 47 | elif config.embed_type == 4: 48 | self.enc_embedding = DataEmbedding_wo_pos_temp(config.enc_in, config.d_model, config.embed, config.freq, 49 | config.dropout) 50 | self.dec_embedding = DataEmbedding_wo_pos_temp(config.dec_in, config.d_model, config.embed, config.freq, 51 | config.dropout) 52 | # Encoder 53 | self.encoder = Encoder( 54 | [ 55 | EncoderLayer( 56 | AttentionLayer( 57 | ProbAttention(False, config.factor, attention_dropout=config.dropout, 58 | output_attention=config.output_attention), 59 | config.d_model, config.n_heads), 60 | config.d_model, 61 | config.d_ff, 62 | dropout=config.dropout, 63 | activation=config.activation 64 | ) for l in range(config.e_layers) 65 | ], 66 | [ 67 | ConvLayer( 68 | config.d_model 69 | ) for l in range(config.e_layers - 1) 70 | ] if config.distil else None, 71 | norm_layer=torch.nn.LayerNorm(config.d_model) 72 | ) 73 | # Decoder 74 | self.decoder = Decoder( 75 | [ 76 | DecoderLayer( 77 | AttentionLayer( 78 | ProbAttention(True, config.factor, attention_dropout=config.dropout, output_attention=False), 79 | config.d_model, config.n_heads), 80 | AttentionLayer( 81 | ProbAttention(False, config.factor, attention_dropout=config.dropout, output_attention=False), 82 | config.d_model, config.n_heads), 83 | config.d_model, 84 | config.d_ff, 85 | dropout=config.dropout, 86 | activation=config.activation, 87 | ) 88 | for l in range(config.d_layers) 89 | ], 90 | norm_layer=torch.nn.LayerNorm(config.d_model), 91 | projection=nn.Linear(config.d_model, config.c_out, bias=True) 92 | ) 93 | 94 | def forward(self, x_enc: Tensor, x_mark_enc: Tensor, x_dec: Tensor, x_mark_dec: Tensor, 95 | enc_self_mask: Tensor=None, dec_self_mask: Tensor=None, dec_enc_mask: Tensor=None): 96 | 97 | enc_out = self.enc_embedding(x_enc, x_mark_enc) 98 | enc_out, attns = self.encoder(enc_out, attn_mask=enc_self_mask) 99 | 100 | dec_out = self.dec_embedding(x_dec, x_mark_dec) 101 | dec_out = self.decoder(dec_out, enc_out, x_mask=dec_self_mask, cross_mask=dec_enc_mask) 102 | 103 | if self.output_attention: 104 | return dec_out[:, -self.pred_len:, :], attns 105 | else: 106 | return dec_out[:, -self.pred_len:, :] # [B, L, D] -------------------------------------------------------------------------------- /ltsm/prompt_reader/stat_prompt/prompt_tsne.py: -------------------------------------------------------------------------------- 1 | # from ltsm.data_provider.data_factory import get_data_loader, get_data_loaders, get_dataset 2 | import argparse 3 | import ipdb 4 | import pandas as pd 5 | import numpy as np 6 | # import tsfel 7 | from pandas import read_csv, read_feather 8 | import matplotlib.pyplot as plt 9 | import sys, os 10 | import torch 11 | from sklearn.preprocessing import StandardScaler 12 | from sklearn import manifold 13 | 14 | 15 | def get_args(): 16 | parser = argparse.ArgumentParser(description='LTSM') 17 | 18 | parser.add_argument('--data_path', type=str, default='dataset/weather.csv') 19 | parser.add_argument('--data', type=str, default='custom') 20 | parser.add_argument('--freq', type=str, default="h") 21 | parser.add_argument('--target', type=str, default='OT') 22 | parser.add_argument('--embed', type=str, default='timeF') 23 | parser.add_argument('--percent', type=int, default=10) 24 | parser.add_argument('--batch_size', type=int, default=512) 25 | parser.add_argument('--max_len', type=int, default=-1) 26 | parser.add_argument('--seq_len', type=int, default=512) 27 | parser.add_argument('--pred_len', type=int, default=96) 28 | parser.add_argument('--label_len', type=int, default=48) 29 | parser.add_argument('--features', type=str, default='M') 30 | 31 | args = parser.parse_args() 32 | 33 | return args 34 | 35 | 36 | def prompt_generation(ts): 37 | cfg = tsfel.get_features_by_domain() 38 | prompt = tsfel.time_series_features_extractor(cfg, ts) 39 | return prompt 40 | 41 | 42 | def prompt_prune(pt): 43 | pt_dict = pt.to_dict() 44 | pt_keys = list(pt_dict.keys()) 45 | for key in pt_keys: 46 | if key.startswith("0_FFT mean coefficient"): 47 | del pt[key] 48 | 49 | return pt 50 | 51 | 52 | if __name__ == "__main__": 53 | 54 | root_path = "./prompt_bank/stat-prompt/prompt_data_split/" 55 | # print(data_path_buf) 56 | 57 | dataset_name = [ 58 | "electricity", 59 | "ETT-small", 60 | "exchange_rate", 61 | "illness", 62 | "traffic", 63 | "weather", 64 | ] 65 | split_buf = ["train", "val", "test"] 66 | 67 | dataset_fullname_train = [os.path.join(root_path, "train", name) for name in dataset_name] 68 | dataset_fullname_val = [os.path.join(root_path, "val", name) for name in dataset_name] 69 | dataset_fullname_test = [os.path.join(root_path, "test", name) for name in dataset_name] 70 | dataset_fullname = dataset_fullname_train + dataset_fullname_val + dataset_fullname_test 71 | data_path_buf = [] 72 | dataset_dir_buf = [] 73 | dataset_split_buf = [] 74 | K = 100 75 | for index, dataset_dir in enumerate(dataset_fullname): 76 | paths = os.listdir(dataset_dir) 77 | new_dataset = [os.path.join(dataset_dir, path) for path in paths] 78 | sample_idx = np.random.permutation(len(new_dataset))[:K].astype(np.int64) 79 | # ipdb.set_trace() 80 | new_dataset = np.array(new_dataset)[sample_idx].tolist() 81 | data_path_buf.extend(new_dataset) 82 | 83 | for dataset_index, dname in enumerate(dataset_name): 84 | if dname in dataset_dir: 85 | dataset_dir_buf.extend(len(new_dataset) * [dataset_index]) 86 | 87 | for split_index, split in enumerate(split_buf): 88 | if split in dataset_dir: 89 | dataset_split_buf.extend(len(new_dataset) * [split_index]) 90 | break 91 | 92 | prompt_data_buf = [] 93 | for index, dataset_path in enumerate(data_path_buf): 94 | prompt_data = torch.load(dataset_path) 95 | prompt_data_buf.append(prompt_data) 96 | print("Import from {}".format(dataset_path)) 97 | # print(prompt_data) 98 | 99 | # if index == 100: 100 | # break 101 | 102 | # print(prompt_data_buf) 103 | # print(output_path_buf) 104 | 105 | prompt_data_all = pd.concat(prompt_data_buf, axis=0).values 106 | print(prompt_data_all.shape) 107 | # (3166, 133) 108 | 109 | # nan_index = np.where(np.isnan(prompt_data_all))[0] 110 | # prompt_data_all[nan_index] = 0 111 | 112 | # ipdb.set_trace() 113 | tsne = manifold.TSNE(n_components=2, init='pca', random_state=0) 114 | prompt_data_tsne = tsne.fit_transform(prompt_data_all) 115 | dataset_plot_buf = ["electricity"] 116 | color_buf = ["red", "blue", "black", "green", "pink", "brown"] 117 | marker_buf = [".", "^", "x"] 118 | for index, _ in enumerate(dataset_name): 119 | for sindex, split_fold in enumerate(split_buf): 120 | data_index = (np.array(dataset_dir_buf) == index) 121 | split_index = (np.array(dataset_split_buf) == sindex) 122 | plot_index = data_index & split_index 123 | plt.plot(prompt_data_tsne[plot_index, 0], prompt_data_tsne[plot_index, 1], linewidth=0, marker=marker_buf[sindex], label=str(dataset_name[index][0:8] + "-" + split_fold), color=color_buf[index]) 124 | # plt.text(prompt_data_tsne[data_index, 0].mean()-20, prompt_data_tsne[data_index, 1].mean(), str(dataset_name[index][0:8]), fontdict={'weight': 'bold', 'size': 9}) 125 | 126 | plt.legend(loc="right") 127 | plt.savefig("./figures/stat_prompt_tsne.png") 128 | plt.close() 129 | 130 | # ipdb.set_trace() 131 | # plt.xticks([]) 132 | # plt.yticks([]) 133 | 134 | # print(prompt_data_all) 135 | # , color = plt.cm.Set1(dataset_dir_buf[index]) 136 | # print(prompt_data_transform) 137 | # print(prompt_data_transform_array.mean(axis=0)) 138 | # print(prompt_data_transform_array.std(axis=0)) 139 | # print(prompt_data_transform.loc[5]) 140 | 141 | 142 | 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /multi_agents_pipeline/agents/Planning_Agent.py: -------------------------------------------------------------------------------- 1 | from .custom_messages import TextMessage, TSTaskMessage, TSMessage 2 | from typing import Optional, List 3 | from autogen_core import RoutedAgent, default_subscription, message_handler, MessageContext, TopicId 4 | from autogen_core.models import ChatCompletionClient, SystemMessage 5 | from pydantic import ValidationError 6 | from pydantic import BaseModel 7 | 8 | 9 | @default_subscription 10 | class PlanningAgent(RoutedAgent): 11 | """A planning agent that uses OpenAI API to generate tasks for a Time Series Agent and QA Agent. 12 | 13 | Args: 14 | name (str): The name of the agent. 15 | model_client (ChatCompletionClient): The ChatCompletion client. 16 | """ 17 | def __init__(self, name: str, model_client: ChatCompletionClient) -> None: 18 | super().__init__("planning_agent") 19 | self.name = name 20 | self._model_client = model_client 21 | self._system_messages = [SystemMessage(content="You are a helpful AI assistant.")] 22 | 23 | async def send_message_to_openai(self, messages: List[SystemMessage], ctx: MessageContext, json_output: Optional[bool | BaseModel] = False) -> str: 24 | """Sends messages to OpenAI and returns the response content. 25 | 26 | Args: 27 | messages (List[SystemMessage]): The list of messages to send to OpenAI. 28 | 29 | Returns: 30 | str: The response content from OpenAI. 31 | """ 32 | response = await self._model_client.create( 33 | messages=self._system_messages + messages, 34 | cancellation_token=ctx.cancellation_token, 35 | json_output=json_output) 36 | if isinstance(response.content, str): 37 | return response.content 38 | else: 39 | raise ValueError("Response content is not a valid JSON string") 40 | 41 | async def generate_ts_task(self, original_message: TSTaskMessage, ctx: MessageContext) -> TSMessage: 42 | """Generates a time series task message based on the original message. 43 | 44 | Args: 45 | original_message (TSTaskMessage): The original message containing the task description and filepath. 46 | 47 | Returns: 48 | TSMessage: A new TSMessage with the task type and description. 49 | """ 50 | ts_message = SystemMessage( 51 | source="user", 52 | content=f"""The task for the time series analysis is: {original_message.description}. 53 | The time-series data is stored at {original_message.filepath}. Provide a detailed description of the data 54 | based on the task description. Also, provide what type of analysis would be required to complete the task among 55 | the following types: ["statistical forecasting", "anomaly detection"]. 56 | """ 57 | ) 58 | 59 | response_content = await self.send_message_to_openai([ts_message], ctx, json_output=TSMessage) 60 | 61 | try: 62 | ts_task = TSMessage.model_validate_json(response_content) 63 | ts_task.source = "planner" # Set the source to the Planning Agent 64 | ts_task.filepath = original_message.filepath # Ensure the filepath is preserved 65 | # Send the generated task to the QA Agent 66 | return ts_task 67 | except ValidationError as e: 68 | raise ValueError(f"Response content is not a valid TextMessage: {e}") from e 69 | 70 | async def generate_qa_task(self, original_message: TSTaskMessage, ctx: MessageContext) -> TextMessage: 71 | """Generates a QA task message based on the original message. 72 | 73 | Args: 74 | original_message (TSTaskMessage): The original message containing the task description and filepath. 75 | 76 | Returns: 77 | TextMessage: A new TextMessage with the task description. 78 | """ 79 | task_message = SystemMessage( 80 | source="user", 81 | content=f"""Write a descriptive task for the following prompt: {original_message.description}. 82 | The time-series data is stored at {original_message.filepath}. 83 | """ 84 | ) 85 | 86 | response_content = await self.send_message_to_openai([task_message], ctx, json_output=TextMessage) 87 | 88 | try: 89 | qa_task = TextMessage.model_validate_json(response_content) 90 | qa_task.source = "planner" # Set the source to the Planning Agent 91 | # Send the generated task to the QA Agent 92 | return qa_task 93 | except ValidationError as e: 94 | raise ValueError(f"Response content is not a valid TextMessage: {e}") from e 95 | 96 | @message_handler 97 | async def handle_ts_task_message(self, message: TSTaskMessage, ctx: MessageContext) -> None: 98 | """Handles incoming time series task messages and generates a response using the OpenAI Assistant API. 99 | 100 | Args: 101 | message (TSTaskMessage): The incoming message containing the user's query. 102 | """ 103 | ts_task = await self.generate_ts_task(message, ctx) 104 | print(f"[{self.name}] Sending TS task to TS Agent...") 105 | await self.publish_message( 106 | ts_task, 107 | TopicId(type="Planner-TS", source=self.id.key) 108 | ) 109 | #await self.send_message(ts_task, AgentId("ts_agent", "default")) 110 | 111 | qa_task = await self.generate_qa_task(message, ctx) 112 | print(f"[{self.name}] Sending QA task to QA Agent...") 113 | await self.publish_message( 114 | qa_task, 115 | TopicId(type="Planner-QA", source=self.id.key) 116 | ) 117 | #await self.send_message(qa_task, AgentId("qa_agent", "default")) -------------------------------------------------------------------------------- /ltsm/models/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | from math import sqrt 5 | from transformers.modeling_utils import PreTrainedModel, PretrainedConfig 6 | 7 | class Normalize(nn.Module): 8 | def __init__(self, num_features: int, eps=1e-5, affine=False, subtract_last=False, non_norm=False): 9 | """ 10 | :param num_features: the number of features or channels 11 | :param eps: a value added for numerical stability 12 | :param affine: if True, RevIN has learnable affine parameters 13 | """ 14 | super(Normalize, self).__init__() 15 | self.num_features = num_features 16 | self.eps = eps 17 | self.affine = affine 18 | self.subtract_last = subtract_last 19 | self.non_norm = non_norm 20 | if self.affine: 21 | self._init_params() 22 | 23 | def forward(self, x, mode: str): 24 | if mode == 'norm': 25 | self._get_statistics(x) 26 | x = self._normalize(x) 27 | elif mode == 'denorm': 28 | x = self._denormalize(x) 29 | else: 30 | raise NotImplementedError 31 | return x 32 | 33 | def _init_params(self): 34 | # initialize RevIN params: (C,) 35 | self.affine_weight = nn.Parameter(torch.ones(self.num_features)) 36 | self.affine_bias = nn.Parameter(torch.zeros(self.num_features)) 37 | 38 | def _get_statistics(self, x): 39 | dim2reduce = tuple(range(1, x.ndim - 1)) 40 | if self.subtract_last: 41 | self.last = x[:, -1, :].unsqueeze(1) 42 | else: 43 | self.mean = torch.mean(x, dim=dim2reduce, keepdim=True).detach() 44 | self.stdev = torch.sqrt(torch.var(x, dim=dim2reduce, keepdim=True, unbiased=False) + self.eps).detach() 45 | 46 | def _normalize(self, x): 47 | if self.non_norm: 48 | return x 49 | if self.subtract_last: 50 | x = x - self.last 51 | else: 52 | x = x - self.mean 53 | x = x / self.stdev 54 | if self.affine: 55 | x = x * self.affine_weight 56 | x = x + self.affine_bias 57 | return x 58 | 59 | def _denormalize(self, x): 60 | if self.non_norm: 61 | return x 62 | if self.affine: 63 | x = x - self.affine_bias 64 | x = x / (self.affine_weight + self.eps * self.eps) 65 | x = x * self.stdev 66 | if self.subtract_last: 67 | x = x + self.last 68 | else: 69 | x = x + self.mean 70 | return x 71 | 72 | 73 | class FlattenHead(nn.Module): 74 | def __init__(self, n_vars, nf, target_window, head_dropout=0): 75 | super().__init__() 76 | self.n_vars = n_vars 77 | self.flatten = nn.Flatten(start_dim=-2) 78 | self.linear = nn.Linear(nf, target_window) 79 | self.dropout = nn.Dropout(head_dropout) 80 | 81 | def forward(self, x): 82 | x = self.flatten(x) 83 | x = self.linear(x) 84 | x = self.dropout(x) 85 | return x 86 | 87 | class ReprogrammingLayer(nn.Module): 88 | def __init__(self, d_model, n_heads, d_keys=None, d_llm=None, attention_dropout=0.1): 89 | super(ReprogrammingLayer, self).__init__() 90 | 91 | d_keys = d_keys or (d_model // n_heads) 92 | 93 | self.query_projection = nn.Linear(d_model, d_keys * n_heads) 94 | self.key_projection = nn.Linear(d_llm, d_keys * n_heads) 95 | self.value_projection = nn.Linear(d_llm, d_keys * n_heads) 96 | self.out_projection = nn.Linear(d_keys * n_heads, d_llm) 97 | self.n_heads = n_heads 98 | self.dropout = nn.Dropout(attention_dropout) 99 | 100 | def forward(self, target_embedding, source_embedding, value_embedding): 101 | B, L, _ = target_embedding.shape 102 | S, _ = source_embedding.shape 103 | H = self.n_heads 104 | 105 | target_embedding = self.query_projection(target_embedding).view(B, L, H, -1) 106 | source_embedding = self.key_projection(source_embedding).view(S, H, -1) 107 | value_embedding = self.value_projection(value_embedding).view(S, H, -1) 108 | 109 | out = self.reprogramming(target_embedding, source_embedding, value_embedding) 110 | 111 | out = out.reshape(B, L, -1) 112 | 113 | return self.out_projection(out) 114 | 115 | def reprogramming(self, target_embedding, source_embedding, value_embedding): 116 | B, L, H, E = target_embedding.shape 117 | 118 | scale = 1. / sqrt(E) 119 | 120 | scores = torch.einsum("blhe,she->bhls", target_embedding, source_embedding) 121 | 122 | A = self.dropout(torch.softmax(scale * scores, dim=-1)) 123 | reprogramming_embedding = torch.einsum("bhls,she->blhe", A, value_embedding) 124 | 125 | return reprogramming_embedding 126 | 127 | def freeze_parameters(model: PreTrainedModel): 128 | """ 129 | Sets certain model parameters to non-trainable, and specific parameters to trainable, based on predefined 130 | lists of layer names to freeze or keep trainable. 131 | """ 132 | freeze_param_buf = ["gpt2"] 133 | for n, p in model.named_parameters(): 134 | if any(fp in n for fp in freeze_param_buf): 135 | p.requires_grad = False 136 | print(f"{n} has been freeezed") 137 | 138 | trainable_param_buf = ["ln", "wpe", "in_layer", "out_layer", "lora"] 139 | for n, p in model.named_parameters(): 140 | if any(fp in n for fp in trainable_param_buf): 141 | p.requires_grad = True 142 | 143 | def print_trainable_parameters(model): 144 | """ 145 | Prints the names of parameters in the model that are trainable. 146 | """ 147 | for n, p in model.named_parameters(): 148 | if p.requires_grad: 149 | print(f"{n} is trainable...") -------------------------------------------------------------------------------- /ltsm/data_reader/database_reader.py: -------------------------------------------------------------------------------- 1 | import taosws 2 | import pandas as pd 3 | import os 4 | 5 | # change to your own 6 | datapath = "original_upload" 7 | output_folder = 'original_download' 8 | database = "time_series_demo" 9 | user = "root" 10 | password = "taosdata" 11 | 12 | # create_connection() function to connect to the database. (change host and port to your own) 13 | def create_connection(host='35.153.211.255', port=6041): 14 | conn = None 15 | try: 16 | conn = taosws.connect( 17 | user=user, 18 | password=password, 19 | host=host, 20 | port=port, 21 | ) 22 | print(f"Connected to {host}:{port} successfully.") 23 | return conn 24 | except Exception as err: 25 | print(f"Failed to connect to {host}:{port}, ErrMessage: {err}") 26 | raise err 27 | 28 | 29 | # setup_database() function to create a new database if it doesn't exist. 30 | def setup_database(conn, database): 31 | try: 32 | cursor = conn.cursor() 33 | cursor.execute(f"CREATE DATABASE IF NOT EXISTS {database}") 34 | print("Database time_series_demo set up successfully.") 35 | except Exception as err: 36 | print(f"Error setting up database: {err}") 37 | raise err 38 | 39 | 40 | # setup_tables() function to create tables based on CSV column names and data types. 41 | def setup_tables(conn, database, table_name, df): 42 | try: 43 | cursor = conn.cursor() 44 | cursor.execute(f"USE {database}") 45 | cursor.execute(f"DROP TABLE IF EXISTS {table_name}") 46 | columns = df.columns 47 | schema_columns = ["ts TIMESTAMP"] 48 | 49 | # Infer column types and set schema accordingly 50 | for column in columns[1:]: 51 | dtype = df[column].dtype 52 | if pd.api.types.is_float_dtype(dtype): 53 | schema_columns.append(f"{column.replace(' ', '_')} FLOAT") 54 | elif pd.api.types.is_integer_dtype(dtype): 55 | schema_columns.append(f"{column.replace(' ', '_')} INT") 56 | elif pd.api.types.is_bool_dtype(dtype): 57 | schema_columns.append(f"{column.replace(' ', '_')} BOOL") 58 | elif pd.api.types.is_datetime64_any_dtype(dtype): 59 | schema_columns.append(f"{column.replace(' ', '_')} TIMESTAMP") 60 | else: # Treat as STRING for other types like object (text) 61 | schema_columns.append(f"{column.replace(' ', '_')} STRING") 62 | 63 | schema = f"({', '.join(schema_columns)})" 64 | cursor.execute(f"CREATE TABLE IF NOT EXISTS {table_name} {schema}") 65 | print(f"Table {table_name} set up successfully with schema: {schema}") 66 | except Exception as err: 67 | print(f"Error setting up database or table {table_name}: {err}") 68 | raise err 69 | 70 | 71 | # insert_data_from_csv() function to insert data from CSV files into tables. 72 | def insert_data_from_csv(conn, database, csv_file, table_name): 73 | try: 74 | cursor = conn.cursor() 75 | df = pd.read_csv(csv_file) 76 | 77 | # Ensure the first column is a timestamp 78 | df[df.columns[0]] = pd.to_datetime(df[df.columns[0]], errors='coerce') 79 | 80 | setup_tables(conn, database, table_name, df) 81 | cursor.execute(f"USE {database}") 82 | 83 | for _, row in df.iterrows(): 84 | values = [f"'{row[df.columns[0]]}'"] # Timestamp value 85 | for col in df.columns[1:]: 86 | value = row[col] 87 | if pd.isna(value): 88 | values.append("NULL") 89 | elif isinstance(value, str): 90 | values.append(f"'{value}'") 91 | elif isinstance(value, bool): 92 | values.append("true" if value else "false") 93 | else: 94 | values.append(str(value)) 95 | 96 | insert_query = f"INSERT INTO {table_name} VALUES({', '.join(values)})" 97 | print(insert_query) 98 | cursor.execute(insert_query) 99 | 100 | print(f"Data from {csv_file} inserted into table {table_name} successfully.") 101 | except Exception as err: 102 | print(f"Error inserting data from {csv_file} into {table_name}: {err}") 103 | raise err 104 | 105 | 106 | # retrieve_data_to_csv() function to retrieve data from a table and save it to a CSV file. 107 | def retrieve_data_to_csv(conn, database, table_name, output_file): 108 | try: 109 | cursor = conn.cursor() 110 | cursor.execute(f"USE {database}") 111 | cursor.execute(f"SELECT * FROM {table_name}") 112 | data = cursor.fetchall() 113 | cursor.execute(f"DESCRIBE {table_name}") 114 | columns = [desc[0] for desc in cursor.fetchall()] 115 | 116 | df = pd.DataFrame(data, columns=columns) 117 | df.to_csv(output_file, index=False) 118 | print(f"Data from {table_name} saved to {output_file}.") 119 | except Exception as err: 120 | print(f"Error retrieving data from {table_name}: {err}") 121 | raise err 122 | 123 | 124 | # Example usage 125 | if __name__ == "__main__": 126 | conn = create_connection() 127 | if conn: 128 | try: 129 | setup_database(conn, database) 130 | csv_files = [os.path.join(datapath, f) for f in os.listdir(datapath) if f.endswith('.csv')] 131 | tables = [os.path.splitext(os.path.basename(csv_file))[0] for csv_file in csv_files] 132 | for csv_file, table_name in zip(csv_files, tables): 133 | insert_data_from_csv(conn, database, csv_file, table_name) 134 | if not os.path.exists(output_folder): 135 | os.makedirs(output_folder) 136 | for table_name in tables: 137 | output_file = os.path.join(output_folder, f"{table_name}.csv") 138 | retrieve_data_to_csv(conn, database, table_name, output_file) 139 | 140 | finally: 141 | conn.close() 142 | --------------------------------------------------------------------------------