├── gpt_model ├── generator │ ├── baseline │ │ ├── __init__.py │ │ └── markov.py │ ├── model_config.json │ ├── run_generator_finetuning.sh │ ├── run_generator_training.sh │ ├── trainer.py │ ├── run_generating.py │ ├── dataset.py │ └── train_generator.py ├── data_preparation │ ├── form_aux_dataset.sh │ ├── README.md │ ├── preprocess_target_pcaps.py │ ├── preprocess_pretraining_pcaps.py │ └── format_parsed_as_classification_dataset.py ├── classifier │ ├── run_evaluation_scenarios.sh │ ├── model.py │ ├── dataset.py │ └── train_classifier.py ├── README.md ├── tokenizer.py └── quantizer.py ├── flow_parsing ├── static │ ├── example.pcap │ └── ip_proto_map.csv ├── __init__.py ├── aux_raw_features_plugin.py ├── utils.py ├── features.py └── pcap_parser.py ├── sklearn_classifiers ├── utils.py ├── registered_classes.py ├── config.yaml.example ├── clf_utils.py ├── run_training.py ├── knn_cosine.py └── featurizer.py ├── tests ├── test_classifiers.py ├── test_gen_metrics.py ├── test_evaluator.py ├── static │ ├── classifiers_config.yaml │ ├── quantizer_checkpoint │ │ └── ids_to_tokens.json │ └── quantized_pkts.json ├── test_fsnet.py ├── test_markov.py ├── test_pcap_parser.py ├── conftest.py ├── test_distance.py └── test_tokenizer.py ├── .gitignore ├── requirements.yaml ├── fs_net ├── README.md ├── model.py ├── dataset.py └── train_fsnet.py ├── nn_classifiers ├── dataset.py └── models.py ├── .github └── workflows │ └── python-package-conda.yml ├── settings.py ├── evaluation_utils ├── classification.py └── modeling.py ├── README.md └── LICENSE /gpt_model/generator/baseline/__init__.py: -------------------------------------------------------------------------------- 1 | from .markov import MarkovGenerator 2 | 3 | 4 | __all__ = [ 5 | 'MarkovGenerator' 6 | ] 7 | -------------------------------------------------------------------------------- /flow_parsing/static/example.pcap: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RadionBik/ML-based-network-traffic-classifier/HEAD/flow_parsing/static/example.pcap -------------------------------------------------------------------------------- /gpt_model/data_preparation/form_aux_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ls -1d ~/Applications/traffic_dumps/separated_iot_devices/* | xargs -n1 -t python ../flow_parser.py --raw -p 4 | python ../flow_parser.py -p ~/Applications/traffic_dumps/non_iot.pcap --raw -------------------------------------------------------------------------------- /sklearn_classifiers/utils.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | 3 | 4 | def iterate_batch_indexes(array, batch_size): 5 | iter_num = len(array) // batch_size 6 | for iteration in tqdm(range(iter_num + 1)): 7 | start_idx = iteration * batch_size 8 | end_idx = (iteration + 1) * batch_size 9 | yield start_idx, end_idx 10 | -------------------------------------------------------------------------------- /flow_parsing/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import read_dataset, check_filename_in_patterns, save_dataset 2 | from .pcap_parser import parse_pcap_to_csv, parse_pcap_to_dataframe, init_streamer 3 | 4 | 5 | __all__ = [ 6 | read_dataset, 7 | save_dataset, 8 | check_filename_in_patterns, 9 | parse_pcap_to_dataframe, 10 | parse_pcap_to_csv, 11 | init_streamer, 12 | ] 13 | -------------------------------------------------------------------------------- /gpt_model/generator/model_config.json: -------------------------------------------------------------------------------- 1 | {"vocab_size": 9906, 2 | "n_positions": 128, 3 | "n_ctx": 128, 4 | "n_embd": 512, 5 | "n_layer": 6, 6 | "n_head": 8, 7 | "activation_function": "gelu_new", 8 | "resid_pdrop": 0.1, 9 | "embd_pdrop": 0.1, 10 | "attn_pdrop": 0.1, 11 | "layer_norm_epsilon": 1e-5, 12 | "initializer_range": 0.02, 13 | "bos_token_id": -10, 14 | "eos_token_id": -10, 15 | "pad_token_id": -10 16 | } -------------------------------------------------------------------------------- /tests/test_classifiers.py: -------------------------------------------------------------------------------- 1 | from sklearn_classifiers import clf_utils 2 | import settings 3 | 4 | 5 | def test_config_parsing(classif_config): 6 | cfg = clf_utils.read_classifier_settings(settings.TEST_STATIC_DIR / 'classifiers_config.yaml') 7 | assert cfg == classif_config 8 | 9 | 10 | def test_init_clfs(classif_config): 11 | clfs = clf_utils.initialize_classifiers(classif_config) 12 | assert clfs 13 | -------------------------------------------------------------------------------- /gpt_model/data_preparation/README.md: -------------------------------------------------------------------------------- 1 | This is rather a collection of various functions that were used 2 | during dataset creation. The code was highly experimental and the exact steps 3 | has not been documented and saved, although it is not that 4 | difficult to repeat. 5 | 6 | The only module that can run as expected is `format_parsed_as_classification_dataset.py` 7 | 8 | After all, it should not be a problem, since the resulting datasets are 9 | accessible via the minio client as described in the upper-level [README.md](../README.md) -------------------------------------------------------------------------------- /tests/test_gen_metrics.py: -------------------------------------------------------------------------------- 1 | from evaluation_utils.modeling import get_ks_stat, get_wasserstein_distance_pdf 2 | import numpy as np 3 | 4 | 5 | def test_scale_invariance(): 6 | 7 | def check(f): 8 | m = f(orig, gen) 9 | m_l = f(orig * 100, gen * 100) 10 | m_a = f(orig + 10000, gen + 10000) 11 | assert np.isclose(m, m_l) 12 | assert np.isclose(m, m_a, atol=1e-2) 13 | 14 | orig = np.random.random(1000) 15 | gen = np.random.normal(size=1000) - .1 16 | check(get_ks_stat) 17 | check(get_wasserstein_distance_pdf) 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *$py.class 4 | .ipynb_checkpoints/ 5 | *.ipynb 6 | 7 | #python workspaces 8 | *.res 9 | .~* 10 | #figures/ 11 | figures/*.* 12 | reports/ 13 | trained_classifiers/*.* 14 | #old code 15 | py/ 16 | .idea 17 | .directory 18 | 19 | #results 20 | *.txt 21 | *.csv 22 | *.docx 23 | *.doc 24 | *.odt 25 | *.pdf 26 | 27 | # config 28 | *.yaml 29 | 30 | venv 31 | config.ini 32 | csv_files 33 | bin 34 | *.pcap 35 | !flow_parsing/static/example.pcap 36 | 37 | # gpt_model artifacts 38 | trial_txt_gpt2 39 | trained_quantizers -------------------------------------------------------------------------------- /tests/test_evaluator.py: -------------------------------------------------------------------------------- 1 | from evaluation_utils.modeling import convert_ipt_to_iat, evaluate_generated_traffic, flows_to_packets 2 | 3 | 4 | def test_splitting_by_directions(raw_dataset): 5 | raw_dataset = raw_dataset.values 6 | ipt_ds = convert_ipt_to_iat(raw_dataset) 7 | assert ipt_ds.shape == raw_dataset.shape 8 | ipt_packets = flows_to_packets(ipt_ds) 9 | source_packets = flows_to_packets(raw_dataset) 10 | assert (ipt_packets[:, 0] == source_packets[:, 0]).all() 11 | 12 | 13 | def test_smoke_evaluate_generated_traffic(raw_dataset): 14 | results = evaluate_generated_traffic(raw_dataset.values, raw_dataset.values) 15 | assert all(value == 0 for key, value in results.items() if key.startswith('KL')) 16 | -------------------------------------------------------------------------------- /requirements.yaml: -------------------------------------------------------------------------------- 1 | name: classifier 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - python=3.7 7 | - pandas=1.0.3 8 | - numpy=1.18.4 9 | - matplotlib=3.2.1 10 | - seaborn=0.11.0 11 | - scikit-learn=0.23.1 12 | - neptune-client 13 | - psutil 14 | - pytorch::pytorch=1.6 # -- built from source due to CUDA 11.0 and apex 15 | - dpkt=1.9.2 16 | - hmmlearn=0.2.3 17 | - pytest=5.4.2 18 | - pytest-mock=3.1.1 19 | - pyyaml=5.3.1 20 | - gitpython=3.1.3 21 | - jupyterlab>=2.1 22 | - jupytext 23 | - tqdm 24 | - ipywidgets 25 | - nodejs 26 | - pip: 27 | - pytorch-lightning==0.8.5 28 | - nfstream==6.1.3 29 | - sh==1.13.1 30 | - pandarallel==1.4.8 31 | - transformers==3.0.2 32 | - ngt==1.12.2 33 | -------------------------------------------------------------------------------- /gpt_model/generator/run_generator_finetuning.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PYTHONPATH=../.. python train_generator.py \ 4 | --model_name_or_path=/media/raid_store/pretrained_traffic/gpt2_model_2epochs_classes \ 5 | --finetune_on_class=Telegram \ 6 | --output_dir=/media/raid_store/pretrained_traffic/gpt2_model_telegram \ 7 | --do_train \ 8 | --train_data_file=../../datasets/train_4c93174d7808b1487aa3288084365d76_no_mawi_unswnb_iscxvpn.csv \ 9 | --do_eval \ 10 | --eval_data_file=../../datasets/test_4c93174d7808b1487aa3288084365d76_no_mawi_unswnb_iscxvpn.csv \ 11 | --overwrite_output_dir \ 12 | --per_device_train_batch_size=128 \ 13 | --per_device_eval_batch_size=224 \ 14 | --fp16 \ 15 | --fp16_opt_level=O2 \ 16 | --logging_steps=1 \ 17 | --save_steps=1000 \ 18 | --eval_steps=1000 \ 19 | --gradient_accumulation_steps=30 \ 20 | --num_train_epochs=10 \ 21 | --learning_rate=0.00005 \ 22 | --save_total_limit=10 \ 23 | --logging_dir=fine_runs/7 -------------------------------------------------------------------------------- /tests/static/classifiers_config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | SVM: 3 | type: OneVsOneClassifier 4 | params: 5 | estimator: 6 | type: LinearSVC 7 | params: 8 | tol: 1.0e-5 9 | n_jobs: -1 10 | param_search_space: 11 | estimator__C: [0.1, 1, 10] 12 | estimator__loss: ['squared_hinge'] 13 | estimator__dual: [True, False] 14 | DecTree: 15 | type: DecisionTreeClassifier 16 | param_search_space: 17 | max_depth: 18 | from: 6 19 | till: 20 20 | step: 3 21 | max_features: 22 | from: 10 23 | till: 40 24 | step: 10 25 | criterion: 26 | - entropy 27 | GradBoost: 28 | type: GradientBoostingClassifier 29 | param_search_space: 30 | n_estimators: 31 | - 50 32 | max_depth: [2,3,4,5] 33 | learning_rate: 34 | - 0.01 35 | - 0.05 36 | - 0.1 -------------------------------------------------------------------------------- /gpt_model/generator/run_generator_training.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PYTHONPATH=../.. python train_generator.py \ 4 | --config_name=model_config.json \ 5 | --quantizer_path=trained_quantizers/quantizer_2^14_train_shuffled_0 \ 6 | --output_dir=/media/raid_store/pretrained_traffic/gpt2_model_4epochs_classes_external \ 7 | --do_train \ 8 | --train_data_file=/media/raid_store/pretrained_traffic/train_csv \ 9 | --do_eval \ 10 | --eval_data_file=/media/raid_store/pretrained_traffic/val_csv \ 11 | --overwrite_output_dir \ 12 | --per_device_train_batch_size=128 \ 13 | --per_device_eval_batch_size=224 \ 14 | --fp16 \ 15 | --fp16_opt_level=O2 \ 16 | --logging_steps=1 \ 17 | --save_steps=150 \ 18 | --eval_steps=1000 \ 19 | --gradient_accumulation_steps=30 \ 20 | --num_train_epochs=4 \ 21 | --warmup_steps=200 \ 22 | --learning_rate=0.001 \ 23 | --save_total_limit=10 \ 24 | --file_patterns_to_exclude=mawi_iot_home \ 25 | --train_with_targets 26 | #--model_name_or_path=/media/raid_store/pretrained_traffic/gpt2_model_4epochs_classes_home_iot \ -------------------------------------------------------------------------------- /sklearn_classifiers/registered_classes.py: -------------------------------------------------------------------------------- 1 | from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier 2 | from sklearn.linear_model import LogisticRegression 3 | from sklearn.multiclass import OneVsOneClassifier 4 | from sklearn.neighbors import KNeighborsClassifier 5 | from sklearn.neural_network import MLPClassifier 6 | from sklearn.svm import LinearSVC 7 | from sklearn.tree import DecisionTreeClassifier 8 | 9 | from sklearn_classifiers.knn_cosine import ( 10 | KNeighborsCosineClassifier, 11 | KNeighborsPuffinnClassifier, 12 | KNeighborsNGTClassifier 13 | ) 14 | 15 | REGISTERED_CLASSES = { 16 | cls.__name__: cls for cls in [ 17 | MLPClassifier, 18 | LinearSVC, 19 | DecisionTreeClassifier, 20 | RandomForestClassifier, 21 | GradientBoostingClassifier, 22 | LogisticRegression, 23 | OneVsOneClassifier, 24 | KNeighborsClassifier, 25 | KNeighborsCosineClassifier, 26 | KNeighborsPuffinnClassifier, 27 | KNeighborsNGTClassifier 28 | ] 29 | } 30 | -------------------------------------------------------------------------------- /fs_net/README.md: -------------------------------------------------------------------------------- 1 | ## FS-NET 2 | 3 | Reimplementation of FS-NET model without reconstruction loss, 4 | which harmed the performance according to the reported results in the original 5 | paper: 6 | 7 | ``` 8 | @inproceedings{LiuHXCL19, 9 | author = {Chang Liu and 10 | Longtao He and 11 | Gang Xiong and 12 | Zigang Cao and 13 | Zhen Li}, 14 | title = {FS-Net: {A} Flow Sequence Network For Encrypted Traffic Classification}, 15 | booktitle = {{IEEE} Conference on Computer Communications (INFOCOM), 2019}, 16 | pages = {1171--1179}, 17 | year = {2019} 18 | } 19 | ``` 20 | From my point of view, there is some inconsistency between the paper's 21 | description and implementation found in https://github.com/WSPTTH/FS-Net, 22 | particularly regarding the presence of Selu activation for the final 23 | output in eq. 17. 24 | 25 | As a bonus, the training script has 2 options for the model's input: 26 | (i) either packet size sequences (as in the paper), or (ii) K-Means centroids 27 | for (PS, IPT) features (similarly to the transformer model in this repo). -------------------------------------------------------------------------------- /nn_classifiers/dataset.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Tuple 3 | 4 | import torch 5 | from sklearn.model_selection import train_test_split 6 | from torch.utils.data import TensorDataset 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | def get_train_val_test_datasets(X_train, y_train, X_test, y_test, device='cpu', val_part=0.9) \ 12 | -> Tuple[TensorDataset, TensorDataset, TensorDataset]: 13 | 14 | """ 15 | converts sklearn-like dataset into torch-compatible one 16 | """ 17 | def _tensor_dataset(X, y): 18 | return TensorDataset(torch.as_tensor(X, device=device, dtype=torch.float), 19 | torch.as_tensor(y, device=device, dtype=torch.long)) 20 | 21 | X_train, X_val, y_train, y_val = train_test_split( 22 | X_train, 23 | y_train, 24 | train_size=val_part, 25 | stratify=y_train, 26 | random_state=1 27 | ) 28 | train_dataset = _tensor_dataset(X_train, y_train) 29 | val_dataset = _tensor_dataset(X_val, y_val) 30 | test_dataset = _tensor_dataset(X_test, y_test) 31 | 32 | return train_dataset, val_dataset, test_dataset 33 | 34 | 35 | -------------------------------------------------------------------------------- /flow_parsing/aux_raw_features_plugin.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import dpkt 4 | import nfstream 5 | import numpy as np 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class AuxRawFeatures(nfstream.NFPlugin): 11 | @staticmethod 12 | def _fill_flow_stats(flow, packet, counter=0): 13 | flow.udps.bulk[counter] = packet.payload_size 14 | if packet.protocol == 6 and packet.ip_version == 4: 15 | decoded_packet = dpkt.ip.IP(packet.ip_packet) 16 | try: 17 | flow.udps.tcp_window[counter] = decoded_packet.data.win 18 | flow.udps.tcp_flag[counter] = decoded_packet.data.flags 19 | except AttributeError: 20 | logger.warning(f'unexpected packet format: {decoded_packet}') 21 | 22 | def on_init(self, packet, flow): 23 | flow.udps.bulk = np.ones(self.packet_limit) * -1 24 | flow.udps.tcp_window = np.zeros(self.packet_limit) 25 | flow.udps.tcp_flag = np.zeros(self.packet_limit) 26 | 27 | self._fill_flow_stats(flow, packet) 28 | 29 | def on_update(self, packet, flow): 30 | if flow.bidirectional_packets <= self.packet_limit: 31 | self._fill_flow_stats(flow, packet, flow.bidirectional_packets - 1) 32 | -------------------------------------------------------------------------------- /.github/workflows/python-package-conda.yml: -------------------------------------------------------------------------------- 1 | name: Python Package using Conda 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build-linux: 7 | runs-on: ubuntu-latest 8 | strategy: 9 | max-parallel: 5 10 | 11 | steps: 12 | - uses: actions/checkout@v2 13 | - name: Set up Python 3.7 14 | uses: actions/setup-python@v2 15 | with: 16 | python-version: 3.7 17 | - name: Install dependencies 18 | run: | 19 | # $CONDA is an environment variable pointing to the root of the miniconda directory 20 | $CONDA/bin/conda env update --file requirements.yaml -n base 21 | git clone https://github.com/puffinn/puffinn /opt/puffinn && \ 22 | cd /opt/puffinn && \ 23 | $CONDA/bin/python3.7 setup.py build && \ 24 | $CONDA/bin/python3.7 setup.py install && \ 25 | cd - 26 | 27 | - name: Lint with flake8 28 | run: | 29 | $CONDA/bin/conda install flake8 30 | # stop the build if there are Python syntax errors or undefined names 31 | $CONDA/bin/flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 32 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 33 | $CONDA/bin/flake8 . --count --exit-zero --max-complexity=15 --max-line-length=127 --statistics 34 | - name: Test with pytest 35 | run: | 36 | export PYTHONPATH=/home/runner/work/ML-based-network-traffic-classifier/ML-based-network-traffic-classifier 37 | $CONDA/bin/pytest 38 | -------------------------------------------------------------------------------- /gpt_model/classifier/run_evaluation_scenarios.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PYTHONPATH=../.. 4 | export PRETRAINED_MODEL_PATH=/media/raid_store/pretrained_traffic/gpt2_model_4epochs_classes_external 5 | 6 | export TRAIN_DATASET=../../datasets/train_4c93174d7808b1487aa3288084365d76_no_mawi_unswnb_iscxvpn.csv 7 | export TEST_DATASET=../../datasets/test_4c93174d7808b1487aa3288084365d76_no_mawi_unswnb_iscxvpn.csv 8 | 9 | 10 | python train_classifier.py \ 11 | --pretrained_path=$PRETRAINED_MODEL_PATH \ 12 | --train_dataset=$TRAIN_DATASET \ 13 | --test_dataset=$TEST_DATASET \ 14 | --freeze_pretrained_model \ 15 | --mask_first_token 16 | 17 | python train_classifier.py \ 18 | --pretrained_path=$PRETRAINED_MODEL_PATH \ 19 | --train_dataset=$TRAIN_DATASET \ 20 | --test_dataset=$TEST_DATASET \ 21 | --mask_first_token 22 | 23 | python train_classifier.py \ 24 | --pretrained_path=$PRETRAINED_MODEL_PATH \ 25 | --train_dataset=$TRAIN_DATASET \ 26 | --test_dataset=$TEST_DATASET \ 27 | --freeze_pretrained_model 28 | 29 | python train_classifier.py \ 30 | --pretrained_path=$PRETRAINED_MODEL_PATH \ 31 | --train_dataset=$TRAIN_DATASET \ 32 | --test_dataset=$TEST_DATASET \ 33 | 34 | python train.py \ 35 | --pretrained_path=$PRETRAINED_MODEL_PATH \ 36 | --train_dataset=$TRAIN_DATASET \ 37 | --test_dataset=$TEST_DATASET \ 38 | --freeze_pretrained_model \ 39 | --reinitialize 40 | 41 | python train_classifier.py \ 42 | --pretrained_path=$PRETRAINED_MODEL_PATH \ 43 | --train_dataset=$TRAIN_DATASET \ 44 | --test_dataset=$TEST_DATASET \ 45 | --reinitialize 46 | -------------------------------------------------------------------------------- /tests/test_fsnet.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import DataLoader 2 | import torch 3 | 4 | from fs_net.dataset import SimpleClassificationQuantizedDataset, ClassificationPacketSizeDataset 5 | from fs_net.model import FSNETClassifier 6 | 7 | 8 | def test_packet_ds(raw_dataset_file): 9 | ds = ClassificationPacketSizeDataset(raw_dataset_file, max_size_range=100, target_column='ndpi_app') 10 | loader = DataLoader(ds, batch_size=4, drop_last=True) 11 | for flow, target in loader: 12 | assert flow.min() == torch.tensor(1) and \ 13 | flow.max() == torch.tensor(99) and \ 14 | flow.shape == torch.Size([4, 20]) 15 | 16 | 17 | def test_forward_packet_ds(raw_dataset_file): 18 | ds = ClassificationPacketSizeDataset(raw_dataset_file, max_size_range=100, target_column='ndpi_app') 19 | loader = DataLoader(ds, batch_size=4, drop_last=True) 20 | n_classes = len(ds.target_encoder.classes_) 21 | model = FSNETClassifier({}, ds.target_encoder.classes_, 100) 22 | for flow, target in loader: 23 | output = model(flow) 24 | assert output.shape == torch.Size([4, n_classes]) 25 | 26 | 27 | def test_forward(tokenizer, raw_dataset_file): 28 | """ simple smoke-test """ 29 | ds = SimpleClassificationQuantizedDataset(tokenizer, dataset_path=raw_dataset_file, target_column='ndpi_app') 30 | loader = DataLoader(ds, batch_size=4, drop_last=True) 31 | n_classes = len(ds.target_encoder.classes_) 32 | model = FSNETClassifier({}, ds.target_encoder.classes_, len(tokenizer)) 33 | for flow in loader: 34 | output = model(flow[0]) 35 | assert output.shape == torch.Size([4, n_classes]) 36 | -------------------------------------------------------------------------------- /flow_parsing/utils.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import pathlib 3 | 4 | import logging 5 | import pandas as pd 6 | 7 | from settings import BASE_DIR 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | def read_dataset(filename, fill_na=False) -> pd.DataFrame: 13 | """ a simple wrapper for pandas """ 14 | dataset = pd.read_csv(filename, na_filter=True) 15 | if fill_na: 16 | dataset = dataset.fillna(0) 17 | logger.info(f'read {len(dataset)} flows from {filename}') 18 | return dataset 19 | 20 | 21 | def check_filename_in_patterns(file, patterns): 22 | if isinstance(file, pathlib.Path): 23 | file = file.name 24 | 25 | if patterns and any(pattern in file for pattern in patterns): 26 | logger.info(f'file {file} matches a pattern') 27 | return True 28 | return False 29 | 30 | 31 | def get_df_hash(df): 32 | return hashlib.md5(pd.util.hash_pandas_object(df, index=True).values).hexdigest() 33 | 34 | 35 | def get_hash(df): 36 | 37 | def _get_current_commit_hash(): 38 | """ get commit hash at HEAD """ 39 | from git import Repo 40 | repo = Repo(BASE_DIR) 41 | return repo.head.commit.hexsha 42 | 43 | try: 44 | df_hash = _get_current_commit_hash() 45 | except Exception: 46 | df_hash = get_df_hash(df) 47 | return df_hash 48 | 49 | 50 | def save_dataset(dataset, save_to=None): 51 | """ simple data tracking/versioning via hash suffixes """ 52 | 53 | if save_to is None: 54 | save_to = BASE_DIR / f'datasets/dataset_{get_hash(dataset)}.csv' 55 | dataset.to_csv(save_to, index=False) 56 | logger.info(f'saved dataset to {save_to}') 57 | return save_to 58 | -------------------------------------------------------------------------------- /settings.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pathlib 3 | import os 4 | from dataclasses import dataclass 5 | 6 | import pandas as pd 7 | 8 | logging.basicConfig(level=logging.INFO, 9 | format='[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s') 10 | logger = logging.getLogger() 11 | 12 | 13 | def _read_protocol_mapping() -> dict: 14 | map_file = BASE_DIR / 'flow_parsing/static/ip_proto_map.csv' 15 | pairs = pd.read_csv(map_file, header=None) 16 | return dict(pairs.values.tolist()) 17 | 18 | 19 | BASE_DIR = pathlib.Path(__file__).resolve().parent 20 | TEST_STATIC_DIR = BASE_DIR / 'tests' / 'static' 21 | DATASET_DIR = BASE_DIR / 'datasets' 22 | 23 | PCAP_OUTPUT_DIR = BASE_DIR / 'csv_files' 24 | REPORT_DIR = BASE_DIR / 'reports' 25 | CACHE_DIR = pathlib.Path('/tmp') 26 | 27 | IP_PROTO_MAPPING = _read_protocol_mapping() 28 | RANDOM_SEED = 1 29 | 30 | DEFAULT_PACKET_LIMIT_PER_FLOW = int(os.getenv('DEFAULT_PACKET_LIMIT_PER_FLOW', 20)) 31 | LOWER_BOUND_CLASS_OCCURRENCE = int(os.getenv('LOWER_BOUND_CLASS_OCCURRENCE', 50)) 32 | 33 | # customize, if needed 34 | TARGET_CLASS_COLUMN = 'target_class' 35 | 36 | # nfstream params 37 | # the idle timeout follows many papers on traffic identification (JOY has 10 sec) 38 | IDLE_TIMEOUT = 60 39 | # active timeouts are set similarly, (Cisco's JOY tool has 30 sec) 40 | ACTIVE_TIMEOUT_ONLINE = 60 41 | ACTIVE_TIMEOUT_OFFLINE = 10**6 42 | 43 | NEPTUNE_PROJECT = 'radion/traffic-classifier' 44 | 45 | 46 | @dataclass 47 | class FilePatterns: 48 | mawi: tuple = ('202004',) 49 | mawi_unswnb_iscxvpn: tuple = ('raw_csv', '202004') 50 | iot_home: tuple = ('electronics', 'camera', '2020', 'environment', 'healthcare', 'home', 'hub', 'light', 'trigger') 51 | mawi_iot_home: tuple = ('electronics', 'camera', '2020', 'environment', 'healthcare', 'home', 'hub', 'light', 52 | 'trigger', '202004') 53 | -------------------------------------------------------------------------------- /gpt_model/classifier/model.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import torch 4 | from torch.optim.lr_scheduler import ReduceLROnPlateau 5 | from transformers import GPT2Model, GPT2Config 6 | from transformers.optimization import AdamW 7 | 8 | from nn_classifiers.models import BaseClassifier 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class GPT2Classifier(BaseClassifier): 14 | def __init__( 15 | self, 16 | config, 17 | class_labels, 18 | pretrained_model_path, 19 | dropout=0.1, 20 | freeze_pretrained_part=True, 21 | reinitialize=False, 22 | n_layers=6, 23 | ): 24 | super().__init__(config, class_labels) 25 | 26 | if reinitialize: 27 | logger.info('resetting model weights') 28 | config = GPT2Config.from_json_file(pretrained_model_path + '/config.json') 29 | config = config.to_dict() 30 | config['n_layer'] = n_layers 31 | config = GPT2Config.from_dict(config) 32 | self.gpt2 = GPT2Model(config) 33 | else: 34 | self.gpt2 = GPT2Model.from_pretrained(pretrained_model_path) 35 | 36 | self.dropout = torch.nn.Dropout(dropout) 37 | self.fc = torch.nn.Linear(self.gpt2.config.n_embd, self.output_dim) 38 | if freeze_pretrained_part: 39 | for param in self.gpt2.parameters(): 40 | param.requires_grad = False 41 | 42 | def forward(self, x): 43 | output = self.gpt2(**x) 44 | output = output[0] # last hidden state (batch_size, sequence_length, hidden_size) 45 | # average over temporal dimension 46 | output = output.mean(dim=1) 47 | output = self.dropout(output) 48 | return self.fc(output) 49 | 50 | def configure_optimizers(self): 51 | optimizer = AdamW(self.parameters(), lr=self.hparams.learning_rate) 52 | scheduler = ReduceLROnPlateau(optimizer, patience=self.hparams.es_patience // 2) 53 | return [optimizer], [scheduler] 54 | -------------------------------------------------------------------------------- /flow_parsing/static/ip_proto_map.csv: -------------------------------------------------------------------------------- 1 | 1,ICMP 2 | 2,IGMP 3 | 3,GGP 4 | 4,IP-in-IP 5 | 5,ST 6 | 6,TCP 7 | 7,CBT 8 | 8,EGP 9 | 9,IGP 10 | 10,BBN-RCC-MON 11 | 11,NVP-II 12 | 12,PUP 13 | 13,ARGUS 14 | 14,EMCON 15 | 15,XNET 16 | 16,CHAOS 17 | 17,UDP 18 | 18,MUX 19 | 19,DCN-MEAS 20 | 20,HMP 21 | 21,PRM 22 | 22,XNS-IDP 23 | 23,TRUNK-1 24 | 24,TRUNK-2 25 | 25,LEAF-1 26 | 26,LEAF-2 27 | 27,RDP 28 | 28,IRTP 29 | 29,ISO-TP4 30 | 30,NETBLT 31 | 31,MFE-NSP 32 | 32,MERIT-INP 33 | 33,DCCP 34 | 34,3PC 35 | 35,IDPR 36 | 36,XTP 37 | 37,DDP 38 | 38,IDPR-CMTP 39 | 39,TP++ 40 | 40,IL 41 | 41,IPv6 42 | 42,SDRP 43 | 43,IPv6-Route 44 | 44,IPv6-Frag 45 | 45,IDRP 46 | 46,RSVP 47 | 47,GREs 48 | 48,DSR 49 | 49,BNA 50 | 50,ESP 51 | 51,AH 52 | 52,I-NLSP 53 | 53,SwIPe 54 | 54,NARP 55 | 55,MOBILE 56 | 56,TLSP 57 | 57,SKIP 58 | 58,IPv6-ICMP 59 | 59,IPv6-NoNxt 60 | 60,IPv6-Opts 61 | 61,Any host internal proto 62 | 62,CFTP 63 | 63,Any local network 64 | 64,SAT-EXPAK 65 | 65,KRYPTOLAN 66 | 66,RVD 67 | 67,IPPC 68 | 68,Any distributed file system 69 | 69,SAT-MON 70 | 70,VISA 71 | 71,IPCU 72 | 72,CPNX 73 | 73,CPHB 74 | 74,WSN 75 | 75,PVP 76 | 76,BR-SAT-MON 77 | 77,SUN-ND 78 | 78,WB-MON 79 | 79,WB-EXPAK 80 | 80,ISO-IP 81 | 81,VMTP 82 | 82,SECURE-VMTP 83 | 83,VINES 84 | 84,TTP |IPTM 85 | 85,NSFNET-IGP 86 | 86,DGP 87 | 87,TCF 88 | 88,EIGRP 89 | 89,OSPF 90 | 90,Sprite-RPC 91 | 91,LARP 92 | 92,MTP 93 | 93,AX.25 94 | 94,OS 95 | 95,MICP 96 | 96,SCC-SP 97 | 97,ETHERIP 98 | 98,ENCAP 99 | 99,Any private encryption scheme 100 | 100,GMTP 101 | 101,IFMP 102 | 102,PNNI 103 | 103,PIM 104 | 104,ARIS 105 | 105,SCPS 106 | 106,QNX 107 | 107,A/N 108 | 108,IPComp 109 | 109,SNP 110 | 110,Compaq-Peer 111 | 111,IPX-in-IP 112 | 112,VRRP 113 | 113,PGM 114 | 114,Any 0-hop protocol 115 | 115,L2TP 116 | 116,DDX 117 | 117,IATP 118 | 118,STP 119 | 119,SRP 120 | 120,UTI 121 | 121,SMP 122 | 122,SM 123 | 123,PTP 124 | 124,IS-IS over IPv4 125 | 125,FIRE 126 | 126,CRTP 127 | 127,CRUDP 128 | 128,SSCOPMCE 129 | 129,IPLT 130 | 130,SPS 131 | 131,PIPE 132 | 132,SCTP 133 | 133,FC 134 | 134,RSVP-E2E-IGNORE 135 | 135,Mobility Header 136 | 136,UDPLite 137 | 137,MPLS-in-IP 138 | 138,manet 139 | 139,HIP 140 | 140,Shim6 141 | 141,WESP 142 | 142,ROHC 143 | 143,Ethernet -------------------------------------------------------------------------------- /sklearn_classifiers/config.yaml.example: -------------------------------------------------------------------------------- 1 | # control the algorithms' use putting/removing comment mark # 2 | 3 | LogRegr: 4 | type: LogisticRegression 5 | params: 6 | solver: saga 7 | max_iter: 500 8 | n_jobs: -1 9 | multi_class: multinomial 10 | param_search_space: 11 | C: [0.1, 1, 10] 12 | tol: [0.00001,0.0001,0.001], 13 | 14 | LogRegrCost: 15 | type: LogisticRegression 16 | params: 17 | solver: saga 18 | max_iter: 500 19 | n_jobs: 4 20 | class_weight: balanced 21 | multi_class: multinomial 22 | param_search_space: 23 | C: [0.1, 1, 10] 24 | 25 | LogRegrOVR: 26 | type: LogisticRegression 27 | params: 28 | solver: saga 29 | max_iter: 500 30 | n_jobs: -1 31 | multi_class: ovr 32 | param_search_space: 33 | C: [0.1, 1, 10] 34 | 35 | SVM_OvO: 36 | type: OneVsOneClassifier 37 | norandom: true 38 | params: 39 | estimator: 40 | type: LinearSVC 41 | params: 42 | tol: 1.0e-5 43 | n_jobs: -1 44 | param_search_space: 45 | estimator__C: [0.1, 1, 10] 46 | estimator__loss: ['squared_hinge'] 47 | 48 | DecTree: 49 | type: DecisionTreeClassifier 50 | param_search_space: 51 | max_depth: 52 | from: 6 53 | till: 20 54 | step: 3 55 | max_features: 56 | from: 10 57 | till: 40 58 | step: 10 59 | criterion: 60 | - entropy 61 | RandomForest: 62 | type: RandomForestClassifier 63 | params: 64 | n_estimators: 10 65 | n_jobs: -1 66 | param_search_space: 67 | n_estimators: 68 | from: 10 69 | till: 100 70 | step: 30 71 | 72 | GradBoost: 73 | type: GradientBoostingClassifier 74 | param_search_space: 75 | n_estimators: 76 | - 50 77 | max_depth: [2,3,4,5] 78 | learning_rate: 79 | - 0.01 80 | - 0.05 81 | - 0.1 82 | 83 | MLP: 84 | type: MLPClassifier 85 | params: 86 | max_iter: 300 87 | param_search_space: 88 | hidden_layer_sizes: 89 | - [80, 80] 90 | - [120, 120] 91 | alpha: 92 | - 0.0001 93 | - 0.001 94 | - 0.01 95 | -------------------------------------------------------------------------------- /tests/test_markov.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from gpt_model.generator.baseline import markov 4 | 5 | 6 | def test_norm(): 7 | x = np.array([np.inf, 0, 0, 1]).reshape(2, -1) 8 | n_x = markov._normalize_by_rows(x) 9 | assert (n_x == [[1, 0], [0, 1]]).all() 10 | 11 | x = np.array([[10, 0, ], [4, 16]]) 12 | 13 | n_x = markov._normalize_by_rows(x) 14 | exp_x = np.array([[1., 0.], [0.2, 0.8]]) 15 | assert np.isclose(exp_x, n_x, rtol=1e-3).all() 16 | 17 | 18 | def test_calc_transition_matrix(quantized_packets): 19 | trans_matrix = markov._calc_transition_matrix( 20 | seq_matrix=quantized_packets, 21 | state_numb=np.unique(quantized_packets).size 22 | ) 23 | # 0 is the reccurent state 24 | assert np.isclose(trans_matrix[0, 0], 1, atol=1e-6) 25 | 26 | 27 | def test_priors(quantized_packets): 28 | priors = markov._calc_prior_probas(quantized_packets, 29 | np.unique(quantized_packets).size) 30 | 31 | assert np.isclose(priors[10], 0.7541, rtol=1e-3) 32 | 33 | 34 | def test_markov_generator(quantized_packets): 35 | gener = markov.MarkovGenerator() 36 | gener.fit(quantized_packets*-1) 37 | sampled = gener.sample(1000) 38 | new_gener = markov.MarkovGenerator() 39 | new_gener.fit(sampled) 40 | assert np.isclose(gener.init_priors, new_gener.init_priors, atol=0.1).all() 41 | # accumulated error < 1. for 114x114 matrix is OK 42 | tr_matrix_frob_norm = np.linalg.norm(gener.transition_matrix - new_gener.transition_matrix, ord='fro') 43 | assert tr_matrix_frob_norm < 1. 44 | 45 | 46 | def test_markov_kmeans_augmenter(raw_dataset): 47 | def _calc_hist_like_pmf(packet_vector): 48 | pmf = np.histogram(packet_vector, bins=50, range=(0, 1000), density=True)[0] 49 | return pmf 50 | 51 | raw_packets = raw_dataset.filter(regex='raw_packet').fillna(0) 52 | gener = markov.MarkovQuantizedGenerator() 53 | gener.fit(raw_packets.values) 54 | output = gener.sample(raw_packets.shape[0]) 55 | priors_distrs_norm = np.linalg.norm( 56 | _calc_hist_like_pmf(output[:, 0]) - 57 | _calc_hist_like_pmf(raw_packets.iloc[:, 0]), 58 | ord=1) 59 | assert priors_distrs_norm < 0.015 60 | 61 | total_distr_norm = np.linalg.norm( 62 | _calc_hist_like_pmf(output.flatten()) - 63 | _calc_hist_like_pmf(raw_packets.values.flatten()), 64 | ord=1) 65 | 66 | assert total_distr_norm < 0.01 67 | -------------------------------------------------------------------------------- /tests/test_pcap_parser.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pandas as pd 4 | 5 | from flow_parsing import pcap_parser 6 | 7 | 8 | def test_feature_persistence(pcap_example_path): 9 | features = pcap_parser.parse_pcap_to_dataframe(pcap_example_path, online_mode=False). \ 10 | sort_values('flow_id', axis=0). \ 11 | reset_index(drop=True) 12 | features2 = pcap_parser.parse_pcap_to_dataframe(pcap_example_path, online_mode=False). \ 13 | sort_values('flow_id', axis=0). \ 14 | reset_index(drop=True) 15 | pd.testing.assert_frame_equal(features, features2) 16 | 17 | 18 | def _serialize_tcp_flag(x): 19 | indexer = x.index.str.endswith('tcp_flags') 20 | x.iloc[indexer] = x.iloc[indexer].apply(json.dumps) 21 | return x 22 | 23 | 24 | def test_parser_output(dataset, pcap_example_path): 25 | parsed_features = pcap_parser.parse_pcap_to_dataframe(pcap_example_path, online_mode=False). \ 26 | sort_values('flow_id', axis=0). \ 27 | reset_index(drop=True) 28 | 29 | parsed_features = parsed_features.apply(_serialize_tcp_flag, axis=1) 30 | dataset = dataset.astype(parsed_features.dtypes). \ 31 | sort_values('flow_id', axis=0). \ 32 | reset_index(drop=True) 33 | pd.testing.assert_frame_equal(parsed_features, dataset, 34 | check_less_precise=2, 35 | check_like=True, 36 | check_categorical=False) 37 | 38 | 39 | def test_raw_parser_output(raw_dataset_with_targets, pcap_example_path): 40 | parsed_features = pcap_parser.parse_pcap_to_dataframe(pcap_example_path, 41 | derivative_features=False, 42 | raw_features=20, 43 | online_mode=False) 44 | parsed_features = parsed_features. \ 45 | sort_values('flow_id', axis=0). \ 46 | reset_index(drop=True). \ 47 | filter(regex='raw') 48 | 49 | raw_dataset = raw_dataset_with_targets. \ 50 | sort_values('flow_id', axis=0). \ 51 | reset_index(drop=True). \ 52 | filter(regex='raw') 53 | raw_dataset = raw_dataset.astype(parsed_features.dtypes) 54 | pd.testing.assert_frame_equal(parsed_features, raw_dataset, 55 | check_less_precise=2, 56 | check_like=True, 57 | check_categorical=False) 58 | -------------------------------------------------------------------------------- /fs_net/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from nn_classifiers.models import BaseClassifier 4 | 5 | 6 | class FSNETClassifier(BaseClassifier): 7 | def __init__(self, 8 | config, 9 | class_labels, 10 | n_tokens, 11 | embedding_dim=128, 12 | hidden_size=128, 13 | n_layers=2, 14 | dropout=0.3): 15 | super().__init__(config, class_labels) 16 | 17 | self.embeddings = torch.nn.Embedding(num_embeddings=n_tokens, embedding_dim=embedding_dim) 18 | self.dropout = torch.nn.Dropout(p=dropout) 19 | self.activation = torch.nn.SELU() 20 | self.encoder = torch.nn.GRU( 21 | embedding_dim, 22 | hidden_size, 23 | num_layers=n_layers, 24 | batch_first=True, 25 | dropout=dropout, 26 | bidirectional=True 27 | ) 28 | self.encoder_hidden_dim = 2 * n_layers * hidden_size 29 | self.compound_dim = self.encoder_hidden_dim * 4 30 | self.decoder = torch.nn.GRU( 31 | self.encoder_hidden_dim, 32 | hidden_size, 33 | num_layers=n_layers, 34 | batch_first=True, 35 | dropout=dropout, 36 | bidirectional=True 37 | ) 38 | self.compressor = torch.nn.Sequential( 39 | torch.nn.Linear(self.compound_dim, 2 * hidden_size), 40 | self.activation, 41 | self.dropout, 42 | ) 43 | self.classifier = torch.nn.Linear(2 * hidden_size, self.output_dim) 44 | 45 | @staticmethod 46 | def _concat_hidden_states(hidden_states, batch_size): 47 | return hidden_states.permute([1, 0, 2]).reshape(batch_size, -1) # (batch_size, 2*n_layers*hidden_size) 48 | 49 | def forward(self, x): 50 | encoder_in = self.embeddings(x.squeeze_(1)) # (batch_size, embedding_dim) 51 | batch_size, seq_len = x.shape[0], x.shape[1] 52 | 53 | _, enc_states = self.encoder(encoder_in) 54 | # "concatenate the final hidden states of both forward and backward directions of all the layers" 55 | z_e = self._concat_hidden_states(enc_states, batch_size) 56 | 57 | # "the encoder-based feature vector ze is input into the decoder at each time step t", so we just repeat it 58 | decoder_in = z_e.unsqueeze(1).repeat(1, seq_len, 1) # (batch_size, seq_len, encoder_hidden_dim) 59 | _, dec_states = self.decoder(decoder_in) 60 | z_d = self._concat_hidden_states(dec_states, batch_size) 61 | # compound feature vector 62 | z = torch.cat([z_e, z_d, z_e * z_d, torch.abs(z_e - z_d)], dim=1) 63 | z_c = self.compressor(z) 64 | return self.classifier(z_c) 65 | -------------------------------------------------------------------------------- /tests/static/quantizer_checkpoint/ids_to_tokens.json: -------------------------------------------------------------------------------- 1 | {"9902": "[EOF]", "9903": "[PAD]", "9904": "[UNK]", "9905": "[BOF]", "9906": "Free90", "9907": "GoogleMaps", "9908": "RDP", "9909": "WhatsAppFiles", "9910": "Zabbix", "9911": "Starcraft", "9912": "DoH_DoT", "9913": "Zoom", "9914": "DNP3", "9915": "ICMP", "9916": "DHCP", "9917": "WindowsUpdate", "9918": "Playstation", "9919": "GTP", "9920": "Facebook", "9921": "IoT_trigger", "9922": "MS_OneDrive", "9923": "LDAP", "9924": "SoundCloud", "9925": "Spotify", "9926": "Oracle", "9927": "HTTP_Proxy", "9928": "POPS", "9929": "SMBv1", "9930": "CiscoSkinny", "9931": "Github", "9932": "Redis", "9933": "NTP", "9934": "Unknown", "9935": "Syslog", "9936": "WireGuard", "9937": "Messenger", "9938": "SAP", "9939": "Xbox", "9940": "SOCKS", "9941": "NestLogSink", "9942": "CiscoVPN", "9943": "MDNS", "9944": "Ayiya", "9945": "GRE", "9946": "CHECKMK", "9947": "QUIC", "9948": "PostgreSQL", "9949": "Citrix", "9950": "IoT_hub", "9951": "Corba", "9952": "STUN", "9953": "GoogleDocs", "9954": "Targus Dataspeed", "9955": "NetFlix", "9956": "CNN", "9957": "SkypeCall", "9958": "GoogleServices", "9959": "IMAPS", "9960": "WeChat", "9961": "Cloudflare", "9962": "Microsoft365", "9963": "SCTP", "9964": "DCE_RPC", "9965": "BitTorrent", "9966": "LLMNR", "9967": "FTP_CONTROL", "9968": "RTSP", "9969": "VRRP", "9970": "H323", "9971": "TeamViewer", "9972": "Steam", "9973": "PlayStore", "9974": "OSPF", "9975": "YouTube", "9976": "IMO", "9977": "SMTP", "9978": "DRDA", "9979": "SMTPS", "9980": "MsSQL-TDS", "9981": "TLS", "9982": "Memcached", "9983": "EGP", "9984": "IoT_camera", "9985": "Instagram", "9986": "Teredo", "9987": "HTTP", "9988": "RemoteScan", "9989": "AJP", "9990": "Skype", "9991": "Amazon", "9992": "SOMEIP", "9993": "SMBv23", "9994": "Modbus", "9995": "WebSocket", "9996": "TFTP", "9997": "Usenet", "9998": "RTP", "9999": "eDonkey", "10000": "NFS", "10001": "Viber", "10002": "Dropbox", "10003": "SSDP", "10004": "Telegram", "10005": "LinkedIn", "10006": "DHCPV6", "10007": "IoT_healthcare", "10008": "IRC", "10009": "WhatsAppCall", "10010": "IPsec", "10011": "IoT_environment", "10012": "AFP", "10013": "OpenVPN", "10014": "WhatsApp", "10015": "BJNP", "10016": "NetBIOS", "10017": "Telnet", "10018": "Whois-DAS", "10019": "Mining", "10020": "PPTP", "10021": "IAX", "10022": "SIP", "10023": "Nats", "10024": "VNC", "10025": "UbuntuONE", "10026": "Google", "10027": "SSH", "10028": "s7comm", "10029": "Kerberos", "10030": "Twitter", "10031": "Radius", "10032": "DNS", "10033": "IGMP", "10034": "ICMPV6", "10035": "GooglePlus", "10036": "MQTT", "10037": "RTMP", "10038": "BGP", "10039": "Wikipedia", "10040": "Microsoft", "10041": "LotusNotes", "10042": "Yahoo", "10043": "UPnP", "10044": "IEC60870", "10045": "SNMP", "10046": "Git", "10047": "POP3", "10048": "Slack", "10049": "MySQL", "10050": "GMail", "10051": "IoT_electronics", "10052": "Diameter", "10053": "IMAP", "10054": "RX", "10055": "AMQP", "10056": "CAPWAP"} -------------------------------------------------------------------------------- /gpt_model/generator/trainer.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from torch.utils.data import DataLoader, Dataset 4 | from torch.utils.data.distributed import DistributedSampler 5 | from transformers import Trainer 6 | from transformers.trainer import SequentialDistributedSampler 7 | 8 | 9 | class SeqTrainer(Trainer): 10 | def get_train_dataloader(self) -> DataLoader: 11 | """ 12 | Returns the training :class:`~torch.utils.data.DataLoader`. 13 | """ 14 | if self.train_dataset is None: 15 | raise ValueError("Trainer: training requires a train_dataset.") 16 | else: 17 | train_sampler = ( 18 | None 19 | if self.args.local_rank == -1 20 | else DistributedSampler(self.train_dataset) 21 | ) 22 | 23 | data_loader = DataLoader( 24 | self.train_dataset, 25 | batch_size=self.args.train_batch_size, 26 | sampler=train_sampler, 27 | collate_fn=self.data_collator, 28 | drop_last=self.args.dataloader_drop_last, 29 | ) 30 | 31 | return data_loader 32 | 33 | def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader: 34 | """ 35 | Returns the evaluation :class:`~torch.utils.data.DataLoader`. 36 | 37 | Args: 38 | eval_dataset (:obj:`Dataset`, `optional`): 39 | If provided, will override `self.eval_dataset`. 40 | """ 41 | if eval_dataset is None and self.eval_dataset is None: 42 | raise ValueError("Trainer: evaluation requires an eval_dataset.") 43 | 44 | eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset 45 | if self.args.local_rank != -1: 46 | sampler = SequentialDistributedSampler(eval_dataset) 47 | else: 48 | sampler = None 49 | 50 | data_loader = DataLoader( 51 | eval_dataset, 52 | sampler=sampler, 53 | batch_size=self.args.eval_batch_size, 54 | collate_fn=self.data_collator, 55 | drop_last=self.args.dataloader_drop_last, 56 | ) 57 | 58 | return data_loader 59 | 60 | def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: 61 | """ 62 | Returns the test :class:`~torch.utils.data.DataLoader`. 63 | 64 | Args: 65 | test_dataset (obj:`Dataset`): The test dataset to use. 66 | """ 67 | # We use the same batch_size as for eval. 68 | if self.args.local_rank != -1: 69 | sampler = SequentialDistributedSampler(test_dataset) 70 | else: 71 | sampler = None 72 | 73 | data_loader = DataLoader( 74 | test_dataset, 75 | sampler=sampler, 76 | batch_size=self.args.eval_batch_size, 77 | collate_fn=self.data_collator, 78 | drop_last=self.args.dataloader_drop_last, 79 | ) 80 | 81 | return data_loader 82 | -------------------------------------------------------------------------------- /gpt_model/classifier/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | from functools import partial 4 | from typing import Dict, List, Tuple 5 | 6 | import logging 7 | import numpy as np 8 | import pandas as pd 9 | import torch 10 | from sklearn.preprocessing import LabelEncoder 11 | from torch.utils.data import Dataset 12 | 13 | from gpt_model.tokenizer import PacketTokenizer 14 | from settings import TARGET_CLASS_COLUMN 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | class ClassificationQuantizedDataset(Dataset): 20 | def __init__( 21 | self, tokenizer: PacketTokenizer, 22 | dataset_path: str, 23 | label_encoder: LabelEncoder = None, 24 | target_column=TARGET_CLASS_COLUMN 25 | ): 26 | assert os.path.isfile(dataset_path) 27 | 28 | dataset_path = pathlib.Path(dataset_path) 29 | self.source_file = dataset_path 30 | logger.info("initializing dataset from %s", dataset_path) 31 | 32 | self.tokenizer = tokenizer 33 | 34 | raw_flows = pd.read_csv(self.source_file, 35 | usecols=self.tokenizer.packet_quantizer.raw_columns + [target_column]) 36 | 37 | if label_encoder is None: 38 | self.target_encoder = LabelEncoder().fit(raw_flows[target_column].values) 39 | else: 40 | self.target_encoder = label_encoder 41 | 42 | self.targets = self.target_encoder.transform(raw_flows[target_column].values) 43 | self.raw_flows = raw_flows.loc[:, tokenizer.packet_quantizer.raw_columns].values 44 | logger.info('initialized dataset') 45 | 46 | def __len__(self): 47 | return len(self.raw_flows) 48 | 49 | def __getitem__(self, i: int) -> Dict[str, torch.Tensor]: 50 | enc_flow = self.tokenizer.batch_encode_packets(self.raw_flows[i].reshape(1, -1).astype(np.float64), 51 | add_special_tokens=True, 52 | return_attention_mask=True).data 53 | 54 | enc_flow.update({'target': torch.as_tensor(self.targets[i], dtype=torch.long)}) 55 | return enc_flow 56 | 57 | @classmethod 58 | def get_collator(cls, mask_first_token): 59 | return partial(classification_quantized_collator, mask_first_token=mask_first_token) 60 | 61 | 62 | def classification_quantized_collator(examples: List[Dict[str, torch.Tensor]], mask_first_token=True) -> \ 63 | Tuple[Dict[str, torch.Tensor], torch.Tensor]: 64 | """ Data collator used for traffic classification """ 65 | 66 | length_of_first = examples[0]['input_ids'].size(0) 67 | are_tensors_same_length = all(x['input_ids'].size(0) == length_of_first for x in examples) 68 | assert are_tensors_same_length 69 | 70 | input_ids = torch.cat([item['input_ids'] for item in examples], dim=0) 71 | attention_masks = torch.cat([item['attention_mask'] for item in examples], dim=0) 72 | if mask_first_token: 73 | attention_masks[:, 0] = 0 74 | targets = torch.cat([item['target'].view(1) for item in examples]) 75 | return {"input_ids": input_ids, "attention_mask": attention_masks}, targets 76 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pytest 6 | 7 | from flow_parsing import features 8 | import settings 9 | from gpt_model.tokenizer import PacketTokenizer 10 | 11 | 12 | @pytest.fixture 13 | def dataset(): 14 | return pd.read_csv(settings.TEST_STATIC_DIR / 'example_20packets.csv', na_filter=False) 15 | 16 | 17 | @pytest.fixture 18 | def raw_dataset_folder(): 19 | return settings.TEST_STATIC_DIR / 'raw_csv' 20 | 21 | 22 | @pytest.fixture 23 | def raw_dataset_file(raw_dataset_folder): 24 | return raw_dataset_folder / 'example_raw_20packets.csv' 25 | 26 | 27 | @pytest.fixture 28 | def raw_dataset(raw_dataset_folder): 29 | return pd.read_csv(raw_dataset_folder / 'example_raw_20packets.csv', na_filter=False).\ 30 | filter(regex='raw').\ 31 | astype(np.float64) 32 | 33 | 34 | @pytest.fixture 35 | def raw_dataset_with_targets(raw_dataset_folder): 36 | df = pd.read_csv(raw_dataset_folder / 'example_raw_20packets.csv', na_filter=False) 37 | df.filter(regex='raw').astype(np.float64, copy=False) 38 | return df 39 | 40 | 41 | @pytest.fixture 42 | def classif_config(): 43 | return {'SVM': {'type': 'OneVsOneClassifier', 44 | 'params': {'estimator': {'type': 'LinearSVC', 45 | 'params': {'tol': 1e-05}}, 'n_jobs': -1}, 46 | 'param_search_space': {'estimator__C': [0.1, 1, 10], 'estimator__loss': ['squared_hinge'], 47 | 'estimator__dual': [True, False]}}, 48 | 'DecTree': {'type': 'DecisionTreeClassifier', 49 | 'param_search_space': {'max_depth': [6, 9, 12, 15, 18], 'max_features': [10, 20, 30, 40], 50 | 'criterion': ['entropy']}}, 51 | 'GradBoost': {'type': 'GradientBoostingClassifier', 52 | 'param_search_space': {'n_estimators': [50], 'max_depth': [2, 3, 4, 5], 53 | 'learning_rate': [0.01, 0.05, 0.1]}}} 54 | 55 | 56 | @pytest.fixture 57 | def raw_matrix(): 58 | size = 10 59 | raw_feature_matrix = np.zeros((size, 7)) 60 | raw_feature_matrix[:, features.RMI.TIMESTAMP] = np.array(range(12312, size + 12312)) 61 | raw_feature_matrix[:, features.RMI.IP_LEN] = np.array([13, 54, 345, 43, 44, 990, 1000, 23, 555, 1400]) 62 | raw_feature_matrix[:, features.RMI.IS_CLIENT] = np.array([0, 1, 1, 0, 0, 1, 1, 1, 1, 0]) 63 | return raw_feature_matrix 64 | 65 | 66 | @pytest.fixture 67 | def quantized_packets(): 68 | with open(settings.TEST_STATIC_DIR / 'quantized_pkts.json', 'r') as js: 69 | pkts = json.load(js) 70 | return np.array(pkts).reshape(-1, 20) 71 | 72 | 73 | @pytest.fixture 74 | def quantizer_checkpoint(): 75 | return settings.TEST_STATIC_DIR / 'quantizer_checkpoint' 76 | 77 | 78 | @pytest.fixture 79 | def pcap_example_path(): 80 | return (settings.BASE_DIR / 'flow_parsing/static/example.pcap').as_posix() 81 | 82 | 83 | @pytest.fixture() 84 | def tokenizer(quantizer_checkpoint): 85 | return PacketTokenizer.from_pretrained(quantizer_checkpoint, flow_size=20) 86 | -------------------------------------------------------------------------------- /fs_net/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | from typing import Tuple 4 | 5 | import numpy as np 6 | import torch 7 | from sklearn.preprocessing import LabelEncoder 8 | from torch.utils.data import Dataset 9 | import logging 10 | import pandas as pd 11 | from flow_parsing.features import generate_raw_feature_names 12 | from gpt_model.classifier.dataset import ClassificationQuantizedDataset 13 | from settings import TARGET_CLASS_COLUMN, DEFAULT_PACKET_LIMIT_PER_FLOW 14 | 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | class SimpleClassificationQuantizedDataset(ClassificationQuantizedDataset): 20 | """ no attention mask and no dict-like output """ 21 | def __getitem__(self, i: int) -> Tuple[torch.Tensor, torch.Tensor]: 22 | enc_flow = self.tokenizer.batch_encode_packets(self.raw_flows.reshape(1, -1).astype(np.float64), 23 | add_special_tokens=False, 24 | return_attention_mask=False).data 25 | X = enc_flow['input_ids'] 26 | y = torch.as_tensor(self.targets[i], dtype=torch.long) 27 | return X, y 28 | 29 | 30 | class ClassificationPacketSizeDataset(Dataset): 31 | """ 32 | the sequences are expected to be passed through embedding layer first, thus they are encoded to be positive and 33 | the modified PS itself will serve as an index 34 | 35 | max_size_range sets max dynamic range for PS parameter and implicitly sets Embedding layer dim 36 | """ 37 | def __init__( 38 | self, 39 | dataset_path: str, 40 | max_size_range=5000, 41 | label_encoder: LabelEncoder = None, 42 | target_column=TARGET_CLASS_COLUMN, 43 | flow_size=DEFAULT_PACKET_LIMIT_PER_FLOW 44 | ): 45 | assert os.path.isfile(dataset_path) 46 | 47 | dataset_path = pathlib.Path(dataset_path) 48 | self.source_file = dataset_path 49 | logger.info("initializing dataset from %s", dataset_path) 50 | 51 | self.packet_columns = generate_raw_feature_names(flow_size, base_features=('packet',)) 52 | raw_flows = pd.read_csv(self.source_file, 53 | usecols=self.packet_columns + [target_column]) 54 | 55 | if label_encoder is None: 56 | self.target_encoder = LabelEncoder().fit(raw_flows[target_column].values) 57 | else: 58 | self.target_encoder = label_encoder 59 | 60 | self.targets = self.target_encoder.transform(raw_flows[target_column].values) 61 | raw_flows = raw_flows.loc[:, self.packet_columns] 62 | raw_flows = raw_flows.fillna(0) 63 | # truncate values outside the range 64 | offset = max_size_range // 2 65 | raw_flows[raw_flows <= -offset] = -offset + 1 66 | raw_flows[raw_flows >= offset] = offset - 1 67 | self.raw_flows = raw_flows + offset 68 | logger.info('initialized dataset') 69 | 70 | def __len__(self): 71 | return len(self.raw_flows) 72 | 73 | def __getitem__(self, i: int) -> Tuple[torch.Tensor, torch.Tensor]: 74 | X = torch.as_tensor(self.raw_flows.values[i], dtype=torch.long) 75 | y = torch.as_tensor(self.targets[i], dtype=torch.long) 76 | return X, y 77 | -------------------------------------------------------------------------------- /evaluation_utils/classification.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from typing import Optional 3 | 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, classification_report 8 | 9 | from settings import REPORT_DIR 10 | 11 | 12 | class Reporter: 13 | def __init__(self, true, predicted, 14 | classifier_name: str, 15 | target_classes: Optional[list] = None, 16 | report_dir=REPORT_DIR): 17 | self.true = true 18 | self.predicted = predicted 19 | self.target_classes = target_classes if len(target_classes) > 0 else list(range(max(true) + 1)) 20 | self.classifier_name = classifier_name 21 | self.save_dir = pathlib.Path(report_dir) 22 | self.save_dir.mkdir(exist_ok=True) 23 | 24 | def scores(self): 25 | return { 26 | 'Accuracy': accuracy_score(self.true, self.predicted), 27 | 'F1 macro': f1_score(self.true, self.predicted, average='macro'), 28 | 'F1 weighted': f1_score(self.true, self.predicted, average='weighted') 29 | } 30 | 31 | def clf_report(self, as_dict=False, save_to=None): 32 | def to_df(report): 33 | return pd.DataFrame(report).T 34 | 35 | report = classification_report(self.true, self.predicted, 36 | target_names=self.target_classes, 37 | digits=3, 38 | output_dict=True) 39 | 40 | if save_to is not None: 41 | to_df(report).to_csv(self.save_dir / save_to, index=True) 42 | 43 | if as_dict: 44 | return report 45 | return to_df(report) 46 | 47 | def conf_matrix(self, normalize=None): 48 | return pd.DataFrame(confusion_matrix(self.true, self.predicted, normalize=normalize), 49 | columns=self.target_classes, 50 | index=self.target_classes) 51 | 52 | def plot_conf_matrix(self, normalize=None) -> plt.figure: 53 | 54 | cm = self.conf_matrix(normalize).values 55 | classes = self.target_classes 56 | fig_size = int(len(classes) * 0.7) 57 | fig, ax = plt.subplots(ncols=1, nrows=1, figsize=(fig_size, fig_size)) 58 | 59 | im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues) 60 | ax.set_title('CM of {} classifier'.format(self.classifier_name)) 61 | fig.colorbar(im, aspect=30, shrink=0.8, ax=ax) 62 | 63 | tick_marks = np.arange(len(classes)) 64 | ax.set_xticks(tick_marks) 65 | ax.set_xticklabels(list(classes)) 66 | plt.setp(ax.get_xticklabels(), rotation=45) 67 | ax.set_yticks(tick_marks) 68 | ax.set_yticklabels(list(classes)) 69 | 70 | fmt = '.2f' if normalize else 'd' 71 | thresh = cm.max() / 2. 72 | for i in range(cm.shape[0]): 73 | for j in range(cm.shape[1]): 74 | ax.text(j, i, format(cm[i, j], fmt), 75 | horizontalalignment="center", 76 | color="white" if cm[i, j] > thresh else "black") 77 | 78 | ax.set_ylabel('True label') 79 | ax.set_xlabel('Predicted label') 80 | # fig.tight_layout() 81 | plt.show() 82 | return fig 83 | -------------------------------------------------------------------------------- /gpt_model/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Transformer-based network traffic generator and classifier 3 | 4 | ### Introduction 5 | 6 | Given currents trends in web-protocol development (e.g. eSNI, DNS-over-*), 7 | plain text information in traffic sessions is disappearing. In order 8 | to classify the flows, one of few options is to use statistical discriminators 9 | based on packet size (PS) and inter-packet time (IPT) features. 10 | Moreover, exactly the same features are usually produced by traffic 11 | flow generators. 12 | 13 | That gives an idea to develop a common neural network framework for 14 | creating statistical generators and classifiers. A reasonable choice 15 | can be Transformer architecture that showed SOTA on numerous NLP benchmarks. 16 | Since we need a generative model, GPT-2 seems to be a good option to start 17 | with, luckily, `huggingface` did all the dirty stuff implementing it. 18 | 19 | In order to use the models, the initial packet feature space (PS + IPT) 20 | has to be quantized into discrete sequences. I used K-Means for this 21 | purpose and given the expected dataset size (millions of flows), 22 | the libKMCUDA's implementation was adopted to transform prior scaled 23 | packet features into integer sequences of cluster numbers (see 24 | `quantizer.py`). 25 | 26 | Generative pretraining is a viable option to get a powerful classifier without 27 | having much target data. We can pretrain the model in the following ways: 28 | 1. Using unlabeled data. Allows to further use the model as a feature 29 | extractor for various classifiers (e.g linear, K-nn, uSVM) or to be completely 30 | fine-tuned on a classification task. 31 | 2. Using labeled data. The model is trained with first sequence tokens 32 | denoting traffic class that afterwards allows to sample class-specific 33 | packet clusters. Moreover, the same benefits as above are preserved. 34 | 35 | ### Pre-trained models and datasets: 36 | 37 | It is necessary to download a MinIO client to your computer as per: 38 | https://docs.min.io/docs/minio-client-quickstart-guide.html 39 | 40 | To get the data, execute the following commands: 41 | ``` 42 | ./mc alias set ext-anon http://195.201.38.68:9000 43 | ./mc ls ext-anon/traffic-classifier 44 | ./mc cp ext-anon/traffic-classifier . 45 | ``` 46 | 47 | where the first command will prompt you for user credentials: 48 | ``` 49 | Access Key: gpt_research 50 | Secret Key: mbmug8VDbRu5hqJ 51 | ``` 52 | 53 | 54 | *Note: opening the URL in a browser leads to the administrator 55 | console. To access the datasets and models you have to install MinIO client 56 | as mentioned above.* 57 | 58 | 59 | ### Publications 60 | 61 | More details can be found in the following papers (please, cite the first one): 62 | ``` 63 | @article{Bikmukhamedov2021MultiClassNT, 64 | title={Multi-Class Network Traffic Generators and Classifiers Based on Neural Networks}, 65 | author={R. Bikmukhamedov and A. Nadeev}, 66 | journal={2021 Systems of Signals Generating and Processing in the Field of on Board Communications}, 67 | year={2021}, 68 | pages={1-7}, 69 | url = {https://doi.org/10.1109/IEEECONF51389.2021.9416067} 70 | } 71 | 72 | @article{bikmukhamedov2020, 73 | author = {Bikmukhamedov, R. F. and Nadeev, A.F.}, 74 | title = {Generative transformer framework for network traffic generation and classification}, 75 | journal = {T-Comm}, 76 | year = {2020}, 77 | number = {11}, 78 | vol = {14}, 79 | pages = {64--71}, 80 | url = {http://media-publisher.ru/wp-content/uploads/Nom-11-2020-s.pdf} 81 | } 82 | ``` 83 | -------------------------------------------------------------------------------- /gpt_model/data_preparation/preprocess_target_pcaps.py: -------------------------------------------------------------------------------- 1 | import typing 2 | import pathlib 3 | 4 | import sh 5 | 6 | 7 | """ provided here for the sake of reproducibility of own research """ 8 | 9 | 10 | class Device(typing.NamedTuple): 11 | mac: str 12 | name: str 13 | category: str 14 | 15 | 16 | IOT_DEVICES = [ 17 | Device('d0:52:a8:00:67:5e', 'Smart Things', 'hub'), 18 | Device('44:65:0d:56:cc:d3', 'Amazon Echo', 'hub'), 19 | 20 | Device('70:ee:50:18:34:43', 'Netatmo Welcome', 'camera'), 21 | Device('f4:f2:6d:93:51:f1', 'TP-Link Day Night Cloud camera', 'camera'), 22 | Device('00:16:6c:ab:6b:88', 'Samsung SmartCam', 'camera'), 23 | Device('30:8c:fb:2f:e4:b2', 'Dropcam', 'camera'), 24 | Device('00:62:6e:51:27:2e', 'Insteon (wired)', 'camera'), 25 | Device('e8:ab:fa:19:de:4f', 'Insteon (wireless)', 'camera'), 26 | Device('00:24:e4:11:18:a8', 'Withings Smart Baby Monitor', 'camera'), 27 | 28 | Device('ec:1a:59:79:f4:89', 'Belkin Wemo', 'trigger'), 29 | Device('ec:1a:59:83:28:11', 'Belkin Wemo Motion sensor', 'trigger'), 30 | Device('50:c7:bf:00:56:39', 'TP-Link Smart Plug', 'trigger'), 31 | Device('74:c6:3b:29:d7:1d', 'iHome', 'trigger'), 32 | 33 | Device('18:b4:30:25:be:e4', 'NEST Protect smoke alarm', 'environment'), 34 | Device('70:ee:50:03:b8:ac', 'Netatmo weather station', 'environment'), 35 | 36 | Device('00:24:e4:1b:6f:96', 'Withings Smart scale', 'healthcare'), 37 | Device('00:24:e4:20:28:c6', 'Withings Aura smart sleep sensor', 'healthcare'), 38 | Device('74:6a:89:00:2e:25', 'Blipcare Blood Pressure meter', 'healthcare'), 39 | 40 | Device('d0:73:d5:01:83:08', 'LiFX Smart Bulb', 'light_bulb'), 41 | 42 | Device('18:b7:9e:02:20:44', 'Triby Speaker', 'electronics'), 43 | Device('e0:76:d0:33:bb:85', 'PIX-STAR photo-frame', 'electronics'), 44 | Device('70:5a:0f:e4:9b:c0', 'HP Printer', 'electronics'), 45 | ] 46 | 47 | 48 | TCPDUMP_BASE_FILTER = 'not arp and not icmp and not icmp6 and not broadcast and not multicast and not net 127.0.0.0/8' 49 | 50 | 51 | def _merge_pcaps(pcaps_to_merge: list, to_file): 52 | exec = sh.Command('mergecap') 53 | exec('-w', to_file, '-Fpcap', *pcaps_to_merge) 54 | 55 | 56 | def _split_by_devices(source_pcap): 57 | exec = sh.Command('/usr/sbin/tcpdump') 58 | target_dir = source_pcap.parent / 'separated_iot_devices' 59 | target_dir.mkdir(exist_ok=True) 60 | for device in IOT_DEVICES: 61 | target_file = target_dir / f'{device.category}_{device.name.lower().replace(" ", "_")}.pcap' 62 | filter_str = f"ether host {device.mac} and not (dst net 192.168.1.0/24 and src net 192.168.1.0/24) " \ 63 | f"and {TCPDUMP_BASE_FILTER}" 64 | exec(['-r', source_pcap, filter_str, '-w', target_file]) 65 | 66 | 67 | def _filter_non_iot_dump(source_pcap): 68 | target_file = source_pcap.parent / 'non_iot.pcap' 69 | filter_str = f"not (dst net 192.168.88.0/24 and src net 192.168.88.0/24) and {TCPDUMP_BASE_FILTER}" 70 | exec = sh.Command('/usr/sbin/tcpdump') 71 | exec(['-r', source_pcap, filter_str, '-w', target_file]) 72 | 73 | 74 | def main(): 75 | dump_root_dir = pathlib.Path('/media/raid_store/pretrained_traffic') 76 | merged_pcap = dump_root_dir / 'total.pcap' 77 | # merge all .pcap files from https://iotanalytics.unsw.edu.au/iottraces 78 | pcaps = pathlib.Path(dump_root_dir / 'iot_downloads').glob('*.pcap') 79 | _merge_pcaps(pcaps, merged_pcap) 80 | _split_by_devices(merged_pcap) 81 | _filter_non_iot_dump(dump_root_dir / 'home.pcap') 82 | 83 | 84 | if __name__ == '__main__': 85 | main() 86 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Network traffic classifier based on statistical properties of application flows 2 | 3 | UPDATE 18/03/2019: Refactored in OOP-style, more flexibility and features! 4 | 5 | UPDATE 23/05/2020: Replaced custom flow-parsing mechanism with NFStream 6 | 7 | UPDATE 17/09/2020: Added pytorch classifiers, including transformer-based one 8 | 9 | UPDATE 30/10/2020: ANN classifiers (NGT, LSH), FS-NET baseline 10 | ## Key features 11 | 12 | * Configurable feature extraction from network flows via `NFStream`. 13 | 14 | * Possibility to test arbitrary sklearn algorithms (e.g. SVM, Random Forest, 15 | etc.) and configure their parameter search space via `.yaml` configs. 16 | 17 | * Basic examples of pytorch classifiers and new generative transformer 18 | framework that can be used for building traffic generators and 19 | classifiers. 20 | 21 | * Option for experiment tracking with Neptune. 22 | 23 | ## Project structure 24 | 25 | * `flow_parsing` contains scripts for parsing flow features and labels 26 | from `.pcap` into `.csv` via `NFStream`. It can be 27 | used for exporting raw per-flow packet-features (e.g. packet/payload 28 | sizes, timestamps, various packet-fields) in a numpy array, as well as 29 | derivative statistics, such as feature percentiles, etc. 30 | 31 | * `evaluation_utils` contains utilities for evaluation of traffic 32 | classifiers and generators. 33 | 34 | * `fs_net` is a reimplementation of FS-NET classifier 35 | 36 | * `sklearn_classifiers` contains wrapper for sklearn-like classifiers 37 | and example pipeline script. Used models and their parameters are specified 38 | via the `.yaml` configuration file. Check and modify `utils.py:REGISTERED_CLASSES` 39 | to support the needed models. 40 | 41 | * `nn_classifers` includes base class for pytorch-lightning classifier and 42 | some basic derivatives. 43 | 44 | * `gpt_model` has all the code required for building your own 45 | transformer-based traffic generator and classifier, along with a link to 46 | model checkpoints. See the package for more info. 47 | 48 | ## Usage example for sklearn-based classifiers 49 | 50 | 1. A feature file has to be prepared before running model training, so 51 | make sure to create a `.csv` dataset by running, for example: 52 | 53 | ```PYTHONPATH=. python flow_parsing/pcap_parser.py -p flow_parsing/static/example.pcap --online_mode``` 54 | 55 | 2. OPTIONAL. Postprocess parsed `.csv` as needed, e.g. split into train-test, 56 | reassign target columns. 57 | 58 | 3. Create own version of `config.yaml` to experiment with and 59 | test classifiers: 60 | 61 | ``` 62 | PYTHONPATH=. python sklearn_classifiers/run_training.py 63 | --train_dataset csv_files/example_20packets.csv 64 | --target_column ndpi_category 65 | --continuous 66 | ``` 67 | 68 | ## Publications 69 | 70 | If you find the code or datasets useful for your research, please, 71 | cite one of the following papers: 72 | 73 | ``` 74 | @article{Bikmukhamedov2021MultiClassNT, 75 | title={Multi-Class Network Traffic Generators and Classifiers Based on Neural Networks}, 76 | author={Bikmukhamedov, Radion and Nadeev, Adel}, 77 | journal={2021 Systems of Signals Generating and Processing in the Field of on Board Communications}, 78 | year={2021}, 79 | pages={1-7}, 80 | url = {https://doi.org/10.1109/IEEECONF51389.2021.9416067} 81 | } 82 | 83 | @CONFERENCE{bikmukhamedov2019, 84 | author = {Bikmukhamedov, R. F. and Nadeev, A. F.}, 85 | title = {Lightweight Machine Learning Classifiers of IoT Traffic Flows}, 86 | booktitle = {2019 Systems of Signal Synchronization, Generating and Processing in Telecommunications}, 87 | year = {2019}, 88 | } 89 | ``` -------------------------------------------------------------------------------- /tests/test_distance.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | import torch 4 | from sklearn.metrics import accuracy_score 5 | from sklearn.model_selection import train_test_split 6 | from sklearn.preprocessing import LabelEncoder, normalize 7 | from transformers import set_seed, GPT2Model, GPT2Config 8 | 9 | from settings import RANDOM_SEED 10 | from sklearn_classifiers.knn_cosine import ( 11 | cos_dist, 12 | top_k_cosine_similar, 13 | batch_voter, 14 | KNeighborsCosineClassifier, 15 | KNeighborsPuffinnClassifier, 16 | KNeighborsNGTClassifier 17 | ) 18 | 19 | 20 | @pytest.fixture() 21 | def keys(): 22 | return np.array([[1, 0, 0], [0.9, -0.1, 0], [1, 0, 0], [0, 1, 1]]) 23 | 24 | 25 | def test_cos_dist(keys): 26 | query = np.array([0, 0, 1]) 27 | sim = cos_dist(query, keys) 28 | assert np.isclose(sim, np.array([1., 1., 1., 0.29289322])).all() 29 | 30 | 31 | @pytest.mark.parametrize( 32 | 'query,idx,top_k', 33 | [ 34 | (np.array([0, 0, 1]), [[3]], 1), 35 | (np.array([1, 0, 0]), [[0, 2]], 2), 36 | (np.array([1, -0.1, 0]), [[1, 0]], 2), 37 | (np.array([[1, -0.1, 0], [1, 0, 0]]), [[1, 0], [0, 2]], 2) 38 | ] 39 | ) 40 | def test_cos_top_k(query, idx, top_k, keys): 41 | top = top_k_cosine_similar(query, keys, top_k) 42 | assert top.tolist() == idx 43 | 44 | 45 | def test_target_assignment(keys): 46 | targets = np.array([2, 2, 0, 1]) 47 | top_2_for_3_queries = np.array([[1, 0], [2, 0], [1, 2]]) 48 | votes = batch_voter(targets[top_2_for_3_queries]) 49 | assert votes.tolist() == [2, 0, 2] 50 | 51 | 52 | def test_knn_cos(keys): 53 | targets = np.array([2, 0, 2, 1]) 54 | for classifier_class in [KNeighborsCosineClassifier, KNeighborsPuffinnClassifier, KNeighborsNGTClassifier]: 55 | clf = classifier_class(2) 56 | clf.fit(keys, targets) 57 | X_test = np.array([[0.9, 0, 0]]) 58 | pred = clf.predict(X_test) 59 | assert pred.tolist() == [2] 60 | 61 | 62 | @pytest.fixture() 63 | def dummy_gpt2(): 64 | set_seed(RANDOM_SEED) 65 | 66 | config = { 67 | "vocab_size": 9906, 68 | "n_positions": 128, 69 | "n_ctx": 128, 70 | "n_embd": 512, 71 | "n_layer": 6, 72 | "n_head": 8, 73 | } 74 | config = GPT2Config(**config) 75 | model = GPT2Model(config) 76 | return model 77 | 78 | 79 | def test_ann_deviation(raw_dataset_with_targets, raw_dataset, tokenizer, dummy_gpt2): 80 | 81 | y = LabelEncoder().fit_transform(raw_dataset_with_targets['ndpi_app']) 82 | encoded = tokenizer.batch_encode_packets(raw_dataset) 83 | with torch.no_grad(): 84 | features = dummy_gpt2(**encoded)[0] 85 | 86 | X = features.mean(dim=1).numpy() 87 | X = normalize(X) 88 | 89 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=True, random_state=1) 90 | ref_preds = KNeighborsCosineClassifier(n_neighbors=1).fit(X_train, y_train).predict(X_test) 91 | # ref_preds = KNeighborsClassifier(n_neighbors=1, algorithm='brute').fit(X_train, y_train).predict(X_test) 92 | 93 | accuracy = accuracy_score(y_test, ref_preds) 94 | 95 | pfn_preds = KNeighborsPuffinnClassifier(n_neighbors=1).fit(X_train, y_train).predict(X_test) 96 | 97 | pfn_acc = accuracy_score(y_test, pfn_preds) 98 | assert accuracy == pfn_acc 99 | 100 | assert accuracy_score(ref_preds, pfn_preds) == 1.0 101 | 102 | ngt_preds = KNeighborsNGTClassifier(n_neighbors=1, 103 | search_epsilon=0.2, 104 | optimize_n_edges=False, 105 | optimize_search_params=False 106 | ).fit(X_train, y_train).predict(X_test) 107 | 108 | assert accuracy_score(ref_preds, ngt_preds) == 1.0 109 | assert accuracy_score(ngt_preds, pfn_preds) == 1.0 110 | -------------------------------------------------------------------------------- /sklearn_classifiers/clf_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import logging 4 | import typing 5 | from time import time 6 | 7 | import yaml 8 | from sklearn import metrics 9 | from sklearn.metrics import make_scorer 10 | from sklearn.model_selection import GridSearchCV, StratifiedKFold 11 | 12 | import settings 13 | from .registered_classes import REGISTERED_CLASSES 14 | 15 | logger = logging.getLogger(__file__) 16 | 17 | 18 | class ClassifierHolder: 19 | """ simple dataclass """ 20 | def __init__(self, classifier, param_search_space, shortcut_name=None): 21 | self.classifier = classifier 22 | self.name = type(classifier).__name__ if not shortcut_name else shortcut_name 23 | self.param_search_space = param_search_space 24 | 25 | def __repr__(self): 26 | repr_str = repr(self.classifier) 27 | if self.param_search_space: 28 | repr_str += f'\n\tsearch_space: {self.param_search_space}' 29 | return repr_str 30 | 31 | 32 | def _read_config_file(config_path) -> dict: 33 | """ simple wrapper around yaml.load """ 34 | with open(config_path) as f: 35 | settings = yaml.load(f, Loader=yaml.SafeLoader) 36 | return settings 37 | 38 | 39 | def _process_settings(settings: dict) -> None: 40 | """ In-place settings transform for ranges""" 41 | for key, params in settings.items(): 42 | if 'param_search_space' in params: 43 | ssp = params.get('param_search_space') 44 | for pname, pvalue in ssp.items(): 45 | if isinstance(pvalue, dict) and 'from' in pvalue: 46 | step = pvalue.get('step', 1) 47 | ssp[pname] = list(range(pvalue['from'], pvalue['till']+1, step)) 48 | 49 | 50 | def read_classifier_settings(config_path=None): 51 | if config_path is None: 52 | config_path = settings.BASE_DIR / 'sklearn_classifiers/config.yaml' 53 | config = _read_config_file(config_path) 54 | _process_settings(config) 55 | return config 56 | 57 | 58 | def initialize_classifiers(config: dict, 59 | random_seed: int = settings.RANDOM_SEED, 60 | classes: typing.Dict[str, type] = REGISTERED_CLASSES) -> typing.Dict[str, ClassifierHolder]: 61 | 62 | result = {} 63 | for key, params in config.items(): 64 | kwargs = params.get('params', {}) 65 | 66 | logger.info(f'Instantiating {params["type"]} with params {kwargs}') 67 | if 'estimator' in kwargs: # this works only on one level deeper. No recursion 68 | sub_kwargs = {'random_state': random_seed} 69 | kwargs['estimator'] = classes[kwargs['estimator']['type']](**sub_kwargs) 70 | else: 71 | kwargs['random_state'] = random_seed 72 | 73 | if params['type'].startswith('KNeighbors'): 74 | kwargs.pop('random_state') 75 | classifier = classes[params['type']](**kwargs) 76 | holder = ClassifierHolder(classifier, params.get('param_search_space', {}), shortcut_name=key) 77 | result[key] = holder 78 | return result 79 | 80 | 81 | def fit_optimal_classifier(classifier: ClassifierHolder, X_train, y_train, n_folds=2): 82 | """ searches through pre-defined parameter space from the .yaml, and fits classifier with found parameters """ 83 | logger.info('Searching parameters for {} through {}'.format(classifier.name, classifier.param_search_space)) 84 | search = GridSearchCV(classifier.classifier, 85 | param_grid=classifier.param_search_space, 86 | n_jobs=-1, 87 | scoring=make_scorer(metrics.f1_score, average='macro'), 88 | cv=StratifiedKFold(n_folds, shuffle=True, random_state=settings.RANDOM_SEED), 89 | refit=True, 90 | verbose=1) 91 | 92 | start = time() 93 | search.fit(X_train, y_train) 94 | logger.info('Search took {:.2f} seconds'.format(time() - start)) 95 | logger.info('Best parameters are {} with score {:.4f}'.format(search.best_params_, search.best_score_)) 96 | classifier.classifier = search.best_estimator_ 97 | return classifier 98 | -------------------------------------------------------------------------------- /tests/static/quantized_pkts.json: -------------------------------------------------------------------------------- 1 | [10, 77, 26, 70, 34, 17, 17, 17, 18, 26, 14, 28, 26, 95, 31, 34, 32, 26, 0, 0, 10, 77, 26, 99, 34, 17, 17, 24, 26, 14, 28, 26, 96, 98, 34, 49, 26, 26, 0, 0, 10, 77, 26, 99, 34, 17, 17, 17, 18, 26, 26, 26, 14, 28, 26, 48, 29, 34, 19, 81, 10, 12, 33, 104, 52, 17, 47, 33, 56, 33, 46, 16, 33, 62, 85, 33, 59, 79, 39, 73, 10, 12, 33, 37, 52, 69, 11, 33, 56, 33, 46, 16, 33, 62, 85, 33, 105, 39, 73, 33, 10, 12, 33, 104, 52, 1, 1, 58, 56, 33, 33, 33, 46, 16, 33, 62, 85, 33, 103, 79, 10, 12, 33, 37, 52, 17, 17, 11, 56, 33, 33, 46, 16, 33, 62, 85, 33, 94, 79, 33, 55, 26, 34, 34, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 26, 34, 34, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 12, 33, 91, 52, 1, 50, 56, 33, 33, 46, 54, 16, 78, 33, 100, 85, 33, 38, 88, 10, 12, 33, 91, 52, 1, 1, 58, 56, 33, 33, 46, 16, 33, 100, 85, 33, 89, 1, 112, 55, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 52, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 52, 26, 70, 34, 1, 1, 1, 23, 26, 26, 14, 28, 26, 3, 31, 34, 1, 9, 26, 10, 12, 33, 110, 52, 1, 50, 33, 1, 15, 33, 33, 33, 46, 68, 33, 62, 85, 33, 67, 10, 52, 26, 99, 34, 1, 1, 1, 23, 26, 14, 28, 26, 106, 36, 34, 2, 75, 1, 92, 55, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 77, 26, 70, 26, 34, 17, 17, 17, 18, 26, 26, 26, 34, 26, 0, 0, 0, 0, 0, 10, 77, 26, 99, 34, 17, 17, 17, 18, 26, 14, 28, 26, 13, 29, 34, 19, 81, 25, 17, 10, 12, 33, 91, 33, 52, 17, 17, 53, 17, 26, 26, 26, 26, 43, 17, 26, 26, 52, 26, 10, 12, 33, 37, 52, 17, 17, 53, 17, 78, 33, 78, 33, 56, 33, 33, 46, 16, 33, 62, 10, 12, 33, 84, 52, 1, 1, 1, 63, 33, 33, 33, 33, 46, 64, 33, 21, 52, 1, 1, 10, 77, 26, 70, 34, 17, 17, 17, 18, 26, 26, 26, 14, 28, 26, 108, 31, 34, 113, 26, 10, 77, 26, 99, 34, 17, 17, 17, 18, 26, 26, 14, 28, 26, 13, 98, 34, 49, 26, 26, 73, 33, 52, 52, 83, 52, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 12, 33, 104, 52, 1, 50, 56, 33, 33, 33, 46, 16, 33, 100, 85, 33, 45, 45, 93, 10, 12, 33, 91, 52, 17, 17, 53, 17, 33, 33, 78, 33, 43, 17, 78, 33, 46, 16, 33, 10, 52, 26, 99, 34, 1, 1, 1, 23, 26, 14, 28, 26, 102, 36, 34, 1, 1, 75, 1, 10, 12, 33, 110, 52, 1, 1, 58, 1, 15, 33, 33, 46, 68, 33, 62, 85, 33, 30, 52, 10, 77, 26, 70, 34, 17, 17, 17, 18, 26, 26, 14, 28, 26, 101, 31, 34, 8, 26, 74, 10, 12, 33, 5, 52, 71, 58, 33, 33, 20, 33, 33, 46, 68, 33, 100, 85, 33, 22, 22, 10, 12, 33, 5, 52, 1, 50, 33, 33, 20, 33, 33, 46, 68, 33, 100, 6, 76, 80, 40, 55, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 52, 26, 70, 34, 1, 1, 61, 26, 26, 14, 28, 26, 101, 31, 34, 1, 35, 26, 0, 10, 52, 26, 99, 34, 1, 1, 1, 23, 26, 26, 26, 14, 28, 26, 102, 98, 34, 49, 26, 10, 52, 26, 99, 34, 2, 1, 23, 26, 26, 14, 28, 26, 87, 29, 34, 4, 66, 26, 26, 10, 12, 33, 37, 52, 1, 1, 58, 33, 33, 33, 56, 33, 33, 46, 16, 33, 62, 85, 33, 10, 12, 33, 37, 52, 69, 11, 56, 33, 33, 46, 16, 33, 62, 85, 33, 42, 39, 73, 33, 10, 12, 33, 91, 52, 1, 1, 58, 56, 33, 33, 46, 16, 33, 62, 85, 33, 111, 51, 39, 55, 26, 34, 34, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 26, 34, 34, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 12, 33, 104, 52, 1, 1, 58, 56, 33, 33, 33, 33, 46, 16, 33, 100, 85, 33, 7, 55, 26, 34, 34, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 12, 33, 104, 52, 1, 1, 58, 56, 33, 33, 46, 16, 33, 100, 85, 33, 86, 57, 72, 10, 12, 33, 91, 52, 65, 33, 33, 56, 33, 33, 46, 16, 33, 62, 85, 33, 97, 51, 39, 10, 52, 26, 99, 34, 1, 1, 1, 23, 26, 14, 41, 28, 33, 26, 87, 98, 34, 49, 26, 10, 52, 26, 99, 34, 1, 1, 1, 23, 26, 26, 26, 14, 28, 26, 107, 29, 34, 1, 92, 10, 12, 33, 91, 52, 1, 1, 58, 33, 33, 33, 54, 1, 78, 33, 46, 16, 33, 62, 85, 10, 12, 33, 91, 52, 71, 58, 33, 33, 33, 54, 1, 78, 33, 46, 16, 33, 62, 73, 33, 10, 12, 33, 84, 52, 1, 1, 1, 63, 33, 33, 33, 46, 64, 33, 21, 1, 1, 1, 1, 10, 12, 33, 84, 52, 1, 1, 27, 33, 33, 33, 46, 64, 33, 109, 52, 1, 1, 71, 33, 10, 52, 26, 70, 34, 1, 1, 1, 23, 26, 14, 41, 28, 33, 26, 82, 31, 34, 8, 26, 73, 33, 52, 52, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 12, 33, 91, 52, 1, 1, 58, 54, 1, 33, 33, 78, 33, 46, 16, 33, 100, 85, 33, 73, 33, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 12, 33, 91, 52, 65, 56, 33, 33, 33, 46, 16, 33, 100, 6, 76, 90, 33, 60, 44, 10, 52, 26, 70, 26, 34, 1, 1, 1, 23, 26, 26, 26, 34, 26, 0, 0, 0, 0, 0] -------------------------------------------------------------------------------- /tests/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import numpy as np 4 | from torch.utils.data import DataLoader 5 | 6 | from gpt_model.generator.dataset import PretrainIterDataset, PretrainCollator, PretrainDataset, PretrainDatasetWithClasses 7 | from gpt_model.quantizer import PacketScaler, init_sklearn_kmeans_from_checkpoint, PacketQuantizer 8 | from gpt_model.tokenizer import PacketTokenizer 9 | 10 | np.random.seed(1) 11 | 12 | 13 | def test_packet_scaler(): 14 | n_packets = 1000 15 | pack_lens = np.random.uniform(-1500, 1500, n_packets) 16 | iats = np.random.gamma(0, scale=1e4, size=n_packets) 17 | indices = np.random.choice(np.arange(iats.size), replace=False, size=int(iats.size * 0.2)) 18 | iats[indices] = 0. 19 | 20 | packets = np.stack([pack_lens, iats], axis=1) 21 | transformer = PacketScaler() 22 | transf_packets = transformer.transform(packets.copy()) 23 | reverted_packets = transformer.inverse_transform(transf_packets) 24 | assert np.isclose(packets, reverted_packets, atol=10e-9).all() 25 | 26 | 27 | def test_loading_quantizer(quantizer_checkpoint): 28 | q = init_sklearn_kmeans_from_checkpoint(quantizer_checkpoint) 29 | cluster = q.predict(np.array([[-1, 0]])) 30 | assert cluster[0] == 8 31 | 32 | 33 | def test_saving_tokenizer(quantizer_checkpoint): 34 | q = PacketTokenizer.from_pretrained(quantizer_checkpoint) 35 | q.save_pretrained('/tmp/') 36 | assert pathlib.Path('/tmp/clusters.json').is_file() 37 | assert pathlib.Path('/tmp/ids_to_tokens.json').is_file() 38 | 39 | 40 | def _estimate_normalized_packet_difference(raw_packets, reverted_packets): 41 | norm_diff = (reverted_packets - raw_packets) / reverted_packets 42 | norm_diff[np.isnan(norm_diff) | np.isinf(norm_diff)] = 0 43 | return norm_diff.mean() 44 | 45 | 46 | def test_quantizer_transform(quantizer_checkpoint, raw_dataset): 47 | 48 | q = PacketQuantizer.from_checkpoint(quantizer_checkpoint, flow_size=20) 49 | # assert proper column ordering with packet features 50 | raw_packets = raw_dataset[q.raw_columns].values 51 | quantized = q.transform(raw_packets) 52 | assert quantized.shape == (raw_dataset.shape[0], 20) 53 | assert np.isnan(raw_packets).sum() == (quantized == -1).sum() * 2 54 | 55 | # test invariance 56 | assert np.isclose(quantized, q.transform(raw_packets)).all() 57 | 58 | # test inverting 59 | reverted_packets = q.inverse_transform(quantized) 60 | assert reverted_packets.shape == raw_packets.shape 61 | assert np.isnan(reverted_packets).sum() == np.isnan(raw_packets).sum() 62 | 63 | assert _estimate_normalized_packet_difference(raw_packets, reverted_packets) < 0.0003 64 | 65 | 66 | def test_tokenize_detokenize(quantizer_checkpoint, raw_dataset): 67 | tokenizer = PacketTokenizer.from_pretrained(quantizer_checkpoint) 68 | encoded = tokenizer.batch_encode_packets(raw_dataset) 69 | tokens = encoded['input_ids'] 70 | # since the model limit 128 > 20 in raw_features, we do not expect truncating 71 | decoded = tokenizer.batch_decode_packets(tokens) 72 | assert _estimate_normalized_packet_difference(raw_dataset.values, decoded) < 0.0003 73 | 74 | 75 | def test_flow_loader(raw_dataset_folder, quantizer_checkpoint): 76 | tokenizer = PacketTokenizer.from_pretrained(quantizer_checkpoint, flow_size=20) 77 | ds = PretrainIterDataset(tokenizer, folder_path=raw_dataset_folder) 78 | loader = DataLoader(ds, batch_size=4, collate_fn=PretrainCollator(tokenizer), drop_last=True) 79 | for flow in loader: 80 | assert flow['input_ids'].shape == (4, 22) 81 | 82 | 83 | def test_flowlight_loader(raw_dataset_folder, quantizer_checkpoint): 84 | tokenizer = PacketTokenizer.from_pretrained(quantizer_checkpoint, flow_size=20) 85 | ds = PretrainDataset(tokenizer, folder_path=raw_dataset_folder) 86 | loader = DataLoader(ds, batch_size=4, collate_fn=PretrainCollator(tokenizer), drop_last=True) 87 | for flow in loader: 88 | assert flow['input_ids'].shape == (4, 22) 89 | 90 | 91 | def test_dataset_with_classes(raw_dataset_folder, quantizer_checkpoint): 92 | tokenizer = PacketTokenizer.from_pretrained(quantizer_checkpoint, flow_size=20) 93 | ds = PretrainDatasetWithClasses(tokenizer, folder_path=raw_dataset_folder) 94 | loader = DataLoader(ds, batch_size=4, collate_fn=PretrainCollator(tokenizer), drop_last=True) 95 | for flow in loader: 96 | assert flow['input_ids'].shape == (4, 22) 97 | # 9905 is the last non-flow-label token ID 98 | assert (flow['input_ids'][:, 0] > 9905).all().tolist() 99 | -------------------------------------------------------------------------------- /flow_parsing/features.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import logging 3 | from typing import Tuple, Union, Optional 4 | 5 | import numpy as np 6 | from nfstream.flow import NFlow 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | FEATURE_FUNCTIONS = { 12 | '0': lambda feature_slice: _safe_vector_getter(feature_slice, 0), 13 | '1': lambda feature_slice: _safe_vector_getter(feature_slice, 1), 14 | '_max': np.max, 15 | '_min': np.min, 16 | '_avg': np.mean, 17 | '_median': np.median, 18 | '_25q': lambda feature_slice: np.percentile(feature_slice, 25), 19 | '_75q': lambda feature_slice: np.percentile(feature_slice, 75), 20 | '_sum': np.sum, 21 | # counting non-empty bulks (packets with payload) 22 | '_number': lambda feature_slice: feature_slice[feature_slice > 0].shape[0] 23 | } 24 | 25 | # These are not complete subsets of handcrafted features 26 | CONTINUOUS_NAMES = tuple(base + feature for feature in FEATURE_FUNCTIONS.keys() for base in ['bulk', 'packet']) 27 | CONTINUOUS_NAMES += ('tcp_window_avg', ) 28 | 29 | CATEGORICAL_NAMES = ( 30 | 'found_tcp_flags', 31 | ) 32 | 33 | FEATURE_NAMES = CONTINUOUS_NAMES + CATEGORICAL_NAMES 34 | 35 | 36 | class FEATURE_PREFIX: 37 | client = 'client_' 38 | server = 'server_' 39 | 40 | 41 | @functools.lru_cache(maxsize=2) 42 | def create_empty_features(prefix: str, feature_list=FEATURE_NAMES) -> dict: 43 | return {prefix + feature: 0. for feature in feature_list} 44 | 45 | 46 | def _safe_vector_getter(vector, indexer) -> Union[int, float]: 47 | try: 48 | return vector[indexer] 49 | except IndexError: 50 | return np.nan 51 | 52 | 53 | def calc_parameter_stats(feature_slice, prefix, feature_name) -> dict: 54 | return {prefix + feature_name + feature: func(feature_slice) for feature, func in FEATURE_FUNCTIONS.items()} 55 | 56 | 57 | def inter_packet_times_from_timestamps(timestamps): 58 | if len(timestamps) == 0: 59 | return timestamps 60 | next_timestamps = np.roll(timestamps, 1) 61 | ipt = timestamps - next_timestamps 62 | ipt[0] = 0 63 | return ipt 64 | 65 | 66 | def generate_raw_feature_names(flow_size, base_features: Tuple[str] = ('packet', 'iat')) -> list: 67 | return [f'raw_{feature}{index}' 68 | for index in range(flow_size) 69 | for feature in base_features] 70 | 71 | 72 | def calc_raw_features(flow: NFlow) -> dict: 73 | """ selects PS and IPT features """ 74 | packet_limit = len(flow.splt_ps) 75 | features = dict.fromkeys(generate_raw_feature_names(packet_limit)) 76 | for index in range(packet_limit): 77 | ps = flow.splt_ps[index] 78 | ipt = flow.splt_piat_ms[index] 79 | 80 | if flow.splt_direction[index] == 1: 81 | ps = flow.splt_ps[index] * -1 82 | elif flow.splt_direction[index] == -1: 83 | ps = np.nan 84 | ipt = np.nan 85 | 86 | features['raw_packet' + str(index)] = ps 87 | features['raw_iat' + str(index)] = ipt 88 | 89 | return features 90 | 91 | 92 | def _calc_unidirectional_flow_features(flow: NFlow, direction_idxs, prefix='', features: Optional[list] = None) -> dict: 93 | # this asserts using of the listed features 94 | if features is None: 95 | features = create_empty_features(prefix) 96 | 97 | features.update(calc_parameter_stats(np.array(flow.splt_ps)[direction_idxs], prefix, 'packet')) 98 | 99 | features[prefix + 'found_tcp_flags'] = sorted(set(flow.udps.tcp_flag[direction_idxs])) 100 | features[prefix + 'tcp_window_avg'] = np.mean(flow.udps.tcp_window[direction_idxs]) 101 | features.update(calc_parameter_stats(flow.udps.bulk[direction_idxs], prefix, 'bulk')) 102 | 103 | return features 104 | 105 | 106 | def calc_stat_features(flow: NFlow) -> dict: 107 | """ estimates derivative discriminative features for flow classification from: 108 | packet size, payload size, TCP window, TCP flag 109 | """ 110 | direction = np.array(flow.splt_direction) 111 | client_idxs = direction == 0 112 | server_idxs = direction == 1 113 | 114 | if client_idxs.sum() > 0: 115 | client_features = _calc_unidirectional_flow_features(flow, client_idxs, prefix=FEATURE_PREFIX.client) 116 | else: 117 | client_features = create_empty_features(prefix=FEATURE_PREFIX.client) 118 | 119 | if server_idxs.sum() > 0: 120 | server_features = _calc_unidirectional_flow_features(flow, server_idxs, prefix=FEATURE_PREFIX.server) 121 | else: 122 | server_features = create_empty_features(prefix=FEATURE_PREFIX.server) 123 | 124 | total_features = dict(**client_features, **server_features) 125 | return total_features 126 | -------------------------------------------------------------------------------- /gpt_model/generator/baseline/markov.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import numpy as np 4 | from sklearn.cluster import KMeans 5 | from sklearn.preprocessing import normalize 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | def _normalize_by_rows(x: np.array): 11 | safe_x = x.copy() 12 | safe_x[safe_x == np.inf] = 10e6 13 | return normalize(safe_x, axis=1, norm='l1') 14 | 15 | 16 | def _calc_transition_matrix(seq_matrix, state_numb): 17 | """ here the states are expected to be integers in [0, state_numb) """ 18 | # init with values close-to-zero for smoothing 19 | transition_matrix = np.ones((state_numb, state_numb)) * 1e-6 20 | for row_iter in range(seq_matrix.shape[0]): 21 | state_seq = seq_matrix[row_iter, :] 22 | # count number of each possible transition 23 | for t in range(len(state_seq) - 1): 24 | j = state_seq[t] 25 | k = state_seq[t + 1] 26 | transition_matrix[j, k] += 1 27 | 28 | norm_trans_matrix = _normalize_by_rows(transition_matrix) 29 | logger.info(f'estimated transition matrix for {norm_trans_matrix.shape[0]} states') 30 | return norm_trans_matrix 31 | 32 | 33 | def _calc_prior_probas(seq_matrix, state_numb): 34 | counts = np.zeros(state_numb) 35 | for state in range(state_numb): 36 | counts[state] = np.count_nonzero(seq_matrix[:, 0] == state) 37 | priors = counts / np.linalg.norm(counts, ord=1) 38 | logger.info('estimated vector of priors') 39 | return priors 40 | 41 | 42 | class BaseGenerator: 43 | def fit(self, X): 44 | raise NotImplementedError 45 | 46 | def sample(self, n_sequences): 47 | raise NotImplementedError 48 | 49 | 50 | class MarkovGenerator(BaseGenerator): 51 | def __init__(self): 52 | self.n_states = None 53 | self.transition_matrix = None 54 | self.init_priors = None 55 | self.index2value = {} 56 | self.value2index = {} 57 | self._seq_len = None 58 | self._states = None 59 | logger.info('init MarkovGenerator') 60 | 61 | def _map_values_to_indexes(self, X): 62 | orig_values = X.flatten() 63 | self.value2index = {value: index for index, value in enumerate(np.unique(orig_values))} 64 | self.index2value = {index: value for index, value in enumerate(np.unique(orig_values))} 65 | X_mapped = np.array([self.value2index[val] for val in orig_values]).reshape(-1, self._seq_len) 66 | return X_mapped 67 | 68 | def _map_indexes_to_values(self, X_mapped): 69 | mapped_values = X_mapped.flatten() 70 | X = np.array([self.index2value[val] for val in mapped_values]).reshape(-1, self._seq_len) 71 | return X 72 | 73 | def fit(self, X): 74 | self._seq_len = X.shape[1] 75 | n_states = np.unique(X).size 76 | self._states = np.arange(n_states) 77 | 78 | X_mapped = self._map_values_to_indexes(X) 79 | 80 | self.transition_matrix = _calc_transition_matrix(X_mapped, n_states) 81 | self.init_priors = _calc_prior_probas(X_mapped, n_states) 82 | return self 83 | 84 | def sample(self, n_sequences): 85 | assert n_sequences > 0 86 | logger.info(f'started generating {n_sequences} sequences') 87 | sampled_matrix = np.zeros((n_sequences, self._seq_len), dtype=int) 88 | for seq_index in range(n_sequences): 89 | sampled_matrix[seq_index, :] = self._sample_sequence() 90 | return self._map_indexes_to_values(sampled_matrix) 91 | 92 | def _sample_sequence(self): 93 | sampled = np.zeros(self._seq_len, dtype=int) 94 | sampled[0] = np.random.choice(self._states, p=self.init_priors) 95 | for index in range(1, self._seq_len): 96 | sampled[index] = np.random.choice(self._states, p=self.transition_matrix[sampled[index-1], :]) 97 | return sampled 98 | 99 | 100 | class MarkovQuantizedGenerator(BaseGenerator): 101 | def __init__(self, cluster_limit=200): 102 | self.cluster_limit = cluster_limit 103 | self.quantizer = None 104 | self.generator = MarkovGenerator() 105 | 106 | def _get_cluster_number(self, X): 107 | unique_points = np.unique(X).size 108 | cluster_number = self.cluster_limit if unique_points > self.cluster_limit else unique_points 109 | logger.info(f'selected {cluster_number} clusters for quantization') 110 | return cluster_number 111 | 112 | def fit(self, X): 113 | cluster_number = self._get_cluster_number(X) 114 | self.quantizer = KMeans(n_clusters=cluster_number) 115 | X_quantized = self.quantizer.fit_predict(X.flatten().reshape(-1, 1)).reshape(X.shape) 116 | logger.info('quantized input') 117 | self.generator.fit(X_quantized) 118 | 119 | def sample(self, n_sequences): 120 | X_gen = self.generator.sample(n_sequences) 121 | X_restored = self.quantizer.cluster_centers_[X_gen][:, :, 0] 122 | logger.info('dequantized output') 123 | return X_restored 124 | -------------------------------------------------------------------------------- /nn_classifiers/models.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | 3 | import logging 4 | import torch 5 | from pytorch_lightning import LightningModule 6 | from torch.nn import functional as F 7 | from torch.optim.lr_scheduler import ReduceLROnPlateau 8 | from transformers.trainer_utils import set_seed 9 | 10 | from evaluation_utils.classification import Reporter 11 | from settings import RANDOM_SEED 12 | 13 | set_seed(RANDOM_SEED) 14 | logger = logging.getLogger(__file__) 15 | 16 | 17 | class BaseClassifier(LightningModule): 18 | def __init__(self, config, class_labels: Optional[List[str]], *args, **kwargs): 19 | super().__init__() 20 | self.hparams = config 21 | self.class_labels = class_labels 22 | self.output_dim = len(class_labels) 23 | 24 | def forward(self, x): 25 | return self.net(x) 26 | 27 | def training_step(self, batch, batch_idx): 28 | x, y = batch 29 | y_hat = self(x) 30 | loss = F.cross_entropy(y_hat, y) 31 | logs = {'train_loss': loss} 32 | return {'loss': loss, 'log': logs} 33 | 34 | def validation_step(self, batch, batch_idx): 35 | x, y = batch 36 | y_hat = self(x) 37 | return {'val_loss': F.cross_entropy(y_hat, y)} 38 | 39 | def validation_epoch_end(self, outputs): 40 | avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean() 41 | logs = {'val_loss': avg_loss} 42 | return {'val_loss': avg_loss, 'log': logs} 43 | 44 | def test_step(self, batch, batch_idx): 45 | x, y = batch 46 | y_hat = self(x) 47 | predictions = y_hat.max(axis=1)[1] 48 | loss = F.cross_entropy(y_hat, y) 49 | logs = {'test_loss': loss} 50 | return {'test_loss': loss, 51 | 'predictions': predictions.to('cpu'), 52 | 'targets': y.to('cpu'), 53 | 'log': logs} 54 | 55 | def test_epoch_end(self, outputs): 56 | avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean() 57 | predictions = torch.cat([x['predictions'] for x in outputs]).to('cpu').numpy() 58 | targets = torch.cat([x['targets'] for x in outputs]).to('cpu').numpy() 59 | rpt = Reporter(targets, predictions, self.__class__.__name__, target_classes=self.class_labels) 60 | self.logger.experiment.log_image('confusion_matrix', rpt.plot_conf_matrix()) 61 | 62 | report_file = f'report_{self.__class__.__name__}.csv' 63 | clf_report = rpt.clf_report(save_to=report_file) 64 | print(clf_report) 65 | self.logger.experiment.log_artifact((rpt.save_dir / report_file).as_posix()) 66 | 67 | logs = rpt.scores() 68 | logs.update({'test_loss': avg_loss}) 69 | return {'test_loss': avg_loss, 'log': logs} 70 | 71 | def configure_optimizers(self): 72 | optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate) 73 | scheduler = ReduceLROnPlateau(optimizer, patience=self.hparams.es_patience // 2) 74 | return [optimizer], [scheduler] 75 | 76 | 77 | class DenseClassifier(BaseClassifier): 78 | def __init__(self, config, class_labels, input_size, hidden_size=40, activation=torch.nn.LeakyReLU, dropout=0.1): 79 | super().__init__(config, class_labels) 80 | 81 | self.net = torch.nn.Sequential(torch.nn.Linear(input_size, hidden_size), 82 | activation(), 83 | torch.nn.Dropout(dropout), 84 | torch.nn.Linear(hidden_size, hidden_size), 85 | activation(), 86 | torch.nn.Dropout(dropout), 87 | torch.nn.Linear(hidden_size, hidden_size), 88 | activation(), 89 | torch.nn.Dropout(dropout), 90 | torch.nn.Linear(hidden_size, self.output_dim)) 91 | 92 | 93 | class BiGRUClassifier(BaseClassifier): 94 | def __init__(self, config, class_labels, input_size, num_layers=3, hidden_size=None, dropout=0.1, bidirectional=True): 95 | super().__init__(config, class_labels) 96 | 97 | if not hidden_size: 98 | hidden_size = self.output_dim 99 | 100 | self.gru = torch.nn.GRU(input_size, 101 | hidden_size, 102 | num_layers=num_layers, 103 | batch_first=True, 104 | dropout=dropout, 105 | bidirectional=bidirectional) 106 | 107 | self.activation = torch.nn.LeakyReLU() 108 | gru_out_size = 2*hidden_size if bidirectional else hidden_size 109 | self.layer_norm = torch.nn.LayerNorm(gru_out_size) 110 | self.fc = torch.nn.Linear(gru_out_size, self.output_dim) 111 | 112 | def forward(self, x): 113 | gru_out, hidden_state = self.gru(x.unsqueeze_(2)) 114 | out = self.activation(gru_out.max(axis=1)[0]) 115 | out = self.layer_norm(out) 116 | return self.fc(out) 117 | -------------------------------------------------------------------------------- /gpt_model/data_preparation/preprocess_pretraining_pcaps.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import nfstream 4 | import pandas as pd 5 | import sh 6 | 7 | import settings 8 | from flow_parsing import parse_pcap_to_csv 9 | 10 | 11 | def parse_flow_sizes(pcap_folder, target_folder): 12 | 13 | for pcap_file in pcap_folder.glob('*.pcap'): 14 | print(f'parsing {pcap_file}') 15 | dest_file = target_folder / (pcap_file.stem + '.csv') 16 | 17 | streamer = nfstream.NFStreamer( 18 | source=pcap_file.as_posix(), 19 | statistical_analysis=True, 20 | idle_timeout=settings.IDLE_TIMEOUT, 21 | active_timeout=settings.ACTIVE_TIMEOUT_ONLINE, 22 | accounting_mode=1, # IP size, 23 | ) 24 | print(f'saving to {dest_file}') 25 | streamer.to_csv(path=dest_file) 26 | 27 | 28 | def parse_raw_features_from_pcaps(pcap_folder, target_folder): 29 | for pcap_file in pcap_folder.glob('*.pcap'): 30 | target_csv = target_folder / (pcap_file.stem + '.csv') 31 | if target_csv.exists(): 32 | continue 33 | print(f'started parsing file {pcap_file}') 34 | 35 | # raw_features are set via analysis of packet number distribution within sessions 36 | # @ mawi.wide.ad.jp/mawi/ditl/ditl2020/ pcaps, such that the limit is close to .99 percentile 37 | 38 | parse_pcap_to_csv(pcap_file.as_posix(), 39 | target_csv.as_posix(), 40 | derivative_features=False, 41 | raw_features=128, 42 | provide_labels=True) 43 | 44 | 45 | def record_session_lengths(target_folder): 46 | dfs = [] 47 | for csv in target_folder.glob('*.csv'): 48 | df = pd.read_csv(csv, usecols=['bidirectional_packets']) 49 | dfs.append(df) 50 | 51 | dfs = pd.concat(dfs, axis=0) 52 | counts = dfs.bidirectional_packets.value_counts() 53 | norm_counts = counts.sort_index().cumsum() / dfs.shape[0] 54 | norm_counts.to_json(target_folder.parent / 'pkt_len_norm_counts.json') 55 | 56 | norm_counts_no_1packet_flows = (counts.sort_index().cumsum() - counts[1]) / (dfs.shape[0] - counts[1]) 57 | norm_counts_no_1packet_flows.to_json(target_folder.parent / 'pkt_len_norm_counts_no_1_packet.json') 58 | 59 | 60 | def rm_icmp_from_pcaps(source_pcap_folder, target_pcap_folder): 61 | for source_pcap in source_pcap_folder.glob('*.pcap'): 62 | target_pcap = target_pcap_folder / (source_pcap.stem + 'no_icmp.pcap') 63 | exec = sh.Command('/usr/sbin/tcpdump') 64 | exec(['-r', source_pcap, 'not icmp', '-w', target_pcap]) 65 | 66 | 67 | def split_pcaps_into_smaller(source_folder, dest_folder, size_limit=2000): 68 | for source_pcap in source_folder.glob('*.pcap'): 69 | target_pcaps = dest_folder / source_pcap.stem 70 | exec = sh.Command('/usr/sbin/tcpdump') 71 | exec(['-r', source_pcap, '-w', target_pcaps, '-C', size_limit]) 72 | 73 | 74 | def pcapng_to_pcap(pcap_folder): 75 | for source_pcap in pcap_folder.glob('*.pcapng'): 76 | target_pcap = pcap_folder / (source_pcap.stem + '.pcap') 77 | exec = sh.Command('tshark') 78 | exec(['-F', 'pcap', '-r', source_pcap, '-w', target_pcap]) 79 | 80 | 81 | def add_pcap_suffix(folder): 82 | for file in folder.glob('*'): 83 | file.replace(file.parent / (file.stem + '.pcap')) 84 | 85 | 86 | def uncompress_and_split_pcaps(source_folder, target_folder): 87 | """ 88 | bash script: 89 | 90 | for f in *.gz; do 91 | STEM_with_pcap=$(basename "${f}" .gz) 92 | STEM=$(basename "${STEM_with_pcap}" .pcap) 93 | # gunzip -c "${f}" > /media/raid_store/pretrained_traffic/mawi_pcaps/"${STEM}" 94 | gunzip -c "${f}" | tcpdump -w /media/raid_store/pretrained_traffic/mawi_pcaps/"${STEM}" -C 2000 -r - 95 | done 96 | 97 | :param folder: 98 | :return: 99 | """ 100 | gunzip = sh.Command('gunzip') 101 | tcpdump = sh.Command('tcpdump') 102 | target_folder = pathlib.Path(target_folder) 103 | for file in source_folder.glob('*.gz'): 104 | stem = file.stem.split('.pcap')[0] 105 | target = target_folder / stem 106 | # not tested :) see https://amoffat.github.io/sh/sections/piping.html#piping 107 | tcpdump(gunzip('-c', file), '-w', target, '-C', 2000, '-r', '-') 108 | 109 | 110 | if __name__ == '__main__': 111 | source_pcap_folder = pathlib.Path('/media/raid_store/pretrained_traffic/separated_iot_pcaps') 112 | 113 | # no_icmp_pcaps = pathlib.Path('/media/raid_store/pretrained_traffic/MAWI_no_icmp') 114 | # rm_icmp_from_pcaps(source_pcap_folder, no_icmp_pcaps) 115 | 116 | # clean_pcap_folder = pathlib.Path('/media/raid_store/pretrained_traffic/pcaps') 117 | # clean_pcap_folder.mkdir(exist_ok=True) 118 | # split_pcaps_into_smaller(clean_pcap_folder, split_pcap_folder, 2000) 119 | 120 | # split_pcap_folder = pathlib.Path('/media/raid_store/pretrained_traffic/ISCXVPN2016') 121 | # split_pcap_folder = pathlib.Path('/media/raid_store/pretrained_traffic/pcaps') 122 | # split_pcap_folder.mkdir(exist_ok=True) 123 | 124 | # add_pcap_suffix(source_pcap_folder) 125 | # parse_flow_sizes(split_pcap_folder, target_csv_folder_w_lengths) 126 | 127 | # target_csv_folder_w_lengths = pathlib.Path('/media/raid_store/pretrained_traffic/raw_csv_len') 128 | # target_csv_folder_w_lengths.mkdir(exist_ok=True) 129 | # record_session_lengths(target_csv_folder_w_lengths.parent) 130 | 131 | target_csv_folder = pathlib.Path('/media/raid_store/pretrained_traffic/raw_csv_iot_devices') 132 | target_csv_folder.mkdir(exist_ok=True) 133 | 134 | # pcapng_to_pcap(split_pcap_folder) 135 | 136 | parse_raw_features_from_pcaps(source_pcap_folder, target_csv_folder) 137 | -------------------------------------------------------------------------------- /gpt_model/classifier/train_classifier.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torch 5 | from pytorch_lightning import Trainer 6 | from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint 7 | from pytorch_lightning.callbacks import LearningRateLogger 8 | from pytorch_lightning.loggers import NeptuneLogger 9 | from torch.utils.data import DataLoader 10 | from torch.utils.data import random_split 11 | 12 | from gpt_model.classifier.model import GPT2Classifier 13 | from gpt_model.classifier.dataset import ClassificationQuantizedDataset 14 | from gpt_model.tokenizer import PacketTokenizer 15 | from settings import BASE_DIR, DEFAULT_PACKET_LIMIT_PER_FLOW, NEPTUNE_PROJECT 16 | 17 | 18 | def main(): 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument( 21 | '--train_dataset', 22 | help='path to preprocessed .csv dataset', 23 | ) 24 | parser.add_argument( 25 | '--test_dataset', 26 | help='path to preprocessed .csv dataset', 27 | ) 28 | parser.add_argument( 29 | '--pretrained_path', 30 | ) 31 | parser.add_argument( 32 | '--freeze_pretrained_model', 33 | action='store_true', 34 | default=False, 35 | ) 36 | parser.add_argument( 37 | '--mask_first_token', 38 | action='store_true', 39 | default=False, 40 | ) 41 | parser.add_argument( 42 | '--batch_size', 43 | default=256, 44 | ) 45 | parser.add_argument( 46 | '--es_patience', 47 | default=5, 48 | type=int, 49 | ) 50 | parser.add_argument( 51 | '--learning_rate', 52 | default=None 53 | ) 54 | parser.add_argument( 55 | '--fc_dropout', 56 | default=0.0, 57 | ) 58 | parser.add_argument( 59 | '--reinitialize', 60 | action='store_true', 61 | default=False 62 | ) 63 | parser.add_argument( 64 | '--n_layers', 65 | default=6, 66 | type=int, 67 | help='number of transformer layers to use, only in use when --reinitialize is provided' 68 | ) 69 | parser.add_argument( 70 | '--log_neptune', 71 | dest='log_neptune', 72 | action='store_true', 73 | default=False 74 | ) 75 | parser.add_argument( 76 | '--neptune_experiment_name', 77 | dest='neptune_experiment_name', 78 | default='gpt2_class_pretrained' 79 | ) 80 | 81 | args = parser.parse_args() 82 | if args.learning_rate is None: 83 | args.learning_rate = 0.0005 if args.freeze_pretrained_model else 0.00002 84 | 85 | print(args) 86 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 87 | 88 | tokenizer = PacketTokenizer.from_pretrained(args.pretrained_path, flow_size=DEFAULT_PACKET_LIMIT_PER_FLOW) 89 | 90 | train_val_dataset = ClassificationQuantizedDataset(tokenizer, 91 | dataset_path=args.train_dataset) 92 | train_part_len = int(len(train_val_dataset) * 0.9) 93 | train_dataset, val_dataset = random_split(train_val_dataset, 94 | [train_part_len, len(train_val_dataset) - train_part_len]) 95 | 96 | test_dataset = ClassificationQuantizedDataset(tokenizer, 97 | dataset_path=args.test_dataset, 98 | label_encoder=train_val_dataset.target_encoder) 99 | 100 | collator = ClassificationQuantizedDataset.get_collator(mask_first_token=args.mask_first_token) 101 | 102 | cpu_counter = os.cpu_count() 103 | train_dataloader = DataLoader(train_dataset, 104 | batch_size=args.batch_size, 105 | drop_last=False, 106 | shuffle=False, 107 | collate_fn=collator, 108 | num_workers=cpu_counter) 109 | 110 | val_dataloader = DataLoader(val_dataset, 111 | batch_size=args.batch_size, 112 | drop_last=False, 113 | shuffle=False, 114 | collate_fn=collator, 115 | num_workers=cpu_counter 116 | ) 117 | 118 | test_dataloader = DataLoader(test_dataset, 119 | batch_size=args.batch_size, 120 | drop_last=False, 121 | collate_fn=collator, 122 | num_workers=cpu_counter) 123 | 124 | class_labels = train_val_dataset.target_encoder.classes_ 125 | 126 | nn_classifier = GPT2Classifier( 127 | args, 128 | class_labels, 129 | pretrained_model_path=args.pretrained_path, 130 | dropout=args.fc_dropout, 131 | freeze_pretrained_part=args.freeze_pretrained_model, 132 | reinitialize=args.reinitialize, 133 | n_layers=args.n_layers 134 | ) 135 | 136 | early_stop_callback = EarlyStopping( 137 | monitor='val_loss', 138 | min_delta=1e-4, 139 | patience=args.es_patience, 140 | verbose=False, 141 | mode='min' 142 | ) 143 | 144 | logger = NeptuneLogger( 145 | offline_mode=not args.log_neptune, 146 | close_after_fit=False, 147 | project_name=NEPTUNE_PROJECT, 148 | experiment_name=args.neptune_experiment_name, 149 | params=vars(args), 150 | upload_source_files=[(BASE_DIR / 'gpt_model/classifier/model.py').as_posix()] 151 | ) 152 | 153 | checkpoint_dir = f'{nn_classifier.__class__.__name__}_checkpoints' 154 | model_checkpoint = ModelCheckpoint( 155 | filepath=checkpoint_dir + '/{epoch}-{val_loss:.2f}-{other_metric:.2f}' 156 | ) 157 | 158 | trainer = Trainer( 159 | early_stop_callback=early_stop_callback, 160 | callbacks=[LearningRateLogger()], 161 | checkpoint_callback=model_checkpoint, 162 | auto_lr_find=False, 163 | logger=logger, 164 | gpus=int(device == 'cuda'), 165 | ) 166 | 167 | trainer.fit(nn_classifier, train_dataloader, val_dataloader) 168 | trainer.test(nn_classifier, test_dataloader) 169 | logger.experiment.log_artifact(model_checkpoint.best_model_path) 170 | logger.experiment.stop() 171 | 172 | 173 | if __name__ == '__main__': 174 | main() 175 | -------------------------------------------------------------------------------- /gpt_model/generator/run_generating.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pathlib 3 | 4 | import logging 5 | import numpy as np 6 | import pandas as pd 7 | import torch 8 | from transformers import GPT2LMHeadModel 9 | 10 | from flow_parsing import save_dataset 11 | from evaluation_utils.modeling import evaluate_generated_traffic, save_metrics 12 | from gpt_model.generator.dataset import load_modeling_data_with_classes 13 | from gpt_model.generator.baseline import MarkovGenerator 14 | from gpt_model.tokenizer import PacketTokenizer 15 | from settings import FilePatterns, REPORT_DIR 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | def generate_packets(protocol, n_samples, model: GPT2LMHeadModel, tokenizer, device='cpu', batch_limit=1024): 21 | logger.info(f'generating {n_samples} flows of "{protocol}"...') 22 | 23 | generated_flows = [] 24 | tokens_to_sample = [batch_limit] * (n_samples // batch_limit) 25 | if n_samples % batch_limit != 0: 26 | # add the remainder 27 | tokens_to_sample += [n_samples % batch_limit] 28 | 29 | counter = 0 30 | for batch_size in tokens_to_sample: 31 | input_ids = torch.tensor([tokenizer.tokens_to_ids[protocol]] * batch_size, dtype=torch.long 32 | ).view(batch_size, -1).to(device) 33 | 34 | # no_repeat_ngram_size=1 is a dirty hack to fix duplicating pairs for 2-packet protocols 35 | out = model.generate( 36 | input_ids, 37 | eos_token_id=tokenizer.eos_token_id, 38 | pad_token_id=tokenizer.pad_token_id, 39 | max_length=128, 40 | do_sample=True, 41 | num_return_sequences=1, 42 | top_k=len(tokenizer), 43 | no_repeat_ngram_size=int(protocol in ['DNS', 'NTP']), 44 | use_cache=True, 45 | ).cpu() 46 | torch.cuda.empty_cache() 47 | packets = tokenizer.batch_decode_packets(out) 48 | generated_flows.append(packets) 49 | counter += batch_size 50 | logger.info(f'generated {counter} flows') 51 | 52 | target_dim_size = max(x.shape[1] for x in generated_flows) 53 | # pad arrays to equal out their 2nd dim 54 | generated_flows = list(map(lambda x: np.pad(x, ((0, 0), (0, target_dim_size - x.shape[1])), constant_values=np.nan), 55 | generated_flows)) 56 | generated_flows = np.concatenate(generated_flows, axis=0) 57 | return generated_flows 58 | 59 | 60 | def main(): 61 | parser = argparse.ArgumentParser() 62 | parser.add_argument( 63 | '--source_dataset', 64 | help='path to preprocessed .csv dataset', 65 | default='/media/raid_store/pretrained_traffic/train_csv' 66 | ) 67 | parser.add_argument( 68 | '--pretrained_path', 69 | default='/media/raid_store/pretrained_traffic/gpt2_model_4_6epochs_classes_home_iot' 70 | ) 71 | parser.add_argument( 72 | '--flow_limit_per_app', 73 | default=20000, 74 | type=int, 75 | ) 76 | parser.add_argument( 77 | '--filename_patterns_to_exclude', 78 | default='mawi', 79 | help='see settings.py::FilePatterns for the options' 80 | ) 81 | parser.add_argument( 82 | '--evaluate', 83 | action='store_true', 84 | default=False, 85 | ) 86 | 87 | parser.add_argument( 88 | '--markov_model', 89 | action='store_true', 90 | default=False, 91 | ) 92 | 93 | args = parser.parse_args() 94 | filename_patterns_to_exclude = getattr(FilePatterns, args.filename_patterns_to_exclude) 95 | source_dataset_folder = pathlib.Path(args.source_dataset) 96 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 97 | 98 | all_source_flows, classes = load_modeling_data_with_classes( 99 | source_dataset_folder, 100 | filename_patterns_to_exclude=filename_patterns_to_exclude 101 | ) 102 | source_class_counts = classes.value_counts() 103 | 104 | pretrained_path = pathlib.Path(args.pretrained_path) 105 | tokenizer = PacketTokenizer.from_pretrained(pretrained_path) 106 | if not args.markov_model: 107 | model = GPT2LMHeadModel.from_pretrained(pretrained_path).to(device) 108 | 109 | generated_flows_path = pretrained_path.parent / ('generated_flows_' + pretrained_path.stem) 110 | if args.markov_model: 111 | generated_flows_path = generated_flows_path.parent / (generated_flows_path.name + '_markov') 112 | generated_flows_path.mkdir(exist_ok=True) 113 | metrics = {} 114 | for proto in tokenizer.tokens_to_ids.keys(): 115 | # skip special tokens 116 | if proto.startswith('['): 117 | continue 118 | try: 119 | source_class_count = source_class_counts[proto] 120 | except KeyError: 121 | logger.error(f'could not find target class "{proto}" in dataset, skipping') 122 | continue 123 | 124 | n_flows_to_generate = source_class_count \ 125 | if source_class_count < args.flow_limit_per_app \ 126 | else args.flow_limit_per_app 127 | 128 | src_flows = all_source_flows[classes == proto] 129 | 130 | if args.markov_model: 131 | markov = MarkovGenerator() 132 | X = tokenizer.batch_encode_packets(src_flows.values.astype(np.float64), 133 | target_class=proto, 134 | add_special_tokens=True, 135 | return_attention_mask=False, 136 | return_tensors='np')['input_ids'] 137 | 138 | markov.fit(X) 139 | gen_tokens = markov.sample(n_flows_to_generate) 140 | gen_flows = tokenizer.batch_decode_packets(gen_tokens) 141 | else: 142 | gen_flows = generate_packets(proto, n_flows_to_generate, model, tokenizer, device) 143 | 144 | gen_flows = pd.DataFrame(gen_flows, columns=tokenizer.packet_quantizer.raw_columns[:gen_flows.shape[1]]) 145 | save_dataset(gen_flows, save_to=generated_flows_path / f'{proto}.csv') 146 | 147 | if args.evaluate: 148 | results = evaluate_generated_traffic(src_flows.values, gen_flows.values) 149 | metrics[proto] = results 150 | if args.evaluate: 151 | save_metrics(metrics, REPORT_DIR / ('report_' + generated_flows_path.stem + '.csv')) 152 | 153 | 154 | if __name__ == '__main__': 155 | main() 156 | -------------------------------------------------------------------------------- /fs_net/train_fsnet.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | from functools import partial 5 | from pprint import pprint 6 | 7 | import torch 8 | from pytorch_lightning import Trainer 9 | from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateLogger, EarlyStopping 10 | from pytorch_lightning.loggers import NeptuneLogger 11 | from torch.utils.data import DataLoader, random_split 12 | 13 | from gpt_model.tokenizer import PacketTokenizer 14 | from fs_net.dataset import SimpleClassificationQuantizedDataset, ClassificationPacketSizeDataset 15 | from fs_net.model import FSNETClassifier 16 | from settings import BASE_DIR, DEFAULT_PACKET_LIMIT_PER_FLOW, NEPTUNE_PROJECT, TARGET_CLASS_COLUMN 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | def _parse_args(): 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument( 24 | '--train_dataset', 25 | help='path to preprocessed .csv dataset', 26 | required=True 27 | ) 28 | parser.add_argument( 29 | '--test_dataset', 30 | help='path to preprocessed .csv dataset', 31 | ) 32 | parser.add_argument( 33 | '--target_column', 34 | help='column within the .csv denoting target variable', 35 | default=TARGET_CLASS_COLUMN 36 | ) 37 | parser.add_argument( 38 | "--packet_num", 39 | dest='packet_num', 40 | type=int, 41 | help="specify the first N packets to use for classification, " 42 | "defaults to settings.py:DEFAULT_PACKET_LIMIT_PER_FLOW,", 43 | default=DEFAULT_PACKET_LIMIT_PER_FLOW 44 | ) 45 | parser.add_argument( 46 | "--use_packet_size_only", 47 | dest='use_packet_size_only', 48 | action='store_true', 49 | help="set to use only (truncated) packet size sequences instead of quantized (PS, IPT)", 50 | default=False 51 | ) 52 | parser.add_argument( 53 | "--dynamic_ps_range", 54 | dest='dynamic_ps_range', 55 | help="dynamic range for PS parameter which implicitly sets Embedding layer dim, effective only along" 56 | "with --use_packet_size_only option", 57 | type=int, 58 | default=5000 59 | ) 60 | parser.add_argument( 61 | '--tokenizer_path', 62 | help='path to the tokenizer checkpoint, defaults to the one used for tests ooops :)', 63 | default=BASE_DIR / 'tests/static/quantizer_checkpoint' 64 | ) 65 | parser.add_argument( 66 | '--neptune_experiment_name', 67 | dest='neptune_experiment_name', 68 | default='FS-NET' 69 | ) 70 | parser.add_argument( 71 | '--log_neptune', 72 | dest='log_neptune', 73 | action='store_true', 74 | default=False 75 | ) 76 | parser.add_argument( 77 | '--learning_rate', 78 | default=0.0005 79 | ) 80 | parser.add_argument( 81 | '--batch_size', 82 | default=256, 83 | ) 84 | parser.add_argument( 85 | '--es_patience', 86 | default=5, 87 | type=int, 88 | ) 89 | args = parser.parse_args() 90 | return args 91 | 92 | 93 | def main(): 94 | args = _parse_args() 95 | pprint(args) 96 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 97 | cpu_counter = os.cpu_count() 98 | 99 | if args.use_packet_size_only: 100 | n_tokens = args.dynamic_ps_range 101 | ds_class = partial(ClassificationPacketSizeDataset, max_size_range=n_tokens) 102 | else: 103 | tokenizer = PacketTokenizer.from_pretrained(args.tokenizer_path, 104 | flow_size=args.packet_num) 105 | n_tokens = len(tokenizer) 106 | ds_class = partial(SimpleClassificationQuantizedDataset, tokenizer=tokenizer) 107 | 108 | train_val_dataset = ds_class(dataset_path=args.train_dataset, 109 | target_column=args.target_column) 110 | train_part_len = int(len(train_val_dataset) * 0.9) 111 | train_dataset, val_dataset = random_split(train_val_dataset, 112 | [train_part_len, len(train_val_dataset) - train_part_len]) 113 | 114 | test_dataset = ds_class(dataset_path=args.test_dataset, 115 | label_encoder=train_val_dataset.target_encoder, 116 | target_column=args.target_column) 117 | 118 | train_dataloader = DataLoader(train_dataset, 119 | batch_size=args.batch_size, 120 | drop_last=False, 121 | shuffle=False, 122 | num_workers=cpu_counter) 123 | 124 | val_dataloader = DataLoader(val_dataset, 125 | batch_size=args.batch_size, 126 | drop_last=False, 127 | shuffle=False, 128 | num_workers=cpu_counter) 129 | 130 | test_dataloader = DataLoader(test_dataset, 131 | batch_size=args.batch_size, 132 | drop_last=False, 133 | num_workers=cpu_counter) 134 | 135 | class_labels = train_val_dataset.target_encoder.classes_ 136 | 137 | nn_classifier = FSNETClassifier(args, class_labels=class_labels, n_tokens=n_tokens) 138 | 139 | early_stop_callback = EarlyStopping( 140 | monitor='val_loss', 141 | min_delta=1e-4, 142 | patience=args.es_patience, 143 | verbose=False, 144 | mode='min' 145 | ) 146 | 147 | exp_logger = NeptuneLogger( 148 | offline_mode=not args.log_neptune, 149 | close_after_fit=False, 150 | project_name=NEPTUNE_PROJECT, 151 | experiment_name=args.neptune_experiment_name, 152 | params=vars(args), 153 | upload_source_files=[(BASE_DIR / 'fs_net/model.py').as_posix()] 154 | ) 155 | 156 | checkpoint_dir = f'{nn_classifier.__class__.__name__}_checkpoints' 157 | model_checkpoint = ModelCheckpoint( 158 | filepath=checkpoint_dir + '/{epoch}-{val_loss:.2f}-{other_metric:.2f}' 159 | ) 160 | 161 | trainer = Trainer( 162 | early_stop_callback=early_stop_callback, 163 | callbacks=[LearningRateLogger()], 164 | checkpoint_callback=model_checkpoint, 165 | auto_lr_find=False, 166 | logger=exp_logger, 167 | gpus=int(device == 'cuda'), 168 | ) 169 | 170 | trainer.fit(nn_classifier, train_dataloader, val_dataloader) 171 | trainer.test(nn_classifier, test_dataloader) 172 | exp_logger.experiment.log_artifact(model_checkpoint.best_model_path) 173 | exp_logger.experiment.stop() 174 | 175 | 176 | if __name__ == '__main__': 177 | main() 178 | -------------------------------------------------------------------------------- /sklearn_classifiers/run_training.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | 4 | import neptune 5 | from sklearn.model_selection import train_test_split 6 | from flow_parsing import read_dataset 7 | from evaluation_utils.classification import Reporter 8 | from sklearn_classifiers.featurizer import Featurizer, TransformerFeatureExtractor 9 | from sklearn_classifiers.clf_utils import read_classifier_settings, initialize_classifiers, fit_optimal_classifier 10 | from settings import BASE_DIR, DEFAULT_PACKET_LIMIT_PER_FLOW, NEPTUNE_PROJECT, TARGET_CLASS_COLUMN, RANDOM_SEED 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def _parse_args(): 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument( 18 | "-c", "--config", 19 | help="configuration file, defaults to config.yaml", 20 | default=BASE_DIR / 'sklearn_classifiers/config.yaml') 21 | 22 | parser.add_argument( 23 | '--train_dataset', 24 | help='path to preprocessed .csv dataset', 25 | required=True 26 | ) 27 | parser.add_argument( 28 | '--test_dataset', 29 | help='path to preprocessed .csv dataset, if not specified, 1/4 of the training dataset is selected in ' 30 | 'stratified manner', 31 | ) 32 | parser.add_argument( 33 | '--target_column', 34 | help='column within the .csv denoting target variable', 35 | default=TARGET_CLASS_COLUMN 36 | ) 37 | parser.add_argument( 38 | "--packet_num", 39 | dest='packet_num', 40 | type=int, 41 | help="specify the first N packets to use for classification, " 42 | "defaults to settings.py:DEFAULT_PACKET_LIMIT_PER_FLOW,", 43 | default=DEFAULT_PACKET_LIMIT_PER_FLOW 44 | ) 45 | parser.add_argument( 46 | '--continuous', 47 | dest='continuous', 48 | action='store_true', 49 | help="when enabled, continuous derivative features from dataset are accounted for, " 50 | "e.g. percentiles, sums, etc. of packet size. Defaults to False", 51 | default=False 52 | ) 53 | parser.add_argument( 54 | '--categorical', 55 | dest='categorical', 56 | action='store_true', 57 | help="when enabled, categorical feature from dataset are accounted for, " 58 | "e.g. IP protocol. Defaults to False", 59 | default=False 60 | ) 61 | parser.add_argument( 62 | "--raw", 63 | dest='raw', 64 | action='store_true', 65 | help="when enabled, raw packet sequences are used for classification", 66 | default=False 67 | ) 68 | parser.add_argument( 69 | '--use_iat', 70 | help='set to use inter-packet time features, as raw features and/or their derivatives', 71 | action='store_true', 72 | default=False 73 | ) 74 | parser.add_argument( 75 | '--transformer_model_path', 76 | help='path to the pretrained transformer, if specified, shadows other feature-related arguments except' 77 | 'for the number of packets to use' 78 | ) 79 | parser.add_argument( 80 | '--mask_first_token', 81 | help='masks first sequence token when extracting features from transformer model, useful when the model was' 82 | 'pretrained with class-specific first tokens', 83 | action='store_true', 84 | default=False 85 | ) 86 | parser.add_argument( 87 | '--reinitialize', 88 | action='store_true', 89 | default=False 90 | ) 91 | 92 | parser.add_argument('--search_hyper_parameters', dest='search_hyper_parameters', action='store_true', default=False) 93 | 94 | parser.add_argument('--log_neptune', dest='log_neptune', action='store_true', default=False) 95 | args = parser.parse_args() 96 | return args 97 | 98 | 99 | def main(): 100 | """ basic training loop example """ 101 | args = _parse_args() 102 | 103 | logger.info('Loading csv file..') 104 | 105 | df_train = read_dataset(args.train_dataset, fill_na=True) 106 | if args.test_dataset: 107 | df_test = read_dataset(args.test_dataset, fill_na=True) 108 | else: 109 | df_train, df_test = train_test_split(df_train, 110 | stratify=df_train[args.target_column], 111 | test_size=1 / 4, 112 | random_state=RANDOM_SEED) 113 | 114 | if args.transformer_model_path: 115 | featurizer = TransformerFeatureExtractor( 116 | args.transformer_model_path, 117 | args.packet_num, 118 | mask_first_token=args.mask_first_token, 119 | reinitialize=args.reinitialize 120 | ) 121 | else: 122 | featurizer = Featurizer( 123 | packet_num=args.packet_num, 124 | cont_features=None if args.continuous else [], 125 | categorical_features=None if args.categorical else [], 126 | consider_raw_features=args.raw, 127 | consider_j3a=False, 128 | consider_tcp_flags=False, 129 | consider_iat_features=args.use_iat, 130 | target_column=args.target_column, 131 | ) 132 | 133 | X_train, y_train = featurizer.fit_transform_encode(df_train) 134 | X_test, y_test = featurizer.transform_encode(df_test) 135 | 136 | classifier_settings = read_classifier_settings(args.config) 137 | clfs = initialize_classifiers(classifier_settings) 138 | 139 | for model_name, model_holder in clfs.items(): 140 | if args.search_hyper_parameters: 141 | fit_optimal_classifier(model_holder, X_train, y_train) 142 | else: 143 | model_holder.classifier.fit(X_train, y_train) 144 | y_pred = model_holder.classifier.predict(X_test) 145 | reporter = Reporter(y_test, y_pred, model_holder.name, featurizer.target_encoder.classes_) 146 | 147 | report_file = f'report_{model_holder.name}.csv' 148 | report = reporter.clf_report(save_to=report_file) 149 | print(report) 150 | 151 | if args.log_neptune: 152 | neptune.init(NEPTUNE_PROJECT) 153 | parameters = vars(args) 154 | parameters.update({'classifier': model_name}) 155 | parameters.update(model_holder.classifier.get_params(deep=False)) 156 | 157 | neptune.create_experiment(name='sklearn', params=parameters) 158 | neptune.log_artifact((reporter.save_dir / report_file).as_posix()) 159 | neptune.log_image('confusion_matrix', reporter.plot_conf_matrix()) 160 | for metric_name, metric_value in reporter.scores().items(): 161 | neptune.log_metric(metric_name, metric_value) 162 | 163 | neptune.stop() 164 | 165 | 166 | if __name__ == '__main__': 167 | main() 168 | -------------------------------------------------------------------------------- /flow_parsing/pcap_parser.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import csv 3 | import logging 4 | from typing import Optional 5 | 6 | import nfstream 7 | import pandas as pd 8 | 9 | import settings 10 | from flow_parsing.features import calc_raw_features, calc_stat_features 11 | from flow_parsing.aux_raw_features_plugin import AuxRawFeatures 12 | 13 | logger = logging.getLogger('flow_parser') 14 | 15 | 16 | def init_streamer(source, 17 | derivative_features: bool, 18 | online_mode: bool = False, 19 | packet_limit: int = settings.DEFAULT_PACKET_LIMIT_PER_FLOW): 20 | # since we decide and set routing policy upon first occurrence of a flow we don't care about its re-export 21 | active_timeout = settings.ACTIVE_TIMEOUT_ONLINE if online_mode else settings.ACTIVE_TIMEOUT_OFFLINE 22 | plugins = [AuxRawFeatures(packet_limit=packet_limit)] if derivative_features else [] 23 | logger.info(f'mode set to {"online" if online_mode else "offline"}') 24 | 25 | return nfstream.NFStreamer( 26 | source=source, 27 | statistical_analysis=False, 28 | idle_timeout=settings.IDLE_TIMEOUT, 29 | active_timeout=active_timeout, 30 | splt_analysis=packet_limit, 31 | accounting_mode=1, # IP size, 32 | udps=plugins, 33 | ) 34 | 35 | 36 | def get_ip_protocol_by_int(proto: int) -> str: 37 | try: 38 | return settings.IP_PROTO_MAPPING[proto] 39 | except KeyError: 40 | logger.warning(f'encountered unknown IP proto number: {proto}') 41 | return 'UNKNOWN' 42 | 43 | 44 | def flow_processor(source, 45 | derivative_features: bool = True, 46 | raw_features: Optional[int] = None, 47 | provide_labels=True, 48 | online_mode=True 49 | ) -> dict: 50 | def _make_flow_id(): 51 | return f'{get_ip_protocol_by_int(entry.protocol)} ' \ 52 | f'{entry.src_ip}:{entry.src_port} ' \ 53 | f'{entry.dst_ip}:{entry.dst_port}' 54 | 55 | streamer = init_streamer( 56 | source, 57 | derivative_features, 58 | online_mode=online_mode, 59 | packet_limit=raw_features if raw_features is not None else settings.DEFAULT_PACKET_LIMIT_PER_FLOW 60 | ) 61 | for flow_number, entry in enumerate(streamer): 62 | flow_ids = { 63 | 'flow_id': _make_flow_id(), 64 | 'ip_proto': get_ip_protocol_by_int(entry.protocol)} 65 | 66 | ndpi_features = { 67 | 'ndpi_app': entry.application_name, 68 | 'ndpi_category': entry.application_category_name, 69 | 'ndpi_client_info': entry.user_agent, 70 | 'ndpi_server_info': entry.requested_server_name, 71 | 'ndpi_j3ac': entry.client_fingerprint, 72 | 'ndpi_j3as': entry.server_fingerprint, 73 | } if provide_labels else {} 74 | 75 | raw_packets = calc_raw_features(entry) if raw_features else {} 76 | 77 | flow_features = calc_stat_features(entry) if derivative_features else {} 78 | 79 | if flow_number > 0 == flow_number % 5000: 80 | logger.info(f'processed {flow_number} flows...') 81 | yield dict(**flow_ids, **ndpi_features, **flow_features, **raw_packets) 82 | 83 | 84 | def parse_pcap_to_csv(pcap_file_path, 85 | target_csv_path, 86 | derivative_features: bool = True, 87 | raw_features: Optional[int] = None, 88 | provide_labels=True, 89 | online_mode=True): 90 | logger.info(f'started parsing file {pcap_file_path}') 91 | logger.info(f'saving to {target_csv_path}') 92 | with open(target_csv_path, 'w', newline='') as f: 93 | writer = csv.writer(f) 94 | for index, flow in enumerate(flow_processor(pcap_file_path, 95 | derivative_features=derivative_features, 96 | raw_features=raw_features, 97 | provide_labels=provide_labels, 98 | online_mode=online_mode)): 99 | if index == 0: 100 | writer.writerow(flow.keys()) 101 | writer.writerow(flow.values()) 102 | 103 | 104 | def parse_pcap_to_dataframe(pcap_file: str, 105 | derivative_features: bool = True, 106 | raw_features: Optional[int] = None, 107 | provide_labels=True, 108 | online_mode=True) -> pd.DataFrame: 109 | flows = [] 110 | logger.info(f'started parsing file {pcap_file}') 111 | for flow in flow_processor(pcap_file, 112 | derivative_features=derivative_features, 113 | raw_features=raw_features, 114 | provide_labels=provide_labels, 115 | online_mode=online_mode): 116 | flows.append(flow) 117 | return pd.DataFrame(flows) 118 | 119 | 120 | def _get_output_csv_filename(args) -> str: 121 | core_name = args.pcapfile.split('/')[-1].split('.')[0] 122 | if args.raw: 123 | core_name = core_name + '_raw' 124 | pkt_lim = args.raw if args.raw else settings.DEFAULT_PACKET_LIMIT_PER_FLOW 125 | output_csv = settings.PCAP_OUTPUT_DIR / f'{core_name}_{pkt_lim}packets.csv' 126 | return output_csv 127 | 128 | 129 | def main(): 130 | parser = argparse.ArgumentParser() 131 | parser.add_argument( 132 | "-p", "--pcapfile", 133 | help="pcap file", 134 | default=(settings.BASE_DIR / 'flow_parsing/static/example.pcap').as_posix(), 135 | ) 136 | parser.add_argument( 137 | "-o", "--output", 138 | help="output .csv file destination", 139 | ) 140 | 141 | parser.add_argument( 142 | "--raw", 143 | dest='raw', 144 | type=int, 145 | help="when provided, in addition to feature statistics, specified N number of raw features " 146 | "(packet lengths and IATs) for first N packets are exported, which are used by traffic augmenters/models.", 147 | default=None 148 | ) 149 | parser.add_argument('--derivative', dest='derivative', action='store_true', 150 | help="when enabled, derivative feature statistics " 151 | "(e.g. such as percentiles, sums, etc. of packet size) " 152 | "of first DEFAULT_PACKET_LIMIT_PER_FLOW or provided via arg '--raw' packets are exported") 153 | parser.add_argument('--no-derivative', dest='derivative', action='store_false') 154 | parser.set_defaults(derivative=True) 155 | 156 | parser.add_argument('--online_mode', dest='online_mode', action='store_true', 157 | help="when enabled, active flow expiration timeout is decreased to the one defined in settings." 158 | "In offline mode, active timeout is set to be large " 159 | "enough to avoid flow fragmentation", 160 | default=False) 161 | 162 | args = parser.parse_args() 163 | 164 | output_csv = args.output if args.output else _get_output_csv_filename(args) 165 | parse_pcap_to_csv(args.pcapfile, 166 | target_csv_path=output_csv, 167 | derivative_features=args.derivative, 168 | raw_features=args.raw, 169 | online_mode=args.online_mode) 170 | 171 | 172 | if __name__ == '__main__': 173 | main() 174 | -------------------------------------------------------------------------------- /evaluation_utils/modeling.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from functools import partial 3 | 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import pandas as pd 7 | import scipy 8 | 9 | from flow_parsing.features import inter_packet_times_from_timestamps 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | def flows_to_packets(flows): 15 | return flows[~np.isnan(flows)].reshape(-1, 2) 16 | 17 | 18 | def convert_ipt_to_iat(flows): 19 | """ converts inter-packet time (IPT - timing between 2 any packets) to 20 | inter-arrival time (IAT - timing between 2 consecutive packets within 1 direction) """ 21 | 22 | def ipt_to_iat(flow): 23 | """ 24 | :param flow: source flow of size (packet_num, feature_num=2) 25 | :return: 26 | """ 27 | timestamps_like = np.cumsum(flow[:, 1]) 28 | direction_from_mask = flow[:, 0] > 0 29 | direction_to_mask = flow[:, 0] < 0 30 | 31 | iat_flow = np.full(flow.shape, np.nan) 32 | iat_flow[direction_from_mask, 1] = inter_packet_times_from_timestamps(timestamps_like[direction_from_mask]) 33 | iat_flow[direction_to_mask, 1] = inter_packet_times_from_timestamps(timestamps_like[direction_to_mask]) 34 | iat_flow[:, 0] = flow[:, 0] 35 | return iat_flow 36 | 37 | source_shape = flows.shape 38 | raw_packets = flows.reshape(-1, 2) # per-packet view 39 | raw_packets = raw_packets.reshape(-1, source_shape[1] // 2, 2) # (n_flows, n_packets, features) 40 | iat_packets = np.empty_like(raw_packets) 41 | for i in range(source_shape[0]): 42 | iat_packets[i, :, :] = ipt_to_iat(raw_packets[i]) 43 | iat_packets = iat_packets.reshape(source_shape) 44 | return iat_packets 45 | 46 | 47 | def plot_packets(packet_features, limit_packet_scale=False, save_to=None, ru_lang=False): 48 | if isinstance(packet_features, pd.DataFrame): 49 | packet_features = packet_features.values 50 | 51 | fig, ax = plt.subplots(figsize=(12, 7)) 52 | plt.scatter(packet_features[:, 0], packet_features[:, 1], alpha=0.3) 53 | ax.set_title(f'Число кластеров: {packet_features.shape[0]}' if ru_lang else 54 | f'Number of items: {packet_features.shape[0]}') 55 | if limit_packet_scale: 56 | ax.set_xlim(-1, 1) 57 | ax.grid(True) 58 | ax.set_xlabel('размер пакета, байт / 1500' if ru_lang else 59 | 'packet size, bytes / 1500') 60 | ax.set_ylabel('log10(межпакетный интервал, µs)' if ru_lang else 61 | 'log10(inter-packet time, µs)') 62 | if save_to: 63 | plt.savefig(save_to, dpi=300) 64 | 65 | 66 | def packets_per_flow(flows): 67 | non_packet_mask = ~np.isnan(flows) 68 | return non_packet_mask.sum(1) / 2 69 | 70 | 71 | def handle_estimation_exceptions(func): 72 | def real_decorator(*args, **kwargs): 73 | try: 74 | return func(*args, **kwargs) 75 | except Exception as e: 76 | logger.error(f'{func.__name__}: {e}') 77 | return np.nan 78 | 79 | return real_decorator 80 | 81 | 82 | @handle_estimation_exceptions 83 | def estimate_pdf(samples): 84 | x_values = np.linspace(0, max(samples), 100) 85 | kde = scipy.stats.gaussian_kde(samples)(x_values) 86 | kde /= sum(kde) 87 | return kde 88 | 89 | 90 | @handle_estimation_exceptions 91 | def get_kl_divergence_continuous(orig_values, gen_values): 92 | kde_orig = estimate_pdf(orig_values) 93 | kde_gen = estimate_pdf(gen_values) 94 | return scipy.stats.entropy(kde_orig, kde_gen) 95 | 96 | 97 | @handle_estimation_exceptions 98 | def get_wasserstein_distance_pdf(orig_values, gen_values): 99 | kde_orig = estimate_pdf(orig_values) 100 | kde_gen = estimate_pdf(gen_values) 101 | return scipy.stats.wasserstein_distance(kde_orig, kde_gen) 102 | 103 | 104 | @handle_estimation_exceptions 105 | def get_ks_stat(orig_values, gen_values): 106 | ks = scipy.stats.ks_2samp(orig_values, gen_values) 107 | return ks.statistic 108 | 109 | 110 | def scaled_diff(orig, gen): 111 | return np.abs(orig - gen) / orig 112 | 113 | 114 | @handle_estimation_exceptions 115 | def scaled_diff_at_percentile(orig, gen, percentile): 116 | o = np.percentile(orig, percentile) 117 | g = np.percentile(gen, percentile) 118 | return scaled_diff(o, g) 119 | 120 | 121 | def packets_to_throughput(packets, resolution='1S'): 122 | # replace indexes with DateTime format 123 | df = pd.Series( 124 | packets[:, 0], 125 | index=pd.to_datetime(np.cumsum(packets[:, 1]), unit='ms') 126 | ) 127 | throughput = df.resample(resolution).sum() 128 | return throughput.values 129 | 130 | 131 | def evaluate_generated_traffic(src_flows: np.ndarray, gen_flows: np.ndarray) -> dict: 132 | logger.info('starting evaluation of flows...') 133 | src_packets = flows_to_packets(convert_ipt_to_iat(src_flows)) 134 | gen_packets = flows_to_packets(convert_ipt_to_iat(gen_flows)) 135 | 136 | client_src_mask = src_packets[:, 0] > 0 137 | client_gen_mask = gen_packets[:, 0] > 0 138 | 139 | client_src_packets = src_packets[client_src_mask] 140 | server_src_packets = src_packets[~client_src_mask] 141 | 142 | client_gen_packets = gen_packets[client_gen_mask] 143 | server_gen_packets = gen_packets[~client_gen_mask] 144 | 145 | throughput = { 146 | 'src_avg_throughput_bytes_per_s_client': np.mean(packets_to_throughput(client_src_packets)), 147 | 'gen_avg_throughput_bytes_per_s_client': np.mean(packets_to_throughput(client_gen_packets)), 148 | 'src_avg_throughput_bytes_per_s_server': np.mean(packets_to_throughput(server_src_packets)), 149 | 'gen_avg_throughput_bytes_per_s_server': np.mean(packets_to_throughput(server_gen_packets)), 150 | } 151 | 152 | metrics = {} 153 | 154 | for metric_name, metric_function in [ 155 | ('KL', get_kl_divergence_continuous), 156 | # ('Wasserstein', get_wasserstein_distance_pdf), 157 | ('KS_2sample', get_ks_stat), 158 | ('10th_percentile', partial(scaled_diff_at_percentile, percentile=10)), 159 | # ('25th_percentile', partial(scaled_diff_at_percentile, percentile=25)), 160 | ('50th_percentile', partial(scaled_diff_at_percentile, percentile=50)), 161 | # ('75th_percentile', partial(scaled_diff_at_percentile, percentile=75)), 162 | ('90th_percentile', partial(scaled_diff_at_percentile, percentile=90)) 163 | 164 | ]: 165 | metrics.update({ 166 | metric_name + '_packets_per_flow': metric_function(packets_per_flow(src_flows), packets_per_flow(gen_flows)), 167 | metric_name + '_PS_client': metric_function(client_src_packets[:, 0], client_gen_packets[:, 0]), 168 | metric_name + '_IAT_client': metric_function(client_src_packets[:, 1], client_gen_packets[:, 1]), 169 | metric_name + '_PS_server': metric_function(server_src_packets[:, 0], server_gen_packets[:, 0]), 170 | metric_name + '_IAT_server': metric_function(server_src_packets[:, 1], server_gen_packets[:, 1]), 171 | metric_name + '_thrpt_client': metric_function(packets_to_throughput(client_src_packets), 172 | packets_to_throughput(client_gen_packets)), 173 | f'{metric_name}_thrpt_server': metric_function(packets_to_throughput(server_src_packets), 174 | packets_to_throughput(server_gen_packets)), 175 | }) 176 | 177 | common_metrics = { 178 | 'n_flows': min(src_flows.shape[0], gen_flows.shape[0]) 179 | } 180 | return dict(**common_metrics, **metrics, **throughput) 181 | 182 | 183 | def save_metrics(metrics: dict, save_to): 184 | pd.DataFrame(metrics).T.to_csv(save_to) 185 | -------------------------------------------------------------------------------- /gpt_model/data_preparation/format_parsed_as_classification_dataset.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pathlib 3 | from typing import Iterable, Optional 4 | 5 | import pandas as pd 6 | 7 | from flow_parsing.utils import get_hash, read_dataset, check_filename_in_patterns, save_dataset 8 | from gpt_model.data_preparation.preprocess_target_pcaps import IOT_DEVICES 9 | from settings import TARGET_CLASS_COLUMN, LOWER_BOUND_CLASS_OCCURRENCE, FilePatterns, DATASET_DIR 10 | 11 | """ 12 | task-specific module, provided for the sake of reproducibility 13 | formats labels from outputs of nDPI and in case of IoT traffic, assigns labels from filenames 14 | """ 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | # signalling protos are common among all devices and it doesn't make sense to treat them separately 19 | COMMON_PROTOCOLS = ['DNS', 'NTP', 'STUN'] 20 | GARBAGE_PROTOCOLS = ['ICMP', 'ICMPV6', 'DHCPV6', 'DHCP', 'Unknown', 'IGMP', 'SSDP'] 21 | 22 | 23 | def _load_parsed_results(dir_with_parsed_csvs, filename_patterns_to_exclude: Optional[Iterable[str]]): 24 | dir_with_parsed_csvs = pathlib.Path(dir_with_parsed_csvs) 25 | 26 | parsed_csvs = list(dir_with_parsed_csvs.glob('*.csv')) 27 | 28 | iot_datasets = [] 29 | usual_traffic = [] 30 | 31 | iot_categories = set(item.category for item in IOT_DEVICES) 32 | for csv_file in parsed_csvs: 33 | # skip non-home and IoT files 34 | if check_filename_in_patterns(csv_file, filename_patterns_to_exclude): 35 | continue 36 | 37 | traffic_df = read_dataset(csv_file) 38 | 39 | if csv_file.name.startswith('train'): 40 | base_name = csv_file.name.split('train_')[-1] 41 | elif csv_file.name.startswith('val'): 42 | base_name = csv_file.name.split('val_')[-1] 43 | elif csv_file.name.startswith('test'): 44 | base_name = csv_file.name.split('test_')[-1] 45 | else: 46 | base_name = csv_file.name 47 | 48 | traffic_df['source_file'] = base_name 49 | 50 | if base_name.split('_')[0] in iot_categories: 51 | iot_datasets.append(traffic_df) 52 | else: 53 | usual_traffic.append(traffic_df) 54 | 55 | try: 56 | iot_traffic = pd.concat(iot_datasets, ignore_index=True) 57 | except ValueError: 58 | iot_traffic = pd.DataFrame([]) 59 | logger.warning('no IoT files were found!') 60 | usual_traffic = pd.concat(usual_traffic, ignore_index=True) 61 | logger.info(f'found: {len(iot_traffic)} IoT flows, and {len(usual_traffic)} usual') 62 | return iot_traffic, usual_traffic 63 | 64 | 65 | def _set_common_protos_targets(dataset): 66 | for proto in COMMON_PROTOCOLS: 67 | dataset.loc[dataset['ndpi_app'].str.startswith(proto), TARGET_CLASS_COLUMN] = proto 68 | return dataset 69 | 70 | 71 | def _set_iot_devices_targets(dataset): 72 | """ assigns target class according to the category of an IoT device """ 73 | common_indexer = dataset[TARGET_CLASS_COLUMN].isin(COMMON_PROTOCOLS) 74 | iot_category = dataset.loc[~common_indexer, 'source_file'].str.split('_').apply(lambda x: 'IoT_' + x[0]) 75 | dataset.loc[~common_indexer, TARGET_CLASS_COLUMN] = iot_category 76 | logger.info(str(dataset[TARGET_CLASS_COLUMN].value_counts())) 77 | return dataset 78 | 79 | 80 | def _set_application_targets(dataset): 81 | """ assigns target class according to the 'Y' application from nDPI's 'X.Y' label """ 82 | common_indexer = dataset[TARGET_CLASS_COLUMN].isin(COMMON_PROTOCOLS) 83 | cleaned_up_applications = dataset.loc[~common_indexer, 'ndpi_app'].str.split('.').apply(lambda x: x[-1]) 84 | dataset.loc[~common_indexer, TARGET_CLASS_COLUMN] = cleaned_up_applications 85 | logger.info(str(dataset[TARGET_CLASS_COLUMN].value_counts())) 86 | return dataset 87 | 88 | 89 | def _rm_garbage(dataset, garbage: list = None, column_from='ndpi_app'): 90 | """ rm irrelevant targets for classification at an upstream device """ 91 | if garbage is None: 92 | garbage = GARBAGE_PROTOCOLS 93 | garbage_indexer = dataset[column_from].isin(garbage) 94 | logger.info(f'found {garbage_indexer.sum()} objects of garbage protos') 95 | return dataset[~garbage_indexer] 96 | 97 | 98 | def prune_targets(dataset, lower_bound=LOWER_BOUND_CLASS_OCCURRENCE, underrepresented_protos: list = None): 99 | """ rm infrequent targets """ 100 | proto_counts = dataset[TARGET_CLASS_COLUMN].value_counts() 101 | if underrepresented_protos is None: 102 | underrepresented_protos = proto_counts[proto_counts < lower_bound].index.tolist() 103 | if underrepresented_protos: 104 | logger.info(f'pruning the following targets: {underrepresented_protos}') 105 | dataset = dataset.loc[~dataset[TARGET_CLASS_COLUMN].isin(underrepresented_protos)] 106 | return dataset.reset_index(drop=True), underrepresented_protos 107 | 108 | 109 | def delete_duplicating_flows(dataset): 110 | def to_session_id(flow_id): 111 | proto, conn1, conn2 = flow_id.split(' ') 112 | return proto, frozenset([conn1, conn2]) 113 | 114 | dataset['session_id'] = dataset['flow_id'].apply(to_session_id) 115 | dataset = dataset.drop_duplicates(subset=['session_id']) 116 | dataset.drop(columns='session_id', inplace=True) 117 | logger.info(f'{dataset.shape[0]} flows left after deduplication') 118 | return dataset 119 | 120 | 121 | def prepare_classification_data(csv_dir, remove_garbage=True, filename_patterns_to_exclude=None): 122 | """ the order of operations matters """ 123 | iot_traffic, usual_traffic = _load_parsed_results(csv_dir, filename_patterns_to_exclude) 124 | 125 | if len(iot_traffic) > 0: 126 | iot_traffic = _set_common_protos_targets(iot_traffic) 127 | iot_traffic = _set_iot_devices_targets(iot_traffic) 128 | if remove_garbage: 129 | iot_traffic = _rm_garbage(iot_traffic, 130 | column_from='ndpi_app') 131 | 132 | usual_traffic = _set_common_protos_targets(usual_traffic) 133 | usual_traffic = _set_application_targets(usual_traffic) 134 | 135 | if remove_garbage: 136 | usual_traffic = _rm_garbage(usual_traffic, 137 | garbage=GARBAGE_PROTOCOLS + ['Amazon'], 138 | column_from=TARGET_CLASS_COLUMN) 139 | 140 | merged_traffic = pd.concat([usual_traffic, iot_traffic], ignore_index=True) 141 | return merged_traffic 142 | 143 | 144 | def main(): 145 | pattern_name = 'mawi_unswnb_iscxvpn' 146 | excluded_patterns = getattr(FilePatterns, pattern_name) 147 | train_df = prepare_classification_data(DATASET_DIR / 'pretraining/train_csv', 148 | filename_patterns_to_exclude=excluded_patterns) 149 | eval_df = prepare_classification_data(DATASET_DIR / 'pretraining/val_csv', 150 | filename_patterns_to_exclude=excluded_patterns) 151 | test_df = prepare_classification_data(DATASET_DIR / 'pretraining/test_csv', 152 | filename_patterns_to_exclude=excluded_patterns) 153 | tr_val_df = pd.concat([train_df, eval_df], ignore_index=True) 154 | tr_val_df = delete_duplicating_flows(tr_val_df) 155 | tr_val_df, underrepresented_protos = prune_targets(tr_val_df) 156 | 157 | test_df = delete_duplicating_flows(test_df) 158 | test_df, _ = prune_targets(test_df, underrepresented_protos=underrepresented_protos) 159 | 160 | suffix = get_hash(tr_val_df) 161 | save_dataset(tr_val_df, save_to=DATASET_DIR / f'train_{suffix}_no_{pattern_name}.csv') 162 | save_dataset(test_df, save_to=DATASET_DIR / f'test_{suffix}_no_{pattern_name}.csv') 163 | 164 | 165 | if __name__ == '__main__': 166 | main() 167 | -------------------------------------------------------------------------------- /gpt_model/tokenizer.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import json 3 | import pathlib 4 | from functools import partial 5 | from typing import Optional, Union 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import torch 10 | from transformers import PreTrainedTokenizerBase 11 | from transformers.tokenization_utils_base import TensorType, BatchEncoding 12 | 13 | from settings import logger 14 | from .quantizer import PacketQuantizer 15 | 16 | 17 | class PacketTokenizer(PreTrainedTokenizerBase): 18 | max_model_input_sizes = 128 19 | model_input_names = ["attention_mask"] 20 | 21 | def __init__(self, 22 | packet_quantizer: PacketQuantizer, 23 | unk_token="[UNK]", 24 | bos_token="[BOF]", 25 | eos_token="[EOF]", 26 | pad_token="[PAD]", 27 | **kwargs 28 | ): 29 | super().__init__( 30 | unk_token=unk_token, 31 | bos_token=bos_token, 32 | eos_token=eos_token, 33 | pad_token=pad_token, 34 | **kwargs, 35 | ) 36 | self.packet_quantizer = packet_quantizer 37 | self.cluster_num = packet_quantizer.n_clusters 38 | # special token ids have indexes larger than all packet clusters (which start at 0) 39 | ids_to_tokens = kwargs.get('ids_to_tokens') 40 | if ids_to_tokens: 41 | self.ids_to_tokens = ids_to_tokens 42 | else: 43 | self.ids_to_tokens = collections.OrderedDict([(ids + self.cluster_num, tok) 44 | for ids, tok in enumerate(self.all_special_tokens)]) 45 | 46 | self.tokens_to_ids = {v: k for k, v in self.ids_to_tokens.items()} 47 | logger.info('initialized PacketTokenizer') 48 | 49 | def add_class_tokens(self, class_names: list): 50 | classes_to_add = set(class_names) - set(self.tokens_to_ids.keys()) 51 | 52 | ids_to_classes = collections.OrderedDict([(ids + len(self), tok) for ids, tok in enumerate(classes_to_add)]) 53 | classes_to_ids = {v: k for k, v in ids_to_classes.items()} 54 | 55 | self.ids_to_tokens.update(ids_to_classes) 56 | self.tokens_to_ids.update(classes_to_ids) 57 | 58 | @classmethod 59 | def from_pretrained(cls, pretrained_model_name_or_path, flow_size=None): 60 | path_dir = pathlib.Path(pretrained_model_name_or_path) 61 | flow_size = cls.max_model_input_sizes if flow_size is None else flow_size 62 | 63 | token_map_file = path_dir / 'ids_to_tokens.json' 64 | if token_map_file.is_file(): 65 | with open(token_map_file, 'r') as jf: 66 | ids_to_tokens = json.load(jf) 67 | ids_to_tokens = {int(k): v for k, v in ids_to_tokens.items()} 68 | logger.info('loaded special tokens map from "ids_to_tokens.json"') 69 | else: 70 | ids_to_tokens = {} 71 | logger.warning('special tokens map "ids_to_tokens.json" was not found, will attempt to recreate one') 72 | 73 | quantizer = PacketQuantizer.from_checkpoint(path_dir, flow_size=flow_size) 74 | return cls( 75 | packet_quantizer=quantizer, 76 | ids_to_tokens=ids_to_tokens, 77 | ) 78 | 79 | def save_pretrained(self, save_directory): 80 | save_directory = pathlib.Path(save_directory) 81 | with open(save_directory / 'ids_to_tokens.json', 'w') as jf: 82 | json.dump(self.ids_to_tokens, jf) 83 | 84 | self.packet_quantizer.save_checkpoint(save_directory) 85 | 86 | def convert_ids_to_tokens(self, index): 87 | if isinstance(index, int): 88 | # exception indicates the bug 89 | return self.ids_to_tokens[index] 90 | else: 91 | raise NotImplementedError 92 | 93 | def convert_tokens_to_ids(self, tokens): 94 | if isinstance(tokens, str): 95 | return self.tokens_to_ids[tokens] 96 | else: 97 | raise NotImplementedError 98 | 99 | def _pad_flow(self, flow: np.ndarray) -> np.ndarray: 100 | non_packets_mask = flow == self.packet_quantizer.non_packet_value 101 | flow[non_packets_mask] = self.pad_token_id 102 | return flow 103 | 104 | def _expand_with_special_tokens(self, flow: np.ndarray, first_token) -> np.ndarray: 105 | # truncate to account for the tokens 106 | flow = flow[:self.max_model_input_sizes - 2] 107 | flow = np.insert(flow, 0, first_token) 108 | non_packets_mask = flow == self.packet_quantizer.non_packet_value 109 | flow[non_packets_mask] = self.pad_token_id 110 | # we either pick index of the first True value or append 111 | end_of_flow = non_packets_mask.argmax() if non_packets_mask.any() else len(flow) 112 | flow = np.insert(flow, end_of_flow, self.eos_token_id) 113 | return flow 114 | 115 | def batch_encode_packets( 116 | self, 117 | flows: Union[pd.DataFrame, np.ndarray], 118 | target_class: Optional[str] = None, 119 | add_special_tokens: bool = True, 120 | return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH, 121 | return_attention_mask: Optional[bool] = True, 122 | ) -> BatchEncoding: 123 | 124 | if isinstance(flows, pd.DataFrame): 125 | flows = flows.values 126 | 127 | if flows.shape[1] // 2 != self.max_model_input_sizes: 128 | logger.debug(f'input number of features ({flows.shape[1] // 2}) does not match ' 129 | f'max_model_input_sizes ({self.max_model_input_sizes})') 130 | clusters = self.packet_quantizer.transform(flows) 131 | 132 | if add_special_tokens: 133 | first_token = self.convert_tokens_to_ids(target_class) if target_class is not None else self.bos_token_id 134 | expander = partial(self._expand_with_special_tokens, first_token=first_token) 135 | clusters = np.apply_along_axis(expander, axis=1, arr=clusters) 136 | else: 137 | clusters = np.apply_along_axis(self._pad_flow, axis=1, arr=clusters) 138 | 139 | result = {'input_ids': clusters.astype(np.int64)} 140 | 141 | if return_attention_mask: 142 | token_mask = (clusters != self.pad_token_id).astype(np.int64) 143 | result.update({'attention_mask': token_mask}) 144 | 145 | return BatchEncoding(result, tensor_type=TensorType(return_tensors), prepend_batch_axis=False) 146 | 147 | def _remove_special_tokens(self, flow): 148 | # rm first token 149 | flow = flow[1:] 150 | try: 151 | flow_end_idx = np.where(flow == self.eos_token_id)[0][0] 152 | except IndexError: 153 | flow_end_idx = flow.shape[0] - 1 154 | logger.warning('could not find EOS token, removing the last one') 155 | 156 | if flow_end_idx == flow.shape[0] - 1: 157 | flow = flow[:-1] 158 | else: 159 | flow = flow[:-1] 160 | # replace pad token with quantizer's non packet value for consistency 161 | flow[flow_end_idx:] = self.packet_quantizer.non_packet_value 162 | return flow 163 | 164 | def batch_decode_packets(self, tokenized_flows) -> np.ndarray: 165 | if isinstance(tokenized_flows, torch.Tensor): 166 | tokenized_flows = tokenized_flows.numpy() 167 | clusters_only = np.apply_along_axis(self._remove_special_tokens, axis=1, arr=tokenized_flows) 168 | packet_features = self.packet_quantizer.inverse_transform(clusters_only) 169 | return packet_features 170 | 171 | def __len__(self): 172 | return self.cluster_num + len(self.tokens_to_ids) 173 | 174 | @property 175 | def max_len(self): 176 | return self.max_model_input_sizes 177 | -------------------------------------------------------------------------------- /sklearn_classifiers/knn_cosine.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from collections import Counter 3 | 4 | import ngtpy 5 | import numpy as np 6 | import pandas as pd 7 | import puffinn 8 | from scipy.spatial.distance import cdist 9 | from sklearn.base import BaseEstimator 10 | from sklearn.preprocessing import normalize 11 | 12 | from sklearn_classifiers.utils import iterate_batch_indexes 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | def cos_dist(query, keys): 18 | # if got vector 19 | if len(query.shape) == 1: 20 | query = query.reshape(1, -1) 21 | return cdist(keys, query, 'cosine').T 22 | 23 | 24 | def top_k_cosine_similar(query, keys, k=1): 25 | distances = cos_dist(query, keys) 26 | top_k = np.argpartition(distances, k)[:, :k] 27 | return top_k 28 | 29 | 30 | def voter(obj_votes): 31 | top_count = Counter(obj_votes).most_common(1) 32 | # return the top key 33 | return top_count[0][0] 34 | 35 | 36 | def batch_voter(class_votes): 37 | """ 38 | returns vector with the most occurring values within a `class_votes` row, if tie -- selects the first one 39 | :param class_votes: is a (objects, votes) matrix 40 | :return: 41 | """ 42 | top_or_first = np.apply_along_axis(voter, axis=1, arr=class_votes) 43 | return top_or_first 44 | 45 | 46 | class KNeighborsCosineClassifier(BaseEstimator): 47 | """ 48 | custom K-nn based on cosine similarity 49 | 50 | time: 2h18m 51 | perf: 52 | accuracy 0.981014 0.981014 0.981014 0.981014 53 | macro avg 0.862088 0.861645 0.859317 96705.000000 54 | weighted avg 0.981303 0.981014 0.981095 96705.000000 55 | 56 | """ 57 | 58 | def __init__(self, n_neighbors=3): 59 | self.n_neighbors = n_neighbors 60 | self.target_keys: np.ndarray = np.nan 61 | self.target_classes: np.ndarray = np.nan 62 | 63 | def fit(self, X, y): 64 | X_train = X.values if isinstance(X, pd.DataFrame) else X 65 | y_train = y.values if isinstance(y, pd.Series) else y 66 | assert X_train.shape[0] == y_train.shape[0], 'X and y length must match!' 67 | # assure the values are of np.ndarray type after all 68 | self.target_keys = np.array(X_train) 69 | self.target_classes = np.array(y_train) 70 | logger.info('fit KNeighborsCosineClassifier') 71 | return self 72 | 73 | def predict(self, X, batch_size=1024): 74 | X = X.values if isinstance(X, pd.DataFrame) else X 75 | X = np.array(X) 76 | predictions = np.empty(X.shape[0], dtype=np.int) 77 | for start_idx, end_idx in iterate_batch_indexes(X, batch_size): 78 | top_indexes = top_k_cosine_similar(query=X[start_idx:end_idx], keys=self.target_keys, k=self.n_neighbors) 79 | predictions[start_idx:end_idx] = batch_voter(self.target_classes[top_indexes]) 80 | return predictions 81 | 82 | 83 | class KNeighborsLshClassifier(BaseEstimator): 84 | 85 | def __init__(self, n_neighbors=1): 86 | self.target_classes: np.ndarray = np.nan 87 | self.n_neighbors = n_neighbors 88 | self.lsh_table = None 89 | 90 | def _construct_table(self, dataset: np.ndarray): 91 | raise NotImplementedError 92 | 93 | def _check_set_features(self, X): 94 | X = X.values if isinstance(X, pd.DataFrame) else np.array(X) 95 | X = X.astype(np.float32) 96 | normalize(X, copy=False) 97 | return X 98 | 99 | def fit(self, X, y): 100 | X_train = self._check_set_features(X) 101 | self.target_classes = y.values if isinstance(y, pd.Series) else np.array(y) 102 | self._construct_table(X_train) 103 | logger.info(f'fit {self.__class__.__name__}') 104 | return self 105 | 106 | def _predict(self, X): 107 | raise NotImplementedError 108 | 109 | def predict(self, X): 110 | X = self._check_set_features(X) 111 | return self._predict(X) 112 | 113 | 114 | class KNeighborsPuffinnClassifier(KNeighborsLshClassifier): 115 | """ 116 | PUFFINN - Parameterless and Universal Fast Finding of Nearest Neighbors 117 | https://arxiv.org/pdf/1906.12211.pdf 118 | 119 | time: 12m 120 | perf: 121 | accuracy 0.981759 0.981759 0.981759 0.981759 122 | macro avg 0.865334 0.861639 0.860683 96705.000000 123 | weighted avg 0.981953 0.981759 0.981810 96705.000000 124 | 125 | it is really close to the perf of grid-search K-nn approach but much faster 126 | """ 127 | 128 | def __init__(self, n_neighbors=1, search_recall=0.995, memory_limit=1 * 1024 ** 3): 129 | super().__init__(n_neighbors) 130 | self.memory_limit = memory_limit 131 | self.search_recall = search_recall 132 | self.lsh_table: puffinn.Index 133 | 134 | def _construct_table(self, dataset: np.ndarray): 135 | self.lsh_table = puffinn.Index('angular', dataset.shape[1], self.memory_limit) 136 | for v in dataset: 137 | self.lsh_table.insert(v.tolist()) 138 | logger.info('building index table...') 139 | self.lsh_table.rebuild() 140 | 141 | def _predict(self, X): 142 | def query_predictor(query): 143 | top_indexes = self.lsh_table.search(query.tolist(), self.n_neighbors, self.search_recall) 144 | return voter(self.target_classes[top_indexes]) 145 | 146 | predictions = np.apply_along_axis(query_predictor, axis=1, arr=X) 147 | return predictions 148 | 149 | 150 | class KNeighborsNGTClassifier(KNeighborsLshClassifier): 151 | """ 152 | ONNG-NGT (https://github.com/yahoojapan/NGT/wiki) 153 | 154 | better keep optimize_* args as defaults, it doesn't work as expected 155 | """ 156 | 157 | def __init__( 158 | self, 159 | n_neighbors=1, 160 | search_epsilon=0.1, 161 | optimize_n_edges=False, 162 | optimize_search_params=False, 163 | index_path='/tmp/knn_ngt_index' 164 | ): 165 | super().__init__(n_neighbors) 166 | self.index_path = index_path 167 | self.optimize_n_edges = optimize_n_edges 168 | self.optimize_search_params = optimize_search_params 169 | self.search_epsilon = search_epsilon 170 | 171 | def _construct_table(self, dataset: np.ndarray): 172 | # when data is normalized row-wise, the L2 distance metric is similar to the cosine 173 | ngtpy.create(self.index_path, dataset.shape[1], distance_type='L2') 174 | index = ngtpy.Index(self.index_path) # open the index 175 | index.batch_insert(dataset) 176 | if self.optimize_n_edges: 177 | logger.info('optimizing number of edges...') 178 | index.save() 179 | optimizer = ngtpy.Optimizer(log_disabled=True) 180 | try: 181 | optimizer.optimize_number_of_edges_for_anng(self.index_path) 182 | except RuntimeError as e: 183 | logger.error(f'skipping optimization due to: {e}') 184 | if self.optimize_search_params: 185 | optimizer = ngtpy.Optimizer(log_disabled=True) 186 | optimizer.set_processing_modes( 187 | search_parameter_optimization=True, 188 | prefetch_parameter_optimization=True, 189 | accuracy_table_generation=True) 190 | optimizer.optimize_search_parameters(self.index_path) 191 | logger.info('building index table...') 192 | index.build_index() # build index 193 | index.save() 194 | self.lsh_table = index 195 | 196 | def _predict(self, X): 197 | def query_predictor(query): 198 | if self.optimize_search_params: 199 | top_indexes = self.lsh_table.search(query, size=self.n_neighbors, expected_accuracy=0.99) 200 | else: 201 | top_indexes = self.lsh_table.search(query, size=self.n_neighbors, epsilon=self.search_epsilon) 202 | 203 | top_indexes = [i[0] for i in top_indexes] 204 | return voter(self.target_classes[top_indexes]) 205 | 206 | predictions = np.apply_along_axis(query_predictor, axis=1, arr=X) 207 | return predictions 208 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /gpt_model/generator/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | from dataclasses import dataclass 4 | from functools import lru_cache, partial 5 | from typing import Dict, List, Tuple 6 | 7 | import logging 8 | import numpy as np 9 | import pandas as pd 10 | import sh 11 | import torch 12 | from torch.utils.data.dataset import IterableDataset, Dataset 13 | from transformers import BatchEncoding 14 | 15 | from flow_parsing import check_filename_in_patterns 16 | from settings import TARGET_CLASS_COLUMN 17 | from gpt_model.tokenizer import PacketTokenizer 18 | from gpt_model.data_preparation.format_parsed_as_classification_dataset import prepare_classification_data 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | class PretrainIterDataset(IterableDataset): 24 | 25 | def __init__(self, tokenizer: PacketTokenizer, folder_path: str, train_mode=True): 26 | assert os.path.isdir(folder_path) 27 | # TODO feature caching, multiple workers?, filter out one-packet flows 28 | 29 | self.source_files = list(pathlib.Path(folder_path).glob('*.csv')) 30 | logger.info("initializing dataset from %s with %s files", folder_path, len(self.source_files)) 31 | 32 | self.tokenizer = tokenizer 33 | self.train_mode = train_mode 34 | 35 | def __iter__(self) -> BatchEncoding: 36 | assert torch.utils.data.get_worker_info() is None 37 | for csv in self.source_files: 38 | # not really the best way, reading is gonna be slow 39 | logger.info(f'FlowDataset: reading {csv}') 40 | reader = pd.read_csv(csv, chunksize=1, 41 | usecols=self.tokenizer.packet_quantizer.raw_columns, 42 | dtype=float) 43 | for raw_flow in reader: 44 | # skip 1-packet and empty flows 45 | if self.train_mode and pd.isna(raw_flow.iloc[:, 3]).any(): 46 | continue 47 | 48 | encoded = self.tokenizer.batch_encode_packets(raw_flow, 49 | add_special_tokens=True, 50 | return_attention_mask=True) 51 | yield encoded 52 | 53 | @lru_cache(maxsize=2) 54 | def __len__(self): 55 | """ the files are too large to count their size via Python """ 56 | line_counter = sh.Command('sed') 57 | total = 0 58 | for filename in self.source_files: 59 | found_lines = line_counter("-n", "$=", filename) 60 | # do not count .csv header 61 | total += int(found_lines) - 1 62 | return total 63 | 64 | 65 | class PretrainDataset(Dataset): 66 | def __init__(self, tokenizer: PacketTokenizer, folder_path: str, filename_patterns_to_exclude: tuple = ()): 67 | assert os.path.isdir(folder_path) 68 | # TODO feature caching, multiple workers?, filter out one-packet flows 69 | 70 | source_files = list(pathlib.Path(folder_path).glob('*.csv')) 71 | file_matcher = partial(check_filename_in_patterns, patterns=filename_patterns_to_exclude) 72 | source_files = list(file for file in source_files if not file_matcher(file)) 73 | print(source_files) 74 | self.source_files = source_files 75 | logger.info("initializing dataset from %s with %s files", folder_path, len(self.source_files)) 76 | 77 | self.tokenizer = tokenizer 78 | # load as 32-bit to save RAM 79 | raw_flows = pd.concat((pd.read_csv(csv, usecols=self.tokenizer.packet_quantizer.raw_columns, dtype=np.float32) 80 | for csv in self.source_files), ignore_index=True) 81 | 82 | raw_flows = raw_flows.loc[:, tokenizer.packet_quantizer.raw_columns].sample(frac=1, random_state=1) 83 | 84 | logger.info('concatenated dataframes within the folder') 85 | # skip 1-packet and empty flows 86 | raw_flows.dropna(axis=0, subset=['raw_packet0', 'raw_packet1'], inplace=True, how='any') 87 | self.raw_flows = raw_flows.values 88 | logger.info('initialized dataset') 89 | 90 | def __len__(self): 91 | return len(self.raw_flows) 92 | 93 | def __getitem__(self, i: int) -> Dict[str, torch.Tensor]: 94 | return self.tokenizer.batch_encode_packets(self.raw_flows[i].reshape(1, -1).astype(np.float64), 95 | add_special_tokens=True, 96 | return_attention_mask=True).data 97 | 98 | 99 | def load_modeling_data_with_classes( 100 | folder_path, 101 | shuffle=True, 102 | filename_patterns_to_exclude=None 103 | ) -> Tuple[pd.DataFrame, pd.Series]: 104 | assert os.path.isdir(folder_path) 105 | logger.info(f"initializing dataset from {folder_path}, excluding {filename_patterns_to_exclude}") 106 | folder_path = pathlib.Path(folder_path) 107 | 108 | raw_flows = prepare_classification_data(folder_path, 109 | remove_garbage=False, 110 | filename_patterns_to_exclude=filename_patterns_to_exclude) 111 | # skip 1-packet and empty flows 112 | raw_flows.dropna(axis=0, subset=['raw_packet0', 'raw_packet1'], inplace=True, how='any') 113 | if shuffle: 114 | raw_flows = raw_flows.sample(frac=1, random_state=1) 115 | 116 | return raw_flows.filter(regex='raw_'), raw_flows[TARGET_CLASS_COLUMN] 117 | 118 | 119 | class PretrainDatasetWithClasses(Dataset): 120 | def __init__(self, tokenizer: PacketTokenizer, folder_path: str, filename_patterns_to_exclude: tuple = ()): 121 | self.tokenizer = tokenizer 122 | 123 | raw_flows, targets = load_modeling_data_with_classes(folder_path, 124 | filename_patterns_to_exclude=filename_patterns_to_exclude) 125 | 126 | self.raw_flows: np.ndarray = raw_flows.loc[:, tokenizer.packet_quantizer.raw_columns].values 127 | self.targets: np.ndarray = targets.values 128 | logger.info('initialized dataset') 129 | tokenizer.add_class_tokens(self.target_classes) 130 | logger.info('added special tokens representing classes') 131 | 132 | @property 133 | def target_classes(self) -> list: 134 | return np.unique(self.targets).tolist() 135 | 136 | def __len__(self): 137 | return len(self.raw_flows) 138 | 139 | def __getitem__(self, i: int) -> Dict[str, torch.Tensor]: 140 | return self.tokenizer.batch_encode_packets(self.raw_flows[i].reshape(1, -1).astype(np.float64), 141 | target_class=self.targets[i], 142 | add_special_tokens=True, 143 | return_attention_mask=True).data 144 | 145 | 146 | @dataclass 147 | class PretrainCollator: 148 | """ 149 | Data collator used for traffic flow modeling. 150 | - collates batches of tensors, honoring their tokenizer's pad_token 151 | - preprocesses batches for masked language modeling 152 | """ 153 | 154 | tokenizer: PacketTokenizer 155 | 156 | def __call__(self, examples: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]: 157 | """ 158 | Data collator used for packet modeling. 159 | - collates batches of tensors 160 | """ 161 | 162 | length_of_first = examples[0]['input_ids'].size(0) 163 | are_tensors_same_length = all(x['input_ids'].size(0) == length_of_first for x in examples) 164 | assert are_tensors_same_length 165 | 166 | input_ids = torch.cat([item['input_ids'] for item in examples], dim=0) 167 | attention_masks = torch.cat([item['attention_mask'] for item in examples], dim=0) 168 | labels = input_ids.clone().detach() 169 | labels[labels == self.tokenizer.pad_token_id] = -100 170 | return { 171 | "input_ids": input_ids, 172 | "attention_mask": attention_masks, 173 | "labels": labels, 174 | } 175 | 176 | 177 | class FinetuningDataset(Dataset): 178 | def __init__(self, tokenizer: PacketTokenizer, dataset_path: str, target_class: str, target_column: str = None): 179 | assert os.path.isfile(dataset_path) 180 | 181 | dataset_path = pathlib.Path(dataset_path) 182 | self.source_file = dataset_path 183 | logger.info("initializing dataset from %s with '%s' target class", dataset_path, target_class) 184 | 185 | self.tokenizer = tokenizer 186 | 187 | self.target_column = TARGET_CLASS_COLUMN if target_column is None else target_column 188 | 189 | raw_flows = pd.read_csv(self.source_file, 190 | usecols=self.tokenizer.packet_quantizer.raw_columns + [self.target_column]) 191 | raw_flows = raw_flows[raw_flows.loc[:, self.target_column] == target_class] 192 | 193 | self.raw_flows = raw_flows.loc[:, tokenizer.packet_quantizer.raw_columns].values 194 | logger.info('initialized dataset') 195 | 196 | def __len__(self): 197 | return len(self.raw_flows) 198 | 199 | def __getitem__(self, i: int) -> Dict[str, torch.Tensor]: 200 | return self.tokenizer.batch_encode_packets(self.raw_flows[i].reshape(1, -1).astype(np.float64), 201 | add_special_tokens=True, 202 | return_attention_mask=True).data 203 | -------------------------------------------------------------------------------- /gpt_model/quantizer.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import pathlib 4 | from typing import Optional 5 | 6 | import numpy as np 7 | import pandas as pd 8 | from sklearn.cluster import KMeans 9 | 10 | from flow_parsing.features import generate_raw_feature_names 11 | 12 | try: 13 | from libKMCUDA import kmeans_cuda 14 | except ImportError: 15 | print('libKMCUDA was not found: calling fit() for PacketQuantizer is not possible, kmcuda must be installed') 16 | 17 | from settings import logger, BASE_DIR 18 | 19 | 20 | def get_kmeans_mae(original, restored): 21 | s = np.abs(original - restored).sum() 22 | mae = np.abs(original - restored).mean() 23 | logger.info(f'MAE: {mae}, cumulative error: {s}') 24 | return mae 25 | 26 | 27 | def drop_nan_packets(packet_features): 28 | return packet_features[~np.isnan(packet_features) & ~np.isinf(packet_features)].reshape(-1, 2) 29 | 30 | 31 | def init_sklearn_kmeans_from_checkpoint(checkpoint_path): 32 | checkpoint_path = pathlib.Path(checkpoint_path) 33 | with open(checkpoint_path / 'clusters.json', 'rb') as jf: 34 | clusters = np.array(json.load(jf)) 35 | 36 | clusters = drop_nan_packets(clusters) 37 | # make KMeans think it was fitted 38 | quantizer = KMeans(n_clusters=clusters.shape[0]) 39 | quantizer._n_threads = 1 40 | quantizer.cluster_centers_ = clusters 41 | logger.info(f'init sklearn KMeans from checkpoint: {checkpoint_path}') 42 | return quantizer 43 | 44 | 45 | class PacketScaler: 46 | def __init__(self, max_packet_len=1500): 47 | self.max_packet_len = max_packet_len 48 | 49 | def transform(self, packet_pairs): 50 | """ 51 | :param packet_pairs: (N, 2), 0 -- packet_len, 1 -- IAT 52 | :return: transformed_packets (N, 2) 53 | """ 54 | packet_pairs[:, 0] = packet_pairs[:, 0] / self.max_packet_len 55 | # avoids warning and -inf values. the scale here is in microseconds (?) 56 | zero_iats = np.isclose(packet_pairs[:, 1], 0.) 57 | packet_pairs[:, 1][zero_iats] = 0 58 | packet_pairs[:, 1][~zero_iats] = np.log10(packet_pairs[:, 1][~zero_iats]) 59 | return packet_pairs 60 | 61 | def inverse_transform(self, packet_pairs): 62 | packet_pairs[:, 0] = packet_pairs[:, 0] * self.max_packet_len 63 | # to correctly rescale, we need to know which were initially zeros 64 | zero_iats = np.isclose(packet_pairs[:, 1], 0., atol=1e-3) 65 | packet_pairs[:, 1][zero_iats] = 0 66 | packet_pairs[:, 1][~zero_iats] = 10 ** packet_pairs[:, 1][~zero_iats] 67 | return packet_pairs 68 | 69 | 70 | class PacketQuantizer: 71 | """ 72 | You can init PacketQuantizer for transform() and inverse_transform() only after loading from checkpoint 73 | """ 74 | 75 | def __init__(self, 76 | n_clusters=16384, 77 | flow_size=128, 78 | packet_scaler=PacketScaler, 79 | kmeans_clusterizer: Optional[KMeans] = None): 80 | self.n_clusters = n_clusters 81 | # hard-coded to the expected dataframe format (as in features.py) 82 | self.iat_columns = generate_raw_feature_names(flow_size, base_features=('iat',)) 83 | self.packet_columns = generate_raw_feature_names(flow_size, base_features=('packet',)) 84 | self.raw_columns = generate_raw_feature_names(flow_size) 85 | self.scaler = packet_scaler() 86 | self._cluster_centers = None 87 | self.kmeans = kmeans_clusterizer 88 | self.non_packet_value = -1 89 | 90 | def fit(self, raw_batch): 91 | """ 92 | https://github.com/src-d/kmcuda#python-api 93 | due to performance reasons, uses kmcuda instead of sklearn's KMeans. 94 | :param raw_batch: 95 | :return: 96 | """ 97 | # do not consider single-packet flows 98 | raw_batch = raw_batch[raw_batch.raw_packet1 != 0] 99 | # form matrix (n_packet x (packet_size, IAT)) 100 | packet_features = raw_batch[self.raw_columns].values.reshape(-1, 2) 101 | # omit non_packet values 102 | packet_features = drop_nan_packets(packet_features) 103 | init_clusters = "k-means++" if self._cluster_centers is None else self._cluster_centers 104 | logger.info('fitting on {} packets, init clusters from data: {}'.format(packet_features.shape[0], 105 | isinstance(init_clusters, str))) 106 | packet_features = self.scaler.transform(packet_features) 107 | 108 | cluster_centers_, assignments = kmeans_cuda( 109 | samples=packet_features, 110 | clusters=self.n_clusters, 111 | tolerance=0.01, 112 | init=init_clusters, 113 | yinyang_t=0, 114 | metric="L2", 115 | average_distance=False, 116 | seed=1, device=0, verbosity=1 117 | ) 118 | self._cluster_centers = cluster_centers_ 119 | self._evaluate(packet_features, cluster_centers_[assignments]) 120 | 121 | def _evaluate(self, packet_features, restored): 122 | n_unique_clusters = len(self._cluster_centers[~np.isnan(self._cluster_centers)]) / 2 123 | logger.info(f'found {n_unique_clusters} unique clusters') 124 | get_kmeans_mae(packet_features, restored) 125 | 126 | def save_checkpoint(self, save_directory): 127 | save_directory = pathlib.Path(save_directory) 128 | save_directory.mkdir(exist_ok=True) 129 | quantizer_path = save_directory / 'clusters.json' 130 | with open(quantizer_path, 'w') as qf: 131 | try: 132 | json.dump(self._cluster_centers.tolist(), qf) 133 | except AttributeError: 134 | # account for the case when saving not during training 135 | json.dump(self.kmeans.cluster_centers_.tolist(), qf) 136 | logger.info(f'saving checkpoint to {quantizer_path}') 137 | return quantizer_path.as_posix() 138 | 139 | @classmethod 140 | def from_checkpoint(cls, checkpoint_path, *args, **kwargs): 141 | kmeans = init_sklearn_kmeans_from_checkpoint(checkpoint_path) 142 | return cls(n_clusters=kmeans.n_clusters, kmeans_clusterizer=kmeans, *args, **kwargs) 143 | 144 | def transform(self, raw_packet_batch): 145 | """ transforms raw packet matrix of size (n_flows, packets*2) 146 | (where 2 is due to features - PS, IAT) to packet clusters matrix 147 | of size (n_flows, packets). Non-packet values in the source matrix 148 | MUST BE NaN, and in the cluster matrix they correspond to -1. 149 | """ 150 | if self.kmeans is None: 151 | raise Exception('the class must be init with an sklearn KMeans instance first!') 152 | 153 | if isinstance(raw_packet_batch, pd.DataFrame): 154 | # assert correct order 155 | raw_packet_batch = raw_packet_batch[self.raw_columns].values 156 | 157 | batch_size = raw_packet_batch.shape[0] 158 | # reshape to form (n_samples, n_features) for PacketScaler and KMeans 159 | raw_packet_batch = raw_packet_batch.reshape(-1, 2) 160 | non_packet_mask = np.isnan(raw_packet_batch) | np.isinf(raw_packet_batch) 161 | # temp fill to allow for predicting 162 | raw_packet_batch[non_packet_mask] = 0 163 | clusters = self.kmeans.predict(self.scaler.transform(raw_packet_batch)) 164 | # set non_packet clusters to NaN 165 | non_packet_cluster_mask = non_packet_mask.sum(axis=1).astype(bool) 166 | clusters[non_packet_cluster_mask] = self.non_packet_value 167 | # reshape back to batch form 168 | clusters = clusters.reshape(batch_size, -1) 169 | return clusters 170 | 171 | def inverse_transform(self, cluster_batch): 172 | flat_clusters = cluster_batch.flatten() 173 | non_packet_cluster_mask = flat_clusters == self.non_packet_value 174 | # assign temp cluster value 175 | flat_clusters[non_packet_cluster_mask] = 0 176 | outbound_cluster_mask = flat_clusters >= self.n_clusters 177 | n_outbound_clusters = outbound_cluster_mask.sum() 178 | if n_outbound_clusters > 0: 179 | logger.warning(f'found {n_outbound_clusters} outbounding cluster values') 180 | flat_clusters[outbound_cluster_mask] = 0 181 | reverted_packets = self.scaler.inverse_transform(self.kmeans.cluster_centers_[flat_clusters]) 182 | # make NaN non-packets 183 | reverted_packets[non_packet_cluster_mask] = np.nan 184 | reverted_packets = reverted_packets.reshape(-1, cluster_batch.shape[1] * 2) 185 | return reverted_packets 186 | 187 | 188 | def main(): 189 | parser = argparse.ArgumentParser() 190 | parser.add_argument( 191 | "-s", "--source", 192 | help="source folder with .csv files. the recommended way is to create a folder with all the training data, that" 193 | "was merged and shuffled beforehand (e.g. via pandas)" 194 | ) 195 | args = parser.parse_args() 196 | 197 | quantizer = PacketQuantizer(n_clusters=16384, flow_size=128) 198 | raw_csv_dir = pathlib.Path(args.source) 199 | 200 | flow_limit = 1_000_000 201 | for file_idx, csv in enumerate(raw_csv_dir.glob('*.csv')): 202 | logger.info(f'processing {csv}') 203 | reader = pd.read_csv(csv, chunksize=flow_limit, usecols=quantizer.raw_columns, dtype=np.float32) 204 | for batch, raw_packets in enumerate(reader): 205 | quantizer.fit(raw_packets) 206 | if batch % 10 == 0: 207 | quantizer.save_checkpoint( 208 | BASE_DIR / f'gpt_model/trained_quantizers/quantizer_2^14_{csv.stem}_{batch}') 209 | 210 | quantizer.save_checkpoint(BASE_DIR / f'gpt_model/trained_quantizers/quantizer_2^14_{csv.stem}_final') 211 | 212 | 213 | if __name__ == '__main__': 214 | main() 215 | -------------------------------------------------------------------------------- /gpt_model/generator/train_generator.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ 17 | Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa). 18 | GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned 19 | using a masked language modeling (MLM) loss. 20 | """ 21 | 22 | 23 | import logging 24 | import math 25 | import os 26 | from dataclasses import dataclass, field 27 | from typing import Optional 28 | 29 | from transformers import ( 30 | GPT2Config, 31 | GPT2LMHeadModel, 32 | HfArgumentParser, 33 | TrainingArguments, 34 | set_seed, AutoModelForCausalLM, Trainer, 35 | ) 36 | 37 | from gpt_model.generator.dataset import PretrainCollator, PretrainDataset, FinetuningDataset, PretrainDatasetWithClasses 38 | from gpt_model.tokenizer import PacketTokenizer 39 | from settings import FilePatterns 40 | 41 | logger = logging.getLogger(__name__) 42 | 43 | 44 | @dataclass 45 | class ModelArguments: 46 | """ 47 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. 48 | """ 49 | 50 | model_name_or_path: Optional[str] = field( 51 | default=None, 52 | metadata={ 53 | "help": "The model checkpoint for weights initialization. " 54 | "Leave None if you want to train a model from scratch." 55 | }, 56 | ) 57 | quantizer_path: Optional[str] = field( 58 | default=None, 59 | metadata={ 60 | "help": "The quantizer checkpoint for weights initialization. Must be provided when the model" 61 | "is trained from scratch. Not used, when the model is initialized from checkpoint" 62 | }, 63 | ) 64 | model_type: Optional[str] = field( 65 | default=None, 66 | ) 67 | config_name: Optional[str] = field( 68 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} 69 | ) 70 | tokenizer_name: Optional[str] = field( 71 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} 72 | ) 73 | cache_dir: Optional[str] = field( 74 | default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"} 75 | ) 76 | 77 | 78 | @dataclass 79 | class DataTrainingArguments: 80 | """ 81 | Arguments pertaining to what data we are going to input our model for training and eval. 82 | """ 83 | 84 | train_data_file: Optional[str] = field( 85 | default=None, metadata={"help": "The input training data file (a text file)."} 86 | ) 87 | eval_data_file: Optional[str] = field( 88 | default=None, 89 | metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, 90 | ) 91 | block_size: int = field( 92 | default=-1, 93 | metadata={ 94 | "help": "Optional input sequence length after tokenization." 95 | "The training dataset will be truncated in block of this size for training." 96 | "Default to the model max input length for single sentence inputs (take into account special tokens)." 97 | }, 98 | ) 99 | overwrite_cache: bool = field( 100 | default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} 101 | ) 102 | finetune_on_class: str = field( 103 | default=None, 104 | metadata={"help": "specifies flow subset within the DF's target column to fine-tune the packet model on"} 105 | ) 106 | train_with_targets: bool = field( 107 | default=False, 108 | metadata={"help": "specifies whether to include flow label as the first special token or to use a generic BOS"} 109 | ) 110 | file_patterns_to_exclude: str = field( 111 | default='mawi', 112 | metadata={"help": "specifies which file patterns from the data folder to exclude, defaults to empty," 113 | " see settings.py::FilePatterns for used combinations"} 114 | ) 115 | 116 | 117 | def get_dataset(args: DataTrainingArguments, tokenizer: PacketTokenizer, evaluate=False): 118 | file_path = args.eval_data_file if evaluate else args.train_data_file 119 | logger.info(f'block_size is {args.block_size} and likely unused') 120 | file_patterns = getattr(FilePatterns, args.file_patterns_to_exclude) 121 | if args.finetune_on_class: 122 | return FinetuningDataset(tokenizer=tokenizer, 123 | dataset_path=file_path, 124 | target_class=args.finetune_on_class) 125 | 126 | if args.train_with_targets: 127 | return PretrainDatasetWithClasses(tokenizer=tokenizer, 128 | folder_path=file_path, 129 | filename_patterns_to_exclude=file_patterns) 130 | 131 | return PretrainDataset(tokenizer=tokenizer, folder_path=file_path, filename_patterns_to_exclude=file_patterns) 132 | 133 | 134 | def main(): 135 | # See all possible arguments in src/transformers/training_args.py 136 | # or by passing the --help flag to this script. 137 | # We now keep distinct sets of args, for a cleaner separation of concerns. 138 | 139 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) 140 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 141 | 142 | if data_args.eval_data_file is None and training_args.do_eval: 143 | raise ValueError( 144 | "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " 145 | "or remove the --do_eval argument." 146 | ) 147 | 148 | if ( 149 | os.path.exists(training_args.output_dir) 150 | and os.listdir(training_args.output_dir) 151 | and training_args.do_train 152 | and not training_args.overwrite_output_dir 153 | ): 154 | raise ValueError( 155 | f"Output directory ({training_args.output_dir}) already exists and is not empty. " 156 | f"Use --overwrite_output_dir to overcome." 157 | ) 158 | 159 | if data_args.finetune_on_class and data_args.train_with_targets: 160 | raise ValueError("Pretraining with flow labels and fine-tuning on the class simultaneously not supported.") 161 | 162 | if not model_args.model_name_or_path and not model_args.quantizer_path: 163 | raise ValueError("Either model or quantizer checkpoint path must be specified") 164 | 165 | # Setup logging 166 | logging.basicConfig( 167 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 168 | datefmt="%m/%d/%Y %H:%M:%S", 169 | level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, 170 | ) 171 | logger.warning( 172 | "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", 173 | training_args.local_rank, 174 | training_args.device, 175 | training_args.n_gpu, 176 | bool(training_args.local_rank != -1), 177 | training_args.fp16, 178 | ) 179 | logger.info("Training/evaluation parameters %s", training_args) 180 | 181 | # Set seed 182 | set_seed(training_args.seed) 183 | 184 | # Load pretrained model and tokenizer 185 | # 186 | # Distributed training: 187 | # The .from_pretrained methods guarantee that only one local process can concurrently 188 | # download model & vocab. 189 | 190 | if model_args.model_name_or_path: 191 | config = GPT2Config.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) 192 | else: 193 | config = GPT2Config.from_json_file(model_args.config_name) 194 | logger.warning("You are instantiating a new config instance from scratch.") 195 | 196 | if model_args.model_name_or_path: 197 | tokenizer = PacketTokenizer.from_pretrained(model_args.model_name_or_path) 198 | else: 199 | tokenizer = PacketTokenizer.from_pretrained(model_args.quantizer_path) 200 | 201 | if model_args.model_name_or_path: 202 | model = GPT2LMHeadModel.from_pretrained( 203 | model_args.model_name_or_path, 204 | from_tf=bool(".ckpt" in model_args.model_name_or_path), 205 | config=config, 206 | cache_dir=model_args.cache_dir, 207 | ) 208 | else: 209 | logger.info("Training new model from scratch") 210 | model = AutoModelForCausalLM.from_config(config) 211 | 212 | if data_args.block_size <= 0: 213 | data_args.block_size = tokenizer.max_len 214 | # Our input block size will be the max possible for the model 215 | else: 216 | data_args.block_size = min(data_args.block_size, tokenizer.max_len) 217 | 218 | # Get datasets 219 | train_dataset = get_dataset(data_args, tokenizer=tokenizer) if training_args.do_train else None 220 | eval_dataset = get_dataset(data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None 221 | model.resize_token_embeddings(len(tokenizer)) 222 | print(model) 223 | 224 | # Initialize our Trainer 225 | trainer = Trainer( 226 | model=model, 227 | args=training_args, 228 | data_collator=PretrainCollator(tokenizer), 229 | train_dataset=train_dataset, 230 | eval_dataset=eval_dataset, 231 | prediction_loss_only=True, 232 | ) 233 | 234 | # Training 235 | if training_args.do_train: 236 | model_path = ( 237 | model_args.model_name_or_path 238 | if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) 239 | else None 240 | ) 241 | trainer.train(model_path=model_path) 242 | trainer.save_model() 243 | # For convenience, we also re-save the tokenizer to the same directory, 244 | # so that you can share your model easily on huggingface.co/models =) 245 | if trainer.is_world_master(): 246 | tokenizer.save_pretrained(training_args.output_dir) 247 | 248 | # Evaluation 249 | results = {} 250 | if training_args.do_eval: 251 | logger.info("*** Evaluate ***") 252 | 253 | eval_output = trainer.evaluate() 254 | 255 | perplexity = math.exp(eval_output["eval_loss"]) 256 | result = {"perplexity": perplexity} 257 | 258 | output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") 259 | if trainer.is_world_master(): 260 | with open(output_eval_file, "w") as writer: 261 | logger.info("***** Eval results *****") 262 | for key in sorted(result.keys()): 263 | logger.info(" %s = %s", key, str(result[key])) 264 | writer.write("%s = %s\n" % (key, str(result[key]))) 265 | 266 | results.update(result) 267 | 268 | return results 269 | 270 | 271 | def _mp_fn(index): 272 | # For xla_spawn (TPUs) 273 | main() 274 | 275 | 276 | if __name__ == "__main__": 277 | main() 278 | -------------------------------------------------------------------------------- /sklearn_classifiers/featurizer.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pathlib 3 | from typing import Tuple, List 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import torch 8 | from pandarallel import pandarallel 9 | from sklearn.compose import ColumnTransformer 10 | from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder 11 | from transformers import GPT2Model 12 | 13 | from evaluation_utils.modeling import flows_to_packets 14 | from flow_parsing.features import ( 15 | FEATURE_PREFIX, 16 | FEATURE_FUNCTIONS, 17 | CONTINUOUS_NAMES, 18 | generate_raw_feature_names, 19 | calc_parameter_stats 20 | ) 21 | from flow_parsing.utils import get_df_hash, save_dataset, read_dataset 22 | from gpt_model.tokenizer import PacketTokenizer 23 | from settings import TARGET_CLASS_COLUMN, DEFAULT_PACKET_LIMIT_PER_FLOW, CACHE_DIR 24 | from .utils import iterate_batch_indexes 25 | 26 | logger = logging.getLogger(__name__) 27 | pandarallel.initialize() 28 | 29 | 30 | class BaseFeaturizer: 31 | def __init__(self, packet_num, consider_iat_features=True, target_column=TARGET_CLASS_COLUMN): 32 | self.target_encoder = LabelEncoder() 33 | self.target_column = target_column 34 | 35 | self.raw_features: List[str] = generate_raw_feature_names( 36 | packet_num, 37 | base_features=('packet', 'iat') if consider_iat_features else ('packet',) 38 | ) 39 | 40 | def encode_targets(self, data: pd.DataFrame) -> np.ndarray: 41 | return self.target_encoder.transform(data[self.target_column]) 42 | 43 | def fit_target_encoder(self, data: pd.DataFrame) -> np.ndarray: 44 | return self.target_encoder.fit_transform(data[self.target_column]) 45 | 46 | def fit_transform_encode(self, data: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]: 47 | raise NotImplementedError 48 | 49 | def transform_encode(self, data: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]: 50 | raise NotImplementedError 51 | 52 | 53 | class TransformerFeatureExtractor(BaseFeaturizer): 54 | def __init__( 55 | self, 56 | transformer_pretrained_path, 57 | packet_num, 58 | mask_first_token=False, 59 | reinitialize=False, 60 | device=None 61 | ): 62 | super().__init__(packet_num, consider_iat_features=True) 63 | assert packet_num > 0, 'raw packet sequence length must be > 0' 64 | self._pretrained_path = pathlib.Path(transformer_pretrained_path) 65 | self.tokenizer = PacketTokenizer.from_pretrained(transformer_pretrained_path, 66 | flow_size=packet_num) 67 | if not device: 68 | self.device = 'cuda' if torch.cuda.is_available() else 'cpu' 69 | 70 | feature_extractor = GPT2Model.from_pretrained(transformer_pretrained_path).to(self.device) 71 | self.reinitialize = reinitialize 72 | if self.reinitialize: 73 | logger.info('resetting model weights') 74 | feature_extractor.init_weights() 75 | self.feature_extractor = feature_extractor.eval() 76 | self.mask_first_token = mask_first_token 77 | 78 | def _get_transformer_features(self, df, batch_size=1024): 79 | filename = (get_df_hash(df) + 80 | self._pretrained_path.stem + 81 | ('_mask_first' if self.mask_first_token else '') + 82 | ('_reinitialize' if self.reinitialize else '')) 83 | tmp_path = CACHE_DIR / filename 84 | if tmp_path.is_file(): 85 | logger.info(f'found cached transformer features, loading {tmp_path}...') 86 | return read_dataset(tmp_path, True) 87 | 88 | logger.info(f'did not find cached transformer features at {tmp_path}, processing...') 89 | merged_tensor = np.empty((len(df), self.feature_extractor.config.hidden_size)) 90 | for start_idx, end_idx in iterate_batch_indexes(df, batch_size): 91 | raw_subset = df[self.raw_features].iloc[start_idx:end_idx] 92 | encoded_flows = self.tokenizer.batch_encode_packets(raw_subset).to(self.device) 93 | if self.mask_first_token: 94 | encoded_flows['attention_mask'][:, 0] = 0 95 | with torch.no_grad(): 96 | output = self.feature_extractor(**encoded_flows) 97 | output = output[0].to('cpu') # last hidden state (batch_size, sequence_length, hidden_size) 98 | # average over temporal dimension 99 | output = output.mean(dim=1).numpy() 100 | merged_tensor[start_idx:end_idx, :] = output 101 | 102 | save_dataset(pd.DataFrame(merged_tensor), tmp_path) 103 | return merged_tensor 104 | 105 | def fit_transform_encode(self, data): 106 | X_feat = self._get_transformer_features(data) 107 | y = self.fit_target_encoder(data) 108 | return X_feat, y 109 | 110 | def transform_encode(self, data): 111 | X_feat = self._get_transformer_features(data) 112 | y = self.encode_targets(data) 113 | return X_feat, y 114 | 115 | 116 | class Featurizer(BaseFeaturizer): 117 | """ 118 | Featurizer processes features from a pandas object by merging results from scalers, one-hot encoders 119 | and encodes target labels 120 | """ 121 | 122 | def __init__(self, 123 | packet_num, 124 | cont_features=None, 125 | categorical_features=None, 126 | consider_tcp_flags=True, 127 | consider_j3a=True, 128 | consider_raw_features=True, 129 | consider_iat_features=False, 130 | target_column=TARGET_CLASS_COLUMN): 131 | super().__init__(packet_num, consider_iat_features, target_column) 132 | 133 | self.column_converter = None 134 | if not consider_raw_features: 135 | self.raw_features = [] 136 | self.consider_iat_features = consider_iat_features 137 | self.consider_tcp_flags = consider_tcp_flags 138 | self.consider_j3a = consider_j3a 139 | 140 | self.categorical_features = ['ip_proto'] if categorical_features is None else categorical_features 141 | self.cont_features = self._get_cont_features() if cont_features is None else cont_features 142 | self.try_extract_derivative_features = cont_features is None 143 | 144 | if self.consider_j3a: 145 | self.categorical_features.extend(['ndpi_j3ac', 'ndpi_j3as']) 146 | 147 | if self.consider_tcp_flags: 148 | self.categorical_features.extend([f'{FEATURE_PREFIX.client}found_tcp_flags', 149 | f'{FEATURE_PREFIX.server}found_tcp_flags']) 150 | 151 | def _get_cont_features(self): 152 | # here we expect features to be consistent with flow_parser's 153 | base_features = ['bulk', 'packet'] 154 | if self.consider_iat_features: 155 | base_features.append('iat') 156 | 157 | cont_features = [] 158 | for prefix in [FEATURE_PREFIX.client, FEATURE_PREFIX.server]: 159 | for derivative in list(FEATURE_FUNCTIONS.keys()): 160 | for base in base_features: 161 | cont_features.append(prefix + base + derivative) 162 | return cont_features 163 | 164 | def _filter_non_existing_features(self, data: pd.DataFrame): 165 | data_features = set(data.columns) 166 | 167 | if set(self.raw_features) - data_features: 168 | found_features = list(set(self.raw_features) & data_features) 169 | logger.warning(f'skipping the following raw features: {set(self.raw_features) - data_features}') 170 | self.raw_features = found_features 171 | 172 | if set(self.cont_features) - data_features: 173 | found_features = list(set(self.cont_features) & data_features) 174 | logger.warning(f'skipping the following continuous features: {set(self.cont_features) - data_features}') 175 | self.cont_features = found_features 176 | 177 | if set(self.categorical_features) - data_features: 178 | found_features = list(set(self.categorical_features) & data_features) 179 | logger.warning(f'skipping the following categorical features: ' 180 | f'{set(self.categorical_features) - data_features}') 181 | self.categorical_features = found_features 182 | 183 | def _parse_derivatives_if_needed(self, data): 184 | if self.try_extract_derivative_features: 185 | data = self.calc_packets_stats_from_raw(data) 186 | return data 187 | 188 | def _fit_transform_encode(self, data: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]: 189 | """ init transformers upon actual fitting to check for non-existing columns """ 190 | data = self._parse_derivatives_if_needed(data) 191 | 192 | self._filter_non_existing_features(data) 193 | feature_set = [] 194 | 195 | if self.cont_features: 196 | feature_set.append(("scaler", StandardScaler(), 197 | self.cont_features)) 198 | 199 | if self.categorical_features: 200 | feature_set.append(("one_hot", OneHotEncoder(handle_unknown='ignore', sparse=False), 201 | self.categorical_features)), 202 | 203 | if self.raw_features: 204 | # TODO replace with PacketScaler 205 | feature_set.append(('raw_features', StandardScaler(), self.raw_features)) 206 | 207 | self.column_converter = ColumnTransformer(feature_set) 208 | 209 | X_train = self.column_converter.fit_transform(data) 210 | y_train = self.fit_target_encoder(data) 211 | logger.info(f'{X_train.shape[0]} train samples with {self.n_classes} classes') 212 | return X_train, y_train 213 | 214 | def fit_transform(self, data: pd.DataFrame) -> np.ndarray: 215 | return self._fit_transform_encode(data)[0] 216 | 217 | def fit_transform_encode(self, data: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]: 218 | return self._fit_transform_encode(data) 219 | 220 | def transform(self, data: pd.DataFrame) -> np.ndarray: 221 | data = self._parse_derivatives_if_needed(data) 222 | X_test = self.column_converter.transform(data) 223 | return X_test 224 | 225 | def transform_encode(self, data: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]: 226 | data = self._parse_derivatives_if_needed(data) 227 | X_test = self.column_converter.transform(data) 228 | y_test = self.encode_targets(data) 229 | return X_test, y_test 230 | 231 | @property 232 | def n_classes(self): 233 | return len(self.target_encoder.classes_) 234 | 235 | def calc_packets_stats_from_raw(self, data: pd.DataFrame): 236 | def calc_flow_packet_stats(flow: np.ndarray): 237 | subflow = flow[:2 * DEFAULT_PACKET_LIMIT_PER_FLOW] 238 | packets = flows_to_packets(subflow) 239 | from_idx = packets[:, 0] > 0 240 | to_idx = packets[:, 0] < 0 241 | 242 | stats = {} 243 | for direction, packet_idx in zip( 244 | (FEATURE_PREFIX.server, FEATURE_PREFIX.client), 245 | (from_idx, to_idx) 246 | ): 247 | try: 248 | ps_derivatives = calc_parameter_stats(np.abs(packets[packet_idx, 0]), direction, 'packet') 249 | stats.update(ps_derivatives) 250 | except ValueError: 251 | continue 252 | 253 | if self.consider_iat_features: 254 | try: 255 | iat_derivatives = calc_parameter_stats(packets[packet_idx, 1], direction, 'iat') 256 | stats.update(iat_derivatives) 257 | except ValueError: 258 | continue 259 | 260 | return stats 261 | 262 | if any(FEATURE_PREFIX.server + feature in data.columns for feature in CONTINUOUS_NAMES): 263 | logger.warning('packet stats has been found in dataframe, skipping calculation') 264 | return data 265 | 266 | tmp_path = CACHE_DIR / (get_df_hash(data) + '_iat_' + str(self.consider_iat_features)) 267 | if tmp_path.is_file(): 268 | logger.info('found cached dataset version, loading...') 269 | return read_dataset(tmp_path, True) 270 | 271 | raw = data.filter(regex='raw_') 272 | packet_stats = raw.parallel_apply(calc_flow_packet_stats, axis=1, raw=True, result_type='expand').tolist() 273 | packet_stats = pd.DataFrame(packet_stats).fillna(0) 274 | logger.info('calculated the derivatives from raw features') 275 | data = data.join(packet_stats) 276 | save_dataset(data, save_to=tmp_path) 277 | return data 278 | --------------------------------------------------------------------------------