├── gpt_model
    ├── generator
    │   ├── baseline
    │   │   ├── __init__.py
    │   │   └── markov.py
    │   ├── model_config.json
    │   ├── run_generator_finetuning.sh
    │   ├── run_generator_training.sh
    │   ├── trainer.py
    │   ├── run_generating.py
    │   ├── dataset.py
    │   └── train_generator.py
    ├── data_preparation
    │   ├── form_aux_dataset.sh
    │   ├── README.md
    │   ├── preprocess_target_pcaps.py
    │   ├── preprocess_pretraining_pcaps.py
    │   └── format_parsed_as_classification_dataset.py
    ├── classifier
    │   ├── run_evaluation_scenarios.sh
    │   ├── model.py
    │   ├── dataset.py
    │   └── train_classifier.py
    ├── README.md
    ├── tokenizer.py
    └── quantizer.py
├── flow_parsing
    ├── static
    │   ├── example.pcap
    │   └── ip_proto_map.csv
    ├── __init__.py
    ├── aux_raw_features_plugin.py
    ├── utils.py
    ├── features.py
    └── pcap_parser.py
├── sklearn_classifiers
    ├── utils.py
    ├── registered_classes.py
    ├── config.yaml.example
    ├── clf_utils.py
    ├── run_training.py
    ├── knn_cosine.py
    └── featurizer.py
├── tests
    ├── test_classifiers.py
    ├── test_gen_metrics.py
    ├── test_evaluator.py
    ├── static
    │   ├── classifiers_config.yaml
    │   ├── quantizer_checkpoint
    │   │   └── ids_to_tokens.json
    │   └── quantized_pkts.json
    ├── test_fsnet.py
    ├── test_markov.py
    ├── test_pcap_parser.py
    ├── conftest.py
    ├── test_distance.py
    └── test_tokenizer.py
├── .gitignore
├── requirements.yaml
├── fs_net
    ├── README.md
    ├── model.py
    ├── dataset.py
    └── train_fsnet.py
├── nn_classifiers
    ├── dataset.py
    └── models.py
├── .github
    └── workflows
    │   └── python-package-conda.yml
├── settings.py
├── evaluation_utils
    ├── classification.py
    └── modeling.py
├── README.md
└── LICENSE


/gpt_model/generator/baseline/__init__.py:
--------------------------------------------------------------------------------
1 | from .markov import MarkovGenerator
2 | 
3 | 
4 | __all__ = [
5 |     'MarkovGenerator'
6 | ]
7 | 


--------------------------------------------------------------------------------
/flow_parsing/static/example.pcap:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RadionBik/ML-based-network-traffic-classifier/HEAD/flow_parsing/static/example.pcap


--------------------------------------------------------------------------------
/gpt_model/data_preparation/form_aux_dataset.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | ls -1d ~/Applications/traffic_dumps/separated_iot_devices/* | xargs -n1 -t python ../flow_parser.py --raw -p
4 | python ../flow_parser.py -p ~/Applications/traffic_dumps/non_iot.pcap --raw


--------------------------------------------------------------------------------
/sklearn_classifiers/utils.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | 
 3 | 
 4 | def iterate_batch_indexes(array, batch_size):
 5 |     iter_num = len(array) // batch_size
 6 |     for iteration in tqdm(range(iter_num + 1)):
 7 |         start_idx = iteration * batch_size
 8 |         end_idx = (iteration + 1) * batch_size
 9 |         yield start_idx, end_idx
10 | 


--------------------------------------------------------------------------------
/flow_parsing/__init__.py:
--------------------------------------------------------------------------------
 1 | from .utils import read_dataset, check_filename_in_patterns, save_dataset
 2 | from .pcap_parser import parse_pcap_to_csv, parse_pcap_to_dataframe, init_streamer
 3 | 
 4 | 
 5 | __all__ = [
 6 |     read_dataset,
 7 |     save_dataset,
 8 |     check_filename_in_patterns,
 9 |     parse_pcap_to_dataframe,
10 |     parse_pcap_to_csv,
11 |     init_streamer,
12 | ]
13 | 


--------------------------------------------------------------------------------
/gpt_model/generator/model_config.json:
--------------------------------------------------------------------------------
 1 | {"vocab_size": 9906,
 2 | "n_positions": 128,
 3 | "n_ctx": 128,
 4 | "n_embd": 512,
 5 | "n_layer": 6,
 6 | "n_head": 8,
 7 | "activation_function": "gelu_new",
 8 | "resid_pdrop": 0.1,
 9 | "embd_pdrop": 0.1,
10 | "attn_pdrop": 0.1,
11 | "layer_norm_epsilon": 1e-5,
12 | "initializer_range": 0.02,
13 | "bos_token_id": -10,
14 | "eos_token_id": -10,
15 |   "pad_token_id": -10
16 | }


--------------------------------------------------------------------------------
/tests/test_classifiers.py:
--------------------------------------------------------------------------------
 1 | from sklearn_classifiers import clf_utils
 2 | import settings
 3 | 
 4 | 
 5 | def test_config_parsing(classif_config):
 6 |     cfg = clf_utils.read_classifier_settings(settings.TEST_STATIC_DIR / 'classifiers_config.yaml')
 7 |     assert cfg == classif_config
 8 | 
 9 | 
10 | def test_init_clfs(classif_config):
11 |     clfs = clf_utils.initialize_classifiers(classif_config)
12 |     assert clfs
13 | 


--------------------------------------------------------------------------------
/gpt_model/data_preparation/README.md:
--------------------------------------------------------------------------------
1 | This is rather a collection of various functions that were used
2 | during dataset creation. The code was highly experimental and the exact steps
3 | has not been documented and saved, although it is not that 
4 | difficult to repeat.
5 | 
6 | The only module that can run as expected is `format_parsed_as_classification_dataset.py`
7 | 
8 | After all, it should not be a problem, since the resulting datasets are 
9 | accessible via the minio client as described in the upper-level [README.md](../README.md)


--------------------------------------------------------------------------------
/tests/test_gen_metrics.py:
--------------------------------------------------------------------------------
 1 | from evaluation_utils.modeling import get_ks_stat, get_wasserstein_distance_pdf
 2 | import numpy as np
 3 | 
 4 | 
 5 | def test_scale_invariance():
 6 | 
 7 |     def check(f):
 8 |         m = f(orig, gen)
 9 |         m_l = f(orig * 100, gen * 100)
10 |         m_a = f(orig + 10000, gen + 10000)
11 |         assert np.isclose(m, m_l)
12 |         assert np.isclose(m, m_a, atol=1e-2)
13 | 
14 |     orig = np.random.random(1000)
15 |     gen = np.random.normal(size=1000) - .1
16 |     check(get_ks_stat)
17 |     check(get_wasserstein_distance_pdf)
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *$py.class
 4 | .ipynb_checkpoints/
 5 | *.ipynb
 6 | 
 7 | #python workspaces
 8 | *.res
 9 | .~*
10 | #figures/
11 | figures/*.*
12 | reports/
13 | trained_classifiers/*.*
14 | #old code
15 | py/
16 | .idea
17 | .directory
18 | 
19 | #results
20 | *.txt
21 | *.csv
22 | *.docx
23 | *.doc
24 | *.odt
25 | *.pdf
26 | 
27 | # config
28 | *.yaml
29 | 
30 | venv
31 | config.ini
32 | csv_files
33 | bin
34 | *.pcap
35 | !flow_parsing/static/example.pcap
36 | 
37 | # gpt_model artifacts
38 | trial_txt_gpt2
39 | trained_quantizers


--------------------------------------------------------------------------------
/tests/test_evaluator.py:
--------------------------------------------------------------------------------
 1 | from evaluation_utils.modeling import convert_ipt_to_iat, evaluate_generated_traffic, flows_to_packets
 2 | 
 3 | 
 4 | def test_splitting_by_directions(raw_dataset):
 5 |     raw_dataset = raw_dataset.values
 6 |     ipt_ds = convert_ipt_to_iat(raw_dataset)
 7 |     assert ipt_ds.shape == raw_dataset.shape
 8 |     ipt_packets = flows_to_packets(ipt_ds)
 9 |     source_packets = flows_to_packets(raw_dataset)
10 |     assert (ipt_packets[:, 0] == source_packets[:, 0]).all()
11 | 
12 | 
13 | def test_smoke_evaluate_generated_traffic(raw_dataset):
14 |     results = evaluate_generated_traffic(raw_dataset.values, raw_dataset.values)
15 |     assert all(value == 0 for key, value in results.items() if key.startswith('KL'))
16 | 


--------------------------------------------------------------------------------
/requirements.yaml:
--------------------------------------------------------------------------------
 1 | name: classifier
 2 | channels:
 3 |   - conda-forge
 4 |   - defaults
 5 | dependencies:
 6 |   - python=3.7
 7 |   - pandas=1.0.3
 8 |   - numpy=1.18.4
 9 |   - matplotlib=3.2.1
10 |   - seaborn=0.11.0
11 |   - scikit-learn=0.23.1
12 |   - neptune-client
13 |   - psutil
14 |   - pytorch::pytorch=1.6 # -- built from source due to CUDA 11.0 and apex
15 |   - dpkt=1.9.2
16 |   - hmmlearn=0.2.3
17 |   - pytest=5.4.2
18 |   - pytest-mock=3.1.1
19 |   - pyyaml=5.3.1
20 |   - gitpython=3.1.3
21 |   - jupyterlab>=2.1
22 |   - jupytext
23 |   - tqdm
24 |   - ipywidgets
25 |   - nodejs
26 |   - pip:
27 |     - pytorch-lightning==0.8.5
28 |     - nfstream==6.1.3
29 |     - sh==1.13.1
30 |     - pandarallel==1.4.8
31 |     - transformers==3.0.2
32 |     - ngt==1.12.2
33 | 


--------------------------------------------------------------------------------
/gpt_model/generator/run_generator_finetuning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PYTHONPATH=../.. python train_generator.py \
 4 | --model_name_or_path=/media/raid_store/pretrained_traffic/gpt2_model_2epochs_classes \
 5 | --finetune_on_class=Telegram \
 6 | --output_dir=/media/raid_store/pretrained_traffic/gpt2_model_telegram \
 7 | --do_train \
 8 | --train_data_file=../../datasets/train_4c93174d7808b1487aa3288084365d76_no_mawi_unswnb_iscxvpn.csv \
 9 | --do_eval \
10 | --eval_data_file=../../datasets/test_4c93174d7808b1487aa3288084365d76_no_mawi_unswnb_iscxvpn.csv \
11 | --overwrite_output_dir \
12 | --per_device_train_batch_size=128 \
13 | --per_device_eval_batch_size=224 \
14 | --fp16 \
15 | --fp16_opt_level=O2 \
16 | --logging_steps=1 \
17 | --save_steps=1000 \
18 | --eval_steps=1000 \
19 | --gradient_accumulation_steps=30 \
20 | --num_train_epochs=10 \
21 | --learning_rate=0.00005 \
22 | --save_total_limit=10 \
23 | --logging_dir=fine_runs/7


--------------------------------------------------------------------------------
/tests/static/classifiers_config.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | SVM:
 3 |     type: OneVsOneClassifier
 4 |     params:
 5 |         estimator:
 6 |             type: LinearSVC
 7 |             params:
 8 |                 tol: 1.0e-5
 9 |         n_jobs: -1
10 |     param_search_space:
11 |         estimator__C: [0.1, 1, 10]
12 |         estimator__loss: ['squared_hinge']
13 |         estimator__dual: [True, False]
14 | DecTree:
15 |     type: DecisionTreeClassifier
16 |     param_search_space:
17 |         max_depth:
18 |             from: 6
19 |             till: 20
20 |             step: 3
21 |         max_features:
22 |             from: 10
23 |             till: 40
24 |             step: 10
25 |         criterion:
26 |             - entropy
27 | GradBoost:
28 |     type: GradientBoostingClassifier
29 |     param_search_space:
30 |         n_estimators:
31 |             - 50
32 |         max_depth: [2,3,4,5]
33 |         learning_rate:
34 |             - 0.01
35 |             - 0.05
36 |             - 0.1


--------------------------------------------------------------------------------
/gpt_model/generator/run_generator_training.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PYTHONPATH=../.. python train_generator.py \
 4 | --config_name=model_config.json \
 5 | --quantizer_path=trained_quantizers/quantizer_2^14_train_shuffled_0 \
 6 | --output_dir=/media/raid_store/pretrained_traffic/gpt2_model_4epochs_classes_external \
 7 | --do_train \
 8 | --train_data_file=/media/raid_store/pretrained_traffic/train_csv \
 9 | --do_eval \
10 | --eval_data_file=/media/raid_store/pretrained_traffic/val_csv \
11 | --overwrite_output_dir \
12 | --per_device_train_batch_size=128 \
13 | --per_device_eval_batch_size=224 \
14 | --fp16 \
15 | --fp16_opt_level=O2 \
16 | --logging_steps=1 \
17 | --save_steps=150 \
18 | --eval_steps=1000 \
19 | --gradient_accumulation_steps=30 \
20 | --num_train_epochs=4 \
21 | --warmup_steps=200 \
22 | --learning_rate=0.001 \
23 | --save_total_limit=10 \
24 | --file_patterns_to_exclude=mawi_iot_home \
25 | --train_with_targets
26 | #--model_name_or_path=/media/raid_store/pretrained_traffic/gpt2_model_4epochs_classes_home_iot \


--------------------------------------------------------------------------------
/sklearn_classifiers/registered_classes.py:
--------------------------------------------------------------------------------
 1 | from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
 2 | from sklearn.linear_model import LogisticRegression
 3 | from sklearn.multiclass import OneVsOneClassifier
 4 | from sklearn.neighbors import KNeighborsClassifier
 5 | from sklearn.neural_network import MLPClassifier
 6 | from sklearn.svm import LinearSVC
 7 | from sklearn.tree import DecisionTreeClassifier
 8 | 
 9 | from sklearn_classifiers.knn_cosine import (
10 |     KNeighborsCosineClassifier,
11 |     KNeighborsPuffinnClassifier,
12 |     KNeighborsNGTClassifier
13 | )
14 | 
15 | REGISTERED_CLASSES = {
16 |     cls.__name__: cls for cls in [
17 |         MLPClassifier,
18 |         LinearSVC,
19 |         DecisionTreeClassifier,
20 |         RandomForestClassifier,
21 |         GradientBoostingClassifier,
22 |         LogisticRegression,
23 |         OneVsOneClassifier,
24 |         KNeighborsClassifier,
25 |         KNeighborsCosineClassifier,
26 |         KNeighborsPuffinnClassifier,
27 |         KNeighborsNGTClassifier
28 |     ]
29 | }
30 | 


--------------------------------------------------------------------------------
/fs_net/README.md:
--------------------------------------------------------------------------------
 1 | ## FS-NET
 2 | 
 3 | Reimplementation of FS-NET model without reconstruction loss, 
 4 | which harmed the performance according to the reported results in the original 
 5 | paper:
 6 | 
 7 | ```
 8 | @inproceedings{LiuHXCL19,
 9 |   author    = {Chang Liu and
10 |                Longtao He and
11 |                Gang Xiong and
12 |                Zigang Cao and
13 |                Zhen Li},
14 |   title     = {FS-Net: {A} Flow Sequence Network For Encrypted Traffic Classification},
15 |   booktitle = {{IEEE} Conference on Computer Communications (INFOCOM), 2019},
16 |   pages     = {1171--1179},
17 |   year      = {2019}
18 | }
19 | ```
20 | From my point of view, there is some inconsistency between the paper's 
21 | description and implementation found in https://github.com/WSPTTH/FS-Net, 
22 | particularly regarding the presence of Selu activation for the final 
23 | output in eq. 17.
24 | 
25 | As a bonus, the training script has 2 options for the model's input: 
26 |  (i) either packet size sequences (as in the paper), or (ii) K-Means centroids 
27 |  for (PS, IPT) features (similarly to the transformer model in this repo). 


--------------------------------------------------------------------------------
/nn_classifiers/dataset.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import Tuple
 3 | 
 4 | import torch
 5 | from sklearn.model_selection import train_test_split
 6 | from torch.utils.data import TensorDataset
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | def get_train_val_test_datasets(X_train, y_train, X_test, y_test, device='cpu', val_part=0.9) \
12 |         -> Tuple[TensorDataset, TensorDataset, TensorDataset]:
13 | 
14 |     """
15 |     converts sklearn-like dataset into torch-compatible one
16 |     """
17 |     def _tensor_dataset(X, y):
18 |         return TensorDataset(torch.as_tensor(X, device=device, dtype=torch.float),
19 |                              torch.as_tensor(y, device=device, dtype=torch.long))
20 | 
21 |     X_train, X_val, y_train, y_val = train_test_split(
22 |         X_train,
23 |         y_train,
24 |         train_size=val_part,
25 |         stratify=y_train,
26 |         random_state=1
27 |     )
28 |     train_dataset = _tensor_dataset(X_train, y_train)
29 |     val_dataset = _tensor_dataset(X_val, y_val)
30 |     test_dataset = _tensor_dataset(X_test, y_test)
31 | 
32 |     return train_dataset, val_dataset, test_dataset
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/flow_parsing/aux_raw_features_plugin.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import dpkt
 4 | import nfstream
 5 | import numpy as np
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | 
10 | class AuxRawFeatures(nfstream.NFPlugin):
11 |     @staticmethod
12 |     def _fill_flow_stats(flow, packet, counter=0):
13 |         flow.udps.bulk[counter] = packet.payload_size
14 |         if packet.protocol == 6 and packet.ip_version == 4:
15 |             decoded_packet = dpkt.ip.IP(packet.ip_packet)
16 |             try:
17 |                 flow.udps.tcp_window[counter] = decoded_packet.data.win
18 |                 flow.udps.tcp_flag[counter] = decoded_packet.data.flags
19 |             except AttributeError:
20 |                 logger.warning(f'unexpected packet format: {decoded_packet}')
21 | 
22 |     def on_init(self, packet, flow):
23 |         flow.udps.bulk = np.ones(self.packet_limit) * -1
24 |         flow.udps.tcp_window = np.zeros(self.packet_limit)
25 |         flow.udps.tcp_flag = np.zeros(self.packet_limit)
26 | 
27 |         self._fill_flow_stats(flow, packet)
28 | 
29 |     def on_update(self, packet, flow):
30 |         if flow.bidirectional_packets <= self.packet_limit:
31 |             self._fill_flow_stats(flow, packet, flow.bidirectional_packets - 1)
32 | 


--------------------------------------------------------------------------------
/.github/workflows/python-package-conda.yml:
--------------------------------------------------------------------------------
 1 | name: Python Package using Conda
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   build-linux:
 7 |     runs-on: ubuntu-latest
 8 |     strategy:
 9 |       max-parallel: 5
10 | 
11 |     steps:
12 |     - uses: actions/checkout@v2
13 |     - name: Set up Python 3.7
14 |       uses: actions/setup-python@v2
15 |       with:
16 |         python-version: 3.7
17 |     - name: Install dependencies
18 |       run: |
19 |         # $CONDA is an environment variable pointing to the root of the miniconda directory
20 |         $CONDA/bin/conda env update --file requirements.yaml -n base
21 |         git clone https://github.com/puffinn/puffinn /opt/puffinn && \
22 |         cd /opt/puffinn && \
23 |         $CONDA/bin/python3.7 setup.py build && \
24 |         $CONDA/bin/python3.7 setup.py install && \
25 |         cd -
26 | 
27 |     - name: Lint with flake8
28 |       run: |
29 |         $CONDA/bin/conda install flake8
30 |         # stop the build if there are Python syntax errors or undefined names
31 |         $CONDA/bin/flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
32 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
33 |         $CONDA/bin/flake8 . --count --exit-zero --max-complexity=15 --max-line-length=127 --statistics
34 |     - name: Test with pytest
35 |       run: |
36 |         export PYTHONPATH=/home/runner/work/ML-based-network-traffic-classifier/ML-based-network-traffic-classifier
37 |         $CONDA/bin/pytest
38 | 


--------------------------------------------------------------------------------
/gpt_model/classifier/run_evaluation_scenarios.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export PYTHONPATH=../..
 4 | export PRETRAINED_MODEL_PATH=/media/raid_store/pretrained_traffic/gpt2_model_4epochs_classes_external
 5 | 
 6 | export TRAIN_DATASET=../../datasets/train_4c93174d7808b1487aa3288084365d76_no_mawi_unswnb_iscxvpn.csv
 7 | export TEST_DATASET=../../datasets/test_4c93174d7808b1487aa3288084365d76_no_mawi_unswnb_iscxvpn.csv
 8 | 
 9 | 
10 | python train_classifier.py \
11 | --pretrained_path=$PRETRAINED_MODEL_PATH \
12 | --train_dataset=$TRAIN_DATASET \
13 | --test_dataset=$TEST_DATASET \
14 | --freeze_pretrained_model \
15 | --mask_first_token
16 | 
17 | python train_classifier.py \
18 | --pretrained_path=$PRETRAINED_MODEL_PATH \
19 | --train_dataset=$TRAIN_DATASET \
20 | --test_dataset=$TEST_DATASET \
21 | --mask_first_token
22 | 
23 | python train_classifier.py \
24 | --pretrained_path=$PRETRAINED_MODEL_PATH \
25 | --train_dataset=$TRAIN_DATASET \
26 | --test_dataset=$TEST_DATASET \
27 | --freeze_pretrained_model
28 | 
29 | python train_classifier.py \
30 | --pretrained_path=$PRETRAINED_MODEL_PATH \
31 | --train_dataset=$TRAIN_DATASET \
32 | --test_dataset=$TEST_DATASET \
33 | 
34 | python train.py \
35 | --pretrained_path=$PRETRAINED_MODEL_PATH \
36 | --train_dataset=$TRAIN_DATASET \
37 | --test_dataset=$TEST_DATASET \
38 | --freeze_pretrained_model \
39 | --reinitialize
40 | 
41 | python train_classifier.py \
42 | --pretrained_path=$PRETRAINED_MODEL_PATH \
43 | --train_dataset=$TRAIN_DATASET \
44 | --test_dataset=$TEST_DATASET \
45 | --reinitialize
46 | 


--------------------------------------------------------------------------------
/tests/test_fsnet.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import DataLoader
 2 | import torch
 3 | 
 4 | from fs_net.dataset import SimpleClassificationQuantizedDataset, ClassificationPacketSizeDataset
 5 | from fs_net.model import FSNETClassifier
 6 | 
 7 | 
 8 | def test_packet_ds(raw_dataset_file):
 9 |     ds = ClassificationPacketSizeDataset(raw_dataset_file, max_size_range=100, target_column='ndpi_app')
10 |     loader = DataLoader(ds, batch_size=4, drop_last=True)
11 |     for flow, target in loader:
12 |         assert flow.min() == torch.tensor(1) and \
13 |                flow.max() == torch.tensor(99) and \
14 |                flow.shape == torch.Size([4, 20])
15 | 
16 | 
17 | def test_forward_packet_ds(raw_dataset_file):
18 |     ds = ClassificationPacketSizeDataset(raw_dataset_file, max_size_range=100, target_column='ndpi_app')
19 |     loader = DataLoader(ds, batch_size=4, drop_last=True)
20 |     n_classes = len(ds.target_encoder.classes_)
21 |     model = FSNETClassifier({}, ds.target_encoder.classes_, 100)
22 |     for flow, target in loader:
23 |         output = model(flow)
24 |         assert output.shape == torch.Size([4, n_classes])
25 | 
26 | 
27 | def test_forward(tokenizer, raw_dataset_file):
28 |     """ simple smoke-test """
29 |     ds = SimpleClassificationQuantizedDataset(tokenizer, dataset_path=raw_dataset_file, target_column='ndpi_app')
30 |     loader = DataLoader(ds, batch_size=4, drop_last=True)
31 |     n_classes = len(ds.target_encoder.classes_)
32 |     model = FSNETClassifier({}, ds.target_encoder.classes_, len(tokenizer))
33 |     for flow in loader:
34 |         output = model(flow[0])
35 |         assert output.shape == torch.Size([4, n_classes])
36 | 


--------------------------------------------------------------------------------
/flow_parsing/utils.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | import pathlib
 3 | 
 4 | import logging
 5 | import pandas as pd
 6 | 
 7 | from settings import BASE_DIR
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | def read_dataset(filename, fill_na=False) -> pd.DataFrame:
13 |     """ a simple wrapper for pandas """
14 |     dataset = pd.read_csv(filename, na_filter=True)
15 |     if fill_na:
16 |         dataset = dataset.fillna(0)
17 |     logger.info(f'read {len(dataset)} flows from {filename}')
18 |     return dataset
19 | 
20 | 
21 | def check_filename_in_patterns(file, patterns):
22 |     if isinstance(file, pathlib.Path):
23 |         file = file.name
24 | 
25 |     if patterns and any(pattern in file for pattern in patterns):
26 |         logger.info(f'file {file} matches a pattern')
27 |         return True
28 |     return False
29 | 
30 | 
31 | def get_df_hash(df):
32 |     return hashlib.md5(pd.util.hash_pandas_object(df, index=True).values).hexdigest()
33 | 
34 | 
35 | def get_hash(df):
36 | 
37 |     def _get_current_commit_hash():
38 |         """ get commit hash at HEAD """
39 |         from git import Repo
40 |         repo = Repo(BASE_DIR)
41 |         return repo.head.commit.hexsha
42 | 
43 |     try:
44 |         df_hash = _get_current_commit_hash()
45 |     except Exception:
46 |         df_hash = get_df_hash(df)
47 |     return df_hash
48 | 
49 | 
50 | def save_dataset(dataset, save_to=None):
51 |     """ simple data tracking/versioning via hash suffixes """
52 | 
53 |     if save_to is None:
54 |         save_to = BASE_DIR / f'datasets/dataset_{get_hash(dataset)}.csv'
55 |     dataset.to_csv(save_to, index=False)
56 |     logger.info(f'saved dataset to {save_to}')
57 |     return save_to
58 | 


--------------------------------------------------------------------------------
/settings.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import pathlib
 3 | import os
 4 | from dataclasses import dataclass
 5 | 
 6 | import pandas as pd
 7 | 
 8 | logging.basicConfig(level=logging.INFO,
 9 |                     format='[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s')
10 | logger = logging.getLogger()
11 | 
12 | 
13 | def _read_protocol_mapping() -> dict:
14 |     map_file = BASE_DIR / 'flow_parsing/static/ip_proto_map.csv'
15 |     pairs = pd.read_csv(map_file, header=None)
16 |     return dict(pairs.values.tolist())
17 | 
18 | 
19 | BASE_DIR = pathlib.Path(__file__).resolve().parent
20 | TEST_STATIC_DIR = BASE_DIR / 'tests' / 'static'
21 | DATASET_DIR = BASE_DIR / 'datasets'
22 | 
23 | PCAP_OUTPUT_DIR = BASE_DIR / 'csv_files'
24 | REPORT_DIR = BASE_DIR / 'reports'
25 | CACHE_DIR = pathlib.Path('/tmp')
26 | 
27 | IP_PROTO_MAPPING = _read_protocol_mapping()
28 | RANDOM_SEED = 1
29 | 
30 | DEFAULT_PACKET_LIMIT_PER_FLOW = int(os.getenv('DEFAULT_PACKET_LIMIT_PER_FLOW', 20))
31 | LOWER_BOUND_CLASS_OCCURRENCE = int(os.getenv('LOWER_BOUND_CLASS_OCCURRENCE', 50))
32 | 
33 | # customize, if needed
34 | TARGET_CLASS_COLUMN = 'target_class'
35 | 
36 | # nfstream params
37 | # the idle timeout follows many papers on traffic identification (JOY has 10 sec)
38 | IDLE_TIMEOUT = 60
39 | # active timeouts are set similarly, (Cisco's JOY tool has 30 sec)
40 | ACTIVE_TIMEOUT_ONLINE = 60
41 | ACTIVE_TIMEOUT_OFFLINE = 10**6
42 | 
43 | NEPTUNE_PROJECT = 'radion/traffic-classifier'
44 | 
45 | 
46 | @dataclass
47 | class FilePatterns:
48 |     mawi: tuple = ('202004',)
49 |     mawi_unswnb_iscxvpn: tuple = ('raw_csv', '202004')
50 |     iot_home: tuple = ('electronics', 'camera', '2020', 'environment', 'healthcare', 'home', 'hub', 'light', 'trigger')
51 |     mawi_iot_home: tuple = ('electronics', 'camera', '2020', 'environment', 'healthcare', 'home', 'hub', 'light',
52 |                             'trigger', '202004')
53 | 


--------------------------------------------------------------------------------
/gpt_model/classifier/model.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import torch
 4 | from torch.optim.lr_scheduler import ReduceLROnPlateau
 5 | from transformers import GPT2Model, GPT2Config
 6 | from transformers.optimization import AdamW
 7 | 
 8 | from nn_classifiers.models import BaseClassifier
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | class GPT2Classifier(BaseClassifier):
14 |     def __init__(
15 |             self,
16 |             config,
17 |             class_labels,
18 |             pretrained_model_path,
19 |             dropout=0.1,
20 |             freeze_pretrained_part=True,
21 |             reinitialize=False,
22 |             n_layers=6,
23 |     ):
24 |         super().__init__(config, class_labels)
25 | 
26 |         if reinitialize:
27 |             logger.info('resetting model weights')
28 |             config = GPT2Config.from_json_file(pretrained_model_path + '/config.json')
29 |             config = config.to_dict()
30 |             config['n_layer'] = n_layers
31 |             config = GPT2Config.from_dict(config)
32 |             self.gpt2 = GPT2Model(config)
33 |         else:
34 |             self.gpt2 = GPT2Model.from_pretrained(pretrained_model_path)
35 | 
36 |         self.dropout = torch.nn.Dropout(dropout)
37 |         self.fc = torch.nn.Linear(self.gpt2.config.n_embd, self.output_dim)
38 |         if freeze_pretrained_part:
39 |             for param in self.gpt2.parameters():
40 |                 param.requires_grad = False
41 | 
42 |     def forward(self, x):
43 |         output = self.gpt2(**x)
44 |         output = output[0]  # last hidden state (batch_size, sequence_length, hidden_size)
45 |         # average over temporal dimension
46 |         output = output.mean(dim=1)
47 |         output = self.dropout(output)
48 |         return self.fc(output)
49 | 
50 |     def configure_optimizers(self):
51 |         optimizer = AdamW(self.parameters(), lr=self.hparams.learning_rate)
52 |         scheduler = ReduceLROnPlateau(optimizer, patience=self.hparams.es_patience // 2)
53 |         return [optimizer], [scheduler]
54 | 


--------------------------------------------------------------------------------
/flow_parsing/static/ip_proto_map.csv:
--------------------------------------------------------------------------------
  1 | 1,ICMP
  2 | 2,IGMP
  3 | 3,GGP
  4 | 4,IP-in-IP
  5 | 5,ST
  6 | 6,TCP
  7 | 7,CBT
  8 | 8,EGP
  9 | 9,IGP
 10 | 10,BBN-RCC-MON
 11 | 11,NVP-II
 12 | 12,PUP
 13 | 13,ARGUS
 14 | 14,EMCON
 15 | 15,XNET
 16 | 16,CHAOS
 17 | 17,UDP
 18 | 18,MUX
 19 | 19,DCN-MEAS
 20 | 20,HMP
 21 | 21,PRM
 22 | 22,XNS-IDP
 23 | 23,TRUNK-1
 24 | 24,TRUNK-2
 25 | 25,LEAF-1
 26 | 26,LEAF-2
 27 | 27,RDP
 28 | 28,IRTP
 29 | 29,ISO-TP4
 30 | 30,NETBLT
 31 | 31,MFE-NSP
 32 | 32,MERIT-INP
 33 | 33,DCCP
 34 | 34,3PC
 35 | 35,IDPR
 36 | 36,XTP
 37 | 37,DDP
 38 | 38,IDPR-CMTP
 39 | 39,TP++
 40 | 40,IL
 41 | 41,IPv6
 42 | 42,SDRP
 43 | 43,IPv6-Route
 44 | 44,IPv6-Frag
 45 | 45,IDRP
 46 | 46,RSVP
 47 | 47,GREs
 48 | 48,DSR
 49 | 49,BNA
 50 | 50,ESP
 51 | 51,AH
 52 | 52,I-NLSP
 53 | 53,SwIPe
 54 | 54,NARP
 55 | 55,MOBILE
 56 | 56,TLSP
 57 | 57,SKIP
 58 | 58,IPv6-ICMP
 59 | 59,IPv6-NoNxt
 60 | 60,IPv6-Opts
 61 | 61,Any host internal proto
 62 | 62,CFTP
 63 | 63,Any local network
 64 | 64,SAT-EXPAK
 65 | 65,KRYPTOLAN
 66 | 66,RVD
 67 | 67,IPPC
 68 | 68,Any distributed file system
 69 | 69,SAT-MON
 70 | 70,VISA
 71 | 71,IPCU
 72 | 72,CPNX
 73 | 73,CPHB
 74 | 74,WSN
 75 | 75,PVP
 76 | 76,BR-SAT-MON
 77 | 77,SUN-ND
 78 | 78,WB-MON
 79 | 79,WB-EXPAK
 80 | 80,ISO-IP
 81 | 81,VMTP
 82 | 82,SECURE-VMTP
 83 | 83,VINES
 84 | 84,TTP |IPTM
 85 | 85,NSFNET-IGP
 86 | 86,DGP
 87 | 87,TCF
 88 | 88,EIGRP
 89 | 89,OSPF
 90 | 90,Sprite-RPC
 91 | 91,LARP
 92 | 92,MTP
 93 | 93,AX.25
 94 | 94,OS
 95 | 95,MICP
 96 | 96,SCC-SP
 97 | 97,ETHERIP
 98 | 98,ENCAP
 99 | 99,Any private encryption scheme
100 | 100,GMTP
101 | 101,IFMP
102 | 102,PNNI
103 | 103,PIM
104 | 104,ARIS
105 | 105,SCPS
106 | 106,QNX
107 | 107,A/N
108 | 108,IPComp
109 | 109,SNP
110 | 110,Compaq-Peer
111 | 111,IPX-in-IP
112 | 112,VRRP
113 | 113,PGM
114 | 114,Any 0-hop protocol
115 | 115,L2TP
116 | 116,DDX
117 | 117,IATP
118 | 118,STP
119 | 119,SRP
120 | 120,UTI
121 | 121,SMP
122 | 122,SM
123 | 123,PTP
124 | 124,IS-IS over IPv4
125 | 125,FIRE
126 | 126,CRTP
127 | 127,CRUDP
128 | 128,SSCOPMCE
129 | 129,IPLT
130 | 130,SPS
131 | 131,PIPE
132 | 132,SCTP
133 | 133,FC
134 | 134,RSVP-E2E-IGNORE
135 | 135,Mobility Header
136 | 136,UDPLite
137 | 137,MPLS-in-IP
138 | 138,manet
139 | 139,HIP
140 | 140,Shim6
141 | 141,WESP
142 | 142,ROHC
143 | 143,Ethernet


--------------------------------------------------------------------------------
/sklearn_classifiers/config.yaml.example:
--------------------------------------------------------------------------------
 1 | # control the algorithms' use putting/removing comment mark #
 2 | 
 3 | LogRegr:
 4 |     type: LogisticRegression
 5 |     params:
 6 |         solver: saga
 7 |         max_iter: 500
 8 |         n_jobs: -1
 9 |         multi_class: multinomial
10 |     param_search_space:
11 |         C: [0.1, 1, 10]
12 |         tol: [0.00001,0.0001,0.001],
13 | 
14 | LogRegrCost:
15 |     type: LogisticRegression
16 |     params:
17 |         solver: saga
18 |         max_iter: 500
19 |         n_jobs: 4
20 |         class_weight: balanced
21 |         multi_class: multinomial
22 |     param_search_space:
23 |         C: [0.1, 1, 10]
24 | 
25 | LogRegrOVR:
26 |     type: LogisticRegression
27 |     params:
28 |         solver: saga
29 |         max_iter: 500
30 |         n_jobs: -1
31 |         multi_class: ovr
32 |     param_search_space:
33 |         C: [0.1, 1, 10]
34 | 
35 | SVM_OvO:
36 |     type: OneVsOneClassifier
37 |     norandom: true
38 |     params:
39 |         estimator:
40 |             type: LinearSVC
41 |             params:
42 |                 tol: 1.0e-5
43 |         n_jobs: -1
44 |     param_search_space:
45 |         estimator__C: [0.1, 1, 10]
46 |         estimator__loss: ['squared_hinge']
47 | 
48 | DecTree:
49 |     type: DecisionTreeClassifier
50 |     param_search_space:
51 |         max_depth:
52 |             from: 6
53 |             till: 20
54 |             step: 3
55 |         max_features:
56 |             from: 10
57 |             till: 40
58 |             step: 10
59 |         criterion:
60 |             - entropy
61 | RandomForest:
62 |     type: RandomForestClassifier
63 |     params:
64 |         n_estimators: 10
65 |         n_jobs: -1
66 |     param_search_space:
67 |         n_estimators:
68 |             from: 10
69 |             till: 100
70 |             step: 30
71 | 
72 | GradBoost:
73 |     type: GradientBoostingClassifier
74 |     param_search_space:
75 |         n_estimators:
76 |             - 50
77 |         max_depth: [2,3,4,5]
78 |         learning_rate:
79 |             - 0.01
80 |             - 0.05
81 |             - 0.1
82 | 
83 | MLP:
84 |     type: MLPClassifier
85 |     params:
86 |         max_iter: 300
87 |     param_search_space:
88 |         hidden_layer_sizes:
89 |           - [80, 80]
90 |           - [120, 120]
91 |         alpha:
92 |           - 0.0001
93 |           - 0.001
94 |           - 0.01
95 | 


--------------------------------------------------------------------------------
/tests/test_markov.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from gpt_model.generator.baseline import markov
 4 | 
 5 | 
 6 | def test_norm():
 7 |     x = np.array([np.inf, 0, 0, 1]).reshape(2, -1)
 8 |     n_x = markov._normalize_by_rows(x)
 9 |     assert (n_x == [[1, 0], [0, 1]]).all()
10 | 
11 |     x = np.array([[10, 0, ], [4, 16]])
12 | 
13 |     n_x = markov._normalize_by_rows(x)
14 |     exp_x = np.array([[1., 0.], [0.2, 0.8]])
15 |     assert np.isclose(exp_x, n_x, rtol=1e-3).all()
16 | 
17 | 
18 | def test_calc_transition_matrix(quantized_packets):
19 |     trans_matrix = markov._calc_transition_matrix(
20 |         seq_matrix=quantized_packets,
21 |         state_numb=np.unique(quantized_packets).size
22 |     )
23 |     # 0 is the reccurent state
24 |     assert np.isclose(trans_matrix[0, 0], 1, atol=1e-6)
25 | 
26 | 
27 | def test_priors(quantized_packets):
28 |     priors = markov._calc_prior_probas(quantized_packets,
29 |                                        np.unique(quantized_packets).size)
30 | 
31 |     assert np.isclose(priors[10], 0.7541, rtol=1e-3)
32 | 
33 | 
34 | def test_markov_generator(quantized_packets):
35 |     gener = markov.MarkovGenerator()
36 |     gener.fit(quantized_packets*-1)
37 |     sampled = gener.sample(1000)
38 |     new_gener = markov.MarkovGenerator()
39 |     new_gener.fit(sampled)
40 |     assert np.isclose(gener.init_priors, new_gener.init_priors, atol=0.1).all()
41 |     # accumulated error < 1. for 114x114 matrix is OK
42 |     tr_matrix_frob_norm = np.linalg.norm(gener.transition_matrix - new_gener.transition_matrix, ord='fro')
43 |     assert tr_matrix_frob_norm < 1.
44 | 
45 | 
46 | def test_markov_kmeans_augmenter(raw_dataset):
47 |     def _calc_hist_like_pmf(packet_vector):
48 |         pmf = np.histogram(packet_vector, bins=50, range=(0, 1000), density=True)[0]
49 |         return pmf
50 | 
51 |     raw_packets = raw_dataset.filter(regex='raw_packet').fillna(0)
52 |     gener = markov.MarkovQuantizedGenerator()
53 |     gener.fit(raw_packets.values)
54 |     output = gener.sample(raw_packets.shape[0])
55 |     priors_distrs_norm = np.linalg.norm(
56 |         _calc_hist_like_pmf(output[:, 0]) -
57 |         _calc_hist_like_pmf(raw_packets.iloc[:, 0]),
58 |         ord=1)
59 |     assert priors_distrs_norm < 0.015
60 | 
61 |     total_distr_norm = np.linalg.norm(
62 |         _calc_hist_like_pmf(output.flatten()) -
63 |         _calc_hist_like_pmf(raw_packets.values.flatten()),
64 |         ord=1)
65 | 
66 |     assert total_distr_norm < 0.01
67 | 


--------------------------------------------------------------------------------
/tests/test_pcap_parser.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from flow_parsing import pcap_parser
 6 | 
 7 | 
 8 | def test_feature_persistence(pcap_example_path):
 9 |     features = pcap_parser.parse_pcap_to_dataframe(pcap_example_path, online_mode=False). \
10 |         sort_values('flow_id', axis=0). \
11 |         reset_index(drop=True)
12 |     features2 = pcap_parser.parse_pcap_to_dataframe(pcap_example_path, online_mode=False). \
13 |         sort_values('flow_id', axis=0). \
14 |         reset_index(drop=True)
15 |     pd.testing.assert_frame_equal(features, features2)
16 | 
17 | 
18 | def _serialize_tcp_flag(x):
19 |     indexer = x.index.str.endswith('tcp_flags')
20 |     x.iloc[indexer] = x.iloc[indexer].apply(json.dumps)
21 |     return x
22 | 
23 | 
24 | def test_parser_output(dataset, pcap_example_path):
25 |     parsed_features = pcap_parser.parse_pcap_to_dataframe(pcap_example_path, online_mode=False). \
26 |         sort_values('flow_id', axis=0). \
27 |         reset_index(drop=True)
28 | 
29 |     parsed_features = parsed_features.apply(_serialize_tcp_flag, axis=1)
30 |     dataset = dataset.astype(parsed_features.dtypes). \
31 |         sort_values('flow_id', axis=0). \
32 |         reset_index(drop=True)
33 |     pd.testing.assert_frame_equal(parsed_features, dataset,
34 |                                   check_less_precise=2,
35 |                                   check_like=True,
36 |                                   check_categorical=False)
37 | 
38 | 
39 | def test_raw_parser_output(raw_dataset_with_targets, pcap_example_path):
40 |     parsed_features = pcap_parser.parse_pcap_to_dataframe(pcap_example_path,
41 |                                                           derivative_features=False,
42 |                                                           raw_features=20,
43 |                                                           online_mode=False)
44 |     parsed_features = parsed_features. \
45 |         sort_values('flow_id', axis=0). \
46 |         reset_index(drop=True). \
47 |         filter(regex='raw')
48 | 
49 |     raw_dataset = raw_dataset_with_targets. \
50 |         sort_values('flow_id', axis=0). \
51 |         reset_index(drop=True). \
52 |         filter(regex='raw')
53 |     raw_dataset = raw_dataset.astype(parsed_features.dtypes)
54 |     pd.testing.assert_frame_equal(parsed_features, raw_dataset,
55 |                                   check_less_precise=2,
56 |                                   check_like=True,
57 |                                   check_categorical=False)
58 | 


--------------------------------------------------------------------------------
/fs_net/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from nn_classifiers.models import BaseClassifier
 4 | 
 5 | 
 6 | class FSNETClassifier(BaseClassifier):
 7 |     def __init__(self,
 8 |                  config,
 9 |                  class_labels,
10 |                  n_tokens,
11 |                  embedding_dim=128,
12 |                  hidden_size=128,
13 |                  n_layers=2,
14 |                  dropout=0.3):
15 |         super().__init__(config, class_labels)
16 | 
17 |         self.embeddings = torch.nn.Embedding(num_embeddings=n_tokens, embedding_dim=embedding_dim)
18 |         self.dropout = torch.nn.Dropout(p=dropout)
19 |         self.activation = torch.nn.SELU()
20 |         self.encoder = torch.nn.GRU(
21 |             embedding_dim,
22 |             hidden_size,
23 |             num_layers=n_layers,
24 |             batch_first=True,
25 |             dropout=dropout,
26 |             bidirectional=True
27 |         )
28 |         self.encoder_hidden_dim = 2 * n_layers * hidden_size
29 |         self.compound_dim = self.encoder_hidden_dim * 4
30 |         self.decoder = torch.nn.GRU(
31 |             self.encoder_hidden_dim,
32 |             hidden_size,
33 |             num_layers=n_layers,
34 |             batch_first=True,
35 |             dropout=dropout,
36 |             bidirectional=True
37 |         )
38 |         self.compressor = torch.nn.Sequential(
39 |             torch.nn.Linear(self.compound_dim, 2 * hidden_size),
40 |             self.activation,
41 |             self.dropout,
42 |         )
43 |         self.classifier = torch.nn.Linear(2 * hidden_size, self.output_dim)
44 | 
45 |     @staticmethod
46 |     def _concat_hidden_states(hidden_states, batch_size):
47 |         return hidden_states.permute([1, 0, 2]).reshape(batch_size, -1)  # (batch_size, 2*n_layers*hidden_size)
48 | 
49 |     def forward(self, x):
50 |         encoder_in = self.embeddings(x.squeeze_(1))  # (batch_size, embedding_dim)
51 |         batch_size, seq_len = x.shape[0], x.shape[1]
52 | 
53 |         _, enc_states = self.encoder(encoder_in)
54 |         # "concatenate the final hidden states of both forward and backward directions of all the layers"
55 |         z_e = self._concat_hidden_states(enc_states, batch_size)
56 | 
57 |         # "the encoder-based feature vector ze is input into the decoder at each time step t", so we just repeat it
58 |         decoder_in = z_e.unsqueeze(1).repeat(1, seq_len, 1)  # (batch_size, seq_len, encoder_hidden_dim)
59 |         _, dec_states = self.decoder(decoder_in)
60 |         z_d = self._concat_hidden_states(dec_states, batch_size)
61 |         # compound feature vector
62 |         z = torch.cat([z_e, z_d, z_e * z_d, torch.abs(z_e - z_d)], dim=1)
63 |         z_c = self.compressor(z)
64 |         return self.classifier(z_c)
65 | 


--------------------------------------------------------------------------------
/tests/static/quantizer_checkpoint/ids_to_tokens.json:
--------------------------------------------------------------------------------
1 | {"9902": "[EOF]", "9903": "[PAD]", "9904": "[UNK]", "9905": "[BOF]", "9906": "Free90", "9907": "GoogleMaps", "9908": "RDP", "9909": "WhatsAppFiles", "9910": "Zabbix", "9911": "Starcraft", "9912": "DoH_DoT", "9913": "Zoom", "9914": "DNP3", "9915": "ICMP", "9916": "DHCP", "9917": "WindowsUpdate", "9918": "Playstation", "9919": "GTP", "9920": "Facebook", "9921": "IoT_trigger", "9922": "MS_OneDrive", "9923": "LDAP", "9924": "SoundCloud", "9925": "Spotify", "9926": "Oracle", "9927": "HTTP_Proxy", "9928": "POPS", "9929": "SMBv1", "9930": "CiscoSkinny", "9931": "Github", "9932": "Redis", "9933": "NTP", "9934": "Unknown", "9935": "Syslog", "9936": "WireGuard", "9937": "Messenger", "9938": "SAP", "9939": "Xbox", "9940": "SOCKS", "9941": "NestLogSink", "9942": "CiscoVPN", "9943": "MDNS", "9944": "Ayiya", "9945": "GRE", "9946": "CHECKMK", "9947": "QUIC", "9948": "PostgreSQL", "9949": "Citrix", "9950": "IoT_hub", "9951": "Corba", "9952": "STUN", "9953": "GoogleDocs", "9954": "Targus Dataspeed", "9955": "NetFlix", "9956": "CNN", "9957": "SkypeCall", "9958": "GoogleServices", "9959": "IMAPS", "9960": "WeChat", "9961": "Cloudflare", "9962": "Microsoft365", "9963": "SCTP", "9964": "DCE_RPC", "9965": "BitTorrent", "9966": "LLMNR", "9967": "FTP_CONTROL", "9968": "RTSP", "9969": "VRRP", "9970": "H323", "9971": "TeamViewer", "9972": "Steam", "9973": "PlayStore", "9974": "OSPF", "9975": "YouTube", "9976": "IMO", "9977": "SMTP", "9978": "DRDA", "9979": "SMTPS", "9980": "MsSQL-TDS", "9981": "TLS", "9982": "Memcached", "9983": "EGP", "9984": "IoT_camera", "9985": "Instagram", "9986": "Teredo", "9987": "HTTP", "9988": "RemoteScan", "9989": "AJP", "9990": "Skype", "9991": "Amazon", "9992": "SOMEIP", "9993": "SMBv23", "9994": "Modbus", "9995": "WebSocket", "9996": "TFTP", "9997": "Usenet", "9998": "RTP", "9999": "eDonkey", "10000": "NFS", "10001": "Viber", "10002": "Dropbox", "10003": "SSDP", "10004": "Telegram", "10005": "LinkedIn", "10006": "DHCPV6", "10007": "IoT_healthcare", "10008": "IRC", "10009": "WhatsAppCall", "10010": "IPsec", "10011": "IoT_environment", "10012": "AFP", "10013": "OpenVPN", "10014": "WhatsApp", "10015": "BJNP", "10016": "NetBIOS", "10017": "Telnet", "10018": "Whois-DAS", "10019": "Mining", "10020": "PPTP", "10021": "IAX", "10022": "SIP", "10023": "Nats", "10024": "VNC", "10025": "UbuntuONE", "10026": "Google", "10027": "SSH", "10028": "s7comm", "10029": "Kerberos", "10030": "Twitter", "10031": "Radius", "10032": "DNS", "10033": "IGMP", "10034": "ICMPV6", "10035": "GooglePlus", "10036": "MQTT", "10037": "RTMP", "10038": "BGP", "10039": "Wikipedia", "10040": "Microsoft", "10041": "LotusNotes", "10042": "Yahoo", "10043": "UPnP", "10044": "IEC60870", "10045": "SNMP", "10046": "Git", "10047": "POP3", "10048": "Slack", "10049": "MySQL", "10050": "GMail", "10051": "IoT_electronics", "10052": "Diameter", "10053": "IMAP", "10054": "RX", "10055": "AMQP", "10056": "CAPWAP"}


--------------------------------------------------------------------------------
/gpt_model/generator/trainer.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from torch.utils.data import DataLoader, Dataset
 4 | from torch.utils.data.distributed import DistributedSampler
 5 | from transformers import Trainer
 6 | from transformers.trainer import SequentialDistributedSampler
 7 | 
 8 | 
 9 | class SeqTrainer(Trainer):
10 |     def get_train_dataloader(self) -> DataLoader:
11 |         """
12 |         Returns the training :class:`~torch.utils.data.DataLoader`.
13 |         """
14 |         if self.train_dataset is None:
15 |             raise ValueError("Trainer: training requires a train_dataset.")
16 |         else:
17 |             train_sampler = (
18 |                 None
19 |                 if self.args.local_rank == -1
20 |                 else DistributedSampler(self.train_dataset)
21 |             )
22 | 
23 |         data_loader = DataLoader(
24 |             self.train_dataset,
25 |             batch_size=self.args.train_batch_size,
26 |             sampler=train_sampler,
27 |             collate_fn=self.data_collator,
28 |             drop_last=self.args.dataloader_drop_last,
29 |         )
30 | 
31 |         return data_loader
32 | 
33 |     def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader:
34 |         """
35 |         Returns the evaluation :class:`~torch.utils.data.DataLoader`.
36 | 
37 |         Args:
38 |             eval_dataset (:obj:`Dataset`, `optional`):
39 |                 If provided, will override `self.eval_dataset`.
40 |         """
41 |         if eval_dataset is None and self.eval_dataset is None:
42 |             raise ValueError("Trainer: evaluation requires an eval_dataset.")
43 | 
44 |         eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
45 |         if self.args.local_rank != -1:
46 |             sampler = SequentialDistributedSampler(eval_dataset)
47 |         else:
48 |             sampler = None
49 | 
50 |         data_loader = DataLoader(
51 |             eval_dataset,
52 |             sampler=sampler,
53 |             batch_size=self.args.eval_batch_size,
54 |             collate_fn=self.data_collator,
55 |             drop_last=self.args.dataloader_drop_last,
56 |         )
57 | 
58 |         return data_loader
59 | 
60 |     def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
61 |         """
62 |         Returns the test :class:`~torch.utils.data.DataLoader`.
63 | 
64 |         Args:
65 |             test_dataset (obj:`Dataset`): The test dataset to use.
66 |         """
67 |         # We use the same batch_size as for eval.
68 |         if self.args.local_rank != -1:
69 |             sampler = SequentialDistributedSampler(test_dataset)
70 |         else:
71 |             sampler = None
72 | 
73 |         data_loader = DataLoader(
74 |             test_dataset,
75 |             sampler=sampler,
76 |             batch_size=self.args.eval_batch_size,
77 |             collate_fn=self.data_collator,
78 |             drop_last=self.args.dataloader_drop_last,
79 |         )
80 | 
81 |         return data_loader
82 | 


--------------------------------------------------------------------------------
/gpt_model/classifier/dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pathlib
 3 | from functools import partial
 4 | from typing import Dict, List, Tuple
 5 | 
 6 | import logging
 7 | import numpy as np
 8 | import pandas as pd
 9 | import torch
10 | from sklearn.preprocessing import LabelEncoder
11 | from torch.utils.data import Dataset
12 | 
13 | from gpt_model.tokenizer import PacketTokenizer
14 | from settings import TARGET_CLASS_COLUMN
15 | 
16 | logger = logging.getLogger(__name__)
17 | 
18 | 
19 | class ClassificationQuantizedDataset(Dataset):
20 |     def __init__(
21 |         self, tokenizer: PacketTokenizer,
22 |         dataset_path: str,
23 |         label_encoder: LabelEncoder = None,
24 |         target_column=TARGET_CLASS_COLUMN
25 |     ):
26 |         assert os.path.isfile(dataset_path)
27 | 
28 |         dataset_path = pathlib.Path(dataset_path)
29 |         self.source_file = dataset_path
30 |         logger.info("initializing dataset from %s", dataset_path)
31 | 
32 |         self.tokenizer = tokenizer
33 | 
34 |         raw_flows = pd.read_csv(self.source_file,
35 |                                 usecols=self.tokenizer.packet_quantizer.raw_columns + [target_column])
36 | 
37 |         if label_encoder is None:
38 |             self.target_encoder = LabelEncoder().fit(raw_flows[target_column].values)
39 |         else:
40 |             self.target_encoder = label_encoder
41 | 
42 |         self.targets = self.target_encoder.transform(raw_flows[target_column].values)
43 |         self.raw_flows = raw_flows.loc[:, tokenizer.packet_quantizer.raw_columns].values
44 |         logger.info('initialized dataset')
45 | 
46 |     def __len__(self):
47 |         return len(self.raw_flows)
48 | 
49 |     def __getitem__(self, i: int) -> Dict[str, torch.Tensor]:
50 |         enc_flow = self.tokenizer.batch_encode_packets(self.raw_flows[i].reshape(1, -1).astype(np.float64),
51 |                                                        add_special_tokens=True,
52 |                                                        return_attention_mask=True).data
53 | 
54 |         enc_flow.update({'target': torch.as_tensor(self.targets[i], dtype=torch.long)})
55 |         return enc_flow
56 | 
57 |     @classmethod
58 |     def get_collator(cls, mask_first_token):
59 |         return partial(classification_quantized_collator, mask_first_token=mask_first_token)
60 | 
61 | 
62 | def classification_quantized_collator(examples: List[Dict[str, torch.Tensor]], mask_first_token=True) -> \
63 |         Tuple[Dict[str, torch.Tensor], torch.Tensor]:
64 |     """ Data collator used for traffic classification """
65 | 
66 |     length_of_first = examples[0]['input_ids'].size(0)
67 |     are_tensors_same_length = all(x['input_ids'].size(0) == length_of_first for x in examples)
68 |     assert are_tensors_same_length
69 | 
70 |     input_ids = torch.cat([item['input_ids'] for item in examples], dim=0)
71 |     attention_masks = torch.cat([item['attention_mask'] for item in examples], dim=0)
72 |     if mask_first_token:
73 |         attention_masks[:, 0] = 0
74 |     targets = torch.cat([item['target'].view(1) for item in examples])
75 |     return {"input_ids": input_ids, "attention_mask": attention_masks}, targets
76 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import pytest
 6 | 
 7 | from flow_parsing import features
 8 | import settings
 9 | from gpt_model.tokenizer import PacketTokenizer
10 | 
11 | 
12 | @pytest.fixture
13 | def dataset():
14 |     return pd.read_csv(settings.TEST_STATIC_DIR / 'example_20packets.csv', na_filter=False)
15 | 
16 | 
17 | @pytest.fixture
18 | def raw_dataset_folder():
19 |     return settings.TEST_STATIC_DIR / 'raw_csv'
20 | 
21 | 
22 | @pytest.fixture
23 | def raw_dataset_file(raw_dataset_folder):
24 |     return raw_dataset_folder / 'example_raw_20packets.csv'
25 | 
26 | 
27 | @pytest.fixture
28 | def raw_dataset(raw_dataset_folder):
29 |     return pd.read_csv(raw_dataset_folder / 'example_raw_20packets.csv', na_filter=False).\
30 |         filter(regex='raw').\
31 |         astype(np.float64)
32 | 
33 | 
34 | @pytest.fixture
35 | def raw_dataset_with_targets(raw_dataset_folder):
36 |     df = pd.read_csv(raw_dataset_folder / 'example_raw_20packets.csv', na_filter=False)
37 |     df.filter(regex='raw').astype(np.float64, copy=False)
38 |     return df
39 | 
40 | 
41 | @pytest.fixture
42 | def classif_config():
43 |     return {'SVM': {'type': 'OneVsOneClassifier',
44 |                     'params': {'estimator': {'type': 'LinearSVC',
45 |                                              'params': {'tol': 1e-05}}, 'n_jobs': -1},
46 |                     'param_search_space': {'estimator__C': [0.1, 1, 10], 'estimator__loss': ['squared_hinge'],
47 |                                            'estimator__dual': [True, False]}},
48 |             'DecTree': {'type': 'DecisionTreeClassifier',
49 |                         'param_search_space': {'max_depth': [6, 9, 12, 15, 18], 'max_features': [10, 20, 30, 40],
50 |                                                'criterion': ['entropy']}},
51 |             'GradBoost': {'type': 'GradientBoostingClassifier',
52 |                           'param_search_space': {'n_estimators': [50], 'max_depth': [2, 3, 4, 5],
53 |                                                  'learning_rate': [0.01, 0.05, 0.1]}}}
54 | 
55 | 
56 | @pytest.fixture
57 | def raw_matrix():
58 |     size = 10
59 |     raw_feature_matrix = np.zeros((size, 7))
60 |     raw_feature_matrix[:, features.RMI.TIMESTAMP] = np.array(range(12312, size + 12312))
61 |     raw_feature_matrix[:, features.RMI.IP_LEN] = np.array([13, 54, 345, 43, 44, 990, 1000, 23, 555, 1400])
62 |     raw_feature_matrix[:, features.RMI.IS_CLIENT] = np.array([0, 1, 1, 0, 0, 1, 1, 1, 1, 0])
63 |     return raw_feature_matrix
64 | 
65 | 
66 | @pytest.fixture
67 | def quantized_packets():
68 |     with open(settings.TEST_STATIC_DIR / 'quantized_pkts.json', 'r') as js:
69 |         pkts = json.load(js)
70 |     return np.array(pkts).reshape(-1, 20)
71 | 
72 | 
73 | @pytest.fixture
74 | def quantizer_checkpoint():
75 |     return settings.TEST_STATIC_DIR / 'quantizer_checkpoint'
76 | 
77 | 
78 | @pytest.fixture
79 | def pcap_example_path():
80 |     return (settings.BASE_DIR / 'flow_parsing/static/example.pcap').as_posix()
81 | 
82 | 
83 | @pytest.fixture()
84 | def tokenizer(quantizer_checkpoint):
85 |     return PacketTokenizer.from_pretrained(quantizer_checkpoint, flow_size=20)
86 | 


--------------------------------------------------------------------------------
/fs_net/dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pathlib
 3 | from typing import Tuple
 4 | 
 5 | import numpy as np
 6 | import torch
 7 | from sklearn.preprocessing import LabelEncoder
 8 | from torch.utils.data import Dataset
 9 | import logging
10 | import pandas as pd
11 | from flow_parsing.features import generate_raw_feature_names
12 | from gpt_model.classifier.dataset import ClassificationQuantizedDataset
13 | from settings import TARGET_CLASS_COLUMN, DEFAULT_PACKET_LIMIT_PER_FLOW
14 | 
15 | 
16 | logger = logging.getLogger(__name__)
17 | 
18 | 
19 | class SimpleClassificationQuantizedDataset(ClassificationQuantizedDataset):
20 |     """ no attention mask and no dict-like output """
21 |     def __getitem__(self, i: int) -> Tuple[torch.Tensor, torch.Tensor]:
22 |         enc_flow = self.tokenizer.batch_encode_packets(self.raw_flows.reshape(1, -1).astype(np.float64),
23 |                                                        add_special_tokens=False,
24 |                                                        return_attention_mask=False).data
25 |         X = enc_flow['input_ids']
26 |         y = torch.as_tensor(self.targets[i], dtype=torch.long)
27 |         return X, y
28 | 
29 | 
30 | class ClassificationPacketSizeDataset(Dataset):
31 |     """
32 |     the sequences are expected to be passed through embedding layer first, thus they are encoded to be positive and
33 |     the modified PS itself will serve as an index
34 | 
35 |     max_size_range sets max dynamic range for PS parameter and implicitly sets Embedding layer dim
36 |     """
37 |     def __init__(
38 |             self,
39 |             dataset_path: str,
40 |             max_size_range=5000,
41 |             label_encoder: LabelEncoder = None,
42 |             target_column=TARGET_CLASS_COLUMN,
43 |             flow_size=DEFAULT_PACKET_LIMIT_PER_FLOW
44 |     ):
45 |         assert os.path.isfile(dataset_path)
46 | 
47 |         dataset_path = pathlib.Path(dataset_path)
48 |         self.source_file = dataset_path
49 |         logger.info("initializing dataset from %s", dataset_path)
50 | 
51 |         self.packet_columns = generate_raw_feature_names(flow_size, base_features=('packet',))
52 |         raw_flows = pd.read_csv(self.source_file,
53 |                                 usecols=self.packet_columns + [target_column])
54 | 
55 |         if label_encoder is None:
56 |             self.target_encoder = LabelEncoder().fit(raw_flows[target_column].values)
57 |         else:
58 |             self.target_encoder = label_encoder
59 | 
60 |         self.targets = self.target_encoder.transform(raw_flows[target_column].values)
61 |         raw_flows = raw_flows.loc[:, self.packet_columns]
62 |         raw_flows = raw_flows.fillna(0)
63 |         # truncate values outside the range
64 |         offset = max_size_range // 2
65 |         raw_flows[raw_flows <= -offset] = -offset + 1
66 |         raw_flows[raw_flows >= offset] = offset - 1
67 |         self.raw_flows = raw_flows + offset
68 |         logger.info('initialized dataset')
69 | 
70 |     def __len__(self):
71 |         return len(self.raw_flows)
72 | 
73 |     def __getitem__(self, i: int) -> Tuple[torch.Tensor, torch.Tensor]:
74 |         X = torch.as_tensor(self.raw_flows.values[i], dtype=torch.long)
75 |         y = torch.as_tensor(self.targets[i], dtype=torch.long)
76 |         return X, y
77 | 


--------------------------------------------------------------------------------
/evaluation_utils/classification.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | from typing import Optional
 3 | 
 4 | import matplotlib.pyplot as plt
 5 | import numpy as np
 6 | import pandas as pd
 7 | from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, classification_report
 8 | 
 9 | from settings import REPORT_DIR
10 | 
11 | 
12 | class Reporter:
13 |     def __init__(self, true, predicted,
14 |                  classifier_name: str,
15 |                  target_classes: Optional[list] = None,
16 |                  report_dir=REPORT_DIR):
17 |         self.true = true
18 |         self.predicted = predicted
19 |         self.target_classes = target_classes if len(target_classes) > 0 else list(range(max(true) + 1))
20 |         self.classifier_name = classifier_name
21 |         self.save_dir = pathlib.Path(report_dir)
22 |         self.save_dir.mkdir(exist_ok=True)
23 | 
24 |     def scores(self):
25 |         return {
26 |             'Accuracy': accuracy_score(self.true, self.predicted),
27 |             'F1 macro': f1_score(self.true, self.predicted, average='macro'),
28 |             'F1 weighted': f1_score(self.true, self.predicted, average='weighted')
29 |         }
30 | 
31 |     def clf_report(self, as_dict=False, save_to=None):
32 |         def to_df(report):
33 |             return pd.DataFrame(report).T
34 | 
35 |         report = classification_report(self.true, self.predicted,
36 |                                        target_names=self.target_classes,
37 |                                        digits=3,
38 |                                        output_dict=True)
39 | 
40 |         if save_to is not None:
41 |             to_df(report).to_csv(self.save_dir / save_to, index=True)
42 | 
43 |         if as_dict:
44 |             return report
45 |         return to_df(report)
46 | 
47 |     def conf_matrix(self, normalize=None):
48 |         return pd.DataFrame(confusion_matrix(self.true, self.predicted, normalize=normalize),
49 |                             columns=self.target_classes,
50 |                             index=self.target_classes)
51 | 
52 |     def plot_conf_matrix(self, normalize=None) -> plt.figure:
53 | 
54 |         cm = self.conf_matrix(normalize).values
55 |         classes = self.target_classes
56 |         fig_size = int(len(classes) * 0.7)
57 |         fig, ax = plt.subplots(ncols=1, nrows=1, figsize=(fig_size, fig_size))
58 | 
59 |         im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
60 |         ax.set_title('CM of {} classifier'.format(self.classifier_name))
61 |         fig.colorbar(im, aspect=30, shrink=0.8, ax=ax)
62 | 
63 |         tick_marks = np.arange(len(classes))
64 |         ax.set_xticks(tick_marks)
65 |         ax.set_xticklabels(list(classes))
66 |         plt.setp(ax.get_xticklabels(), rotation=45)
67 |         ax.set_yticks(tick_marks)
68 |         ax.set_yticklabels(list(classes))
69 | 
70 |         fmt = '.2f' if normalize else 'd'
71 |         thresh = cm.max() / 2.
72 |         for i in range(cm.shape[0]):
73 |             for j in range(cm.shape[1]):
74 |                 ax.text(j, i, format(cm[i, j], fmt),
75 |                         horizontalalignment="center",
76 |                         color="white" if cm[i, j] > thresh else "black")
77 | 
78 |         ax.set_ylabel('True label')
79 |         ax.set_xlabel('Predicted label')
80 |         # fig.tight_layout()
81 |         plt.show()
82 |         return fig
83 | 


--------------------------------------------------------------------------------
/gpt_model/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Transformer-based network traffic generator and classifier 
 3 | 
 4 | ### Introduction
 5 | 
 6 | Given currents trends in web-protocol development (e.g. eSNI, DNS-over-*),
 7 |  plain text information in traffic sessions is disappearing. In order
 8 | to classify the flows, one of few options is to use statistical discriminators
 9 | based on packet size (PS) and inter-packet time (IPT) features. 
10 | Moreover, exactly the same features are usually produced by traffic 
11 | flow generators. 
12 |  
13 | That gives an idea to develop a common neural network framework for 
14 | creating statistical generators and classifiers. A reasonable choice
15 |  can be Transformer architecture that showed SOTA on numerous NLP benchmarks.
16 | Since we need a generative model, GPT-2 seems to be a good option to start 
17 | with, luckily, `huggingface` did all the dirty stuff implementing it.
18 | 
19 | In order to use the models, the initial packet feature space (PS + IPT) 
20 | has to be quantized into discrete sequences. I used K-Means for this 
21 | purpose and given the expected dataset size (millions of flows),
22 | the libKMCUDA's implementation was adopted to transform prior scaled 
23 |  packet features into integer sequences of cluster numbers (see 
24 |  `quantizer.py`).
25 | 
26 | Generative pretraining is a viable option to get a powerful classifier without
27 | having much target data. We can pretrain the model in the following ways:
28 | 1. Using unlabeled data. Allows to further use the model as a feature
29 | extractor for various classifiers (e.g linear, K-nn, uSVM) or to be completely
30 | fine-tuned on a classification task.
31 | 2. Using labeled data. The model is trained with first sequence tokens 
32 | denoting traffic class that afterwards allows to sample class-specific
33 | packet clusters. Moreover, the same benefits as above are preserved.
34 |  
35 | ### Pre-trained models and datasets: 
36 | 
37 | It is necessary to download a MinIO client to your computer as per:
38 | https://docs.min.io/docs/minio-client-quickstart-guide.html
39 | 
40 | To get the data, execute the following commands:
41 | ```
42 | ./mc alias set ext-anon http://195.201.38.68:9000
43 | ./mc ls ext-anon/traffic-classifier
44 | ./mc cp ext-anon/traffic-classifier .
45 | ```
46 | 
47 | where the first command will prompt you for user credentials:
48 | ```
49 | Access Key: gpt_research
50 | Secret Key: mbmug8VDbRu5hqJ
51 | ```
52 | 
53 | 
54 | *Note: opening the URL in a browser leads to the administrator 
55 | console. To access the datasets and models you have to install MinIO client 
56 | as mentioned above.*
57 | 
58 | 
59 | ### Publications
60 | 
61 | More details can be found in the following papers (please, cite the first one):
62 | ```
63 | @article{Bikmukhamedov2021MultiClassNT,
64 |   title={Multi-Class Network Traffic Generators and Classifiers Based on Neural Networks},
65 |   author={R. Bikmukhamedov and A. Nadeev},
66 |   journal={2021 Systems of Signals Generating and Processing in the Field of on Board Communications},
67 |   year={2021},
68 |   pages={1-7},
69 |   url = {https://doi.org/10.1109/IEEECONF51389.2021.9416067}
70 | }
71 | 
72 | @article{bikmukhamedov2020,
73 |   author = {Bikmukhamedov, R. F. and Nadeev, A.F.},
74 |   title = {Generative transformer framework for network traffic generation and classification},
75 |   journal = {T-Comm},
76 |   year = {2020},
77 |   number = {11},
78 |   vol = {14},
79 |   pages = {64--71},
80 |   url = {http://media-publisher.ru/wp-content/uploads/Nom-11-2020-s.pdf}
81 | }
82 | ```
83 | 


--------------------------------------------------------------------------------
/gpt_model/data_preparation/preprocess_target_pcaps.py:
--------------------------------------------------------------------------------
 1 | import typing
 2 | import pathlib
 3 | 
 4 | import sh
 5 | 
 6 | 
 7 | """ provided here for the sake of reproducibility of own research """
 8 | 
 9 | 
10 | class Device(typing.NamedTuple):
11 |     mac: str
12 |     name: str
13 |     category: str
14 | 
15 | 
16 | IOT_DEVICES = [
17 |     Device('d0:52:a8:00:67:5e', 'Smart Things', 'hub'),
18 |     Device('44:65:0d:56:cc:d3', 'Amazon Echo', 'hub'),
19 | 
20 |     Device('70:ee:50:18:34:43', 'Netatmo Welcome', 'camera'),
21 |     Device('f4:f2:6d:93:51:f1', 'TP-Link Day Night Cloud camera', 'camera'),
22 |     Device('00:16:6c:ab:6b:88', 'Samsung SmartCam', 'camera'),
23 |     Device('30:8c:fb:2f:e4:b2', 'Dropcam', 'camera'),
24 |     Device('00:62:6e:51:27:2e', 'Insteon (wired)', 'camera'),
25 |     Device('e8:ab:fa:19:de:4f', 'Insteon (wireless)', 'camera'),
26 |     Device('00:24:e4:11:18:a8', 'Withings Smart Baby Monitor', 'camera'),
27 | 
28 |     Device('ec:1a:59:79:f4:89', 'Belkin Wemo', 'trigger'),
29 |     Device('ec:1a:59:83:28:11', 'Belkin Wemo Motion sensor', 'trigger'),
30 |     Device('50:c7:bf:00:56:39', 'TP-Link Smart Plug', 'trigger'),
31 |     Device('74:c6:3b:29:d7:1d', 'iHome', 'trigger'),
32 | 
33 |     Device('18:b4:30:25:be:e4', 'NEST Protect smoke alarm', 'environment'),
34 |     Device('70:ee:50:03:b8:ac', 'Netatmo weather station', 'environment'),
35 | 
36 |     Device('00:24:e4:1b:6f:96', 'Withings Smart scale', 'healthcare'),
37 |     Device('00:24:e4:20:28:c6', 'Withings Aura smart sleep sensor', 'healthcare'),
38 |     Device('74:6a:89:00:2e:25', 'Blipcare Blood Pressure meter', 'healthcare'),
39 | 
40 |     Device('d0:73:d5:01:83:08', 'LiFX Smart Bulb', 'light_bulb'),
41 | 
42 |     Device('18:b7:9e:02:20:44', 'Triby Speaker', 'electronics'),
43 |     Device('e0:76:d0:33:bb:85', 'PIX-STAR photo-frame', 'electronics'),
44 |     Device('70:5a:0f:e4:9b:c0', 'HP Printer', 'electronics'),
45 | ]
46 | 
47 | 
48 | TCPDUMP_BASE_FILTER = 'not arp and not icmp and not icmp6 and not broadcast and not multicast and not net 127.0.0.0/8'
49 | 
50 | 
51 | def _merge_pcaps(pcaps_to_merge: list, to_file):
52 |     exec = sh.Command('mergecap')
53 |     exec('-w', to_file, '-Fpcap', *pcaps_to_merge)
54 | 
55 | 
56 | def _split_by_devices(source_pcap):
57 |     exec = sh.Command('/usr/sbin/tcpdump')
58 |     target_dir = source_pcap.parent / 'separated_iot_devices'
59 |     target_dir.mkdir(exist_ok=True)
60 |     for device in IOT_DEVICES:
61 |         target_file = target_dir / f'{device.category}_{device.name.lower().replace(" ", "_")}.pcap'
62 |         filter_str = f"ether host {device.mac} and not (dst net 192.168.1.0/24 and src net 192.168.1.0/24) " \
63 |                      f"and {TCPDUMP_BASE_FILTER}"
64 |         exec(['-r', source_pcap, filter_str, '-w', target_file])
65 | 
66 | 
67 | def _filter_non_iot_dump(source_pcap):
68 |     target_file = source_pcap.parent / 'non_iot.pcap'
69 |     filter_str = f"not (dst net 192.168.88.0/24 and src net 192.168.88.0/24) and {TCPDUMP_BASE_FILTER}"
70 |     exec = sh.Command('/usr/sbin/tcpdump')
71 |     exec(['-r', source_pcap, filter_str, '-w', target_file])
72 | 
73 | 
74 | def main():
75 |     dump_root_dir = pathlib.Path('/media/raid_store/pretrained_traffic')
76 |     merged_pcap = dump_root_dir / 'total.pcap'
77 |     # merge all .pcap files from https://iotanalytics.unsw.edu.au/iottraces
78 |     pcaps = pathlib.Path(dump_root_dir / 'iot_downloads').glob('*.pcap')
79 |     _merge_pcaps(pcaps, merged_pcap)
80 |     _split_by_devices(merged_pcap)
81 |     _filter_non_iot_dump(dump_root_dir / 'home.pcap')
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     main()
86 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Network traffic classifier based on statistical properties of application flows
 2 | 
 3 | UPDATE 18/03/2019: Refactored in OOP-style, more flexibility and features! 
 4 | 
 5 | UPDATE 23/05/2020: Replaced custom flow-parsing mechanism with NFStream
 6 | 
 7 | UPDATE 17/09/2020: Added pytorch classifiers, including transformer-based one
 8 | 
 9 | UPDATE 30/10/2020: ANN classifiers (NGT, LSH), FS-NET baseline
10 | ## Key features
11 | 
12 | * Configurable feature extraction from network flows via `NFStream`.
13 | 
14 | * Possibility to test arbitrary sklearn algorithms (e.g. SVM, Random Forest, 
15 | etc.) and configure their parameter search space via `.yaml` configs.
16 | 
17 | * Basic examples of pytorch classifiers and new generative transformer
18 | framework that can be used for building traffic generators and 
19 | classifiers.
20 | 
21 | * Option for experiment tracking with Neptune.
22 | 
23 | ## Project structure
24 | 
25 | * `flow_parsing` contains scripts for parsing flow features and labels
26 | from `.pcap` into `.csv` via `NFStream`. It can be
27 |  used for exporting raw per-flow packet-features (e.g. packet/payload 
28 |  sizes, timestamps, various packet-fields) in a numpy array, as well as
29 |  derivative statistics, such as feature percentiles, etc.
30 | 
31 | * `evaluation_utils` contains utilities for evaluation of traffic 
32 | classifiers and generators.
33 |   
34 | * `fs_net` is a reimplementation of FS-NET classifier
35 | 
36 | * `sklearn_classifiers` contains wrapper for sklearn-like classifiers 
37 | and example pipeline script. Used models and their parameters are specified
38 | via the `.yaml` configuration file. Check and modify `utils.py:REGISTERED_CLASSES` 
39 | to support the needed models.
40 | 
41 | * `nn_classifers` includes base class for pytorch-lightning classifier and
42 | some basic derivatives.
43 | 
44 | * `gpt_model` has all the code required for building your own 
45 | transformer-based traffic generator and classifier, along with a link to 
46 | model checkpoints. See the package for more info.
47 | 
48 | ## Usage example for sklearn-based classifiers
49 | 
50 | 1. A feature file has to be prepared before running model training, so 
51 | make sure to create a `.csv` dataset by running, for example:
52 |  
53 |     ```PYTHONPATH=. python flow_parsing/pcap_parser.py -p flow_parsing/static/example.pcap --online_mode``` 
54 | 
55 | 2. OPTIONAL. Postprocess parsed `.csv` as needed, e.g. split into train-test,
56 | reassign target columns.
57 | 
58 | 3. Create own version of `config.yaml` to experiment with and
59 | test classifiers:
60 | 
61 |     ```
62 |    PYTHONPATH=. python sklearn_classifiers/run_training.py 
63 |         --train_dataset csv_files/example_20packets.csv 
64 |         --target_column ndpi_category 
65 |         --continuous 
66 |    ```
67 | 
68 | ## Publications
69 | 
70 | If you find the code or datasets useful for your research, please, 
71 | cite one of the following papers:
72 | 
73 | ```
74 | @article{Bikmukhamedov2021MultiClassNT,
75 |   title={Multi-Class Network Traffic Generators and Classifiers Based on Neural Networks},
76 |   author={Bikmukhamedov, Radion and Nadeev, Adel},
77 |   journal={2021 Systems of Signals Generating and Processing in the Field of on Board Communications},
78 |   year={2021},
79 |   pages={1-7},
80 |   url = {https://doi.org/10.1109/IEEECONF51389.2021.9416067}
81 | }
82 | 
83 | @CONFERENCE{bikmukhamedov2019,
84 |   author = {Bikmukhamedov, R. F. and Nadeev, A. F.},
85 |   title = {Lightweight Machine Learning Classifiers of IoT Traffic Flows},
86 |   booktitle = {2019 Systems of Signal Synchronization, Generating and Processing in Telecommunications},
87 |   year = {2019},
88 | }
89 | ```


--------------------------------------------------------------------------------
/tests/test_distance.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pytest
  3 | import torch
  4 | from sklearn.metrics import accuracy_score
  5 | from sklearn.model_selection import train_test_split
  6 | from sklearn.preprocessing import LabelEncoder, normalize
  7 | from transformers import set_seed, GPT2Model, GPT2Config
  8 | 
  9 | from settings import RANDOM_SEED
 10 | from sklearn_classifiers.knn_cosine import (
 11 |     cos_dist,
 12 |     top_k_cosine_similar,
 13 |     batch_voter,
 14 |     KNeighborsCosineClassifier,
 15 |     KNeighborsPuffinnClassifier,
 16 |     KNeighborsNGTClassifier
 17 | )
 18 | 
 19 | 
 20 | @pytest.fixture()
 21 | def keys():
 22 |     return np.array([[1, 0, 0], [0.9, -0.1, 0], [1, 0, 0], [0, 1, 1]])
 23 | 
 24 | 
 25 | def test_cos_dist(keys):
 26 |     query = np.array([0, 0, 1])
 27 |     sim = cos_dist(query, keys)
 28 |     assert np.isclose(sim, np.array([1., 1., 1., 0.29289322])).all()
 29 | 
 30 | 
 31 | @pytest.mark.parametrize(
 32 |     'query,idx,top_k',
 33 |     [
 34 |         (np.array([0, 0, 1]), [[3]], 1),
 35 |         (np.array([1, 0, 0]), [[0, 2]], 2),
 36 |         (np.array([1, -0.1, 0]), [[1, 0]], 2),
 37 |         (np.array([[1, -0.1, 0], [1, 0, 0]]), [[1, 0], [0, 2]], 2)
 38 |     ]
 39 | )
 40 | def test_cos_top_k(query, idx, top_k, keys):
 41 |     top = top_k_cosine_similar(query, keys, top_k)
 42 |     assert top.tolist() == idx
 43 | 
 44 | 
 45 | def test_target_assignment(keys):
 46 |     targets = np.array([2, 2, 0, 1])
 47 |     top_2_for_3_queries = np.array([[1, 0], [2, 0], [1, 2]])
 48 |     votes = batch_voter(targets[top_2_for_3_queries])
 49 |     assert votes.tolist() == [2, 0, 2]
 50 | 
 51 | 
 52 | def test_knn_cos(keys):
 53 |     targets = np.array([2, 0, 2, 1])
 54 |     for classifier_class in [KNeighborsCosineClassifier, KNeighborsPuffinnClassifier, KNeighborsNGTClassifier]:
 55 |         clf = classifier_class(2)
 56 |         clf.fit(keys, targets)
 57 |         X_test = np.array([[0.9, 0, 0]])
 58 |         pred = clf.predict(X_test)
 59 |         assert pred.tolist() == [2]
 60 | 
 61 | 
 62 | @pytest.fixture()
 63 | def dummy_gpt2():
 64 |     set_seed(RANDOM_SEED)
 65 | 
 66 |     config = {
 67 |         "vocab_size": 9906,
 68 |         "n_positions": 128,
 69 |         "n_ctx": 128,
 70 |         "n_embd": 512,
 71 |         "n_layer": 6,
 72 |         "n_head": 8,
 73 |      }
 74 |     config = GPT2Config(**config)
 75 |     model = GPT2Model(config)
 76 |     return model
 77 | 
 78 | 
 79 | def test_ann_deviation(raw_dataset_with_targets, raw_dataset, tokenizer, dummy_gpt2):
 80 | 
 81 |     y = LabelEncoder().fit_transform(raw_dataset_with_targets['ndpi_app'])
 82 |     encoded = tokenizer.batch_encode_packets(raw_dataset)
 83 |     with torch.no_grad():
 84 |         features = dummy_gpt2(**encoded)[0]
 85 | 
 86 |     X = features.mean(dim=1).numpy()
 87 |     X = normalize(X)
 88 | 
 89 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=True, random_state=1)
 90 |     ref_preds = KNeighborsCosineClassifier(n_neighbors=1).fit(X_train, y_train).predict(X_test)
 91 |     # ref_preds = KNeighborsClassifier(n_neighbors=1, algorithm='brute').fit(X_train, y_train).predict(X_test)
 92 | 
 93 |     accuracy = accuracy_score(y_test, ref_preds)
 94 | 
 95 |     pfn_preds = KNeighborsPuffinnClassifier(n_neighbors=1).fit(X_train, y_train).predict(X_test)
 96 | 
 97 |     pfn_acc = accuracy_score(y_test, pfn_preds)
 98 |     assert accuracy == pfn_acc
 99 | 
100 |     assert accuracy_score(ref_preds, pfn_preds) == 1.0
101 | 
102 |     ngt_preds = KNeighborsNGTClassifier(n_neighbors=1,
103 |                                         search_epsilon=0.2,
104 |                                         optimize_n_edges=False,
105 |                                         optimize_search_params=False
106 |                                         ).fit(X_train, y_train).predict(X_test)
107 | 
108 |     assert accuracy_score(ref_preds, ngt_preds) == 1.0
109 |     assert accuracy_score(ngt_preds, pfn_preds) == 1.0
110 | 


--------------------------------------------------------------------------------
/sklearn_classifiers/clf_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import logging
 4 | import typing
 5 | from time import time
 6 | 
 7 | import yaml
 8 | from sklearn import metrics
 9 | from sklearn.metrics import make_scorer
10 | from sklearn.model_selection import GridSearchCV, StratifiedKFold
11 | 
12 | import settings
13 | from .registered_classes import REGISTERED_CLASSES
14 | 
15 | logger = logging.getLogger(__file__)
16 | 
17 | 
18 | class ClassifierHolder:
19 |     """ simple dataclass """
20 |     def __init__(self, classifier, param_search_space, shortcut_name=None):
21 |         self.classifier = classifier
22 |         self.name = type(classifier).__name__ if not shortcut_name else shortcut_name
23 |         self.param_search_space = param_search_space
24 | 
25 |     def __repr__(self):
26 |         repr_str = repr(self.classifier)
27 |         if self.param_search_space:
28 |             repr_str += f'\n\tsearch_space: {self.param_search_space}'
29 |         return repr_str
30 | 
31 | 
32 | def _read_config_file(config_path) -> dict:
33 |     """ simple wrapper around yaml.load """
34 |     with open(config_path) as f:
35 |         settings = yaml.load(f, Loader=yaml.SafeLoader)
36 |     return settings
37 | 
38 | 
39 | def _process_settings(settings: dict) -> None:
40 |     """ In-place settings transform for ranges"""
41 |     for key, params in settings.items():
42 |         if 'param_search_space' in params:
43 |             ssp = params.get('param_search_space')
44 |             for pname, pvalue in ssp.items():
45 |                 if isinstance(pvalue, dict) and 'from' in pvalue:
46 |                     step = pvalue.get('step', 1)
47 |                     ssp[pname] = list(range(pvalue['from'], pvalue['till']+1, step))
48 | 
49 | 
50 | def read_classifier_settings(config_path=None):
51 |     if config_path is None:
52 |         config_path = settings.BASE_DIR / 'sklearn_classifiers/config.yaml'
53 |     config = _read_config_file(config_path)
54 |     _process_settings(config)
55 |     return config
56 | 
57 | 
58 | def initialize_classifiers(config: dict,
59 |                            random_seed: int = settings.RANDOM_SEED,
60 |                            classes: typing.Dict[str, type] = REGISTERED_CLASSES) -> typing.Dict[str, ClassifierHolder]:
61 | 
62 |     result = {}
63 |     for key, params in config.items():
64 |         kwargs = params.get('params', {})
65 | 
66 |         logger.info(f'Instantiating {params["type"]} with params {kwargs}')
67 |         if 'estimator' in kwargs:  # this works only on one level deeper. No recursion
68 |             sub_kwargs = {'random_state': random_seed}
69 |             kwargs['estimator'] = classes[kwargs['estimator']['type']](**sub_kwargs)
70 |         else:
71 |             kwargs['random_state'] = random_seed
72 | 
73 |         if params['type'].startswith('KNeighbors'):
74 |             kwargs.pop('random_state')
75 |         classifier = classes[params['type']](**kwargs)
76 |         holder = ClassifierHolder(classifier, params.get('param_search_space', {}), shortcut_name=key)
77 |         result[key] = holder
78 |     return result
79 | 
80 | 
81 | def fit_optimal_classifier(classifier: ClassifierHolder, X_train, y_train, n_folds=2):
82 |     """ searches through pre-defined parameter space from the .yaml, and fits classifier with found parameters """
83 |     logger.info('Searching parameters for {} through {}'.format(classifier.name, classifier.param_search_space))
84 |     search = GridSearchCV(classifier.classifier,
85 |                           param_grid=classifier.param_search_space,
86 |                           n_jobs=-1,
87 |                           scoring=make_scorer(metrics.f1_score, average='macro'),
88 |                           cv=StratifiedKFold(n_folds, shuffle=True, random_state=settings.RANDOM_SEED),
89 |                           refit=True,
90 |                           verbose=1)
91 | 
92 |     start = time()
93 |     search.fit(X_train, y_train)
94 |     logger.info('Search took {:.2f} seconds'.format(time() - start))
95 |     logger.info('Best parameters are {} with score {:.4f}'.format(search.best_params_, search.best_score_))
96 |     classifier.classifier = search.best_estimator_
97 |     return classifier
98 | 


--------------------------------------------------------------------------------
/tests/static/quantized_pkts.json:
--------------------------------------------------------------------------------
1 | [10, 77, 26, 70, 34, 17, 17, 17, 18, 26, 14, 28, 26, 95, 31, 34, 32, 26, 0, 0, 10, 77, 26, 99, 34, 17, 17, 24, 26, 14, 28, 26, 96, 98, 34, 49, 26, 26, 0, 0, 10, 77, 26, 99, 34, 17, 17, 17, 18, 26, 26, 26, 14, 28, 26, 48, 29, 34, 19, 81, 10, 12, 33, 104, 52, 17, 47, 33, 56, 33, 46, 16, 33, 62, 85, 33, 59, 79, 39, 73, 10, 12, 33, 37, 52, 69, 11, 33, 56, 33, 46, 16, 33, 62, 85, 33, 105, 39, 73, 33, 10, 12, 33, 104, 52, 1, 1, 58, 56, 33, 33, 33, 46, 16, 33, 62, 85, 33, 103, 79, 10, 12, 33, 37, 52, 17, 17, 11, 56, 33, 33, 46, 16, 33, 62, 85, 33, 94, 79, 33, 55, 26, 34, 34, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 26, 34, 34, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 12, 33, 91, 52, 1, 50, 56, 33, 33, 46, 54, 16, 78, 33, 100, 85, 33, 38, 88, 10, 12, 33, 91, 52, 1, 1, 58, 56, 33, 33, 46, 16, 33, 100, 85, 33, 89, 1, 112, 55, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 52, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 52, 26, 70, 34, 1, 1, 1, 23, 26, 26, 14, 28, 26, 3, 31, 34, 1, 9, 26, 10, 12, 33, 110, 52, 1, 50, 33, 1, 15, 33, 33, 33, 46, 68, 33, 62, 85, 33, 67, 10, 52, 26, 99, 34, 1, 1, 1, 23, 26, 14, 28, 26, 106, 36, 34, 2, 75, 1, 92, 55, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 77, 26, 70, 26, 34, 17, 17, 17, 18, 26, 26, 26, 34, 26, 0, 0, 0, 0, 0, 10, 77, 26, 99, 34, 17, 17, 17, 18, 26, 14, 28, 26, 13, 29, 34, 19, 81, 25, 17, 10, 12, 33, 91, 33, 52, 17, 17, 53, 17, 26, 26, 26, 26, 43, 17, 26, 26, 52, 26, 10, 12, 33, 37, 52, 17, 17, 53, 17, 78, 33, 78, 33, 56, 33, 33, 46, 16, 33, 62, 10, 12, 33, 84, 52, 1, 1, 1, 63, 33, 33, 33, 33, 46, 64, 33, 21, 52, 1, 1, 10, 77, 26, 70, 34, 17, 17, 17, 18, 26, 26, 26, 14, 28, 26, 108, 31, 34, 113, 26, 10, 77, 26, 99, 34, 17, 17, 17, 18, 26, 26, 14, 28, 26, 13, 98, 34, 49, 26, 26, 73, 33, 52, 52, 83, 52, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 12, 33, 104, 52, 1, 50, 56, 33, 33, 33, 46, 16, 33, 100, 85, 33, 45, 45, 93, 10, 12, 33, 91, 52, 17, 17, 53, 17, 33, 33, 78, 33, 43, 17, 78, 33, 46, 16, 33, 10, 52, 26, 99, 34, 1, 1, 1, 23, 26, 14, 28, 26, 102, 36, 34, 1, 1, 75, 1, 10, 12, 33, 110, 52, 1, 1, 58, 1, 15, 33, 33, 46, 68, 33, 62, 85, 33, 30, 52, 10, 77, 26, 70, 34, 17, 17, 17, 18, 26, 26, 14, 28, 26, 101, 31, 34, 8, 26, 74, 10, 12, 33, 5, 52, 71, 58, 33, 33, 20, 33, 33, 46, 68, 33, 100, 85, 33, 22, 22, 10, 12, 33, 5, 52, 1, 50, 33, 33, 20, 33, 33, 46, 68, 33, 100, 6, 76, 80, 40, 55, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 52, 26, 70, 34, 1, 1, 61, 26, 26, 14, 28, 26, 101, 31, 34, 1, 35, 26, 0, 10, 52, 26, 99, 34, 1, 1, 1, 23, 26, 26, 26, 14, 28, 26, 102, 98, 34, 49, 26, 10, 52, 26, 99, 34, 2, 1, 23, 26, 26, 14, 28, 26, 87, 29, 34, 4, 66, 26, 26, 10, 12, 33, 37, 52, 1, 1, 58, 33, 33, 33, 56, 33, 33, 46, 16, 33, 62, 85, 33, 10, 12, 33, 37, 52, 69, 11, 56, 33, 33, 46, 16, 33, 62, 85, 33, 42, 39, 73, 33, 10, 12, 33, 91, 52, 1, 1, 58, 56, 33, 33, 46, 16, 33, 62, 85, 33, 111, 51, 39, 55, 26, 34, 34, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 26, 34, 34, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 12, 33, 104, 52, 1, 1, 58, 56, 33, 33, 33, 33, 46, 16, 33, 100, 85, 33, 7, 55, 26, 34, 34, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 12, 33, 104, 52, 1, 1, 58, 56, 33, 33, 46, 16, 33, 100, 85, 33, 86, 57, 72, 10, 12, 33, 91, 52, 65, 33, 33, 56, 33, 33, 46, 16, 33, 62, 85, 33, 97, 51, 39, 10, 52, 26, 99, 34, 1, 1, 1, 23, 26, 14, 41, 28, 33, 26, 87, 98, 34, 49, 26, 10, 52, 26, 99, 34, 1, 1, 1, 23, 26, 26, 26, 14, 28, 26, 107, 29, 34, 1, 92, 10, 12, 33, 91, 52, 1, 1, 58, 33, 33, 33, 54, 1, 78, 33, 46, 16, 33, 62, 85, 10, 12, 33, 91, 52, 71, 58, 33, 33, 33, 54, 1, 78, 33, 46, 16, 33, 62, 73, 33, 10, 12, 33, 84, 52, 1, 1, 1, 63, 33, 33, 33, 46, 64, 33, 21, 1, 1, 1, 1, 10, 12, 33, 84, 52, 1, 1, 27, 33, 33, 33, 46, 64, 33, 109, 52, 1, 1, 71, 33, 10, 52, 26, 70, 34, 1, 1, 1, 23, 26, 14, 41, 28, 33, 26, 82, 31, 34, 8, 26, 73, 33, 52, 52, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 12, 33, 91, 52, 1, 1, 58, 54, 1, 33, 33, 78, 33, 46, 16, 33, 100, 85, 33, 73, 33, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 12, 33, 91, 52, 65, 56, 33, 33, 33, 46, 16, 33, 100, 6, 76, 90, 33, 60, 44, 10, 52, 26, 70, 26, 34, 1, 1, 1, 23, 26, 26, 26, 34, 26, 0, 0, 0, 0, 0]


--------------------------------------------------------------------------------
/tests/test_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | 
 3 | import numpy as np
 4 | from torch.utils.data import DataLoader
 5 | 
 6 | from gpt_model.generator.dataset import PretrainIterDataset, PretrainCollator, PretrainDataset, PretrainDatasetWithClasses
 7 | from gpt_model.quantizer import PacketScaler, init_sklearn_kmeans_from_checkpoint, PacketQuantizer
 8 | from gpt_model.tokenizer import PacketTokenizer
 9 | 
10 | np.random.seed(1)
11 | 
12 | 
13 | def test_packet_scaler():
14 |     n_packets = 1000
15 |     pack_lens = np.random.uniform(-1500, 1500, n_packets)
16 |     iats = np.random.gamma(0, scale=1e4, size=n_packets)
17 |     indices = np.random.choice(np.arange(iats.size), replace=False, size=int(iats.size * 0.2))
18 |     iats[indices] = 0.
19 | 
20 |     packets = np.stack([pack_lens, iats], axis=1)
21 |     transformer = PacketScaler()
22 |     transf_packets = transformer.transform(packets.copy())
23 |     reverted_packets = transformer.inverse_transform(transf_packets)
24 |     assert np.isclose(packets, reverted_packets, atol=10e-9).all()
25 | 
26 | 
27 | def test_loading_quantizer(quantizer_checkpoint):
28 |     q = init_sklearn_kmeans_from_checkpoint(quantizer_checkpoint)
29 |     cluster = q.predict(np.array([[-1, 0]]))
30 |     assert cluster[0] == 8
31 | 
32 | 
33 | def test_saving_tokenizer(quantizer_checkpoint):
34 |     q = PacketTokenizer.from_pretrained(quantizer_checkpoint)
35 |     q.save_pretrained('/tmp/')
36 |     assert pathlib.Path('/tmp/clusters.json').is_file()
37 |     assert pathlib.Path('/tmp/ids_to_tokens.json').is_file()
38 | 
39 | 
40 | def _estimate_normalized_packet_difference(raw_packets, reverted_packets):
41 |     norm_diff = (reverted_packets - raw_packets) / reverted_packets
42 |     norm_diff[np.isnan(norm_diff) | np.isinf(norm_diff)] = 0
43 |     return norm_diff.mean()
44 | 
45 | 
46 | def test_quantizer_transform(quantizer_checkpoint, raw_dataset):
47 | 
48 |     q = PacketQuantizer.from_checkpoint(quantizer_checkpoint, flow_size=20)
49 |     # assert proper column ordering with packet features
50 |     raw_packets = raw_dataset[q.raw_columns].values
51 |     quantized = q.transform(raw_packets)
52 |     assert quantized.shape == (raw_dataset.shape[0], 20)
53 |     assert np.isnan(raw_packets).sum() == (quantized == -1).sum() * 2
54 | 
55 |     # test invariance
56 |     assert np.isclose(quantized, q.transform(raw_packets)).all()
57 | 
58 |     # test inverting
59 |     reverted_packets = q.inverse_transform(quantized)
60 |     assert reverted_packets.shape == raw_packets.shape
61 |     assert np.isnan(reverted_packets).sum() == np.isnan(raw_packets).sum()
62 | 
63 |     assert _estimate_normalized_packet_difference(raw_packets, reverted_packets) < 0.0003
64 | 
65 | 
66 | def test_tokenize_detokenize(quantizer_checkpoint, raw_dataset):
67 |     tokenizer = PacketTokenizer.from_pretrained(quantizer_checkpoint)
68 |     encoded = tokenizer.batch_encode_packets(raw_dataset)
69 |     tokens = encoded['input_ids']
70 |     # since the model limit 128 > 20 in raw_features, we do not expect truncating
71 |     decoded = tokenizer.batch_decode_packets(tokens)
72 |     assert _estimate_normalized_packet_difference(raw_dataset.values, decoded) < 0.0003
73 | 
74 | 
75 | def test_flow_loader(raw_dataset_folder, quantizer_checkpoint):
76 |     tokenizer = PacketTokenizer.from_pretrained(quantizer_checkpoint, flow_size=20)
77 |     ds = PretrainIterDataset(tokenizer, folder_path=raw_dataset_folder)
78 |     loader = DataLoader(ds, batch_size=4, collate_fn=PretrainCollator(tokenizer), drop_last=True)
79 |     for flow in loader:
80 |         assert flow['input_ids'].shape == (4, 22)
81 | 
82 | 
83 | def test_flowlight_loader(raw_dataset_folder, quantizer_checkpoint):
84 |     tokenizer = PacketTokenizer.from_pretrained(quantizer_checkpoint, flow_size=20)
85 |     ds = PretrainDataset(tokenizer, folder_path=raw_dataset_folder)
86 |     loader = DataLoader(ds, batch_size=4, collate_fn=PretrainCollator(tokenizer), drop_last=True)
87 |     for flow in loader:
88 |         assert flow['input_ids'].shape == (4, 22)
89 | 
90 | 
91 | def test_dataset_with_classes(raw_dataset_folder, quantizer_checkpoint):
92 |     tokenizer = PacketTokenizer.from_pretrained(quantizer_checkpoint, flow_size=20)
93 |     ds = PretrainDatasetWithClasses(tokenizer, folder_path=raw_dataset_folder)
94 |     loader = DataLoader(ds, batch_size=4, collate_fn=PretrainCollator(tokenizer), drop_last=True)
95 |     for flow in loader:
96 |         assert flow['input_ids'].shape == (4, 22)
97 |         # 9905 is the last non-flow-label token ID
98 |         assert (flow['input_ids'][:, 0] > 9905).all().tolist()
99 | 


--------------------------------------------------------------------------------
/flow_parsing/features.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import logging
  3 | from typing import Tuple, Union, Optional
  4 | 
  5 | import numpy as np
  6 | from nfstream.flow import NFlow
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | 
 11 | FEATURE_FUNCTIONS = {
 12 |     '0': lambda feature_slice: _safe_vector_getter(feature_slice, 0),
 13 |     '1': lambda feature_slice: _safe_vector_getter(feature_slice, 1),
 14 |     '_max': np.max,
 15 |     '_min': np.min,
 16 |     '_avg': np.mean,
 17 |     '_median': np.median,
 18 |     '_25q': lambda feature_slice: np.percentile(feature_slice, 25),
 19 |     '_75q': lambda feature_slice: np.percentile(feature_slice, 75),
 20 |     '_sum': np.sum,
 21 |     # counting non-empty bulks (packets with payload)
 22 |     '_number': lambda feature_slice: feature_slice[feature_slice > 0].shape[0]
 23 | }
 24 | 
 25 | # These are not complete subsets of handcrafted features
 26 | CONTINUOUS_NAMES = tuple(base + feature for feature in FEATURE_FUNCTIONS.keys() for base in ['bulk', 'packet'])
 27 | CONTINUOUS_NAMES += ('tcp_window_avg', )
 28 | 
 29 | CATEGORICAL_NAMES = (
 30 |     'found_tcp_flags',
 31 | )
 32 | 
 33 | FEATURE_NAMES = CONTINUOUS_NAMES + CATEGORICAL_NAMES
 34 | 
 35 | 
 36 | class FEATURE_PREFIX:
 37 |     client = 'client_'
 38 |     server = 'server_'
 39 | 
 40 | 
 41 | @functools.lru_cache(maxsize=2)
 42 | def create_empty_features(prefix: str, feature_list=FEATURE_NAMES) -> dict:
 43 |     return {prefix + feature: 0. for feature in feature_list}
 44 | 
 45 | 
 46 | def _safe_vector_getter(vector, indexer) -> Union[int, float]:
 47 |     try:
 48 |         return vector[indexer]
 49 |     except IndexError:
 50 |         return np.nan
 51 | 
 52 | 
 53 | def calc_parameter_stats(feature_slice, prefix, feature_name) -> dict:
 54 |     return {prefix + feature_name + feature: func(feature_slice) for feature, func in FEATURE_FUNCTIONS.items()}
 55 | 
 56 | 
 57 | def inter_packet_times_from_timestamps(timestamps):
 58 |     if len(timestamps) == 0:
 59 |         return timestamps
 60 |     next_timestamps = np.roll(timestamps, 1)
 61 |     ipt = timestamps - next_timestamps
 62 |     ipt[0] = 0
 63 |     return ipt
 64 | 
 65 | 
 66 | def generate_raw_feature_names(flow_size, base_features: Tuple[str] = ('packet', 'iat')) -> list:
 67 |     return [f'raw_{feature}{index}'
 68 |             for index in range(flow_size)
 69 |             for feature in base_features]
 70 | 
 71 | 
 72 | def calc_raw_features(flow: NFlow) -> dict:
 73 |     """ selects PS and IPT features  """
 74 |     packet_limit = len(flow.splt_ps)
 75 |     features = dict.fromkeys(generate_raw_feature_names(packet_limit))
 76 |     for index in range(packet_limit):
 77 |         ps = flow.splt_ps[index]
 78 |         ipt = flow.splt_piat_ms[index]
 79 | 
 80 |         if flow.splt_direction[index] == 1:
 81 |             ps = flow.splt_ps[index] * -1
 82 |         elif flow.splt_direction[index] == -1:
 83 |             ps = np.nan
 84 |             ipt = np.nan
 85 | 
 86 |         features['raw_packet' + str(index)] = ps
 87 |         features['raw_iat' + str(index)] = ipt
 88 | 
 89 |     return features
 90 | 
 91 | 
 92 | def _calc_unidirectional_flow_features(flow: NFlow, direction_idxs, prefix='', features: Optional[list] = None) -> dict:
 93 |     # this asserts using of the listed features
 94 |     if features is None:
 95 |         features = create_empty_features(prefix)
 96 | 
 97 |     features.update(calc_parameter_stats(np.array(flow.splt_ps)[direction_idxs], prefix, 'packet'))
 98 | 
 99 |     features[prefix + 'found_tcp_flags'] = sorted(set(flow.udps.tcp_flag[direction_idxs]))
100 |     features[prefix + 'tcp_window_avg'] = np.mean(flow.udps.tcp_window[direction_idxs])
101 |     features.update(calc_parameter_stats(flow.udps.bulk[direction_idxs], prefix, 'bulk'))
102 | 
103 |     return features
104 | 
105 | 
106 | def calc_stat_features(flow: NFlow) -> dict:
107 |     """ estimates derivative discriminative features for flow classification from:
108 |         packet size, payload size, TCP window, TCP flag
109 |     """
110 |     direction = np.array(flow.splt_direction)
111 |     client_idxs = direction == 0
112 |     server_idxs = direction == 1
113 | 
114 |     if client_idxs.sum() > 0:
115 |         client_features = _calc_unidirectional_flow_features(flow, client_idxs, prefix=FEATURE_PREFIX.client)
116 |     else:
117 |         client_features = create_empty_features(prefix=FEATURE_PREFIX.client)
118 | 
119 |     if server_idxs.sum() > 0:
120 |         server_features = _calc_unidirectional_flow_features(flow, server_idxs, prefix=FEATURE_PREFIX.server)
121 |     else:
122 |         server_features = create_empty_features(prefix=FEATURE_PREFIX.server)
123 | 
124 |     total_features = dict(**client_features, **server_features)
125 |     return total_features
126 | 


--------------------------------------------------------------------------------
/gpt_model/generator/baseline/markov.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import numpy as np
  4 | from sklearn.cluster import KMeans
  5 | from sklearn.preprocessing import normalize
  6 | 
  7 | logger = logging.getLogger(__name__)
  8 | 
  9 | 
 10 | def _normalize_by_rows(x: np.array):
 11 |     safe_x = x.copy()
 12 |     safe_x[safe_x == np.inf] = 10e6
 13 |     return normalize(safe_x, axis=1, norm='l1')
 14 | 
 15 | 
 16 | def _calc_transition_matrix(seq_matrix, state_numb):
 17 |     """ here the states are expected to be integers in [0, state_numb) """
 18 |     # init with values close-to-zero for smoothing
 19 |     transition_matrix = np.ones((state_numb, state_numb)) * 1e-6
 20 |     for row_iter in range(seq_matrix.shape[0]):
 21 |         state_seq = seq_matrix[row_iter, :]
 22 |         # count number of each possible transition
 23 |         for t in range(len(state_seq) - 1):
 24 |             j = state_seq[t]
 25 |             k = state_seq[t + 1]
 26 |             transition_matrix[j, k] += 1
 27 | 
 28 |     norm_trans_matrix = _normalize_by_rows(transition_matrix)
 29 |     logger.info(f'estimated transition matrix for {norm_trans_matrix.shape[0]} states')
 30 |     return norm_trans_matrix
 31 | 
 32 | 
 33 | def _calc_prior_probas(seq_matrix, state_numb):
 34 |     counts = np.zeros(state_numb)
 35 |     for state in range(state_numb):
 36 |         counts[state] = np.count_nonzero(seq_matrix[:, 0] == state)
 37 |     priors = counts / np.linalg.norm(counts, ord=1)
 38 |     logger.info('estimated vector of priors')
 39 |     return priors
 40 | 
 41 | 
 42 | class BaseGenerator:
 43 |     def fit(self, X):
 44 |         raise NotImplementedError
 45 | 
 46 |     def sample(self, n_sequences):
 47 |         raise NotImplementedError
 48 | 
 49 | 
 50 | class MarkovGenerator(BaseGenerator):
 51 |     def __init__(self):
 52 |         self.n_states = None
 53 |         self.transition_matrix = None
 54 |         self.init_priors = None
 55 |         self.index2value = {}
 56 |         self.value2index = {}
 57 |         self._seq_len = None
 58 |         self._states = None
 59 |         logger.info('init MarkovGenerator')
 60 | 
 61 |     def _map_values_to_indexes(self, X):
 62 |         orig_values = X.flatten()
 63 |         self.value2index = {value: index for index, value in enumerate(np.unique(orig_values))}
 64 |         self.index2value = {index: value for index, value in enumerate(np.unique(orig_values))}
 65 |         X_mapped = np.array([self.value2index[val] for val in orig_values]).reshape(-1, self._seq_len)
 66 |         return X_mapped
 67 | 
 68 |     def _map_indexes_to_values(self, X_mapped):
 69 |         mapped_values = X_mapped.flatten()
 70 |         X = np.array([self.index2value[val] for val in mapped_values]).reshape(-1, self._seq_len)
 71 |         return X
 72 | 
 73 |     def fit(self, X):
 74 |         self._seq_len = X.shape[1]
 75 |         n_states = np.unique(X).size
 76 |         self._states = np.arange(n_states)
 77 | 
 78 |         X_mapped = self._map_values_to_indexes(X)
 79 | 
 80 |         self.transition_matrix = _calc_transition_matrix(X_mapped, n_states)
 81 |         self.init_priors = _calc_prior_probas(X_mapped, n_states)
 82 |         return self
 83 | 
 84 |     def sample(self, n_sequences):
 85 |         assert n_sequences > 0
 86 |         logger.info(f'started generating {n_sequences} sequences')
 87 |         sampled_matrix = np.zeros((n_sequences, self._seq_len), dtype=int)
 88 |         for seq_index in range(n_sequences):
 89 |             sampled_matrix[seq_index, :] = self._sample_sequence()
 90 |         return self._map_indexes_to_values(sampled_matrix)
 91 | 
 92 |     def _sample_sequence(self):
 93 |         sampled = np.zeros(self._seq_len, dtype=int)
 94 |         sampled[0] = np.random.choice(self._states, p=self.init_priors)
 95 |         for index in range(1, self._seq_len):
 96 |             sampled[index] = np.random.choice(self._states, p=self.transition_matrix[sampled[index-1], :])
 97 |         return sampled
 98 | 
 99 | 
100 | class MarkovQuantizedGenerator(BaseGenerator):
101 |     def __init__(self, cluster_limit=200):
102 |         self.cluster_limit = cluster_limit
103 |         self.quantizer = None
104 |         self.generator = MarkovGenerator()
105 | 
106 |     def _get_cluster_number(self, X):
107 |         unique_points = np.unique(X).size
108 |         cluster_number = self.cluster_limit if unique_points > self.cluster_limit else unique_points
109 |         logger.info(f'selected {cluster_number} clusters for quantization')
110 |         return cluster_number
111 | 
112 |     def fit(self, X):
113 |         cluster_number = self._get_cluster_number(X)
114 |         self.quantizer = KMeans(n_clusters=cluster_number)
115 |         X_quantized = self.quantizer.fit_predict(X.flatten().reshape(-1, 1)).reshape(X.shape)
116 |         logger.info('quantized input')
117 |         self.generator.fit(X_quantized)
118 | 
119 |     def sample(self, n_sequences):
120 |         X_gen = self.generator.sample(n_sequences)
121 |         X_restored = self.quantizer.cluster_centers_[X_gen][:, :, 0]
122 |         logger.info('dequantized output')
123 |         return X_restored
124 | 


--------------------------------------------------------------------------------
/nn_classifiers/models.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional
  2 | 
  3 | import logging
  4 | import torch
  5 | from pytorch_lightning import LightningModule
  6 | from torch.nn import functional as F
  7 | from torch.optim.lr_scheduler import ReduceLROnPlateau
  8 | from transformers.trainer_utils import set_seed
  9 | 
 10 | from evaluation_utils.classification import Reporter
 11 | from settings import RANDOM_SEED
 12 | 
 13 | set_seed(RANDOM_SEED)
 14 | logger = logging.getLogger(__file__)
 15 | 
 16 | 
 17 | class BaseClassifier(LightningModule):
 18 |     def __init__(self, config, class_labels: Optional[List[str]], *args, **kwargs):
 19 |         super().__init__()
 20 |         self.hparams = config
 21 |         self.class_labels = class_labels
 22 |         self.output_dim = len(class_labels)
 23 | 
 24 |     def forward(self, x):
 25 |         return self.net(x)
 26 | 
 27 |     def training_step(self, batch, batch_idx):
 28 |         x, y = batch
 29 |         y_hat = self(x)
 30 |         loss = F.cross_entropy(y_hat, y)
 31 |         logs = {'train_loss': loss}
 32 |         return {'loss': loss, 'log': logs}
 33 | 
 34 |     def validation_step(self, batch, batch_idx):
 35 |         x, y = batch
 36 |         y_hat = self(x)
 37 |         return {'val_loss': F.cross_entropy(y_hat, y)}
 38 | 
 39 |     def validation_epoch_end(self, outputs):
 40 |         avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
 41 |         logs = {'val_loss': avg_loss}
 42 |         return {'val_loss': avg_loss, 'log': logs}
 43 | 
 44 |     def test_step(self, batch, batch_idx):
 45 |         x, y = batch
 46 |         y_hat = self(x)
 47 |         predictions = y_hat.max(axis=1)[1]
 48 |         loss = F.cross_entropy(y_hat, y)
 49 |         logs = {'test_loss': loss}
 50 |         return {'test_loss': loss,
 51 |                 'predictions': predictions.to('cpu'),
 52 |                 'targets': y.to('cpu'),
 53 |                 'log': logs}
 54 | 
 55 |     def test_epoch_end(self, outputs):
 56 |         avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
 57 |         predictions = torch.cat([x['predictions'] for x in outputs]).to('cpu').numpy()
 58 |         targets = torch.cat([x['targets'] for x in outputs]).to('cpu').numpy()
 59 |         rpt = Reporter(targets, predictions, self.__class__.__name__, target_classes=self.class_labels)
 60 |         self.logger.experiment.log_image('confusion_matrix', rpt.plot_conf_matrix())
 61 | 
 62 |         report_file = f'report_{self.__class__.__name__}.csv'
 63 |         clf_report = rpt.clf_report(save_to=report_file)
 64 |         print(clf_report)
 65 |         self.logger.experiment.log_artifact((rpt.save_dir / report_file).as_posix())
 66 | 
 67 |         logs = rpt.scores()
 68 |         logs.update({'test_loss': avg_loss})
 69 |         return {'test_loss': avg_loss, 'log': logs}
 70 | 
 71 |     def configure_optimizers(self):
 72 |         optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
 73 |         scheduler = ReduceLROnPlateau(optimizer, patience=self.hparams.es_patience // 2)
 74 |         return [optimizer], [scheduler]
 75 | 
 76 | 
 77 | class DenseClassifier(BaseClassifier):
 78 |     def __init__(self, config, class_labels, input_size, hidden_size=40, activation=torch.nn.LeakyReLU, dropout=0.1):
 79 |         super().__init__(config, class_labels)
 80 | 
 81 |         self.net = torch.nn.Sequential(torch.nn.Linear(input_size, hidden_size),
 82 |                                        activation(),
 83 |                                        torch.nn.Dropout(dropout),
 84 |                                        torch.nn.Linear(hidden_size, hidden_size),
 85 |                                        activation(),
 86 |                                        torch.nn.Dropout(dropout),
 87 |                                        torch.nn.Linear(hidden_size, hidden_size),
 88 |                                        activation(),
 89 |                                        torch.nn.Dropout(dropout),
 90 |                                        torch.nn.Linear(hidden_size, self.output_dim))
 91 | 
 92 | 
 93 | class BiGRUClassifier(BaseClassifier):
 94 |     def __init__(self, config, class_labels, input_size, num_layers=3, hidden_size=None, dropout=0.1, bidirectional=True):
 95 |         super().__init__(config, class_labels)
 96 | 
 97 |         if not hidden_size:
 98 |             hidden_size = self.output_dim
 99 | 
100 |         self.gru = torch.nn.GRU(input_size,
101 |                                 hidden_size,
102 |                                 num_layers=num_layers,
103 |                                 batch_first=True,
104 |                                 dropout=dropout,
105 |                                 bidirectional=bidirectional)
106 | 
107 |         self.activation = torch.nn.LeakyReLU()
108 |         gru_out_size = 2*hidden_size if bidirectional else hidden_size
109 |         self.layer_norm = torch.nn.LayerNorm(gru_out_size)
110 |         self.fc = torch.nn.Linear(gru_out_size, self.output_dim)
111 | 
112 |     def forward(self, x):
113 |         gru_out, hidden_state = self.gru(x.unsqueeze_(2))
114 |         out = self.activation(gru_out.max(axis=1)[0])
115 |         out = self.layer_norm(out)
116 |         return self.fc(out)
117 | 


--------------------------------------------------------------------------------
/gpt_model/data_preparation/preprocess_pretraining_pcaps.py:
--------------------------------------------------------------------------------
  1 | import pathlib
  2 | 
  3 | import nfstream
  4 | import pandas as pd
  5 | import sh
  6 | 
  7 | import settings
  8 | from flow_parsing import parse_pcap_to_csv
  9 | 
 10 | 
 11 | def parse_flow_sizes(pcap_folder, target_folder):
 12 | 
 13 |     for pcap_file in pcap_folder.glob('*.pcap'):
 14 |         print(f'parsing {pcap_file}')
 15 |         dest_file = target_folder / (pcap_file.stem + '.csv')
 16 | 
 17 |         streamer = nfstream.NFStreamer(
 18 |             source=pcap_file.as_posix(),
 19 |             statistical_analysis=True,
 20 |             idle_timeout=settings.IDLE_TIMEOUT,
 21 |             active_timeout=settings.ACTIVE_TIMEOUT_ONLINE,
 22 |             accounting_mode=1,   # IP size,
 23 |         )
 24 |         print(f'saving to {dest_file}')
 25 |         streamer.to_csv(path=dest_file)
 26 | 
 27 | 
 28 | def parse_raw_features_from_pcaps(pcap_folder, target_folder):
 29 |     for pcap_file in pcap_folder.glob('*.pcap'):
 30 |         target_csv = target_folder / (pcap_file.stem + '.csv')
 31 |         if target_csv.exists():
 32 |             continue
 33 |         print(f'started parsing file {pcap_file}')
 34 | 
 35 |         # raw_features are set via analysis of packet number distribution within sessions
 36 |         # @ mawi.wide.ad.jp/mawi/ditl/ditl2020/ pcaps, such that the limit is close to .99 percentile
 37 | 
 38 |         parse_pcap_to_csv(pcap_file.as_posix(),
 39 |                           target_csv.as_posix(),
 40 |                           derivative_features=False,
 41 |                           raw_features=128,
 42 |                           provide_labels=True)
 43 | 
 44 | 
 45 | def record_session_lengths(target_folder):
 46 |     dfs = []
 47 |     for csv in target_folder.glob('*.csv'):
 48 |         df = pd.read_csv(csv, usecols=['bidirectional_packets'])
 49 |         dfs.append(df)
 50 | 
 51 |     dfs = pd.concat(dfs, axis=0)
 52 |     counts = dfs.bidirectional_packets.value_counts()
 53 |     norm_counts = counts.sort_index().cumsum() / dfs.shape[0]
 54 |     norm_counts.to_json(target_folder.parent / 'pkt_len_norm_counts.json')
 55 | 
 56 |     norm_counts_no_1packet_flows = (counts.sort_index().cumsum() - counts[1]) / (dfs.shape[0] - counts[1])
 57 |     norm_counts_no_1packet_flows.to_json(target_folder.parent / 'pkt_len_norm_counts_no_1_packet.json')
 58 | 
 59 | 
 60 | def rm_icmp_from_pcaps(source_pcap_folder, target_pcap_folder):
 61 |     for source_pcap in source_pcap_folder.glob('*.pcap'):
 62 |         target_pcap = target_pcap_folder / (source_pcap.stem + 'no_icmp.pcap')
 63 |         exec = sh.Command('/usr/sbin/tcpdump')
 64 |         exec(['-r', source_pcap, 'not icmp', '-w', target_pcap])
 65 | 
 66 | 
 67 | def split_pcaps_into_smaller(source_folder, dest_folder, size_limit=2000):
 68 |     for source_pcap in source_folder.glob('*.pcap'):
 69 |         target_pcaps = dest_folder / source_pcap.stem
 70 |         exec = sh.Command('/usr/sbin/tcpdump')
 71 |         exec(['-r', source_pcap, '-w', target_pcaps, '-C', size_limit])
 72 | 
 73 | 
 74 | def pcapng_to_pcap(pcap_folder):
 75 |     for source_pcap in pcap_folder.glob('*.pcapng'):
 76 |         target_pcap = pcap_folder / (source_pcap.stem + '.pcap')
 77 |         exec = sh.Command('tshark')
 78 |         exec(['-F', 'pcap', '-r', source_pcap, '-w', target_pcap])
 79 | 
 80 | 
 81 | def add_pcap_suffix(folder):
 82 |     for file in folder.glob('*'):
 83 |         file.replace(file.parent / (file.stem + '.pcap'))
 84 | 
 85 | 
 86 | def uncompress_and_split_pcaps(source_folder, target_folder):
 87 |     """
 88 |     bash script:
 89 | 
 90 |     for f in *.gz; do
 91 |       STEM_with_pcap=$(basename "${f}" .gz)
 92 |       STEM=$(basename "${STEM_with_pcap}" .pcap)
 93 |       # gunzip -c "${f}" > /media/raid_store/pretrained_traffic/mawi_pcaps/"${STEM}"
 94 |       gunzip -c "${f}" | tcpdump -w /media/raid_store/pretrained_traffic/mawi_pcaps/"${STEM}" -C 2000 -r -
 95 |     done
 96 | 
 97 |     :param folder:
 98 |     :return:
 99 |     """
100 |     gunzip = sh.Command('gunzip')
101 |     tcpdump = sh.Command('tcpdump')
102 |     target_folder = pathlib.Path(target_folder)
103 |     for file in source_folder.glob('*.gz'):
104 |         stem = file.stem.split('.pcap')[0]
105 |         target = target_folder / stem
106 |         # not tested :) see https://amoffat.github.io/sh/sections/piping.html#piping
107 |         tcpdump(gunzip('-c', file), '-w', target, '-C', 2000, '-r', '-')
108 | 
109 | 
110 | if __name__ == '__main__':
111 |     source_pcap_folder = pathlib.Path('/media/raid_store/pretrained_traffic/separated_iot_pcaps')
112 | 
113 |     # no_icmp_pcaps = pathlib.Path('/media/raid_store/pretrained_traffic/MAWI_no_icmp')
114 |     # rm_icmp_from_pcaps(source_pcap_folder, no_icmp_pcaps)
115 | 
116 |     # clean_pcap_folder = pathlib.Path('/media/raid_store/pretrained_traffic/pcaps')
117 |     # clean_pcap_folder.mkdir(exist_ok=True)
118 |     # split_pcaps_into_smaller(clean_pcap_folder, split_pcap_folder, 2000)
119 | 
120 |     # split_pcap_folder = pathlib.Path('/media/raid_store/pretrained_traffic/ISCXVPN2016')
121 |     # split_pcap_folder = pathlib.Path('/media/raid_store/pretrained_traffic/pcaps')
122 |     # split_pcap_folder.mkdir(exist_ok=True)
123 | 
124 |     # add_pcap_suffix(source_pcap_folder)
125 |     # parse_flow_sizes(split_pcap_folder, target_csv_folder_w_lengths)
126 | 
127 |     # target_csv_folder_w_lengths = pathlib.Path('/media/raid_store/pretrained_traffic/raw_csv_len')
128 |     # target_csv_folder_w_lengths.mkdir(exist_ok=True)
129 |     # record_session_lengths(target_csv_folder_w_lengths.parent)
130 | 
131 |     target_csv_folder = pathlib.Path('/media/raid_store/pretrained_traffic/raw_csv_iot_devices')
132 |     target_csv_folder.mkdir(exist_ok=True)
133 | 
134 |     # pcapng_to_pcap(split_pcap_folder)
135 | 
136 |     parse_raw_features_from_pcaps(source_pcap_folder, target_csv_folder)
137 | 


--------------------------------------------------------------------------------
/gpt_model/classifier/train_classifier.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | import torch
  5 | from pytorch_lightning import Trainer
  6 | from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
  7 | from pytorch_lightning.callbacks import LearningRateLogger
  8 | from pytorch_lightning.loggers import NeptuneLogger
  9 | from torch.utils.data import DataLoader
 10 | from torch.utils.data import random_split
 11 | 
 12 | from gpt_model.classifier.model import GPT2Classifier
 13 | from gpt_model.classifier.dataset import ClassificationQuantizedDataset
 14 | from gpt_model.tokenizer import PacketTokenizer
 15 | from settings import BASE_DIR, DEFAULT_PACKET_LIMIT_PER_FLOW, NEPTUNE_PROJECT
 16 | 
 17 | 
 18 | def main():
 19 |     parser = argparse.ArgumentParser()
 20 |     parser.add_argument(
 21 |         '--train_dataset',
 22 |         help='path to preprocessed .csv dataset',
 23 |     )
 24 |     parser.add_argument(
 25 |         '--test_dataset',
 26 |         help='path to preprocessed .csv dataset',
 27 |     )
 28 |     parser.add_argument(
 29 |         '--pretrained_path',
 30 |     )
 31 |     parser.add_argument(
 32 |         '--freeze_pretrained_model',
 33 |         action='store_true',
 34 |         default=False,
 35 |     )
 36 |     parser.add_argument(
 37 |         '--mask_first_token',
 38 |         action='store_true',
 39 |         default=False,
 40 |     )
 41 |     parser.add_argument(
 42 |         '--batch_size',
 43 |         default=256,
 44 |     )
 45 |     parser.add_argument(
 46 |         '--es_patience',
 47 |         default=5,
 48 |         type=int,
 49 |     )
 50 |     parser.add_argument(
 51 |         '--learning_rate',
 52 |         default=None
 53 |     )
 54 |     parser.add_argument(
 55 |         '--fc_dropout',
 56 |         default=0.0,
 57 |     )
 58 |     parser.add_argument(
 59 |         '--reinitialize',
 60 |         action='store_true',
 61 |         default=False
 62 |     )
 63 |     parser.add_argument(
 64 |         '--n_layers',
 65 |         default=6,
 66 |         type=int,
 67 |         help='number of transformer layers to use, only in use when --reinitialize is provided'
 68 |     )
 69 |     parser.add_argument(
 70 |         '--log_neptune',
 71 |         dest='log_neptune',
 72 |         action='store_true',
 73 |         default=False
 74 |     )
 75 |     parser.add_argument(
 76 |         '--neptune_experiment_name',
 77 |         dest='neptune_experiment_name',
 78 |         default='gpt2_class_pretrained'
 79 |     )
 80 | 
 81 |     args = parser.parse_args()
 82 |     if args.learning_rate is None:
 83 |         args.learning_rate = 0.0005 if args.freeze_pretrained_model else 0.00002
 84 | 
 85 |     print(args)
 86 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
 87 | 
 88 |     tokenizer = PacketTokenizer.from_pretrained(args.pretrained_path, flow_size=DEFAULT_PACKET_LIMIT_PER_FLOW)
 89 | 
 90 |     train_val_dataset = ClassificationQuantizedDataset(tokenizer,
 91 |                                                        dataset_path=args.train_dataset)
 92 |     train_part_len = int(len(train_val_dataset) * 0.9)
 93 |     train_dataset, val_dataset = random_split(train_val_dataset,
 94 |                                               [train_part_len, len(train_val_dataset) - train_part_len])
 95 | 
 96 |     test_dataset = ClassificationQuantizedDataset(tokenizer,
 97 |                                                   dataset_path=args.test_dataset,
 98 |                                                   label_encoder=train_val_dataset.target_encoder)
 99 | 
100 |     collator = ClassificationQuantizedDataset.get_collator(mask_first_token=args.mask_first_token)
101 | 
102 |     cpu_counter = os.cpu_count()
103 |     train_dataloader = DataLoader(train_dataset,
104 |                                   batch_size=args.batch_size,
105 |                                   drop_last=False,
106 |                                   shuffle=False,
107 |                                   collate_fn=collator,
108 |                                   num_workers=cpu_counter)
109 | 
110 |     val_dataloader = DataLoader(val_dataset,
111 |                                 batch_size=args.batch_size,
112 |                                 drop_last=False,
113 |                                 shuffle=False,
114 |                                 collate_fn=collator,
115 |                                 num_workers=cpu_counter
116 |                                 )
117 | 
118 |     test_dataloader = DataLoader(test_dataset,
119 |                                  batch_size=args.batch_size,
120 |                                  drop_last=False,
121 |                                  collate_fn=collator,
122 |                                  num_workers=cpu_counter)
123 | 
124 |     class_labels = train_val_dataset.target_encoder.classes_
125 | 
126 |     nn_classifier = GPT2Classifier(
127 |         args,
128 |         class_labels,
129 |         pretrained_model_path=args.pretrained_path,
130 |         dropout=args.fc_dropout,
131 |         freeze_pretrained_part=args.freeze_pretrained_model,
132 |         reinitialize=args.reinitialize,
133 |         n_layers=args.n_layers
134 |     )
135 | 
136 |     early_stop_callback = EarlyStopping(
137 |         monitor='val_loss',
138 |         min_delta=1e-4,
139 |         patience=args.es_patience,
140 |         verbose=False,
141 |         mode='min'
142 |     )
143 | 
144 |     logger = NeptuneLogger(
145 |         offline_mode=not args.log_neptune,
146 |         close_after_fit=False,
147 |         project_name=NEPTUNE_PROJECT,
148 |         experiment_name=args.neptune_experiment_name,
149 |         params=vars(args),
150 |         upload_source_files=[(BASE_DIR / 'gpt_model/classifier/model.py').as_posix()]
151 |     )
152 | 
153 |     checkpoint_dir = f'{nn_classifier.__class__.__name__}_checkpoints'
154 |     model_checkpoint = ModelCheckpoint(
155 |         filepath=checkpoint_dir + '/{epoch}-{val_loss:.2f}-{other_metric:.2f}'
156 |     )
157 | 
158 |     trainer = Trainer(
159 |         early_stop_callback=early_stop_callback,
160 |         callbacks=[LearningRateLogger()],
161 |         checkpoint_callback=model_checkpoint,
162 |         auto_lr_find=False,
163 |         logger=logger,
164 |         gpus=int(device == 'cuda'),
165 |     )
166 | 
167 |     trainer.fit(nn_classifier, train_dataloader, val_dataloader)
168 |     trainer.test(nn_classifier, test_dataloader)
169 |     logger.experiment.log_artifact(model_checkpoint.best_model_path)
170 |     logger.experiment.stop()
171 | 
172 | 
173 | if __name__ == '__main__':
174 |     main()
175 | 


--------------------------------------------------------------------------------
/gpt_model/generator/run_generating.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import pathlib
  3 | 
  4 | import logging
  5 | import numpy as np
  6 | import pandas as pd
  7 | import torch
  8 | from transformers import GPT2LMHeadModel
  9 | 
 10 | from flow_parsing import save_dataset
 11 | from evaluation_utils.modeling import evaluate_generated_traffic, save_metrics
 12 | from gpt_model.generator.dataset import load_modeling_data_with_classes
 13 | from gpt_model.generator.baseline import MarkovGenerator
 14 | from gpt_model.tokenizer import PacketTokenizer
 15 | from settings import FilePatterns, REPORT_DIR
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | 
 20 | def generate_packets(protocol, n_samples, model: GPT2LMHeadModel, tokenizer, device='cpu', batch_limit=1024):
 21 |     logger.info(f'generating {n_samples} flows of "{protocol}"...')
 22 | 
 23 |     generated_flows = []
 24 |     tokens_to_sample = [batch_limit] * (n_samples // batch_limit)
 25 |     if n_samples % batch_limit != 0:
 26 |         # add the remainder
 27 |         tokens_to_sample += [n_samples % batch_limit]
 28 | 
 29 |     counter = 0
 30 |     for batch_size in tokens_to_sample:
 31 |         input_ids = torch.tensor([tokenizer.tokens_to_ids[protocol]] * batch_size, dtype=torch.long
 32 |                                  ).view(batch_size, -1).to(device)
 33 | 
 34 |         # no_repeat_ngram_size=1 is a dirty hack to fix duplicating pairs for 2-packet protocols
 35 |         out = model.generate(
 36 |             input_ids,
 37 |             eos_token_id=tokenizer.eos_token_id,
 38 |             pad_token_id=tokenizer.pad_token_id,
 39 |             max_length=128,
 40 |             do_sample=True,
 41 |             num_return_sequences=1,
 42 |             top_k=len(tokenizer),
 43 |             no_repeat_ngram_size=int(protocol in ['DNS', 'NTP']),
 44 |             use_cache=True,
 45 |         ).cpu()
 46 |         torch.cuda.empty_cache()
 47 |         packets = tokenizer.batch_decode_packets(out)
 48 |         generated_flows.append(packets)
 49 |         counter += batch_size
 50 |         logger.info(f'generated {counter} flows')
 51 | 
 52 |     target_dim_size = max(x.shape[1] for x in generated_flows)
 53 |     # pad arrays to equal out their 2nd dim
 54 |     generated_flows = list(map(lambda x: np.pad(x, ((0, 0), (0, target_dim_size - x.shape[1])), constant_values=np.nan),
 55 |                                generated_flows))
 56 |     generated_flows = np.concatenate(generated_flows, axis=0)
 57 |     return generated_flows
 58 | 
 59 | 
 60 | def main():
 61 |     parser = argparse.ArgumentParser()
 62 |     parser.add_argument(
 63 |         '--source_dataset',
 64 |         help='path to preprocessed .csv dataset',
 65 |         default='/media/raid_store/pretrained_traffic/train_csv'
 66 |     )
 67 |     parser.add_argument(
 68 |         '--pretrained_path',
 69 |         default='/media/raid_store/pretrained_traffic/gpt2_model_4_6epochs_classes_home_iot'
 70 |     )
 71 |     parser.add_argument(
 72 |         '--flow_limit_per_app',
 73 |         default=20000,
 74 |         type=int,
 75 |     )
 76 |     parser.add_argument(
 77 |         '--filename_patterns_to_exclude',
 78 |         default='mawi',
 79 |         help='see settings.py::FilePatterns for the options'
 80 |     )
 81 |     parser.add_argument(
 82 |         '--evaluate',
 83 |         action='store_true',
 84 |         default=False,
 85 |     )
 86 | 
 87 |     parser.add_argument(
 88 |         '--markov_model',
 89 |         action='store_true',
 90 |         default=False,
 91 |     )
 92 | 
 93 |     args = parser.parse_args()
 94 |     filename_patterns_to_exclude = getattr(FilePatterns, args.filename_patterns_to_exclude)
 95 |     source_dataset_folder = pathlib.Path(args.source_dataset)
 96 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
 97 | 
 98 |     all_source_flows, classes = load_modeling_data_with_classes(
 99 |         source_dataset_folder,
100 |         filename_patterns_to_exclude=filename_patterns_to_exclude
101 |     )
102 |     source_class_counts = classes.value_counts()
103 | 
104 |     pretrained_path = pathlib.Path(args.pretrained_path)
105 |     tokenizer = PacketTokenizer.from_pretrained(pretrained_path)
106 |     if not args.markov_model:
107 |         model = GPT2LMHeadModel.from_pretrained(pretrained_path).to(device)
108 | 
109 |     generated_flows_path = pretrained_path.parent / ('generated_flows_' + pretrained_path.stem)
110 |     if args.markov_model:
111 |         generated_flows_path = generated_flows_path.parent / (generated_flows_path.name + '_markov')
112 |     generated_flows_path.mkdir(exist_ok=True)
113 |     metrics = {}
114 |     for proto in tokenizer.tokens_to_ids.keys():
115 |         # skip special tokens
116 |         if proto.startswith('['):
117 |             continue
118 |         try:
119 |             source_class_count = source_class_counts[proto]
120 |         except KeyError:
121 |             logger.error(f'could not find target class "{proto}" in dataset, skipping')
122 |             continue
123 | 
124 |         n_flows_to_generate = source_class_count \
125 |             if source_class_count < args.flow_limit_per_app \
126 |             else args.flow_limit_per_app
127 | 
128 |         src_flows = all_source_flows[classes == proto]
129 | 
130 |         if args.markov_model:
131 |             markov = MarkovGenerator()
132 |             X = tokenizer.batch_encode_packets(src_flows.values.astype(np.float64),
133 |                                                target_class=proto,
134 |                                                add_special_tokens=True,
135 |                                                return_attention_mask=False,
136 |                                                return_tensors='np')['input_ids']
137 | 
138 |             markov.fit(X)
139 |             gen_tokens = markov.sample(n_flows_to_generate)
140 |             gen_flows = tokenizer.batch_decode_packets(gen_tokens)
141 |         else:
142 |             gen_flows = generate_packets(proto, n_flows_to_generate, model, tokenizer, device)
143 | 
144 |         gen_flows = pd.DataFrame(gen_flows, columns=tokenizer.packet_quantizer.raw_columns[:gen_flows.shape[1]])
145 |         save_dataset(gen_flows, save_to=generated_flows_path / f'{proto}.csv')
146 | 
147 |         if args.evaluate:
148 |             results = evaluate_generated_traffic(src_flows.values, gen_flows.values)
149 |             metrics[proto] = results
150 |     if args.evaluate:
151 |         save_metrics(metrics, REPORT_DIR / ('report_' + generated_flows_path.stem + '.csv'))
152 | 
153 | 
154 | if __name__ == '__main__':
155 |     main()
156 | 


--------------------------------------------------------------------------------
/fs_net/train_fsnet.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import os
  4 | from functools import partial
  5 | from pprint import pprint
  6 | 
  7 | import torch
  8 | from pytorch_lightning import Trainer
  9 | from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateLogger, EarlyStopping
 10 | from pytorch_lightning.loggers import NeptuneLogger
 11 | from torch.utils.data import DataLoader, random_split
 12 | 
 13 | from gpt_model.tokenizer import PacketTokenizer
 14 | from fs_net.dataset import SimpleClassificationQuantizedDataset, ClassificationPacketSizeDataset
 15 | from fs_net.model import FSNETClassifier
 16 | from settings import BASE_DIR, DEFAULT_PACKET_LIMIT_PER_FLOW, NEPTUNE_PROJECT, TARGET_CLASS_COLUMN
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | def _parse_args():
 22 |     parser = argparse.ArgumentParser()
 23 |     parser.add_argument(
 24 |         '--train_dataset',
 25 |         help='path to preprocessed .csv dataset',
 26 |         required=True
 27 |     )
 28 |     parser.add_argument(
 29 |         '--test_dataset',
 30 |         help='path to preprocessed .csv dataset',
 31 |     )
 32 |     parser.add_argument(
 33 |         '--target_column',
 34 |         help='column within the .csv denoting target variable',
 35 |         default=TARGET_CLASS_COLUMN
 36 |     )
 37 |     parser.add_argument(
 38 |         "--packet_num",
 39 |         dest='packet_num',
 40 |         type=int,
 41 |         help="specify the first N packets to use for classification, "
 42 |              "defaults to settings.py:DEFAULT_PACKET_LIMIT_PER_FLOW,",
 43 |         default=DEFAULT_PACKET_LIMIT_PER_FLOW
 44 |     )
 45 |     parser.add_argument(
 46 |         "--use_packet_size_only",
 47 |         dest='use_packet_size_only',
 48 |         action='store_true',
 49 |         help="set to use only (truncated) packet size sequences instead of quantized (PS, IPT)",
 50 |         default=False
 51 |     )
 52 |     parser.add_argument(
 53 |         "--dynamic_ps_range",
 54 |         dest='dynamic_ps_range',
 55 |         help="dynamic range for PS parameter which implicitly sets Embedding layer dim, effective only along"
 56 |              "with --use_packet_size_only option",
 57 |         type=int,
 58 |         default=5000
 59 |     )
 60 |     parser.add_argument(
 61 |         '--tokenizer_path',
 62 |         help='path to the tokenizer checkpoint, defaults to the one used for tests ooops :)',
 63 |         default=BASE_DIR / 'tests/static/quantizer_checkpoint'
 64 |     )
 65 |     parser.add_argument(
 66 |         '--neptune_experiment_name',
 67 |         dest='neptune_experiment_name',
 68 |         default='FS-NET'
 69 |     )
 70 |     parser.add_argument(
 71 |         '--log_neptune',
 72 |         dest='log_neptune',
 73 |         action='store_true',
 74 |         default=False
 75 |     )
 76 |     parser.add_argument(
 77 |         '--learning_rate',
 78 |         default=0.0005
 79 |     )
 80 |     parser.add_argument(
 81 |         '--batch_size',
 82 |         default=256,
 83 |     )
 84 |     parser.add_argument(
 85 |         '--es_patience',
 86 |         default=5,
 87 |         type=int,
 88 |     )
 89 |     args = parser.parse_args()
 90 |     return args
 91 | 
 92 | 
 93 | def main():
 94 |     args = _parse_args()
 95 |     pprint(args)
 96 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
 97 |     cpu_counter = os.cpu_count()
 98 | 
 99 |     if args.use_packet_size_only:
100 |         n_tokens = args.dynamic_ps_range
101 |         ds_class = partial(ClassificationPacketSizeDataset, max_size_range=n_tokens)
102 |     else:
103 |         tokenizer = PacketTokenizer.from_pretrained(args.tokenizer_path,
104 |                                                     flow_size=args.packet_num)
105 |         n_tokens = len(tokenizer)
106 |         ds_class = partial(SimpleClassificationQuantizedDataset, tokenizer=tokenizer)
107 | 
108 |     train_val_dataset = ds_class(dataset_path=args.train_dataset,
109 |                                  target_column=args.target_column)
110 |     train_part_len = int(len(train_val_dataset) * 0.9)
111 |     train_dataset, val_dataset = random_split(train_val_dataset,
112 |                                               [train_part_len, len(train_val_dataset) - train_part_len])
113 | 
114 |     test_dataset = ds_class(dataset_path=args.test_dataset,
115 |                             label_encoder=train_val_dataset.target_encoder,
116 |                             target_column=args.target_column)
117 | 
118 |     train_dataloader = DataLoader(train_dataset,
119 |                                   batch_size=args.batch_size,
120 |                                   drop_last=False,
121 |                                   shuffle=False,
122 |                                   num_workers=cpu_counter)
123 | 
124 |     val_dataloader = DataLoader(val_dataset,
125 |                                 batch_size=args.batch_size,
126 |                                 drop_last=False,
127 |                                 shuffle=False,
128 |                                 num_workers=cpu_counter)
129 | 
130 |     test_dataloader = DataLoader(test_dataset,
131 |                                  batch_size=args.batch_size,
132 |                                  drop_last=False,
133 |                                  num_workers=cpu_counter)
134 | 
135 |     class_labels = train_val_dataset.target_encoder.classes_
136 | 
137 |     nn_classifier = FSNETClassifier(args, class_labels=class_labels, n_tokens=n_tokens)
138 | 
139 |     early_stop_callback = EarlyStopping(
140 |         monitor='val_loss',
141 |         min_delta=1e-4,
142 |         patience=args.es_patience,
143 |         verbose=False,
144 |         mode='min'
145 |     )
146 | 
147 |     exp_logger = NeptuneLogger(
148 |         offline_mode=not args.log_neptune,
149 |         close_after_fit=False,
150 |         project_name=NEPTUNE_PROJECT,
151 |         experiment_name=args.neptune_experiment_name,
152 |         params=vars(args),
153 |         upload_source_files=[(BASE_DIR / 'fs_net/model.py').as_posix()]
154 |     )
155 | 
156 |     checkpoint_dir = f'{nn_classifier.__class__.__name__}_checkpoints'
157 |     model_checkpoint = ModelCheckpoint(
158 |         filepath=checkpoint_dir + '/{epoch}-{val_loss:.2f}-{other_metric:.2f}'
159 |     )
160 | 
161 |     trainer = Trainer(
162 |         early_stop_callback=early_stop_callback,
163 |         callbacks=[LearningRateLogger()],
164 |         checkpoint_callback=model_checkpoint,
165 |         auto_lr_find=False,
166 |         logger=exp_logger,
167 |         gpus=int(device == 'cuda'),
168 |     )
169 | 
170 |     trainer.fit(nn_classifier, train_dataloader, val_dataloader)
171 |     trainer.test(nn_classifier, test_dataloader)
172 |     exp_logger.experiment.log_artifact(model_checkpoint.best_model_path)
173 |     exp_logger.experiment.stop()
174 | 
175 | 
176 | if __name__ == '__main__':
177 |     main()
178 | 


--------------------------------------------------------------------------------
/sklearn_classifiers/run_training.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | 
  4 | import neptune
  5 | from sklearn.model_selection import train_test_split
  6 | from flow_parsing import read_dataset
  7 | from evaluation_utils.classification import Reporter
  8 | from sklearn_classifiers.featurizer import Featurizer, TransformerFeatureExtractor
  9 | from sklearn_classifiers.clf_utils import read_classifier_settings, initialize_classifiers, fit_optimal_classifier
 10 | from settings import BASE_DIR, DEFAULT_PACKET_LIMIT_PER_FLOW, NEPTUNE_PROJECT, TARGET_CLASS_COLUMN, RANDOM_SEED
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | def _parse_args():
 16 |     parser = argparse.ArgumentParser()
 17 |     parser.add_argument(
 18 |         "-c", "--config",
 19 |         help="configuration file, defaults to config.yaml",
 20 |         default=BASE_DIR / 'sklearn_classifiers/config.yaml')
 21 | 
 22 |     parser.add_argument(
 23 |         '--train_dataset',
 24 |         help='path to preprocessed .csv dataset',
 25 |         required=True
 26 |     )
 27 |     parser.add_argument(
 28 |         '--test_dataset',
 29 |         help='path to preprocessed .csv dataset, if not specified, 1/4 of the training dataset is selected in '
 30 |              'stratified manner',
 31 |     )
 32 |     parser.add_argument(
 33 |         '--target_column',
 34 |         help='column within the .csv denoting target variable',
 35 |         default=TARGET_CLASS_COLUMN
 36 |     )
 37 |     parser.add_argument(
 38 |         "--packet_num",
 39 |         dest='packet_num',
 40 |         type=int,
 41 |         help="specify the first N packets to use for classification, "
 42 |              "defaults to settings.py:DEFAULT_PACKET_LIMIT_PER_FLOW,",
 43 |         default=DEFAULT_PACKET_LIMIT_PER_FLOW
 44 |     )
 45 |     parser.add_argument(
 46 |         '--continuous',
 47 |         dest='continuous',
 48 |         action='store_true',
 49 |         help="when enabled, continuous derivative features from dataset are accounted for, "
 50 |              "e.g. percentiles, sums, etc. of packet size. Defaults to False",
 51 |         default=False
 52 |     )
 53 |     parser.add_argument(
 54 |         '--categorical',
 55 |         dest='categorical',
 56 |         action='store_true',
 57 |         help="when enabled, categorical feature from dataset are accounted for, "
 58 |              "e.g. IP protocol. Defaults to False",
 59 |         default=False
 60 |     )
 61 |     parser.add_argument(
 62 |         "--raw",
 63 |         dest='raw',
 64 |         action='store_true',
 65 |         help="when enabled, raw packet sequences are used for classification",
 66 |         default=False
 67 |     )
 68 |     parser.add_argument(
 69 |         '--use_iat',
 70 |         help='set to use inter-packet time features, as raw features and/or their derivatives',
 71 |         action='store_true',
 72 |         default=False
 73 |     )
 74 |     parser.add_argument(
 75 |         '--transformer_model_path',
 76 |         help='path to the pretrained transformer, if specified, shadows other feature-related arguments except'
 77 |              'for the number of packets to use'
 78 |     )
 79 |     parser.add_argument(
 80 |         '--mask_first_token',
 81 |         help='masks first sequence token when extracting features from transformer model, useful when the model was'
 82 |              'pretrained with class-specific first tokens',
 83 |         action='store_true',
 84 |         default=False
 85 |     )
 86 |     parser.add_argument(
 87 |         '--reinitialize',
 88 |         action='store_true',
 89 |         default=False
 90 |     )
 91 | 
 92 |     parser.add_argument('--search_hyper_parameters', dest='search_hyper_parameters', action='store_true', default=False)
 93 | 
 94 |     parser.add_argument('--log_neptune', dest='log_neptune', action='store_true', default=False)
 95 |     args = parser.parse_args()
 96 |     return args
 97 | 
 98 | 
 99 | def main():
100 |     """ basic training loop example  """
101 |     args = _parse_args()
102 | 
103 |     logger.info('Loading csv file..')
104 | 
105 |     df_train = read_dataset(args.train_dataset, fill_na=True)
106 |     if args.test_dataset:
107 |         df_test = read_dataset(args.test_dataset, fill_na=True)
108 |     else:
109 |         df_train, df_test = train_test_split(df_train,
110 |                                              stratify=df_train[args.target_column],
111 |                                              test_size=1 / 4,
112 |                                              random_state=RANDOM_SEED)
113 | 
114 |     if args.transformer_model_path:
115 |         featurizer = TransformerFeatureExtractor(
116 |             args.transformer_model_path,
117 |             args.packet_num,
118 |             mask_first_token=args.mask_first_token,
119 |             reinitialize=args.reinitialize
120 |         )
121 |     else:
122 |         featurizer = Featurizer(
123 |             packet_num=args.packet_num,
124 |             cont_features=None if args.continuous else [],
125 |             categorical_features=None if args.categorical else [],
126 |             consider_raw_features=args.raw,
127 |             consider_j3a=False,
128 |             consider_tcp_flags=False,
129 |             consider_iat_features=args.use_iat,
130 |             target_column=args.target_column,
131 |         )
132 | 
133 |     X_train, y_train = featurizer.fit_transform_encode(df_train)
134 |     X_test, y_test = featurizer.transform_encode(df_test)
135 | 
136 |     classifier_settings = read_classifier_settings(args.config)
137 |     clfs = initialize_classifiers(classifier_settings)
138 | 
139 |     for model_name, model_holder in clfs.items():
140 |         if args.search_hyper_parameters:
141 |             fit_optimal_classifier(model_holder, X_train, y_train)
142 |         else:
143 |             model_holder.classifier.fit(X_train, y_train)
144 |         y_pred = model_holder.classifier.predict(X_test)
145 |         reporter = Reporter(y_test, y_pred, model_holder.name, featurizer.target_encoder.classes_)
146 | 
147 |         report_file = f'report_{model_holder.name}.csv'
148 |         report = reporter.clf_report(save_to=report_file)
149 |         print(report)
150 | 
151 |         if args.log_neptune:
152 |             neptune.init(NEPTUNE_PROJECT)
153 |             parameters = vars(args)
154 |             parameters.update({'classifier': model_name})
155 |             parameters.update(model_holder.classifier.get_params(deep=False))
156 | 
157 |             neptune.create_experiment(name='sklearn', params=parameters)
158 |             neptune.log_artifact((reporter.save_dir / report_file).as_posix())
159 |             neptune.log_image('confusion_matrix', reporter.plot_conf_matrix())
160 |             for metric_name, metric_value in reporter.scores().items():
161 |                 neptune.log_metric(metric_name, metric_value)
162 | 
163 |             neptune.stop()
164 | 
165 | 
166 | if __name__ == '__main__':
167 |     main()
168 | 


--------------------------------------------------------------------------------
/flow_parsing/pcap_parser.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import csv
  3 | import logging
  4 | from typing import Optional
  5 | 
  6 | import nfstream
  7 | import pandas as pd
  8 | 
  9 | import settings
 10 | from flow_parsing.features import calc_raw_features, calc_stat_features
 11 | from flow_parsing.aux_raw_features_plugin import AuxRawFeatures
 12 | 
 13 | logger = logging.getLogger('flow_parser')
 14 | 
 15 | 
 16 | def init_streamer(source,
 17 |                   derivative_features: bool,
 18 |                   online_mode: bool = False,
 19 |                   packet_limit: int = settings.DEFAULT_PACKET_LIMIT_PER_FLOW):
 20 |     # since we decide and set routing policy upon first occurrence of a flow we don't care about its re-export
 21 |     active_timeout = settings.ACTIVE_TIMEOUT_ONLINE if online_mode else settings.ACTIVE_TIMEOUT_OFFLINE
 22 |     plugins = [AuxRawFeatures(packet_limit=packet_limit)] if derivative_features else []
 23 |     logger.info(f'mode set to {"online" if online_mode else "offline"}')
 24 | 
 25 |     return nfstream.NFStreamer(
 26 |         source=source,
 27 |         statistical_analysis=False,
 28 |         idle_timeout=settings.IDLE_TIMEOUT,
 29 |         active_timeout=active_timeout,
 30 |         splt_analysis=packet_limit,
 31 |         accounting_mode=1,   # IP size,
 32 |         udps=plugins,
 33 |     )
 34 | 
 35 | 
 36 | def get_ip_protocol_by_int(proto: int) -> str:
 37 |     try:
 38 |         return settings.IP_PROTO_MAPPING[proto]
 39 |     except KeyError:
 40 |         logger.warning(f'encountered unknown IP proto number: {proto}')
 41 |         return 'UNKNOWN'
 42 | 
 43 | 
 44 | def flow_processor(source,
 45 |                    derivative_features: bool = True,
 46 |                    raw_features: Optional[int] = None,
 47 |                    provide_labels=True,
 48 |                    online_mode=True
 49 |                    ) -> dict:
 50 |     def _make_flow_id():
 51 |         return f'{get_ip_protocol_by_int(entry.protocol)} ' \
 52 |                f'{entry.src_ip}:{entry.src_port} ' \
 53 |                f'{entry.dst_ip}:{entry.dst_port}'
 54 | 
 55 |     streamer = init_streamer(
 56 |         source,
 57 |         derivative_features,
 58 |         online_mode=online_mode,
 59 |         packet_limit=raw_features if raw_features is not None else settings.DEFAULT_PACKET_LIMIT_PER_FLOW
 60 |     )
 61 |     for flow_number, entry in enumerate(streamer):
 62 |         flow_ids = {
 63 |             'flow_id': _make_flow_id(),
 64 |             'ip_proto': get_ip_protocol_by_int(entry.protocol)}
 65 | 
 66 |         ndpi_features = {
 67 |             'ndpi_app': entry.application_name,
 68 |             'ndpi_category': entry.application_category_name,
 69 |             'ndpi_client_info': entry.user_agent,
 70 |             'ndpi_server_info': entry.requested_server_name,
 71 |             'ndpi_j3ac': entry.client_fingerprint,
 72 |             'ndpi_j3as': entry.server_fingerprint,
 73 |         } if provide_labels else {}
 74 | 
 75 |         raw_packets = calc_raw_features(entry) if raw_features else {}
 76 | 
 77 |         flow_features = calc_stat_features(entry) if derivative_features else {}
 78 | 
 79 |         if flow_number > 0 == flow_number % 5000:
 80 |             logger.info(f'processed {flow_number} flows...')
 81 |         yield dict(**flow_ids, **ndpi_features, **flow_features, **raw_packets)
 82 | 
 83 | 
 84 | def parse_pcap_to_csv(pcap_file_path,
 85 |                       target_csv_path,
 86 |                       derivative_features: bool = True,
 87 |                       raw_features: Optional[int] = None,
 88 |                       provide_labels=True,
 89 |                       online_mode=True):
 90 |     logger.info(f'started parsing file {pcap_file_path}')
 91 |     logger.info(f'saving to {target_csv_path}')
 92 |     with open(target_csv_path, 'w', newline='') as f:
 93 |         writer = csv.writer(f)
 94 |         for index, flow in enumerate(flow_processor(pcap_file_path,
 95 |                                                     derivative_features=derivative_features,
 96 |                                                     raw_features=raw_features,
 97 |                                                     provide_labels=provide_labels,
 98 |                                                     online_mode=online_mode)):
 99 |             if index == 0:
100 |                 writer.writerow(flow.keys())
101 |             writer.writerow(flow.values())
102 | 
103 | 
104 | def parse_pcap_to_dataframe(pcap_file: str,
105 |                             derivative_features: bool = True,
106 |                             raw_features: Optional[int] = None,
107 |                             provide_labels=True,
108 |                             online_mode=True) -> pd.DataFrame:
109 |     flows = []
110 |     logger.info(f'started parsing file {pcap_file}')
111 |     for flow in flow_processor(pcap_file,
112 |                                derivative_features=derivative_features,
113 |                                raw_features=raw_features,
114 |                                provide_labels=provide_labels,
115 |                                online_mode=online_mode):
116 |         flows.append(flow)
117 |     return pd.DataFrame(flows)
118 | 
119 | 
120 | def _get_output_csv_filename(args) -> str:
121 |     core_name = args.pcapfile.split('/')[-1].split('.')[0]
122 |     if args.raw:
123 |         core_name = core_name + '_raw'
124 |     pkt_lim = args.raw if args.raw else settings.DEFAULT_PACKET_LIMIT_PER_FLOW
125 |     output_csv = settings.PCAP_OUTPUT_DIR / f'{core_name}_{pkt_lim}packets.csv'
126 |     return output_csv
127 | 
128 | 
129 | def main():
130 |     parser = argparse.ArgumentParser()
131 |     parser.add_argument(
132 |         "-p", "--pcapfile",
133 |         help="pcap file",
134 |         default=(settings.BASE_DIR / 'flow_parsing/static/example.pcap').as_posix(),
135 |     )
136 |     parser.add_argument(
137 |         "-o", "--output",
138 |         help="output .csv file destination",
139 |     )
140 | 
141 |     parser.add_argument(
142 |         "--raw",
143 |         dest='raw',
144 |         type=int,
145 |         help="when provided, in addition to feature statistics, specified N number of raw features "
146 |              "(packet lengths and IATs) for first N packets are exported, which are used by traffic augmenters/models.",
147 |         default=None
148 |     )
149 |     parser.add_argument('--derivative', dest='derivative', action='store_true',
150 |                         help="when enabled, derivative feature statistics "
151 |                              "(e.g. such as percentiles, sums, etc. of packet size) "
152 |                              "of first DEFAULT_PACKET_LIMIT_PER_FLOW or provided via arg '--raw' packets are exported")
153 |     parser.add_argument('--no-derivative', dest='derivative', action='store_false')
154 |     parser.set_defaults(derivative=True)
155 | 
156 |     parser.add_argument('--online_mode', dest='online_mode', action='store_true',
157 |                         help="when enabled, active flow expiration timeout is decreased to the one defined in settings."
158 |                              "In offline mode, active timeout is set to be large "
159 |                              "enough to avoid flow fragmentation",
160 |                         default=False)
161 | 
162 |     args = parser.parse_args()
163 | 
164 |     output_csv = args.output if args.output else _get_output_csv_filename(args)
165 |     parse_pcap_to_csv(args.pcapfile,
166 |                       target_csv_path=output_csv,
167 |                       derivative_features=args.derivative,
168 |                       raw_features=args.raw,
169 |                       online_mode=args.online_mode)
170 | 
171 | 
172 | if __name__ == '__main__':
173 |     main()
174 | 


--------------------------------------------------------------------------------
/evaluation_utils/modeling.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from functools import partial
  3 | 
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | import pandas as pd
  7 | import scipy
  8 | 
  9 | from flow_parsing.features import inter_packet_times_from_timestamps
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | def flows_to_packets(flows):
 15 |     return flows[~np.isnan(flows)].reshape(-1, 2)
 16 | 
 17 | 
 18 | def convert_ipt_to_iat(flows):
 19 |     """ converts inter-packet time (IPT - timing between 2 any packets) to
 20 |         inter-arrival time (IAT - timing between 2 consecutive packets within 1 direction) """
 21 | 
 22 |     def ipt_to_iat(flow):
 23 |         """
 24 |         :param flow: source flow of size (packet_num, feature_num=2)
 25 |         :return:
 26 |         """
 27 |         timestamps_like = np.cumsum(flow[:, 1])
 28 |         direction_from_mask = flow[:, 0] > 0
 29 |         direction_to_mask = flow[:, 0] < 0
 30 | 
 31 |         iat_flow = np.full(flow.shape, np.nan)
 32 |         iat_flow[direction_from_mask, 1] = inter_packet_times_from_timestamps(timestamps_like[direction_from_mask])
 33 |         iat_flow[direction_to_mask, 1] = inter_packet_times_from_timestamps(timestamps_like[direction_to_mask])
 34 |         iat_flow[:, 0] = flow[:, 0]
 35 |         return iat_flow
 36 | 
 37 |     source_shape = flows.shape
 38 |     raw_packets = flows.reshape(-1, 2)  # per-packet view
 39 |     raw_packets = raw_packets.reshape(-1, source_shape[1] // 2, 2)  # (n_flows, n_packets, features)
 40 |     iat_packets = np.empty_like(raw_packets)
 41 |     for i in range(source_shape[0]):
 42 |         iat_packets[i, :, :] = ipt_to_iat(raw_packets[i])
 43 |     iat_packets = iat_packets.reshape(source_shape)
 44 |     return iat_packets
 45 | 
 46 | 
 47 | def plot_packets(packet_features, limit_packet_scale=False, save_to=None, ru_lang=False):
 48 |     if isinstance(packet_features, pd.DataFrame):
 49 |         packet_features = packet_features.values
 50 | 
 51 |     fig, ax = plt.subplots(figsize=(12, 7))
 52 |     plt.scatter(packet_features[:, 0], packet_features[:, 1], alpha=0.3)
 53 |     ax.set_title(f'Число кластеров: {packet_features.shape[0]}' if ru_lang else
 54 |                  f'Number of items: {packet_features.shape[0]}')
 55 |     if limit_packet_scale:
 56 |         ax.set_xlim(-1, 1)
 57 |     ax.grid(True)
 58 |     ax.set_xlabel('размер пакета, байт / 1500' if ru_lang else
 59 |                   'packet size, bytes / 1500')
 60 |     ax.set_ylabel('log10(межпакетный интервал, µs)' if ru_lang else
 61 |                   'log10(inter-packet time, µs)')
 62 |     if save_to:
 63 |         plt.savefig(save_to, dpi=300)
 64 | 
 65 | 
 66 | def packets_per_flow(flows):
 67 |     non_packet_mask = ~np.isnan(flows)
 68 |     return non_packet_mask.sum(1) / 2
 69 | 
 70 | 
 71 | def handle_estimation_exceptions(func):
 72 |     def real_decorator(*args, **kwargs):
 73 |         try:
 74 |             return func(*args, **kwargs)
 75 |         except Exception as e:
 76 |             logger.error(f'{func.__name__}: {e}')
 77 |             return np.nan
 78 | 
 79 |     return real_decorator
 80 | 
 81 | 
 82 | @handle_estimation_exceptions
 83 | def estimate_pdf(samples):
 84 |     x_values = np.linspace(0, max(samples), 100)
 85 |     kde = scipy.stats.gaussian_kde(samples)(x_values)
 86 |     kde /= sum(kde)
 87 |     return kde
 88 | 
 89 | 
 90 | @handle_estimation_exceptions
 91 | def get_kl_divergence_continuous(orig_values, gen_values):
 92 |     kde_orig = estimate_pdf(orig_values)
 93 |     kde_gen = estimate_pdf(gen_values)
 94 |     return scipy.stats.entropy(kde_orig, kde_gen)
 95 | 
 96 | 
 97 | @handle_estimation_exceptions
 98 | def get_wasserstein_distance_pdf(orig_values, gen_values):
 99 |     kde_orig = estimate_pdf(orig_values)
100 |     kde_gen = estimate_pdf(gen_values)
101 |     return scipy.stats.wasserstein_distance(kde_orig, kde_gen)
102 | 
103 | 
104 | @handle_estimation_exceptions
105 | def get_ks_stat(orig_values, gen_values):
106 |     ks = scipy.stats.ks_2samp(orig_values, gen_values)
107 |     return ks.statistic
108 | 
109 | 
110 | def scaled_diff(orig, gen):
111 |     return np.abs(orig - gen) / orig
112 | 
113 | 
114 | @handle_estimation_exceptions
115 | def scaled_diff_at_percentile(orig, gen, percentile):
116 |     o = np.percentile(orig, percentile)
117 |     g = np.percentile(gen, percentile)
118 |     return scaled_diff(o, g)
119 | 
120 | 
121 | def packets_to_throughput(packets, resolution='1S'):
122 |     # replace indexes with DateTime format
123 |     df = pd.Series(
124 |         packets[:, 0],
125 |         index=pd.to_datetime(np.cumsum(packets[:, 1]), unit='ms')
126 |     )
127 |     throughput = df.resample(resolution).sum()
128 |     return throughput.values
129 | 
130 | 
131 | def evaluate_generated_traffic(src_flows: np.ndarray, gen_flows: np.ndarray) -> dict:
132 |     logger.info('starting evaluation of flows...')
133 |     src_packets = flows_to_packets(convert_ipt_to_iat(src_flows))
134 |     gen_packets = flows_to_packets(convert_ipt_to_iat(gen_flows))
135 | 
136 |     client_src_mask = src_packets[:, 0] > 0
137 |     client_gen_mask = gen_packets[:, 0] > 0
138 | 
139 |     client_src_packets = src_packets[client_src_mask]
140 |     server_src_packets = src_packets[~client_src_mask]
141 | 
142 |     client_gen_packets = gen_packets[client_gen_mask]
143 |     server_gen_packets = gen_packets[~client_gen_mask]
144 | 
145 |     throughput = {
146 |         'src_avg_throughput_bytes_per_s_client': np.mean(packets_to_throughput(client_src_packets)),
147 |         'gen_avg_throughput_bytes_per_s_client': np.mean(packets_to_throughput(client_gen_packets)),
148 |         'src_avg_throughput_bytes_per_s_server': np.mean(packets_to_throughput(server_src_packets)),
149 |         'gen_avg_throughput_bytes_per_s_server': np.mean(packets_to_throughput(server_gen_packets)),
150 |     }
151 | 
152 |     metrics = {}
153 | 
154 |     for metric_name, metric_function in [
155 |         ('KL', get_kl_divergence_continuous),
156 |         # ('Wasserstein', get_wasserstein_distance_pdf),
157 |         ('KS_2sample', get_ks_stat),
158 |         ('10th_percentile', partial(scaled_diff_at_percentile, percentile=10)),
159 |         # ('25th_percentile', partial(scaled_diff_at_percentile, percentile=25)),
160 |         ('50th_percentile', partial(scaled_diff_at_percentile, percentile=50)),
161 |         # ('75th_percentile', partial(scaled_diff_at_percentile, percentile=75)),
162 |         ('90th_percentile', partial(scaled_diff_at_percentile, percentile=90))
163 | 
164 |     ]:
165 |         metrics.update({
166 |             metric_name + '_packets_per_flow': metric_function(packets_per_flow(src_flows), packets_per_flow(gen_flows)),
167 |             metric_name + '_PS_client': metric_function(client_src_packets[:, 0], client_gen_packets[:, 0]),
168 |             metric_name + '_IAT_client': metric_function(client_src_packets[:, 1], client_gen_packets[:, 1]),
169 |             metric_name + '_PS_server': metric_function(server_src_packets[:, 0], server_gen_packets[:, 0]),
170 |             metric_name + '_IAT_server': metric_function(server_src_packets[:, 1], server_gen_packets[:, 1]),
171 |             metric_name + '_thrpt_client': metric_function(packets_to_throughput(client_src_packets),
172 |                                                            packets_to_throughput(client_gen_packets)),
173 |             f'{metric_name}_thrpt_server': metric_function(packets_to_throughput(server_src_packets),
174 |                                                            packets_to_throughput(server_gen_packets)),
175 |         })
176 | 
177 |     common_metrics = {
178 |         'n_flows': min(src_flows.shape[0], gen_flows.shape[0])
179 |     }
180 |     return dict(**common_metrics, **metrics, **throughput)
181 | 
182 | 
183 | def save_metrics(metrics: dict, save_to):
184 |     pd.DataFrame(metrics).T.to_csv(save_to)
185 | 


--------------------------------------------------------------------------------
/gpt_model/data_preparation/format_parsed_as_classification_dataset.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import pathlib
  3 | from typing import Iterable, Optional
  4 | 
  5 | import pandas as pd
  6 | 
  7 | from flow_parsing.utils import get_hash, read_dataset, check_filename_in_patterns, save_dataset
  8 | from gpt_model.data_preparation.preprocess_target_pcaps import IOT_DEVICES
  9 | from settings import TARGET_CLASS_COLUMN, LOWER_BOUND_CLASS_OCCURRENCE, FilePatterns, DATASET_DIR
 10 | 
 11 | """
 12 | task-specific module, provided for the sake of reproducibility
 13 | formats labels from outputs of nDPI and in case of IoT traffic, assigns labels from filenames
 14 | """
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | # signalling protos are common among all devices and it doesn't make sense to treat them separately
 19 | COMMON_PROTOCOLS = ['DNS', 'NTP', 'STUN']
 20 | GARBAGE_PROTOCOLS = ['ICMP', 'ICMPV6', 'DHCPV6', 'DHCP', 'Unknown', 'IGMP', 'SSDP']
 21 | 
 22 | 
 23 | def _load_parsed_results(dir_with_parsed_csvs, filename_patterns_to_exclude: Optional[Iterable[str]]):
 24 |     dir_with_parsed_csvs = pathlib.Path(dir_with_parsed_csvs)
 25 | 
 26 |     parsed_csvs = list(dir_with_parsed_csvs.glob('*.csv'))
 27 | 
 28 |     iot_datasets = []
 29 |     usual_traffic = []
 30 | 
 31 |     iot_categories = set(item.category for item in IOT_DEVICES)
 32 |     for csv_file in parsed_csvs:
 33 |         # skip non-home and IoT files
 34 |         if check_filename_in_patterns(csv_file, filename_patterns_to_exclude):
 35 |             continue
 36 | 
 37 |         traffic_df = read_dataset(csv_file)
 38 | 
 39 |         if csv_file.name.startswith('train'):
 40 |             base_name = csv_file.name.split('train_')[-1]
 41 |         elif csv_file.name.startswith('val'):
 42 |             base_name = csv_file.name.split('val_')[-1]
 43 |         elif csv_file.name.startswith('test'):
 44 |             base_name = csv_file.name.split('test_')[-1]
 45 |         else:
 46 |             base_name = csv_file.name
 47 | 
 48 |         traffic_df['source_file'] = base_name
 49 | 
 50 |         if base_name.split('_')[0] in iot_categories:
 51 |             iot_datasets.append(traffic_df)
 52 |         else:
 53 |             usual_traffic.append(traffic_df)
 54 | 
 55 |     try:
 56 |         iot_traffic = pd.concat(iot_datasets, ignore_index=True)
 57 |     except ValueError:
 58 |         iot_traffic = pd.DataFrame([])
 59 |         logger.warning('no IoT files were found!')
 60 |     usual_traffic = pd.concat(usual_traffic, ignore_index=True)
 61 |     logger.info(f'found: {len(iot_traffic)} IoT flows, and {len(usual_traffic)} usual')
 62 |     return iot_traffic, usual_traffic
 63 | 
 64 | 
 65 | def _set_common_protos_targets(dataset):
 66 |     for proto in COMMON_PROTOCOLS:
 67 |         dataset.loc[dataset['ndpi_app'].str.startswith(proto), TARGET_CLASS_COLUMN] = proto
 68 |     return dataset
 69 | 
 70 | 
 71 | def _set_iot_devices_targets(dataset):
 72 |     """ assigns target class according to the category of an IoT device """
 73 |     common_indexer = dataset[TARGET_CLASS_COLUMN].isin(COMMON_PROTOCOLS)
 74 |     iot_category = dataset.loc[~common_indexer, 'source_file'].str.split('_').apply(lambda x: 'IoT_' + x[0])
 75 |     dataset.loc[~common_indexer, TARGET_CLASS_COLUMN] = iot_category
 76 |     logger.info(str(dataset[TARGET_CLASS_COLUMN].value_counts()))
 77 |     return dataset
 78 | 
 79 | 
 80 | def _set_application_targets(dataset):
 81 |     """ assigns target class according to the 'Y' application from nDPI's 'X.Y' label """
 82 |     common_indexer = dataset[TARGET_CLASS_COLUMN].isin(COMMON_PROTOCOLS)
 83 |     cleaned_up_applications = dataset.loc[~common_indexer, 'ndpi_app'].str.split('.').apply(lambda x: x[-1])
 84 |     dataset.loc[~common_indexer, TARGET_CLASS_COLUMN] = cleaned_up_applications
 85 |     logger.info(str(dataset[TARGET_CLASS_COLUMN].value_counts()))
 86 |     return dataset
 87 | 
 88 | 
 89 | def _rm_garbage(dataset, garbage: list = None, column_from='ndpi_app'):
 90 |     """ rm irrelevant targets for classification at an upstream device """
 91 |     if garbage is None:
 92 |         garbage = GARBAGE_PROTOCOLS
 93 |     garbage_indexer = dataset[column_from].isin(garbage)
 94 |     logger.info(f'found {garbage_indexer.sum()} objects of garbage protos')
 95 |     return dataset[~garbage_indexer]
 96 | 
 97 | 
 98 | def prune_targets(dataset, lower_bound=LOWER_BOUND_CLASS_OCCURRENCE, underrepresented_protos: list = None):
 99 |     """ rm infrequent targets """
100 |     proto_counts = dataset[TARGET_CLASS_COLUMN].value_counts()
101 |     if underrepresented_protos is None:
102 |         underrepresented_protos = proto_counts[proto_counts < lower_bound].index.tolist()
103 |     if underrepresented_protos:
104 |         logger.info(f'pruning the following targets: {underrepresented_protos}')
105 |         dataset = dataset.loc[~dataset[TARGET_CLASS_COLUMN].isin(underrepresented_protos)]
106 |     return dataset.reset_index(drop=True), underrepresented_protos
107 | 
108 | 
109 | def delete_duplicating_flows(dataset):
110 |     def to_session_id(flow_id):
111 |         proto, conn1, conn2 = flow_id.split(' ')
112 |         return proto, frozenset([conn1, conn2])
113 | 
114 |     dataset['session_id'] = dataset['flow_id'].apply(to_session_id)
115 |     dataset = dataset.drop_duplicates(subset=['session_id'])
116 |     dataset.drop(columns='session_id', inplace=True)
117 |     logger.info(f'{dataset.shape[0]} flows left after deduplication')
118 |     return dataset
119 | 
120 | 
121 | def prepare_classification_data(csv_dir, remove_garbage=True, filename_patterns_to_exclude=None):
122 |     """ the order of operations matters """
123 |     iot_traffic, usual_traffic = _load_parsed_results(csv_dir, filename_patterns_to_exclude)
124 | 
125 |     if len(iot_traffic) > 0:
126 |         iot_traffic = _set_common_protos_targets(iot_traffic)
127 |         iot_traffic = _set_iot_devices_targets(iot_traffic)
128 |         if remove_garbage:
129 |             iot_traffic = _rm_garbage(iot_traffic,
130 |                                       column_from='ndpi_app')
131 | 
132 |     usual_traffic = _set_common_protos_targets(usual_traffic)
133 |     usual_traffic = _set_application_targets(usual_traffic)
134 | 
135 |     if remove_garbage:
136 |         usual_traffic = _rm_garbage(usual_traffic,
137 |                                     garbage=GARBAGE_PROTOCOLS + ['Amazon'],
138 |                                     column_from=TARGET_CLASS_COLUMN)
139 | 
140 |     merged_traffic = pd.concat([usual_traffic, iot_traffic], ignore_index=True)
141 |     return merged_traffic
142 | 
143 | 
144 | def main():
145 |     pattern_name = 'mawi_unswnb_iscxvpn'
146 |     excluded_patterns = getattr(FilePatterns, pattern_name)
147 |     train_df = prepare_classification_data(DATASET_DIR / 'pretraining/train_csv',
148 |                                            filename_patterns_to_exclude=excluded_patterns)
149 |     eval_df = prepare_classification_data(DATASET_DIR / 'pretraining/val_csv',
150 |                                           filename_patterns_to_exclude=excluded_patterns)
151 |     test_df = prepare_classification_data(DATASET_DIR / 'pretraining/test_csv',
152 |                                           filename_patterns_to_exclude=excluded_patterns)
153 |     tr_val_df = pd.concat([train_df, eval_df], ignore_index=True)
154 |     tr_val_df = delete_duplicating_flows(tr_val_df)
155 |     tr_val_df, underrepresented_protos = prune_targets(tr_val_df)
156 | 
157 |     test_df = delete_duplicating_flows(test_df)
158 |     test_df, _ = prune_targets(test_df, underrepresented_protos=underrepresented_protos)
159 | 
160 |     suffix = get_hash(tr_val_df)
161 |     save_dataset(tr_val_df, save_to=DATASET_DIR / f'train_{suffix}_no_{pattern_name}.csv')
162 |     save_dataset(test_df, save_to=DATASET_DIR / f'test_{suffix}_no_{pattern_name}.csv')
163 | 
164 | 
165 | if __name__ == '__main__':
166 |     main()
167 | 


--------------------------------------------------------------------------------
/gpt_model/tokenizer.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import json
  3 | import pathlib
  4 | from functools import partial
  5 | from typing import Optional, Union
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import torch
 10 | from transformers import PreTrainedTokenizerBase
 11 | from transformers.tokenization_utils_base import TensorType, BatchEncoding
 12 | 
 13 | from settings import logger
 14 | from .quantizer import PacketQuantizer
 15 | 
 16 | 
 17 | class PacketTokenizer(PreTrainedTokenizerBase):
 18 |     max_model_input_sizes = 128
 19 |     model_input_names = ["attention_mask"]
 20 | 
 21 |     def __init__(self,
 22 |                  packet_quantizer: PacketQuantizer,
 23 |                  unk_token="[UNK]",
 24 |                  bos_token="[BOF]",
 25 |                  eos_token="[EOF]",
 26 |                  pad_token="[PAD]",
 27 |                  **kwargs
 28 |                  ):
 29 |         super().__init__(
 30 |             unk_token=unk_token,
 31 |             bos_token=bos_token,
 32 |             eos_token=eos_token,
 33 |             pad_token=pad_token,
 34 |             **kwargs,
 35 |         )
 36 |         self.packet_quantizer = packet_quantizer
 37 |         self.cluster_num = packet_quantizer.n_clusters
 38 |         # special token ids have indexes larger than all packet clusters (which start at 0)
 39 |         ids_to_tokens = kwargs.get('ids_to_tokens')
 40 |         if ids_to_tokens:
 41 |             self.ids_to_tokens = ids_to_tokens
 42 |         else:
 43 |             self.ids_to_tokens = collections.OrderedDict([(ids + self.cluster_num, tok)
 44 |                                                           for ids, tok in enumerate(self.all_special_tokens)])
 45 | 
 46 |         self.tokens_to_ids = {v: k for k, v in self.ids_to_tokens.items()}
 47 |         logger.info('initialized PacketTokenizer')
 48 | 
 49 |     def add_class_tokens(self, class_names: list):
 50 |         classes_to_add = set(class_names) - set(self.tokens_to_ids.keys())
 51 | 
 52 |         ids_to_classes = collections.OrderedDict([(ids + len(self), tok) for ids, tok in enumerate(classes_to_add)])
 53 |         classes_to_ids = {v: k for k, v in ids_to_classes.items()}
 54 | 
 55 |         self.ids_to_tokens.update(ids_to_classes)
 56 |         self.tokens_to_ids.update(classes_to_ids)
 57 | 
 58 |     @classmethod
 59 |     def from_pretrained(cls, pretrained_model_name_or_path, flow_size=None):
 60 |         path_dir = pathlib.Path(pretrained_model_name_or_path)
 61 |         flow_size = cls.max_model_input_sizes if flow_size is None else flow_size
 62 | 
 63 |         token_map_file = path_dir / 'ids_to_tokens.json'
 64 |         if token_map_file.is_file():
 65 |             with open(token_map_file, 'r') as jf:
 66 |                 ids_to_tokens = json.load(jf)
 67 |             ids_to_tokens = {int(k): v for k, v in ids_to_tokens.items()}
 68 |             logger.info('loaded special tokens map from "ids_to_tokens.json"')
 69 |         else:
 70 |             ids_to_tokens = {}
 71 |             logger.warning('special tokens map "ids_to_tokens.json" was not found, will attempt to recreate one')
 72 | 
 73 |         quantizer = PacketQuantizer.from_checkpoint(path_dir, flow_size=flow_size)
 74 |         return cls(
 75 |             packet_quantizer=quantizer,
 76 |             ids_to_tokens=ids_to_tokens,
 77 |         )
 78 | 
 79 |     def save_pretrained(self, save_directory):
 80 |         save_directory = pathlib.Path(save_directory)
 81 |         with open(save_directory / 'ids_to_tokens.json', 'w') as jf:
 82 |             json.dump(self.ids_to_tokens, jf)
 83 | 
 84 |         self.packet_quantizer.save_checkpoint(save_directory)
 85 | 
 86 |     def convert_ids_to_tokens(self, index):
 87 |         if isinstance(index, int):
 88 |             # exception indicates the bug
 89 |             return self.ids_to_tokens[index]
 90 |         else:
 91 |             raise NotImplementedError
 92 | 
 93 |     def convert_tokens_to_ids(self, tokens):
 94 |         if isinstance(tokens, str):
 95 |             return self.tokens_to_ids[tokens]
 96 |         else:
 97 |             raise NotImplementedError
 98 | 
 99 |     def _pad_flow(self, flow: np.ndarray) -> np.ndarray:
100 |         non_packets_mask = flow == self.packet_quantizer.non_packet_value
101 |         flow[non_packets_mask] = self.pad_token_id
102 |         return flow
103 | 
104 |     def _expand_with_special_tokens(self, flow: np.ndarray, first_token) -> np.ndarray:
105 |         # truncate to account for the tokens
106 |         flow = flow[:self.max_model_input_sizes - 2]
107 |         flow = np.insert(flow, 0, first_token)
108 |         non_packets_mask = flow == self.packet_quantizer.non_packet_value
109 |         flow[non_packets_mask] = self.pad_token_id
110 |         # we either pick index of the first True value or append
111 |         end_of_flow = non_packets_mask.argmax() if non_packets_mask.any() else len(flow)
112 |         flow = np.insert(flow, end_of_flow, self.eos_token_id)
113 |         return flow
114 | 
115 |     def batch_encode_packets(
116 |             self,
117 |             flows: Union[pd.DataFrame, np.ndarray],
118 |             target_class: Optional[str] = None,
119 |             add_special_tokens: bool = True,
120 |             return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
121 |             return_attention_mask: Optional[bool] = True,
122 |     ) -> BatchEncoding:
123 | 
124 |         if isinstance(flows, pd.DataFrame):
125 |             flows = flows.values
126 | 
127 |         if flows.shape[1] // 2 != self.max_model_input_sizes:
128 |             logger.debug(f'input number of features ({flows.shape[1] // 2}) does not match '
129 |                          f'max_model_input_sizes ({self.max_model_input_sizes})')
130 |         clusters = self.packet_quantizer.transform(flows)
131 | 
132 |         if add_special_tokens:
133 |             first_token = self.convert_tokens_to_ids(target_class) if target_class is not None else self.bos_token_id
134 |             expander = partial(self._expand_with_special_tokens, first_token=first_token)
135 |             clusters = np.apply_along_axis(expander, axis=1, arr=clusters)
136 |         else:
137 |             clusters = np.apply_along_axis(self._pad_flow, axis=1, arr=clusters)
138 | 
139 |         result = {'input_ids': clusters.astype(np.int64)}
140 | 
141 |         if return_attention_mask:
142 |             token_mask = (clusters != self.pad_token_id).astype(np.int64)
143 |             result.update({'attention_mask': token_mask})
144 | 
145 |         return BatchEncoding(result, tensor_type=TensorType(return_tensors), prepend_batch_axis=False)
146 | 
147 |     def _remove_special_tokens(self, flow):
148 |         # rm first token
149 |         flow = flow[1:]
150 |         try:
151 |             flow_end_idx = np.where(flow == self.eos_token_id)[0][0]
152 |         except IndexError:
153 |             flow_end_idx = flow.shape[0] - 1
154 |             logger.warning('could not find EOS token, removing the last one')
155 | 
156 |         if flow_end_idx == flow.shape[0] - 1:
157 |             flow = flow[:-1]
158 |         else:
159 |             flow = flow[:-1]
160 |             # replace pad token with quantizer's non packet value for consistency
161 |             flow[flow_end_idx:] = self.packet_quantizer.non_packet_value
162 |         return flow
163 | 
164 |     def batch_decode_packets(self, tokenized_flows) -> np.ndarray:
165 |         if isinstance(tokenized_flows, torch.Tensor):
166 |             tokenized_flows = tokenized_flows.numpy()
167 |         clusters_only = np.apply_along_axis(self._remove_special_tokens, axis=1, arr=tokenized_flows)
168 |         packet_features = self.packet_quantizer.inverse_transform(clusters_only)
169 |         return packet_features
170 | 
171 |     def __len__(self):
172 |         return self.cluster_num + len(self.tokens_to_ids)
173 | 
174 |     @property
175 |     def max_len(self):
176 |         return self.max_model_input_sizes
177 | 


--------------------------------------------------------------------------------
/sklearn_classifiers/knn_cosine.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from collections import Counter
  3 | 
  4 | import ngtpy
  5 | import numpy as np
  6 | import pandas as pd
  7 | import puffinn
  8 | from scipy.spatial.distance import cdist
  9 | from sklearn.base import BaseEstimator
 10 | from sklearn.preprocessing import normalize
 11 | 
 12 | from sklearn_classifiers.utils import iterate_batch_indexes
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | def cos_dist(query, keys):
 18 |     # if got vector
 19 |     if len(query.shape) == 1:
 20 |         query = query.reshape(1, -1)
 21 |     return cdist(keys, query, 'cosine').T
 22 | 
 23 | 
 24 | def top_k_cosine_similar(query, keys, k=1):
 25 |     distances = cos_dist(query, keys)
 26 |     top_k = np.argpartition(distances, k)[:, :k]
 27 |     return top_k
 28 | 
 29 | 
 30 | def voter(obj_votes):
 31 |     top_count = Counter(obj_votes).most_common(1)
 32 |     # return the top key
 33 |     return top_count[0][0]
 34 | 
 35 | 
 36 | def batch_voter(class_votes):
 37 |     """
 38 |     returns vector with the most occurring values within a `class_votes` row, if tie -- selects the first one
 39 |     :param class_votes: is a (objects, votes) matrix
 40 |     :return:
 41 |     """
 42 |     top_or_first = np.apply_along_axis(voter, axis=1, arr=class_votes)
 43 |     return top_or_first
 44 | 
 45 | 
 46 | class KNeighborsCosineClassifier(BaseEstimator):
 47 |     """
 48 |     custom K-nn based on cosine similarity
 49 | 
 50 |     time: 2h18m
 51 |     perf:
 52 |     accuracy          0.981014  0.981014  0.981014      0.981014
 53 |     macro avg         0.862088  0.861645  0.859317  96705.000000
 54 |     weighted avg      0.981303  0.981014  0.981095  96705.000000
 55 | 
 56 |     """
 57 | 
 58 |     def __init__(self, n_neighbors=3):
 59 |         self.n_neighbors = n_neighbors
 60 |         self.target_keys: np.ndarray = np.nan
 61 |         self.target_classes: np.ndarray = np.nan
 62 | 
 63 |     def fit(self, X, y):
 64 |         X_train = X.values if isinstance(X, pd.DataFrame) else X
 65 |         y_train = y.values if isinstance(y, pd.Series) else y
 66 |         assert X_train.shape[0] == y_train.shape[0], 'X and y length must match!'
 67 |         # assure the values are of np.ndarray type after all
 68 |         self.target_keys = np.array(X_train)
 69 |         self.target_classes = np.array(y_train)
 70 |         logger.info('fit KNeighborsCosineClassifier')
 71 |         return self
 72 | 
 73 |     def predict(self, X, batch_size=1024):
 74 |         X = X.values if isinstance(X, pd.DataFrame) else X
 75 |         X = np.array(X)
 76 |         predictions = np.empty(X.shape[0], dtype=np.int)
 77 |         for start_idx, end_idx in iterate_batch_indexes(X, batch_size):
 78 |             top_indexes = top_k_cosine_similar(query=X[start_idx:end_idx], keys=self.target_keys, k=self.n_neighbors)
 79 |             predictions[start_idx:end_idx] = batch_voter(self.target_classes[top_indexes])
 80 |         return predictions
 81 | 
 82 | 
 83 | class KNeighborsLshClassifier(BaseEstimator):
 84 | 
 85 |     def __init__(self, n_neighbors=1):
 86 |         self.target_classes: np.ndarray = np.nan
 87 |         self.n_neighbors = n_neighbors
 88 |         self.lsh_table = None
 89 | 
 90 |     def _construct_table(self, dataset: np.ndarray):
 91 |         raise NotImplementedError
 92 | 
 93 |     def _check_set_features(self, X):
 94 |         X = X.values if isinstance(X, pd.DataFrame) else np.array(X)
 95 |         X = X.astype(np.float32)
 96 |         normalize(X, copy=False)
 97 |         return X
 98 | 
 99 |     def fit(self, X, y):
100 |         X_train = self._check_set_features(X)
101 |         self.target_classes = y.values if isinstance(y, pd.Series) else np.array(y)
102 |         self._construct_table(X_train)
103 |         logger.info(f'fit {self.__class__.__name__}')
104 |         return self
105 | 
106 |     def _predict(self, X):
107 |         raise NotImplementedError
108 | 
109 |     def predict(self, X):
110 |         X = self._check_set_features(X)
111 |         return self._predict(X)
112 | 
113 | 
114 | class KNeighborsPuffinnClassifier(KNeighborsLshClassifier):
115 |     """
116 |     PUFFINN - Parameterless and Universal Fast Finding of Nearest Neighbors
117 |     https://arxiv.org/pdf/1906.12211.pdf
118 | 
119 |     time: 12m
120 |     perf:
121 |     accuracy          0.981759  0.981759  0.981759      0.981759
122 |     macro avg         0.865334  0.861639  0.860683  96705.000000
123 |     weighted avg      0.981953  0.981759  0.981810  96705.000000
124 | 
125 |     it is really close to the perf of grid-search K-nn approach but much faster
126 |     """
127 | 
128 |     def __init__(self, n_neighbors=1, search_recall=0.995, memory_limit=1 * 1024 ** 3):
129 |         super().__init__(n_neighbors)
130 |         self.memory_limit = memory_limit
131 |         self.search_recall = search_recall
132 |         self.lsh_table: puffinn.Index
133 | 
134 |     def _construct_table(self, dataset: np.ndarray):
135 |         self.lsh_table = puffinn.Index('angular', dataset.shape[1], self.memory_limit)
136 |         for v in dataset:
137 |             self.lsh_table.insert(v.tolist())
138 |         logger.info('building index table...')
139 |         self.lsh_table.rebuild()
140 | 
141 |     def _predict(self, X):
142 |         def query_predictor(query):
143 |             top_indexes = self.lsh_table.search(query.tolist(), self.n_neighbors, self.search_recall)
144 |             return voter(self.target_classes[top_indexes])
145 | 
146 |         predictions = np.apply_along_axis(query_predictor, axis=1, arr=X)
147 |         return predictions
148 | 
149 | 
150 | class KNeighborsNGTClassifier(KNeighborsLshClassifier):
151 |     """
152 |     ONNG-NGT (https://github.com/yahoojapan/NGT/wiki)
153 | 
154 |     better keep optimize_* args as defaults, it doesn't work as expected
155 |     """
156 | 
157 |     def __init__(
158 |             self,
159 |             n_neighbors=1,
160 |             search_epsilon=0.1,
161 |             optimize_n_edges=False,
162 |             optimize_search_params=False,
163 |             index_path='/tmp/knn_ngt_index'
164 |     ):
165 |         super().__init__(n_neighbors)
166 |         self.index_path = index_path
167 |         self.optimize_n_edges = optimize_n_edges
168 |         self.optimize_search_params = optimize_search_params
169 |         self.search_epsilon = search_epsilon
170 | 
171 |     def _construct_table(self, dataset: np.ndarray):
172 |         # when data is normalized row-wise, the L2 distance metric is similar to the cosine
173 |         ngtpy.create(self.index_path, dataset.shape[1], distance_type='L2')
174 |         index = ngtpy.Index(self.index_path)  # open the index
175 |         index.batch_insert(dataset)
176 |         if self.optimize_n_edges:
177 |             logger.info('optimizing number of edges...')
178 |             index.save()
179 |             optimizer = ngtpy.Optimizer(log_disabled=True)
180 |             try:
181 |                 optimizer.optimize_number_of_edges_for_anng(self.index_path)
182 |             except RuntimeError as e:
183 |                 logger.error(f'skipping optimization due to: {e}')
184 |         if self.optimize_search_params:
185 |             optimizer = ngtpy.Optimizer(log_disabled=True)
186 |             optimizer.set_processing_modes(
187 |                 search_parameter_optimization=True,
188 |                 prefetch_parameter_optimization=True,
189 |                 accuracy_table_generation=True)
190 |             optimizer.optimize_search_parameters(self.index_path)
191 |         logger.info('building index table...')
192 |         index.build_index()  # build index
193 |         index.save()
194 |         self.lsh_table = index
195 | 
196 |     def _predict(self, X):
197 |         def query_predictor(query):
198 |             if self.optimize_search_params:
199 |                 top_indexes = self.lsh_table.search(query, size=self.n_neighbors, expected_accuracy=0.99)
200 |             else:
201 |                 top_indexes = self.lsh_table.search(query, size=self.n_neighbors, epsilon=self.search_epsilon)
202 | 
203 |             top_indexes = [i[0] for i in top_indexes]
204 |             return voter(self.target_classes[top_indexes])
205 | 
206 |         predictions = np.apply_along_axis(query_predictor, axis=1, arr=X)
207 |         return predictions
208 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                    GNU LESSER GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 | 
  9 |   This version of the GNU Lesser General Public License incorporates
 10 | the terms and conditions of version 3 of the GNU General Public
 11 | License, supplemented by the additional permissions listed below.
 12 | 
 13 |   0. Additional Definitions.
 14 | 
 15 |   As used herein, "this License" refers to version 3 of the GNU Lesser
 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
 17 | General Public License.
 18 | 
 19 |   "The Library" refers to a covered work governed by this License,
 20 | other than an Application or a Combined Work as defined below.
 21 | 
 22 |   An "Application" is any work that makes use of an interface provided
 23 | by the Library, but which is not otherwise based on the Library.
 24 | Defining a subclass of a class defined by the Library is deemed a mode
 25 | of using an interface provided by the Library.
 26 | 
 27 |   A "Combined Work" is a work produced by combining or linking an
 28 | Application with the Library.  The particular version of the Library
 29 | with which the Combined Work was made is also called the "Linked
 30 | Version".
 31 | 
 32 |   The "Minimal Corresponding Source" for a Combined Work means the
 33 | Corresponding Source for the Combined Work, excluding any source code
 34 | for portions of the Combined Work that, considered in isolation, are
 35 | based on the Application, and not on the Linked Version.
 36 | 
 37 |   The "Corresponding Application Code" for a Combined Work means the
 38 | object code and/or source code for the Application, including any data
 39 | and utility programs needed for reproducing the Combined Work from the
 40 | Application, but excluding the System Libraries of the Combined Work.
 41 | 
 42 |   1. Exception to Section 3 of the GNU GPL.
 43 | 
 44 |   You may convey a covered work under sections 3 and 4 of this License
 45 | without being bound by section 3 of the GNU GPL.
 46 | 
 47 |   2. Conveying Modified Versions.
 48 | 
 49 |   If you modify a copy of the Library, and, in your modifications, a
 50 | facility refers to a function or data to be supplied by an Application
 51 | that uses the facility (other than as an argument passed when the
 52 | facility is invoked), then you may convey a copy of the modified
 53 | version:
 54 | 
 55 |    a) under this License, provided that you make a good faith effort to
 56 |    ensure that, in the event an Application does not supply the
 57 |    function or data, the facility still operates, and performs
 58 |    whatever part of its purpose remains meaningful, or
 59 | 
 60 |    b) under the GNU GPL, with none of the additional permissions of
 61 |    this License applicable to that copy.
 62 | 
 63 |   3. Object Code Incorporating Material from Library Header Files.
 64 | 
 65 |   The object code form of an Application may incorporate material from
 66 | a header file that is part of the Library.  You may convey such object
 67 | code under terms of your choice, provided that, if the incorporated
 68 | material is not limited to numerical parameters, data structure
 69 | layouts and accessors, or small macros, inline functions and templates
 70 | (ten or fewer lines in length), you do both of the following:
 71 | 
 72 |    a) Give prominent notice with each copy of the object code that the
 73 |    Library is used in it and that the Library and its use are
 74 |    covered by this License.
 75 | 
 76 |    b) Accompany the object code with a copy of the GNU GPL and this license
 77 |    document.
 78 | 
 79 |   4. Combined Works.
 80 | 
 81 |   You may convey a Combined Work under terms of your choice that,
 82 | taken together, effectively do not restrict modification of the
 83 | portions of the Library contained in the Combined Work and reverse
 84 | engineering for debugging such modifications, if you also do each of
 85 | the following:
 86 | 
 87 |    a) Give prominent notice with each copy of the Combined Work that
 88 |    the Library is used in it and that the Library and its use are
 89 |    covered by this License.
 90 | 
 91 |    b) Accompany the Combined Work with a copy of the GNU GPL and this license
 92 |    document.
 93 | 
 94 |    c) For a Combined Work that displays copyright notices during
 95 |    execution, include the copyright notice for the Library among
 96 |    these notices, as well as a reference directing the user to the
 97 |    copies of the GNU GPL and this license document.
 98 | 
 99 |    d) Do one of the following:
100 | 
101 |        0) Convey the Minimal Corresponding Source under the terms of this
102 |        License, and the Corresponding Application Code in a form
103 |        suitable for, and under terms that permit, the user to
104 |        recombine or relink the Application with a modified version of
105 |        the Linked Version to produce a modified Combined Work, in the
106 |        manner specified by section 6 of the GNU GPL for conveying
107 |        Corresponding Source.
108 | 
109 |        1) Use a suitable shared library mechanism for linking with the
110 |        Library.  A suitable mechanism is one that (a) uses at run time
111 |        a copy of the Library already present on the user's computer
112 |        system, and (b) will operate properly with a modified version
113 |        of the Library that is interface-compatible with the Linked
114 |        Version.
115 | 
116 |    e) Provide Installation Information, but only if you would otherwise
117 |    be required to provide such information under section 6 of the
118 |    GNU GPL, and only to the extent that such information is
119 |    necessary to install and execute a modified version of the
120 |    Combined Work produced by recombining or relinking the
121 |    Application with a modified version of the Linked Version. (If
122 |    you use option 4d0, the Installation Information must accompany
123 |    the Minimal Corresponding Source and Corresponding Application
124 |    Code. If you use option 4d1, you must provide the Installation
125 |    Information in the manner specified by section 6 of the GNU GPL
126 |    for conveying Corresponding Source.)
127 | 
128 |   5. Combined Libraries.
129 | 
130 |   You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 | 
136 |    a) Accompany the combined library with a copy of the same work based
137 |    on the Library, uncombined with any other library facilities,
138 |    conveyed under the terms of this License.
139 | 
140 |    b) Give prominent notice with the combined library that part of it
141 |    is a work based on the Library, and explaining where to find the
142 |    accompanying uncombined form of the same work.
143 | 
144 |   6. Revised Versions of the GNU Lesser General Public License.
145 | 
146 |   The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 | 
151 |   Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 | 
161 |   If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 | 


--------------------------------------------------------------------------------
/gpt_model/generator/dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pathlib
  3 | from dataclasses import dataclass
  4 | from functools import lru_cache, partial
  5 | from typing import Dict, List, Tuple
  6 | 
  7 | import logging
  8 | import numpy as np
  9 | import pandas as pd
 10 | import sh
 11 | import torch
 12 | from torch.utils.data.dataset import IterableDataset, Dataset
 13 | from transformers import BatchEncoding
 14 | 
 15 | from flow_parsing import check_filename_in_patterns
 16 | from settings import TARGET_CLASS_COLUMN
 17 | from gpt_model.tokenizer import PacketTokenizer
 18 | from gpt_model.data_preparation.format_parsed_as_classification_dataset import prepare_classification_data
 19 | 
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | 
 23 | class PretrainIterDataset(IterableDataset):
 24 | 
 25 |     def __init__(self, tokenizer: PacketTokenizer, folder_path: str, train_mode=True):
 26 |         assert os.path.isdir(folder_path)
 27 |         # TODO feature caching, multiple workers?, filter out one-packet flows
 28 | 
 29 |         self.source_files = list(pathlib.Path(folder_path).glob('*.csv'))
 30 |         logger.info("initializing dataset from %s with %s files", folder_path, len(self.source_files))
 31 | 
 32 |         self.tokenizer = tokenizer
 33 |         self.train_mode = train_mode
 34 | 
 35 |     def __iter__(self) -> BatchEncoding:
 36 |         assert torch.utils.data.get_worker_info() is None
 37 |         for csv in self.source_files:
 38 |             # not really the best way, reading is gonna be slow
 39 |             logger.info(f'FlowDataset: reading {csv}')
 40 |             reader = pd.read_csv(csv, chunksize=1,
 41 |                                  usecols=self.tokenizer.packet_quantizer.raw_columns,
 42 |                                  dtype=float)
 43 |             for raw_flow in reader:
 44 |                 # skip 1-packet and empty flows
 45 |                 if self.train_mode and pd.isna(raw_flow.iloc[:, 3]).any():
 46 |                     continue
 47 | 
 48 |                 encoded = self.tokenizer.batch_encode_packets(raw_flow,
 49 |                                                               add_special_tokens=True,
 50 |                                                               return_attention_mask=True)
 51 |                 yield encoded
 52 | 
 53 |     @lru_cache(maxsize=2)
 54 |     def __len__(self):
 55 |         """ the files are too large to count their size via Python """
 56 |         line_counter = sh.Command('sed')
 57 |         total = 0
 58 |         for filename in self.source_files:
 59 |             found_lines = line_counter("-n", "$=", filename)
 60 |             # do not count .csv header
 61 |             total += int(found_lines) - 1
 62 |         return total
 63 | 
 64 | 
 65 | class PretrainDataset(Dataset):
 66 |     def __init__(self, tokenizer: PacketTokenizer, folder_path: str, filename_patterns_to_exclude: tuple = ()):
 67 |         assert os.path.isdir(folder_path)
 68 |         # TODO feature caching, multiple workers?, filter out one-packet flows
 69 | 
 70 |         source_files = list(pathlib.Path(folder_path).glob('*.csv'))
 71 |         file_matcher = partial(check_filename_in_patterns, patterns=filename_patterns_to_exclude)
 72 |         source_files = list(file for file in source_files if not file_matcher(file))
 73 |         print(source_files)
 74 |         self.source_files = source_files
 75 |         logger.info("initializing dataset from %s with %s files", folder_path, len(self.source_files))
 76 | 
 77 |         self.tokenizer = tokenizer
 78 |         # load as 32-bit to save RAM
 79 |         raw_flows = pd.concat((pd.read_csv(csv, usecols=self.tokenizer.packet_quantizer.raw_columns, dtype=np.float32)
 80 |                                for csv in self.source_files), ignore_index=True)
 81 | 
 82 |         raw_flows = raw_flows.loc[:, tokenizer.packet_quantizer.raw_columns].sample(frac=1, random_state=1)
 83 | 
 84 |         logger.info('concatenated dataframes within the folder')
 85 |         # skip 1-packet and empty flows
 86 |         raw_flows.dropna(axis=0, subset=['raw_packet0', 'raw_packet1'], inplace=True, how='any')
 87 |         self.raw_flows = raw_flows.values
 88 |         logger.info('initialized dataset')
 89 | 
 90 |     def __len__(self):
 91 |         return len(self.raw_flows)
 92 | 
 93 |     def __getitem__(self, i: int) -> Dict[str, torch.Tensor]:
 94 |         return self.tokenizer.batch_encode_packets(self.raw_flows[i].reshape(1, -1).astype(np.float64),
 95 |                                                    add_special_tokens=True,
 96 |                                                    return_attention_mask=True).data
 97 | 
 98 | 
 99 | def load_modeling_data_with_classes(
100 |         folder_path,
101 |         shuffle=True,
102 |         filename_patterns_to_exclude=None
103 | ) -> Tuple[pd.DataFrame, pd.Series]:
104 |     assert os.path.isdir(folder_path)
105 |     logger.info(f"initializing dataset from {folder_path}, excluding {filename_patterns_to_exclude}")
106 |     folder_path = pathlib.Path(folder_path)
107 | 
108 |     raw_flows = prepare_classification_data(folder_path,
109 |                                             remove_garbage=False,
110 |                                             filename_patterns_to_exclude=filename_patterns_to_exclude)
111 |     # skip 1-packet and empty flows
112 |     raw_flows.dropna(axis=0, subset=['raw_packet0', 'raw_packet1'], inplace=True, how='any')
113 |     if shuffle:
114 |         raw_flows = raw_flows.sample(frac=1, random_state=1)
115 | 
116 |     return raw_flows.filter(regex='raw_'), raw_flows[TARGET_CLASS_COLUMN]
117 | 
118 | 
119 | class PretrainDatasetWithClasses(Dataset):
120 |     def __init__(self, tokenizer: PacketTokenizer, folder_path: str, filename_patterns_to_exclude: tuple = ()):
121 |         self.tokenizer = tokenizer
122 | 
123 |         raw_flows, targets = load_modeling_data_with_classes(folder_path,
124 |                                                              filename_patterns_to_exclude=filename_patterns_to_exclude)
125 | 
126 |         self.raw_flows: np.ndarray = raw_flows.loc[:, tokenizer.packet_quantizer.raw_columns].values
127 |         self.targets: np.ndarray = targets.values
128 |         logger.info('initialized dataset')
129 |         tokenizer.add_class_tokens(self.target_classes)
130 |         logger.info('added special tokens representing classes')
131 | 
132 |     @property
133 |     def target_classes(self) -> list:
134 |         return np.unique(self.targets).tolist()
135 | 
136 |     def __len__(self):
137 |         return len(self.raw_flows)
138 | 
139 |     def __getitem__(self, i: int) -> Dict[str, torch.Tensor]:
140 |         return self.tokenizer.batch_encode_packets(self.raw_flows[i].reshape(1, -1).astype(np.float64),
141 |                                                    target_class=self.targets[i],
142 |                                                    add_special_tokens=True,
143 |                                                    return_attention_mask=True).data
144 | 
145 | 
146 | @dataclass
147 | class PretrainCollator:
148 |     """
149 |     Data collator used for traffic flow modeling.
150 |     - collates batches of tensors, honoring their tokenizer's pad_token
151 |     - preprocesses batches for masked language modeling
152 |     """
153 | 
154 |     tokenizer: PacketTokenizer
155 | 
156 |     def __call__(self, examples: List[Dict[str, torch.Tensor]]) -> Dict[str, torch.Tensor]:
157 |         """
158 |         Data collator used for packet modeling.
159 |         - collates batches of tensors
160 |         """
161 | 
162 |         length_of_first = examples[0]['input_ids'].size(0)
163 |         are_tensors_same_length = all(x['input_ids'].size(0) == length_of_first for x in examples)
164 |         assert are_tensors_same_length
165 | 
166 |         input_ids = torch.cat([item['input_ids'] for item in examples], dim=0)
167 |         attention_masks = torch.cat([item['attention_mask'] for item in examples], dim=0)
168 |         labels = input_ids.clone().detach()
169 |         labels[labels == self.tokenizer.pad_token_id] = -100
170 |         return {
171 |             "input_ids": input_ids,
172 |             "attention_mask": attention_masks,
173 |             "labels": labels,
174 |         }
175 | 
176 | 
177 | class FinetuningDataset(Dataset):
178 |     def __init__(self, tokenizer: PacketTokenizer, dataset_path: str, target_class: str, target_column: str = None):
179 |         assert os.path.isfile(dataset_path)
180 | 
181 |         dataset_path = pathlib.Path(dataset_path)
182 |         self.source_file = dataset_path
183 |         logger.info("initializing dataset from %s with '%s' target class", dataset_path, target_class)
184 | 
185 |         self.tokenizer = tokenizer
186 | 
187 |         self.target_column = TARGET_CLASS_COLUMN if target_column is None else target_column
188 | 
189 |         raw_flows = pd.read_csv(self.source_file,
190 |                                 usecols=self.tokenizer.packet_quantizer.raw_columns + [self.target_column])
191 |         raw_flows = raw_flows[raw_flows.loc[:, self.target_column] == target_class]
192 | 
193 |         self.raw_flows = raw_flows.loc[:, tokenizer.packet_quantizer.raw_columns].values
194 |         logger.info('initialized dataset')
195 | 
196 |     def __len__(self):
197 |         return len(self.raw_flows)
198 | 
199 |     def __getitem__(self, i: int) -> Dict[str, torch.Tensor]:
200 |         return self.tokenizer.batch_encode_packets(self.raw_flows[i].reshape(1, -1).astype(np.float64),
201 |                                                    add_special_tokens=True,
202 |                                                    return_attention_mask=True).data
203 | 


--------------------------------------------------------------------------------
/gpt_model/quantizer.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import pathlib
  4 | from typing import Optional
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | from sklearn.cluster import KMeans
  9 | 
 10 | from flow_parsing.features import generate_raw_feature_names
 11 | 
 12 | try:
 13 |     from libKMCUDA import kmeans_cuda
 14 | except ImportError:
 15 |     print('libKMCUDA was not found: calling fit() for PacketQuantizer is not possible, kmcuda must be installed')
 16 | 
 17 | from settings import logger, BASE_DIR
 18 | 
 19 | 
 20 | def get_kmeans_mae(original, restored):
 21 |     s = np.abs(original - restored).sum()
 22 |     mae = np.abs(original - restored).mean()
 23 |     logger.info(f'MAE: {mae}, cumulative error: {s}')
 24 |     return mae
 25 | 
 26 | 
 27 | def drop_nan_packets(packet_features):
 28 |     return packet_features[~np.isnan(packet_features) & ~np.isinf(packet_features)].reshape(-1, 2)
 29 | 
 30 | 
 31 | def init_sklearn_kmeans_from_checkpoint(checkpoint_path):
 32 |     checkpoint_path = pathlib.Path(checkpoint_path)
 33 |     with open(checkpoint_path / 'clusters.json', 'rb') as jf:
 34 |         clusters = np.array(json.load(jf))
 35 | 
 36 |     clusters = drop_nan_packets(clusters)
 37 |     # make KMeans think it was fitted
 38 |     quantizer = KMeans(n_clusters=clusters.shape[0])
 39 |     quantizer._n_threads = 1
 40 |     quantizer.cluster_centers_ = clusters
 41 |     logger.info(f'init sklearn KMeans from checkpoint: {checkpoint_path}')
 42 |     return quantizer
 43 | 
 44 | 
 45 | class PacketScaler:
 46 |     def __init__(self, max_packet_len=1500):
 47 |         self.max_packet_len = max_packet_len
 48 | 
 49 |     def transform(self, packet_pairs):
 50 |         """
 51 |         :param packet_pairs: (N, 2), 0 -- packet_len, 1 -- IAT
 52 |         :return: transformed_packets (N, 2)
 53 |         """
 54 |         packet_pairs[:, 0] = packet_pairs[:, 0] / self.max_packet_len
 55 |         # avoids warning and -inf values. the scale here is in microseconds (?)
 56 |         zero_iats = np.isclose(packet_pairs[:, 1], 0.)
 57 |         packet_pairs[:, 1][zero_iats] = 0
 58 |         packet_pairs[:, 1][~zero_iats] = np.log10(packet_pairs[:, 1][~zero_iats])
 59 |         return packet_pairs
 60 | 
 61 |     def inverse_transform(self, packet_pairs):
 62 |         packet_pairs[:, 0] = packet_pairs[:, 0] * self.max_packet_len
 63 |         # to correctly rescale, we need to know which were initially zeros
 64 |         zero_iats = np.isclose(packet_pairs[:, 1], 0., atol=1e-3)
 65 |         packet_pairs[:, 1][zero_iats] = 0
 66 |         packet_pairs[:, 1][~zero_iats] = 10 ** packet_pairs[:, 1][~zero_iats]
 67 |         return packet_pairs
 68 | 
 69 | 
 70 | class PacketQuantizer:
 71 |     """
 72 |     You can init PacketQuantizer for transform() and inverse_transform() only after loading from checkpoint
 73 |     """
 74 | 
 75 |     def __init__(self,
 76 |                  n_clusters=16384,
 77 |                  flow_size=128,
 78 |                  packet_scaler=PacketScaler,
 79 |                  kmeans_clusterizer: Optional[KMeans] = None):
 80 |         self.n_clusters = n_clusters
 81 |         # hard-coded to the expected dataframe format (as in features.py)
 82 |         self.iat_columns = generate_raw_feature_names(flow_size, base_features=('iat',))
 83 |         self.packet_columns = generate_raw_feature_names(flow_size, base_features=('packet',))
 84 |         self.raw_columns = generate_raw_feature_names(flow_size)
 85 |         self.scaler = packet_scaler()
 86 |         self._cluster_centers = None
 87 |         self.kmeans = kmeans_clusterizer
 88 |         self.non_packet_value = -1
 89 | 
 90 |     def fit(self, raw_batch):
 91 |         """
 92 |         https://github.com/src-d/kmcuda#python-api
 93 |         due to performance reasons, uses kmcuda instead of sklearn's KMeans.
 94 |         :param raw_batch:
 95 |         :return:
 96 |         """
 97 |         # do not consider single-packet flows
 98 |         raw_batch = raw_batch[raw_batch.raw_packet1 != 0]
 99 |         # form matrix (n_packet x (packet_size, IAT))
100 |         packet_features = raw_batch[self.raw_columns].values.reshape(-1, 2)
101 |         # omit non_packet values
102 |         packet_features = drop_nan_packets(packet_features)
103 |         init_clusters = "k-means++" if self._cluster_centers is None else self._cluster_centers
104 |         logger.info('fitting on {} packets, init clusters from data: {}'.format(packet_features.shape[0],
105 |                                                                                 isinstance(init_clusters, str)))
106 |         packet_features = self.scaler.transform(packet_features)
107 | 
108 |         cluster_centers_, assignments = kmeans_cuda(
109 |             samples=packet_features,
110 |             clusters=self.n_clusters,
111 |             tolerance=0.01,
112 |             init=init_clusters,
113 |             yinyang_t=0,
114 |             metric="L2",
115 |             average_distance=False,
116 |             seed=1, device=0, verbosity=1
117 |         )
118 |         self._cluster_centers = cluster_centers_
119 |         self._evaluate(packet_features, cluster_centers_[assignments])
120 | 
121 |     def _evaluate(self, packet_features, restored):
122 |         n_unique_clusters = len(self._cluster_centers[~np.isnan(self._cluster_centers)]) / 2
123 |         logger.info(f'found {n_unique_clusters} unique clusters')
124 |         get_kmeans_mae(packet_features, restored)
125 | 
126 |     def save_checkpoint(self, save_directory):
127 |         save_directory = pathlib.Path(save_directory)
128 |         save_directory.mkdir(exist_ok=True)
129 |         quantizer_path = save_directory / 'clusters.json'
130 |         with open(quantizer_path, 'w') as qf:
131 |             try:
132 |                 json.dump(self._cluster_centers.tolist(), qf)
133 |             except AttributeError:
134 |                 # account for the case when saving not during training
135 |                 json.dump(self.kmeans.cluster_centers_.tolist(), qf)
136 |         logger.info(f'saving checkpoint to {quantizer_path}')
137 |         return quantizer_path.as_posix()
138 | 
139 |     @classmethod
140 |     def from_checkpoint(cls, checkpoint_path, *args, **kwargs):
141 |         kmeans = init_sklearn_kmeans_from_checkpoint(checkpoint_path)
142 |         return cls(n_clusters=kmeans.n_clusters, kmeans_clusterizer=kmeans, *args, **kwargs)
143 | 
144 |     def transform(self, raw_packet_batch):
145 |         """ transforms raw packet matrix of size (n_flows, packets*2)
146 |         (where 2 is due to features - PS, IAT) to packet clusters matrix
147 |         of size (n_flows, packets). Non-packet values in the source matrix
148 |         MUST BE NaN, and in the cluster matrix they correspond to -1.
149 |         """
150 |         if self.kmeans is None:
151 |             raise Exception('the class must be init with an sklearn KMeans instance first!')
152 | 
153 |         if isinstance(raw_packet_batch, pd.DataFrame):
154 |             # assert correct order
155 |             raw_packet_batch = raw_packet_batch[self.raw_columns].values
156 | 
157 |         batch_size = raw_packet_batch.shape[0]
158 |         # reshape to form (n_samples, n_features) for PacketScaler and KMeans
159 |         raw_packet_batch = raw_packet_batch.reshape(-1, 2)
160 |         non_packet_mask = np.isnan(raw_packet_batch) | np.isinf(raw_packet_batch)
161 |         # temp fill to allow for predicting
162 |         raw_packet_batch[non_packet_mask] = 0
163 |         clusters = self.kmeans.predict(self.scaler.transform(raw_packet_batch))
164 |         # set non_packet clusters to NaN
165 |         non_packet_cluster_mask = non_packet_mask.sum(axis=1).astype(bool)
166 |         clusters[non_packet_cluster_mask] = self.non_packet_value
167 |         # reshape back to batch form
168 |         clusters = clusters.reshape(batch_size, -1)
169 |         return clusters
170 | 
171 |     def inverse_transform(self, cluster_batch):
172 |         flat_clusters = cluster_batch.flatten()
173 |         non_packet_cluster_mask = flat_clusters == self.non_packet_value
174 |         # assign temp cluster value
175 |         flat_clusters[non_packet_cluster_mask] = 0
176 |         outbound_cluster_mask = flat_clusters >= self.n_clusters
177 |         n_outbound_clusters = outbound_cluster_mask.sum()
178 |         if n_outbound_clusters > 0:
179 |             logger.warning(f'found {n_outbound_clusters} outbounding cluster values')
180 |             flat_clusters[outbound_cluster_mask] = 0
181 |         reverted_packets = self.scaler.inverse_transform(self.kmeans.cluster_centers_[flat_clusters])
182 |         # make NaN non-packets
183 |         reverted_packets[non_packet_cluster_mask] = np.nan
184 |         reverted_packets = reverted_packets.reshape(-1, cluster_batch.shape[1] * 2)
185 |         return reverted_packets
186 | 
187 | 
188 | def main():
189 |     parser = argparse.ArgumentParser()
190 |     parser.add_argument(
191 |         "-s", "--source",
192 |         help="source folder with .csv files. the recommended way is to create a folder with all the training data, that"
193 |              "was merged and shuffled beforehand (e.g. via pandas)"
194 |     )
195 |     args = parser.parse_args()
196 | 
197 |     quantizer = PacketQuantizer(n_clusters=16384, flow_size=128)
198 |     raw_csv_dir = pathlib.Path(args.source)
199 | 
200 |     flow_limit = 1_000_000
201 |     for file_idx, csv in enumerate(raw_csv_dir.glob('*.csv')):
202 |         logger.info(f'processing {csv}')
203 |         reader = pd.read_csv(csv, chunksize=flow_limit, usecols=quantizer.raw_columns, dtype=np.float32)
204 |         for batch, raw_packets in enumerate(reader):
205 |             quantizer.fit(raw_packets)
206 |             if batch % 10 == 0:
207 |                 quantizer.save_checkpoint(
208 |                     BASE_DIR / f'gpt_model/trained_quantizers/quantizer_2^14_{csv.stem}_{batch}')
209 | 
210 |         quantizer.save_checkpoint(BASE_DIR / f'gpt_model/trained_quantizers/quantizer_2^14_{csv.stem}_final')
211 | 
212 | 
213 | if __name__ == '__main__':
214 |     main()
215 | 


--------------------------------------------------------------------------------
/gpt_model/generator/train_generator.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """
 17 | Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
 18 | GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
 19 | using a masked language modeling (MLM) loss.
 20 | """
 21 | 
 22 | 
 23 | import logging
 24 | import math
 25 | import os
 26 | from dataclasses import dataclass, field
 27 | from typing import Optional
 28 | 
 29 | from transformers import (
 30 |     GPT2Config,
 31 |     GPT2LMHeadModel,
 32 |     HfArgumentParser,
 33 |     TrainingArguments,
 34 |     set_seed, AutoModelForCausalLM, Trainer,
 35 | )
 36 | 
 37 | from gpt_model.generator.dataset import PretrainCollator, PretrainDataset, FinetuningDataset, PretrainDatasetWithClasses
 38 | from gpt_model.tokenizer import PacketTokenizer
 39 | from settings import FilePatterns
 40 | 
 41 | logger = logging.getLogger(__name__)
 42 | 
 43 | 
 44 | @dataclass
 45 | class ModelArguments:
 46 |     """
 47 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
 48 |     """
 49 | 
 50 |     model_name_or_path: Optional[str] = field(
 51 |         default=None,
 52 |         metadata={
 53 |             "help": "The model checkpoint for weights initialization. "
 54 |                     "Leave None if you want to train a model from scratch."
 55 |         },
 56 |     )
 57 |     quantizer_path: Optional[str] = field(
 58 |         default=None,
 59 |         metadata={
 60 |             "help": "The quantizer checkpoint for weights initialization. Must be provided when the model"
 61 |                     "is trained from scratch. Not used, when the model is initialized from checkpoint"
 62 |         },
 63 |     )
 64 |     model_type: Optional[str] = field(
 65 |         default=None,
 66 |     )
 67 |     config_name: Optional[str] = field(
 68 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
 69 |     )
 70 |     tokenizer_name: Optional[str] = field(
 71 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
 72 |     )
 73 |     cache_dir: Optional[str] = field(
 74 |         default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
 75 |     )
 76 | 
 77 | 
 78 | @dataclass
 79 | class DataTrainingArguments:
 80 |     """
 81 |     Arguments pertaining to what data we are going to input our model for training and eval.
 82 |     """
 83 | 
 84 |     train_data_file: Optional[str] = field(
 85 |         default=None, metadata={"help": "The input training data file (a text file)."}
 86 |     )
 87 |     eval_data_file: Optional[str] = field(
 88 |         default=None,
 89 |         metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
 90 |     )
 91 |     block_size: int = field(
 92 |         default=-1,
 93 |         metadata={
 94 |             "help": "Optional input sequence length after tokenization."
 95 |             "The training dataset will be truncated in block of this size for training."
 96 |             "Default to the model max input length for single sentence inputs (take into account special tokens)."
 97 |         },
 98 |     )
 99 |     overwrite_cache: bool = field(
100 |         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
101 |     )
102 |     finetune_on_class: str = field(
103 |         default=None,
104 |         metadata={"help": "specifies flow subset within the DF's target column to fine-tune the packet model on"}
105 |     )
106 |     train_with_targets: bool = field(
107 |         default=False,
108 |         metadata={"help": "specifies whether to include flow label as the first special token or to use a generic BOS"}
109 |     )
110 |     file_patterns_to_exclude: str = field(
111 |         default='mawi',
112 |         metadata={"help": "specifies which file patterns from the data folder to exclude, defaults to empty,"
113 |                           " see settings.py::FilePatterns for used combinations"}
114 |     )
115 | 
116 | 
117 | def get_dataset(args: DataTrainingArguments, tokenizer: PacketTokenizer, evaluate=False):
118 |     file_path = args.eval_data_file if evaluate else args.train_data_file
119 |     logger.info(f'block_size is {args.block_size} and likely unused')
120 |     file_patterns = getattr(FilePatterns, args.file_patterns_to_exclude)
121 |     if args.finetune_on_class:
122 |         return FinetuningDataset(tokenizer=tokenizer,
123 |                                  dataset_path=file_path,
124 |                                  target_class=args.finetune_on_class)
125 | 
126 |     if args.train_with_targets:
127 |         return PretrainDatasetWithClasses(tokenizer=tokenizer,
128 |                                           folder_path=file_path,
129 |                                           filename_patterns_to_exclude=file_patterns)
130 | 
131 |     return PretrainDataset(tokenizer=tokenizer, folder_path=file_path, filename_patterns_to_exclude=file_patterns)
132 | 
133 | 
134 | def main():
135 |     # See all possible arguments in src/transformers/training_args.py
136 |     # or by passing the --help flag to this script.
137 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
138 | 
139 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
140 |     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
141 | 
142 |     if data_args.eval_data_file is None and training_args.do_eval:
143 |         raise ValueError(
144 |             "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
145 |             "or remove the --do_eval argument."
146 |         )
147 | 
148 |     if (
149 |         os.path.exists(training_args.output_dir)
150 |         and os.listdir(training_args.output_dir)
151 |         and training_args.do_train
152 |         and not training_args.overwrite_output_dir
153 |     ):
154 |         raise ValueError(
155 |             f"Output directory ({training_args.output_dir}) already exists and is not empty. "
156 |             f"Use --overwrite_output_dir to overcome."
157 |         )
158 | 
159 |     if data_args.finetune_on_class and data_args.train_with_targets:
160 |         raise ValueError("Pretraining with flow labels and fine-tuning on the class simultaneously not supported.")
161 | 
162 |     if not model_args.model_name_or_path and not model_args.quantizer_path:
163 |         raise ValueError("Either model or quantizer checkpoint path must be specified")
164 | 
165 |     # Setup logging
166 |     logging.basicConfig(
167 |         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
168 |         datefmt="%m/%d/%Y %H:%M:%S",
169 |         level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
170 |     )
171 |     logger.warning(
172 |         "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
173 |         training_args.local_rank,
174 |         training_args.device,
175 |         training_args.n_gpu,
176 |         bool(training_args.local_rank != -1),
177 |         training_args.fp16,
178 |     )
179 |     logger.info("Training/evaluation parameters %s", training_args)
180 | 
181 |     # Set seed
182 |     set_seed(training_args.seed)
183 | 
184 |     # Load pretrained model and tokenizer
185 |     #
186 |     # Distributed training:
187 |     # The .from_pretrained methods guarantee that only one local process can concurrently
188 |     # download model & vocab.
189 | 
190 |     if model_args.model_name_or_path:
191 |         config = GPT2Config.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
192 |     else:
193 |         config = GPT2Config.from_json_file(model_args.config_name)
194 |         logger.warning("You are instantiating a new config instance from scratch.")
195 | 
196 |     if model_args.model_name_or_path:
197 |         tokenizer = PacketTokenizer.from_pretrained(model_args.model_name_or_path)
198 |     else:
199 |         tokenizer = PacketTokenizer.from_pretrained(model_args.quantizer_path)
200 | 
201 |     if model_args.model_name_or_path:
202 |         model = GPT2LMHeadModel.from_pretrained(
203 |             model_args.model_name_or_path,
204 |             from_tf=bool(".ckpt" in model_args.model_name_or_path),
205 |             config=config,
206 |             cache_dir=model_args.cache_dir,
207 |         )
208 |     else:
209 |         logger.info("Training new model from scratch")
210 |         model = AutoModelForCausalLM.from_config(config)
211 | 
212 |     if data_args.block_size <= 0:
213 |         data_args.block_size = tokenizer.max_len
214 |         # Our input block size will be the max possible for the model
215 |     else:
216 |         data_args.block_size = min(data_args.block_size, tokenizer.max_len)
217 | 
218 |     # Get datasets
219 |     train_dataset = get_dataset(data_args, tokenizer=tokenizer) if training_args.do_train else None
220 |     eval_dataset = get_dataset(data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None
221 |     model.resize_token_embeddings(len(tokenizer))
222 |     print(model)
223 | 
224 |     # Initialize our Trainer
225 |     trainer = Trainer(
226 |         model=model,
227 |         args=training_args,
228 |         data_collator=PretrainCollator(tokenizer),
229 |         train_dataset=train_dataset,
230 |         eval_dataset=eval_dataset,
231 |         prediction_loss_only=True,
232 |     )
233 | 
234 |     # Training
235 |     if training_args.do_train:
236 |         model_path = (
237 |             model_args.model_name_or_path
238 |             if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)
239 |             else None
240 |         )
241 |         trainer.train(model_path=model_path)
242 |         trainer.save_model()
243 |         # For convenience, we also re-save the tokenizer to the same directory,
244 |         # so that you can share your model easily on huggingface.co/models =)
245 |         if trainer.is_world_master():
246 |             tokenizer.save_pretrained(training_args.output_dir)
247 | 
248 |     # Evaluation
249 |     results = {}
250 |     if training_args.do_eval:
251 |         logger.info("*** Evaluate ***")
252 | 
253 |         eval_output = trainer.evaluate()
254 | 
255 |         perplexity = math.exp(eval_output["eval_loss"])
256 |         result = {"perplexity": perplexity}
257 | 
258 |         output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt")
259 |         if trainer.is_world_master():
260 |             with open(output_eval_file, "w") as writer:
261 |                 logger.info("***** Eval results *****")
262 |                 for key in sorted(result.keys()):
263 |                     logger.info("  %s = %s", key, str(result[key]))
264 |                     writer.write("%s = %s\n" % (key, str(result[key])))
265 | 
266 |         results.update(result)
267 | 
268 |     return results
269 | 
270 | 
271 | def _mp_fn(index):
272 |     # For xla_spawn (TPUs)
273 |     main()
274 | 
275 | 
276 | if __name__ == "__main__":
277 |     main()
278 | 


--------------------------------------------------------------------------------
/sklearn_classifiers/featurizer.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import pathlib
  3 | from typing import Tuple, List
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import torch
  8 | from pandarallel import pandarallel
  9 | from sklearn.compose import ColumnTransformer
 10 | from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
 11 | from transformers import GPT2Model
 12 | 
 13 | from evaluation_utils.modeling import flows_to_packets
 14 | from flow_parsing.features import (
 15 |     FEATURE_PREFIX,
 16 |     FEATURE_FUNCTIONS,
 17 |     CONTINUOUS_NAMES,
 18 |     generate_raw_feature_names,
 19 |     calc_parameter_stats
 20 | )
 21 | from flow_parsing.utils import get_df_hash, save_dataset, read_dataset
 22 | from gpt_model.tokenizer import PacketTokenizer
 23 | from settings import TARGET_CLASS_COLUMN, DEFAULT_PACKET_LIMIT_PER_FLOW, CACHE_DIR
 24 | from .utils import iterate_batch_indexes
 25 | 
 26 | logger = logging.getLogger(__name__)
 27 | pandarallel.initialize()
 28 | 
 29 | 
 30 | class BaseFeaturizer:
 31 |     def __init__(self, packet_num, consider_iat_features=True, target_column=TARGET_CLASS_COLUMN):
 32 |         self.target_encoder = LabelEncoder()
 33 |         self.target_column = target_column
 34 | 
 35 |         self.raw_features: List[str] = generate_raw_feature_names(
 36 |             packet_num,
 37 |             base_features=('packet', 'iat') if consider_iat_features else ('packet',)
 38 |         )
 39 | 
 40 |     def encode_targets(self, data: pd.DataFrame) -> np.ndarray:
 41 |         return self.target_encoder.transform(data[self.target_column])
 42 | 
 43 |     def fit_target_encoder(self, data: pd.DataFrame) -> np.ndarray:
 44 |         return self.target_encoder.fit_transform(data[self.target_column])
 45 | 
 46 |     def fit_transform_encode(self, data: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
 47 |         raise NotImplementedError
 48 | 
 49 |     def transform_encode(self, data: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
 50 |         raise NotImplementedError
 51 | 
 52 | 
 53 | class TransformerFeatureExtractor(BaseFeaturizer):
 54 |     def __init__(
 55 |             self,
 56 |             transformer_pretrained_path,
 57 |             packet_num,
 58 |             mask_first_token=False,
 59 |             reinitialize=False,
 60 |             device=None
 61 |     ):
 62 |         super().__init__(packet_num, consider_iat_features=True)
 63 |         assert packet_num > 0, 'raw packet sequence length must be > 0'
 64 |         self._pretrained_path = pathlib.Path(transformer_pretrained_path)
 65 |         self.tokenizer = PacketTokenizer.from_pretrained(transformer_pretrained_path,
 66 |                                                          flow_size=packet_num)
 67 |         if not device:
 68 |             self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
 69 | 
 70 |         feature_extractor = GPT2Model.from_pretrained(transformer_pretrained_path).to(self.device)
 71 |         self.reinitialize = reinitialize
 72 |         if self.reinitialize:
 73 |             logger.info('resetting model weights')
 74 |             feature_extractor.init_weights()
 75 |         self.feature_extractor = feature_extractor.eval()
 76 |         self.mask_first_token = mask_first_token
 77 | 
 78 |     def _get_transformer_features(self, df, batch_size=1024):
 79 |         filename = (get_df_hash(df) +
 80 |                     self._pretrained_path.stem +
 81 |                     ('_mask_first' if self.mask_first_token else '') +
 82 |                     ('_reinitialize' if self.reinitialize else ''))
 83 |         tmp_path = CACHE_DIR / filename
 84 |         if tmp_path.is_file():
 85 |             logger.info(f'found cached transformer features, loading {tmp_path}...')
 86 |             return read_dataset(tmp_path, True)
 87 | 
 88 |         logger.info(f'did not find cached transformer features at {tmp_path}, processing...')
 89 |         merged_tensor = np.empty((len(df), self.feature_extractor.config.hidden_size))
 90 |         for start_idx, end_idx in iterate_batch_indexes(df, batch_size):
 91 |             raw_subset = df[self.raw_features].iloc[start_idx:end_idx]
 92 |             encoded_flows = self.tokenizer.batch_encode_packets(raw_subset).to(self.device)
 93 |             if self.mask_first_token:
 94 |                 encoded_flows['attention_mask'][:, 0] = 0
 95 |             with torch.no_grad():
 96 |                 output = self.feature_extractor(**encoded_flows)
 97 |             output = output[0].to('cpu')  # last hidden state (batch_size, sequence_length, hidden_size)
 98 |             # average over temporal dimension
 99 |             output = output.mean(dim=1).numpy()
100 |             merged_tensor[start_idx:end_idx, :] = output
101 | 
102 |         save_dataset(pd.DataFrame(merged_tensor), tmp_path)
103 |         return merged_tensor
104 | 
105 |     def fit_transform_encode(self, data):
106 |         X_feat = self._get_transformer_features(data)
107 |         y = self.fit_target_encoder(data)
108 |         return X_feat, y
109 | 
110 |     def transform_encode(self, data):
111 |         X_feat = self._get_transformer_features(data)
112 |         y = self.encode_targets(data)
113 |         return X_feat, y
114 | 
115 | 
116 | class Featurizer(BaseFeaturizer):
117 |     """
118 |     Featurizer processes features from a pandas object by merging results from scalers, one-hot encoders
119 |     and encodes target labels
120 |     """
121 | 
122 |     def __init__(self,
123 |                  packet_num,
124 |                  cont_features=None,
125 |                  categorical_features=None,
126 |                  consider_tcp_flags=True,
127 |                  consider_j3a=True,
128 |                  consider_raw_features=True,
129 |                  consider_iat_features=False,
130 |                  target_column=TARGET_CLASS_COLUMN):
131 |         super().__init__(packet_num, consider_iat_features, target_column)
132 | 
133 |         self.column_converter = None
134 |         if not consider_raw_features:
135 |             self.raw_features = []
136 |         self.consider_iat_features = consider_iat_features
137 |         self.consider_tcp_flags = consider_tcp_flags
138 |         self.consider_j3a = consider_j3a
139 | 
140 |         self.categorical_features = ['ip_proto'] if categorical_features is None else categorical_features
141 |         self.cont_features = self._get_cont_features() if cont_features is None else cont_features
142 |         self.try_extract_derivative_features = cont_features is None
143 | 
144 |         if self.consider_j3a:
145 |             self.categorical_features.extend(['ndpi_j3ac', 'ndpi_j3as'])
146 | 
147 |         if self.consider_tcp_flags:
148 |             self.categorical_features.extend([f'{FEATURE_PREFIX.client}found_tcp_flags',
149 |                                               f'{FEATURE_PREFIX.server}found_tcp_flags'])
150 | 
151 |     def _get_cont_features(self):
152 |         # here we expect features to be consistent with flow_parser's
153 |         base_features = ['bulk', 'packet']
154 |         if self.consider_iat_features:
155 |             base_features.append('iat')
156 | 
157 |         cont_features = []
158 |         for prefix in [FEATURE_PREFIX.client, FEATURE_PREFIX.server]:
159 |             for derivative in list(FEATURE_FUNCTIONS.keys()):
160 |                 for base in base_features:
161 |                     cont_features.append(prefix + base + derivative)
162 |         return cont_features
163 | 
164 |     def _filter_non_existing_features(self, data: pd.DataFrame):
165 |         data_features = set(data.columns)
166 | 
167 |         if set(self.raw_features) - data_features:
168 |             found_features = list(set(self.raw_features) & data_features)
169 |             logger.warning(f'skipping the following raw features: {set(self.raw_features) - data_features}')
170 |             self.raw_features = found_features
171 | 
172 |         if set(self.cont_features) - data_features:
173 |             found_features = list(set(self.cont_features) & data_features)
174 |             logger.warning(f'skipping the following continuous features: {set(self.cont_features) - data_features}')
175 |             self.cont_features = found_features
176 | 
177 |         if set(self.categorical_features) - data_features:
178 |             found_features = list(set(self.categorical_features) & data_features)
179 |             logger.warning(f'skipping the following categorical features: '
180 |                            f'{set(self.categorical_features) - data_features}')
181 |             self.categorical_features = found_features
182 | 
183 |     def _parse_derivatives_if_needed(self, data):
184 |         if self.try_extract_derivative_features:
185 |             data = self.calc_packets_stats_from_raw(data)
186 |         return data
187 | 
188 |     def _fit_transform_encode(self, data: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
189 |         """ init transformers upon actual fitting to check for non-existing columns """
190 |         data = self._parse_derivatives_if_needed(data)
191 | 
192 |         self._filter_non_existing_features(data)
193 |         feature_set = []
194 | 
195 |         if self.cont_features:
196 |             feature_set.append(("scaler", StandardScaler(),
197 |                                 self.cont_features))
198 | 
199 |         if self.categorical_features:
200 |             feature_set.append(("one_hot", OneHotEncoder(handle_unknown='ignore', sparse=False),
201 |                                 self.categorical_features)),
202 | 
203 |         if self.raw_features:
204 |             # TODO replace with PacketScaler
205 |             feature_set.append(('raw_features', StandardScaler(), self.raw_features))
206 | 
207 |         self.column_converter = ColumnTransformer(feature_set)
208 | 
209 |         X_train = self.column_converter.fit_transform(data)
210 |         y_train = self.fit_target_encoder(data)
211 |         logger.info(f'{X_train.shape[0]} train samples with {self.n_classes} classes')
212 |         return X_train, y_train
213 | 
214 |     def fit_transform(self, data: pd.DataFrame) -> np.ndarray:
215 |         return self._fit_transform_encode(data)[0]
216 | 
217 |     def fit_transform_encode(self, data: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
218 |         return self._fit_transform_encode(data)
219 | 
220 |     def transform(self, data: pd.DataFrame) -> np.ndarray:
221 |         data = self._parse_derivatives_if_needed(data)
222 |         X_test = self.column_converter.transform(data)
223 |         return X_test
224 | 
225 |     def transform_encode(self, data: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
226 |         data = self._parse_derivatives_if_needed(data)
227 |         X_test = self.column_converter.transform(data)
228 |         y_test = self.encode_targets(data)
229 |         return X_test, y_test
230 | 
231 |     @property
232 |     def n_classes(self):
233 |         return len(self.target_encoder.classes_)
234 | 
235 |     def calc_packets_stats_from_raw(self, data: pd.DataFrame):
236 |         def calc_flow_packet_stats(flow: np.ndarray):
237 |             subflow = flow[:2 * DEFAULT_PACKET_LIMIT_PER_FLOW]
238 |             packets = flows_to_packets(subflow)
239 |             from_idx = packets[:, 0] > 0
240 |             to_idx = packets[:, 0] < 0
241 | 
242 |             stats = {}
243 |             for direction, packet_idx in zip(
244 |                     (FEATURE_PREFIX.server, FEATURE_PREFIX.client),
245 |                     (from_idx, to_idx)
246 |             ):
247 |                 try:
248 |                     ps_derivatives = calc_parameter_stats(np.abs(packets[packet_idx, 0]), direction, 'packet')
249 |                     stats.update(ps_derivatives)
250 |                 except ValueError:
251 |                     continue
252 | 
253 |                 if self.consider_iat_features:
254 |                     try:
255 |                         iat_derivatives = calc_parameter_stats(packets[packet_idx, 1], direction, 'iat')
256 |                         stats.update(iat_derivatives)
257 |                     except ValueError:
258 |                         continue
259 | 
260 |             return stats
261 | 
262 |         if any(FEATURE_PREFIX.server + feature in data.columns for feature in CONTINUOUS_NAMES):
263 |             logger.warning('packet stats has been found in dataframe, skipping calculation')
264 |             return data
265 | 
266 |         tmp_path = CACHE_DIR / (get_df_hash(data) + '_iat_' + str(self.consider_iat_features))
267 |         if tmp_path.is_file():
268 |             logger.info('found cached dataset version, loading...')
269 |             return read_dataset(tmp_path, True)
270 | 
271 |         raw = data.filter(regex='raw_')
272 |         packet_stats = raw.parallel_apply(calc_flow_packet_stats, axis=1, raw=True, result_type='expand').tolist()
273 |         packet_stats = pd.DataFrame(packet_stats).fillna(0)
274 |         logger.info('calculated the derivatives from raw features')
275 |         data = data.join(packet_stats)
276 |         save_dataset(data, save_to=tmp_path)
277 |         return data
278 | 


--------------------------------------------------------------------------------