├── .coveragerc
├── networkml
    ├── algorithms
    │   ├── __init__.py
    │   └── host_footprint.py
    ├── featurizers
    │   ├── __init__.py
    │   ├── funcs
    │   │   ├── __init__.py
    │   │   ├── packet.py
    │   │   ├── generic.py
    │   │   ├── flow.py
    │   │   └── host.py
    │   ├── features.py
    │   ├── README.md
    │   ├── main.py
    │   └── csv_to_features.py
    ├── helpers
    │   ├── __init__.py
    │   ├── gzipio.py
    │   ├── pandas_csv_importer.py
    │   └── results_output.py
    ├── parsers
    │   ├── __init__.py
    │   └── pcap_to_csv.py
    ├── __init__.py
    ├── trained_models
    │   ├── host_footprint_scaler.mod
    │   ├── host_footprint_le.json
    │   ├── label_assignments.json
    │   └── README.md
    ├── __main__.py
    └── NetworkML.py
├── netml-dev.yml
├── MAINTAINERS
├── renovate.json
├── tests
    ├── test_networkml.py
    ├── test_data
    │   ├── trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap
    │   ├── trace_ab34_2001-01-01_02_03-client-ip-1-2-3-4.pcap
    │   ├── trace_ab12_2001-01-01_02_03-client-ip6-1-2-3-4.pcap
    │   ├── bad_data_too_few_columns.csv
    │   ├── list_test.json
    │   ├── combined_two_roles.csv
    │   ├── combined_three_roles.csv
    │   └── combined.csv
    ├── test_funcs_packet.py
    ├── test_featurizers_features.py
    ├── test_e2e.sh
    ├── test_pcap_to_csv.py
    ├── test_featurizers_main.py
    ├── test_results_output.py
    ├── test_csv_to_features.py
    ├── test_funcs_host.py
    └── test_algorithms_host_footprint.py
├── .sastscanrc
├── .dockerignore
├── .github
    ├── workflows
    │   ├── make.yml
    │   ├── pypi.yaml
    │   ├── stale.yml
    │   ├── sl-scan.yaml
    │   ├── secrets-scan.yml
    │   ├── test.yml
    │   ├── docker-amd64.yml
    │   ├── docker.yml
    │   └── semgrep.yml
    └── ISSUE_TEMPLATE
    │   └── bug_report.md
├── Dockerfile.standalone
├── .pre-commit-config.yaml
├── Dockerfile.test
├── Dockerfile
├── Makefile
├── AUTHORS
├── pyproject.toml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── README.md
├── CONTRIBUTING.md
├── LICENSE
└── CHANGELOG.md


/.coveragerc:
--------------------------------------------------------------------------------
1 | [report]
2 | 


--------------------------------------------------------------------------------
/networkml/algorithms/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/networkml/featurizers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/networkml/helpers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/networkml/parsers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/networkml/featurizers/funcs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/networkml/__init__.py:
--------------------------------------------------------------------------------
1 | from importlib.metadata import version
2 | 
3 | __version__ = version('networkml')
4 | 


--------------------------------------------------------------------------------
/netml-dev.yml:
--------------------------------------------------------------------------------
1 | name: netml-dev
2 | channels:
3 |   - defaults
4 | dependencies:
5 |   - pip=20.0
6 |   - python=3.7.6
7 | 


--------------------------------------------------------------------------------
/MAINTAINERS:
--------------------------------------------------------------------------------
1 | Charlie Lewis <clewis@iqt.org>
2 | Josh Bailey   <josh@vandervecken.com>
3 | Ryan Ashley   <rashley@iqt.org>
4 | 


--------------------------------------------------------------------------------
/renovate.json:
--------------------------------------------------------------------------------
1 | {
2 |   "extends": [
3 |     "config:base",
4 |     "docker:enableMajor"
5 |   ],
6 |   "ignorePaths": []
7 | }
8 | 


--------------------------------------------------------------------------------
/tests/test_networkml.py:
--------------------------------------------------------------------------------
1 | from networkml.NetworkML import NetworkML
2 | 
3 | 
4 | def test_smoke():
5 |     instance = NetworkML()
6 | 


--------------------------------------------------------------------------------
/networkml/trained_models/host_footprint_scaler.mod:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/faucetsdn/NetworkML/HEAD/networkml/trained_models/host_footprint_scaler.mod


--------------------------------------------------------------------------------
/tests/test_data/trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/faucetsdn/NetworkML/HEAD/tests/test_data/trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap


--------------------------------------------------------------------------------
/tests/test_data/trace_ab34_2001-01-01_02_03-client-ip-1-2-3-4.pcap:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/faucetsdn/NetworkML/HEAD/tests/test_data/trace_ab34_2001-01-01_02_03-client-ip-1-2-3-4.pcap


--------------------------------------------------------------------------------
/networkml/featurizers/funcs/packet.py:
--------------------------------------------------------------------------------
1 | from networkml.featurizers.features import Features
2 | 
3 | 
4 | class Packet(Features):
5 | 
6 |     def all(self, rows):
7 |         return rows
8 | 


--------------------------------------------------------------------------------
/tests/test_data/trace_ab12_2001-01-01_02_03-client-ip6-1-2-3-4.pcap:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/faucetsdn/NetworkML/HEAD/tests/test_data/trace_ab12_2001-01-01_02_03-client-ip6-1-2-3-4.pcap


--------------------------------------------------------------------------------
/.sastscanrc:
--------------------------------------------------------------------------------
1 | {
2 |   "type": ["credscan", "python", "dockerfile", "yaml"]
3 |   "SCAN_ANNOTATE_PR": true
4 |   "PR_COMMENT_TEMPLATE": "## Scan Summary\n%(summary)s\n## Recommendation\n%(recommendation)s\n"
5 | }


--------------------------------------------------------------------------------
/tests/test_funcs_packet.py:
--------------------------------------------------------------------------------
1 | from networkml.featurizers.funcs.packet import Packet
2 | 
3 | 
4 | def test_packet_all():
5 |     instance = Packet()
6 |     result = instance.all('foo')
7 |     assert result == 'foo'
8 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | Dockerfile
 2 | Dockerfile.base
 3 | Dockerfile.test
 4 | .gitignore
 5 | AUTHORS
 6 | CHANGELOG.md
 7 | CODE_OF_CONDUCT.md
 8 | CONTRIBUTING.md
 9 | LICENSE
10 | MAINTAINERS.md
11 | Makefile
12 | README.md
13 | 


--------------------------------------------------------------------------------
/networkml/trained_models/host_footprint_le.json:
--------------------------------------------------------------------------------
1 | {"classes": ["ActiveDirectoryController", "AdminServer", "AdminWorkstation", "BusinessWorkstation", "ConfluenceServer", "DevWorkstation", "DistrFileShare", "ExchangeServer", "GPULaptop", "GithubRepo", "PKIServer", "Printer"]}


--------------------------------------------------------------------------------
/.github/workflows/make.yml:
--------------------------------------------------------------------------------
 1 | name: make
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   make:
 7 | 
 8 |     runs-on: ubuntu-latest
 9 | 
10 |     steps:
11 |     - uses: actions/checkout@v3
12 |     - name: Test make
13 |       run: |
14 |         make test
15 | 


--------------------------------------------------------------------------------
/networkml/featurizers/funcs/generic.py:
--------------------------------------------------------------------------------
1 | from networkml.featurizers.features import Features
2 | from networkml.helpers.pandas_csv_importer import WS_FIELDS
3 | 
4 | 
5 | class Generic(Features):
6 | 
7 |     def all(self, rows_f):
8 |         return [{field: row.get(field, '') for field in WS_FIELDS} for row in rows_f()]
9 | 


--------------------------------------------------------------------------------
/Dockerfile.standalone:
--------------------------------------------------------------------------------
1 | FROM networkml
2 | LABEL maintainer="Ryan Ashley <rashley@iqt.org>"
3 | 
4 | ENTRYPOINT ["networkml"]
5 | CMD ["--trained_model=/trained_models/host_footprint.json", "--label_encoder=/trained_models/host_footprint_le.json", "--scaler=/trained_models/host_footprint_scaler.mod", "--operation", "predict", "/pcaps"]


--------------------------------------------------------------------------------
/networkml/helpers/gzipio.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | import io
 3 | 
 4 | 
 5 | def gzip_reader(gzip_file):
 6 |     return io.TextIOWrapper(gzip.open(gzip_file, 'r'), newline='')  # pytype: disable=wrong-arg-types
 7 | 
 8 | 
 9 | def gzip_writer(gzip_file):
10 |     return io.TextIOWrapper(gzip.open(gzip_file, 'w'), newline='', write_through=True)  # pytype: disable=wrong-arg-types
11 | 


--------------------------------------------------------------------------------
/networkml/__main__.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import humanize
 3 | import logging
 4 | import time
 5 | 
 6 | 
 7 | def main():
 8 |     from networkml.NetworkML import NetworkML
 9 |     start = time.time()
10 |     NetworkML()
11 |     end = time.time()
12 |     elapsed = end - start
13 |     human_elapsed = humanize.naturaldelta(datetime.timedelta(seconds=elapsed))
14 |     logging.info(f'Elapsed Time: {elapsed} seconds ({human_elapsed})')
15 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: git@github.com:pre-commit/pre-commit-hooks
 3 |     rev: v4.0.1
 4 |     hooks:
 5 |     -   id: trailing-whitespace
 6 |     -   id: end-of-file-fixer
 7 |     -   id: autopep8-wrapper
 8 |     -   id: check-case-conflict
 9 |     -   id: check-json
10 |     -   id: pretty-format-json
11 |         args: ['--autofix']
12 |     -   id: double-quote-string-fixer
13 |     -   id: check-yaml
14 | -   repo: git@github.com:asottile/reorder_python_imports
15 |     rev: v2.6.0
16 |     hooks:
17 |     -   id: reorder-python-imports
18 | 


--------------------------------------------------------------------------------
/networkml/featurizers/funcs/flow.py:
--------------------------------------------------------------------------------
 1 | from networkml.featurizers.features import Features
 2 | 
 3 | 
 4 | class Flow(Features):
 5 | 
 6 |     def default_tcp_5tuple(self, rows):
 7 |         fields = ['ip.src_host', 'ip.dst_host',
 8 |                   'tcp.dstport', 'tcp.srcport', 'frame.protocols']
 9 |         return self.get_columns(fields, rows)
10 | 
11 |     def default_udp_5tuple(self, rows):
12 |         fields = ['ip.src_host', 'ip.dst_host',
13 |                   'udp.dstport', 'udp.srcport', 'frame.protocols']
14 |         return self.get_columns(fields, rows)
15 | 


--------------------------------------------------------------------------------
/networkml/trained_models/label_assignments.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "ActiveDirectoryController": "Active Directory controller",
 3 |   "AdminServer": "Administrator server",
 4 |   "AdminWorkstation": "Administrator workstation",
 5 |   "BusinessWorkstation": "Business workstation",
 6 |   "ConfluenceServer": "Confluence server",
 7 |   "DevWorkstation": "Developer workstation",
 8 |   "DistrFileShare": "Distributed file share",
 9 |   "ExchangeServer": "Exchange server",
10 |   "GPULaptop": "GPU laptop",
11 |   "GithubRepo": "GitHub server",
12 |   "PKIServer": "PKI server",
13 |   "Printer": "Printer"
14 | }
15 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Tell us what is broken
 4 | 
 5 | ---
 6 | 
 7 | ### Description
 8 | Describe your issue here.
 9 | 
10 | ### Environment
11 | - Git commit hash, and specific ML model (if any)
12 | - Docker version, if using
13 | - Operating system
14 | 
15 | ### Steps to reproduce
16 | - step 1
17 | - step 2
18 | - ...
19 | 
20 | ### Expected result
21 | What did you expect to happen?
22 | 
23 | ### Actual result
24 | Describe the error message or unexpected result.
25 | 
26 | Don't forget to upload any relevant data if this is required to trigger the issue.
27 | 


--------------------------------------------------------------------------------
/Dockerfile.test:
--------------------------------------------------------------------------------
 1 | FROM networkml
 2 | LABEL maintainer="Charlie Lewis <clewis@iqt.org>"
 3 | COPY . /networkml
 4 | WORKDIR /networkml
 5 | ENV PATH="${PATH}:/root/.local/bin"
 6 | RUN apt-get update && apt-get install -y gcc rabbitmq-server && \
 7 |     rm -rf /var/cache && \
 8 |     poetry install -E test
 9 | RUN jupyter nbconvert --ExecutePreprocessor.timeout=300 --to notebook --execute notebooks/networkml_exploration.ipynb --Application.log_level=DEBUG
10 | ENTRYPOINT ["pytest"]
11 | CMD ["-l", "-s", "-v", "-nauto", "--cov=tests/", "--cov=networkml/", "--cov-report", "term-missing", "-c", ".coveragerc", "--rabbitmq-port=5672"]
12 | 


--------------------------------------------------------------------------------
/tests/test_featurizers_features.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from networkml.featurizers.features import Features
 3 | 
 4 | 
 5 | def test_quantile_nullable_int():
 6 |     # TODO: https://github.com/pandas-dev/pandas/issues/42626
 7 |     # TODO: migrate all tests to unittest class/assert method style.
 8 |     df = pd.DataFrame([{'x': 1}, {'x': 0}], dtype=pd.Int64Dtype())
 9 |     assert df['x'].quantile(0.75)  # nosec
10 | 
11 | 
12 | def test_no_func():
13 |     instance = Features()
14 |     instance.run_func('none')
15 | 
16 | 
17 | def test_get_columns():
18 |     instance = Features()
19 |     assert instance.get_columns(
20 |         ['foo', 'bar'], [{'foo': 1, 'baz': 3}]) == [{'foo': 1}]
21 | 


--------------------------------------------------------------------------------
/tests/test_e2e.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | TMPDIR=$(mktemp -d)
 6 | networkml ./tests/test_data/trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap -o $TMPDIR --first_stage parser --final_stage algorithm --operation predict
 7 | cat $TMPDIR/predict.json
 8 | LABEL=$(jq < $TMPDIR/predict.json '.[0].data.mac_addresses["00:04:00:81:81:d0"]["classification"]["labels"][0]')
 9 | if [[ "$LABEL" == "" ]] ; then
10 |     echo FAIL: no result from prediction
11 | fi
12 | TD=$(pwd)
13 | docker build -f Dockerfile . -t iqtlabs/networkml:latest
14 | docker run -i -e RESULT_PATH=/tmp/predict.json -v $TD/tests/test_data:/pcaps iqtlabs/networkml:latest /pcaps/trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap -o/tmp
15 | echo PASS
16 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM iqtlabs/rbqwrapper:v0.11.33
 2 | LABEL maintainer="Charlie Lewis <clewis@iqt.org>"
 3 | 
 4 | ENV DEBIAN_FRONTEND "noninteractive"
 5 | ENV PYTHONUNBUFFERED 1
 6 | ENV PATH="${PATH}:/root/.local/bin"
 7 | COPY pyproject.toml pyproject.toml
 8 | 
 9 | # hadolint ignore=DL3008
10 | RUN apt-get update && apt-get install -y git python3-numpy python3-scipy gfortran libblas-dev liblapack-dev libxslt-dev libxml2-dev flex bison zlib1g-dev tshark curl && \
11 |     apt-get remove -y libblas-dev liblapack-dev libxslt-dev libxml2-dev gfortran flex bison zlib1g-dev && \
12 |     apt-get autoremove -y && \
13 |     rm -rf /var/cache/* && \
14 |     rm -rf /root/.cache/* && \
15 |     curl -sSL https://install.python-poetry.org | python3 - --version 1.1.15 && \
16 |     poetry config virtualenvs.create false && \
17 |     pip install -U pip
18 | 
19 | COPY . /networkml
20 | WORKDIR /networkml
21 | RUN poetry install
22 | ENTRYPOINT ["/rbqwrapper.py", "networkml"]
23 | 


--------------------------------------------------------------------------------
/tests/test_data/bad_data_too_few_columns.csv:
--------------------------------------------------------------------------------
1 | min_frame_len_in,25q_frame_len,25q_frame_len_out,max_frame_len_out,75q_time_delta_out,variance_frame_len_out,count_frame_len_in,75q_frame_len_out,median_frame_len,min_time_delta,variance_time_delta_out,average_frame_len_in,IPv6,75q_frame_len,average_time_delta_in,min_time_delta_in,IPv4,25q_frame_len_in,total_frame_len_in,25q_time_delta_out,min_time_delta_out,variance_frame_len_in,variance_frame_len,75q_time_delta_in,average_frame_len_out,max_time_delta,min_frame_len_out,variance_time_delta_in,average_frame_len,median_frame_len_in,median_time_delta_in,average_time_delta,max_time_delta_in,count_frame_len_out,25q_time_delta_in,min_frame_len,max_frame_len,median_frame_len_out,median_time_delta_out,max_time_delta_out,75q_frame_len_in,average_time_delta_out,total_frame_len_out,max_frame_len_in,filename
2 | 42,54,54,ARGH----BADDATA----ARGH!!!!!,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,.pcap.csv.gz
3 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi.yaml:
--------------------------------------------------------------------------------
 1 | name: release
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: main
 6 |     tags: 'v*'
 7 | 
 8 | jobs:
 9 |   release:
10 | 
11 |     runs-on: ubuntu-latest
12 | 
13 |     steps:
14 |     - uses: actions/checkout@v3
15 |     - name: Build package
16 |       id: build_package
17 |       run: |
18 |         sudo apt-get update && \
19 |         sudo apt-get install -yq --no-install-recommends python3-pip python3.7 python3-dev python3-setuptools && \
20 |         pip3 install wheel && \
21 |         python3 setup.py sdist bdist_wheel --universal
22 |       if: github.repository == 'iqtlabs/networkml' && github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')
23 |     - name: Publish package
24 |       if: github.repository == 'iqtlabs/networkml' && github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')
25 |       uses: pypa/gh-action-pypi-publish@master
26 |       with:
27 |         user: ${{ secrets.PYPI_USERNAME }}
28 |         password: ${{ secrets.PYPI_TOKEN }}
29 | 


--------------------------------------------------------------------------------
/.github/workflows/stale.yml:
--------------------------------------------------------------------------------
 1 | name: 'Close stale issues and PRs'
 2 | on:
 3 |   schedule:
 4 |     - cron: '30 1 * * *'
 5 | 
 6 | jobs:
 7 |   stale:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - uses: actions/stale@v6
11 |         with:
12 |           stale-issue-message: 'This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days.'
13 |           stale-pr-message: 'This PR is stale because it has been open 45 days with no activity. Remove stale label or comment or this will be closed in 10 days.'
14 |           close-issue-message: 'This issue was closed because it has been stalled for 5 days with no activity.'
15 |           close-pr-message: 'This PR was closed because it has been stalled for 10 days with no activity.'
16 |           days-before-issue-stale: 30
17 |           days-before-pr-stale: 45
18 |           days-before-issue-close: 5
19 |           days-before-pr-close: 10
20 |           exempt-issue-labels: 'dependencies'
21 | 


--------------------------------------------------------------------------------
/.github/workflows/sl-scan.yaml:
--------------------------------------------------------------------------------
 1 | name: sl-scan
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   scan:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |     - name: Cache vdb
10 |       uses: actions/cache@v3
11 |       with:
12 |         path: |
13 |           ${{ github.workspace }}/vdb
14 |         key: ${{ runner.os }}
15 |     - name: Scan
16 |       uses: ShiftLeftSecurity/scan-action@master
17 |       env:
18 |         VDB_HOME: ${{ github.workspace }}/vdb
19 |         WORKSPACE: https://github.com/${{ github.repository }}/blob/${{ github.sha }}
20 |         SCAN_AUTO_BUILD: true
21 |         SCAN_ANNOTATE_PR: true
22 |         PR_COMMENT_TEMPLATE: "## Scan Summary\n%(summary)s\n## Recommendation\n%(recommendation)s\n"
23 |         PR_COMMENT_BASIC_TEMPLATE: "## Scan Summary\n%(summary)s\n## Recommendation\n%(recommendation)s\n"
24 |       with:
25 |         output: reports
26 |         type: "credscan, python, dockerfile, yaml"
27 |     - name: Upload scan reports
28 |       uses: actions/upload-artifact@v3.1.1
29 |       with:
30 |         name: shiftleft-scan-reports
31 |         path: reports


--------------------------------------------------------------------------------
/networkml/featurizers/features.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | import ipaddress
 3 | 
 4 | import numpy as np
 5 | 
 6 | 
 7 | class Features():
 8 | 
 9 |     def __init__(self):
10 |         self.nonempty_generators = set()
11 | 
12 |     def run_func(self, func_name, *args):
13 |         """
14 |         Helper function that will run the <func_name> with <args> for this func
15 |         :param func_name: name of the function to run
16 |         :param args: list of arguments to run with the function
17 |         """
18 |         func = getattr(self, func_name, None)
19 |         if not func:
20 |             print("Error: Not a function name that's been defined")
21 |             return False
22 | 
23 |         results = func(*args)
24 |         return results
25 | 
26 |     @staticmethod
27 |     def get_columns(fields, rows):
28 |         # Terse but efficient.
29 |         new_rows = [{field: row[field]
30 |                      for field in fields if row.get(field, None)} for row in rows]
31 |         return new_rows
32 | 
33 |     @staticmethod
34 |     def _pyshark_row_layers(rows_f):
35 |         return filter(lambda row: 'layers' in row, rows_f())
36 | 


--------------------------------------------------------------------------------
/tests/test_pcap_to_csv.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import sys
 4 | import tempfile
 5 | 
 6 | from networkml.parsers.pcap_to_csv import PCAPToCSV
 7 | 
 8 | 
 9 | def test_PCAPToCSV_pyshark_packet():
10 |     with tempfile.TemporaryDirectory() as tmpdir:
11 |         testdata = os.path.join(tmpdir, 'test_data')
12 |         shutil.copytree('./tests/test_data', testdata)
13 |         sys.argv = ['pcap_to_csv.py', '-c', '-e', 'pyshark', '-t', '2', '-v', 'DEBUG', '-o', os.path.join(
14 |             tmpdir, 'networkml_test.pcap.csv.gz'), os.path.join(testdata, 'trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap')]
15 |         instance = PCAPToCSV()
16 |         instance.main()
17 | 
18 | 
19 | def test_ispcap():
20 |     a = 'foo.bad'
21 |     answer = PCAPToCSV.ispcap(a)
22 |     assert answer == False
23 |     a = 'fooo.pcap'
24 |     answer = PCAPToCSV.ispcap(a)
25 |     assert answer == True
26 |     a = 'fooo.pcapng'
27 |     answer = PCAPToCSV.ispcap(a)
28 |     assert answer == True
29 |     a = 'fooo.dump'
30 |     answer = PCAPToCSV.ispcap(a)
31 |     assert answer == True
32 |     a = 'fooo.capture'
33 |     answer = PCAPToCSV.ispcap(a)
34 |     assert answer == True
35 | 


--------------------------------------------------------------------------------
/tests/test_featurizers_main.py:
--------------------------------------------------------------------------------
 1 | from networkml.featurizers.features import Features
 2 | from networkml.featurizers.main import Featurizer
 3 | 
 4 | 
 5 | def test_no_path():
 6 |     instance = Featurizer()
 7 |     result = instance.import_class('foo', None)
 8 |     assert result == None
 9 | 
10 | 
11 | def test_run_all_funcs():
12 |     instance = Featurizer()
13 | 
14 |     class TestClass(Features):
15 | 
16 |         @staticmethod
17 |         def test_feature1(rows, _srcmacid):
18 |             for row in rows:
19 |                 return [{'test1': row['test1']}]
20 | 
21 |         @staticmethod
22 |         def test_feature2(rows, _srcmacid):
23 |             for row in rows:
24 |                 return [{'test2': row['test2']}]
25 | 
26 |     tc = TestClass()
27 | 
28 |     results = instance.run_all_funcs(
29 |         [('test_feature1', 'test_feature1'),
30 |             ('test_feature2', 'test_feature2')], [],
31 |         [(tc, 'test_feature1'), (tc, 'test_feature2')],
32 |         [{'test1': 99, 'test2': 123}],
33 |         True)
34 |     assert results == [[{'test1': 99}], [{'test2': 123}]]
35 | 
36 |     results = instance.run_all_funcs(
37 |         [], [], [], [{'test1': 99, 'test2': 123}], True)
38 |     assert results == []
39 | 


--------------------------------------------------------------------------------
/.github/workflows/secrets-scan.yml:
--------------------------------------------------------------------------------
 1 | name: secrets
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   scan:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |     - uses: actions/checkout@v3
10 |     - name: scan
11 |       run: |
12 |         export DEBIAN_FRONTEND=noninteractive && \
13 |         echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections && \
14 |         sudo apt-get update && \
15 |         python3 -m pip install --upgrade pip && \
16 |         pip3 install whispers && \
17 |         mkdir /home/runner/reports/ && \
18 |         whispers --severity BLOCKER,CRITICAL -o /home/runner/reports/whispers.json ${GITHUB_WORKSPACE} && \
19 |         echo "::set-output name=found-count::$(wc -l /home/runner/reports/whispers.json | cut -d' ' -f1)"
20 |     - name: Fail if found
21 |       if: steps.scan.outputs.found-count != 0
22 |       uses: actions/github-script@v6
23 |       with:
24 |         script: |
25 |             echo {{steps.scan.outputs.found-count}} && \
26 |             core.setFailed('Secrets found. Please check the uploaded report')
27 |     - name: Upload scan reports
28 |       uses: actions/upload-artifact@v3.1.1
29 |       if: failure()
30 |       with:
31 |         name: whispers-report
32 |         path: /home/runner/reports/whispers.json


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL:=/bin/bash
 2 | PIP=$(shell which pip3 || echo "pip3")
 3 | 
 4 | 
 5 | # CONDA_EXE must be set before running `make dev` or `rmdev`
 6 | # export CONDA_EXE=$_CONDA_EXE
 7 | CONDA_DEV=netml-dev
 8 | CONDAROOT=$(shell ${CONDA_EXE} info --base)/bin
 9 | CONDA_ENV=$(shell ${CONDA_EXE} info --base)/envs/$(CONDA_DEV)/bin
10 | 
11 | run: predict
12 | predict: build predict_nobuild
13 | predict_nobuild:
14 | 	@echo
15 | 	@echo "Running Predict on PCAP files $(PCAP)"
16 | 	@docker run -it --rm -v "$(PCAP):/pcaps$(PCAP)" networkml /pcaps
17 | 	@echo
18 | train: build train_nobuild
19 | train_nobuild:
20 | 	@echo
21 | 	@echo "Running Train on PCAP files $(PCAP)"
22 | 	@docker run -it --rm -v "$(PCAP):/pcaps$(PCAP)" -v "$(PWD)/networkml/trained_models:/usr/local/lib/python3.8/site-packages/networkml/trained_models/" networkml -O train /pcaps
23 | 	@echo
24 | test: build
25 | 	@docker build -t networkml-test -f Dockerfile.test .
26 | 	@docker run --rm networkml-test
27 | build:
28 | 	@docker build -t networkml .
29 | install:
30 | 	poetry install
31 | 
32 | dev:
33 | 	${CONDA_EXE} env create --force -f $(CONDA_DEV).yml python=3.9
34 | 	source $(CONDAROOT)/activate $(CONDA_DEV) ; \
35 | 	$(CONDA_ENV)/python3 -m pip install --upgrade pip ; \
36 | 	$(CONDA_ENV)/pip3 install .
37 | 
38 | rmdev:
39 | 	${CONDA_EXE} env remove -y -n $(CONDA_DEV)
40 | 


--------------------------------------------------------------------------------
/tests/test_data/list_test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "meta": "mlp",
 3 |   "coefs_": [
 4 |     [
 5 |       [
 6 |         0.05
 7 |       ],
 8 |       [
 9 |         0.05
10 |       ],
11 |       [
12 |         0.05
13 |       ]
14 |     ]
15 |   ],
16 |   "loss_": 0.00844733599407846,
17 |   "intercepts_": [
18 |     [
19 |       0.05
20 |     ]
21 |   ],
22 |   "n_iter_": 61,
23 |   "n_layers_": 5,
24 |   "n_outputs_": 12,
25 |   "out_activation_": "softmax",
26 |   "_label_binarizer": {
27 |     "neg_label": 0,
28 |     "pos_label": 1,
29 |     "sparse_output": false,
30 |     "y_type_": "multiclass",
31 |     "sparse_input_": false,
32 |     "classes_": [
33 |       0
34 |     ]
35 |   },
36 |   "params": {
37 |     "activation": "relu",
38 |     "alpha": 0.0001,
39 |     "batch_size": "auto",
40 |     "beta_1": 0.9,
41 |     "beta_2": 0.999,
42 |     "early_stopping": false,
43 |     "epsilon": 1e-08,
44 |     "hidden_layer_sizes": [
45 |       64,
46 |       32,
47 |       32
48 |     ],
49 |     "learning_rate": "constant",
50 |     "learning_rate_init": 0.001,
51 |     "max_fun": 15000,
52 |     "max_iter": 200,
53 |     "momentum": 0.9,
54 |     "n_iter_no_change": 10,
55 |     "nesterovs_momentum": true,
56 |     "power_t": 0.5,
57 |     "random_state": null,
58 |     "shuffle": true,
59 |     "solver": "adam",
60 |     "tol": 0.0001,
61 |     "validation_fraction": 0.1,
62 |     "verbose": false,
63 |     "warm_start": false
64 |   },
65 |   "features": [
66 |     "foo",
67 |     "bar",
68 |     "baz"
69 |   ],
70 |   "classes_": [
71 |     0
72 |   ]
73 | }


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
 1 | Ahmad Asmar <ahmad.s.asmar@gmail.com>
 2 | Alice <lilchurro@users.noreply.github.com>
 3 | Alice Chang <lilchurro@users.noreply.github.com>
 4 | Alice Chang <squeeve@users.noreply.github.com>
 5 | CStephenson970 <cstephenson970@gmail.com>
 6 | Charlie Lewis <clewis@iqt.org>
 7 | Cory Stephenson <CStephenson970@gmail.com>
 8 | Greg <22061293+gregs5@users.noreply.github.com>
 9 | Jackson <djackson@iqt.org>
10 | Jason Separovic <jason.separovic@nokia.com>
11 | Jason Separovic <jseppa01@gmail.com>
12 | John Meyers <jmeyers@Labs-LT-6506.local>
13 | John Meyers <jmeyers@iqt.org>
14 | John Speed Meyers <54914994+jspeed-meyers@users.noreply.github.com>
15 | Josh Bailey <anarkiwi@users.noreply.github.com>
16 | Josh Bailey <josh@vandervecken.com>
17 | Kieran Baker <baker.kieran.97@gmail.com>
18 | Lewis, Charlie <clewis@iqt.org>
19 | Mohammed Al-Shaboti <shaboti@ecs.vuw.ac.nz>
20 | Mohammed Alshaboti <alshaboti.it@gmail.com>
21 | Renovate Bot <bot@renovateapp.com>
22 | Ryan Ashley <rashley@iqt.org>
23 | Stephenson, Cory <cstephenson@iqt.org>
24 | Todd Stavish <toddstavish@gmail.com>
25 | achang <achang@iqt.org>
26 | alshaboti <shaboti@ecs.vuw.ac.nz>
27 | cglewis <clewis@iqt.org>
28 | jaiken06 <jaiken06@qub.ac.uk>
29 | jspeed-meyers <jmeyers@iqt.org>
30 | karllab41 <kni@Karl-MacTop.local>
31 | lilchurro <lilchurro@users.noreply.github.com>
32 | lostminty (lostminty@users.noreply.github.com)
33 | pgamble-admin <paulgamble09@gmail.com>
34 | pyup-bot <github-bot@pyup.io>
35 | pyup.io bot <github-bot@pyup.io>
36 | sneakyoctopus12 <56274120+sneakyoctopus12@users.noreply.github.com>
37 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: test
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   test:
 7 | 
 8 |     runs-on: ubuntu-20.04
 9 | 
10 |     steps:
11 |     - uses: actions/checkout@v3
12 |     - name: test
13 |       run: |
14 |         export DEBIAN_FRONTEND=noninteractive && \
15 |         echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections && \
16 |         sudo apt-get update && \
17 |         sudo apt-get install -yq --no-install-recommends python3-wheel python3.9 python3.9-dev python3-setuptools dialog apt-utils tshark jq curl && \
18 |         sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && \
19 |         sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 2 && \
20 |         python3 -m pip install --upgrade pip && \
21 |         curl -sSL https://install.python-poetry.org | python3 - --version 1.1.15 && \
22 |         export PATH=/home/runner/.local/bin:$PATH && \
23 |         poetry config virtualenvs.create false && \
24 |         poetry install -E test && \
25 |         PYTHONPATH=. pytype . && \
26 |         PYTHONPATH=. pytest -l -s -n auto -v --cov=tests/ --cov=networkml/ --cov-report term-missing -c .coveragerc --rabbitmq-port=5672 && coverage report && coverage xml && \
27 |         jupyter nbconvert --ExecutePreprocessor.timeout=300 --to notebook --execute notebooks/networkml_exploration.ipynb && \
28 |         ./tests/test_e2e.sh
29 |     - name: Upload coverage to Codecov
30 |       uses: codecov/codecov-action@v3.1.1
31 |       if: github.repository == 'iqtlabs/networkml'
32 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "networkml"
 3 | version = "0.7.0"
 4 | description = "Machine Learning model for use in determining device classes based on packet headers."
 5 | authors = ["Ryan <rashley@iqt.org>"]
 6 | license = "Apache 2.0"
 7 | include = [
 8 |     "networkml/trained_models/*.*",
 9 |     "networkml/*.py",
10 | ]
11 | 
12 | [tool.poetry.dependencies]
13 | python = ">=3.8 <3.11"
14 | cython = "0.29.32"
15 | humanize = "4.4.0"
16 | joblib = "1.2.0"
17 | netaddr = "0.8.0"
18 | numpy = "1.23.5"
19 | pandas = "1.5.2"
20 | pyshark = "0.5.3"
21 | scikit-learn = "1.2.0"
22 | scipy = "1.9.3"
23 | 
24 | #optional dependencies
25 | pygments = { version = "2.13.0", optional = true }
26 | pytest = { version = "7.2.0", optional = true }
27 | pytest-cov = { version = "4.0.0", optional = true }
28 | pytest-xdist = { version = "3.1.0", optional = true }
29 | pytest-rabbitmq = { version = "2.2.1", optional = true }
30 | nest_asyncio = { version = "1.5.6", optional = true }
31 | MarkupSafe = { version = "2.1.1", optional = true }
32 | notebook = { version = "6.5.2", optional = true }
33 | pytype = { version = "2022.11.29", optional = true }
34 | 
35 | [tool.poetry.extras]
36 | test = [
37 | "pygments",
38 | "pytest",
39 | "pytest-cov",
40 | "pytest-xdist",
41 | "pytest-rabbitmq",
42 | "nest_asyncio",
43 | "MarkupSafe",
44 | "notebook",
45 | "pytype",
46 | ]
47 | 
48 | [tool.poetry.dev-dependencies]
49 | 
50 | [build-system]
51 | requires = ["setuptools", "poetry-core>=1.0.0"]
52 | build-backend = "poetry.core.masonry.api"
53 | 
54 | [tool.poetry.scripts]
55 | networkml = 'networkml.__main__:main'
56 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-amd64.yml:
--------------------------------------------------------------------------------
 1 | name: no-arm
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: main
 6 |     tags: 'v*'
 7 | 
 8 | jobs:
 9 |   buildx:
10 | 
11 |     runs-on: ubuntu-latest
12 | 
13 |     steps:
14 |     - uses: actions/checkout@v3
15 |     - name: Get the version
16 |       id: get_version
17 |       run: echo ::set-output name=VERSION::$(echo $GITHUB_REF | cut -d / -f 3)
18 |     - name: Change for main
19 |       id: change_version
20 |       run: if [ "${{ steps.get_version.outputs.VERSION }}" == "main" ]; then echo ::set-output name=VERSION::latest; else echo ::set-output name=VERSION::${{ steps.get_version.outputs.VERSION }}; fi
21 |     - name: Set up qemu
22 |       uses: docker/setup-qemu-action@v2
23 |       with:
24 |         platforms: all
25 |     - name: Set up Docker Buildx
26 |       id: buildx
27 |       uses: docker/setup-buildx-action@v2
28 |       with:
29 |         version: latest
30 |     - name: Docker Login
31 |       env:
32 |         DOCKER_PASSWORD: ${{ secrets.DOCKER_TOKEN }}
33 |       run: |
34 |         echo "${DOCKER_PASSWORD}" | docker login --username "${{ secrets.DOCKER_USERNAME }}" --password-stdin
35 |       if: github.repository == 'iqtlabs/networkml' && github.event_name == 'push'
36 |     - name: Build and push platforms for final image
37 |       env:
38 |         DOCKER_CLI_EXPERIMENTAL: enabled
39 |       run: |
40 |         docker buildx build \
41 |           --platform linux/amd64 \
42 |           --push \
43 |           -t iqtlabs/networkml:${{ steps.change_version.outputs.VERSION }} .
44 |       if: github.repository == 'iqtlabs/networkml' && github.event_name == 'push'
45 | 


--------------------------------------------------------------------------------
/.github/workflows/docker.yml:
--------------------------------------------------------------------------------
 1 | name: buildx
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: main
 6 |     tags: 'v*'
 7 | 
 8 | jobs:
 9 |   buildx:
10 | 
11 |     runs-on: ubuntu-latest
12 | 
13 |     steps:
14 |     - uses: actions/checkout@v3
15 |     - name: Get the version
16 |       id: get_version
17 |       run: echo ::set-output name=VERSION::$(echo $GITHUB_REF | cut -d / -f 3)
18 |     - name: Change for main
19 |       id: change_version
20 |       run: if [ "${{ steps.get_version.outputs.VERSION }}" == "main" ]; then echo ::set-output name=VERSION::latest; else echo ::set-output name=VERSION::${{ steps.get_version.outputs.VERSION }}; fi
21 |     - name: Set up qemu
22 |       uses: docker/setup-qemu-action@v2
23 |       with:
24 |         platforms: all
25 |     - name: Set up Docker Buildx
26 |       id: buildx
27 |       uses: docker/setup-buildx-action@v2
28 |       with:
29 |         version: latest
30 |     - name: Docker Login
31 |       env:
32 |         DOCKER_PASSWORD: ${{ secrets.DOCKER_TOKEN }}
33 |       run: |
34 |         echo "${DOCKER_PASSWORD}" | docker login --username "${{ secrets.DOCKER_USERNAME }}" --password-stdin
35 |       if: github.repository == 'iqtlabs/networkml' && github.event_name == 'push'
36 |     - name: Build and push platforms for final image
37 |       env:
38 |         DOCKER_CLI_EXPERIMENTAL: enabled
39 |       run: |
40 |         docker buildx build \
41 |           --platform linux/amd64,linux/arm64 \
42 |           --push \
43 |           -t iqtlabs/networkml:${{ steps.change_version.outputs.VERSION }} .
44 |       if: github.repository == 'iqtlabs/networkml' && github.event_name == 'push'
45 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | pytype_output/
  4 | .pytype
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Editor files
 12 | *~
 13 | 
 14 | # Distribution / packaging
 15 | .Python
 16 | env/
 17 | build/
 18 | develop-eggs/
 19 | dist/
 20 | downloads/
 21 | eggs/
 22 | .eggs/
 23 | lib/
 24 | lib64/
 25 | parts/
 26 | sdist/
 27 | var/
 28 | wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # pyenv
 80 | .python-version
 81 | 
 82 | # celery beat schedule file
 83 | celerybeat-schedule
 84 | 
 85 | # SageMath parsed files
 86 | *.sage.py
 87 | 
 88 | # dotenv
 89 | .env
 90 | 
 91 | # virtualenv
 92 | .venv
 93 | venv/
 94 | ENV/
 95 | 
 96 | # Spyder project settings
 97 | .spyderproject
 98 | 
 99 | # Rope project settings
100 | .ropeproject
101 | 
102 | # vim temp files
103 | *.swp
104 | *.swo
105 | 
106 | .mypy_cache
107 | 
108 | # IntelliJ IDE files
109 | .idea/
110 | faucet.iml
111 | *.bak
112 | 
113 | *gz
114 | erl_crash.dump
115 | 
116 | # converted notebooks
117 | *.nbconvert.ipynb
118 | 
119 | #scan stuff
120 | reports/
121 | 
122 | semgrep.out


--------------------------------------------------------------------------------
/tests/test_data/combined_two_roles.csv:
--------------------------------------------------------------------------------
1 | host_key,min_frame_len_in,25q_frame_len,25q_frame_len_out,max_frame_len_out,75q_time_delta_out,variance_frame_len_out,count_frame_len_in,75q_frame_len_out,median_frame_len,min_time_delta,variance_time_delta_out,average_frame_len_in,IPv6,75q_frame_len,average_time_delta_in,min_time_delta_in,IPv4,25q_frame_len_in,total_frame_len_in,25q_time_delta_out,min_time_delta_out,variance_frame_len_in,variance_frame_len,75q_time_delta_in,average_frame_len_out,max_time_delta,min_frame_len_out,variance_time_delta_in,average_frame_len,median_frame_len_in,median_time_delta_in,average_time_delta,max_time_delta_in,count_frame_len_out,25q_time_delta_in,min_frame_len,max_frame_len,median_frame_len_out,median_time_delta_out,max_time_delta_out,75q_frame_len_in,average_time_delta_out,total_frame_len_out,max_frame_len_in,filename
2 | 0e:00:00:00:00:01,42,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,printer-ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz
3 | 0e:00:00:00:00:02,41,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,printer-ab122_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz
4 | 0e:00:00:00:00:01,42,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,fileshare-ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz
5 | 0e:00:00:00:00:02,41,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,fileshare-ab122_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz
6 | 


--------------------------------------------------------------------------------
/tests/test_results_output.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import time
 4 | import os
 5 | 
 6 | from networkml.helpers.results_output import ResultsOutput
 7 | 
 8 | 
 9 | def test_parse_pcap_name():
10 |     logger = logging.getLogger(__name__)
11 |     instance = ResultsOutput(logger, 'uid', '/path')
12 |     assert instance.parse_pcap_name('notaposeidontracefile.pcap') == (
13 |         'notaposeidontracefile', None)
14 |     assert instance.parse_pcap_name('trace_but_invalid') == (
15 |         None, None)
16 |     assert instance.parse_pcap_name('trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap') == (
17 |         'ab12', 'ip-1-2-3-4')
18 |     assert instance.parse_pcap_name('trace_8adfcc152604e75d37a1a2ac62124ae859105239_2020-01-21_21_31_44-client-ip-17-253-66-125-17-253-66-125-192-168-3-2-udp-frame-ntp-wsshort-ip-eth-port-123.pcap') == (
19 |         '8adfcc152604e75d37a1a2ac62124ae859105239', 'ip-17-253-66-125-17-253-66-125-192-168-3-2-udp-frame-ntp-wsshort-ip-eth-port-123')
20 |     assert instance.parse_pcap_name('trace_8198b3326dcb032a2bfbb8030339ff2159b9993d_2020-02-19_03_16_21.pcap') == (
21 |         '8198b3326dcb032a2bfbb8030339ff2159b9993d', None)
22 |     assert instance.parse_pcap_name('trace_ab12_2001-01-01_02_03-miscellaneous-stuff.pcap') == (
23 |         None, None)
24 | 
25 | def test_output_from_result_json():
26 |     logger = logging.getLogger(__name__)
27 |     instance = ResultsOutput(logger, 'testver', 'path/')
28 |     result_json = {
29 |         '/dir/trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap': [{
30 |             'top_role': 'foo',
31 |             'source_ip': '1.2.3.4',
32 |             'source_mac': '01:02:03:04:05:06',
33 |             'timestamp': 999,
34 |             'role_list': [('bsomething', 0.7), ('asomething', 0.6), ('csomething', 0.5)]}],
35 |     }
36 |     reformatted_result_json_file = os.devnull
37 |     reformatted_json = instance.output_from_result_json(json.dumps(result_json), reformatted_result_json_file)
38 |     version = reformatted_json[0]['version']
39 |     assert reformatted_json == [{'file_path': 'path/', 'id': '', 'results': {'tool': 'networkml', 'version': version}, 'type': 'metadata', 'version': version, 'tool': 'networkml', 'data': {'mac_addresses': {'01:02:03:04:05:06': {'uid': 'testver', 'file_path': 'path/', 'pcap': '', 'pcap_key': '', 'pcap_labels': None, 'timestamp': 999, 'source_ip': '1.2.3.4', 'decisions': {'investigate': False}, 'classification': {'labels': ['bsomething', 'asomething', 'csomething'], 'confidences': (0.7, 0.6, 0.5)}}}}}, {'data': '', 'file_path': 'path/', 'id': '', 'results': {'tool': 'networkml', 'version': version}, 'tool': 'networkml', 'type': 'metadata'}]  # nosec - fine in a test.
40 | 


--------------------------------------------------------------------------------
/networkml/trained_models/README.md:
--------------------------------------------------------------------------------
 1 | # Machine Learning Models in NetworkML
 2 | 
 3 | ## Overview
 4 | 
 5 | NetworkML performs role identification via supervised machine learning. Although our
 6 | internal analysis compared decision trees, random forests, and neural networks, the public
 7 | networkML codebase only includes a neural network (or "deep learning") model.
 8 | 
 9 | ### Neural Network
10 | Neural networks can be used for supervised machine learning to match patterns in network
11 | traffic with the functional role of a device. For further information on neural networks,
12 | see Francois Chollet's "Deep Learning with Python" published by Manning
13 | Publications in 2018, especially pages 3-116. For more general information on machine
14 | learning and information security or cybersecurity, see Clarence
15 | Chio and David Freeman, "Machine Learning & Security," published by O'Reilly
16 | in 2018. The neural network model in networkML uses the Python package scikit-learn. Using
17 | TensorFlow or a similar neural network-specific machine learning package was not necessary
18 | to achieve high levels of performance in our in-house testing.
19 | 
20 | ### host_footprint model
21 | 
22 | Currently one model is defined, depending upon the Host() featurizer.
23 | 
24 | Note: The stored model was trained using all available host-level features. For a full description of the statistical features used to build a model in NetworkML, see the README in the featurizers directory.
25 | 
26 | #### Training and predicting
27 | 
28 | The model can be regenerated by:
29 | 
30 | ~~~~
31 | networkml --kfolds=5 --first_stage=algorithm --trained_model=networkml/trained_models/host_footprint.json --label_encoder=networkml/trained_models/host_footprint_le.json --scaler=networkml/trained_models/host_footprint_scaler.mod --operation train [--eval_data=/tmp/test_host.csv] /tmp/train_host.csv
32 | ~~~~
33 | 
34 | Where train_host.csv is the combined CSV output of the featurizer, used to train the model.
35 | 
36 | Optionally, if a --eval_data CSV is provided, this will be tested against the trained model.
37 | This test CSV would typically be the result of a 20/80 split of the original featurizer data
38 | (20% as test_host.csv, 80% as train_host.csv).
39 | 
40 | 
41 | You can also evalulate an existing trained model without retraining:
42 | 
43 | ~~~~
44 | networkml --first_stage=algorithm --final_stage=algorithm --operation eval ~/tmp/test_host.csv
45 | ~~~~
46 | 
47 | 
48 | A pcap prediction against an existing model in the default location can be done by:
49 | 
50 | ~~~~
51 | networkml --kfolds=5 --first_stage=parser -o /tmp/out--operation predict /tmp/test.pcap
52 | ~~~~
53 | 
54 | The output directory (e.g. /tmp/out) must already exist and be empty.
55 | 
56 | You can also do a prediction against featurizer output:
57 | 
58 | ~~~~
59 | networkml --kfolds=5 --first_stage=algorithm -o /tmp/out --operation predict /tmp/combined.csv
60 | ~~~
61 | 
62 | #### Using your own model
63 | 
64 | You can also use your own model. Specify --trained_model, --label_encoder, and --scaler for
65 | training and predicting.
66 | =======
67 | 


--------------------------------------------------------------------------------
/tests/test_data/combined_three_roles.csv:
--------------------------------------------------------------------------------
1 | host_key,min_frame_len_in,25q_frame_len,25q_frame_len_out,max_frame_len_out,75q_time_delta_out,variance_frame_len_out,count_frame_len_in,75q_frame_len_out,median_frame_len,min_time_delta,variance_time_delta_out,average_frame_len_in,IPv6,75q_frame_len,average_time_delta_in,min_time_delta_in,IPv4,25q_frame_len_in,total_frame_len_in,25q_time_delta_out,min_time_delta_out,variance_frame_len_in,variance_frame_len,75q_time_delta_in,average_frame_len_out,max_time_delta,min_frame_len_out,variance_time_delta_in,average_frame_len,median_frame_len_in,median_time_delta_in,average_time_delta,max_time_delta_in,count_frame_len_out,25q_time_delta_in,min_frame_len,max_frame_len,median_frame_len_out,median_time_delta_out,max_time_delta_out,75q_frame_len_in,average_time_delta_out,total_frame_len_out,max_frame_len_in,filename
2 | 0e:00:00:00:00:01,42,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,printer-ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz
3 | 0e:00:00:00:00:02,41,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,printer-ab122_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz
4 | 0e:00:00:00:00:03,42,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,fileshare-ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz
5 | 0e:00:00:00:00:04,41,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,fileshare-ab122_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz
6 | 0e:00:00:00:00:05,42,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,pkiserver-ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz
7 | 0e:00:00:00:00:06,41,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,pkiserver-ab122_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz
8 | 


--------------------------------------------------------------------------------
/tests/test_csv_to_features.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import sys
 4 | import tempfile
 5 | 
 6 | from networkml.featurizers.csv_to_features import CSVToFeatures
 7 | from networkml.parsers.pcap_to_csv import PCAPToCSV
 8 | 
 9 | COMMON_ARGS = ['-t', '2', '-v', 'DEBUG']
10 | 
11 | 
12 | def run_pcap_to_csv(in_path, out_path, engine='tshark'):
13 |     sys.argv = ['pcap_to_csv.py'] + COMMON_ARGS + ['-e', engine, '-o', out_path, in_path]
14 |     instance = PCAPToCSV()
15 |     instance.main()
16 | 
17 | 
18 | def run_csv_to_features(in_path, out_path=None, featurizer='host_tshark', otherflag=None):
19 |     args = ['csv_to_features.py'] + COMMON_ARGS + ['-g', featurizer]
20 |     if otherflag:
21 |         args.append(otherflag)
22 |     if out_path:
23 |         args.extend(['-o', out_path])
24 |     args.append(in_path)
25 |     sys.argv = args
26 |     instance = CSVToFeatures()
27 |     instance.main()
28 | 
29 | 
30 | def run_pcap_to_features(pcap=None, outdir=False):
31 |     with tempfile.TemporaryDirectory() as tmpdir:
32 |         testdata = os.path.join(tmpdir, 'test_data')
33 |         shutil.copytree('./tests/test_data', testdata)
34 |         if pcap:
35 |             pcap_path = os.path.join(testdata, pcap)
36 |             pcap_csv_path = os.path.join(tmpdir, pcap + '.csv.gz')
37 |         else:
38 |             testsdir = os.path.join(tmpdir, 'tests')
39 |             shutil.copytree('tests', testsdir)
40 |             pcap_path = testsdir
41 |             pcap_csv_path = os.path.join(tmpdir, 'pcap.csv.gz')
42 |         run_pcap_to_csv(pcap_path, pcap_csv_path)
43 |         if outdir:
44 |             outpath = tmpdir
45 |         else:
46 |             outpath = os.path.join(tmpdir, 'combined.csv.gz')
47 |         run_csv_to_features(pcap_csv_path, outpath)
48 | 
49 | 
50 | def test_CSVToFeatures():
51 |     run_pcap_to_features(pcap='trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap', outdir=False)
52 | 
53 | 
54 | def test_CSVToFeatures_no_output():
55 |     run_pcap_to_features(pcap='trace_ab12_2001-01-01_02_03-client-ip6-1-2-3-4.pcap', outdir=False)
56 | 
57 | 
58 | def test_CSVToFeatures_input_dir_output_file():
59 |     run_pcap_to_features(pcap=None, outdir=False)
60 | 
61 | 
62 | def test_CSVToFeatures_input_dir_output_dir():
63 |     run_pcap_to_features(pcap=None, outdir=True)
64 | 
65 | 
66 | def test_CSVToFeatures_no_group_or_func():
67 |     with tempfile.TemporaryDirectory() as tmpdir:
68 |         testdata = os.path.join(tmpdir, 'test_data')
69 |         shutil.copytree('./tests/test_data', testdata)
70 |         trace = os.path.join(testdata, 'trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz')
71 |         run_csv_to_features(trace, featurizer='')
72 | 
73 | 
74 | def test_CSVToFeatures_host():
75 |     with tempfile.TemporaryDirectory() as tmpdir:
76 |         testdata = os.path.join(tmpdir, 'test_data')
77 |         shutil.copytree('./tests/test_data', testdata)
78 |         trace = os.path.join(testdata, 'trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz')
79 |         for srcidflag in ('--srcmacid', '--no-srcmacid'):
80 |             for featurizer in ('sessionhost_tshark', 'host_tshark'):
81 |                 run_csv_to_features(trace, featurizer=featurizer, otherflag=srcidflag)
82 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
 6 | 
 7 | ## Our Standards
 8 | 
 9 | Examples of behavior that contributes to creating a positive environment include:
10 | 
11 | * Using welcoming and inclusive language
12 | * Being respectful of differing viewpoints and experiences
13 | * Gracefully accepting constructive criticism
14 | * Focusing on what is best for the community
15 | * Showing empathy towards other community members
16 | 
17 | Examples of unacceptable behavior by participants include:
18 | 
19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances
20 | * Trolling, insulting/derogatory comments, and personal or political attacks
21 | * Public or private harassment
22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission
23 | * Other conduct which could reasonably be considered inappropriate in a professional setting
24 | 
25 | ## Our Responsibilities
26 | 
27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
28 | 
29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
30 | 
31 | ## Scope
32 | 
33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
34 | 
35 | ## Enforcement
36 | 
37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at clewis@iqt.org. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
38 | 
39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
40 | 
41 | ## Attribution
42 | 
43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
44 | 
45 | [homepage]: http://contributor-covenant.org
46 | [version]: http://contributor-covenant.org/version/1/4/
47 | 


--------------------------------------------------------------------------------
/networkml/featurizers/README.md:
--------------------------------------------------------------------------------
 1 | # Host-Related Features in NetworkML Models
 2 | 
 3 | Machine learning models require inputs (or "features" in the language of machine learning practitioners.) NetworkML, a machine learning model that operates on network traffic, is no different. This readme describes the features that networkML can create and can therefore be included in NetworkML's models. All features can either be created for each host present, which therefore creates a numeric representation of all the traffic flowing into and out of a host, or for each session, which describes a particular data exchange for two hosts defined by a 5-tuple of source and destination IP's, source and destination ports, and protocol. This readme focuses on the host-related features. These features were devised with enterprise networks as the likely site of NetworkML deployment. Additionally, these features have a focus on both IP and non-IP traffic at layer 2, which is different than many other network traffic analysis models.
 4 | 
 5 | A key at the bottom of this page explains the symbols associated with each feature set.
 6 | 
 7 | ## IP Protocol-Specific Features
 8 | 
 9 | IPv4 (b)
10 | 
11 | IPv6 (b)
12 | 
13 | Well-known Ethernet protocols (b) [Note: Each flag is assigned an individual boolean vector. See list of ethernet protocols at the bottom of the page.]
14 | 
15 | TCP ports (i/o) \(P\) (P/NP) (b)
16 | 
17 | UDP ports (i/o) (P/NP) (b)
18 | 
19 | TCP flags (i/o) (b) [Note: Each flag is assigned an individual boolean vector.]
20 | 
21 | IP flags (i/o) (b)
22 | 
23 | IPX (b)
24 | 
25 | Both private IP (b)
26 | 
27 | IPv4 multicast (b)
28 | 
29 | IP differentiated services (i/o) (b)
30 | 
31 | Well-known IP protocols (b) [Note: Each protocol is assigned an individual boolean vector. See list of IP protocols at the bottom of the page.]
32 | 
33 | ## Non-IP Features
34 | 
35 | VLAN ID (b)
36 | 
37 | Non-IP protocol (b)
38 | 
39 | ## Packet Timing-related Features
40 | Interarrival time (D) (S) \(r\)
41 | 
42 | ## Packet Size-related Features
43 | Frame length (D) (S) \(r\)
44 | 
45 | ## Feature Key
46 | **Directionality**
47 | Indicates that there are versions of a feature for different traffic directions
48 | 
49 | (i) = incoming packets
50 | 
51 | (o) = outgoing packets
52 | 
53 | (bi) = bidirectional flow
54 | 
55 | (D) = i + o + bi
56 | 
57 | **Statistics**
58 | Indicates that there are versions of a feature for each statistic
59 | 
60 | (S) Statistics = (min, 25th percentile, median, 75th percentile, max, mean, variance, count, sum)
61 | 
62 | **Well Known Ports**
63 | Indicates that features are port-specific
64 | 
65 | \(P\) Private ports = (22, 23, 25, 53, 67, 68, 69, 80, 88, 110, 123, 137, 138, 139, 143, 161, 443, 631, other)
66 | 
67 | (NP) Non-private ports = (1900, 2375, 5222, 5349, 5353, 5354, 5349, 5357, 6653, other)
68 | 
69 | **Type of values**
70 | Indicates the acceptable values for a feature
71 | 
72 | (b) = binary feature (0, 1)
73 | 
74 | \(r\) = real number feature (-inf, +inf)
75 | 
76 | Example: Frame length (D) (S) \(r\) indicates that there are versions of this feature for incoming packets, outgoing packets, and bidirectional flows and also sub-versions for each different statistic. In total, there are 27 features.
77 | 
78 | Ethernet protocols: Well-known ethernet protocols include ethernet, IPv6, IP, TCP, ARP, ICMP, GRE, ESP.
79 | IP protocols: Well-known IP protocols include TCP, UDP, ICMP, ICMPv6, ARP, and an other category.
80 | 


--------------------------------------------------------------------------------
/tests/test_data/combined.csv:
--------------------------------------------------------------------------------
 1 | min_frame_len_in,25q_frame_len,25q_frame_len_out,max_frame_len_out,75q_time_delta_out,variance_frame_len_out,count_frame_len_in,75q_frame_len_out,median_frame_len,min_time_delta,variance_time_delta_out,average_frame_len_in,IPv6,75q_frame_len,average_time_delta_in,min_time_delta_in,IPv4,25q_frame_len_in,total_frame_len_in,25q_time_delta_out,min_time_delta_out,variance_frame_len_in,variance_frame_len,75q_time_delta_in,average_frame_len_out,max_time_delta,min_frame_len_out,variance_time_delta_in,average_frame_len,median_frame_len_in,median_time_delta_in,average_time_delta,max_time_delta_in,count_frame_len_out,25q_time_delta_in,min_frame_len,max_frame_len,median_frame_len_out,median_time_delta_out,max_time_delta_out,75q_frame_len_in,average_time_delta_out,total_frame_len_out,max_frame_len_in,filename
 2 | 42,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,printer-ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz
 3 | 41,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,printer-ab122_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz
 4 | 42,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,fileshare-ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz
 5 | 41,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,fileshare-ab122_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz
 6 | 42,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,pkiserver-ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz
 7 | 41,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,pkiserver-ab122_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz
 8 | 42,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,workstation-ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz
 9 | 41,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,workstation-ab122_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz
10 | 


--------------------------------------------------------------------------------
/networkml/featurizers/main.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import inspect
  3 | import os
  4 | import sys
  5 | import time
  6 | 
  7 | from networkml.featurizers.features import Features
  8 | 
  9 | # TODO move print statements to logging
 10 | 
 11 | 
 12 | class Featurizer():
 13 | 
 14 |     def import_class(self, path, classes):
 15 |         """
 16 |         Imports classs from an external directory at runtime. Imported functions will be added
 17 |         to classes
 18 |         :param path: path where the modules reside
 19 |         :param classes: existing class instances
 20 |         :return list of newly add class instances
 21 |         """
 22 |         # make sure path exists
 23 |         if os.path.isdir(path) is False:
 24 |             print('Error: path {} does not exist'.format(path))
 25 |             return classes
 26 | 
 27 |         # add the path to the PYTHONPATH
 28 |         sys.path.append(path)
 29 | 
 30 |         # acquire list of files in the path
 31 |         mod_list = os.listdir(path)
 32 | 
 33 |         for f in mod_list:
 34 | 
 35 |             # continue if it is not a python file
 36 |             if f[-3:] != '.py':
 37 |                 continue
 38 | 
 39 |             # get module name by removing extension
 40 |             mod_name = os.path.basename(f)[:-3]
 41 | 
 42 |             # import the module
 43 |             module = __import__(mod_name, locals(), globals())
 44 |             for name, cls in inspect.getmembers(module):
 45 |                 if inspect.isclass(cls) and name != 'Features':
 46 |                     instance = cls()
 47 |                     if isinstance(instance, Features):
 48 |                         # append an instance of the class to classes
 49 |                         classes.append((instance, name))
 50 |                         print(f'Importing class: {name}')
 51 | 
 52 |         return classes
 53 | 
 54 |     def run_all_funcs(self, functions_orig, groups_orig, classes_orig, rows_f, parsed_args):
 55 |         functions = copy.deepcopy(functions_orig)
 56 |         groups = copy.deepcopy(groups_orig)
 57 |         classes = copy.deepcopy(classes_orig)
 58 |         feature_rows = []
 59 |         run_methods = []
 60 | 
 61 |         def verify_feature_row(method, feature_row):
 62 |             assert isinstance(feature_row, list), 'method %s returned non list: %s' % (
 63 |                 method, feature_row)
 64 |             non_dicts = {x for x in feature_row if not isinstance(x, dict)}
 65 |             assert not non_dicts, 'method %s returned something not a dict: %s' % (
 66 |                 method, non_dicts)
 67 | 
 68 |         def run_func(method, func, descr):
 69 |             print(f'running {descr}...', end='')
 70 |             start_time = time.time()
 71 |             feature_row = func()
 72 |             elapsed_time = int(time.time() - start_time)
 73 |             print(f'{elapsed_time}s')
 74 |             verify_feature_row(method, feature_row)
 75 |             return feature_row
 76 | 
 77 |         # attempt to group methods together based on same field name for more cache hits.
 78 |         def method_key(method):
 79 |             return ''.join(reversed(method.strip('_in').strip('_out')))
 80 | 
 81 |         for f in classes:
 82 |             if groups:
 83 |                 methods = filter(
 84 |                     lambda funcname: funcname.startswith(groups), dir(f[0]))
 85 |                 for method in sorted(methods, key=method_key):
 86 |                     feature_rows.append(run_func(method, lambda: f[0].run_func(
 87 |                         method, rows_f, parsed_args), f'{f[1]}/{method}'))
 88 |                     run_methods.append((f[1], method))
 89 | 
 90 |         # run remaining extras
 91 |         for function in functions:
 92 |             if function not in run_methods:
 93 |                 for f in classes:
 94 |                     if f[1] == function[0]:
 95 |                         method = function[1]
 96 |                         feature_rows.append(run_func(method, lambda: f[0].run_func(
 97 |                             method, rows_f, parsed_args), f'{f[1]}/{function[1]}'))
 98 |         return feature_rows
 99 | 
100 |     def main(self, feature_choices, rows, features_path, parsed_args):
101 |         functions = []
102 |         groups = ('default')
103 |         classes = []
104 |         classes = self.import_class(features_path, classes)
105 | 
106 |         if 'functions' in feature_choices:
107 |             functions = feature_choices['functions']
108 |         if 'groups' in feature_choices:
109 |             groups = feature_choices['groups']
110 | 
111 |         return self.run_all_funcs(functions, groups, classes, rows, parsed_args)
112 | 


--------------------------------------------------------------------------------
/networkml/helpers/pandas_csv_importer.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import ipaddress
  3 | import warnings
  4 | 
  5 | import netaddr
  6 | import numpy
  7 | import pandas as pd
  8 | from pandas.errors import DtypeWarning
  9 | # We are using converters to fix types, so mixed type warning from read_csv() is spurious.
 10 | warnings.simplefilter(action='ignore', category=DtypeWarning)
 11 | 
 12 | 
 13 | @functools.lru_cache()
 14 | def _ipaddress_packed(val):
 15 |     if len(val) > 0:
 16 |         return int(ipaddress.ip_address(val))
 17 |     return None
 18 | 
 19 | 
 20 | @functools.lru_cache()
 21 | def _netaddr_packed(val):
 22 |     if len(val) > 0:
 23 |         return int(netaddr.EUI(val))
 24 |     return None
 25 | 
 26 | 
 27 | def _hex_str(val):
 28 |     if len(val) > 0:
 29 |         assert val.startswith('0x'), val
 30 |         return int(val, 16)
 31 |     return None
 32 | 
 33 | 
 34 | def _safe_int(val):
 35 |     if len(val) > 0:
 36 |         return int(val)
 37 |     return None
 38 | 
 39 | 
 40 | def _eth_protos(val):
 41 |     return ':'.join([i for i in val.split(':') if i != 'ethertype'])
 42 | 
 43 | 
 44 | WS_FIELDS = {
 45 |     'arp.opcode': (_safe_int, 8),
 46 |     'eth.src': (_netaddr_packed, None),
 47 |     'eth.dst': (_netaddr_packed, None),
 48 |     'eth.type': (_hex_str, 16),
 49 |     'frame.len': (_safe_int, 32),
 50 |     'frame.time_epoch': (float, None),
 51 |     'frame.time_delta_displayed': (float, None),
 52 |     'frame.protocols': (_eth_protos, None),
 53 |     'icmp.code': (_safe_int, 8),
 54 |     'gre.proto': (_hex_str, 8),
 55 |     'ip.src': (_ipaddress_packed, None),
 56 |     'ip.src_host': (_ipaddress_packed, None),
 57 |     'ip.dst': (_ipaddress_packed, None),
 58 |     'ip.dst_host': (_ipaddress_packed, None),
 59 |     'ip.dsfield': (_hex_str, 8),
 60 |     'ip.flags': (_hex_str, 16),
 61 |     'ip.proto': (_safe_int, 8),
 62 |     'ip.version': (_safe_int, 8),
 63 |     'icmpv6.code': (_safe_int, 8),
 64 |     'ipv6.src': (_ipaddress_packed, None),
 65 |     'ipv6.src_host': (_ipaddress_packed, None),
 66 |     'ipv6.dst': (_ipaddress_packed, None),
 67 |     'ipv6.dst_host': (_ipaddress_packed, None),
 68 |     'tcp.flags': (_hex_str, 16),
 69 |     'tcp.srcport': (_safe_int, 16),
 70 |     'tcp.dstport': (_safe_int, 16),
 71 |     'udp.srcport': (_safe_int, 16),
 72 |     'udp.dstport': (_safe_int, 16),
 73 |     'vlan.etype': (_hex_str, 16),
 74 |     'vlan.id': (_safe_int, 16),
 75 | }
 76 | _WS_FIELDS_CONVERTERS = {
 77 |     field: field_info[0] for field, field_info in WS_FIELDS.items()}
 78 | _WS_FIELDS_NULLABLE_INT = {
 79 |     field: 'UInt%s' % field_info[1] for field, field_info in WS_FIELDS.items()
 80 |     if isinstance(field_info[1], int)}
 81 | _WS_NON_INT_CONVERTERS = {
 82 |     field: converter for field, converter in _WS_FIELDS_CONVERTERS.items()
 83 |     if field not in _WS_FIELDS_NULLABLE_INT}
 84 | _REQUIRED_WS_FIELDS = {
 85 |     'eth.src', 'eth.dst', 'frame.len',
 86 |     'frame.time_epoch', 'frame.time_delta_displayed'}
 87 | 
 88 | 
 89 | def recast_df(df):
 90 |     # TODO: when pandas allows read_csv to infer nullable ints, we can use less memory on import.
 91 |     # https://github.com/pandas-dev/pandas/issues/2631
 92 |     # For now convert to nullable int after import.
 93 |     for col, typestr in _WS_FIELDS_NULLABLE_INT.items():
 94 |         try:
 95 |             df[col] = df[col].astype(typestr)
 96 |         except TypeError:
 97 |             raise TypeError('cannot cast %s to %s: %u' %
 98 |                             (col, typestr, df[col].max()))
 99 |     return df
100 | 
101 | 
102 | def import_csv(in_file):
103 |     # We need converters, so we can't use dtypes parameter, and that results in an un-suppressable warning.
104 |     sample_df = pd.read_csv(in_file, index_col=0, nrows=100)
105 |     csv_fields = set(sample_df.columns.tolist())
106 |     usecols = csv_fields.intersection(WS_FIELDS.keys())
107 |     missingcols = set(WS_FIELDS.keys()) - csv_fields
108 | 
109 |     # Any hex-int fields, detected as strings?
110 |     obj_int_fields = {
111 |         field for field, fieldinfo in WS_FIELDS.items()
112 |         if fieldinfo[0] == _hex_str and sample_df.dtypes.get(field, None) == numpy.dtype('O')}
113 |     converters = _WS_NON_INT_CONVERTERS
114 |     # If yes, this is an old style PCAP CSV which needs conversion.
115 |     if obj_int_fields:
116 |         converters = _WS_FIELDS_CONVERTERS
117 | 
118 |     df = pd.read_csv(in_file, usecols=usecols, converters=converters)
119 | 
120 |     for col in missingcols:
121 |         df[col] = None
122 |     for col in _REQUIRED_WS_FIELDS:
123 |         assert df[col].count(
124 |         ) > 0, 'required col %s is all null (not a PCAP CSV?)' % col
125 |     df = recast_df(df)
126 |     return df
127 | 


--------------------------------------------------------------------------------
/networkml/helpers/results_output.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import re
  4 | 
  5 | import networkml
  6 | 
  7 | from networkml import __version__
  8 | 
  9 | 
 10 | class ResultsOutput:
 11 | 
 12 |     def __init__(self, logger, uid, file_path):
 13 |         self.logger = logger
 14 |         self.uid = uid
 15 |         self.file_path = file_path
 16 | 
 17 |     @staticmethod
 18 |     def assign_labels(labels):
 19 |         netml_path = list(networkml.__path__)
 20 |         la = os.path.join(netml_path[0],
 21 |                           'trained_models/label_assignments.json')
 22 |         assignment_map = {}
 23 |         with open(la) as f:
 24 |             assignment_map = json.load(f)
 25 |         labels = [assignment_map[label] if label in assignment_map else label for label in labels]
 26 |         return labels
 27 | 
 28 |     @staticmethod
 29 |     def parse_pcap_name(base_pcap):
 30 |         # The parsing operation below assumes a specific file naming
 31 |         # convention trace_DeviceName-deviceID-time-duration-flags.pcap
 32 |         # Explanation: All files coming from Poseidon have trace_ at their
 33 |         # beginning. The device name and deviceID colums are self explanatory.
 34 |         # Time refers to the day of the week and time of day. Duration refers
 35 |         # to the length of the network traffic capture. The flags aspect
 36 |         # refers to an unknown characteristic.
 37 |         # TODO: tolerate tshark labels in the trace name, but do not parse them for now.
 38 |         pcap_key = None
 39 |         pcap_labels = None
 40 |         if base_pcap.startswith('trace_'):
 41 |             for pcap_re, key_pos, label_pos in (
 42 |                     (re.compile(
 43 |                         r'^trace_([\da-f]+)_([0-9\_\-]+)-(client|server)-(.+).pcap$'), 1, 4),
 44 |                     (re.compile(r'^trace_([\da-f]+)_([0-9\_\-]+).pcap$'), 1, None)):
 45 |                 pcap_match = pcap_re.match(base_pcap)
 46 |                 if pcap_match:
 47 |                     pcap_key = pcap_match.group(key_pos)
 48 |                     if label_pos:
 49 |                         pcap_labels = pcap_match.group(label_pos)
 50 |                     break
 51 |         else:
 52 |             # Not a Poseidon trace file, return basename as key.
 53 |             pcap_key = base_pcap.split('.')[0]
 54 |         return (pcap_key, pcap_labels)
 55 | 
 56 |     @staticmethod
 57 |     def valid_template(uid, file_path, timestamp, source_ip, investigate, labels, confidences,
 58 |                        pcap_labels, base_pcap, pcap_key):
 59 |         return {
 60 |             'uid': uid,
 61 |             'file_path': file_path,
 62 |             'pcap': base_pcap,
 63 |             'pcap_key': pcap_key,
 64 |             'pcap_labels': pcap_labels,
 65 |             'timestamp': timestamp,
 66 |             'source_ip': source_ip,
 67 |             'decisions': {
 68 |                 'investigate': investigate,
 69 |             },
 70 |             'classification': {
 71 |                 'labels': labels,
 72 |                 'confidences': confidences,
 73 |             },
 74 |         }
 75 | 
 76 |     def output_from_result_json(self, result_json_str, reformatted_result_json_file_name):
 77 |         base_pcap = os.path.basename(self.file_path)
 78 |         pcap_key, pcap_labels = self.parse_pcap_name(base_pcap)
 79 |         result_json = json.loads(result_json_str)
 80 | 
 81 |         mac_metadata = {}
 82 |         for filename, host_results in result_json.items():
 83 |             filename = filename.split('.csv.gz')[0]
 84 |             for host_result in host_results:
 85 |                 top_role = host_result.get('top_role', None)
 86 |                 if top_role is not None:
 87 |                     investigate = top_role == 'Unknown'
 88 |                     source_ip = host_result.get('source_ip', None)
 89 |                     source_mac = host_result.get('source_mac', None)
 90 |                     timestamp = host_result.get('timestamp', None)
 91 |                     labels, confidences = zip(*host_result['role_list'])
 92 |                     labels = self.assign_labels(labels)
 93 |                     mac_metadata[source_mac] = self.valid_template(
 94 |                         self.uid, self.file_path, timestamp, source_ip,
 95 |                         investigate, labels, confidences,
 96 |                         pcap_labels, base_pcap, pcap_key)
 97 |         reformatted_json = [{
 98 |             'tool': 'networkml',
 99 |             'version': __version__,
100 |             'id': os.environ.get('id', ''),
101 |             'type': 'metadata',
102 |             'file_path': self.file_path,
103 |             'results': {'tool': 'networkml', 'version': __version__},
104 |             'data': {
105 |                 'mac_addresses': mac_metadata,
106 |             }
107 |         },
108 |         {
109 |             'tool': 'networkml',
110 |             'id': os.environ.get('id', ''),
111 |             'type': 'metadata',
112 |             'file_path': self.file_path,
113 |             'data': '',
114 |             'results': {'tool': 'networkml', 'version': __version__}
115 |         }]
116 |         with open(reformatted_result_json_file_name, 'w') as reformatted_result:
117 |             reformatted_result.write(json.dumps(reformatted_json))
118 |         return reformatted_json
119 | 


--------------------------------------------------------------------------------
/.github/workflows/semgrep.yml:
--------------------------------------------------------------------------------
  1 | name: semgrep
  2 | on:
  3 |   pull_request_target:
  4 |     types:
  5 |       - opened
  6 |       - synchronize
  7 |       - reopened
  8 | permissions:
  9 |   pull-requests: write
 10 | jobs:
 11 |   docker_scan:
 12 |     runs-on: ubuntu-latest
 13 |     steps:
 14 |     - uses: actions/checkout@v3
 15 |       with:
 16 |         repository: ${{ github.event.pull_request.head.repo.full_name }}
 17 |         ref: ${{ github.event.pull_request.head.ref }}
 18 |     - name: scan
 19 |       id: d_scan
 20 |       run: |
 21 |         export DEBIAN_FRONTEND=noninteractive && \
 22 |         echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections && \
 23 |         sudo apt-get update && \
 24 |         sudo apt install jq && \
 25 |         python3 -m pip install --upgrade pip && \
 26 |         python3 -m pip install semgrep && \
 27 |         python3 -m pip install --upgrade urllib3 && \
 28 |         mkdir /home/runner/reports/ && \
 29 |         cd ${GITHUB_WORKSPACE}/ && \
 30 |         semgrep --config=.github/workflows/config/semgrep-docker.yml  --json -o /home/runner/reports/semgrep.out \
 31 |         --severity ERROR ./ &&\
 32 |         echo "## Validation Issues Found (Docker) :whale: " >> /home/runner/reports/docker-msg && \
 33 |         cat /home/runner/reports/semgrep.out | jq -r --arg ws "$GITHUB_WORKSPACE" --arg url "$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/blob/$GITHUB_SHA" '.results[] | "**File:** [\(.path | sub($ws; "."; "g"))](\(.path | sub($ws; $url; "g"))#L\(.start.line)) \n**Line Number:** \(.start.line) \n**Statement(s):** \n``` \n\(.extra.lines) \n``` \n**Rule:** \n\(.extra.message)\n\n"' >> /home/runner/reports/docker-msg && \
 34 |         echo "::set-output name=found-count::$(cat /home/runner/reports/semgrep.out | jq '.results | length')"
 35 |     - name: Fail if found
 36 |       if: steps.d_scan.outputs.found-count != 0
 37 |       uses: actions/github-script@v6
 38 |       with:
 39 |         script: |
 40 |           const fs = require('fs')
 41 | 
 42 |           var msg = fs.readFileSync('/home/runner/reports/docker-msg', 'utf8');
 43 |           console.log('${{steps.d_scan.outputs.found-count}} errors found in docker/docker-compose files');
 44 |           github.rest.issues.createComment({
 45 |               issue_number: context.issue.number,
 46 |               owner: context.repo.owner,
 47 |               repo: context.repo.repo,
 48 |               body: msg
 49 |             });
 50 | 
 51 |           core.setFailed('Semgrep found errors in Dockerfiles or docker-compose files. Please check the uploaded report');
 52 |     - name: Upload scan reports
 53 |       uses: actions/upload-artifact@v3.1.1
 54 |       if: failure()
 55 |       with:
 56 |         name: semgrep-docker-report
 57 |         path: /home/runner/reports/semgrep.out
 58 |   python_scan:
 59 |     runs-on: ubuntu-latest
 60 |     steps:
 61 |     - uses: actions/checkout@v3
 62 |       with:
 63 |         repository: ${{ github.event.pull_request.head.repo.full_name }}
 64 |         ref: ${{ github.event.pull_request.head.ref }}
 65 |     - name: scan
 66 |       id: py_scan
 67 |       run: |
 68 |         export DEBIAN_FRONTEND=noninteractive && \
 69 |         echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections && \
 70 |         sudo apt-get update && \
 71 |         sudo apt install jq && \
 72 |         python3 -m pip install --upgrade pip && \
 73 |         python3 -m pip install semgrep && \
 74 |         python3 -m pip install --upgrade urllib3 && \
 75 |         mkdir -p /home/runner/reports/ && \
 76 |         cd ${GITHUB_WORKSPACE}/ && \
 77 |         semgrep --config=.github/workflows/config/semgrep-python.yml  --json -o /home/runner/reports/semgrep.out \
 78 |         --severity ERROR ./ && \
 79 |         echo "## Validation Issues Found (Python) :snake: " >> /home/runner/reports/python-msg && \
 80 |         cat /home/runner/reports/semgrep.out | jq -r --arg ws "$GITHUB_WORKSPACE" --arg url "$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/blob/$GITHUB_SHA" '.results[] | "**File:** [\(.path | sub($ws; "."; "g"))](\(.path | sub($ws; $url; "g"))#L\(.start.line)) \n**Line Number:** \(.start.line) \n**Statement(s):** \n``` \n\(.extra.lines) \n``` \n**Rule:** \n\(.extra.message)\n\n"' >> /home/runner/reports/python-msg && \
 81 |         echo "::set-output name=python-found-count::$(cat /home/runner/reports/semgrep.out | jq '.results | length')"
 82 |     - name: Fail if found
 83 |       if: steps.py_scan.outputs.python-found-count > 0
 84 |       uses: actions/github-script@v6
 85 |       with:
 86 |         github-token: ${{secrets.GITHUB_TOKEN}}
 87 |         script: |
 88 |           const fs = require('fs')
 89 | 
 90 |           var msg = fs.readFileSync('/home/runner/reports/python-msg', 'utf8');
 91 |           console.log('${{steps.py_scan.outputs.python-found-count}} errors found in python files');
 92 |           github.rest.issues.createComment({
 93 |               issue_number: context.issue.number,
 94 |               owner: context.repo.owner,
 95 |               repo: context.repo.repo,
 96 |               body: msg
 97 |             });
 98 | 
 99 |           core.setFailed('Semgrep found errors in Python files. Please check the uploaded report');
100 |     - name: Upload scan reports
101 |       uses: actions/upload-artifact@v3.1.1
102 |       if: failure()
103 |       with:
104 |         name: semgrep-python-report
105 |         path: /home/runner/reports/semgrep.out
106 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Device Functional Role ID via Machine Learning and Network Traffic Analysis
 2 | 
 3 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
 4 | ![Build Status](https://github.com/iqtlabs/networkml/workflows/test/badge.svg)
 5 | [![PyPI version](https://badge.fury.io/py/networkml.svg)](https://badge.fury.io/py/networkml)
 6 | [![codecov](https://codecov.io/gh/IQTLabs/NetworkML/branch/main/graph/badge.svg)](https://codecov.io/gh/IQTLabs/IQTLabs)
 7 | [![Docker Hub Downloads](https://img.shields.io/docker/pulls/iqtlabs/networkml.svg)](https://hub.docker.com/r/iqtlabs/networkml/)
 8 | 
 9 | ## Overview
10 | NetworkML is the machine learning portion of our [Poseidon](https://github.com/IQTLabs/poseidon) project. The model in networkML classifies each device into a functional role via machine learning models trained on features derived from network traffic. "Functional role" refers to the authorized administrative purpose of the device on the network and includes roles such as printer, mail server, and others typically found in an IT environment. Our internal analysis suggests networkML can achieve accuracy, precision, recall, and F1 scores in the high 90's when trained on devices from your own network. Whether this performance can transfer from IT environment to IT environment is an active area of our research.
11 | 
12 | NetworkML can be used in a "standalone" mode from the command line interface. For more background and context on the macro project, please check out [the Poseidon project](https://www.cyberreboot.org/projects/poseidon/) page on our website. This repository specifically covers the output, inputs, data processing, and machine learning models we deploy in networkML.
13 | 
14 | While this repository and resulting docker container can be used completely independently, the code was written to support the IQT Labs Poseidon project. See:
15 | 
16 | - [Poseidon](https://github.com/IQTLabs/poseidon) SDN project.
17 | 
18 | This repository contains the components necessary to build a docker container that can be used for training a number of ML models using network packet captures (PCAPs). The repository includes scripts necessary to do training, testing, and evaluation. These can be run from a shell once `networkml` is installed as a package or run in a Docker container using the `networkml` script.
19 | 
20 | Feel free to use, discuss, and contribute!
21 | 
22 | ## Model Output
23 | NetworkML predicts the functional role of network-connected device via network traffic analysis and machine learning.
24 | 
25 | Admittedly subjective, the term "role" refers to the authorized administrative purpose of the device on the network. NetworkML in its default configuration has twelve roles: active directory controller, administrator server, administrator workstation, confluence server, developer workstation, distributed file share, exchange server, graphics processing unit (GPU) laptop, github server, public key infrastructure (PKI) server, and printer. This typology reflects the network-connected devices in the data we used to train the model. Other networks will lack some of these roles and might include others. Consequently, organizations that wish to use networkML might have to adapt the model outputs for their specific organization.
26 | 
27 | ## Model Inputs
28 | NetworkML's key input is the network traffic for a single device. By network traffic for a single device, we mean all packets sent and received by that device over a given time period. For reliable results, we recommend at least fifteen minutes of network traffic. Poseidon, the larger project of which networkML is only a part, performs the necessary packet pre-processing to produce pcap's containing all network traffic to and from a single device. If you are using networkML in a standalone manner, the pcap files must all follow a strict naming convention: DeviceName-deviceID-time-duration-flags.pcap. For example, ActiveDirectoryController-labs-Fri0036-n00.pcap refers to a pcap from an active directory controller taken from a user named labs on a Friday at 00:36. The flag field does not currently have any significance.
29 | 
30 | It is worth noting that networkML uses only packet header data in its models. NetworkML does not use data from the packet payload. Relying only on packet header data enables networkML to avoid some privacy-related issues associated with using payload data and to create (hopefully) more generalizable and more performant models.
31 | 
32 | ## Data Processing
33 | 
34 | ## Algorithms
35 | 
36 | NetworkML uses a feedforward neural network from the scikit-learn package. The model is trained using 5-fold cross validation in combination with a simple grid-search of the hyper-parameter space.
37 | 
38 | 
39 | # Installation/Run
40 | 
41 | Our models can be executed via Docker and in a standalone manner on a Linux host. We recommend deployment via Poseidon if you are running an SDN (software-defined network). Otherwise, we recommend using Docker.
42 | 
43 | See the [README](https://github.com/IQTLabs/NetworkML/tree/main/networkml/trained_models) file included in the `networkml/trained_models` folder for specific instructions on deployment.
44 | 
45 | # Develop/Standalone Installation
46 | 
47 | Note: This project uses absolute paths for imports, meaning you'll either need to modify your `PYTHONPATH` to something like this from the project directory:
48 | ```
49 | export PYTHONPATH=$PWD/networkml:$PYTHONPATH
50 | ```
51 | Alternatively, simply running `pip3 install .` from the project directory after making changes will update the package to test or debug against.
52 | 
53 | This package is set up for anaconda/miniconda to be used for package and environment
54 | management if desired. Assuming you have the latest install (as of this writing, we have been using
55 | conda 4.5.12), set up the environment by performing the following:
56 |  1. Ensure that the CONDA_EXE environment variable has been set. If `echo $CONDA_EXE`
57 | returns empty, resolve this by `export CONDA_EXE=$_CONDA_EXE` in your bash shell.
58 |  2. Run `make dev` to set up the environment
59 |  3. Run `conda activate posml-dev` to begin.
60 | 
61 | You can remove the dev environment via standard conda commands:
62 |  1. Run `conda deactivate`
63 |  2. Run `conda env remove -y -n posml-dev`
64 | 
65 | For more information about using conda, please refer to their
66 | [user documentation](https://conda.io/projects/conda/en/latest/user-guide/getting-started.html).
67 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing to NetworkML
  2 | 
  3 | Want to hack on NetworkML? Awesome! Here are instructions to get you started.
  4 | If you have any questions or find the instructions to be incomplete, please do
  5 | open an issue to let us know about it.
  6 | 
  7 | ## Contribution guidelines
  8 | 
  9 | ### Pull requests are always welcome
 10 | 
 11 | We are always thrilled to receive pull requests and do our best to
 12 | process them as fast as possible. Not sure if that typo is worth a pull
 13 | request? Do it! We will appreciate it.
 14 | 
 15 | If your pull request is not accepted on the first try, don't be
 16 | discouraged! If there's a problem with the implementation, hopefully you
 17 | received feedback on what to improve.
 18 | 
 19 | We're trying very hard to keep NetworkML lean and focused. We don't want it
 20 | to do everything for everybody. This means that we might decide against
 21 | incorporating a new feature. However, there might be a way to implement
 22 | that feature *on top of* NetworkML.
 23 | 
 24 | ### Create issues...
 25 | 
 26 | Any significant improvement should be documented as [a github
 27 | issue](https://github.com/IQTLabs/NetworkML/issues) before anybody
 28 | starts working on it.
 29 | 
 30 | ### ...but check for existing issues first!
 31 | 
 32 | Please take a moment to check that an issue doesn't already exist
 33 | documenting your bug report or improvement proposal. If it does, it
 34 | never hurts to add a quick "+1" or "I have this problem too". This will
 35 | help prioritize the most common problems and requests.
 36 | 
 37 | ### Conventions
 38 | 
 39 | #### Project structure
 40 | 
 41 | The NetworkML project is currently structured to be a collection of
 42 | models processing pcap traffic. Each model is contained within its own
 43 | folder under the root directory. Code under root's `utils/` folder contains
 44 | generic feature extraction and processing from raw pcap files, and can be
 45 | reused by any of the models within the collection.
 46 | 
 47 | Take the `DeviceClassifier` as an archetype example of one such model.
 48 | Our [Poseidon project](https://github.com/IQTLabs/Poseidon) uses this
 49 | to identify device roles on the network based on their behavior on the
 50 | network. In fact, this classifier contains two different models that can
 51 | be used depending on the amount of data available for training -- `OneLayer`
 52 | neural network model, and the `RandomForest` model. Each of these models
 53 | are contained in their own subdirectories, and a `README` file describes
 54 | the usage and requirements of both. Within each model's directory, you'll
 55 | find a Dockerfile and the scripts to train, test, and evaluate the models.
 56 | Any configurations or options specific to these models are located in the
 57 | `opts/` subfolder, and the optional trained models (in the form of
 58 | serialized pkl files) are made available in the `models/` subfolder.
 59 | 
 60 | Our hope is that by following this structure as much as possible, newer
 61 | users can get up to speed more quickly, and models will be easier to
 62 | maintain in the long run. However, if you find this too stifling for
 63 | your specific model, we will leave it to you to explain the usage,
 64 | requirements and structure in your model's `README` file.
 65 | 
 66 | 
 67 | #### Submitting a pull request
 68 | 
 69 | Fork the repo and make changes on your fork in a feature branch.
 70 | 
 71 | Make sure you include relevant updates or additions to documentation and
 72 | tests when creating or modifying features.
 73 | 
 74 | Pull requests descriptions should be as clear as possible and include a
 75 | reference to all the issues that they address.
 76 | 
 77 | Code review comments may be added to your pull request. Discuss, then make the
 78 | suggested modifications and push additional commits to your feature branch. Be
 79 | sure to post a comment after pushing. The new commits will show up in the pull
 80 | request automatically, but the reviewers will not be notified unless you
 81 | comment.
 82 | 
 83 | Before the pull request is merged, make sure that you squash your commits into
 84 | logical units of work using `git rebase -i` and `git push -f`. After every
 85 | commit the test suite should be passing. Include documentation changes in the
 86 | same commit so that a revert would remove all traces of the feature or fix.
 87 | 
 88 | Commits that fix or close an issue should include a reference like `Closes #XXX`
 89 | or `Fixes #XXX`, which will automatically close the issue when merged.
 90 | 
 91 | Add your name to the AUTHORS file, but make sure that the list is sorted and that
 92 | your name and email address match the ones you used to make your commits. The
 93 | AUTHORS file is regenerated occasionally from the commit history, so a mismatch
 94 | may result in your changes being overwritten.
 95 | 
 96 | ## Decision process
 97 | 
 98 | ### How are decisions made?
 99 | 
100 | Short answer: with pull requests to the NetworkML repository.
101 | 
102 | All decisions affecting NetworkML, big and small, follow the same 3 steps:
103 | 
104 | * Step 1: Open a pull request. Anyone can do this.
105 | 
106 | * Step 2: Discuss the pull request. Anyone can do this.
107 | 
108 | * Step 3: Accept or refuse a pull request. A maintainer does this.
109 | 
110 | 
111 | ### How can I become a maintainer?
112 | 
113 | * Step 1: learn the code inside out
114 | * Step 2: make yourself useful by contributing code, bugfixes, support etc.
115 | 
116 | Don't forget: being a maintainer is a time investment. Make sure you will have time to make yourself available.
117 | You don't have to be a maintainer to make a difference on the project!
118 | 
119 | ### What are a maintainer's responsibility?
120 | 
121 | It is every maintainer's responsibility to:
122 | 
123 | * 1) Deliver prompt feedback and decisions on pull requests.
124 | * 2) Be available to anyone with questions, bug reports, criticism etc. on NetworkML.
125 | 
126 | ### How is this process changed?
127 | 
128 | Just like everything else: by making a pull request :)
129 | 
130 | *Derivative work from [Docker](https://github.com/moby/moby/blob/master/CONTRIBUTING.md).*
131 | 
132 | ### Any questions?
133 | 
134 | As stated above, if you have any questions or encounter any problems, we recommend checking the
135 | pre-existing issues on the project page. If nothing relates or the discussion turns out to not relate
136 | any longer, feel free to start a new issue. We do our best to respond in a timely fashion and to
137 | keep all discussions open and transparent.
138 | 


--------------------------------------------------------------------------------
/tests/test_funcs_host.py:
--------------------------------------------------------------------------------
  1 | import ipaddress
  2 | 
  3 | import netaddr
  4 | import pandas as pd
  5 | 
  6 | from networkml.featurizers.funcs.host import Host
  7 | from networkml.featurizers.funcs.host import HostBase
  8 | from networkml.featurizers.funcs.host import SessionHost
  9 | from networkml.helpers.pandas_csv_importer import recast_df
 10 | from networkml.helpers.pandas_csv_importer import WS_FIELDS
 11 | 
 12 | 
 13 | def nan_row_dict(defaults):
 14 |     row = {field: None for field in WS_FIELDS}
 15 |     row.update(defaults)
 16 |     return pd.Series(row)
 17 | 
 18 | 
 19 | def test_get_ips():
 20 |     instance = HostBase()
 21 |     for ipv, ipb, srcip, dstip, ip_flags in (
 22 |         (4, 'ip', ipaddress.ip_address('192.168.0.1'),
 23 |          ipaddress.ip_address('192.168.0.2'), (1, 0)),
 24 |         (6, 'ipv6', ipaddress.ip_address('fc01::1'),
 25 |          ipaddress.ip_address('fc01::2'), (1, 0)),
 26 |         (4, 'ip', ipaddress.ip_address('192.168.0.1'),
 27 |          ipaddress.ip_address('8.8.8.8'), (0, 0)),
 28 |         (6, 'ipv6', ipaddress.ip_address('fc01::1'),
 29 |          ipaddress.ip_address('2001:4860:4860::8888'), (0, 0)),
 30 |             (4, 'ip', ipaddress.ip_address('192.168.0.1'), ipaddress.ip_address('224.0.0.1'), (0, 1))):
 31 |         row = nan_row_dict({'ip.version': ipv, '%s.src' % ipb: str(
 32 |             int(srcip)), '%s.dst' % ipb: str(int(dstip))})
 33 |         assert instance._get_src_ip(row) == srcip
 34 |         assert instance._get_dst_ip(row) == dstip
 35 |         assert instance._df_ip_flags(srcip, dstip) == ip_flags
 36 | 
 37 | 
 38 | def test_macs():
 39 |     instance = HostBase()
 40 |     assert instance._is_unicast(int(netaddr.EUI('0e:00:00:00:00:01'))) == True
 41 |     assert instance._is_unicast(int(netaddr.EUI('ff:ff:ff:ff:ff:ff'))) == False
 42 | 
 43 | 
 44 | def test_flags():
 45 |     instance = HostBase()
 46 |     mac_df = pd.DataFrame.from_dict({'test_col': [1, 2, 4]})
 47 |     assert instance._get_flags(mac_df, 'test_col', {0: 'foo', 1: 'baz', 2: 'blah'}, suffix=None, field_name=None) == {
 48 |         'tshark_test_col_foo': 1, 'tshark_test_col_baz': 1, 'tshark_test_col_blah': 1}
 49 |     mac_df = pd.DataFrame.from_dict({'test_col': [1, 0, 4]})
 50 |     assert instance._get_flags(mac_df, 'test_col', {0: 'foo', 1: 'baz', 2: 'blah'}, suffix=None, field_name=None) == {
 51 |         'tshark_test_col_foo': 1, 'tshark_test_col_baz': 0, 'tshark_test_col_blah': 1}
 52 | 
 53 | 
 54 | def test_lowest_ip_proto_port():
 55 |     instance = HostBase()
 56 |     test_data = {field: None for field in WS_FIELDS}
 57 |     test_data.update({
 58 |         'tcp.srcport': 99,
 59 |         'tcp.dstport': 100,
 60 |     })
 61 |     mac_df = recast_df(pd.DataFrame([test_data]))
 62 |     assert instance._lowest_ip_proto_port(mac_df, 'tcp') == {99}
 63 | 
 64 | 
 65 | def test_no_ip_tshark_ports():
 66 |     instance = HostBase()
 67 |     mac_df = pd.DataFrame([{'ip.proto': 99}])
 68 |     assert instance._tshark_ports('in', mac_df)
 69 |     assert instance._tshark_ratio_ports(mac_df)
 70 | 
 71 | 
 72 | def test_tshark_ports():
 73 |     instance = HostBase()
 74 |     for test_rows, test_output, ratio_output in (
 75 |             ([{'tcp.srcport': 22, 'tcp.dstport': 1025, 'ip.proto': 6}, {'tcp.srcport': 1025, 'tcp.dstport': 22, 'ip.proto': 6}, {'tcp.srcport': 22, 'tcp.dstport': 1025,
 76 |                                                                                                                                  'ip.proto': 6}], {'tshark_tcp_priv_port_22_in'}, {'tshark_tcp_priv_packet_ratio_io_port_22': 2.0, 'tshark_tcp_nonpriv_packet_ratio_io_port_other': 0.5}),
 77 |             ([{'tcp.srcport': 1025, 'tcp.dstport': 1025, 'ip.proto': 6}], {'tshark_tcp_nonpriv_port_other_in'}, {'tshark_tcp_nonpriv_packet_ratio_io_port_other': 1.0})):
 78 | 
 79 |         test_data = []
 80 |         for test_ports in test_rows:
 81 |             row = {field: None for field in WS_FIELDS}
 82 |             row.update(test_ports)
 83 |             test_data.append(row)
 84 |         mac_df = recast_df(pd.DataFrame(test_data))
 85 |         ports = {col for col, val in instance._tshark_ports(
 86 |             'in', mac_df).items() if val == 1}
 87 |         assert test_output == ports
 88 |         ratios = {col: val for col,
 89 |                   val in instance._tshark_ratio_ports(mac_df).items() if val}
 90 |         assert ratio_output == ratios, test_rows
 91 | 
 92 | 
 93 | def test_ip_versions():
 94 |     instance = HostBase()
 95 |     test_data = {field: None for field in WS_FIELDS}
 96 |     test_data.update({'ip.version': 4})
 97 |     mac_df = recast_df(pd.DataFrame([test_data]))
 98 |     assert instance._tshark_ipversions(
 99 |         mac_df) == {'tshark_ipv4': 1, 'tshark_ipv6': 0}
100 | 
101 | 
102 | def test_non_ip():
103 |     instance = HostBase()
104 |     for eth_type, test_output in (
105 |             (1, {'tshark_ipx': 0, 'tshark_nonip': 1}),
106 |             (0x8137, {'tshark_ipx': 1, 'tshark_nonip': 1}),
107 |             (0x800, {'tshark_ipx': 0, 'tshark_nonip': 0})):
108 |         test_data = {field: None for field in WS_FIELDS}
109 |         test_data.update({'eth.type': eth_type})
110 |         mac_df = recast_df(pd.DataFrame([test_data]))
111 |         assert instance._tshark_non_ip(mac_df) == test_output
112 | 
113 | 
114 | def test_vlan_id():
115 |     instance = HostBase()
116 |     test_data = {field: None for field in WS_FIELDS}
117 |     mac_df = recast_df(pd.DataFrame([test_data]))
118 |     assert instance._tshark_vlan_id(mac_df) == {'tshark_tagged_vlan': 0}
119 |     test_data.update({'vlan.id': 99})
120 |     mac_df = recast_df(pd.DataFrame([test_data]))
121 |     assert instance._tshark_vlan_id(mac_df) == {'tshark_tagged_vlan': 1}
122 | 
123 | 
124 | def test_smoke_calc_cols():
125 |     instance = HostBase()
126 |     test_data = {field: None for field in WS_FIELDS}
127 |     eth_src = '0e:00:00:00:00:01'
128 |     eth_src_int = int(netaddr.EUI(eth_src))
129 |     test_data.update({
130 |         'ip.version': 4,
131 |         'eth.src': eth_src_int,
132 |         'eth.dst': eth_src_int,
133 |         '_srcip': '192.168.0.1',
134 |         '_dstip': '192.168.0.2',
135 |     })
136 |     mac_df = recast_df(pd.DataFrame([test_data]))
137 |     assert instance._calc_cols(eth_src_int, mac_df)
138 | 
139 | 
140 | def test_host_keys():
141 |     test_data = {field: None for field in WS_FIELDS}
142 |     eth_src = '0e:00:00:00:00:01'
143 |     eth_src_int = int(netaddr.EUI(eth_src))
144 |     src_ip = ipaddress.ip_address('192.168.0.1')
145 |     dst_ip = ipaddress.ip_address('192.168.0.2')
146 |     test_data.update({
147 |         'ip.version': 4,
148 |         'eth.src': eth_src_int,
149 |         'eth.dst': eth_src_int,
150 |         'ip.src': str(int(src_ip)),
151 |         'ip.dst': str(int(dst_ip)),
152 |         'tcp.srcport': 999,
153 |         'tcp.dstport': 1001,
154 |         'frame.protocols': 'eth:ip',
155 |     })
156 |     row = nan_row_dict(test_data)
157 |     instance = Host()
158 |     assert instance._host_key(row)[1:] == (str(src_ip), str(dst_ip), 1, 0, 1)
159 |     instance = SessionHost()
160 |     assert instance._host_key(row)[1:] == (str(src_ip), str(dst_ip), 1, 0, 1)
161 | 


--------------------------------------------------------------------------------
/tests/test_algorithms_host_footprint.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import shutil
  4 | import sys
  5 | import tempfile
  6 | 
  7 | import numpy as np
  8 | import pytest
  9 | from sklearn import preprocessing
 10 | from sklearn.preprocessing import StandardScaler
 11 | from sklearn.model_selection import GridSearchCV
 12 | from sklearn.neural_network import MLPClassifier
 13 | from sklearn.preprocessing import LabelBinarizer
 14 | 
 15 | from networkml.algorithms.host_footprint import HostFootprint
 16 | 
 17 | 
 18 | def test_serialize_scaler():
 19 |     instance = HostFootprint()
 20 |     scaler = StandardScaler()
 21 |     test_data = [[i, i] for i in range(99)]
 22 |     scaler.fit(test_data)
 23 |     with tempfile.TemporaryDirectory() as tmpdir:
 24 |         scaler_file = os.path.join(tmpdir, 'scaler.mod')
 25 |         instance.serialize_scaler(scaler, scaler_file)
 26 |         new_scaler = instance.deserialize_scaler(scaler_file)
 27 |         assert len(scaler.mean_) == 2
 28 |         assert scaler.mean_.tolist() == new_scaler.mean_.tolist()
 29 | 
 30 | 
 31 | def test_serialize_label_encoder():
 32 |     instance = HostFootprint()
 33 |     le_classes = ['printer', 'workstation', 'server']
 34 |     le = preprocessing.LabelEncoder()
 35 |     le.fit(le_classes)
 36 |     with tempfile.TemporaryDirectory() as tmpdir:
 37 |         le_file = os.path.join(tmpdir, 'le.json')
 38 |         instance.serialize_label_encoder(le, le_file)
 39 |         new_le = instance.deserialize_label_encoder(le_file)
 40 |         assert le.classes_.tolist() == new_le.classes_.tolist()
 41 |         assert new_le.inverse_transform(le.transform(le_classes)).tolist() == le_classes
 42 | 
 43 | 
 44 | def test_serialize_model():
 45 |     instance = HostFootprint()
 46 |     model = MLPClassifier()
 47 |     label_binarizer = LabelBinarizer()
 48 |     label_binarizer.neg_label = 0
 49 |     label_binarizer.pos_label = 1
 50 |     label_binarizer.sparse_output = False
 51 |     label_binarizer.y_type_ = "binary"
 52 |     label_binarizer.sparse_input_ = False
 53 |     label_binarizer.classes_ = np.array([0])
 54 | 
 55 |     parameters = {'hidden_layer_sizes': [(64, 32)]}
 56 |     GridSearchCV(model, parameters,
 57 |                        cv=5, n_jobs=-1,
 58 |                        scoring='f1_weighted')
 59 | 
 60 |     model.coefs_ = np.array([[1],[2]])
 61 |     model.loss_ = 42
 62 |     model.intercepts_ = np.array([[3],[4]])
 63 |     model.classes_ = np.array([[5],[6]])
 64 |     model.n_iter_ = 42
 65 |     model.n_layers_ = 2
 66 |     model.n_outputs_ = 1
 67 |     model.out_activation_ = "logistic"
 68 |     model._label_binarizer =label_binarizer
 69 |     model.features = ['test_1', 'test_2', 'test_3']
 70 | 
 71 | 
 72 |     with tempfile.TemporaryDirectory() as tmpdir:
 73 |         model_file = os.path.join(tmpdir, 'host_footprint.json')
 74 |         instance.serialize_model(model, model_file)
 75 |         new_model = instance.deserialize_model(model_file)
 76 |         assert model.features == new_model.features
 77 |         print(f"model params: {model.get_params()}")
 78 |         print(f"new_model params: {new_model.get_params()}")
 79 |         assert len(model.get_params()['hidden_layer_sizes']) == len(new_model.get_params()['hidden_layer_sizes'])
 80 |         assert model._label_binarizer.y_type_ == new_model._label_binarizer.y_type_
 81 |         assert len(model.coefs_) == len(new_model.coefs_)
 82 |         assert len(model.intercepts_) == len(new_model.intercepts_)
 83 | 
 84 | 
 85 | def test_list_model():
 86 |     expected = [
 87 |         "foo",
 88 |         "bar",
 89 |         "baz",
 90 |     ]
 91 |     instance = HostFootprint()
 92 |     instance.model_path = './tests/test_data/list_test.json'
 93 |     instance.list = 'features'
 94 |     actual = instance.list_model()
 95 |     assert actual == expected
 96 | 
 97 | def test_get_individual_predictions():
 98 |    le_classes = ['asomething', 'bsomething']
 99 |    le = preprocessing.LabelEncoder()
100 |    le.fit(le_classes)
101 |    filename = ['firstfile']
102 |    host_key = np.array(['mac1'])
103 |    tshark_srcips = np.array(["['1.1.1.1']"])
104 |    frame_epoch = None
105 |    instance = HostFootprint()
106 |    assert instance.get_individual_predictions([[0.6, 0.7]], le, filename, host_key, tshark_srcips, frame_epoch) == {
107 |         'firstfile': [{'top_role': 'bsomething', 'role_list': [('bsomething', 0.7), ('asomething', 0.6)], 'source_ip': '1.1.1.1', 'source_mac': 'mac1'}]}
108 |    assert instance.get_individual_predictions([[0.2, 0.1]], le, filename, host_key, tshark_srcips, frame_epoch) == {
109 |         'firstfile': [{'top_role': 'Unknown', 'role_list': [('asomething', 0.2), ('bsomething', 0.1)], 'source_ip': '1.1.1.1', 'source_mac': 'mac1'}]}
110 | 
111 | 
112 | def hf_args(tmpdir, operation, input_file):
113 |     output_json = os.path.join(tmpdir, 'out.json')
114 |     output_le_json = os.path.join(tmpdir, 'out_le.json')
115 |     scaler_mod = os.path.join(tmpdir, 'scaler.mod')
116 |     return ['host_footprint.py', '--label_encoder', output_le_json,
117 |             '--trained_model', output_json, '--scaler', scaler_mod,
118 |             '--operation', operation, '--kfolds', '2', input_file]
119 | 
120 | 
121 | def test_train():
122 |     """Test training function of HostFootprint class"""
123 |     with tempfile.TemporaryDirectory() as tmpdir:
124 |         testdata = os.path.join(tmpdir, 'test_data')
125 |         shutil.copytree('./tests/test_data', testdata)
126 |         input_file = os.path.join(testdata, 'combined.csv')
127 |         operation = 'train'
128 |         sys.argv = hf_args(tmpdir, operation, input_file)
129 |         instance = HostFootprint()
130 |         instance.main()
131 | 
132 | 
133 | def test_predict():
134 |     """Test predict function of HostFootprint class"""
135 |     with tempfile.TemporaryDirectory() as tmpdir:
136 |         testdata = os.path.join(tmpdir, 'test_data')
137 |         shutil.copytree('./tests/test_data', testdata)
138 |         input_file = os.path.join(testdata, 'combined.csv')
139 |         operation = 'train'
140 |         sys.argv = hf_args(tmpdir, operation, input_file)
141 |         instance = HostFootprint()
142 |         instance.main()
143 |         operation = 'predict'
144 |         sys.argv = hf_args(tmpdir, operation, input_file)
145 |         instance = HostFootprint()
146 |         json.loads(instance.main())
147 | 
148 | 
149 | def test_predict_num_roles():
150 |     """
151 |     Test predict function of HostFootprint class with
152 |     varying number of distinct roles present
153 |     """
154 |     with tempfile.TemporaryDirectory() as tmpdir:
155 |         testdata = os.path.join(tmpdir, 'test_data')
156 |         shutil.copytree('./tests/test_data', testdata)
157 |         for file in ['combined_three_roles.csv', 'combined_two_roles.csv']:
158 |             input_file = os.path.join(testdata, file)
159 |             operation = 'train'
160 |             sys.argv = hf_args(tmpdir, operation, input_file)
161 |             instance = HostFootprint()
162 |             instance.main()
163 |             operation = 'predict'
164 |             sys.argv = hf_args(tmpdir, operation, input_file)
165 |             instance = HostFootprint()
166 |             instance.main()
167 | 
168 |             predictions = json.loads(instance.predict())
169 |             assert isinstance(predictions, dict)
170 |             # Check if number of predictions is correct
171 |             if file == 'combined_three_roles.csv':
172 |                 assert len(predictions) == 6
173 |             else:
174 |                 assert len(predictions) == 4
175 | 
176 | 
177 | def test_train_bad_data_too_few_columns():
178 |     """
179 |     This test tries to train a model on a mal-formed csv with too few fields
180 |     """
181 |     with tempfile.TemporaryDirectory() as tmpdir:
182 |         testdata = os.path.join(tmpdir, 'test_data')
183 |         shutil.copytree('./tests/test_data', testdata)
184 |         input_file = os.path.join(testdata, 'bad_data_too_few_columns.csv')
185 |         operation = 'train'
186 |         sys.argv = hf_args(tmpdir, operation, input_file)
187 |         instance = HostFootprint()
188 |         with pytest.raises(Exception):
189 |             instance.main()
190 | 


--------------------------------------------------------------------------------
/networkml/NetworkML.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import os
  4 | 
  5 | from networkml import __version__
  6 | from networkml.algorithms.host_footprint import HostFootprint
  7 | from networkml.featurizers.csv_to_features import CSVToFeatures
  8 | from networkml.helpers.results_output import ResultsOutput
  9 | from networkml.parsers.pcap_to_csv import PCAPToCSV
 10 | 
 11 | 
 12 | class NetworkML:
 13 | 
 14 |     def __init__(self, raw_args=None):
 15 |         self.logger = logging.getLogger(__name__)
 16 |         log_levels = {'INFO': logging.INFO, 'DEBUG': logging.DEBUG,
 17 |                       'WARNING': logging.WARNING, 'ERROR': logging.ERROR}
 18 | 
 19 |         # TODO: migrate stage-specific flags here.
 20 |         self.stage_args = {
 21 |             'parser': {},
 22 |             'featurizer': {
 23 |                 'srcmacid': {'help': 'attempt to detect canonical source MAC and featurize only that MAC', 'action': 'store_true'},
 24 |                 'no-srcmacid': {'help': 'featurize all MACs', 'action': 'store_true'},
 25 |             },
 26 |             'algorithm': {
 27 |                 'trained_model': {'help': 'specify a path to load or save trained model'},
 28 |                 'label_encoder': {'help': 'specify a path to load or save label encoder'},
 29 |                 'scaler': {'help': 'specify a path to load or save scaler'},
 30 |                 'kfolds': {'help': 'specify number of folds for k-fold cross validation'},
 31 |                 'eval_data': {'help': 'path to eval CSV file, if training'},
 32 |                 'train_unknown': {'help': 'Train on unknown roles'},
 33 |                 'list':{'choices':['features'],
 34 |                         'default':None,
 35 |                         'help':'list information contained within model defined by --trained_model'
 36 |                         }
 37 |             },
 38 |         }
 39 |         parsed_args = self.parse_args(raw_args=raw_args)
 40 |         self.in_path = parsed_args.path
 41 |         self.algorithm = parsed_args.algorithm
 42 |         self.engine = parsed_args.engine
 43 |         self.first_stage = parsed_args.first_stage
 44 |         self.final_stage = parsed_args.final_stage
 45 |         self.groups = parsed_args.groups
 46 |         self.gzip_opt = parsed_args.gzip
 47 |         self.level = parsed_args.level
 48 |         self.operation = parsed_args.operation
 49 |         self.output = parsed_args.output
 50 |         self.threads = parsed_args.threads
 51 |         self.list = parsed_args.list
 52 |         self.log_level = parsed_args.verbose
 53 |         for args in self.stage_args.values():
 54 |             for arg in args:
 55 |                 val = getattr(parsed_args, arg, None)
 56 |                 if val is not None:
 57 |                     setattr(self, arg, val)
 58 |         logging.basicConfig(level=log_levels[self.log_level])
 59 |         self.main()
 60 | 
 61 |     def parse_args(self, raw_args=None):
 62 |         parser = argparse.ArgumentParser(description='networkml %s' % __version__)
 63 |         parser.add_argument('path', help='path to a single pcap file, or a directory of pcaps to parse', default='/pcaps')
 64 |         parser.add_argument('--algorithm', '-a', choices=[
 65 |                             'host_footprint'], default='host_footprint', help='choose which algorithm to use (default=host_footprint)')
 66 |         parser.add_argument('--engine', '-e', choices=['pyshark', 'tshark', 'host'],
 67 |                             default='tshark', help='engine to use to process the PCAP file (default=tshark)')
 68 |         parser.add_argument('--first_stage', '-f', choices=['parser', 'featurizer', 'algorithm'], default='parser',
 69 |                             help='choose which stage to start at, `path` arg is relative to stage (default=parser)')
 70 |         parser.add_argument('--final_stage', choices=['parser', 'featurizer', 'algorithm'],
 71 |                             default='algorithm', help='choose which stage to finish at (default=algorithm)')
 72 |         parser.add_argument('--groups', '-g', default='host',
 73 |                             help='groups of comma separated features to use (default=host)')
 74 |         parser.add_argument('--gzip', '-z', choices=['input', 'output', 'both'], default='both',
 75 |                             help='use gzip between stages, useful when not using all 3 stages (default=both)')
 76 |         parser.add_argument('--level', '-l', choices=['packet', 'flow', 'host'],
 77 |                             default='packet', help='level to make the output records (default=packet)')
 78 |         parser.add_argument('--operation', '-O', choices=['train', 'predict', 'eval'], default='predict',
 79 |                             help='choose which operation task to perform, train or predict (default=predict)')
 80 |         parser.add_argument('--output', '-o', default=None,
 81 |                             help='directory to write out any results files to')
 82 |         parser.add_argument('--threads', '-t', default=1, type=int,
 83 |                             help='number of async threads to use (default=1)')
 84 |         parser.add_argument('--verbose', '-v', choices=[
 85 |                             'DEBUG', 'INFO', 'WARNING', 'ERROR'], default='INFO', help='logging level (default=INFO)')
 86 |         for stage, args in self.stage_args.items():
 87 |             for arg, arg_parms in args.items():
 88 |                 arg_help = '%s (%s)' % (arg_parms['help'], stage)
 89 |                 arg_choices = arg_parms['choices'] if 'choices' in arg_parms else None
 90 |                 arg_default = arg_parms['default'] if 'default' in arg_parms else None
 91 |                 action = arg_parms.get('action', 'store')
 92 |                 if not arg_choices:
 93 |                     parser.add_argument('--' + arg, action=action, help=arg_help, default=arg_default, dest=arg)
 94 |                 else:
 95 |                     parser.add_argument('--' + arg, help=arg_help, choices=arg_choices, default=arg_default, dest=arg, action=action)
 96 |         parsed_args = parser.parse_args(raw_args)
 97 |         return parsed_args
 98 | 
 99 |     def add_opt_args(self, opt_args):
100 |         raw_args = []
101 |         for arg, arg_parms in opt_args.items():
102 |             val = getattr(self, arg, None)
103 |             if val is not None:
104 |                 raw_args.append('--' + arg)
105 |                 if arg_parms.get('action', None) != 'store_true':
106 |                     raw_args.append(str(val))
107 |         return raw_args
108 | 
109 |     def run_parser_stage(self, in_path):
110 |         raw_args = self.add_opt_args(self.stage_args['parser'])
111 |         raw_args.extend(['-e', self.engine, '-l', self.level,
112 |             '-o', self.output, '-t', str(self.threads), '-v', self.log_level, in_path])
113 |         instance = PCAPToCSV(raw_args=raw_args)
114 |         return instance.main()
115 | 
116 |     def run_featurizer_stage(self, in_path):
117 |         raw_args = self.add_opt_args(self.stage_args['featurizer'])
118 |         raw_args.extend(['-c', '-g', self.groups, '-z', self.gzip_opt,
119 |             '-o', self.output, '-t', str(self.threads), '-v', self.log_level, in_path])
120 |         instance = CSVToFeatures(raw_args=raw_args)
121 |         return instance.main()
122 | 
123 |     def run_algorithm_stage(self, in_path):
124 |         raw_args = self.add_opt_args(self.stage_args['algorithm'])
125 |         raw_args.extend(['-O', self.operation, '-v', self.log_level, in_path])
126 |         instance = HostFootprint(raw_args=raw_args)
127 |         return instance.main()
128 | 
129 |     def output_results(self, result_json_str, run_complete):
130 |         if run_complete:
131 |             if self.list:
132 |                 print(f'{result_json_str}')
133 |             if self.final_stage == 'algorithm' and self.operation == 'predict':
134 |                 if self.output and os.path.isdir(self.output):
135 |                     uid = os.getenv('id', 'None')
136 |                     file_path = os.getenv('file_path', self.in_path)
137 |                     results_outputter = ResultsOutput(self.logger, uid, file_path)
138 |                     result_json_file_name = os.path.join(self.output, 'predict.json')
139 |                     results_outputter.output_from_result_json(result_json_str, result_json_file_name)
140 | 
141 |     def run_stages(self):
142 |         stages = ('parser', 'featurizer', 'algorithm')
143 |         stage_runners = {
144 |             'parser': self.run_parser_stage,
145 |             'featurizer': self.run_featurizer_stage,
146 |             'algorithm': self.run_algorithm_stage}
147 | 
148 |         try:
149 |             first_stage_index = stages.index(self.first_stage)
150 |             final_stage_index = stages.index(self.final_stage)
151 |         except ValueError:
152 |             self.logger.error('Unknown first/final stage name')
153 |             return
154 | 
155 |         if first_stage_index > final_stage_index:
156 |             self.logger.error('Invalid first and final stage combination')
157 |             return
158 | 
159 |         run_schedule = stages[first_stage_index:(final_stage_index+1)]
160 |         result = self.in_path
161 |         self.logger.info(f'running stages: {run_schedule}')
162 | 
163 |         run_complete = False
164 |         try:
165 |             for stage in run_schedule:
166 |                 runner = stage_runners[stage]
167 |                 result = runner(result)
168 |             run_complete = True
169 |         except Exception as err:
170 |             self.logger.error(f'Could not run stage: {err}')
171 | 
172 |         self.output_results(result, run_complete)
173 | 
174 |     def main(self):
175 |         self.run_stages()
176 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright (c) 2017-2022 IQT Labs LLC, All Rights Reserved.
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/networkml/featurizers/csv_to_features.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import concurrent.futures
  3 | import csv
  4 | import logging
  5 | import os
  6 | import pathlib
  7 | from collections import Counter
  8 | from collections import defaultdict
  9 | 
 10 | import numpy as np
 11 | 
 12 | import networkml
 13 | from networkml.featurizers.main import Featurizer
 14 | from networkml.helpers.gzipio import gzip_reader
 15 | from networkml.helpers.gzipio import gzip_writer
 16 | from networkml.helpers.pandas_csv_importer import import_csv
 17 | 
 18 | 
 19 | class CSVToFeatures():
 20 | 
 21 |     def __init__(self, raw_args=None):
 22 |         self.logger = logging.getLogger(__name__)
 23 |         self.raw_args = raw_args
 24 | 
 25 |     @staticmethod
 26 |     def get_reader(in_file, use_gzip):
 27 |         if use_gzip:
 28 |             return gzip_reader(in_file)
 29 |         return open(in_file, 'r')
 30 | 
 31 |     @staticmethod
 32 |     def get_writer(out_file, use_gzip):
 33 |         if use_gzip:
 34 |             return gzip_writer(out_file)
 35 |         return open(out_file, 'w')
 36 | 
 37 |     @staticmethod
 38 |     def iscsv(pathfile):
 39 |         for ext in ('csv', 'gz'):
 40 |             if pathfile.endswith(''.join(('.', ext))):
 41 |                 return True
 42 |         return False
 43 | 
 44 |     @staticmethod
 45 |     def write_features_to_csv(header, rows, out_file, gzip_opt):
 46 |         use_gzip = gzip_opt in ['output', 'both']
 47 |         with CSVToFeatures.get_writer(out_file, use_gzip) as f_out:
 48 |             writer = csv.DictWriter(f_out, fieldnames=header)
 49 |             writer.writeheader()
 50 |             writer.writerows(rows)
 51 | 
 52 |     @staticmethod
 53 |     def combine_csvs(out_paths, combined_path, gzip_opt):
 54 |         # First determine the field names from the top line of each input file
 55 |         fieldnames = {'filename'}
 56 |         use_gzip = gzip_opt in ['output', 'both']
 57 |         for filename in out_paths:
 58 |             with CSVToFeatures.get_reader(filename, use_gzip) as f_in:
 59 |                 reader = csv.reader(f_in)
 60 |                 fieldnames.update({header for header in next(reader)})
 61 | 
 62 |         # Then copy the data
 63 |         with CSVToFeatures.get_writer(combined_path, use_gzip) as f_out:
 64 |             writer = csv.DictWriter(f_out, fieldnames=list(fieldnames))
 65 |             writer.writeheader()
 66 |             for filename in out_paths:
 67 |                 with CSVToFeatures.get_reader(filename, use_gzip) as f_in:
 68 |                     reader = csv.DictReader(f_in)
 69 |                     for line in reader:
 70 |                         if use_gzip:
 71 |                             line['filename'] = filename.split(
 72 |                                 '/')[-1].split('.features.gz')[0]
 73 |                         else:
 74 |                             line['filename'] = filename.split(
 75 |                                 '/')[-1].split('.features')[0]
 76 |                         writer.writerow(line)
 77 |                 CSVToFeatures.cleanup_files([filename])
 78 | 
 79 |     @staticmethod
 80 |     def cleanup_files(paths):
 81 |         for fi in paths:
 82 |             if os.path.exists(fi):
 83 |                 os.remove(fi)
 84 | 
 85 |     @staticmethod
 86 |     def parse_args(raw_args=None):
 87 |         netml_path = list(networkml.__path__)
 88 |         parser = argparse.ArgumentParser()
 89 |         parser.add_argument(
 90 |             'path', help='path to a single gzipped csv file, or a directory of gzipped csvs to parse')
 91 |         parser.add_argument('--combined', '-c', action='store_true',
 92 |                             help='write out all records from all csvs into a single gzipped csv file')
 93 |         parser.add_argument('--features_path', '-p', default=os.path.join(
 94 |             netml_path[0], 'featurizers/funcs'), help='path to featurizer functions')
 95 |         parser.add_argument('--functions', '-f', default='',
 96 |                             help='comma separated list of <class>:<function> to featurize (default=None)')
 97 |         parser.add_argument('--groups', '-g', default='host',
 98 |                             help='comma separated list of groups of functions to featurize (default=host)')
 99 |         parser.add_argument('--gzip', '-z', choices=['input', 'output', 'both', 'neither'],
100 |                             default='both', help='gzip the input/output file, both or neither (default=both)')
101 |         parser.add_argument('--output', '-o', default=None,
102 |                             help='path to write out gzipped csv file or directory for gzipped csv files')
103 |         parser.add_argument('--threads', '-t', default=1, type=int,
104 |                             help='number of async threads to use (default=1)')
105 |         parser.add_argument('--verbose', '-v', choices=[
106 |                             'DEBUG', 'INFO', 'WARNING', 'ERROR'], default='INFO', help='logging level (default=INFO)')
107 |         srcmacid_parser = parser.add_mutually_exclusive_group(required=False)
108 |         srcmacid_parser.add_argument('--srcmacid', dest='srcmacid', action='store_true', help='attempt to detect canonical source MAC and featurize only that MAC')
109 |         srcmacid_parser.add_argument('--no-srcmacid', dest='srcmacid', action='store_false', help='featurize all MACs')
110 |         parser.set_defaults(srcmacid=True)
111 |         parsed_args = parser.parse_args(raw_args)
112 |         return parsed_args
113 | 
114 |     def exec_features(self, features, in_file, out_file, features_path, gzip_opt, parsed_args):
115 |         in_file_size = os.path.getsize(in_file)
116 |         self.logger.info(f'Importing {in_file} size {in_file_size}')
117 |         df = import_csv(in_file)
118 |         featurizer = Featurizer()
119 |         self.logger.info(f'Featurizing {in_file}')
120 |         rows = featurizer.main(features, df, features_path, parsed_args)
121 | 
122 |         rowcounts = Counter()
123 |         for row in rows:
124 |             for r in row:
125 |                 for header_key in r:
126 |                     rowcounts[header_key] += 1
127 |         rowcompare = defaultdict(set)
128 |         for header_key, header_count in rowcounts.items():
129 |             if header_key != 'host_key':
130 |                 rowcompare[header_count].add(header_key)
131 |         assert not len(rowcompare) == 0, 'featurizer returned no results'
132 |         assert len(
133 |             rowcompare) == 1, 'inconsistent featurizer row counts (headers not consistently present in all rows): %s' % rowcompare
134 |         header = list(rowcounts.keys())
135 | 
136 |         columns = [np.array(row) for row in rows]
137 |         np_array = np.vstack(columns)
138 | 
139 |         rows = None
140 |         for method in np_array:
141 |             if rows is None:
142 |                 rows = method
143 |             else:
144 |                 for i, row in enumerate(method):
145 |                     rows[i].update(row)
146 | 
147 |         if header and rows is not None:
148 |             rows = rows.tolist()
149 |             CSVToFeatures.write_features_to_csv(
150 |                 header, rows, out_file, gzip_opt)
151 |         else:
152 |             self.logger.warning(
153 |                 f'No results based on {features} for {in_file}')
154 | 
155 |     def process_files(self, threads, features, features_path, in_paths, out_paths, gzip_opt, parsed_args):
156 |         num_files = len(in_paths)
157 |         failed_paths = []
158 |         finished_files = 0
159 |         # corner case so it works in jupyterlab
160 |         if threads < 2:
161 |             for i in range(len(in_paths)):
162 |                 try:
163 |                     finished_files += 1
164 |                     self.exec_features(
165 |                         features, in_paths[i], out_paths[i], features_path, gzip_opt, parsed_args)
166 |                     self.logger.info(
167 |                         f'Finished {in_paths[i]}. {finished_files}/{num_files} CSVs done.')
168 |                 except Exception as e:  # pragma: no cover
169 |                     self.logger.error(
170 |                         f'{in_paths[i]} generated an exception: {e}')
171 |                     failed_paths.append(out_paths[i])
172 |         else:
173 |             with concurrent.futures.ProcessPoolExecutor(max_workers=threads) as executor:
174 |                 future_to_parse = {executor.submit(
175 |                     self.exec_features, features, in_paths[i], out_paths[i], features_path, gzip_opt, parsed_args): i for i in range(len((in_paths)))}
176 |                 for future in concurrent.futures.as_completed(future_to_parse):
177 |                     path = future_to_parse[future]
178 |                     try:
179 |                         finished_files += 1
180 |                         future.result()
181 |                     except Exception as e:  # pragma: no cover
182 |                         self.logger.error(
183 |                             f'{in_paths[path]} generated an exception: {e}')
184 |                         failed_paths.append(out_paths[path])
185 |                     else:
186 |                         self.logger.info(
187 |                             f'Finished {in_paths[path]}. {finished_files}/{num_files} CSVs done.')
188 |         return failed_paths
189 | 
190 |     def main(self):
191 |         parsed_args = CSVToFeatures.parse_args(raw_args=self.raw_args)
192 |         in_path = parsed_args.path
193 |         out_path = parsed_args.output
194 |         combined = parsed_args.combined
195 |         features_path = parsed_args.features_path
196 |         threads = parsed_args.threads
197 |         log_level = parsed_args.verbose
198 |         functions = parsed_args.functions
199 |         groups = parsed_args.groups
200 |         gzip_opt = parsed_args.gzip
201 | 
202 |         if not groups and not functions:
203 |             self.logger.warning(
204 |                 'No groups or functions were selected, quitting')
205 |             return
206 | 
207 |         log_levels = {'INFO': logging.INFO, 'DEBUG': logging.DEBUG,
208 |                       'WARNING': logging.WARNING, 'ERROR': logging.ERROR}
209 |         logging.basicConfig(level=log_levels[log_level])
210 | 
211 |         in_paths = []
212 |         out_paths = []
213 | 
214 |         # parse out features dict
215 |         groups = tuple(groups.split(','))
216 |         funcs = functions.split(',')
217 |         functions = []
218 |         for function in funcs:
219 |             functions.append(tuple(function.split(':')))
220 |         features = {'groups': groups, 'functions': functions}
221 | 
222 |         # check if it's a directory or a file
223 |         if os.path.isdir(in_path):
224 |             if out_path:
225 |                 pathlib.Path(out_path).mkdir(parents=True, exist_ok=True)
226 |             for root, _, files in os.walk(in_path):
227 |                 for pathfile in files:
228 |                     if CSVToFeatures.iscsv(pathfile):
229 |                         in_paths.append(os.path.join(root, pathfile))
230 |                         if out_path:
231 |                             if gzip_opt in ['neither', 'input']:
232 |                                 out_paths.append(os.path.join(
233 |                                     out_path, pathfile) + '.features')
234 |                             else:
235 |                                 out_paths.append(os.path.join(
236 |                                     out_path, pathfile) + '.features.gz')
237 |                         else:
238 |                             if gzip_opt in ['neither', 'input']:
239 |                                 out_paths.append(os.path.join(
240 |                                     root, pathfile) + '.features')
241 |                             else:
242 |                                 out_paths.append(os.path.join(
243 |                                     root, pathfile) + '.features.gz')
244 |         else:
245 |             in_paths.append(in_path)
246 |             default_out_path = in_path + '.features.gz'
247 |             if gzip_opt in ['neither', 'input']:
248 |                 default_out_path = in_path + '.features'
249 |             if out_path:
250 |                 if os.path.isdir(out_path):
251 |                     out_paths.append(os.path.join(out_path, os.path.basename(default_out_path)))
252 |                 else:
253 |                     out_paths.append(out_path)
254 |             else:
255 |                 out_paths.append(default_out_path)
256 | 
257 |         failed_paths = self.process_files(
258 |             threads, features, features_path, in_paths, out_paths, gzip_opt, parsed_args)
259 | 
260 |         for failed_path in failed_paths:  # pragma: no cover
261 |             if failed_path in out_paths:
262 |                 out_paths.remove(failed_path)
263 | 
264 |         if combined and out_paths:
265 |             combined_path = os.path.join(
266 |                 os.path.dirname(out_paths[0]), 'combined.csv.gz')
267 |             if gzip_opt in ['input', 'neither']:
268 |                 combined_path = combined_path[:-3]
269 |             self.logger.info(
270 |                 f'Combining CSVs into a single file: {combined_path}')
271 |             CSVToFeatures.combine_csvs(out_paths, combined_path, gzip_opt)
272 |             return combined_path
273 |         if out_paths:
274 |             self.logger.info(f'GZipped CSV file(s) written out to: {out_paths}')
275 |             return os.path.dirname(out_paths[0])
276 |         else:
277 |             self.logger.error(f'No CSV file(s) written out because the following paths failed: {failed_paths}')
278 |             return
279 | 
280 | 
281 | if __name__ == '__main__':  # pragma: no cover
282 |     features = CSVToFeatures()
283 |     features.main()
284 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # v0.6.19 (2022-03-23)
  2 | 
  3 | - Update dependency pyshark to v0.4.5
  4 | - Update dependency cython to v0.29.28
  5 | - Update dependency humanize to v4.0.0
  6 | - Update dependency pytest to v7.1.1
  7 | - Update dependency pbr to v5.8.1
  8 | - Update dependency numpy to v1.22.3
  9 | - Update dependency pandas to v1.4.1
 10 | - Update dependency pytest-rabbitmq to v2.2.1
 11 | - Update iqtlabs/rbqwrapper Docker tag to v0.11.32
 12 | 
 13 | # v0.6.18 (2022-01-11)
 14 | 
 15 | - Update dependency pbr to v5.8.0
 16 | - Update iqtlabs/rbqwrapper Docker tag to v0.11.31
 17 | - Update dependency humanize to v3.13.1
 18 | - Update dependency nest_asyncio to v1.5.4
 19 | - Update dependency cython to v0.29.26
 20 | - Update dependency pytest-xdist to v2.5.0
 21 | - Update dependency pygments to v2.11.2
 22 | - Update dependency numpy to v1.22.0
 23 | - Update dependency pandas to v1.3.5
 24 | - Update dependency scikit-learn to v1.0.2
 25 | 
 26 | # v0.6.17 (2021-08-19)
 27 | 
 28 | - Update dependency numpy to v1.21.3
 29 | - Update dependency pandas to v1.3.4
 30 | - Update dependency scikit-learn to v1.0.1
 31 | - Update dependency joblib to v1.1.0
 32 | - Update dependency humanize to v3.12.0
 33 | - Update dependency pytest-cov to v3
 34 | - Update dependency pytest-xdist to v2.4.0
 35 | - Update codecov/codecov-action action to v2.1.0
 36 | - Allow pcap to features to read a pcap CSV with pre-cast int types (for future drop in replacement for tshark/pyshark parsers). If hex int fields, are detected as strings, fall back to current behavior (use python conversion)
 37 | - Update iqtlabs/rbqwrapper Docker tag to v0.11.29
 38 | - pytype observes that csv.DictWriter fields, should be an indexable Sequence
 39 | 
 40 | # v0.6.16 (2021-08-19)
 41 | 
 42 | - Updated humanize, numpy, pandas, and pygments
 43 | - Updated rbqwrapper base image
 44 | - Fixed an issue where tshark could exit and not write out buffer
 45 | - Improved SAAST scanning
 46 | 
 47 | # v0.6.15 (2021-07-29)
 48 | 
 49 | - Updated NumPy, codecov-action, reorder_python_imports, upload-artifact
 50 | - Pinned Pandas to v1.2.5 due to #871
 51 | - Added Shift-Left SAAST Scan on push and PR
 52 | 
 53 | # v0.6.14 (2021-07-14)
 54 | 
 55 | - Updated humanize, pytest-xdist, NumPy, Pandas, python, pip, cython
 56 | 
 57 | # v0.6.13 (2021-06-15)
 58 | 
 59 | - Updated humanize, pytest-cov, pytest-rabbitmq, codecov-action
 60 | 
 61 | # v0.6.12 (2021-05-20)
 62 | 
 63 | - Updated pre-commit, rbqwrapper, and pytest-cov
 64 | - Fixed output for Packet Cafe consumption
 65 | 
 66 | # v0.6.11 (2021-05-10)
 67 | 
 68 | - Updated codecov, reorder_python_imports, cython, humanize, numpy, pandas, pbr, scikit-learn, pygments, and pytest
 69 | 
 70 | # v0.6.10 (2021-03-04)
 71 | 
 72 | - Updated rbqwrapper, cython, pandas, and pygments
 73 | 
 74 | # v0.6.9 (2021-02-11)
 75 | 
 76 | - Updated reorder_python_imports, rbqwrapper, joblib, numpy, pandas, pyshark, and pytest-xdist
 77 | 
 78 | # v0.6.8 (2021-01-26)
 79 | 
 80 | - Updated pytest, pytest-cov, scikit-learn, pandas, nest_asyncio
 81 | 
 82 | # v0.6.7 (2021-01-13)
 83 | 
 84 | - Updated codecov, pygments, pytest, pytest-xdist, pytest-rabbitmq
 85 | - Moved base image to rbqwrapper, abstracting away RabbitMQ
 86 | - Added a new feature for listing out features in the model
 87 | - Rewrote the model serializer removing the need for sklearn_json
 88 | - Add end-to-end tests
 89 | 
 90 | # v0.6.6 (2020-12-01)
 91 | 
 92 | - Move to PBR
 93 | - fix test for 'behavior'
 94 | 
 95 | # v0.6.5 (2020-11-24)
 96 | 
 97 | - Rollback numpy as it doesn't properly handle confidence values on ARM64
 98 | - Updated pre-commit versions
 99 | - Cleaned up formatting/style
100 | 
101 | # v0.6.4 (2020-11-19)
102 | 
103 | - Updated numpy, pandas, and nest_asyncio
104 | 
105 | # v0.6.3 (2020-10-29)
106 | 
107 | - Updated numpy, pygments, pytest, and nest_asyncio
108 | 
109 | # v0.6.2 (2020-10-20)
110 | 
111 | - Updated buildx, codecov, humanize, joblib, pandas, pygments, pytest, and nest_asyncio
112 | 
113 | # v0.6.1 (2020-08-26)
114 | 
115 | - Updated humanize, pytest-cov, pytest-xdist, and pandas
116 | 
117 | # v0.6.0 (2020-08-05)
118 | 
119 | - Retrained models for updated version of scikit-learn
120 | 
121 | # v0.5.9 (2020-08-05)
122 | 
123 | - Udpated scikit-learn and pytest
124 | - Moved from CyberReboot to new IQTLabs brand
125 | 
126 | # v0.5.8 (2020-07-29)
127 | 
128 | - Updated cython, humanize, netaddr, numpy, pandas, pytest, pytest-xdist, and nest_asyncio
129 | 
130 | # v0.5.7 (2020-07-01)
131 | 
132 | - Updated joblib, pandas, numpy, netaddr, and humanize
133 | - Moved Docker base image to python:3.8-slim (debian based instead of alpine)
134 | 
135 | # v0.5.6 (2020-06-18)
136 | 
137 | - Updated pandas, pytest-cov
138 | - Broke up Docker into two images for build times across architectures
139 | 
140 | # v0.5.5 (2020-06-03)
141 | 
142 | - Updated joblib, pandas, pytest, pytest-cov, and pyshark
143 | - Updated documentation for developers
144 | 
145 | # v0.5.4 (2020-05-06)
146 | 
147 | - Updated pytest-xdist, nest-asyncio, and numpy
148 | - Added flag --no-srcmacid to make predictions on all MACs found
149 | 
150 | # v0.5.3 (2020-04-27)
151 | 
152 | - Actually fix manifest to properly include models for PyPi Package.
153 | 
154 | # v0.5.2 (2020-04-27)
155 | 
156 | - Added missing files to manifest for PyPi package.
157 | 
158 | # v0.5.1 (2020-04-27)
159 | 
160 | - Rolling back to latest published version of pyshark - for issues see commented version in requirements.txt, which is unfortunately not supported for dependency install from PyPi with pip.
161 | 
162 | # v0.5.0 (2020-04-23)
163 | 
164 | - Rewrote Networkml entirely
165 | - Now only does classification, no longer behavior
166 | - Flexible stages for processing PCAPs into CSVs of features
167 | - No longer uses tensorflow
168 | - Now supports running on ARM
169 | 
170 | # v0.4.8 (2020-02-20)
171 | 
172 | - Fixed local dev python version to be 3.7
173 | - Fixed missing threshold_time configuration option
174 | - Fixed filename checks for client/server
175 | - Warn instead of debug log when files are ignored
176 | - Fixed running concurrent.futures when on python3.6
177 | 
178 | # v0.4.7 (2020-02-12)
179 | 
180 | - Added caching for parsed sessions
181 | - Added IPv6 capability to networkML
182 | - Updated pytest to v5.3.4
183 | - Updated models for scikit v0.22.1
184 | - Updated redis to v3.4.1
185 | 
186 | # v0.4.6 (2020-01-15)
187 | 
188 | - Updated tensorflow
189 | - Updated pyshark
190 | - Made sessionizer parallel
191 | - Added 120 minute timeout for a pcap
192 | - Added sessionizer test
193 | - Updated license
194 | 
195 | # v0.4.5 (2020-01-02)
196 | 
197 | - Updated numpy to 1.18.0
198 | - updated pytest-xdist to 1.31.0
199 | - Updated test_extract_macs() test
200 | - Removed vent template
201 | - Added check for empty F1 score list
202 | 
203 | # v0.4.4 (2019-12-18)
204 | 
205 | - Add pyshark wiring to get highest-level protocol
206 | - Make tests run in parallel
207 | - Update models for scikit-learn v0.22
208 | - Improve parsing speed
209 | - Add tests for pcap reader
210 | 
211 | # v0.4.3 (2019-12-4)
212 | 
213 | - update pytest to 5.3.1
214 | - update scikit-learn to 0.22
215 | 
216 | # v0.4.2 (2019-11-22)
217 | 
218 | - Updated numpy to 1.17.4
219 | - Updated pytest to 5.3
220 | - Edited .gitignore
221 | - Added tests for label extraction
222 | - Added test for avx check
223 | - Added pcap labels to decision
224 | 
225 | # v0.4.1 (2019-11-07)
226 | 
227 | - updated numpy to 1.17.3
228 | - updated pytest to 5.2.2
229 | - Added documentation
230 | - Added support for additional labels and filenames
231 | 
232 | # v0.4.0 (2019-10-24)
233 | 
234 | - Updated pytest-cov
235 | - Updated pytest
236 | - Updated redis
237 | - Added more documentation and tests
238 | - Updated the python image for the Dockerfile
239 | 
240 | # v0.3.9 (2019-10-02)
241 | 
242 | - Updated pytest to 5.2.0
243 | - Updated tensorflow to 2.0.0
244 | - Fixed up old code using tensorflow1 to work with tensorflow2
245 | 
246 | # v0.3.8 (2019-09-12)
247 | 
248 | - Updated pytest to 5.1.2
249 | - Updated numpy to 1.17.2
250 | - Fixed make help
251 | 
252 | # v0.3.7 (2019-08-30)
253 | 
254 | - Updated redis to 3.3.8
255 | - Updated pytest to 5.1.1
256 | 
257 | # v0.3.6 (2019-08-15)
258 | 
259 | - Updated redis to 3.3.7
260 | - Redis is now optional
261 | - RabbitMQ is now configurable, and has a cleaned up message format
262 | - Retrained models against numpy 1.17.0 and scikit-learn 0.21.3
263 | 
264 | # v0.3.5 (2019-08-02)
265 | 
266 | - Updated pika to 1.1.0
267 | - Got rid of outdated linux headers
268 | - Updated redis to 3.3.4
269 | 
270 | # v0.3.4 (2019-07-11)
271 | 
272 | - Updated to python3.7
273 | - Updated models
274 | - Updated tensorflow to 1.14.0
275 | - Updated pytest to 5.0.1
276 | 
277 | # v0.3.3 (2019-06-13)
278 | 
279 | - Updated models and included printers
280 | - Renamed PoseidonML to NetworkML
281 | - Updated pytest to 4.6.3
282 | 
283 | # v0.3.2 (2019-05-31)
284 | 
285 | - Updated numpy to 1.16.3
286 | - Updated pytest-cov to 2.7.1
287 | - Updated pytest to 4.5.0
288 | - Reduce places that Tensorflow is imported
289 | - Made it possible to run classifications on CPUs that don't support AVX
290 | 
291 | # v0.3.1 (2019-04-18)
292 | 
293 | - Updated Tensorflow imports for new deprecations
294 | - Updated pika to 1.0.1
295 | - Removed a bunch of duplicated code to keep the code base cleaner
296 | - Added a bunch of tests to get coverage up to 90%
297 | - Updated pytest to 4.4.1
298 | - Removed the use of md5 and replaced it with sha224
299 | 
300 | # v0.3.0 (2019-04-04)
301 | 
302 | - Major rewrite and restructuring of the code base, but same functionality
303 | 
304 | # v0.2.10 (2019-03-22)
305 | 
306 |  - Changed the default for Rabbit to not be used
307 |  - Changed the environment variable for Rabbit from SKIP_RABBIT to RABBIT
308 |  - Improved logging output for summarizing evaluation results of multiple PCAPs
309 |  - Updated versions of pika, pytest, redis, and scikit-learn
310 |  - Fixed a bug that was preventing training the SoSModel
311 |  - Added some more test coverage
312 |  - Updated the trained models and labels
313 | 
314 | # v0.2.9 (2019-03-08)
315 | 
316 |  - Updated tensorflow from 1.12.0 to 1.13.1.
317 |  - Updated numpy from 1.16.1 to 1.16.2.
318 |  - Miscellaneous error checking and spacing corrections.
319 | 
320 | # v0.2.8 (2019-02-22)
321 | 
322 |  - Updated pytest to 4.3.0 from 4.2.0.
323 |  - Cleaned up some code issues as pointed out by Codacy.
324 |  - Minor miscellaneous bugfixes to support running training natively.
325 | 
326 | # v0.2.7 (2019-02-09)
327 | 
328 |  - Provided a way to run DeviceClassifier training and testing scripts from command line.
329 |  - Cleaned up some unused code and consolidated common operations into utils and model class.
330 |  - Fixed issue where Makefile built the OneLayer training container when building the test one.
331 |  - Updated redis to 3.1.0
332 |  - Updated numpy to 1.16.1
333 | 
334 | # v0.2.6 (2019-01-25)
335 | 
336 |  - Updated numpy to 1.16.0
337 |  - Updated pika to 0.13.0
338 |  - Included a conda yml file for a standalone/dev environment, and new Makefile options to build it.
339 | 
340 | # v0.2.5 (2019-01-11)
341 | 
342 |  - models have been retrained to fix a warning about invalid results when evaluating a pcap
343 |  - some unused code and module has been removed
344 |  - upgraded pytest to 4.1.0 and pytest-cov to 2.6.1
345 | 
346 | # v0.2.4 (2018-12-21)
347 | 
348 |  - upgraded scikit-learn to 0.20.2
349 |  - removed scipy
350 |  - cleaned up requirements.txt and setup.py
351 |  - fixed issue where redis was throwing error when saving decisions
352 |  - fixed error in eval_onelayer that was using nonexistent key
353 |  - Make train/eval/test process consistent for all models
354 |  - Fixed path error specific to python 3.5 that occurred when processing PCAP files
355 |  - PCAP directories can now be used when running model evals
356 | 
357 | # v0.2.3 (2018-12-14)
358 | 
359 |  - upgraded pytest to 4.0.2
360 |  - upgraded scikit-learn to 0.20.1
361 |  - improved README documentation
362 |  - upgraded redis to 3.0.1
363 |  - added pcap directory support
364 |  - re-enabled the behavior model
365 |  - includes the trained behavior model
366 |  - fixed hardcoded onelayer pickle file in randomforest
367 |  - fixed missing labels
368 |  - simplified rabbit connection
369 |  - replaced deprecated randomized logistic regression with random forest
370 | 
371 | # v0.2.2 (2018-10-22)
372 | 
373 |  - upgraded pytest to 3.9.1
374 |  - fixed a NoneType error when multiplying
375 |  - fixed an issue where the config file wasn't being read properly
376 |  - abstracted away the code to read the config file into one place
377 | 
378 | # v0.2.1 (2018-10-02)
379 | 
380 |  - lots of cleanup of duplicated code
381 |  - upgraded tensorflow to 1.11.0
382 |  - upgraded scikit-learn to 0.20.0
383 |  - updated the model
384 | 
385 | # v0.2.0 (2018-09-22)
386 | 
387 |  - moved a bunch of duplicated code into common utils
388 | 
389 | # v0.1.9 (2018-09-21)
390 | 
391 |  - fixed issue where results were not getting sent to rabbitmq or stored in redis
392 |  - cleaned up cruft in OneLayer Eval
393 |  - moved OneLayer Eval code into a class to reduce duplication
394 | 
395 | # v0.1.8 (2018-09-10)
396 | 
397 |  - upgraded pytest to 3.8.0
398 |  - upgraded pytest-cov to 2.6.0
399 |  - upgraded tensorflow to 1.10.1
400 |  - made all print statements logger statements
401 |  - sends messages to rabbitmq now even if not enough sessions
402 |  - stores normal/abnormal results in redis now
403 |  - fixed performance issue where evaluation would take a long time
404 |  - updated the model
405 | 
406 | # v0.1.7 (2018-08-24)
407 | 
408 |  - upgraded pytest to 3.7.2
409 |  - upgraded numpy to 1.15.1
410 | 
411 | # v0.1.6 (2018-08-10)
412 | 
413 |  - updated model
414 |  - upgraded pytest to 3.7.1
415 |  - upgraded scikit-learn to 0.19.2
416 |  - linting
417 | 
418 | # v0.1.5 (2018-07-27)
419 | 
420 |  - fixes pairs issue when checking private addresses
421 |  - fixes the models path for running in a container
422 |  - improve dockerfile builds
423 |  - upgraded pika to 0.12.0
424 |  - upgraded scipy to 1.1.0
425 |  - upgraded numpy to 1.14.5
426 |  - upgraded tensorflow to 1.9.0
427 |  - fixed vent template
428 |  - added some initial tests
429 |  - re-trained the onelayer model with improved accuracy
430 |  - reduced the number of labels for onelayer to 6
431 |  - improvements for developing on poseidonml
432 | 
433 | # v0.1.4 (2018-07-13)
434 | 
435 |  - initial utility release
436 | 


--------------------------------------------------------------------------------
/networkml/featurizers/funcs/host.py:
--------------------------------------------------------------------------------
  1 | import ipaddress
  2 | 
  3 | import netaddr
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | from networkml.featurizers.features import Features
  8 | 
  9 | 
 10 | MAC_BCAST = netaddr.EUI('FF-FF-FF-FF-FF-FF')
 11 | ETH_TYPE_ARP = 0x806
 12 | ETH_TYPE_IP = 0x800
 13 | ETH_TYPE_IPV6 = 0x86DD
 14 | ETH_TYPE_IPX = 0x8137
 15 | ETH_IP_TYPES = frozenset((ETH_TYPE_ARP, ETH_TYPE_IP, ETH_TYPE_IPV6))
 16 | WK_IP_PROTOS = ('tcp', 'udp', 'icmp', 'arp', 'icmpv6', 'gre', 'esp', 'ah')
 17 | WK_IP_PROTOS_INDEX = {WK_IP_PROTOS.index(i): i for i in WK_IP_PROTOS}
 18 | TCP_UDP_PROTOS = {
 19 |     6: 'tcp',
 20 |     17: 'udp',
 21 | }
 22 | 
 23 | 
 24 | class HostBase:
 25 | 
 26 |     CALC_COL_NAMES = (
 27 |         ('frame.len', 'frame_len'),
 28 |         ('frame.time_delta_displayed', 'time_delta'))
 29 |     CALC_COL_FUNCS = (
 30 |         ('max', lambda x: x.max()),
 31 |         ('min', lambda x: x.min()),
 32 |         ('count', lambda x: x.count()),
 33 |         ('total', lambda x: x.sum()),
 34 |         ('average', lambda x: x.mean()),
 35 |         ('median', lambda x: x.median()),
 36 |         ('variance', lambda x: x.var()),
 37 |         ('25q', lambda x: x.quantile(0.25)),
 38 |         ('75q', lambda x: x.quantile(0.75)))
 39 |     # https://www.iana.org/assignments/service-names-port-numbers/service-names-port-numbers.xml
 40 |     # TODO: enumerate most common ports from survey (complete indicator matrix too expensive)
 41 |     WK_PRIV_TCPUDP_PORTS = frozenset(
 42 |         [22, 23, 25, 53, 67, 68, 69, 80, 88, 110, 123, 137, 138, 139, 143, 161, 443, 631])
 43 |     WK_NONPRIV_TCPUDP_PORTS = frozenset(
 44 |         [1900, 2375, 2376, 5222, 5349, 5353, 5354, 5349, 5357, 6653])
 45 |     DROP_PROTOS = frozenset(
 46 |         ['frame', 'data', 'eth', 'ip', 'ipv6'])
 47 | 
 48 |     def _mac(self, mac):
 49 |         return netaddr.EUI(int(mac), dialect=netaddr.mac_unix_expanded)
 50 | 
 51 |     def _is_unicast(self, mac):
 52 |         mac_val = self._mac(mac)
 53 |         if mac_val == MAC_BCAST or mac_val.packed[0] & 1:
 54 |             return False
 55 |         return True
 56 | 
 57 |     def _numericintset(self, nums):
 58 |         if nums is not None:
 59 |             return frozenset(int(x) for x in nums if x is not None and pd.notna(x))
 60 |         return frozenset()
 61 | 
 62 |     def _get_ip(self, row, cols):
 63 |         ipv = row['ip.version']
 64 |         if not pd.isnull(ipv):
 65 |             ipv = int(ipv)
 66 |             if ipv == 4:
 67 |                 prefix = 'ip'
 68 |             else:
 69 |                 prefix = 'ipv6'
 70 |             for col in cols:
 71 |                 val = row['.'.join((prefix, col))]
 72 |                 if not pd.isnull(val):
 73 |                     return ipaddress.ip_address(int(val))
 74 |         return None
 75 | 
 76 |     def _get_src_ip(self, row):
 77 |         return self._get_ip(row, ('src', 'src_host'))
 78 | 
 79 |     def _get_dst_ip(self, row):
 80 |         return self._get_ip(row, ('dst', 'dst_host'))
 81 | 
 82 |     def _get_flags(self, mac_df, col_name, decode_map, suffix=None, field_name=None):
 83 |         try:
 84 |             col = mac_df[col_name]
 85 |             unique_flags = self._numericintset(col.unique())
 86 |         except KeyError:
 87 |             unique_flags = [0]
 88 |         decoded_flags = set()
 89 |         for bit, decoded_flag in decode_map.items():
 90 |             bitval = 2**bit
 91 |             for flags in sorted(filter(lambda x: x >= bitval, unique_flags)):
 92 |                 if flags & bitval:
 93 |                     decoded_flags.add(decoded_flag)
 94 |         if field_name is None:
 95 |             field_name = col_name.replace('.', '_')
 96 |         if suffix is not None:
 97 |             return {'tshark_%s_%s_%s' % (
 98 |                 field_name, decoded_flag, suffix): int(decoded_flag in decoded_flags)
 99 |                 for decoded_flag in decode_map.values()}
100 |         return {'tshark_%s_%s' % (
101 |             field_name, decoded_flag): int(decoded_flag in decoded_flags)
102 |             for decoded_flag in decode_map.values()}
103 | 
104 |     def _tshark_flags(self, suffix, mac_df):
105 |         mac_row_flags = {}
106 |         for func in (
107 |             lambda x, y: self._get_flags(x, 'ip.dsfield', {
108 |                                          0: 'ecn0', 1: 'ecn1', 2: 'dscp0', 3: 'dscp1', 4: 'dscp2', 5: 'dscp3', 6: 'dscp4', 7: 'dscp5'}, suffix=y),
109 |             lambda x, y: self._get_flags(x, 'ip.flags', {
110 |                                          0: 'fin', 1: 'syn', 2: 'rst', 3: 'psh', 4: 'ack', 5: 'urg', 6: 'ece', 7: 'cwr', 8: 'ns'}, suffix=y),
111 |             lambda x, y: self._get_flags(x, 'tcp.flags', {
112 |                                          0: 'fin', 1: 'syn', 2: 'rst', 3: 'psh', 4: 'ack', 5: 'urg', 6: 'ece', 7: 'cwr', 8: 'ns'}, suffix=y),
113 |         ):
114 |             mac_row_flags.update(func(mac_df, suffix))
115 |         return mac_row_flags
116 | 
117 |     def _lowest_ip_proto_port(self, mac_df, ip_proto):
118 |         if not mac_df.empty:
119 |             src = mac_df['%s.srcport' % ip_proto]
120 |             dst = mac_df['%s.dstport' % ip_proto]
121 |             if src.count() and dst.count():
122 |                 return self._numericintset(np.minimum(src, dst).unique())   # pylint: disable=no-member
123 |         return frozenset()
124 | 
125 |     def _tshark_ports(self, suffix, mac_df):
126 |         mac_row_ports = {}
127 | 
128 |         def port_priv(port):
129 |             return port < 1024
130 | 
131 |         for ip_proto_num, ip_proto in TCP_UDP_PROTOS.items():
132 |             proto_df = mac_df[mac_df['ip.proto']==ip_proto_num]
133 |             lowest_ports = self._lowest_ip_proto_port(proto_df, ip_proto)
134 |             for field_name, ports, wk_ports in (
135 |                 ('priv', {port for port in lowest_ports if port_priv(
136 |                     port)}, self.WK_PRIV_TCPUDP_PORTS),
137 |                 ('nonpriv', {port for port in lowest_ports if not port_priv(
138 |                     port)}, self.WK_NONPRIV_TCPUDP_PORTS),
139 |             ):
140 |                 port_flags = {port: int(port in ports) for port in wk_ports}
141 |                 port_flags.update(
142 |                     {'other': int(bool(lowest_ports) and not ports.issubset(wk_ports))})
143 |                 mac_row_ports.update({
144 |                     'tshark_%s_%s_port_%s_%s' % (ip_proto, field_name, port, suffix): present for port, present in port_flags.items()})
145 |         return mac_row_ports
146 | 
147 |     def _tshark_ratio_ports(self, mac_df):
148 |         mac_row_ports = {}
149 | 
150 |         def calc_ratio(src_count, dst_count):
151 |             packet_ratio = 0
152 |             if src_count is not None and dst_count is not None:
153 |                 if dst_count > 0:
154 |                     packet_ratio = src_count / dst_count
155 |                 elif src_count > 0:
156 |                     packet_ratio = 1
157 |             return packet_ratio
158 | 
159 | 
160 |         for ip_proto_num, ip_proto in TCP_UDP_PROTOS.items():
161 |             proto_df = mac_df[mac_df['ip.proto']==ip_proto_num]
162 |             src = pd.DataFrame(columns=['%s.srcport' % ip_proto])
163 |             dst = pd.DataFrame(columns=['%s.dstport' % ip_proto])
164 |             if not proto_df.empty:
165 |                 try:
166 |                     src = proto_df['%s.srcport' % ip_proto]
167 |                     dst = proto_df['%s.dstport' % ip_proto]
168 |                 except KeyError:
169 |                     pass
170 |             for field_name, wk_ports, port_src, port_dst in (
171 |                 ('priv', self.WK_PRIV_TCPUDP_PORTS,
172 |                  src[src <= 1023], dst[dst <= 1023]),
173 |                 ('nonpriv', self.WK_NONPRIV_TCPUDP_PORTS,
174 |                  src[src > 1023], dst[dst > 1023])):
175 |                 src_values = port_src[src.isin(wk_ports)]
176 |                 dst_values = port_dst[dst.isin(wk_ports)]
177 |                 src_counts = {}
178 |                 if not src_values.empty:
179 |                     src_counts = src_values.value_counts()
180 |                 dst_counts = {}
181 |                 if not dst_values.empty:
182 |                     dst_counts = dst_values.value_counts()
183 |                 for port in wk_ports:
184 |                     src_count = src_counts.get(port, None)
185 |                     dst_count = dst_counts.get(port, None)
186 |                     mac_row_ports.update({
187 |                         'tshark_%s_%s_packet_ratio_io_port_%s' % (ip_proto, field_name, port): calc_ratio(src_count, dst_count)})
188 |                 src_values = port_src[~port_src.isin(wk_ports)]
189 |                 src_count = 0
190 |                 if not src_values.empty:
191 |                     src_count = src_values.value_counts().sum()
192 |                 dst_values = port_dst[~port_dst.isin(wk_ports)]
193 |                 dst_count = 0
194 |                 if not dst_values.empty:
195 |                     dst_count = dst_values.value_counts().sum()
196 |                 mac_row_ports.update({
197 |                     'tshark_%s_%s_packet_ratio_io_port_%s' % (ip_proto, field_name, 'other'): calc_ratio(src_count, dst_count)})
198 |         return mac_row_ports
199 | 
200 |     def _tshark_ipversions(self, mac_df):
201 |         try:
202 |             ip_versions = self._numericintset(mac_df['ip.version'].unique())
203 |         except AttributeError:
204 |             ip_versions = frozenset()
205 |         return {'tshark_ipv%u' % v: int(v in ip_versions) for v in (4, 6)}
206 | 
207 |     def _tshark_non_ip(self, mac_df):
208 |         try:
209 |             eth_types = self._numericintset(mac_df['eth.type'].unique())
210 |         except AttributeError:
211 |             eth_types = frozenset()
212 |         return {
213 |             'tshark_ipx': int(ETH_TYPE_IPX in eth_types),
214 |             'tshark_nonip': int(bool(eth_types - ETH_IP_TYPES)),
215 |         }
216 | 
217 |     def _tshark_both_private_ip(self, mac_df):
218 |         try:
219 |             both_private_ip = int(mac_df['_both_private_ip'].max() == 1)
220 |         except KeyError:
221 |             both_private_ip = 0
222 |         return {
223 |             'tshark_both_private_ip': both_private_ip,
224 |         }
225 | 
226 |     def _tshark_ipv4_multicast(self, mac_df):
227 |         try:
228 |             ipv4_multicast = int(mac_df['_ipv4_multicast'].max() == 1)
229 |         except KeyError:
230 |             ipv4_multicast = 0
231 |         return {
232 |             'tshark_ipv4_multicast': ipv4_multicast,
233 |         }
234 | 
235 |     def _tshark_wk_ip_protocol(self, mac_df):
236 |         return self._get_flags(mac_df, '_protos_int', WK_IP_PROTOS_INDEX, suffix=None, field_name='wk_ip_protocol')
237 | 
238 |     def _tshark_vlan_id(self, mac_df):
239 |         return {
240 |             'tshark_tagged_vlan': int(pd.notna(mac_df['vlan.id'].max()))
241 |         }
242 | 
243 |     def _tshark_frame_epoch(self, mac_df):
244 |         return {
245 |             'tshark_frame_epoch': float(mac_df['frame.time_epoch'].max())
246 |         }
247 | 
248 |     def _tshark_unique_ips(self, mac, mac_df):
249 |         srcips = mac_df[mac_df['eth.src'] == mac]['_srcip']
250 |         dstips = mac_df[mac_df['eth.src'] == mac]['_dstip']
251 |         return {
252 |             'tshark_srcips': list(set(srcips.unique().tolist()) - {'None'}),
253 |             'tshark_unique_srcips': srcips.nunique(),
254 |             'tshark_unique_dstips': dstips.nunique(),
255 |         }
256 | 
257 |     def _calc_cols(self, mac, mac_df):
258 |         mac_row = {}
259 |         for suffix, suffix_func in (
260 |             ('out', lambda x: mac_df[mac_df['eth.src'] == x]),
261 |             ('in', lambda x: mac_df[mac_df['eth.src'] != x])):
262 |             try:
263 |                 suffix_df = suffix_func(mac)
264 |             except KeyError:
265 |                 continue
266 |             for col_name, field_name in self.CALC_COL_NAMES:
267 |                 col = suffix_df[col_name]
268 |                 for calc_name, calc_func in self.CALC_COL_FUNCS:
269 |                     calc_col = 'tshark_%s_%s_%s' % (
270 |                         calc_name, field_name, suffix)
271 |                     val = calc_func(col)
272 |                     if pd.isnull(val):
273 |                         val = 0
274 |                     mac_row.update({calc_col: val})
275 |             for func in (
276 |                     self._tshark_flags,
277 |                     self._tshark_ports):
278 |                 mac_row.update(func(suffix, suffix_df))
279 |         for func in (
280 |                 self._tshark_ipversions,
281 |                 self._tshark_non_ip,
282 |                 self._tshark_both_private_ip,
283 |                 self._tshark_ipv4_multicast,
284 |                 self._tshark_wk_ip_protocol,
285 |                 self._tshark_vlan_id,
286 |                 self._tshark_frame_epoch,
287 |                 self._tshark_ratio_ports):
288 |             mac_row.update(func(mac_df))
289 |         mac_row.update(self._tshark_unique_ips(mac, mac_df))
290 |         return mac_row
291 | 
292 |     def _calc_mac_row(self, mac, mac_df):
293 |         mac_row = {'host_key': str(self._mac(mac))}
294 |         mac_row.update(self._calc_cols(mac, mac_df))
295 |         return mac_row
296 | 
297 |     def _host_key(self, row):
298 |         raise NotImplementedError
299 | 
300 |     def _df_ip_flags(self, ip_src, ip_dst):
301 |         both_private_ip = 0
302 |         ipv4_multicast = 0
303 |         if not pd.isnull(ip_src) and not pd.isnull(ip_dst):
304 |             both_private_ip = int(ip_src.is_private and ip_dst.is_private)
305 |             ipv4_multicast = int(ip_dst.version == 4 and ip_dst.is_multicast)
306 |         return (both_private_ip, ipv4_multicast)
307 | 
308 |     def _encode_df_proto_flags(self, short_row_keys, frame_protocols):
309 |         if frame_protocols:
310 |             short_frame_protocols = frozenset(frame_protocols.split(':'))
311 |         else:
312 |             short_frame_protocols = {}
313 |         all_protos = short_row_keys.union(
314 |             short_frame_protocols) - self.DROP_PROTOS
315 |         all_protos_int = 0
316 |         for proto in all_protos.intersection(WK_IP_PROTOS):
317 |             index = WK_IP_PROTOS.index(proto)
318 |             all_protos_int += 2**index
319 |         return all_protos_int
320 | 
321 |     def _df_proto_flags(self, row):
322 |         short_row_keys = frozenset(x.split('.')[0] for x, y in row.items(
323 |         ) if not pd.isnull(y) and not x.startswith('_'))
324 |         return self._encode_df_proto_flags(short_row_keys, row['frame.protocols'])
325 | 
326 |     def _tshark_all(self, df, srcmacid):
327 |         print('calculating intermediates', end='', flush=True)
328 |         df['_host_key'], df['_srcip'], df['_dstip'], df['_both_private_ip'], df['_ipv4_multicast'], df['_protos_int'] = zip(
329 |             *df.apply(self._host_key, axis=1))
330 |         eth_srcs = frozenset(df['eth.src'].unique())
331 |         eth_dsts = frozenset(df['eth.dst'].unique())
332 |         all_unicast_macs = frozenset(
333 |             mac for mac in eth_srcs.union(eth_dsts) if self._is_unicast(mac))
334 |         host_keys = df['_host_key'].unique()
335 |         host_keys_count = len(host_keys)
336 |         print('.%u MACs, %u sessions' %
337 |               (len(all_unicast_macs), host_keys_count), end='', flush=True)
338 |         if srcmacid:
339 |             minsrcipmac = df.groupby(['eth.src'])[
340 |                 '_srcip'].nunique().idxmin(axis=0)
341 |             assert minsrcipmac in all_unicast_macs
342 |             print('.MAC %s has minimum number of source IPs, selected as canonical source' %
343 |                   self._mac(minsrcipmac), end='', flush=True)
344 |             all_unicast_macs = {minsrcipmac}
345 |         mac_rows = []
346 |         for i, mac in enumerate(all_unicast_macs, start=1):
347 |             mac_df = df[(df['eth.src'] == mac) | (df['eth.dst'] == mac)]
348 |             # If just one MAC, don't need groupby on host key.
349 |             if len(all_unicast_macs) == 1:
350 |                 mac_rows.append(self._calc_mac_row(mac, mac_df))
351 |             else:
352 |                 s = 0
353 |                 for _, key_df in mac_df.groupby('_host_key'):
354 |                     s += 1
355 |                     if s % 100 == 0:
356 |                         print('.MAC %u/%u %.1f%%' % (i, len(all_unicast_macs),
357 |                                                      s / len(host_keys) * 100), end='', flush=True)
358 |                     mac_rows.append(self._calc_mac_row(mac, key_df))
359 |             print('.MAC %u/%u 100%%.' %
360 |                   (i, len(all_unicast_macs)), end='', flush=True)
361 |         return mac_rows
362 | 
363 | 
364 | class Host(HostBase, Features):
365 | 
366 |     def _host_key(self, row):
367 |         ip_src = self._get_src_ip(row)
368 |         ip_dst = self._get_dst_ip(row)
369 |         both_private_ip, ipv4_multicast = self._df_ip_flags(ip_src, ip_dst)
370 |         protos_int = self._df_proto_flags(row)
371 |         return (0, str(ip_src), str(ip_dst), both_private_ip, ipv4_multicast, protos_int)
372 | 
373 |     def host_tshark_all(self, df, parsed_args):
374 |         return self._tshark_all(df, parsed_args.srcmacid)
375 | 
376 | 
377 | class SessionHost(HostBase, Features):
378 | 
379 |     def _host_key(self, row):
380 |         eth_src = row['eth.src']
381 |         eth_dst = row['eth.dst']
382 |         ip_src = self._get_src_ip(row)
383 |         ip_dst = self._get_dst_ip(row)
384 |         both_private_ip, ipv4_multicast = self._df_ip_flags(ip_src, ip_dst)
385 |         protos_int = self._df_proto_flags(row)
386 |         if not pd.isnull(ip_src) and not pd.isnull(ip_dst):
387 |             ip_proto = TCP_UDP_PROTOS.get(row['ip.version'], None)
388 |             if ip_proto:
389 |                 src_port = row['%s.srcport' % ip_proto]
390 |                 dst_port = row['%s.dstport' % ip_proto]
391 |                 if ip_src > ip_dst:
392 |                     key = (ip_proto, eth_src, ip_src,
393 |                            src_port, eth_dst, ip_dst, dst_port)
394 |                 else:
395 |                     key = (ip_proto, eth_dst, ip_dst,
396 |                            dst_port, eth_src, ip_src, src_port)
397 |             else:
398 |                 key = sorted([(eth_src, ip_src), (eth_dst, ip_dst)])
399 |         else:
400 |             key = (row['eth.type'],) + tuple(sorted((eth_src, eth_dst)))
401 |         return (hash('-'.join([str(x) for x in key])), str(ip_src), str(ip_dst), both_private_ip, ipv4_multicast, protos_int)
402 | 
403 |     def sessionhost_tshark_all(self, df, parsed_args):
404 |         return self._tshark_all(df, parsed_args.srcmacid)
405 | 


--------------------------------------------------------------------------------
/networkml/parsers/pcap_to_csv.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import concurrent.futures
  3 | import csv
  4 | import functools
  5 | import json
  6 | import logging
  7 | import ntpath
  8 | import os
  9 | import pathlib
 10 | import shlex
 11 | import subprocess
 12 | import tempfile
 13 | from copy import deepcopy
 14 | 
 15 | import pyshark
 16 | 
 17 | from networkml.helpers.gzipio import gzip_reader
 18 | from networkml.helpers.gzipio import gzip_writer
 19 | 
 20 | 
 21 | class PCAPToCSV():
 22 | 
 23 |     def __init__(self, raw_args=None):
 24 |         self.logger = logging.getLogger(__name__)
 25 |         self.PROTOCOLS = ['<IP Layer>',
 26 |                           '<ETH Layer>',
 27 |                           '<TCP Layer>',
 28 |                           '<UDP Layer>',
 29 |                           '<ICMP Layer>',
 30 |                           '<ICMPV6 Layer>',
 31 |                           '<DNS Layer>',
 32 |                           '<DHCP Layer>',
 33 |                           '<DHCPv6 Layer>',
 34 |                           '<ARP Layer>',
 35 |                           '<IPV6 Layer>',
 36 |                           '<TLS Layer>']
 37 |         self.raw_args = raw_args
 38 | 
 39 |     @staticmethod
 40 |     def ispcap(pathfile):
 41 |         for ext in ('pcap', 'pcapng', 'dump', 'capture'):
 42 |             if pathfile.endswith(''.join(('.', ext))):
 43 |                 return True
 44 |         return False
 45 | 
 46 |     @staticmethod
 47 |     def parse_args(raw_args=None):
 48 |         parser = argparse.ArgumentParser()
 49 |         parser.add_argument(
 50 |             'path', help='path to a single pcap file, or a directory of pcaps to parse')
 51 |         parser.add_argument('--combined', '-c', action='store_true',
 52 |                             help='write out all records from all pcaps into a single gzipped csv file')
 53 |         parser.add_argument('--engine', '-e', choices=['pyshark', 'tshark', 'host'],
 54 |                             default='tshark', help='engine to use to process the PCAP file (default=tshark)')
 55 |         parser.add_argument('--level', '-l', choices=['packet', 'flow', 'host'],
 56 |                             default='packet', help='level to make the output records (default=packet)')
 57 |         parser.add_argument('--output', '-o', default=None,
 58 |                             help='path to write out gzipped csv file or directory for gzipped csv files')
 59 |         parser.add_argument('--threads', '-t', default=1, type=int,
 60 |                             help='number of async threads to use (default=1)')
 61 |         parser.add_argument('--verbose', '-v', choices=[
 62 |                             'DEBUG', 'INFO', 'WARNING', 'ERROR'], default='INFO', help='logging level (default=INFO)')
 63 |         parsed_args = parser.parse_args(raw_args)
 64 |         return parsed_args
 65 | 
 66 |     @staticmethod
 67 |     def get_csv_header(dict_fp):
 68 |         header_all = set()
 69 |         with gzip_reader(dict_fp) as f_in:
 70 |             for line in f_in:
 71 |                 header_all.update(json.loads(line.strip()).keys())
 72 |         header = []
 73 |         for key in header_all:
 74 |             if key[0].isalpha() or key[0] == '_':
 75 |                 header.append(key)
 76 |         return header
 77 | 
 78 |     @staticmethod
 79 |     def combine_csvs(out_paths, combined_path):
 80 |         # First determine the field names from the top line of each input file
 81 |         fieldnames = {'filename'}
 82 |         for filename in out_paths:
 83 |             with gzip_reader(filename) as f_in:
 84 |                 reader = csv.reader(f_in)
 85 |                 fieldnames.update({header for header in next(reader)})
 86 | 
 87 |         # Then copy the data
 88 |         with gzip_writer(combined_path) as f_out:
 89 |             writer = csv.DictWriter(f_out, fieldnames=list(fieldnames))
 90 |             writer.writeheader()
 91 |             for filename in out_paths:
 92 |                 with gzip_reader(filename) as f_in:
 93 |                     reader = csv.DictReader(f_in)
 94 |                     for line in reader:
 95 |                         line['filename'] = filename.split(
 96 |                             '/')[-1].split('csv.gz')[0]
 97 |                         writer.writerow(line)
 98 |                     PCAPToCSV.cleanup_files([filename])
 99 | 
100 |     @staticmethod
101 |     def cleanup_files(paths):
102 |         for fi in paths:
103 |             if os.path.exists(fi):
104 |                 os.remove(fi)
105 | 
106 |     def get_pyshark_packet_data(self, pcap_file, dict_fp):
107 |         all_protocols = set()
108 | 
109 |         pcap_file_short = ntpath.basename(pcap_file)
110 |         with gzip_writer(dict_fp) as f_out:
111 |             with pyshark.FileCapture(pcap_file,
112 |                                      use_json=True,
113 |                                      include_raw=True,
114 |                                      keep_packets=False,
115 |                                      custom_parameters=['-o', 'tcp.desegment_tcp_streams:false', '-n']) as cap:
116 |                 for packet in cap:
117 |                     packet_dict = {}
118 |                     packet_dict['filename'] = pcap_file_short
119 |                     frame_info = packet.frame_info._all_fields
120 |                     for key in frame_info:
121 |                         packet_dict[key] = frame_info[key]
122 |                     # can overflow the field size for csv
123 |                     #packet_dict['raw_packet'] = packet.get_raw_packet()
124 |                     layers = str(packet.layers)
125 |                     packet_dict['layers'] = layers
126 |                     str_layers = layers[1:-1].split(', ')
127 |                     for str_layer in str_layers:
128 |                         # ignore raw layers
129 |                         if 'RAW' not in str_layer:
130 |                             all_protocols.add(str_layer)
131 |                         # only include specified protocols due to unknown parsing for some layers
132 |                         if str_layer in self.PROTOCOLS:
133 |                             layer_info = getattr(packet, str_layer.split()[
134 |                                                  0][1:].lower())._all_fields
135 |                             # check for nested dicts, one level deep
136 |                             for key in layer_info:
137 |                                 # DNS doesn't parse well
138 |                                 if isinstance(layer_info[key], dict) and str_layer != '<DNS Layer>':
139 |                                     for inner_key in layer_info[key]:
140 |                                         packet_dict[inner_key] = layer_info[key][inner_key]
141 |                                 else:
142 |                                     packet_dict[key] = layer_info[key]
143 |                     # clean up records
144 |                     packet_dict_copy = deepcopy(packet_dict)
145 |                     keys = packet_dict_copy.keys()
146 |                     for key in keys:
147 |                         if not key[0].isalpha() or key == 'tcp.payload_raw' or key == 'tcp.payload':
148 |                             del packet_dict[key]
149 |                     f_out.write(json.dumps(packet_dict) + '\n')
150 | 
151 |         for protocol in self.PROTOCOLS:
152 |             if protocol in all_protocols:
153 |                 all_protocols.remove(protocol)
154 |         if all_protocols:
155 |             self.logger.warning(
156 |                 f'Found the following other layers in {pcap_file_short} that were not added to the CSV: {all_protocols}')
157 | 
158 |     def get_tshark_conv_data(self, pcap_file, dict_fp):
159 |         # TODO (add a summary of other packets with protocols?)
160 |         output = ''
161 |         try:
162 |             # TODO perhaps more than just tcp/udp in the future
163 |             options = '-n -q -z conv,tcp -z conv,udp'
164 |             output = subprocess.check_output(shlex.split(
165 |                 ' '.join(['tshark', '-r', pcap_file, options])))
166 |             output = output.decode('utf-8')
167 |         except Exception as e:  # pragma: no cover
168 |             self.logger.error(f'{e}')
169 | 
170 |         in_block = False
171 |         name = None
172 |         results = {}
173 |         for line in output.split('\n'):
174 |             if line.startswith('==='):
175 |                 if in_block:
176 |                     in_block = False
177 |                     name = None
178 |                     continue
179 |                 else:
180 |                     in_block = True
181 |                     continue
182 |             if in_block:
183 |                 if not name:
184 |                     name = ''.join(line.split(':')).strip()
185 |                     results[name] = ''
186 |                     continue
187 |                 elif not line.startswith('Filter:') and line != '':
188 |                     results[name] += line + '\n'
189 | 
190 |         with gzip_writer(dict_fp) as f_out:
191 |             for result in results.keys():
192 |                 if 'Conversations' in result:
193 |                     transport_proto = result.split()[0]
194 |                     # handle conversation parsing
195 |                     for line in results[result].split('\n'):
196 |                         if line == '' or line.startswith(' '):
197 |                             # header or padding, dicard
198 |                             continue
199 |                         else:
200 |                             # TODO perhaps additional features can be extracted for flows from tshark
201 |                             src, _, dst, frames_l, bytes_l, frames_r, bytes_r, frames_total, bytes_total, rel_start, duration = line.split()
202 |                             conv = {'Source': src.rsplit(':', 1)[0],
203 |                                     'Source Port': src.rsplit(':', 1)[1],
204 |                                     'Destination': dst.rsplit(':', 1)[0],
205 |                                     'Destination Port': dst.rsplit(':', 1)[1],
206 |                                     'Transport Protocol': transport_proto,
207 |                                     'Frames to Source': frames_l,
208 |                                     'Bytes to Source': bytes_l,
209 |                                     'Frames to Destination': frames_r,
210 |                                     'Bytes to Destination': bytes_r,
211 |                                     'Total Frames': frames_total,
212 |                                     'Total Bytes': bytes_total,
213 |                                     'Relative Start': rel_start,
214 |                                     'Duration': duration}
215 |                             f_out.write(json.dumps(conv) + '\n')
216 | 
217 |     @staticmethod
218 |     @functools.lru_cache()
219 |     def good_json_key(key):
220 |         return (key[0].isalpha() or key[0] == '_') and ';' not in key and '(' not in key and '\\' not in key and '{' not in key and '<' not in key and '+' not in key
221 | 
222 |     def flatten_json(self, item):
223 |         flattened_dict = {}
224 | 
225 |         def flatten(key, value):
226 |             if isinstance(value, list):
227 |                 for i, sub_item in enumerate(value):
228 |                     flatten(str(i), sub_item)
229 |             elif isinstance(value, dict):
230 |                 sub_keys = value.keys()
231 |                 for sub_key in sub_keys:
232 |                     flatten(sub_key, value[sub_key])
233 |             else:
234 |                 # remove junk
235 |                 if self.good_json_key(key):
236 |                     # limit field size for csv
237 |                     if (value and len(value) < 131072) or not value:
238 |                         flattened_dict[key] = value
239 | 
240 |         flatten('', item)
241 |         return flattened_dict
242 | 
243 |     def json_packet_records(self, process):
244 |         json_buffer = []
245 | 
246 |         def _recordize():
247 |             return json.loads('\n'.join(json_buffer))
248 | 
249 |         depth = 0
250 |         while True:
251 |             json_line = process.stdout.readline().decode(encoding='utf-8', errors='ignore')
252 |             if json_line == '' and process.poll() is not None:
253 |                 break
254 |             if not json_line.startswith(' '):
255 |                 continue
256 |             json_line = json_line.strip()
257 |             bracket_line = json_line.rstrip(',')
258 |             if bracket_line.endswith('}'):
259 |                 depth -= 1
260 |             elif bracket_line.endswith('{'):
261 |                 depth += 1
262 |             if depth == 0:
263 |                 if bracket_line:
264 |                     json_buffer.append(bracket_line)
265 |                 if json_buffer:
266 |                     yield _recordize()
267 |                 json_buffer = []
268 |             else:
269 |                 if json_line:
270 |                     json_buffer.append(json_line)
271 | 
272 |     def get_tshark_packet_data(self, pcap_file, dict_fp):
273 |         options = '-n -V -Tjson'
274 |         try:
275 |             with subprocess.Popen(shlex.split(
276 |                 ' '.join(['tshark', '-r', pcap_file, options])), stdout=subprocess.PIPE) as process:
277 |                 with gzip_writer(dict_fp) as f_out:
278 |                     for item in self.json_packet_records(process):
279 |                         f_out.write(json.dumps(self.flatten_json(item)) + '\n')
280 |         except Exception as e:  # pragma: no cover
281 |             self.logger.error(f'{e}')
282 | 
283 |     def get_tshark_host_data(self, pcap_file, dict_fp):
284 |         # TODO
285 |         raise NotImplementedError('To be implemented')
286 | 
287 |     def write_dict_to_csv(self, dict_fp, out_file):
288 |         header = PCAPToCSV.get_csv_header(dict_fp)
289 |         with gzip_writer(out_file) as f_out:
290 |             writer = csv.DictWriter(f_out, fieldnames=header)
291 |             writer.writeheader()
292 |             try:
293 |                 with gzip_reader(dict_fp) as f_in:
294 |                     for line in f_in:
295 |                         writer.writerow(json.loads(line.strip()))
296 |             except Exception as e:  # pragma: no cover
297 |                 self.logger.error(f'Failed to write to CSV because: {e}')
298 | 
299 |     def parse_file(self, level, in_file, out_file, engine):
300 |         self.logger.info(f'Processing {in_file}')
301 |         with tempfile.TemporaryDirectory() as tmpdir:
302 |             dict_fp = os.path.join(tmpdir, os.path.basename(in_file))
303 |             if level == 'packet':
304 |                 if engine == 'tshark':
305 |                     # option for tshark as it's much faster
306 |                     self.get_tshark_packet_data(in_file, dict_fp)
307 |                 elif engine == 'pyshark':
308 |                     # using pyshark to get everything possible
309 |                     self.get_pyshark_packet_data(in_file, dict_fp)
310 |             elif level == 'flow':
311 |                 # using tshark conv,tcp and conv,udp filters
312 |                 self.get_tshark_conv_data(in_file, dict_fp)
313 |             elif level == 'host':
314 |                 # TODO unknown what should be in this, just the overarching stats?
315 |                 raise NotImplementedError('To be implemented')
316 |             self.write_dict_to_csv(dict_fp, out_file)
317 |             PCAPToCSV.cleanup_files([dict_fp])
318 | 
319 |     def process_files(self, threads, level, in_paths, out_paths, engine):
320 |         num_files = len(in_paths)
321 |         failed_paths = []
322 |         finished_files = 0
323 |         # corner case so it works in jupyterlab
324 |         if threads < 2:
325 |             for i in range(len(in_paths)):
326 |                 try:
327 |                     finished_files += 1
328 |                     self.parse_file(level, in_paths[i], out_paths[i], engine)
329 |                     self.logger.info(
330 |                         f'Finished {in_paths[i]}. {finished_files}/{num_files} PCAPs done.')
331 |                 except Exception as e:  # pragma: no cover
332 |                     self.logger.error(
333 |                         f'{in_paths[i]} generated an exception: {e}')
334 |                     failed_paths.append(out_paths[i])
335 |         else:
336 |             with concurrent.futures.ProcessPoolExecutor(max_workers=threads) as executor:
337 |                 future_to_parse = {executor.submit(
338 |                     self.parse_file, level, in_paths[i], out_paths[i], engine): i for i in range(len(in_paths))}
339 |                 for future in concurrent.futures.as_completed(future_to_parse):
340 |                     path = future_to_parse[future]
341 |                     try:
342 |                         finished_files += 1
343 |                         future.result()
344 |                     except Exception as e:  # pragma: no cover
345 |                         self.logger.error(
346 |                             f'{in_paths[path]} generated an exception: {e}')
347 |                         failed_paths.append(out_paths[path])
348 |                     else:
349 |                         self.logger.info(
350 |                             f'Finished {in_paths[path]}. {finished_files}/{num_files} PCAPs done.')
351 |         return failed_paths
352 | 
353 |     def main(self):
354 |         parsed_args = PCAPToCSV.parse_args(raw_args=self.raw_args)
355 |         in_path = parsed_args.path
356 |         out_path = parsed_args.output
357 |         combined = parsed_args.combined
358 |         engine = parsed_args.engine
359 |         threads = parsed_args.threads
360 |         log_level = parsed_args.verbose
361 |         level = parsed_args.level
362 | 
363 |         log_levels = {'INFO': logging.INFO, 'DEBUG': logging.DEBUG,
364 |                       'WARNING': logging.WARNING, 'ERROR': logging.ERROR}
365 |         logging.basicConfig(level=log_levels[log_level])
366 | 
367 |         in_paths = []
368 |         out_paths = []
369 | 
370 |         # check if it's a directory or a file
371 |         if os.path.isdir(in_path):
372 |             if out_path:
373 |                 pathlib.Path(out_path).mkdir(parents=True, exist_ok=True)
374 |             for root, _, files in os.walk(in_path):
375 |                 for pathfile in files:
376 |                     if PCAPToCSV.ispcap(pathfile):
377 |                         in_paths.append(os.path.join(root, pathfile))
378 |                         if out_path:
379 |                             out_paths.append(os.path.join(
380 |                                 out_path, pathfile) + '.csv.gz')
381 |                         else:
382 |                             out_paths.append(os.path.join(
383 |                                 root, pathfile) + '.csv.gz')
384 |         else:
385 |             in_paths.append(in_path)
386 |             default_out_path = in_path + '.csv.gz'
387 | 
388 |             if out_path:
389 |                 if os.path.isdir(out_path):
390 |                     out_paths.append(os.path.join(out_path, os.path.basename(default_out_path)))
391 |                 else:
392 |                     out_paths.append(out_path)
393 |             else:
394 |                 out_paths.append(default_out_path)
395 | 
396 |         if level == 'packet' and engine == 'pyshark':
397 |             self.logger.info(
398 |                 f'Including the following layers in CSV (if they exist): {self.PROTOCOLS}')
399 | 
400 |         failed_paths = self.process_files(
401 |             threads, level, in_paths, out_paths, engine)
402 | 
403 |         for failed_path in failed_paths:  # pragma: no cover
404 |             if failed_path in out_paths:
405 |                 out_paths.remove(failed_path)
406 | 
407 |         if combined:
408 |             if out_paths:
409 |                 combined_path = os.path.join(
410 |                     os.path.dirname(out_paths[0]), 'combined.csv.gz')
411 |             else:
412 |                 combined_path = 'combined.csv.gz'
413 |             self.logger.info(
414 |                 f'Combining CSVs into a single file: {combined_path}')
415 |             PCAPToCSV.combine_csvs(out_paths, combined_path)
416 |             return combined_path
417 |         else:
418 |             self.logger.info(
419 |                 f'GZipped CSV file(s) written out to: {out_paths}')
420 |             if len(out_paths) > 1:
421 |                 return os.path.dirname(out_paths[0])
422 |             return out_paths[0]
423 | 
424 | 
425 | if __name__ == '__main__':  # pragma: no cover
426 |     instance = PCAPToCSV()
427 |     instance.main()
428 | 


--------------------------------------------------------------------------------
/networkml/algorithms/host_footprint.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A class to perform machine learning operations on computer network traffic
  3 | """
  4 | import argparse
  5 | import ast
  6 | import json
  7 | import logging
  8 | import os
  9 | from collections import defaultdict
 10 | 
 11 | import joblib
 12 | import numpy as np
 13 | import pandas as pd
 14 | from sklearn import preprocessing
 15 | from sklearn.metrics import accuracy_score
 16 | from sklearn.metrics import confusion_matrix
 17 | from sklearn.metrics import f1_score
 18 | from sklearn.metrics import precision_score
 19 | from sklearn.metrics import recall_score
 20 | from sklearn.model_selection import GridSearchCV
 21 | from sklearn.neural_network import MLPClassifier
 22 | from sklearn.preprocessing import LabelBinarizer
 23 | 
 24 | import networkml
 25 | 
 26 | 
 27 | class HostFootprint():
 28 |     """
 29 |     Perform machine learning operations on a host's network traffic
 30 | 
 31 |     A class to peform machine learning operations on network traffic
 32 |     represented at the host footprint level. "Host footprint" refers to
 33 |     a representation of network traffic in which there are statistical
 34 |     features that characterize all packets with a particular host as
 35 |     the origin or source.
 36 |     """
 37 | 
 38 |     def __init__(self, raw_args=None):
 39 |         self.logger = logging.getLogger(__name__)
 40 |         self.raw_args = raw_args
 41 |         self.list = None
 42 |         self.model_path = None
 43 | 
 44 |     @staticmethod
 45 |     def regularize_df(df):
 46 |         # need host_key, tshark_srcips, and frame_epoch to send
 47 |         # source_ip/source_mac to Poseidon.
 48 |         cols = [col for col in ('host_key', 'tshark_srcips', 'tshark_frame_epoch', 'role') if col in df.columns]
 49 |         # TODO: remove ratio features for now for model compatibility.
 50 |         cols.extend([col for col in df.columns if 'ratio' in col])
 51 |         host_key = df.get('host_key', None)
 52 |         tshark_srcips = df.get('tshark_srcips', None)
 53 |         frame_epoch = df.get('tshark_frame_epoch', None)
 54 |         df = df.drop(columns=cols)
 55 |         # Dataframe column order must be the same for train/predict!
 56 |         df = df.reindex(columns=sorted(df.columns))
 57 |         return df, host_key, tshark_srcips, frame_epoch
 58 | 
 59 |     @staticmethod
 60 |     def serialize_label_encoder(le, path):
 61 |         """Serialize label encoder to enable persistence
 62 |         without pickling the file. .pkl files are a security
 63 |         risk and should be avoided
 64 |         Model is saved as a JSON object.
 65 |         INPUT:
 66 |         --le: the label encoder object (from sklearn) to be saved
 67 |         --path: filepath for saving the object
 68 |         OUTPUT:
 69 |         --Does not return anything
 70 |         """
 71 |         serialized_le = {
 72 |             'classes': le.classes_.tolist(),
 73 |         }
 74 |         with open(path, 'w') as model_json:
 75 |             json.dump(serialized_le, model_json)
 76 | 
 77 |     @staticmethod
 78 |     def deserialize_label_encoder(path):
 79 |         """Deserialize JSON object storing label encoder.
 80 |         Label encoder (from sklearn) is re-instantiated
 81 |         with proper values.
 82 |         INPUT:
 83 |         --path: filepath for loading the JSON object
 84 |         OUTPUT:
 85 |         --le: Returns label encoder (sklearn) object
 86 |         """
 87 |         with open(path, 'r') as model_json:
 88 |             model_dict = json.load(model_json)
 89 |         # Instantiate and assign class label
 90 |         le = preprocessing.LabelEncoder()
 91 |         le.classes_ = np.array(model_dict['classes'])
 92 |         return le
 93 | 
 94 |     @staticmethod
 95 |     def serialize_model(model, path):
 96 |         """Serialize lmodel to enable persistence
 97 |         without pickling the file. .pkl files are a security
 98 |         risk and should be avoided
 99 |         Model is saved as a JSON object.
100 |         INPUT:
101 |         --model: the model object (an MLPClassifier from sklearn) to be saved
102 |         --path: filepath for saving the object
103 |         OUTPUT:
104 |         --Does not return anything
105 |         """
106 |         def serialize_label_binarizer(label_binarizer):
107 |             serialized_label_binarizer = {
108 |                 'neg_label': label_binarizer.neg_label,
109 |                 'pos_label': label_binarizer.pos_label,
110 |                 'sparse_output': label_binarizer.sparse_output,
111 |                 'y_type_': label_binarizer.y_type_,
112 |                 'sparse_input_': label_binarizer.sparse_input_,
113 |                 'classes_': label_binarizer.classes_.tolist()
114 |             }
115 | 
116 |             return serialized_label_binarizer
117 | 
118 |         serialized_model = {
119 |             'meta': 'mlp',
120 |             'coefs_': [array.tolist() for array in model.coefs_],
121 |             'loss_': model.loss_,
122 |             'intercepts_': [array.tolist() for array in model.intercepts_],
123 |             'n_iter_': model.n_iter_,
124 |             'n_layers_': model.n_layers_,
125 |             'n_outputs_': model.n_outputs_,
126 |             'out_activation_': model.out_activation_,
127 |             '_label_binarizer': serialize_label_binarizer(model._label_binarizer),
128 |             'params': model.get_params(),
129 |             'features':model.features,
130 |         }
131 | 
132 |         if isinstance(model.classes_, list):
133 |             serialized_model['classes_'] = [array.tolist() for array in model.classes_]
134 |         else:
135 |             serialized_model['classes_'] = model.classes_.tolist()
136 | 
137 |         with open(path, 'w') as out_file:
138 |             json.dump(serialized_model, out_file, indent=2)
139 |         #skljson.to_json(model, path)
140 | 
141 |     @staticmethod
142 |     def deserialize_model(path):
143 |         """Deserialize JSON object storing the ml model.
144 |         Model (an MLPClassifier from sklearn) is re-instantiated
145 |         with proper values.
146 |         INPUT:
147 |         --path: filepath for loading the JSON object
148 |         OUTPUT:
149 |         --model: Returns an MLPClassifier (sklearn) object
150 |         """
151 |         def deserialize_label_binarizer(label_binarizer_dict):
152 |             label_binarizer = LabelBinarizer()
153 |             label_binarizer.neg_label = label_binarizer_dict['neg_label']
154 |             label_binarizer.pos_label = label_binarizer_dict['pos_label']
155 |             label_binarizer.sparse_output = label_binarizer_dict['sparse_output']
156 |             label_binarizer.y_type_ = label_binarizer_dict['y_type_']
157 |             label_binarizer.sparse_input_ = label_binarizer_dict['sparse_input_']
158 |             label_binarizer.classes_ = np.array(label_binarizer_dict['classes_'])
159 | 
160 |             return label_binarizer
161 | 
162 |         # Load (or deserialize) model from JSON
163 |         model_dict = {}
164 |         with open(path, 'r') as in_file:
165 |             model_dict = json.load(in_file)
166 | 
167 |         model = MLPClassifier(**model_dict['params'])
168 | 
169 |         model.coefs_ = np.array(model_dict['coefs_'], dtype=object)
170 |         model.loss_ = model_dict['loss_']
171 |         model.intercepts_ = np.array(model_dict['intercepts_'], dtype=object)
172 |         model.n_iter_ = model_dict['n_iter_']
173 |         model.n_layers_ = model_dict['n_layers_']
174 |         model.n_outputs_ = model_dict['n_outputs_']
175 |         model.out_activation_ = model_dict['out_activation_']
176 |         model._label_binarizer = deserialize_label_binarizer(model_dict['_label_binarizer'])
177 |         model.features = list(model_dict['features'])
178 | 
179 |         model.classes_ = np.array(model_dict['classes_'])
180 |         # Convert coeficients to numpy arrays to enable JSON deserialization
181 |         # This is a hack to compensate for a bug in sklearn_json
182 |         for i, x in enumerate(model.coefs_):
183 |             model.coefs_[i] = np.array(x)
184 |         return model
185 | 
186 |     @staticmethod
187 |     def serialize_scaler(scaler, path):
188 |         return joblib.dump(scaler, path)
189 | 
190 |     @staticmethod
191 |     def deserialize_scaler(path):
192 |         return joblib.load(path)
193 | 
194 |     @staticmethod
195 |     def parse_args(raw_args=None):
196 |         """
197 |         Use python's argparse module to collect command line arguments
198 |         for using this class
199 |         """
200 |         netml_path = list(networkml.__path__)
201 |         parser = argparse.ArgumentParser()
202 |         parser.add_argument('path', help='path to a single csv file')
203 |         parser.add_argument('--eval_data',
204 |                             help='path to eval CSV file, if training')
205 |         parser.add_argument('--kfolds', '-k',
206 |                             default=5,
207 |                             help='specify number of folds for k-fold cross validation')
208 |         parser.add_argument('--label_encoder', '-l',
209 |                             default=os.path.join(netml_path[0],
210 |                                                  'trained_models/host_footprint_le.json'),
211 |                             help='specify a path to load or save label encoder')
212 |         parser.add_argument('--scaler',
213 |                             default=os.path.join(netml_path[0],
214 |                                                  'trained_models/host_footprint_scaler.mod'),
215 |                             help='specify a path to load or save scaler')
216 |         parser.add_argument('--operation', '-O', choices=['train', 'predict', 'eval'],
217 |                             default='predict',
218 |                             help='choose which operation task to perform, \
219 |                             train or predict (default=predict)')
220 |         parser.add_argument('--trained_model',
221 |                             default=os.path.join(netml_path[0],
222 |                                                  'trained_models/host_footprint.json'),
223 |                             help='specify a path to load or save trained model')
224 |         parser.add_argument('--list', '-L',
225 |                             choices=['features'],
226 |                             default=None,
227 |                             help='list information contained within model defined by --trained_model')
228 |         parser.add_argument('--verbose', '-v',
229 |                             choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'],
230 |                             default='INFO',
231 |                             help='logging level (default=INFO)')
232 |         parser.add_argument('--train_unknown', default=False, action='store_true',
233 |                             help='Train on unknown roles')
234 |         parsed_args = parser.parse_args(raw_args)
235 |         return parsed_args
236 | 
237 |     def _get_test_train_csv(self, path, train_unknown):
238 |         df, _, _, _ = self.regularize_df(pd.read_csv(path))
239 |         df = df.fillna(0)
240 |         # Split dataframe into X (the input features or predictors)
241 |         # and y (the target or outcome or dependent variable)
242 |         df['role'] = df.filename.str.split('-').str[0]
243 |         # Drop unknown roles.
244 |         if not train_unknown:
245 |             df = df[df['role'] != 'Unknown']
246 |         X = df.drop(['filename', 'role'], axis=1)
247 |         y = df.role
248 |         column_list = list(X.columns.values)
249 |         X = self.string_feature_check(X)
250 |         return (X, y, column_list)
251 | 
252 |     def summarize_eval_data(self, model, scaler, label_encoder, eval_data, train_unknown):
253 |         X_test, y_true, _ = self._get_test_train_csv(eval_data, train_unknown)
254 |         X_test = scaler.transform(X_test)
255 |         y_true = label_encoder.transform(y_true)
256 |         y_pred = model.predict(X_test)
257 | 
258 |         for metric, name in (
259 |                 (accuracy_score, 'accuracy'),
260 |                 (precision_score, 'precision'),
261 |                 (recall_score, 'recall'),
262 |                 (f1_score, 'f1')):
263 |             if metric == accuracy_score:
264 |                 val = metric(y_true, y_pred)
265 |             else:
266 |                 val = metric(y_true, y_pred, average='weighted')
267 |             val = np.round(val, 4)
268 |             self.logger.info(f'{name}: {val}')
269 | 
270 |         conf_matrix = confusion_matrix(y_true, y_pred)
271 |         self.logger.info(conf_matrix)
272 |         self.logger.info(label_encoder.classes_.tolist())
273 | 
274 |     def eval(self, path, scaler_path, le_path, model_path, train_unknown):
275 |         """
276 |         Accept CSV and summarize based on already trained model.
277 |         """
278 |         scaler = self.deserialize_scaler(scaler_path)
279 |         le = self.deserialize_label_encoder(le_path)
280 |         self.model = self.deserialize_model(model_path)
281 |         self.summarize_eval_data(self.model, scaler, le, path, train_unknown)
282 | 
283 |     def train(self):
284 |         """
285 |         This function takes a .csv file of host footprint features--i.e. each
286 |         row is a feature vector for a given host and each column is a feature
287 |         --and trains a model to do functional role classification. This function
288 |         saves the trained model. Because the best model is still yet to be
289 |         determined, this method uses only a simple neural network. A future
290 |         version of this function will use a superior model once our research
291 |         group has done experiments with different models and hyperparameter
292 |         optimization.
293 |         """
294 |         X, y, cols = self._get_test_train_csv(self.path, self.train_unknown)
295 | 
296 |         unique_roles = sorted(y.unique())
297 |         self.logger.info(f'inferring roles {unique_roles}')
298 | 
299 |         # Normalize X features before training
300 |         scaler = preprocessing.StandardScaler()
301 |         scaler.fit(X)
302 |         X = scaler.transform(X)
303 | 
304 |         # Convert y into categorical/numerical feature
305 |         le = preprocessing.LabelEncoder()
306 |         y = le.fit_transform(y)
307 | 
308 |         # Instantiate neural network model
309 |         # MLP = multi-layer perceptron
310 |         model = MLPClassifier()
311 | 
312 |         # Perform grid-search with hyperparameter optimization
313 |         # to find the best model
314 |         parameters = {'hidden_layer_sizes': [(64, 32), (32, 16),
315 |                                              (64, 32, 32),
316 |                                              (64, 32, 32, 16)]}
317 |         clf = GridSearchCV(model, parameters,
318 |                            cv=self.kfolds, n_jobs=-1,
319 |                            scoring='f1_weighted')
320 | 
321 |         self.logger.info(f'Beginning model training')
322 |         # Find best fitting model from the hyper-parameter
323 |         # optimization process
324 |         self.model = clf.fit(X, y).best_estimator_
325 |         self.model.features = cols
326 | 
327 |         # Save model to JSON
328 |         self.serialize_model(self.model, self.model_path)
329 |         self.serialize_scaler(scaler, self.scaler)
330 |         self.serialize_label_encoder(le, self.le_path)
331 | 
332 |         if self.eval_data:
333 |             self.summarize_eval_data(self.model, self.scaler, self.le_path, self.eval_data, self.train_unknown)
334 | 
335 |     def predict(self):
336 |         """
337 |         This function takes a csv of features at the host footprint level and
338 |         then makes a role prediction for each row. The output is the top three
339 |         roles.
340 | 
341 |         OUTPUTS:
342 |         --all_predictions: a dict with the filename for a key and a JSON'ified dict
343 |         dict for a value. see sorted_roles_to_json() for a description of
344 |         the value's structure.
345 |         """
346 |         scaler = self.deserialize_scaler(self.scaler)
347 |         # Get label encoder
348 |         le = self.deserialize_label_encoder(self.le_path)
349 |         # Load (or deserialize) model from JSON
350 |         self.model = self.deserialize_model(self.model_path)
351 | 
352 |         # Load data from host footprint .csv
353 |         csv_df = pd.read_csv(self.path)
354 |         df, host_key, tshark_srcips, frame_epoch = self.regularize_df(csv_df)
355 |         # Split dataframe into X (the input features or predictors)
356 |         # and y (the target or outcome or dependent variable)
357 |         # This drop function should work even if there is no column
358 |         # named filename
359 |         X = df.drop('filename', axis=1)
360 | 
361 |         # Get filenames to match to predictions
362 |         filename = df.filename
363 | 
364 |         # Normalize X features before predicting
365 |         X = scaler.transform(X)
366 | 
367 |         self.logger.info(f'Executing model inference')
368 |         # Make model predicton - Will return a vector of values
369 |         predictions_rows = self.model.predict_proba(X)
370 | 
371 |         # Dict to store top role and list of top roles
372 |         all_predictions = self.get_individual_predictions(
373 |             predictions_rows, le, filename, host_key, tshark_srcips, frame_epoch)
374 | 
375 |         return json.dumps(all_predictions)
376 | 
377 |     def get_individual_predictions(self, predictions_rows, label_encoder,
378 |                                    filename, host_key, tshark_srcips,
379 |                                    frame_epoch, top_n_roles=3):
380 |         """ Return role predictions for given device
381 | 
382 |         INPUTS:
383 |         --predictions_rows: each device is represented as a row
384 |         --label_encoder: a mapping of device role name to numerical category
385 |         --filename: the filename of the pcap for which a prediction is made
386 |         --host_key: canonical source MAC for this pcap.
387 |         --tshark_srcips: canonical source IPs for this pcap.
388 |         --frame_epoch: the timestamp of the packet.
389 | 
390 |         OUTPUTS:
391 |         --all_predictions: a dict with the filename for a key and a
392 |         JSON'ified dict for a value. see sorted_roles_to_json() for a description
393 |         of the value's structure.
394 |         """
395 | 
396 |         # Dict to store JSON of top n roles and probabilities per device
397 |         all_predictions = defaultdict(list)
398 |         num_roles = len(label_encoder.classes_)
399 |         labels = label_encoder.inverse_transform([i for i in range(num_roles)])
400 | 
401 |         # Loop thru different devices on which to make prediction
402 |         for i, predictions in enumerate(predictions_rows):
403 |             role_list = [(k, v) for k, v in zip(labels, predictions)]
404 |             # Sort role list by probabilities
405 |             role_list_sorted = sorted(role_list, key=lambda x: x[1], reverse=True)[:top_n_roles]
406 |             # Dump top role and roles-probability list
407 |             host_results = self.sorted_roles_to_dict(role_list_sorted)
408 |             if host_key is not None:
409 |                 host_results.update({'source_mac': host_key[i]})
410 |             if tshark_srcips is not None:
411 |                 source_ip = ast.literal_eval(tshark_srcips[i])
412 |                 if source_ip:
413 |                     source_ip = source_ip[0]
414 |                 else:
415 |                     source_ip = None
416 |                 host_results.update({'source_ip': source_ip})
417 |             if frame_epoch is not None:
418 |                 host_results.update({'timestamp': frame_epoch[i]})
419 |             all_predictions[filename[i]].append(host_results)
420 | 
421 |         return all_predictions
422 | 
423 | 
424 |     @staticmethod
425 |     def sorted_roles_to_dict(role_list_sorted, threshold=.5):
426 |         """ Converted sorted role-probability list into formatted dict
427 | 
428 |         This function ensures that the top role returned is Unknown
429 |         if the top role has a probability less than the threshold
430 |         specified in the default input parameter.
431 | 
432 |         INPUTS:
433 |         --role_list_sorted: a sorted list that associates the top role
434 |         with their probabilities
435 |         --threshold: probability threshold below which the top role
436 |         should be designated as "Unknown"
437 | 
438 |         OUTPUTS:
439 |         --predictions: a dict with the top role and a sorted role list
440 |         """
441 | 
442 |         # Probability associated with the most likely role
443 |         top_role_prob = role_list_sorted[0][1]
444 | 
445 |         # Only use actual top role if probability is greater
446 |         # than designated threshold
447 |         if top_role_prob <= threshold:
448 |             top_role = 'Unknown'
449 |         else:
450 |             top_role = role_list_sorted[0][0]  # Most likely role
451 | 
452 |         # Create dict to store prediction results
453 |         role_predictions = {
454 |             'top_role': top_role,
455 |             'role_list': role_list_sorted,
456 |         }
457 | 
458 |         return role_predictions
459 | 
460 | 
461 |     def string_feature_check(self, X):
462 |         """
463 |         This function takes a pandas dataframe that contains the
464 |         features for a model and checks if any of the features are
465 |         strings (or "objects" in the pandas ontology). If any of the
466 |         features are strings, then that feature is expanded into dummy
467 |         features, i.e. a series of 0/1 features for each category within
468 |         that object feature. The function then removes the original feature.
469 | 
470 |         INPUTS:
471 |         --X: a pandas dataframe with only the training features
472 | 
473 |         OUPUTS:
474 |         --X: a pandas dataframe expanded with dummy features
475 | 
476 |         """
477 | 
478 |         # loop through columns in X
479 |         for col in X.columns:
480 | 
481 |             # Check if the feature's data type is string
482 |             # Object is the datatype pandas uses for storing strings
483 |             if X[col].dtype == 'object':
484 | 
485 |                 # log warning if a string column is found
486 |                 self.logger.info(f'String object found in column {col}')
487 | 
488 |                 # Expand features into "dummy", i.e. 0/1
489 |                 # features
490 |                 new_features = pd.get_dummies(X[col])
491 | 
492 |                 # Add new features onto X dataframe
493 |                 X = pd.concat([X, new_features], axis=1)
494 | 
495 |                 # Remove original non-expanded feature from X
496 |                 X = X.drop(col, axis=1)
497 | 
498 |         return X
499 | 
500 | 
501 |     def list_model(self):
502 |         model = self.deserialize_model(self.model_path)
503 |         if self.list == 'features':
504 |             return model.features
505 | 
506 | 
507 |     def main(self):
508 |         """
509 |         Collect and parse command line arguments for using this class
510 |         """
511 | 
512 |         # Collect command line arguments
513 |         parsed_args = HostFootprint.parse_args(raw_args=self.raw_args)
514 |         self.path = parsed_args.path
515 |         self.eval_data = parsed_args.eval_data
516 |         self.model_path = parsed_args.trained_model
517 |         self.le_path = parsed_args.label_encoder
518 |         self.scaler = parsed_args.scaler
519 |         self.kfolds = int(parsed_args.kfolds)
520 |         self.train_unknown = parsed_args.train_unknown
521 |         self.list = parsed_args.list
522 |         operation = parsed_args.operation
523 |         log_level = parsed_args.verbose
524 | 
525 |         # Set logging output options
526 |         log_levels = {'INFO': logging.INFO, 'DEBUG': logging.DEBUG,
527 |                       'WARNING': logging.WARNING, 'ERROR': logging.ERROR}
528 |         logging.basicConfig(level=log_levels[log_level])
529 | 
530 |         self.logger.debug(f'hostfootprint.main list: {self.list}')
531 |         if self.list:
532 |             model_list = self.list_model()
533 |             if model_list and len(model_list) > 0:
534 |                 result = f'Listing {self.list} for model at {self.model_path}:\n{model_list}'
535 |                 return result
536 |             else:
537 |                 return f'model found at {self.model_path} contains no {self.list}'
538 | 
539 |         # Basic execution logic
540 |         if operation == 'train':
541 |             if not self.train_unknown:
542 |                 self.logger.info(f'Role Unknown will be dropped from training data')
543 |             self.train()
544 |             self.logger.info(f'Saved model to: {self.model_path}')
545 |             self.logger.info(f'Saved label encoder to: {self.le_path}')
546 |             return self.model_path
547 |         if operation == 'predict':
548 |             role_prediction = self.predict()
549 |             self.logger.info(f'{role_prediction}')
550 |             return role_prediction
551 |         if operation == 'eval':
552 |             return self.eval(self.path, self.scaler, self.le_path, self.model_path, self.train_unknown)
553 |         return None
554 | 
555 | 
556 | if __name__ == '__main__':
557 |     host_footprint = HostFootprint()
558 |     host_footprint.main()
559 | 


--------------------------------------------------------------------------------