├── .coveragerc ├── networkml ├── algorithms │ ├── __init__.py │ └── host_footprint.py ├── featurizers │ ├── __init__.py │ ├── funcs │ │ ├── __init__.py │ │ ├── packet.py │ │ ├── generic.py │ │ ├── flow.py │ │ └── host.py │ ├── features.py │ ├── README.md │ ├── main.py │ └── csv_to_features.py ├── helpers │ ├── __init__.py │ ├── gzipio.py │ ├── pandas_csv_importer.py │ └── results_output.py ├── parsers │ ├── __init__.py │ └── pcap_to_csv.py ├── __init__.py ├── trained_models │ ├── host_footprint_scaler.mod │ ├── host_footprint_le.json │ ├── label_assignments.json │ └── README.md ├── __main__.py └── NetworkML.py ├── netml-dev.yml ├── MAINTAINERS ├── renovate.json ├── tests ├── test_networkml.py ├── test_data │ ├── trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap │ ├── trace_ab34_2001-01-01_02_03-client-ip-1-2-3-4.pcap │ ├── trace_ab12_2001-01-01_02_03-client-ip6-1-2-3-4.pcap │ ├── bad_data_too_few_columns.csv │ ├── list_test.json │ ├── combined_two_roles.csv │ ├── combined_three_roles.csv │ └── combined.csv ├── test_funcs_packet.py ├── test_featurizers_features.py ├── test_e2e.sh ├── test_pcap_to_csv.py ├── test_featurizers_main.py ├── test_results_output.py ├── test_csv_to_features.py ├── test_funcs_host.py └── test_algorithms_host_footprint.py ├── .sastscanrc ├── .dockerignore ├── .github ├── workflows │ ├── make.yml │ ├── pypi.yaml │ ├── stale.yml │ ├── sl-scan.yaml │ ├── secrets-scan.yml │ ├── test.yml │ ├── docker-amd64.yml │ ├── docker.yml │ └── semgrep.yml └── ISSUE_TEMPLATE │ └── bug_report.md ├── Dockerfile.standalone ├── .pre-commit-config.yaml ├── Dockerfile.test ├── Dockerfile ├── Makefile ├── AUTHORS ├── pyproject.toml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── README.md ├── CONTRIBUTING.md ├── LICENSE └── CHANGELOG.md /.coveragerc: -------------------------------------------------------------------------------- 1 | [report] 2 | -------------------------------------------------------------------------------- /networkml/algorithms/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /networkml/featurizers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /networkml/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /networkml/parsers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /networkml/featurizers/funcs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /networkml/__init__.py: -------------------------------------------------------------------------------- 1 | from importlib.metadata import version 2 | 3 | __version__ = version('networkml') 4 | -------------------------------------------------------------------------------- /netml-dev.yml: -------------------------------------------------------------------------------- 1 | name: netml-dev 2 | channels: 3 | - defaults 4 | dependencies: 5 | - pip=20.0 6 | - python=3.7.6 7 | -------------------------------------------------------------------------------- /MAINTAINERS: -------------------------------------------------------------------------------- 1 | Charlie Lewis 2 | Josh Bailey 3 | Ryan Ashley 4 | -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": [ 3 | "config:base", 4 | "docker:enableMajor" 5 | ], 6 | "ignorePaths": [] 7 | } 8 | -------------------------------------------------------------------------------- /tests/test_networkml.py: -------------------------------------------------------------------------------- 1 | from networkml.NetworkML import NetworkML 2 | 3 | 4 | def test_smoke(): 5 | instance = NetworkML() 6 | -------------------------------------------------------------------------------- /networkml/trained_models/host_footprint_scaler.mod: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/faucetsdn/NetworkML/HEAD/networkml/trained_models/host_footprint_scaler.mod -------------------------------------------------------------------------------- /tests/test_data/trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/faucetsdn/NetworkML/HEAD/tests/test_data/trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap -------------------------------------------------------------------------------- /tests/test_data/trace_ab34_2001-01-01_02_03-client-ip-1-2-3-4.pcap: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/faucetsdn/NetworkML/HEAD/tests/test_data/trace_ab34_2001-01-01_02_03-client-ip-1-2-3-4.pcap -------------------------------------------------------------------------------- /networkml/featurizers/funcs/packet.py: -------------------------------------------------------------------------------- 1 | from networkml.featurizers.features import Features 2 | 3 | 4 | class Packet(Features): 5 | 6 | def all(self, rows): 7 | return rows 8 | -------------------------------------------------------------------------------- /tests/test_data/trace_ab12_2001-01-01_02_03-client-ip6-1-2-3-4.pcap: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/faucetsdn/NetworkML/HEAD/tests/test_data/trace_ab12_2001-01-01_02_03-client-ip6-1-2-3-4.pcap -------------------------------------------------------------------------------- /.sastscanrc: -------------------------------------------------------------------------------- 1 | { 2 | "type": ["credscan", "python", "dockerfile", "yaml"] 3 | "SCAN_ANNOTATE_PR": true 4 | "PR_COMMENT_TEMPLATE": "## Scan Summary\n%(summary)s\n## Recommendation\n%(recommendation)s\n" 5 | } -------------------------------------------------------------------------------- /tests/test_funcs_packet.py: -------------------------------------------------------------------------------- 1 | from networkml.featurizers.funcs.packet import Packet 2 | 3 | 4 | def test_packet_all(): 5 | instance = Packet() 6 | result = instance.all('foo') 7 | assert result == 'foo' 8 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | Dockerfile 2 | Dockerfile.base 3 | Dockerfile.test 4 | .gitignore 5 | AUTHORS 6 | CHANGELOG.md 7 | CODE_OF_CONDUCT.md 8 | CONTRIBUTING.md 9 | LICENSE 10 | MAINTAINERS.md 11 | Makefile 12 | README.md 13 | -------------------------------------------------------------------------------- /networkml/trained_models/host_footprint_le.json: -------------------------------------------------------------------------------- 1 | {"classes": ["ActiveDirectoryController", "AdminServer", "AdminWorkstation", "BusinessWorkstation", "ConfluenceServer", "DevWorkstation", "DistrFileShare", "ExchangeServer", "GPULaptop", "GithubRepo", "PKIServer", "Printer"]} -------------------------------------------------------------------------------- /.github/workflows/make.yml: -------------------------------------------------------------------------------- 1 | name: make 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | make: 7 | 8 | runs-on: ubuntu-latest 9 | 10 | steps: 11 | - uses: actions/checkout@v3 12 | - name: Test make 13 | run: | 14 | make test 15 | -------------------------------------------------------------------------------- /networkml/featurizers/funcs/generic.py: -------------------------------------------------------------------------------- 1 | from networkml.featurizers.features import Features 2 | from networkml.helpers.pandas_csv_importer import WS_FIELDS 3 | 4 | 5 | class Generic(Features): 6 | 7 | def all(self, rows_f): 8 | return [{field: row.get(field, '') for field in WS_FIELDS} for row in rows_f()] 9 | -------------------------------------------------------------------------------- /Dockerfile.standalone: -------------------------------------------------------------------------------- 1 | FROM networkml 2 | LABEL maintainer="Ryan Ashley " 3 | 4 | ENTRYPOINT ["networkml"] 5 | CMD ["--trained_model=/trained_models/host_footprint.json", "--label_encoder=/trained_models/host_footprint_le.json", "--scaler=/trained_models/host_footprint_scaler.mod", "--operation", "predict", "/pcaps"] -------------------------------------------------------------------------------- /networkml/helpers/gzipio.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import io 3 | 4 | 5 | def gzip_reader(gzip_file): 6 | return io.TextIOWrapper(gzip.open(gzip_file, 'r'), newline='') # pytype: disable=wrong-arg-types 7 | 8 | 9 | def gzip_writer(gzip_file): 10 | return io.TextIOWrapper(gzip.open(gzip_file, 'w'), newline='', write_through=True) # pytype: disable=wrong-arg-types 11 | -------------------------------------------------------------------------------- /networkml/__main__.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import humanize 3 | import logging 4 | import time 5 | 6 | 7 | def main(): 8 | from networkml.NetworkML import NetworkML 9 | start = time.time() 10 | NetworkML() 11 | end = time.time() 12 | elapsed = end - start 13 | human_elapsed = humanize.naturaldelta(datetime.timedelta(seconds=elapsed)) 14 | logging.info(f'Elapsed Time: {elapsed} seconds ({human_elapsed})') 15 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: git@github.com:pre-commit/pre-commit-hooks 3 | rev: v4.0.1 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: end-of-file-fixer 7 | - id: autopep8-wrapper 8 | - id: check-case-conflict 9 | - id: check-json 10 | - id: pretty-format-json 11 | args: ['--autofix'] 12 | - id: double-quote-string-fixer 13 | - id: check-yaml 14 | - repo: git@github.com:asottile/reorder_python_imports 15 | rev: v2.6.0 16 | hooks: 17 | - id: reorder-python-imports 18 | -------------------------------------------------------------------------------- /networkml/featurizers/funcs/flow.py: -------------------------------------------------------------------------------- 1 | from networkml.featurizers.features import Features 2 | 3 | 4 | class Flow(Features): 5 | 6 | def default_tcp_5tuple(self, rows): 7 | fields = ['ip.src_host', 'ip.dst_host', 8 | 'tcp.dstport', 'tcp.srcport', 'frame.protocols'] 9 | return self.get_columns(fields, rows) 10 | 11 | def default_udp_5tuple(self, rows): 12 | fields = ['ip.src_host', 'ip.dst_host', 13 | 'udp.dstport', 'udp.srcport', 'frame.protocols'] 14 | return self.get_columns(fields, rows) 15 | -------------------------------------------------------------------------------- /networkml/trained_models/label_assignments.json: -------------------------------------------------------------------------------- 1 | { 2 | "ActiveDirectoryController": "Active Directory controller", 3 | "AdminServer": "Administrator server", 4 | "AdminWorkstation": "Administrator workstation", 5 | "BusinessWorkstation": "Business workstation", 6 | "ConfluenceServer": "Confluence server", 7 | "DevWorkstation": "Developer workstation", 8 | "DistrFileShare": "Distributed file share", 9 | "ExchangeServer": "Exchange server", 10 | "GPULaptop": "GPU laptop", 11 | "GithubRepo": "GitHub server", 12 | "PKIServer": "PKI server", 13 | "Printer": "Printer" 14 | } 15 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Tell us what is broken 4 | 5 | --- 6 | 7 | ### Description 8 | Describe your issue here. 9 | 10 | ### Environment 11 | - Git commit hash, and specific ML model (if any) 12 | - Docker version, if using 13 | - Operating system 14 | 15 | ### Steps to reproduce 16 | - step 1 17 | - step 2 18 | - ... 19 | 20 | ### Expected result 21 | What did you expect to happen? 22 | 23 | ### Actual result 24 | Describe the error message or unexpected result. 25 | 26 | Don't forget to upload any relevant data if this is required to trigger the issue. 27 | -------------------------------------------------------------------------------- /Dockerfile.test: -------------------------------------------------------------------------------- 1 | FROM networkml 2 | LABEL maintainer="Charlie Lewis " 3 | COPY . /networkml 4 | WORKDIR /networkml 5 | ENV PATH="${PATH}:/root/.local/bin" 6 | RUN apt-get update && apt-get install -y gcc rabbitmq-server && \ 7 | rm -rf /var/cache && \ 8 | poetry install -E test 9 | RUN jupyter nbconvert --ExecutePreprocessor.timeout=300 --to notebook --execute notebooks/networkml_exploration.ipynb --Application.log_level=DEBUG 10 | ENTRYPOINT ["pytest"] 11 | CMD ["-l", "-s", "-v", "-nauto", "--cov=tests/", "--cov=networkml/", "--cov-report", "term-missing", "-c", ".coveragerc", "--rabbitmq-port=5672"] 12 | -------------------------------------------------------------------------------- /tests/test_featurizers_features.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from networkml.featurizers.features import Features 3 | 4 | 5 | def test_quantile_nullable_int(): 6 | # TODO: https://github.com/pandas-dev/pandas/issues/42626 7 | # TODO: migrate all tests to unittest class/assert method style. 8 | df = pd.DataFrame([{'x': 1}, {'x': 0}], dtype=pd.Int64Dtype()) 9 | assert df['x'].quantile(0.75) # nosec 10 | 11 | 12 | def test_no_func(): 13 | instance = Features() 14 | instance.run_func('none') 15 | 16 | 17 | def test_get_columns(): 18 | instance = Features() 19 | assert instance.get_columns( 20 | ['foo', 'bar'], [{'foo': 1, 'baz': 3}]) == [{'foo': 1}] 21 | -------------------------------------------------------------------------------- /tests/test_e2e.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | TMPDIR=$(mktemp -d) 6 | networkml ./tests/test_data/trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap -o $TMPDIR --first_stage parser --final_stage algorithm --operation predict 7 | cat $TMPDIR/predict.json 8 | LABEL=$(jq < $TMPDIR/predict.json '.[0].data.mac_addresses["00:04:00:81:81:d0"]["classification"]["labels"][0]') 9 | if [[ "$LABEL" == "" ]] ; then 10 | echo FAIL: no result from prediction 11 | fi 12 | TD=$(pwd) 13 | docker build -f Dockerfile . -t iqtlabs/networkml:latest 14 | docker run -i -e RESULT_PATH=/tmp/predict.json -v $TD/tests/test_data:/pcaps iqtlabs/networkml:latest /pcaps/trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap -o/tmp 15 | echo PASS 16 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM iqtlabs/rbqwrapper:v0.11.33 2 | LABEL maintainer="Charlie Lewis " 3 | 4 | ENV DEBIAN_FRONTEND "noninteractive" 5 | ENV PYTHONUNBUFFERED 1 6 | ENV PATH="${PATH}:/root/.local/bin" 7 | COPY pyproject.toml pyproject.toml 8 | 9 | # hadolint ignore=DL3008 10 | RUN apt-get update && apt-get install -y git python3-numpy python3-scipy gfortran libblas-dev liblapack-dev libxslt-dev libxml2-dev flex bison zlib1g-dev tshark curl && \ 11 | apt-get remove -y libblas-dev liblapack-dev libxslt-dev libxml2-dev gfortran flex bison zlib1g-dev && \ 12 | apt-get autoremove -y && \ 13 | rm -rf /var/cache/* && \ 14 | rm -rf /root/.cache/* && \ 15 | curl -sSL https://install.python-poetry.org | python3 - --version 1.1.15 && \ 16 | poetry config virtualenvs.create false && \ 17 | pip install -U pip 18 | 19 | COPY . /networkml 20 | WORKDIR /networkml 21 | RUN poetry install 22 | ENTRYPOINT ["/rbqwrapper.py", "networkml"] 23 | -------------------------------------------------------------------------------- /tests/test_data/bad_data_too_few_columns.csv: -------------------------------------------------------------------------------- 1 | min_frame_len_in,25q_frame_len,25q_frame_len_out,max_frame_len_out,75q_time_delta_out,variance_frame_len_out,count_frame_len_in,75q_frame_len_out,median_frame_len,min_time_delta,variance_time_delta_out,average_frame_len_in,IPv6,75q_frame_len,average_time_delta_in,min_time_delta_in,IPv4,25q_frame_len_in,total_frame_len_in,25q_time_delta_out,min_time_delta_out,variance_frame_len_in,variance_frame_len,75q_time_delta_in,average_frame_len_out,max_time_delta,min_frame_len_out,variance_time_delta_in,average_frame_len,median_frame_len_in,median_time_delta_in,average_time_delta,max_time_delta_in,count_frame_len_out,25q_time_delta_in,min_frame_len,max_frame_len,median_frame_len_out,median_time_delta_out,max_time_delta_out,75q_frame_len_in,average_time_delta_out,total_frame_len_out,max_frame_len_in,filename 2 | 42,54,54,ARGH----BADDATA----ARGH!!!!!,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,.pcap.csv.gz 3 | -------------------------------------------------------------------------------- /.github/workflows/pypi.yaml: -------------------------------------------------------------------------------- 1 | name: release 2 | 3 | on: 4 | push: 5 | branches: main 6 | tags: 'v*' 7 | 8 | jobs: 9 | release: 10 | 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v3 15 | - name: Build package 16 | id: build_package 17 | run: | 18 | sudo apt-get update && \ 19 | sudo apt-get install -yq --no-install-recommends python3-pip python3.7 python3-dev python3-setuptools && \ 20 | pip3 install wheel && \ 21 | python3 setup.py sdist bdist_wheel --universal 22 | if: github.repository == 'iqtlabs/networkml' && github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') 23 | - name: Publish package 24 | if: github.repository == 'iqtlabs/networkml' && github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') 25 | uses: pypa/gh-action-pypi-publish@master 26 | with: 27 | user: ${{ secrets.PYPI_USERNAME }} 28 | password: ${{ secrets.PYPI_TOKEN }} 29 | -------------------------------------------------------------------------------- /.github/workflows/stale.yml: -------------------------------------------------------------------------------- 1 | name: 'Close stale issues and PRs' 2 | on: 3 | schedule: 4 | - cron: '30 1 * * *' 5 | 6 | jobs: 7 | stale: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/stale@v6 11 | with: 12 | stale-issue-message: 'This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days.' 13 | stale-pr-message: 'This PR is stale because it has been open 45 days with no activity. Remove stale label or comment or this will be closed in 10 days.' 14 | close-issue-message: 'This issue was closed because it has been stalled for 5 days with no activity.' 15 | close-pr-message: 'This PR was closed because it has been stalled for 10 days with no activity.' 16 | days-before-issue-stale: 30 17 | days-before-pr-stale: 45 18 | days-before-issue-close: 5 19 | days-before-pr-close: 10 20 | exempt-issue-labels: 'dependencies' 21 | -------------------------------------------------------------------------------- /.github/workflows/sl-scan.yaml: -------------------------------------------------------------------------------- 1 | name: sl-scan 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | scan: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - name: Cache vdb 10 | uses: actions/cache@v3 11 | with: 12 | path: | 13 | ${{ github.workspace }}/vdb 14 | key: ${{ runner.os }} 15 | - name: Scan 16 | uses: ShiftLeftSecurity/scan-action@master 17 | env: 18 | VDB_HOME: ${{ github.workspace }}/vdb 19 | WORKSPACE: https://github.com/${{ github.repository }}/blob/${{ github.sha }} 20 | SCAN_AUTO_BUILD: true 21 | SCAN_ANNOTATE_PR: true 22 | PR_COMMENT_TEMPLATE: "## Scan Summary\n%(summary)s\n## Recommendation\n%(recommendation)s\n" 23 | PR_COMMENT_BASIC_TEMPLATE: "## Scan Summary\n%(summary)s\n## Recommendation\n%(recommendation)s\n" 24 | with: 25 | output: reports 26 | type: "credscan, python, dockerfile, yaml" 27 | - name: Upload scan reports 28 | uses: actions/upload-artifact@v3.1.1 29 | with: 30 | name: shiftleft-scan-reports 31 | path: reports -------------------------------------------------------------------------------- /networkml/featurizers/features.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import ipaddress 3 | 4 | import numpy as np 5 | 6 | 7 | class Features(): 8 | 9 | def __init__(self): 10 | self.nonempty_generators = set() 11 | 12 | def run_func(self, func_name, *args): 13 | """ 14 | Helper function that will run the with for this func 15 | :param func_name: name of the function to run 16 | :param args: list of arguments to run with the function 17 | """ 18 | func = getattr(self, func_name, None) 19 | if not func: 20 | print("Error: Not a function name that's been defined") 21 | return False 22 | 23 | results = func(*args) 24 | return results 25 | 26 | @staticmethod 27 | def get_columns(fields, rows): 28 | # Terse but efficient. 29 | new_rows = [{field: row[field] 30 | for field in fields if row.get(field, None)} for row in rows] 31 | return new_rows 32 | 33 | @staticmethod 34 | def _pyshark_row_layers(rows_f): 35 | return filter(lambda row: 'layers' in row, rows_f()) 36 | -------------------------------------------------------------------------------- /tests/test_pcap_to_csv.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import sys 4 | import tempfile 5 | 6 | from networkml.parsers.pcap_to_csv import PCAPToCSV 7 | 8 | 9 | def test_PCAPToCSV_pyshark_packet(): 10 | with tempfile.TemporaryDirectory() as tmpdir: 11 | testdata = os.path.join(tmpdir, 'test_data') 12 | shutil.copytree('./tests/test_data', testdata) 13 | sys.argv = ['pcap_to_csv.py', '-c', '-e', 'pyshark', '-t', '2', '-v', 'DEBUG', '-o', os.path.join( 14 | tmpdir, 'networkml_test.pcap.csv.gz'), os.path.join(testdata, 'trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap')] 15 | instance = PCAPToCSV() 16 | instance.main() 17 | 18 | 19 | def test_ispcap(): 20 | a = 'foo.bad' 21 | answer = PCAPToCSV.ispcap(a) 22 | assert answer == False 23 | a = 'fooo.pcap' 24 | answer = PCAPToCSV.ispcap(a) 25 | assert answer == True 26 | a = 'fooo.pcapng' 27 | answer = PCAPToCSV.ispcap(a) 28 | assert answer == True 29 | a = 'fooo.dump' 30 | answer = PCAPToCSV.ispcap(a) 31 | assert answer == True 32 | a = 'fooo.capture' 33 | answer = PCAPToCSV.ispcap(a) 34 | assert answer == True 35 | -------------------------------------------------------------------------------- /tests/test_featurizers_main.py: -------------------------------------------------------------------------------- 1 | from networkml.featurizers.features import Features 2 | from networkml.featurizers.main import Featurizer 3 | 4 | 5 | def test_no_path(): 6 | instance = Featurizer() 7 | result = instance.import_class('foo', None) 8 | assert result == None 9 | 10 | 11 | def test_run_all_funcs(): 12 | instance = Featurizer() 13 | 14 | class TestClass(Features): 15 | 16 | @staticmethod 17 | def test_feature1(rows, _srcmacid): 18 | for row in rows: 19 | return [{'test1': row['test1']}] 20 | 21 | @staticmethod 22 | def test_feature2(rows, _srcmacid): 23 | for row in rows: 24 | return [{'test2': row['test2']}] 25 | 26 | tc = TestClass() 27 | 28 | results = instance.run_all_funcs( 29 | [('test_feature1', 'test_feature1'), 30 | ('test_feature2', 'test_feature2')], [], 31 | [(tc, 'test_feature1'), (tc, 'test_feature2')], 32 | [{'test1': 99, 'test2': 123}], 33 | True) 34 | assert results == [[{'test1': 99}], [{'test2': 123}]] 35 | 36 | results = instance.run_all_funcs( 37 | [], [], [], [{'test1': 99, 'test2': 123}], True) 38 | assert results == [] 39 | -------------------------------------------------------------------------------- /.github/workflows/secrets-scan.yml: -------------------------------------------------------------------------------- 1 | name: secrets 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | scan: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v3 10 | - name: scan 11 | run: | 12 | export DEBIAN_FRONTEND=noninteractive && \ 13 | echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections && \ 14 | sudo apt-get update && \ 15 | python3 -m pip install --upgrade pip && \ 16 | pip3 install whispers && \ 17 | mkdir /home/runner/reports/ && \ 18 | whispers --severity BLOCKER,CRITICAL -o /home/runner/reports/whispers.json ${GITHUB_WORKSPACE} && \ 19 | echo "::set-output name=found-count::$(wc -l /home/runner/reports/whispers.json | cut -d' ' -f1)" 20 | - name: Fail if found 21 | if: steps.scan.outputs.found-count != 0 22 | uses: actions/github-script@v6 23 | with: 24 | script: | 25 | echo {{steps.scan.outputs.found-count}} && \ 26 | core.setFailed('Secrets found. Please check the uploaded report') 27 | - name: Upload scan reports 28 | uses: actions/upload-artifact@v3.1.1 29 | if: failure() 30 | with: 31 | name: whispers-report 32 | path: /home/runner/reports/whispers.json -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SHELL:=/bin/bash 2 | PIP=$(shell which pip3 || echo "pip3") 3 | 4 | 5 | # CONDA_EXE must be set before running `make dev` or `rmdev` 6 | # export CONDA_EXE=$_CONDA_EXE 7 | CONDA_DEV=netml-dev 8 | CONDAROOT=$(shell ${CONDA_EXE} info --base)/bin 9 | CONDA_ENV=$(shell ${CONDA_EXE} info --base)/envs/$(CONDA_DEV)/bin 10 | 11 | run: predict 12 | predict: build predict_nobuild 13 | predict_nobuild: 14 | @echo 15 | @echo "Running Predict on PCAP files $(PCAP)" 16 | @docker run -it --rm -v "$(PCAP):/pcaps$(PCAP)" networkml /pcaps 17 | @echo 18 | train: build train_nobuild 19 | train_nobuild: 20 | @echo 21 | @echo "Running Train on PCAP files $(PCAP)" 22 | @docker run -it --rm -v "$(PCAP):/pcaps$(PCAP)" -v "$(PWD)/networkml/trained_models:/usr/local/lib/python3.8/site-packages/networkml/trained_models/" networkml -O train /pcaps 23 | @echo 24 | test: build 25 | @docker build -t networkml-test -f Dockerfile.test . 26 | @docker run --rm networkml-test 27 | build: 28 | @docker build -t networkml . 29 | install: 30 | poetry install 31 | 32 | dev: 33 | ${CONDA_EXE} env create --force -f $(CONDA_DEV).yml python=3.9 34 | source $(CONDAROOT)/activate $(CONDA_DEV) ; \ 35 | $(CONDA_ENV)/python3 -m pip install --upgrade pip ; \ 36 | $(CONDA_ENV)/pip3 install . 37 | 38 | rmdev: 39 | ${CONDA_EXE} env remove -y -n $(CONDA_DEV) 40 | -------------------------------------------------------------------------------- /tests/test_data/list_test.json: -------------------------------------------------------------------------------- 1 | { 2 | "meta": "mlp", 3 | "coefs_": [ 4 | [ 5 | [ 6 | 0.05 7 | ], 8 | [ 9 | 0.05 10 | ], 11 | [ 12 | 0.05 13 | ] 14 | ] 15 | ], 16 | "loss_": 0.00844733599407846, 17 | "intercepts_": [ 18 | [ 19 | 0.05 20 | ] 21 | ], 22 | "n_iter_": 61, 23 | "n_layers_": 5, 24 | "n_outputs_": 12, 25 | "out_activation_": "softmax", 26 | "_label_binarizer": { 27 | "neg_label": 0, 28 | "pos_label": 1, 29 | "sparse_output": false, 30 | "y_type_": "multiclass", 31 | "sparse_input_": false, 32 | "classes_": [ 33 | 0 34 | ] 35 | }, 36 | "params": { 37 | "activation": "relu", 38 | "alpha": 0.0001, 39 | "batch_size": "auto", 40 | "beta_1": 0.9, 41 | "beta_2": 0.999, 42 | "early_stopping": false, 43 | "epsilon": 1e-08, 44 | "hidden_layer_sizes": [ 45 | 64, 46 | 32, 47 | 32 48 | ], 49 | "learning_rate": "constant", 50 | "learning_rate_init": 0.001, 51 | "max_fun": 15000, 52 | "max_iter": 200, 53 | "momentum": 0.9, 54 | "n_iter_no_change": 10, 55 | "nesterovs_momentum": true, 56 | "power_t": 0.5, 57 | "random_state": null, 58 | "shuffle": true, 59 | "solver": "adam", 60 | "tol": 0.0001, 61 | "validation_fraction": 0.1, 62 | "verbose": false, 63 | "warm_start": false 64 | }, 65 | "features": [ 66 | "foo", 67 | "bar", 68 | "baz" 69 | ], 70 | "classes_": [ 71 | 0 72 | ] 73 | } -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Ahmad Asmar 2 | Alice 3 | Alice Chang 4 | Alice Chang 5 | CStephenson970 6 | Charlie Lewis 7 | Cory Stephenson 8 | Greg <22061293+gregs5@users.noreply.github.com> 9 | Jackson 10 | Jason Separovic 11 | Jason Separovic 12 | John Meyers 13 | John Meyers 14 | John Speed Meyers <54914994+jspeed-meyers@users.noreply.github.com> 15 | Josh Bailey 16 | Josh Bailey 17 | Kieran Baker 18 | Lewis, Charlie 19 | Mohammed Al-Shaboti 20 | Mohammed Alshaboti 21 | Renovate Bot 22 | Ryan Ashley 23 | Stephenson, Cory 24 | Todd Stavish 25 | achang 26 | alshaboti 27 | cglewis 28 | jaiken06 29 | jspeed-meyers 30 | karllab41 31 | lilchurro 32 | lostminty (lostminty@users.noreply.github.com) 33 | pgamble-admin 34 | pyup-bot 35 | pyup.io bot 36 | sneakyoctopus12 <56274120+sneakyoctopus12@users.noreply.github.com> 37 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: test 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | 8 | runs-on: ubuntu-20.04 9 | 10 | steps: 11 | - uses: actions/checkout@v3 12 | - name: test 13 | run: | 14 | export DEBIAN_FRONTEND=noninteractive && \ 15 | echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections && \ 16 | sudo apt-get update && \ 17 | sudo apt-get install -yq --no-install-recommends python3-wheel python3.9 python3.9-dev python3-setuptools dialog apt-utils tshark jq curl && \ 18 | sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && \ 19 | sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 2 && \ 20 | python3 -m pip install --upgrade pip && \ 21 | curl -sSL https://install.python-poetry.org | python3 - --version 1.1.15 && \ 22 | export PATH=/home/runner/.local/bin:$PATH && \ 23 | poetry config virtualenvs.create false && \ 24 | poetry install -E test && \ 25 | PYTHONPATH=. pytype . && \ 26 | PYTHONPATH=. pytest -l -s -n auto -v --cov=tests/ --cov=networkml/ --cov-report term-missing -c .coveragerc --rabbitmq-port=5672 && coverage report && coverage xml && \ 27 | jupyter nbconvert --ExecutePreprocessor.timeout=300 --to notebook --execute notebooks/networkml_exploration.ipynb && \ 28 | ./tests/test_e2e.sh 29 | - name: Upload coverage to Codecov 30 | uses: codecov/codecov-action@v3.1.1 31 | if: github.repository == 'iqtlabs/networkml' 32 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "networkml" 3 | version = "0.7.0" 4 | description = "Machine Learning model for use in determining device classes based on packet headers." 5 | authors = ["Ryan "] 6 | license = "Apache 2.0" 7 | include = [ 8 | "networkml/trained_models/*.*", 9 | "networkml/*.py", 10 | ] 11 | 12 | [tool.poetry.dependencies] 13 | python = ">=3.8 <3.11" 14 | cython = "0.29.32" 15 | humanize = "4.4.0" 16 | joblib = "1.2.0" 17 | netaddr = "0.8.0" 18 | numpy = "1.23.5" 19 | pandas = "1.5.2" 20 | pyshark = "0.5.3" 21 | scikit-learn = "1.2.0" 22 | scipy = "1.9.3" 23 | 24 | #optional dependencies 25 | pygments = { version = "2.13.0", optional = true } 26 | pytest = { version = "7.2.0", optional = true } 27 | pytest-cov = { version = "4.0.0", optional = true } 28 | pytest-xdist = { version = "3.1.0", optional = true } 29 | pytest-rabbitmq = { version = "2.2.1", optional = true } 30 | nest_asyncio = { version = "1.5.6", optional = true } 31 | MarkupSafe = { version = "2.1.1", optional = true } 32 | notebook = { version = "6.5.2", optional = true } 33 | pytype = { version = "2022.11.29", optional = true } 34 | 35 | [tool.poetry.extras] 36 | test = [ 37 | "pygments", 38 | "pytest", 39 | "pytest-cov", 40 | "pytest-xdist", 41 | "pytest-rabbitmq", 42 | "nest_asyncio", 43 | "MarkupSafe", 44 | "notebook", 45 | "pytype", 46 | ] 47 | 48 | [tool.poetry.dev-dependencies] 49 | 50 | [build-system] 51 | requires = ["setuptools", "poetry-core>=1.0.0"] 52 | build-backend = "poetry.core.masonry.api" 53 | 54 | [tool.poetry.scripts] 55 | networkml = 'networkml.__main__:main' 56 | -------------------------------------------------------------------------------- /.github/workflows/docker-amd64.yml: -------------------------------------------------------------------------------- 1 | name: no-arm 2 | 3 | on: 4 | push: 5 | branches: main 6 | tags: 'v*' 7 | 8 | jobs: 9 | buildx: 10 | 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v3 15 | - name: Get the version 16 | id: get_version 17 | run: echo ::set-output name=VERSION::$(echo $GITHUB_REF | cut -d / -f 3) 18 | - name: Change for main 19 | id: change_version 20 | run: if [ "${{ steps.get_version.outputs.VERSION }}" == "main" ]; then echo ::set-output name=VERSION::latest; else echo ::set-output name=VERSION::${{ steps.get_version.outputs.VERSION }}; fi 21 | - name: Set up qemu 22 | uses: docker/setup-qemu-action@v2 23 | with: 24 | platforms: all 25 | - name: Set up Docker Buildx 26 | id: buildx 27 | uses: docker/setup-buildx-action@v2 28 | with: 29 | version: latest 30 | - name: Docker Login 31 | env: 32 | DOCKER_PASSWORD: ${{ secrets.DOCKER_TOKEN }} 33 | run: | 34 | echo "${DOCKER_PASSWORD}" | docker login --username "${{ secrets.DOCKER_USERNAME }}" --password-stdin 35 | if: github.repository == 'iqtlabs/networkml' && github.event_name == 'push' 36 | - name: Build and push platforms for final image 37 | env: 38 | DOCKER_CLI_EXPERIMENTAL: enabled 39 | run: | 40 | docker buildx build \ 41 | --platform linux/amd64 \ 42 | --push \ 43 | -t iqtlabs/networkml:${{ steps.change_version.outputs.VERSION }} . 44 | if: github.repository == 'iqtlabs/networkml' && github.event_name == 'push' 45 | -------------------------------------------------------------------------------- /.github/workflows/docker.yml: -------------------------------------------------------------------------------- 1 | name: buildx 2 | 3 | on: 4 | push: 5 | branches: main 6 | tags: 'v*' 7 | 8 | jobs: 9 | buildx: 10 | 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v3 15 | - name: Get the version 16 | id: get_version 17 | run: echo ::set-output name=VERSION::$(echo $GITHUB_REF | cut -d / -f 3) 18 | - name: Change for main 19 | id: change_version 20 | run: if [ "${{ steps.get_version.outputs.VERSION }}" == "main" ]; then echo ::set-output name=VERSION::latest; else echo ::set-output name=VERSION::${{ steps.get_version.outputs.VERSION }}; fi 21 | - name: Set up qemu 22 | uses: docker/setup-qemu-action@v2 23 | with: 24 | platforms: all 25 | - name: Set up Docker Buildx 26 | id: buildx 27 | uses: docker/setup-buildx-action@v2 28 | with: 29 | version: latest 30 | - name: Docker Login 31 | env: 32 | DOCKER_PASSWORD: ${{ secrets.DOCKER_TOKEN }} 33 | run: | 34 | echo "${DOCKER_PASSWORD}" | docker login --username "${{ secrets.DOCKER_USERNAME }}" --password-stdin 35 | if: github.repository == 'iqtlabs/networkml' && github.event_name == 'push' 36 | - name: Build and push platforms for final image 37 | env: 38 | DOCKER_CLI_EXPERIMENTAL: enabled 39 | run: | 40 | docker buildx build \ 41 | --platform linux/amd64,linux/arm64 \ 42 | --push \ 43 | -t iqtlabs/networkml:${{ steps.change_version.outputs.VERSION }} . 44 | if: github.repository == 'iqtlabs/networkml' && github.event_name == 'push' 45 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | pytype_output/ 4 | .pytype 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Editor files 12 | *~ 13 | 14 | # Distribution / packaging 15 | .Python 16 | env/ 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # pyenv 80 | .python-version 81 | 82 | # celery beat schedule file 83 | celerybeat-schedule 84 | 85 | # SageMath parsed files 86 | *.sage.py 87 | 88 | # dotenv 89 | .env 90 | 91 | # virtualenv 92 | .venv 93 | venv/ 94 | ENV/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # vim temp files 103 | *.swp 104 | *.swo 105 | 106 | .mypy_cache 107 | 108 | # IntelliJ IDE files 109 | .idea/ 110 | faucet.iml 111 | *.bak 112 | 113 | *gz 114 | erl_crash.dump 115 | 116 | # converted notebooks 117 | *.nbconvert.ipynb 118 | 119 | #scan stuff 120 | reports/ 121 | 122 | semgrep.out -------------------------------------------------------------------------------- /tests/test_data/combined_two_roles.csv: -------------------------------------------------------------------------------- 1 | host_key,min_frame_len_in,25q_frame_len,25q_frame_len_out,max_frame_len_out,75q_time_delta_out,variance_frame_len_out,count_frame_len_in,75q_frame_len_out,median_frame_len,min_time_delta,variance_time_delta_out,average_frame_len_in,IPv6,75q_frame_len,average_time_delta_in,min_time_delta_in,IPv4,25q_frame_len_in,total_frame_len_in,25q_time_delta_out,min_time_delta_out,variance_frame_len_in,variance_frame_len,75q_time_delta_in,average_frame_len_out,max_time_delta,min_frame_len_out,variance_time_delta_in,average_frame_len,median_frame_len_in,median_time_delta_in,average_time_delta,max_time_delta_in,count_frame_len_out,25q_time_delta_in,min_frame_len,max_frame_len,median_frame_len_out,median_time_delta_out,max_time_delta_out,75q_frame_len_in,average_time_delta_out,total_frame_len_out,max_frame_len_in,filename 2 | 0e:00:00:00:00:01,42,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,printer-ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz 3 | 0e:00:00:00:00:02,41,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,printer-ab122_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz 4 | 0e:00:00:00:00:01,42,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,fileshare-ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz 5 | 0e:00:00:00:00:02,41,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,fileshare-ab122_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz 6 | -------------------------------------------------------------------------------- /tests/test_results_output.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import time 4 | import os 5 | 6 | from networkml.helpers.results_output import ResultsOutput 7 | 8 | 9 | def test_parse_pcap_name(): 10 | logger = logging.getLogger(__name__) 11 | instance = ResultsOutput(logger, 'uid', '/path') 12 | assert instance.parse_pcap_name('notaposeidontracefile.pcap') == ( 13 | 'notaposeidontracefile', None) 14 | assert instance.parse_pcap_name('trace_but_invalid') == ( 15 | None, None) 16 | assert instance.parse_pcap_name('trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap') == ( 17 | 'ab12', 'ip-1-2-3-4') 18 | assert instance.parse_pcap_name('trace_8adfcc152604e75d37a1a2ac62124ae859105239_2020-01-21_21_31_44-client-ip-17-253-66-125-17-253-66-125-192-168-3-2-udp-frame-ntp-wsshort-ip-eth-port-123.pcap') == ( 19 | '8adfcc152604e75d37a1a2ac62124ae859105239', 'ip-17-253-66-125-17-253-66-125-192-168-3-2-udp-frame-ntp-wsshort-ip-eth-port-123') 20 | assert instance.parse_pcap_name('trace_8198b3326dcb032a2bfbb8030339ff2159b9993d_2020-02-19_03_16_21.pcap') == ( 21 | '8198b3326dcb032a2bfbb8030339ff2159b9993d', None) 22 | assert instance.parse_pcap_name('trace_ab12_2001-01-01_02_03-miscellaneous-stuff.pcap') == ( 23 | None, None) 24 | 25 | def test_output_from_result_json(): 26 | logger = logging.getLogger(__name__) 27 | instance = ResultsOutput(logger, 'testver', 'path/') 28 | result_json = { 29 | '/dir/trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap': [{ 30 | 'top_role': 'foo', 31 | 'source_ip': '1.2.3.4', 32 | 'source_mac': '01:02:03:04:05:06', 33 | 'timestamp': 999, 34 | 'role_list': [('bsomething', 0.7), ('asomething', 0.6), ('csomething', 0.5)]}], 35 | } 36 | reformatted_result_json_file = os.devnull 37 | reformatted_json = instance.output_from_result_json(json.dumps(result_json), reformatted_result_json_file) 38 | version = reformatted_json[0]['version'] 39 | assert reformatted_json == [{'file_path': 'path/', 'id': '', 'results': {'tool': 'networkml', 'version': version}, 'type': 'metadata', 'version': version, 'tool': 'networkml', 'data': {'mac_addresses': {'01:02:03:04:05:06': {'uid': 'testver', 'file_path': 'path/', 'pcap': '', 'pcap_key': '', 'pcap_labels': None, 'timestamp': 999, 'source_ip': '1.2.3.4', 'decisions': {'investigate': False}, 'classification': {'labels': ['bsomething', 'asomething', 'csomething'], 'confidences': (0.7, 0.6, 0.5)}}}}}, {'data': '', 'file_path': 'path/', 'id': '', 'results': {'tool': 'networkml', 'version': version}, 'tool': 'networkml', 'type': 'metadata'}] # nosec - fine in a test. 40 | -------------------------------------------------------------------------------- /networkml/trained_models/README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning Models in NetworkML 2 | 3 | ## Overview 4 | 5 | NetworkML performs role identification via supervised machine learning. Although our 6 | internal analysis compared decision trees, random forests, and neural networks, the public 7 | networkML codebase only includes a neural network (or "deep learning") model. 8 | 9 | ### Neural Network 10 | Neural networks can be used for supervised machine learning to match patterns in network 11 | traffic with the functional role of a device. For further information on neural networks, 12 | see Francois Chollet's "Deep Learning with Python" published by Manning 13 | Publications in 2018, especially pages 3-116. For more general information on machine 14 | learning and information security or cybersecurity, see Clarence 15 | Chio and David Freeman, "Machine Learning & Security," published by O'Reilly 16 | in 2018. The neural network model in networkML uses the Python package scikit-learn. Using 17 | TensorFlow or a similar neural network-specific machine learning package was not necessary 18 | to achieve high levels of performance in our in-house testing. 19 | 20 | ### host_footprint model 21 | 22 | Currently one model is defined, depending upon the Host() featurizer. 23 | 24 | Note: The stored model was trained using all available host-level features. For a full description of the statistical features used to build a model in NetworkML, see the README in the featurizers directory. 25 | 26 | #### Training and predicting 27 | 28 | The model can be regenerated by: 29 | 30 | ~~~~ 31 | networkml --kfolds=5 --first_stage=algorithm --trained_model=networkml/trained_models/host_footprint.json --label_encoder=networkml/trained_models/host_footprint_le.json --scaler=networkml/trained_models/host_footprint_scaler.mod --operation train [--eval_data=/tmp/test_host.csv] /tmp/train_host.csv 32 | ~~~~ 33 | 34 | Where train_host.csv is the combined CSV output of the featurizer, used to train the model. 35 | 36 | Optionally, if a --eval_data CSV is provided, this will be tested against the trained model. 37 | This test CSV would typically be the result of a 20/80 split of the original featurizer data 38 | (20% as test_host.csv, 80% as train_host.csv). 39 | 40 | 41 | You can also evalulate an existing trained model without retraining: 42 | 43 | ~~~~ 44 | networkml --first_stage=algorithm --final_stage=algorithm --operation eval ~/tmp/test_host.csv 45 | ~~~~ 46 | 47 | 48 | A pcap prediction against an existing model in the default location can be done by: 49 | 50 | ~~~~ 51 | networkml --kfolds=5 --first_stage=parser -o /tmp/out--operation predict /tmp/test.pcap 52 | ~~~~ 53 | 54 | The output directory (e.g. /tmp/out) must already exist and be empty. 55 | 56 | You can also do a prediction against featurizer output: 57 | 58 | ~~~~ 59 | networkml --kfolds=5 --first_stage=algorithm -o /tmp/out --operation predict /tmp/combined.csv 60 | ~~~ 61 | 62 | #### Using your own model 63 | 64 | You can also use your own model. Specify --trained_model, --label_encoder, and --scaler for 65 | training and predicting. 66 | ======= 67 | -------------------------------------------------------------------------------- /tests/test_data/combined_three_roles.csv: -------------------------------------------------------------------------------- 1 | host_key,min_frame_len_in,25q_frame_len,25q_frame_len_out,max_frame_len_out,75q_time_delta_out,variance_frame_len_out,count_frame_len_in,75q_frame_len_out,median_frame_len,min_time_delta,variance_time_delta_out,average_frame_len_in,IPv6,75q_frame_len,average_time_delta_in,min_time_delta_in,IPv4,25q_frame_len_in,total_frame_len_in,25q_time_delta_out,min_time_delta_out,variance_frame_len_in,variance_frame_len,75q_time_delta_in,average_frame_len_out,max_time_delta,min_frame_len_out,variance_time_delta_in,average_frame_len,median_frame_len_in,median_time_delta_in,average_time_delta,max_time_delta_in,count_frame_len_out,25q_time_delta_in,min_frame_len,max_frame_len,median_frame_len_out,median_time_delta_out,max_time_delta_out,75q_frame_len_in,average_time_delta_out,total_frame_len_out,max_frame_len_in,filename 2 | 0e:00:00:00:00:01,42,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,printer-ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz 3 | 0e:00:00:00:00:02,41,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,printer-ab122_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz 4 | 0e:00:00:00:00:03,42,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,fileshare-ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz 5 | 0e:00:00:00:00:04,41,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,fileshare-ab122_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz 6 | 0e:00:00:00:00:05,42,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,pkiserver-ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz 7 | 0e:00:00:00:00:06,41,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,pkiserver-ab122_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz 8 | -------------------------------------------------------------------------------- /tests/test_csv_to_features.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import sys 4 | import tempfile 5 | 6 | from networkml.featurizers.csv_to_features import CSVToFeatures 7 | from networkml.parsers.pcap_to_csv import PCAPToCSV 8 | 9 | COMMON_ARGS = ['-t', '2', '-v', 'DEBUG'] 10 | 11 | 12 | def run_pcap_to_csv(in_path, out_path, engine='tshark'): 13 | sys.argv = ['pcap_to_csv.py'] + COMMON_ARGS + ['-e', engine, '-o', out_path, in_path] 14 | instance = PCAPToCSV() 15 | instance.main() 16 | 17 | 18 | def run_csv_to_features(in_path, out_path=None, featurizer='host_tshark', otherflag=None): 19 | args = ['csv_to_features.py'] + COMMON_ARGS + ['-g', featurizer] 20 | if otherflag: 21 | args.append(otherflag) 22 | if out_path: 23 | args.extend(['-o', out_path]) 24 | args.append(in_path) 25 | sys.argv = args 26 | instance = CSVToFeatures() 27 | instance.main() 28 | 29 | 30 | def run_pcap_to_features(pcap=None, outdir=False): 31 | with tempfile.TemporaryDirectory() as tmpdir: 32 | testdata = os.path.join(tmpdir, 'test_data') 33 | shutil.copytree('./tests/test_data', testdata) 34 | if pcap: 35 | pcap_path = os.path.join(testdata, pcap) 36 | pcap_csv_path = os.path.join(tmpdir, pcap + '.csv.gz') 37 | else: 38 | testsdir = os.path.join(tmpdir, 'tests') 39 | shutil.copytree('tests', testsdir) 40 | pcap_path = testsdir 41 | pcap_csv_path = os.path.join(tmpdir, 'pcap.csv.gz') 42 | run_pcap_to_csv(pcap_path, pcap_csv_path) 43 | if outdir: 44 | outpath = tmpdir 45 | else: 46 | outpath = os.path.join(tmpdir, 'combined.csv.gz') 47 | run_csv_to_features(pcap_csv_path, outpath) 48 | 49 | 50 | def test_CSVToFeatures(): 51 | run_pcap_to_features(pcap='trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap', outdir=False) 52 | 53 | 54 | def test_CSVToFeatures_no_output(): 55 | run_pcap_to_features(pcap='trace_ab12_2001-01-01_02_03-client-ip6-1-2-3-4.pcap', outdir=False) 56 | 57 | 58 | def test_CSVToFeatures_input_dir_output_file(): 59 | run_pcap_to_features(pcap=None, outdir=False) 60 | 61 | 62 | def test_CSVToFeatures_input_dir_output_dir(): 63 | run_pcap_to_features(pcap=None, outdir=True) 64 | 65 | 66 | def test_CSVToFeatures_no_group_or_func(): 67 | with tempfile.TemporaryDirectory() as tmpdir: 68 | testdata = os.path.join(tmpdir, 'test_data') 69 | shutil.copytree('./tests/test_data', testdata) 70 | trace = os.path.join(testdata, 'trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz') 71 | run_csv_to_features(trace, featurizer='') 72 | 73 | 74 | def test_CSVToFeatures_host(): 75 | with tempfile.TemporaryDirectory() as tmpdir: 76 | testdata = os.path.join(tmpdir, 'test_data') 77 | shutil.copytree('./tests/test_data', testdata) 78 | trace = os.path.join(testdata, 'trace_ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz') 79 | for srcidflag in ('--srcmacid', '--no-srcmacid'): 80 | for featurizer in ('sessionhost_tshark', 'host_tshark'): 81 | run_csv_to_features(trace, featurizer=featurizer, otherflag=srcidflag) 82 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment include: 10 | 11 | * Using welcoming and inclusive language 12 | * Being respectful of differing viewpoints and experiences 13 | * Gracefully accepting constructive criticism 14 | * Focusing on what is best for the community 15 | * Showing empathy towards other community members 16 | 17 | Examples of unacceptable behavior by participants include: 18 | 19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances 20 | * Trolling, insulting/derogatory comments, and personal or political attacks 21 | * Public or private harassment 22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission 23 | * Other conduct which could reasonably be considered inappropriate in a professional setting 24 | 25 | ## Our Responsibilities 26 | 27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 28 | 29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | ## Scope 32 | 33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 34 | 35 | ## Enforcement 36 | 37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at clewis@iqt.org. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 38 | 39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 40 | 41 | ## Attribution 42 | 43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] 44 | 45 | [homepage]: http://contributor-covenant.org 46 | [version]: http://contributor-covenant.org/version/1/4/ 47 | -------------------------------------------------------------------------------- /networkml/featurizers/README.md: -------------------------------------------------------------------------------- 1 | # Host-Related Features in NetworkML Models 2 | 3 | Machine learning models require inputs (or "features" in the language of machine learning practitioners.) NetworkML, a machine learning model that operates on network traffic, is no different. This readme describes the features that networkML can create and can therefore be included in NetworkML's models. All features can either be created for each host present, which therefore creates a numeric representation of all the traffic flowing into and out of a host, or for each session, which describes a particular data exchange for two hosts defined by a 5-tuple of source and destination IP's, source and destination ports, and protocol. This readme focuses on the host-related features. These features were devised with enterprise networks as the likely site of NetworkML deployment. Additionally, these features have a focus on both IP and non-IP traffic at layer 2, which is different than many other network traffic analysis models. 4 | 5 | A key at the bottom of this page explains the symbols associated with each feature set. 6 | 7 | ## IP Protocol-Specific Features 8 | 9 | IPv4 (b) 10 | 11 | IPv6 (b) 12 | 13 | Well-known Ethernet protocols (b) [Note: Each flag is assigned an individual boolean vector. See list of ethernet protocols at the bottom of the page.] 14 | 15 | TCP ports (i/o) \(P\) (P/NP) (b) 16 | 17 | UDP ports (i/o) (P/NP) (b) 18 | 19 | TCP flags (i/o) (b) [Note: Each flag is assigned an individual boolean vector.] 20 | 21 | IP flags (i/o) (b) 22 | 23 | IPX (b) 24 | 25 | Both private IP (b) 26 | 27 | IPv4 multicast (b) 28 | 29 | IP differentiated services (i/o) (b) 30 | 31 | Well-known IP protocols (b) [Note: Each protocol is assigned an individual boolean vector. See list of IP protocols at the bottom of the page.] 32 | 33 | ## Non-IP Features 34 | 35 | VLAN ID (b) 36 | 37 | Non-IP protocol (b) 38 | 39 | ## Packet Timing-related Features 40 | Interarrival time (D) (S) \(r\) 41 | 42 | ## Packet Size-related Features 43 | Frame length (D) (S) \(r\) 44 | 45 | ## Feature Key 46 | **Directionality** 47 | Indicates that there are versions of a feature for different traffic directions 48 | 49 | (i) = incoming packets 50 | 51 | (o) = outgoing packets 52 | 53 | (bi) = bidirectional flow 54 | 55 | (D) = i + o + bi 56 | 57 | **Statistics** 58 | Indicates that there are versions of a feature for each statistic 59 | 60 | (S) Statistics = (min, 25th percentile, median, 75th percentile, max, mean, variance, count, sum) 61 | 62 | **Well Known Ports** 63 | Indicates that features are port-specific 64 | 65 | \(P\) Private ports = (22, 23, 25, 53, 67, 68, 69, 80, 88, 110, 123, 137, 138, 139, 143, 161, 443, 631, other) 66 | 67 | (NP) Non-private ports = (1900, 2375, 5222, 5349, 5353, 5354, 5349, 5357, 6653, other) 68 | 69 | **Type of values** 70 | Indicates the acceptable values for a feature 71 | 72 | (b) = binary feature (0, 1) 73 | 74 | \(r\) = real number feature (-inf, +inf) 75 | 76 | Example: Frame length (D) (S) \(r\) indicates that there are versions of this feature for incoming packets, outgoing packets, and bidirectional flows and also sub-versions for each different statistic. In total, there are 27 features. 77 | 78 | Ethernet protocols: Well-known ethernet protocols include ethernet, IPv6, IP, TCP, ARP, ICMP, GRE, ESP. 79 | IP protocols: Well-known IP protocols include TCP, UDP, ICMP, ICMPv6, ARP, and an other category. 80 | -------------------------------------------------------------------------------- /tests/test_data/combined.csv: -------------------------------------------------------------------------------- 1 | min_frame_len_in,25q_frame_len,25q_frame_len_out,max_frame_len_out,75q_time_delta_out,variance_frame_len_out,count_frame_len_in,75q_frame_len_out,median_frame_len,min_time_delta,variance_time_delta_out,average_frame_len_in,IPv6,75q_frame_len,average_time_delta_in,min_time_delta_in,IPv4,25q_frame_len_in,total_frame_len_in,25q_time_delta_out,min_time_delta_out,variance_frame_len_in,variance_frame_len,75q_time_delta_in,average_frame_len_out,max_time_delta,min_frame_len_out,variance_time_delta_in,average_frame_len,median_frame_len_in,median_time_delta_in,average_time_delta,max_time_delta_in,count_frame_len_out,25q_time_delta_in,min_frame_len,max_frame_len,median_frame_len_out,median_time_delta_out,max_time_delta_out,75q_frame_len_in,average_time_delta_out,total_frame_len_out,max_frame_len_in,filename 2 | 42,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,printer-ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz 3 | 41,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,printer-ab122_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz 4 | 42,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,fileshare-ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz 5 | 41,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,fileshare-ab122_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz 6 | 42,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,pkiserver-ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz 7 | 41,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,pkiserver-ab122_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz 8 | 42,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,workstation-ab12_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz 9 | 41,54,54,1514,0.0003925,173155.3216,9967,66,161,0,0.019982144,815.9771245,0,1514,0.01997715,0,1,60,8132844,2.70E-05,0,457031.152,438388.4943,0.0041335,252.3723801,3.00746,54,0.012088969,646.2752261,931,0.000772,0.020931586,2.516838,4294,3.70E-05,42,1514,54,4.90E-05,3.00746,1514,0.02314697,1083687,1514,workstation-ab122_2001-01-01_02_03-client-ip-1-2-3-4.pcap.csv.gz 10 | -------------------------------------------------------------------------------- /networkml/featurizers/main.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import inspect 3 | import os 4 | import sys 5 | import time 6 | 7 | from networkml.featurizers.features import Features 8 | 9 | # TODO move print statements to logging 10 | 11 | 12 | class Featurizer(): 13 | 14 | def import_class(self, path, classes): 15 | """ 16 | Imports classs from an external directory at runtime. Imported functions will be added 17 | to classes 18 | :param path: path where the modules reside 19 | :param classes: existing class instances 20 | :return list of newly add class instances 21 | """ 22 | # make sure path exists 23 | if os.path.isdir(path) is False: 24 | print('Error: path {} does not exist'.format(path)) 25 | return classes 26 | 27 | # add the path to the PYTHONPATH 28 | sys.path.append(path) 29 | 30 | # acquire list of files in the path 31 | mod_list = os.listdir(path) 32 | 33 | for f in mod_list: 34 | 35 | # continue if it is not a python file 36 | if f[-3:] != '.py': 37 | continue 38 | 39 | # get module name by removing extension 40 | mod_name = os.path.basename(f)[:-3] 41 | 42 | # import the module 43 | module = __import__(mod_name, locals(), globals()) 44 | for name, cls in inspect.getmembers(module): 45 | if inspect.isclass(cls) and name != 'Features': 46 | instance = cls() 47 | if isinstance(instance, Features): 48 | # append an instance of the class to classes 49 | classes.append((instance, name)) 50 | print(f'Importing class: {name}') 51 | 52 | return classes 53 | 54 | def run_all_funcs(self, functions_orig, groups_orig, classes_orig, rows_f, parsed_args): 55 | functions = copy.deepcopy(functions_orig) 56 | groups = copy.deepcopy(groups_orig) 57 | classes = copy.deepcopy(classes_orig) 58 | feature_rows = [] 59 | run_methods = [] 60 | 61 | def verify_feature_row(method, feature_row): 62 | assert isinstance(feature_row, list), 'method %s returned non list: %s' % ( 63 | method, feature_row) 64 | non_dicts = {x for x in feature_row if not isinstance(x, dict)} 65 | assert not non_dicts, 'method %s returned something not a dict: %s' % ( 66 | method, non_dicts) 67 | 68 | def run_func(method, func, descr): 69 | print(f'running {descr}...', end='') 70 | start_time = time.time() 71 | feature_row = func() 72 | elapsed_time = int(time.time() - start_time) 73 | print(f'{elapsed_time}s') 74 | verify_feature_row(method, feature_row) 75 | return feature_row 76 | 77 | # attempt to group methods together based on same field name for more cache hits. 78 | def method_key(method): 79 | return ''.join(reversed(method.strip('_in').strip('_out'))) 80 | 81 | for f in classes: 82 | if groups: 83 | methods = filter( 84 | lambda funcname: funcname.startswith(groups), dir(f[0])) 85 | for method in sorted(methods, key=method_key): 86 | feature_rows.append(run_func(method, lambda: f[0].run_func( 87 | method, rows_f, parsed_args), f'{f[1]}/{method}')) 88 | run_methods.append((f[1], method)) 89 | 90 | # run remaining extras 91 | for function in functions: 92 | if function not in run_methods: 93 | for f in classes: 94 | if f[1] == function[0]: 95 | method = function[1] 96 | feature_rows.append(run_func(method, lambda: f[0].run_func( 97 | method, rows_f, parsed_args), f'{f[1]}/{function[1]}')) 98 | return feature_rows 99 | 100 | def main(self, feature_choices, rows, features_path, parsed_args): 101 | functions = [] 102 | groups = ('default') 103 | classes = [] 104 | classes = self.import_class(features_path, classes) 105 | 106 | if 'functions' in feature_choices: 107 | functions = feature_choices['functions'] 108 | if 'groups' in feature_choices: 109 | groups = feature_choices['groups'] 110 | 111 | return self.run_all_funcs(functions, groups, classes, rows, parsed_args) 112 | -------------------------------------------------------------------------------- /networkml/helpers/pandas_csv_importer.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import ipaddress 3 | import warnings 4 | 5 | import netaddr 6 | import numpy 7 | import pandas as pd 8 | from pandas.errors import DtypeWarning 9 | # We are using converters to fix types, so mixed type warning from read_csv() is spurious. 10 | warnings.simplefilter(action='ignore', category=DtypeWarning) 11 | 12 | 13 | @functools.lru_cache() 14 | def _ipaddress_packed(val): 15 | if len(val) > 0: 16 | return int(ipaddress.ip_address(val)) 17 | return None 18 | 19 | 20 | @functools.lru_cache() 21 | def _netaddr_packed(val): 22 | if len(val) > 0: 23 | return int(netaddr.EUI(val)) 24 | return None 25 | 26 | 27 | def _hex_str(val): 28 | if len(val) > 0: 29 | assert val.startswith('0x'), val 30 | return int(val, 16) 31 | return None 32 | 33 | 34 | def _safe_int(val): 35 | if len(val) > 0: 36 | return int(val) 37 | return None 38 | 39 | 40 | def _eth_protos(val): 41 | return ':'.join([i for i in val.split(':') if i != 'ethertype']) 42 | 43 | 44 | WS_FIELDS = { 45 | 'arp.opcode': (_safe_int, 8), 46 | 'eth.src': (_netaddr_packed, None), 47 | 'eth.dst': (_netaddr_packed, None), 48 | 'eth.type': (_hex_str, 16), 49 | 'frame.len': (_safe_int, 32), 50 | 'frame.time_epoch': (float, None), 51 | 'frame.time_delta_displayed': (float, None), 52 | 'frame.protocols': (_eth_protos, None), 53 | 'icmp.code': (_safe_int, 8), 54 | 'gre.proto': (_hex_str, 8), 55 | 'ip.src': (_ipaddress_packed, None), 56 | 'ip.src_host': (_ipaddress_packed, None), 57 | 'ip.dst': (_ipaddress_packed, None), 58 | 'ip.dst_host': (_ipaddress_packed, None), 59 | 'ip.dsfield': (_hex_str, 8), 60 | 'ip.flags': (_hex_str, 16), 61 | 'ip.proto': (_safe_int, 8), 62 | 'ip.version': (_safe_int, 8), 63 | 'icmpv6.code': (_safe_int, 8), 64 | 'ipv6.src': (_ipaddress_packed, None), 65 | 'ipv6.src_host': (_ipaddress_packed, None), 66 | 'ipv6.dst': (_ipaddress_packed, None), 67 | 'ipv6.dst_host': (_ipaddress_packed, None), 68 | 'tcp.flags': (_hex_str, 16), 69 | 'tcp.srcport': (_safe_int, 16), 70 | 'tcp.dstport': (_safe_int, 16), 71 | 'udp.srcport': (_safe_int, 16), 72 | 'udp.dstport': (_safe_int, 16), 73 | 'vlan.etype': (_hex_str, 16), 74 | 'vlan.id': (_safe_int, 16), 75 | } 76 | _WS_FIELDS_CONVERTERS = { 77 | field: field_info[0] for field, field_info in WS_FIELDS.items()} 78 | _WS_FIELDS_NULLABLE_INT = { 79 | field: 'UInt%s' % field_info[1] for field, field_info in WS_FIELDS.items() 80 | if isinstance(field_info[1], int)} 81 | _WS_NON_INT_CONVERTERS = { 82 | field: converter for field, converter in _WS_FIELDS_CONVERTERS.items() 83 | if field not in _WS_FIELDS_NULLABLE_INT} 84 | _REQUIRED_WS_FIELDS = { 85 | 'eth.src', 'eth.dst', 'frame.len', 86 | 'frame.time_epoch', 'frame.time_delta_displayed'} 87 | 88 | 89 | def recast_df(df): 90 | # TODO: when pandas allows read_csv to infer nullable ints, we can use less memory on import. 91 | # https://github.com/pandas-dev/pandas/issues/2631 92 | # For now convert to nullable int after import. 93 | for col, typestr in _WS_FIELDS_NULLABLE_INT.items(): 94 | try: 95 | df[col] = df[col].astype(typestr) 96 | except TypeError: 97 | raise TypeError('cannot cast %s to %s: %u' % 98 | (col, typestr, df[col].max())) 99 | return df 100 | 101 | 102 | def import_csv(in_file): 103 | # We need converters, so we can't use dtypes parameter, and that results in an un-suppressable warning. 104 | sample_df = pd.read_csv(in_file, index_col=0, nrows=100) 105 | csv_fields = set(sample_df.columns.tolist()) 106 | usecols = csv_fields.intersection(WS_FIELDS.keys()) 107 | missingcols = set(WS_FIELDS.keys()) - csv_fields 108 | 109 | # Any hex-int fields, detected as strings? 110 | obj_int_fields = { 111 | field for field, fieldinfo in WS_FIELDS.items() 112 | if fieldinfo[0] == _hex_str and sample_df.dtypes.get(field, None) == numpy.dtype('O')} 113 | converters = _WS_NON_INT_CONVERTERS 114 | # If yes, this is an old style PCAP CSV which needs conversion. 115 | if obj_int_fields: 116 | converters = _WS_FIELDS_CONVERTERS 117 | 118 | df = pd.read_csv(in_file, usecols=usecols, converters=converters) 119 | 120 | for col in missingcols: 121 | df[col] = None 122 | for col in _REQUIRED_WS_FIELDS: 123 | assert df[col].count( 124 | ) > 0, 'required col %s is all null (not a PCAP CSV?)' % col 125 | df = recast_df(df) 126 | return df 127 | -------------------------------------------------------------------------------- /networkml/helpers/results_output.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import re 4 | 5 | import networkml 6 | 7 | from networkml import __version__ 8 | 9 | 10 | class ResultsOutput: 11 | 12 | def __init__(self, logger, uid, file_path): 13 | self.logger = logger 14 | self.uid = uid 15 | self.file_path = file_path 16 | 17 | @staticmethod 18 | def assign_labels(labels): 19 | netml_path = list(networkml.__path__) 20 | la = os.path.join(netml_path[0], 21 | 'trained_models/label_assignments.json') 22 | assignment_map = {} 23 | with open(la) as f: 24 | assignment_map = json.load(f) 25 | labels = [assignment_map[label] if label in assignment_map else label for label in labels] 26 | return labels 27 | 28 | @staticmethod 29 | def parse_pcap_name(base_pcap): 30 | # The parsing operation below assumes a specific file naming 31 | # convention trace_DeviceName-deviceID-time-duration-flags.pcap 32 | # Explanation: All files coming from Poseidon have trace_ at their 33 | # beginning. The device name and deviceID colums are self explanatory. 34 | # Time refers to the day of the week and time of day. Duration refers 35 | # to the length of the network traffic capture. The flags aspect 36 | # refers to an unknown characteristic. 37 | # TODO: tolerate tshark labels in the trace name, but do not parse them for now. 38 | pcap_key = None 39 | pcap_labels = None 40 | if base_pcap.startswith('trace_'): 41 | for pcap_re, key_pos, label_pos in ( 42 | (re.compile( 43 | r'^trace_([\da-f]+)_([0-9\_\-]+)-(client|server)-(.+).pcap$'), 1, 4), 44 | (re.compile(r'^trace_([\da-f]+)_([0-9\_\-]+).pcap$'), 1, None)): 45 | pcap_match = pcap_re.match(base_pcap) 46 | if pcap_match: 47 | pcap_key = pcap_match.group(key_pos) 48 | if label_pos: 49 | pcap_labels = pcap_match.group(label_pos) 50 | break 51 | else: 52 | # Not a Poseidon trace file, return basename as key. 53 | pcap_key = base_pcap.split('.')[0] 54 | return (pcap_key, pcap_labels) 55 | 56 | @staticmethod 57 | def valid_template(uid, file_path, timestamp, source_ip, investigate, labels, confidences, 58 | pcap_labels, base_pcap, pcap_key): 59 | return { 60 | 'uid': uid, 61 | 'file_path': file_path, 62 | 'pcap': base_pcap, 63 | 'pcap_key': pcap_key, 64 | 'pcap_labels': pcap_labels, 65 | 'timestamp': timestamp, 66 | 'source_ip': source_ip, 67 | 'decisions': { 68 | 'investigate': investigate, 69 | }, 70 | 'classification': { 71 | 'labels': labels, 72 | 'confidences': confidences, 73 | }, 74 | } 75 | 76 | def output_from_result_json(self, result_json_str, reformatted_result_json_file_name): 77 | base_pcap = os.path.basename(self.file_path) 78 | pcap_key, pcap_labels = self.parse_pcap_name(base_pcap) 79 | result_json = json.loads(result_json_str) 80 | 81 | mac_metadata = {} 82 | for filename, host_results in result_json.items(): 83 | filename = filename.split('.csv.gz')[0] 84 | for host_result in host_results: 85 | top_role = host_result.get('top_role', None) 86 | if top_role is not None: 87 | investigate = top_role == 'Unknown' 88 | source_ip = host_result.get('source_ip', None) 89 | source_mac = host_result.get('source_mac', None) 90 | timestamp = host_result.get('timestamp', None) 91 | labels, confidences = zip(*host_result['role_list']) 92 | labels = self.assign_labels(labels) 93 | mac_metadata[source_mac] = self.valid_template( 94 | self.uid, self.file_path, timestamp, source_ip, 95 | investigate, labels, confidences, 96 | pcap_labels, base_pcap, pcap_key) 97 | reformatted_json = [{ 98 | 'tool': 'networkml', 99 | 'version': __version__, 100 | 'id': os.environ.get('id', ''), 101 | 'type': 'metadata', 102 | 'file_path': self.file_path, 103 | 'results': {'tool': 'networkml', 'version': __version__}, 104 | 'data': { 105 | 'mac_addresses': mac_metadata, 106 | } 107 | }, 108 | { 109 | 'tool': 'networkml', 110 | 'id': os.environ.get('id', ''), 111 | 'type': 'metadata', 112 | 'file_path': self.file_path, 113 | 'data': '', 114 | 'results': {'tool': 'networkml', 'version': __version__} 115 | }] 116 | with open(reformatted_result_json_file_name, 'w') as reformatted_result: 117 | reformatted_result.write(json.dumps(reformatted_json)) 118 | return reformatted_json 119 | -------------------------------------------------------------------------------- /.github/workflows/semgrep.yml: -------------------------------------------------------------------------------- 1 | name: semgrep 2 | on: 3 | pull_request_target: 4 | types: 5 | - opened 6 | - synchronize 7 | - reopened 8 | permissions: 9 | pull-requests: write 10 | jobs: 11 | docker_scan: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v3 15 | with: 16 | repository: ${{ github.event.pull_request.head.repo.full_name }} 17 | ref: ${{ github.event.pull_request.head.ref }} 18 | - name: scan 19 | id: d_scan 20 | run: | 21 | export DEBIAN_FRONTEND=noninteractive && \ 22 | echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections && \ 23 | sudo apt-get update && \ 24 | sudo apt install jq && \ 25 | python3 -m pip install --upgrade pip && \ 26 | python3 -m pip install semgrep && \ 27 | python3 -m pip install --upgrade urllib3 && \ 28 | mkdir /home/runner/reports/ && \ 29 | cd ${GITHUB_WORKSPACE}/ && \ 30 | semgrep --config=.github/workflows/config/semgrep-docker.yml --json -o /home/runner/reports/semgrep.out \ 31 | --severity ERROR ./ &&\ 32 | echo "## Validation Issues Found (Docker) :whale: " >> /home/runner/reports/docker-msg && \ 33 | cat /home/runner/reports/semgrep.out | jq -r --arg ws "$GITHUB_WORKSPACE" --arg url "$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/blob/$GITHUB_SHA" '.results[] | "**File:** [\(.path | sub($ws; "."; "g"))](\(.path | sub($ws; $url; "g"))#L\(.start.line)) \n**Line Number:** \(.start.line) \n**Statement(s):** \n``` \n\(.extra.lines) \n``` \n**Rule:** \n\(.extra.message)\n\n"' >> /home/runner/reports/docker-msg && \ 34 | echo "::set-output name=found-count::$(cat /home/runner/reports/semgrep.out | jq '.results | length')" 35 | - name: Fail if found 36 | if: steps.d_scan.outputs.found-count != 0 37 | uses: actions/github-script@v6 38 | with: 39 | script: | 40 | const fs = require('fs') 41 | 42 | var msg = fs.readFileSync('/home/runner/reports/docker-msg', 'utf8'); 43 | console.log('${{steps.d_scan.outputs.found-count}} errors found in docker/docker-compose files'); 44 | github.rest.issues.createComment({ 45 | issue_number: context.issue.number, 46 | owner: context.repo.owner, 47 | repo: context.repo.repo, 48 | body: msg 49 | }); 50 | 51 | core.setFailed('Semgrep found errors in Dockerfiles or docker-compose files. Please check the uploaded report'); 52 | - name: Upload scan reports 53 | uses: actions/upload-artifact@v3.1.1 54 | if: failure() 55 | with: 56 | name: semgrep-docker-report 57 | path: /home/runner/reports/semgrep.out 58 | python_scan: 59 | runs-on: ubuntu-latest 60 | steps: 61 | - uses: actions/checkout@v3 62 | with: 63 | repository: ${{ github.event.pull_request.head.repo.full_name }} 64 | ref: ${{ github.event.pull_request.head.ref }} 65 | - name: scan 66 | id: py_scan 67 | run: | 68 | export DEBIAN_FRONTEND=noninteractive && \ 69 | echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections && \ 70 | sudo apt-get update && \ 71 | sudo apt install jq && \ 72 | python3 -m pip install --upgrade pip && \ 73 | python3 -m pip install semgrep && \ 74 | python3 -m pip install --upgrade urllib3 && \ 75 | mkdir -p /home/runner/reports/ && \ 76 | cd ${GITHUB_WORKSPACE}/ && \ 77 | semgrep --config=.github/workflows/config/semgrep-python.yml --json -o /home/runner/reports/semgrep.out \ 78 | --severity ERROR ./ && \ 79 | echo "## Validation Issues Found (Python) :snake: " >> /home/runner/reports/python-msg && \ 80 | cat /home/runner/reports/semgrep.out | jq -r --arg ws "$GITHUB_WORKSPACE" --arg url "$GITHUB_SERVER_URL/$GITHUB_REPOSITORY/blob/$GITHUB_SHA" '.results[] | "**File:** [\(.path | sub($ws; "."; "g"))](\(.path | sub($ws; $url; "g"))#L\(.start.line)) \n**Line Number:** \(.start.line) \n**Statement(s):** \n``` \n\(.extra.lines) \n``` \n**Rule:** \n\(.extra.message)\n\n"' >> /home/runner/reports/python-msg && \ 81 | echo "::set-output name=python-found-count::$(cat /home/runner/reports/semgrep.out | jq '.results | length')" 82 | - name: Fail if found 83 | if: steps.py_scan.outputs.python-found-count > 0 84 | uses: actions/github-script@v6 85 | with: 86 | github-token: ${{secrets.GITHUB_TOKEN}} 87 | script: | 88 | const fs = require('fs') 89 | 90 | var msg = fs.readFileSync('/home/runner/reports/python-msg', 'utf8'); 91 | console.log('${{steps.py_scan.outputs.python-found-count}} errors found in python files'); 92 | github.rest.issues.createComment({ 93 | issue_number: context.issue.number, 94 | owner: context.repo.owner, 95 | repo: context.repo.repo, 96 | body: msg 97 | }); 98 | 99 | core.setFailed('Semgrep found errors in Python files. Please check the uploaded report'); 100 | - name: Upload scan reports 101 | uses: actions/upload-artifact@v3.1.1 102 | if: failure() 103 | with: 104 | name: semgrep-python-report 105 | path: /home/runner/reports/semgrep.out 106 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Device Functional Role ID via Machine Learning and Network Traffic Analysis 2 | 3 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) 4 | ![Build Status](https://github.com/iqtlabs/networkml/workflows/test/badge.svg) 5 | [![PyPI version](https://badge.fury.io/py/networkml.svg)](https://badge.fury.io/py/networkml) 6 | [![codecov](https://codecov.io/gh/IQTLabs/NetworkML/branch/main/graph/badge.svg)](https://codecov.io/gh/IQTLabs/IQTLabs) 7 | [![Docker Hub Downloads](https://img.shields.io/docker/pulls/iqtlabs/networkml.svg)](https://hub.docker.com/r/iqtlabs/networkml/) 8 | 9 | ## Overview 10 | NetworkML is the machine learning portion of our [Poseidon](https://github.com/IQTLabs/poseidon) project. The model in networkML classifies each device into a functional role via machine learning models trained on features derived from network traffic. "Functional role" refers to the authorized administrative purpose of the device on the network and includes roles such as printer, mail server, and others typically found in an IT environment. Our internal analysis suggests networkML can achieve accuracy, precision, recall, and F1 scores in the high 90's when trained on devices from your own network. Whether this performance can transfer from IT environment to IT environment is an active area of our research. 11 | 12 | NetworkML can be used in a "standalone" mode from the command line interface. For more background and context on the macro project, please check out [the Poseidon project](https://www.cyberreboot.org/projects/poseidon/) page on our website. This repository specifically covers the output, inputs, data processing, and machine learning models we deploy in networkML. 13 | 14 | While this repository and resulting docker container can be used completely independently, the code was written to support the IQT Labs Poseidon project. See: 15 | 16 | - [Poseidon](https://github.com/IQTLabs/poseidon) SDN project. 17 | 18 | This repository contains the components necessary to build a docker container that can be used for training a number of ML models using network packet captures (PCAPs). The repository includes scripts necessary to do training, testing, and evaluation. These can be run from a shell once `networkml` is installed as a package or run in a Docker container using the `networkml` script. 19 | 20 | Feel free to use, discuss, and contribute! 21 | 22 | ## Model Output 23 | NetworkML predicts the functional role of network-connected device via network traffic analysis and machine learning. 24 | 25 | Admittedly subjective, the term "role" refers to the authorized administrative purpose of the device on the network. NetworkML in its default configuration has twelve roles: active directory controller, administrator server, administrator workstation, confluence server, developer workstation, distributed file share, exchange server, graphics processing unit (GPU) laptop, github server, public key infrastructure (PKI) server, and printer. This typology reflects the network-connected devices in the data we used to train the model. Other networks will lack some of these roles and might include others. Consequently, organizations that wish to use networkML might have to adapt the model outputs for their specific organization. 26 | 27 | ## Model Inputs 28 | NetworkML's key input is the network traffic for a single device. By network traffic for a single device, we mean all packets sent and received by that device over a given time period. For reliable results, we recommend at least fifteen minutes of network traffic. Poseidon, the larger project of which networkML is only a part, performs the necessary packet pre-processing to produce pcap's containing all network traffic to and from a single device. If you are using networkML in a standalone manner, the pcap files must all follow a strict naming convention: DeviceName-deviceID-time-duration-flags.pcap. For example, ActiveDirectoryController-labs-Fri0036-n00.pcap refers to a pcap from an active directory controller taken from a user named labs on a Friday at 00:36. The flag field does not currently have any significance. 29 | 30 | It is worth noting that networkML uses only packet header data in its models. NetworkML does not use data from the packet payload. Relying only on packet header data enables networkML to avoid some privacy-related issues associated with using payload data and to create (hopefully) more generalizable and more performant models. 31 | 32 | ## Data Processing 33 | 34 | ## Algorithms 35 | 36 | NetworkML uses a feedforward neural network from the scikit-learn package. The model is trained using 5-fold cross validation in combination with a simple grid-search of the hyper-parameter space. 37 | 38 | 39 | # Installation/Run 40 | 41 | Our models can be executed via Docker and in a standalone manner on a Linux host. We recommend deployment via Poseidon if you are running an SDN (software-defined network). Otherwise, we recommend using Docker. 42 | 43 | See the [README](https://github.com/IQTLabs/NetworkML/tree/main/networkml/trained_models) file included in the `networkml/trained_models` folder for specific instructions on deployment. 44 | 45 | # Develop/Standalone Installation 46 | 47 | Note: This project uses absolute paths for imports, meaning you'll either need to modify your `PYTHONPATH` to something like this from the project directory: 48 | ``` 49 | export PYTHONPATH=$PWD/networkml:$PYTHONPATH 50 | ``` 51 | Alternatively, simply running `pip3 install .` from the project directory after making changes will update the package to test or debug against. 52 | 53 | This package is set up for anaconda/miniconda to be used for package and environment 54 | management if desired. Assuming you have the latest install (as of this writing, we have been using 55 | conda 4.5.12), set up the environment by performing the following: 56 | 1. Ensure that the CONDA_EXE environment variable has been set. If `echo $CONDA_EXE` 57 | returns empty, resolve this by `export CONDA_EXE=$_CONDA_EXE` in your bash shell. 58 | 2. Run `make dev` to set up the environment 59 | 3. Run `conda activate posml-dev` to begin. 60 | 61 | You can remove the dev environment via standard conda commands: 62 | 1. Run `conda deactivate` 63 | 2. Run `conda env remove -y -n posml-dev` 64 | 65 | For more information about using conda, please refer to their 66 | [user documentation](https://conda.io/projects/conda/en/latest/user-guide/getting-started.html). 67 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to NetworkML 2 | 3 | Want to hack on NetworkML? Awesome! Here are instructions to get you started. 4 | If you have any questions or find the instructions to be incomplete, please do 5 | open an issue to let us know about it. 6 | 7 | ## Contribution guidelines 8 | 9 | ### Pull requests are always welcome 10 | 11 | We are always thrilled to receive pull requests and do our best to 12 | process them as fast as possible. Not sure if that typo is worth a pull 13 | request? Do it! We will appreciate it. 14 | 15 | If your pull request is not accepted on the first try, don't be 16 | discouraged! If there's a problem with the implementation, hopefully you 17 | received feedback on what to improve. 18 | 19 | We're trying very hard to keep NetworkML lean and focused. We don't want it 20 | to do everything for everybody. This means that we might decide against 21 | incorporating a new feature. However, there might be a way to implement 22 | that feature *on top of* NetworkML. 23 | 24 | ### Create issues... 25 | 26 | Any significant improvement should be documented as [a github 27 | issue](https://github.com/IQTLabs/NetworkML/issues) before anybody 28 | starts working on it. 29 | 30 | ### ...but check for existing issues first! 31 | 32 | Please take a moment to check that an issue doesn't already exist 33 | documenting your bug report or improvement proposal. If it does, it 34 | never hurts to add a quick "+1" or "I have this problem too". This will 35 | help prioritize the most common problems and requests. 36 | 37 | ### Conventions 38 | 39 | #### Project structure 40 | 41 | The NetworkML project is currently structured to be a collection of 42 | models processing pcap traffic. Each model is contained within its own 43 | folder under the root directory. Code under root's `utils/` folder contains 44 | generic feature extraction and processing from raw pcap files, and can be 45 | reused by any of the models within the collection. 46 | 47 | Take the `DeviceClassifier` as an archetype example of one such model. 48 | Our [Poseidon project](https://github.com/IQTLabs/Poseidon) uses this 49 | to identify device roles on the network based on their behavior on the 50 | network. In fact, this classifier contains two different models that can 51 | be used depending on the amount of data available for training -- `OneLayer` 52 | neural network model, and the `RandomForest` model. Each of these models 53 | are contained in their own subdirectories, and a `README` file describes 54 | the usage and requirements of both. Within each model's directory, you'll 55 | find a Dockerfile and the scripts to train, test, and evaluate the models. 56 | Any configurations or options specific to these models are located in the 57 | `opts/` subfolder, and the optional trained models (in the form of 58 | serialized pkl files) are made available in the `models/` subfolder. 59 | 60 | Our hope is that by following this structure as much as possible, newer 61 | users can get up to speed more quickly, and models will be easier to 62 | maintain in the long run. However, if you find this too stifling for 63 | your specific model, we will leave it to you to explain the usage, 64 | requirements and structure in your model's `README` file. 65 | 66 | 67 | #### Submitting a pull request 68 | 69 | Fork the repo and make changes on your fork in a feature branch. 70 | 71 | Make sure you include relevant updates or additions to documentation and 72 | tests when creating or modifying features. 73 | 74 | Pull requests descriptions should be as clear as possible and include a 75 | reference to all the issues that they address. 76 | 77 | Code review comments may be added to your pull request. Discuss, then make the 78 | suggested modifications and push additional commits to your feature branch. Be 79 | sure to post a comment after pushing. The new commits will show up in the pull 80 | request automatically, but the reviewers will not be notified unless you 81 | comment. 82 | 83 | Before the pull request is merged, make sure that you squash your commits into 84 | logical units of work using `git rebase -i` and `git push -f`. After every 85 | commit the test suite should be passing. Include documentation changes in the 86 | same commit so that a revert would remove all traces of the feature or fix. 87 | 88 | Commits that fix or close an issue should include a reference like `Closes #XXX` 89 | or `Fixes #XXX`, which will automatically close the issue when merged. 90 | 91 | Add your name to the AUTHORS file, but make sure that the list is sorted and that 92 | your name and email address match the ones you used to make your commits. The 93 | AUTHORS file is regenerated occasionally from the commit history, so a mismatch 94 | may result in your changes being overwritten. 95 | 96 | ## Decision process 97 | 98 | ### How are decisions made? 99 | 100 | Short answer: with pull requests to the NetworkML repository. 101 | 102 | All decisions affecting NetworkML, big and small, follow the same 3 steps: 103 | 104 | * Step 1: Open a pull request. Anyone can do this. 105 | 106 | * Step 2: Discuss the pull request. Anyone can do this. 107 | 108 | * Step 3: Accept or refuse a pull request. A maintainer does this. 109 | 110 | 111 | ### How can I become a maintainer? 112 | 113 | * Step 1: learn the code inside out 114 | * Step 2: make yourself useful by contributing code, bugfixes, support etc. 115 | 116 | Don't forget: being a maintainer is a time investment. Make sure you will have time to make yourself available. 117 | You don't have to be a maintainer to make a difference on the project! 118 | 119 | ### What are a maintainer's responsibility? 120 | 121 | It is every maintainer's responsibility to: 122 | 123 | * 1) Deliver prompt feedback and decisions on pull requests. 124 | * 2) Be available to anyone with questions, bug reports, criticism etc. on NetworkML. 125 | 126 | ### How is this process changed? 127 | 128 | Just like everything else: by making a pull request :) 129 | 130 | *Derivative work from [Docker](https://github.com/moby/moby/blob/master/CONTRIBUTING.md).* 131 | 132 | ### Any questions? 133 | 134 | As stated above, if you have any questions or encounter any problems, we recommend checking the 135 | pre-existing issues on the project page. If nothing relates or the discussion turns out to not relate 136 | any longer, feel free to start a new issue. We do our best to respond in a timely fashion and to 137 | keep all discussions open and transparent. 138 | -------------------------------------------------------------------------------- /tests/test_funcs_host.py: -------------------------------------------------------------------------------- 1 | import ipaddress 2 | 3 | import netaddr 4 | import pandas as pd 5 | 6 | from networkml.featurizers.funcs.host import Host 7 | from networkml.featurizers.funcs.host import HostBase 8 | from networkml.featurizers.funcs.host import SessionHost 9 | from networkml.helpers.pandas_csv_importer import recast_df 10 | from networkml.helpers.pandas_csv_importer import WS_FIELDS 11 | 12 | 13 | def nan_row_dict(defaults): 14 | row = {field: None for field in WS_FIELDS} 15 | row.update(defaults) 16 | return pd.Series(row) 17 | 18 | 19 | def test_get_ips(): 20 | instance = HostBase() 21 | for ipv, ipb, srcip, dstip, ip_flags in ( 22 | (4, 'ip', ipaddress.ip_address('192.168.0.1'), 23 | ipaddress.ip_address('192.168.0.2'), (1, 0)), 24 | (6, 'ipv6', ipaddress.ip_address('fc01::1'), 25 | ipaddress.ip_address('fc01::2'), (1, 0)), 26 | (4, 'ip', ipaddress.ip_address('192.168.0.1'), 27 | ipaddress.ip_address('8.8.8.8'), (0, 0)), 28 | (6, 'ipv6', ipaddress.ip_address('fc01::1'), 29 | ipaddress.ip_address('2001:4860:4860::8888'), (0, 0)), 30 | (4, 'ip', ipaddress.ip_address('192.168.0.1'), ipaddress.ip_address('224.0.0.1'), (0, 1))): 31 | row = nan_row_dict({'ip.version': ipv, '%s.src' % ipb: str( 32 | int(srcip)), '%s.dst' % ipb: str(int(dstip))}) 33 | assert instance._get_src_ip(row) == srcip 34 | assert instance._get_dst_ip(row) == dstip 35 | assert instance._df_ip_flags(srcip, dstip) == ip_flags 36 | 37 | 38 | def test_macs(): 39 | instance = HostBase() 40 | assert instance._is_unicast(int(netaddr.EUI('0e:00:00:00:00:01'))) == True 41 | assert instance._is_unicast(int(netaddr.EUI('ff:ff:ff:ff:ff:ff'))) == False 42 | 43 | 44 | def test_flags(): 45 | instance = HostBase() 46 | mac_df = pd.DataFrame.from_dict({'test_col': [1, 2, 4]}) 47 | assert instance._get_flags(mac_df, 'test_col', {0: 'foo', 1: 'baz', 2: 'blah'}, suffix=None, field_name=None) == { 48 | 'tshark_test_col_foo': 1, 'tshark_test_col_baz': 1, 'tshark_test_col_blah': 1} 49 | mac_df = pd.DataFrame.from_dict({'test_col': [1, 0, 4]}) 50 | assert instance._get_flags(mac_df, 'test_col', {0: 'foo', 1: 'baz', 2: 'blah'}, suffix=None, field_name=None) == { 51 | 'tshark_test_col_foo': 1, 'tshark_test_col_baz': 0, 'tshark_test_col_blah': 1} 52 | 53 | 54 | def test_lowest_ip_proto_port(): 55 | instance = HostBase() 56 | test_data = {field: None for field in WS_FIELDS} 57 | test_data.update({ 58 | 'tcp.srcport': 99, 59 | 'tcp.dstport': 100, 60 | }) 61 | mac_df = recast_df(pd.DataFrame([test_data])) 62 | assert instance._lowest_ip_proto_port(mac_df, 'tcp') == {99} 63 | 64 | 65 | def test_no_ip_tshark_ports(): 66 | instance = HostBase() 67 | mac_df = pd.DataFrame([{'ip.proto': 99}]) 68 | assert instance._tshark_ports('in', mac_df) 69 | assert instance._tshark_ratio_ports(mac_df) 70 | 71 | 72 | def test_tshark_ports(): 73 | instance = HostBase() 74 | for test_rows, test_output, ratio_output in ( 75 | ([{'tcp.srcport': 22, 'tcp.dstport': 1025, 'ip.proto': 6}, {'tcp.srcport': 1025, 'tcp.dstport': 22, 'ip.proto': 6}, {'tcp.srcport': 22, 'tcp.dstport': 1025, 76 | 'ip.proto': 6}], {'tshark_tcp_priv_port_22_in'}, {'tshark_tcp_priv_packet_ratio_io_port_22': 2.0, 'tshark_tcp_nonpriv_packet_ratio_io_port_other': 0.5}), 77 | ([{'tcp.srcport': 1025, 'tcp.dstport': 1025, 'ip.proto': 6}], {'tshark_tcp_nonpriv_port_other_in'}, {'tshark_tcp_nonpriv_packet_ratio_io_port_other': 1.0})): 78 | 79 | test_data = [] 80 | for test_ports in test_rows: 81 | row = {field: None for field in WS_FIELDS} 82 | row.update(test_ports) 83 | test_data.append(row) 84 | mac_df = recast_df(pd.DataFrame(test_data)) 85 | ports = {col for col, val in instance._tshark_ports( 86 | 'in', mac_df).items() if val == 1} 87 | assert test_output == ports 88 | ratios = {col: val for col, 89 | val in instance._tshark_ratio_ports(mac_df).items() if val} 90 | assert ratio_output == ratios, test_rows 91 | 92 | 93 | def test_ip_versions(): 94 | instance = HostBase() 95 | test_data = {field: None for field in WS_FIELDS} 96 | test_data.update({'ip.version': 4}) 97 | mac_df = recast_df(pd.DataFrame([test_data])) 98 | assert instance._tshark_ipversions( 99 | mac_df) == {'tshark_ipv4': 1, 'tshark_ipv6': 0} 100 | 101 | 102 | def test_non_ip(): 103 | instance = HostBase() 104 | for eth_type, test_output in ( 105 | (1, {'tshark_ipx': 0, 'tshark_nonip': 1}), 106 | (0x8137, {'tshark_ipx': 1, 'tshark_nonip': 1}), 107 | (0x800, {'tshark_ipx': 0, 'tshark_nonip': 0})): 108 | test_data = {field: None for field in WS_FIELDS} 109 | test_data.update({'eth.type': eth_type}) 110 | mac_df = recast_df(pd.DataFrame([test_data])) 111 | assert instance._tshark_non_ip(mac_df) == test_output 112 | 113 | 114 | def test_vlan_id(): 115 | instance = HostBase() 116 | test_data = {field: None for field in WS_FIELDS} 117 | mac_df = recast_df(pd.DataFrame([test_data])) 118 | assert instance._tshark_vlan_id(mac_df) == {'tshark_tagged_vlan': 0} 119 | test_data.update({'vlan.id': 99}) 120 | mac_df = recast_df(pd.DataFrame([test_data])) 121 | assert instance._tshark_vlan_id(mac_df) == {'tshark_tagged_vlan': 1} 122 | 123 | 124 | def test_smoke_calc_cols(): 125 | instance = HostBase() 126 | test_data = {field: None for field in WS_FIELDS} 127 | eth_src = '0e:00:00:00:00:01' 128 | eth_src_int = int(netaddr.EUI(eth_src)) 129 | test_data.update({ 130 | 'ip.version': 4, 131 | 'eth.src': eth_src_int, 132 | 'eth.dst': eth_src_int, 133 | '_srcip': '192.168.0.1', 134 | '_dstip': '192.168.0.2', 135 | }) 136 | mac_df = recast_df(pd.DataFrame([test_data])) 137 | assert instance._calc_cols(eth_src_int, mac_df) 138 | 139 | 140 | def test_host_keys(): 141 | test_data = {field: None for field in WS_FIELDS} 142 | eth_src = '0e:00:00:00:00:01' 143 | eth_src_int = int(netaddr.EUI(eth_src)) 144 | src_ip = ipaddress.ip_address('192.168.0.1') 145 | dst_ip = ipaddress.ip_address('192.168.0.2') 146 | test_data.update({ 147 | 'ip.version': 4, 148 | 'eth.src': eth_src_int, 149 | 'eth.dst': eth_src_int, 150 | 'ip.src': str(int(src_ip)), 151 | 'ip.dst': str(int(dst_ip)), 152 | 'tcp.srcport': 999, 153 | 'tcp.dstport': 1001, 154 | 'frame.protocols': 'eth:ip', 155 | }) 156 | row = nan_row_dict(test_data) 157 | instance = Host() 158 | assert instance._host_key(row)[1:] == (str(src_ip), str(dst_ip), 1, 0, 1) 159 | instance = SessionHost() 160 | assert instance._host_key(row)[1:] == (str(src_ip), str(dst_ip), 1, 0, 1) 161 | -------------------------------------------------------------------------------- /tests/test_algorithms_host_footprint.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import shutil 4 | import sys 5 | import tempfile 6 | 7 | import numpy as np 8 | import pytest 9 | from sklearn import preprocessing 10 | from sklearn.preprocessing import StandardScaler 11 | from sklearn.model_selection import GridSearchCV 12 | from sklearn.neural_network import MLPClassifier 13 | from sklearn.preprocessing import LabelBinarizer 14 | 15 | from networkml.algorithms.host_footprint import HostFootprint 16 | 17 | 18 | def test_serialize_scaler(): 19 | instance = HostFootprint() 20 | scaler = StandardScaler() 21 | test_data = [[i, i] for i in range(99)] 22 | scaler.fit(test_data) 23 | with tempfile.TemporaryDirectory() as tmpdir: 24 | scaler_file = os.path.join(tmpdir, 'scaler.mod') 25 | instance.serialize_scaler(scaler, scaler_file) 26 | new_scaler = instance.deserialize_scaler(scaler_file) 27 | assert len(scaler.mean_) == 2 28 | assert scaler.mean_.tolist() == new_scaler.mean_.tolist() 29 | 30 | 31 | def test_serialize_label_encoder(): 32 | instance = HostFootprint() 33 | le_classes = ['printer', 'workstation', 'server'] 34 | le = preprocessing.LabelEncoder() 35 | le.fit(le_classes) 36 | with tempfile.TemporaryDirectory() as tmpdir: 37 | le_file = os.path.join(tmpdir, 'le.json') 38 | instance.serialize_label_encoder(le, le_file) 39 | new_le = instance.deserialize_label_encoder(le_file) 40 | assert le.classes_.tolist() == new_le.classes_.tolist() 41 | assert new_le.inverse_transform(le.transform(le_classes)).tolist() == le_classes 42 | 43 | 44 | def test_serialize_model(): 45 | instance = HostFootprint() 46 | model = MLPClassifier() 47 | label_binarizer = LabelBinarizer() 48 | label_binarizer.neg_label = 0 49 | label_binarizer.pos_label = 1 50 | label_binarizer.sparse_output = False 51 | label_binarizer.y_type_ = "binary" 52 | label_binarizer.sparse_input_ = False 53 | label_binarizer.classes_ = np.array([0]) 54 | 55 | parameters = {'hidden_layer_sizes': [(64, 32)]} 56 | GridSearchCV(model, parameters, 57 | cv=5, n_jobs=-1, 58 | scoring='f1_weighted') 59 | 60 | model.coefs_ = np.array([[1],[2]]) 61 | model.loss_ = 42 62 | model.intercepts_ = np.array([[3],[4]]) 63 | model.classes_ = np.array([[5],[6]]) 64 | model.n_iter_ = 42 65 | model.n_layers_ = 2 66 | model.n_outputs_ = 1 67 | model.out_activation_ = "logistic" 68 | model._label_binarizer =label_binarizer 69 | model.features = ['test_1', 'test_2', 'test_3'] 70 | 71 | 72 | with tempfile.TemporaryDirectory() as tmpdir: 73 | model_file = os.path.join(tmpdir, 'host_footprint.json') 74 | instance.serialize_model(model, model_file) 75 | new_model = instance.deserialize_model(model_file) 76 | assert model.features == new_model.features 77 | print(f"model params: {model.get_params()}") 78 | print(f"new_model params: {new_model.get_params()}") 79 | assert len(model.get_params()['hidden_layer_sizes']) == len(new_model.get_params()['hidden_layer_sizes']) 80 | assert model._label_binarizer.y_type_ == new_model._label_binarizer.y_type_ 81 | assert len(model.coefs_) == len(new_model.coefs_) 82 | assert len(model.intercepts_) == len(new_model.intercepts_) 83 | 84 | 85 | def test_list_model(): 86 | expected = [ 87 | "foo", 88 | "bar", 89 | "baz", 90 | ] 91 | instance = HostFootprint() 92 | instance.model_path = './tests/test_data/list_test.json' 93 | instance.list = 'features' 94 | actual = instance.list_model() 95 | assert actual == expected 96 | 97 | def test_get_individual_predictions(): 98 | le_classes = ['asomething', 'bsomething'] 99 | le = preprocessing.LabelEncoder() 100 | le.fit(le_classes) 101 | filename = ['firstfile'] 102 | host_key = np.array(['mac1']) 103 | tshark_srcips = np.array(["['1.1.1.1']"]) 104 | frame_epoch = None 105 | instance = HostFootprint() 106 | assert instance.get_individual_predictions([[0.6, 0.7]], le, filename, host_key, tshark_srcips, frame_epoch) == { 107 | 'firstfile': [{'top_role': 'bsomething', 'role_list': [('bsomething', 0.7), ('asomething', 0.6)], 'source_ip': '1.1.1.1', 'source_mac': 'mac1'}]} 108 | assert instance.get_individual_predictions([[0.2, 0.1]], le, filename, host_key, tshark_srcips, frame_epoch) == { 109 | 'firstfile': [{'top_role': 'Unknown', 'role_list': [('asomething', 0.2), ('bsomething', 0.1)], 'source_ip': '1.1.1.1', 'source_mac': 'mac1'}]} 110 | 111 | 112 | def hf_args(tmpdir, operation, input_file): 113 | output_json = os.path.join(tmpdir, 'out.json') 114 | output_le_json = os.path.join(tmpdir, 'out_le.json') 115 | scaler_mod = os.path.join(tmpdir, 'scaler.mod') 116 | return ['host_footprint.py', '--label_encoder', output_le_json, 117 | '--trained_model', output_json, '--scaler', scaler_mod, 118 | '--operation', operation, '--kfolds', '2', input_file] 119 | 120 | 121 | def test_train(): 122 | """Test training function of HostFootprint class""" 123 | with tempfile.TemporaryDirectory() as tmpdir: 124 | testdata = os.path.join(tmpdir, 'test_data') 125 | shutil.copytree('./tests/test_data', testdata) 126 | input_file = os.path.join(testdata, 'combined.csv') 127 | operation = 'train' 128 | sys.argv = hf_args(tmpdir, operation, input_file) 129 | instance = HostFootprint() 130 | instance.main() 131 | 132 | 133 | def test_predict(): 134 | """Test predict function of HostFootprint class""" 135 | with tempfile.TemporaryDirectory() as tmpdir: 136 | testdata = os.path.join(tmpdir, 'test_data') 137 | shutil.copytree('./tests/test_data', testdata) 138 | input_file = os.path.join(testdata, 'combined.csv') 139 | operation = 'train' 140 | sys.argv = hf_args(tmpdir, operation, input_file) 141 | instance = HostFootprint() 142 | instance.main() 143 | operation = 'predict' 144 | sys.argv = hf_args(tmpdir, operation, input_file) 145 | instance = HostFootprint() 146 | json.loads(instance.main()) 147 | 148 | 149 | def test_predict_num_roles(): 150 | """ 151 | Test predict function of HostFootprint class with 152 | varying number of distinct roles present 153 | """ 154 | with tempfile.TemporaryDirectory() as tmpdir: 155 | testdata = os.path.join(tmpdir, 'test_data') 156 | shutil.copytree('./tests/test_data', testdata) 157 | for file in ['combined_three_roles.csv', 'combined_two_roles.csv']: 158 | input_file = os.path.join(testdata, file) 159 | operation = 'train' 160 | sys.argv = hf_args(tmpdir, operation, input_file) 161 | instance = HostFootprint() 162 | instance.main() 163 | operation = 'predict' 164 | sys.argv = hf_args(tmpdir, operation, input_file) 165 | instance = HostFootprint() 166 | instance.main() 167 | 168 | predictions = json.loads(instance.predict()) 169 | assert isinstance(predictions, dict) 170 | # Check if number of predictions is correct 171 | if file == 'combined_three_roles.csv': 172 | assert len(predictions) == 6 173 | else: 174 | assert len(predictions) == 4 175 | 176 | 177 | def test_train_bad_data_too_few_columns(): 178 | """ 179 | This test tries to train a model on a mal-formed csv with too few fields 180 | """ 181 | with tempfile.TemporaryDirectory() as tmpdir: 182 | testdata = os.path.join(tmpdir, 'test_data') 183 | shutil.copytree('./tests/test_data', testdata) 184 | input_file = os.path.join(testdata, 'bad_data_too_few_columns.csv') 185 | operation = 'train' 186 | sys.argv = hf_args(tmpdir, operation, input_file) 187 | instance = HostFootprint() 188 | with pytest.raises(Exception): 189 | instance.main() 190 | -------------------------------------------------------------------------------- /networkml/NetworkML.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | 5 | from networkml import __version__ 6 | from networkml.algorithms.host_footprint import HostFootprint 7 | from networkml.featurizers.csv_to_features import CSVToFeatures 8 | from networkml.helpers.results_output import ResultsOutput 9 | from networkml.parsers.pcap_to_csv import PCAPToCSV 10 | 11 | 12 | class NetworkML: 13 | 14 | def __init__(self, raw_args=None): 15 | self.logger = logging.getLogger(__name__) 16 | log_levels = {'INFO': logging.INFO, 'DEBUG': logging.DEBUG, 17 | 'WARNING': logging.WARNING, 'ERROR': logging.ERROR} 18 | 19 | # TODO: migrate stage-specific flags here. 20 | self.stage_args = { 21 | 'parser': {}, 22 | 'featurizer': { 23 | 'srcmacid': {'help': 'attempt to detect canonical source MAC and featurize only that MAC', 'action': 'store_true'}, 24 | 'no-srcmacid': {'help': 'featurize all MACs', 'action': 'store_true'}, 25 | }, 26 | 'algorithm': { 27 | 'trained_model': {'help': 'specify a path to load or save trained model'}, 28 | 'label_encoder': {'help': 'specify a path to load or save label encoder'}, 29 | 'scaler': {'help': 'specify a path to load or save scaler'}, 30 | 'kfolds': {'help': 'specify number of folds for k-fold cross validation'}, 31 | 'eval_data': {'help': 'path to eval CSV file, if training'}, 32 | 'train_unknown': {'help': 'Train on unknown roles'}, 33 | 'list':{'choices':['features'], 34 | 'default':None, 35 | 'help':'list information contained within model defined by --trained_model' 36 | } 37 | }, 38 | } 39 | parsed_args = self.parse_args(raw_args=raw_args) 40 | self.in_path = parsed_args.path 41 | self.algorithm = parsed_args.algorithm 42 | self.engine = parsed_args.engine 43 | self.first_stage = parsed_args.first_stage 44 | self.final_stage = parsed_args.final_stage 45 | self.groups = parsed_args.groups 46 | self.gzip_opt = parsed_args.gzip 47 | self.level = parsed_args.level 48 | self.operation = parsed_args.operation 49 | self.output = parsed_args.output 50 | self.threads = parsed_args.threads 51 | self.list = parsed_args.list 52 | self.log_level = parsed_args.verbose 53 | for args in self.stage_args.values(): 54 | for arg in args: 55 | val = getattr(parsed_args, arg, None) 56 | if val is not None: 57 | setattr(self, arg, val) 58 | logging.basicConfig(level=log_levels[self.log_level]) 59 | self.main() 60 | 61 | def parse_args(self, raw_args=None): 62 | parser = argparse.ArgumentParser(description='networkml %s' % __version__) 63 | parser.add_argument('path', help='path to a single pcap file, or a directory of pcaps to parse', default='/pcaps') 64 | parser.add_argument('--algorithm', '-a', choices=[ 65 | 'host_footprint'], default='host_footprint', help='choose which algorithm to use (default=host_footprint)') 66 | parser.add_argument('--engine', '-e', choices=['pyshark', 'tshark', 'host'], 67 | default='tshark', help='engine to use to process the PCAP file (default=tshark)') 68 | parser.add_argument('--first_stage', '-f', choices=['parser', 'featurizer', 'algorithm'], default='parser', 69 | help='choose which stage to start at, `path` arg is relative to stage (default=parser)') 70 | parser.add_argument('--final_stage', choices=['parser', 'featurizer', 'algorithm'], 71 | default='algorithm', help='choose which stage to finish at (default=algorithm)') 72 | parser.add_argument('--groups', '-g', default='host', 73 | help='groups of comma separated features to use (default=host)') 74 | parser.add_argument('--gzip', '-z', choices=['input', 'output', 'both'], default='both', 75 | help='use gzip between stages, useful when not using all 3 stages (default=both)') 76 | parser.add_argument('--level', '-l', choices=['packet', 'flow', 'host'], 77 | default='packet', help='level to make the output records (default=packet)') 78 | parser.add_argument('--operation', '-O', choices=['train', 'predict', 'eval'], default='predict', 79 | help='choose which operation task to perform, train or predict (default=predict)') 80 | parser.add_argument('--output', '-o', default=None, 81 | help='directory to write out any results files to') 82 | parser.add_argument('--threads', '-t', default=1, type=int, 83 | help='number of async threads to use (default=1)') 84 | parser.add_argument('--verbose', '-v', choices=[ 85 | 'DEBUG', 'INFO', 'WARNING', 'ERROR'], default='INFO', help='logging level (default=INFO)') 86 | for stage, args in self.stage_args.items(): 87 | for arg, arg_parms in args.items(): 88 | arg_help = '%s (%s)' % (arg_parms['help'], stage) 89 | arg_choices = arg_parms['choices'] if 'choices' in arg_parms else None 90 | arg_default = arg_parms['default'] if 'default' in arg_parms else None 91 | action = arg_parms.get('action', 'store') 92 | if not arg_choices: 93 | parser.add_argument('--' + arg, action=action, help=arg_help, default=arg_default, dest=arg) 94 | else: 95 | parser.add_argument('--' + arg, help=arg_help, choices=arg_choices, default=arg_default, dest=arg, action=action) 96 | parsed_args = parser.parse_args(raw_args) 97 | return parsed_args 98 | 99 | def add_opt_args(self, opt_args): 100 | raw_args = [] 101 | for arg, arg_parms in opt_args.items(): 102 | val = getattr(self, arg, None) 103 | if val is not None: 104 | raw_args.append('--' + arg) 105 | if arg_parms.get('action', None) != 'store_true': 106 | raw_args.append(str(val)) 107 | return raw_args 108 | 109 | def run_parser_stage(self, in_path): 110 | raw_args = self.add_opt_args(self.stage_args['parser']) 111 | raw_args.extend(['-e', self.engine, '-l', self.level, 112 | '-o', self.output, '-t', str(self.threads), '-v', self.log_level, in_path]) 113 | instance = PCAPToCSV(raw_args=raw_args) 114 | return instance.main() 115 | 116 | def run_featurizer_stage(self, in_path): 117 | raw_args = self.add_opt_args(self.stage_args['featurizer']) 118 | raw_args.extend(['-c', '-g', self.groups, '-z', self.gzip_opt, 119 | '-o', self.output, '-t', str(self.threads), '-v', self.log_level, in_path]) 120 | instance = CSVToFeatures(raw_args=raw_args) 121 | return instance.main() 122 | 123 | def run_algorithm_stage(self, in_path): 124 | raw_args = self.add_opt_args(self.stage_args['algorithm']) 125 | raw_args.extend(['-O', self.operation, '-v', self.log_level, in_path]) 126 | instance = HostFootprint(raw_args=raw_args) 127 | return instance.main() 128 | 129 | def output_results(self, result_json_str, run_complete): 130 | if run_complete: 131 | if self.list: 132 | print(f'{result_json_str}') 133 | if self.final_stage == 'algorithm' and self.operation == 'predict': 134 | if self.output and os.path.isdir(self.output): 135 | uid = os.getenv('id', 'None') 136 | file_path = os.getenv('file_path', self.in_path) 137 | results_outputter = ResultsOutput(self.logger, uid, file_path) 138 | result_json_file_name = os.path.join(self.output, 'predict.json') 139 | results_outputter.output_from_result_json(result_json_str, result_json_file_name) 140 | 141 | def run_stages(self): 142 | stages = ('parser', 'featurizer', 'algorithm') 143 | stage_runners = { 144 | 'parser': self.run_parser_stage, 145 | 'featurizer': self.run_featurizer_stage, 146 | 'algorithm': self.run_algorithm_stage} 147 | 148 | try: 149 | first_stage_index = stages.index(self.first_stage) 150 | final_stage_index = stages.index(self.final_stage) 151 | except ValueError: 152 | self.logger.error('Unknown first/final stage name') 153 | return 154 | 155 | if first_stage_index > final_stage_index: 156 | self.logger.error('Invalid first and final stage combination') 157 | return 158 | 159 | run_schedule = stages[first_stage_index:(final_stage_index+1)] 160 | result = self.in_path 161 | self.logger.info(f'running stages: {run_schedule}') 162 | 163 | run_complete = False 164 | try: 165 | for stage in run_schedule: 166 | runner = stage_runners[stage] 167 | result = runner(result) 168 | run_complete = True 169 | except Exception as err: 170 | self.logger.error(f'Could not run stage: {err}') 171 | 172 | self.output_results(result, run_complete) 173 | 174 | def main(self): 175 | self.run_stages() 176 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright (c) 2017-2022 IQT Labs LLC, All Rights Reserved. 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /networkml/featurizers/csv_to_features.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import concurrent.futures 3 | import csv 4 | import logging 5 | import os 6 | import pathlib 7 | from collections import Counter 8 | from collections import defaultdict 9 | 10 | import numpy as np 11 | 12 | import networkml 13 | from networkml.featurizers.main import Featurizer 14 | from networkml.helpers.gzipio import gzip_reader 15 | from networkml.helpers.gzipio import gzip_writer 16 | from networkml.helpers.pandas_csv_importer import import_csv 17 | 18 | 19 | class CSVToFeatures(): 20 | 21 | def __init__(self, raw_args=None): 22 | self.logger = logging.getLogger(__name__) 23 | self.raw_args = raw_args 24 | 25 | @staticmethod 26 | def get_reader(in_file, use_gzip): 27 | if use_gzip: 28 | return gzip_reader(in_file) 29 | return open(in_file, 'r') 30 | 31 | @staticmethod 32 | def get_writer(out_file, use_gzip): 33 | if use_gzip: 34 | return gzip_writer(out_file) 35 | return open(out_file, 'w') 36 | 37 | @staticmethod 38 | def iscsv(pathfile): 39 | for ext in ('csv', 'gz'): 40 | if pathfile.endswith(''.join(('.', ext))): 41 | return True 42 | return False 43 | 44 | @staticmethod 45 | def write_features_to_csv(header, rows, out_file, gzip_opt): 46 | use_gzip = gzip_opt in ['output', 'both'] 47 | with CSVToFeatures.get_writer(out_file, use_gzip) as f_out: 48 | writer = csv.DictWriter(f_out, fieldnames=header) 49 | writer.writeheader() 50 | writer.writerows(rows) 51 | 52 | @staticmethod 53 | def combine_csvs(out_paths, combined_path, gzip_opt): 54 | # First determine the field names from the top line of each input file 55 | fieldnames = {'filename'} 56 | use_gzip = gzip_opt in ['output', 'both'] 57 | for filename in out_paths: 58 | with CSVToFeatures.get_reader(filename, use_gzip) as f_in: 59 | reader = csv.reader(f_in) 60 | fieldnames.update({header for header in next(reader)}) 61 | 62 | # Then copy the data 63 | with CSVToFeatures.get_writer(combined_path, use_gzip) as f_out: 64 | writer = csv.DictWriter(f_out, fieldnames=list(fieldnames)) 65 | writer.writeheader() 66 | for filename in out_paths: 67 | with CSVToFeatures.get_reader(filename, use_gzip) as f_in: 68 | reader = csv.DictReader(f_in) 69 | for line in reader: 70 | if use_gzip: 71 | line['filename'] = filename.split( 72 | '/')[-1].split('.features.gz')[0] 73 | else: 74 | line['filename'] = filename.split( 75 | '/')[-1].split('.features')[0] 76 | writer.writerow(line) 77 | CSVToFeatures.cleanup_files([filename]) 78 | 79 | @staticmethod 80 | def cleanup_files(paths): 81 | for fi in paths: 82 | if os.path.exists(fi): 83 | os.remove(fi) 84 | 85 | @staticmethod 86 | def parse_args(raw_args=None): 87 | netml_path = list(networkml.__path__) 88 | parser = argparse.ArgumentParser() 89 | parser.add_argument( 90 | 'path', help='path to a single gzipped csv file, or a directory of gzipped csvs to parse') 91 | parser.add_argument('--combined', '-c', action='store_true', 92 | help='write out all records from all csvs into a single gzipped csv file') 93 | parser.add_argument('--features_path', '-p', default=os.path.join( 94 | netml_path[0], 'featurizers/funcs'), help='path to featurizer functions') 95 | parser.add_argument('--functions', '-f', default='', 96 | help='comma separated list of : to featurize (default=None)') 97 | parser.add_argument('--groups', '-g', default='host', 98 | help='comma separated list of groups of functions to featurize (default=host)') 99 | parser.add_argument('--gzip', '-z', choices=['input', 'output', 'both', 'neither'], 100 | default='both', help='gzip the input/output file, both or neither (default=both)') 101 | parser.add_argument('--output', '-o', default=None, 102 | help='path to write out gzipped csv file or directory for gzipped csv files') 103 | parser.add_argument('--threads', '-t', default=1, type=int, 104 | help='number of async threads to use (default=1)') 105 | parser.add_argument('--verbose', '-v', choices=[ 106 | 'DEBUG', 'INFO', 'WARNING', 'ERROR'], default='INFO', help='logging level (default=INFO)') 107 | srcmacid_parser = parser.add_mutually_exclusive_group(required=False) 108 | srcmacid_parser.add_argument('--srcmacid', dest='srcmacid', action='store_true', help='attempt to detect canonical source MAC and featurize only that MAC') 109 | srcmacid_parser.add_argument('--no-srcmacid', dest='srcmacid', action='store_false', help='featurize all MACs') 110 | parser.set_defaults(srcmacid=True) 111 | parsed_args = parser.parse_args(raw_args) 112 | return parsed_args 113 | 114 | def exec_features(self, features, in_file, out_file, features_path, gzip_opt, parsed_args): 115 | in_file_size = os.path.getsize(in_file) 116 | self.logger.info(f'Importing {in_file} size {in_file_size}') 117 | df = import_csv(in_file) 118 | featurizer = Featurizer() 119 | self.logger.info(f'Featurizing {in_file}') 120 | rows = featurizer.main(features, df, features_path, parsed_args) 121 | 122 | rowcounts = Counter() 123 | for row in rows: 124 | for r in row: 125 | for header_key in r: 126 | rowcounts[header_key] += 1 127 | rowcompare = defaultdict(set) 128 | for header_key, header_count in rowcounts.items(): 129 | if header_key != 'host_key': 130 | rowcompare[header_count].add(header_key) 131 | assert not len(rowcompare) == 0, 'featurizer returned no results' 132 | assert len( 133 | rowcompare) == 1, 'inconsistent featurizer row counts (headers not consistently present in all rows): %s' % rowcompare 134 | header = list(rowcounts.keys()) 135 | 136 | columns = [np.array(row) for row in rows] 137 | np_array = np.vstack(columns) 138 | 139 | rows = None 140 | for method in np_array: 141 | if rows is None: 142 | rows = method 143 | else: 144 | for i, row in enumerate(method): 145 | rows[i].update(row) 146 | 147 | if header and rows is not None: 148 | rows = rows.tolist() 149 | CSVToFeatures.write_features_to_csv( 150 | header, rows, out_file, gzip_opt) 151 | else: 152 | self.logger.warning( 153 | f'No results based on {features} for {in_file}') 154 | 155 | def process_files(self, threads, features, features_path, in_paths, out_paths, gzip_opt, parsed_args): 156 | num_files = len(in_paths) 157 | failed_paths = [] 158 | finished_files = 0 159 | # corner case so it works in jupyterlab 160 | if threads < 2: 161 | for i in range(len(in_paths)): 162 | try: 163 | finished_files += 1 164 | self.exec_features( 165 | features, in_paths[i], out_paths[i], features_path, gzip_opt, parsed_args) 166 | self.logger.info( 167 | f'Finished {in_paths[i]}. {finished_files}/{num_files} CSVs done.') 168 | except Exception as e: # pragma: no cover 169 | self.logger.error( 170 | f'{in_paths[i]} generated an exception: {e}') 171 | failed_paths.append(out_paths[i]) 172 | else: 173 | with concurrent.futures.ProcessPoolExecutor(max_workers=threads) as executor: 174 | future_to_parse = {executor.submit( 175 | self.exec_features, features, in_paths[i], out_paths[i], features_path, gzip_opt, parsed_args): i for i in range(len((in_paths)))} 176 | for future in concurrent.futures.as_completed(future_to_parse): 177 | path = future_to_parse[future] 178 | try: 179 | finished_files += 1 180 | future.result() 181 | except Exception as e: # pragma: no cover 182 | self.logger.error( 183 | f'{in_paths[path]} generated an exception: {e}') 184 | failed_paths.append(out_paths[path]) 185 | else: 186 | self.logger.info( 187 | f'Finished {in_paths[path]}. {finished_files}/{num_files} CSVs done.') 188 | return failed_paths 189 | 190 | def main(self): 191 | parsed_args = CSVToFeatures.parse_args(raw_args=self.raw_args) 192 | in_path = parsed_args.path 193 | out_path = parsed_args.output 194 | combined = parsed_args.combined 195 | features_path = parsed_args.features_path 196 | threads = parsed_args.threads 197 | log_level = parsed_args.verbose 198 | functions = parsed_args.functions 199 | groups = parsed_args.groups 200 | gzip_opt = parsed_args.gzip 201 | 202 | if not groups and not functions: 203 | self.logger.warning( 204 | 'No groups or functions were selected, quitting') 205 | return 206 | 207 | log_levels = {'INFO': logging.INFO, 'DEBUG': logging.DEBUG, 208 | 'WARNING': logging.WARNING, 'ERROR': logging.ERROR} 209 | logging.basicConfig(level=log_levels[log_level]) 210 | 211 | in_paths = [] 212 | out_paths = [] 213 | 214 | # parse out features dict 215 | groups = tuple(groups.split(',')) 216 | funcs = functions.split(',') 217 | functions = [] 218 | for function in funcs: 219 | functions.append(tuple(function.split(':'))) 220 | features = {'groups': groups, 'functions': functions} 221 | 222 | # check if it's a directory or a file 223 | if os.path.isdir(in_path): 224 | if out_path: 225 | pathlib.Path(out_path).mkdir(parents=True, exist_ok=True) 226 | for root, _, files in os.walk(in_path): 227 | for pathfile in files: 228 | if CSVToFeatures.iscsv(pathfile): 229 | in_paths.append(os.path.join(root, pathfile)) 230 | if out_path: 231 | if gzip_opt in ['neither', 'input']: 232 | out_paths.append(os.path.join( 233 | out_path, pathfile) + '.features') 234 | else: 235 | out_paths.append(os.path.join( 236 | out_path, pathfile) + '.features.gz') 237 | else: 238 | if gzip_opt in ['neither', 'input']: 239 | out_paths.append(os.path.join( 240 | root, pathfile) + '.features') 241 | else: 242 | out_paths.append(os.path.join( 243 | root, pathfile) + '.features.gz') 244 | else: 245 | in_paths.append(in_path) 246 | default_out_path = in_path + '.features.gz' 247 | if gzip_opt in ['neither', 'input']: 248 | default_out_path = in_path + '.features' 249 | if out_path: 250 | if os.path.isdir(out_path): 251 | out_paths.append(os.path.join(out_path, os.path.basename(default_out_path))) 252 | else: 253 | out_paths.append(out_path) 254 | else: 255 | out_paths.append(default_out_path) 256 | 257 | failed_paths = self.process_files( 258 | threads, features, features_path, in_paths, out_paths, gzip_opt, parsed_args) 259 | 260 | for failed_path in failed_paths: # pragma: no cover 261 | if failed_path in out_paths: 262 | out_paths.remove(failed_path) 263 | 264 | if combined and out_paths: 265 | combined_path = os.path.join( 266 | os.path.dirname(out_paths[0]), 'combined.csv.gz') 267 | if gzip_opt in ['input', 'neither']: 268 | combined_path = combined_path[:-3] 269 | self.logger.info( 270 | f'Combining CSVs into a single file: {combined_path}') 271 | CSVToFeatures.combine_csvs(out_paths, combined_path, gzip_opt) 272 | return combined_path 273 | if out_paths: 274 | self.logger.info(f'GZipped CSV file(s) written out to: {out_paths}') 275 | return os.path.dirname(out_paths[0]) 276 | else: 277 | self.logger.error(f'No CSV file(s) written out because the following paths failed: {failed_paths}') 278 | return 279 | 280 | 281 | if __name__ == '__main__': # pragma: no cover 282 | features = CSVToFeatures() 283 | features.main() 284 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # v0.6.19 (2022-03-23) 2 | 3 | - Update dependency pyshark to v0.4.5 4 | - Update dependency cython to v0.29.28 5 | - Update dependency humanize to v4.0.0 6 | - Update dependency pytest to v7.1.1 7 | - Update dependency pbr to v5.8.1 8 | - Update dependency numpy to v1.22.3 9 | - Update dependency pandas to v1.4.1 10 | - Update dependency pytest-rabbitmq to v2.2.1 11 | - Update iqtlabs/rbqwrapper Docker tag to v0.11.32 12 | 13 | # v0.6.18 (2022-01-11) 14 | 15 | - Update dependency pbr to v5.8.0 16 | - Update iqtlabs/rbqwrapper Docker tag to v0.11.31 17 | - Update dependency humanize to v3.13.1 18 | - Update dependency nest_asyncio to v1.5.4 19 | - Update dependency cython to v0.29.26 20 | - Update dependency pytest-xdist to v2.5.0 21 | - Update dependency pygments to v2.11.2 22 | - Update dependency numpy to v1.22.0 23 | - Update dependency pandas to v1.3.5 24 | - Update dependency scikit-learn to v1.0.2 25 | 26 | # v0.6.17 (2021-08-19) 27 | 28 | - Update dependency numpy to v1.21.3 29 | - Update dependency pandas to v1.3.4 30 | - Update dependency scikit-learn to v1.0.1 31 | - Update dependency joblib to v1.1.0 32 | - Update dependency humanize to v3.12.0 33 | - Update dependency pytest-cov to v3 34 | - Update dependency pytest-xdist to v2.4.0 35 | - Update codecov/codecov-action action to v2.1.0 36 | - Allow pcap to features to read a pcap CSV with pre-cast int types (for future drop in replacement for tshark/pyshark parsers). If hex int fields, are detected as strings, fall back to current behavior (use python conversion) 37 | - Update iqtlabs/rbqwrapper Docker tag to v0.11.29 38 | - pytype observes that csv.DictWriter fields, should be an indexable Sequence 39 | 40 | # v0.6.16 (2021-08-19) 41 | 42 | - Updated humanize, numpy, pandas, and pygments 43 | - Updated rbqwrapper base image 44 | - Fixed an issue where tshark could exit and not write out buffer 45 | - Improved SAAST scanning 46 | 47 | # v0.6.15 (2021-07-29) 48 | 49 | - Updated NumPy, codecov-action, reorder_python_imports, upload-artifact 50 | - Pinned Pandas to v1.2.5 due to #871 51 | - Added Shift-Left SAAST Scan on push and PR 52 | 53 | # v0.6.14 (2021-07-14) 54 | 55 | - Updated humanize, pytest-xdist, NumPy, Pandas, python, pip, cython 56 | 57 | # v0.6.13 (2021-06-15) 58 | 59 | - Updated humanize, pytest-cov, pytest-rabbitmq, codecov-action 60 | 61 | # v0.6.12 (2021-05-20) 62 | 63 | - Updated pre-commit, rbqwrapper, and pytest-cov 64 | - Fixed output for Packet Cafe consumption 65 | 66 | # v0.6.11 (2021-05-10) 67 | 68 | - Updated codecov, reorder_python_imports, cython, humanize, numpy, pandas, pbr, scikit-learn, pygments, and pytest 69 | 70 | # v0.6.10 (2021-03-04) 71 | 72 | - Updated rbqwrapper, cython, pandas, and pygments 73 | 74 | # v0.6.9 (2021-02-11) 75 | 76 | - Updated reorder_python_imports, rbqwrapper, joblib, numpy, pandas, pyshark, and pytest-xdist 77 | 78 | # v0.6.8 (2021-01-26) 79 | 80 | - Updated pytest, pytest-cov, scikit-learn, pandas, nest_asyncio 81 | 82 | # v0.6.7 (2021-01-13) 83 | 84 | - Updated codecov, pygments, pytest, pytest-xdist, pytest-rabbitmq 85 | - Moved base image to rbqwrapper, abstracting away RabbitMQ 86 | - Added a new feature for listing out features in the model 87 | - Rewrote the model serializer removing the need for sklearn_json 88 | - Add end-to-end tests 89 | 90 | # v0.6.6 (2020-12-01) 91 | 92 | - Move to PBR 93 | - fix test for 'behavior' 94 | 95 | # v0.6.5 (2020-11-24) 96 | 97 | - Rollback numpy as it doesn't properly handle confidence values on ARM64 98 | - Updated pre-commit versions 99 | - Cleaned up formatting/style 100 | 101 | # v0.6.4 (2020-11-19) 102 | 103 | - Updated numpy, pandas, and nest_asyncio 104 | 105 | # v0.6.3 (2020-10-29) 106 | 107 | - Updated numpy, pygments, pytest, and nest_asyncio 108 | 109 | # v0.6.2 (2020-10-20) 110 | 111 | - Updated buildx, codecov, humanize, joblib, pandas, pygments, pytest, and nest_asyncio 112 | 113 | # v0.6.1 (2020-08-26) 114 | 115 | - Updated humanize, pytest-cov, pytest-xdist, and pandas 116 | 117 | # v0.6.0 (2020-08-05) 118 | 119 | - Retrained models for updated version of scikit-learn 120 | 121 | # v0.5.9 (2020-08-05) 122 | 123 | - Udpated scikit-learn and pytest 124 | - Moved from CyberReboot to new IQTLabs brand 125 | 126 | # v0.5.8 (2020-07-29) 127 | 128 | - Updated cython, humanize, netaddr, numpy, pandas, pytest, pytest-xdist, and nest_asyncio 129 | 130 | # v0.5.7 (2020-07-01) 131 | 132 | - Updated joblib, pandas, numpy, netaddr, and humanize 133 | - Moved Docker base image to python:3.8-slim (debian based instead of alpine) 134 | 135 | # v0.5.6 (2020-06-18) 136 | 137 | - Updated pandas, pytest-cov 138 | - Broke up Docker into two images for build times across architectures 139 | 140 | # v0.5.5 (2020-06-03) 141 | 142 | - Updated joblib, pandas, pytest, pytest-cov, and pyshark 143 | - Updated documentation for developers 144 | 145 | # v0.5.4 (2020-05-06) 146 | 147 | - Updated pytest-xdist, nest-asyncio, and numpy 148 | - Added flag --no-srcmacid to make predictions on all MACs found 149 | 150 | # v0.5.3 (2020-04-27) 151 | 152 | - Actually fix manifest to properly include models for PyPi Package. 153 | 154 | # v0.5.2 (2020-04-27) 155 | 156 | - Added missing files to manifest for PyPi package. 157 | 158 | # v0.5.1 (2020-04-27) 159 | 160 | - Rolling back to latest published version of pyshark - for issues see commented version in requirements.txt, which is unfortunately not supported for dependency install from PyPi with pip. 161 | 162 | # v0.5.0 (2020-04-23) 163 | 164 | - Rewrote Networkml entirely 165 | - Now only does classification, no longer behavior 166 | - Flexible stages for processing PCAPs into CSVs of features 167 | - No longer uses tensorflow 168 | - Now supports running on ARM 169 | 170 | # v0.4.8 (2020-02-20) 171 | 172 | - Fixed local dev python version to be 3.7 173 | - Fixed missing threshold_time configuration option 174 | - Fixed filename checks for client/server 175 | - Warn instead of debug log when files are ignored 176 | - Fixed running concurrent.futures when on python3.6 177 | 178 | # v0.4.7 (2020-02-12) 179 | 180 | - Added caching for parsed sessions 181 | - Added IPv6 capability to networkML 182 | - Updated pytest to v5.3.4 183 | - Updated models for scikit v0.22.1 184 | - Updated redis to v3.4.1 185 | 186 | # v0.4.6 (2020-01-15) 187 | 188 | - Updated tensorflow 189 | - Updated pyshark 190 | - Made sessionizer parallel 191 | - Added 120 minute timeout for a pcap 192 | - Added sessionizer test 193 | - Updated license 194 | 195 | # v0.4.5 (2020-01-02) 196 | 197 | - Updated numpy to 1.18.0 198 | - updated pytest-xdist to 1.31.0 199 | - Updated test_extract_macs() test 200 | - Removed vent template 201 | - Added check for empty F1 score list 202 | 203 | # v0.4.4 (2019-12-18) 204 | 205 | - Add pyshark wiring to get highest-level protocol 206 | - Make tests run in parallel 207 | - Update models for scikit-learn v0.22 208 | - Improve parsing speed 209 | - Add tests for pcap reader 210 | 211 | # v0.4.3 (2019-12-4) 212 | 213 | - update pytest to 5.3.1 214 | - update scikit-learn to 0.22 215 | 216 | # v0.4.2 (2019-11-22) 217 | 218 | - Updated numpy to 1.17.4 219 | - Updated pytest to 5.3 220 | - Edited .gitignore 221 | - Added tests for label extraction 222 | - Added test for avx check 223 | - Added pcap labels to decision 224 | 225 | # v0.4.1 (2019-11-07) 226 | 227 | - updated numpy to 1.17.3 228 | - updated pytest to 5.2.2 229 | - Added documentation 230 | - Added support for additional labels and filenames 231 | 232 | # v0.4.0 (2019-10-24) 233 | 234 | - Updated pytest-cov 235 | - Updated pytest 236 | - Updated redis 237 | - Added more documentation and tests 238 | - Updated the python image for the Dockerfile 239 | 240 | # v0.3.9 (2019-10-02) 241 | 242 | - Updated pytest to 5.2.0 243 | - Updated tensorflow to 2.0.0 244 | - Fixed up old code using tensorflow1 to work with tensorflow2 245 | 246 | # v0.3.8 (2019-09-12) 247 | 248 | - Updated pytest to 5.1.2 249 | - Updated numpy to 1.17.2 250 | - Fixed make help 251 | 252 | # v0.3.7 (2019-08-30) 253 | 254 | - Updated redis to 3.3.8 255 | - Updated pytest to 5.1.1 256 | 257 | # v0.3.6 (2019-08-15) 258 | 259 | - Updated redis to 3.3.7 260 | - Redis is now optional 261 | - RabbitMQ is now configurable, and has a cleaned up message format 262 | - Retrained models against numpy 1.17.0 and scikit-learn 0.21.3 263 | 264 | # v0.3.5 (2019-08-02) 265 | 266 | - Updated pika to 1.1.0 267 | - Got rid of outdated linux headers 268 | - Updated redis to 3.3.4 269 | 270 | # v0.3.4 (2019-07-11) 271 | 272 | - Updated to python3.7 273 | - Updated models 274 | - Updated tensorflow to 1.14.0 275 | - Updated pytest to 5.0.1 276 | 277 | # v0.3.3 (2019-06-13) 278 | 279 | - Updated models and included printers 280 | - Renamed PoseidonML to NetworkML 281 | - Updated pytest to 4.6.3 282 | 283 | # v0.3.2 (2019-05-31) 284 | 285 | - Updated numpy to 1.16.3 286 | - Updated pytest-cov to 2.7.1 287 | - Updated pytest to 4.5.0 288 | - Reduce places that Tensorflow is imported 289 | - Made it possible to run classifications on CPUs that don't support AVX 290 | 291 | # v0.3.1 (2019-04-18) 292 | 293 | - Updated Tensorflow imports for new deprecations 294 | - Updated pika to 1.0.1 295 | - Removed a bunch of duplicated code to keep the code base cleaner 296 | - Added a bunch of tests to get coverage up to 90% 297 | - Updated pytest to 4.4.1 298 | - Removed the use of md5 and replaced it with sha224 299 | 300 | # v0.3.0 (2019-04-04) 301 | 302 | - Major rewrite and restructuring of the code base, but same functionality 303 | 304 | # v0.2.10 (2019-03-22) 305 | 306 | - Changed the default for Rabbit to not be used 307 | - Changed the environment variable for Rabbit from SKIP_RABBIT to RABBIT 308 | - Improved logging output for summarizing evaluation results of multiple PCAPs 309 | - Updated versions of pika, pytest, redis, and scikit-learn 310 | - Fixed a bug that was preventing training the SoSModel 311 | - Added some more test coverage 312 | - Updated the trained models and labels 313 | 314 | # v0.2.9 (2019-03-08) 315 | 316 | - Updated tensorflow from 1.12.0 to 1.13.1. 317 | - Updated numpy from 1.16.1 to 1.16.2. 318 | - Miscellaneous error checking and spacing corrections. 319 | 320 | # v0.2.8 (2019-02-22) 321 | 322 | - Updated pytest to 4.3.0 from 4.2.0. 323 | - Cleaned up some code issues as pointed out by Codacy. 324 | - Minor miscellaneous bugfixes to support running training natively. 325 | 326 | # v0.2.7 (2019-02-09) 327 | 328 | - Provided a way to run DeviceClassifier training and testing scripts from command line. 329 | - Cleaned up some unused code and consolidated common operations into utils and model class. 330 | - Fixed issue where Makefile built the OneLayer training container when building the test one. 331 | - Updated redis to 3.1.0 332 | - Updated numpy to 1.16.1 333 | 334 | # v0.2.6 (2019-01-25) 335 | 336 | - Updated numpy to 1.16.0 337 | - Updated pika to 0.13.0 338 | - Included a conda yml file for a standalone/dev environment, and new Makefile options to build it. 339 | 340 | # v0.2.5 (2019-01-11) 341 | 342 | - models have been retrained to fix a warning about invalid results when evaluating a pcap 343 | - some unused code and module has been removed 344 | - upgraded pytest to 4.1.0 and pytest-cov to 2.6.1 345 | 346 | # v0.2.4 (2018-12-21) 347 | 348 | - upgraded scikit-learn to 0.20.2 349 | - removed scipy 350 | - cleaned up requirements.txt and setup.py 351 | - fixed issue where redis was throwing error when saving decisions 352 | - fixed error in eval_onelayer that was using nonexistent key 353 | - Make train/eval/test process consistent for all models 354 | - Fixed path error specific to python 3.5 that occurred when processing PCAP files 355 | - PCAP directories can now be used when running model evals 356 | 357 | # v0.2.3 (2018-12-14) 358 | 359 | - upgraded pytest to 4.0.2 360 | - upgraded scikit-learn to 0.20.1 361 | - improved README documentation 362 | - upgraded redis to 3.0.1 363 | - added pcap directory support 364 | - re-enabled the behavior model 365 | - includes the trained behavior model 366 | - fixed hardcoded onelayer pickle file in randomforest 367 | - fixed missing labels 368 | - simplified rabbit connection 369 | - replaced deprecated randomized logistic regression with random forest 370 | 371 | # v0.2.2 (2018-10-22) 372 | 373 | - upgraded pytest to 3.9.1 374 | - fixed a NoneType error when multiplying 375 | - fixed an issue where the config file wasn't being read properly 376 | - abstracted away the code to read the config file into one place 377 | 378 | # v0.2.1 (2018-10-02) 379 | 380 | - lots of cleanup of duplicated code 381 | - upgraded tensorflow to 1.11.0 382 | - upgraded scikit-learn to 0.20.0 383 | - updated the model 384 | 385 | # v0.2.0 (2018-09-22) 386 | 387 | - moved a bunch of duplicated code into common utils 388 | 389 | # v0.1.9 (2018-09-21) 390 | 391 | - fixed issue where results were not getting sent to rabbitmq or stored in redis 392 | - cleaned up cruft in OneLayer Eval 393 | - moved OneLayer Eval code into a class to reduce duplication 394 | 395 | # v0.1.8 (2018-09-10) 396 | 397 | - upgraded pytest to 3.8.0 398 | - upgraded pytest-cov to 2.6.0 399 | - upgraded tensorflow to 1.10.1 400 | - made all print statements logger statements 401 | - sends messages to rabbitmq now even if not enough sessions 402 | - stores normal/abnormal results in redis now 403 | - fixed performance issue where evaluation would take a long time 404 | - updated the model 405 | 406 | # v0.1.7 (2018-08-24) 407 | 408 | - upgraded pytest to 3.7.2 409 | - upgraded numpy to 1.15.1 410 | 411 | # v0.1.6 (2018-08-10) 412 | 413 | - updated model 414 | - upgraded pytest to 3.7.1 415 | - upgraded scikit-learn to 0.19.2 416 | - linting 417 | 418 | # v0.1.5 (2018-07-27) 419 | 420 | - fixes pairs issue when checking private addresses 421 | - fixes the models path for running in a container 422 | - improve dockerfile builds 423 | - upgraded pika to 0.12.0 424 | - upgraded scipy to 1.1.0 425 | - upgraded numpy to 1.14.5 426 | - upgraded tensorflow to 1.9.0 427 | - fixed vent template 428 | - added some initial tests 429 | - re-trained the onelayer model with improved accuracy 430 | - reduced the number of labels for onelayer to 6 431 | - improvements for developing on poseidonml 432 | 433 | # v0.1.4 (2018-07-13) 434 | 435 | - initial utility release 436 | -------------------------------------------------------------------------------- /networkml/featurizers/funcs/host.py: -------------------------------------------------------------------------------- 1 | import ipaddress 2 | 3 | import netaddr 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from networkml.featurizers.features import Features 8 | 9 | 10 | MAC_BCAST = netaddr.EUI('FF-FF-FF-FF-FF-FF') 11 | ETH_TYPE_ARP = 0x806 12 | ETH_TYPE_IP = 0x800 13 | ETH_TYPE_IPV6 = 0x86DD 14 | ETH_TYPE_IPX = 0x8137 15 | ETH_IP_TYPES = frozenset((ETH_TYPE_ARP, ETH_TYPE_IP, ETH_TYPE_IPV6)) 16 | WK_IP_PROTOS = ('tcp', 'udp', 'icmp', 'arp', 'icmpv6', 'gre', 'esp', 'ah') 17 | WK_IP_PROTOS_INDEX = {WK_IP_PROTOS.index(i): i for i in WK_IP_PROTOS} 18 | TCP_UDP_PROTOS = { 19 | 6: 'tcp', 20 | 17: 'udp', 21 | } 22 | 23 | 24 | class HostBase: 25 | 26 | CALC_COL_NAMES = ( 27 | ('frame.len', 'frame_len'), 28 | ('frame.time_delta_displayed', 'time_delta')) 29 | CALC_COL_FUNCS = ( 30 | ('max', lambda x: x.max()), 31 | ('min', lambda x: x.min()), 32 | ('count', lambda x: x.count()), 33 | ('total', lambda x: x.sum()), 34 | ('average', lambda x: x.mean()), 35 | ('median', lambda x: x.median()), 36 | ('variance', lambda x: x.var()), 37 | ('25q', lambda x: x.quantile(0.25)), 38 | ('75q', lambda x: x.quantile(0.75))) 39 | # https://www.iana.org/assignments/service-names-port-numbers/service-names-port-numbers.xml 40 | # TODO: enumerate most common ports from survey (complete indicator matrix too expensive) 41 | WK_PRIV_TCPUDP_PORTS = frozenset( 42 | [22, 23, 25, 53, 67, 68, 69, 80, 88, 110, 123, 137, 138, 139, 143, 161, 443, 631]) 43 | WK_NONPRIV_TCPUDP_PORTS = frozenset( 44 | [1900, 2375, 2376, 5222, 5349, 5353, 5354, 5349, 5357, 6653]) 45 | DROP_PROTOS = frozenset( 46 | ['frame', 'data', 'eth', 'ip', 'ipv6']) 47 | 48 | def _mac(self, mac): 49 | return netaddr.EUI(int(mac), dialect=netaddr.mac_unix_expanded) 50 | 51 | def _is_unicast(self, mac): 52 | mac_val = self._mac(mac) 53 | if mac_val == MAC_BCAST or mac_val.packed[0] & 1: 54 | return False 55 | return True 56 | 57 | def _numericintset(self, nums): 58 | if nums is not None: 59 | return frozenset(int(x) for x in nums if x is not None and pd.notna(x)) 60 | return frozenset() 61 | 62 | def _get_ip(self, row, cols): 63 | ipv = row['ip.version'] 64 | if not pd.isnull(ipv): 65 | ipv = int(ipv) 66 | if ipv == 4: 67 | prefix = 'ip' 68 | else: 69 | prefix = 'ipv6' 70 | for col in cols: 71 | val = row['.'.join((prefix, col))] 72 | if not pd.isnull(val): 73 | return ipaddress.ip_address(int(val)) 74 | return None 75 | 76 | def _get_src_ip(self, row): 77 | return self._get_ip(row, ('src', 'src_host')) 78 | 79 | def _get_dst_ip(self, row): 80 | return self._get_ip(row, ('dst', 'dst_host')) 81 | 82 | def _get_flags(self, mac_df, col_name, decode_map, suffix=None, field_name=None): 83 | try: 84 | col = mac_df[col_name] 85 | unique_flags = self._numericintset(col.unique()) 86 | except KeyError: 87 | unique_flags = [0] 88 | decoded_flags = set() 89 | for bit, decoded_flag in decode_map.items(): 90 | bitval = 2**bit 91 | for flags in sorted(filter(lambda x: x >= bitval, unique_flags)): 92 | if flags & bitval: 93 | decoded_flags.add(decoded_flag) 94 | if field_name is None: 95 | field_name = col_name.replace('.', '_') 96 | if suffix is not None: 97 | return {'tshark_%s_%s_%s' % ( 98 | field_name, decoded_flag, suffix): int(decoded_flag in decoded_flags) 99 | for decoded_flag in decode_map.values()} 100 | return {'tshark_%s_%s' % ( 101 | field_name, decoded_flag): int(decoded_flag in decoded_flags) 102 | for decoded_flag in decode_map.values()} 103 | 104 | def _tshark_flags(self, suffix, mac_df): 105 | mac_row_flags = {} 106 | for func in ( 107 | lambda x, y: self._get_flags(x, 'ip.dsfield', { 108 | 0: 'ecn0', 1: 'ecn1', 2: 'dscp0', 3: 'dscp1', 4: 'dscp2', 5: 'dscp3', 6: 'dscp4', 7: 'dscp5'}, suffix=y), 109 | lambda x, y: self._get_flags(x, 'ip.flags', { 110 | 0: 'fin', 1: 'syn', 2: 'rst', 3: 'psh', 4: 'ack', 5: 'urg', 6: 'ece', 7: 'cwr', 8: 'ns'}, suffix=y), 111 | lambda x, y: self._get_flags(x, 'tcp.flags', { 112 | 0: 'fin', 1: 'syn', 2: 'rst', 3: 'psh', 4: 'ack', 5: 'urg', 6: 'ece', 7: 'cwr', 8: 'ns'}, suffix=y), 113 | ): 114 | mac_row_flags.update(func(mac_df, suffix)) 115 | return mac_row_flags 116 | 117 | def _lowest_ip_proto_port(self, mac_df, ip_proto): 118 | if not mac_df.empty: 119 | src = mac_df['%s.srcport' % ip_proto] 120 | dst = mac_df['%s.dstport' % ip_proto] 121 | if src.count() and dst.count(): 122 | return self._numericintset(np.minimum(src, dst).unique()) # pylint: disable=no-member 123 | return frozenset() 124 | 125 | def _tshark_ports(self, suffix, mac_df): 126 | mac_row_ports = {} 127 | 128 | def port_priv(port): 129 | return port < 1024 130 | 131 | for ip_proto_num, ip_proto in TCP_UDP_PROTOS.items(): 132 | proto_df = mac_df[mac_df['ip.proto']==ip_proto_num] 133 | lowest_ports = self._lowest_ip_proto_port(proto_df, ip_proto) 134 | for field_name, ports, wk_ports in ( 135 | ('priv', {port for port in lowest_ports if port_priv( 136 | port)}, self.WK_PRIV_TCPUDP_PORTS), 137 | ('nonpriv', {port for port in lowest_ports if not port_priv( 138 | port)}, self.WK_NONPRIV_TCPUDP_PORTS), 139 | ): 140 | port_flags = {port: int(port in ports) for port in wk_ports} 141 | port_flags.update( 142 | {'other': int(bool(lowest_ports) and not ports.issubset(wk_ports))}) 143 | mac_row_ports.update({ 144 | 'tshark_%s_%s_port_%s_%s' % (ip_proto, field_name, port, suffix): present for port, present in port_flags.items()}) 145 | return mac_row_ports 146 | 147 | def _tshark_ratio_ports(self, mac_df): 148 | mac_row_ports = {} 149 | 150 | def calc_ratio(src_count, dst_count): 151 | packet_ratio = 0 152 | if src_count is not None and dst_count is not None: 153 | if dst_count > 0: 154 | packet_ratio = src_count / dst_count 155 | elif src_count > 0: 156 | packet_ratio = 1 157 | return packet_ratio 158 | 159 | 160 | for ip_proto_num, ip_proto in TCP_UDP_PROTOS.items(): 161 | proto_df = mac_df[mac_df['ip.proto']==ip_proto_num] 162 | src = pd.DataFrame(columns=['%s.srcport' % ip_proto]) 163 | dst = pd.DataFrame(columns=['%s.dstport' % ip_proto]) 164 | if not proto_df.empty: 165 | try: 166 | src = proto_df['%s.srcport' % ip_proto] 167 | dst = proto_df['%s.dstport' % ip_proto] 168 | except KeyError: 169 | pass 170 | for field_name, wk_ports, port_src, port_dst in ( 171 | ('priv', self.WK_PRIV_TCPUDP_PORTS, 172 | src[src <= 1023], dst[dst <= 1023]), 173 | ('nonpriv', self.WK_NONPRIV_TCPUDP_PORTS, 174 | src[src > 1023], dst[dst > 1023])): 175 | src_values = port_src[src.isin(wk_ports)] 176 | dst_values = port_dst[dst.isin(wk_ports)] 177 | src_counts = {} 178 | if not src_values.empty: 179 | src_counts = src_values.value_counts() 180 | dst_counts = {} 181 | if not dst_values.empty: 182 | dst_counts = dst_values.value_counts() 183 | for port in wk_ports: 184 | src_count = src_counts.get(port, None) 185 | dst_count = dst_counts.get(port, None) 186 | mac_row_ports.update({ 187 | 'tshark_%s_%s_packet_ratio_io_port_%s' % (ip_proto, field_name, port): calc_ratio(src_count, dst_count)}) 188 | src_values = port_src[~port_src.isin(wk_ports)] 189 | src_count = 0 190 | if not src_values.empty: 191 | src_count = src_values.value_counts().sum() 192 | dst_values = port_dst[~port_dst.isin(wk_ports)] 193 | dst_count = 0 194 | if not dst_values.empty: 195 | dst_count = dst_values.value_counts().sum() 196 | mac_row_ports.update({ 197 | 'tshark_%s_%s_packet_ratio_io_port_%s' % (ip_proto, field_name, 'other'): calc_ratio(src_count, dst_count)}) 198 | return mac_row_ports 199 | 200 | def _tshark_ipversions(self, mac_df): 201 | try: 202 | ip_versions = self._numericintset(mac_df['ip.version'].unique()) 203 | except AttributeError: 204 | ip_versions = frozenset() 205 | return {'tshark_ipv%u' % v: int(v in ip_versions) for v in (4, 6)} 206 | 207 | def _tshark_non_ip(self, mac_df): 208 | try: 209 | eth_types = self._numericintset(mac_df['eth.type'].unique()) 210 | except AttributeError: 211 | eth_types = frozenset() 212 | return { 213 | 'tshark_ipx': int(ETH_TYPE_IPX in eth_types), 214 | 'tshark_nonip': int(bool(eth_types - ETH_IP_TYPES)), 215 | } 216 | 217 | def _tshark_both_private_ip(self, mac_df): 218 | try: 219 | both_private_ip = int(mac_df['_both_private_ip'].max() == 1) 220 | except KeyError: 221 | both_private_ip = 0 222 | return { 223 | 'tshark_both_private_ip': both_private_ip, 224 | } 225 | 226 | def _tshark_ipv4_multicast(self, mac_df): 227 | try: 228 | ipv4_multicast = int(mac_df['_ipv4_multicast'].max() == 1) 229 | except KeyError: 230 | ipv4_multicast = 0 231 | return { 232 | 'tshark_ipv4_multicast': ipv4_multicast, 233 | } 234 | 235 | def _tshark_wk_ip_protocol(self, mac_df): 236 | return self._get_flags(mac_df, '_protos_int', WK_IP_PROTOS_INDEX, suffix=None, field_name='wk_ip_protocol') 237 | 238 | def _tshark_vlan_id(self, mac_df): 239 | return { 240 | 'tshark_tagged_vlan': int(pd.notna(mac_df['vlan.id'].max())) 241 | } 242 | 243 | def _tshark_frame_epoch(self, mac_df): 244 | return { 245 | 'tshark_frame_epoch': float(mac_df['frame.time_epoch'].max()) 246 | } 247 | 248 | def _tshark_unique_ips(self, mac, mac_df): 249 | srcips = mac_df[mac_df['eth.src'] == mac]['_srcip'] 250 | dstips = mac_df[mac_df['eth.src'] == mac]['_dstip'] 251 | return { 252 | 'tshark_srcips': list(set(srcips.unique().tolist()) - {'None'}), 253 | 'tshark_unique_srcips': srcips.nunique(), 254 | 'tshark_unique_dstips': dstips.nunique(), 255 | } 256 | 257 | def _calc_cols(self, mac, mac_df): 258 | mac_row = {} 259 | for suffix, suffix_func in ( 260 | ('out', lambda x: mac_df[mac_df['eth.src'] == x]), 261 | ('in', lambda x: mac_df[mac_df['eth.src'] != x])): 262 | try: 263 | suffix_df = suffix_func(mac) 264 | except KeyError: 265 | continue 266 | for col_name, field_name in self.CALC_COL_NAMES: 267 | col = suffix_df[col_name] 268 | for calc_name, calc_func in self.CALC_COL_FUNCS: 269 | calc_col = 'tshark_%s_%s_%s' % ( 270 | calc_name, field_name, suffix) 271 | val = calc_func(col) 272 | if pd.isnull(val): 273 | val = 0 274 | mac_row.update({calc_col: val}) 275 | for func in ( 276 | self._tshark_flags, 277 | self._tshark_ports): 278 | mac_row.update(func(suffix, suffix_df)) 279 | for func in ( 280 | self._tshark_ipversions, 281 | self._tshark_non_ip, 282 | self._tshark_both_private_ip, 283 | self._tshark_ipv4_multicast, 284 | self._tshark_wk_ip_protocol, 285 | self._tshark_vlan_id, 286 | self._tshark_frame_epoch, 287 | self._tshark_ratio_ports): 288 | mac_row.update(func(mac_df)) 289 | mac_row.update(self._tshark_unique_ips(mac, mac_df)) 290 | return mac_row 291 | 292 | def _calc_mac_row(self, mac, mac_df): 293 | mac_row = {'host_key': str(self._mac(mac))} 294 | mac_row.update(self._calc_cols(mac, mac_df)) 295 | return mac_row 296 | 297 | def _host_key(self, row): 298 | raise NotImplementedError 299 | 300 | def _df_ip_flags(self, ip_src, ip_dst): 301 | both_private_ip = 0 302 | ipv4_multicast = 0 303 | if not pd.isnull(ip_src) and not pd.isnull(ip_dst): 304 | both_private_ip = int(ip_src.is_private and ip_dst.is_private) 305 | ipv4_multicast = int(ip_dst.version == 4 and ip_dst.is_multicast) 306 | return (both_private_ip, ipv4_multicast) 307 | 308 | def _encode_df_proto_flags(self, short_row_keys, frame_protocols): 309 | if frame_protocols: 310 | short_frame_protocols = frozenset(frame_protocols.split(':')) 311 | else: 312 | short_frame_protocols = {} 313 | all_protos = short_row_keys.union( 314 | short_frame_protocols) - self.DROP_PROTOS 315 | all_protos_int = 0 316 | for proto in all_protos.intersection(WK_IP_PROTOS): 317 | index = WK_IP_PROTOS.index(proto) 318 | all_protos_int += 2**index 319 | return all_protos_int 320 | 321 | def _df_proto_flags(self, row): 322 | short_row_keys = frozenset(x.split('.')[0] for x, y in row.items( 323 | ) if not pd.isnull(y) and not x.startswith('_')) 324 | return self._encode_df_proto_flags(short_row_keys, row['frame.protocols']) 325 | 326 | def _tshark_all(self, df, srcmacid): 327 | print('calculating intermediates', end='', flush=True) 328 | df['_host_key'], df['_srcip'], df['_dstip'], df['_both_private_ip'], df['_ipv4_multicast'], df['_protos_int'] = zip( 329 | *df.apply(self._host_key, axis=1)) 330 | eth_srcs = frozenset(df['eth.src'].unique()) 331 | eth_dsts = frozenset(df['eth.dst'].unique()) 332 | all_unicast_macs = frozenset( 333 | mac for mac in eth_srcs.union(eth_dsts) if self._is_unicast(mac)) 334 | host_keys = df['_host_key'].unique() 335 | host_keys_count = len(host_keys) 336 | print('.%u MACs, %u sessions' % 337 | (len(all_unicast_macs), host_keys_count), end='', flush=True) 338 | if srcmacid: 339 | minsrcipmac = df.groupby(['eth.src'])[ 340 | '_srcip'].nunique().idxmin(axis=0) 341 | assert minsrcipmac in all_unicast_macs 342 | print('.MAC %s has minimum number of source IPs, selected as canonical source' % 343 | self._mac(minsrcipmac), end='', flush=True) 344 | all_unicast_macs = {minsrcipmac} 345 | mac_rows = [] 346 | for i, mac in enumerate(all_unicast_macs, start=1): 347 | mac_df = df[(df['eth.src'] == mac) | (df['eth.dst'] == mac)] 348 | # If just one MAC, don't need groupby on host key. 349 | if len(all_unicast_macs) == 1: 350 | mac_rows.append(self._calc_mac_row(mac, mac_df)) 351 | else: 352 | s = 0 353 | for _, key_df in mac_df.groupby('_host_key'): 354 | s += 1 355 | if s % 100 == 0: 356 | print('.MAC %u/%u %.1f%%' % (i, len(all_unicast_macs), 357 | s / len(host_keys) * 100), end='', flush=True) 358 | mac_rows.append(self._calc_mac_row(mac, key_df)) 359 | print('.MAC %u/%u 100%%.' % 360 | (i, len(all_unicast_macs)), end='', flush=True) 361 | return mac_rows 362 | 363 | 364 | class Host(HostBase, Features): 365 | 366 | def _host_key(self, row): 367 | ip_src = self._get_src_ip(row) 368 | ip_dst = self._get_dst_ip(row) 369 | both_private_ip, ipv4_multicast = self._df_ip_flags(ip_src, ip_dst) 370 | protos_int = self._df_proto_flags(row) 371 | return (0, str(ip_src), str(ip_dst), both_private_ip, ipv4_multicast, protos_int) 372 | 373 | def host_tshark_all(self, df, parsed_args): 374 | return self._tshark_all(df, parsed_args.srcmacid) 375 | 376 | 377 | class SessionHost(HostBase, Features): 378 | 379 | def _host_key(self, row): 380 | eth_src = row['eth.src'] 381 | eth_dst = row['eth.dst'] 382 | ip_src = self._get_src_ip(row) 383 | ip_dst = self._get_dst_ip(row) 384 | both_private_ip, ipv4_multicast = self._df_ip_flags(ip_src, ip_dst) 385 | protos_int = self._df_proto_flags(row) 386 | if not pd.isnull(ip_src) and not pd.isnull(ip_dst): 387 | ip_proto = TCP_UDP_PROTOS.get(row['ip.version'], None) 388 | if ip_proto: 389 | src_port = row['%s.srcport' % ip_proto] 390 | dst_port = row['%s.dstport' % ip_proto] 391 | if ip_src > ip_dst: 392 | key = (ip_proto, eth_src, ip_src, 393 | src_port, eth_dst, ip_dst, dst_port) 394 | else: 395 | key = (ip_proto, eth_dst, ip_dst, 396 | dst_port, eth_src, ip_src, src_port) 397 | else: 398 | key = sorted([(eth_src, ip_src), (eth_dst, ip_dst)]) 399 | else: 400 | key = (row['eth.type'],) + tuple(sorted((eth_src, eth_dst))) 401 | return (hash('-'.join([str(x) for x in key])), str(ip_src), str(ip_dst), both_private_ip, ipv4_multicast, protos_int) 402 | 403 | def sessionhost_tshark_all(self, df, parsed_args): 404 | return self._tshark_all(df, parsed_args.srcmacid) 405 | -------------------------------------------------------------------------------- /networkml/parsers/pcap_to_csv.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import concurrent.futures 3 | import csv 4 | import functools 5 | import json 6 | import logging 7 | import ntpath 8 | import os 9 | import pathlib 10 | import shlex 11 | import subprocess 12 | import tempfile 13 | from copy import deepcopy 14 | 15 | import pyshark 16 | 17 | from networkml.helpers.gzipio import gzip_reader 18 | from networkml.helpers.gzipio import gzip_writer 19 | 20 | 21 | class PCAPToCSV(): 22 | 23 | def __init__(self, raw_args=None): 24 | self.logger = logging.getLogger(__name__) 25 | self.PROTOCOLS = ['', 26 | '', 27 | '', 28 | '', 29 | '', 30 | '', 31 | '', 32 | '', 33 | '', 34 | '', 35 | '', 36 | ''] 37 | self.raw_args = raw_args 38 | 39 | @staticmethod 40 | def ispcap(pathfile): 41 | for ext in ('pcap', 'pcapng', 'dump', 'capture'): 42 | if pathfile.endswith(''.join(('.', ext))): 43 | return True 44 | return False 45 | 46 | @staticmethod 47 | def parse_args(raw_args=None): 48 | parser = argparse.ArgumentParser() 49 | parser.add_argument( 50 | 'path', help='path to a single pcap file, or a directory of pcaps to parse') 51 | parser.add_argument('--combined', '-c', action='store_true', 52 | help='write out all records from all pcaps into a single gzipped csv file') 53 | parser.add_argument('--engine', '-e', choices=['pyshark', 'tshark', 'host'], 54 | default='tshark', help='engine to use to process the PCAP file (default=tshark)') 55 | parser.add_argument('--level', '-l', choices=['packet', 'flow', 'host'], 56 | default='packet', help='level to make the output records (default=packet)') 57 | parser.add_argument('--output', '-o', default=None, 58 | help='path to write out gzipped csv file or directory for gzipped csv files') 59 | parser.add_argument('--threads', '-t', default=1, type=int, 60 | help='number of async threads to use (default=1)') 61 | parser.add_argument('--verbose', '-v', choices=[ 62 | 'DEBUG', 'INFO', 'WARNING', 'ERROR'], default='INFO', help='logging level (default=INFO)') 63 | parsed_args = parser.parse_args(raw_args) 64 | return parsed_args 65 | 66 | @staticmethod 67 | def get_csv_header(dict_fp): 68 | header_all = set() 69 | with gzip_reader(dict_fp) as f_in: 70 | for line in f_in: 71 | header_all.update(json.loads(line.strip()).keys()) 72 | header = [] 73 | for key in header_all: 74 | if key[0].isalpha() or key[0] == '_': 75 | header.append(key) 76 | return header 77 | 78 | @staticmethod 79 | def combine_csvs(out_paths, combined_path): 80 | # First determine the field names from the top line of each input file 81 | fieldnames = {'filename'} 82 | for filename in out_paths: 83 | with gzip_reader(filename) as f_in: 84 | reader = csv.reader(f_in) 85 | fieldnames.update({header for header in next(reader)}) 86 | 87 | # Then copy the data 88 | with gzip_writer(combined_path) as f_out: 89 | writer = csv.DictWriter(f_out, fieldnames=list(fieldnames)) 90 | writer.writeheader() 91 | for filename in out_paths: 92 | with gzip_reader(filename) as f_in: 93 | reader = csv.DictReader(f_in) 94 | for line in reader: 95 | line['filename'] = filename.split( 96 | '/')[-1].split('csv.gz')[0] 97 | writer.writerow(line) 98 | PCAPToCSV.cleanup_files([filename]) 99 | 100 | @staticmethod 101 | def cleanup_files(paths): 102 | for fi in paths: 103 | if os.path.exists(fi): 104 | os.remove(fi) 105 | 106 | def get_pyshark_packet_data(self, pcap_file, dict_fp): 107 | all_protocols = set() 108 | 109 | pcap_file_short = ntpath.basename(pcap_file) 110 | with gzip_writer(dict_fp) as f_out: 111 | with pyshark.FileCapture(pcap_file, 112 | use_json=True, 113 | include_raw=True, 114 | keep_packets=False, 115 | custom_parameters=['-o', 'tcp.desegment_tcp_streams:false', '-n']) as cap: 116 | for packet in cap: 117 | packet_dict = {} 118 | packet_dict['filename'] = pcap_file_short 119 | frame_info = packet.frame_info._all_fields 120 | for key in frame_info: 121 | packet_dict[key] = frame_info[key] 122 | # can overflow the field size for csv 123 | #packet_dict['raw_packet'] = packet.get_raw_packet() 124 | layers = str(packet.layers) 125 | packet_dict['layers'] = layers 126 | str_layers = layers[1:-1].split(', ') 127 | for str_layer in str_layers: 128 | # ignore raw layers 129 | if 'RAW' not in str_layer: 130 | all_protocols.add(str_layer) 131 | # only include specified protocols due to unknown parsing for some layers 132 | if str_layer in self.PROTOCOLS: 133 | layer_info = getattr(packet, str_layer.split()[ 134 | 0][1:].lower())._all_fields 135 | # check for nested dicts, one level deep 136 | for key in layer_info: 137 | # DNS doesn't parse well 138 | if isinstance(layer_info[key], dict) and str_layer != '': 139 | for inner_key in layer_info[key]: 140 | packet_dict[inner_key] = layer_info[key][inner_key] 141 | else: 142 | packet_dict[key] = layer_info[key] 143 | # clean up records 144 | packet_dict_copy = deepcopy(packet_dict) 145 | keys = packet_dict_copy.keys() 146 | for key in keys: 147 | if not key[0].isalpha() or key == 'tcp.payload_raw' or key == 'tcp.payload': 148 | del packet_dict[key] 149 | f_out.write(json.dumps(packet_dict) + '\n') 150 | 151 | for protocol in self.PROTOCOLS: 152 | if protocol in all_protocols: 153 | all_protocols.remove(protocol) 154 | if all_protocols: 155 | self.logger.warning( 156 | f'Found the following other layers in {pcap_file_short} that were not added to the CSV: {all_protocols}') 157 | 158 | def get_tshark_conv_data(self, pcap_file, dict_fp): 159 | # TODO (add a summary of other packets with protocols?) 160 | output = '' 161 | try: 162 | # TODO perhaps more than just tcp/udp in the future 163 | options = '-n -q -z conv,tcp -z conv,udp' 164 | output = subprocess.check_output(shlex.split( 165 | ' '.join(['tshark', '-r', pcap_file, options]))) 166 | output = output.decode('utf-8') 167 | except Exception as e: # pragma: no cover 168 | self.logger.error(f'{e}') 169 | 170 | in_block = False 171 | name = None 172 | results = {} 173 | for line in output.split('\n'): 174 | if line.startswith('==='): 175 | if in_block: 176 | in_block = False 177 | name = None 178 | continue 179 | else: 180 | in_block = True 181 | continue 182 | if in_block: 183 | if not name: 184 | name = ''.join(line.split(':')).strip() 185 | results[name] = '' 186 | continue 187 | elif not line.startswith('Filter:') and line != '': 188 | results[name] += line + '\n' 189 | 190 | with gzip_writer(dict_fp) as f_out: 191 | for result in results.keys(): 192 | if 'Conversations' in result: 193 | transport_proto = result.split()[0] 194 | # handle conversation parsing 195 | for line in results[result].split('\n'): 196 | if line == '' or line.startswith(' '): 197 | # header or padding, dicard 198 | continue 199 | else: 200 | # TODO perhaps additional features can be extracted for flows from tshark 201 | src, _, dst, frames_l, bytes_l, frames_r, bytes_r, frames_total, bytes_total, rel_start, duration = line.split() 202 | conv = {'Source': src.rsplit(':', 1)[0], 203 | 'Source Port': src.rsplit(':', 1)[1], 204 | 'Destination': dst.rsplit(':', 1)[0], 205 | 'Destination Port': dst.rsplit(':', 1)[1], 206 | 'Transport Protocol': transport_proto, 207 | 'Frames to Source': frames_l, 208 | 'Bytes to Source': bytes_l, 209 | 'Frames to Destination': frames_r, 210 | 'Bytes to Destination': bytes_r, 211 | 'Total Frames': frames_total, 212 | 'Total Bytes': bytes_total, 213 | 'Relative Start': rel_start, 214 | 'Duration': duration} 215 | f_out.write(json.dumps(conv) + '\n') 216 | 217 | @staticmethod 218 | @functools.lru_cache() 219 | def good_json_key(key): 220 | return (key[0].isalpha() or key[0] == '_') and ';' not in key and '(' not in key and '\\' not in key and '{' not in key and '<' not in key and '+' not in key 221 | 222 | def flatten_json(self, item): 223 | flattened_dict = {} 224 | 225 | def flatten(key, value): 226 | if isinstance(value, list): 227 | for i, sub_item in enumerate(value): 228 | flatten(str(i), sub_item) 229 | elif isinstance(value, dict): 230 | sub_keys = value.keys() 231 | for sub_key in sub_keys: 232 | flatten(sub_key, value[sub_key]) 233 | else: 234 | # remove junk 235 | if self.good_json_key(key): 236 | # limit field size for csv 237 | if (value and len(value) < 131072) or not value: 238 | flattened_dict[key] = value 239 | 240 | flatten('', item) 241 | return flattened_dict 242 | 243 | def json_packet_records(self, process): 244 | json_buffer = [] 245 | 246 | def _recordize(): 247 | return json.loads('\n'.join(json_buffer)) 248 | 249 | depth = 0 250 | while True: 251 | json_line = process.stdout.readline().decode(encoding='utf-8', errors='ignore') 252 | if json_line == '' and process.poll() is not None: 253 | break 254 | if not json_line.startswith(' '): 255 | continue 256 | json_line = json_line.strip() 257 | bracket_line = json_line.rstrip(',') 258 | if bracket_line.endswith('}'): 259 | depth -= 1 260 | elif bracket_line.endswith('{'): 261 | depth += 1 262 | if depth == 0: 263 | if bracket_line: 264 | json_buffer.append(bracket_line) 265 | if json_buffer: 266 | yield _recordize() 267 | json_buffer = [] 268 | else: 269 | if json_line: 270 | json_buffer.append(json_line) 271 | 272 | def get_tshark_packet_data(self, pcap_file, dict_fp): 273 | options = '-n -V -Tjson' 274 | try: 275 | with subprocess.Popen(shlex.split( 276 | ' '.join(['tshark', '-r', pcap_file, options])), stdout=subprocess.PIPE) as process: 277 | with gzip_writer(dict_fp) as f_out: 278 | for item in self.json_packet_records(process): 279 | f_out.write(json.dumps(self.flatten_json(item)) + '\n') 280 | except Exception as e: # pragma: no cover 281 | self.logger.error(f'{e}') 282 | 283 | def get_tshark_host_data(self, pcap_file, dict_fp): 284 | # TODO 285 | raise NotImplementedError('To be implemented') 286 | 287 | def write_dict_to_csv(self, dict_fp, out_file): 288 | header = PCAPToCSV.get_csv_header(dict_fp) 289 | with gzip_writer(out_file) as f_out: 290 | writer = csv.DictWriter(f_out, fieldnames=header) 291 | writer.writeheader() 292 | try: 293 | with gzip_reader(dict_fp) as f_in: 294 | for line in f_in: 295 | writer.writerow(json.loads(line.strip())) 296 | except Exception as e: # pragma: no cover 297 | self.logger.error(f'Failed to write to CSV because: {e}') 298 | 299 | def parse_file(self, level, in_file, out_file, engine): 300 | self.logger.info(f'Processing {in_file}') 301 | with tempfile.TemporaryDirectory() as tmpdir: 302 | dict_fp = os.path.join(tmpdir, os.path.basename(in_file)) 303 | if level == 'packet': 304 | if engine == 'tshark': 305 | # option for tshark as it's much faster 306 | self.get_tshark_packet_data(in_file, dict_fp) 307 | elif engine == 'pyshark': 308 | # using pyshark to get everything possible 309 | self.get_pyshark_packet_data(in_file, dict_fp) 310 | elif level == 'flow': 311 | # using tshark conv,tcp and conv,udp filters 312 | self.get_tshark_conv_data(in_file, dict_fp) 313 | elif level == 'host': 314 | # TODO unknown what should be in this, just the overarching stats? 315 | raise NotImplementedError('To be implemented') 316 | self.write_dict_to_csv(dict_fp, out_file) 317 | PCAPToCSV.cleanup_files([dict_fp]) 318 | 319 | def process_files(self, threads, level, in_paths, out_paths, engine): 320 | num_files = len(in_paths) 321 | failed_paths = [] 322 | finished_files = 0 323 | # corner case so it works in jupyterlab 324 | if threads < 2: 325 | for i in range(len(in_paths)): 326 | try: 327 | finished_files += 1 328 | self.parse_file(level, in_paths[i], out_paths[i], engine) 329 | self.logger.info( 330 | f'Finished {in_paths[i]}. {finished_files}/{num_files} PCAPs done.') 331 | except Exception as e: # pragma: no cover 332 | self.logger.error( 333 | f'{in_paths[i]} generated an exception: {e}') 334 | failed_paths.append(out_paths[i]) 335 | else: 336 | with concurrent.futures.ProcessPoolExecutor(max_workers=threads) as executor: 337 | future_to_parse = {executor.submit( 338 | self.parse_file, level, in_paths[i], out_paths[i], engine): i for i in range(len(in_paths))} 339 | for future in concurrent.futures.as_completed(future_to_parse): 340 | path = future_to_parse[future] 341 | try: 342 | finished_files += 1 343 | future.result() 344 | except Exception as e: # pragma: no cover 345 | self.logger.error( 346 | f'{in_paths[path]} generated an exception: {e}') 347 | failed_paths.append(out_paths[path]) 348 | else: 349 | self.logger.info( 350 | f'Finished {in_paths[path]}. {finished_files}/{num_files} PCAPs done.') 351 | return failed_paths 352 | 353 | def main(self): 354 | parsed_args = PCAPToCSV.parse_args(raw_args=self.raw_args) 355 | in_path = parsed_args.path 356 | out_path = parsed_args.output 357 | combined = parsed_args.combined 358 | engine = parsed_args.engine 359 | threads = parsed_args.threads 360 | log_level = parsed_args.verbose 361 | level = parsed_args.level 362 | 363 | log_levels = {'INFO': logging.INFO, 'DEBUG': logging.DEBUG, 364 | 'WARNING': logging.WARNING, 'ERROR': logging.ERROR} 365 | logging.basicConfig(level=log_levels[log_level]) 366 | 367 | in_paths = [] 368 | out_paths = [] 369 | 370 | # check if it's a directory or a file 371 | if os.path.isdir(in_path): 372 | if out_path: 373 | pathlib.Path(out_path).mkdir(parents=True, exist_ok=True) 374 | for root, _, files in os.walk(in_path): 375 | for pathfile in files: 376 | if PCAPToCSV.ispcap(pathfile): 377 | in_paths.append(os.path.join(root, pathfile)) 378 | if out_path: 379 | out_paths.append(os.path.join( 380 | out_path, pathfile) + '.csv.gz') 381 | else: 382 | out_paths.append(os.path.join( 383 | root, pathfile) + '.csv.gz') 384 | else: 385 | in_paths.append(in_path) 386 | default_out_path = in_path + '.csv.gz' 387 | 388 | if out_path: 389 | if os.path.isdir(out_path): 390 | out_paths.append(os.path.join(out_path, os.path.basename(default_out_path))) 391 | else: 392 | out_paths.append(out_path) 393 | else: 394 | out_paths.append(default_out_path) 395 | 396 | if level == 'packet' and engine == 'pyshark': 397 | self.logger.info( 398 | f'Including the following layers in CSV (if they exist): {self.PROTOCOLS}') 399 | 400 | failed_paths = self.process_files( 401 | threads, level, in_paths, out_paths, engine) 402 | 403 | for failed_path in failed_paths: # pragma: no cover 404 | if failed_path in out_paths: 405 | out_paths.remove(failed_path) 406 | 407 | if combined: 408 | if out_paths: 409 | combined_path = os.path.join( 410 | os.path.dirname(out_paths[0]), 'combined.csv.gz') 411 | else: 412 | combined_path = 'combined.csv.gz' 413 | self.logger.info( 414 | f'Combining CSVs into a single file: {combined_path}') 415 | PCAPToCSV.combine_csvs(out_paths, combined_path) 416 | return combined_path 417 | else: 418 | self.logger.info( 419 | f'GZipped CSV file(s) written out to: {out_paths}') 420 | if len(out_paths) > 1: 421 | return os.path.dirname(out_paths[0]) 422 | return out_paths[0] 423 | 424 | 425 | if __name__ == '__main__': # pragma: no cover 426 | instance = PCAPToCSV() 427 | instance.main() 428 | -------------------------------------------------------------------------------- /networkml/algorithms/host_footprint.py: -------------------------------------------------------------------------------- 1 | """ 2 | A class to perform machine learning operations on computer network traffic 3 | """ 4 | import argparse 5 | import ast 6 | import json 7 | import logging 8 | import os 9 | from collections import defaultdict 10 | 11 | import joblib 12 | import numpy as np 13 | import pandas as pd 14 | from sklearn import preprocessing 15 | from sklearn.metrics import accuracy_score 16 | from sklearn.metrics import confusion_matrix 17 | from sklearn.metrics import f1_score 18 | from sklearn.metrics import precision_score 19 | from sklearn.metrics import recall_score 20 | from sklearn.model_selection import GridSearchCV 21 | from sklearn.neural_network import MLPClassifier 22 | from sklearn.preprocessing import LabelBinarizer 23 | 24 | import networkml 25 | 26 | 27 | class HostFootprint(): 28 | """ 29 | Perform machine learning operations on a host's network traffic 30 | 31 | A class to peform machine learning operations on network traffic 32 | represented at the host footprint level. "Host footprint" refers to 33 | a representation of network traffic in which there are statistical 34 | features that characterize all packets with a particular host as 35 | the origin or source. 36 | """ 37 | 38 | def __init__(self, raw_args=None): 39 | self.logger = logging.getLogger(__name__) 40 | self.raw_args = raw_args 41 | self.list = None 42 | self.model_path = None 43 | 44 | @staticmethod 45 | def regularize_df(df): 46 | # need host_key, tshark_srcips, and frame_epoch to send 47 | # source_ip/source_mac to Poseidon. 48 | cols = [col for col in ('host_key', 'tshark_srcips', 'tshark_frame_epoch', 'role') if col in df.columns] 49 | # TODO: remove ratio features for now for model compatibility. 50 | cols.extend([col for col in df.columns if 'ratio' in col]) 51 | host_key = df.get('host_key', None) 52 | tshark_srcips = df.get('tshark_srcips', None) 53 | frame_epoch = df.get('tshark_frame_epoch', None) 54 | df = df.drop(columns=cols) 55 | # Dataframe column order must be the same for train/predict! 56 | df = df.reindex(columns=sorted(df.columns)) 57 | return df, host_key, tshark_srcips, frame_epoch 58 | 59 | @staticmethod 60 | def serialize_label_encoder(le, path): 61 | """Serialize label encoder to enable persistence 62 | without pickling the file. .pkl files are a security 63 | risk and should be avoided 64 | Model is saved as a JSON object. 65 | INPUT: 66 | --le: the label encoder object (from sklearn) to be saved 67 | --path: filepath for saving the object 68 | OUTPUT: 69 | --Does not return anything 70 | """ 71 | serialized_le = { 72 | 'classes': le.classes_.tolist(), 73 | } 74 | with open(path, 'w') as model_json: 75 | json.dump(serialized_le, model_json) 76 | 77 | @staticmethod 78 | def deserialize_label_encoder(path): 79 | """Deserialize JSON object storing label encoder. 80 | Label encoder (from sklearn) is re-instantiated 81 | with proper values. 82 | INPUT: 83 | --path: filepath for loading the JSON object 84 | OUTPUT: 85 | --le: Returns label encoder (sklearn) object 86 | """ 87 | with open(path, 'r') as model_json: 88 | model_dict = json.load(model_json) 89 | # Instantiate and assign class label 90 | le = preprocessing.LabelEncoder() 91 | le.classes_ = np.array(model_dict['classes']) 92 | return le 93 | 94 | @staticmethod 95 | def serialize_model(model, path): 96 | """Serialize lmodel to enable persistence 97 | without pickling the file. .pkl files are a security 98 | risk and should be avoided 99 | Model is saved as a JSON object. 100 | INPUT: 101 | --model: the model object (an MLPClassifier from sklearn) to be saved 102 | --path: filepath for saving the object 103 | OUTPUT: 104 | --Does not return anything 105 | """ 106 | def serialize_label_binarizer(label_binarizer): 107 | serialized_label_binarizer = { 108 | 'neg_label': label_binarizer.neg_label, 109 | 'pos_label': label_binarizer.pos_label, 110 | 'sparse_output': label_binarizer.sparse_output, 111 | 'y_type_': label_binarizer.y_type_, 112 | 'sparse_input_': label_binarizer.sparse_input_, 113 | 'classes_': label_binarizer.classes_.tolist() 114 | } 115 | 116 | return serialized_label_binarizer 117 | 118 | serialized_model = { 119 | 'meta': 'mlp', 120 | 'coefs_': [array.tolist() for array in model.coefs_], 121 | 'loss_': model.loss_, 122 | 'intercepts_': [array.tolist() for array in model.intercepts_], 123 | 'n_iter_': model.n_iter_, 124 | 'n_layers_': model.n_layers_, 125 | 'n_outputs_': model.n_outputs_, 126 | 'out_activation_': model.out_activation_, 127 | '_label_binarizer': serialize_label_binarizer(model._label_binarizer), 128 | 'params': model.get_params(), 129 | 'features':model.features, 130 | } 131 | 132 | if isinstance(model.classes_, list): 133 | serialized_model['classes_'] = [array.tolist() for array in model.classes_] 134 | else: 135 | serialized_model['classes_'] = model.classes_.tolist() 136 | 137 | with open(path, 'w') as out_file: 138 | json.dump(serialized_model, out_file, indent=2) 139 | #skljson.to_json(model, path) 140 | 141 | @staticmethod 142 | def deserialize_model(path): 143 | """Deserialize JSON object storing the ml model. 144 | Model (an MLPClassifier from sklearn) is re-instantiated 145 | with proper values. 146 | INPUT: 147 | --path: filepath for loading the JSON object 148 | OUTPUT: 149 | --model: Returns an MLPClassifier (sklearn) object 150 | """ 151 | def deserialize_label_binarizer(label_binarizer_dict): 152 | label_binarizer = LabelBinarizer() 153 | label_binarizer.neg_label = label_binarizer_dict['neg_label'] 154 | label_binarizer.pos_label = label_binarizer_dict['pos_label'] 155 | label_binarizer.sparse_output = label_binarizer_dict['sparse_output'] 156 | label_binarizer.y_type_ = label_binarizer_dict['y_type_'] 157 | label_binarizer.sparse_input_ = label_binarizer_dict['sparse_input_'] 158 | label_binarizer.classes_ = np.array(label_binarizer_dict['classes_']) 159 | 160 | return label_binarizer 161 | 162 | # Load (or deserialize) model from JSON 163 | model_dict = {} 164 | with open(path, 'r') as in_file: 165 | model_dict = json.load(in_file) 166 | 167 | model = MLPClassifier(**model_dict['params']) 168 | 169 | model.coefs_ = np.array(model_dict['coefs_'], dtype=object) 170 | model.loss_ = model_dict['loss_'] 171 | model.intercepts_ = np.array(model_dict['intercepts_'], dtype=object) 172 | model.n_iter_ = model_dict['n_iter_'] 173 | model.n_layers_ = model_dict['n_layers_'] 174 | model.n_outputs_ = model_dict['n_outputs_'] 175 | model.out_activation_ = model_dict['out_activation_'] 176 | model._label_binarizer = deserialize_label_binarizer(model_dict['_label_binarizer']) 177 | model.features = list(model_dict['features']) 178 | 179 | model.classes_ = np.array(model_dict['classes_']) 180 | # Convert coeficients to numpy arrays to enable JSON deserialization 181 | # This is a hack to compensate for a bug in sklearn_json 182 | for i, x in enumerate(model.coefs_): 183 | model.coefs_[i] = np.array(x) 184 | return model 185 | 186 | @staticmethod 187 | def serialize_scaler(scaler, path): 188 | return joblib.dump(scaler, path) 189 | 190 | @staticmethod 191 | def deserialize_scaler(path): 192 | return joblib.load(path) 193 | 194 | @staticmethod 195 | def parse_args(raw_args=None): 196 | """ 197 | Use python's argparse module to collect command line arguments 198 | for using this class 199 | """ 200 | netml_path = list(networkml.__path__) 201 | parser = argparse.ArgumentParser() 202 | parser.add_argument('path', help='path to a single csv file') 203 | parser.add_argument('--eval_data', 204 | help='path to eval CSV file, if training') 205 | parser.add_argument('--kfolds', '-k', 206 | default=5, 207 | help='specify number of folds for k-fold cross validation') 208 | parser.add_argument('--label_encoder', '-l', 209 | default=os.path.join(netml_path[0], 210 | 'trained_models/host_footprint_le.json'), 211 | help='specify a path to load or save label encoder') 212 | parser.add_argument('--scaler', 213 | default=os.path.join(netml_path[0], 214 | 'trained_models/host_footprint_scaler.mod'), 215 | help='specify a path to load or save scaler') 216 | parser.add_argument('--operation', '-O', choices=['train', 'predict', 'eval'], 217 | default='predict', 218 | help='choose which operation task to perform, \ 219 | train or predict (default=predict)') 220 | parser.add_argument('--trained_model', 221 | default=os.path.join(netml_path[0], 222 | 'trained_models/host_footprint.json'), 223 | help='specify a path to load or save trained model') 224 | parser.add_argument('--list', '-L', 225 | choices=['features'], 226 | default=None, 227 | help='list information contained within model defined by --trained_model') 228 | parser.add_argument('--verbose', '-v', 229 | choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'], 230 | default='INFO', 231 | help='logging level (default=INFO)') 232 | parser.add_argument('--train_unknown', default=False, action='store_true', 233 | help='Train on unknown roles') 234 | parsed_args = parser.parse_args(raw_args) 235 | return parsed_args 236 | 237 | def _get_test_train_csv(self, path, train_unknown): 238 | df, _, _, _ = self.regularize_df(pd.read_csv(path)) 239 | df = df.fillna(0) 240 | # Split dataframe into X (the input features or predictors) 241 | # and y (the target or outcome or dependent variable) 242 | df['role'] = df.filename.str.split('-').str[0] 243 | # Drop unknown roles. 244 | if not train_unknown: 245 | df = df[df['role'] != 'Unknown'] 246 | X = df.drop(['filename', 'role'], axis=1) 247 | y = df.role 248 | column_list = list(X.columns.values) 249 | X = self.string_feature_check(X) 250 | return (X, y, column_list) 251 | 252 | def summarize_eval_data(self, model, scaler, label_encoder, eval_data, train_unknown): 253 | X_test, y_true, _ = self._get_test_train_csv(eval_data, train_unknown) 254 | X_test = scaler.transform(X_test) 255 | y_true = label_encoder.transform(y_true) 256 | y_pred = model.predict(X_test) 257 | 258 | for metric, name in ( 259 | (accuracy_score, 'accuracy'), 260 | (precision_score, 'precision'), 261 | (recall_score, 'recall'), 262 | (f1_score, 'f1')): 263 | if metric == accuracy_score: 264 | val = metric(y_true, y_pred) 265 | else: 266 | val = metric(y_true, y_pred, average='weighted') 267 | val = np.round(val, 4) 268 | self.logger.info(f'{name}: {val}') 269 | 270 | conf_matrix = confusion_matrix(y_true, y_pred) 271 | self.logger.info(conf_matrix) 272 | self.logger.info(label_encoder.classes_.tolist()) 273 | 274 | def eval(self, path, scaler_path, le_path, model_path, train_unknown): 275 | """ 276 | Accept CSV and summarize based on already trained model. 277 | """ 278 | scaler = self.deserialize_scaler(scaler_path) 279 | le = self.deserialize_label_encoder(le_path) 280 | self.model = self.deserialize_model(model_path) 281 | self.summarize_eval_data(self.model, scaler, le, path, train_unknown) 282 | 283 | def train(self): 284 | """ 285 | This function takes a .csv file of host footprint features--i.e. each 286 | row is a feature vector for a given host and each column is a feature 287 | --and trains a model to do functional role classification. This function 288 | saves the trained model. Because the best model is still yet to be 289 | determined, this method uses only a simple neural network. A future 290 | version of this function will use a superior model once our research 291 | group has done experiments with different models and hyperparameter 292 | optimization. 293 | """ 294 | X, y, cols = self._get_test_train_csv(self.path, self.train_unknown) 295 | 296 | unique_roles = sorted(y.unique()) 297 | self.logger.info(f'inferring roles {unique_roles}') 298 | 299 | # Normalize X features before training 300 | scaler = preprocessing.StandardScaler() 301 | scaler.fit(X) 302 | X = scaler.transform(X) 303 | 304 | # Convert y into categorical/numerical feature 305 | le = preprocessing.LabelEncoder() 306 | y = le.fit_transform(y) 307 | 308 | # Instantiate neural network model 309 | # MLP = multi-layer perceptron 310 | model = MLPClassifier() 311 | 312 | # Perform grid-search with hyperparameter optimization 313 | # to find the best model 314 | parameters = {'hidden_layer_sizes': [(64, 32), (32, 16), 315 | (64, 32, 32), 316 | (64, 32, 32, 16)]} 317 | clf = GridSearchCV(model, parameters, 318 | cv=self.kfolds, n_jobs=-1, 319 | scoring='f1_weighted') 320 | 321 | self.logger.info(f'Beginning model training') 322 | # Find best fitting model from the hyper-parameter 323 | # optimization process 324 | self.model = clf.fit(X, y).best_estimator_ 325 | self.model.features = cols 326 | 327 | # Save model to JSON 328 | self.serialize_model(self.model, self.model_path) 329 | self.serialize_scaler(scaler, self.scaler) 330 | self.serialize_label_encoder(le, self.le_path) 331 | 332 | if self.eval_data: 333 | self.summarize_eval_data(self.model, self.scaler, self.le_path, self.eval_data, self.train_unknown) 334 | 335 | def predict(self): 336 | """ 337 | This function takes a csv of features at the host footprint level and 338 | then makes a role prediction for each row. The output is the top three 339 | roles. 340 | 341 | OUTPUTS: 342 | --all_predictions: a dict with the filename for a key and a JSON'ified dict 343 | dict for a value. see sorted_roles_to_json() for a description of 344 | the value's structure. 345 | """ 346 | scaler = self.deserialize_scaler(self.scaler) 347 | # Get label encoder 348 | le = self.deserialize_label_encoder(self.le_path) 349 | # Load (or deserialize) model from JSON 350 | self.model = self.deserialize_model(self.model_path) 351 | 352 | # Load data from host footprint .csv 353 | csv_df = pd.read_csv(self.path) 354 | df, host_key, tshark_srcips, frame_epoch = self.regularize_df(csv_df) 355 | # Split dataframe into X (the input features or predictors) 356 | # and y (the target or outcome or dependent variable) 357 | # This drop function should work even if there is no column 358 | # named filename 359 | X = df.drop('filename', axis=1) 360 | 361 | # Get filenames to match to predictions 362 | filename = df.filename 363 | 364 | # Normalize X features before predicting 365 | X = scaler.transform(X) 366 | 367 | self.logger.info(f'Executing model inference') 368 | # Make model predicton - Will return a vector of values 369 | predictions_rows = self.model.predict_proba(X) 370 | 371 | # Dict to store top role and list of top roles 372 | all_predictions = self.get_individual_predictions( 373 | predictions_rows, le, filename, host_key, tshark_srcips, frame_epoch) 374 | 375 | return json.dumps(all_predictions) 376 | 377 | def get_individual_predictions(self, predictions_rows, label_encoder, 378 | filename, host_key, tshark_srcips, 379 | frame_epoch, top_n_roles=3): 380 | """ Return role predictions for given device 381 | 382 | INPUTS: 383 | --predictions_rows: each device is represented as a row 384 | --label_encoder: a mapping of device role name to numerical category 385 | --filename: the filename of the pcap for which a prediction is made 386 | --host_key: canonical source MAC for this pcap. 387 | --tshark_srcips: canonical source IPs for this pcap. 388 | --frame_epoch: the timestamp of the packet. 389 | 390 | OUTPUTS: 391 | --all_predictions: a dict with the filename for a key and a 392 | JSON'ified dict for a value. see sorted_roles_to_json() for a description 393 | of the value's structure. 394 | """ 395 | 396 | # Dict to store JSON of top n roles and probabilities per device 397 | all_predictions = defaultdict(list) 398 | num_roles = len(label_encoder.classes_) 399 | labels = label_encoder.inverse_transform([i for i in range(num_roles)]) 400 | 401 | # Loop thru different devices on which to make prediction 402 | for i, predictions in enumerate(predictions_rows): 403 | role_list = [(k, v) for k, v in zip(labels, predictions)] 404 | # Sort role list by probabilities 405 | role_list_sorted = sorted(role_list, key=lambda x: x[1], reverse=True)[:top_n_roles] 406 | # Dump top role and roles-probability list 407 | host_results = self.sorted_roles_to_dict(role_list_sorted) 408 | if host_key is not None: 409 | host_results.update({'source_mac': host_key[i]}) 410 | if tshark_srcips is not None: 411 | source_ip = ast.literal_eval(tshark_srcips[i]) 412 | if source_ip: 413 | source_ip = source_ip[0] 414 | else: 415 | source_ip = None 416 | host_results.update({'source_ip': source_ip}) 417 | if frame_epoch is not None: 418 | host_results.update({'timestamp': frame_epoch[i]}) 419 | all_predictions[filename[i]].append(host_results) 420 | 421 | return all_predictions 422 | 423 | 424 | @staticmethod 425 | def sorted_roles_to_dict(role_list_sorted, threshold=.5): 426 | """ Converted sorted role-probability list into formatted dict 427 | 428 | This function ensures that the top role returned is Unknown 429 | if the top role has a probability less than the threshold 430 | specified in the default input parameter. 431 | 432 | INPUTS: 433 | --role_list_sorted: a sorted list that associates the top role 434 | with their probabilities 435 | --threshold: probability threshold below which the top role 436 | should be designated as "Unknown" 437 | 438 | OUTPUTS: 439 | --predictions: a dict with the top role and a sorted role list 440 | """ 441 | 442 | # Probability associated with the most likely role 443 | top_role_prob = role_list_sorted[0][1] 444 | 445 | # Only use actual top role if probability is greater 446 | # than designated threshold 447 | if top_role_prob <= threshold: 448 | top_role = 'Unknown' 449 | else: 450 | top_role = role_list_sorted[0][0] # Most likely role 451 | 452 | # Create dict to store prediction results 453 | role_predictions = { 454 | 'top_role': top_role, 455 | 'role_list': role_list_sorted, 456 | } 457 | 458 | return role_predictions 459 | 460 | 461 | def string_feature_check(self, X): 462 | """ 463 | This function takes a pandas dataframe that contains the 464 | features for a model and checks if any of the features are 465 | strings (or "objects" in the pandas ontology). If any of the 466 | features are strings, then that feature is expanded into dummy 467 | features, i.e. a series of 0/1 features for each category within 468 | that object feature. The function then removes the original feature. 469 | 470 | INPUTS: 471 | --X: a pandas dataframe with only the training features 472 | 473 | OUPUTS: 474 | --X: a pandas dataframe expanded with dummy features 475 | 476 | """ 477 | 478 | # loop through columns in X 479 | for col in X.columns: 480 | 481 | # Check if the feature's data type is string 482 | # Object is the datatype pandas uses for storing strings 483 | if X[col].dtype == 'object': 484 | 485 | # log warning if a string column is found 486 | self.logger.info(f'String object found in column {col}') 487 | 488 | # Expand features into "dummy", i.e. 0/1 489 | # features 490 | new_features = pd.get_dummies(X[col]) 491 | 492 | # Add new features onto X dataframe 493 | X = pd.concat([X, new_features], axis=1) 494 | 495 | # Remove original non-expanded feature from X 496 | X = X.drop(col, axis=1) 497 | 498 | return X 499 | 500 | 501 | def list_model(self): 502 | model = self.deserialize_model(self.model_path) 503 | if self.list == 'features': 504 | return model.features 505 | 506 | 507 | def main(self): 508 | """ 509 | Collect and parse command line arguments for using this class 510 | """ 511 | 512 | # Collect command line arguments 513 | parsed_args = HostFootprint.parse_args(raw_args=self.raw_args) 514 | self.path = parsed_args.path 515 | self.eval_data = parsed_args.eval_data 516 | self.model_path = parsed_args.trained_model 517 | self.le_path = parsed_args.label_encoder 518 | self.scaler = parsed_args.scaler 519 | self.kfolds = int(parsed_args.kfolds) 520 | self.train_unknown = parsed_args.train_unknown 521 | self.list = parsed_args.list 522 | operation = parsed_args.operation 523 | log_level = parsed_args.verbose 524 | 525 | # Set logging output options 526 | log_levels = {'INFO': logging.INFO, 'DEBUG': logging.DEBUG, 527 | 'WARNING': logging.WARNING, 'ERROR': logging.ERROR} 528 | logging.basicConfig(level=log_levels[log_level]) 529 | 530 | self.logger.debug(f'hostfootprint.main list: {self.list}') 531 | if self.list: 532 | model_list = self.list_model() 533 | if model_list and len(model_list) > 0: 534 | result = f'Listing {self.list} for model at {self.model_path}:\n{model_list}' 535 | return result 536 | else: 537 | return f'model found at {self.model_path} contains no {self.list}' 538 | 539 | # Basic execution logic 540 | if operation == 'train': 541 | if not self.train_unknown: 542 | self.logger.info(f'Role Unknown will be dropped from training data') 543 | self.train() 544 | self.logger.info(f'Saved model to: {self.model_path}') 545 | self.logger.info(f'Saved label encoder to: {self.le_path}') 546 | return self.model_path 547 | if operation == 'predict': 548 | role_prediction = self.predict() 549 | self.logger.info(f'{role_prediction}') 550 | return role_prediction 551 | if operation == 'eval': 552 | return self.eval(self.path, self.scaler, self.le_path, self.model_path, self.train_unknown) 553 | return None 554 | 555 | 556 | if __name__ == '__main__': 557 | host_footprint = HostFootprint() 558 | host_footprint.main() 559 | --------------------------------------------------------------------------------