├── docs
├── source
│ ├── _static
│ │ └── EMPTY
│ ├── index.rst
│ └── api.rst
├── requirement.txt
├── README.md
├── Makefile
└── make.bat
├── python
├── clx
│ ├── dns
│ │ └── __init__.py
│ ├── io
│ │ ├── __init__.py
│ │ ├── factory
│ │ │ ├── __init__.py
│ │ │ ├── abstract_factory.py
│ │ │ ├── dask_fs_factory.py
│ │ │ ├── fs_factory.py
│ │ │ ├── factory.py
│ │ │ └── kafka_factory.py
│ │ ├── reader
│ │ │ ├── __init__.py
│ │ │ ├── reader.py
│ │ │ ├── file_reader.py
│ │ │ ├── fs_reader.py
│ │ │ ├── dask_fs_reader.py
│ │ │ └── kafka_reader.py
│ │ └── writer
│ │ │ ├── __init__.py
│ │ │ ├── writer.py
│ │ │ ├── file_writer.py
│ │ │ ├── fs_writer.py
│ │ │ └── kafka_writer.py
│ ├── osi
│ │ ├── __init__.py
│ │ └── whois.py
│ ├── utils
│ │ ├── __init__.py
│ │ └── data
│ │ │ ├── __init__.py
│ │ │ ├── dataset.py
│ │ │ ├── dataloader.py
│ │ │ └── utils.py
│ ├── analytics
│ │ ├── __init__.py
│ │ ├── model
│ │ │ ├── __init__.py
│ │ │ ├── tabular_model.py
│ │ │ └── rnn_classifier.py
│ │ ├── dga_dataset.py
│ │ ├── anomaly_detection.py
│ │ ├── periodicity_detection.py
│ │ ├── stats.py
│ │ └── detector.py
│ ├── heuristics
│ │ └── __init__.py
│ ├── parsers
│ │ ├── __init__.py
│ │ ├── resources
│ │ │ └── splunk_notable_regex.yaml
│ │ └── zeek.py
│ ├── workflow
│ │ ├── __init__.py
│ │ └── netflow_workflow.py
│ ├── eda
│ │ ├── __init__.py
│ │ ├── analysis.py
│ │ └── summary_stats.py
│ ├── __init__.py
│ └── tests
│ │ ├── test_anomaly_detection.py
│ │ ├── test_utils.py
│ │ ├── test_kafka_writer.py
│ │ ├── test_dga_dataset.py
│ │ ├── test_features.py
│ │ ├── test_dataloader.py
│ │ ├── test_whois.py
│ │ ├── test_event_parser.py
│ │ ├── test_stats.py
│ │ ├── test_loda.py
│ │ ├── test_netflow_workflow.py
│ │ ├── test_dask_fs_reader.py
│ │ ├── test_eda.py
│ │ ├── test_kafka_reader.py
│ │ ├── test_fs_writer.py
│ │ ├── test_port_heuristic.py
│ │ ├── test_fs_reader.py
│ │ ├── test_binary_sequence_classifier.py
│ │ └── test_multiclass_sequence_classifier.py
├── .gitattributes
├── MANIFEST.in
├── pytest.ini
├── .flake8
├── setup.py
└── setup.cfg
├── examples
├── streamz
│ ├── python
│ │ ├── clx_streamz_tools
│ │ │ └── __init__.py
│ │ ├── setup.py
│ │ ├── dga_detection.py
│ │ ├── phishing_detection.py
│ │ └── cybert.py
│ ├── resources
│ │ ├── cybert.yaml
│ │ ├── dga_detection.yaml
│ │ └── phishing_detection.yaml
│ └── scripts
│ │ └── entrypoint.sh
└── run_dga_training.py
├── siem_integrations
├── clx_query_service
│ ├── clxquery
│ │ ├── __init__.py
│ │ ├── migrations
│ │ │ └── __init__.py
│ │ ├── models.py
│ │ ├── admin.py
│ │ ├── tests.py
│ │ ├── apps.py
│ │ ├── urls.py
│ │ ├── utils.py
│ │ ├── logging.conf
│ │ ├── blazingsql_helper.py
│ │ └── views.py
│ ├── clx_query_service
│ │ ├── __init__.py
│ │ ├── urls.py
│ │ ├── wsgi.py
│ │ └── settings.py
│ ├── conf
│ │ ├── clx_blz_reader_conf.yaml
│ │ └── clx_query_service.conf
│ └── manage.py
├── splunk2kafka
│ ├── export2kafka
│ │ ├── bin
│ │ │ ├── scripts
│ │ │ │ └── nothing.sh
│ │ │ └── export2kafka.py
│ │ ├── default
│ │ │ ├── commands.conf
│ │ │ ├── data
│ │ │ │ └── ui
│ │ │ │ │ └── nav
│ │ │ │ │ └── default.xml
│ │ │ └── app.conf
│ │ ├── metadata
│ │ │ └── default.meta
│ │ └── README.md
│ └── splunk_wrapper
│ │ ├── wrapper-install.sh
│ │ └── README.md
├── clx_query
│ ├── clx_query.png
│ ├── default
│ │ ├── clx_query_setup.conf
│ │ ├── commands.conf
│ │ ├── app.conf
│ │ ├── logging.conf
│ │ └── data
│ │ │ └── ui
│ │ │ └── nav
│ │ │ └── default.xml
│ ├── metadata
│ │ └── default.meta
│ └── bin
│ │ ├── clx_query_conf.py
│ │ └── clx_query.py
└── Dockerfile
├── conda
├── recipes
│ └── clx
│ │ ├── build.sh
│ │ ├── conda_build_config.yaml
│ │ └── meta.yaml
└── environments
│ └── clx_dev_cuda11.5.yml
├── notebooks
├── alert_analysis
│ └── workflow_implementation
│ │ ├── input.csv
│ │ ├── input2.csv
│ │ ├── image1.png
│ │ ├── image3.png
│ │ ├── image4.png
│ │ ├── image5.png
│ │ ├── image7.png
│ │ └── image8.png
└── ids_detection
│ └── util.py
├── img
└── rapids_logo.png
├── ci
├── cpu
│ ├── prebuild.sh
│ ├── upload.sh
│ └── build.sh
├── integration_tests
│ ├── Dockerfile.test
│ ├── docker-compose.test.yml
│ └── README.md
├── checks
│ ├── style.sh
│ └── changelog.sh
├── utils
│ └── nbtest.sh
├── docs
│ └── build.sh
├── release
│ └── update-version.sh
├── gpu
│ ├── test-notebooks.sh
│ └── build.sh
└── local
│ └── README.md
├── docker
├── .run_in_rapids.sh
├── start_jupyter.sh
├── stop_jupyter.sh
└── .start_jupyter_run_in_rapids.sh
├── .github
├── ISSUE_TEMPLATE
│ ├── submit_question.md
│ ├── feature_request.md
│ ├── documentation-request.md
│ └── bug_report.md
├── workflows
│ └── labeler.yml
├── ops-bot.yaml
├── CODEOWNERS
└── labeler.yml
├── .pre-commit-config.yaml
├── .gitignore
├── docker-compose.yml
├── Dockerfile
└── CODE_OF_CONDUCT.md
/docs/source/_static/EMPTY:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/python/clx/dns/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/python/clx/io/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/python/clx/osi/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/python/clx/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/python/clx/analytics/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/python/clx/heuristics/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/python/clx/io/factory/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/python/clx/io/reader/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/python/clx/io/writer/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/python/clx/parsers/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/python/clx/utils/data/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/python/clx/workflow/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/python/clx/analytics/model/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/examples/streamz/python/clx_streamz_tools/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/python/.gitattributes:
--------------------------------------------------------------------------------
1 | clx/_version.py export-subst
2 |
--------------------------------------------------------------------------------
/siem_integrations/clx_query_service/clxquery/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/siem_integrations/clx_query_service/clx_query_service/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/siem_integrations/clx_query_service/clxquery/migrations/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/python/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include versioneer.py
2 | include clx/_version.py
3 |
--------------------------------------------------------------------------------
/python/clx/eda/__init__.py:
--------------------------------------------------------------------------------
1 | from clx.eda.eda import EDA # noqa: F401
2 |
--------------------------------------------------------------------------------
/siem_integrations/splunk2kafka/export2kafka/bin/scripts/nothing.sh:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/conda/recipes/clx/build.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | ./build.sh clx
4 |
--------------------------------------------------------------------------------
/notebooks/alert_analysis/workflow_implementation/input.csv:
--------------------------------------------------------------------------------
1 | raw
2 | hello gtcdc
--------------------------------------------------------------------------------
/docs/requirement.txt:
--------------------------------------------------------------------------------
1 | sphinx
2 | sphinx_rtd_theme
3 | numpydoc
4 | ipython
5 | nbsphinx
--------------------------------------------------------------------------------
/img/rapids_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/clx/HEAD/img/rapids_logo.png
--------------------------------------------------------------------------------
/ci/cpu/prebuild.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | export BUILD_CLX=1
4 | export UPLOAD_CLX=1
5 |
--------------------------------------------------------------------------------
/notebooks/alert_analysis/workflow_implementation/input2.csv:
--------------------------------------------------------------------------------
1 | raw
2 | username=gtcdc host=1.2.3.4
3 |
--------------------------------------------------------------------------------
/docker/.run_in_rapids.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | . /opt/conda/etc/profile.d/conda.sh
3 | conda activate rapids
4 | exec "$@"
5 |
--------------------------------------------------------------------------------
/siem_integrations/clx_query_service/clxquery/models.py:
--------------------------------------------------------------------------------
1 | from django.db import models
2 |
3 | # Create your models here.
4 |
--------------------------------------------------------------------------------
/siem_integrations/clx_query_service/clxquery/admin.py:
--------------------------------------------------------------------------------
1 | from django.contrib import admin
2 |
3 | # Register your models here.
4 |
--------------------------------------------------------------------------------
/siem_integrations/clx_query_service/clxquery/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase
2 |
3 | # Create your tests here.
4 |
--------------------------------------------------------------------------------
/siem_integrations/clx_query/clx_query.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/clx/HEAD/siem_integrations/clx_query/clx_query.png
--------------------------------------------------------------------------------
/siem_integrations/splunk2kafka/export2kafka/default/commands.conf:
--------------------------------------------------------------------------------
1 | [export2kafka]
2 | filename = export2kafka.py
3 | chunked = true
4 |
--------------------------------------------------------------------------------
/siem_integrations/clx_query/default/clx_query_setup.conf:
--------------------------------------------------------------------------------
1 | [setupentity]
2 | clx_hostname = localhost
3 | clx_port = 8998
4 | clx_query_limit = 10000
5 |
--------------------------------------------------------------------------------
/conda/recipes/clx/conda_build_config.yaml:
--------------------------------------------------------------------------------
1 | c_compiler_version:
2 | - 9
3 |
4 | cxx_compiler_version:
5 | - 9
6 |
7 | sysroot_version:
8 | - "2.17"
9 |
--------------------------------------------------------------------------------
/siem_integrations/clx_query_service/clx_query_service/urls.py:
--------------------------------------------------------------------------------
1 | from django.urls import path, include
2 |
3 | urlpatterns = [path("", include("clxquery.urls"))]
4 |
--------------------------------------------------------------------------------
/siem_integrations/clx_query_service/clxquery/apps.py:
--------------------------------------------------------------------------------
1 | from django.apps import AppConfig
2 |
3 |
4 | class ClxQueryConfig(AppConfig):
5 | name = "clxquery"
6 |
--------------------------------------------------------------------------------
/notebooks/alert_analysis/workflow_implementation/image1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/clx/HEAD/notebooks/alert_analysis/workflow_implementation/image1.png
--------------------------------------------------------------------------------
/notebooks/alert_analysis/workflow_implementation/image3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/clx/HEAD/notebooks/alert_analysis/workflow_implementation/image3.png
--------------------------------------------------------------------------------
/notebooks/alert_analysis/workflow_implementation/image4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/clx/HEAD/notebooks/alert_analysis/workflow_implementation/image4.png
--------------------------------------------------------------------------------
/notebooks/alert_analysis/workflow_implementation/image5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/clx/HEAD/notebooks/alert_analysis/workflow_implementation/image5.png
--------------------------------------------------------------------------------
/notebooks/alert_analysis/workflow_implementation/image7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/clx/HEAD/notebooks/alert_analysis/workflow_implementation/image7.png
--------------------------------------------------------------------------------
/notebooks/alert_analysis/workflow_implementation/image8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rapidsai/clx/HEAD/notebooks/alert_analysis/workflow_implementation/image8.png
--------------------------------------------------------------------------------
/python/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | log_cli = 1
3 | log_cli_level = INFO
4 | log_cli_format = %(asctime)s [%(levelname)s] %(message)s (%(filename)s:%(lineno)s)
5 | log_cli_date_format="%Y-%m-%d %H:%M:%S"
6 |
--------------------------------------------------------------------------------
/siem_integrations/clx_query_service/clxquery/urls.py:
--------------------------------------------------------------------------------
1 | from django.conf.urls import re_path
2 | from clxquery import views
3 |
4 | urlpatterns = [re_path("clxquery/$", views.ExecuteClxQuery.as_view())]
--------------------------------------------------------------------------------
/siem_integrations/splunk2kafka/export2kafka/default/data/ui/nav/default.xml:
--------------------------------------------------------------------------------
1 |
5 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/submit_question.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Submit question
3 | about: Ask a general question about CLX
4 | title: "[QST]"
5 | labels: "? - Needs Triage, question"
6 | assignees: ''
7 |
8 | ---
9 |
10 | **What is your question?**
--------------------------------------------------------------------------------
/.github/workflows/labeler.yml:
--------------------------------------------------------------------------------
1 | name: "Pull Request Labeler"
2 | on:
3 | - pull_request_target
4 |
5 | jobs:
6 | triage:
7 | runs-on: ubuntu-latest
8 | steps:
9 | - uses: actions/labeler@main
10 | with:
11 | repo-token: "${{ secrets.GITHUB_TOKEN }}"
12 |
--------------------------------------------------------------------------------
/docker/start_jupyter.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | nohup jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token='' > /dev/null 2>&1 &
3 | echo -e "\n"
4 | echo "nohup jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token='rapids' > /dev/null 2>&1 &"
5 | echo -e "\n"
6 |
--------------------------------------------------------------------------------
/.github/ops-bot.yaml:
--------------------------------------------------------------------------------
1 | # This file controls which features from the `ops-bot` repository below are enabled.
2 | # - https://github.com/rapidsai/ops-bot
3 |
4 | auto_merger: true
5 | branch_checker: true
6 | label_checker: true
7 | release_drafter: true
8 | external_contributors: false
9 | copy_prs: true
10 |
--------------------------------------------------------------------------------
/docker/stop_jupyter.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ps aux | grep jupyter | \
3 | grep --extended-regexp "$USER[\ ]{1,10}[0-9]{1,10}" | \
4 | grep --only-matching --extended-regexp "$USER[\ ]{1,10}[0-9]{1,10}" | \
5 | grep --only-matching --extended-regexp "[\ ]{1,10}[0-9]{1,10}" | \
6 | xargs kill -9
7 | sleep 2
--------------------------------------------------------------------------------
/siem_integrations/splunk2kafka/export2kafka/default/app.conf:
--------------------------------------------------------------------------------
1 | # Version 7.0.0
2 | # Splunk app configuration file
3 |
4 | [install]
5 | is_configured = 0
6 |
7 | [ui]
8 | is_visible = 1
9 | label = CyberWorks
10 |
11 | [launcher]
12 | author = ASE Team
13 | description =
14 | version = 1.0
15 |
16 |
--------------------------------------------------------------------------------
/siem_integrations/clx_query/metadata/default.meta:
--------------------------------------------------------------------------------
1 | []
2 | access = read : [ * ], write : [ * ]
3 |
4 | [props/sendmail/]
5 | export = system
6 | owner = nobody
7 |
8 | ### VIEWSTATES: even normal users should be able to create shared viewstates
9 |
10 | [viewstates]
11 | access = read : [ * ], write : [ * ]
12 |
--------------------------------------------------------------------------------
/siem_integrations/clx_query_service/conf/clx_blz_reader_conf.yaml:
--------------------------------------------------------------------------------
1 | input_format: csv
2 | tables:
3 | -
4 | input_path: "/rapids/my_data/movies"
5 | table_name: movies
6 | header: 0
7 | -
8 | input_path: "/rapids/my_data/ratings"
9 | table_name: ratings
10 | header: 0
11 | type: blazingsql
12 |
13 |
--------------------------------------------------------------------------------
/siem_integrations/splunk2kafka/export2kafka/metadata/default.meta:
--------------------------------------------------------------------------------
1 | []
2 | access = read : [ * ], write : [ * ]
3 |
4 | [props/sendmail/]
5 | export = system
6 | owner = nobody
7 |
8 | ### VIEWSTATES: even normal users should be able to create shared viewstates
9 |
10 | [viewstates]
11 | access = read : [ * ], write : [ * ]
12 |
--------------------------------------------------------------------------------
/python/.flake8:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020, NVIDIA CORPORATION.
2 |
3 | [flake8]
4 | exclude = factory.py,perfect_hash.py
5 | ignore =
6 | # line break before binary operator
7 | W503
8 | # whitespace before :
9 | E203
10 | # line too long (82 > 79 characters)
11 | E501
12 | # invalid escape sequence ‘x’
13 | W605
14 |
--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | #python code owners
2 | clx/ @rapidsai/clx-python-codeowners
3 |
4 | #build/ops code owners
5 | .github/ @rapidsai/ops-codeowners
6 | ci/ @rapidsai/ops-codeowners
7 | conda/ @rapidsai/ops-codeowners
8 | **/Dockerfile @rapidsai/ops-codeowners
9 | **/.dockerignore @rapidsai/ops-codeowners
10 |
--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | # Building Documentation
2 |
3 | A basic python environment with packages listed in `./requirement.txt` is
4 | enough to build the docs.
5 |
6 | ## Get additional dependency
7 |
8 | ```bash
9 | pip install -r requirement.txt
10 | ```
11 |
12 | ## Run makefile:
13 |
14 | ```bash
15 | make html
16 | ```
17 |
18 | Outputs to `build/html/index.html`
--------------------------------------------------------------------------------
/siem_integrations/clx_query/default/commands.conf:
--------------------------------------------------------------------------------
1 | # [commands.conf]($SPLUNK_HOME/etc/system/README/commands.conf.spec)
2 | [defaults]
3 |
4 | [clx]
5 | filename = clx_query.py
6 | enableheader = true
7 | outputheader = true
8 | requires_srinfo = true
9 | supports_getinfo = true
10 | supports_multivalues = true
11 | supports_rawargs = true
12 | stderr_dest = message
13 |
--------------------------------------------------------------------------------
/.github/labeler.yml:
--------------------------------------------------------------------------------
1 | # https://github.com/actions/labeler#common-examples
2 | # Adapted from https://github.com/rapidsai/clx/blob/main/.github/CODEOWNERS
3 | # Labels culled from https://github.com/rapidsai/clx/labels
4 |
5 | Python:
6 | - 'python/**'
7 | - 'notebooks/**'
8 |
9 | integration:
10 | - 'siem_integrations'
11 |
12 | gpuCI:
13 | - 'ci/**'
14 |
15 | conda:
16 | - 'conda/**'
17 |
--------------------------------------------------------------------------------
/siem_integrations/clx_query/default/app.conf:
--------------------------------------------------------------------------------
1 | # Splunk app configuration file
2 |
3 | [ui]
4 | label = Clx Query
5 | is_visible = 1
6 |
7 | [launcher]
8 | description = This app has ability to perform custom query on clx python module, which internally triggers workflow to retrieve data.
9 | author = CLX
10 | version = 0.13
11 |
12 | [package]
13 | id = clx_query
14 |
15 | [install]
16 | is_configured = 0
17 |
--------------------------------------------------------------------------------
/docker/.start_jupyter_run_in_rapids.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | . /opt/conda/etc/profile.d/conda.sh
3 | conda activate rapids
4 | /rapids/utils/start_jupyter.sh > /dev/null
5 | echo "Notebook server successfully started!"
6 | echo "To access visit http://localhost:8888 on your host machine."
7 | echo 'Ensure the following arguments to "docker run" are added to expose the server ports to your host machine:
8 | -p 8888:8888 -p 8787:8787 -p 8786:8786'
9 | exec "$@"
10 |
--------------------------------------------------------------------------------
/ci/integration_tests/Dockerfile.test:
--------------------------------------------------------------------------------
1 | ARG repository=rapidsai-dev-nightly
2 | ARG version=0.11-cuda10.0-devel-ubuntu18.04-py3.7
3 |
4 | FROM rapidsai/${repository}:${version}
5 |
6 | ADD . /clx/
7 |
8 | ADD ./ci/integration_tests/run_integration_test.py /clx/run_integration_test.py
9 |
10 | SHELL ["/bin/bash", "-c"]
11 | RUN source activate rapids \
12 | && cd /clx \
13 | && python setup.py install
14 |
15 | WORKDIR /clx
16 | CMD source activate rapids && python run_integration_test.py
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/timothycrosley/isort
3 | rev: 5.0.7
4 | hooks:
5 | - id: isort
6 | - repo: https://github.com/ambv/black
7 | rev: 19.10b0
8 | hooks:
9 | - id: black
10 | - repo: https://gitlab.com/pycqa/flake8
11 | rev: 3.8.3
12 | hooks:
13 | - id: flake8
14 | alias: flake8
15 | name: flake8
16 | args: ["--config=python/.flake8"]
17 | types: [python]
18 |
--------------------------------------------------------------------------------
/siem_integrations/clx_query_service/clx_query_service/wsgi.py:
--------------------------------------------------------------------------------
1 | """
2 | WSGI config for clx_query_service project.
3 |
4 | It exposes the WSGI callable as a module-level variable named ``application``.
5 |
6 | For more information on this file, see
7 | https://docs.djangoproject.com/en/2.2/howto/deployment/wsgi/
8 | """
9 |
10 | import os
11 |
12 | from django.core.wsgi import get_wsgi_application
13 |
14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "clx_query_service.settings")
15 |
16 | application = get_wsgi_application()
17 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Common
2 | *.pyc
3 | *.a
4 | *.o
5 | *.so
6 | *.dylib
7 | .cache
8 | .coverage
9 | .vscode
10 | *.swp
11 | .DS_Store
12 |
13 | # Python
14 | __pycache__/
15 | .pytest_cache/
16 | build/
17 | dist/
18 | clx.egg-info/
19 | python/clx/analytics/*.cpp
20 |
21 | ## C++ build directories & artifacts
22 | CMakeFiles/
23 | Debug
24 | build/
25 | bin/
26 |
27 | # Dask
28 | dask-worker-space/
29 |
30 | # IDE
31 | .idea/
32 | *.iml
33 |
34 | # Test output
35 | clx/tests/output
36 | clx/tests/.config
37 | rnn_classifier_2020-06-08_20_48_03.pth
38 |
39 | # Jupyter
40 | .ipynb_checkpoints/
--------------------------------------------------------------------------------
/python/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | import versioneer
4 |
5 |
6 | setup(
7 | name="clx",
8 | version=versioneer.get_version(),
9 | description="CLX",
10 | author="NVIDIA Corporation",
11 | packages=find_packages(include=["clx", "clx.*"]),
12 | package_data={
13 | "clx.analytics": ["resources/*.txt"],
14 | "clx.parsers": ["resources/*.yaml"],
15 | "clx.dns": ["resources/*.txt"],
16 | "clx.heuristics": ["resources/*.csv"]
17 | },
18 | license="Apache",
19 | cmdclass=versioneer.get_cmdclass()
20 | )
21 |
--------------------------------------------------------------------------------
/siem_integrations/clx_query_service/conf/clx_query_service.conf:
--------------------------------------------------------------------------------
1 | [supervisord]
2 | logfile = /tmp/supervisord.log
3 | logfile_maxbytes = 50MB
4 | logfile_backups=5
5 |
6 |
7 | [inet_http_server]
8 | port=127.0.0.1:9001
9 |
10 | [rpcinterface:supervisor]
11 | supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
12 |
13 | [supervisorctl]
14 | serverurl=http://127.0.0.1:9001
15 |
16 |
17 | [program:clx_query_service]
18 | directory=/rapids/clx/siem_integrations/clx_query_service/bin
19 | command=bash start_service.sh -p 8998 -w 2 -t 60
20 | autostart=false
21 | autorestart=true
--------------------------------------------------------------------------------
/siem_integrations/clx_query/default/logging.conf:
--------------------------------------------------------------------------------
1 | [loggers]
2 | keys = root, ClxQuery
3 |
4 | [logger_root]
5 | level = INFO ; Default: WARNING
6 | handlers = stderr ; Default: stderr
7 |
8 | [logger_ClxQuery]
9 | qualname = ClxQuery
10 | level = INFO ; Default: WARNING
11 | handlers = stderr ; Default: stderr
12 |
13 | [handlers]
14 | keys=stderr
15 |
16 | [handler_stderr]
17 | class = logging.StreamHandler
18 | level = NOTSET
19 | args = (sys.stderr,)
20 | formatter = search_command
21 |
22 | [formatters]
23 | keys = search_command
24 |
25 | [formatter_search_command]
26 | format=%(levelname)s:%(module)s: %(message)s
27 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SPHINXPROJ = clx
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--------------------------------------------------------------------------------
/siem_integrations/clx_query/default/data/ui/nav/default.xml:
--------------------------------------------------------------------------------
1 |
2 |
20 |
--------------------------------------------------------------------------------
/ci/integration_tests/docker-compose.test.yml:
--------------------------------------------------------------------------------
1 | version: '3'
2 | services:
3 | zookeeper:
4 | image: confluentinc/cp-zookeeper:latest
5 | environment:
6 | ZOOKEEPER_CLIENT_PORT: 2181
7 | ZOOKEEPER_TICK_TIME: 2000
8 | kafka:
9 | image: confluentinc/cp-kafka:latest
10 | depends_on:
11 | - zookeeper
12 | environment:
13 | KAFKA_BROKER_ID: 1
14 | KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
15 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
16 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
17 | clx:
18 | build:
19 | context: ../../
20 | dockerfile: ci/integration_tests/Dockerfile.test
21 | depends_on:
22 | - kafka
23 | ports:
24 | - "8888:8888"
--------------------------------------------------------------------------------
/ci/integration_tests/README.md:
--------------------------------------------------------------------------------
1 | # CLX Integration Testing
2 |
3 | CLX integrates with [Kafka](https://kafka.apache.org/) for the ability to read and write data from/to a Kafka queue. An integration test environment has been created to simulate and test this interaction.
4 |
5 | ## Running the Integration Test
6 |
7 | To run the integration test simply run the following. This will run the integration test `run_integration_test.py`.
8 |
9 | ```
10 | cd ci/integration_tests
11 | docker-compose -f docker-compose.test.yml up
12 | ```
13 |
14 | To continue re-running the integration tests, don't forget to first destroy your current docker images/containers, before creating a new one.
15 |
16 | ```
17 | cd ci/integration_tests
18 | docker-compose down
19 | ```
--------------------------------------------------------------------------------
/siem_integrations/clx_query_service/manage.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Django's command-line utility for administrative tasks."""
3 | import os
4 | import sys
5 |
6 |
7 | def main():
8 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "clx_query_service.settings")
9 | try:
10 | from django.core.management import execute_from_command_line
11 | except ImportError as exc:
12 | raise ImportError(
13 | "Couldn't import Django. Are you sure it's installed and "
14 | "available on your PYTHONPATH environment variable? Did you "
15 | "forget to activate a virtual environment?"
16 | ) from exc
17 | execute_from_command_line(sys.argv)
18 |
19 |
20 | if __name__ == "__main__":
21 | main()
22 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for CLX
4 | title: "[FEA]"
5 | labels: "? - Needs Triage, feature request"
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I wish I could use CLX to do [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context, code examples, or references to existing implementations about the feature request here.
--------------------------------------------------------------------------------
/python/clx/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | # Licensed under the Apache License, Version 2.0 (the "License");
3 | # you may not use this file except in compliance with the License.
4 | # You may obtain a copy of the License at
5 | #
6 | # http://www.apache.org/licenses/LICENSE-2.0
7 | #
8 | # Unless required by applicable law or agreed to in writing, software
9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 |
14 | # Versioneer
15 | from ._version import get_versions
16 | __version__ = get_versions()['version']
17 |
18 | del get_versions
19 |
--------------------------------------------------------------------------------
/conda/environments/clx_dev_cuda11.5.yml:
--------------------------------------------------------------------------------
1 | name: clx_dev
2 | channels:
3 | - rapidsai
4 | - rapidsai-nightly
5 | - conda-forge
6 | dependencies:
7 | - cudatoolkit=11.5
8 | - python>=3.6,<3.9
9 | - cugraph=23.04.*
10 | - cuml=23.04.*
11 | - cuxfilter=23.04.*
12 | - scikit-learn=0.23.1
13 | - s3fs
14 | - ipywidgets
15 | - python-confluent-kafka
16 | - transformers=4.*
17 | - seqeval
18 | - python-whois
19 | - seaborn
20 | - requests
21 | - matplotlib
22 | - pip
23 | - pytest
24 | - faker
25 | - jupyterlab
26 | - sphinx
27 | - sphinx_rtd_theme
28 | - numpydoc
29 | - ipython
30 | - nbsphinx
31 | - pip:
32 | - "git+https://github.com/rapidsai/cudatashader.git"
33 | - "git+https://github.com/slashnext/SlashNext-URL-Analysis-and-Enrichment.git#egg=slashnext-phishing-ir&subdirectory=Python SDK/src"
34 | - wget
35 | - mockito
36 | - torch==1.11.0
37 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3'
2 | services:
3 | #JupyterLab will be available at port 9888
4 | clx:
5 | build: .
6 | ports:
7 | - "9888:8888"
8 | - "8787:8787"
9 | - "8686:8686"
10 | stdin_open: true
11 | tty: true
12 | runtime: nvidia
13 | #Zookeeper will be available at `zookeeper:2181`
14 | zookeeper:
15 | image: confluentinc/cp-zookeeper:latest
16 | environment:
17 | ZOOKEEPER_CLIENT_PORT: 2181
18 | ZOOKEEPER_TICK_TIME: 2000
19 | #Kafka will be available at `kafka:9092`
20 | kafka:
21 | image: confluentinc/cp-kafka:latest
22 | depends_on:
23 | - zookeeper
24 | environment:
25 | KAFKA_BROKER_ID: 1
26 | KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
27 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092
28 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
29 |
--------------------------------------------------------------------------------
/python/clx/io/writer/writer.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from abc import ABC, abstractmethod
16 |
17 |
18 | class Writer(ABC):
19 | @abstractmethod
20 | def close(self):
21 | pass
22 |
23 | @abstractmethod
24 | def write_data(self):
25 | pass
26 |
--------------------------------------------------------------------------------
/examples/streamz/python/setup.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from setuptools import setup
16 |
17 | setup(
18 | name="clx_streamz_tools",
19 | version="0.1",
20 | author="NVIDIA Corporation",
21 | packages=["clx_streamz_tools"],
22 | include_package_data=True,
23 | )
24 |
--------------------------------------------------------------------------------
/siem_integrations/splunk2kafka/splunk_wrapper/wrapper-install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | sudo -i
3 |
4 | SPLUNKHOME=/opt/splunk
5 |
6 | cd $SPLUNKHOME/bin
7 | mv splunk splunk.splunk
8 | cat < splunk.wrapper
9 | #!/bin/bash
10 |
11 | RETVAL=0
12 |
13 | switch_python_splunk() {
14 | echo Switching Python to Splunk distro...
15 | rm -f $SPLUNKHOME/bin/python2.7
16 | cp -a $SPLUNKHOME/bin/python2.7.splunk $SPLUNKHOME/bin/python2.7
17 | }
18 | switch_python_conda() {
19 | echo Switching Python to Miniconda distro...
20 | rm -f $SPLUNKHOME/bin/python2.7
21 | cp -a $SPLUNKHOME/bin/python2.7.conda $SPLUNKHOME/bin/python2.7
22 | }
23 |
24 | switch_python_splunk
25 | sleep 1
26 | $SPLUNKHOME/bin/splunk.splunk \$@
27 | RETVAL=\$?
28 | sleep 5
29 | switch_python_conda
30 |
31 | exit \$RETVAL
32 | EOF
33 | chmod 755 splunk.wrapper
34 | chown splunk:splunk splunk.wrapper
35 | ln -s splunk.wrapper splunk
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | .. clx documentation main file, created by
2 | sphinx-quickstart on Thu Oct 3 16:57:19 2019.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | Welcome to clx's documentation!
7 | ===============================
8 |
9 | .. toctree::
10 | :maxdepth: 2
11 | :caption: Contents:
12 |
13 | api.rst
14 | 10min-clx.ipynb
15 | intro-clx-workflow.ipynb
16 | intro-clx-dga.ipynb
17 | intro-clx-streamz.ipynb
18 | intro-clx-asset-classification.ipynb
19 | intro-clx-cybert.ipynb
20 | intro-clx-loda-anomaly-detection.ipynb
21 | intro-clx-periodicity-detection.ipynb
22 | intro-clx-phishing-detection.ipynb
23 | intro-clx-predictive-maintenance.ipynb
24 |
25 | Indices and tables
26 | ==================
27 |
28 | * :ref:`genindex`
29 | * :ref:`modindex`
30 | * :ref:`search`
31 |
--------------------------------------------------------------------------------
/siem_integrations/clx_query_service/clxquery/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import yaml
16 |
17 | """
18 | Utility script
19 | """
20 |
21 |
22 | def load_yaml(yaml_file):
23 | with open(yaml_file) as yaml_file:
24 | config = yaml.safe_load(yaml_file)
25 | return config
26 |
--------------------------------------------------------------------------------
/siem_integrations/clx_query_service/clxquery/logging.conf:
--------------------------------------------------------------------------------
1 | [loggers]
2 | keys=root,applog
3 | [handlers]
4 | keys=rotateFileHandler,rotateConsoleHandler
5 |
6 | [formatters]
7 | keys=applog_format,console_format
8 |
9 | [formatter_applog_format]
10 | format=%(asctime)s-[%(levelname)-8s]:%(message)s
11 |
12 | [formatter_console_format]
13 | format=%(asctime)s-%(filename)s%(lineno)d[%(levelname)s]:%(message)s
14 |
15 | [logger_root]
16 | level=WARNING
17 | handlers=rotateFileHandler,rotateConsoleHandler
18 |
19 | [logger_applog]
20 | level=WARNING
21 | handlers=rotateFileHandler
22 | qualname=simple_example
23 |
24 | [handler_rotateFileHandler]
25 | class=handlers.RotatingFileHandler
26 | level=WARNING
27 | formatter=applog_format
28 | args=('applog.log', 'a', 10000, 9)
29 |
30 | [handler_rotateConsoleHandler]
31 | class=StreamHandler
32 | level=WARNING
33 | formatter=console_format
34 | args=(sys.stdout,)
35 |
--------------------------------------------------------------------------------
/ci/checks/style.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) 2019, NVIDIA CORPORATION.
3 | ################################################################################
4 | # clx Style Tester
5 | ################################################################################
6 |
7 | # Ignore errors and set path
8 | set +e
9 | PATH=/conda/bin:$PATH
10 |
11 | # Activate common conda env
12 | . /opt/conda/etc/profile.d/conda.sh
13 | conda activate rapids
14 |
15 | # Run flake8 and get results/return code
16 | FLAKE=`flake8 --ignore=E501,W605 --exclude="factory.py,perfect_hash.py" python`
17 | RETVAL=$?
18 |
19 | # Output results if failure otherwise show pass
20 | if [ "$FLAKE" != "" ]; then
21 | echo -e "\n\n>>>> FAILED: flake8 style check; begin output\n\n"
22 | echo -e "$FLAKE"
23 | echo -e "\n\n>>>> FAILED: flake8 style check; end output\n\n"
24 | else
25 | echo -e "\n\n>>>> PASSED: flake8 style check\n\n"
26 | fi
27 |
28 | exit $RETVAL
29 |
--------------------------------------------------------------------------------
/python/clx/io/writer/file_writer.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from abc import abstractmethod
16 | from clx.io.writer.writer import Writer
17 |
18 |
19 | class FileWriter(Writer):
20 |
21 | @property
22 | def config(self):
23 | return self._config
24 |
25 | @abstractmethod
26 | def write_data(self):
27 | pass
28 |
--------------------------------------------------------------------------------
/siem_integrations/splunk2kafka/splunk_wrapper/README.md:
--------------------------------------------------------------------------------
1 | # splunk_wrapper
2 |
3 | ## Overview
4 |
5 | This is a wrapper script to handle switching Python version so start, stop, and restart commands work as expected from `init.d` and the Splunk Web UI.
6 |
7 | ## Pre-reqs
8 |
9 | 1. Install [Miniconda2](https://repo.continuum.io/miniconda/) in $SPLUNKHOME as the splunk user:
10 | ```
11 | sudo -i -u splunk bash
12 | Add path to ~/.bashrc
13 | ```
14 |
15 | 2. Backup the splunk python executable in `/opt/splunk/bin`:
16 | ```
17 | mv /opt/splunk/bin/python2.7 $SPLUNKHOME/bin/python2.7.splunk
18 | ```
19 |
20 | 3. Create symlink to Miniconda Python in `/opt/splunk/bin`:
21 | ```
22 | ln -s /opt/splunk/miniconda2/bin/python2.7 /opt/splunk/python2.7.conda
23 | ```
24 |
25 | ## Install
26 |
27 | **NOTE:** Do not run this script twice as it will remove Splunk's Python. This is an active area of developmet.
28 |
29 | Run `sudo bash wrapper-install.sh` to install.
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | set SPHINXPROJ=clx
13 |
14 | if "%1" == "" goto help
15 |
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | echo.
19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | echo.installed, then set the SPHINXBUILD environment variable to point
21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | echo.may add the Sphinx directory to PATH.
23 | echo.
24 | echo.If you don't have Sphinx installed, grab it from
25 | echo.http://sphinx-doc.org/
26 | exit /b 1
27 | )
28 |
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 |
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 |
35 | :end
36 | popd
37 |
--------------------------------------------------------------------------------
/python/setup.cfg:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2018-2019, NVIDIA CORPORATION.
2 | # Licensed under the Apache License, Version 2.0 (the "License");
3 | # you may not use this file except in compliance with the License.
4 | # You may obtain a copy of the License at
5 | #
6 | # http://www.apache.org/licenses/LICENSE-2.0
7 | #
8 | # Unless required by applicable law or agreed to in writing, software
9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 |
14 | # See the docstring in versioneer.py for instructions. Note that you must
15 | # re-run 'versioneer.py setup' after changing this section, and commit the
16 | # resulting files.
17 |
18 | [versioneer]
19 | VCS = git
20 | style = pep440
21 | versionfile_source = clx/_version.py
22 | versionfile_build = clx/_version.py
23 | tag_prefix = v
24 | parentdir_prefix = clx-
--------------------------------------------------------------------------------
/python/clx/io/factory/abstract_factory.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from abc import ABC, abstractmethod
16 |
17 |
18 | class AbstractFactory(ABC):
19 | @property
20 | def config(self):
21 | return self._config
22 |
23 | @config.setter
24 | def config(self, val):
25 | self._config = val
26 |
27 | @abstractmethod
28 | def get_reader(self):
29 | pass
30 |
31 | @abstractmethod
32 | def get_writer(self):
33 | pass
34 |
--------------------------------------------------------------------------------
/python/clx/workflow/netflow_workflow.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import logging
16 | from clx.workflow.workflow import Workflow
17 |
18 | log = logging.getLogger(__name__)
19 |
20 |
21 | class NetflowWorkflow(Workflow):
22 | def workflow(self, dataframe):
23 | """TODO: Implement netflow dataframe enrichment"""
24 | log.debug("Processing netflow workflow data...")
25 | dataframe["netflow_enriched"] = "netflow_enriched"
26 | return dataframe
27 |
--------------------------------------------------------------------------------
/siem_integrations/Dockerfile:
--------------------------------------------------------------------------------
1 | # An integration test & dev container based on rapids-dev-nightly with CLX installed from current branch
2 | ARG RAPIDS_VERSION=0.13
3 | ARG CUDA_VERSION=10.1
4 | ARG LINUX_VERSION=ubuntu18.04
5 | ARG PYTHON_VERSION=3.7
6 |
7 | FROM rapidsai/rapidsai-dev-nightly:${RAPIDS_VERSION}-cuda${CUDA_VERSION}-devel-${LINUX_VERSION}-py${PYTHON_VERSION}
8 |
9 | ADD . /rapids/clx/
10 |
11 | SHELL ["/bin/bash", "-c"]
12 |
13 | RUN apt update -y --fix-missing && \
14 | apt upgrade -y && \
15 | apt install -y vim
16 |
17 | RUN source activate rapids \
18 | && conda install -c blazingsql-nightly/label/cuda${CUDA_VERSION} -c blazingsql-nightly -c rapidsai-nightly -c conda-forge blazingsql
19 |
20 | RUN source activate rapids \
21 | && conda install -y -c pytorch pytorch==1.3.1 torchvision=0.4.2 datashader>=0.10.* panel=0.6.* geopandas>=0.6.* pyppeteer s3fs gunicorn djangorestframework django supervisor nginx \
22 | && pip install "git+https://github.com/rapidsai/cudatashader.git" \
23 | && cd /rapids/clx \
24 | && pip install -e .
25 |
26 | WORKDIR /rapids
27 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation-request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Documentation request
3 | about: Report incorrect or needed documentation
4 | title: "[DOC]"
5 | labels: "? - Needs Triage, doc"
6 | assignees: ''
7 |
8 | ---
9 |
10 | ## Report incorrect documentation
11 |
12 | **Location of incorrect documentation**
13 | Provide links and line numbers if applicable.
14 |
15 | **Describe the problems or issues found in the documentation**
16 | A clear and concise description of what you found to be incorrect.
17 |
18 | **Steps taken to verify documentation is incorrect**
19 | List any steps you have taken:
20 |
21 | **Suggested fix for documentation**
22 | Detail proposed changes to fix the documentation if you have any.
23 |
24 | ---
25 |
26 | ## Report needed documentation
27 |
28 | **Report needed documentation**
29 | A clear and concise description of what documentation you believe it is needed and why.
30 |
31 | **Describe the documentation you'd like**
32 | A clear and concise description of what you want to happen.
33 |
34 | **Steps taken to search for needed documentation**
35 | List any steps you have taken:
--------------------------------------------------------------------------------
/python/clx/utils/data/dataset.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | class Dataset(object):
17 | def __init__(self, df):
18 | self._df = df.reset_index(drop=True)
19 | self._dataset_len = self._df.shape[0]
20 |
21 | @property
22 | def length(self):
23 | """
24 | Returns dataframe length
25 | """
26 | return self._dataset_len
27 |
28 | @property
29 | def data(self):
30 | """
31 | Retruns dataframe
32 | """
33 | return self._df
34 |
--------------------------------------------------------------------------------
/conda/recipes/clx/meta.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 |
3 | # Usage:
4 | # conda build -c conda-forge .
5 | {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
6 | {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
7 | {% set py_version=environ.get('CONDA_PY', 36) %}
8 | {% set cuda_version=environ.get('CUDA_REL', '0') %}
9 |
10 | package:
11 | name: clx
12 | version: {{ version }}
13 |
14 | source:
15 | git_url: ../../..
16 |
17 | build:
18 | number: {{ GIT_DESCRIBE_NUMBER }}
19 | string: py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
20 | script_env:
21 | - VERSION_SUFFIX
22 |
23 | requirements:
24 | build:
25 | - {{ compiler('c') }}
26 | - sysroot_{{ target_platform }} {{ sysroot_version }}
27 | host:
28 | - python
29 | run:
30 | - python
31 | - mkl
32 | - cugraph {{ minor_version }}.*
33 | - cuml {{ minor_version }}.*
34 |
35 | about:
36 | home: http://rapids.ai/
37 | license: Apache-2.0
38 | license_family: Apache
39 | license_file: LICENSE
40 | summary: clx library
41 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a bug report to help us improve CLX
4 | title: "[BUG]"
5 | labels: "? - Needs Triage, bug"
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **Steps/Code to reproduce bug**
14 | Follow this guide http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports to craft a minimal bug report. This helps us reproduce the issue you're having and resolve the issue more quickly.
15 |
16 | **Expected behavior**
17 | A clear and concise description of what you expected to happen.
18 |
19 | **Environment overview (please complete the following information)**
20 | - Environment location: [Bare-metal, Docker, Cloud(specify cloud provider)]
21 | - Method of CLX install: [conda, Docker, or from source]
22 | - If method of install is [Docker], provide `docker pull` & `docker run` commands used
23 |
24 | **Environment details**
25 | Please run and paste the output of the `/rapids/cudf/print_env.sh` script here, to gather any other relevant environment details. The script is located in the docker container.
26 |
27 | **Additional context**
28 | Add any other context about the problem here.
--------------------------------------------------------------------------------
/python/clx/io/reader/reader.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from abc import ABC, abstractmethod
16 |
17 |
18 | class Reader(ABC):
19 | @property
20 | def has_data(self):
21 | return self._has_data
22 |
23 | @has_data.setter
24 | def has_data(self, val):
25 | self._has_data = val
26 |
27 | @property
28 | def config(self):
29 | return self._config
30 |
31 | @config.setter
32 | def config(self, val):
33 | self._config = val
34 |
35 | @abstractmethod
36 | def close(self):
37 | pass
38 |
39 | @abstractmethod
40 | def fetch_data(self):
41 | pass
42 |
--------------------------------------------------------------------------------
/python/clx/io/reader/file_reader.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from abc import abstractmethod
16 | from clx.io.reader.reader import Reader
17 |
18 |
19 | class FileReader(Reader):
20 | @property
21 | def has_data(self):
22 | return self._has_data
23 |
24 | @has_data.setter
25 | def has_data(self, val):
26 | self._has_data = val
27 |
28 | @property
29 | def config(self):
30 | return self._config
31 |
32 | @config.setter
33 | def config(self, val):
34 | self._config = val
35 |
36 | @abstractmethod
37 | def close(self):
38 | pass
39 |
40 | @abstractmethod
41 | def fetch_data(self):
42 | pass
43 |
--------------------------------------------------------------------------------
/python/clx/io/factory/dask_fs_factory.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from clx.io.factory.abstract_factory import AbstractFactory
16 | from clx.io.reader.dask_fs_reader import DaskFileSystemReader
17 |
18 |
19 | class DaskFileSystemFactory(AbstractFactory):
20 | def __init__(self, config):
21 | """
22 | Constructor method
23 |
24 | :param config: dictionary object of config values for **type**, **input_format**, **input_path**, and dask reader optional keyword args
25 | """
26 | self._config = config
27 |
28 | def get_reader(self):
29 | """
30 | Get instance of DaskFileSystemReader
31 | """
32 |
33 | return DaskFileSystemReader(self.config)
34 |
35 | def get_writer(self):
36 | raise NotImplementedError
37 |
--------------------------------------------------------------------------------
/python/clx/analytics/dga_dataset.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import logging
16 | from clx.utils.data.dataset import Dataset
17 | from clx.utils.data import utils
18 |
19 | log = logging.getLogger(__name__)
20 |
21 |
22 | class DGADataset(Dataset):
23 | """Constructor to create DGADataset instance.
24 |
25 | :param df: Input dataframe.
26 | :type df: cudf.DataFrame
27 | :param truncate: Truncate string to n number of characters.
28 | :type truncate: int
29 | """
30 |
31 | def __init__(self, df, truncate):
32 | df = self.__preprocess(df, truncate)
33 | super().__init__(df)
34 |
35 | def __preprocess(self, df, truncate):
36 | df['domain'] = df['domain'].str.slice_replace(truncate, repl='')
37 | df = utils.str2ascii(df, 'domain')
38 | return df
39 |
--------------------------------------------------------------------------------
/python/clx/io/factory/fs_factory.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from clx.io.factory.abstract_factory import AbstractFactory
16 | from clx.io.reader.fs_reader import FileSystemReader
17 | from clx.io.writer.fs_writer import FileSystemWriter
18 |
19 |
20 | class FileSystemFactory(AbstractFactory):
21 | def __init__(self, config):
22 | """
23 | Constructor method
24 |
25 | :param config: dictionary object of config values for **type**, **input_format**, **input_path** (or **output_path**), and dask reader/writer optional keyword args
26 | """
27 | self._config = config
28 |
29 | def get_reader(self):
30 | """
31 | Get instance of FileSystemReader
32 | """
33 | return FileSystemReader(self.config)
34 |
35 | def get_writer(self):
36 | return FileSystemWriter(self.config)
37 |
--------------------------------------------------------------------------------
/python/clx/tests/test_anomaly_detection.py:
--------------------------------------------------------------------------------
1 | import cudf
2 |
3 | import clx.analytics.anomaly_detection
4 | import clx.features
5 |
6 |
7 | def test_anomaly_detection():
8 | df = cudf.DataFrame(
9 | {
10 | "time": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
11 | "user": [
12 | "u1",
13 | "u5",
14 | "u4",
15 | "u2",
16 | "u3",
17 | "u1",
18 | "u1",
19 | "u1",
20 | "u1",
21 | "u1",
22 | "u1",
23 | "u1",
24 | "u1",
25 | "u1",
26 | ],
27 | "computer": [
28 | "c1",
29 | "c1",
30 | "c5",
31 | "c1",
32 | "c1",
33 | "c3",
34 | "c1",
35 | "c1",
36 | "c2",
37 | "c3",
38 | "c1",
39 | "c1",
40 | "c4",
41 | "c5",
42 | ],
43 | }
44 | )
45 | fdf = clx.features.frequency(df, "user", "computer") # Create feature data
46 | actual = clx.analytics.anomaly_detection.dbscan(fdf, min_samples=2, eps=0.5)
47 | expected = cudf.Series([-1, -1], dtype="int32", index=None)
48 | expected.index = cudf.Series(["u1", "u4"])
49 | assert actual.equals(expected)
50 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # An integration test & dev container which builds and installs CLX from default branch
2 | ARG RAPIDS_VERSION=22.06
3 | ARG CUDA_VERSION=11.5
4 | ARG CUDA_SHORT_VERSION=${CUDA_VERSION}
5 | ARG LINUX_VERSION=ubuntu18.04
6 | ARG PYTHON_VERSION=3.8
7 | FROM rapidsai/rapidsai-dev-nightly:${RAPIDS_VERSION}-cuda${CUDA_VERSION}-devel-${LINUX_VERSION}-py${PYTHON_VERSION}
8 |
9 | # Add everything from the local build context
10 | ADD . /rapids/clx/
11 | RUN chmod -R ugo+w /rapids/clx/
12 |
13 | RUN source activate rapids && \
14 | gpuci_mamba_retry install -y -n rapids \
15 | "cudf_kafka=${RAPIDS_VER}" \
16 | "custreamz=${RAPIDS_VER}" \
17 | scikit-learn>=0.21 \
18 | nodejs>=12 \
19 | ipywidgets \
20 | python-confluent-kafka \
21 | seqeval \
22 | python-whois \
23 | seaborn \
24 | requests \
25 | matplotlib \
26 | pytest \
27 | jupyterlab=3.0 \
28 | faker && \
29 | pip install -U torch==1.11.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html && \
30 | pip install "git+https://github.com/rapidsai/cudatashader.git" && \
31 | pip install mockito && \
32 | pip install wget && \
33 | pip install "git+https://github.com/slashnext/SlashNext-URL-Analysis-and-Enrichment.git#egg=slashnext-phishing-ir&subdirectory=Python SDK/src"
34 |
35 | # clx build/install
36 | RUN source activate rapids && \
37 | cd /rapids/clx/python && \
38 | python setup.py install
39 |
40 | WORKDIR /rapids/clx
41 |
--------------------------------------------------------------------------------
/ci/checks/changelog.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) 2018, NVIDIA CORPORATION.
3 | #########################
4 | # clx CHANGELOG Tester #
5 | #########################
6 |
7 | # Checkout main for comparison
8 | git checkout --force --quiet main
9 |
10 | # Switch back to tip of PR branch
11 | git checkout --force --quiet current-pr-branch
12 |
13 | # Ignore errors during searching
14 | set +e
15 |
16 | # Get list of modified files between matster and PR branch
17 | CHANGELOG=`git diff --name-only main...current-pr-branch | grep CHANGELOG.md`
18 | # Check if CHANGELOG has PR ID
19 | PRNUM=`cat CHANGELOG.md | grep "$PR_ID"`
20 | RETVAL=0
21 |
22 | # Return status of check result
23 | if [ "$CHANGELOG" != "" -a "$PRNUM" != "" ] ; then
24 | echo -e "\n\n>>>> PASSED: CHANGELOG.md has been updated with current PR information.\n\nPlease ensure the update meets the following criteria.\n"
25 | else
26 | echo -e "\n\n>>>> FAILED: CHANGELOG.md has not been updated!\n\nPlease add a line describing this PR to CHANGELOG.md in the repository root directory. The line should meet the following criteria.\n"
27 | RETVAL=1
28 | fi
29 |
30 | cat << EOF
31 | It should be placed under the section for the appropriate release.
32 | It should be placed under "New Features", "Improvements", or "Bug Fixes" as appropriate.
33 | It should be formatted as '- PR # '
34 | Example format for #491 '- PR #491 Add CI test script to check for updates to CHANGELOG.md in PRs'
35 | EOF
36 |
37 | exit $RETVAL
38 |
--------------------------------------------------------------------------------
/examples/streamz/resources/cybert.yaml:
--------------------------------------------------------------------------------
1 | # cudf_engine currently supports only on flatten json input data
2 | cudf_engine: false
3 | kafka_conf:
4 | input_topic: cybert_input
5 | output_topic: cybert_output
6 | # consumer topic partitions
7 | n_partitions: 1
8 | producer_conf:
9 | bootstrap.servers: localhost:9092
10 | session.timeout.ms: '10000'
11 | #queue.buffering.max.messages: '250000'
12 | #linger.ms: '100'
13 | #security.protocol: SASL_SSL
14 | #sasl.mechanism: PLAIN
15 | #ssl.ca.location:
16 | #sasl.username:
17 | #sasl.password:
18 | consumer_conf:
19 | bootstrap.servers: localhost:9092
20 | group.id: streamz
21 | session.timeout.ms: '60000'
22 | enable.partition.eof: 'true'
23 | auto.offset.reset: earliest
24 | #security.protocol: SASL_SSL
25 | #sasl.mechanism: PLAIN
26 | #ssl.ca.location:
27 | #sasl.username:
28 | #sasl.password:
29 | elasticsearch_conf:
30 | url: localhost #https://{}:{}@test.nvidia.com:{}/
31 | port: 9200
32 | # below properties are required if elasticsearch cluster is SSL enabled
33 | #cafile:
34 | #username:
35 | #password:
36 | index: cybert
37 | # other available sinks are "elasticsearch", "filesystem"
38 | sink: kafka
39 | # below properties are used when sink is set to filesystem
40 | col_delimiter: ','
41 | file_extension: '.csv'
42 | output_dir: '/your/output/newdir/path'
--------------------------------------------------------------------------------
/examples/streamz/resources/dga_detection.yaml:
--------------------------------------------------------------------------------
1 | # cudf_engine currently supports only on flatten json input data
2 | cudf_engine: true
3 | kafka_conf:
4 | input_topic: dga_detection_input
5 | output_topic: dga_detection_output
6 | # consumer topic partitions
7 | n_partitions: 1
8 | producer_conf:
9 | bootstrap.servers: localhost:9092
10 | session.timeout.ms: '10000'
11 | #queue.buffering.max.messages: '250000'
12 | #linger.ms: '100'
13 | #security.protocol: SASL_SSL
14 | #sasl.mechanism: PLAIN
15 | #ssl.ca.location:
16 | #sasl.username:
17 | #sasl.password:
18 | consumer_conf:
19 | bootstrap.servers: localhost:9092
20 | group.id: streamz
21 | session.timeout.ms: '60000'
22 | enable.partition.eof: 'true'
23 | auto.offset.reset: earliest
24 | #security.protocol: SASL_SSL
25 | #sasl.mechanism: PLAIN
26 | #ssl.ca.location:
27 | #sasl.username:
28 | #sasl.password:
29 | elasticsearch_conf:
30 | url: localhost #https://{}:{}@test.nvidia.com:{}/
31 | port: 9200
32 | # below properties are required if elasticsearch cluster is SSL enabled
33 | #cafile:
34 | #username:
35 | #password:
36 | index: dga
37 | # other available sinks are "elasticsearch", "filesystem"
38 | sink: kafka
39 | # below properties are used when sink is set to filesystem
40 | col_delimiter: ','
41 | file_extension: '.csv'
42 | output_dir: '/your/output/newdir/path'
--------------------------------------------------------------------------------
/examples/streamz/resources/phishing_detection.yaml:
--------------------------------------------------------------------------------
1 | # cudf_engine currently supports only on flatten json input data
2 | cudf_engine: false
3 | kafka_conf:
4 | input_topic: phising_detection_input
5 | output_topic: phising_detection_output
6 | # consumer topic partitions
7 | n_partitions: 1
8 | producer_conf:
9 | bootstrap.servers: localhost:9092
10 | session.timeout.ms: '10000'
11 | #queue.buffering.max.messages: '250000'
12 | #linger.ms: '100'
13 | #security.protocol: SASL_SSL
14 | #sasl.mechanism: PLAIN
15 | #ssl.ca.location:
16 | #sasl.username:
17 | #sasl.password:
18 | consumer_conf:
19 | bootstrap.servers: localhost:9092
20 | group.id: streamz
21 | session.timeout.ms: '60000'
22 | enable.partition.eof: 'true'
23 | auto.offset.reset: earliest
24 | #security.protocol: SASL_SSL
25 | #sasl.mechanism: PLAIN
26 | #ssl.ca.location:
27 | #sasl.username:
28 | #sasl.password:
29 | elasticsearch_conf:
30 | url: localhost #https://{}:{}@test.nvidia.com:{}/
31 | port: 9200
32 | # below properties are required if elasticsearch cluster is SSL enabled
33 | #cafile:
34 | #username:
35 | #password:
36 | index: phising_detection
37 | # other available sinks are "elasticsearch", "filesystem"
38 | sink: kafka
39 | # below properties are used when sink is set to filesystem
40 | col_delimiter: ','
41 | file_extension: '.csv'
42 | output_dir: '/your/output/newdir/path'
--------------------------------------------------------------------------------
/ci/cpu/upload.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Adopted from https://github.com/tmcdonell/travis-scripts/blob/dfaac280ac2082cd6bcaba3217428347899f2975/update-accelerate-buildbot.sh
4 |
5 | set -e
6 |
7 | # Setup 'gpuci_retry' for upload retries (results in 4 total attempts)
8 | export GPUCI_RETRY_MAX=3
9 | export GPUCI_RETRY_SLEEP=30
10 |
11 | # Set default label options if they are not defined elsewhere
12 | export LABEL_OPTION=${LABEL_OPTION:-"--label main"}
13 |
14 | # Skip uploads unless BUILD_MODE == "branch"
15 | if [ ${BUILD_MODE} != "branch" ]; then
16 | echo "Skipping upload"
17 | return 0
18 | fi
19 |
20 | # Skip uploads if there is no upload key
21 | if [ -z "$MY_UPLOAD_KEY" ]; then
22 | echo "No upload key"
23 | return 0
24 | fi
25 |
26 | ################################################################################
27 | # SETUP - Get conda file output locations
28 | ################################################################################
29 |
30 | gpuci_logger "Get conda file output locations"
31 |
32 | export CLX_FILE=`conda build conda/recipes/clx --python=$PYTHON --output`
33 |
34 | ################################################################################
35 | # UPLOAD - Conda packages
36 | ################################################################################
37 |
38 | gpuci_logger "Starting conda uploads"
39 |
40 | if [[ "$BUILD_CLX" == "1" && "$UPLOAD_CLX" == "1" ]]; then
41 | test -e ${CLX_FILE}
42 | echo "Upload clx"
43 | echo ${CLX_FILE}
44 | gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${CLX_FILE} --no-progress
45 | fi
46 |
47 |
--------------------------------------------------------------------------------
/python/clx/tests/test_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import cudf
16 | from clx.utils.data import utils
17 |
18 | test_domains_len = 2
19 | test_input_df = cudf.DataFrame(
20 | {"domain": ["studytour.com.tw", "cnn.com"], "type": [1, 1]}
21 | )
22 | expected_output_df = cudf.DataFrame(
23 | {
24 | 0: [115, 99],
25 | 1: [116, 110],
26 | 2: [117, 110],
27 | 3: [100, 46],
28 | 4: [121, 99],
29 | 5: [116, 111],
30 | 6: [111, 109],
31 | 7: [117, 0],
32 | 8: [114, 0],
33 | 9: [46, 0],
34 | 10: [99, 0],
35 | 11: [111, 0],
36 | 12: [109, 0],
37 | 13: [46, 0],
38 | 14: [116, 0],
39 | 15: [119, 0],
40 | "len": [16, 7]
41 | },
42 | dtype="int32"
43 | )
44 | expected_output_df["type"] = [1, 1]
45 | expected_output_df["domain"] = ["studytour.com.tw", "cnn.com"]
46 |
47 |
48 | def test_str2ascii():
49 | actual_output_df = utils.str2ascii(test_input_df, 'domain')
50 | assert actual_output_df.equals(expected_output_df)
51 |
--------------------------------------------------------------------------------
/siem_integrations/clx_query/bin/clx_query_conf.py:
--------------------------------------------------------------------------------
1 | import splunk.admin as admin
2 | import splunk.entity as en
3 |
4 | """
5 | Copyright (C) 2005 - 2010 Splunk Inc. All Rights Reserved.
6 | Description: This skeleton python script handles the parameters in the configuration page.
7 |
8 | handleList method: lists configurable parameters in the configuration page
9 | corresponds to handleractions = list in restmap.conf
10 |
11 | handleEdit method: controls the parameters and saves the values
12 | corresponds to handleractions = edit in restmap.conf
13 |
14 | """
15 |
16 |
17 | class ConfigApp(admin.MConfigHandler):
18 | """
19 | Set up supported arguments
20 | """
21 |
22 | def setup(self):
23 | if self.requestedAction == admin.ACTION_EDIT:
24 | for arg in ["clx_hostname", "clx_port", "clx_query_limit"]:
25 | self.supportedArgs.addOptArg(arg)
26 |
27 | """
28 | Reads configuration from the custom file clx/default/clx_query_setup.conf.
29 | """
30 |
31 | def handleList(self, confInfo):
32 | confDict = self.readConf("clx_query_setup")
33 | if None != confDict:
34 | for stanza, settings in confDict.items():
35 | for key, val in settings.items():
36 | confInfo[stanza].append(key, val)
37 |
38 | def handleEdit(self, confInfo):
39 | name = self.callerArgs.id
40 | args = self.callerArgs
41 |
42 | self.writeConf("clx_query_setup", "setupentity", self.callerArgs.data)
43 |
44 |
45 | # initialize the handler
46 | admin.init(ConfigApp, admin.CONTEXT_NONE)
47 |
--------------------------------------------------------------------------------
/python/clx/tests/test_kafka_writer.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import cudf
16 | import pytest
17 | from mockito import when, mock, verify
18 | from clx.io.writer.kafka_writer import KafkaWriter
19 |
20 | input_df = cudf.DataFrame(
21 | {
22 | "firstname": ["Emma", "Ava", "Sophia"],
23 | "lastname": ["Olivia", "Isabella", "Charlotte"],
24 | "gender": ["F", "F", "F"],
25 | }
26 | )
27 | kafka_topic = "publisher_topic_t1"
28 | batch_size = 100
29 | delimiter = ","
30 | producer = mock()
31 |
32 |
33 | @pytest.mark.parametrize("kafka_topic", [kafka_topic])
34 | @pytest.mark.parametrize("batch_size", [batch_size])
35 | @pytest.mark.parametrize("delimiter", [delimiter])
36 | @pytest.mark.parametrize("producer", [producer])
37 | @pytest.mark.parametrize("input_df", [input_df])
38 | def test_write_data(kafka_topic, batch_size, delimiter, producer, input_df):
39 | writer = KafkaWriter(kafka_topic, batch_size, delimiter, producer)
40 | when(writer.producer).__len__().thenReturn(1)
41 | writer.write_data(input_df)
42 | verify(writer.producer, times=3).produce(...)
43 |
--------------------------------------------------------------------------------
/python/clx/parsers/resources/splunk_notable_regex.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | time: '(^[0-9]+\.?[0-9]*),'
16 | search_name: 'search_name=\"([0-9A-Za-z\s\-\(\)]+)'
17 | orig_time: 'orig_time=\"([0-9]+\.[0-9]+)'
18 | urgency: 'urgency=\"([A-Za-z]+)'
19 | user: 'user=\"([A-Za-z0-9]+)'
20 | owner: 'owner=\"([\w@\.]+)'
21 | security_domain: 'security_domain=\"([A-Za-z]+)'
22 | severity: 'severity=\"([A-Za-z]+)'
23 | src_ip: 'src_ip=\"([\w\.\-]+)'
24 | src_ip2: 'src=\"([\w\.\-]+)'
25 | src_mac: 'smac=([\w\:]+)'
26 | src_port: 'src_port=\"(\d+)'
27 | dest_ip: 'dest_ip=\"([\w\.\-]+)'
28 | dest_ip2: 'dest=\"([\w\.\-]+)'
29 | dest_mac: 'dmac=([\w\:]+)'
30 | dest_port: 'dest_port=\"(\d+)'
31 | dest_priority: 'dest_priority="([A-Za-z]+)'
32 | device_name: 'Device Name:\s([0-9A-Za-z\_\-]+)'
33 | event_name: 'Event Name:\s([A-Za-z\_]+)'
34 | event_type: 'Event Type:\s([A-Za-z]+)'
35 | ip_address: 'IP Address:\s\(([0-9\.]+)'
36 | message_ip: 'message.ip="([\w\.]+)'
37 | message_hostname: 'message.hostname="([\w\.]+)'
38 | message_username: 'message.user_name="([\w\.\@]+)'
39 | message_description: 'message.description="([\w\.\s]+)'
--------------------------------------------------------------------------------
/python/clx/tests/test_dga_dataset.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import cudf
16 | from clx.analytics.dga_dataset import DGADataset
17 |
18 | test_domains_len = 2
19 | test_batchsize = 1
20 | test_input_df = cudf.DataFrame(
21 | {"domain": ["studytour.com.tw", "cnn.com"], "type": [1, 1]}
22 | )
23 |
24 | expected_output_df = cudf.DataFrame(
25 | {
26 | 0: [115, 99],
27 | 1: [116, 110],
28 | 2: [117, 110],
29 | 3: [100, 46],
30 | 4: [121, 99],
31 | 5: [116, 111],
32 | 6: [111, 109],
33 | 7: [117, 0],
34 | 8: [114, 0],
35 | 9: [46, 0],
36 | 10: [99, 0],
37 | 11: [111, 0],
38 | 12: [109, 0],
39 | 13: [46, 0],
40 | 14: [116, 0],
41 | 15: [119, 0],
42 | "len": [16, 7],
43 | },
44 | dtype="int32"
45 | )
46 | expected_output_df["type"] = [1, 1]
47 | expected_output_df["domain"] = ["studytour.com.tw", "cnn.com"]
48 |
49 |
50 | def test_detector_dataset():
51 | dataset = DGADataset(test_input_df, 100)
52 | assert dataset.length == 2
53 | assert dataset.data.equals(expected_output_df)
54 |
--------------------------------------------------------------------------------
/python/clx/analytics/anomaly_detection.py:
--------------------------------------------------------------------------------
1 | import cudf
2 | import cuml
3 |
4 |
5 | def dbscan(feature_dataframe, min_samples=3, eps=0.3):
6 | """
7 | Pass a feature dataframe to this function to detect anomalies in your feature dataframe. This function uses ``cuML`` DBSCAN to detect anomalies
8 | and outputs associated labels 0,1,-1.
9 |
10 | Parameters
11 | ----------
12 | :param feature_dataframe: Feature dataframe to be used for clustering
13 | :type feature_dataframe: cudf.DataFrame
14 | :param min_samples: Minimum samples to use for dbscan
15 | :type min_samples: int
16 | :param eps: Max distance to use for dbscan
17 | :type eps: float
18 |
19 | Examples
20 | --------
21 | >>> import cudf
22 | >>> import clx.features
23 | >>> import clx.analytics.anomaly_detection
24 | >>> df = cudf.DataFrame(
25 | >>> {
26 | >>> "time": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
27 | >>> "user": ["u1","u1","u1","u1","u1","u1","u1","u1","u1","u1","u5","u4","u2","u3"],
28 | >>> "computer": ["c1","c2","c3","c1","c2","c3","c1","c1","c2","c3","c1","c1","c5","c6"],
29 | >>> }
30 | >>> )
31 | >>> feature_df = clx.features.frequency(df, entity_id="user", feature_id="computer")
32 | >>> labels = clx.analytics.anomaly_detection.dbscan(feature_df, min_samples=2, eps=0.5)
33 | >>> labels
34 | 0 -1
35 | 1 -1
36 | 2 -1
37 | dtype: int32
38 | """
39 | dbscan = cuml.cluster.DBSCAN(eps=eps, min_samples=min_samples)
40 | dbscan.fit(feature_dataframe)
41 | # return anomalies only
42 | labels = cudf.Series(dbscan.labels_)
43 | anomalies = labels[labels == -1]
44 | return anomalies
45 |
--------------------------------------------------------------------------------
/python/clx/analytics/periodicity_detection.py:
--------------------------------------------------------------------------------
1 | import cupy as cp
2 |
3 |
4 | def to_periodogram(signal):
5 | """
6 | Returns periodogram of signal for finding frequencies that have high energy.
7 |
8 | :param signal: signal (time domain)
9 | :type signal: cudf.Series
10 | :return: CuPy array representing periodogram
11 | :rtype: cupy.ndarray
12 | """
13 |
14 | # convert cudf series to cupy array
15 | signal_cp = cp.fromDlpack(signal.to_dlpack())
16 |
17 | # standardize the signal
18 | signal_cp_std = (signal_cp - cp.mean(signal_cp)) / cp.std(signal_cp)
19 |
20 | # take fourier transform of signal
21 | FFT_data = cp.fft.fft(signal_cp_std)
22 |
23 | # create periodogram
24 | prdg = (1 / len(signal)) * ((cp.absolute(FFT_data)) ** 2)
25 |
26 | return prdg
27 |
28 |
29 | def filter_periodogram(prdg, p_value):
30 | """
31 | Select important frequencies by filtering periodogram by p-value. Filtered out frequencies are set to zero.
32 |
33 | :param prdg: periodogram to be filtered
34 | :type signal: cudf.Series
35 | :param p_value: p-value to filter by
36 | :type signal: float
37 | :return: CuPy array representing periodogram
38 | :rtype: cupy.ndarray
39 | """
40 |
41 | filtered_prdg = cp.copy(prdg)
42 | filtered_prdg[filtered_prdg < (cp.mean(filtered_prdg) * (-1) * (cp.log(p_value)))] = 0
43 |
44 | return filtered_prdg
45 |
46 |
47 | def to_time_domain(prdg):
48 | """
49 | Convert the signal back to time domain.
50 |
51 | :param prdg: periodogram (frequency domain)
52 | :type prdg: cupy.ndarray
53 | :return: CuPy array representing reconstructed signal
54 | :rtype: cupy.ndarray
55 | """
56 |
57 | acf = cp.abs(cp.fft.ifft(prdg))
58 |
59 | return acf
60 |
--------------------------------------------------------------------------------
/ci/utils/nbtest.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | MAGIC_OVERRIDE_CODE="
4 | def my_run_line_magic(*args, **kwargs):
5 | g=globals()
6 | l={}
7 | for a in args:
8 | try:
9 | exec(str(a),g,l)
10 | except Exception as e:
11 | print('WARNING: %s\n While executing this magic function code:\n%s\n continuing...\n' % (e, a))
12 | else:
13 | g.update(l)
14 |
15 | def my_run_cell_magic(*args, **kwargs):
16 | my_run_line_magic(*args, **kwargs)
17 |
18 | get_ipython().run_line_magic=my_run_line_magic
19 | get_ipython().run_cell_magic=my_run_cell_magic
20 |
21 | "
22 |
23 | NO_COLORS=--colors=NoColor
24 | EXITCODE=0
25 | NBTMPDIR="$WORKSPACE/tmp"
26 | mkdir -p ${NBTMPDIR}
27 |
28 | for nb in $*; do
29 | NBFILENAME=$1
30 | NBNAME=${NBFILENAME%.*}
31 | NBNAME=${NBNAME##*/}
32 | NBTESTSCRIPT=${NBTMPDIR}/${NBNAME}-test.py
33 | shift
34 |
35 | echo --------------------------------------------------------------------------------
36 | echo STARTING: ${NBNAME}
37 | echo --------------------------------------------------------------------------------
38 | jupyter nbconvert --to script ${NBFILENAME} --output ${NBTMPDIR}/${NBNAME}-test
39 | echo "${MAGIC_OVERRIDE_CODE}" > ${NBTMPDIR}/tmpfile
40 | cat ${NBTESTSCRIPT} >> ${NBTMPDIR}/tmpfile
41 | mv ${NBTMPDIR}/tmpfile ${NBTESTSCRIPT}
42 |
43 | echo "Running \"ipython ${NO_COLORS} ${NBTESTSCRIPT}\" on $(date)"
44 | echo
45 | time bash -c "ipython ${NO_COLORS} ${NBTESTSCRIPT}; EC=\$?; echo -------------------------------------------------------------------------------- ; echo DONE: ${NBNAME}; exit \$EC"
46 | NBEXITCODE=$?
47 | echo EXIT CODE: ${NBEXITCODE}
48 | echo
49 | EXITCODE=$((EXITCODE | ${NBEXITCODE}))
50 | done
51 |
52 | exit ${EXITCODE}
53 |
--------------------------------------------------------------------------------
/python/clx/tests/test_features.py:
--------------------------------------------------------------------------------
1 | import cudf
2 | import pytest
3 |
4 | import clx.features
5 |
6 | df = cudf.DataFrame(
7 | {
8 | "time": [1, 2, 3, 4, 5, 6, 7],
9 | "user": ["u1", "u2", "u3", "u1", "u1", "u2", "u1"],
10 | "computer": ["c1", "c2", "c3", "c1", "c2", "c3", "c1"],
11 | }
12 | )
13 |
14 |
15 | def test_binary_features():
16 | actual = clx.features.binary(df, "user", "computer")
17 | expected = cudf.DataFrame(
18 | {"user": ["u1", "u2", "u3"], "c1": [1, 0, 0], "c2": [1, 1, 0], "c3": [0, 1, 1]}
19 | )
20 | expected = expected.set_index("user")
21 | expected["c1"] = expected["c1"].astype("int32")
22 | expected["c2"] = expected["c2"].astype("int32")
23 | expected["c3"] = expected["c3"].astype("int32")
24 | expected.columns = cudf.MultiIndex(
25 | names=[None, "computer"],
26 | codes=[[0, 0, 0], [0, 1, 2]],
27 | levels=[["time"], ["c1", "c2", "c3"]],
28 | )
29 | assert expected.equals(actual)
30 |
31 |
32 | def test_binary_exception():
33 | with pytest.raises(Exception):
34 | clx.features.binary(df, "user", "a")
35 |
36 |
37 | def test_frequency_features():
38 | actual = clx.features.frequency(df, "user", "computer")
39 | expected = cudf.DataFrame(
40 | {
41 | "user": ["u1", "u2", "u3"],
42 | "c1": [0.75, 0.00, 0.00],
43 | "c2": [0.25, 0.50, 0.0],
44 | "c3": [0.0, 0.5, 1.0],
45 | }
46 | )
47 | expected = expected.set_index("user")
48 | expected.columns = cudf.MultiIndex(
49 | names=[None, "computer"],
50 | codes=[[0, 0, 0], [0, 1, 2]],
51 | levels=[["time"], ["c1", "c2", "c3"]],
52 | )
53 | assert expected.equals(actual)
54 |
55 |
56 | def test_frequency_exception():
57 | with pytest.raises(Exception):
58 | clx.features.frequency(df, "a", "computer")
59 |
--------------------------------------------------------------------------------
/python/clx/eda/analysis.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import json
16 | from abc import ABC, abstractmethod
17 |
18 |
19 | class Analysis(ABC):
20 | def __init__(self, dataframe):
21 | self._analysis = self._generate_analysis(dataframe)
22 | self._charts = self._generate_charts(dataframe)
23 |
24 | @property
25 | def analysis(self):
26 | return self._analysis
27 |
28 | @property
29 | def charts(self):
30 | return self._charts
31 |
32 | @abstractmethod
33 | def _generate_analysis(self, dataframe):
34 | """Abstract function intended to create a dictionary summarizing analysis results of the dataframe"""
35 | pass
36 |
37 | @abstractmethod
38 | def _generate_charts(self, dataframe):
39 | """Abstract function intended to create a list of cuxfilt"""
40 | pass
41 |
42 | def to_json(self):
43 | """Get json version of analysis results"""
44 | return json.dumps(self.analysis, indent=2)
45 |
46 | def save_analysis(self, output_filepath):
47 | """Save analysis to a json file
48 | TODO: Expand to other output types"""
49 | formatted_output = self.to_json()
50 | with open(output_filepath + ".json", "w") as file:
51 | file.write(formatted_output)
52 |
--------------------------------------------------------------------------------
/python/clx/tests/test_dataloader.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import cudf
15 | from clx.utils.data.dataset import Dataset
16 | from clx.utils.data.dataloader import DataLoader
17 |
18 | test_batchsize = 2
19 | test_df = cudf.DataFrame(
20 | {
21 | "domain": [
22 | "studytour.com.tw",
23 | "cnn.com",
24 | "bakercityherald.com",
25 | "bankmobile.com",
26 | ],
27 | "type": [1, 1, 0, 1],
28 | }
29 | )
30 | expected_part_df1 = cudf.DataFrame(
31 | {
32 | "domain": [
33 | "studytour.com.tw",
34 | "cnn.com",
35 | ],
36 | "type": [1, 1],
37 | }
38 | )
39 |
40 | expected_part_df2 = cudf.DataFrame(
41 | {
42 | "domain": [
43 | "bakercityherald.com",
44 | "bankmobile.com",
45 | ],
46 | "type": [0, 1],
47 | }
48 | )
49 | dataset = Dataset(test_df)
50 | dataloader = DataLoader(dataset, batchsize=test_batchsize)
51 |
52 |
53 | def test_get_chunks():
54 | df_parts = []
55 | for df_part in dataloader.get_chunks():
56 | df_parts.append(df_part)
57 | assert len(df_parts) == 2
58 | assert df_parts[0].reset_index(drop=True).equals(expected_part_df1)
59 | assert df_parts[1].reset_index(drop=True).equals(expected_part_df2)
60 |
--------------------------------------------------------------------------------
/python/clx/tests/test_whois.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 | import datetime
17 | import whois
18 | from clx.osi.whois import WhoIsLookupClient
19 | from mockito import when
20 |
21 |
22 | domains = ["nvidia.com"]
23 | datetime_1 = datetime.datetime(2020, 5, 17)
24 | datetime_2 = datetime.datetime(2020, 5, 18)
25 | client = WhoIsLookupClient()
26 |
27 | response = {
28 | "domain_name": "NVIDIA.COM",
29 | "registrar": "Safenames Ltd",
30 | "emails": [
31 | "abuse@safenames.net",
32 | "wadmpfvzi5ei@idp.email",
33 | "hostmaster@safenames.net",
34 | ],
35 | "updated_date": [datetime_1, datetime_2],
36 | }
37 |
38 |
39 | @pytest.mark.parametrize("client", [client])
40 | @pytest.mark.parametrize("domains", [domains])
41 | def test_whois(client, domains):
42 | expected_output = [{
43 | "domain_name": "NVIDIA.COM",
44 | "registrar": "Safenames Ltd",
45 | "emails": "abuse@safenames.net,wadmpfvzi5ei@idp.email,hostmaster@safenames.net",
46 | "updated_date": "05-17-2020 00:00:00,05-18-2020 00:00:00",
47 | }]
48 | when(whois).whois(...).thenReturn(response)
49 | actual_output = client.whois(domains)
50 | assert actual_output[0]["domain_name"] == "NVIDIA.COM"
51 | assert len(actual_output) == len(domains)
52 | assert actual_output == expected_output
53 |
--------------------------------------------------------------------------------
/python/clx/tests/test_event_parser.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import cudf
16 | from clx.parsers.event_parser import EventParser
17 |
18 |
19 | class TestEventParserImpl(EventParser):
20 | def parse(self, dataframe, raw_column):
21 | return None
22 |
23 |
24 | class TestEventParser(object):
25 | def setup(self):
26 | # Create Test Event Parser Implementation
27 | event_name = "eventName"
28 | columns = ["eventTypeId", "username"]
29 | self.event_regex = {
30 | "eventTypeId": r"eventTypeId: ([0-9$]+)",
31 | "username": r"username: ([a-z\.\-0-9$]+)",
32 | }
33 | self.event_parser = TestEventParserImpl(columns, event_name)
34 |
35 | def test_parse_raw_event(self):
36 | test_dataframe = cudf.DataFrame(
37 | {
38 | "Raw": [
39 | "eventTypeId: 1 \\nusername: foo",
40 | "eventTypeId: 1 \\nusername: bar",
41 | ]
42 | }
43 | )
44 | parsed_dataframe = self.event_parser.parse_raw_event(
45 | test_dataframe, "Raw", self.event_regex
46 | )
47 | expected_parsed_dataframe = cudf.DataFrame(
48 | {"eventTypeId": ["1", "1"], "username": ["foo", "bar"]}
49 | )
50 |
51 | assert parsed_dataframe.equals(expected_parsed_dataframe)
52 |
--------------------------------------------------------------------------------
/python/clx/utils/data/dataloader.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import logging
16 |
17 | log = logging.getLogger(__name__)
18 |
19 |
20 | class DataLoader(object):
21 | """
22 | Wrapper class is used to return dataframe partitions based on batchsize.
23 | """
24 |
25 | def __init__(self, dataset, batchsize=1000):
26 | """Constructor to create dataframe partitions.
27 |
28 | :param df: Input dataframe.
29 | :type df: cudf.DataFrame
30 | :param batch_size: Number of records in the dataframe.
31 | :type batch_size: int
32 | """
33 | self.__dataset = dataset
34 | self.__batchsize = batchsize
35 |
36 | @property
37 | def dataset_len(self):
38 | return self.__dataset.length
39 |
40 | @property
41 | def dataset(self):
42 | return self.__dataset
43 |
44 | def get_chunks(self):
45 | """ A generator function that yields each chunk of original input dataframe based on batchsize
46 | :return: Partitioned dataframe.
47 | :rtype: cudf.DataFrame
48 | """
49 | prev_chunk_offset = 0
50 | while prev_chunk_offset < self.__dataset.length:
51 | curr_chunk_offset = prev_chunk_offset + self.__batchsize
52 | chunk = self.__dataset.data[prev_chunk_offset:curr_chunk_offset:1]
53 | prev_chunk_offset = curr_chunk_offset
54 | yield chunk
55 |
--------------------------------------------------------------------------------
/python/clx/analytics/model/tabular_model.py:
--------------------------------------------------------------------------------
1 | # Original code at https://github.com/spro/practical-pytorch
2 | import torch
3 | import torch.nn as nn
4 |
5 |
6 | class TabularModel(nn.Module):
7 | "Basic model for tabular data"
8 |
9 | def __init__(self, emb_szs, n_cont, out_sz, layers, drops,
10 | emb_drop, use_bn, is_reg, is_multi):
11 | super().__init__()
12 |
13 | self.embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in emb_szs])
14 | self.emb_drop = nn.Dropout(emb_drop)
15 | self.bn_cont = nn.BatchNorm1d(n_cont)
16 | n_emb = sum(e.embedding_dim for e in self.embeds)
17 | self.n_emb, self.n_cont = n_emb, n_cont
18 | sizes = [n_emb + n_cont] + layers + [out_sz]
19 | actns = [nn.ReLU(inplace=True)] * (len(sizes) - 2) + [None]
20 | layers = []
21 | for i, (n_in, n_out, dp, act) in enumerate(zip(sizes[:-1], sizes[1:], [0.] + drops, actns)):
22 | layers += self._bn_drop_lin(n_in, n_out, bn=use_bn and i != 0, p=dp, actn=act)
23 | self.layers = nn.Sequential(*layers)
24 |
25 | def forward(self, x_cat, x_cont):
26 | if self.n_emb != 0:
27 | x = [e(x_cat[:, i]) for i, e in enumerate(self.embeds)]
28 | x = torch.cat(x, 1)
29 | x = self.emb_drop(x)
30 | if self.n_cont != 0:
31 | if self.n_cont == 1:
32 | x_cont = x_cont.unsqueeze(1)
33 | x_cont = self.bn_cont(x_cont)
34 | x = torch.cat([x, x_cont], 1) if self.n_emb != 0 else x_cont
35 | x = self.layers(x)
36 | return x.squeeze()
37 |
38 | def _bn_drop_lin(self, n_in, n_out, bn, p, actn):
39 | "Sequence of batchnorm (if `bn`), dropout (with `p`) and linear (`n_in`,`n_out`) layers followed by `actn`."
40 | layers = [nn.BatchNorm1d(n_in)] if bn else []
41 | if p != 0:
42 | layers.append(nn.Dropout(p))
43 | layers.append(nn.Linear(n_in, n_out))
44 | if actn is not None:
45 | layers.append(actn)
46 | return layers
47 |
--------------------------------------------------------------------------------
/python/clx/utils/data/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import cudf
16 | import logging
17 |
18 | log = logging.getLogger(__name__)
19 |
20 |
21 | def str2ascii(df, col_name):
22 | """
23 | This function sorts domain name entries in desc order based on the length of domain and converts domain name to ascii characters.
24 |
25 | :param df: Domains which requires conversion.
26 | :type df: cudf.DataFrame
27 | :param col_name: Name of the column that needs to be transformed.
28 | :type col_name: str
29 | :return: Ascii character converted information.
30 | :rtype: cudf.DataFrame
31 | """
32 | df["len"] = df[col_name].str.len()
33 | df = df.sort_values("len", ascending=False)
34 | split_ser = df[col_name].str.findall("[\w\W\d\D\s\S]")
35 | split_df = split_ser.to_frame()
36 | split_df = cudf.DataFrame(split_df[col_name].to_arrow().to_pylist())
37 | columns_cnt = len(split_df.columns)
38 |
39 | # Replace null's with ^.
40 | split_df = split_df.fillna("^")
41 | temp_df = cudf.DataFrame()
42 | for col in range(0, columns_cnt):
43 | temp_df[col] = split_df[col].str.code_points()
44 | del split_df
45 |
46 | # Replace ^ ascii value 94 with 0.
47 | temp_df = temp_df.replace(94, 0)
48 | temp_df.index = df.index
49 | temp_df["len"] = df["len"]
50 | if "type" in df.columns:
51 | temp_df["type"] = df["type"]
52 | temp_df[col_name] = df[col_name]
53 | return temp_df
54 |
--------------------------------------------------------------------------------
/python/clx/parsers/zeek.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import cudf
16 |
17 | type_dict = {
18 | "bool": "bool",
19 | "count": "int64",
20 | "int": "int64",
21 | "double": "float64",
22 | "time": "float64",
23 | "interval": "float64",
24 | "string": "str",
25 | "pattern": "str",
26 | "port": "int64",
27 | "addr": "str",
28 | "subnet": "str",
29 | "enum": "str",
30 | "function": "str",
31 | "event": "str",
32 | "hook": "str",
33 | "file": "str",
34 | "opaque": "str",
35 | "any": "str",
36 | }
37 |
38 |
39 | def parse_log_file(filepath):
40 | """Parse Zeek log file and return cuDF dataframe. Uses header comments to get column names/types and configure parser.
41 |
42 | :param filepath: filepath for Zeek log file
43 | :type filepath: string
44 | :return: Zeek log dataframe
45 | :rtype: cudf.DataFrame
46 | """
47 | header_gdf = cudf.read_csv(filepath, names=["line"], nrows=8)
48 | lines_gdf = header_gdf["line"].str.split()
49 |
50 | column_names = lines_gdf.to_pandas().iloc[6][1:].tolist()
51 | column_types = lines_gdf.to_pandas().iloc[7][1:].tolist()
52 | column_dtypes = list(map(lambda x: type_dict.get(x, "str"), column_types))
53 |
54 | log_gdf = cudf.read_csv(
55 | filepath,
56 | delimiter="\t",
57 | dtype=column_dtypes,
58 | names=column_names,
59 | skiprows=8,
60 | skipfooter=1,
61 | )
62 | return log_gdf
63 |
--------------------------------------------------------------------------------
/siem_integrations/splunk2kafka/export2kafka/README.md:
--------------------------------------------------------------------------------
1 | # export2kafka
2 |
3 | ## Overview
4 |
5 | This is a Splunk App that installs `export2kafka` that enables data export from Splunk to a running Kafka instance.
6 |
7 | ## Prerequesites
8 |
9 | 1. Install Kakfa libraries:
10 | ```
11 | sudo -i -u splunk bash
12 | source activate root
13 | conda install -c conda-forge python-confluent-kafka
14 | conda remove python-confluent-kafka
15 | conda install -c conda-forge librdkafka=0.11.0
16 | conda install -f -c conda-forge python-confluent-kafka
17 | ```
18 | 2. Setup `/etc/hosts` for the Kafka brokers
19 |
20 | ## Install
21 |
22 | 1. Git clone this repo into `$SPLUNKHOME/etc/apps`
23 | 2. Copy `splunklib` from [splunk-sdk-python](https://github.com/splunk/splunk-sdk-python) to `$SPLUNKHOME/etc/apps`. Use tag version that matches your Splunk installation. *Note: Application was tested with Splunk 1.6.x*.
24 | 3. Go to `http://$SPLUNKURL/en-us/debug/refresh`
25 | 4. Click the "Refresh" button to load the app into the Web UI
26 |
27 | ## Usage
28 | ### Config Options
29 | **broker**
30 | Usage - set a Kafka broker to use for bootstrap
31 | Required? - YES
32 | Format - :
33 | Example - broker=10.0.0.0:9092
34 |
35 | **topic**
36 | Usage - set the Kafka topic to publish to
37 | Required? - YES
38 | Format -
39 | Example - topic=data_raw
40 |
41 | **batch**
42 | Usage - set the batch size before calling poll on producer
43 | Required? - NO
44 | Format - integer
45 | Default - 2000 records
46 | Example - batch=2000
47 |
48 | **timeout**
49 | Usage - set the timeout of the export in minutes
50 | Required? - NO
51 | Format - integer in minutes
52 | Default - 60 mins
53 | Example - timeout=60
54 |
55 | **pool**
56 | Usage - set the number of producers used, useful when exporting large data sets
57 | Required? - NO
58 | Format - integer
59 | Default - 2 producers
60 | Example - pool=2
61 |
62 | ### Query Example
63 |
64 | ```
65 | index="my-index" | export2kafka topic=my-topic broker=10.0.0.0:9092
66 | ```
67 |
--------------------------------------------------------------------------------
/ci/docs/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright (c) 2020, NVIDIA CORPORATION.
3 | #################################
4 | # CLX Docs build script for CI #
5 | #################################
6 |
7 | if [ -z "$PROJECT_WORKSPACE" ]; then
8 | echo ">>>> ERROR: Could not detect PROJECT_WORKSPACE in environment"
9 | echo ">>>> WARNING: This script contains git commands meant for automated building, do not run locally"
10 | exit 1
11 | fi
12 |
13 | export PATH=/conda/bin:/usr/local/cuda/bin:$PATH
14 | export HOME="$WORKSPACE"
15 | export DOCS_WORKSPACE="$WORKSPACE/docs"
16 | export CUDA_REL=${CUDA_VERSION%.*}
17 | export CUDA_SHORT=${CUDA_REL//./}
18 | export PROJECTS=(clx)
19 |
20 | # Switch to project root; also root of repo checkout
21 | cd "$PROJECT_WORKSPACE"
22 | export GIT_DESCRIBE_TAG=`git describe --tags`
23 | export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
24 |
25 | gpuci_logger "Check environment"
26 | env
27 |
28 | gpuci_logger "Check GPU usage"
29 | nvidia-smi
30 |
31 | logger "Activate conda env..."
32 | source activate rapids
33 | conda install --freeze-installed -c rapidsai-nightly -c rapidsai -c nvidia -c pytorch -c conda-forge \
34 | "pytorch>=1.7" torchvision "transformers=3.5.*" requests yaml python-confluent-kafka python-whois markdown beautifulsoup4 jq
35 |
36 | pip install mockito
37 | pip install "git+https://github.com/slashnext/SlashNext-URL-Analysis-and-Enrichment.git#egg=slashnext-phishing-ir&subdirectory=Python SDK/src"
38 | pip install cupy-cuda${CUDA_SHORT}
39 |
40 | gpuci_logger "Check versions"
41 | python --version
42 | $CC --version
43 | $CXX --version
44 |
45 | gpuci_logger "Show conda info"
46 | conda info
47 | conda config --show-sources
48 | conda list --show-channel-urls
49 |
50 | #clx source build
51 | "$PROJECT_WORKSPACE/build.sh" clx
52 |
53 | #clx Sphinx Build
54 | gpuci_logger "Build clx docs"
55 | cd "$PROJECT_WORKSPACE/docs"
56 | make html
57 |
58 | cd $DOCS_WORKSPACE
59 |
60 | if [ ! -d "api/clx/$BRANCH_VERSION" ]; then
61 | mkdir -p api/clx/$BRANCH_VERSION
62 | fi
63 |
64 | rm -rf api/clx/$BRANCH_VERSION/*
65 | mv "$PROJECT_WORKSPACE/docs/build/html/"* $DOCS_WORKSPACE/api/clx/$BRANCH_VERSION
66 |
67 |
68 |
--------------------------------------------------------------------------------
/examples/streamz/python/dga_detection.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import time
16 | import dask
17 | from clx_streamz_tools import utils
18 | from clx_streamz_tools import streamz_workflow
19 |
20 |
21 | class DGADetectionWorkflow(streamz_workflow.StreamzWorkflow):
22 | def inference(self, messages_df):
23 | # Messages will be received and run through DGA inferencing
24 | worker = dask.distributed.get_worker()
25 | batch_start_time = int(round(time.time()))
26 | result_size = messages_df.shape[0]
27 | print("Processing batch size: " + str(result_size))
28 | dd = worker.data["dga_detector"]
29 | preds = dd.predict(messages_df["domain"])
30 | messages_df["preds"] = preds
31 | return (messages_df, batch_start_time, result_size)
32 |
33 | def worker_init(self):
34 | # Initialization for each dask worker
35 | from clx.analytics.dga_detector import DGADetector
36 |
37 | worker = dask.distributed.get_worker()
38 | dd = DGADetector()
39 | print(
40 | "Initializing Dask worker: "
41 | + str(worker)
42 | + " with dga model. Model File: "
43 | + str(self.args.model)
44 | )
45 | dd.load_model(self.args.model)
46 | # this dict can be used for adding more objects to distributed dask worker
47 | obj_dict = {"dga_detector": dd}
48 | worker = utils.init_dask_workers(worker, self.config, obj_dict)
49 |
50 |
51 | if __name__ == "__main__":
52 | dga_detection = DGADetectionWorkflow()
53 | dga_detection.start()
54 |
--------------------------------------------------------------------------------
/python/clx/tests/test_stats.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import clx.analytics.stats
16 | import cudf
17 | import cupy as cp
18 |
19 |
20 | def test_rzscore():
21 | sequence = [
22 | 3,
23 | 4,
24 | 5,
25 | 6,
26 | 1,
27 | 10,
28 | 34,
29 | 2,
30 | 1,
31 | 11,
32 | 45,
33 | 34,
34 | 2,
35 | 9,
36 | 19,
37 | 43,
38 | 24,
39 | 13,
40 | 23,
41 | 10,
42 | 98,
43 | 84,
44 | 10,
45 | ]
46 | series = cudf.Series(sequence)
47 | zscores_df = cudf.DataFrame()
48 | zscores_df["zscore"] = clx.analytics.stats.rzscore(series, 7)
49 | expected_zscores_arr = [
50 | float(0),
51 | float(0),
52 | float(0),
53 | float(0),
54 | float(0),
55 | float(0),
56 | 2.374423424,
57 | -0.645941275,
58 | -0.683973734,
59 | 0.158832461,
60 | 1.847751909,
61 | 0.880026019,
62 | -0.950835449,
63 | -0.360593742,
64 | 0.111407599,
65 | 1.228914145,
66 | -0.074966331,
67 | -0.570321249,
68 | 0.327849973,
69 | -0.934372308,
70 | 2.296828498,
71 | 1.282966989,
72 | -0.795223674,
73 | ]
74 | expected_zscores_df = cudf.DataFrame()
75 | expected_zscores_df["zscore"] = expected_zscores_arr
76 |
77 | # Check that columns are equal
78 | zscores_df["zscore"] = zscores_df["zscore"].fillna(0)
79 | assert cp.allclose(expected_zscores_df["zscore"], zscores_df["zscore"])
80 |
--------------------------------------------------------------------------------
/ci/release/update-version.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ########################
3 | # clx Version Updater #
4 | ########################
5 |
6 | ## Usage
7 | # bash update-version.sh
8 |
9 |
10 | # Format is YY.MM.PP - no leading 'v' or trailing 'a'
11 | NEXT_FULL_TAG=$1
12 |
13 | # Get current version
14 | CURRENT_TAG=$(git tag --merged HEAD | grep -xE '^v.*' | sort --version-sort | tail -n 1 | tr -d 'v')
15 | CURRENT_MAJOR=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[1]}')
16 | CURRENT_MINOR=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[2]}')
17 | CURRENT_PATCH=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[3]}')
18 | CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR}
19 |
20 | #Get . for next version
21 | NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
22 | NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
23 | NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
24 |
25 | echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
26 |
27 | # Inplace sed replace; workaround for Linux and Mac
28 | function sed_runner() {
29 | sed -i.bak ''"$1"'' $2 && rm -f ${2}.bak
30 | }
31 |
32 | # Dockerfile update
33 | sed_runner 's/RAPIDS_VERSION=0.*/RAPIDS_VERSION='"${NEXT_SHORT_TAG}"'/g' Dockerfile
34 | # Streamz Dockerfile update
35 | sed_runner 's/RAPIDS_VERSION=0.*/RAPIDS_VERSION='"${NEXT_SHORT_TAG}"'/g' examples/streamz/Dockerfile
36 |
37 | # Sphinx Update
38 | sed_runner 's/version = *.*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/source/conf.py
39 | sed_runner 's/release = *.*.*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/source/conf.py
40 |
41 | # conda environment
42 | for FILE in conda/environments/*.yml; do
43 | sed_runner "s/cugraph=${CURRENT_SHORT_TAG}/cugraph=${NEXT_SHORT_TAG}/g" ${FILE};
44 | sed_runner "s/cuml=${CURRENT_SHORT_TAG}/cuml=${NEXT_SHORT_TAG}/g" ${FILE};
45 | sed_runner "s/cuxfilter=${CURRENT_SHORT_TAG}/cuxfilter=${NEXT_SHORT_TAG}/g" ${FILE};
46 | sed_runner "s/dask-cudf=${CURRENT_SHORT_TAG}/dask-cudf=${NEXT_SHORT_TAG}/g" ${FILE};
47 | done
48 |
49 | # README.md update
50 | sed_runner "s/rapidsai-clx:${CURRENT_SHORT_TAG}/rapidsai-clx:${NEXT_SHORT_TAG}/g" README.md
51 | sed_runner "s/rapidsai-dev:${CURRENT_SHORT_TAG}/rapidsai-dev:${NEXT_SHORT_TAG}/g" README.md
--------------------------------------------------------------------------------
/notebooks/ids_detection/util.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from cuml.metrics import precision_recall_curve, roc_auc_score
3 | from sklearn.metrics import roc_curve
4 | import cupy as cp
5 | import matplotlib.pylab as plt
6 |
7 | def average_precision_score(y_true, y_score):
8 | """
9 | Compute average precision score using precision and recall computed from cuml.
10 | """
11 | precision, recall, _ = precision_recall_curve(y_true, y_score)
12 | # return step function integral
13 | return -cp.sum(cp.diff(recall) * cp.array(precision)[:-1])
14 |
15 | def metrics(y_true, y_score):
16 | auc = roc_auc_score(y_true=y_true, y_score=y_score)
17 | ap = average_precision_score(y_true, y_score)
18 | return [auc, ap]
19 |
20 | def plot_roc(label, y_scores):
21 | fpr, tpr, _ = roc_curve(y_true=label.values.tolist(), y_score=y_scores.tolist())
22 | auc = metrics(label, y_scores)[0]
23 | plt.plot(fpr, tpr, label="ROC = " + str(np.round(auc,2)))
24 | plt.plot(np.arange(0,1.1,0.1), np.arange(0,1.1,0.1), 'r-')
25 | plt.ylabel('tpr')
26 | plt.xlabel('fpr')
27 | plt.legend(loc='best')
28 | plt.title('Area under AUC curve')
29 |
30 | def plot_pr(label, y_scores):
31 | ap = metrics(label, y_scores)[1]
32 | precision, recall, _ = precision_recall_curve( label, y_scores)
33 | plt.plot(recall, precision, label='AP = ' + str(np.round(ap,2)))
34 | plt.ylabel('Precision')
35 | plt.xlabel('Recall')
36 | plt.legend(loc='best')
37 | plt.title('Area under PR curve')
38 |
39 | def missing_values_table(df):
40 | mis_val = df.isnull().sum()
41 | mis_val_percent = 100 * df.isnull().sum() / len(df)
42 | mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
43 | mis_val_table_ren_columns = mis_val_table.rename(
44 | columns = {0 : 'Missing Values', 1 : '% of Total Values'})
45 | mis_val_table_ren_columns = mis_val_table_ren_columns[
46 | mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
47 | '% of Total Values', ascending=False).round(1)
48 | print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"
49 | "There are " + str(mis_val_table_ren_columns.shape[0]) +
50 | " columns that have missing values.")
51 | return mis_val_table_ren_columns
52 |
--------------------------------------------------------------------------------
/python/clx/io/reader/fs_reader.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import cudf
16 | import logging
17 | from clx.io.reader.file_reader import FileReader
18 |
19 | log = logging.getLogger(__name__)
20 |
21 |
22 | class FileSystemReader(FileReader):
23 | """
24 | Uses cudf to read from file system based on config object.
25 |
26 | :param config: dictionary object of config values for **type**, **input_format**, **input_path** (or **output_path**), and cudf reader optional keyword args
27 | """
28 | def __init__(self, config):
29 | self._config = config
30 | self._has_data = True
31 |
32 | def fetch_data(self):
33 | """
34 | Fetch data using cudf based on provided config object
35 | """
36 | df = None
37 | input_format = self.config["input_format"].lower()
38 | filepath = self.config["input_path"]
39 | kwargs = self.config.copy()
40 | del kwargs["type"]
41 | del kwargs["input_format"]
42 | del kwargs["input_path"]
43 |
44 | if "csv" == input_format:
45 | df = cudf.read_csv(filepath, **kwargs)
46 | elif "parquet" == input_format:
47 | df = cudf.read_parquet(filepath, **kwargs)
48 | elif "orc" == input_format:
49 | df = cudf.read_orc(filepath, engine="cudf")
50 | elif "json" == input_format:
51 | df = cudf.read_json(filepath, **kwargs)
52 | else:
53 | raise NotImplementedError("%s is not a supported input_format" % (input_format))
54 |
55 | self.has_data = False
56 | return df
57 |
58 | def close(self):
59 | """
60 | Close cudf reader
61 | """
62 | log.info("Closed fs reader")
63 |
--------------------------------------------------------------------------------
/siem_integrations/clx_query_service/clxquery/blazingsql_helper.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from dask_cuda import LocalCUDACluster
16 | from dask.distributed import Client
17 | from blazingsql import BlazingContext
18 | import logging
19 |
20 | log = logging.getLogger(__name__)
21 | """
22 | This class provides functionality to run blazingSQL queires and drop tables.
23 | """
24 |
25 |
26 | class BlazingSQLHelper:
27 | def __init__(self):
28 | cluster = LocalCUDACluster()
29 | client = Client(cluster)
30 | self._bc = BlazingContext(dask_client = client, network_interface = 'lo')
31 |
32 | """This function runs blazingSQL query.
33 |
34 | :param config: Query related tables configuration.
35 | :type config: dict
36 | :return: Query results.
37 | :rtype: cudf.DataFrame
38 | """
39 |
40 | def run_query(self, config):
41 | for table in config["tables"]:
42 | table_name = table["table_name"]
43 | file_path = table["input_path"]
44 | kwargs = table.copy()
45 | del kwargs["table_name"]
46 | del kwargs["input_path"]
47 | self._bc.create_table(table_name, file_path, **kwargs)
48 | sql = config["sql"]
49 | log.debug("Executing query: %s" % (sql))
50 | result = self._bc.sql(sql)
51 | result = result.compute()
52 | return result
53 |
54 | """This function drops blazingSQL tables.
55 | :param table_names: List of table names to drop.
56 | :type table_names: List
57 | """
58 |
59 | def drop_table(self, table_names):
60 | for table_name in table_names:
61 | log.debug("Drop table: %s" % (table_name))
62 | self._bc.drop_table(table_name)
--------------------------------------------------------------------------------
/python/clx/analytics/model/rnn_classifier.py:
--------------------------------------------------------------------------------
1 | # Original code at https://github.com/spro/practical-pytorch
2 | import torch
3 | import torch.nn as nn
4 | from torch.nn.utils.rnn import pack_padded_sequence
5 |
6 | DROPOUT = 0.0
7 |
8 |
9 | class RNNClassifier(nn.Module):
10 | def __init__(
11 | self, input_size, hidden_size, output_size, n_layers, bidirectional=True
12 | ):
13 | super(RNNClassifier, self).__init__()
14 | self.input_size = input_size
15 | self.hidden_size = hidden_size
16 | self.output_size = output_size
17 | self.n_layers = n_layers
18 | self.n_directions = int(bidirectional) + 1
19 | self.embedding = nn.Embedding(input_size, hidden_size)
20 | self.gru = nn.GRU(
21 | hidden_size,
22 | hidden_size,
23 | n_layers,
24 | dropout=DROPOUT,
25 | bidirectional=bidirectional,
26 | )
27 | self.fc = nn.Linear(hidden_size, output_size)
28 |
29 | def forward(self, input, seq_lengths):
30 | # Note: we run this all at once (over the whole input sequence)
31 | # input shape: B x S (input size)
32 | # transpose to make S(sequence) x B (batch)
33 | input = input.t()
34 | batch_size = input.size(1)
35 |
36 | # Make a hidden
37 | hidden = self._init_hidden(batch_size)
38 |
39 | # Embedding S x B -> S x B x I (embedding size)
40 | embedded = self.embedding(input)
41 |
42 | # Pack them up nicely
43 | gru_input = pack_padded_sequence(embedded, seq_lengths.data.cpu().numpy())
44 |
45 | # To compact weights again call flatten_parameters().
46 | self.gru.flatten_parameters()
47 | output, hidden = self.gru(gru_input, hidden)
48 | # output = self.dropout(output)
49 |
50 | # Use the last layer output as FC's input
51 | # No need to unpack, since we are going to use hidden
52 | fc_output = self.fc(hidden[-1])
53 | return fc_output
54 |
55 | def _init_hidden(self, batch_size):
56 | hidden = torch.zeros(
57 | self.n_layers * self.n_directions, batch_size, self.hidden_size
58 | )
59 | # creating variable
60 | if torch.cuda.is_available():
61 | return hidden.cuda()
62 | else:
63 | return hidden
64 |
--------------------------------------------------------------------------------
/python/clx/io/reader/dask_fs_reader.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import dask_cudf
16 | import logging
17 | from clx.io.reader.file_reader import FileReader
18 |
19 | log = logging.getLogger(__name__)
20 |
21 |
22 | class DaskFileSystemReader(FileReader):
23 | """
24 | Uses Dask to read from file system based on config object.
25 |
26 | :param config: dictionary object of config values for **type**, **input_format**, **input_path**, and dask reader optional keyword args
27 | """
28 | def __init__(self, config):
29 | self._config = config
30 | self._has_data = True
31 |
32 | def fetch_data(self):
33 | """
34 | Fetch data using dask based on provided config object
35 | """
36 | df = None
37 | input_format = self.config["input_format"].lower()
38 | filepath = self.config["input_path"]
39 | kwargs = self.config.copy()
40 | del kwargs["type"]
41 | del kwargs["input_format"]
42 | del kwargs["input_path"]
43 |
44 | if "csv" == input_format:
45 | df = dask_cudf.read_csv(filepath, **kwargs)
46 | elif "parquet" == input_format:
47 | df = dask_cudf.read_parquet(filepath, **kwargs)
48 | elif "orc" == input_format:
49 | df = dask_cudf.read_orc(filepath, engine="cudf")
50 | elif "json" == input_format:
51 | df = dask_cudf.read_json(filepath, **kwargs)
52 | else:
53 | raise NotImplementedError("%s is not a supported input_format" % (input_format))
54 |
55 | self.has_data = False
56 | return df
57 |
58 | def close(self):
59 | """
60 | Close dask reader
61 | """
62 | log.info("Closed dask_fs reader")
63 |
--------------------------------------------------------------------------------
/ci/gpu/test-notebooks.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | #RAPIDS_DIR=/rapids
4 | NOTEBOOKS_DIR="$WORKSPACE/notebooks"
5 | NBTEST="$WORKSPACE/ci/utils/nbtest.sh"
6 | LIBCUDF_KERNEL_CACHE_PATH="$WORKSPACE/.jitcache"
7 |
8 | cd ${NOTEBOOKS_DIR}
9 | TOPLEVEL_NB_FOLDERS=$(find . -name *.ipynb |cut -d'/' -f2|sort -u)
10 |
11 | # Add notebooks that should be skipped here
12 | # (space-separated list of filenames without paths)
13 | SKIPNBS="FLAIR_DNS_Log_Parsing.ipynb CLX_Workflow_Notebook2.ipynb CLX_Workflow_Notebook3.ipynb Supervised_Asset_Classification.ipynb CLX_Supervised_Asset_Classification.ipynb DGA_Detection.ipynb Predictive_Maintenance_Sequence_Classifier.ipynb IDS_using_LODA.ipynb anomalous_behavior_profiling_supervised.ipynb custream_n_graph.ipynb"
14 |
15 | ## Check env
16 | env
17 |
18 | EXITCODE=0
19 |
20 | # Always run nbtest in all TOPLEVEL_NB_FOLDERS, set EXITCODE to failure
21 | # if any run fails
22 | for folder in ${TOPLEVEL_NB_FOLDERS}; do
23 | echo "========================================"
24 | echo "FOLDER: ${folder}"
25 | echo "========================================"
26 | cd ${NOTEBOOKS_DIR}/${folder}
27 | for nb in $(find . -name "*.ipynb"); do
28 | nbBasename=$(basename ${nb})
29 | # Skip all NBs that use dask (in the code or even in their name)
30 | if ((echo ${nb}|grep -qi dask) || \
31 | (grep -q dask ${nb})); then
32 | echo "--------------------------------------------------------------------------------"
33 | echo "SKIPPING: ${nb} (suspected Dask usage, not currently automatable)"
34 | echo "--------------------------------------------------------------------------------"
35 | elif (echo " ${SKIPNBS} " | grep -q " ${nbBasename} "); then
36 | echo "--------------------------------------------------------------------------------"
37 | echo "SKIPPING: ${nb} (listed in skip list)"
38 | echo "--------------------------------------------------------------------------------"
39 | else
40 | cd $(dirname ${nb})
41 | nvidia-smi
42 | ${NBTEST} ${nbBasename}
43 | EXITCODE=$((EXITCODE | $?))
44 | rm -rf ${LIBCUDF_KERNEL_CACHE_PATH}/*
45 | cd ${NOTEBOOKS_DIR}/${folder}
46 | fi
47 | done
48 | done
49 |
50 | nvidia-smi
51 |
52 | exit ${EXITCODE}
53 |
--------------------------------------------------------------------------------
/siem_integrations/clx_query/bin/clx_query.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import logging
16 | import re
17 | import sys, requests, json
18 | from splunklib.searchcommands import (
19 | dispatch,
20 | GeneratingCommand,
21 | Configuration,
22 | Option,
23 | validators,
24 | )
25 | import splunklib.client as client
26 |
27 | log = logging.getLogger(__name__)
28 |
29 | REGEX_PATTERN = r"([LIMIT|limit]+.[0-9]+$)"
30 |
31 | @Configuration()
32 | class ClxQuery(GeneratingCommand):
33 | query = Option(require=True)
34 |
35 | def generate(self):
36 | configs = client.Configurations(self.service)
37 | for config in configs:
38 | if config.name == "clx_query_setup":
39 | clx_config = config.iter().next().content
40 |
41 | url = self.construct_url(clx_config)
42 | has_query_limit = re.findall(REGEX_PATTERN, self.query)
43 |
44 | payload = {'query': self.query}
45 | if not has_query_limit and clx_config["clx_query_limit"]:
46 | self.query = "%s LIMIT %s" %(self.query, clx_config["clx_query_limit"])
47 | payload = {'query': self.query}
48 | response = requests.post(url, data=payload)
49 |
50 | if response.status_code != 200:
51 | yield {"ERROR": response.content}
52 | else:
53 | results = json.loads(json.loads(response.content))
54 | for result in results:
55 | yield result
56 |
57 | def construct_url(self, config):
58 | url = "http://%s:%s/%s/" % (
59 | config["clx_hostname"],
60 | config["clx_port"],
61 | 'clxquery'
62 | )
63 | return url
64 |
65 |
66 | dispatch(ClxQuery, sys.argv, sys.stdin, sys.stdout, __name__)
--------------------------------------------------------------------------------
/python/clx/io/factory/factory.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import logging
16 |
17 | from clx.io.factory.kafka_factory import KafkaFactory
18 | from clx.io.factory.fs_factory import FileSystemFactory
19 | from clx.io.factory.dask_fs_factory import DaskFileSystemFactory
20 |
21 | log = logging.getLogger(__name__)
22 |
23 |
24 | class Factory:
25 |
26 | __cls_dict = {
27 | "kafka": "KafkaFactory",
28 | "fs": "FileSystemFactory",
29 | "dask_fs": "DaskFileSystemFactory",
30 | }
31 |
32 | @staticmethod
33 | def cls_dict():
34 | return Factory.__cls_dict
35 |
36 | class InstanceGenerator(object):
37 | def __init__(self, func):
38 | self.func = func
39 |
40 | def __call__(self, *args, **kwargs):
41 | class_name, config = self.func(*args, **kwargs)
42 | try:
43 | target_cls = globals()[class_name](config)
44 | return target_cls
45 | except KeyError as error:
46 | log.error(error)
47 | log.exception(error)
48 | raise
49 |
50 | @InstanceGenerator
51 | def get_instance(io_comp, config):
52 | io_comp = io_comp.lower()
53 | if io_comp and io_comp in Factory.cls_dict():
54 | return Factory.cls_dict()[io_comp], config
55 | else:
56 | raise KeyError(
57 | "Dictionary doesn't have { %s } corresponding component class."
58 | % (io_comp)
59 | )
60 |
61 | @staticmethod
62 | def get_reader(io_comp, config):
63 | return Factory.get_instance(io_comp, config).get_reader()
64 |
65 | @staticmethod
66 | def get_writer(io_comp, config):
67 | return Factory.get_instance(io_comp, config).get_writer()
68 |
--------------------------------------------------------------------------------
/ci/cpu/build.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Copyright (c) 2020-2022, NVIDIA CORPORATION.
3 | ################################################################################
4 | # CLX cpu build
5 | ################################################################################
6 | set -e
7 |
8 | # Set path and build parallel level
9 | export PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH
10 | export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
11 |
12 | # Set home to the job's workspace
13 | export HOME="$WORKSPACE"
14 |
15 | # Switch to project root; also root of repo checkout
16 | cd "$WORKSPACE"
17 |
18 | # If nightly build, append current YYMMDD to version
19 | if [[ "$BUILD_MODE" = "branch" && "$SOURCE_BRANCH" = branch-* ]] ; then
20 | export VERSION_SUFFIX=`date +%y%m%d`
21 | fi
22 |
23 | # Setup 'gpuci_conda_retry' for build retries (results in 2 total attempts)
24 | export GPUCI_CONDA_RETRY_MAX=1
25 | export GPUCI_CONDA_RETRY_SLEEP=30
26 |
27 | ################################################################################
28 | # SETUP - Check environment
29 | ################################################################################
30 |
31 | gpuci_logger "Get env"
32 | env
33 |
34 | gpuci_logger "Activate conda env"
35 | . /opt/conda/etc/profile.d/conda.sh
36 | conda activate rapids
37 |
38 | # Remove rapidsai-nightly channel if we are building main branch
39 | if [ "$SOURCE_BRANCH" = "main" ]; then
40 | conda config --system --remove channels rapidsai-nightly
41 | fi
42 |
43 | gpuci_logger "Check versions"
44 | python --version
45 | $CC --version
46 | $CXX --version
47 |
48 | gpuci_logger "Check conda environment"
49 | conda info
50 | conda config --show-sources
51 | conda list --show-channel-urls
52 |
53 | # FIX Added to deal with Anancoda SSL verification issues during conda builds
54 | conda config --set ssl_verify False
55 |
56 | # FIXME: Remove
57 | gpuci_mamba_retry install -c conda-forge boa
58 |
59 | ###############################################################################
60 | # BUILD - Conda package build
61 | ################################################################################
62 |
63 | gpuci_logger "Build conda pkg for clx"
64 | gpuci_conda_retry mambabuild conda/recipes/clx
65 |
66 | ################################################################################
67 | # UPLOAD - Conda package
68 | ################################################################################
69 |
70 | gpuci_logger "Upload packages"
71 | source ci/cpu/upload.sh
72 |
--------------------------------------------------------------------------------
/python/clx/tests/test_loda.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import cupy
15 | from clx.analytics.loda import Loda
16 | from os import path
17 |
18 |
19 | def test_fit():
20 | ld = Loda(n_random_cuts=10, n_bins=None)
21 | x = cupy.random.randint(0, 100, size=(200, 10))
22 | ld.fit(x)
23 | assert ld._histograms is not None
24 | assert isinstance(
25 | ld._histograms,
26 | cupy.ndarray
27 | )
28 | assert cupy.all(ld._histograms > 0)
29 |
30 |
31 | def test_score():
32 | ld = Loda(n_random_cuts=10, n_bins=None)
33 | x = cupy.random.randint(0, 100, size=(200, 10))
34 | ld.fit(x)
35 | scores = ld.score(x)
36 | assert scores is not None
37 | assert isinstance(
38 | scores,
39 | cupy.ndarray
40 | )
41 | assert cupy.all(scores > 0)
42 |
43 |
44 | def test_explain():
45 | ld = Loda(n_random_cuts=10, n_bins=None)
46 | x = cupy.random.randint(0, 100, size=(200, 10))
47 | ld.fit(x)
48 | explanation = ld.explain(x[0])
49 | assert explanation is not None
50 | assert isinstance(
51 | explanation,
52 | cupy.ndarray
53 | )
54 |
55 |
56 | def test_save_model(tmpdir):
57 | ld = Loda(n_random_cuts=10, n_bins=None)
58 | x = cupy.random.randint(0, 100, size=(200, 10))
59 | ld.fit(x)
60 | ipath = path.join(tmpdir, "clx_loda")
61 | opath = path.join(tmpdir, "clx_loda.npz")
62 | ld.save_model(ipath)
63 | assert path.exists(opath)
64 |
65 |
66 | def test_load_model(tmpdir):
67 | ld = Loda(n_random_cuts=10, n_bins=None)
68 | x = cupy.random.randint(0, 100, size=(200, 10))
69 | ld.fit(x)
70 | ipath = path.join(tmpdir, "clx_loda")
71 | opath = path.join(tmpdir, "clx_loda.npz")
72 | ld.save_model(ipath)
73 | assert path.exists(opath)
74 |
75 | # load model
76 | ld = Loda.load_model(opath)
77 | assert isinstance(ld, Loda)
78 |
--------------------------------------------------------------------------------
/examples/streamz/scripts/entrypoint.sh:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | #!/bin/bash
16 | set +e
17 |
18 | #*****************************
19 | # This function print logging.
20 | #*****************************
21 | log(){
22 | if [[ $# = 2 ]]; then
23 | echo "$(date) [$1] : $2"
24 | fi
25 | }
26 |
27 | source activate rapids
28 |
29 | # kakfa broker
30 | BROKER="localhost:9092"
31 |
32 | #**********************************
33 | # Configure Kafka
34 | #**********************************
35 | sed -i "/listeners=PLAINTEXT:\/\//c\listeners=PLAINTEXT:\/\/$BROKER" $KAFKA_HOME/config/server.properties
36 | sed -i "/advertised.listeners=PLAINTEXT:\/\//c\advertised.listeners=PLAINTEXT:\/\/$broker" $KAFKA_HOME/config/server.properties
37 | log "INFO" "Kafka configuration updated"
38 |
39 | #**********************************
40 | # Run Kafka and Zookeeper
41 | #**********************************
42 | $KAFKA_HOME/bin/zookeeper-server-start.sh -daemon $KAFKA_HOME/config/zookeeper.properties
43 | sleep 3
44 | $KAFKA_HOME/bin/kafka-server-start.sh -daemon $KAFKA_HOME/config/server.properties
45 | sleep 3
46 |
47 | log "INFO" "Kafka and zookeeper running"
48 | log "INFO" "Kafka broker is running on $BROKER"
49 | log "INFO" "Zookeeper is running on localhost:2181"
50 |
51 | #**********************************
52 | # Create topics and publish data
53 | #**********************************
54 | log "INFO" "Loading cybert input data to 'cybert_input' topic"
55 | . $CLX_STREAMZ_HOME/scripts/kafka_topic_setup.sh \
56 | -i cybert_input \
57 | -o cybert_output \
58 | -d $CLX_STREAMZ_HOME/data/apache_raw_sample_1k.txt
59 |
60 | log "INFO" "Loading dga detection input data to 'dga_detection_input' topic"
61 | . $CLX_STREAMZ_HOME/scripts/kafka_topic_setup.sh \
62 | -i dga_detection_input \
63 | -o dga_detection_output \
64 | -d $CLX_STREAMZ_HOME/data/dga_detection_input.jsonlines
65 |
66 | exec "$@";
67 |
--------------------------------------------------------------------------------
/python/clx/analytics/stats.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import logging
16 | import math
17 |
18 | log = logging.getLogger(__name__)
19 |
20 |
21 | def rzscore(series, window):
22 | """
23 | Calculates rolling z-score
24 |
25 | Parameters
26 | ----------
27 | series : cudf.Series
28 | Series for which to calculate rolling z-score
29 | window : int
30 | Window size
31 |
32 | Returns
33 | -------
34 | cudf.Series
35 | Series with rolling z-score values
36 |
37 | Examples
38 | --------
39 | >>> import clx.analytics.stats
40 | >>> import cudf
41 | >>> sequence = [3,4,5,6,1,10,34,2,1,11,45,34,2,9,19,43,24,13,23,10,98,84,10]
42 | >>> series = cudf.Series(sequence)
43 | >>> zscores_df = cudf.DataFrame()
44 | >>> zscores_df['zscore'] = clx.analytics.stats.rzscore(series, 7)
45 | >>> zscores_df
46 | zscore
47 | 0 null
48 | 1 null
49 | 2 null
50 | 3 null
51 | 4 null
52 | 5 null
53 | 6 2.374423424
54 | 7 -0.645941275
55 | 8 -0.683973734
56 | 9 0.158832461
57 | 10 1.847751909
58 | 11 0.880026019
59 | 12 -0.950835449
60 | 13 -0.360593742
61 | 14 0.111407599
62 | 15 1.228914145
63 | 16 -0.074966331
64 | 17 -0.570321249
65 | 18 0.327849973
66 | 19 -0.934372308
67 | 20 2.296828498
68 | 21 1.282966989
69 | 22 -0.795223674
70 | """
71 | rolling = series.rolling(window=window)
72 | mean = rolling.mean()
73 | std = rolling.apply(__std_func)
74 |
75 | zscore = (series - mean) / std
76 | return zscore
77 |
78 |
79 | def __std_func(A):
80 | """
81 | Current implementation assumes ddof = 0
82 | """
83 | sum_of_elem = 0
84 | sum_of_square_elem = 0
85 |
86 | for a in A:
87 | sum_of_elem += a
88 | sum_of_square_elem += a * a
89 |
90 | s = (sum_of_square_elem - ((sum_of_elem * sum_of_elem) / len(A))) / len(A)
91 | return math.sqrt(s)
92 |
--------------------------------------------------------------------------------
/examples/streamz/python/phishing_detection.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import time
16 | import dask
17 | import cudf
18 | from clx_streamz_tools import utils
19 | from clx_streamz_tools import streamz_workflow
20 |
21 |
22 | class PhisingDetectionWorkflow(streamz_workflow.StreamzWorkflow):
23 | def inference(messages):
24 | # Messages will be received and run through sequence classifier inferencing
25 | worker = dask.distributed.get_worker()
26 | batch_start_time = int(round(time.time()))
27 | df = cudf.DataFrame()
28 | if type(messages) == str:
29 | df["stream"] = [messages.decode("utf-8")]
30 | elif type(messages) == list and len(messages) > 0:
31 | df["stream"] = [msg.decode("utf-8") for msg in messages]
32 | else:
33 | print("ERROR: Unknown type encountered in inference")
34 |
35 | result_size = df.shape[0]
36 | print("Processing batch size: " + str(result_size))
37 | pred, prob = worker.data["seq_classifier"].predict(df["stream"])
38 | results_gdf = cudf.DataFrame({"pred": pred, "prob": prob})
39 | return (results_gdf, batch_start_time, result_size)
40 |
41 | def worker_init():
42 | # Initialization for each dask worker
43 | from clx.analytics.sequence_classifier import SequenceClassifier
44 |
45 | worker = dask.distributed.get_worker()
46 | seq_classifier = SequenceClassifier()
47 | print(
48 | "Initializing Dask worker: "
49 | + str(worker)
50 | + " with sequence classifier model. Model File: "
51 | + str(self.args.model)
52 | )
53 | seq_classifier.init_model(self.args.model)
54 | # this dict can be used for adding more objects to distributed dask worker
55 | obj_dict = {"seq_classifier": seq_classifier}
56 | worker = utils.init_dask_workers(worker, self.config, obj_dict)
57 |
58 |
59 | if __name__ == "__main__":
60 | phishing_detection = PhisingDetectionWorkflow()
61 | phishing_detection.start()
62 |
--------------------------------------------------------------------------------
/examples/streamz/python/cybert.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import time
16 | import dask
17 | import cudf
18 | import pandas as pd
19 | from clx_streamz_tools import utils
20 | from clx_streamz_tools import streamz_workflow
21 |
22 |
23 | class CybertWorkflow(streamz_workflow.StreamzWorkflow):
24 | def inference(self, messages):
25 | # Messages will be received and run through cyBERT inferencing
26 | worker = dask.distributed.get_worker()
27 | batch_start_time = int(round(time.time()))
28 | df = cudf.DataFrame()
29 | if type(messages) == str:
30 | df["stream"] = [messages.decode("utf-8")]
31 | elif type(messages) == list and len(messages) > 0:
32 | df["stream"] = [msg.decode("utf-8") for msg in messages]
33 | else:
34 | print("ERROR: Unknown type encountered in inference")
35 |
36 | result_size = df.shape[0]
37 | print("Processing batch size: " + str(result_size))
38 | parsed_df, confidence_df = worker.data["cybert"].inference(df["stream"])
39 | confidence_df = confidence_df.add_suffix("_confidence")
40 | parsed_df = pd.concat([parsed_df, confidence_df], axis=1)
41 | return (parsed_df, batch_start_time, result_size)
42 |
43 | def worker_init(self):
44 | # Initialization for each dask worker
45 | from clx.analytics.cybert import Cybert
46 |
47 | worker = dask.distributed.get_worker()
48 | cy = Cybert()
49 | print(
50 | "Initializing Dask worker: "
51 | + str(worker)
52 | + " with cybert model. Model File: "
53 | + str(self.args.model)
54 | + " Label Map: "
55 | + str(self.args.label_map)
56 | )
57 | cy.load_model(self.args.model, self.args.label_map)
58 | # this dict can be used for adding more objects to distributed dask worker
59 | obj_dict = {"cybert": cy}
60 | worker = utils.init_dask_workers(worker, self.config, obj_dict)
61 |
62 |
63 | if __name__ == "__main__":
64 | cybert = CybertWorkflow()
65 | cybert.start()
66 |
--------------------------------------------------------------------------------
/siem_integrations/splunk2kafka/export2kafka/bin/export2kafka.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | import sys
3 | import json
4 | #import pprint
5 | from splunklib.searchcommands import dispatch, StreamingCommand, Configuration, Option
6 | from confluent_kafka import Producer
7 | import confluent_kafka
8 | import time
9 |
10 | def eprint(*args, **kwargs):
11 | print(*args, file=sys.stderr, **kwargs)
12 |
13 | @Configuration(local=True)
14 | class FileSinkCommand(StreamingCommand):
15 | broker = Option(require=True)
16 | topic = Option(require=True)
17 | batch = Option(require=False, default=2000)
18 | timeout = Option(require=False, default=60)
19 | pool = Option(require=False, default=2)
20 | start_time = int(time.time())
21 |
22 | def create_producers(self, pool, broker):
23 | producers = []
24 | for i in range(pool):
25 | producers.append(Producer({'bootstrap.servers': broker, 'session.timeout.ms': 10000}))
26 | eprint("exprot2kafka - producer"+str(i)+" created: "+broker)
27 | return producers
28 |
29 | def stream(self, records):
30 | topic = str(self.topic)
31 | broker = str(self.broker)
32 | batch = int(self.batch)
33 | timeout = int(self.timeout)
34 | pool = int(self.pool)
35 | eprint("export2kafka - starting... broker("+broker+") topic("+topic+") batch(" \
36 | +str(batch)+") timeout("+str(timeout)+" mins) pool("+str(pool)+")")
37 | eprint("export2kafka - stream starting")
38 | producers = self.create_producers(pool, broker)
39 | cnt = 0
40 |
41 | for record in records:
42 | trimmed = {k: v for k, v in record.iteritems()}
43 | #eprint(json.dumps(trimmed))
44 | producers[cnt % pool].produce(topic, json.dumps(trimmed))
45 | cnt += 1
46 |
47 | if cnt % batch == 0:
48 | # batch level reached poll to get producer to move messages out
49 | eprint("export2kafka - batch reached, calling poll... processed records: "+str(cnt))
50 | for p in producers:
51 | p.poll(0)
52 |
53 | if cnt % 10 == 0 and int(time.time()) > (60 * timeout) + self.start_time:
54 | # quit after timeout has been reached, only check every 10 records
55 | eprint("export2kafka - timeout reached, stopping search...")
56 | break
57 |
58 | # return record for display in Splunk
59 | yield record
60 |
61 | eprint("export2kafka - all records processed for stream... processed records: "+str(cnt))
62 | eprint("export2kafka - calling flush...")
63 | for p in producers:
64 | p.flush()
65 | eprint("export2kafka - flush finished...")
66 | eprint("export2kafka - stream finished")
67 |
68 | if __name__ == "__main__":
69 | dispatch(FileSinkCommand, sys.argv, sys.stdin, sys.stdout, __name__)
70 |
--------------------------------------------------------------------------------
/python/clx/io/writer/fs_writer.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import cudf
16 | import logging
17 | import os
18 |
19 | from clx.io.writer.file_writer import FileWriter
20 |
21 | log = logging.getLogger(__name__)
22 |
23 |
24 | class FileSystemWriter(FileWriter):
25 | """
26 | Uses cudf to write to file system based on config object.
27 |
28 | :param config: dictionary object of config values for **type**, **output_format**, **output_path** (or **output_path**), and cudf writer optional keyword args
29 | """
30 |
31 | def __init__(self, config):
32 | self._config = config
33 |
34 | def write_data(self, df):
35 | """
36 | Write data to file system using cudf based on provided config object
37 | """
38 | output_format = self.config["output_format"].lower()
39 | filepath = self.config["output_path"]
40 | kwargs = self.config.copy()
41 | del kwargs["type"]
42 | del kwargs["output_format"]
43 | del kwargs["output_path"]
44 |
45 | dir = os.path.dirname(filepath)
46 | if not os.path.isdir(dir):
47 | log.info("output directory { %s } not exist" % (dir))
48 | log.info("creating output directory { %s }..." % (dir))
49 | os.makedirs(dir)
50 | log.info("created output directory { %s }..." % (dir))
51 | if os.path.exists(filepath):
52 | raise IOError("output path { %s } already exist" % (filepath))
53 |
54 | log.info("writing data to location {%s}" % (filepath))
55 |
56 | if "csv" == output_format:
57 | df.to_csv(filepath, **kwargs)
58 | elif "parquet" == output_format:
59 | cudf.io.parquet.to_parquet(df, filepath, **kwargs)
60 | elif "orc" == output_format:
61 | cudf.io.orc.to_orc(df, filepath, **kwargs)
62 | elif "json" == output_format:
63 | cudf.io.json.to_json(df, filepath, **kwargs)
64 | else:
65 | raise NotImplementedError("%s is not a supported output_format" % (output_format))
66 |
67 | def close(self):
68 | """
69 | Close cudf writer
70 | """
71 | log.info("Closed writer")
72 |
--------------------------------------------------------------------------------
/ci/local/README.md:
--------------------------------------------------------------------------------
1 | ## Purpose
2 |
3 | This script is designed for developer and contributor use. This tool mimics the actions of gpuCI on your local machine. This allows you to test and even debug your code inside a gpuCI base container before pushing your code as a GitHub commit.
4 | The script can be helpful in locally triaging and debugging RAPIDS continuous integration failures.
5 |
6 | ## Requirements
7 |
8 | ```
9 | nvidia-docker
10 | ```
11 |
12 | ## Usage
13 |
14 | ```
15 | bash build.sh [-h] [-H] [-s] [-r ] [-i ]
16 | Build and test your local repository using a base gpuCI Docker image
17 |
18 | where:
19 | -H Show this help text
20 | -r Path to repository (defaults to working directory)
21 | -i Use Docker image (default is gpuci/rapidsai-base:cuda10.0-ubuntu16.04-gcc5-py3.6)
22 | -s Skip building and testing and start an interactive shell in a container of the Docker image
23 | ```
24 |
25 | Example Usage:
26 | `bash build.sh -r ~/rapids/clx -i gpuci/rapidsai-base:cuda10.1-ubuntu16.04-gcc5-py3.6`
27 |
28 | For a full list of available gpuCI docker images, visit our [DockerHub](https://hub.docker.com/r/gpuci/rapidsai-base/tags) page.
29 |
30 | Style Check:
31 | ```bash
32 | $ bash ci/local/build.sh -r ~/rapids/clx -s
33 | $ . /opt/conda/etc/profile.d/conda.sh
34 | $ conda activate rapids
35 | $ cd rapids
36 | $ flake8 python
37 | ```
38 |
39 | ## Information
40 |
41 | There are some caveats to be aware of when using this script, especially if you plan on developing from within the container itself.
42 |
43 |
44 | ### Docker Image Build Repository
45 |
46 | The docker image will generate build artifacts in a folder on your machine located in the `root` directory of the repository you passed to the script. For the above example, the directory is named `~/rapids/clx/build_rapidsai-base_cuda10.1-ubuntu16.04-gcc5-py3.6/`. Feel free to remove this directory after the script is finished.
47 |
48 | *Note*: The script *will not* override your local build repository. Your local environment stays in tact.
49 |
50 |
51 | ### Where The User is Dumped
52 |
53 | The script will build your repository and run all tests. If any tests fail, it dumps the user into the docker container itself to allow you to debug from within the container. If all the tests pass as expected the container exits and is automatically removed. Remember to exit the container if tests fail and you do not wish to debug within the container itself.
54 |
55 |
56 | ### Container File Structure
57 |
58 | Your repository will be located in the `/rapids/` folder of the container. This folder is volume mounted from the local machine. Any changes to the code in this repository are replicated onto the local machine. The `cpp/build` and `python/build` directories within your repository is on a separate mount to avoid conflicting with your local build artifacts.
59 |
--------------------------------------------------------------------------------
/python/clx/tests/test_netflow_workflow.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import cudf
16 | from clx.workflow.netflow_workflow import NetflowWorkflow
17 |
18 |
19 | def test_netflow_workflow():
20 | """Tests the netflow dataframe enrichment"""
21 | netflow_workflow = NetflowWorkflow("netflow-workflow")
22 | input_df = cudf.DataFrame(
23 | {
24 | "ts time": ["12345678900.12345"],
25 | "uid string": ["123ABC"],
26 | "id.orig_h": ["123.456.789"],
27 | "id.orig_p": ["1000"],
28 | "id.resp_h": ["987.654.321"],
29 | "id.resp_p": ["80"],
30 | "proto": ["tcp"],
31 | "service": ["-"],
32 | "duration": ["2.015"],
33 | "orig_bytes": ["0"],
34 | "resp_bytes": ["0"],
35 | "conn_state": ["SH"],
36 | "local_orig": ["-"],
37 | "local_resp": ["-"],
38 | "missed_bytes": ["0"],
39 | "history": ["F"],
40 | "orig_pkts count": ["2"],
41 | "orig_ip_bytes": ["80"],
42 | "resp_pkts": ["0"],
43 | "resp_ip_bytes": ["0"],
44 | "tunnel_parents": ["-"],
45 | }
46 | )
47 | actual_df = netflow_workflow.workflow(input_df)
48 | expected_df = cudf.DataFrame(
49 | {
50 | "ts time": ["12345678900.12345"],
51 | "uid string": ["123ABC"],
52 | "id.orig_h": ["123.456.789"],
53 | "id.orig_p": ["1000"],
54 | "id.resp_h": ["987.654.321"],
55 | "id.resp_p": ["80"],
56 | "proto": ["tcp"],
57 | "service": ["-"],
58 | "duration": ["2.015"],
59 | "orig_bytes": ["0"],
60 | "resp_bytes": ["0"],
61 | "conn_state": ["SH"],
62 | "local_orig": ["-"],
63 | "local_resp": ["-"],
64 | "missed_bytes": ["0"],
65 | "history": ["F"],
66 | "orig_pkts count": ["2"],
67 | "orig_ip_bytes": ["80"],
68 | "resp_pkts": ["0"],
69 | "resp_ip_bytes": ["0"],
70 | "tunnel_parents": ["-"],
71 | "netflow_enriched": ["netflow_enriched"],
72 | }
73 | )
74 |
75 | assert actual_df.equals(expected_df)
76 |
--------------------------------------------------------------------------------
/siem_integrations/clx_query_service/clxquery/views.py:
--------------------------------------------------------------------------------
1 | import re
2 | import os
3 | import logging
4 |
5 | from clxquery import utils
6 | from clxquery.blazingsql_helper import BlazingSQLHelper
7 | from django.http import HttpResponse, JsonResponse
8 | from rest_framework.generics import CreateAPIView
9 |
10 | log = logging.getLogger(__name__)
11 |
12 | class ExecuteClxQuery(CreateAPIView):
13 |
14 | file_path = os.environ.get("BLZ_READER_CONF")
15 | # Load tables configuration
16 | config = utils.load_yaml(file_path)
17 | configured_tables = set([table["table_name"] for table in config["tables"]])
18 |
19 | regex_pattern = r"main.([\w]+)"
20 | blz_helper = BlazingSQLHelper()
21 |
22 | def post(self, request, *args, **kwargs):
23 | query = str(request.data['query'])
24 | # Check for the list of tables used in the query to prevent loading other tables into gpu memory
25 | query_tables = set(re.findall(self.regex_pattern, query))
26 | # Verify list of tables used in the query to make sure they are included in the configuration file
27 |
28 | if query_tables.issubset(self.configured_tables):
29 | try:
30 | query_config = {}
31 | query_config["tables"] = []
32 | for table in self.config["tables"]:
33 | if table["table_name"] in query_tables:
34 | query_config["tables"].append(table)
35 | query_config["sql"] = query
36 | # Run query and get the results
37 | df = self.blz_helper.run_query(query_config)
38 | # Drop tables to free up memory
39 | self.blz_helper.drop_table(query_tables)
40 | # Convert cudf to pandas dataframe
41 | df = df.to_pandas()
42 | # Convert results to json format.
43 | results = df.to_json(orient="records")
44 | response = JsonResponse(results, safe=False)
45 | except Exception as e:
46 | stacktrace = str(e)
47 | log.error("Error executing query: %s" % (stacktrace))
48 | response = JsonResponse(
49 | {"status": "false", "message": stacktrace}, status=500, safe=False
50 | )
51 | else:
52 | message = (
53 | "One or more tables used in the query are not available in the server configuration. Please select from this list %s or add new tables to your clx-blazingsql configuration."
54 | % (configured_tables)
55 | )
56 | response = JsonResponse(
57 | {"status": "false", "message": message}, status=404, safe=False
58 | )
59 | return response
--------------------------------------------------------------------------------
/python/clx/tests/test_dask_fs_reader.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import cudf
16 | import pytest
17 | from clx.io.reader.dask_fs_reader import DaskFileSystemReader
18 |
19 | expected_df = cudf.DataFrame(
20 | {
21 | "firstname": ["Emma", "Ava", "Sophia"],
22 | "lastname": ["Olivia", "Isabella", "Charlotte"],
23 | "gender": ["F", "F", "F"],
24 | }
25 | )
26 |
27 |
28 | @pytest.mark.parametrize("expected_df", [expected_df])
29 | def test_fetch_data_csv(tmpdir, expected_df):
30 | fname = tmpdir.mkdir("tmp_test_fs_reader").join("person.csv")
31 | expected_df.to_csv(fname, index=False)
32 | config = {
33 | "type": "dask_fs",
34 | "input_path": fname,
35 | "names": ["firstname", "lastname", "gender"],
36 | "delimiter": ",",
37 | "usecols": ["firstname", "lastname", "gender"],
38 | "dtype": ["str", "str", "str"],
39 | "header": 0,
40 | "input_format": "csv",
41 | }
42 | reader = DaskFileSystemReader(config)
43 | fetched_df = reader.fetch_data().compute()
44 |
45 | assert fetched_df.equals(expected_df)
46 |
47 |
48 | @pytest.mark.parametrize("expected_df", [expected_df])
49 | def test_fetch_data_parquet(tmpdir, expected_df):
50 | fname = str(tmpdir.mkdir("tmp_test_fs_reader").join("person.parquet"))
51 | cudf.io.parquet.to_parquet(expected_df, fname)
52 | config = {
53 | "type": "dask_fs",
54 | "input_path": fname,
55 | "columns": ["firstname", "lastname", "gender"],
56 | "input_format": "parquet",
57 | "gather_statistics": False,
58 | "split_row_groups": False
59 | }
60 |
61 | reader = DaskFileSystemReader(config)
62 | fetched_df = reader.fetch_data().compute()
63 |
64 | assert fetched_df.equals(expected_df)
65 |
66 |
67 | @pytest.mark.parametrize("expected_df", [expected_df])
68 | def test_fetch_data_orc(tmpdir, expected_df):
69 | fname = str(tmpdir.mkdir("tmp_test_fs_reader").join("person.orc"))
70 | cudf.io.orc.to_orc(expected_df, fname)
71 | config = {
72 | "type": "dask_fs",
73 | "input_path": fname,
74 | "input_format": "orc"
75 | }
76 |
77 | reader = DaskFileSystemReader(config)
78 | fetched_df = reader.fetch_data().compute()
79 |
80 | assert fetched_df.equals(expected_df)
81 |
--------------------------------------------------------------------------------
/python/clx/tests/test_eda.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import json
16 |
17 | import cudf
18 | import cuxfilter
19 | import pandas as pd
20 | import pytest
21 |
22 | from clx.eda import EDA
23 |
24 |
25 | @pytest.fixture
26 | def test_dataframe():
27 | df = cudf.DataFrame()
28 | df["a"] = [1, 2, 3, 4]
29 | df["b"] = ["a", "b", "c", "c"]
30 | df["c"] = [True, False, True, True]
31 | df["d"] = cudf.Series(pd.date_range("2000-01-01", periods=3, freq="m"))
32 | return df
33 |
34 |
35 | def test_eda_summary_stats(test_dataframe):
36 | """Test EDA Summary statistics"""
37 | expected_output = {
38 | "SummaryStatistics": {
39 | "a": {"dtype": "int64", "summary": {"unique": "4", "total": "4"}},
40 | "b": {"dtype": "object", "summary": {"unique": "3", "total": "4"}},
41 | "c": {"dtype": "bool", "summary": {"true_percent": "0.75"}},
42 | "d": {
43 | "dtype": "datetime64[ns]",
44 | "summary": {"timespan": "60 days, 2880 hours, 0 minutes, 0 seconds"},
45 | },
46 | }
47 | }
48 | eda = EDA(test_dataframe)
49 | actual_output = eda.analysis
50 | assert expected_output == actual_output
51 |
52 |
53 | def test_eda_save_analysis(tmpdir, test_dataframe):
54 | """Test saving the analysis to a json file"""
55 | fdir = str(tmpdir.mkdir("tmp_test_eda"))
56 | fname = fdir + "/SummaryStatistics.json"
57 | eda = EDA(test_dataframe)
58 | eda.save_analysis(fdir)
59 | expected_output = {
60 | "a": {"dtype": "int64", "summary": {"unique": "4", "total": "4"}},
61 | "b": {"dtype": "object", "summary": {"unique": "3", "total": "4"}},
62 | "c": {"dtype": "bool", "summary": {"true_percent": "0.75"}},
63 | "d": {
64 | "dtype": "datetime64[ns]",
65 | "summary": {"timespan": "60 days, 2880 hours, 0 minutes, 0 seconds"},
66 | },
67 | }
68 | with open(fname) as f:
69 | actual_output = json.load(f)
70 | assert expected_output == actual_output
71 |
72 |
73 | def test_cuxfilter_dashboard(test_dataframe):
74 | """Test generating the dashboard"""
75 | eda = EDA(test_dataframe)
76 | dash = eda.cuxfilter_dashboard()
77 | assert isinstance(dash, cuxfilter.dashboard.DashBoard)
78 | assert len(dash.charts) == 2
79 | assert dash.title == "Exploratory Data Analysis"
80 |
--------------------------------------------------------------------------------
/python/clx/io/writer/kafka_writer.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import logging
16 |
17 | log = logging.getLogger(__name__)
18 |
19 |
20 | class KafkaWriter:
21 | """
22 | Publish to Kafka topic based on config object.
23 |
24 | :param kafka_topic: Kafka topic
25 | :param batch_size: batch size
26 | :param delimiter: delimiter
27 | :param producer: producer
28 | """
29 |
30 | # Column name of formatted output messages sent to kafka
31 | output_colname = "delimited_output"
32 |
33 | def __init__(self, kafka_topic, batch_size, delimiter, producer):
34 | self._kafka_topic = kafka_topic
35 | self._batch_size = batch_size
36 | self._delimiter = delimiter
37 | self._producer = producer
38 |
39 | @property
40 | def producer(self):
41 | return self._producer
42 |
43 | @property
44 | def delimiter(self):
45 | return self._delimiter
46 |
47 | def write_data(self, df):
48 | """
49 | publish messages to kafka topic
50 |
51 | :param df: dataframe to publish
52 | """
53 | out_df = self._generate_delimited_ouput_col(df)
54 | for rec in out_df.to_records():
55 | self.producer.produce(self._kafka_topic, rec[self.output_colname])
56 | if len(self.producer) > self._batch_size:
57 | log.debug(
58 | "batch reached, calling poll... producer unsent: %s",
59 | len(self.producer),
60 | )
61 | self.producer.poll(0)
62 |
63 | def _generate_delimited_ouput_col(self, gdf):
64 | first_col = gdf.columns[0]
65 | gdf[first_col] = gdf[first_col].astype("str").fillna("")
66 | gdf[self.output_colname] = gdf[first_col].astype("str").str.rstrip()
67 | for col in gdf.columns[1:-1]:
68 | gdf[col] = gdf[col].astype("str").fillna("")
69 | gdf[col] = gdf[col].astype("str").str.rstrip()
70 | gdf[self.output_colname] = gdf[self.output_colname].str.cat(
71 | gdf[col], sep=self.delimiter
72 | )
73 | return gdf
74 |
75 | def close(self):
76 | """
77 | Close Kafka writer
78 | """
79 | log.info("Closing kafka writer...")
80 | if self._producer is not None:
81 | self._producer.flush()
82 | log.info("Closed kafka writer.")
83 |
--------------------------------------------------------------------------------
/python/clx/tests/test_kafka_reader.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from confluent_kafka import Consumer
18 | from confluent_kafka import Message, KafkaError
19 | from mockito import when, mock, verify
20 | from clx.io.reader.kafka_reader import KafkaReader
21 |
22 | batch_size = 100
23 | message = mock(Message)
24 | kafka_error = mock(KafkaError)
25 | when(kafka_error).code().thenReturn("test")
26 | when(message).value().thenReturn("test message".encode("utf-8"))
27 |
28 |
29 | @pytest.mark.parametrize("batch_size", [batch_size])
30 | def test_read_data(batch_size):
31 | consumer = mock(Consumer)
32 | reader = KafkaReader(batch_size, consumer)
33 | # Return msg = None 1 time, then return a valid message moving forward
34 | when(reader.consumer).poll(timeout=1.0).thenReturn(None).thenReturn(message)
35 | # Always return no message error
36 | when(message).error().thenReturn(None)
37 | df = reader.fetch_data()
38 | assert df.shape == (100, 1)
39 | assert df.columns == ["Raw"]
40 | assert df["Raw"][0] == "test message"
41 | # Call to poll returned 100(Valid messages) + 1(None message) = 101
42 | verify(reader.consumer, times=101).poll(...)
43 |
44 |
45 | @pytest.mark.parametrize("batch_size", [batch_size])
46 | def test_read_data_message_error(batch_size):
47 | consumer = mock(Consumer)
48 | reader = KafkaReader(batch_size, consumer)
49 | # Return valid message data
50 | when(reader.consumer).poll(timeout=1.0).thenReturn(message)
51 | # Return no message error 1 time, then an error moving forward
52 | when(message).error().thenReturn(None).thenReturn(kafka_error)
53 | df = reader.fetch_data()
54 |
55 | # Validate consumer polls
56 | # 1 (Valid message) + 1 (Error Message) = 2 Consumer polls
57 | verify(reader.consumer, times=2).poll(...)
58 |
59 | # Validate dataframe output
60 | assert df.shape == (1, 1)
61 | assert df.columns == ["Raw"]
62 | assert df["Raw"].to_arrow().to_pylist() == ["test message"]
63 |
64 |
65 | @pytest.mark.parametrize("batch_size", [5])
66 | def test_read_data_no_messages(batch_size):
67 | consumer = mock(Consumer)
68 | reader = KafkaReader(batch_size, consumer, time_window=5)
69 | # Return no messages
70 | when(reader.consumer).poll(timeout=1.0).thenReturn(None)
71 | df = reader.fetch_data()
72 |
73 | # Validate dataframe output
74 | assert df.empty
75 |
--------------------------------------------------------------------------------
/python/clx/tests/test_fs_writer.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import cudf
16 | import pytest
17 |
18 | from clx.io.writer.fs_writer import FileSystemWriter
19 |
20 |
21 | expected_df = cudf.DataFrame(
22 | {
23 | "firstname": ["Emma", "Ava", "Sophia"],
24 | "lastname": ["Olivia", "Isabella", "Charlotte"],
25 | "gender": ["F", "F", "F"],
26 | }
27 | )
28 |
29 |
30 | @pytest.mark.parametrize("expected_df", [expected_df])
31 | def test_write_data_csv(tmpdir, expected_df):
32 | fname = str(tmpdir.mkdir("tmp_test_fs_writer").join("person.csv"))
33 | config = {
34 | "type": "fs",
35 | "output_path": fname,
36 | "output_format": "csv",
37 | "index": False
38 | }
39 | writer = FileSystemWriter(config)
40 | writer.write_data(expected_df)
41 |
42 | result_df = cudf.read_csv(fname)
43 | assert result_df.equals(expected_df)
44 |
45 |
46 | @pytest.mark.parametrize("expected_df", [expected_df])
47 | def test_write_data_parquet(tmpdir, expected_df):
48 | fname = str(tmpdir.mkdir("tmp_test_fs_writer").join("person.parquet"))
49 | config = {
50 | "type": "fs",
51 | "output_path": fname,
52 | "output_format": "parquet"
53 | }
54 | writer = FileSystemWriter(config)
55 | writer.write_data(expected_df)
56 |
57 | result_df = cudf.read_parquet(fname)
58 | assert result_df.equals(expected_df)
59 |
60 |
61 | @pytest.mark.parametrize("expected_df", [expected_df])
62 | def test_write_data_orc(tmpdir, expected_df):
63 | fname = str(tmpdir.mkdir("tmp_test_fs_writer").join("person.orc"))
64 | config = {
65 | "type": "fs",
66 | "output_path": fname,
67 | "output_format": "orc",
68 | }
69 | writer = FileSystemWriter(config)
70 | writer.write_data(expected_df)
71 |
72 | result_df = cudf.read_orc(fname)
73 | assert result_df.equals(expected_df)
74 |
75 |
76 | @pytest.mark.parametrize("expected_df", [expected_df])
77 | def test_write_data_json(tmpdir, expected_df):
78 | fname = str(tmpdir.mkdir("tmp_test_fs_writer").join("person.json"))
79 | config = {
80 | "type": "fs",
81 | "output_path": fname,
82 | "output_format": "json",
83 | "orient": "records"
84 | }
85 | writer = FileSystemWriter(config)
86 | writer.write_data(expected_df)
87 |
88 | result_df = cudf.read_json(fname, orient="records")
89 | assert result_df.equals(expected_df)
90 |
--------------------------------------------------------------------------------
/python/clx/tests/test_port_heuristic.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import cudf
16 | from clx.heuristics import ports
17 |
18 |
19 | def test_major_ports():
20 | input_addr_col = cudf.Series(["10.0.75.1", "10.0.75.1", "10.0.75.1", "10.0.75.255", "10.110.104.107"])
21 | input_port_col = cudf.Series([137, 137, 7680, 137, 7680])
22 |
23 | expected = cudf.DataFrame()
24 | expected["addr"] = ["10.0.75.1", "10.0.75.255", "10.110.104.107"]
25 | expected["port"] = [137, 137, 7680]
26 | expected["service"] = ["netbios-ns", "netbios-ns", "pando-pub"]
27 | expected["conns"] = [2, 1, 1]
28 |
29 | actual = ports.major_ports(input_addr_col, input_port_col)
30 |
31 | assert actual.equals(expected)
32 |
33 |
34 | def test_major_ports_ephemeral():
35 | input_addr_col = cudf.Series(["10.0.75.1", "10.0.75.2", "10.0.75.3", "10.0.75.4"])
36 | input_port_col = cudf.Series([50000, 60000, 20000, 80])
37 |
38 | expected = cudf.DataFrame()
39 | expected["addr"] = ["10.0.75.1", "10.0.75.2", "10.0.75.3", "10.0.75.4"]
40 | expected["port"] = [50000, 60000, 20000, 80]
41 | expected["service"] = ["ephemeral", "ephemeral", "dnp", "http"]
42 | expected["conns"] = [1, 1, 1, 1]
43 |
44 | actual = ports.major_ports(input_addr_col, input_port_col, eph_min=50000)
45 |
46 | assert actual.equals(expected)
47 |
48 |
49 | def test_major_ports_min_conns():
50 | input_addr_col = cudf.Series(["10.0.75.1", "10.0.75.1", "10.0.75.1", "10.0.75.255", "10.110.104.107"])
51 | input_port_col = cudf.Series([137, 137, 7680, 137, 7680])
52 |
53 | expected = cudf.DataFrame()
54 | expected["addr"] = ["10.0.75.1"]
55 | expected["port"] = [137]
56 | expected["service"] = ["netbios-ns"]
57 | expected["conns"] = [2]
58 |
59 | actual = ports.major_ports(input_addr_col, input_port_col, min_conns=2)
60 |
61 | assert actual.equals(expected)
62 |
63 |
64 | def test_major_ports_all_params():
65 | input_addr_col = cudf.Series(["10.0.75.1", "10.0.75.1", "10.0.75.1", "10.0.75.255", "10.110.104.107", "10.110.104.107"])
66 | input_port_col = cudf.Series([137, 137, 7680, 137, 7680, 7680])
67 |
68 | expected = cudf.DataFrame()
69 | expected["addr"] = ["10.0.75.1", "10.110.104.107"]
70 | expected["port"] = [137, 7680]
71 | expected["service"] = ["netbios-ns", "ephemeral"]
72 | expected["conns"] = [2, 2]
73 |
74 | actual = ports.major_ports(input_addr_col, input_port_col, min_conns=2, eph_min=7000)
75 |
76 | assert actual.equals(expected)
77 |
--------------------------------------------------------------------------------
/ci/gpu/build.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Copyright (c) 2018-2022, NVIDIA CORPORATION.
3 | ##########################################
4 | # CLX GPU build & testscript for CI #
5 | ##########################################
6 |
7 | set -e
8 | NUMARGS=$#
9 | ARGS=$*
10 |
11 | # Arg parsing function
12 | function hasArg {
13 | (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ")
14 | }
15 |
16 | # Set path and build parallel level
17 | export PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH
18 | export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
19 | export CUDA_REL=${CUDA_VERSION%.*}
20 | export CUDA_SHORT=${CUDA_REL//./}
21 |
22 | # Set home to the job's workspace
23 | export HOME="$WORKSPACE"
24 |
25 | # Switch to project root; also root of repo checkout
26 | cd "$WORKSPACE"
27 | export GIT_DESCRIBE_TAG=`git describe --tags`
28 | export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
29 | unset GIT_DESCRIBE_TAG
30 |
31 | ################################################################################
32 | # SETUP - Check environment
33 | ################################################################################
34 |
35 | gpuci_logger "Get env"
36 | env
37 |
38 | gpuci_logger "Activate conda env"
39 | . /opt/conda/etc/profile.d/conda.sh
40 | conda activate rapids
41 |
42 | gpuci_logger "Install conda dependencies"
43 | gpuci_mamba_retry install -y \
44 | "cuxfilter=${MINOR_VERSION}" \
45 | "faker" \
46 | "python-whois" \
47 | "seqeval=1.2.2"
48 |
49 | pip install -U torch==1.11.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
50 | pip install "git+https://github.com/rapidsai/cudatashader.git"
51 | pip install "git+https://github.com/slashnext/SlashNext-URL-Analysis-and-Enrichment.git#egg=slashnext-phishing-ir&subdirectory=Python SDK/src"
52 | pip install mockito
53 | pip install wget
54 |
55 | gpuci_logger "Check versions"
56 | python --version
57 | $CC --version
58 | $CXX --version
59 |
60 | gpuci_logger "Show conda info"
61 | conda info
62 | conda config --show-sources
63 | conda list --show-channel-urls
64 |
65 | ################################################################################
66 | # BUILD - Build clx
67 | ################################################################################
68 |
69 | #TODO: Move boa installation to gpuci/rapidsai
70 | gpuci_mamba_retry install boa
71 |
72 | gpuci_logger "Build and install clx..."
73 | cd "${WORKSPACE}"
74 | CONDA_BLD_DIR="${WORKSPACE}/.conda-bld"
75 | gpuci_conda_retry mambabuild --croot "${CONDA_BLD_DIR}" conda/recipes/clx
76 | gpuci_mamba_retry install -c "${CONDA_BLD_DIR}" clx
77 |
78 | ################################################################################
79 | # TEST - Test python package
80 | ################################################################################
81 | set +e -Eo pipefail
82 | EXITCODE=0
83 | trap "EXITCODE=1" ERR
84 |
85 | if hasArg --skip-tests; then
86 | gpuci_logger "Skipping Tests"
87 | else
88 | cd "$WORKSPACE/python"
89 | py.test --ignore=ci --cache-clear --junitxml="$WORKSPACE/junit-clx.xml" -v
90 | "$WORKSPACE/ci/gpu/test-notebooks.sh" 2>&1 | tee nbtest.log
91 | python "$WORKSPACE/ci/utils/nbtestlog2junitxml.py" nbtest.log
92 | fi
93 |
94 | return "${EXITCODE}"
95 |
--------------------------------------------------------------------------------
/python/clx/analytics/detector.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import torch
3 | import torch.nn as nn
4 | from abc import ABC, abstractmethod
5 |
6 | log = logging.getLogger(__name__)
7 |
8 | GPU_COUNT = torch.cuda.device_count()
9 |
10 |
11 | class Detector(ABC):
12 | def __init__(self, lr=0.001):
13 | self.lr = lr
14 | self._model = None
15 | self._optimizer = None
16 | self._criterion = nn.CrossEntropyLoss()
17 |
18 | @property
19 | def model(self):
20 | return self._model
21 |
22 | @property
23 | def optimizer(self):
24 | return self._optimizer
25 |
26 | @property
27 | def criterion(self):
28 | return self._criterion
29 |
30 | @abstractmethod
31 | def init_model(self, char_vocab, hidden_size, n_domain_type, n_layers):
32 | pass
33 |
34 | @abstractmethod
35 | def train_model(self, training_data, labels, batch_size=1000, epochs=1, train_size=0.7):
36 | pass
37 |
38 | @abstractmethod
39 | def predict(self, epoch, train_dataset):
40 | pass
41 |
42 | def load_model(self, file_path):
43 | """ This function load already saved model and sets cuda parameters.
44 |
45 | :param file_path: File path of a model to be loaded.
46 | :type file_path: string
47 | """
48 |
49 | model = torch.load(file_path)
50 | model.eval()
51 | self._model = model
52 | self._set_model2cuda()
53 | self._set_optimizer()
54 |
55 | def save_model(self, file_path):
56 | """ This function saves model to a given location.
57 |
58 | :param file_path: File path of a model to be saved.
59 | :type file_path: string
60 | """
61 |
62 | torch.save(self.model, file_path)
63 |
64 | def _save_checkpoint(self, checkpoint, file_path):
65 | torch.save(checkpoint, file_path)
66 | log.info("Pretrained model checkpoint saved to location: '{}'".format(file_path))
67 |
68 | def _set_parallelism(self):
69 | if GPU_COUNT > 1:
70 | log.info("CUDA device count: {}".format(GPU_COUNT))
71 | self._model = nn.DataParallel(self.model)
72 | self._set_model2cuda()
73 | else:
74 | self._set_model2cuda()
75 |
76 | def _set_optimizer(self):
77 | self._optimizer = torch.optim.RMSprop(
78 | self.model.parameters(), self.lr, weight_decay=0.0
79 | )
80 |
81 | def _set_model2cuda(self):
82 | if torch.cuda.is_available():
83 | log.info("Found GPU's now setting up cuda for the model")
84 | self.model.cuda()
85 |
86 | def leverage_model(self, model):
87 | """This function leverages model by setting parallelism parameters.
88 |
89 | :param model: Model instance.
90 | :type model: RNNClassifier
91 | """
92 | model.eval()
93 | self._model = model
94 | self._set_parallelism()
95 | self._set_optimizer()
96 |
97 | def _get_unwrapped_model(self):
98 | if GPU_COUNT > 1:
99 | model = self.model.module
100 | else:
101 | model = self.model
102 | return model
103 |
--------------------------------------------------------------------------------
/python/clx/tests/test_fs_reader.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import cudf
16 | import pytest
17 | from clx.io.reader.fs_reader import FileSystemReader
18 |
19 | expected_df = cudf.DataFrame(
20 | {
21 | "firstname": ["Emma", "Ava", "Sophia"],
22 | "lastname": ["Olivia", "Isabella", "Charlotte"],
23 | "gender": ["F", "F", "F"],
24 | }
25 | )
26 |
27 |
28 | @pytest.mark.parametrize("expected_df", [expected_df])
29 | def test_fetch_data_csv(tmpdir, expected_df):
30 | fname = tmpdir.mkdir("tmp_test_fs_reader").join("person.csv")
31 | expected_df.to_csv(fname, index=False)
32 |
33 | config = {
34 | "type": "fs",
35 | "input_path": fname,
36 | "names": ["firstname", "lastname", "gender"],
37 | "delimiter": ",",
38 | "usecols": ["firstname", "lastname", "gender"],
39 | "dtype": ["str", "str", "str"],
40 | "header": 0,
41 | "input_format": "csv"
42 | }
43 | reader = FileSystemReader(config)
44 | fetched_df = reader.fetch_data()
45 |
46 | assert fetched_df.equals(expected_df)
47 |
48 |
49 | @pytest.mark.parametrize("expected_df", [expected_df])
50 | def test_fetch_data_parquet(tmpdir, expected_df):
51 | fname = tmpdir.mkdir("tmp_test_fs_reader").join("person.parquet")
52 | cudf.io.parquet.to_parquet(expected_df, fname)
53 |
54 | config = {
55 | "type": "fs",
56 | "input_path": fname,
57 | "input_format": "parquet"
58 | }
59 |
60 | reader = FileSystemReader(config)
61 | fetched_df = reader.fetch_data()
62 |
63 | assert fetched_df.equals(expected_df)
64 |
65 |
66 | @pytest.mark.parametrize("expected_df", [expected_df])
67 | def test_fetch_data_orc(tmpdir, expected_df):
68 | fname = str(tmpdir.mkdir("tmp_test_fs_reader").join("person.orc"))
69 | cudf.io.orc.to_orc(expected_df, fname)
70 | config = {
71 | "type": "fs",
72 | "input_path": fname,
73 | "input_format": "orc"
74 | }
75 |
76 | reader = FileSystemReader(config)
77 | fetched_df = reader.fetch_data()
78 |
79 | assert fetched_df.equals(expected_df)
80 |
81 |
82 | @pytest.mark.parametrize("expected_df", [expected_df])
83 | def test_fetch_data_json(tmpdir, expected_df):
84 | fname = str(tmpdir.mkdir("tmp_test_fs_reader").join("person.json"))
85 | cudf.io.json.to_json(expected_df, fname, orient="records")
86 | config = {
87 | "type": "fs",
88 | "input_path": fname,
89 | "orient": "records",
90 | "input_format": "json"
91 | }
92 |
93 | reader = FileSystemReader(config)
94 | fetched_df = reader.fetch_data()
95 |
96 | assert fetched_df.equals(expected_df)
97 |
--------------------------------------------------------------------------------
/examples/run_dga_training.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | """
16 | Example Usage: python run_dga_training.py \
17 | --training-data benign_and_dga_domains.csv \
18 | --output-dir trained_models \
19 | --batch-size 10000 \
20 | --epochs 2
21 | """
22 | import os
23 | import cudf
24 | import torch
25 | import argparse
26 | from datetime import datetime
27 | from clx.analytics.dga_detector import DGADetector
28 |
29 | LR = 0.001
30 | N_LAYERS = 4
31 | CHAR_VOCAB = 128
32 | HIDDEN_SIZE = 100
33 | N_DOMAIN_TYPE = 2
34 |
35 | def main():
36 | epochs = int(args["epochs"])
37 | input_filepath = args["training_data"]
38 | batch_size = int(args["batch_size"])
39 | output_dir = args["output_dir"]
40 | # load input data to gpu memory
41 | input_df = cudf.read_csv(input_filepath)
42 | train_data = input_df['domain']
43 | labels = input_df['type']
44 | del input_df
45 | dd = DGADetector(lr=LR)
46 | dd.init_model(
47 | n_layers=N_LAYERS,
48 | char_vocab=CHAR_VOCAB,
49 | hidden_size=HIDDEN_SIZE,
50 | n_domain_type=N_DOMAIN_TYPE,
51 | )
52 | dd.train_model(train_data, labels, batch_size=batch_size, epochs=epochs, train_size=0.7)
53 |
54 | if not os.path.exists(output_dir):
55 | print("Creating directory '{}'".format(output_dir))
56 | os.makedirs(output_dir)
57 | now = datetime.now()
58 | model_filename = "rnn_classifier_{}.bin".format(now.strftime("%Y-%m-%d_%H_%M_%S"))
59 | model_filepath = os.path.join(output_dir, model_filename)
60 | print("Saving trained model to location '{}'".format(model_filepath))
61 | dd.save_model(model_filepath)
62 |
63 | def parse_cmd_args():
64 | # construct the argument parse and parse the arguments
65 | ap = argparse.ArgumentParser(description="DGA detection model training script")
66 | ap.add_argument(
67 | "--training-data", required=True, help="CSV with domain and type fields"
68 | )
69 | ap.add_argument(
70 | "--output-dir", required=True, help="output directory to save new model files"
71 | )
72 | ap.add_argument(
73 | "--batch-size",
74 | required=True,
75 | help="Dividing dataset into number of batches or sets or parts",
76 | )
77 | ap.add_argument(
78 | "--epochs",
79 | required=True,
80 | help="One epoch is when an entire dataset is passed forward and backward through the neural network only once",
81 | )
82 | args = vars(ap.parse_args())
83 | return args
84 |
85 |
86 | # execution starts here
87 | if __name__ == "__main__":
88 | args = parse_cmd_args()
89 | main()
90 |
--------------------------------------------------------------------------------
/python/clx/osi/whois.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # use `pip install python-whois`
16 | import whois
17 | import logging
18 |
19 | log = logging.getLogger(__name__)
20 |
21 |
22 | class WhoIsLookupClient(object):
23 |
24 | str_arr_keys = ["domain_name", "name_servers", "status", "emails", "dnssec"]
25 | datetime_arr_keys = ["creation_date", "updated_date", "expiration_date"]
26 |
27 | """
28 | Wrapper class to query WhoIs API.
29 |
30 | :param sep: Delimiter to concat nested list values from the Whois response.
31 | :param datetime_format: Format to convert WhoIs response datetime object.
32 | """
33 | def __init__(self, sep=",", datetime_format="%m-%d-%Y %H:%M:%S"):
34 | self.sep = sep
35 | self.datetime_format = datetime_format
36 |
37 | def whois(self, domains, arr2str=True):
38 | """
39 | Function to access parsed WhoIs data for a given domain.
40 |
41 | :param domains: Domains to perform whois lookup.
42 | :type domains: list
43 | :param arr2str: Convert WhoIs lookup response object to list of strings.
44 | :type arr2str: boolean
45 | :return: WhoIs information with respect to given domains.
46 | :rtype: list/obj
47 |
48 | Examples
49 | --------
50 | >>> from clx.osi.whois import WhoIsLookupClient
51 | >>> domains = ["nvidia.com"]
52 | >>> client = WhoIsLookupClient()
53 | >>> client.whois(domains)
54 | [{'domain_name': 'NVIDIA.COM', 'registrar': 'Safenames Ltd', 'whois_server': 'whois.safenames.net'...}]
55 | """
56 | result = []
57 | for domain in domains:
58 | resp = whois.whois(domain)
59 | if arr2str:
60 | resp_keys = resp.keys()
61 | resp = self.__flatten_str_array(resp, resp_keys)
62 | resp = self.__flatten_datetime_array(resp, resp_keys)
63 | result.append(resp)
64 | return result
65 |
66 | def __flatten_str_array(self, resp, resp_keys):
67 | for key in self.str_arr_keys:
68 | if key in resp_keys and isinstance(resp[key], list):
69 | resp[key] = self.sep.join(resp[key])
70 | return resp
71 |
72 | def __flatten_datetime_array(self, resp, resp_keys):
73 | for key in self.datetime_arr_keys:
74 | values = []
75 | if key in resp_keys:
76 | if isinstance(resp[key], list):
77 | for ts in resp[key]:
78 | values.append(ts.strftime(self.datetime_format))
79 | resp[key] = self.sep.join(values)
80 | else:
81 | resp[key] = resp[key].strftime(self.datetime_format)
82 | return resp
83 |
--------------------------------------------------------------------------------
/python/clx/io/factory/kafka_factory.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import logging
16 |
17 | from confluent_kafka import Consumer
18 | from confluent_kafka import Producer
19 |
20 | from clx.io.factory.abstract_factory import AbstractFactory
21 | from clx.io.reader.kafka_reader import KafkaReader
22 | from clx.io.writer.kafka_writer import KafkaWriter
23 |
24 | log = logging.getLogger(__name__)
25 |
26 |
27 | class KafkaFactory(AbstractFactory):
28 | def __init__(self, config):
29 | """
30 | Constructor method
31 |
32 | :param config: dictionary object of config values for **batch_size**, **time_window**, **publisher_kafka_topic**, **output_delimiter**, **kafka_brokers**, and **group_id**.
33 | """
34 | self._config = config
35 |
36 | def get_reader(self):
37 | """
38 | Get instance of KafkaReader
39 | """
40 | consumer = self._create_consumer()
41 | if "time_window" in self.config:
42 | reader = KafkaReader(
43 | self.config["batch_size"],
44 | consumer,
45 | time_window=self.config["time_window"],
46 | )
47 | else:
48 | reader = KafkaReader(self.config["batch_size"], consumer)
49 | return reader
50 |
51 | def get_writer(self):
52 | """
53 | Get instance of KafkaWriter
54 | """
55 | producer = self._create_producer()
56 | writer = KafkaWriter(
57 | self.config["publisher_kafka_topic"],
58 | self.config["batch_size"],
59 | self.config["output_delimiter"],
60 | producer,
61 | )
62 | return writer
63 |
64 | def _create_consumer(self):
65 | log.info("creating kafka consumer instance")
66 | consumer_conf = {
67 | "bootstrap.servers": self.config["kafka_brokers"],
68 | "group.id": self.config["group_id"],
69 | "session.timeout.ms": 10000,
70 | "default.topic.config": {"auto.offset.reset": "largest"},
71 | }
72 |
73 | c = Consumer(consumer_conf)
74 | c.subscribe(
75 | self.config["consumer_kafka_topics"], on_assign=self.print_assignment
76 | )
77 | log.info("created kafka consumer instance")
78 | return c
79 |
80 | def _create_producer(self):
81 | log.info("creating kafka producer instance")
82 | producer_conf = {
83 | "bootstrap.servers": self.config["kafka_brokers"],
84 | "session.timeout.ms": 10000,
85 | }
86 | producer = Producer(producer_conf)
87 | log.info("created producer instance")
88 | return producer
89 |
90 | def print_assignment(self, consumer, partitions):
91 | print("Assignment:", partitions)
92 |
--------------------------------------------------------------------------------
/python/clx/tests/test_binary_sequence_classifier.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import random
15 | from os import path
16 |
17 | import cudf
18 | import torch
19 | import transformers
20 | from cuml.model_selection import train_test_split
21 | from faker import Faker
22 |
23 | from clx.analytics.binary_sequence_classifier import BinarySequenceClassifier
24 |
25 | sc = BinarySequenceClassifier()
26 | if torch.cuda.is_available():
27 | sc.init_model("bert-base-uncased")
28 |
29 |
30 | def test_train_model():
31 | if torch.cuda.is_available():
32 | fake = Faker()
33 | email_col = [fake.text() for _ in range(200)]
34 | label_col = [random.randint(0, 1) for _ in range(200)]
35 | emails_gdf = cudf.DataFrame(list(zip(email_col, label_col)), columns=["email", "label"])
36 | X_train, X_test, y_train, y_test = train_test_split(
37 | emails_gdf, "label", train_size=0.8, random_state=10
38 | )
39 | sc.train_model(
40 | X_train["email"],
41 | y_train,
42 | learning_rate=3e-5,
43 | max_seq_len=128,
44 | batch_size=6,
45 | epochs=1,
46 | )
47 | assert isinstance(
48 | sc._model.module,
49 | transformers.models.bert.modeling_bert.BertForSequenceClassification,
50 | )
51 |
52 |
53 | def test_evaluate_model():
54 | if torch.cuda.is_available():
55 | X_test = cudf.Series(["email 1", "email 2"])
56 | y_test = cudf.Series([0, 0])
57 | accuracy = sc.evaluate_model(
58 | X_test, y_test, max_seq_len=128, batch_size=32
59 | )
60 | assert accuracy >= 0.0 and accuracy <= 1.0
61 |
62 |
63 | def test_predict():
64 | if torch.cuda.is_available():
65 | X_test = cudf.Series(["email 1", "email 2"])
66 | preds = sc.predict(X_test, max_seq_len=128)
67 | assert preds[0].isin([False, True]).equals(cudf.Series([True, True]))
68 |
69 |
70 | def test_save_model(tmpdir):
71 | if torch.cuda.is_available():
72 | sc.save_model(tmpdir)
73 | assert path.exists(str(tmpdir.join("config.json")))
74 | assert path.exists(str(tmpdir.join("pytorch_model.bin")))
75 |
76 |
77 | def test_save_checkpoint(tmpdir):
78 | if torch.cuda.is_available():
79 | fname = str(tmpdir.mkdir("tmp_test_sequence_classifier").join("sc_checkpoint.tar"))
80 | sc.save_checkpoint(fname)
81 | assert path.exists(fname)
82 |
83 |
84 | def test_load_checkpoint(tmpdir):
85 | if torch.cuda.is_available():
86 | fname = str(tmpdir.mkdir("tmp_test_sequence_classifier").join("sc_checkpoint.tar"))
87 | sc.save_checkpoint(fname)
88 | assert path.exists(fname)
89 | sc.load_checkpoint(fname)
90 | assert isinstance(
91 | sc._model.module,
92 | transformers.models.bert.modeling_bert.BertForSequenceClassification,
93 | )
94 |
--------------------------------------------------------------------------------
/python/clx/tests/test_multiclass_sequence_classifier.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import random
15 | from os import path
16 |
17 | import cudf
18 | import torch
19 | import transformers
20 | from cuml.model_selection import train_test_split
21 | from faker import Faker
22 |
23 | from clx.analytics.multiclass_sequence_classifier import MulticlassSequenceClassifier
24 |
25 | sc = MulticlassSequenceClassifier()
26 | if torch.cuda.is_available():
27 | sc.init_model("bert-base-uncased", num_labels=3)
28 |
29 |
30 | def test_train_model():
31 | if torch.cuda.is_available():
32 | fake = Faker()
33 | email_col = [fake.text() for _ in range(200)]
34 | label_col = [random.randint(0, 2) for _ in range(200)]
35 | emails_gdf = cudf.DataFrame(list(zip(email_col, label_col)), columns=["email", "label"])
36 | X_train, X_test, y_train, y_test = train_test_split(
37 | emails_gdf, "label", train_size=0.8, random_state=10
38 | )
39 | sc.train_model(
40 | X_train["email"],
41 | y_train,
42 | learning_rate=3e-5,
43 | max_seq_len=128,
44 | batch_size=6,
45 | epochs=1,
46 | )
47 | assert isinstance(
48 | sc._model.module,
49 | transformers.models.bert.modeling_bert.BertForSequenceClassification,
50 | )
51 |
52 |
53 | def test_evaluate_model():
54 | if torch.cuda.is_available():
55 | X_test = cudf.Series(["email 1", "email 2"])
56 | y_test = cudf.Series([0, 0])
57 | accuracy = sc.evaluate_model(
58 | X_test, y_test, max_seq_len=128, batch_size=32
59 | )
60 | assert accuracy >= 0.0 and accuracy <= 1.0
61 |
62 |
63 | def test_predict():
64 | if torch.cuda.is_available():
65 | X_test = cudf.Series(["email 1", "email 2"])
66 | preds = sc.predict(X_test, max_seq_len=128)
67 | assert preds.isin([0, 1, 2]).equals(cudf.Series([True, True]))
68 |
69 |
70 | def test_save_model(tmpdir):
71 | if torch.cuda.is_available():
72 | sc.save_model(tmpdir)
73 | assert path.exists(str(tmpdir.join("config.json")))
74 | assert path.exists(str(tmpdir.join("pytorch_model.bin")))
75 |
76 |
77 | def test_save_checkpoint(tmpdir):
78 | if torch.cuda.is_available():
79 | fname = str(tmpdir.mkdir("tmp_test_sequence_classifier").join("sc_checkpoint.tar"))
80 | sc.save_checkpoint(fname)
81 | assert path.exists(fname)
82 |
83 |
84 | def test_load_checkpoint(tmpdir):
85 | if torch.cuda.is_available():
86 | fname = str(tmpdir.mkdir("tmp_test_sequence_classifier").join("sc_checkpoint.tar"))
87 | sc.save_checkpoint(fname)
88 | assert path.exists(fname)
89 | sc.load_checkpoint(fname)
90 | assert isinstance(
91 | sc._model.module,
92 | transformers.models.bert.modeling_bert.BertForSequenceClassification,
93 | )
94 |
--------------------------------------------------------------------------------
/docs/source/api.rst:
--------------------------------------------------------------------------------
1 | API Reference
2 | =============
3 |
4 |
5 | IP
6 | --
7 | .. automodule:: clx.ip
8 | :members:
9 |
10 | Features
11 | --------
12 | .. automodule:: clx.features
13 | :members:
14 |
15 | Analytics
16 | ---------
17 | .. autoclass:: clx.analytics.asset_classification.AssetClassification
18 | :members:
19 |
20 | .. autoclass:: clx.analytics.binary_sequence_classifier.BinarySequenceClassifier
21 | :members:
22 | :inherited-members:
23 |
24 | .. autoclass:: clx.analytics.cybert.Cybert
25 | :members:
26 |
27 | .. autoclass:: clx.analytics.detector.Detector
28 | :members:
29 |
30 | .. autoclass:: clx.analytics.dga_dataset.DGADataset
31 | :members:
32 |
33 | .. autoclass:: clx.analytics.dga_detector.DGADetector
34 | :members:
35 |
36 | .. autoclass:: clx.analytics.loda.Loda
37 | :members:
38 |
39 | .. autoclass:: clx.analytics.model.rnn_classifier.RNNClassifier
40 | :members:
41 |
42 | .. autoclass:: clx.analytics.model.tabular_model.TabularModel
43 | :members:
44 |
45 | .. autoclass:: clx.analytics.multiclass_sequence_classifier.MulticlassSequenceClassifier
46 | :members:
47 | :inherited-members:
48 |
49 | .. automodule:: clx.analytics.anomaly_detection
50 | :members:
51 |
52 | .. automodule:: clx.analytics.perfect_hash
53 | :members:
54 |
55 | .. automodule:: clx.analytics.periodicity_detection
56 | :members:
57 |
58 | .. automodule:: clx.analytics.stats
59 | :members:
60 |
61 | DNS Extractor
62 | -------------
63 | .. automodule:: clx.dns.dns_extractor
64 | :members:
65 |
66 | Exploratory Data Analysis
67 | -------------------------
68 | .. autoclass:: clx.eda.EDA
69 | :members:
70 |
71 | Heuristics
72 | ----------
73 | .. automodule:: clx.heuristics.ports
74 | :members:
75 |
76 | OSI (Open Source Integration)
77 | -----------------------------
78 | .. autoclass:: clx.osi.farsight.FarsightLookupClient
79 | :members:
80 |
81 | .. autoclass:: clx.osi.virus_total.VirusTotalClient
82 | :members:
83 |
84 | .. autoclass:: clx.osi.whois.WhoIsLookupClient
85 | :members:
86 |
87 | .. autoclass:: clx.osi.slashnext.SlashNextClient
88 | :members:
89 |
90 | Parsers
91 | -------
92 |
93 | .. autoclass:: clx.parsers.event_parser.EventParser
94 | :members:
95 |
96 | .. autoclass:: clx.parsers.splunk_notable_parser.SplunkNotableParser
97 | :members:
98 |
99 | .. autoclass:: clx.parsers.windows_event_parser.WindowsEventParser
100 | :members:
101 |
102 | .. automodule:: clx.parsers.zeek
103 | :members:
104 |
105 | Utils
106 | -----
107 |
108 | .. autoclass:: clx.utils.data.dataloader.DataLoader
109 | :members:
110 |
111 | .. autoclass:: clx.utils.data.dataset.Dataset
112 | :members:
113 |
114 | .. autoclass:: clx.utils.data.utils
115 | :members:
116 |
117 | Workflow
118 | --------
119 |
120 | .. autoclass:: clx.workflow.workflow.Workflow
121 | :members:
122 |
123 | .. autoclass:: clx.workflow.splunk_alert_workflow.SplunkAlertWorkflow
124 | :members:
125 |
126 | I/O
127 | --------
128 |
129 | .. autoclass:: clx.io.reader.kafka_reader.KafkaReader
130 | :members:
131 |
132 | .. autoclass:: clx.io.reader.dask_fs_reader.DaskFileSystemReader
133 | :members:
134 |
135 | .. autoclass:: clx.io.reader.fs_reader.FileSystemReader
136 | :members:
137 |
138 | .. autoclass:: clx.io.writer.kafka_writer.KafkaWriter
139 | :members:
140 |
141 | .. autoclass:: clx.io.writer.fs_writer.FileSystemWriter
142 | :members:
143 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # CLX Code of Conduct
2 |
3 | CLX has adopted the [Contributor Covenant Conde of Conduct](https://docs.rapids.ai/resources/conduct):
4 |
5 | ## Our Pledge
6 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation.
7 |
8 | ## Our Standards
9 | ### Examples of behavior that contributes to creating a positive environment include:
10 |
11 | - Using welcoming and inclusive language,
12 | - Being respectful of differing viewpoints and experiences,
13 | - Gracefully accepting constructive criticism,
14 | - Focusing on what is best for the community, and
15 | - Showing empathy towards other community members.
16 |
17 | ### Examples of unacceptable behavior by participants include:
18 |
19 | - The use of sexualized language or imagery and unwelcome sexual attention or advances,
20 | - Trolling, insulting/derogatory comments, and personal or political attacks,
21 | - Public or private harassment,
22 | - Publishing others’ private information, such as a physical or electronic address, without explicit permission, and
23 | - Other conduct which could reasonably be considered inappropriate in a professional setting.
24 |
25 | ## Our Responsibilities
26 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
27 |
28 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
29 |
30 | ## Scope
31 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
32 |
33 | ## Enforcement
34 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at [conduct@rapids.ai](mailto:conduct@rapids.ai). All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
35 |
36 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project’s leadership.
37 |
38 | ## Attribution
39 | This Code of Conduct is adapted from the Contributor Covenant, version 1.4, available at [https://www.contributor-covenant.org/version/1/4/code-of-conduct.html](https://www.contributor-covenant.org/version/1/4/code-of-conduct.html).
40 |
41 | For answers to common questions about this code of conduct, see [https://www.contributor-covenant.org/faq](https://www.contributor-covenant.org/faq).
--------------------------------------------------------------------------------
/python/clx/eda/summary_stats.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2020, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import cuxfilter
16 |
17 | from clx.eda.analysis import Analysis
18 |
19 |
20 | class SummaryStatistics(Analysis):
21 | def __init__(self, dataframe):
22 | super().__init__(dataframe)
23 |
24 | def __summary_obj(self, series):
25 | summary = {}
26 | uniq_count = len(series.unique())
27 | total = series.notna().sum()
28 | summary["unique"] = str(uniq_count)
29 | summary["total"] = str(total)
30 | return summary
31 |
32 | def __summary_bool(self, series):
33 | summary = {}
34 | true_per = (series == True).sum() # noqa: E712
35 | summary["true_percent"] = str(true_per / len(series))
36 | return summary
37 |
38 | def __summary_num(self, series):
39 | summary = {}
40 | uniq_count = len(series.unique())
41 | total = series.notna().sum()
42 | summary["unique"] = str(uniq_count)
43 | summary["total"] = str(total)
44 | return summary
45 |
46 | def __summary_time(self, series):
47 | summary = {}
48 | duration = series.max() - series.min()
49 | days = duration.astype("timedelta64[D]").astype(int)
50 | seconds = duration.astype("timedelta64[s]").astype(int)
51 | hours = days * 24 + seconds // 3600
52 | minutes = (seconds % 3600) // 60
53 | seconds = seconds % 60
54 | msg = "{0} days, {1} hours, {2} minutes, {3} seconds".format(
55 | days, hours, minutes, seconds
56 | )
57 | summary["timespan"] = msg
58 | return summary
59 |
60 | def _generate_analysis(self, dataframe):
61 | # This function will receive a dataframe and returns a dictionary of summary statistics
62 | summary_dict = {}
63 | for col in dataframe.columns:
64 | summary_dict[col] = {}
65 | summary_dict[col]["dtype"] = str(dataframe[col].dtype)
66 | if dataframe[col].dtype == "object":
67 | summary_dict[col]["summary"] = self.__summary_obj(dataframe[col])
68 | elif dataframe[col].dtype == "bool":
69 | summary_dict[col]["summary"] = self.__summary_bool(dataframe[col])
70 | elif dataframe[col].dtype in ["int64", "float64", "int8"]:
71 | summary_dict[col]["summary"] = self.__summary_num(dataframe[col])
72 | elif dataframe[col].dtype == "datetime64[ns]":
73 | summary_dict[col]["summary"] = self.__summary_time(dataframe[col])
74 | else:
75 | msg = "\t column type (" + str(dataframe[col].dtype) + ") not supported"
76 | summary_dict[col]["error"] = msg
77 | return summary_dict
78 |
79 | def _generate_charts(self, dataframe):
80 | """Get barcharts for the summary analysis"""
81 | charts = []
82 | for col in dataframe.columns:
83 | if dataframe[col].dtype == "object":
84 | bars = len(dataframe[col].unique())
85 | if bars < 30:
86 | if bars > 1:
87 | charts.append(cuxfilter.charts.bar(col))
88 | return charts
89 |
--------------------------------------------------------------------------------
/siem_integrations/clx_query_service/clx_query_service/settings.py:
--------------------------------------------------------------------------------
1 | """
2 | Django settings for clx_query_service project.
3 |
4 | Generated by 'django-admin startproject' using Django 2.2.6.
5 |
6 | For more information on this file, see
7 | https://docs.djangoproject.com/en/2.2/topics/settings/
8 |
9 | For the full list of settings and their values, see
10 | https://docs.djangoproject.com/en/2.2/ref/settings/
11 | """
12 |
13 | import os
14 |
15 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
16 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
17 |
18 |
19 | # Quick-start development settings - unsuitable for production
20 | # See https://docs.djangoproject.com/en/2.2/howto/deployment/checklist/
21 |
22 | # SECURITY WARNING: keep the secret key used in production secret!
23 | SECRET_KEY = "i6nr8@pzj5$@^(y903w5tc)8%v!(lk!3npl$1z7(%##2zxv"
24 |
25 | # SECURITY WARNING: don't run with debug turned on in production!
26 | DEBUG = False
27 |
28 | ALLOWED_HOSTS = ["localhost"]
29 |
30 |
31 | # Application definition
32 |
33 | INSTALLED_APPS = [
34 | "django.contrib.admin",
35 | "django.contrib.auth",
36 | "django.contrib.contenttypes",
37 | "django.contrib.sessions",
38 | "django.contrib.messages",
39 | "django.contrib.staticfiles",
40 | "rest_framework",
41 | "clxquery.apps.ClxQueryConfig",
42 | ]
43 |
44 | MIDDLEWARE = [
45 | "django.middleware.security.SecurityMiddleware",
46 | "django.contrib.sessions.middleware.SessionMiddleware",
47 | "django.middleware.common.CommonMiddleware",
48 | "django.middleware.csrf.CsrfViewMiddleware",
49 | "django.contrib.auth.middleware.AuthenticationMiddleware",
50 | "django.contrib.messages.middleware.MessageMiddleware",
51 | "django.middleware.clickjacking.XFrameOptionsMiddleware",
52 | ]
53 |
54 | ROOT_URLCONF = "clx_query_service.urls"
55 |
56 | TEMPLATES = [
57 | {
58 | "BACKEND": "django.template.backends.django.DjangoTemplates",
59 | "DIRS": [],
60 | "APP_DIRS": True,
61 | "OPTIONS": {
62 | "context_processors": [
63 | "django.template.context_processors.debug",
64 | "django.template.context_processors.request",
65 | "django.contrib.auth.context_processors.auth",
66 | "django.contrib.messages.context_processors.messages",
67 | ]
68 | },
69 | }
70 | ]
71 |
72 | WSGI_APPLICATION = "clx_query_service.wsgi.application"
73 |
74 |
75 | # Database
76 | # https://docs.djangoproject.com/en/2.2/ref/settings/#databases
77 |
78 | DATABASES = {
79 | "default": {
80 | "ENGINE": "django.db.backends.sqlite3",
81 | "NAME": os.path.join(BASE_DIR, "db.sqlite3"),
82 | }
83 | }
84 |
85 |
86 | # Password validation
87 | # https://docs.djangoproject.com/en/2.2/ref/settings/#auth-password-validators
88 |
89 | AUTH_PASSWORD_VALIDATORS = [
90 | {
91 | "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator"
92 | },
93 | {"NAME": "django.contrib.auth.password_validation.MinimumLengthValidator"},
94 | {"NAME": "django.contrib.auth.password_validation.CommonPasswordValidator"},
95 | {"NAME": "django.contrib.auth.password_validation.NumericPasswordValidator"},
96 | ]
97 |
98 |
99 | # Internationalization
100 | # https://docs.djangoproject.com/en/2.2/topics/i18n/
101 |
102 | LANGUAGE_CODE = "en-us"
103 |
104 | TIME_ZONE = "UTC"
105 |
106 | USE_I18N = True
107 |
108 | USE_L10N = True
109 |
110 | USE_TZ = True
111 |
112 |
113 | # Static files (CSS, JavaScript, Images)
114 | # https://docs.djangoproject.com/en/2.2/howto/static-files/
115 |
116 | STATIC_URL = "/static/"
117 |
--------------------------------------------------------------------------------
/python/clx/io/reader/kafka_reader.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2019, NVIDIA CORPORATION.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import cudf
16 | import logging
17 | import time
18 | from confluent_kafka import KafkaError
19 | from clx.io.reader.reader import Reader
20 |
21 | log = logging.getLogger(__name__)
22 |
23 |
24 | class KafkaReader(Reader):
25 | """
26 | Reads from Kafka based on config object.
27 |
28 | :param batch_size: batch size
29 | :param consumer: Kafka consumer
30 | :param time_window: Max window of time that queued events will wait to be pushed to workflow
31 | """
32 | def __init__(self, batch_size, consumer, time_window=30):
33 | self._batch_size = batch_size
34 | self._consumer = consumer
35 | self._has_data = True
36 | self._time_window = time_window
37 |
38 | @property
39 | def consumer(self):
40 | return self._consumer
41 |
42 | @property
43 | def has_data(self):
44 | return self._has_data
45 |
46 | @property
47 | def time_window(self):
48 | return self._time_window
49 |
50 | def fetch_data(self):
51 | """
52 | Fetch data from Kafka based on provided config object
53 | """
54 | events = []
55 | rec_cnt = 0
56 | running = True
57 | current_time = time.time()
58 | try:
59 | while running:
60 | # First check if batch size or time window has been exceeded
61 | if (
62 | rec_cnt >= self._batch_size or (time.time() - current_time) >= self.time_window
63 | ):
64 | log.debug(
65 | "Exceeded record count (" + str(rec_cnt) + ") or time window (" + str(time.time() - current_time) + ")"
66 | )
67 | running = False
68 | # Else poll next message in kafka queue
69 | else:
70 | msg = self.consumer.poll(timeout=1.0)
71 | if msg is None:
72 | log.debug("No message received.")
73 | continue
74 | elif not msg.error():
75 | data = msg.value().decode("utf-8")
76 | log.debug("Message received.")
77 | events.append(data)
78 | rec_cnt += 1
79 | elif msg.error().code() != KafkaError._PARTITION_EOF:
80 | log.error(msg.error())
81 | running = False
82 | else:
83 | running = False
84 | df = cudf.DataFrame()
85 | if len(events) > 0:
86 | df["Raw"] = events
87 | log.debug("Kafka reader batch aggregation complete. Dataframe size = " + str(df.shape))
88 | return df
89 | except Exception:
90 | log.error("Error fetching data from kafka")
91 | raise
92 |
93 | def close(self):
94 | """
95 | Close Kafka reader
96 | """
97 | log.info("Closing kafka reader...")
98 | if self.consumer is not None:
99 | self.consumer.close()
100 | log.info("Closed kafka reader.")
101 |
--------------------------------------------------------------------------------