├── docs ├── source │ ├── _static │ │ └── EMPTY │ ├── index.rst │ └── api.rst ├── requirement.txt ├── README.md ├── Makefile └── make.bat ├── python ├── clx │ ├── dns │ │ └── __init__.py │ ├── io │ │ ├── __init__.py │ │ ├── factory │ │ │ ├── __init__.py │ │ │ ├── abstract_factory.py │ │ │ ├── dask_fs_factory.py │ │ │ ├── fs_factory.py │ │ │ ├── factory.py │ │ │ └── kafka_factory.py │ │ ├── reader │ │ │ ├── __init__.py │ │ │ ├── reader.py │ │ │ ├── file_reader.py │ │ │ ├── fs_reader.py │ │ │ ├── dask_fs_reader.py │ │ │ └── kafka_reader.py │ │ └── writer │ │ │ ├── __init__.py │ │ │ ├── writer.py │ │ │ ├── file_writer.py │ │ │ ├── fs_writer.py │ │ │ └── kafka_writer.py │ ├── osi │ │ ├── __init__.py │ │ └── whois.py │ ├── utils │ │ ├── __init__.py │ │ └── data │ │ │ ├── __init__.py │ │ │ ├── dataset.py │ │ │ ├── dataloader.py │ │ │ └── utils.py │ ├── analytics │ │ ├── __init__.py │ │ ├── model │ │ │ ├── __init__.py │ │ │ ├── tabular_model.py │ │ │ └── rnn_classifier.py │ │ ├── dga_dataset.py │ │ ├── anomaly_detection.py │ │ ├── periodicity_detection.py │ │ ├── stats.py │ │ └── detector.py │ ├── heuristics │ │ └── __init__.py │ ├── parsers │ │ ├── __init__.py │ │ ├── resources │ │ │ └── splunk_notable_regex.yaml │ │ └── zeek.py │ ├── workflow │ │ ├── __init__.py │ │ └── netflow_workflow.py │ ├── eda │ │ ├── __init__.py │ │ ├── analysis.py │ │ └── summary_stats.py │ ├── __init__.py │ └── tests │ │ ├── test_anomaly_detection.py │ │ ├── test_utils.py │ │ ├── test_kafka_writer.py │ │ ├── test_dga_dataset.py │ │ ├── test_features.py │ │ ├── test_dataloader.py │ │ ├── test_whois.py │ │ ├── test_event_parser.py │ │ ├── test_stats.py │ │ ├── test_loda.py │ │ ├── test_netflow_workflow.py │ │ ├── test_dask_fs_reader.py │ │ ├── test_eda.py │ │ ├── test_kafka_reader.py │ │ ├── test_fs_writer.py │ │ ├── test_port_heuristic.py │ │ ├── test_fs_reader.py │ │ ├── test_binary_sequence_classifier.py │ │ └── test_multiclass_sequence_classifier.py ├── .gitattributes ├── MANIFEST.in ├── pytest.ini ├── .flake8 ├── setup.py └── setup.cfg ├── examples ├── streamz │ ├── python │ │ ├── clx_streamz_tools │ │ │ └── __init__.py │ │ ├── setup.py │ │ ├── dga_detection.py │ │ ├── phishing_detection.py │ │ └── cybert.py │ ├── resources │ │ ├── cybert.yaml │ │ ├── dga_detection.yaml │ │ └── phishing_detection.yaml │ └── scripts │ │ └── entrypoint.sh └── run_dga_training.py ├── siem_integrations ├── clx_query_service │ ├── clxquery │ │ ├── __init__.py │ │ ├── migrations │ │ │ └── __init__.py │ │ ├── models.py │ │ ├── admin.py │ │ ├── tests.py │ │ ├── apps.py │ │ ├── urls.py │ │ ├── utils.py │ │ ├── logging.conf │ │ ├── blazingsql_helper.py │ │ └── views.py │ ├── clx_query_service │ │ ├── __init__.py │ │ ├── urls.py │ │ ├── wsgi.py │ │ └── settings.py │ ├── conf │ │ ├── clx_blz_reader_conf.yaml │ │ └── clx_query_service.conf │ └── manage.py ├── splunk2kafka │ ├── export2kafka │ │ ├── bin │ │ │ ├── scripts │ │ │ │ └── nothing.sh │ │ │ └── export2kafka.py │ │ ├── default │ │ │ ├── commands.conf │ │ │ ├── data │ │ │ │ └── ui │ │ │ │ │ └── nav │ │ │ │ │ └── default.xml │ │ │ └── app.conf │ │ ├── metadata │ │ │ └── default.meta │ │ └── README.md │ └── splunk_wrapper │ │ ├── wrapper-install.sh │ │ └── README.md ├── clx_query │ ├── clx_query.png │ ├── default │ │ ├── clx_query_setup.conf │ │ ├── commands.conf │ │ ├── app.conf │ │ ├── logging.conf │ │ └── data │ │ │ └── ui │ │ │ └── nav │ │ │ └── default.xml │ ├── metadata │ │ └── default.meta │ └── bin │ │ ├── clx_query_conf.py │ │ └── clx_query.py └── Dockerfile ├── conda ├── recipes │ └── clx │ │ ├── build.sh │ │ ├── conda_build_config.yaml │ │ └── meta.yaml └── environments │ └── clx_dev_cuda11.5.yml ├── notebooks ├── alert_analysis │ └── workflow_implementation │ │ ├── input.csv │ │ ├── input2.csv │ │ ├── image1.png │ │ ├── image3.png │ │ ├── image4.png │ │ ├── image5.png │ │ ├── image7.png │ │ └── image8.png └── ids_detection │ └── util.py ├── img └── rapids_logo.png ├── ci ├── cpu │ ├── prebuild.sh │ ├── upload.sh │ └── build.sh ├── integration_tests │ ├── Dockerfile.test │ ├── docker-compose.test.yml │ └── README.md ├── checks │ ├── style.sh │ └── changelog.sh ├── utils │ └── nbtest.sh ├── docs │ └── build.sh ├── release │ └── update-version.sh ├── gpu │ ├── test-notebooks.sh │ └── build.sh └── local │ └── README.md ├── docker ├── .run_in_rapids.sh ├── start_jupyter.sh ├── stop_jupyter.sh └── .start_jupyter_run_in_rapids.sh ├── .github ├── ISSUE_TEMPLATE │ ├── submit_question.md │ ├── feature_request.md │ ├── documentation-request.md │ └── bug_report.md ├── workflows │ └── labeler.yml ├── ops-bot.yaml ├── CODEOWNERS └── labeler.yml ├── .pre-commit-config.yaml ├── .gitignore ├── docker-compose.yml ├── Dockerfile └── CODE_OF_CONDUCT.md /docs/source/_static/EMPTY: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python/clx/dns/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python/clx/io/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python/clx/osi/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python/clx/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python/clx/analytics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python/clx/heuristics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python/clx/io/factory/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python/clx/io/reader/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python/clx/io/writer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python/clx/parsers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python/clx/utils/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python/clx/workflow/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python/clx/analytics/model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/streamz/python/clx_streamz_tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python/.gitattributes: -------------------------------------------------------------------------------- 1 | clx/_version.py export-subst 2 | -------------------------------------------------------------------------------- /siem_integrations/clx_query_service/clxquery/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /siem_integrations/clx_query_service/clx_query_service/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /siem_integrations/clx_query_service/clxquery/migrations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include versioneer.py 2 | include clx/_version.py 3 | -------------------------------------------------------------------------------- /python/clx/eda/__init__.py: -------------------------------------------------------------------------------- 1 | from clx.eda.eda import EDA # noqa: F401 2 | -------------------------------------------------------------------------------- /siem_integrations/splunk2kafka/export2kafka/bin/scripts/nothing.sh: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /conda/recipes/clx/build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ./build.sh clx 4 | -------------------------------------------------------------------------------- /notebooks/alert_analysis/workflow_implementation/input.csv: -------------------------------------------------------------------------------- 1 | raw 2 | hello gtcdc -------------------------------------------------------------------------------- /docs/requirement.txt: -------------------------------------------------------------------------------- 1 | sphinx 2 | sphinx_rtd_theme 3 | numpydoc 4 | ipython 5 | nbsphinx -------------------------------------------------------------------------------- /img/rapids_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/clx/HEAD/img/rapids_logo.png -------------------------------------------------------------------------------- /ci/cpu/prebuild.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export BUILD_CLX=1 4 | export UPLOAD_CLX=1 5 | -------------------------------------------------------------------------------- /notebooks/alert_analysis/workflow_implementation/input2.csv: -------------------------------------------------------------------------------- 1 | raw 2 | username=gtcdc host=1.2.3.4 3 | -------------------------------------------------------------------------------- /docker/.run_in_rapids.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . /opt/conda/etc/profile.d/conda.sh 3 | conda activate rapids 4 | exec "$@" 5 | -------------------------------------------------------------------------------- /siem_integrations/clx_query_service/clxquery/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | 3 | # Create your models here. 4 | -------------------------------------------------------------------------------- /siem_integrations/clx_query_service/clxquery/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | # Register your models here. 4 | -------------------------------------------------------------------------------- /siem_integrations/clx_query_service/clxquery/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /siem_integrations/clx_query/clx_query.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/clx/HEAD/siem_integrations/clx_query/clx_query.png -------------------------------------------------------------------------------- /siem_integrations/splunk2kafka/export2kafka/default/commands.conf: -------------------------------------------------------------------------------- 1 | [export2kafka] 2 | filename = export2kafka.py 3 | chunked = true 4 | -------------------------------------------------------------------------------- /siem_integrations/clx_query/default/clx_query_setup.conf: -------------------------------------------------------------------------------- 1 | [setupentity] 2 | clx_hostname = localhost 3 | clx_port = 8998 4 | clx_query_limit = 10000 5 | -------------------------------------------------------------------------------- /conda/recipes/clx/conda_build_config.yaml: -------------------------------------------------------------------------------- 1 | c_compiler_version: 2 | - 9 3 | 4 | cxx_compiler_version: 5 | - 9 6 | 7 | sysroot_version: 8 | - "2.17" 9 | -------------------------------------------------------------------------------- /siem_integrations/clx_query_service/clx_query_service/urls.py: -------------------------------------------------------------------------------- 1 | from django.urls import path, include 2 | 3 | urlpatterns = [path("", include("clxquery.urls"))] 4 | -------------------------------------------------------------------------------- /siem_integrations/clx_query_service/clxquery/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class ClxQueryConfig(AppConfig): 5 | name = "clxquery" 6 | -------------------------------------------------------------------------------- /notebooks/alert_analysis/workflow_implementation/image1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/clx/HEAD/notebooks/alert_analysis/workflow_implementation/image1.png -------------------------------------------------------------------------------- /notebooks/alert_analysis/workflow_implementation/image3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/clx/HEAD/notebooks/alert_analysis/workflow_implementation/image3.png -------------------------------------------------------------------------------- /notebooks/alert_analysis/workflow_implementation/image4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/clx/HEAD/notebooks/alert_analysis/workflow_implementation/image4.png -------------------------------------------------------------------------------- /notebooks/alert_analysis/workflow_implementation/image5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/clx/HEAD/notebooks/alert_analysis/workflow_implementation/image5.png -------------------------------------------------------------------------------- /notebooks/alert_analysis/workflow_implementation/image7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/clx/HEAD/notebooks/alert_analysis/workflow_implementation/image7.png -------------------------------------------------------------------------------- /notebooks/alert_analysis/workflow_implementation/image8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai/clx/HEAD/notebooks/alert_analysis/workflow_implementation/image8.png -------------------------------------------------------------------------------- /python/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | log_cli = 1 3 | log_cli_level = INFO 4 | log_cli_format = %(asctime)s [%(levelname)s] %(message)s (%(filename)s:%(lineno)s) 5 | log_cli_date_format="%Y-%m-%d %H:%M:%S" 6 | -------------------------------------------------------------------------------- /siem_integrations/clx_query_service/clxquery/urls.py: -------------------------------------------------------------------------------- 1 | from django.conf.urls import re_path 2 | from clxquery import views 3 | 4 | urlpatterns = [re_path("clxquery/$", views.ExecuteClxQuery.as_view())] -------------------------------------------------------------------------------- /siem_integrations/splunk2kafka/export2kafka/default/data/ui/nav/default.xml: -------------------------------------------------------------------------------- 1 | 5 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/submit_question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Submit question 3 | about: Ask a general question about CLX 4 | title: "[QST]" 5 | labels: "? - Needs Triage, question" 6 | assignees: '' 7 | 8 | --- 9 | 10 | **What is your question?** -------------------------------------------------------------------------------- /.github/workflows/labeler.yml: -------------------------------------------------------------------------------- 1 | name: "Pull Request Labeler" 2 | on: 3 | - pull_request_target 4 | 5 | jobs: 6 | triage: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/labeler@main 10 | with: 11 | repo-token: "${{ secrets.GITHUB_TOKEN }}" 12 | -------------------------------------------------------------------------------- /docker/start_jupyter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | nohup jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token='' > /dev/null 2>&1 & 3 | echo -e "\n" 4 | echo "nohup jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token='rapids' > /dev/null 2>&1 &" 5 | echo -e "\n" 6 | -------------------------------------------------------------------------------- /.github/ops-bot.yaml: -------------------------------------------------------------------------------- 1 | # This file controls which features from the `ops-bot` repository below are enabled. 2 | # - https://github.com/rapidsai/ops-bot 3 | 4 | auto_merger: true 5 | branch_checker: true 6 | label_checker: true 7 | release_drafter: true 8 | external_contributors: false 9 | copy_prs: true 10 | -------------------------------------------------------------------------------- /docker/stop_jupyter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ps aux | grep jupyter | \ 3 | grep --extended-regexp "$USER[\ ]{1,10}[0-9]{1,10}" | \ 4 | grep --only-matching --extended-regexp "$USER[\ ]{1,10}[0-9]{1,10}" | \ 5 | grep --only-matching --extended-regexp "[\ ]{1,10}[0-9]{1,10}" | \ 6 | xargs kill -9 7 | sleep 2 -------------------------------------------------------------------------------- /siem_integrations/splunk2kafka/export2kafka/default/app.conf: -------------------------------------------------------------------------------- 1 | # Version 7.0.0 2 | # Splunk app configuration file 3 | 4 | [install] 5 | is_configured = 0 6 | 7 | [ui] 8 | is_visible = 1 9 | label = CyberWorks 10 | 11 | [launcher] 12 | author = ASE Team 13 | description = 14 | version = 1.0 15 | 16 | -------------------------------------------------------------------------------- /siem_integrations/clx_query/metadata/default.meta: -------------------------------------------------------------------------------- 1 | [] 2 | access = read : [ * ], write : [ * ] 3 | 4 | [props/sendmail/] 5 | export = system 6 | owner = nobody 7 | 8 | ### VIEWSTATES: even normal users should be able to create shared viewstates 9 | 10 | [viewstates] 11 | access = read : [ * ], write : [ * ] 12 | -------------------------------------------------------------------------------- /siem_integrations/clx_query_service/conf/clx_blz_reader_conf.yaml: -------------------------------------------------------------------------------- 1 | input_format: csv 2 | tables: 3 | - 4 | input_path: "/rapids/my_data/movies" 5 | table_name: movies 6 | header: 0 7 | - 8 | input_path: "/rapids/my_data/ratings" 9 | table_name: ratings 10 | header: 0 11 | type: blazingsql 12 | 13 | -------------------------------------------------------------------------------- /siem_integrations/splunk2kafka/export2kafka/metadata/default.meta: -------------------------------------------------------------------------------- 1 | [] 2 | access = read : [ * ], write : [ * ] 3 | 4 | [props/sendmail/] 5 | export = system 6 | owner = nobody 7 | 8 | ### VIEWSTATES: even normal users should be able to create shared viewstates 9 | 10 | [viewstates] 11 | access = read : [ * ], write : [ * ] 12 | -------------------------------------------------------------------------------- /python/.flake8: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. 2 | 3 | [flake8] 4 | exclude = factory.py,perfect_hash.py 5 | ignore = 6 | # line break before binary operator 7 | W503 8 | # whitespace before : 9 | E203 10 | # line too long (82 > 79 characters) 11 | E501 12 | # invalid escape sequence ‘x’ 13 | W605 14 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | #python code owners 2 | clx/ @rapidsai/clx-python-codeowners 3 | 4 | #build/ops code owners 5 | .github/ @rapidsai/ops-codeowners 6 | ci/ @rapidsai/ops-codeowners 7 | conda/ @rapidsai/ops-codeowners 8 | **/Dockerfile @rapidsai/ops-codeowners 9 | **/.dockerignore @rapidsai/ops-codeowners 10 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Building Documentation 2 | 3 | A basic python environment with packages listed in `./requirement.txt` is 4 | enough to build the docs. 5 | 6 | ## Get additional dependency 7 | 8 | ```bash 9 | pip install -r requirement.txt 10 | ``` 11 | 12 | ## Run makefile: 13 | 14 | ```bash 15 | make html 16 | ``` 17 | 18 | Outputs to `build/html/index.html` -------------------------------------------------------------------------------- /siem_integrations/clx_query/default/commands.conf: -------------------------------------------------------------------------------- 1 | # [commands.conf]($SPLUNK_HOME/etc/system/README/commands.conf.spec) 2 | [defaults] 3 | 4 | [clx] 5 | filename = clx_query.py 6 | enableheader = true 7 | outputheader = true 8 | requires_srinfo = true 9 | supports_getinfo = true 10 | supports_multivalues = true 11 | supports_rawargs = true 12 | stderr_dest = message 13 | -------------------------------------------------------------------------------- /.github/labeler.yml: -------------------------------------------------------------------------------- 1 | # https://github.com/actions/labeler#common-examples 2 | # Adapted from https://github.com/rapidsai/clx/blob/main/.github/CODEOWNERS 3 | # Labels culled from https://github.com/rapidsai/clx/labels 4 | 5 | Python: 6 | - 'python/**' 7 | - 'notebooks/**' 8 | 9 | integration: 10 | - 'siem_integrations' 11 | 12 | gpuCI: 13 | - 'ci/**' 14 | 15 | conda: 16 | - 'conda/**' 17 | -------------------------------------------------------------------------------- /siem_integrations/clx_query/default/app.conf: -------------------------------------------------------------------------------- 1 | # Splunk app configuration file 2 | 3 | [ui] 4 | label = Clx Query 5 | is_visible = 1 6 | 7 | [launcher] 8 | description = This app has ability to perform custom query on clx python module, which internally triggers workflow to retrieve data. 9 | author = CLX 10 | version = 0.13 11 | 12 | [package] 13 | id = clx_query 14 | 15 | [install] 16 | is_configured = 0 17 | -------------------------------------------------------------------------------- /docker/.start_jupyter_run_in_rapids.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | . /opt/conda/etc/profile.d/conda.sh 3 | conda activate rapids 4 | /rapids/utils/start_jupyter.sh > /dev/null 5 | echo "Notebook server successfully started!" 6 | echo "To access visit http://localhost:8888 on your host machine." 7 | echo 'Ensure the following arguments to "docker run" are added to expose the server ports to your host machine: 8 | -p 8888:8888 -p 8787:8787 -p 8786:8786' 9 | exec "$@" 10 | -------------------------------------------------------------------------------- /ci/integration_tests/Dockerfile.test: -------------------------------------------------------------------------------- 1 | ARG repository=rapidsai-dev-nightly 2 | ARG version=0.11-cuda10.0-devel-ubuntu18.04-py3.7 3 | 4 | FROM rapidsai/${repository}:${version} 5 | 6 | ADD . /clx/ 7 | 8 | ADD ./ci/integration_tests/run_integration_test.py /clx/run_integration_test.py 9 | 10 | SHELL ["/bin/bash", "-c"] 11 | RUN source activate rapids \ 12 | && cd /clx \ 13 | && python setup.py install 14 | 15 | WORKDIR /clx 16 | CMD source activate rapids && python run_integration_test.py -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/timothycrosley/isort 3 | rev: 5.0.7 4 | hooks: 5 | - id: isort 6 | - repo: https://github.com/ambv/black 7 | rev: 19.10b0 8 | hooks: 9 | - id: black 10 | - repo: https://gitlab.com/pycqa/flake8 11 | rev: 3.8.3 12 | hooks: 13 | - id: flake8 14 | alias: flake8 15 | name: flake8 16 | args: ["--config=python/.flake8"] 17 | types: [python] 18 | -------------------------------------------------------------------------------- /siem_integrations/clx_query_service/clx_query_service/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for clx_query_service project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/2.2/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "clx_query_service.settings") 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Common 2 | *.pyc 3 | *.a 4 | *.o 5 | *.so 6 | *.dylib 7 | .cache 8 | .coverage 9 | .vscode 10 | *.swp 11 | .DS_Store 12 | 13 | # Python 14 | __pycache__/ 15 | .pytest_cache/ 16 | build/ 17 | dist/ 18 | clx.egg-info/ 19 | python/clx/analytics/*.cpp 20 | 21 | ## C++ build directories & artifacts 22 | CMakeFiles/ 23 | Debug 24 | build/ 25 | bin/ 26 | 27 | # Dask 28 | dask-worker-space/ 29 | 30 | # IDE 31 | .idea/ 32 | *.iml 33 | 34 | # Test output 35 | clx/tests/output 36 | clx/tests/.config 37 | rnn_classifier_2020-06-08_20_48_03.pth 38 | 39 | # Jupyter 40 | .ipynb_checkpoints/ -------------------------------------------------------------------------------- /python/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | import versioneer 4 | 5 | 6 | setup( 7 | name="clx", 8 | version=versioneer.get_version(), 9 | description="CLX", 10 | author="NVIDIA Corporation", 11 | packages=find_packages(include=["clx", "clx.*"]), 12 | package_data={ 13 | "clx.analytics": ["resources/*.txt"], 14 | "clx.parsers": ["resources/*.yaml"], 15 | "clx.dns": ["resources/*.txt"], 16 | "clx.heuristics": ["resources/*.csv"] 17 | }, 18 | license="Apache", 19 | cmdclass=versioneer.get_cmdclass() 20 | ) 21 | -------------------------------------------------------------------------------- /siem_integrations/clx_query_service/conf/clx_query_service.conf: -------------------------------------------------------------------------------- 1 | [supervisord] 2 | logfile = /tmp/supervisord.log 3 | logfile_maxbytes = 50MB 4 | logfile_backups=5 5 | 6 | 7 | [inet_http_server] 8 | port=127.0.0.1:9001 9 | 10 | [rpcinterface:supervisor] 11 | supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface 12 | 13 | [supervisorctl] 14 | serverurl=http://127.0.0.1:9001 15 | 16 | 17 | [program:clx_query_service] 18 | directory=/rapids/clx/siem_integrations/clx_query_service/bin 19 | command=bash start_service.sh -p 8998 -w 2 -t 60 20 | autostart=false 21 | autorestart=true -------------------------------------------------------------------------------- /siem_integrations/clx_query/default/logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys = root, ClxQuery 3 | 4 | [logger_root] 5 | level = INFO ; Default: WARNING 6 | handlers = stderr ; Default: stderr 7 | 8 | [logger_ClxQuery] 9 | qualname = ClxQuery 10 | level = INFO ; Default: WARNING 11 | handlers = stderr ; Default: stderr 12 | 13 | [handlers] 14 | keys=stderr 15 | 16 | [handler_stderr] 17 | class = logging.StreamHandler 18 | level = NOTSET 19 | args = (sys.stderr,) 20 | formatter = search_command 21 | 22 | [formatters] 23 | keys = search_command 24 | 25 | [formatter_search_command] 26 | format=%(levelname)s:%(module)s: %(message)s 27 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = clx 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /siem_integrations/clx_query/default/data/ui/nav/default.xml: -------------------------------------------------------------------------------- 1 | 2 | 20 | -------------------------------------------------------------------------------- /ci/integration_tests/docker-compose.test.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | zookeeper: 4 | image: confluentinc/cp-zookeeper:latest 5 | environment: 6 | ZOOKEEPER_CLIENT_PORT: 2181 7 | ZOOKEEPER_TICK_TIME: 2000 8 | kafka: 9 | image: confluentinc/cp-kafka:latest 10 | depends_on: 11 | - zookeeper 12 | environment: 13 | KAFKA_BROKER_ID: 1 14 | KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 15 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 16 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 17 | clx: 18 | build: 19 | context: ../../ 20 | dockerfile: ci/integration_tests/Dockerfile.test 21 | depends_on: 22 | - kafka 23 | ports: 24 | - "8888:8888" -------------------------------------------------------------------------------- /ci/integration_tests/README.md: -------------------------------------------------------------------------------- 1 | # CLX Integration Testing 2 | 3 | CLX integrates with [Kafka](https://kafka.apache.org/) for the ability to read and write data from/to a Kafka queue. An integration test environment has been created to simulate and test this interaction. 4 | 5 | ## Running the Integration Test 6 | 7 | To run the integration test simply run the following. This will run the integration test `run_integration_test.py`. 8 | 9 | ``` 10 | cd ci/integration_tests 11 | docker-compose -f docker-compose.test.yml up 12 | ``` 13 | 14 | To continue re-running the integration tests, don't forget to first destroy your current docker images/containers, before creating a new one. 15 | 16 | ``` 17 | cd ci/integration_tests 18 | docker-compose down 19 | ``` -------------------------------------------------------------------------------- /siem_integrations/clx_query_service/manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Django's command-line utility for administrative tasks.""" 3 | import os 4 | import sys 5 | 6 | 7 | def main(): 8 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "clx_query_service.settings") 9 | try: 10 | from django.core.management import execute_from_command_line 11 | except ImportError as exc: 12 | raise ImportError( 13 | "Couldn't import Django. Are you sure it's installed and " 14 | "available on your PYTHONPATH environment variable? Did you " 15 | "forget to activate a virtual environment?" 16 | ) from exc 17 | execute_from_command_line(sys.argv) 18 | 19 | 20 | if __name__ == "__main__": 21 | main() 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for CLX 4 | title: "[FEA]" 5 | labels: "? - Needs Triage, feature request" 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I wish I could use CLX to do [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context, code examples, or references to existing implementations about the feature request here. -------------------------------------------------------------------------------- /python/clx/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | # Versioneer 15 | from ._version import get_versions 16 | __version__ = get_versions()['version'] 17 | 18 | del get_versions 19 | -------------------------------------------------------------------------------- /conda/environments/clx_dev_cuda11.5.yml: -------------------------------------------------------------------------------- 1 | name: clx_dev 2 | channels: 3 | - rapidsai 4 | - rapidsai-nightly 5 | - conda-forge 6 | dependencies: 7 | - cudatoolkit=11.5 8 | - python>=3.6,<3.9 9 | - cugraph=23.04.* 10 | - cuml=23.04.* 11 | - cuxfilter=23.04.* 12 | - scikit-learn=0.23.1 13 | - s3fs 14 | - ipywidgets 15 | - python-confluent-kafka 16 | - transformers=4.* 17 | - seqeval 18 | - python-whois 19 | - seaborn 20 | - requests 21 | - matplotlib 22 | - pip 23 | - pytest 24 | - faker 25 | - jupyterlab 26 | - sphinx 27 | - sphinx_rtd_theme 28 | - numpydoc 29 | - ipython 30 | - nbsphinx 31 | - pip: 32 | - "git+https://github.com/rapidsai/cudatashader.git" 33 | - "git+https://github.com/slashnext/SlashNext-URL-Analysis-and-Enrichment.git#egg=slashnext-phishing-ir&subdirectory=Python SDK/src" 34 | - wget 35 | - mockito 36 | - torch==1.11.0 37 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | #JupyterLab will be available at port 9888 4 | clx: 5 | build: . 6 | ports: 7 | - "9888:8888" 8 | - "8787:8787" 9 | - "8686:8686" 10 | stdin_open: true 11 | tty: true 12 | runtime: nvidia 13 | #Zookeeper will be available at `zookeeper:2181` 14 | zookeeper: 15 | image: confluentinc/cp-zookeeper:latest 16 | environment: 17 | ZOOKEEPER_CLIENT_PORT: 2181 18 | ZOOKEEPER_TICK_TIME: 2000 19 | #Kafka will be available at `kafka:9092` 20 | kafka: 21 | image: confluentinc/cp-kafka:latest 22 | depends_on: 23 | - zookeeper 24 | environment: 25 | KAFKA_BROKER_ID: 1 26 | KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 27 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 28 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 29 | -------------------------------------------------------------------------------- /python/clx/io/writer/writer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import ABC, abstractmethod 16 | 17 | 18 | class Writer(ABC): 19 | @abstractmethod 20 | def close(self): 21 | pass 22 | 23 | @abstractmethod 24 | def write_data(self): 25 | pass 26 | -------------------------------------------------------------------------------- /examples/streamz/python/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from setuptools import setup 16 | 17 | setup( 18 | name="clx_streamz_tools", 19 | version="0.1", 20 | author="NVIDIA Corporation", 21 | packages=["clx_streamz_tools"], 22 | include_package_data=True, 23 | ) 24 | -------------------------------------------------------------------------------- /siem_integrations/splunk2kafka/splunk_wrapper/wrapper-install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | sudo -i 3 | 4 | SPLUNKHOME=/opt/splunk 5 | 6 | cd $SPLUNKHOME/bin 7 | mv splunk splunk.splunk 8 | cat < splunk.wrapper 9 | #!/bin/bash 10 | 11 | RETVAL=0 12 | 13 | switch_python_splunk() { 14 | echo Switching Python to Splunk distro... 15 | rm -f $SPLUNKHOME/bin/python2.7 16 | cp -a $SPLUNKHOME/bin/python2.7.splunk $SPLUNKHOME/bin/python2.7 17 | } 18 | switch_python_conda() { 19 | echo Switching Python to Miniconda distro... 20 | rm -f $SPLUNKHOME/bin/python2.7 21 | cp -a $SPLUNKHOME/bin/python2.7.conda $SPLUNKHOME/bin/python2.7 22 | } 23 | 24 | switch_python_splunk 25 | sleep 1 26 | $SPLUNKHOME/bin/splunk.splunk \$@ 27 | RETVAL=\$? 28 | sleep 5 29 | switch_python_conda 30 | 31 | exit \$RETVAL 32 | EOF 33 | chmod 755 splunk.wrapper 34 | chown splunk:splunk splunk.wrapper 35 | ln -s splunk.wrapper splunk -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. clx documentation main file, created by 2 | sphinx-quickstart on Thu Oct 3 16:57:19 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to clx's documentation! 7 | =============================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | api.rst 14 | 10min-clx.ipynb 15 | intro-clx-workflow.ipynb 16 | intro-clx-dga.ipynb 17 | intro-clx-streamz.ipynb 18 | intro-clx-asset-classification.ipynb 19 | intro-clx-cybert.ipynb 20 | intro-clx-loda-anomaly-detection.ipynb 21 | intro-clx-periodicity-detection.ipynb 22 | intro-clx-phishing-detection.ipynb 23 | intro-clx-predictive-maintenance.ipynb 24 | 25 | Indices and tables 26 | ================== 27 | 28 | * :ref:`genindex` 29 | * :ref:`modindex` 30 | * :ref:`search` 31 | -------------------------------------------------------------------------------- /siem_integrations/clx_query_service/clxquery/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import yaml 16 | 17 | """ 18 | Utility script 19 | """ 20 | 21 | 22 | def load_yaml(yaml_file): 23 | with open(yaml_file) as yaml_file: 24 | config = yaml.safe_load(yaml_file) 25 | return config 26 | -------------------------------------------------------------------------------- /siem_integrations/clx_query_service/clxquery/logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root,applog 3 | [handlers] 4 | keys=rotateFileHandler,rotateConsoleHandler 5 | 6 | [formatters] 7 | keys=applog_format,console_format 8 | 9 | [formatter_applog_format] 10 | format=%(asctime)s-[%(levelname)-8s]:%(message)s 11 | 12 | [formatter_console_format] 13 | format=%(asctime)s-%(filename)s%(lineno)d[%(levelname)s]:%(message)s 14 | 15 | [logger_root] 16 | level=WARNING 17 | handlers=rotateFileHandler,rotateConsoleHandler 18 | 19 | [logger_applog] 20 | level=WARNING 21 | handlers=rotateFileHandler 22 | qualname=simple_example 23 | 24 | [handler_rotateFileHandler] 25 | class=handlers.RotatingFileHandler 26 | level=WARNING 27 | formatter=applog_format 28 | args=('applog.log', 'a', 10000, 9) 29 | 30 | [handler_rotateConsoleHandler] 31 | class=StreamHandler 32 | level=WARNING 33 | formatter=console_format 34 | args=(sys.stdout,) 35 | -------------------------------------------------------------------------------- /ci/checks/style.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2019, NVIDIA CORPORATION. 3 | ################################################################################ 4 | # clx Style Tester 5 | ################################################################################ 6 | 7 | # Ignore errors and set path 8 | set +e 9 | PATH=/conda/bin:$PATH 10 | 11 | # Activate common conda env 12 | . /opt/conda/etc/profile.d/conda.sh 13 | conda activate rapids 14 | 15 | # Run flake8 and get results/return code 16 | FLAKE=`flake8 --ignore=E501,W605 --exclude="factory.py,perfect_hash.py" python` 17 | RETVAL=$? 18 | 19 | # Output results if failure otherwise show pass 20 | if [ "$FLAKE" != "" ]; then 21 | echo -e "\n\n>>>> FAILED: flake8 style check; begin output\n\n" 22 | echo -e "$FLAKE" 23 | echo -e "\n\n>>>> FAILED: flake8 style check; end output\n\n" 24 | else 25 | echo -e "\n\n>>>> PASSED: flake8 style check\n\n" 26 | fi 27 | 28 | exit $RETVAL 29 | -------------------------------------------------------------------------------- /python/clx/io/writer/file_writer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import abstractmethod 16 | from clx.io.writer.writer import Writer 17 | 18 | 19 | class FileWriter(Writer): 20 | 21 | @property 22 | def config(self): 23 | return self._config 24 | 25 | @abstractmethod 26 | def write_data(self): 27 | pass 28 | -------------------------------------------------------------------------------- /siem_integrations/splunk2kafka/splunk_wrapper/README.md: -------------------------------------------------------------------------------- 1 | # splunk_wrapper 2 | 3 | ## Overview 4 | 5 | This is a wrapper script to handle switching Python version so start, stop, and restart commands work as expected from `init.d` and the Splunk Web UI. 6 | 7 | ## Pre-reqs 8 | 9 | 1. Install [Miniconda2](https://repo.continuum.io/miniconda/) in $SPLUNKHOME as the splunk user: 10 | ``` 11 | sudo -i -u splunk bash 12 | Add path to ~/.bashrc 13 | ``` 14 | 15 | 2. Backup the splunk python executable in `/opt/splunk/bin`: 16 | ``` 17 | mv /opt/splunk/bin/python2.7 $SPLUNKHOME/bin/python2.7.splunk 18 | ``` 19 | 20 | 3. Create symlink to Miniconda Python in `/opt/splunk/bin`: 21 | ``` 22 | ln -s /opt/splunk/miniconda2/bin/python2.7 /opt/splunk/python2.7.conda 23 | ``` 24 | 25 | ## Install 26 | 27 | **NOTE:** Do not run this script twice as it will remove Splunk's Python. This is an active area of developmet. 28 | 29 | Run `sudo bash wrapper-install.sh` to install. -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | set SPHINXPROJ=clx 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /python/setup.cfg: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2018-2019, NVIDIA CORPORATION. 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | # See the docstring in versioneer.py for instructions. Note that you must 15 | # re-run 'versioneer.py setup' after changing this section, and commit the 16 | # resulting files. 17 | 18 | [versioneer] 19 | VCS = git 20 | style = pep440 21 | versionfile_source = clx/_version.py 22 | versionfile_build = clx/_version.py 23 | tag_prefix = v 24 | parentdir_prefix = clx- -------------------------------------------------------------------------------- /python/clx/io/factory/abstract_factory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import ABC, abstractmethod 16 | 17 | 18 | class AbstractFactory(ABC): 19 | @property 20 | def config(self): 21 | return self._config 22 | 23 | @config.setter 24 | def config(self, val): 25 | self._config = val 26 | 27 | @abstractmethod 28 | def get_reader(self): 29 | pass 30 | 31 | @abstractmethod 32 | def get_writer(self): 33 | pass 34 | -------------------------------------------------------------------------------- /python/clx/workflow/netflow_workflow.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | from clx.workflow.workflow import Workflow 17 | 18 | log = logging.getLogger(__name__) 19 | 20 | 21 | class NetflowWorkflow(Workflow): 22 | def workflow(self, dataframe): 23 | """TODO: Implement netflow dataframe enrichment""" 24 | log.debug("Processing netflow workflow data...") 25 | dataframe["netflow_enriched"] = "netflow_enriched" 26 | return dataframe 27 | -------------------------------------------------------------------------------- /siem_integrations/Dockerfile: -------------------------------------------------------------------------------- 1 | # An integration test & dev container based on rapids-dev-nightly with CLX installed from current branch 2 | ARG RAPIDS_VERSION=0.13 3 | ARG CUDA_VERSION=10.1 4 | ARG LINUX_VERSION=ubuntu18.04 5 | ARG PYTHON_VERSION=3.7 6 | 7 | FROM rapidsai/rapidsai-dev-nightly:${RAPIDS_VERSION}-cuda${CUDA_VERSION}-devel-${LINUX_VERSION}-py${PYTHON_VERSION} 8 | 9 | ADD . /rapids/clx/ 10 | 11 | SHELL ["/bin/bash", "-c"] 12 | 13 | RUN apt update -y --fix-missing && \ 14 | apt upgrade -y && \ 15 | apt install -y vim 16 | 17 | RUN source activate rapids \ 18 | && conda install -c blazingsql-nightly/label/cuda${CUDA_VERSION} -c blazingsql-nightly -c rapidsai-nightly -c conda-forge blazingsql 19 | 20 | RUN source activate rapids \ 21 | && conda install -y -c pytorch pytorch==1.3.1 torchvision=0.4.2 datashader>=0.10.* panel=0.6.* geopandas>=0.6.* pyppeteer s3fs gunicorn djangorestframework django supervisor nginx \ 22 | && pip install "git+https://github.com/rapidsai/cudatashader.git" \ 23 | && cd /rapids/clx \ 24 | && pip install -e . 25 | 26 | WORKDIR /rapids 27 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Documentation request 3 | about: Report incorrect or needed documentation 4 | title: "[DOC]" 5 | labels: "? - Needs Triage, doc" 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Report incorrect documentation 11 | 12 | **Location of incorrect documentation** 13 | Provide links and line numbers if applicable. 14 | 15 | **Describe the problems or issues found in the documentation** 16 | A clear and concise description of what you found to be incorrect. 17 | 18 | **Steps taken to verify documentation is incorrect** 19 | List any steps you have taken: 20 | 21 | **Suggested fix for documentation** 22 | Detail proposed changes to fix the documentation if you have any. 23 | 24 | --- 25 | 26 | ## Report needed documentation 27 | 28 | **Report needed documentation** 29 | A clear and concise description of what documentation you believe it is needed and why. 30 | 31 | **Describe the documentation you'd like** 32 | A clear and concise description of what you want to happen. 33 | 34 | **Steps taken to search for needed documentation** 35 | List any steps you have taken: -------------------------------------------------------------------------------- /python/clx/utils/data/dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | class Dataset(object): 17 | def __init__(self, df): 18 | self._df = df.reset_index(drop=True) 19 | self._dataset_len = self._df.shape[0] 20 | 21 | @property 22 | def length(self): 23 | """ 24 | Returns dataframe length 25 | """ 26 | return self._dataset_len 27 | 28 | @property 29 | def data(self): 30 | """ 31 | Retruns dataframe 32 | """ 33 | return self._df 34 | -------------------------------------------------------------------------------- /conda/recipes/clx/meta.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | 3 | # Usage: 4 | # conda build -c conda-forge . 5 | {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} 6 | {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} 7 | {% set py_version=environ.get('CONDA_PY', 36) %} 8 | {% set cuda_version=environ.get('CUDA_REL', '0') %} 9 | 10 | package: 11 | name: clx 12 | version: {{ version }} 13 | 14 | source: 15 | git_url: ../../.. 16 | 17 | build: 18 | number: {{ GIT_DESCRIBE_NUMBER }} 19 | string: py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} 20 | script_env: 21 | - VERSION_SUFFIX 22 | 23 | requirements: 24 | build: 25 | - {{ compiler('c') }} 26 | - sysroot_{{ target_platform }} {{ sysroot_version }} 27 | host: 28 | - python 29 | run: 30 | - python 31 | - mkl 32 | - cugraph {{ minor_version }}.* 33 | - cuml {{ minor_version }}.* 34 | 35 | about: 36 | home: http://rapids.ai/ 37 | license: Apache-2.0 38 | license_family: Apache 39 | license_file: LICENSE 40 | summary: clx library 41 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a bug report to help us improve CLX 4 | title: "[BUG]" 5 | labels: "? - Needs Triage, bug" 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **Steps/Code to reproduce bug** 14 | Follow this guide http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports to craft a minimal bug report. This helps us reproduce the issue you're having and resolve the issue more quickly. 15 | 16 | **Expected behavior** 17 | A clear and concise description of what you expected to happen. 18 | 19 | **Environment overview (please complete the following information)** 20 | - Environment location: [Bare-metal, Docker, Cloud(specify cloud provider)] 21 | - Method of CLX install: [conda, Docker, or from source] 22 | - If method of install is [Docker], provide `docker pull` & `docker run` commands used 23 | 24 | **Environment details** 25 | Please run and paste the output of the `/rapids/cudf/print_env.sh` script here, to gather any other relevant environment details. The script is located in the docker container. 26 | 27 | **Additional context** 28 | Add any other context about the problem here. -------------------------------------------------------------------------------- /python/clx/io/reader/reader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import ABC, abstractmethod 16 | 17 | 18 | class Reader(ABC): 19 | @property 20 | def has_data(self): 21 | return self._has_data 22 | 23 | @has_data.setter 24 | def has_data(self, val): 25 | self._has_data = val 26 | 27 | @property 28 | def config(self): 29 | return self._config 30 | 31 | @config.setter 32 | def config(self, val): 33 | self._config = val 34 | 35 | @abstractmethod 36 | def close(self): 37 | pass 38 | 39 | @abstractmethod 40 | def fetch_data(self): 41 | pass 42 | -------------------------------------------------------------------------------- /python/clx/io/reader/file_reader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import abstractmethod 16 | from clx.io.reader.reader import Reader 17 | 18 | 19 | class FileReader(Reader): 20 | @property 21 | def has_data(self): 22 | return self._has_data 23 | 24 | @has_data.setter 25 | def has_data(self, val): 26 | self._has_data = val 27 | 28 | @property 29 | def config(self): 30 | return self._config 31 | 32 | @config.setter 33 | def config(self, val): 34 | self._config = val 35 | 36 | @abstractmethod 37 | def close(self): 38 | pass 39 | 40 | @abstractmethod 41 | def fetch_data(self): 42 | pass 43 | -------------------------------------------------------------------------------- /python/clx/io/factory/dask_fs_factory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from clx.io.factory.abstract_factory import AbstractFactory 16 | from clx.io.reader.dask_fs_reader import DaskFileSystemReader 17 | 18 | 19 | class DaskFileSystemFactory(AbstractFactory): 20 | def __init__(self, config): 21 | """ 22 | Constructor method 23 | 24 | :param config: dictionary object of config values for **type**, **input_format**, **input_path**, and dask reader optional keyword args 25 | """ 26 | self._config = config 27 | 28 | def get_reader(self): 29 | """ 30 | Get instance of DaskFileSystemReader 31 | """ 32 | 33 | return DaskFileSystemReader(self.config) 34 | 35 | def get_writer(self): 36 | raise NotImplementedError 37 | -------------------------------------------------------------------------------- /python/clx/analytics/dga_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | from clx.utils.data.dataset import Dataset 17 | from clx.utils.data import utils 18 | 19 | log = logging.getLogger(__name__) 20 | 21 | 22 | class DGADataset(Dataset): 23 | """Constructor to create DGADataset instance. 24 | 25 | :param df: Input dataframe. 26 | :type df: cudf.DataFrame 27 | :param truncate: Truncate string to n number of characters. 28 | :type truncate: int 29 | """ 30 | 31 | def __init__(self, df, truncate): 32 | df = self.__preprocess(df, truncate) 33 | super().__init__(df) 34 | 35 | def __preprocess(self, df, truncate): 36 | df['domain'] = df['domain'].str.slice_replace(truncate, repl='') 37 | df = utils.str2ascii(df, 'domain') 38 | return df 39 | -------------------------------------------------------------------------------- /python/clx/io/factory/fs_factory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from clx.io.factory.abstract_factory import AbstractFactory 16 | from clx.io.reader.fs_reader import FileSystemReader 17 | from clx.io.writer.fs_writer import FileSystemWriter 18 | 19 | 20 | class FileSystemFactory(AbstractFactory): 21 | def __init__(self, config): 22 | """ 23 | Constructor method 24 | 25 | :param config: dictionary object of config values for **type**, **input_format**, **input_path** (or **output_path**), and dask reader/writer optional keyword args 26 | """ 27 | self._config = config 28 | 29 | def get_reader(self): 30 | """ 31 | Get instance of FileSystemReader 32 | """ 33 | return FileSystemReader(self.config) 34 | 35 | def get_writer(self): 36 | return FileSystemWriter(self.config) 37 | -------------------------------------------------------------------------------- /python/clx/tests/test_anomaly_detection.py: -------------------------------------------------------------------------------- 1 | import cudf 2 | 3 | import clx.analytics.anomaly_detection 4 | import clx.features 5 | 6 | 7 | def test_anomaly_detection(): 8 | df = cudf.DataFrame( 9 | { 10 | "time": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], 11 | "user": [ 12 | "u1", 13 | "u5", 14 | "u4", 15 | "u2", 16 | "u3", 17 | "u1", 18 | "u1", 19 | "u1", 20 | "u1", 21 | "u1", 22 | "u1", 23 | "u1", 24 | "u1", 25 | "u1", 26 | ], 27 | "computer": [ 28 | "c1", 29 | "c1", 30 | "c5", 31 | "c1", 32 | "c1", 33 | "c3", 34 | "c1", 35 | "c1", 36 | "c2", 37 | "c3", 38 | "c1", 39 | "c1", 40 | "c4", 41 | "c5", 42 | ], 43 | } 44 | ) 45 | fdf = clx.features.frequency(df, "user", "computer") # Create feature data 46 | actual = clx.analytics.anomaly_detection.dbscan(fdf, min_samples=2, eps=0.5) 47 | expected = cudf.Series([-1, -1], dtype="int32", index=None) 48 | expected.index = cudf.Series(["u1", "u4"]) 49 | assert actual.equals(expected) 50 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # An integration test & dev container which builds and installs CLX from default branch 2 | ARG RAPIDS_VERSION=22.06 3 | ARG CUDA_VERSION=11.5 4 | ARG CUDA_SHORT_VERSION=${CUDA_VERSION} 5 | ARG LINUX_VERSION=ubuntu18.04 6 | ARG PYTHON_VERSION=3.8 7 | FROM rapidsai/rapidsai-dev-nightly:${RAPIDS_VERSION}-cuda${CUDA_VERSION}-devel-${LINUX_VERSION}-py${PYTHON_VERSION} 8 | 9 | # Add everything from the local build context 10 | ADD . /rapids/clx/ 11 | RUN chmod -R ugo+w /rapids/clx/ 12 | 13 | RUN source activate rapids && \ 14 | gpuci_mamba_retry install -y -n rapids \ 15 | "cudf_kafka=${RAPIDS_VER}" \ 16 | "custreamz=${RAPIDS_VER}" \ 17 | scikit-learn>=0.21 \ 18 | nodejs>=12 \ 19 | ipywidgets \ 20 | python-confluent-kafka \ 21 | seqeval \ 22 | python-whois \ 23 | seaborn \ 24 | requests \ 25 | matplotlib \ 26 | pytest \ 27 | jupyterlab=3.0 \ 28 | faker && \ 29 | pip install -U torch==1.11.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html && \ 30 | pip install "git+https://github.com/rapidsai/cudatashader.git" && \ 31 | pip install mockito && \ 32 | pip install wget && \ 33 | pip install "git+https://github.com/slashnext/SlashNext-URL-Analysis-and-Enrichment.git#egg=slashnext-phishing-ir&subdirectory=Python SDK/src" 34 | 35 | # clx build/install 36 | RUN source activate rapids && \ 37 | cd /rapids/clx/python && \ 38 | python setup.py install 39 | 40 | WORKDIR /rapids/clx 41 | -------------------------------------------------------------------------------- /ci/checks/changelog.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2018, NVIDIA CORPORATION. 3 | ######################### 4 | # clx CHANGELOG Tester # 5 | ######################### 6 | 7 | # Checkout main for comparison 8 | git checkout --force --quiet main 9 | 10 | # Switch back to tip of PR branch 11 | git checkout --force --quiet current-pr-branch 12 | 13 | # Ignore errors during searching 14 | set +e 15 | 16 | # Get list of modified files between matster and PR branch 17 | CHANGELOG=`git diff --name-only main...current-pr-branch | grep CHANGELOG.md` 18 | # Check if CHANGELOG has PR ID 19 | PRNUM=`cat CHANGELOG.md | grep "$PR_ID"` 20 | RETVAL=0 21 | 22 | # Return status of check result 23 | if [ "$CHANGELOG" != "" -a "$PRNUM" != "" ] ; then 24 | echo -e "\n\n>>>> PASSED: CHANGELOG.md has been updated with current PR information.\n\nPlease ensure the update meets the following criteria.\n" 25 | else 26 | echo -e "\n\n>>>> FAILED: CHANGELOG.md has not been updated!\n\nPlease add a line describing this PR to CHANGELOG.md in the repository root directory. The line should meet the following criteria.\n" 27 | RETVAL=1 28 | fi 29 | 30 | cat << EOF 31 | It should be placed under the section for the appropriate release. 32 | It should be placed under "New Features", "Improvements", or "Bug Fixes" as appropriate. 33 | It should be formatted as '- PR # ' 34 | Example format for #491 '- PR #491 Add CI test script to check for updates to CHANGELOG.md in PRs' 35 | EOF 36 | 37 | exit $RETVAL 38 | -------------------------------------------------------------------------------- /examples/streamz/resources/cybert.yaml: -------------------------------------------------------------------------------- 1 | # cudf_engine currently supports only on flatten json input data 2 | cudf_engine: false 3 | kafka_conf: 4 | input_topic: cybert_input 5 | output_topic: cybert_output 6 | # consumer topic partitions 7 | n_partitions: 1 8 | producer_conf: 9 | bootstrap.servers: localhost:9092 10 | session.timeout.ms: '10000' 11 | #queue.buffering.max.messages: '250000' 12 | #linger.ms: '100' 13 | #security.protocol: SASL_SSL 14 | #sasl.mechanism: PLAIN 15 | #ssl.ca.location: 16 | #sasl.username: 17 | #sasl.password: 18 | consumer_conf: 19 | bootstrap.servers: localhost:9092 20 | group.id: streamz 21 | session.timeout.ms: '60000' 22 | enable.partition.eof: 'true' 23 | auto.offset.reset: earliest 24 | #security.protocol: SASL_SSL 25 | #sasl.mechanism: PLAIN 26 | #ssl.ca.location: 27 | #sasl.username: 28 | #sasl.password: 29 | elasticsearch_conf: 30 | url: localhost #https://{}:{}@test.nvidia.com:{}/ 31 | port: 9200 32 | # below properties are required if elasticsearch cluster is SSL enabled 33 | #cafile: 34 | #username: 35 | #password: 36 | index: cybert 37 | # other available sinks are "elasticsearch", "filesystem" 38 | sink: kafka 39 | # below properties are used when sink is set to filesystem 40 | col_delimiter: ',' 41 | file_extension: '.csv' 42 | output_dir: '/your/output/newdir/path' -------------------------------------------------------------------------------- /examples/streamz/resources/dga_detection.yaml: -------------------------------------------------------------------------------- 1 | # cudf_engine currently supports only on flatten json input data 2 | cudf_engine: true 3 | kafka_conf: 4 | input_topic: dga_detection_input 5 | output_topic: dga_detection_output 6 | # consumer topic partitions 7 | n_partitions: 1 8 | producer_conf: 9 | bootstrap.servers: localhost:9092 10 | session.timeout.ms: '10000' 11 | #queue.buffering.max.messages: '250000' 12 | #linger.ms: '100' 13 | #security.protocol: SASL_SSL 14 | #sasl.mechanism: PLAIN 15 | #ssl.ca.location: 16 | #sasl.username: 17 | #sasl.password: 18 | consumer_conf: 19 | bootstrap.servers: localhost:9092 20 | group.id: streamz 21 | session.timeout.ms: '60000' 22 | enable.partition.eof: 'true' 23 | auto.offset.reset: earliest 24 | #security.protocol: SASL_SSL 25 | #sasl.mechanism: PLAIN 26 | #ssl.ca.location: 27 | #sasl.username: 28 | #sasl.password: 29 | elasticsearch_conf: 30 | url: localhost #https://{}:{}@test.nvidia.com:{}/ 31 | port: 9200 32 | # below properties are required if elasticsearch cluster is SSL enabled 33 | #cafile: 34 | #username: 35 | #password: 36 | index: dga 37 | # other available sinks are "elasticsearch", "filesystem" 38 | sink: kafka 39 | # below properties are used when sink is set to filesystem 40 | col_delimiter: ',' 41 | file_extension: '.csv' 42 | output_dir: '/your/output/newdir/path' -------------------------------------------------------------------------------- /examples/streamz/resources/phishing_detection.yaml: -------------------------------------------------------------------------------- 1 | # cudf_engine currently supports only on flatten json input data 2 | cudf_engine: false 3 | kafka_conf: 4 | input_topic: phising_detection_input 5 | output_topic: phising_detection_output 6 | # consumer topic partitions 7 | n_partitions: 1 8 | producer_conf: 9 | bootstrap.servers: localhost:9092 10 | session.timeout.ms: '10000' 11 | #queue.buffering.max.messages: '250000' 12 | #linger.ms: '100' 13 | #security.protocol: SASL_SSL 14 | #sasl.mechanism: PLAIN 15 | #ssl.ca.location: 16 | #sasl.username: 17 | #sasl.password: 18 | consumer_conf: 19 | bootstrap.servers: localhost:9092 20 | group.id: streamz 21 | session.timeout.ms: '60000' 22 | enable.partition.eof: 'true' 23 | auto.offset.reset: earliest 24 | #security.protocol: SASL_SSL 25 | #sasl.mechanism: PLAIN 26 | #ssl.ca.location: 27 | #sasl.username: 28 | #sasl.password: 29 | elasticsearch_conf: 30 | url: localhost #https://{}:{}@test.nvidia.com:{}/ 31 | port: 9200 32 | # below properties are required if elasticsearch cluster is SSL enabled 33 | #cafile: 34 | #username: 35 | #password: 36 | index: phising_detection 37 | # other available sinks are "elasticsearch", "filesystem" 38 | sink: kafka 39 | # below properties are used when sink is set to filesystem 40 | col_delimiter: ',' 41 | file_extension: '.csv' 42 | output_dir: '/your/output/newdir/path' -------------------------------------------------------------------------------- /ci/cpu/upload.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Adopted from https://github.com/tmcdonell/travis-scripts/blob/dfaac280ac2082cd6bcaba3217428347899f2975/update-accelerate-buildbot.sh 4 | 5 | set -e 6 | 7 | # Setup 'gpuci_retry' for upload retries (results in 4 total attempts) 8 | export GPUCI_RETRY_MAX=3 9 | export GPUCI_RETRY_SLEEP=30 10 | 11 | # Set default label options if they are not defined elsewhere 12 | export LABEL_OPTION=${LABEL_OPTION:-"--label main"} 13 | 14 | # Skip uploads unless BUILD_MODE == "branch" 15 | if [ ${BUILD_MODE} != "branch" ]; then 16 | echo "Skipping upload" 17 | return 0 18 | fi 19 | 20 | # Skip uploads if there is no upload key 21 | if [ -z "$MY_UPLOAD_KEY" ]; then 22 | echo "No upload key" 23 | return 0 24 | fi 25 | 26 | ################################################################################ 27 | # SETUP - Get conda file output locations 28 | ################################################################################ 29 | 30 | gpuci_logger "Get conda file output locations" 31 | 32 | export CLX_FILE=`conda build conda/recipes/clx --python=$PYTHON --output` 33 | 34 | ################################################################################ 35 | # UPLOAD - Conda packages 36 | ################################################################################ 37 | 38 | gpuci_logger "Starting conda uploads" 39 | 40 | if [[ "$BUILD_CLX" == "1" && "$UPLOAD_CLX" == "1" ]]; then 41 | test -e ${CLX_FILE} 42 | echo "Upload clx" 43 | echo ${CLX_FILE} 44 | gpuci_retry anaconda -t ${MY_UPLOAD_KEY} upload -u ${CONDA_USERNAME:-rapidsai} ${LABEL_OPTION} --skip-existing ${CLX_FILE} --no-progress 45 | fi 46 | 47 | -------------------------------------------------------------------------------- /python/clx/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import cudf 16 | from clx.utils.data import utils 17 | 18 | test_domains_len = 2 19 | test_input_df = cudf.DataFrame( 20 | {"domain": ["studytour.com.tw", "cnn.com"], "type": [1, 1]} 21 | ) 22 | expected_output_df = cudf.DataFrame( 23 | { 24 | 0: [115, 99], 25 | 1: [116, 110], 26 | 2: [117, 110], 27 | 3: [100, 46], 28 | 4: [121, 99], 29 | 5: [116, 111], 30 | 6: [111, 109], 31 | 7: [117, 0], 32 | 8: [114, 0], 33 | 9: [46, 0], 34 | 10: [99, 0], 35 | 11: [111, 0], 36 | 12: [109, 0], 37 | 13: [46, 0], 38 | 14: [116, 0], 39 | 15: [119, 0], 40 | "len": [16, 7] 41 | }, 42 | dtype="int32" 43 | ) 44 | expected_output_df["type"] = [1, 1] 45 | expected_output_df["domain"] = ["studytour.com.tw", "cnn.com"] 46 | 47 | 48 | def test_str2ascii(): 49 | actual_output_df = utils.str2ascii(test_input_df, 'domain') 50 | assert actual_output_df.equals(expected_output_df) 51 | -------------------------------------------------------------------------------- /siem_integrations/clx_query/bin/clx_query_conf.py: -------------------------------------------------------------------------------- 1 | import splunk.admin as admin 2 | import splunk.entity as en 3 | 4 | """ 5 | Copyright (C) 2005 - 2010 Splunk Inc. All Rights Reserved. 6 | Description: This skeleton python script handles the parameters in the configuration page. 7 | 8 | handleList method: lists configurable parameters in the configuration page 9 | corresponds to handleractions = list in restmap.conf 10 | 11 | handleEdit method: controls the parameters and saves the values 12 | corresponds to handleractions = edit in restmap.conf 13 | 14 | """ 15 | 16 | 17 | class ConfigApp(admin.MConfigHandler): 18 | """ 19 | Set up supported arguments 20 | """ 21 | 22 | def setup(self): 23 | if self.requestedAction == admin.ACTION_EDIT: 24 | for arg in ["clx_hostname", "clx_port", "clx_query_limit"]: 25 | self.supportedArgs.addOptArg(arg) 26 | 27 | """ 28 | Reads configuration from the custom file clx/default/clx_query_setup.conf. 29 | """ 30 | 31 | def handleList(self, confInfo): 32 | confDict = self.readConf("clx_query_setup") 33 | if None != confDict: 34 | for stanza, settings in confDict.items(): 35 | for key, val in settings.items(): 36 | confInfo[stanza].append(key, val) 37 | 38 | def handleEdit(self, confInfo): 39 | name = self.callerArgs.id 40 | args = self.callerArgs 41 | 42 | self.writeConf("clx_query_setup", "setupentity", self.callerArgs.data) 43 | 44 | 45 | # initialize the handler 46 | admin.init(ConfigApp, admin.CONTEXT_NONE) 47 | -------------------------------------------------------------------------------- /python/clx/tests/test_kafka_writer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import cudf 16 | import pytest 17 | from mockito import when, mock, verify 18 | from clx.io.writer.kafka_writer import KafkaWriter 19 | 20 | input_df = cudf.DataFrame( 21 | { 22 | "firstname": ["Emma", "Ava", "Sophia"], 23 | "lastname": ["Olivia", "Isabella", "Charlotte"], 24 | "gender": ["F", "F", "F"], 25 | } 26 | ) 27 | kafka_topic = "publisher_topic_t1" 28 | batch_size = 100 29 | delimiter = "," 30 | producer = mock() 31 | 32 | 33 | @pytest.mark.parametrize("kafka_topic", [kafka_topic]) 34 | @pytest.mark.parametrize("batch_size", [batch_size]) 35 | @pytest.mark.parametrize("delimiter", [delimiter]) 36 | @pytest.mark.parametrize("producer", [producer]) 37 | @pytest.mark.parametrize("input_df", [input_df]) 38 | def test_write_data(kafka_topic, batch_size, delimiter, producer, input_df): 39 | writer = KafkaWriter(kafka_topic, batch_size, delimiter, producer) 40 | when(writer.producer).__len__().thenReturn(1) 41 | writer.write_data(input_df) 42 | verify(writer.producer, times=3).produce(...) 43 | -------------------------------------------------------------------------------- /python/clx/parsers/resources/splunk_notable_regex.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | time: '(^[0-9]+\.?[0-9]*),' 16 | search_name: 'search_name=\"([0-9A-Za-z\s\-\(\)]+)' 17 | orig_time: 'orig_time=\"([0-9]+\.[0-9]+)' 18 | urgency: 'urgency=\"([A-Za-z]+)' 19 | user: 'user=\"([A-Za-z0-9]+)' 20 | owner: 'owner=\"([\w@\.]+)' 21 | security_domain: 'security_domain=\"([A-Za-z]+)' 22 | severity: 'severity=\"([A-Za-z]+)' 23 | src_ip: 'src_ip=\"([\w\.\-]+)' 24 | src_ip2: 'src=\"([\w\.\-]+)' 25 | src_mac: 'smac=([\w\:]+)' 26 | src_port: 'src_port=\"(\d+)' 27 | dest_ip: 'dest_ip=\"([\w\.\-]+)' 28 | dest_ip2: 'dest=\"([\w\.\-]+)' 29 | dest_mac: 'dmac=([\w\:]+)' 30 | dest_port: 'dest_port=\"(\d+)' 31 | dest_priority: 'dest_priority="([A-Za-z]+)' 32 | device_name: 'Device Name:\s([0-9A-Za-z\_\-]+)' 33 | event_name: 'Event Name:\s([A-Za-z\_]+)' 34 | event_type: 'Event Type:\s([A-Za-z]+)' 35 | ip_address: 'IP Address:\s\(([0-9\.]+)' 36 | message_ip: 'message.ip="([\w\.]+)' 37 | message_hostname: 'message.hostname="([\w\.]+)' 38 | message_username: 'message.user_name="([\w\.\@]+)' 39 | message_description: 'message.description="([\w\.\s]+)' -------------------------------------------------------------------------------- /python/clx/tests/test_dga_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import cudf 16 | from clx.analytics.dga_dataset import DGADataset 17 | 18 | test_domains_len = 2 19 | test_batchsize = 1 20 | test_input_df = cudf.DataFrame( 21 | {"domain": ["studytour.com.tw", "cnn.com"], "type": [1, 1]} 22 | ) 23 | 24 | expected_output_df = cudf.DataFrame( 25 | { 26 | 0: [115, 99], 27 | 1: [116, 110], 28 | 2: [117, 110], 29 | 3: [100, 46], 30 | 4: [121, 99], 31 | 5: [116, 111], 32 | 6: [111, 109], 33 | 7: [117, 0], 34 | 8: [114, 0], 35 | 9: [46, 0], 36 | 10: [99, 0], 37 | 11: [111, 0], 38 | 12: [109, 0], 39 | 13: [46, 0], 40 | 14: [116, 0], 41 | 15: [119, 0], 42 | "len": [16, 7], 43 | }, 44 | dtype="int32" 45 | ) 46 | expected_output_df["type"] = [1, 1] 47 | expected_output_df["domain"] = ["studytour.com.tw", "cnn.com"] 48 | 49 | 50 | def test_detector_dataset(): 51 | dataset = DGADataset(test_input_df, 100) 52 | assert dataset.length == 2 53 | assert dataset.data.equals(expected_output_df) 54 | -------------------------------------------------------------------------------- /python/clx/analytics/anomaly_detection.py: -------------------------------------------------------------------------------- 1 | import cudf 2 | import cuml 3 | 4 | 5 | def dbscan(feature_dataframe, min_samples=3, eps=0.3): 6 | """ 7 | Pass a feature dataframe to this function to detect anomalies in your feature dataframe. This function uses ``cuML`` DBSCAN to detect anomalies 8 | and outputs associated labels 0,1,-1. 9 | 10 | Parameters 11 | ---------- 12 | :param feature_dataframe: Feature dataframe to be used for clustering 13 | :type feature_dataframe: cudf.DataFrame 14 | :param min_samples: Minimum samples to use for dbscan 15 | :type min_samples: int 16 | :param eps: Max distance to use for dbscan 17 | :type eps: float 18 | 19 | Examples 20 | -------- 21 | >>> import cudf 22 | >>> import clx.features 23 | >>> import clx.analytics.anomaly_detection 24 | >>> df = cudf.DataFrame( 25 | >>> { 26 | >>> "time": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], 27 | >>> "user": ["u1","u1","u1","u1","u1","u1","u1","u1","u1","u1","u5","u4","u2","u3"], 28 | >>> "computer": ["c1","c2","c3","c1","c2","c3","c1","c1","c2","c3","c1","c1","c5","c6"], 29 | >>> } 30 | >>> ) 31 | >>> feature_df = clx.features.frequency(df, entity_id="user", feature_id="computer") 32 | >>> labels = clx.analytics.anomaly_detection.dbscan(feature_df, min_samples=2, eps=0.5) 33 | >>> labels 34 | 0 -1 35 | 1 -1 36 | 2 -1 37 | dtype: int32 38 | """ 39 | dbscan = cuml.cluster.DBSCAN(eps=eps, min_samples=min_samples) 40 | dbscan.fit(feature_dataframe) 41 | # return anomalies only 42 | labels = cudf.Series(dbscan.labels_) 43 | anomalies = labels[labels == -1] 44 | return anomalies 45 | -------------------------------------------------------------------------------- /python/clx/analytics/periodicity_detection.py: -------------------------------------------------------------------------------- 1 | import cupy as cp 2 | 3 | 4 | def to_periodogram(signal): 5 | """ 6 | Returns periodogram of signal for finding frequencies that have high energy. 7 | 8 | :param signal: signal (time domain) 9 | :type signal: cudf.Series 10 | :return: CuPy array representing periodogram 11 | :rtype: cupy.ndarray 12 | """ 13 | 14 | # convert cudf series to cupy array 15 | signal_cp = cp.fromDlpack(signal.to_dlpack()) 16 | 17 | # standardize the signal 18 | signal_cp_std = (signal_cp - cp.mean(signal_cp)) / cp.std(signal_cp) 19 | 20 | # take fourier transform of signal 21 | FFT_data = cp.fft.fft(signal_cp_std) 22 | 23 | # create periodogram 24 | prdg = (1 / len(signal)) * ((cp.absolute(FFT_data)) ** 2) 25 | 26 | return prdg 27 | 28 | 29 | def filter_periodogram(prdg, p_value): 30 | """ 31 | Select important frequencies by filtering periodogram by p-value. Filtered out frequencies are set to zero. 32 | 33 | :param prdg: periodogram to be filtered 34 | :type signal: cudf.Series 35 | :param p_value: p-value to filter by 36 | :type signal: float 37 | :return: CuPy array representing periodogram 38 | :rtype: cupy.ndarray 39 | """ 40 | 41 | filtered_prdg = cp.copy(prdg) 42 | filtered_prdg[filtered_prdg < (cp.mean(filtered_prdg) * (-1) * (cp.log(p_value)))] = 0 43 | 44 | return filtered_prdg 45 | 46 | 47 | def to_time_domain(prdg): 48 | """ 49 | Convert the signal back to time domain. 50 | 51 | :param prdg: periodogram (frequency domain) 52 | :type prdg: cupy.ndarray 53 | :return: CuPy array representing reconstructed signal 54 | :rtype: cupy.ndarray 55 | """ 56 | 57 | acf = cp.abs(cp.fft.ifft(prdg)) 58 | 59 | return acf 60 | -------------------------------------------------------------------------------- /ci/utils/nbtest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MAGIC_OVERRIDE_CODE=" 4 | def my_run_line_magic(*args, **kwargs): 5 | g=globals() 6 | l={} 7 | for a in args: 8 | try: 9 | exec(str(a),g,l) 10 | except Exception as e: 11 | print('WARNING: %s\n While executing this magic function code:\n%s\n continuing...\n' % (e, a)) 12 | else: 13 | g.update(l) 14 | 15 | def my_run_cell_magic(*args, **kwargs): 16 | my_run_line_magic(*args, **kwargs) 17 | 18 | get_ipython().run_line_magic=my_run_line_magic 19 | get_ipython().run_cell_magic=my_run_cell_magic 20 | 21 | " 22 | 23 | NO_COLORS=--colors=NoColor 24 | EXITCODE=0 25 | NBTMPDIR="$WORKSPACE/tmp" 26 | mkdir -p ${NBTMPDIR} 27 | 28 | for nb in $*; do 29 | NBFILENAME=$1 30 | NBNAME=${NBFILENAME%.*} 31 | NBNAME=${NBNAME##*/} 32 | NBTESTSCRIPT=${NBTMPDIR}/${NBNAME}-test.py 33 | shift 34 | 35 | echo -------------------------------------------------------------------------------- 36 | echo STARTING: ${NBNAME} 37 | echo -------------------------------------------------------------------------------- 38 | jupyter nbconvert --to script ${NBFILENAME} --output ${NBTMPDIR}/${NBNAME}-test 39 | echo "${MAGIC_OVERRIDE_CODE}" > ${NBTMPDIR}/tmpfile 40 | cat ${NBTESTSCRIPT} >> ${NBTMPDIR}/tmpfile 41 | mv ${NBTMPDIR}/tmpfile ${NBTESTSCRIPT} 42 | 43 | echo "Running \"ipython ${NO_COLORS} ${NBTESTSCRIPT}\" on $(date)" 44 | echo 45 | time bash -c "ipython ${NO_COLORS} ${NBTESTSCRIPT}; EC=\$?; echo -------------------------------------------------------------------------------- ; echo DONE: ${NBNAME}; exit \$EC" 46 | NBEXITCODE=$? 47 | echo EXIT CODE: ${NBEXITCODE} 48 | echo 49 | EXITCODE=$((EXITCODE | ${NBEXITCODE})) 50 | done 51 | 52 | exit ${EXITCODE} 53 | -------------------------------------------------------------------------------- /python/clx/tests/test_features.py: -------------------------------------------------------------------------------- 1 | import cudf 2 | import pytest 3 | 4 | import clx.features 5 | 6 | df = cudf.DataFrame( 7 | { 8 | "time": [1, 2, 3, 4, 5, 6, 7], 9 | "user": ["u1", "u2", "u3", "u1", "u1", "u2", "u1"], 10 | "computer": ["c1", "c2", "c3", "c1", "c2", "c3", "c1"], 11 | } 12 | ) 13 | 14 | 15 | def test_binary_features(): 16 | actual = clx.features.binary(df, "user", "computer") 17 | expected = cudf.DataFrame( 18 | {"user": ["u1", "u2", "u3"], "c1": [1, 0, 0], "c2": [1, 1, 0], "c3": [0, 1, 1]} 19 | ) 20 | expected = expected.set_index("user") 21 | expected["c1"] = expected["c1"].astype("int32") 22 | expected["c2"] = expected["c2"].astype("int32") 23 | expected["c3"] = expected["c3"].astype("int32") 24 | expected.columns = cudf.MultiIndex( 25 | names=[None, "computer"], 26 | codes=[[0, 0, 0], [0, 1, 2]], 27 | levels=[["time"], ["c1", "c2", "c3"]], 28 | ) 29 | assert expected.equals(actual) 30 | 31 | 32 | def test_binary_exception(): 33 | with pytest.raises(Exception): 34 | clx.features.binary(df, "user", "a") 35 | 36 | 37 | def test_frequency_features(): 38 | actual = clx.features.frequency(df, "user", "computer") 39 | expected = cudf.DataFrame( 40 | { 41 | "user": ["u1", "u2", "u3"], 42 | "c1": [0.75, 0.00, 0.00], 43 | "c2": [0.25, 0.50, 0.0], 44 | "c3": [0.0, 0.5, 1.0], 45 | } 46 | ) 47 | expected = expected.set_index("user") 48 | expected.columns = cudf.MultiIndex( 49 | names=[None, "computer"], 50 | codes=[[0, 0, 0], [0, 1, 2]], 51 | levels=[["time"], ["c1", "c2", "c3"]], 52 | ) 53 | assert expected.equals(actual) 54 | 55 | 56 | def test_frequency_exception(): 57 | with pytest.raises(Exception): 58 | clx.features.frequency(df, "a", "computer") 59 | -------------------------------------------------------------------------------- /python/clx/eda/analysis.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import json 16 | from abc import ABC, abstractmethod 17 | 18 | 19 | class Analysis(ABC): 20 | def __init__(self, dataframe): 21 | self._analysis = self._generate_analysis(dataframe) 22 | self._charts = self._generate_charts(dataframe) 23 | 24 | @property 25 | def analysis(self): 26 | return self._analysis 27 | 28 | @property 29 | def charts(self): 30 | return self._charts 31 | 32 | @abstractmethod 33 | def _generate_analysis(self, dataframe): 34 | """Abstract function intended to create a dictionary summarizing analysis results of the dataframe""" 35 | pass 36 | 37 | @abstractmethod 38 | def _generate_charts(self, dataframe): 39 | """Abstract function intended to create a list of cuxfilt""" 40 | pass 41 | 42 | def to_json(self): 43 | """Get json version of analysis results""" 44 | return json.dumps(self.analysis, indent=2) 45 | 46 | def save_analysis(self, output_filepath): 47 | """Save analysis to a json file 48 | TODO: Expand to other output types""" 49 | formatted_output = self.to_json() 50 | with open(output_filepath + ".json", "w") as file: 51 | file.write(formatted_output) 52 | -------------------------------------------------------------------------------- /python/clx/tests/test_dataloader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import cudf 15 | from clx.utils.data.dataset import Dataset 16 | from clx.utils.data.dataloader import DataLoader 17 | 18 | test_batchsize = 2 19 | test_df = cudf.DataFrame( 20 | { 21 | "domain": [ 22 | "studytour.com.tw", 23 | "cnn.com", 24 | "bakercityherald.com", 25 | "bankmobile.com", 26 | ], 27 | "type": [1, 1, 0, 1], 28 | } 29 | ) 30 | expected_part_df1 = cudf.DataFrame( 31 | { 32 | "domain": [ 33 | "studytour.com.tw", 34 | "cnn.com", 35 | ], 36 | "type": [1, 1], 37 | } 38 | ) 39 | 40 | expected_part_df2 = cudf.DataFrame( 41 | { 42 | "domain": [ 43 | "bakercityherald.com", 44 | "bankmobile.com", 45 | ], 46 | "type": [0, 1], 47 | } 48 | ) 49 | dataset = Dataset(test_df) 50 | dataloader = DataLoader(dataset, batchsize=test_batchsize) 51 | 52 | 53 | def test_get_chunks(): 54 | df_parts = [] 55 | for df_part in dataloader.get_chunks(): 56 | df_parts.append(df_part) 57 | assert len(df_parts) == 2 58 | assert df_parts[0].reset_index(drop=True).equals(expected_part_df1) 59 | assert df_parts[1].reset_index(drop=True).equals(expected_part_df2) 60 | -------------------------------------------------------------------------------- /python/clx/tests/test_whois.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | import datetime 17 | import whois 18 | from clx.osi.whois import WhoIsLookupClient 19 | from mockito import when 20 | 21 | 22 | domains = ["nvidia.com"] 23 | datetime_1 = datetime.datetime(2020, 5, 17) 24 | datetime_2 = datetime.datetime(2020, 5, 18) 25 | client = WhoIsLookupClient() 26 | 27 | response = { 28 | "domain_name": "NVIDIA.COM", 29 | "registrar": "Safenames Ltd", 30 | "emails": [ 31 | "abuse@safenames.net", 32 | "wadmpfvzi5ei@idp.email", 33 | "hostmaster@safenames.net", 34 | ], 35 | "updated_date": [datetime_1, datetime_2], 36 | } 37 | 38 | 39 | @pytest.mark.parametrize("client", [client]) 40 | @pytest.mark.parametrize("domains", [domains]) 41 | def test_whois(client, domains): 42 | expected_output = [{ 43 | "domain_name": "NVIDIA.COM", 44 | "registrar": "Safenames Ltd", 45 | "emails": "abuse@safenames.net,wadmpfvzi5ei@idp.email,hostmaster@safenames.net", 46 | "updated_date": "05-17-2020 00:00:00,05-18-2020 00:00:00", 47 | }] 48 | when(whois).whois(...).thenReturn(response) 49 | actual_output = client.whois(domains) 50 | assert actual_output[0]["domain_name"] == "NVIDIA.COM" 51 | assert len(actual_output) == len(domains) 52 | assert actual_output == expected_output 53 | -------------------------------------------------------------------------------- /python/clx/tests/test_event_parser.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import cudf 16 | from clx.parsers.event_parser import EventParser 17 | 18 | 19 | class TestEventParserImpl(EventParser): 20 | def parse(self, dataframe, raw_column): 21 | return None 22 | 23 | 24 | class TestEventParser(object): 25 | def setup(self): 26 | # Create Test Event Parser Implementation 27 | event_name = "eventName" 28 | columns = ["eventTypeId", "username"] 29 | self.event_regex = { 30 | "eventTypeId": r"eventTypeId: ([0-9$]+)", 31 | "username": r"username: ([a-z\.\-0-9$]+)", 32 | } 33 | self.event_parser = TestEventParserImpl(columns, event_name) 34 | 35 | def test_parse_raw_event(self): 36 | test_dataframe = cudf.DataFrame( 37 | { 38 | "Raw": [ 39 | "eventTypeId: 1 \\nusername: foo", 40 | "eventTypeId: 1 \\nusername: bar", 41 | ] 42 | } 43 | ) 44 | parsed_dataframe = self.event_parser.parse_raw_event( 45 | test_dataframe, "Raw", self.event_regex 46 | ) 47 | expected_parsed_dataframe = cudf.DataFrame( 48 | {"eventTypeId": ["1", "1"], "username": ["foo", "bar"]} 49 | ) 50 | 51 | assert parsed_dataframe.equals(expected_parsed_dataframe) 52 | -------------------------------------------------------------------------------- /python/clx/utils/data/dataloader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | 17 | log = logging.getLogger(__name__) 18 | 19 | 20 | class DataLoader(object): 21 | """ 22 | Wrapper class is used to return dataframe partitions based on batchsize. 23 | """ 24 | 25 | def __init__(self, dataset, batchsize=1000): 26 | """Constructor to create dataframe partitions. 27 | 28 | :param df: Input dataframe. 29 | :type df: cudf.DataFrame 30 | :param batch_size: Number of records in the dataframe. 31 | :type batch_size: int 32 | """ 33 | self.__dataset = dataset 34 | self.__batchsize = batchsize 35 | 36 | @property 37 | def dataset_len(self): 38 | return self.__dataset.length 39 | 40 | @property 41 | def dataset(self): 42 | return self.__dataset 43 | 44 | def get_chunks(self): 45 | """ A generator function that yields each chunk of original input dataframe based on batchsize 46 | :return: Partitioned dataframe. 47 | :rtype: cudf.DataFrame 48 | """ 49 | prev_chunk_offset = 0 50 | while prev_chunk_offset < self.__dataset.length: 51 | curr_chunk_offset = prev_chunk_offset + self.__batchsize 52 | chunk = self.__dataset.data[prev_chunk_offset:curr_chunk_offset:1] 53 | prev_chunk_offset = curr_chunk_offset 54 | yield chunk 55 | -------------------------------------------------------------------------------- /python/clx/analytics/model/tabular_model.py: -------------------------------------------------------------------------------- 1 | # Original code at https://github.com/spro/practical-pytorch 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | class TabularModel(nn.Module): 7 | "Basic model for tabular data" 8 | 9 | def __init__(self, emb_szs, n_cont, out_sz, layers, drops, 10 | emb_drop, use_bn, is_reg, is_multi): 11 | super().__init__() 12 | 13 | self.embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in emb_szs]) 14 | self.emb_drop = nn.Dropout(emb_drop) 15 | self.bn_cont = nn.BatchNorm1d(n_cont) 16 | n_emb = sum(e.embedding_dim for e in self.embeds) 17 | self.n_emb, self.n_cont = n_emb, n_cont 18 | sizes = [n_emb + n_cont] + layers + [out_sz] 19 | actns = [nn.ReLU(inplace=True)] * (len(sizes) - 2) + [None] 20 | layers = [] 21 | for i, (n_in, n_out, dp, act) in enumerate(zip(sizes[:-1], sizes[1:], [0.] + drops, actns)): 22 | layers += self._bn_drop_lin(n_in, n_out, bn=use_bn and i != 0, p=dp, actn=act) 23 | self.layers = nn.Sequential(*layers) 24 | 25 | def forward(self, x_cat, x_cont): 26 | if self.n_emb != 0: 27 | x = [e(x_cat[:, i]) for i, e in enumerate(self.embeds)] 28 | x = torch.cat(x, 1) 29 | x = self.emb_drop(x) 30 | if self.n_cont != 0: 31 | if self.n_cont == 1: 32 | x_cont = x_cont.unsqueeze(1) 33 | x_cont = self.bn_cont(x_cont) 34 | x = torch.cat([x, x_cont], 1) if self.n_emb != 0 else x_cont 35 | x = self.layers(x) 36 | return x.squeeze() 37 | 38 | def _bn_drop_lin(self, n_in, n_out, bn, p, actn): 39 | "Sequence of batchnorm (if `bn`), dropout (with `p`) and linear (`n_in`,`n_out`) layers followed by `actn`." 40 | layers = [nn.BatchNorm1d(n_in)] if bn else [] 41 | if p != 0: 42 | layers.append(nn.Dropout(p)) 43 | layers.append(nn.Linear(n_in, n_out)) 44 | if actn is not None: 45 | layers.append(actn) 46 | return layers 47 | -------------------------------------------------------------------------------- /python/clx/utils/data/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import cudf 16 | import logging 17 | 18 | log = logging.getLogger(__name__) 19 | 20 | 21 | def str2ascii(df, col_name): 22 | """ 23 | This function sorts domain name entries in desc order based on the length of domain and converts domain name to ascii characters. 24 | 25 | :param df: Domains which requires conversion. 26 | :type df: cudf.DataFrame 27 | :param col_name: Name of the column that needs to be transformed. 28 | :type col_name: str 29 | :return: Ascii character converted information. 30 | :rtype: cudf.DataFrame 31 | """ 32 | df["len"] = df[col_name].str.len() 33 | df = df.sort_values("len", ascending=False) 34 | split_ser = df[col_name].str.findall("[\w\W\d\D\s\S]") 35 | split_df = split_ser.to_frame() 36 | split_df = cudf.DataFrame(split_df[col_name].to_arrow().to_pylist()) 37 | columns_cnt = len(split_df.columns) 38 | 39 | # Replace null's with ^. 40 | split_df = split_df.fillna("^") 41 | temp_df = cudf.DataFrame() 42 | for col in range(0, columns_cnt): 43 | temp_df[col] = split_df[col].str.code_points() 44 | del split_df 45 | 46 | # Replace ^ ascii value 94 with 0. 47 | temp_df = temp_df.replace(94, 0) 48 | temp_df.index = df.index 49 | temp_df["len"] = df["len"] 50 | if "type" in df.columns: 51 | temp_df["type"] = df["type"] 52 | temp_df[col_name] = df[col_name] 53 | return temp_df 54 | -------------------------------------------------------------------------------- /python/clx/parsers/zeek.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import cudf 16 | 17 | type_dict = { 18 | "bool": "bool", 19 | "count": "int64", 20 | "int": "int64", 21 | "double": "float64", 22 | "time": "float64", 23 | "interval": "float64", 24 | "string": "str", 25 | "pattern": "str", 26 | "port": "int64", 27 | "addr": "str", 28 | "subnet": "str", 29 | "enum": "str", 30 | "function": "str", 31 | "event": "str", 32 | "hook": "str", 33 | "file": "str", 34 | "opaque": "str", 35 | "any": "str", 36 | } 37 | 38 | 39 | def parse_log_file(filepath): 40 | """Parse Zeek log file and return cuDF dataframe. Uses header comments to get column names/types and configure parser. 41 | 42 | :param filepath: filepath for Zeek log file 43 | :type filepath: string 44 | :return: Zeek log dataframe 45 | :rtype: cudf.DataFrame 46 | """ 47 | header_gdf = cudf.read_csv(filepath, names=["line"], nrows=8) 48 | lines_gdf = header_gdf["line"].str.split() 49 | 50 | column_names = lines_gdf.to_pandas().iloc[6][1:].tolist() 51 | column_types = lines_gdf.to_pandas().iloc[7][1:].tolist() 52 | column_dtypes = list(map(lambda x: type_dict.get(x, "str"), column_types)) 53 | 54 | log_gdf = cudf.read_csv( 55 | filepath, 56 | delimiter="\t", 57 | dtype=column_dtypes, 58 | names=column_names, 59 | skiprows=8, 60 | skipfooter=1, 61 | ) 62 | return log_gdf 63 | -------------------------------------------------------------------------------- /siem_integrations/splunk2kafka/export2kafka/README.md: -------------------------------------------------------------------------------- 1 | # export2kafka 2 | 3 | ## Overview 4 | 5 | This is a Splunk App that installs `export2kafka` that enables data export from Splunk to a running Kafka instance. 6 | 7 | ## Prerequesites 8 | 9 | 1. Install Kakfa libraries: 10 | ``` 11 | sudo -i -u splunk bash 12 | source activate root 13 | conda install -c conda-forge python-confluent-kafka 14 | conda remove python-confluent-kafka 15 | conda install -c conda-forge librdkafka=0.11.0 16 | conda install -f -c conda-forge python-confluent-kafka 17 | ``` 18 | 2. Setup `/etc/hosts` for the Kafka brokers 19 | 20 | ## Install 21 | 22 | 1. Git clone this repo into `$SPLUNKHOME/etc/apps` 23 | 2. Copy `splunklib` from [splunk-sdk-python](https://github.com/splunk/splunk-sdk-python) to `$SPLUNKHOME/etc/apps`. Use tag version that matches your Splunk installation. *Note: Application was tested with Splunk 1.6.x*. 24 | 3. Go to `http://$SPLUNKURL/en-us/debug/refresh` 25 | 4. Click the "Refresh" button to load the app into the Web UI 26 | 27 | ## Usage 28 | ### Config Options 29 | **broker** 30 | Usage - set a Kafka broker to use for bootstrap 31 | Required? - YES 32 | Format - : 33 | Example - broker=10.0.0.0:9092 34 | 35 | **topic** 36 | Usage - set the Kafka topic to publish to 37 | Required? - YES 38 | Format - 39 | Example - topic=data_raw 40 | 41 | **batch** 42 | Usage - set the batch size before calling poll on producer 43 | Required? - NO 44 | Format - integer 45 | Default - 2000 records 46 | Example - batch=2000 47 | 48 | **timeout** 49 | Usage - set the timeout of the export in minutes 50 | Required? - NO 51 | Format - integer in minutes 52 | Default - 60 mins 53 | Example - timeout=60 54 | 55 | **pool** 56 | Usage - set the number of producers used, useful when exporting large data sets 57 | Required? - NO 58 | Format - integer 59 | Default - 2 producers 60 | Example - pool=2 61 | 62 | ### Query Example 63 | 64 | ``` 65 | index="my-index" | export2kafka topic=my-topic broker=10.0.0.0:9092 66 | ``` 67 | -------------------------------------------------------------------------------- /ci/docs/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright (c) 2020, NVIDIA CORPORATION. 3 | ################################# 4 | # CLX Docs build script for CI # 5 | ################################# 6 | 7 | if [ -z "$PROJECT_WORKSPACE" ]; then 8 | echo ">>>> ERROR: Could not detect PROJECT_WORKSPACE in environment" 9 | echo ">>>> WARNING: This script contains git commands meant for automated building, do not run locally" 10 | exit 1 11 | fi 12 | 13 | export PATH=/conda/bin:/usr/local/cuda/bin:$PATH 14 | export HOME="$WORKSPACE" 15 | export DOCS_WORKSPACE="$WORKSPACE/docs" 16 | export CUDA_REL=${CUDA_VERSION%.*} 17 | export CUDA_SHORT=${CUDA_REL//./} 18 | export PROJECTS=(clx) 19 | 20 | # Switch to project root; also root of repo checkout 21 | cd "$PROJECT_WORKSPACE" 22 | export GIT_DESCRIBE_TAG=`git describe --tags` 23 | export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'` 24 | 25 | gpuci_logger "Check environment" 26 | env 27 | 28 | gpuci_logger "Check GPU usage" 29 | nvidia-smi 30 | 31 | logger "Activate conda env..." 32 | source activate rapids 33 | conda install --freeze-installed -c rapidsai-nightly -c rapidsai -c nvidia -c pytorch -c conda-forge \ 34 | "pytorch>=1.7" torchvision "transformers=3.5.*" requests yaml python-confluent-kafka python-whois markdown beautifulsoup4 jq 35 | 36 | pip install mockito 37 | pip install "git+https://github.com/slashnext/SlashNext-URL-Analysis-and-Enrichment.git#egg=slashnext-phishing-ir&subdirectory=Python SDK/src" 38 | pip install cupy-cuda${CUDA_SHORT} 39 | 40 | gpuci_logger "Check versions" 41 | python --version 42 | $CC --version 43 | $CXX --version 44 | 45 | gpuci_logger "Show conda info" 46 | conda info 47 | conda config --show-sources 48 | conda list --show-channel-urls 49 | 50 | #clx source build 51 | "$PROJECT_WORKSPACE/build.sh" clx 52 | 53 | #clx Sphinx Build 54 | gpuci_logger "Build clx docs" 55 | cd "$PROJECT_WORKSPACE/docs" 56 | make html 57 | 58 | cd $DOCS_WORKSPACE 59 | 60 | if [ ! -d "api/clx/$BRANCH_VERSION" ]; then 61 | mkdir -p api/clx/$BRANCH_VERSION 62 | fi 63 | 64 | rm -rf api/clx/$BRANCH_VERSION/* 65 | mv "$PROJECT_WORKSPACE/docs/build/html/"* $DOCS_WORKSPACE/api/clx/$BRANCH_VERSION 66 | 67 | 68 | -------------------------------------------------------------------------------- /examples/streamz/python/dga_detection.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import time 16 | import dask 17 | from clx_streamz_tools import utils 18 | from clx_streamz_tools import streamz_workflow 19 | 20 | 21 | class DGADetectionWorkflow(streamz_workflow.StreamzWorkflow): 22 | def inference(self, messages_df): 23 | # Messages will be received and run through DGA inferencing 24 | worker = dask.distributed.get_worker() 25 | batch_start_time = int(round(time.time())) 26 | result_size = messages_df.shape[0] 27 | print("Processing batch size: " + str(result_size)) 28 | dd = worker.data["dga_detector"] 29 | preds = dd.predict(messages_df["domain"]) 30 | messages_df["preds"] = preds 31 | return (messages_df, batch_start_time, result_size) 32 | 33 | def worker_init(self): 34 | # Initialization for each dask worker 35 | from clx.analytics.dga_detector import DGADetector 36 | 37 | worker = dask.distributed.get_worker() 38 | dd = DGADetector() 39 | print( 40 | "Initializing Dask worker: " 41 | + str(worker) 42 | + " with dga model. Model File: " 43 | + str(self.args.model) 44 | ) 45 | dd.load_model(self.args.model) 46 | # this dict can be used for adding more objects to distributed dask worker 47 | obj_dict = {"dga_detector": dd} 48 | worker = utils.init_dask_workers(worker, self.config, obj_dict) 49 | 50 | 51 | if __name__ == "__main__": 52 | dga_detection = DGADetectionWorkflow() 53 | dga_detection.start() 54 | -------------------------------------------------------------------------------- /python/clx/tests/test_stats.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import clx.analytics.stats 16 | import cudf 17 | import cupy as cp 18 | 19 | 20 | def test_rzscore(): 21 | sequence = [ 22 | 3, 23 | 4, 24 | 5, 25 | 6, 26 | 1, 27 | 10, 28 | 34, 29 | 2, 30 | 1, 31 | 11, 32 | 45, 33 | 34, 34 | 2, 35 | 9, 36 | 19, 37 | 43, 38 | 24, 39 | 13, 40 | 23, 41 | 10, 42 | 98, 43 | 84, 44 | 10, 45 | ] 46 | series = cudf.Series(sequence) 47 | zscores_df = cudf.DataFrame() 48 | zscores_df["zscore"] = clx.analytics.stats.rzscore(series, 7) 49 | expected_zscores_arr = [ 50 | float(0), 51 | float(0), 52 | float(0), 53 | float(0), 54 | float(0), 55 | float(0), 56 | 2.374423424, 57 | -0.645941275, 58 | -0.683973734, 59 | 0.158832461, 60 | 1.847751909, 61 | 0.880026019, 62 | -0.950835449, 63 | -0.360593742, 64 | 0.111407599, 65 | 1.228914145, 66 | -0.074966331, 67 | -0.570321249, 68 | 0.327849973, 69 | -0.934372308, 70 | 2.296828498, 71 | 1.282966989, 72 | -0.795223674, 73 | ] 74 | expected_zscores_df = cudf.DataFrame() 75 | expected_zscores_df["zscore"] = expected_zscores_arr 76 | 77 | # Check that columns are equal 78 | zscores_df["zscore"] = zscores_df["zscore"].fillna(0) 79 | assert cp.allclose(expected_zscores_df["zscore"], zscores_df["zscore"]) 80 | -------------------------------------------------------------------------------- /ci/release/update-version.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ######################## 3 | # clx Version Updater # 4 | ######################## 5 | 6 | ## Usage 7 | # bash update-version.sh 8 | 9 | 10 | # Format is YY.MM.PP - no leading 'v' or trailing 'a' 11 | NEXT_FULL_TAG=$1 12 | 13 | # Get current version 14 | CURRENT_TAG=$(git tag --merged HEAD | grep -xE '^v.*' | sort --version-sort | tail -n 1 | tr -d 'v') 15 | CURRENT_MAJOR=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[1]}') 16 | CURRENT_MINOR=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[2]}') 17 | CURRENT_PATCH=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[3]}') 18 | CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR} 19 | 20 | #Get . for next version 21 | NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}') 22 | NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}') 23 | NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR} 24 | 25 | echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG" 26 | 27 | # Inplace sed replace; workaround for Linux and Mac 28 | function sed_runner() { 29 | sed -i.bak ''"$1"'' $2 && rm -f ${2}.bak 30 | } 31 | 32 | # Dockerfile update 33 | sed_runner 's/RAPIDS_VERSION=0.*/RAPIDS_VERSION='"${NEXT_SHORT_TAG}"'/g' Dockerfile 34 | # Streamz Dockerfile update 35 | sed_runner 's/RAPIDS_VERSION=0.*/RAPIDS_VERSION='"${NEXT_SHORT_TAG}"'/g' examples/streamz/Dockerfile 36 | 37 | # Sphinx Update 38 | sed_runner 's/version = *.*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/source/conf.py 39 | sed_runner 's/release = *.*.*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/source/conf.py 40 | 41 | # conda environment 42 | for FILE in conda/environments/*.yml; do 43 | sed_runner "s/cugraph=${CURRENT_SHORT_TAG}/cugraph=${NEXT_SHORT_TAG}/g" ${FILE}; 44 | sed_runner "s/cuml=${CURRENT_SHORT_TAG}/cuml=${NEXT_SHORT_TAG}/g" ${FILE}; 45 | sed_runner "s/cuxfilter=${CURRENT_SHORT_TAG}/cuxfilter=${NEXT_SHORT_TAG}/g" ${FILE}; 46 | sed_runner "s/dask-cudf=${CURRENT_SHORT_TAG}/dask-cudf=${NEXT_SHORT_TAG}/g" ${FILE}; 47 | done 48 | 49 | # README.md update 50 | sed_runner "s/rapidsai-clx:${CURRENT_SHORT_TAG}/rapidsai-clx:${NEXT_SHORT_TAG}/g" README.md 51 | sed_runner "s/rapidsai-dev:${CURRENT_SHORT_TAG}/rapidsai-dev:${NEXT_SHORT_TAG}/g" README.md -------------------------------------------------------------------------------- /notebooks/ids_detection/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from cuml.metrics import precision_recall_curve, roc_auc_score 3 | from sklearn.metrics import roc_curve 4 | import cupy as cp 5 | import matplotlib.pylab as plt 6 | 7 | def average_precision_score(y_true, y_score): 8 | """ 9 | Compute average precision score using precision and recall computed from cuml. 10 | """ 11 | precision, recall, _ = precision_recall_curve(y_true, y_score) 12 | # return step function integral 13 | return -cp.sum(cp.diff(recall) * cp.array(precision)[:-1]) 14 | 15 | def metrics(y_true, y_score): 16 | auc = roc_auc_score(y_true=y_true, y_score=y_score) 17 | ap = average_precision_score(y_true, y_score) 18 | return [auc, ap] 19 | 20 | def plot_roc(label, y_scores): 21 | fpr, tpr, _ = roc_curve(y_true=label.values.tolist(), y_score=y_scores.tolist()) 22 | auc = metrics(label, y_scores)[0] 23 | plt.plot(fpr, tpr, label="ROC = " + str(np.round(auc,2))) 24 | plt.plot(np.arange(0,1.1,0.1), np.arange(0,1.1,0.1), 'r-') 25 | plt.ylabel('tpr') 26 | plt.xlabel('fpr') 27 | plt.legend(loc='best') 28 | plt.title('Area under AUC curve') 29 | 30 | def plot_pr(label, y_scores): 31 | ap = metrics(label, y_scores)[1] 32 | precision, recall, _ = precision_recall_curve( label, y_scores) 33 | plt.plot(recall, precision, label='AP = ' + str(np.round(ap,2))) 34 | plt.ylabel('Precision') 35 | plt.xlabel('Recall') 36 | plt.legend(loc='best') 37 | plt.title('Area under PR curve') 38 | 39 | def missing_values_table(df): 40 | mis_val = df.isnull().sum() 41 | mis_val_percent = 100 * df.isnull().sum() / len(df) 42 | mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1) 43 | mis_val_table_ren_columns = mis_val_table.rename( 44 | columns = {0 : 'Missing Values', 1 : '% of Total Values'}) 45 | mis_val_table_ren_columns = mis_val_table_ren_columns[ 46 | mis_val_table_ren_columns.iloc[:,1] != 0].sort_values( 47 | '% of Total Values', ascending=False).round(1) 48 | print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n" 49 | "There are " + str(mis_val_table_ren_columns.shape[0]) + 50 | " columns that have missing values.") 51 | return mis_val_table_ren_columns 52 | -------------------------------------------------------------------------------- /python/clx/io/reader/fs_reader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import cudf 16 | import logging 17 | from clx.io.reader.file_reader import FileReader 18 | 19 | log = logging.getLogger(__name__) 20 | 21 | 22 | class FileSystemReader(FileReader): 23 | """ 24 | Uses cudf to read from file system based on config object. 25 | 26 | :param config: dictionary object of config values for **type**, **input_format**, **input_path** (or **output_path**), and cudf reader optional keyword args 27 | """ 28 | def __init__(self, config): 29 | self._config = config 30 | self._has_data = True 31 | 32 | def fetch_data(self): 33 | """ 34 | Fetch data using cudf based on provided config object 35 | """ 36 | df = None 37 | input_format = self.config["input_format"].lower() 38 | filepath = self.config["input_path"] 39 | kwargs = self.config.copy() 40 | del kwargs["type"] 41 | del kwargs["input_format"] 42 | del kwargs["input_path"] 43 | 44 | if "csv" == input_format: 45 | df = cudf.read_csv(filepath, **kwargs) 46 | elif "parquet" == input_format: 47 | df = cudf.read_parquet(filepath, **kwargs) 48 | elif "orc" == input_format: 49 | df = cudf.read_orc(filepath, engine="cudf") 50 | elif "json" == input_format: 51 | df = cudf.read_json(filepath, **kwargs) 52 | else: 53 | raise NotImplementedError("%s is not a supported input_format" % (input_format)) 54 | 55 | self.has_data = False 56 | return df 57 | 58 | def close(self): 59 | """ 60 | Close cudf reader 61 | """ 62 | log.info("Closed fs reader") 63 | -------------------------------------------------------------------------------- /siem_integrations/clx_query_service/clxquery/blazingsql_helper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dask_cuda import LocalCUDACluster 16 | from dask.distributed import Client 17 | from blazingsql import BlazingContext 18 | import logging 19 | 20 | log = logging.getLogger(__name__) 21 | """ 22 | This class provides functionality to run blazingSQL queires and drop tables. 23 | """ 24 | 25 | 26 | class BlazingSQLHelper: 27 | def __init__(self): 28 | cluster = LocalCUDACluster() 29 | client = Client(cluster) 30 | self._bc = BlazingContext(dask_client = client, network_interface = 'lo') 31 | 32 | """This function runs blazingSQL query. 33 | 34 | :param config: Query related tables configuration. 35 | :type config: dict 36 | :return: Query results. 37 | :rtype: cudf.DataFrame 38 | """ 39 | 40 | def run_query(self, config): 41 | for table in config["tables"]: 42 | table_name = table["table_name"] 43 | file_path = table["input_path"] 44 | kwargs = table.copy() 45 | del kwargs["table_name"] 46 | del kwargs["input_path"] 47 | self._bc.create_table(table_name, file_path, **kwargs) 48 | sql = config["sql"] 49 | log.debug("Executing query: %s" % (sql)) 50 | result = self._bc.sql(sql) 51 | result = result.compute() 52 | return result 53 | 54 | """This function drops blazingSQL tables. 55 | :param table_names: List of table names to drop. 56 | :type table_names: List 57 | """ 58 | 59 | def drop_table(self, table_names): 60 | for table_name in table_names: 61 | log.debug("Drop table: %s" % (table_name)) 62 | self._bc.drop_table(table_name) -------------------------------------------------------------------------------- /python/clx/analytics/model/rnn_classifier.py: -------------------------------------------------------------------------------- 1 | # Original code at https://github.com/spro/practical-pytorch 2 | import torch 3 | import torch.nn as nn 4 | from torch.nn.utils.rnn import pack_padded_sequence 5 | 6 | DROPOUT = 0.0 7 | 8 | 9 | class RNNClassifier(nn.Module): 10 | def __init__( 11 | self, input_size, hidden_size, output_size, n_layers, bidirectional=True 12 | ): 13 | super(RNNClassifier, self).__init__() 14 | self.input_size = input_size 15 | self.hidden_size = hidden_size 16 | self.output_size = output_size 17 | self.n_layers = n_layers 18 | self.n_directions = int(bidirectional) + 1 19 | self.embedding = nn.Embedding(input_size, hidden_size) 20 | self.gru = nn.GRU( 21 | hidden_size, 22 | hidden_size, 23 | n_layers, 24 | dropout=DROPOUT, 25 | bidirectional=bidirectional, 26 | ) 27 | self.fc = nn.Linear(hidden_size, output_size) 28 | 29 | def forward(self, input, seq_lengths): 30 | # Note: we run this all at once (over the whole input sequence) 31 | # input shape: B x S (input size) 32 | # transpose to make S(sequence) x B (batch) 33 | input = input.t() 34 | batch_size = input.size(1) 35 | 36 | # Make a hidden 37 | hidden = self._init_hidden(batch_size) 38 | 39 | # Embedding S x B -> S x B x I (embedding size) 40 | embedded = self.embedding(input) 41 | 42 | # Pack them up nicely 43 | gru_input = pack_padded_sequence(embedded, seq_lengths.data.cpu().numpy()) 44 | 45 | # To compact weights again call flatten_parameters(). 46 | self.gru.flatten_parameters() 47 | output, hidden = self.gru(gru_input, hidden) 48 | # output = self.dropout(output) 49 | 50 | # Use the last layer output as FC's input 51 | # No need to unpack, since we are going to use hidden 52 | fc_output = self.fc(hidden[-1]) 53 | return fc_output 54 | 55 | def _init_hidden(self, batch_size): 56 | hidden = torch.zeros( 57 | self.n_layers * self.n_directions, batch_size, self.hidden_size 58 | ) 59 | # creating variable 60 | if torch.cuda.is_available(): 61 | return hidden.cuda() 62 | else: 63 | return hidden 64 | -------------------------------------------------------------------------------- /python/clx/io/reader/dask_fs_reader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import dask_cudf 16 | import logging 17 | from clx.io.reader.file_reader import FileReader 18 | 19 | log = logging.getLogger(__name__) 20 | 21 | 22 | class DaskFileSystemReader(FileReader): 23 | """ 24 | Uses Dask to read from file system based on config object. 25 | 26 | :param config: dictionary object of config values for **type**, **input_format**, **input_path**, and dask reader optional keyword args 27 | """ 28 | def __init__(self, config): 29 | self._config = config 30 | self._has_data = True 31 | 32 | def fetch_data(self): 33 | """ 34 | Fetch data using dask based on provided config object 35 | """ 36 | df = None 37 | input_format = self.config["input_format"].lower() 38 | filepath = self.config["input_path"] 39 | kwargs = self.config.copy() 40 | del kwargs["type"] 41 | del kwargs["input_format"] 42 | del kwargs["input_path"] 43 | 44 | if "csv" == input_format: 45 | df = dask_cudf.read_csv(filepath, **kwargs) 46 | elif "parquet" == input_format: 47 | df = dask_cudf.read_parquet(filepath, **kwargs) 48 | elif "orc" == input_format: 49 | df = dask_cudf.read_orc(filepath, engine="cudf") 50 | elif "json" == input_format: 51 | df = dask_cudf.read_json(filepath, **kwargs) 52 | else: 53 | raise NotImplementedError("%s is not a supported input_format" % (input_format)) 54 | 55 | self.has_data = False 56 | return df 57 | 58 | def close(self): 59 | """ 60 | Close dask reader 61 | """ 62 | log.info("Closed dask_fs reader") 63 | -------------------------------------------------------------------------------- /ci/gpu/test-notebooks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #RAPIDS_DIR=/rapids 4 | NOTEBOOKS_DIR="$WORKSPACE/notebooks" 5 | NBTEST="$WORKSPACE/ci/utils/nbtest.sh" 6 | LIBCUDF_KERNEL_CACHE_PATH="$WORKSPACE/.jitcache" 7 | 8 | cd ${NOTEBOOKS_DIR} 9 | TOPLEVEL_NB_FOLDERS=$(find . -name *.ipynb |cut -d'/' -f2|sort -u) 10 | 11 | # Add notebooks that should be skipped here 12 | # (space-separated list of filenames without paths) 13 | SKIPNBS="FLAIR_DNS_Log_Parsing.ipynb CLX_Workflow_Notebook2.ipynb CLX_Workflow_Notebook3.ipynb Supervised_Asset_Classification.ipynb CLX_Supervised_Asset_Classification.ipynb DGA_Detection.ipynb Predictive_Maintenance_Sequence_Classifier.ipynb IDS_using_LODA.ipynb anomalous_behavior_profiling_supervised.ipynb custream_n_graph.ipynb" 14 | 15 | ## Check env 16 | env 17 | 18 | EXITCODE=0 19 | 20 | # Always run nbtest in all TOPLEVEL_NB_FOLDERS, set EXITCODE to failure 21 | # if any run fails 22 | for folder in ${TOPLEVEL_NB_FOLDERS}; do 23 | echo "========================================" 24 | echo "FOLDER: ${folder}" 25 | echo "========================================" 26 | cd ${NOTEBOOKS_DIR}/${folder} 27 | for nb in $(find . -name "*.ipynb"); do 28 | nbBasename=$(basename ${nb}) 29 | # Skip all NBs that use dask (in the code or even in their name) 30 | if ((echo ${nb}|grep -qi dask) || \ 31 | (grep -q dask ${nb})); then 32 | echo "--------------------------------------------------------------------------------" 33 | echo "SKIPPING: ${nb} (suspected Dask usage, not currently automatable)" 34 | echo "--------------------------------------------------------------------------------" 35 | elif (echo " ${SKIPNBS} " | grep -q " ${nbBasename} "); then 36 | echo "--------------------------------------------------------------------------------" 37 | echo "SKIPPING: ${nb} (listed in skip list)" 38 | echo "--------------------------------------------------------------------------------" 39 | else 40 | cd $(dirname ${nb}) 41 | nvidia-smi 42 | ${NBTEST} ${nbBasename} 43 | EXITCODE=$((EXITCODE | $?)) 44 | rm -rf ${LIBCUDF_KERNEL_CACHE_PATH}/* 45 | cd ${NOTEBOOKS_DIR}/${folder} 46 | fi 47 | done 48 | done 49 | 50 | nvidia-smi 51 | 52 | exit ${EXITCODE} 53 | -------------------------------------------------------------------------------- /siem_integrations/clx_query/bin/clx_query.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | import re 17 | import sys, requests, json 18 | from splunklib.searchcommands import ( 19 | dispatch, 20 | GeneratingCommand, 21 | Configuration, 22 | Option, 23 | validators, 24 | ) 25 | import splunklib.client as client 26 | 27 | log = logging.getLogger(__name__) 28 | 29 | REGEX_PATTERN = r"([LIMIT|limit]+.[0-9]+$)" 30 | 31 | @Configuration() 32 | class ClxQuery(GeneratingCommand): 33 | query = Option(require=True) 34 | 35 | def generate(self): 36 | configs = client.Configurations(self.service) 37 | for config in configs: 38 | if config.name == "clx_query_setup": 39 | clx_config = config.iter().next().content 40 | 41 | url = self.construct_url(clx_config) 42 | has_query_limit = re.findall(REGEX_PATTERN, self.query) 43 | 44 | payload = {'query': self.query} 45 | if not has_query_limit and clx_config["clx_query_limit"]: 46 | self.query = "%s LIMIT %s" %(self.query, clx_config["clx_query_limit"]) 47 | payload = {'query': self.query} 48 | response = requests.post(url, data=payload) 49 | 50 | if response.status_code != 200: 51 | yield {"ERROR": response.content} 52 | else: 53 | results = json.loads(json.loads(response.content)) 54 | for result in results: 55 | yield result 56 | 57 | def construct_url(self, config): 58 | url = "http://%s:%s/%s/" % ( 59 | config["clx_hostname"], 60 | config["clx_port"], 61 | 'clxquery' 62 | ) 63 | return url 64 | 65 | 66 | dispatch(ClxQuery, sys.argv, sys.stdin, sys.stdout, __name__) -------------------------------------------------------------------------------- /python/clx/io/factory/factory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | 17 | from clx.io.factory.kafka_factory import KafkaFactory 18 | from clx.io.factory.fs_factory import FileSystemFactory 19 | from clx.io.factory.dask_fs_factory import DaskFileSystemFactory 20 | 21 | log = logging.getLogger(__name__) 22 | 23 | 24 | class Factory: 25 | 26 | __cls_dict = { 27 | "kafka": "KafkaFactory", 28 | "fs": "FileSystemFactory", 29 | "dask_fs": "DaskFileSystemFactory", 30 | } 31 | 32 | @staticmethod 33 | def cls_dict(): 34 | return Factory.__cls_dict 35 | 36 | class InstanceGenerator(object): 37 | def __init__(self, func): 38 | self.func = func 39 | 40 | def __call__(self, *args, **kwargs): 41 | class_name, config = self.func(*args, **kwargs) 42 | try: 43 | target_cls = globals()[class_name](config) 44 | return target_cls 45 | except KeyError as error: 46 | log.error(error) 47 | log.exception(error) 48 | raise 49 | 50 | @InstanceGenerator 51 | def get_instance(io_comp, config): 52 | io_comp = io_comp.lower() 53 | if io_comp and io_comp in Factory.cls_dict(): 54 | return Factory.cls_dict()[io_comp], config 55 | else: 56 | raise KeyError( 57 | "Dictionary doesn't have { %s } corresponding component class." 58 | % (io_comp) 59 | ) 60 | 61 | @staticmethod 62 | def get_reader(io_comp, config): 63 | return Factory.get_instance(io_comp, config).get_reader() 64 | 65 | @staticmethod 66 | def get_writer(io_comp, config): 67 | return Factory.get_instance(io_comp, config).get_writer() 68 | -------------------------------------------------------------------------------- /ci/cpu/build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright (c) 2020-2022, NVIDIA CORPORATION. 3 | ################################################################################ 4 | # CLX cpu build 5 | ################################################################################ 6 | set -e 7 | 8 | # Set path and build parallel level 9 | export PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH 10 | export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4} 11 | 12 | # Set home to the job's workspace 13 | export HOME="$WORKSPACE" 14 | 15 | # Switch to project root; also root of repo checkout 16 | cd "$WORKSPACE" 17 | 18 | # If nightly build, append current YYMMDD to version 19 | if [[ "$BUILD_MODE" = "branch" && "$SOURCE_BRANCH" = branch-* ]] ; then 20 | export VERSION_SUFFIX=`date +%y%m%d` 21 | fi 22 | 23 | # Setup 'gpuci_conda_retry' for build retries (results in 2 total attempts) 24 | export GPUCI_CONDA_RETRY_MAX=1 25 | export GPUCI_CONDA_RETRY_SLEEP=30 26 | 27 | ################################################################################ 28 | # SETUP - Check environment 29 | ################################################################################ 30 | 31 | gpuci_logger "Get env" 32 | env 33 | 34 | gpuci_logger "Activate conda env" 35 | . /opt/conda/etc/profile.d/conda.sh 36 | conda activate rapids 37 | 38 | # Remove rapidsai-nightly channel if we are building main branch 39 | if [ "$SOURCE_BRANCH" = "main" ]; then 40 | conda config --system --remove channels rapidsai-nightly 41 | fi 42 | 43 | gpuci_logger "Check versions" 44 | python --version 45 | $CC --version 46 | $CXX --version 47 | 48 | gpuci_logger "Check conda environment" 49 | conda info 50 | conda config --show-sources 51 | conda list --show-channel-urls 52 | 53 | # FIX Added to deal with Anancoda SSL verification issues during conda builds 54 | conda config --set ssl_verify False 55 | 56 | # FIXME: Remove 57 | gpuci_mamba_retry install -c conda-forge boa 58 | 59 | ############################################################################### 60 | # BUILD - Conda package build 61 | ################################################################################ 62 | 63 | gpuci_logger "Build conda pkg for clx" 64 | gpuci_conda_retry mambabuild conda/recipes/clx 65 | 66 | ################################################################################ 67 | # UPLOAD - Conda package 68 | ################################################################################ 69 | 70 | gpuci_logger "Upload packages" 71 | source ci/cpu/upload.sh 72 | -------------------------------------------------------------------------------- /python/clx/tests/test_loda.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import cupy 15 | from clx.analytics.loda import Loda 16 | from os import path 17 | 18 | 19 | def test_fit(): 20 | ld = Loda(n_random_cuts=10, n_bins=None) 21 | x = cupy.random.randint(0, 100, size=(200, 10)) 22 | ld.fit(x) 23 | assert ld._histograms is not None 24 | assert isinstance( 25 | ld._histograms, 26 | cupy.ndarray 27 | ) 28 | assert cupy.all(ld._histograms > 0) 29 | 30 | 31 | def test_score(): 32 | ld = Loda(n_random_cuts=10, n_bins=None) 33 | x = cupy.random.randint(0, 100, size=(200, 10)) 34 | ld.fit(x) 35 | scores = ld.score(x) 36 | assert scores is not None 37 | assert isinstance( 38 | scores, 39 | cupy.ndarray 40 | ) 41 | assert cupy.all(scores > 0) 42 | 43 | 44 | def test_explain(): 45 | ld = Loda(n_random_cuts=10, n_bins=None) 46 | x = cupy.random.randint(0, 100, size=(200, 10)) 47 | ld.fit(x) 48 | explanation = ld.explain(x[0]) 49 | assert explanation is not None 50 | assert isinstance( 51 | explanation, 52 | cupy.ndarray 53 | ) 54 | 55 | 56 | def test_save_model(tmpdir): 57 | ld = Loda(n_random_cuts=10, n_bins=None) 58 | x = cupy.random.randint(0, 100, size=(200, 10)) 59 | ld.fit(x) 60 | ipath = path.join(tmpdir, "clx_loda") 61 | opath = path.join(tmpdir, "clx_loda.npz") 62 | ld.save_model(ipath) 63 | assert path.exists(opath) 64 | 65 | 66 | def test_load_model(tmpdir): 67 | ld = Loda(n_random_cuts=10, n_bins=None) 68 | x = cupy.random.randint(0, 100, size=(200, 10)) 69 | ld.fit(x) 70 | ipath = path.join(tmpdir, "clx_loda") 71 | opath = path.join(tmpdir, "clx_loda.npz") 72 | ld.save_model(ipath) 73 | assert path.exists(opath) 74 | 75 | # load model 76 | ld = Loda.load_model(opath) 77 | assert isinstance(ld, Loda) 78 | -------------------------------------------------------------------------------- /examples/streamz/scripts/entrypoint.sh: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #!/bin/bash 16 | set +e 17 | 18 | #***************************** 19 | # This function print logging. 20 | #***************************** 21 | log(){ 22 | if [[ $# = 2 ]]; then 23 | echo "$(date) [$1] : $2" 24 | fi 25 | } 26 | 27 | source activate rapids 28 | 29 | # kakfa broker 30 | BROKER="localhost:9092" 31 | 32 | #********************************** 33 | # Configure Kafka 34 | #********************************** 35 | sed -i "/listeners=PLAINTEXT:\/\//c\listeners=PLAINTEXT:\/\/$BROKER" $KAFKA_HOME/config/server.properties 36 | sed -i "/advertised.listeners=PLAINTEXT:\/\//c\advertised.listeners=PLAINTEXT:\/\/$broker" $KAFKA_HOME/config/server.properties 37 | log "INFO" "Kafka configuration updated" 38 | 39 | #********************************** 40 | # Run Kafka and Zookeeper 41 | #********************************** 42 | $KAFKA_HOME/bin/zookeeper-server-start.sh -daemon $KAFKA_HOME/config/zookeeper.properties 43 | sleep 3 44 | $KAFKA_HOME/bin/kafka-server-start.sh -daemon $KAFKA_HOME/config/server.properties 45 | sleep 3 46 | 47 | log "INFO" "Kafka and zookeeper running" 48 | log "INFO" "Kafka broker is running on $BROKER" 49 | log "INFO" "Zookeeper is running on localhost:2181" 50 | 51 | #********************************** 52 | # Create topics and publish data 53 | #********************************** 54 | log "INFO" "Loading cybert input data to 'cybert_input' topic" 55 | . $CLX_STREAMZ_HOME/scripts/kafka_topic_setup.sh \ 56 | -i cybert_input \ 57 | -o cybert_output \ 58 | -d $CLX_STREAMZ_HOME/data/apache_raw_sample_1k.txt 59 | 60 | log "INFO" "Loading dga detection input data to 'dga_detection_input' topic" 61 | . $CLX_STREAMZ_HOME/scripts/kafka_topic_setup.sh \ 62 | -i dga_detection_input \ 63 | -o dga_detection_output \ 64 | -d $CLX_STREAMZ_HOME/data/dga_detection_input.jsonlines 65 | 66 | exec "$@"; 67 | -------------------------------------------------------------------------------- /python/clx/analytics/stats.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | import math 17 | 18 | log = logging.getLogger(__name__) 19 | 20 | 21 | def rzscore(series, window): 22 | """ 23 | Calculates rolling z-score 24 | 25 | Parameters 26 | ---------- 27 | series : cudf.Series 28 | Series for which to calculate rolling z-score 29 | window : int 30 | Window size 31 | 32 | Returns 33 | ------- 34 | cudf.Series 35 | Series with rolling z-score values 36 | 37 | Examples 38 | -------- 39 | >>> import clx.analytics.stats 40 | >>> import cudf 41 | >>> sequence = [3,4,5,6,1,10,34,2,1,11,45,34,2,9,19,43,24,13,23,10,98,84,10] 42 | >>> series = cudf.Series(sequence) 43 | >>> zscores_df = cudf.DataFrame() 44 | >>> zscores_df['zscore'] = clx.analytics.stats.rzscore(series, 7) 45 | >>> zscores_df 46 | zscore 47 | 0 null 48 | 1 null 49 | 2 null 50 | 3 null 51 | 4 null 52 | 5 null 53 | 6 2.374423424 54 | 7 -0.645941275 55 | 8 -0.683973734 56 | 9 0.158832461 57 | 10 1.847751909 58 | 11 0.880026019 59 | 12 -0.950835449 60 | 13 -0.360593742 61 | 14 0.111407599 62 | 15 1.228914145 63 | 16 -0.074966331 64 | 17 -0.570321249 65 | 18 0.327849973 66 | 19 -0.934372308 67 | 20 2.296828498 68 | 21 1.282966989 69 | 22 -0.795223674 70 | """ 71 | rolling = series.rolling(window=window) 72 | mean = rolling.mean() 73 | std = rolling.apply(__std_func) 74 | 75 | zscore = (series - mean) / std 76 | return zscore 77 | 78 | 79 | def __std_func(A): 80 | """ 81 | Current implementation assumes ddof = 0 82 | """ 83 | sum_of_elem = 0 84 | sum_of_square_elem = 0 85 | 86 | for a in A: 87 | sum_of_elem += a 88 | sum_of_square_elem += a * a 89 | 90 | s = (sum_of_square_elem - ((sum_of_elem * sum_of_elem) / len(A))) / len(A) 91 | return math.sqrt(s) 92 | -------------------------------------------------------------------------------- /examples/streamz/python/phishing_detection.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import time 16 | import dask 17 | import cudf 18 | from clx_streamz_tools import utils 19 | from clx_streamz_tools import streamz_workflow 20 | 21 | 22 | class PhisingDetectionWorkflow(streamz_workflow.StreamzWorkflow): 23 | def inference(messages): 24 | # Messages will be received and run through sequence classifier inferencing 25 | worker = dask.distributed.get_worker() 26 | batch_start_time = int(round(time.time())) 27 | df = cudf.DataFrame() 28 | if type(messages) == str: 29 | df["stream"] = [messages.decode("utf-8")] 30 | elif type(messages) == list and len(messages) > 0: 31 | df["stream"] = [msg.decode("utf-8") for msg in messages] 32 | else: 33 | print("ERROR: Unknown type encountered in inference") 34 | 35 | result_size = df.shape[0] 36 | print("Processing batch size: " + str(result_size)) 37 | pred, prob = worker.data["seq_classifier"].predict(df["stream"]) 38 | results_gdf = cudf.DataFrame({"pred": pred, "prob": prob}) 39 | return (results_gdf, batch_start_time, result_size) 40 | 41 | def worker_init(): 42 | # Initialization for each dask worker 43 | from clx.analytics.sequence_classifier import SequenceClassifier 44 | 45 | worker = dask.distributed.get_worker() 46 | seq_classifier = SequenceClassifier() 47 | print( 48 | "Initializing Dask worker: " 49 | + str(worker) 50 | + " with sequence classifier model. Model File: " 51 | + str(self.args.model) 52 | ) 53 | seq_classifier.init_model(self.args.model) 54 | # this dict can be used for adding more objects to distributed dask worker 55 | obj_dict = {"seq_classifier": seq_classifier} 56 | worker = utils.init_dask_workers(worker, self.config, obj_dict) 57 | 58 | 59 | if __name__ == "__main__": 60 | phishing_detection = PhisingDetectionWorkflow() 61 | phishing_detection.start() 62 | -------------------------------------------------------------------------------- /examples/streamz/python/cybert.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import time 16 | import dask 17 | import cudf 18 | import pandas as pd 19 | from clx_streamz_tools import utils 20 | from clx_streamz_tools import streamz_workflow 21 | 22 | 23 | class CybertWorkflow(streamz_workflow.StreamzWorkflow): 24 | def inference(self, messages): 25 | # Messages will be received and run through cyBERT inferencing 26 | worker = dask.distributed.get_worker() 27 | batch_start_time = int(round(time.time())) 28 | df = cudf.DataFrame() 29 | if type(messages) == str: 30 | df["stream"] = [messages.decode("utf-8")] 31 | elif type(messages) == list and len(messages) > 0: 32 | df["stream"] = [msg.decode("utf-8") for msg in messages] 33 | else: 34 | print("ERROR: Unknown type encountered in inference") 35 | 36 | result_size = df.shape[0] 37 | print("Processing batch size: " + str(result_size)) 38 | parsed_df, confidence_df = worker.data["cybert"].inference(df["stream"]) 39 | confidence_df = confidence_df.add_suffix("_confidence") 40 | parsed_df = pd.concat([parsed_df, confidence_df], axis=1) 41 | return (parsed_df, batch_start_time, result_size) 42 | 43 | def worker_init(self): 44 | # Initialization for each dask worker 45 | from clx.analytics.cybert import Cybert 46 | 47 | worker = dask.distributed.get_worker() 48 | cy = Cybert() 49 | print( 50 | "Initializing Dask worker: " 51 | + str(worker) 52 | + " with cybert model. Model File: " 53 | + str(self.args.model) 54 | + " Label Map: " 55 | + str(self.args.label_map) 56 | ) 57 | cy.load_model(self.args.model, self.args.label_map) 58 | # this dict can be used for adding more objects to distributed dask worker 59 | obj_dict = {"cybert": cy} 60 | worker = utils.init_dask_workers(worker, self.config, obj_dict) 61 | 62 | 63 | if __name__ == "__main__": 64 | cybert = CybertWorkflow() 65 | cybert.start() 66 | -------------------------------------------------------------------------------- /siem_integrations/splunk2kafka/export2kafka/bin/export2kafka.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | import json 4 | #import pprint 5 | from splunklib.searchcommands import dispatch, StreamingCommand, Configuration, Option 6 | from confluent_kafka import Producer 7 | import confluent_kafka 8 | import time 9 | 10 | def eprint(*args, **kwargs): 11 | print(*args, file=sys.stderr, **kwargs) 12 | 13 | @Configuration(local=True) 14 | class FileSinkCommand(StreamingCommand): 15 | broker = Option(require=True) 16 | topic = Option(require=True) 17 | batch = Option(require=False, default=2000) 18 | timeout = Option(require=False, default=60) 19 | pool = Option(require=False, default=2) 20 | start_time = int(time.time()) 21 | 22 | def create_producers(self, pool, broker): 23 | producers = [] 24 | for i in range(pool): 25 | producers.append(Producer({'bootstrap.servers': broker, 'session.timeout.ms': 10000})) 26 | eprint("exprot2kafka - producer"+str(i)+" created: "+broker) 27 | return producers 28 | 29 | def stream(self, records): 30 | topic = str(self.topic) 31 | broker = str(self.broker) 32 | batch = int(self.batch) 33 | timeout = int(self.timeout) 34 | pool = int(self.pool) 35 | eprint("export2kafka - starting... broker("+broker+") topic("+topic+") batch(" \ 36 | +str(batch)+") timeout("+str(timeout)+" mins) pool("+str(pool)+")") 37 | eprint("export2kafka - stream starting") 38 | producers = self.create_producers(pool, broker) 39 | cnt = 0 40 | 41 | for record in records: 42 | trimmed = {k: v for k, v in record.iteritems()} 43 | #eprint(json.dumps(trimmed)) 44 | producers[cnt % pool].produce(topic, json.dumps(trimmed)) 45 | cnt += 1 46 | 47 | if cnt % batch == 0: 48 | # batch level reached poll to get producer to move messages out 49 | eprint("export2kafka - batch reached, calling poll... processed records: "+str(cnt)) 50 | for p in producers: 51 | p.poll(0) 52 | 53 | if cnt % 10 == 0 and int(time.time()) > (60 * timeout) + self.start_time: 54 | # quit after timeout has been reached, only check every 10 records 55 | eprint("export2kafka - timeout reached, stopping search...") 56 | break 57 | 58 | # return record for display in Splunk 59 | yield record 60 | 61 | eprint("export2kafka - all records processed for stream... processed records: "+str(cnt)) 62 | eprint("export2kafka - calling flush...") 63 | for p in producers: 64 | p.flush() 65 | eprint("export2kafka - flush finished...") 66 | eprint("export2kafka - stream finished") 67 | 68 | if __name__ == "__main__": 69 | dispatch(FileSinkCommand, sys.argv, sys.stdin, sys.stdout, __name__) 70 | -------------------------------------------------------------------------------- /python/clx/io/writer/fs_writer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import cudf 16 | import logging 17 | import os 18 | 19 | from clx.io.writer.file_writer import FileWriter 20 | 21 | log = logging.getLogger(__name__) 22 | 23 | 24 | class FileSystemWriter(FileWriter): 25 | """ 26 | Uses cudf to write to file system based on config object. 27 | 28 | :param config: dictionary object of config values for **type**, **output_format**, **output_path** (or **output_path**), and cudf writer optional keyword args 29 | """ 30 | 31 | def __init__(self, config): 32 | self._config = config 33 | 34 | def write_data(self, df): 35 | """ 36 | Write data to file system using cudf based on provided config object 37 | """ 38 | output_format = self.config["output_format"].lower() 39 | filepath = self.config["output_path"] 40 | kwargs = self.config.copy() 41 | del kwargs["type"] 42 | del kwargs["output_format"] 43 | del kwargs["output_path"] 44 | 45 | dir = os.path.dirname(filepath) 46 | if not os.path.isdir(dir): 47 | log.info("output directory { %s } not exist" % (dir)) 48 | log.info("creating output directory { %s }..." % (dir)) 49 | os.makedirs(dir) 50 | log.info("created output directory { %s }..." % (dir)) 51 | if os.path.exists(filepath): 52 | raise IOError("output path { %s } already exist" % (filepath)) 53 | 54 | log.info("writing data to location {%s}" % (filepath)) 55 | 56 | if "csv" == output_format: 57 | df.to_csv(filepath, **kwargs) 58 | elif "parquet" == output_format: 59 | cudf.io.parquet.to_parquet(df, filepath, **kwargs) 60 | elif "orc" == output_format: 61 | cudf.io.orc.to_orc(df, filepath, **kwargs) 62 | elif "json" == output_format: 63 | cudf.io.json.to_json(df, filepath, **kwargs) 64 | else: 65 | raise NotImplementedError("%s is not a supported output_format" % (output_format)) 66 | 67 | def close(self): 68 | """ 69 | Close cudf writer 70 | """ 71 | log.info("Closed writer") 72 | -------------------------------------------------------------------------------- /ci/local/README.md: -------------------------------------------------------------------------------- 1 | ## Purpose 2 | 3 | This script is designed for developer and contributor use. This tool mimics the actions of gpuCI on your local machine. This allows you to test and even debug your code inside a gpuCI base container before pushing your code as a GitHub commit. 4 | The script can be helpful in locally triaging and debugging RAPIDS continuous integration failures. 5 | 6 | ## Requirements 7 | 8 | ``` 9 | nvidia-docker 10 | ``` 11 | 12 | ## Usage 13 | 14 | ``` 15 | bash build.sh [-h] [-H] [-s] [-r ] [-i ] 16 | Build and test your local repository using a base gpuCI Docker image 17 | 18 | where: 19 | -H Show this help text 20 | -r Path to repository (defaults to working directory) 21 | -i Use Docker image (default is gpuci/rapidsai-base:cuda10.0-ubuntu16.04-gcc5-py3.6) 22 | -s Skip building and testing and start an interactive shell in a container of the Docker image 23 | ``` 24 | 25 | Example Usage: 26 | `bash build.sh -r ~/rapids/clx -i gpuci/rapidsai-base:cuda10.1-ubuntu16.04-gcc5-py3.6` 27 | 28 | For a full list of available gpuCI docker images, visit our [DockerHub](https://hub.docker.com/r/gpuci/rapidsai-base/tags) page. 29 | 30 | Style Check: 31 | ```bash 32 | $ bash ci/local/build.sh -r ~/rapids/clx -s 33 | $ . /opt/conda/etc/profile.d/conda.sh 34 | $ conda activate rapids 35 | $ cd rapids 36 | $ flake8 python 37 | ``` 38 | 39 | ## Information 40 | 41 | There are some caveats to be aware of when using this script, especially if you plan on developing from within the container itself. 42 | 43 | 44 | ### Docker Image Build Repository 45 | 46 | The docker image will generate build artifacts in a folder on your machine located in the `root` directory of the repository you passed to the script. For the above example, the directory is named `~/rapids/clx/build_rapidsai-base_cuda10.1-ubuntu16.04-gcc5-py3.6/`. Feel free to remove this directory after the script is finished. 47 | 48 | *Note*: The script *will not* override your local build repository. Your local environment stays in tact. 49 | 50 | 51 | ### Where The User is Dumped 52 | 53 | The script will build your repository and run all tests. If any tests fail, it dumps the user into the docker container itself to allow you to debug from within the container. If all the tests pass as expected the container exits and is automatically removed. Remember to exit the container if tests fail and you do not wish to debug within the container itself. 54 | 55 | 56 | ### Container File Structure 57 | 58 | Your repository will be located in the `/rapids/` folder of the container. This folder is volume mounted from the local machine. Any changes to the code in this repository are replicated onto the local machine. The `cpp/build` and `python/build` directories within your repository is on a separate mount to avoid conflicting with your local build artifacts. 59 | -------------------------------------------------------------------------------- /python/clx/tests/test_netflow_workflow.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import cudf 16 | from clx.workflow.netflow_workflow import NetflowWorkflow 17 | 18 | 19 | def test_netflow_workflow(): 20 | """Tests the netflow dataframe enrichment""" 21 | netflow_workflow = NetflowWorkflow("netflow-workflow") 22 | input_df = cudf.DataFrame( 23 | { 24 | "ts time": ["12345678900.12345"], 25 | "uid string": ["123ABC"], 26 | "id.orig_h": ["123.456.789"], 27 | "id.orig_p": ["1000"], 28 | "id.resp_h": ["987.654.321"], 29 | "id.resp_p": ["80"], 30 | "proto": ["tcp"], 31 | "service": ["-"], 32 | "duration": ["2.015"], 33 | "orig_bytes": ["0"], 34 | "resp_bytes": ["0"], 35 | "conn_state": ["SH"], 36 | "local_orig": ["-"], 37 | "local_resp": ["-"], 38 | "missed_bytes": ["0"], 39 | "history": ["F"], 40 | "orig_pkts count": ["2"], 41 | "orig_ip_bytes": ["80"], 42 | "resp_pkts": ["0"], 43 | "resp_ip_bytes": ["0"], 44 | "tunnel_parents": ["-"], 45 | } 46 | ) 47 | actual_df = netflow_workflow.workflow(input_df) 48 | expected_df = cudf.DataFrame( 49 | { 50 | "ts time": ["12345678900.12345"], 51 | "uid string": ["123ABC"], 52 | "id.orig_h": ["123.456.789"], 53 | "id.orig_p": ["1000"], 54 | "id.resp_h": ["987.654.321"], 55 | "id.resp_p": ["80"], 56 | "proto": ["tcp"], 57 | "service": ["-"], 58 | "duration": ["2.015"], 59 | "orig_bytes": ["0"], 60 | "resp_bytes": ["0"], 61 | "conn_state": ["SH"], 62 | "local_orig": ["-"], 63 | "local_resp": ["-"], 64 | "missed_bytes": ["0"], 65 | "history": ["F"], 66 | "orig_pkts count": ["2"], 67 | "orig_ip_bytes": ["80"], 68 | "resp_pkts": ["0"], 69 | "resp_ip_bytes": ["0"], 70 | "tunnel_parents": ["-"], 71 | "netflow_enriched": ["netflow_enriched"], 72 | } 73 | ) 74 | 75 | assert actual_df.equals(expected_df) 76 | -------------------------------------------------------------------------------- /siem_integrations/clx_query_service/clxquery/views.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import logging 4 | 5 | from clxquery import utils 6 | from clxquery.blazingsql_helper import BlazingSQLHelper 7 | from django.http import HttpResponse, JsonResponse 8 | from rest_framework.generics import CreateAPIView 9 | 10 | log = logging.getLogger(__name__) 11 | 12 | class ExecuteClxQuery(CreateAPIView): 13 | 14 | file_path = os.environ.get("BLZ_READER_CONF") 15 | # Load tables configuration 16 | config = utils.load_yaml(file_path) 17 | configured_tables = set([table["table_name"] for table in config["tables"]]) 18 | 19 | regex_pattern = r"main.([\w]+)" 20 | blz_helper = BlazingSQLHelper() 21 | 22 | def post(self, request, *args, **kwargs): 23 | query = str(request.data['query']) 24 | # Check for the list of tables used in the query to prevent loading other tables into gpu memory 25 | query_tables = set(re.findall(self.regex_pattern, query)) 26 | # Verify list of tables used in the query to make sure they are included in the configuration file 27 | 28 | if query_tables.issubset(self.configured_tables): 29 | try: 30 | query_config = {} 31 | query_config["tables"] = [] 32 | for table in self.config["tables"]: 33 | if table["table_name"] in query_tables: 34 | query_config["tables"].append(table) 35 | query_config["sql"] = query 36 | # Run query and get the results 37 | df = self.blz_helper.run_query(query_config) 38 | # Drop tables to free up memory 39 | self.blz_helper.drop_table(query_tables) 40 | # Convert cudf to pandas dataframe 41 | df = df.to_pandas() 42 | # Convert results to json format. 43 | results = df.to_json(orient="records") 44 | response = JsonResponse(results, safe=False) 45 | except Exception as e: 46 | stacktrace = str(e) 47 | log.error("Error executing query: %s" % (stacktrace)) 48 | response = JsonResponse( 49 | {"status": "false", "message": stacktrace}, status=500, safe=False 50 | ) 51 | else: 52 | message = ( 53 | "One or more tables used in the query are not available in the server configuration. Please select from this list %s or add new tables to your clx-blazingsql configuration." 54 | % (configured_tables) 55 | ) 56 | response = JsonResponse( 57 | {"status": "false", "message": message}, status=404, safe=False 58 | ) 59 | return response -------------------------------------------------------------------------------- /python/clx/tests/test_dask_fs_reader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import cudf 16 | import pytest 17 | from clx.io.reader.dask_fs_reader import DaskFileSystemReader 18 | 19 | expected_df = cudf.DataFrame( 20 | { 21 | "firstname": ["Emma", "Ava", "Sophia"], 22 | "lastname": ["Olivia", "Isabella", "Charlotte"], 23 | "gender": ["F", "F", "F"], 24 | } 25 | ) 26 | 27 | 28 | @pytest.mark.parametrize("expected_df", [expected_df]) 29 | def test_fetch_data_csv(tmpdir, expected_df): 30 | fname = tmpdir.mkdir("tmp_test_fs_reader").join("person.csv") 31 | expected_df.to_csv(fname, index=False) 32 | config = { 33 | "type": "dask_fs", 34 | "input_path": fname, 35 | "names": ["firstname", "lastname", "gender"], 36 | "delimiter": ",", 37 | "usecols": ["firstname", "lastname", "gender"], 38 | "dtype": ["str", "str", "str"], 39 | "header": 0, 40 | "input_format": "csv", 41 | } 42 | reader = DaskFileSystemReader(config) 43 | fetched_df = reader.fetch_data().compute() 44 | 45 | assert fetched_df.equals(expected_df) 46 | 47 | 48 | @pytest.mark.parametrize("expected_df", [expected_df]) 49 | def test_fetch_data_parquet(tmpdir, expected_df): 50 | fname = str(tmpdir.mkdir("tmp_test_fs_reader").join("person.parquet")) 51 | cudf.io.parquet.to_parquet(expected_df, fname) 52 | config = { 53 | "type": "dask_fs", 54 | "input_path": fname, 55 | "columns": ["firstname", "lastname", "gender"], 56 | "input_format": "parquet", 57 | "gather_statistics": False, 58 | "split_row_groups": False 59 | } 60 | 61 | reader = DaskFileSystemReader(config) 62 | fetched_df = reader.fetch_data().compute() 63 | 64 | assert fetched_df.equals(expected_df) 65 | 66 | 67 | @pytest.mark.parametrize("expected_df", [expected_df]) 68 | def test_fetch_data_orc(tmpdir, expected_df): 69 | fname = str(tmpdir.mkdir("tmp_test_fs_reader").join("person.orc")) 70 | cudf.io.orc.to_orc(expected_df, fname) 71 | config = { 72 | "type": "dask_fs", 73 | "input_path": fname, 74 | "input_format": "orc" 75 | } 76 | 77 | reader = DaskFileSystemReader(config) 78 | fetched_df = reader.fetch_data().compute() 79 | 80 | assert fetched_df.equals(expected_df) 81 | -------------------------------------------------------------------------------- /python/clx/tests/test_eda.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import json 16 | 17 | import cudf 18 | import cuxfilter 19 | import pandas as pd 20 | import pytest 21 | 22 | from clx.eda import EDA 23 | 24 | 25 | @pytest.fixture 26 | def test_dataframe(): 27 | df = cudf.DataFrame() 28 | df["a"] = [1, 2, 3, 4] 29 | df["b"] = ["a", "b", "c", "c"] 30 | df["c"] = [True, False, True, True] 31 | df["d"] = cudf.Series(pd.date_range("2000-01-01", periods=3, freq="m")) 32 | return df 33 | 34 | 35 | def test_eda_summary_stats(test_dataframe): 36 | """Test EDA Summary statistics""" 37 | expected_output = { 38 | "SummaryStatistics": { 39 | "a": {"dtype": "int64", "summary": {"unique": "4", "total": "4"}}, 40 | "b": {"dtype": "object", "summary": {"unique": "3", "total": "4"}}, 41 | "c": {"dtype": "bool", "summary": {"true_percent": "0.75"}}, 42 | "d": { 43 | "dtype": "datetime64[ns]", 44 | "summary": {"timespan": "60 days, 2880 hours, 0 minutes, 0 seconds"}, 45 | }, 46 | } 47 | } 48 | eda = EDA(test_dataframe) 49 | actual_output = eda.analysis 50 | assert expected_output == actual_output 51 | 52 | 53 | def test_eda_save_analysis(tmpdir, test_dataframe): 54 | """Test saving the analysis to a json file""" 55 | fdir = str(tmpdir.mkdir("tmp_test_eda")) 56 | fname = fdir + "/SummaryStatistics.json" 57 | eda = EDA(test_dataframe) 58 | eda.save_analysis(fdir) 59 | expected_output = { 60 | "a": {"dtype": "int64", "summary": {"unique": "4", "total": "4"}}, 61 | "b": {"dtype": "object", "summary": {"unique": "3", "total": "4"}}, 62 | "c": {"dtype": "bool", "summary": {"true_percent": "0.75"}}, 63 | "d": { 64 | "dtype": "datetime64[ns]", 65 | "summary": {"timespan": "60 days, 2880 hours, 0 minutes, 0 seconds"}, 66 | }, 67 | } 68 | with open(fname) as f: 69 | actual_output = json.load(f) 70 | assert expected_output == actual_output 71 | 72 | 73 | def test_cuxfilter_dashboard(test_dataframe): 74 | """Test generating the dashboard""" 75 | eda = EDA(test_dataframe) 76 | dash = eda.cuxfilter_dashboard() 77 | assert isinstance(dash, cuxfilter.dashboard.DashBoard) 78 | assert len(dash.charts) == 2 79 | assert dash.title == "Exploratory Data Analysis" 80 | -------------------------------------------------------------------------------- /python/clx/io/writer/kafka_writer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | 17 | log = logging.getLogger(__name__) 18 | 19 | 20 | class KafkaWriter: 21 | """ 22 | Publish to Kafka topic based on config object. 23 | 24 | :param kafka_topic: Kafka topic 25 | :param batch_size: batch size 26 | :param delimiter: delimiter 27 | :param producer: producer 28 | """ 29 | 30 | # Column name of formatted output messages sent to kafka 31 | output_colname = "delimited_output" 32 | 33 | def __init__(self, kafka_topic, batch_size, delimiter, producer): 34 | self._kafka_topic = kafka_topic 35 | self._batch_size = batch_size 36 | self._delimiter = delimiter 37 | self._producer = producer 38 | 39 | @property 40 | def producer(self): 41 | return self._producer 42 | 43 | @property 44 | def delimiter(self): 45 | return self._delimiter 46 | 47 | def write_data(self, df): 48 | """ 49 | publish messages to kafka topic 50 | 51 | :param df: dataframe to publish 52 | """ 53 | out_df = self._generate_delimited_ouput_col(df) 54 | for rec in out_df.to_records(): 55 | self.producer.produce(self._kafka_topic, rec[self.output_colname]) 56 | if len(self.producer) > self._batch_size: 57 | log.debug( 58 | "batch reached, calling poll... producer unsent: %s", 59 | len(self.producer), 60 | ) 61 | self.producer.poll(0) 62 | 63 | def _generate_delimited_ouput_col(self, gdf): 64 | first_col = gdf.columns[0] 65 | gdf[first_col] = gdf[first_col].astype("str").fillna("") 66 | gdf[self.output_colname] = gdf[first_col].astype("str").str.rstrip() 67 | for col in gdf.columns[1:-1]: 68 | gdf[col] = gdf[col].astype("str").fillna("") 69 | gdf[col] = gdf[col].astype("str").str.rstrip() 70 | gdf[self.output_colname] = gdf[self.output_colname].str.cat( 71 | gdf[col], sep=self.delimiter 72 | ) 73 | return gdf 74 | 75 | def close(self): 76 | """ 77 | Close Kafka writer 78 | """ 79 | log.info("Closing kafka writer...") 80 | if self._producer is not None: 81 | self._producer.flush() 82 | log.info("Closed kafka writer.") 83 | -------------------------------------------------------------------------------- /python/clx/tests/test_kafka_reader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from confluent_kafka import Consumer 18 | from confluent_kafka import Message, KafkaError 19 | from mockito import when, mock, verify 20 | from clx.io.reader.kafka_reader import KafkaReader 21 | 22 | batch_size = 100 23 | message = mock(Message) 24 | kafka_error = mock(KafkaError) 25 | when(kafka_error).code().thenReturn("test") 26 | when(message).value().thenReturn("test message".encode("utf-8")) 27 | 28 | 29 | @pytest.mark.parametrize("batch_size", [batch_size]) 30 | def test_read_data(batch_size): 31 | consumer = mock(Consumer) 32 | reader = KafkaReader(batch_size, consumer) 33 | # Return msg = None 1 time, then return a valid message moving forward 34 | when(reader.consumer).poll(timeout=1.0).thenReturn(None).thenReturn(message) 35 | # Always return no message error 36 | when(message).error().thenReturn(None) 37 | df = reader.fetch_data() 38 | assert df.shape == (100, 1) 39 | assert df.columns == ["Raw"] 40 | assert df["Raw"][0] == "test message" 41 | # Call to poll returned 100(Valid messages) + 1(None message) = 101 42 | verify(reader.consumer, times=101).poll(...) 43 | 44 | 45 | @pytest.mark.parametrize("batch_size", [batch_size]) 46 | def test_read_data_message_error(batch_size): 47 | consumer = mock(Consumer) 48 | reader = KafkaReader(batch_size, consumer) 49 | # Return valid message data 50 | when(reader.consumer).poll(timeout=1.0).thenReturn(message) 51 | # Return no message error 1 time, then an error moving forward 52 | when(message).error().thenReturn(None).thenReturn(kafka_error) 53 | df = reader.fetch_data() 54 | 55 | # Validate consumer polls 56 | # 1 (Valid message) + 1 (Error Message) = 2 Consumer polls 57 | verify(reader.consumer, times=2).poll(...) 58 | 59 | # Validate dataframe output 60 | assert df.shape == (1, 1) 61 | assert df.columns == ["Raw"] 62 | assert df["Raw"].to_arrow().to_pylist() == ["test message"] 63 | 64 | 65 | @pytest.mark.parametrize("batch_size", [5]) 66 | def test_read_data_no_messages(batch_size): 67 | consumer = mock(Consumer) 68 | reader = KafkaReader(batch_size, consumer, time_window=5) 69 | # Return no messages 70 | when(reader.consumer).poll(timeout=1.0).thenReturn(None) 71 | df = reader.fetch_data() 72 | 73 | # Validate dataframe output 74 | assert df.empty 75 | -------------------------------------------------------------------------------- /python/clx/tests/test_fs_writer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import cudf 16 | import pytest 17 | 18 | from clx.io.writer.fs_writer import FileSystemWriter 19 | 20 | 21 | expected_df = cudf.DataFrame( 22 | { 23 | "firstname": ["Emma", "Ava", "Sophia"], 24 | "lastname": ["Olivia", "Isabella", "Charlotte"], 25 | "gender": ["F", "F", "F"], 26 | } 27 | ) 28 | 29 | 30 | @pytest.mark.parametrize("expected_df", [expected_df]) 31 | def test_write_data_csv(tmpdir, expected_df): 32 | fname = str(tmpdir.mkdir("tmp_test_fs_writer").join("person.csv")) 33 | config = { 34 | "type": "fs", 35 | "output_path": fname, 36 | "output_format": "csv", 37 | "index": False 38 | } 39 | writer = FileSystemWriter(config) 40 | writer.write_data(expected_df) 41 | 42 | result_df = cudf.read_csv(fname) 43 | assert result_df.equals(expected_df) 44 | 45 | 46 | @pytest.mark.parametrize("expected_df", [expected_df]) 47 | def test_write_data_parquet(tmpdir, expected_df): 48 | fname = str(tmpdir.mkdir("tmp_test_fs_writer").join("person.parquet")) 49 | config = { 50 | "type": "fs", 51 | "output_path": fname, 52 | "output_format": "parquet" 53 | } 54 | writer = FileSystemWriter(config) 55 | writer.write_data(expected_df) 56 | 57 | result_df = cudf.read_parquet(fname) 58 | assert result_df.equals(expected_df) 59 | 60 | 61 | @pytest.mark.parametrize("expected_df", [expected_df]) 62 | def test_write_data_orc(tmpdir, expected_df): 63 | fname = str(tmpdir.mkdir("tmp_test_fs_writer").join("person.orc")) 64 | config = { 65 | "type": "fs", 66 | "output_path": fname, 67 | "output_format": "orc", 68 | } 69 | writer = FileSystemWriter(config) 70 | writer.write_data(expected_df) 71 | 72 | result_df = cudf.read_orc(fname) 73 | assert result_df.equals(expected_df) 74 | 75 | 76 | @pytest.mark.parametrize("expected_df", [expected_df]) 77 | def test_write_data_json(tmpdir, expected_df): 78 | fname = str(tmpdir.mkdir("tmp_test_fs_writer").join("person.json")) 79 | config = { 80 | "type": "fs", 81 | "output_path": fname, 82 | "output_format": "json", 83 | "orient": "records" 84 | } 85 | writer = FileSystemWriter(config) 86 | writer.write_data(expected_df) 87 | 88 | result_df = cudf.read_json(fname, orient="records") 89 | assert result_df.equals(expected_df) 90 | -------------------------------------------------------------------------------- /python/clx/tests/test_port_heuristic.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import cudf 16 | from clx.heuristics import ports 17 | 18 | 19 | def test_major_ports(): 20 | input_addr_col = cudf.Series(["10.0.75.1", "10.0.75.1", "10.0.75.1", "10.0.75.255", "10.110.104.107"]) 21 | input_port_col = cudf.Series([137, 137, 7680, 137, 7680]) 22 | 23 | expected = cudf.DataFrame() 24 | expected["addr"] = ["10.0.75.1", "10.0.75.255", "10.110.104.107"] 25 | expected["port"] = [137, 137, 7680] 26 | expected["service"] = ["netbios-ns", "netbios-ns", "pando-pub"] 27 | expected["conns"] = [2, 1, 1] 28 | 29 | actual = ports.major_ports(input_addr_col, input_port_col) 30 | 31 | assert actual.equals(expected) 32 | 33 | 34 | def test_major_ports_ephemeral(): 35 | input_addr_col = cudf.Series(["10.0.75.1", "10.0.75.2", "10.0.75.3", "10.0.75.4"]) 36 | input_port_col = cudf.Series([50000, 60000, 20000, 80]) 37 | 38 | expected = cudf.DataFrame() 39 | expected["addr"] = ["10.0.75.1", "10.0.75.2", "10.0.75.3", "10.0.75.4"] 40 | expected["port"] = [50000, 60000, 20000, 80] 41 | expected["service"] = ["ephemeral", "ephemeral", "dnp", "http"] 42 | expected["conns"] = [1, 1, 1, 1] 43 | 44 | actual = ports.major_ports(input_addr_col, input_port_col, eph_min=50000) 45 | 46 | assert actual.equals(expected) 47 | 48 | 49 | def test_major_ports_min_conns(): 50 | input_addr_col = cudf.Series(["10.0.75.1", "10.0.75.1", "10.0.75.1", "10.0.75.255", "10.110.104.107"]) 51 | input_port_col = cudf.Series([137, 137, 7680, 137, 7680]) 52 | 53 | expected = cudf.DataFrame() 54 | expected["addr"] = ["10.0.75.1"] 55 | expected["port"] = [137] 56 | expected["service"] = ["netbios-ns"] 57 | expected["conns"] = [2] 58 | 59 | actual = ports.major_ports(input_addr_col, input_port_col, min_conns=2) 60 | 61 | assert actual.equals(expected) 62 | 63 | 64 | def test_major_ports_all_params(): 65 | input_addr_col = cudf.Series(["10.0.75.1", "10.0.75.1", "10.0.75.1", "10.0.75.255", "10.110.104.107", "10.110.104.107"]) 66 | input_port_col = cudf.Series([137, 137, 7680, 137, 7680, 7680]) 67 | 68 | expected = cudf.DataFrame() 69 | expected["addr"] = ["10.0.75.1", "10.110.104.107"] 70 | expected["port"] = [137, 7680] 71 | expected["service"] = ["netbios-ns", "ephemeral"] 72 | expected["conns"] = [2, 2] 73 | 74 | actual = ports.major_ports(input_addr_col, input_port_col, min_conns=2, eph_min=7000) 75 | 76 | assert actual.equals(expected) 77 | -------------------------------------------------------------------------------- /ci/gpu/build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright (c) 2018-2022, NVIDIA CORPORATION. 3 | ########################################## 4 | # CLX GPU build & testscript for CI # 5 | ########################################## 6 | 7 | set -e 8 | NUMARGS=$# 9 | ARGS=$* 10 | 11 | # Arg parsing function 12 | function hasArg { 13 | (( ${NUMARGS} != 0 )) && (echo " ${ARGS} " | grep -q " $1 ") 14 | } 15 | 16 | # Set path and build parallel level 17 | export PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH 18 | export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4} 19 | export CUDA_REL=${CUDA_VERSION%.*} 20 | export CUDA_SHORT=${CUDA_REL//./} 21 | 22 | # Set home to the job's workspace 23 | export HOME="$WORKSPACE" 24 | 25 | # Switch to project root; also root of repo checkout 26 | cd "$WORKSPACE" 27 | export GIT_DESCRIBE_TAG=`git describe --tags` 28 | export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'` 29 | unset GIT_DESCRIBE_TAG 30 | 31 | ################################################################################ 32 | # SETUP - Check environment 33 | ################################################################################ 34 | 35 | gpuci_logger "Get env" 36 | env 37 | 38 | gpuci_logger "Activate conda env" 39 | . /opt/conda/etc/profile.d/conda.sh 40 | conda activate rapids 41 | 42 | gpuci_logger "Install conda dependencies" 43 | gpuci_mamba_retry install -y \ 44 | "cuxfilter=${MINOR_VERSION}" \ 45 | "faker" \ 46 | "python-whois" \ 47 | "seqeval=1.2.2" 48 | 49 | pip install -U torch==1.11.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html 50 | pip install "git+https://github.com/rapidsai/cudatashader.git" 51 | pip install "git+https://github.com/slashnext/SlashNext-URL-Analysis-and-Enrichment.git#egg=slashnext-phishing-ir&subdirectory=Python SDK/src" 52 | pip install mockito 53 | pip install wget 54 | 55 | gpuci_logger "Check versions" 56 | python --version 57 | $CC --version 58 | $CXX --version 59 | 60 | gpuci_logger "Show conda info" 61 | conda info 62 | conda config --show-sources 63 | conda list --show-channel-urls 64 | 65 | ################################################################################ 66 | # BUILD - Build clx 67 | ################################################################################ 68 | 69 | #TODO: Move boa installation to gpuci/rapidsai 70 | gpuci_mamba_retry install boa 71 | 72 | gpuci_logger "Build and install clx..." 73 | cd "${WORKSPACE}" 74 | CONDA_BLD_DIR="${WORKSPACE}/.conda-bld" 75 | gpuci_conda_retry mambabuild --croot "${CONDA_BLD_DIR}" conda/recipes/clx 76 | gpuci_mamba_retry install -c "${CONDA_BLD_DIR}" clx 77 | 78 | ################################################################################ 79 | # TEST - Test python package 80 | ################################################################################ 81 | set +e -Eo pipefail 82 | EXITCODE=0 83 | trap "EXITCODE=1" ERR 84 | 85 | if hasArg --skip-tests; then 86 | gpuci_logger "Skipping Tests" 87 | else 88 | cd "$WORKSPACE/python" 89 | py.test --ignore=ci --cache-clear --junitxml="$WORKSPACE/junit-clx.xml" -v 90 | "$WORKSPACE/ci/gpu/test-notebooks.sh" 2>&1 | tee nbtest.log 91 | python "$WORKSPACE/ci/utils/nbtestlog2junitxml.py" nbtest.log 92 | fi 93 | 94 | return "${EXITCODE}" 95 | -------------------------------------------------------------------------------- /python/clx/analytics/detector.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import torch 3 | import torch.nn as nn 4 | from abc import ABC, abstractmethod 5 | 6 | log = logging.getLogger(__name__) 7 | 8 | GPU_COUNT = torch.cuda.device_count() 9 | 10 | 11 | class Detector(ABC): 12 | def __init__(self, lr=0.001): 13 | self.lr = lr 14 | self._model = None 15 | self._optimizer = None 16 | self._criterion = nn.CrossEntropyLoss() 17 | 18 | @property 19 | def model(self): 20 | return self._model 21 | 22 | @property 23 | def optimizer(self): 24 | return self._optimizer 25 | 26 | @property 27 | def criterion(self): 28 | return self._criterion 29 | 30 | @abstractmethod 31 | def init_model(self, char_vocab, hidden_size, n_domain_type, n_layers): 32 | pass 33 | 34 | @abstractmethod 35 | def train_model(self, training_data, labels, batch_size=1000, epochs=1, train_size=0.7): 36 | pass 37 | 38 | @abstractmethod 39 | def predict(self, epoch, train_dataset): 40 | pass 41 | 42 | def load_model(self, file_path): 43 | """ This function load already saved model and sets cuda parameters. 44 | 45 | :param file_path: File path of a model to be loaded. 46 | :type file_path: string 47 | """ 48 | 49 | model = torch.load(file_path) 50 | model.eval() 51 | self._model = model 52 | self._set_model2cuda() 53 | self._set_optimizer() 54 | 55 | def save_model(self, file_path): 56 | """ This function saves model to a given location. 57 | 58 | :param file_path: File path of a model to be saved. 59 | :type file_path: string 60 | """ 61 | 62 | torch.save(self.model, file_path) 63 | 64 | def _save_checkpoint(self, checkpoint, file_path): 65 | torch.save(checkpoint, file_path) 66 | log.info("Pretrained model checkpoint saved to location: '{}'".format(file_path)) 67 | 68 | def _set_parallelism(self): 69 | if GPU_COUNT > 1: 70 | log.info("CUDA device count: {}".format(GPU_COUNT)) 71 | self._model = nn.DataParallel(self.model) 72 | self._set_model2cuda() 73 | else: 74 | self._set_model2cuda() 75 | 76 | def _set_optimizer(self): 77 | self._optimizer = torch.optim.RMSprop( 78 | self.model.parameters(), self.lr, weight_decay=0.0 79 | ) 80 | 81 | def _set_model2cuda(self): 82 | if torch.cuda.is_available(): 83 | log.info("Found GPU's now setting up cuda for the model") 84 | self.model.cuda() 85 | 86 | def leverage_model(self, model): 87 | """This function leverages model by setting parallelism parameters. 88 | 89 | :param model: Model instance. 90 | :type model: RNNClassifier 91 | """ 92 | model.eval() 93 | self._model = model 94 | self._set_parallelism() 95 | self._set_optimizer() 96 | 97 | def _get_unwrapped_model(self): 98 | if GPU_COUNT > 1: 99 | model = self.model.module 100 | else: 101 | model = self.model 102 | return model 103 | -------------------------------------------------------------------------------- /python/clx/tests/test_fs_reader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import cudf 16 | import pytest 17 | from clx.io.reader.fs_reader import FileSystemReader 18 | 19 | expected_df = cudf.DataFrame( 20 | { 21 | "firstname": ["Emma", "Ava", "Sophia"], 22 | "lastname": ["Olivia", "Isabella", "Charlotte"], 23 | "gender": ["F", "F", "F"], 24 | } 25 | ) 26 | 27 | 28 | @pytest.mark.parametrize("expected_df", [expected_df]) 29 | def test_fetch_data_csv(tmpdir, expected_df): 30 | fname = tmpdir.mkdir("tmp_test_fs_reader").join("person.csv") 31 | expected_df.to_csv(fname, index=False) 32 | 33 | config = { 34 | "type": "fs", 35 | "input_path": fname, 36 | "names": ["firstname", "lastname", "gender"], 37 | "delimiter": ",", 38 | "usecols": ["firstname", "lastname", "gender"], 39 | "dtype": ["str", "str", "str"], 40 | "header": 0, 41 | "input_format": "csv" 42 | } 43 | reader = FileSystemReader(config) 44 | fetched_df = reader.fetch_data() 45 | 46 | assert fetched_df.equals(expected_df) 47 | 48 | 49 | @pytest.mark.parametrize("expected_df", [expected_df]) 50 | def test_fetch_data_parquet(tmpdir, expected_df): 51 | fname = tmpdir.mkdir("tmp_test_fs_reader").join("person.parquet") 52 | cudf.io.parquet.to_parquet(expected_df, fname) 53 | 54 | config = { 55 | "type": "fs", 56 | "input_path": fname, 57 | "input_format": "parquet" 58 | } 59 | 60 | reader = FileSystemReader(config) 61 | fetched_df = reader.fetch_data() 62 | 63 | assert fetched_df.equals(expected_df) 64 | 65 | 66 | @pytest.mark.parametrize("expected_df", [expected_df]) 67 | def test_fetch_data_orc(tmpdir, expected_df): 68 | fname = str(tmpdir.mkdir("tmp_test_fs_reader").join("person.orc")) 69 | cudf.io.orc.to_orc(expected_df, fname) 70 | config = { 71 | "type": "fs", 72 | "input_path": fname, 73 | "input_format": "orc" 74 | } 75 | 76 | reader = FileSystemReader(config) 77 | fetched_df = reader.fetch_data() 78 | 79 | assert fetched_df.equals(expected_df) 80 | 81 | 82 | @pytest.mark.parametrize("expected_df", [expected_df]) 83 | def test_fetch_data_json(tmpdir, expected_df): 84 | fname = str(tmpdir.mkdir("tmp_test_fs_reader").join("person.json")) 85 | cudf.io.json.to_json(expected_df, fname, orient="records") 86 | config = { 87 | "type": "fs", 88 | "input_path": fname, 89 | "orient": "records", 90 | "input_format": "json" 91 | } 92 | 93 | reader = FileSystemReader(config) 94 | fetched_df = reader.fetch_data() 95 | 96 | assert fetched_df.equals(expected_df) 97 | -------------------------------------------------------------------------------- /examples/run_dga_training.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | Example Usage: python run_dga_training.py \ 17 | --training-data benign_and_dga_domains.csv \ 18 | --output-dir trained_models \ 19 | --batch-size 10000 \ 20 | --epochs 2 21 | """ 22 | import os 23 | import cudf 24 | import torch 25 | import argparse 26 | from datetime import datetime 27 | from clx.analytics.dga_detector import DGADetector 28 | 29 | LR = 0.001 30 | N_LAYERS = 4 31 | CHAR_VOCAB = 128 32 | HIDDEN_SIZE = 100 33 | N_DOMAIN_TYPE = 2 34 | 35 | def main(): 36 | epochs = int(args["epochs"]) 37 | input_filepath = args["training_data"] 38 | batch_size = int(args["batch_size"]) 39 | output_dir = args["output_dir"] 40 | # load input data to gpu memory 41 | input_df = cudf.read_csv(input_filepath) 42 | train_data = input_df['domain'] 43 | labels = input_df['type'] 44 | del input_df 45 | dd = DGADetector(lr=LR) 46 | dd.init_model( 47 | n_layers=N_LAYERS, 48 | char_vocab=CHAR_VOCAB, 49 | hidden_size=HIDDEN_SIZE, 50 | n_domain_type=N_DOMAIN_TYPE, 51 | ) 52 | dd.train_model(train_data, labels, batch_size=batch_size, epochs=epochs, train_size=0.7) 53 | 54 | if not os.path.exists(output_dir): 55 | print("Creating directory '{}'".format(output_dir)) 56 | os.makedirs(output_dir) 57 | now = datetime.now() 58 | model_filename = "rnn_classifier_{}.bin".format(now.strftime("%Y-%m-%d_%H_%M_%S")) 59 | model_filepath = os.path.join(output_dir, model_filename) 60 | print("Saving trained model to location '{}'".format(model_filepath)) 61 | dd.save_model(model_filepath) 62 | 63 | def parse_cmd_args(): 64 | # construct the argument parse and parse the arguments 65 | ap = argparse.ArgumentParser(description="DGA detection model training script") 66 | ap.add_argument( 67 | "--training-data", required=True, help="CSV with domain and type fields" 68 | ) 69 | ap.add_argument( 70 | "--output-dir", required=True, help="output directory to save new model files" 71 | ) 72 | ap.add_argument( 73 | "--batch-size", 74 | required=True, 75 | help="Dividing dataset into number of batches or sets or parts", 76 | ) 77 | ap.add_argument( 78 | "--epochs", 79 | required=True, 80 | help="One epoch is when an entire dataset is passed forward and backward through the neural network only once", 81 | ) 82 | args = vars(ap.parse_args()) 83 | return args 84 | 85 | 86 | # execution starts here 87 | if __name__ == "__main__": 88 | args = parse_cmd_args() 89 | main() 90 | -------------------------------------------------------------------------------- /python/clx/osi/whois.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # use `pip install python-whois` 16 | import whois 17 | import logging 18 | 19 | log = logging.getLogger(__name__) 20 | 21 | 22 | class WhoIsLookupClient(object): 23 | 24 | str_arr_keys = ["domain_name", "name_servers", "status", "emails", "dnssec"] 25 | datetime_arr_keys = ["creation_date", "updated_date", "expiration_date"] 26 | 27 | """ 28 | Wrapper class to query WhoIs API. 29 | 30 | :param sep: Delimiter to concat nested list values from the Whois response. 31 | :param datetime_format: Format to convert WhoIs response datetime object. 32 | """ 33 | def __init__(self, sep=",", datetime_format="%m-%d-%Y %H:%M:%S"): 34 | self.sep = sep 35 | self.datetime_format = datetime_format 36 | 37 | def whois(self, domains, arr2str=True): 38 | """ 39 | Function to access parsed WhoIs data for a given domain. 40 | 41 | :param domains: Domains to perform whois lookup. 42 | :type domains: list 43 | :param arr2str: Convert WhoIs lookup response object to list of strings. 44 | :type arr2str: boolean 45 | :return: WhoIs information with respect to given domains. 46 | :rtype: list/obj 47 | 48 | Examples 49 | -------- 50 | >>> from clx.osi.whois import WhoIsLookupClient 51 | >>> domains = ["nvidia.com"] 52 | >>> client = WhoIsLookupClient() 53 | >>> client.whois(domains) 54 | [{'domain_name': 'NVIDIA.COM', 'registrar': 'Safenames Ltd', 'whois_server': 'whois.safenames.net'...}] 55 | """ 56 | result = [] 57 | for domain in domains: 58 | resp = whois.whois(domain) 59 | if arr2str: 60 | resp_keys = resp.keys() 61 | resp = self.__flatten_str_array(resp, resp_keys) 62 | resp = self.__flatten_datetime_array(resp, resp_keys) 63 | result.append(resp) 64 | return result 65 | 66 | def __flatten_str_array(self, resp, resp_keys): 67 | for key in self.str_arr_keys: 68 | if key in resp_keys and isinstance(resp[key], list): 69 | resp[key] = self.sep.join(resp[key]) 70 | return resp 71 | 72 | def __flatten_datetime_array(self, resp, resp_keys): 73 | for key in self.datetime_arr_keys: 74 | values = [] 75 | if key in resp_keys: 76 | if isinstance(resp[key], list): 77 | for ts in resp[key]: 78 | values.append(ts.strftime(self.datetime_format)) 79 | resp[key] = self.sep.join(values) 80 | else: 81 | resp[key] = resp[key].strftime(self.datetime_format) 82 | return resp 83 | -------------------------------------------------------------------------------- /python/clx/io/factory/kafka_factory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | 17 | from confluent_kafka import Consumer 18 | from confluent_kafka import Producer 19 | 20 | from clx.io.factory.abstract_factory import AbstractFactory 21 | from clx.io.reader.kafka_reader import KafkaReader 22 | from clx.io.writer.kafka_writer import KafkaWriter 23 | 24 | log = logging.getLogger(__name__) 25 | 26 | 27 | class KafkaFactory(AbstractFactory): 28 | def __init__(self, config): 29 | """ 30 | Constructor method 31 | 32 | :param config: dictionary object of config values for **batch_size**, **time_window**, **publisher_kafka_topic**, **output_delimiter**, **kafka_brokers**, and **group_id**. 33 | """ 34 | self._config = config 35 | 36 | def get_reader(self): 37 | """ 38 | Get instance of KafkaReader 39 | """ 40 | consumer = self._create_consumer() 41 | if "time_window" in self.config: 42 | reader = KafkaReader( 43 | self.config["batch_size"], 44 | consumer, 45 | time_window=self.config["time_window"], 46 | ) 47 | else: 48 | reader = KafkaReader(self.config["batch_size"], consumer) 49 | return reader 50 | 51 | def get_writer(self): 52 | """ 53 | Get instance of KafkaWriter 54 | """ 55 | producer = self._create_producer() 56 | writer = KafkaWriter( 57 | self.config["publisher_kafka_topic"], 58 | self.config["batch_size"], 59 | self.config["output_delimiter"], 60 | producer, 61 | ) 62 | return writer 63 | 64 | def _create_consumer(self): 65 | log.info("creating kafka consumer instance") 66 | consumer_conf = { 67 | "bootstrap.servers": self.config["kafka_brokers"], 68 | "group.id": self.config["group_id"], 69 | "session.timeout.ms": 10000, 70 | "default.topic.config": {"auto.offset.reset": "largest"}, 71 | } 72 | 73 | c = Consumer(consumer_conf) 74 | c.subscribe( 75 | self.config["consumer_kafka_topics"], on_assign=self.print_assignment 76 | ) 77 | log.info("created kafka consumer instance") 78 | return c 79 | 80 | def _create_producer(self): 81 | log.info("creating kafka producer instance") 82 | producer_conf = { 83 | "bootstrap.servers": self.config["kafka_brokers"], 84 | "session.timeout.ms": 10000, 85 | } 86 | producer = Producer(producer_conf) 87 | log.info("created producer instance") 88 | return producer 89 | 90 | def print_assignment(self, consumer, partitions): 91 | print("Assignment:", partitions) 92 | -------------------------------------------------------------------------------- /python/clx/tests/test_binary_sequence_classifier.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import random 15 | from os import path 16 | 17 | import cudf 18 | import torch 19 | import transformers 20 | from cuml.model_selection import train_test_split 21 | from faker import Faker 22 | 23 | from clx.analytics.binary_sequence_classifier import BinarySequenceClassifier 24 | 25 | sc = BinarySequenceClassifier() 26 | if torch.cuda.is_available(): 27 | sc.init_model("bert-base-uncased") 28 | 29 | 30 | def test_train_model(): 31 | if torch.cuda.is_available(): 32 | fake = Faker() 33 | email_col = [fake.text() for _ in range(200)] 34 | label_col = [random.randint(0, 1) for _ in range(200)] 35 | emails_gdf = cudf.DataFrame(list(zip(email_col, label_col)), columns=["email", "label"]) 36 | X_train, X_test, y_train, y_test = train_test_split( 37 | emails_gdf, "label", train_size=0.8, random_state=10 38 | ) 39 | sc.train_model( 40 | X_train["email"], 41 | y_train, 42 | learning_rate=3e-5, 43 | max_seq_len=128, 44 | batch_size=6, 45 | epochs=1, 46 | ) 47 | assert isinstance( 48 | sc._model.module, 49 | transformers.models.bert.modeling_bert.BertForSequenceClassification, 50 | ) 51 | 52 | 53 | def test_evaluate_model(): 54 | if torch.cuda.is_available(): 55 | X_test = cudf.Series(["email 1", "email 2"]) 56 | y_test = cudf.Series([0, 0]) 57 | accuracy = sc.evaluate_model( 58 | X_test, y_test, max_seq_len=128, batch_size=32 59 | ) 60 | assert accuracy >= 0.0 and accuracy <= 1.0 61 | 62 | 63 | def test_predict(): 64 | if torch.cuda.is_available(): 65 | X_test = cudf.Series(["email 1", "email 2"]) 66 | preds = sc.predict(X_test, max_seq_len=128) 67 | assert preds[0].isin([False, True]).equals(cudf.Series([True, True])) 68 | 69 | 70 | def test_save_model(tmpdir): 71 | if torch.cuda.is_available(): 72 | sc.save_model(tmpdir) 73 | assert path.exists(str(tmpdir.join("config.json"))) 74 | assert path.exists(str(tmpdir.join("pytorch_model.bin"))) 75 | 76 | 77 | def test_save_checkpoint(tmpdir): 78 | if torch.cuda.is_available(): 79 | fname = str(tmpdir.mkdir("tmp_test_sequence_classifier").join("sc_checkpoint.tar")) 80 | sc.save_checkpoint(fname) 81 | assert path.exists(fname) 82 | 83 | 84 | def test_load_checkpoint(tmpdir): 85 | if torch.cuda.is_available(): 86 | fname = str(tmpdir.mkdir("tmp_test_sequence_classifier").join("sc_checkpoint.tar")) 87 | sc.save_checkpoint(fname) 88 | assert path.exists(fname) 89 | sc.load_checkpoint(fname) 90 | assert isinstance( 91 | sc._model.module, 92 | transformers.models.bert.modeling_bert.BertForSequenceClassification, 93 | ) 94 | -------------------------------------------------------------------------------- /python/clx/tests/test_multiclass_sequence_classifier.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import random 15 | from os import path 16 | 17 | import cudf 18 | import torch 19 | import transformers 20 | from cuml.model_selection import train_test_split 21 | from faker import Faker 22 | 23 | from clx.analytics.multiclass_sequence_classifier import MulticlassSequenceClassifier 24 | 25 | sc = MulticlassSequenceClassifier() 26 | if torch.cuda.is_available(): 27 | sc.init_model("bert-base-uncased", num_labels=3) 28 | 29 | 30 | def test_train_model(): 31 | if torch.cuda.is_available(): 32 | fake = Faker() 33 | email_col = [fake.text() for _ in range(200)] 34 | label_col = [random.randint(0, 2) for _ in range(200)] 35 | emails_gdf = cudf.DataFrame(list(zip(email_col, label_col)), columns=["email", "label"]) 36 | X_train, X_test, y_train, y_test = train_test_split( 37 | emails_gdf, "label", train_size=0.8, random_state=10 38 | ) 39 | sc.train_model( 40 | X_train["email"], 41 | y_train, 42 | learning_rate=3e-5, 43 | max_seq_len=128, 44 | batch_size=6, 45 | epochs=1, 46 | ) 47 | assert isinstance( 48 | sc._model.module, 49 | transformers.models.bert.modeling_bert.BertForSequenceClassification, 50 | ) 51 | 52 | 53 | def test_evaluate_model(): 54 | if torch.cuda.is_available(): 55 | X_test = cudf.Series(["email 1", "email 2"]) 56 | y_test = cudf.Series([0, 0]) 57 | accuracy = sc.evaluate_model( 58 | X_test, y_test, max_seq_len=128, batch_size=32 59 | ) 60 | assert accuracy >= 0.0 and accuracy <= 1.0 61 | 62 | 63 | def test_predict(): 64 | if torch.cuda.is_available(): 65 | X_test = cudf.Series(["email 1", "email 2"]) 66 | preds = sc.predict(X_test, max_seq_len=128) 67 | assert preds.isin([0, 1, 2]).equals(cudf.Series([True, True])) 68 | 69 | 70 | def test_save_model(tmpdir): 71 | if torch.cuda.is_available(): 72 | sc.save_model(tmpdir) 73 | assert path.exists(str(tmpdir.join("config.json"))) 74 | assert path.exists(str(tmpdir.join("pytorch_model.bin"))) 75 | 76 | 77 | def test_save_checkpoint(tmpdir): 78 | if torch.cuda.is_available(): 79 | fname = str(tmpdir.mkdir("tmp_test_sequence_classifier").join("sc_checkpoint.tar")) 80 | sc.save_checkpoint(fname) 81 | assert path.exists(fname) 82 | 83 | 84 | def test_load_checkpoint(tmpdir): 85 | if torch.cuda.is_available(): 86 | fname = str(tmpdir.mkdir("tmp_test_sequence_classifier").join("sc_checkpoint.tar")) 87 | sc.save_checkpoint(fname) 88 | assert path.exists(fname) 89 | sc.load_checkpoint(fname) 90 | assert isinstance( 91 | sc._model.module, 92 | transformers.models.bert.modeling_bert.BertForSequenceClassification, 93 | ) 94 | -------------------------------------------------------------------------------- /docs/source/api.rst: -------------------------------------------------------------------------------- 1 | API Reference 2 | ============= 3 | 4 | 5 | IP 6 | -- 7 | .. automodule:: clx.ip 8 | :members: 9 | 10 | Features 11 | -------- 12 | .. automodule:: clx.features 13 | :members: 14 | 15 | Analytics 16 | --------- 17 | .. autoclass:: clx.analytics.asset_classification.AssetClassification 18 | :members: 19 | 20 | .. autoclass:: clx.analytics.binary_sequence_classifier.BinarySequenceClassifier 21 | :members: 22 | :inherited-members: 23 | 24 | .. autoclass:: clx.analytics.cybert.Cybert 25 | :members: 26 | 27 | .. autoclass:: clx.analytics.detector.Detector 28 | :members: 29 | 30 | .. autoclass:: clx.analytics.dga_dataset.DGADataset 31 | :members: 32 | 33 | .. autoclass:: clx.analytics.dga_detector.DGADetector 34 | :members: 35 | 36 | .. autoclass:: clx.analytics.loda.Loda 37 | :members: 38 | 39 | .. autoclass:: clx.analytics.model.rnn_classifier.RNNClassifier 40 | :members: 41 | 42 | .. autoclass:: clx.analytics.model.tabular_model.TabularModel 43 | :members: 44 | 45 | .. autoclass:: clx.analytics.multiclass_sequence_classifier.MulticlassSequenceClassifier 46 | :members: 47 | :inherited-members: 48 | 49 | .. automodule:: clx.analytics.anomaly_detection 50 | :members: 51 | 52 | .. automodule:: clx.analytics.perfect_hash 53 | :members: 54 | 55 | .. automodule:: clx.analytics.periodicity_detection 56 | :members: 57 | 58 | .. automodule:: clx.analytics.stats 59 | :members: 60 | 61 | DNS Extractor 62 | ------------- 63 | .. automodule:: clx.dns.dns_extractor 64 | :members: 65 | 66 | Exploratory Data Analysis 67 | ------------------------- 68 | .. autoclass:: clx.eda.EDA 69 | :members: 70 | 71 | Heuristics 72 | ---------- 73 | .. automodule:: clx.heuristics.ports 74 | :members: 75 | 76 | OSI (Open Source Integration) 77 | ----------------------------- 78 | .. autoclass:: clx.osi.farsight.FarsightLookupClient 79 | :members: 80 | 81 | .. autoclass:: clx.osi.virus_total.VirusTotalClient 82 | :members: 83 | 84 | .. autoclass:: clx.osi.whois.WhoIsLookupClient 85 | :members: 86 | 87 | .. autoclass:: clx.osi.slashnext.SlashNextClient 88 | :members: 89 | 90 | Parsers 91 | ------- 92 | 93 | .. autoclass:: clx.parsers.event_parser.EventParser 94 | :members: 95 | 96 | .. autoclass:: clx.parsers.splunk_notable_parser.SplunkNotableParser 97 | :members: 98 | 99 | .. autoclass:: clx.parsers.windows_event_parser.WindowsEventParser 100 | :members: 101 | 102 | .. automodule:: clx.parsers.zeek 103 | :members: 104 | 105 | Utils 106 | ----- 107 | 108 | .. autoclass:: clx.utils.data.dataloader.DataLoader 109 | :members: 110 | 111 | .. autoclass:: clx.utils.data.dataset.Dataset 112 | :members: 113 | 114 | .. autoclass:: clx.utils.data.utils 115 | :members: 116 | 117 | Workflow 118 | -------- 119 | 120 | .. autoclass:: clx.workflow.workflow.Workflow 121 | :members: 122 | 123 | .. autoclass:: clx.workflow.splunk_alert_workflow.SplunkAlertWorkflow 124 | :members: 125 | 126 | I/O 127 | -------- 128 | 129 | .. autoclass:: clx.io.reader.kafka_reader.KafkaReader 130 | :members: 131 | 132 | .. autoclass:: clx.io.reader.dask_fs_reader.DaskFileSystemReader 133 | :members: 134 | 135 | .. autoclass:: clx.io.reader.fs_reader.FileSystemReader 136 | :members: 137 | 138 | .. autoclass:: clx.io.writer.kafka_writer.KafkaWriter 139 | :members: 140 | 141 | .. autoclass:: clx.io.writer.fs_writer.FileSystemWriter 142 | :members: 143 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # CLX Code of Conduct 2 | 3 | CLX has adopted the [Contributor Covenant Conde of Conduct](https://docs.rapids.ai/resources/conduct): 4 | 5 | ## Our Pledge 6 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. 7 | 8 | ## Our Standards 9 | ### Examples of behavior that contributes to creating a positive environment include: 10 | 11 | - Using welcoming and inclusive language, 12 | - Being respectful of differing viewpoints and experiences, 13 | - Gracefully accepting constructive criticism, 14 | - Focusing on what is best for the community, and 15 | - Showing empathy towards other community members. 16 | 17 | ### Examples of unacceptable behavior by participants include: 18 | 19 | - The use of sexualized language or imagery and unwelcome sexual attention or advances, 20 | - Trolling, insulting/derogatory comments, and personal or political attacks, 21 | - Public or private harassment, 22 | - Publishing others’ private information, such as a physical or electronic address, without explicit permission, and 23 | - Other conduct which could reasonably be considered inappropriate in a professional setting. 24 | 25 | ## Our Responsibilities 26 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 27 | 28 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 29 | 30 | ## Scope 31 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 32 | 33 | ## Enforcement 34 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at [conduct@rapids.ai](mailto:conduct@rapids.ai). All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 35 | 36 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project’s leadership. 37 | 38 | ## Attribution 39 | This Code of Conduct is adapted from the Contributor Covenant, version 1.4, available at [https://www.contributor-covenant.org/version/1/4/code-of-conduct.html](https://www.contributor-covenant.org/version/1/4/code-of-conduct.html). 40 | 41 | For answers to common questions about this code of conduct, see [https://www.contributor-covenant.org/faq](https://www.contributor-covenant.org/faq). -------------------------------------------------------------------------------- /python/clx/eda/summary_stats.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import cuxfilter 16 | 17 | from clx.eda.analysis import Analysis 18 | 19 | 20 | class SummaryStatistics(Analysis): 21 | def __init__(self, dataframe): 22 | super().__init__(dataframe) 23 | 24 | def __summary_obj(self, series): 25 | summary = {} 26 | uniq_count = len(series.unique()) 27 | total = series.notna().sum() 28 | summary["unique"] = str(uniq_count) 29 | summary["total"] = str(total) 30 | return summary 31 | 32 | def __summary_bool(self, series): 33 | summary = {} 34 | true_per = (series == True).sum() # noqa: E712 35 | summary["true_percent"] = str(true_per / len(series)) 36 | return summary 37 | 38 | def __summary_num(self, series): 39 | summary = {} 40 | uniq_count = len(series.unique()) 41 | total = series.notna().sum() 42 | summary["unique"] = str(uniq_count) 43 | summary["total"] = str(total) 44 | return summary 45 | 46 | def __summary_time(self, series): 47 | summary = {} 48 | duration = series.max() - series.min() 49 | days = duration.astype("timedelta64[D]").astype(int) 50 | seconds = duration.astype("timedelta64[s]").astype(int) 51 | hours = days * 24 + seconds // 3600 52 | minutes = (seconds % 3600) // 60 53 | seconds = seconds % 60 54 | msg = "{0} days, {1} hours, {2} minutes, {3} seconds".format( 55 | days, hours, minutes, seconds 56 | ) 57 | summary["timespan"] = msg 58 | return summary 59 | 60 | def _generate_analysis(self, dataframe): 61 | # This function will receive a dataframe and returns a dictionary of summary statistics 62 | summary_dict = {} 63 | for col in dataframe.columns: 64 | summary_dict[col] = {} 65 | summary_dict[col]["dtype"] = str(dataframe[col].dtype) 66 | if dataframe[col].dtype == "object": 67 | summary_dict[col]["summary"] = self.__summary_obj(dataframe[col]) 68 | elif dataframe[col].dtype == "bool": 69 | summary_dict[col]["summary"] = self.__summary_bool(dataframe[col]) 70 | elif dataframe[col].dtype in ["int64", "float64", "int8"]: 71 | summary_dict[col]["summary"] = self.__summary_num(dataframe[col]) 72 | elif dataframe[col].dtype == "datetime64[ns]": 73 | summary_dict[col]["summary"] = self.__summary_time(dataframe[col]) 74 | else: 75 | msg = "\t column type (" + str(dataframe[col].dtype) + ") not supported" 76 | summary_dict[col]["error"] = msg 77 | return summary_dict 78 | 79 | def _generate_charts(self, dataframe): 80 | """Get barcharts for the summary analysis""" 81 | charts = [] 82 | for col in dataframe.columns: 83 | if dataframe[col].dtype == "object": 84 | bars = len(dataframe[col].unique()) 85 | if bars < 30: 86 | if bars > 1: 87 | charts.append(cuxfilter.charts.bar(col)) 88 | return charts 89 | -------------------------------------------------------------------------------- /siem_integrations/clx_query_service/clx_query_service/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for clx_query_service project. 3 | 4 | Generated by 'django-admin startproject' using Django 2.2.6. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/2.2/topics/settings/ 8 | 9 | For the full list of settings and their values, see 10 | https://docs.djangoproject.com/en/2.2/ref/settings/ 11 | """ 12 | 13 | import os 14 | 15 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...) 16 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 17 | 18 | 19 | # Quick-start development settings - unsuitable for production 20 | # See https://docs.djangoproject.com/en/2.2/howto/deployment/checklist/ 21 | 22 | # SECURITY WARNING: keep the secret key used in production secret! 23 | SECRET_KEY = "i6nr8@pzj5$@^(y903w5tc)8%v!(lk!3npl$1z7(%##2zxv" 24 | 25 | # SECURITY WARNING: don't run with debug turned on in production! 26 | DEBUG = False 27 | 28 | ALLOWED_HOSTS = ["localhost"] 29 | 30 | 31 | # Application definition 32 | 33 | INSTALLED_APPS = [ 34 | "django.contrib.admin", 35 | "django.contrib.auth", 36 | "django.contrib.contenttypes", 37 | "django.contrib.sessions", 38 | "django.contrib.messages", 39 | "django.contrib.staticfiles", 40 | "rest_framework", 41 | "clxquery.apps.ClxQueryConfig", 42 | ] 43 | 44 | MIDDLEWARE = [ 45 | "django.middleware.security.SecurityMiddleware", 46 | "django.contrib.sessions.middleware.SessionMiddleware", 47 | "django.middleware.common.CommonMiddleware", 48 | "django.middleware.csrf.CsrfViewMiddleware", 49 | "django.contrib.auth.middleware.AuthenticationMiddleware", 50 | "django.contrib.messages.middleware.MessageMiddleware", 51 | "django.middleware.clickjacking.XFrameOptionsMiddleware", 52 | ] 53 | 54 | ROOT_URLCONF = "clx_query_service.urls" 55 | 56 | TEMPLATES = [ 57 | { 58 | "BACKEND": "django.template.backends.django.DjangoTemplates", 59 | "DIRS": [], 60 | "APP_DIRS": True, 61 | "OPTIONS": { 62 | "context_processors": [ 63 | "django.template.context_processors.debug", 64 | "django.template.context_processors.request", 65 | "django.contrib.auth.context_processors.auth", 66 | "django.contrib.messages.context_processors.messages", 67 | ] 68 | }, 69 | } 70 | ] 71 | 72 | WSGI_APPLICATION = "clx_query_service.wsgi.application" 73 | 74 | 75 | # Database 76 | # https://docs.djangoproject.com/en/2.2/ref/settings/#databases 77 | 78 | DATABASES = { 79 | "default": { 80 | "ENGINE": "django.db.backends.sqlite3", 81 | "NAME": os.path.join(BASE_DIR, "db.sqlite3"), 82 | } 83 | } 84 | 85 | 86 | # Password validation 87 | # https://docs.djangoproject.com/en/2.2/ref/settings/#auth-password-validators 88 | 89 | AUTH_PASSWORD_VALIDATORS = [ 90 | { 91 | "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator" 92 | }, 93 | {"NAME": "django.contrib.auth.password_validation.MinimumLengthValidator"}, 94 | {"NAME": "django.contrib.auth.password_validation.CommonPasswordValidator"}, 95 | {"NAME": "django.contrib.auth.password_validation.NumericPasswordValidator"}, 96 | ] 97 | 98 | 99 | # Internationalization 100 | # https://docs.djangoproject.com/en/2.2/topics/i18n/ 101 | 102 | LANGUAGE_CODE = "en-us" 103 | 104 | TIME_ZONE = "UTC" 105 | 106 | USE_I18N = True 107 | 108 | USE_L10N = True 109 | 110 | USE_TZ = True 111 | 112 | 113 | # Static files (CSS, JavaScript, Images) 114 | # https://docs.djangoproject.com/en/2.2/howto/static-files/ 115 | 116 | STATIC_URL = "/static/" 117 | -------------------------------------------------------------------------------- /python/clx/io/reader/kafka_reader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import cudf 16 | import logging 17 | import time 18 | from confluent_kafka import KafkaError 19 | from clx.io.reader.reader import Reader 20 | 21 | log = logging.getLogger(__name__) 22 | 23 | 24 | class KafkaReader(Reader): 25 | """ 26 | Reads from Kafka based on config object. 27 | 28 | :param batch_size: batch size 29 | :param consumer: Kafka consumer 30 | :param time_window: Max window of time that queued events will wait to be pushed to workflow 31 | """ 32 | def __init__(self, batch_size, consumer, time_window=30): 33 | self._batch_size = batch_size 34 | self._consumer = consumer 35 | self._has_data = True 36 | self._time_window = time_window 37 | 38 | @property 39 | def consumer(self): 40 | return self._consumer 41 | 42 | @property 43 | def has_data(self): 44 | return self._has_data 45 | 46 | @property 47 | def time_window(self): 48 | return self._time_window 49 | 50 | def fetch_data(self): 51 | """ 52 | Fetch data from Kafka based on provided config object 53 | """ 54 | events = [] 55 | rec_cnt = 0 56 | running = True 57 | current_time = time.time() 58 | try: 59 | while running: 60 | # First check if batch size or time window has been exceeded 61 | if ( 62 | rec_cnt >= self._batch_size or (time.time() - current_time) >= self.time_window 63 | ): 64 | log.debug( 65 | "Exceeded record count (" + str(rec_cnt) + ") or time window (" + str(time.time() - current_time) + ")" 66 | ) 67 | running = False 68 | # Else poll next message in kafka queue 69 | else: 70 | msg = self.consumer.poll(timeout=1.0) 71 | if msg is None: 72 | log.debug("No message received.") 73 | continue 74 | elif not msg.error(): 75 | data = msg.value().decode("utf-8") 76 | log.debug("Message received.") 77 | events.append(data) 78 | rec_cnt += 1 79 | elif msg.error().code() != KafkaError._PARTITION_EOF: 80 | log.error(msg.error()) 81 | running = False 82 | else: 83 | running = False 84 | df = cudf.DataFrame() 85 | if len(events) > 0: 86 | df["Raw"] = events 87 | log.debug("Kafka reader batch aggregation complete. Dataframe size = " + str(df.shape)) 88 | return df 89 | except Exception: 90 | log.error("Error fetching data from kafka") 91 | raise 92 | 93 | def close(self): 94 | """ 95 | Close Kafka reader 96 | """ 97 | log.info("Closing kafka reader...") 98 | if self.consumer is not None: 99 | self.consumer.close() 100 | log.info("Closed kafka reader.") 101 | --------------------------------------------------------------------------------