├── CONTRIBUTING.md ├── NOTICE ├── SECURITY.md ├── .github └── workflows │ ├── integration-test-gcp-pr.yml │ ├── integration-test-aws-pr.yml │ ├── integration-test-aws-push.yml │ ├── integration-test-msa-pr.yml │ ├── integration-test-gcp-push.yml │ └── integration-test-msa-push.yml ├── 06_DNS_Analytics_ScoreDomain.py ├── .gitignore ├── 03_DNS_Analytics_Exploring_Data.py ├── LICENSE ├── 02_DNS_Analytics_Enrichment.py ├── util └── Shared_Include.py ├── 05_DNS_Analytics_Streaming.py ├── readme.md ├── RUNME.py ├── 00_README.py ├── 01_DNS_Analytics_Ingest.py └── 04_DNS_Analytics_Data_Science.py /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | We happily welcome contributions to this project. We use GitHub Issues to track community reported issues and GitHub Pull Requests for accepting changes pursuant to a CLA. 2 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Copyright (2022) Databricks, Inc. 2 | 3 | 4 | This Software includes software developed at Databricks (https://www.databricks.com/) and its use is subject to the included LICENSE file. 5 | By using this repository and the notebooks within, you consent to Databricks collection and use of usage and tracking information in accordance with our privacy policy at www.databricks/privacypolicy. 6 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Reporting a Vulnerability 4 | 5 | Please email bugbounty@databricks.com to report any security vulnerabilities. We will acknowledge receipt of your vulnerability and strive to send you regular updates about our progress. If you're curious about the status of your disclosure please feel free to email us again. If you want to encrypt your disclosure email, you can use [this PGP key](https://keybase.io/arikfr/key.asc). 6 | 7 | -------------------------------------------------------------------------------- /.github/workflows/integration-test-gcp-pr.yml: -------------------------------------------------------------------------------- 1 | name: GCP integration test PR 2 | 3 | on: 4 | pull_request: 5 | 6 | jobs: 7 | run-databricks-notebook: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Checkout repo 11 | uses: actions/checkout@v2 12 | - name: Run a databricks notebook 13 | uses: databricks/run-notebook@v0 14 | with: 15 | local-notebook-path: RUNME.py 16 | git-commit: ${{ github.event.pull_request.head.sha }} 17 | databricks-host: https://416411475796958.8.gcp.databricks.com 18 | databricks-token: ${{ secrets.DEPLOYMENT_TARGET_TOKEN_GCP }} 19 | new-cluster-json: > 20 | { 21 | "num_workers": 0, 22 | "spark_version": "10.4.x-scala2.12", 23 | "node_type_id": "n1-highmem-4", 24 | "gcp_attributes": { 25 | "availability": "ON_DEMAND_GCP" 26 | }, 27 | "spark_conf": { 28 | "spark.master": "local[*, 4]", 29 | "spark.databricks.cluster.profile": "singleNode" 30 | }, 31 | "custom_tags": { 32 | "ResourceClass": "SingleNode" 33 | } 34 | } 35 | notebook-params-json: > 36 | { 37 | "run_job": "True" 38 | } 39 | access-control-list-json: > 40 | [ 41 | { 42 | "group_name": "users", 43 | "permission_level": "CAN_VIEW" 44 | } 45 | ] -------------------------------------------------------------------------------- /.github/workflows/integration-test-aws-pr.yml: -------------------------------------------------------------------------------- 1 | name: AWS integration test PR 2 | 3 | on: 4 | pull_request: 5 | 6 | jobs: 7 | run-databricks-notebook: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Checkout repo 11 | uses: actions/checkout@v2 12 | - name: Run a databricks notebook 13 | uses: databricks/run-notebook@v0 14 | with: 15 | local-notebook-path: RUNME.py 16 | git-commit: ${{ github.event.pull_request.head.sha }} 17 | databricks-host: https://e2-demo-west.cloud.databricks.com 18 | databricks-token: ${{ secrets.DEPLOYMENT_TARGET_TOKEN_AWS }} 19 | new-cluster-json: > 20 | { 21 | "num_workers": 0, 22 | "spark_version": "10.4.x-scala2.12", 23 | "node_type_id": "i3.xlarge", 24 | "aws_attributes": { 25 | "availability": "ON_DEMAND" 26 | }, 27 | "spark_conf": { 28 | "spark.master": "local[*, 4]", 29 | "spark.databricks.cluster.profile": "singleNode" 30 | }, 31 | "custom_tags": { 32 | "ResourceClass": "SingleNode" 33 | } 34 | } 35 | notebook-params-json: > 36 | { 37 | "run_job": "True" 38 | } 39 | access-control-list-json: > 40 | [ 41 | { 42 | "group_name": "users", 43 | "permission_level": "CAN_VIEW" 44 | } 45 | ] 46 | 47 | -------------------------------------------------------------------------------- /.github/workflows/integration-test-aws-push.yml: -------------------------------------------------------------------------------- 1 | name: AWS integration test push 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: 7 | - main 8 | - web-sync 9 | 10 | jobs: 11 | run-databricks-notebook: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout repo 15 | uses: actions/checkout@v2 16 | - name: Run a databricks notebook 17 | uses: databricks/run-notebook@v0 18 | with: 19 | local-notebook-path: RUNME.py 20 | git-commit: ${{ github.sha }} 21 | databricks-host: https://e2-demo-west.cloud.databricks.com 22 | databricks-token: ${{ secrets.DEPLOYMENT_TARGET_TOKEN_AWS }} 23 | new-cluster-json: > 24 | { 25 | "num_workers": 0, 26 | "spark_version": "10.4.x-scala2.12", 27 | "node_type_id": "i3.xlarge", 28 | "aws_attributes": { 29 | "availability": "ON_DEMAND" 30 | }, 31 | "spark_conf": { 32 | "spark.master": "local[*, 4]", 33 | "spark.databricks.cluster.profile": "singleNode" 34 | }, 35 | "custom_tags": { 36 | "ResourceClass": "SingleNode" 37 | } 38 | } 39 | notebook-params-json: > 40 | { 41 | "run_job": "True" 42 | } 43 | access-control-list-json: > 44 | [ 45 | { 46 | "group_name": "users", 47 | "permission_level": "CAN_VIEW" 48 | } 49 | ] 50 | -------------------------------------------------------------------------------- /.github/workflows/integration-test-msa-pr.yml: -------------------------------------------------------------------------------- 1 | name: MSA integration test PR 2 | on: 3 | pull_request: 4 | 5 | jobs: 6 | run-databricks-notebook: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - name: Checkout repo 10 | uses: actions/checkout@v2 11 | - name: Run a databricks notebook 12 | uses: databricks/run-notebook@v0 13 | with: 14 | local-notebook-path: RUNME.py 15 | git-commit: ${{ github.event.pull_request.head.sha }} 16 | databricks-host: https://adb-984752964297111.11.azuredatabricks.net 17 | databricks-token: ${{ secrets.DEPLOYMENT_TARGET_TOKEN_MSA }} 18 | new-cluster-json: > 19 | { 20 | "num_workers": 0, 21 | "spark_version": "10.4.x-scala2.12", 22 | "node_type_id": "Standard_DS3_v2", 23 | "azure_attributes": { 24 | "availability": "ON_DEMAND_AZURE" 25 | }, 26 | "spark_conf": { 27 | "spark.master": "local[*, 4]", 28 | "spark.databricks.cluster.profile": "singleNode" 29 | }, 30 | "custom_tags": { 31 | "ResourceClass": "SingleNode" 32 | } 33 | 34 | } 35 | notebook-params-json: > 36 | { 37 | "run_job": "True" 38 | } 39 | access-control-list-json: > 40 | [ 41 | { 42 | "group_name": "users", 43 | "permission_level": "CAN_VIEW" 44 | } 45 | ] -------------------------------------------------------------------------------- /.github/workflows/integration-test-gcp-push.yml: -------------------------------------------------------------------------------- 1 | name: GCP integration test push 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: 7 | - main 8 | - web-sync 9 | 10 | jobs: 11 | run-databricks-notebook: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout repo 15 | uses: actions/checkout@v2 16 | - name: Run a databricks notebook 17 | uses: databricks/run-notebook@v0 18 | with: 19 | local-notebook-path: RUNME.py 20 | git-commit: ${{ github.sha }} 21 | databricks-host: https://416411475796958.8.gcp.databricks.com 22 | databricks-token: ${{ secrets.DEPLOYMENT_TARGET_TOKEN_GCP }} 23 | new-cluster-json: > 24 | { 25 | "num_workers": 0, 26 | "spark_version": "10.4.x-scala2.12", 27 | "node_type_id": "n1-highmem-4", 28 | "gcp_attributes": { 29 | "availability": "ON_DEMAND_GCP" 30 | }, 31 | "spark_conf": { 32 | "spark.master": "local[*, 4]", 33 | "spark.databricks.cluster.profile": "singleNode" 34 | }, 35 | "custom_tags": { 36 | "ResourceClass": "SingleNode" 37 | } 38 | } 39 | notebook-params-json: > 40 | { 41 | "run_job": "True" 42 | } 43 | access-control-list-json: > 44 | [ 45 | { 46 | "group_name": "users", 47 | "permission_level": "CAN_VIEW" 48 | } 49 | ] -------------------------------------------------------------------------------- /.github/workflows/integration-test-msa-push.yml: -------------------------------------------------------------------------------- 1 | name: MSA integration test push 2 | on: 3 | workflow_dispatch: 4 | push: 5 | branches: 6 | - main 7 | - web-sync 8 | 9 | jobs: 10 | run-databricks-notebook: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout repo 14 | uses: actions/checkout@v2 15 | - name: Run a databricks notebook 16 | uses: databricks/run-notebook@v0 17 | with: 18 | local-notebook-path: RUNME.py 19 | git-commit: ${{ github.sha }} 20 | databricks-host: https://adb-984752964297111.11.azuredatabricks.net 21 | databricks-token: ${{ secrets.DEPLOYMENT_TARGET_TOKEN_MSA }} 22 | new-cluster-json: > 23 | { 24 | "num_workers": 0, 25 | "spark_version": "10.4.x-scala2.12", 26 | "node_type_id": "Standard_D3_v2", 27 | "azure_attributes": { 28 | "availability": "ON_DEMAND_AZURE" 29 | }, 30 | "spark_conf": { 31 | "spark.master": "local[*, 4]", 32 | "spark.databricks.cluster.profile": "singleNode" 33 | }, 34 | "custom_tags": { 35 | "ResourceClass": "SingleNode" 36 | } 37 | } 38 | notebook-params-json: > 39 | { 40 | "run_job": "True" 41 | } 42 | access-control-list-json: > 43 | [ 44 | { 45 | "group_name": "users", 46 | "permission_level": "CAN_VIEW" 47 | } 48 | ] -------------------------------------------------------------------------------- /06_DNS_Analytics_ScoreDomain.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC You may find this series of notebooks at https://github.com/databricks-industry-solutions/dns-analytics. For more information about this solution accelerator, visit https://www.databricks.com/solutions/accelerators/threat-detection. 4 | 5 | # COMMAND ---------- 6 | 7 | # MAGIC %md Read Parameterized inputs 8 | 9 | # COMMAND ---------- 10 | 11 | dbutils.widgets.removeAll() 12 | dbutils.widgets.text("DomainName","","01. Domain to be scored") 13 | 14 | # COMMAND ---------- 15 | 16 | domain=dbutils.widgets.get("DomainName") 17 | 18 | # COMMAND ---------- 19 | 20 | # MAGIC %md Download Databricks Trained DGA Detection model file for scoring 21 | 22 | # COMMAND ---------- 23 | 24 | # MAGIC %sh 25 | # MAGIC if [ ! -d /tmp/dga_model ]; then 26 | # MAGIC mkdir -p /tmp/dga_model 27 | # MAGIC curl -o /tmp/dga_model/python_model.pkl https://raw.githubusercontent.com/zaferbil/dns-notebook-datasets/master/model/python_model.pkl 28 | # MAGIC curl -o /tmp/dga_model/MLmodel https://raw.githubusercontent.com/zaferbil/dns-notebook-datasets/master/model/MLmodel 29 | # MAGIC curl -o /tmp/dga_model/conda.yaml https://raw.githubusercontent.com/zaferbil/dns-notebook-datasets/master/model/conda.yaml 30 | # MAGIC fi 31 | 32 | # COMMAND ---------- 33 | 34 | # MAGIC %md Load the model using mlflow 35 | 36 | # COMMAND ---------- 37 | 38 | # Load the DGA model. 39 | 40 | # this is an optimization to not to reload model on evey invocation! 41 | import json 42 | ctx = json.loads(dbutils.notebook.entry_point.getDbutils().notebook().getContext().toJson()) 43 | if spark.conf.get(f"dga_model_is_loaded_{ctx['extraContext']['notebook_path']}", "false") == "false": 44 | 45 | import mlflow 46 | import mlflow.pyfunc 47 | 48 | # you can change to your own path copied from the output of 4th notebook 49 | model_path = 'dbfs:/FileStore/tables/dga_model' 50 | dbutils.fs.cp("file:/tmp/dga_model/", model_path, True) 51 | print(f"loading model from {model_path}") 52 | loaded_model = mlflow.pyfunc.load_model(model_path) 53 | spark.conf.set(f"dga_model_is_loaded_{ctx['extraContext']['notebook_path']}", "true") 54 | 55 | # COMMAND ---------- 56 | 57 | # MAGIC %md Score the domain name with the function. 58 | 59 | # COMMAND ---------- 60 | 61 | print(f'Score for Domain {domain} is : {loaded_model.predict(domain)}') 62 | 63 | 64 | # COMMAND ---------- 65 | 66 | #print(f'Test Execution for google.com: {loaded_model.predict("google.com")}') 67 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | .DS_Store 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .idea 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 97 | __pypackages__/ 98 | 99 | # Celery stuff 100 | celerybeat-schedule 101 | celerybeat.pid 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # mkdocs documentation 123 | /site 124 | 125 | # mypy 126 | .mypy_cache/ 127 | .dmypy.json 128 | dmypy.json 129 | 130 | # Pyre type checker 131 | .pyre/ 132 | -------------------------------------------------------------------------------- /03_DNS_Analytics_Exploring_Data.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC You may find this series of notebooks at https://github.com/databricks-industry-solutions/dns-analytics. For more information about this solution accelerator, visit https://www.databricks.com/solutions/accelerators/threat-detection. 4 | 5 | # COMMAND ---------- 6 | 7 | # MAGIC %md 8 | # MAGIC # 3. Ad-Hoc Analytics: Exploring the data 9 | # MAGIC FINALLY!!!! We have data. And we can start poking around. This is an optional section for you to familiarize yourself with the data. And pick up some spark SQL tricks. You can use these tactics to explore and expand on the analytics. 10 | # MAGIC 11 | # MAGIC **Add your own queries!** 12 | 13 | # COMMAND ---------- 14 | 15 | # MAGIC %run ./util/Shared_Include 16 | 17 | # COMMAND ---------- 18 | 19 | # MAGIC %sql 20 | # MAGIC -- Lets take a look at the number of unique domains in our dataset 21 | # MAGIC select count(distinct(domain_name)) from silver_dns 22 | 23 | # COMMAND ---------- 24 | 25 | # MAGIC %sql select count(*) from silver_dns 26 | 27 | # COMMAND ---------- 28 | 29 | # MAGIC %sql 30 | # MAGIC -- ioc is a field we've created as a result of running the DGA model. If the ioc field has a value of ioc, it means that the DGA model has determined the domain to be an ioc (indicator of compromise) 31 | # MAGIC -- The query below is for a total count of rows where the DGA algorithm has detected an ioc. But excludes an domains that have the string 'ip' in it and has a domain name length of more than 10 characters 32 | # MAGIC select count(*), domain_name, country 33 | # MAGIC from silver_dns 34 | # MAGIC where ioc = 'ioc' and domain_name not like '%ip%' and char_length(domain_name) > 8 35 | # MAGIC group by domain_name, country 36 | # MAGIC order by count(*) desc 37 | 38 | # COMMAND ---------- 39 | 40 | # MAGIC %md 41 | # MAGIC Let us check against the known threat feeds 42 | 43 | # COMMAND ---------- 44 | 45 | # MAGIC %sql 46 | # MAGIC -- Query for domains in the silver.dns, silver.EnrichedThreatFeeds tables where there is an ioc match. 47 | # MAGIC -- You may have experienced: many to many match/join is compute cost prohibitive in most SIEM/log aggregation systems. Spark SQL is a lot more efficient. 48 | # MAGIC select count(distinct(domain_name)) 49 | # MAGIC from silver_dns, silver_threat_feeds 50 | # MAGIC where silver_dns.domain_name == silver_threat_feeds.domain 51 | 52 | # COMMAND ---------- 53 | 54 | # MAGIC %sql 55 | # MAGIC -- Query for ioc matches across multiple tables. Similar to previous example but with additional columns in the results table 56 | # MAGIC select domain_name, rrname, country, time_first, time_last, ioc,rrtype,rdata,bailiwick, silver_threat_feeds.* 57 | # MAGIC from silver_dns, silver_threat_feeds 58 | # MAGIC where silver_dns.domain_name == silver_threat_feeds.domain and ioc='ioc' 59 | 60 | # COMMAND ---------- 61 | 62 | # MAGIC %sql 63 | # MAGIC -- Looking for specific rrnames in multiple tables. 64 | # MAGIC select domain_name, rrname, country, time_first, time_last, ioc,rrtype,rdata,bailiwick, silver_threat_feeds.* 65 | # MAGIC from silver_dns, silver_threat_feeds 66 | # MAGIC where silver_dns.domain_name == silver_threat_feeds.domain and (silver_dns.rrname = "ns1.asdklgb.cf." OR silver_dns.rrname LIKE "%cn.") 67 | 68 | # COMMAND ---------- 69 | 70 | # MAGIC %sql describe table silver_threat_feeds 71 | 72 | # COMMAND ---------- 73 | 74 | # MAGIC %sql describe table silver_dns 75 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | [Project Name] 2 | 3 | Copyright (2022) Databricks, Inc. 4 | 5 | This library (the "Software") may not be used except in connection with the Licensee's use of the Databricks Platform Services pursuant 6 | to an Agreement (defined below) between Licensee (defined below) and Databricks, Inc. ("Databricks"). The Object Code version of the 7 | Software shall be deemed part of the Downloadable Services under the Agreement, or if the Agreement does not define Downloadable Services, 8 | Subscription Services, or if neither are defined then the term in such Agreement that refers to the applicable Databricks Platform 9 | Services (as defined below) shall be substituted herein for “Downloadable Services.” Licensee's use of the Software must comply at 10 | all times with any restrictions applicable to the Downloadable Services and Subscription Services, generally, and must be used in 11 | accordance with any applicable documentation. For the avoidance of doubt, the Software constitutes Databricks Confidential Information 12 | under the Agreement. 13 | 14 | Additionally, and notwithstanding anything in the Agreement to the contrary: 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 16 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 17 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 18 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | * you may view, make limited copies of, and may compile the Source Code version of the Software into an Object Code version of the 20 | Software. For the avoidance of doubt, you may not make derivative works of Software (or make any any changes to the Source Code 21 | version of the unless you have agreed to separate terms with Databricks permitting such modifications (e.g., a contribution license 22 | agreement)). 23 | 24 | If you have not agreed to an Agreement or otherwise do not agree to these terms, you may not use the Software or view, copy or compile 25 | the Source Code of the Software. 26 | 27 | This license terminates automatically upon the termination of the Agreement or Licensee's breach of these terms. Additionally, 28 | Databricks may terminate this license at any time on notice. Upon termination, you must permanently delete the Software and all 29 | copies thereof (including the Source Code). 30 | 31 | Agreement: the agreement between Databricks and Licensee governing the use of the Databricks Platform Services, which shall be, with 32 | respect to Databricks, the Databricks Terms of Service located at www.databricks.com/termsofservice, and with respect to Databricks 33 | Community Edition, the Community Edition Terms of Service located at www.databricks.com/ce-termsofuse, in each case unless Licensee 34 | has entered into a separate written agreement with Databricks governing the use of the applicable Databricks Platform Services. 35 | 36 | Databricks Platform Services: the Databricks services or the Databricks Community Edition services, according to where the Software is used. 37 | 38 | Licensee: the user of the Software, or, if the Software is being used on behalf of a company, the company. 39 | 40 | Object Code: is version of the Software produced when an interpreter or a compiler translates the Source Code into recognizable and 41 | executable machine code. 42 | 43 | Source Code: the human readable portion of the Software. 44 | -------------------------------------------------------------------------------- /02_DNS_Analytics_Enrichment.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC You may find this series of notebooks at https://github.com/databricks-industry-solutions/dns-analytics. For more information about this solution accelerator, visit https://www.databricks.com/solutions/accelerators/threat-detection. 4 | 5 | # COMMAND ---------- 6 | 7 | # MAGIC %md 8 | # MAGIC # 2. Loading the data 9 | # MAGIC We admit, that felt like a lot of work to prep URLHaus and dnstwist. But we are now ready for typosquatting detection and threat intel enrichment. 10 | # MAGIC 11 | # MAGIC Now, we can enrich the pDNS data with tldextract, GeoIP lookups, a DGA Classifier, URLHaus, threat intel lookups. 12 | # MAGIC We will do this using Spark SQL. 13 | 14 | # COMMAND ---------- 15 | 16 | # MAGIC %run ./util/Shared_Include 17 | 18 | # COMMAND ---------- 19 | 20 | # Create user defined functions (UDF) for loading and manipulating the Geo data 21 | # The code here will perform Geo-IP lookups using the ip address available in the rdata field in our bronze table 22 | # We use a free geo database from Maxmind: https://dev.maxmind.com/geoip/geoip2/geolite2/ 23 | import geoip2.errors 24 | from geoip2 import database 25 | 26 | import pandas as pd 27 | 28 | from pyspark.sql.functions import pandas_udf 29 | from pyspark import SparkFiles 30 | 31 | # You can download this database from: https://dev.maxmind.com/geoip/geoip2/geolite2/ 32 | # You can upload the GeoLite2_City database file by using the databricks UI. 33 | # Databricks Navigator (lefthand bar) -> Data -> Upload File -> Select 34 | # Note if you receive an error here, you need to check exact location and adjust it 35 | city_db = f'{get_default_path()}/datasets/GeoLite2_City.mmdb' 36 | 37 | if not dbfs_file_exists(city_db): 38 | raise Exception(f'Please download GeoLite2_City database and put into {city_db}') 39 | 40 | def extract_geoip_data(ip: str, geocity): 41 | print(ip) 42 | if ip: 43 | try: 44 | record = geocity.city(ip) 45 | return {'city': record.city.name, 'country': record.country.name, 'country_code': record.country.iso_code} 46 | except geoip2.errors.AddressNotFoundError: 47 | pass 48 | 49 | return {'city': None, 'country': None, 'country_code': None} 50 | 51 | @pandas_udf("city string, country string, country_code string") 52 | def get_geoip_data(ips: pd.Series) -> pd.DataFrame: 53 | # TODO: re-think that into more portable, as this may not work on CE 54 | geocity = database.Reader(f'/dbfs{city_db}') 55 | extracted = ips.apply(lambda ip: extract_geoip_data(ip, geocity)) 56 | 57 | return pd.DataFrame(extracted.values.tolist()) 58 | 59 | spark.udf.register("get_geoip_data", get_geoip_data) 60 | 61 | # COMMAND ---------- 62 | 63 | # Load the DGA model. This is a pre-trained model that we will use to enrich our incoming DNS events. You will see how to train this model in a later step. 64 | import mlflow 65 | import mlflow.pyfunc 66 | 67 | model_path = f'dbfs:{get_default_path()}/model' 68 | loaded_model = mlflow.pyfunc.load_model(model_path) 69 | ioc_detect_udf = spark.udf.register("ioc_detect", loaded_model.predict) 70 | 71 | # COMMAND ---------- 72 | 73 | # Filtering on the rrtype of A 74 | dns_table = (spark.table("bronze_dns") 75 | .selectExpr("*", "case when rrtype = 'A' then element_at(rdata, 1) else null end as ip_address ") 76 | ) 77 | 78 | # COMMAND ---------- 79 | 80 | #Enrich the data with city, country, country codes, ioc and domain name 81 | dns_table_enriched = dns_table.withColumn("geoip_data", get_geoip_data(dns_table.ip_address))\ 82 | .selectExpr("*", "geoip_data.*", 83 | "case when char_length(domain_extract(rrname)) > 5 then ioc_detect(string(domain_extract(rrname))) else null end as ioc", 84 | "domain_extract(rrname) as domain_name").drop("geoip_data") 85 | 86 | # COMMAND ---------- 87 | 88 | # Persist the enriched DNS data 89 | (dns_table_enriched.write 90 | .format("delta") 91 | .mode('overwrite') 92 | .option("mergeSchema", True) 93 | .saveAsTable('silver_dns') 94 | ) 95 | 96 | # COMMAND ---------- 97 | 98 | # MAGIC %sql 99 | # MAGIC /* We check to see how many records we have loaded */ 100 | # MAGIC select count(*) from silver_dns 101 | -------------------------------------------------------------------------------- /util/Shared_Include.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC You may find this series of notebooks at https://github.com/databricks-industry-solutions/dns-analytics. For more information about this solution accelerator, visit https://www.databricks.com/solutions/accelerators/threat-detection. 4 | 5 | # COMMAND ---------- 6 | 7 | # install our libraries 8 | %pip install tldextract dnstwist geoip2 9 | 10 | # COMMAND ---------- 11 | 12 | import re 13 | import os 14 | 15 | current_user_name = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags().get("user").get() 16 | 17 | def get_user_prefix(): 18 | return re.sub(r'[^A-Za-z0-9_]', '_', re.sub(r'^([^@]+)(@.*)?$', r'\1', current_user_name)) 19 | 20 | current_user_name_prefix = get_user_prefix() 21 | 22 | def get_default_path(): 23 | return f'/tmp/{current_user_name_prefix}/dns_analytics' 24 | 25 | try: 26 | dbutils.fs.mkdirs(get_default_path()) 27 | except: 28 | pass 29 | 30 | def dbfs_file_exists(path: str): 31 | try: 32 | dbutils.fs.ls(path) 33 | except Exception as e: 34 | return False 35 | 36 | return True 37 | 38 | if dbfs_file_exists('dbfs:/FileStore/dns_analytics/GeoLite2-City.mmdb') and not dbfs_file_exists(f'dbfs:{get_default_path()}/datasets/GeoLite2_City.mmdb'): 39 | dbutils.fs.cp('dbfs:/FileStore/dns_analytics/GeoLite2-City.mmdb', f'{get_default_path()}/datasets/GeoLite2_City.mmdb') 40 | 41 | if dbfs_file_exists(f'dbfs:{get_default_path()}/datasets/GeoLite2_City.mmdb'): 42 | sc.addFile(f'dbfs:{get_default_path()}/datasets/GeoLite2_City.mmdb') 43 | 44 | def get_default_database(): 45 | return f'{current_user_name_prefix}_dns' 46 | 47 | spark.sql(f'create database if not exists {get_default_database()}') 48 | spark.sql(f'use {get_default_database()}') 49 | 50 | print(f'Default database: {get_default_database()}') 51 | print(f'Files are stored in {get_default_path()}') 52 | 53 | # COMMAND ---------- 54 | 55 | spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true") 56 | 57 | # COMMAND ---------- 58 | 59 | # MAGIC %python 60 | # MAGIC # We will extract the registered_domain_extract and domain_extract fields from the URLHaus feeds. 61 | # MAGIC import tldextract 62 | # MAGIC import numpy as np 63 | # MAGIC 64 | # MAGIC def registered_domain_extract(uri): 65 | # MAGIC ext = tldextract.extract(uri) 66 | # MAGIC if (not ext.suffix): 67 | # MAGIC return " " 68 | # MAGIC else: 69 | # MAGIC return ext.registered_domain 70 | # MAGIC 71 | # MAGIC def domain_extract(uri): 72 | # MAGIC ext = tldextract.extract(uri) 73 | # MAGIC if (not ext.suffix): 74 | # MAGIC return " " 75 | # MAGIC else: 76 | # MAGIC return ext.domain 77 | # MAGIC 78 | # MAGIC #The next three lines are registering our user defined functions(UDF) in the Databricks runtime environment 79 | # MAGIC registered_domain_extract = spark.udf.register("registered_domain_extract", registered_domain_extract) 80 | # MAGIC domain_extract_udf = spark.udf.register("domain_extract", domain_extract) 81 | 82 | # COMMAND ---------- 83 | 84 | #Load the DGA model. This is a pre-trained model that we will use to enrich our incoming DNS events. You will see how to train this model in a later step. 85 | import mlflow 86 | import mlflow.pyfunc 87 | 88 | def get_and_register_ioc_detect_model(): 89 | if dbfs_file_exists(f'dbfs:{get_default_path()}/new_model/dga_model'): 90 | model_path = f'dbfs:{get_default_path()}/new_model/dga_model' 91 | else: 92 | model_path = f'dbfs:{get_default_path()}/model' 93 | print(f"Loading model from {model_path}") 94 | loaded_model = mlflow.pyfunc.load_model(model_path) 95 | spark.udf.register("ioc_detect", loaded_model.predict) 96 | return loaded_model 97 | 98 | # COMMAND ---------- 99 | 100 | def cleanup_files_and_database(): 101 | try: 102 | dbutils.fs.rm(get_default_path(), True) 103 | except: 104 | pass 105 | try: 106 | spark.sql(f'drop database if exists {get_default_database()} cascade') 107 | except: 108 | pass 109 | try: 110 | from mlflow.tracking.client import MlflowClient 111 | client = MlflowClient() 112 | model_name = f"{get_user_prefix()}_dns_dga" 113 | client.delete_registered_model(model_name) 114 | except: 115 | pass 116 | try: 117 | dbutils.fs.rm("file:/tmp/dns-notebook-datasets", True) 118 | except: 119 | pass 120 | 121 | # COMMAND ---------- 122 | 123 | import mlflow 124 | experiment_name = f"/Users/{current_user_name}/dns_analytics_experiment" 125 | mlflow.set_experiment(experiment_name) 126 | -------------------------------------------------------------------------------- /05_DNS_Analytics_Streaming.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC You may find this series of notebooks at https://github.com/databricks-industry-solutions/dns-analytics. For more information about this solution accelerator, visit https://www.databricks.com/solutions/accelerators/threat-detection. 4 | 5 | # COMMAND ---------- 6 | 7 | # MAGIC %md 8 | # MAGIC # 5. Near Realtime Streaming Analytics 9 | # MAGIC Enrich data with threat intel and Detect malicious activity in real-time using the analytics and enrichments 10 | 11 | # COMMAND ---------- 12 | 13 | # MAGIC %run ./util/Shared_Include 14 | 15 | # COMMAND ---------- 16 | 17 | # Defining the schema for pDNS. 18 | # You can use either the python style syntax or the SQL DDL syntax to define your schema. 19 | 20 | # from pyspark.sql.types import StructType, StructField, StringType, LongType, StringType, ArrayType 21 | # pdns_schema = (StructType() 22 | # .add("rrname", StringType(), True) 23 | # .add("rrtype", StringType(), True) 24 | # .add("time_first", LongType(), True) 25 | # .add("time_last", LongType(), True) 26 | # .add("count", LongType(), True) 27 | # .add("bailiwick", StringType(), True) 28 | # .add("rdata", ArrayType(StringType(), True), True) 29 | # ) 30 | 31 | pdns_schema = """ 32 | rrname string, 33 | rrtype string, 34 | time_first long, 35 | time_last long, 36 | count long, 37 | bailiwick string, 38 | rdata array 39 | """ 40 | 41 | # COMMAND ---------- 42 | 43 | # Load the DGA model from before and make available as a UDF so we can apply it to our dataframe. 44 | import mlflow 45 | import mlflow.pyfunc 46 | 47 | model_path = f'dbfs:{get_default_path()}/model' 48 | loaded_model = mlflow.pyfunc.load_model(model_path) 49 | ioc_detect_udf = spark.udf.register("ioc_detect", loaded_model.predict) 50 | 51 | # COMMAND ---------- 52 | 53 | # Load test data set 54 | # Setting maxFilesPerTrigger to 1 to simulate streaming from a static set of files. You wouldn't normally add this option in production. 55 | df=(spark.readStream 56 | .option("maxFilesPerTrigger", 1) 57 | .json(f"{get_default_path()}/datasets/latest/", schema=pdns_schema) 58 | .withColumn("isioc", ioc_detect_udf(domain_extract_udf("rrname"))) 59 | .withColumn("domain", domain_extract_udf("rrname")) 60 | ) 61 | df.createOrReplaceTempView("dns_latest_stream") 62 | 63 | # COMMAND ---------- 64 | 65 | # MAGIC %md 66 | # MAGIC ##6.1 Find threats in DNS Event Stream 67 | 68 | # COMMAND ---------- 69 | 70 | # MAGIC %sql 71 | # MAGIC 72 | # MAGIC SELECT * FROM dns_latest_stream WHERE isioc = 'ioc' 73 | 74 | # COMMAND ---------- 75 | 76 | # MAGIC %sql 77 | # MAGIC -- Phishing or Typosquating? 78 | # MAGIC -- This is where we do typosquatting detection 79 | # MAGIC -- By using dnstwist, we find the suspicious domain, googlee 80 | # MAGIC SELECT silver_twisted_domain_brand.* FROM dns_latest_stream, silver_twisted_domain_brand 81 | # MAGIC WHERE silver_twisted_domain_brand.dnstwisted_domain = dns_latest_stream.domain 82 | 83 | # COMMAND ---------- 84 | 85 | # The next few lines we will be applying our models: 86 | # - To detect the bad domains 87 | # - Create an alerts table 88 | dns_stream_iocs = spark.sql("Select * from dns_latest_stream where isioc = 'ioc'") 89 | # dbutils.fs.rm('dbfs:/tmp/datasets/gold/delta/DNS_IOC_Latest', True) 90 | # spark.sql("drop table if exists DNS_IOC_Latest") 91 | (dns_stream_iocs.writeStream 92 | .format("delta") 93 | .outputMode("append") 94 | .option("checkpointLocation", f"{get_default_path()}/_checkpoints/DNS_IOC_Latest") 95 | .table("DNS_IOC_Latest") 96 | ) 97 | 98 | # COMMAND ---------- 99 | 100 | # MAGIC %md 101 | # MAGIC ## 7. Agent Tesla 102 | # MAGIC Success!!! 103 | # MAGIC - We used the DGA detection model on streaming DNS events, 104 | # MAGIC - Identified a suspicious domain (ioc) in our DNS logs, 105 | # MAGIC - Enriched the ioc with URLHaus 106 | # MAGIC - We can we can see that it this DGA domain is serving up agent tesla 107 | 108 | # COMMAND ---------- 109 | 110 | # MAGIC %sql 111 | # MAGIC -- We found the bad domain - lets see if our enriched threat feeds have intel on this domain? 112 | # MAGIC select * from silver_threat_feeds 113 | # MAGIC where silver_threat_feeds.domain = domain_extract('ns1.asdklgb.cf.') 114 | 115 | # COMMAND ---------- 116 | 117 | # Uncomment this line to remove database & all files 118 | # cleanup_files_and_database() 119 | 120 | # COMMAND ---------- 121 | 122 | # let it execute for 10 more minutes 123 | import time 124 | time.sleep(600) 125 | # Please stop all your streams before you go. This will ensure clusters can timeout and shutdown after class. 126 | for s in spark.streams.active: 127 | s.stop() 128 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Detecting Criminals and Nation States through DNS Analytics 2 | 3 | You are a security practitioner, a data scientist or a security data engineer; you’ve seen the Large Scale Threat Detection and Response talk with Databricks . But you're wondering, “how can I try Databricks in my own security operations?” In this blog post, you will learn how to detect a remote access trojan using passive DNS (pDNS) and threat intel. Along the way, you’ll learn how to store, and analyze DNS data using Delta, Spark and MLFlow. As you well know, APT’s and cyber criminals are known to utilize DNS. Threat actors use the DNS protocol for command and control or beaconing or resolution of attacker domains. This is why academic researchers and industry groups advise security teams to collect and analyze DNS events to hunt, detect, investigate and respond to threats. But you know, it's not as easy as it sounds. 4 | 5 | The Complexity, cost, and limitations of legacy technology make detecting DNS security threats challenging for most enterprise organizations. 6 | 7 | 8 | 9 | ## Detecting AgentTeslaRAT with Databricks 10 | Using the notebooks on this solution accelerator, you will be able to detect the Agent Tesla RAT. You will be using analytics for domain generation algorithms (DGA), typosquatting and threat intel enrichments from URLhaus. Along the way you will learn the Databricks concepts of: 11 | 12 | * Data ingestion 13 | * Ad hoc analytics 14 | * How to enrich event data, such as DNS queries 15 | * Model building and 16 | * Batch and Streaming analytics 17 | 18 | Why use Databricks for this? Because the hardest thing about security analytics aren’t the analytics. You already know that analyzing large scale DNS traffic logs is complicated. Colleagues in the security community tell us that the challenges fall into three categories: 19 | 20 | ## Deployment complexity 21 | DNS server data is everywhere. Cloud, hybrid, and multi-cloud deployments make it challenging to collect the data, have a single data store and run analytics consistently across the entire deployment. 22 | Tech limitations: Legacy SIEM and log aggregation solutions can’t scale to cloud data volumes for storage, analytics or ML/AI workloads. Especially, when it comes to joining data like threat intel enrichments. 23 | Cost: SIEMs or log aggregation systems charge by volume of data ingest. With so much data SIEM/log licensing and hardware requirements make DNS analytics cost prohibitive. And moving data from one cloud service provider to another is also costly and time consuming. The hardware pre-commit in the cloud or the expense of physical hardware on-prem are all deterrents for security teams. 24 | In order to address these issues, security teams need a real-time data analytics platform that can handle cloud-scale, analyze data wherever it is, natively support streaming and batch analytics and, have collaborative, content development capabilities. And… if someone could make this entire system elastic to prevent hardware commits… now wouldn’t that be cool! 25 | 26 | You can use this notebook in the Databricks community edition or in your own Databricks deployment. There are lot of lines here but the high level flow is this: 27 | 28 | * Read passive DNS data from AWS S3 bucket 29 | * Specify the schema for DNS and load the data into Delta 30 | * Explore the data with string matches 31 | * Build the DGA detection model. Build the typosquatting model. 32 | * Enrich the output of the DGA and typosquatting with threat intel from URLhaus 33 | * Run the analytics and detect the AgentTesla RAT 34 | 35 | 36 | 37 | ## Getting started 38 | 39 | Although specific solutions can be downloaded as .dbc archives from our websites, we recommend cloning these repositories onto your databricks environment. Not only will you get access to latest code, but you will be part of a community of experts driving industry best practices and re-usable solutions, influencing our respective industries. 40 | 41 | add_repo 42 | 43 | To start using a solution accelerator in Databricks simply follow these steps: 44 | 45 | 1. Clone solution accelerator repository in Databricks using [Databricks Repos](https://www.databricks.com/product/repos) 46 | 2. Attach the `RUNME` notebook to any cluster and execute the notebook via Run-All. A multi-step-job describing the accelerator pipeline will be created, and the link will be provided. The job configuration is written in the RUNME notebook in json format. 47 | 3. Execute the multi-step-job to see how the pipeline runs. 48 | 4. You might want to modify the samples in the solution accelerator to your need, collaborate with other users and run the code samples against your own data. To do so start by changing the Git remote of your repository to your organization’s repository vs using our samples repository (learn more). You can now commit and push code, collaborate with other user’s via Git and follow your organization’s processes for code development. 49 | 50 | The cost associated with running the accelerator is the user's responsibility. 51 | 52 | 53 | ## Project support 54 | 55 | Please note the code in this project is provided for your exploration only, and are not formally supported by Databricks with Service Level Agreements (SLAs). They are provided AS-IS and we do not make any guarantees of any kind. Please do not submit a support ticket relating to any issues arising from the use of these projects. The source in this project is provided subject to the Databricks [License](./LICENSE). All included or referenced third party libraries are subject to the licenses set forth below. 56 | 57 | Any issues discovered through the use of this project should be filed as GitHub Issues on the Repo. They will be reviewed as time permits, but there are no formal SLAs for support. -------------------------------------------------------------------------------- /RUNME.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md This notebook sets up the companion cluster(s) to run the solution accelerator. It also creates the Workflow to illustrate the order of execution. Happy exploring! 3 | # MAGIC 🎉 4 | # MAGIC 5 | # MAGIC **Steps** 6 | # MAGIC 1. Simply attach this notebook to a cluster and hit Run-All for this notebook. A multi-step job and the clusters used in the job will be created for you and hyperlinks are printed on the last block of the notebook. 7 | # MAGIC 8 | # MAGIC 2. Run the accelerator notebooks: Feel free to explore the multi-step job page and **run the Workflow**, or **run the notebooks interactively** with the cluster to see how this solution accelerator executes. 9 | # MAGIC 10 | # MAGIC 2a. **Run the Workflow**: Navigate to the Workflow link and hit the `Run Now` 💥. 11 | # MAGIC 12 | # MAGIC 2b. **Run the notebooks interactively**: Attach the notebook with the cluster(s) created and execute as described in the `job_json['tasks']` below. 13 | # MAGIC 14 | # MAGIC **Prerequisites** 15 | # MAGIC 1. You need to have cluster creation permissions in this workspace. 16 | # MAGIC 17 | # MAGIC 2. In case the environment has cluster-policies that interfere with automated deployment, you may need to manually create the cluster in accordance with the workspace cluster policy. The `job_json` definition below still provides valuable information about the configuration these series of notebooks should run with. 18 | # MAGIC 19 | # MAGIC **Notes** 20 | # MAGIC 1. The pipelines, workflows and clusters created in this script are not user-specific. Keep in mind that rerunning this script again after modification resets them for other users too. 21 | # MAGIC 22 | # MAGIC 2. If the job execution fails, please confirm that you have set up other environment dependencies as specified in the accelerator notebooks. Accelerators may require the user to set up additional cloud infra or secrets to manage credentials. 23 | 24 | # COMMAND ---------- 25 | 26 | # DBTITLE 0,Install util packages 27 | # MAGIC %pip install git+https://github.com/databricks-academy/dbacademy@v1.0.13 git+https://github.com/databricks-industry-solutions/notebook-solution-companion@safe-print-html --quiet --disable-pip-version-check 28 | 29 | # COMMAND ---------- 30 | 31 | from solacc.companion import NotebookSolutionCompanion 32 | 33 | # COMMAND ---------- 34 | 35 | job_json = { 36 | "timeout_seconds": 14400, 37 | "max_concurrent_runs": 1, 38 | "tags": { 39 | "usage": "solacc_automation", 40 | "group": "SEC" 41 | }, 42 | "tasks": [ 43 | { 44 | "job_cluster_key": "dns_cluster", 45 | "notebook_task": { 46 | "notebook_path": f"00_README" 47 | }, 48 | "task_key": "dns_01" 49 | }, 50 | { 51 | "job_cluster_key": "dns_cluster", 52 | "notebook_task": { 53 | "notebook_path": f"01_DNS_Analytics_Ingest" 54 | }, 55 | "task_key": "dns_02", 56 | "depends_on": [ 57 | { 58 | "task_key": "dns_01" 59 | } 60 | ] 61 | }, 62 | { 63 | "job_cluster_key": "dns_cluster", 64 | "notebook_task": { 65 | "notebook_path": f"02_DNS_Analytics_Enrichment" 66 | }, 67 | "task_key": "dns_03", 68 | "depends_on": [ 69 | { 70 | "task_key": "dns_02" 71 | } 72 | ] 73 | }, 74 | { 75 | "job_cluster_key": "dns_cluster", 76 | "notebook_task": { 77 | "notebook_path": f"03_DNS_Analytics_Exploring_Data" 78 | }, 79 | "task_key": "dns_04", 80 | "depends_on": [ 81 | { 82 | "task_key": "dns_03" 83 | } 84 | ] 85 | }, 86 | { 87 | "job_cluster_key": "dns_cluster", 88 | "notebook_task": { 89 | "notebook_path": f"04_DNS_Analytics_Data_Science" 90 | }, 91 | "task_key": "dns_05", 92 | "depends_on": [ 93 | { 94 | "task_key": "dns_04" 95 | } 96 | ] 97 | }, 98 | { 99 | "job_cluster_key": "dns_cluster", 100 | "notebook_task": { 101 | "notebook_path": f"05_DNS_Analytics_Streaming" 102 | }, 103 | "task_key": "dns_06", 104 | "depends_on": [ 105 | { 106 | "task_key": "dns_05" 107 | } 108 | ] 109 | }, 110 | { 111 | "job_cluster_key": "dns_cluster", 112 | "notebook_task": { 113 | "notebook_path": f"06_DNS_Analytics_ScoreDomain" 114 | }, 115 | "task_key": "dns_07", 116 | "depends_on": [ 117 | { 118 | "task_key": "dns_06" 119 | } 120 | ] 121 | } 122 | ], 123 | "job_clusters": [ 124 | { 125 | "job_cluster_key": "dns_cluster", 126 | "new_cluster": { 127 | "spark_version": "12.2.x-cpu-ml-scala2.12", 128 | "spark_conf": { 129 | "spark.databricks.delta.formatCheck.enabled": "false" 130 | }, 131 | "num_workers": 2, 132 | "node_type_id": {"AWS": "i3.xlarge", "MSA": "Standard_DS3_v2", "GCP": "n1-highmem-4"}, # different from standard API 133 | "custom_tags": { 134 | "usage": "solacc_automation" 135 | }, 136 | } 137 | } 138 | ] 139 | } 140 | 141 | 142 | # COMMAND ---------- 143 | 144 | dbutils.widgets.dropdown("run_job", "False", ["True", "False"]) 145 | run_job = dbutils.widgets.get("run_job") == "True" 146 | NotebookSolutionCompanion().deploy_compute(job_json, run_job=run_job) 147 | 148 | # COMMAND ---------- 149 | 150 | 151 | -------------------------------------------------------------------------------- /00_README.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC You may find this series of notebooks at https://github.com/databricks-industry-solutions/dns-analytics. For more information about this solution accelerator, visit https://www.databricks.com/solutions/accelerators/threat-detection. 4 | 5 | # COMMAND ---------- 6 | 7 | # MAGIC %md 8 | # MAGIC 9 | # MAGIC # Detecting Criminals and Nation States through DNS Analytics 10 | # MAGIC 11 | # MAGIC You are a security practitioner, a data scientist or a security data engineer; you’ve seen the Large Scale Threat Detection and Response talk with Databricks . But you're wondering, “how can I try Databricks in my own security operations?” In this blog post, you will learn how to detect a remote access trojan using passive DNS (pDNS) and threat intel. Along the way, you’ll learn how to store, and analyze DNS data using Delta, Spark and MLFlow. As you well know, APT’s and cyber criminals are known to utilize DNS. Threat actors use the DNS protocol for command and control or beaconing or resolution of attacker domains. This is why academic researchers and industry groups advise security teams to collect and analyze DNS events to hunt, detect, investigate and respond to threats. But you know, it's not as easy as it sounds. 12 | # MAGIC 13 | # MAGIC The Complexity, cost, and limitations of legacy technology make detecting DNS security threats challenging for most enterprise organizations. 14 | # MAGIC 15 | # MAGIC 16 | # MAGIC 17 | # MAGIC ## Detecting AgentTeslaRAT with Databricks 18 | # MAGIC Using the notebooks on this solution accelerator, you will be able to detect the Agent Tesla RAT. You will be using analytics for domain generation algorithms (DGA), typosquatting and threat intel enrichments from URLhaus. Along the way you will learn the Databricks concepts of: 19 | # MAGIC 20 | # MAGIC * Data ingestion 21 | # MAGIC * Ad hoc analytics 22 | # MAGIC * How to enrich event data, such as DNS queries 23 | # MAGIC * Model building and 24 | # MAGIC * Batch and Streaming analytics 25 | # MAGIC 26 | # MAGIC Why use Databricks for this? Because the hardest thing about security analytics aren’t the analytics. You already know that analyzing large scale DNS traffic logs is complicated. Colleagues in the security community tell us that the challenges fall into three categories: 27 | # MAGIC 28 | # MAGIC ## Deployment complexity 29 | # MAGIC DNS server data is everywhere. Cloud, hybrid, and multi-cloud deployments make it challenging to collect the data, have a single data store and run analytics consistently across the entire deployment. 30 | # MAGIC Tech limitations: Legacy SIEM and log aggregation solutions can’t scale to cloud data volumes for storage, analytics or ML/AI workloads. Especially, when it comes to joining data like threat intel enrichments. 31 | # MAGIC Cost: SIEMs or log aggregation systems charge by volume of data ingest. With so much data SIEM/log licensing and hardware requirements make DNS analytics cost prohibitive. And moving data from one cloud service provider to another is also costly and time consuming. The hardware pre-commit in the cloud or the expense of physical hardware on-prem are all deterrents for security teams. 32 | # MAGIC In order to address these issues, security teams need a real-time data analytics platform that can handle cloud-scale, analyze data wherever it is, natively support streaming and batch analytics and, have collaborative, content development capabilities. And… if someone could make this entire system elastic to prevent hardware commits… now wouldn’t that be cool! 33 | # MAGIC 34 | # MAGIC You can use this notebook in the Databricks community edition or in your own Databricks deployment. There are lot of lines here but the high level flow is this: 35 | # MAGIC 36 | # MAGIC * Read passive DNS data from AWS S3 bucket 37 | # MAGIC * Specify the schema for DNS and load the data into Delta 38 | # MAGIC * Explore the data with string matches 39 | # MAGIC * Build the DGA detection model. Build the typosquatting model. 40 | # MAGIC * Enrich the output of the DGA and typosquatting with threat intel from URLhaus 41 | # MAGIC * Run the analytics and detect the AgentTesla RAT 42 | # MAGIC 43 | # MAGIC 44 | # MAGIC 45 | # MAGIC ## Getting started 46 | # MAGIC 47 | # MAGIC Although specific solutions can be downloaded as .dbc archives from our websites, we recommend cloning these repositories onto your databricks environment. Not only will you get access to latest code, but you will be part of a community of experts driving industry best practices and re-usable solutions, influencing our respective industries. 48 | # MAGIC 49 | # MAGIC add_repo 50 | # MAGIC 51 | # MAGIC To start using a solution accelerator in Databricks simply follow these steps: 52 | # MAGIC 53 | # MAGIC 1. Clone solution accelerator repository in Databricks using [Databricks Repos](https://www.databricks.com/product/repos) 54 | # MAGIC 2. Attach the `RUNME` notebook to any cluster and execute the notebook via Run-All. A multi-step-job describing the accelerator pipeline will be created, and the link will be provided. The job configuration is written in the RUNME notebook in json format. 55 | # MAGIC 3. Execute the multi-step-job to see how the pipeline runs. 56 | # MAGIC 4. You might want to modify the samples in the solution accelerator to your need, collaborate with other users and run the code samples against your own data. To do so start by changing the Git remote of your repository to your organization’s repository vs using our samples repository (learn more). You can now commit and push code, collaborate with other user’s via Git and follow your organization’s processes for code development. 57 | # MAGIC 58 | # MAGIC The cost associated with running the accelerator is the user's responsibility. 59 | # MAGIC 60 | # MAGIC 61 | # MAGIC ## Project support 62 | # MAGIC 63 | # MAGIC Please note the code in this project is provided for your exploration only, and are not formally supported by Databricks with Service Level Agreements (SLAs). They are provided AS-IS and we do not make any guarantees of any kind. Please do not submit a support ticket relating to any issues arising from the use of these projects. The source in this project is provided subject to the Databricks [License](./LICENSE). All included or referenced third party libraries are subject to the licenses set forth below. 64 | # MAGIC 65 | # MAGIC Any issues discovered through the use of this project should be filed as GitHub Issues on the Repo. They will be reviewed as time permits, but there are no formal SLAs for support. 66 | -------------------------------------------------------------------------------- /01_DNS_Analytics_Ingest.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC You may find this series of notebooks at https://github.com/databricks-industry-solutions/dns-analytics. For more information about this solution accelerator, visit https://www.databricks.com/solutions/accelerators/threat-detection. 4 | 5 | # COMMAND ---------- 6 | 7 | # MAGIC %scala 8 | # MAGIC displayHTML(""" 9 | # MAGIC """) 10 | 11 | # COMMAND ---------- 12 | 13 | # MAGIC %md 14 | # MAGIC ## Data layout 15 | # MAGIC 16 | # MAGIC In this workshop we're using prepared data. We use multiple tables to stage, schematize and store analytics results. Here is the TLDR on table naming: 17 | # MAGIC - **Bronze**: Raw data 18 | # MAGIC - **Silver**: Schematized and enriched data 19 | # MAGIC - **Gold**: Detections and alerts 20 | # MAGIC 21 | # MAGIC Why do this? Short version: so you can always go back to the source, refine your analytics over time, and never lose any data. And the long version. 22 | 23 | # COMMAND ---------- 24 | 25 | # MAGIC %run ./util/Shared_Include 26 | 27 | # COMMAND ---------- 28 | 29 | # MAGIC %md 30 | # MAGIC 31 | # MAGIC ## Fetching the data & initial model for workshop 32 | 33 | # COMMAND ---------- 34 | 35 | # MAGIC %md 36 | # MAGIC In this segment, we'll download all of the datasets we need in order to be able to run our notebook. 37 | # MAGIC These datasets include: 38 | # MAGIC * anonymized DNS data 39 | # MAGIC * a GeoIP lookup database 40 | # MAGIC * a threat feed 41 | # MAGIC * domains generated by `dnstwist` for our enrichment pipeline 42 | # MAGIC 43 | # MAGIC We also include: 44 | # MAGIC * the top 100k domains on alexa 45 | # MAGIC * a list of dictionary words 46 | # MAGIC * a list of dga domains to train a DGA model 47 | 48 | # COMMAND ---------- 49 | 50 | # MAGIC %sh 51 | # MAGIC if [ -d /tmp/dns-notebook-datasets ]; then 52 | # MAGIC cd /tmp/dns-notebook-datasets 53 | # MAGIC git pull 54 | # MAGIC else 55 | # MAGIC cd /tmp 56 | # MAGIC git clone --depth 1 https://github.com/zaferbil/dns-notebook-datasets.git 57 | # MAGIC fi 58 | 59 | # COMMAND ---------- 60 | 61 | # Copy the downloaded data into the FileStore for this workspace 62 | print(f'Copying datasets and model to the DBFS: {get_default_path()}') 63 | dbutils.fs.cp("file:///tmp/dns-notebook-datasets/data", f"dbfs:{get_default_path()}/datasets/",True) 64 | dbutils.fs.cp("file:///tmp/dns-notebook-datasets/model", f"dbfs:{get_default_path()}/model/",True) 65 | 66 | # COMMAND ---------- 67 | 68 | # MAGIC %md 69 | # MAGIC 70 | # MAGIC ## Loading pDNS data 71 | 72 | # COMMAND ---------- 73 | 74 | # Defining the schema for pDNS. 75 | # You can use either the python style syntax or the SQL DDL syntax to define your schema. 76 | 77 | # from pyspark.sql.types import StructType, StructField, StringType, LongType, StringType, ArrayType 78 | # pdns_schema = (StructType() 79 | # .add("rrname", StringType(), True) 80 | # .add("rrtype", StringType(), True) 81 | # .add("time_first", LongType(), True) 82 | # .add("time_last", LongType(), True) 83 | # .add("count", LongType(), True) 84 | # .add("bailiwick", StringType(), True) 85 | # .add("rdata", ArrayType(StringType(), True), True) 86 | # ) 87 | 88 | pdns_schema = """ 89 | rrname string, 90 | rrtype string, 91 | time_first long, 92 | time_last long, 93 | count long, 94 | bailiwick string, 95 | rdata array 96 | """ 97 | 98 | # COMMAND ---------- 99 | 100 | # In this segment, we are specifying where the data is and what type of data it is. 101 | # You can see the json format, the path and the AWS region 102 | df = spark.read.format("json").schema(pdns_schema).load(f"{get_default_path()}/datasets/dns_events.json") 103 | 104 | # COMMAND ---------- 105 | 106 | # The rdata field has an array element. This isn't very useful if you want to parse it, or search in it. 107 | # So we create a new field called rdatastr. You can see the difference in the two fields in the sample output below. 108 | from pyspark.sql.functions import col, concat_ws 109 | df_enhanced = df.withColumn("rdatastr", concat_ws(",", col("rdata"))) 110 | display(df_enhanced) 111 | 112 | # COMMAND ---------- 113 | 114 | # Here we specify the format of the data to be written, and the destination path 115 | # This is still just setup - Data has not been posted to the Bronze table yet. 116 | df_enhanced.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable("bronze_dns") 117 | 118 | # COMMAND ---------- 119 | 120 | # MAGIC %md 121 | # MAGIC ## URLHaus threat feed setup 122 | # MAGIC We will be using URLHaus threat feeds with our pDNS data. This section shows you how to ingest the URLHaus feed. 123 | # MAGIC 124 | # MAGIC For this setup, we need to do two things: 125 | # MAGIC - Define functions for field extractions so we can extract the `registered_domain_extract`, `domain_extract` and `suffix_extract` fields from the URLHaus feeds. This is done via [user defined functions (UDF)](https://docs.databricks.com/spark/latest/spark-sql/udf-python.html) that are declared in the `./Shared_Include` notebook. 126 | # MAGIC - Create an enriched schema and save it to a silver table. 127 | 128 | # COMMAND ---------- 129 | 130 | # We specify the source location of the URLHaus feed, the csv format, and declare that the csv has field labels in a header 131 | threat_feeds_location = f"{get_default_path()}/datasets/ThreatDataFeed.txt" 132 | threat_feeds_raw = spark.read.csv(threat_feeds_location, header=True) 133 | # Display a sample so we can check to see it makes sense 134 | display(threat_feeds_raw) 135 | 136 | # COMMAND ---------- 137 | 138 | # We create a new enriched view by extracting the domain name from the URL using the domain_extractor user defined function from the previous section. 139 | threat_feeds_raw.createOrReplaceTempView("threat_feeds_raw") 140 | threat_feeds_enriched_df = spark.sql(""" 141 | select *, domain_extract(url) as domain 142 | from threat_feeds_raw 143 | """).filter("char_length(domain) >= 2") 144 | # The sample display shows the new field "domain" 145 | display(threat_feeds_enriched_df) 146 | 147 | # COMMAND ---------- 148 | 149 | # We save our new, enriched schema 150 | (threat_feeds_enriched_df.write 151 | .format("delta") 152 | .mode('overwrite') 153 | .option("mergeSchema", True) 154 | .saveAsTable("silver_threat_feeds") 155 | ) 156 | 157 | # COMMAND ---------- 158 | 159 | # MAGIC %md 160 | # MAGIC ## DNS Twist Setup for detecting lookalike domains 161 | # MAGIC We will use dnstwist to monitor lookalike domains that adversaries can use to attack you. 162 | # MAGIC Using dnstwist you can detect typosquatters, phishing attacks, fraud, and brand impersonation. Before using the remainder of section 1.b of this notebook, you will have to use dnstwist instructions (outside of this notebook) to create a `domains_dnstwists.csv`. In our example (below) we generated variations for `google.com` using `dnstwist`. You can automate this for your own organization or for any organization of interest. 163 | # MAGIC 164 | # MAGIC After installing `dnstwist`, we ran:
165 | # MAGIC ``` 166 | # MAGIC dnstwist --registered google.com >> domains_dnstwists.csv 167 | # MAGIC addition googlea.com 184.168.131.241 NS:ns65.domaincontrol.com MX:mailstore1.secureserver.net 168 | # MAGIC addition googleb.com 47.254.33.193 NS:ns3.dns.com 169 | # MAGIC ``` 170 | # MAGIC 171 | # MAGIC We formatted domains_dnstwists.csv with a header: `PERMUTATIONTYPE,domain,meta` 172 | # MAGIC 173 | # MAGIC Once you have created `domain_dnstwists.csv`, you can continue: 174 | # MAGIC - load the dnstwisted domains 175 | # MAGIC - enrich the table with domain names (without TLDs) 176 | # MAGIC - load the `dnstwist`-enriched results into a silver table 177 | # MAGIC 178 | # MAGIC We will use these tables later to productionize typosquatting detection. 179 | 180 | # COMMAND ---------- 181 | 182 | # NOTE: domain_dnstwists.csv needs to be created outside of this notebook, using instructions from dnstwist. 183 | # Load the domain_dnstwists.csv into a dataframe, brand_domains_monitored_raw. Note the csv and header, true options. 184 | brand_domains_monitored_raw_df = spark.read.csv(f"{get_default_path()}/datasets/domains_dnstwists.csv", header=True) 185 | 186 | # COMMAND ---------- 187 | 188 | # Display csv we just read 189 | display(brand_domains_monitored_raw_df) 190 | 191 | # COMMAND ---------- 192 | 193 | # Load the csv brand_domains_monitored_raw into a local table called, brand_domains_monitored_raw 194 | brand_domains_monitored_raw_df.createOrReplaceTempView("brand_domains_monitored_raw") 195 | 196 | # COMMAND ---------- 197 | 198 | # Extract the domain names using the UDF we created at Cmd 9 of this notebook. 199 | # Create a new table with the dnstwist extracted domains. New column dnstwisted_domain 200 | # The hardcoded ">=2" is there to accommodate for potential empty domain fields 201 | brand_domains_monitored_enriched_df = spark.sql(""" 202 | select *, domain_extract(domain) as dnstwisted_domain 203 | from brand_domains_monitored_raw 204 | """).filter("char_length(dnstwisted_domain) >= 2") 205 | display(brand_domains_monitored_enriched_df) 206 | 207 | # COMMAND ---------- 208 | 209 | # Define a silver Delta table 210 | (brand_domains_monitored_enriched_df.write 211 | .format("delta") 212 | .mode('overwrite') 213 | .option("mergeSchema", False) 214 | .saveAsTable("silver_twisted_domain_brand") 215 | ) 216 | 217 | # COMMAND ---------- 218 | 219 | # MAGIC %sql 220 | # MAGIC /* Query the silver Delta table */ 221 | # MAGIC select * from silver_twisted_domain_brand 222 | -------------------------------------------------------------------------------- /04_DNS_Analytics_Data_Science.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC You may find this series of notebooks at https://github.com/databricks-industry-solutions/dns-analytics. For more information about this solution accelerator, visit https://www.databricks.com/solutions/accelerators/threat-detection. 4 | 5 | # COMMAND ---------- 6 | 7 | # MAGIC %md 8 | # MAGIC # 4. ML Training and Analytics 9 | # MAGIC In this section we will build the DGA model and the typosquatting model. Slides below have some high level discussion on DGA. 10 | # MAGIC - A detailed discussion on DGA is here: http://www.covert.io/getting-started-with-dga-research/ 11 | # MAGIC - A more detailed discussion on typosquatting is here: https://www.mcafee.com/blogs/consumer/what-is-typosquatting/ 12 | # MAGIC 13 | # MAGIC At a high level we will: 14 | # MAGIC - Extract the domain names from the data removing gTLD (e.g. .com, .org) and ccTLD (e.g. .ru, cn, .uk, .ca) 15 | # MAGIC - Build the models 16 | 17 | # COMMAND ---------- 18 | 19 | # MAGIC %scala 20 | # MAGIC displayHTML(""" 21 | # MAGIC """) 22 | 23 | # COMMAND ---------- 24 | 25 | # MAGIC %run ./util/Shared_Include 26 | 27 | # COMMAND ---------- 28 | 29 | # Read the Alexa list of domains 30 | # Alexa is a list of the most popular domains on the internet ranked by popularity 31 | # Alexa is not intended as a whitelist. 32 | import pandas as pd 33 | import mlflow 34 | import mlflow.sklearn 35 | import mlflow.pyfunc 36 | 37 | dbutils.fs.cp(f'{get_default_path()}/datasets/alexa_100k.txt', f'file://{get_default_path()}/datasets/alexa_100k.txt') 38 | alexa_dataframe = pd.read_csv(f'{get_default_path()}/datasets/alexa_100k.txt') 39 | display(alexa_dataframe) 40 | 41 | # COMMAND ---------- 42 | 43 | # Extract the domains names without gTLD or ccTLD (generic or country code top-level domain) from the registered domain and subdomains of a URL. 44 | # We only need the domain names for training. 45 | # Example fields in a tldextract result: ExtractResult(subdomain='forums.news', domain='cnn', suffix='com') 46 | import tldextract 47 | import numpy as np 48 | 49 | alexa_dataframe['domain'] = [ domain_extract(uri) for uri in alexa_dataframe['uri']] 50 | del alexa_dataframe['uri'] 51 | del alexa_dataframe['rank'] 52 | display(alexa_dataframe) 53 | 54 | # COMMAND ---------- 55 | 56 | # Add legitimate domains from Alexa to the training data 57 | # It's possible we have NaNs from blanklines or whatever 58 | alexa_dataframe = alexa_dataframe.dropna() 59 | alexa_dataframe = alexa_dataframe.drop_duplicates() 60 | 61 | # Set the class 62 | alexa_dataframe['class'] = 'legit' 63 | 64 | # Shuffle the data (important for training/testing) 65 | alexa_dataframe = alexa_dataframe.reindex(np.random.permutation(alexa_dataframe.index)) 66 | alexa_total = alexa_dataframe.shape[0] 67 | print('Total Alexa domains %d' % alexa_total) 68 | display(alexa_dataframe) 69 | 70 | # COMMAND ---------- 71 | 72 | file_location = f'{get_default_path()}/datasets/dga_domains_header.txt' 73 | dbutils.fs.cp(f'{get_default_path()}/datasets/dga_domains_header.txt', f'file:{file_location}') 74 | 75 | 76 | # For big datasets we'll use Koalas! 77 | dga_dataframe = pd.read_csv(file_location, header=0); 78 | # We noticed that the blacklist values just differ by capitalization or .com/.org/.info 79 | dga_dataframe['domain'] = dga_dataframe.applymap(lambda x: x.split('.')[0].strip().lower()) 80 | 81 | # It's possible we have NaNs from blanklines or whatever 82 | dga_dataframe = dga_dataframe.dropna() 83 | dga_dataframe = dga_dataframe.drop_duplicates() 84 | dga_total = dga_dataframe.shape[0] 85 | print('Total DGA domains %d' % dga_total) 86 | 87 | # Set the class 88 | dga_dataframe['class'] = 'ioc' 89 | 90 | print('Number of DGA domains: %d' % dga_dataframe.shape[0]) 91 | all_domains = pd.concat([alexa_dataframe, dga_dataframe], ignore_index=True) 92 | 93 | # COMMAND ---------- 94 | 95 | # Output of DGA detections from our dataset 96 | display(dga_dataframe) 97 | 98 | # COMMAND ---------- 99 | 100 | # Lets do some feature engineering and add calculations for entropy and length to our dataset. 101 | # We calculate entropy by comparing the number of unique characters in our string to its length. 102 | all_domains['length'] = [len(x) for x in all_domains['domain']] 103 | all_domains = all_domains[all_domains['length'] > 6] 104 | 105 | import math 106 | from collections import Counter 107 | 108 | def entropy(s): 109 | p, lns = Counter(s), float(len(s)) 110 | return -sum( count/lns * math.log(count/lns, 2) for count in p.values()) 111 | 112 | all_domains['entropy'] = [entropy(x) for x in all_domains['domain']] 113 | 114 | # COMMAND ---------- 115 | 116 | # Print the results. The higher the entropy the higher the potential for DGA. But we aren't done quite yet. 117 | display(all_domains) 118 | 119 | # COMMAND ---------- 120 | 121 | # Here we do additional feature engineering to do n-gram frequency analysis our valid domains 122 | 123 | y = np.array(all_domains['class'].tolist()) # Yes, this is weird but it needs 124 | 125 | import sklearn.ensemble 126 | from sklearn import feature_extraction 127 | 128 | alexa_vc = sklearn.feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(3,5), min_df=1e-4, max_df=1.0) 129 | counts_matrix = alexa_vc.fit_transform(alexa_dataframe['domain']) 130 | alexa_counts = np.log10(counts_matrix.sum(axis=0).getA1()) 131 | ngrams_list = alexa_vc.get_feature_names() 132 | 133 | # COMMAND ---------- 134 | 135 | # Load dictionary words into a dataframe 136 | dbutils.fs.cp(f'{get_default_path()}/datasets/words.txt', f'file://{get_default_path()}/datasets/words.txt') 137 | file_location = f'{get_default_path()}/datasets/words.txt' 138 | word_dataframe = pd.read_csv(file_location, header=0, sep=';'); 139 | word_dataframe = word_dataframe[word_dataframe['words'].map(lambda x: str(x).isalpha())] 140 | word_dataframe = word_dataframe.applymap(lambda x: str(x).strip().lower()) 141 | word_dataframe = word_dataframe.dropna() 142 | word_dataframe = word_dataframe.drop_duplicates() 143 | 144 | # COMMAND ---------- 145 | 146 | # Create a dictionary from the word list 147 | dict_vc = sklearn.feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(3,5), min_df=1e-5, max_df=1.0) 148 | counts_matrix = dict_vc.fit_transform(word_dataframe['words']) 149 | dict_counts = np.log10(counts_matrix.sum(axis=0).getA1()) 150 | ngrams_list = dict_vc.get_feature_names() 151 | 152 | def ngram_count(domain): 153 | alexa_match = alexa_counts * alexa_vc.transform([domain]).T # Woot vector multiply and transpose Woo Hoo! 154 | dict_match = dict_counts * dict_vc.transform([domain]).T 155 | print(f'Domain: {domain} Alexa match: {alexa_match} Dict match: {dict_match}') 156 | 157 | # Examples: 158 | ngram_count('beyonce') 159 | ngram_count('dominos') 160 | ngram_count('1cb8a5f36f') 161 | ngram_count('zfjknuh38231') 162 | ngram_count('bey6o4ce') 163 | ngram_count('washington') 164 | 165 | # COMMAND ---------- 166 | 167 | # Create n-grams from the dictionary and Alex 100k list. And build a matching function. And run test examples. 168 | # More on ngrams here: https://blog.xrds.acm.org/2017/10/introduction-n-grams-need/ 169 | 170 | all_domains['alexa_grams']= alexa_counts * alexa_vc.transform(all_domains['domain']).T 171 | all_domains['word_grams']= dict_counts * dict_vc.transform(all_domains['domain']).T 172 | 173 | # COMMAND ---------- 174 | 175 | # MAGIC %md 176 | # MAGIC ## Build a vectorized model of the n-grams 177 | # MAGIC 178 | # MAGIC We need vectors for building the model 179 | 180 | # COMMAND ---------- 181 | 182 | weird_cond = (all_domains['class']=='legit') & (all_domains['word_grams']<3) & (all_domains['alexa_grams']<2) 183 | weird = all_domains[weird_cond] 184 | print(weird.shape[0]) 185 | all_domains.loc[weird_cond, 'class'] = 'weird' 186 | print(all_domains['class'].value_counts()) 187 | 188 | # COMMAND ---------- 189 | 190 | # MAGIC %md 191 | # MAGIC ## Let's train our model 192 | 193 | # COMMAND ---------- 194 | 195 | #Labelling the domains based on weirdness 196 | # Using ML runtime, my packages come pre-installed 197 | # Using ML flow, we can track our experiments as we iterate 198 | 199 | from sklearn.model_selection import train_test_split 200 | clf = sklearn.ensemble.RandomForestClassifier(n_estimators=20) # Trees in the forest 201 | 202 | not_weird = all_domains[all_domains['class'] != 'weird'] 203 | X = not_weird[['length', 'entropy', 'alexa_grams', 'word_grams']].values 204 | 205 | # Labels (scikit learn uses 'y' for classification labels) 206 | y = np.array(not_weird['class'].tolist()) 207 | 208 | with mlflow.start_run() as run: 209 | mlflow.sklearn.autolog() # automatically log model parameters 210 | # We can also add the call to mlflow.spark.autolog() to track the data source, but it requires an additional Jar: https://mlflow.org/docs/latest/tracking.html#spark-experimental 211 | # Train on a 80/20 split 212 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 213 | clf.fit(X_train, y_train) 214 | y_pred = clf.predict(X_test) 215 | # First train on the whole thing before looking at prediction performance 216 | clf.fit(X, y) 217 | 218 | # Locate the model in our content library using MLFlow 219 | run_id = run.info.run_id 220 | print(f'MLflow run_id: {run_id}, model_uri: runs:/{run_id}/model') 221 | 222 | # COMMAND ---------- 223 | 224 | # Build a predict function to be used laterto do DGA predictions 225 | # Add in pre and post processing for our predict function 226 | 227 | import mlflow.pyfunc 228 | 229 | class vc_transform(mlflow.pyfunc.PythonModel): 230 | def __init__(self, alexa_vc, dict_vc, ctx): 231 | self.alexa_vc = alexa_vc 232 | self.dict_vc = dict_vc 233 | self.ctx = ctx 234 | 235 | def predict(self, context, model_input): 236 | _alexa_match = alexa_counts * self.alexa_vc.transform([model_input]).T 237 | _dict_match = dict_counts * self.dict_vc.transform([model_input]).T 238 | _X = [len(model_input), entropy(model_input), _alexa_match, _dict_match] 239 | return str(self.ctx.predict([_X])[0]) 240 | 241 | # COMMAND ---------- 242 | 243 | # Save our model 244 | from mlflow.exceptions import MlflowException 245 | model_path = f'{get_default_path()}/new_model/dga_model' 246 | 247 | dbutils.fs.rm(f'file://{model_path}', True) 248 | 249 | vc_model = vc_transform(alexa_vc, dict_vc, clf) 250 | mlflow.pyfunc.save_model(model_path, python_model=vc_model) 251 | dbutils.fs.cp(f'file://{model_path}', model_path, True) 252 | print(f'new DGA model is copied to DBFS to dbfs:{model_path}') 253 | 254 | # COMMAND ---------- 255 | 256 | from mlflow.tracking.client import MlflowClient 257 | client = MlflowClient() 258 | 259 | model_name = f"{get_user_prefix()}_dns_dga" 260 | 261 | # Usually it's enough to call the mlflow.register_model(model_uri=f"runs:/{run_id}/model", name=model_name) 262 | # but because we have custom model we need to 263 | try: 264 | client.get_registered_model(model_name) 265 | except: 266 | client.create_registered_model(model_name) 267 | 268 | model_version = client.create_model_version(model_name, f"dbfs:{model_path}", run.info.run_id, description="DGA detection model") 269 | print(f"Model is registered with name: {model_name}, version: {model_version.version}") 270 | # Uncomment if you want to promote this version into Staging 271 | # client.transition_model_version_stage(name=model_name, version=model_version.version, stage="Staging") 272 | 273 | # COMMAND ---------- 274 | 275 | # do predict 276 | vc_model.predict(mlflow.pyfunc.PythonModel, '7ydbdehaaz') 277 | 278 | # COMMAND ---------- 279 | 280 | # MAGIC %md 281 | # MAGIC 282 | # MAGIC ## What problems can you spot with that model? How we can improve this model? 283 | # MAGIC 284 | # MAGIC **How would you approach to this problem?** 285 | # MAGIC 286 | # MAGIC For example: 287 | # MAGIC * domain registration/update/expiration date ? 288 | # MAGIC * information about DNS registrant ? 289 | # MAGIC * information about autonomous system ? 290 | # MAGIC * references from other domains? 291 | --------------------------------------------------------------------------------