├── CONTRIBUTING.md
├── NOTICE
├── SECURITY.md
├── .github
    └── workflows
    │   ├── integration-test-gcp-pr.yml
    │   ├── integration-test-aws-pr.yml
    │   ├── integration-test-aws-push.yml
    │   ├── integration-test-msa-pr.yml
    │   ├── integration-test-gcp-push.yml
    │   └── integration-test-msa-push.yml
├── 06_DNS_Analytics_ScoreDomain.py
├── .gitignore
├── 03_DNS_Analytics_Exploring_Data.py
├── LICENSE
├── 02_DNS_Analytics_Enrichment.py
├── util
    └── Shared_Include.py
├── 05_DNS_Analytics_Streaming.py
├── readme.md
├── RUNME.py
├── 00_README.py
├── 01_DNS_Analytics_Ingest.py
└── 04_DNS_Analytics_Data_Science.py


/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | We happily welcome contributions to this project. We use GitHub Issues to track community reported issues and GitHub Pull Requests for accepting changes pursuant to a CLA.
2 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Copyright (2022) Databricks, Inc.
2 | 
3 | 
4 | This Software includes software developed at Databricks (https://www.databricks.com/) and its use is subject to the included LICENSE file.
5 | By using this repository and the notebooks within, you consent to Databricks collection and use of usage and tracking information in accordance with our privacy policy at www.databricks/privacypolicy.
6 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security Policy
2 | 
3 | ## Reporting a Vulnerability
4 | 
5 | Please email bugbounty@databricks.com to report any security vulnerabilities. We will acknowledge receipt of your vulnerability and strive to send you regular updates about our progress. If you're curious about the status of your disclosure please feel free to email us again. If you want to encrypt your disclosure email, you can use [this PGP key](https://keybase.io/arikfr/key.asc).
6 | 
7 | 


--------------------------------------------------------------------------------
/.github/workflows/integration-test-gcp-pr.yml:
--------------------------------------------------------------------------------
 1 | name: GCP integration test PR
 2 | 
 3 | on:
 4 |  pull_request:
 5 | 
 6 | jobs:
 7 |  run-databricks-notebook:
 8 |    runs-on: ubuntu-latest
 9 |    steps:
10 |      - name: Checkout repo
11 |        uses: actions/checkout@v2
12 |      - name: Run a databricks notebook
13 |        uses: databricks/run-notebook@v0
14 |        with:
15 |          local-notebook-path: RUNME.py
16 |          git-commit: ${{ github.event.pull_request.head.sha }}
17 |          databricks-host: https://416411475796958.8.gcp.databricks.com
18 |          databricks-token: ${{ secrets.DEPLOYMENT_TARGET_TOKEN_GCP }}
19 |          new-cluster-json: >
20 |            {
21 |               "num_workers": 0,
22 |               "spark_version": "10.4.x-scala2.12",
23 |               "node_type_id": "n1-highmem-4",
24 |               "gcp_attributes": {
25 |                 "availability": "ON_DEMAND_GCP"
26 |               },
27 |               "spark_conf": {
28 |                   "spark.master": "local[*, 4]",
29 |                   "spark.databricks.cluster.profile": "singleNode"
30 |               },
31 |               "custom_tags": {
32 |                   "ResourceClass": "SingleNode"
33 |               }
34 |             }
35 |          notebook-params-json: >
36 |            {
37 |             "run_job": "True"
38 |            }
39 |          access-control-list-json: >
40 |            [
41 |              {
42 |                "group_name": "users",
43 |                "permission_level": "CAN_VIEW"
44 |              }
45 |            ]


--------------------------------------------------------------------------------
/.github/workflows/integration-test-aws-pr.yml:
--------------------------------------------------------------------------------
 1 | name: AWS integration test PR
 2 | 
 3 | on:
 4 |  pull_request:
 5 | 
 6 | jobs:
 7 |  run-databricks-notebook:
 8 |    runs-on: ubuntu-latest
 9 |    steps:
10 |      - name: Checkout repo
11 |        uses: actions/checkout@v2
12 |      - name: Run a databricks notebook
13 |        uses: databricks/run-notebook@v0
14 |        with:
15 |          local-notebook-path: RUNME.py
16 |          git-commit: ${{ github.event.pull_request.head.sha }}
17 |          databricks-host: https://e2-demo-west.cloud.databricks.com
18 |          databricks-token: ${{ secrets.DEPLOYMENT_TARGET_TOKEN_AWS }}
19 |          new-cluster-json: >
20 |            {
21 |              "num_workers": 0,
22 |              "spark_version": "10.4.x-scala2.12",
23 |              "node_type_id": "i3.xlarge",
24 |              "aws_attributes": {
25 |                  "availability": "ON_DEMAND"
26 |               },
27 |              "spark_conf": {
28 |                   "spark.master": "local[*, 4]",
29 |                   "spark.databricks.cluster.profile": "singleNode"
30 |               },
31 |               "custom_tags": {
32 |                   "ResourceClass": "SingleNode"
33 |               }
34 |            }
35 |          notebook-params-json: >
36 |            {
37 |              "run_job": "True"
38 |            }
39 |          access-control-list-json: >
40 |            [
41 |              {
42 |                "group_name": "users",
43 |                "permission_level": "CAN_VIEW"
44 |              }
45 |            ]
46 |            
47 | 


--------------------------------------------------------------------------------
/.github/workflows/integration-test-aws-push.yml:
--------------------------------------------------------------------------------
 1 | name: AWS integration test push
 2 | 
 3 | on:
 4 |  workflow_dispatch:
 5 |  push:
 6 |     branches:
 7 |       - main
 8 |       - web-sync
 9 | 
10 | jobs:
11 |  run-databricks-notebook:
12 |    runs-on: ubuntu-latest
13 |    steps:
14 |      - name: Checkout repo
15 |        uses: actions/checkout@v2
16 |      - name: Run a databricks notebook
17 |        uses: databricks/run-notebook@v0
18 |        with:
19 |          local-notebook-path: RUNME.py
20 |          git-commit: ${{ github.sha }}
21 |          databricks-host: https://e2-demo-west.cloud.databricks.com
22 |          databricks-token: ${{ secrets.DEPLOYMENT_TARGET_TOKEN_AWS }}
23 |          new-cluster-json: >
24 |            {
25 |              "num_workers": 0,
26 |              "spark_version": "10.4.x-scala2.12",
27 |              "node_type_id": "i3.xlarge",
28 |              "aws_attributes": {
29 |                "availability": "ON_DEMAND"
30 |              },
31 |              "spark_conf": {
32 |                   "spark.master": "local[*, 4]",
33 |                   "spark.databricks.cluster.profile": "singleNode"
34 |               },
35 |               "custom_tags": {
36 |                   "ResourceClass": "SingleNode"
37 |               }
38 |            }
39 |          notebook-params-json: >
40 |            {
41 |             "run_job": "True"
42 |            }
43 |          access-control-list-json: >
44 |            [
45 |              {
46 |                "group_name": "users",
47 |                "permission_level": "CAN_VIEW"
48 |              }
49 |            ]
50 | 


--------------------------------------------------------------------------------
/.github/workflows/integration-test-msa-pr.yml:
--------------------------------------------------------------------------------
 1 | name: MSA integration test PR
 2 | on:
 3 |  pull_request:
 4 | 
 5 | jobs:
 6 |  run-databricks-notebook:
 7 |    runs-on: ubuntu-latest
 8 |    steps:
 9 |      - name: Checkout repo
10 |        uses: actions/checkout@v2
11 |      - name: Run a databricks notebook
12 |        uses: databricks/run-notebook@v0
13 |        with:
14 |          local-notebook-path: RUNME.py
15 |          git-commit: ${{ github.event.pull_request.head.sha }}
16 |          databricks-host: https://adb-984752964297111.11.azuredatabricks.net
17 |          databricks-token: ${{ secrets.DEPLOYMENT_TARGET_TOKEN_MSA }}
18 |          new-cluster-json: >
19 |            {
20 |              "num_workers": 0,
21 |              "spark_version": "10.4.x-scala2.12",
22 |              "node_type_id": "Standard_DS3_v2",
23 |              "azure_attributes": {
24 |                   "availability": "ON_DEMAND_AZURE"
25 |                             },
26 |              "spark_conf": {
27 |                   "spark.master": "local[*, 4]",
28 |                   "spark.databricks.cluster.profile": "singleNode"
29 |               },
30 |               "custom_tags": {
31 |                   "ResourceClass": "SingleNode"
32 |               }
33 |                             
34 |            }
35 |          notebook-params-json: >
36 |            {
37 |             "run_job": "True"
38 |            }
39 |          access-control-list-json: >
40 |            [
41 |              {
42 |                "group_name": "users",
43 |                "permission_level": "CAN_VIEW"
44 |              }
45 |            ]


--------------------------------------------------------------------------------
/.github/workflows/integration-test-gcp-push.yml:
--------------------------------------------------------------------------------
 1 | name: GCP integration test push
 2 | 
 3 | on:
 4 |  workflow_dispatch:
 5 |  push:
 6 |     branches:
 7 |       - main
 8 |       - web-sync
 9 | 
10 | jobs:
11 |  run-databricks-notebook:
12 |    runs-on: ubuntu-latest
13 |    steps:
14 |      - name: Checkout repo
15 |        uses: actions/checkout@v2
16 |      - name: Run a databricks notebook
17 |        uses: databricks/run-notebook@v0
18 |        with:
19 |          local-notebook-path: RUNME.py
20 |          git-commit: ${{ github.sha }}
21 |          databricks-host: https://416411475796958.8.gcp.databricks.com
22 |          databricks-token: ${{ secrets.DEPLOYMENT_TARGET_TOKEN_GCP }}
23 |          new-cluster-json: >
24 |            {
25 |               "num_workers": 0,
26 |               "spark_version": "10.4.x-scala2.12",
27 |               "node_type_id": "n1-highmem-4",
28 |               "gcp_attributes": {
29 |                 "availability": "ON_DEMAND_GCP"
30 |               },
31 |               "spark_conf": {
32 |                   "spark.master": "local[*, 4]",
33 |                   "spark.databricks.cluster.profile": "singleNode"
34 |               },
35 |               "custom_tags": {
36 |                   "ResourceClass": "SingleNode"
37 |               }
38 |             }
39 |          notebook-params-json: >
40 |            {
41 |             "run_job": "True"
42 |            }
43 |          access-control-list-json: >
44 |            [
45 |              {
46 |                "group_name": "users",
47 |                "permission_level": "CAN_VIEW"
48 |              }
49 |            ]


--------------------------------------------------------------------------------
/.github/workflows/integration-test-msa-push.yml:
--------------------------------------------------------------------------------
 1 | name: MSA integration test push
 2 | on:
 3 |  workflow_dispatch:
 4 |  push:
 5 |     branches:
 6 |       - main
 7 |       - web-sync
 8 | 
 9 | jobs:
10 |  run-databricks-notebook:
11 |    runs-on: ubuntu-latest
12 |    steps:
13 |      - name: Checkout repo
14 |        uses: actions/checkout@v2
15 |      - name: Run a databricks notebook
16 |        uses: databricks/run-notebook@v0
17 |        with:
18 |          local-notebook-path: RUNME.py
19 |          git-commit: ${{ github.sha }}
20 |          databricks-host: https://adb-984752964297111.11.azuredatabricks.net
21 |          databricks-token: ${{ secrets.DEPLOYMENT_TARGET_TOKEN_MSA }}
22 |          new-cluster-json: >
23 |            {
24 |              "num_workers": 0,
25 |              "spark_version": "10.4.x-scala2.12",
26 |              "node_type_id": "Standard_D3_v2",
27 |              "azure_attributes": {
28 |                   "availability": "ON_DEMAND_AZURE"
29 |                             },
30 |              "spark_conf": {
31 |                   "spark.master": "local[*, 4]",
32 |                   "spark.databricks.cluster.profile": "singleNode"
33 |               },
34 |               "custom_tags": {
35 |                   "ResourceClass": "SingleNode"
36 |               }
37 |            }
38 |          notebook-params-json: >
39 |            {
40 |             "run_job": "True"
41 |            }
42 |          access-control-list-json: >
43 |            [
44 |              {
45 |                "group_name": "users",
46 |                "permission_level": "CAN_VIEW"
47 |              }
48 |            ]


--------------------------------------------------------------------------------
/06_DNS_Analytics_ScoreDomain.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md 
 3 | # MAGIC You may find this series of notebooks at https://github.com/databricks-industry-solutions/dns-analytics. For more information about this solution accelerator, visit https://www.databricks.com/solutions/accelerators/threat-detection.
 4 | 
 5 | # COMMAND ----------
 6 | 
 7 | # MAGIC %md Read Parameterized inputs
 8 | 
 9 | # COMMAND ----------
10 | 
11 | dbutils.widgets.removeAll()
12 | dbutils.widgets.text("DomainName","","01. Domain to be scored")
13 | 
14 | # COMMAND ----------
15 | 
16 | domain=dbutils.widgets.get("DomainName")
17 | 
18 | # COMMAND ----------
19 | 
20 | # MAGIC %md Download Databricks Trained DGA Detection model file for scoring
21 | 
22 | # COMMAND ----------
23 | 
24 | # MAGIC %sh 
25 | # MAGIC if [ ! -d /tmp/dga_model ]; then
26 | # MAGIC   mkdir -p /tmp/dga_model
27 | # MAGIC   curl -o /tmp/dga_model/python_model.pkl https://raw.githubusercontent.com/zaferbil/dns-notebook-datasets/master/model/python_model.pkl
28 | # MAGIC   curl -o /tmp/dga_model/MLmodel https://raw.githubusercontent.com/zaferbil/dns-notebook-datasets/master/model/MLmodel
29 | # MAGIC   curl -o /tmp/dga_model/conda.yaml https://raw.githubusercontent.com/zaferbil/dns-notebook-datasets/master/model/conda.yaml
30 | # MAGIC fi
31 | 
32 | # COMMAND ----------
33 | 
34 | # MAGIC %md Load the model using mlflow
35 | 
36 | # COMMAND ----------
37 | 
38 | # Load the DGA model. 
39 | 
40 | # this is an optimization to not to reload model on evey invocation!
41 | import json
42 | ctx = json.loads(dbutils.notebook.entry_point.getDbutils().notebook().getContext().toJson())
43 | if spark.conf.get(f"dga_model_is_loaded_{ctx['extraContext']['notebook_path']}", "false") == "false":
44 |   
45 |   import mlflow
46 |   import mlflow.pyfunc
47 | 
48 |   # you can change to your own path copied from the output of 4th notebook
49 |   model_path = 'dbfs:/FileStore/tables/dga_model'
50 |   dbutils.fs.cp("file:/tmp/dga_model/", model_path, True)
51 |   print(f"loading model from {model_path}")
52 |   loaded_model = mlflow.pyfunc.load_model(model_path)
53 |   spark.conf.set(f"dga_model_is_loaded_{ctx['extraContext']['notebook_path']}", "true")
54 | 
55 | # COMMAND ----------
56 | 
57 | # MAGIC %md Score the domain name with the function.
58 | 
59 | # COMMAND ----------
60 | 
61 | print(f'Score for Domain {domain} is : {loaded_model.predict(domain)}')
62 | 
63 | 
64 | # COMMAND ----------
65 | 
66 | #print(f'Test Execution for google.com: {loaded_model.predict("google.com")}')
67 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | .DS_Store
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .idea
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | pip-wheel-metadata/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | .python-version
 88 | 
 89 | # pipenv
 90 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 91 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 92 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 93 | #   install all needed dependencies.
 94 | #Pipfile.lock
 95 | 
 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 97 | __pypackages__/
 98 | 
 99 | # Celery stuff
100 | celerybeat-schedule
101 | celerybeat.pid
102 | 
103 | # SageMath parsed files
104 | *.sage.py
105 | 
106 | # Environments
107 | .env
108 | .venv
109 | env/
110 | venv/
111 | ENV/
112 | env.bak/
113 | venv.bak/
114 | 
115 | # Spyder project settings
116 | .spyderproject
117 | .spyproject
118 | 
119 | # Rope project settings
120 | .ropeproject
121 | 
122 | # mkdocs documentation
123 | /site
124 | 
125 | # mypy
126 | .mypy_cache/
127 | .dmypy.json
128 | dmypy.json
129 | 
130 | # Pyre type checker
131 | .pyre/
132 | 


--------------------------------------------------------------------------------
/03_DNS_Analytics_Exploring_Data.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md 
 3 | # MAGIC You may find this series of notebooks at https://github.com/databricks-industry-solutions/dns-analytics. For more information about this solution accelerator, visit https://www.databricks.com/solutions/accelerators/threat-detection.
 4 | 
 5 | # COMMAND ----------
 6 | 
 7 | # MAGIC %md
 8 | # MAGIC # 3. Ad-Hoc Analytics: Exploring the data
 9 | # MAGIC FINALLY!!!! We have data. And we can start poking around. This is an optional section for you to familiarize yourself with the data. And pick up some spark SQL tricks. You can use these tactics to explore and expand on the analytics.
10 | # MAGIC 
11 | # MAGIC **Add your own queries!**
12 | 
13 | # COMMAND ----------
14 | 
15 | # MAGIC %run ./util/Shared_Include
16 | 
17 | # COMMAND ----------
18 | 
19 | # MAGIC %sql 
20 | # MAGIC -- Lets take a look at the number of unique domains in our dataset 
21 | # MAGIC select count(distinct(domain_name)) from silver_dns
22 | 
23 | # COMMAND ----------
24 | 
25 | # MAGIC %sql select count(*) from silver_dns
26 | 
27 | # COMMAND ----------
28 | 
29 | # MAGIC %sql
30 | # MAGIC -- ioc is a field we've created as a result of running the DGA model. If the ioc field has a value of ioc, it means that the DGA model has determined the domain to be an ioc (indicator of compromise)
31 | # MAGIC -- The query below is for a total count of rows where the DGA algorithm has detected an ioc. But excludes an domains that have the string 'ip' in it and has a domain name length of more than 10 characters
32 | # MAGIC select count(*), domain_name, country 
33 | # MAGIC   from silver_dns 
34 | # MAGIC   where ioc = 'ioc' and domain_name not like '%ip%' and char_length(domain_name) > 8 
35 | # MAGIC   group by domain_name, country 
36 | # MAGIC   order by count(*) desc
37 | 
38 | # COMMAND ----------
39 | 
40 | # MAGIC %md
41 | # MAGIC Let us check against the known threat feeds
42 | 
43 | # COMMAND ----------
44 | 
45 | # MAGIC %sql
46 | # MAGIC -- Query for domains in the silver.dns, silver.EnrichedThreatFeeds tables where there is an ioc match.
47 | # MAGIC -- You may have experienced: many to many match/join is compute cost prohibitive in most SIEM/log aggregation systems. Spark SQL is a lot more efficient. 
48 | # MAGIC select count(distinct(domain_name))
49 | # MAGIC   from silver_dns, silver_threat_feeds 
50 | # MAGIC   where silver_dns.domain_name == silver_threat_feeds.domain
51 | 
52 | # COMMAND ----------
53 | 
54 | # MAGIC %sql 
55 | # MAGIC -- Query for ioc matches across multiple tables. Similar to previous example but with additional columns in the results table
56 | # MAGIC select  domain_name, rrname, country, time_first, time_last, ioc,rrtype,rdata,bailiwick, silver_threat_feeds.* 
57 | # MAGIC   from silver_dns, silver_threat_feeds 
58 | # MAGIC   where silver_dns.domain_name == silver_threat_feeds.domain and ioc='ioc'
59 | 
60 | # COMMAND ----------
61 | 
62 | # MAGIC %sql
63 | # MAGIC -- Looking for specific rrnames in multiple tables.
64 | # MAGIC select  domain_name, rrname, country, time_first, time_last, ioc,rrtype,rdata,bailiwick, silver_threat_feeds.* 
65 | # MAGIC   from silver_dns, silver_threat_feeds 
66 | # MAGIC   where silver_dns.domain_name == silver_threat_feeds.domain  and (silver_dns.rrname = "ns1.asdklgb.cf." OR silver_dns.rrname LIKE "%cn.")
67 | 
68 | # COMMAND ----------
69 | 
70 | # MAGIC %sql describe table silver_threat_feeds
71 | 
72 | # COMMAND ----------
73 | 
74 | # MAGIC %sql describe table silver_dns
75 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | [Project Name]
 2 | 
 3 | Copyright (2022) Databricks, Inc.
 4 | 
 5 | This library (the "Software") may not be used except in connection with the Licensee's use of the Databricks Platform Services pursuant
 6 | to an Agreement (defined below) between Licensee (defined below) and Databricks, Inc. ("Databricks"). The Object Code version of the
 7 | Software shall be deemed part of the Downloadable Services under the Agreement, or if the Agreement does not define Downloadable Services,
 8 | Subscription Services, or if neither are defined then the term in such Agreement that refers to the applicable Databricks Platform
 9 | Services (as defined below) shall be substituted herein for “Downloadable Services.”  Licensee's use of the Software must comply at
10 | all times with any restrictions applicable to the Downloadable Services and Subscription Services, generally, and must be used in
11 | accordance with any applicable documentation. For the avoidance of doubt, the Software constitutes Databricks Confidential Information
12 | under the Agreement.
13 | 
14 | Additionally, and notwithstanding anything in the Agreement to the contrary:
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
16 |   OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
17 |   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
18 |   IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 | * you may view, make limited copies of, and may compile the Source Code version of the Software into an Object Code version of the
20 |   Software.  For the avoidance of doubt, you may not make derivative works of Software (or make any any changes to the Source Code
21 |   version of the unless you have agreed to separate terms with Databricks permitting such modifications (e.g., a contribution license
22 |   agreement)).
23 | 
24 | If you have not agreed to an Agreement or otherwise do not agree to these terms, you may not use the Software or view, copy or compile
25 | the Source Code of the Software.
26 | 
27 | This license terminates automatically upon the termination of the Agreement or Licensee's breach of these terms.  Additionally,
28 | Databricks may terminate this license at any time on notice.  Upon termination, you must permanently delete the Software and all
29 | copies thereof (including the Source Code).
30 | 
31 | Agreement: the agreement between Databricks and Licensee governing the use of the Databricks Platform Services, which shall be, with
32 | respect to Databricks, the Databricks Terms of Service located at www.databricks.com/termsofservice, and with respect to Databricks
33 | Community Edition, the Community Edition Terms of Service located at www.databricks.com/ce-termsofuse, in each case unless Licensee
34 | has entered into a separate written agreement with Databricks governing the use of the applicable Databricks Platform Services.
35 | 
36 | Databricks Platform Services: the Databricks services or the Databricks Community Edition services, according to where the Software is used.
37 | 
38 | Licensee: the user of the Software, or, if the Software is being used on behalf of a company, the company.
39 | 
40 | Object Code: is version of the Software produced when an interpreter or a compiler translates the Source Code into recognizable and
41 | executable machine code.
42 | 
43 | Source Code: the human readable portion of the Software.
44 | 


--------------------------------------------------------------------------------
/02_DNS_Analytics_Enrichment.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md 
  3 | # MAGIC You may find this series of notebooks at https://github.com/databricks-industry-solutions/dns-analytics. For more information about this solution accelerator, visit https://www.databricks.com/solutions/accelerators/threat-detection.
  4 | 
  5 | # COMMAND ----------
  6 | 
  7 | # MAGIC %md
  8 | # MAGIC # 2. Loading the data
  9 | # MAGIC We admit, that felt like a lot of work to prep URLHaus and dnstwist. But we are now ready for typosquatting detection and threat intel enrichment. 
 10 | # MAGIC 
 11 | # MAGIC Now, we can enrich the pDNS data with tldextract, GeoIP lookups, a DGA Classifier, URLHaus, threat intel lookups.
 12 | # MAGIC We will do this using Spark SQL.
 13 | 
 14 | # COMMAND ----------
 15 | 
 16 | # MAGIC %run ./util/Shared_Include
 17 | 
 18 | # COMMAND ----------
 19 | 
 20 | # Create user defined functions (UDF) for loading and manipulating the Geo data 
 21 | # The code here will perform Geo-IP lookups using the ip address available in the rdata field in our bronze table
 22 | # We use a free geo database from Maxmind: https://dev.maxmind.com/geoip/geoip2/geolite2/ 
 23 | import geoip2.errors
 24 | from geoip2 import database
 25 | 
 26 | import pandas as pd
 27 | 
 28 | from pyspark.sql.functions import pandas_udf
 29 | from pyspark import SparkFiles
 30 | 
 31 | # You can download this database from: https://dev.maxmind.com/geoip/geoip2/geolite2/ 
 32 | # You can upload the GeoLite2_City database file by using the databricks UI. 
 33 | # Databricks Navigator (lefthand bar) -> Data -> Upload File -> Select 
 34 | # Note if you receive an error here, you need to check exact location and adjust it
 35 | city_db = f'{get_default_path()}/datasets/GeoLite2_City.mmdb'
 36 | 
 37 | if not dbfs_file_exists(city_db):
 38 |   raise Exception(f'Please download GeoLite2_City database and put into {city_db}')
 39 | 
 40 | def extract_geoip_data(ip: str, geocity):
 41 |   print(ip)
 42 |   if ip:
 43 |     try:
 44 |       record = geocity.city(ip)
 45 |       return {'city': record.city.name, 'country': record.country.name, 'country_code': record.country.iso_code}
 46 |     except geoip2.errors.AddressNotFoundError:
 47 |       pass
 48 |   
 49 |   return {'city': None, 'country': None, 'country_code': None}
 50 | 
 51 | @pandas_udf("city string, country string, country_code string")
 52 | def get_geoip_data(ips: pd.Series) -> pd.DataFrame:
 53 |   # TODO: re-think that into more portable, as this may not work on CE
 54 |   geocity = database.Reader(f'/dbfs{city_db}')
 55 |   extracted = ips.apply(lambda ip: extract_geoip_data(ip, geocity))
 56 |   
 57 |   return pd.DataFrame(extracted.values.tolist())
 58 | 
 59 | spark.udf.register("get_geoip_data", get_geoip_data)
 60 | 
 61 | # COMMAND ----------
 62 | 
 63 | # Load the DGA model. This is a pre-trained model that we will use to enrich our incoming DNS events. You will see how to train this model in a later step.
 64 | import mlflow
 65 | import mlflow.pyfunc
 66 | 
 67 | model_path = f'dbfs:{get_default_path()}/model'
 68 | loaded_model = mlflow.pyfunc.load_model(model_path)
 69 | ioc_detect_udf = spark.udf.register("ioc_detect", loaded_model.predict)
 70 | 
 71 | # COMMAND ----------
 72 | 
 73 | # Filtering on the rrtype of A 
 74 | dns_table = (spark.table("bronze_dns")
 75 |               .selectExpr("*", "case when rrtype = 'A' then element_at(rdata, 1) else null end as ip_address ")
 76 |             )
 77 | 
 78 | # COMMAND ----------
 79 | 
 80 | #Enrich the data with city, country, country codes, ioc and domain name
 81 | dns_table_enriched = dns_table.withColumn("geoip_data", get_geoip_data(dns_table.ip_address))\
 82 |   .selectExpr("*", "geoip_data.*", 
 83 |               "case when char_length(domain_extract(rrname)) > 5 then ioc_detect(string(domain_extract(rrname))) else null end as ioc",
 84 |               "domain_extract(rrname) as domain_name").drop("geoip_data")
 85 | 
 86 | # COMMAND ----------
 87 | 
 88 | # Persist the enriched DNS data
 89 | (dns_table_enriched.write
 90 |   .format("delta")
 91 |   .mode('overwrite')
 92 |   .option("mergeSchema", True)
 93 |   .saveAsTable('silver_dns')
 94 | )
 95 | 
 96 | # COMMAND ----------
 97 | 
 98 | # MAGIC %sql
 99 | # MAGIC /* We check to see how many records we have loaded */
100 | # MAGIC select count(*) from silver_dns
101 | 


--------------------------------------------------------------------------------
/util/Shared_Include.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md 
  3 | # MAGIC You may find this series of notebooks at https://github.com/databricks-industry-solutions/dns-analytics. For more information about this solution accelerator, visit https://www.databricks.com/solutions/accelerators/threat-detection.
  4 | 
  5 | # COMMAND ----------
  6 | 
  7 | # install our libraries
  8 | %pip install tldextract dnstwist geoip2
  9 | 
 10 | # COMMAND ----------
 11 | 
 12 | import re
 13 | import os
 14 | 
 15 | current_user_name = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags().get("user").get()
 16 | 
 17 | def get_user_prefix():
 18 |   return re.sub(r'[^A-Za-z0-9_]', '_', re.sub(r'^([^@]+)(@.*)?$', r'\1', current_user_name))
 19 | 
 20 | current_user_name_prefix = get_user_prefix()
 21 | 
 22 | def get_default_path():
 23 |   return f'/tmp/{current_user_name_prefix}/dns_analytics'
 24 | 
 25 | try:
 26 |   dbutils.fs.mkdirs(get_default_path())
 27 | except:
 28 |   pass
 29 | 
 30 | def dbfs_file_exists(path: str):
 31 |   try:
 32 |     dbutils.fs.ls(path)
 33 |   except Exception as e:
 34 |     return False
 35 |   
 36 |   return True
 37 | 
 38 | if dbfs_file_exists('dbfs:/FileStore/dns_analytics/GeoLite2-City.mmdb') and not dbfs_file_exists(f'dbfs:{get_default_path()}/datasets/GeoLite2_City.mmdb'):
 39 |   dbutils.fs.cp('dbfs:/FileStore/dns_analytics/GeoLite2-City.mmdb', f'{get_default_path()}/datasets/GeoLite2_City.mmdb')
 40 |   
 41 | if dbfs_file_exists(f'dbfs:{get_default_path()}/datasets/GeoLite2_City.mmdb'):
 42 |   sc.addFile(f'dbfs:{get_default_path()}/datasets/GeoLite2_City.mmdb')
 43 | 
 44 | def get_default_database():
 45 |   return f'{current_user_name_prefix}_dns'
 46 | 
 47 | spark.sql(f'create database if not exists {get_default_database()}')
 48 | spark.sql(f'use {get_default_database()}')
 49 | 
 50 | print(f'Default database: {get_default_database()}')
 51 | print(f'Files are stored in {get_default_path()}')
 52 | 
 53 | # COMMAND ----------
 54 | 
 55 | spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
 56 | 
 57 | # COMMAND ----------
 58 | 
 59 | # MAGIC %python
 60 | # MAGIC # We will extract the registered_domain_extract and domain_extract fields from the URLHaus feeds.
 61 | # MAGIC import tldextract
 62 | # MAGIC import numpy as np
 63 | # MAGIC 
 64 | # MAGIC def registered_domain_extract(uri):
 65 | # MAGIC     ext = tldextract.extract(uri)
 66 | # MAGIC     if (not ext.suffix):
 67 | # MAGIC         return " "
 68 | # MAGIC     else:
 69 | # MAGIC         return ext.registered_domain
 70 | # MAGIC       
 71 | # MAGIC def domain_extract(uri):
 72 | # MAGIC     ext = tldextract.extract(uri)
 73 | # MAGIC     if (not ext.suffix):
 74 | # MAGIC         return " "
 75 | # MAGIC     else:
 76 | # MAGIC         return ext.domain
 77 | # MAGIC 
 78 | # MAGIC #The next three lines are registering our user defined functions(UDF) in the Databricks runtime environment 
 79 | # MAGIC registered_domain_extract = spark.udf.register("registered_domain_extract", registered_domain_extract)
 80 | # MAGIC domain_extract_udf = spark.udf.register("domain_extract", domain_extract)
 81 | 
 82 | # COMMAND ----------
 83 | 
 84 | #Load the DGA model. This is a pre-trained model that we will use to enrich our incoming DNS events. You will see how to train this model in a later step.
 85 | import mlflow
 86 | import mlflow.pyfunc
 87 | 
 88 | def get_and_register_ioc_detect_model():
 89 |   if dbfs_file_exists(f'dbfs:{get_default_path()}/new_model/dga_model'):
 90 |     model_path = f'dbfs:{get_default_path()}/new_model/dga_model'
 91 |   else:
 92 |     model_path = f'dbfs:{get_default_path()}/model'
 93 |   print(f"Loading model from {model_path}")
 94 |   loaded_model = mlflow.pyfunc.load_model(model_path)
 95 |   spark.udf.register("ioc_detect", loaded_model.predict)
 96 |   return loaded_model
 97 | 
 98 | # COMMAND ----------
 99 | 
100 | def cleanup_files_and_database():
101 |   try:
102 |     dbutils.fs.rm(get_default_path(), True)
103 |   except:
104 |     pass
105 |   try:
106 |     spark.sql(f'drop database if exists {get_default_database()} cascade')
107 |   except:
108 |     pass
109 |   try:
110 |     from mlflow.tracking.client import MlflowClient
111 |     client = MlflowClient()
112 |     model_name = f"{get_user_prefix()}_dns_dga"
113 |     client.delete_registered_model(model_name)
114 |   except:
115 |     pass
116 |   try:
117 |     dbutils.fs.rm("file:/tmp/dns-notebook-datasets", True)
118 |   except:
119 |     pass
120 | 
121 | # COMMAND ----------
122 | 
123 | import mlflow
124 | experiment_name = f"/Users/{current_user_name}/dns_analytics_experiment"
125 | mlflow.set_experiment(experiment_name) 
126 | 


--------------------------------------------------------------------------------
/05_DNS_Analytics_Streaming.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md 
  3 | # MAGIC You may find this series of notebooks at https://github.com/databricks-industry-solutions/dns-analytics. For more information about this solution accelerator, visit https://www.databricks.com/solutions/accelerators/threat-detection.
  4 | 
  5 | # COMMAND ----------
  6 | 
  7 | # MAGIC %md
  8 | # MAGIC # 5. Near Realtime Streaming Analytics
  9 | # MAGIC Enrich data with threat intel and Detect malicious activity in real-time using the analytics and enrichments 
 10 | 
 11 | # COMMAND ----------
 12 | 
 13 | # MAGIC %run ./util/Shared_Include
 14 | 
 15 | # COMMAND ----------
 16 | 
 17 | # Defining the schema for pDNS.
 18 | # You can use either the python style syntax or the SQL DDL syntax to define your schema.
 19 | 
 20 | # from pyspark.sql.types import StructType, StructField, StringType, LongType, StringType, ArrayType
 21 | # pdns_schema = (StructType()
 22 | #     .add("rrname", StringType(), True)
 23 | #     .add("rrtype", StringType(), True)
 24 | #     .add("time_first", LongType(), True)
 25 | #     .add("time_last", LongType(), True)
 26 | #     .add("count", LongType(), True)
 27 | #     .add("bailiwick", StringType(), True)
 28 | #     .add("rdata", ArrayType(StringType(), True), True)
 29 | # )
 30 | 
 31 | pdns_schema = """
 32 |   rrname     string,
 33 |   rrtype     string,
 34 |   time_first long,
 35 |   time_last  long,
 36 |   count      long,
 37 |   bailiwick  string,
 38 |   rdata      array<string>
 39 | """
 40 | 
 41 | # COMMAND ----------
 42 | 
 43 | # Load the DGA model from before and make available as a UDF so we can apply it to our dataframe.
 44 | import mlflow
 45 | import mlflow.pyfunc
 46 | 
 47 | model_path = f'dbfs:{get_default_path()}/model'
 48 | loaded_model = mlflow.pyfunc.load_model(model_path)
 49 | ioc_detect_udf = spark.udf.register("ioc_detect", loaded_model.predict)
 50 | 
 51 | # COMMAND ----------
 52 | 
 53 | # Load test data set
 54 | # Setting maxFilesPerTrigger to 1 to simulate streaming from a static set of files.  You wouldn't normally add this option in production.
 55 | df=(spark.readStream
 56 |     .option("maxFilesPerTrigger", 1)
 57 |     .json(f"{get_default_path()}/datasets/latest/", schema=pdns_schema)
 58 |     .withColumn("isioc", ioc_detect_udf(domain_extract_udf("rrname")))
 59 |     .withColumn("domain", domain_extract_udf("rrname"))
 60 | )
 61 | df.createOrReplaceTempView("dns_latest_stream")
 62 | 
 63 | # COMMAND ----------
 64 | 
 65 | # MAGIC %md
 66 | # MAGIC ##6.1 Find threats in DNS Event Stream
 67 | 
 68 | # COMMAND ----------
 69 | 
 70 | # MAGIC %sql
 71 | # MAGIC 
 72 | # MAGIC SELECT * FROM dns_latest_stream  WHERE isioc = 'ioc'
 73 | 
 74 | # COMMAND ----------
 75 | 
 76 | # MAGIC %sql
 77 | # MAGIC -- Phishing or Typosquating?
 78 | # MAGIC -- This is where we do typosquatting detection
 79 | # MAGIC -- By using dnstwist, we find the suspicious domain, googlee
 80 | # MAGIC SELECT silver_twisted_domain_brand.*  FROM dns_latest_stream, silver_twisted_domain_brand 
 81 | # MAGIC WHERE silver_twisted_domain_brand.dnstwisted_domain = dns_latest_stream.domain
 82 | 
 83 | # COMMAND ----------
 84 | 
 85 | # The next few lines we will be applying our models:
 86 | #  - To detect the bad domains
 87 | #  - Create an alerts table
 88 | dns_stream_iocs = spark.sql("Select * from dns_latest_stream  where isioc = 'ioc'")
 89 | # dbutils.fs.rm('dbfs:/tmp/datasets/gold/delta/DNS_IOC_Latest', True)
 90 | # spark.sql("drop table if exists DNS_IOC_Latest")
 91 | (dns_stream_iocs.writeStream
 92 |   .format("delta")
 93 |   .outputMode("append")
 94 |   .option("checkpointLocation", f"{get_default_path()}/_checkpoints/DNS_IOC_Latest")
 95 |   .table("DNS_IOC_Latest")
 96 | )
 97 | 
 98 | # COMMAND ----------
 99 | 
100 | # MAGIC %md
101 | # MAGIC ## 7. Agent Tesla
102 | # MAGIC Success!!! 
103 | # MAGIC - We used the DGA detection model on streaming DNS events, 
104 | # MAGIC - Identified a suspicious domain (ioc) in our DNS logs, 
105 | # MAGIC - Enriched the ioc with URLHaus
106 | # MAGIC - We can we can see that it this DGA domain is serving up agent tesla
107 | 
108 | # COMMAND ----------
109 | 
110 | # MAGIC %sql
111 | # MAGIC -- We found the bad domain - lets see if our enriched threat feeds have intel on this domain? 
112 | # MAGIC select * from silver_threat_feeds 
113 | # MAGIC where silver_threat_feeds.domain = domain_extract('ns1.asdklgb.cf.')
114 | 
115 | # COMMAND ----------
116 | 
117 | # Uncomment this line to remove database & all files
118 | # cleanup_files_and_database()
119 | 
120 | # COMMAND ----------
121 | 
122 | # let it execute for 10 more minutes
123 | import time
124 | time.sleep(600)
125 | # Please stop all your streams before you go.  This will ensure clusters can timeout and shutdown after class.
126 | for s in spark.streams.active:
127 |   s.stop()
128 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # Detecting Criminals and Nation States through DNS Analytics
 2 | 
 3 | You are a security practitioner, a data scientist or a security data engineer; you’ve seen the Large Scale Threat Detection and Response talk with Databricks . But you're wondering, “how can I try Databricks in my own security operations?” In this blog post, you will learn how to detect a remote access trojan using passive DNS (pDNS) and threat intel. Along the way, you’ll learn how to store, and analyze DNS data using Delta, Spark and MLFlow. As you well know, APT’s and cyber criminals are known to utilize DNS. Threat actors use the DNS protocol for command and control or beaconing or resolution of attacker domains. This is why academic researchers and industry groups advise security teams to collect and analyze DNS events to hunt, detect, investigate and respond to threats.  But you know, it's not as easy as it sounds.
 4 | 
 5 | The Complexity, cost, and limitations of legacy technology make detecting DNS security threats challenging for most enterprise organizations.
 6 | 
 7 | <img src='https://www.databricks.com/wp-content/uploads/2020/10/blog-detecting-criminals-1.png'>
 8 | 
 9 | ## Detecting AgentTeslaRAT with Databricks
10 | Using the notebooks on this solution accelerator, you will be able to detect the Agent Tesla RAT. You will be using analytics for domain generation algorithms (DGA), typosquatting and threat intel enrichments from URLhaus. Along the way you will learn the Databricks concepts of:
11 | 
12 | * Data ingestion
13 | * Ad hoc analytics
14 | * How to enrich event data, such as DNS queries
15 | * Model building and
16 | * Batch and Streaming analytics
17 | 
18 | Why use Databricks for this? Because the hardest thing about security analytics aren’t the analytics. You already know that analyzing large scale DNS traffic logs is complicated. Colleagues in the security community tell us that the challenges fall into three categories:
19 | 
20 | ## Deployment complexity
21 | DNS server data is everywhere. Cloud, hybrid, and multi-cloud deployments make it challenging to collect the data, have a single data store and run analytics consistently across the entire deployment.
22 | Tech limitations: Legacy SIEM and log aggregation solutions can’t scale to cloud data volumes for storage, analytics or ML/AI workloads. Especially, when it comes to joining data like threat intel enrichments.
23 | Cost: SIEMs or log aggregation systems charge by volume of data ingest. With so much data SIEM/log licensing and hardware requirements make DNS analytics cost prohibitive. And moving data from one cloud service provider to another is also costly and time consuming. The hardware pre-commit in the cloud or the expense of physical hardware on-prem are all deterrents for security teams.
24 | In order to address these issues, security teams need a real-time data analytics platform that can handle cloud-scale, analyze data wherever it is, natively support streaming and batch analytics and, have collaborative, content development capabilities. And… if someone could make this entire system elastic to prevent hardware commits… now wouldn’t that be cool!
25 | 
26 | You can use this notebook in the Databricks community edition or in your own Databricks deployment. There are lot of lines here but the high level flow is this:
27 | 
28 | * Read passive DNS data from AWS S3 bucket
29 | * Specify the schema for DNS and load the data into Delta
30 | * Explore the data with string matches
31 | * Build the DGA detection model. Build the typosquatting model.
32 | * Enrich the output of the DGA and typosquatting with threat intel from URLhaus
33 | * Run the analytics and detect the AgentTesla RAT
34 | 
35 | <img src='https://www.databricks.com/wp-content/uploads/2020/10/blog-detecting-criminals-2.png'>
36 | 
37 | ## Getting started
38 | 
39 | Although specific solutions can be downloaded as .dbc archives from our websites, we recommend cloning these repositories onto your databricks environment. Not only will you get access to latest code, but you will be part of a community of experts driving industry best practices and re-usable solutions, influencing our respective industries. 
40 | 
41 | <img width="500" alt="add_repo" src="https://user-images.githubusercontent.com/4445837/177207338-65135b10-8ccc-4d17-be21-09416c861a76.png">
42 | 
43 | To start using a solution accelerator in Databricks simply follow these steps: 
44 | 
45 | 1. Clone solution accelerator repository in Databricks using [Databricks Repos](https://www.databricks.com/product/repos)
46 | 2. Attach the `RUNME` notebook to any cluster and execute the notebook via Run-All. A multi-step-job describing the accelerator pipeline will be created, and the link will be provided. The job configuration is written in the RUNME notebook in json format. 
47 | 3. Execute the multi-step-job to see how the pipeline runs. 
48 | 4. You might want to modify the samples in the solution accelerator to your need, collaborate with other users and run the code samples against your own data. To do so start by changing the Git remote of your repository  to your organization’s repository vs using our samples repository (learn more). You can now commit and push code, collaborate with other user’s via Git and follow your organization’s processes for code development.
49 | 
50 | The cost associated with running the accelerator is the user's responsibility.
51 | 
52 | 
53 | ## Project support 
54 | 
55 | Please note the code in this project is provided for your exploration only, and are not formally supported by Databricks with Service Level Agreements (SLAs). They are provided AS-IS and we do not make any guarantees of any kind. Please do not submit a support ticket relating to any issues arising from the use of these projects. The source in this project is provided subject to the Databricks [License](./LICENSE). All included or referenced third party libraries are subject to the licenses set forth below.
56 | 
57 | Any issues discovered through the use of this project should be filed as GitHub Issues on the Repo. They will be reviewed as time permits, but there are no formal SLAs for support. 


--------------------------------------------------------------------------------
/RUNME.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md This notebook sets up the companion cluster(s) to run the solution accelerator. It also creates the Workflow to illustrate the order of execution. Happy exploring! 
  3 | # MAGIC 🎉
  4 | # MAGIC
  5 | # MAGIC **Steps**
  6 | # MAGIC 1. Simply attach this notebook to a cluster and hit Run-All for this notebook. A multi-step job and the clusters used in the job will be created for you and hyperlinks are printed on the last block of the notebook. 
  7 | # MAGIC
  8 | # MAGIC 2. Run the accelerator notebooks: Feel free to explore the multi-step job page and **run the Workflow**, or **run the notebooks interactively** with the cluster to see how this solution accelerator executes. 
  9 | # MAGIC
 10 | # MAGIC     2a. **Run the Workflow**: Navigate to the Workflow link and hit the `Run Now` 💥. 
 11 | # MAGIC   
 12 | # MAGIC     2b. **Run the notebooks interactively**: Attach the notebook with the cluster(s) created and execute as described in the `job_json['tasks']` below.
 13 | # MAGIC
 14 | # MAGIC **Prerequisites** 
 15 | # MAGIC 1. You need to have cluster creation permissions in this workspace.
 16 | # MAGIC
 17 | # MAGIC 2. In case the environment has cluster-policies that interfere with automated deployment, you may need to manually create the cluster in accordance with the workspace cluster policy. The `job_json` definition below still provides valuable information about the configuration these series of notebooks should run with. 
 18 | # MAGIC
 19 | # MAGIC **Notes**
 20 | # MAGIC 1. The pipelines, workflows and clusters created in this script are not user-specific. Keep in mind that rerunning this script again after modification resets them for other users too.
 21 | # MAGIC
 22 | # MAGIC 2. If the job execution fails, please confirm that you have set up other environment dependencies as specified in the accelerator notebooks. Accelerators may require the user to set up additional cloud infra or secrets to manage credentials. 
 23 | 
 24 | # COMMAND ----------
 25 | 
 26 | # DBTITLE 0,Install util packages
 27 | # MAGIC %pip install git+https://github.com/databricks-academy/dbacademy@v1.0.13 git+https://github.com/databricks-industry-solutions/notebook-solution-companion@safe-print-html --quiet --disable-pip-version-check
 28 | 
 29 | # COMMAND ----------
 30 | 
 31 | from solacc.companion import NotebookSolutionCompanion
 32 | 
 33 | # COMMAND ----------
 34 | 
 35 | job_json = {
 36 |         "timeout_seconds": 14400,
 37 |         "max_concurrent_runs": 1,
 38 |         "tags": {
 39 |             "usage": "solacc_automation",
 40 |             "group": "SEC"
 41 |         },
 42 |         "tasks": [
 43 |             {
 44 |                 "job_cluster_key": "dns_cluster",
 45 |                 "notebook_task": {
 46 |                      "notebook_path": f"00_README"
 47 |                 },
 48 |                 "task_key": "dns_01"
 49 |             },
 50 |             {
 51 |                 "job_cluster_key": "dns_cluster",
 52 |                 "notebook_task": {
 53 |                      "notebook_path": f"01_DNS_Analytics_Ingest"
 54 |                 },
 55 |                 "task_key": "dns_02",
 56 |                 "depends_on": [
 57 |                     {
 58 |                         "task_key": "dns_01"
 59 |                     }
 60 |                 ]
 61 |             },
 62 |             {
 63 |                 "job_cluster_key": "dns_cluster",
 64 |                 "notebook_task": {
 65 |                      "notebook_path": f"02_DNS_Analytics_Enrichment"
 66 |                 },
 67 |                 "task_key": "dns_03",
 68 |                 "depends_on": [
 69 |                     {
 70 |                         "task_key": "dns_02"
 71 |                     }
 72 |                 ]
 73 |             },
 74 |             {
 75 |                 "job_cluster_key": "dns_cluster",
 76 |                 "notebook_task": {
 77 |                      "notebook_path": f"03_DNS_Analytics_Exploring_Data"
 78 |                 },
 79 |                 "task_key": "dns_04",
 80 |                 "depends_on": [
 81 |                     {
 82 |                         "task_key": "dns_03"
 83 |                     }
 84 |                 ]
 85 |             },
 86 |             {
 87 |                 "job_cluster_key": "dns_cluster",
 88 |                 "notebook_task": {
 89 |                      "notebook_path": f"04_DNS_Analytics_Data_Science"
 90 |                 },
 91 |                 "task_key": "dns_05",
 92 |                 "depends_on": [
 93 |                     {
 94 |                         "task_key": "dns_04"
 95 |                     }
 96 |                 ]
 97 |             },
 98 |             {
 99 |                 "job_cluster_key": "dns_cluster",
100 |                 "notebook_task": {
101 |                     "notebook_path": f"05_DNS_Analytics_Streaming"
102 |                 },
103 |                 "task_key": "dns_06",
104 |                 "depends_on": [
105 |                     {
106 |                         "task_key": "dns_05"
107 |                     }
108 |                 ]
109 |             },
110 |             {
111 |                 "job_cluster_key": "dns_cluster",
112 |                 "notebook_task": {
113 |                     "notebook_path": f"06_DNS_Analytics_ScoreDomain"
114 |                 },
115 |                 "task_key": "dns_07",
116 |                 "depends_on": [
117 |                     {
118 |                         "task_key": "dns_06"
119 |                     }
120 |                 ]
121 |             }
122 |         ],
123 |         "job_clusters": [
124 |             {
125 |                 "job_cluster_key": "dns_cluster",
126 |                 "new_cluster": {
127 |                     "spark_version": "12.2.x-cpu-ml-scala2.12",
128 |                 "spark_conf": {
129 |                     "spark.databricks.delta.formatCheck.enabled": "false"
130 |                     },
131 |                     "num_workers": 2,
132 |                     "node_type_id": {"AWS": "i3.xlarge", "MSA": "Standard_DS3_v2", "GCP": "n1-highmem-4"}, # different from standard API
133 |                     "custom_tags": {
134 |                         "usage": "solacc_automation"
135 |                     },
136 |                 }
137 |             }
138 |         ]
139 |     }
140 | 
141 | 
142 | # COMMAND ----------
143 | 
144 | dbutils.widgets.dropdown("run_job", "False", ["True", "False"])
145 | run_job = dbutils.widgets.get("run_job") == "True"
146 | NotebookSolutionCompanion().deploy_compute(job_json, run_job=run_job)
147 | 
148 | # COMMAND ----------
149 | 
150 | 
151 | 


--------------------------------------------------------------------------------
/00_README.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md 
 3 | # MAGIC You may find this series of notebooks at https://github.com/databricks-industry-solutions/dns-analytics. For more information about this solution accelerator, visit https://www.databricks.com/solutions/accelerators/threat-detection.
 4 | 
 5 | # COMMAND ----------
 6 | 
 7 | # MAGIC %md
 8 | # MAGIC 
 9 | # MAGIC # Detecting Criminals and Nation States through DNS Analytics
10 | # MAGIC 
11 | # MAGIC You are a security practitioner, a data scientist or a security data engineer; you’ve seen the Large Scale Threat Detection and Response talk with Databricks . But you're wondering, “how can I try Databricks in my own security operations?” In this blog post, you will learn how to detect a remote access trojan using passive DNS (pDNS) and threat intel. Along the way, you’ll learn how to store, and analyze DNS data using Delta, Spark and MLFlow. As you well know, APT’s and cyber criminals are known to utilize DNS. Threat actors use the DNS protocol for command and control or beaconing or resolution of attacker domains. This is why academic researchers and industry groups advise security teams to collect and analyze DNS events to hunt, detect, investigate and respond to threats.  But you know, it's not as easy as it sounds.
12 | # MAGIC 
13 | # MAGIC The Complexity, cost, and limitations of legacy technology make detecting DNS security threats challenging for most enterprise organizations.
14 | # MAGIC 
15 | # MAGIC <img src='https://www.databricks.com/wp-content/uploads/2020/10/blog-detecting-criminals-1.png'>
16 | # MAGIC 
17 | # MAGIC ## Detecting AgentTeslaRAT with Databricks
18 | # MAGIC Using the notebooks on this solution accelerator, you will be able to detect the Agent Tesla RAT. You will be using analytics for domain generation algorithms (DGA), typosquatting and threat intel enrichments from URLhaus. Along the way you will learn the Databricks concepts of:
19 | # MAGIC 
20 | # MAGIC * Data ingestion
21 | # MAGIC * Ad hoc analytics
22 | # MAGIC * How to enrich event data, such as DNS queries
23 | # MAGIC * Model building and
24 | # MAGIC * Batch and Streaming analytics
25 | # MAGIC 
26 | # MAGIC Why use Databricks for this? Because the hardest thing about security analytics aren’t the analytics. You already know that analyzing large scale DNS traffic logs is complicated. Colleagues in the security community tell us that the challenges fall into three categories:
27 | # MAGIC 
28 | # MAGIC ## Deployment complexity
29 | # MAGIC DNS server data is everywhere. Cloud, hybrid, and multi-cloud deployments make it challenging to collect the data, have a single data store and run analytics consistently across the entire deployment.
30 | # MAGIC Tech limitations: Legacy SIEM and log aggregation solutions can’t scale to cloud data volumes for storage, analytics or ML/AI workloads. Especially, when it comes to joining data like threat intel enrichments.
31 | # MAGIC Cost: SIEMs or log aggregation systems charge by volume of data ingest. With so much data SIEM/log licensing and hardware requirements make DNS analytics cost prohibitive. And moving data from one cloud service provider to another is also costly and time consuming. The hardware pre-commit in the cloud or the expense of physical hardware on-prem are all deterrents for security teams.
32 | # MAGIC In order to address these issues, security teams need a real-time data analytics platform that can handle cloud-scale, analyze data wherever it is, natively support streaming and batch analytics and, have collaborative, content development capabilities. And… if someone could make this entire system elastic to prevent hardware commits… now wouldn’t that be cool!
33 | # MAGIC 
34 | # MAGIC You can use this notebook in the Databricks community edition or in your own Databricks deployment. There are lot of lines here but the high level flow is this:
35 | # MAGIC 
36 | # MAGIC * Read passive DNS data from AWS S3 bucket
37 | # MAGIC * Specify the schema for DNS and load the data into Delta
38 | # MAGIC * Explore the data with string matches
39 | # MAGIC * Build the DGA detection model. Build the typosquatting model.
40 | # MAGIC * Enrich the output of the DGA and typosquatting with threat intel from URLhaus
41 | # MAGIC * Run the analytics and detect the AgentTesla RAT
42 | # MAGIC 
43 | # MAGIC <img src='https://www.databricks.com/wp-content/uploads/2020/10/blog-detecting-criminals-2.png'>
44 | # MAGIC 
45 | # MAGIC ## Getting started
46 | # MAGIC 
47 | # MAGIC Although specific solutions can be downloaded as .dbc archives from our websites, we recommend cloning these repositories onto your databricks environment. Not only will you get access to latest code, but you will be part of a community of experts driving industry best practices and re-usable solutions, influencing our respective industries. 
48 | # MAGIC 
49 | # MAGIC <img width="500" alt="add_repo" src="https://user-images.githubusercontent.com/4445837/177207338-65135b10-8ccc-4d17-be21-09416c861a76.png">
50 | # MAGIC 
51 | # MAGIC To start using a solution accelerator in Databricks simply follow these steps: 
52 | # MAGIC 
53 | # MAGIC 1. Clone solution accelerator repository in Databricks using [Databricks Repos](https://www.databricks.com/product/repos)
54 | # MAGIC 2. Attach the `RUNME` notebook to any cluster and execute the notebook via Run-All. A multi-step-job describing the accelerator pipeline will be created, and the link will be provided. The job configuration is written in the RUNME notebook in json format. 
55 | # MAGIC 3. Execute the multi-step-job to see how the pipeline runs. 
56 | # MAGIC 4. You might want to modify the samples in the solution accelerator to your need, collaborate with other users and run the code samples against your own data. To do so start by changing the Git remote of your repository  to your organization’s repository vs using our samples repository (learn more). You can now commit and push code, collaborate with other user’s via Git and follow your organization’s processes for code development.
57 | # MAGIC 
58 | # MAGIC The cost associated with running the accelerator is the user's responsibility.
59 | # MAGIC 
60 | # MAGIC 
61 | # MAGIC ## Project support 
62 | # MAGIC 
63 | # MAGIC Please note the code in this project is provided for your exploration only, and are not formally supported by Databricks with Service Level Agreements (SLAs). They are provided AS-IS and we do not make any guarantees of any kind. Please do not submit a support ticket relating to any issues arising from the use of these projects. The source in this project is provided subject to the Databricks [License](./LICENSE). All included or referenced third party libraries are subject to the licenses set forth below.
64 | # MAGIC 
65 | # MAGIC Any issues discovered through the use of this project should be filed as GitHub Issues on the Repo. They will be reviewed as time permits, but there are no formal SLAs for support. 
66 | 


--------------------------------------------------------------------------------
/01_DNS_Analytics_Ingest.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md 
  3 | # MAGIC You may find this series of notebooks at https://github.com/databricks-industry-solutions/dns-analytics. For more information about this solution accelerator, visit https://www.databricks.com/solutions/accelerators/threat-detection.
  4 | 
  5 | # COMMAND ----------
  6 | 
  7 | # MAGIC %scala
  8 | # MAGIC displayHTML("""<iframe src="https://drive.google.com/file/d/1ZMu8nFMuCzPZonOJmib8TpFR9JNypS0L/preview" frameborder="0" height="480" width="640"></iframe>
  9 | # MAGIC """)
 10 | 
 11 | # COMMAND ----------
 12 | 
 13 | # MAGIC %md
 14 | # MAGIC ## Data layout
 15 | # MAGIC 
 16 | # MAGIC In this workshop we're using prepared data. We use multiple tables to stage, schematize and store analytics results. Here is the TLDR on table naming:
 17 | # MAGIC - **Bronze**: Raw data
 18 | # MAGIC - **Silver**: Schematized and enriched data
 19 | # MAGIC - **Gold**:  Detections and alerts
 20 | # MAGIC 
 21 | # MAGIC Why do this? Short version: so you can always go back to the source, refine your analytics over time, and never lose any data. <a href="https://databricks.com/blog/2019/08/14/productionizing-machine-learning-with-delta-lake.html"> And the long version.</a>
 22 | 
 23 | # COMMAND ----------
 24 | 
 25 | # MAGIC %run ./util/Shared_Include
 26 | 
 27 | # COMMAND ----------
 28 | 
 29 | # MAGIC %md
 30 | # MAGIC 
 31 | # MAGIC ## Fetching the data & initial model for workshop
 32 | 
 33 | # COMMAND ----------
 34 | 
 35 | # MAGIC %md 
 36 | # MAGIC In this segment, we'll download all of the datasets we need in order to be able to run our notebook.
 37 | # MAGIC These datasets include:
 38 | # MAGIC * anonymized DNS data
 39 | # MAGIC * a GeoIP lookup database
 40 | # MAGIC * a threat feed
 41 | # MAGIC * domains generated by `dnstwist` for our enrichment pipeline
 42 | # MAGIC 
 43 | # MAGIC We also include:
 44 | # MAGIC * the top 100k domains on alexa
 45 | # MAGIC * a list of dictionary words
 46 | # MAGIC * a list of dga domains to train a DGA model
 47 | 
 48 | # COMMAND ----------
 49 | 
 50 | # MAGIC %sh 
 51 | # MAGIC if [ -d /tmp/dns-notebook-datasets ]; then
 52 | # MAGIC   cd /tmp/dns-notebook-datasets
 53 | # MAGIC   git pull
 54 | # MAGIC else
 55 | # MAGIC   cd /tmp
 56 | # MAGIC   git clone --depth 1 https://github.com/zaferbil/dns-notebook-datasets.git
 57 | # MAGIC fi
 58 | 
 59 | # COMMAND ----------
 60 | 
 61 | # Copy the downloaded data into the FileStore for this workspace
 62 | print(f'Copying datasets and model to the DBFS: {get_default_path()}')
 63 | dbutils.fs.cp("file:///tmp/dns-notebook-datasets/data", f"dbfs:{get_default_path()}/datasets/",True)
 64 | dbutils.fs.cp("file:///tmp/dns-notebook-datasets/model", f"dbfs:{get_default_path()}/model/",True)
 65 | 
 66 | # COMMAND ----------
 67 | 
 68 | # MAGIC %md 
 69 | # MAGIC 
 70 | # MAGIC ## Loading pDNS data
 71 | 
 72 | # COMMAND ----------
 73 | 
 74 | # Defining the schema for pDNS.
 75 | # You can use either the python style syntax or the SQL DDL syntax to define your schema.
 76 | 
 77 | # from pyspark.sql.types import StructType, StructField, StringType, LongType, StringType, ArrayType
 78 | # pdns_schema = (StructType()
 79 | #     .add("rrname", StringType(), True)
 80 | #     .add("rrtype", StringType(), True)
 81 | #     .add("time_first", LongType(), True)
 82 | #     .add("time_last", LongType(), True)
 83 | #     .add("count", LongType(), True)
 84 | #     .add("bailiwick", StringType(), True)
 85 | #     .add("rdata", ArrayType(StringType(), True), True)
 86 | # )
 87 | 
 88 | pdns_schema = """
 89 |   rrname     string,
 90 |   rrtype     string,
 91 |   time_first long,
 92 |   time_last  long,
 93 |   count      long,
 94 |   bailiwick  string,
 95 |   rdata      array<string>
 96 | """
 97 | 
 98 | # COMMAND ----------
 99 | 
100 | # In this segment, we are specifying where the data is and what type of data it is.
101 | # You can see the json format, the path and the AWS region
102 | df = spark.read.format("json").schema(pdns_schema).load(f"{get_default_path()}/datasets/dns_events.json")
103 | 
104 | # COMMAND ----------
105 | 
106 | # The rdata field has an array element. This isn't very useful if you want to parse it, or search in it.
107 | # So we create a new field called rdatastr. You can see the difference in the two fields in the sample output below.
108 | from pyspark.sql.functions import col, concat_ws
109 | df_enhanced = df.withColumn("rdatastr", concat_ws(",", col("rdata")))
110 | display(df_enhanced)
111 | 
112 | # COMMAND ----------
113 | 
114 | # Here we specify the format of the data to be written, and the destination path
115 | # This is still just setup - Data has not been posted to the Bronze table yet. 
116 | df_enhanced.write.format("delta").mode("overwrite").option("mergeSchema", "true").saveAsTable("bronze_dns")
117 | 
118 | # COMMAND ----------
119 | 
120 | # MAGIC %md
121 | # MAGIC ## URLHaus threat feed setup
122 | # MAGIC We will be using URLHaus threat feeds with our pDNS data. This section shows you how to ingest the URLHaus feed.
123 | # MAGIC 
124 | # MAGIC For this setup, we need to do two things:
125 | # MAGIC - Define functions for field extractions so we can extract the `registered_domain_extract`, `domain_extract` and `suffix_extract` fields from the URLHaus feeds. This is done via [user defined functions (UDF)](https://docs.databricks.com/spark/latest/spark-sql/udf-python.html) that are declared in the `./Shared_Include` notebook.
126 | # MAGIC - Create an enriched schema and save it to a silver table.
127 | 
128 | # COMMAND ----------
129 | 
130 | # We specify the source location of the URLHaus feed, the csv format, and declare that the csv has field labels in a header
131 | threat_feeds_location = f"{get_default_path()}/datasets/ThreatDataFeed.txt"
132 | threat_feeds_raw = spark.read.csv(threat_feeds_location, header=True)
133 | # Display a sample so we can check to see it makes sense
134 | display(threat_feeds_raw)
135 | 
136 | # COMMAND ----------
137 | 
138 | # We create a new enriched view by extracting the domain name from the URL using the domain_extractor user defined function from the previous section.
139 | threat_feeds_raw.createOrReplaceTempView("threat_feeds_raw")
140 | threat_feeds_enriched_df = spark.sql("""
141 |   select *, domain_extract(url) as domain
142 |   from threat_feeds_raw
143 |   """).filter("char_length(domain) >= 2")
144 | # The sample display shows the new field "domain"
145 | display(threat_feeds_enriched_df)
146 | 
147 | # COMMAND ----------
148 | 
149 | # We save our new, enriched schema 
150 | (threat_feeds_enriched_df.write
151 |   .format("delta")
152 |   .mode('overwrite')
153 |   .option("mergeSchema", True)
154 |   .saveAsTable("silver_threat_feeds")
155 | )
156 | 
157 | # COMMAND ----------
158 | 
159 | # MAGIC %md
160 | # MAGIC ## DNS Twist Setup for detecting lookalike domains
161 | # MAGIC We will use <a href="https://github.com/elceef/dnstwist">dnstwist</a> to monitor lookalike domains that adversaries can use to attack you. 
162 | # MAGIC Using <a href="https://github.com/elceef/dnstwist">dnstwist</a> you can detect <a href="https://capec.mitre.org/data/definitions/630.html">typosquatters</a>, phishing attacks, fraud, and brand impersonation. Before using the remainder of section 1.b of this notebook, you will have to use <a href="https://github.com/elceef/dnstwist">dnstwist instructions</a> (outside of this notebook) to create a `domains_dnstwists.csv`. In our example (below) we generated variations for `google.com` using `dnstwist`. You can automate this for your own organization or for any organization of interest. 
163 | # MAGIC 
164 | # MAGIC After installing `dnstwist`, we ran:<br/>
165 | # MAGIC ```
166 | # MAGIC dnstwist --registered google.com >> domains_dnstwists.csv
167 | # MAGIC addition       googlea.com    184.168.131.241 NS:ns65.domaincontrol.com MX:mailstore1.secureserver.net
168 | # MAGIC addition       googleb.com    47.254.33.193 NS:ns3.dns.com 
169 | # MAGIC ```
170 | # MAGIC 
171 | # MAGIC We formatted domains_dnstwists.csv with a header: `PERMUTATIONTYPE,domain,meta`
172 | # MAGIC 
173 | # MAGIC Once you have created `domain_dnstwists.csv`, you can continue:
174 | # MAGIC - load the dnstwisted domains
175 | # MAGIC - enrich the table with domain names (without TLDs)
176 | # MAGIC - load the `dnstwist`-enriched results into a silver table
177 | # MAGIC 
178 | # MAGIC We will use these tables later to productionize typosquatting detection.
179 | 
180 | # COMMAND ----------
181 | 
182 | # NOTE: domain_dnstwists.csv needs to be created outside of this notebook, using instructions from dnstwist. 
183 | # Load the domain_dnstwists.csv into a dataframe, brand_domains_monitored_raw. Note the csv and header, true options.
184 | brand_domains_monitored_raw_df = spark.read.csv(f"{get_default_path()}/datasets/domains_dnstwists.csv", header=True) 
185 | 
186 | # COMMAND ----------
187 | 
188 | # Display csv we just read
189 | display(brand_domains_monitored_raw_df)
190 | 
191 | # COMMAND ----------
192 | 
193 | # Load the csv brand_domains_monitored_raw into a local table called, brand_domains_monitored_raw
194 | brand_domains_monitored_raw_df.createOrReplaceTempView("brand_domains_monitored_raw")
195 | 
196 | # COMMAND ----------
197 | 
198 | # Extract the domain names using the UDF we created at Cmd 9 of this notebook.
199 | # Create a new table with the dnstwist extracted domains. New column dnstwisted_domain
200 | # The hardcoded ">=2" is there to accommodate for potential empty domain fields
201 | brand_domains_monitored_enriched_df = spark.sql("""
202 |   select *, domain_extract(domain) as dnstwisted_domain
203 |   from brand_domains_monitored_raw
204 |   """).filter("char_length(dnstwisted_domain) >= 2")
205 | display(brand_domains_monitored_enriched_df)
206 | 
207 | # COMMAND ----------
208 | 
209 | # Define a silver Delta table
210 | (brand_domains_monitored_enriched_df.write
211 |   .format("delta")
212 |   .mode('overwrite')
213 |   .option("mergeSchema", False)
214 |   .saveAsTable("silver_twisted_domain_brand")
215 | )
216 | 
217 | # COMMAND ----------
218 | 
219 | # MAGIC %sql
220 | # MAGIC /* Query the silver Delta table */
221 | # MAGIC select *  from silver_twisted_domain_brand
222 | 


--------------------------------------------------------------------------------
/04_DNS_Analytics_Data_Science.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md 
  3 | # MAGIC You may find this series of notebooks at https://github.com/databricks-industry-solutions/dns-analytics. For more information about this solution accelerator, visit https://www.databricks.com/solutions/accelerators/threat-detection.
  4 | 
  5 | # COMMAND ----------
  6 | 
  7 | # MAGIC %md
  8 | # MAGIC # 4. ML Training and Analytics
  9 | # MAGIC In this section we will build the DGA model and the typosquatting model. Slides below have some high level discussion on DGA.  
 10 | # MAGIC - A detailed discussion on DGA is here: http://www.covert.io/getting-started-with-dga-research/
 11 | # MAGIC - A more detailed discussion on typosquatting is here: https://www.mcafee.com/blogs/consumer/what-is-typosquatting/
 12 | # MAGIC 
 13 | # MAGIC At a high level we will:
 14 | # MAGIC - Extract the domain names from the data removing gTLD (e.g. .com, .org) and ccTLD (e.g. .ru, cn, .uk, .ca)
 15 | # MAGIC - Build the models 
 16 | 
 17 | # COMMAND ----------
 18 | 
 19 | # MAGIC %scala
 20 | # MAGIC displayHTML("""<iframe src="https://docs.google.com/presentation/d/e/2PACX-1vRqDKRAKkXWhcRavKMvJE1BKzpoI4UvofIFQdIpoTV1d7Z3b4XdIsRt6O0iAFV8waBPvrMLVUdHFcND/embed?start=false&loop=false&delayms=3000" frameborder="0" width="960" height="569" allowfullscreen="true" mozallowfullscreen="true" webkitallowfullscreen="true"></iframe>
 21 | # MAGIC """)
 22 | 
 23 | # COMMAND ----------
 24 | 
 25 | # MAGIC %run ./util/Shared_Include
 26 | 
 27 | # COMMAND ----------
 28 | 
 29 | # Read the Alexa list of domains
 30 | # Alexa is a list of the most popular domains on the internet ranked by popularity
 31 | # Alexa is not intended as a whitelist. 
 32 | import pandas as pd
 33 | import mlflow
 34 | import mlflow.sklearn
 35 | import mlflow.pyfunc
 36 | 
 37 | dbutils.fs.cp(f'{get_default_path()}/datasets/alexa_100k.txt', f'file://{get_default_path()}/datasets/alexa_100k.txt')
 38 | alexa_dataframe = pd.read_csv(f'{get_default_path()}/datasets/alexa_100k.txt')
 39 | display(alexa_dataframe)
 40 | 
 41 | # COMMAND ----------
 42 | 
 43 | # Extract the domains names without gTLD or ccTLD (generic or country code top-level domain) from the registered domain and subdomains of a URL.
 44 | # We only need the domain names for training.
 45 | # Example fields in a tldextract result: ExtractResult(subdomain='forums.news', domain='cnn', suffix='com')
 46 | import tldextract
 47 | import numpy as np
 48 | 
 49 | alexa_dataframe['domain'] = [ domain_extract(uri) for uri in alexa_dataframe['uri']]
 50 | del alexa_dataframe['uri']
 51 | del alexa_dataframe['rank']
 52 | display(alexa_dataframe)
 53 | 
 54 | # COMMAND ----------
 55 | 
 56 | # Add legitimate domains from Alexa to the training data
 57 | # It's possible we have NaNs from blanklines or whatever
 58 | alexa_dataframe = alexa_dataframe.dropna()
 59 | alexa_dataframe = alexa_dataframe.drop_duplicates()
 60 | 
 61 | # Set the class
 62 | alexa_dataframe['class'] = 'legit'
 63 | 
 64 | # Shuffle the data (important for training/testing)
 65 | alexa_dataframe = alexa_dataframe.reindex(np.random.permutation(alexa_dataframe.index))
 66 | alexa_total = alexa_dataframe.shape[0]
 67 | print('Total Alexa domains %d' % alexa_total)
 68 | display(alexa_dataframe)
 69 | 
 70 | # COMMAND ----------
 71 | 
 72 | file_location = f'{get_default_path()}/datasets/dga_domains_header.txt'
 73 | dbutils.fs.cp(f'{get_default_path()}/datasets/dga_domains_header.txt', f'file:{file_location}')
 74 | 
 75 | 
 76 | # For big datasets we'll use Koalas!
 77 | dga_dataframe = pd.read_csv(file_location, header=0);
 78 | # We noticed that the blacklist values just differ by capitalization or .com/.org/.info
 79 | dga_dataframe['domain'] = dga_dataframe.applymap(lambda x: x.split('.')[0].strip().lower())
 80 | 
 81 | # It's possible we have NaNs from blanklines or whatever
 82 | dga_dataframe = dga_dataframe.dropna()
 83 | dga_dataframe = dga_dataframe.drop_duplicates()
 84 | dga_total = dga_dataframe.shape[0]
 85 | print('Total DGA domains %d' % dga_total)
 86 | 
 87 | # Set the class
 88 | dga_dataframe['class'] = 'ioc'
 89 | 
 90 | print('Number of DGA domains: %d' % dga_dataframe.shape[0])
 91 | all_domains = pd.concat([alexa_dataframe, dga_dataframe], ignore_index=True)
 92 | 
 93 | # COMMAND ----------
 94 | 
 95 | # Output of DGA detections from our dataset
 96 | display(dga_dataframe)
 97 | 
 98 | # COMMAND ----------
 99 | 
100 | # Lets do some feature engineering and add calculations for entropy and length to our dataset.
101 | # We calculate entropy by comparing the number of unique characters in our string to its length.
102 | all_domains['length'] = [len(x) for x in all_domains['domain']]
103 | all_domains = all_domains[all_domains['length'] > 6]
104 | 
105 | import math
106 | from collections import Counter
107 |  
108 | def entropy(s):
109 |     p, lns = Counter(s), float(len(s))
110 |     return -sum( count/lns * math.log(count/lns, 2) for count in p.values())
111 |   
112 | all_domains['entropy'] = [entropy(x) for x in all_domains['domain']]
113 | 
114 | # COMMAND ----------
115 | 
116 | # Print the results. The higher the entropy the higher the potential for DGA. But we aren't done quite yet.
117 | display(all_domains)
118 | 
119 | # COMMAND ----------
120 | 
121 | # Here we do additional feature engineering to do n-gram frequency analysis our valid domains
122 | 
123 | y = np.array(all_domains['class'].tolist()) # Yes, this is weird but it needs 
124 | 
125 | import sklearn.ensemble
126 | from sklearn import feature_extraction
127 | 
128 | alexa_vc = sklearn.feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(3,5), min_df=1e-4, max_df=1.0)
129 | counts_matrix = alexa_vc.fit_transform(alexa_dataframe['domain'])
130 | alexa_counts = np.log10(counts_matrix.sum(axis=0).getA1())
131 | ngrams_list = alexa_vc.get_feature_names()
132 | 
133 | # COMMAND ----------
134 | 
135 | # Load dictionary words into a dataframe
136 | dbutils.fs.cp(f'{get_default_path()}/datasets/words.txt', f'file://{get_default_path()}/datasets/words.txt')
137 | file_location = f'{get_default_path()}/datasets/words.txt'
138 | word_dataframe = pd.read_csv(file_location, header=0, sep=';');
139 | word_dataframe = word_dataframe[word_dataframe['words'].map(lambda x: str(x).isalpha())]
140 | word_dataframe = word_dataframe.applymap(lambda x: str(x).strip().lower())
141 | word_dataframe = word_dataframe.dropna()
142 | word_dataframe = word_dataframe.drop_duplicates()
143 | 
144 | # COMMAND ----------
145 | 
146 | # Create a dictionary from the word list
147 | dict_vc = sklearn.feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(3,5), min_df=1e-5, max_df=1.0)
148 | counts_matrix = dict_vc.fit_transform(word_dataframe['words'])
149 | dict_counts = np.log10(counts_matrix.sum(axis=0).getA1())
150 | ngrams_list = dict_vc.get_feature_names()
151 | 
152 | def ngram_count(domain):
153 |     alexa_match = alexa_counts * alexa_vc.transform([domain]).T  # Woot vector multiply and transpose Woo Hoo!
154 |     dict_match = dict_counts * dict_vc.transform([domain]).T
155 |     print(f'Domain: {domain} Alexa match: {alexa_match} Dict match: {dict_match}')
156 | 
157 | # Examples:
158 | ngram_count('beyonce')
159 | ngram_count('dominos')
160 | ngram_count('1cb8a5f36f')
161 | ngram_count('zfjknuh38231')
162 | ngram_count('bey6o4ce')
163 | ngram_count('washington')
164 | 
165 | # COMMAND ----------
166 | 
167 | # Create n-grams from the dictionary and Alex 100k list. And build a matching function. And run test examples.
168 | # More on ngrams here: https://blog.xrds.acm.org/2017/10/introduction-n-grams-need/ 
169 | 
170 | all_domains['alexa_grams']= alexa_counts * alexa_vc.transform(all_domains['domain']).T 
171 | all_domains['word_grams']= dict_counts * dict_vc.transform(all_domains['domain']).T 
172 | 
173 | # COMMAND ----------
174 | 
175 | # MAGIC %md
176 | # MAGIC ## Build a vectorized model of the n-grams
177 | # MAGIC 
178 | # MAGIC We need vectors for building the model
179 | 
180 | # COMMAND ----------
181 | 
182 | weird_cond = (all_domains['class']=='legit') & (all_domains['word_grams']<3) & (all_domains['alexa_grams']<2)
183 | weird = all_domains[weird_cond]
184 | print(weird.shape[0])
185 | all_domains.loc[weird_cond, 'class'] = 'weird'
186 | print(all_domains['class'].value_counts())
187 | 
188 | # COMMAND ----------
189 | 
190 | # MAGIC %md
191 | # MAGIC ## Let's train our model
192 | 
193 | # COMMAND ----------
194 | 
195 | #Labelling the domains based on weirdness 
196 | # Using ML runtime, my packages come pre-installed
197 | # Using ML flow, we can track our experiments as we iterate
198 | 
199 | from sklearn.model_selection import train_test_split
200 | clf = sklearn.ensemble.RandomForestClassifier(n_estimators=20) # Trees in the forest
201 | 
202 | not_weird = all_domains[all_domains['class'] != 'weird']
203 | X = not_weird[['length', 'entropy', 'alexa_grams', 'word_grams']].values
204 | 
205 | # Labels (scikit learn uses 'y' for classification labels)
206 | y = np.array(not_weird['class'].tolist())
207 | 
208 | with mlflow.start_run() as run:
209 |   mlflow.sklearn.autolog() # automatically log model parameters
210 |   # We can also add the call to mlflow.spark.autolog() to track the data source, but it requires an additional Jar: https://mlflow.org/docs/latest/tracking.html#spark-experimental 
211 |   # Train on a 80/20 split
212 |   X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
213 |   clf.fit(X_train, y_train)
214 |   y_pred = clf.predict(X_test)
215 |   # First train on the whole thing before looking at prediction performance
216 |   clf.fit(X, y)
217 | 
218 | # Locate the model in our content library using MLFlow
219 | run_id = run.info.run_id
220 | print(f'MLflow run_id: {run_id}, model_uri: runs:/{run_id}/model')
221 | 
222 | # COMMAND ----------
223 | 
224 | # Build a predict function to be used laterto do DGA predictions
225 | # Add in pre and post processing for our predict function
226 | 
227 | import mlflow.pyfunc
228 | 
229 | class vc_transform(mlflow.pyfunc.PythonModel):
230 |     def __init__(self, alexa_vc, dict_vc, ctx):
231 |         self.alexa_vc = alexa_vc
232 |         self.dict_vc = dict_vc
233 |         self.ctx = ctx
234 | 
235 |     def predict(self, context, model_input):
236 |         _alexa_match = alexa_counts * self.alexa_vc.transform([model_input]).T  
237 |         _dict_match = dict_counts * self.dict_vc.transform([model_input]).T
238 |         _X = [len(model_input), entropy(model_input), _alexa_match, _dict_match]
239 |         return str(self.ctx.predict([_X])[0])
240 | 
241 | # COMMAND ----------
242 | 
243 | # Save our model
244 | from mlflow.exceptions import MlflowException
245 | model_path = f'{get_default_path()}/new_model/dga_model'
246 | 
247 | dbutils.fs.rm(f'file://{model_path}', True)
248 | 
249 | vc_model = vc_transform(alexa_vc, dict_vc, clf)
250 | mlflow.pyfunc.save_model(model_path, python_model=vc_model)
251 | dbutils.fs.cp(f'file://{model_path}', model_path, True)
252 | print(f'new DGA model is copied to DBFS to dbfs:{model_path}')
253 | 
254 | # COMMAND ----------
255 | 
256 | from mlflow.tracking.client import MlflowClient
257 | client = MlflowClient()
258 | 
259 | model_name = f"{get_user_prefix()}_dns_dga"
260 | 
261 | # Usually it's enough to call the mlflow.register_model(model_uri=f"runs:/{run_id}/model", name=model_name) 
262 | # but because we have custom model we need to 
263 | try:
264 |   client.get_registered_model(model_name)
265 | except:
266 |   client.create_registered_model(model_name)
267 | 
268 | model_version = client.create_model_version(model_name, f"dbfs:{model_path}", run.info.run_id, description="DGA detection model")
269 | print(f"Model is registered with name: {model_name}, version: {model_version.version}")
270 | # Uncomment if you want to promote this version into Staging
271 | # client.transition_model_version_stage(name=model_name, version=model_version.version, stage="Staging")
272 | 
273 | # COMMAND ----------
274 | 
275 | # do predict 
276 | vc_model.predict(mlflow.pyfunc.PythonModel, '7ydbdehaaz')
277 | 
278 | # COMMAND ----------
279 | 
280 | # MAGIC %md
281 | # MAGIC 
282 | # MAGIC ## What problems can you spot with that model? How we can improve this model?
283 | # MAGIC 
284 | # MAGIC **How would you approach to this problem?**
285 | # MAGIC 
286 | # MAGIC For example:
287 | # MAGIC * domain registration/update/expiration date ?
288 | # MAGIC * information about DNS registrant ?
289 | # MAGIC * information about autonomous system ?
290 | # MAGIC * references from other domains?
291 | 


--------------------------------------------------------------------------------