├── dlt_modern_stuff
    ├── src
    │   ├── __init__.py
    │   ├── helpers.py
    │   ├── ingest_zeek_http.py
    │   ├── ingest_zeek_conn.py
    │   ├── setup.py
    │   ├── ingest_apache_web.py
    │   ├── detections.py
    │   └── ingest_aws_cloudtrail.py
    ├── resources
    │   ├── .gitkeep
    │   ├── dlt_cyber_demo_setup.job.yml
    │   ├── demo_ingest_aws_cloudtrail_data.pipeline.yml
    │   ├── demo_ingest_apache_data.pipeline.yml
    │   ├── demo_ingest_zeek_data.pipeline.yml
    │   ├── demo_detections.pipeline.yml
    │   └── variables.yml
    ├── .gitignore
    ├── images
    │   └── cyber-pipeline-impl.png
    ├── databricks.yml
    └── README.md
├── iocs-ingest
    ├── README.md
    ├── IoCs Bronze.py
    ├── IoCs Common.py
    └── IoCs Silver.py
├── uc-udfs
    ├── README.md
    ├── community_id.sql
    ├── ocsf.sql
    └── protocols.sql
├── README.md
└── LICENSE


/dlt_modern_stuff/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dlt_modern_stuff/resources/.gitkeep:
--------------------------------------------------------------------------------
1 | This folder is reserved for Databricks Asset Bundles resource definitions.
2 | 


--------------------------------------------------------------------------------
/dlt_modern_stuff/.gitignore:
--------------------------------------------------------------------------------
1 | .databricks/
2 | build/
3 | dist/
4 | __pycache__/
5 | *.egg-info
6 | .venv/
7 | scratch/**
8 | !scratch/README.md
9 | 


--------------------------------------------------------------------------------
/dlt_modern_stuff/images/cyber-pipeline-impl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexott/databricks-cybersecurity-playground/main/dlt_modern_stuff/images/cyber-pipeline-impl.png


--------------------------------------------------------------------------------
/iocs-ingest/README.md:
--------------------------------------------------------------------------------
1 | This folder contains Databricks notebooks with source code for the blog post [Ingesting indicators of compromise with Filebeat, Azure Event Hubs & Delta Lake on Databricks](https://alexott.blogspot.com/2022/10/ingesting-indicators-of-compromise-with.html).
2 | 


--------------------------------------------------------------------------------
/uc-udfs/README.md:
--------------------------------------------------------------------------------
1 | # Cybersecurity-related user-defined functions for Unity Catalog
2 | 
3 | There is a number of user-defined functions that could be useful when working with heterogeneous log sources.
4 | 
5 | Now available:
6 | 
7 | - `protocols.sql` contains two functions `proto_name_to_code` and `proto_code_to_name` to remap network protocol codes and names.
8 | - `ocsf.sql` contains functions that map `activity_id` into `activity_name` for different categories.
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # databricks-cybersecurity-playground
2 | 
3 | Different pieces of code related to doing cybersecurity on Databricks
4 | 
5 | 
6 | * [iocs-ingest](iocs-ingest/) - source code for ingesting data from Filebeat's Threat Intel module.
7 | * [dlt_modern_stuff](dlt_modern_stuff/) - source code that demonstrates use of latest feature of DLT (append flows, sinks, direct publishing).
8 | * [uc-udfs](uc-udfs/) - cybersecurity-related user-defined functions for Unity Catalog.
9 | 


--------------------------------------------------------------------------------
/dlt_modern_stuff/resources/dlt_cyber_demo_setup.job.yml:
--------------------------------------------------------------------------------
 1 | resources:
 2 |   jobs:
 3 |     dlt_cyber_demo_setup:
 4 |       name: 'DLT Cyber Demo: Setup'
 5 |       tasks:
 6 |         - task_key: setup
 7 |           notebook_task:
 8 |             base_parameters:
 9 |               catalog_name: ${var.catalog_name}
10 |               schema_name: ${var.normalized_schema_name}
11 |               volume_path: ${var.log_files_path}
12 |             notebook_path: ../src/setup.py
13 |       max_concurrent_runs: 1
14 | 


--------------------------------------------------------------------------------
/dlt_modern_stuff/resources/demo_ingest_aws_cloudtrail_data.pipeline.yml:
--------------------------------------------------------------------------------
 1 | resources:
 2 |   pipelines:
 3 |     demo_ingest_aws_cloudtrail_data:
 4 |       name: 'DLT Cyber Demo: Ingest AWS Cloudtrail data'
 5 |       configuration:
 6 |         "conf.gold_catalog_name": ${var.catalog_name}
 7 |         "conf.gold_schema_name": ${var.normalized_schema_name}
 8 |         "conf.aws_cloudtrail_input": "${var.log_files_path}/logs/aws_cloudtrail/"
 9 |       libraries:
10 |         - notebook:
11 |             path: ../src/ingest_aws_cloudtrail.py
12 |       catalog: ${var.catalog_name}
13 |       schema: ${var.silver_schema_name}
14 |       channel: CURRENT
15 |       # development: true
16 |       serverless: true
17 | 


--------------------------------------------------------------------------------
/dlt_modern_stuff/resources/demo_ingest_apache_data.pipeline.yml:
--------------------------------------------------------------------------------
 1 | resources:
 2 |   pipelines:
 3 |     demo_ingest_apache_data:
 4 |       name: 'DLT Cyber Demo: Ingest Apache data'
 5 |       configuration:
 6 |         "conf.gold_catalog_name": ${var.catalog_name}
 7 |         "conf.gold_schema_name": ${var.normalized_schema_name}
 8 |         "conf.apache_web_input": "${var.log_files_path}/logs/apache/"
 9 |         "conf.nginx_input": "${var.log_files_path}/logs/nginx/"
10 |       libraries:
11 |         - notebook:
12 |             path: ../src/ingest_apache_web.py
13 |       catalog: ${var.catalog_name}
14 |       schema: ${var.silver_schema_name}
15 |       channel: CURRENT
16 |       # development: true
17 |       serverless: true
18 | 


--------------------------------------------------------------------------------
/dlt_modern_stuff/resources/demo_ingest_zeek_data.pipeline.yml:
--------------------------------------------------------------------------------
 1 | resources:
 2 |   pipelines:
 3 |     demo_ingest_zeek_data:
 4 |       name: 'DLT Cyber Demo: Ingest Zeek data'
 5 |       configuration:
 6 |         "conf.gold_catalog_name": ${var.catalog_name}
 7 |         "conf.gold_schema_name": ${var.normalized_schema_name}
 8 |         "conf.zeek_conn_input": "${var.log_files_path}/logs/zeek_conn/"
 9 |         "conf.zeek_http_input": "${var.log_files_path}/logs/zeek_http/"
10 |       libraries:
11 |         - notebook:
12 |             path: ../src/ingest_zeek_conn.py
13 |         - notebook:
14 |             path: ../src/ingest_zeek_http.py
15 |       catalog: ${var.catalog_name}
16 |       schema: ${var.silver_schema_name}
17 |       channel: CURRENT
18 |       # development: true
19 |       serverless: true
20 | 


--------------------------------------------------------------------------------
/dlt_modern_stuff/resources/demo_detections.pipeline.yml:
--------------------------------------------------------------------------------
 1 | resources:
 2 |   pipelines:
 3 |     demo_detections:
 4 |       name: 'DLT Cyber Demo: Detections'
 5 |       configuration:
 6 |         "conf.azure_tenant_id": ${var.azure_tenant_id}
 7 |         "conf.eh_ns": ${var.eventhub_namespace}
 8 |         "conf.eh_topic": ${var.eventhub_topic}
 9 |         "conf.gold_catalog_name": ${var.catalog_name}
10 |         "conf.gold_schema_name": ${var.normalized_schema_name}
11 |         "conf.push_to_eventhubs": ${var.push_to_eventhubs}
12 |         "conf.secret_scope": ${var.secret_scope_name}
13 |         "conf.sp_id_key_name": ${var.sp_id_key_name}
14 |         "conf.sp_secret_key_name": ${var.sp_secret_key_name}
15 |         "conf.push_to_splunk": ${var.push_to_splunk}
16 |         "conf.splunk_url": ${var.splunk_url}
17 |         "conf.splunk_hec_token": ${var.splunk_hec_token}
18 |       libraries:
19 |         - notebook:
20 |             path: ../src/detections.py
21 |       catalog: ${var.catalog_name}
22 |       schema: ${var.silver_schema_name}
23 |       channel: CURRENT
24 |       # development: true
25 |       serverless: true
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Alex Ott
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/dlt_modern_stuff/databricks.yml:
--------------------------------------------------------------------------------
 1 | # This is a Databricks asset bundle definition for dlt_modern_stuff.
 2 | # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation.
 3 | bundle:
 4 |   name: dlt_modern_stuff
 5 | 
 6 | include:
 7 |   - resources/*.yml
 8 | 
 9 | sync:
10 |   include:
11 |     - src/helpers.py
12 |     - src/__init__.py
13 |   
14 | targets:
15 |   dev:
16 |     # The default target uses 'mode: development' to create a development copy.
17 |     # - Deployed resources get prefixed with '[dev my_user_name]'
18 |     # - Any job schedules and triggers are paused by default.
19 |     # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html.
20 |     # Uncomment this if you run outside of Free Edition
21 |     # mode: development
22 |     default: true
23 |     variables:
24 |       catalog_name: cybersecurity
25 |       normalized_schema_name: normalized
26 |       log_files_path: /Volumes/cybersecurity/logs/logs/demo
27 |       silver_schema_name: silver
28 |       push_to_eventhubs: "false"
29 |       azure_tenant_id: "aaaa"
30 |       eventhub_namespace: "aaa"
31 |       eventhub_topic: "alerts"
32 |       secret_scope_name: "aaaa"
33 |       sp_id_key_name: "aaa"
34 |       sp_secret_key_name: "aaa"
35 |       push_to_splunk: "false"
36 |       splunk_url: "http://10.1.0.6:8088/services/collector/event"
37 |       splunk_hec_token: "aaaa"
38 | 


--------------------------------------------------------------------------------
/dlt_modern_stuff/resources/variables.yml:
--------------------------------------------------------------------------------
 1 | variables:
 2 |   catalog_name:
 3 |     description: The name of the default UC Catalog
 4 |   silver_schema_name:
 5 |     description: The name of UC Schema to put processed data of individual log sources
 6 |   normalized_schema_name:
 7 |     description: The name of UC Schema to put tables with normalized data, IoCs and Detections tables.
 8 |   log_files_path:
 9 |     description: The path to UC Volume where raw log data will be stored
10 |   push_to_eventhubs:
11 |     description: Set to "true" to push detections to the EventHubs topic
12 |     default: "false"
13 |   # The following variables are needed if the `push_to_eventhubs` above is set to "true"
14 |   secret_scope_name:
15 |     description: The name of the secret scope with SP's ID and secret
16 |     default: ""
17 |   sp_id_key_name:
18 |     description: The name of a secret inside secret scope that holds SP ID
19 |     default: ""
20 |   sp_secret_key_name:
21 |     description:  The name of a secret inside secret scope that holds SP Secret
22 |     default: ""
23 |   azure_tenant_id:
24 |     description: The ID of Entra ID tenant where SP is registered
25 |     default: ""
26 |   eventhub_namespace:
27 |     description: The name of EventHubs namespace
28 |     default: ""
29 |   eventhub_topic:
30 |     description: The name of a topic inside EventHubs namespace
31 |     default: ""
32 |   push_to_splunk:
33 |     description: Set to "true" to push detections to a Splunk instance (right now, only on non-serverless)
34 |     default: "false"
35 |   splunk_url:
36 |     description: the URL of Splunk HTTP Event Collector
37 |     default: ""
38 |   splunk_hec_token:
39 |     description: Token that will be used to authenticate to Splunk HTTP Event Collector
40 |     default: ""
41 | 


--------------------------------------------------------------------------------
/uc-udfs/community_id.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | CREATE OR REPLACE FUNCTION community_id_hash(
 3 |   src_ip STRING,
 4 |   src_port INT,
 5 |   dst_ip STRING,
 6 |   dst_port INT,
 7 |   proto INT,
 8 |   seed INT
 9 | )
10 | RETURNS STRING
11 | LANGUAGE PYTHON
12 | DETERMINISTIC
13 | ENVIRONMENT (
14 |       dependencies = '["communityid"]',
15 |       environment_version = 'None'
16 |     )
17 | AS $$
18 | import communityid
19 | 
20 | cid = communityid.CommunityID()
21 | tpl = communityid.FlowTuple(proto, src_ip, dst_ip, src_port, dst_port)
22 | 
23 | return cid.calc(tpl)
24 | $$;
25 | 
26 | -- COMMAND ----------
27 | 
28 | -- MAGIC %python
29 | -- MAGIC
30 | -- MAGIC import requests
31 | -- MAGIC import json
32 | -- MAGIC
33 | -- MAGIC # Download the JSON file
34 | -- MAGIC url = "https://raw.githubusercontent.com/corelight/community-id-spec/refs/heads/master/baseline/baseline_deflt.json"
35 | -- MAGIC response = requests.get(url)
36 | -- MAGIC data = response.json()
37 | -- MAGIC
38 | -- MAGIC # Prepare data for SQL
39 | -- MAGIC rows = []
40 | -- MAGIC for entry in data:
41 | -- MAGIC     src_ip = entry["saddr"]
42 | -- MAGIC     src_port = entry["sport"]
43 | -- MAGIC     dst_ip = entry["daddr"]
44 | -- MAGIC     dst_port = entry["dport"]
45 | -- MAGIC     proto = entry["proto"]
46 | -- MAGIC     seed = entry.get("seed", 0)
47 | -- MAGIC     expected_id = entry["communityid"]
48 | -- MAGIC     rows.append((src_ip, src_port, dst_ip, dst_port, proto, int(seed), expected_id))
49 | -- MAGIC
50 | -- MAGIC # Create DataFrame
51 | -- MAGIC columns = ["src_ip", "src_port", "dst_ip", "dst_port", "proto", "seed", "expected_id"]
52 | -- MAGIC df = spark.createDataFrame(rows, columns)
53 | -- MAGIC df.createOrReplaceTempView("baseline_data")
54 | -- MAGIC
55 | -- MAGIC # Compute and compare community IDs using the UDF
56 | -- MAGIC result = spark.sql("""
57 | -- MAGIC SELECT
58 | -- MAGIC   src_ip,
59 | -- MAGIC   src_port,
60 | -- MAGIC   dst_ip,
61 | -- MAGIC   dst_port,
62 | -- MAGIC   proto,
63 | -- MAGIC   seed,
64 | -- MAGIC   expected_id,
65 | -- MAGIC   community_id_hash(src_ip, src_port, dst_ip, dst_port, proto, seed) AS computed_id,
66 | -- MAGIC   CASE WHEN expected_id = community_id_hash(src_ip, src_port, dst_ip, dst_port, proto, seed) THEN 'MATCH' ELSE 'MISMATCH' END AS comparison
67 | -- MAGIC FROM baseline_data
68 | -- MAGIC """)
69 | -- MAGIC display(result)
70 | 


--------------------------------------------------------------------------------
/uc-udfs/ocsf.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md
 3 | -- MAGIC
 4 | -- MAGIC ## UDFs to support OCSF mappings
 5 | 
 6 | -- COMMAND ----------
 7 | 
 8 | CREATE OR REPLACE FUNCTION ocsf_network_activity_name(activity_id INTEGER)
 9 |   RETURNS STRING
10 |   COMMENT 'Maps network activity_id into the name'
11 |   RETURN CASE activity_id
12 |     WHEN 0 THEN 'Unknown'
13 |     WHEN 1 THEN 'Open'
14 |     WHEN 2 THEN 'Close'
15 |     WHEN 3 THEN 'Reset'
16 |     WHEN 4 THEN 'Fail'
17 |     WHEN 5 THEN 'Refuse'
18 |     WHEN 6 THEN 'Traffic'
19 |     WHEN 7 THEN 'Listen'
20 |     ELSE 'Other'
21 |   END;
22 | 
23 | -- COMMAND ----------
24 | 
25 | CREATE OR REPLACE FUNCTION ocsf_dns_activity_name(activity_id INTEGER)
26 |   RETURNS STRING
27 |   COMMENT 'Maps DNS activity_id into the name'
28 |   RETURN CASE activity_id
29 |     WHEN 0 THEN 'Unknown'
30 |     WHEN 1 THEN 'Query'
31 |     WHEN 2 THEN 'Response'
32 |     WHEN 6 THEN 'Traffic'
33 |     ELSE 'Other'
34 |   END;
35 | 
36 | -- COMMAND ----------
37 | 
38 | CREATE OR REPLACE FUNCTION ocsf_http_activity_name(activity_id INTEGER)
39 |   RETURNS STRING
40 |   COMMENT 'Maps HTTP activity_id into the name'
41 |   RETURN CASE activity_id
42 |     WHEN 0 THEN 'Unknown'
43 |     WHEN 1 THEN 'Connect'
44 |     WHEN 2 THEN 'Delete'
45 |     WHEN 3 THEN 'Get'
46 |     WHEN 4 THEN 'Head'
47 |     WHEN 5 THEN 'Options'
48 |     WHEN 6 THEN 'Post'
49 |     WHEN 7 THEN 'Put'
50 |     WHEN 8 THEN 'Trace'
51 |     WHEN 9 THEN 'Patch'
52 |     ELSE 'Other'
53 |   END;
54 | 
55 | -- COMMAND ----------
56 | 
57 | CREATE OR REPLACE FUNCTION ocsf_file_activity_name(activity_id INTEGER)
58 |   RETURNS STRING
59 |   COMMENT 'Maps file system activity_id into the name'
60 |   RETURN CASE activity_id
61 |     WHEN 0 THEN 'Unknown'
62 |     WHEN 1 THEN 'Create'
63 |     WHEN 2 THEN 'Read'
64 |     WHEN 3 THEN 'Update'
65 |     WHEN 4 THEN 'Delete'
66 |     WHEN 5 THEN 'Rename'
67 |     WHEN 6 THEN 'Set Attributes'
68 |     WHEN 7 THEN 'Set Security'
69 |     WHEN 8 THEN 'Get Attributes'
70 |     WHEN 9 THEN 'Get Security'
71 |     WHEN 9 THEN 'Encrypt'
72 |     WHEN 9 THEN 'Decrypt'
73 |     WHEN 9 THEN 'Mount'
74 |     WHEN 9 THEN 'Unmount'
75 |     WHEN 9 THEN 'Open'
76 |     ELSE 'Other'
77 |   END;
78 | 
79 | -- COMMAND ----------
80 | 
81 | CREATE OR REPLACE FUNCTION ocsf_http_activity_id(verb STRING)
82 |   RETURNS INT
83 |   COMMENT 'Maps HTTP verb into OCSF numeric code'
84 |   RETURN CASE lower(verb)
85 |     WHEN 'connect' THEN 1
86 |     WHEN 'delete' THEN 2
87 |     WHEN 'get' THEN 3
88 |     WHEN 'head' THEN 4
89 |     WHEN 'options' THEN 5
90 |     WHEN 'post' THEN 6
91 |     WHEN 'put' THEN 7
92 |     WHEN 'trace' THEN 8
93 |     WHEN 'patch' THEN 9
94 |     WHEN '' THEN 0  -- TODO: handle null string
95 |     ELSE 99
96 |     END;


--------------------------------------------------------------------------------
/iocs-ingest/IoCs Bronze.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | import os
 3 | from typing import List, Dict, Any, Optional
 4 | 
 5 | from delta.tables import DeltaTable
 6 | from pyspark.sql import DataFrame, SparkSession
 7 | import pyspark.sql.functions as F
 8 | 
 9 | # COMMAND ----------
10 | 
11 | # Variables...
12 | base_dir = "/mnt/cyberdata"
13 | secret_scope = "..."
14 | evhub_secret_key = "..."
15 | evhub_ns_name = "..."
16 | evhub_topic_name = "iocs"
17 | 
18 | # COMMAND ----------
19 | 
20 | # MAGIC %run "./IoCs Common"
21 | 
22 | # COMMAND ----------
23 | 
24 | num_executors = sc._jsc.sc().getExecutorMemoryStatus().size()-1
25 | num_cores = sum(sc.parallelize((("")*num_executors), num_executors).mapPartitions(lambda p: [os.cpu_count()]).collect())
26 | 
27 | # COMMAND ----------
28 | 
29 | spark.sql(f"set spark.sql.shuffle.partitions = {num_cores}")
30 | 
31 | # COMMAND ----------
32 | 
33 | import datetime
34 | 
35 | readConnectionString = dbutils.secrets.get(secret_scope, evhub_secret_key)
36 | eh_sasl = f'kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username="$ConnectionString" password="{readConnectionString}";'
37 | kafka_options = {"kafka.bootstrap.servers": f"{evhub_ns_name}.servicebus.windows.net:9093",
38 |                  "kafka.sasl.mechanism": "PLAIN",
39 |                  "kafka.security.protocol": "SASL_SSL",
40 |                  "kafka.request.timeout.ms": "60000",
41 |                  "kafka.session.timeout.ms": "30000",
42 |                  "startingOffsets": "earliest",
43 |                  "minPartitions": num_cores,
44 |                  "kafka.sasl.jaas.config": eh_sasl,
45 |                  "subscribe": evhub_topic_name,
46 |                 }
47 | 
48 | df = spark.readStream\
49 |   .format("kafka")\
50 |   .options(**kafka_options)\
51 |   .load()\
52 |   .withColumn("value", F.col("value").cast("string"))
53 | 
54 | # COMMAND ----------
55 | 
56 | partial_schema = "`@timestamp` timestamp, fileset struct<name:string>, service struct<type:string>, message string"
57 | df2 = df.select("*", F.from_json("value", partial_schema).alias("jsn")) \
58 |   .withColumnRenamed("timestamp", "kafka_ts") \
59 |   .selectExpr("*", "jsn.`@timestamp` as timestamp", "jsn.fileset.name as dataset", 
60 |               "jsn.service.type as service", "sha2(jsn.message, 256) as msg_hash") \
61 |   .drop("jsn", "timestampType") \
62 |   .withColumn("date", F.col("timestamp").cast("date"))
63 | #display(df2)
64 | 
65 | # COMMAND ----------
66 | 
67 | def perform_foreach_batch(df: DataFrame, epoch):
68 |   return drop_duplicates_with_merge(df, primary_key_columns=["msg_hash"], 
69 |                                     path=f"{base_dir}/bronze/threatintel/",
70 |                                     partitionby=["date"], opts={"mergeSchema": "true"},
71 |                                     additional_merge_cond="update.date >= current_date()-10"
72 |                                    )
73 | 
74 | # COMMAND ----------
75 | 
76 | checkpoint = f"{base_dir}/checkpoints/threatintel-bronze/"
77 | 
78 | # COMMAND ----------
79 | 
80 | df2.writeStream \
81 |   .option("checkpointLocation", checkpoint) \
82 |   .trigger(availableNow=True) \
83 |   .foreachBatch(perform_foreach_batch) \
84 |   .start()
85 | 


--------------------------------------------------------------------------------
/iocs-ingest/IoCs Common.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | import os
 3 | from typing import List, Dict, Any, Optional
 4 | 
 5 | from delta.tables import DeltaTable
 6 | from pyspark.sql import DataFrame, SparkSession
 7 | import pyspark.sql.functions as F
 8 | 
 9 | # COMMAND ----------
10 | 
11 | def drop_duplicates_with_merge(
12 |         df: DataFrame,
13 |         primary_key_columns: List[str],
14 |         path: str = "",
15 |         table_name: str = "",
16 |         partitionby: Optional[List[str]] = None,
17 |         opts: Optional[Dict[str, Any]] = None,
18 |         additional_merge_cond: Optional[str] = None
19 | ):
20 |     """Performs removal of duplicates using the Delta MERGE operation.  If table doesn't exist,
21 |     it's created by writing the dataframe into a specified location.  This function is primarily
22 |     designed for use in ``foreachBatch``.
23 |     
24 |     :param df: new dataframe
25 |     :param primary_key_columns: required list of the column names that are used for detection of duplicates
26 |     :param path: optional path to table (required if table_name isn't specified)
27 |     :param table_name: optional name of the table (required if path isn't specified)
28 |     :param partitionby: optional list of columns to partition by
29 |     :param opts: optional dictionary with options for creation of Delta table
30 |     :param additional_merge_cond: additional merge condition appended to the generated condition using AND
31 |     :return: nothing
32 |     """
33 |     # print(f"Performing merge for {path=} or {table_name=}")
34 |     if opts is None:
35 |         opts = {}
36 |     if partitionby is None:
37 |         partitionby = []
38 |     df = df.dropDuplicates(primary_key_columns)
39 |     if path == "" and table_name == "":
40 |         raise Exception(
41 |             "At least one parameter, 'path' or 'table_name' must be specified"
42 |         )
43 |     if not df._jdf.isEmpty():
44 |         try:
45 |             spark = SparkSession.getActiveSession()
46 |             if table_name != "":
47 |                 tbl = DeltaTable.forName(spark, table_name)
48 |             else:
49 |                 tbl = DeltaTable.forPath(spark, path)
50 |             dname = "dest"
51 |             uname = "update"
52 |             merge_cond = " and ".join(
53 |                 [f"{dname}.{col} = {uname}.{col}" for col in primary_key_columns]
54 |             )
55 |             if additional_merge_cond:
56 |               merge_cond = merge_cond + " AND " + additional_merge_cond
57 |             tbl.alias(dname).merge(
58 |                 df.alias(uname), merge_cond
59 |             ).whenNotMatchedInsertAll().execute()
60 |         # except AnalysisException as ex: # this happens when table doesn't exist
61 |         except Exception as ex:  # this happens when table doesn't exist
62 |             print(f"Got exception: {ex}")
63 |             # print(f"Delta table ({path=}, {table_name=}) doesn't exist, writing all data as new table...")
64 |             if table_name != "":
65 |                 if path != "":
66 |                     opts["path"] = path
67 |                 df.write.format("delta").partitionBy(partitionby).options(
68 |                     **opts
69 |                 ).saveAsTable(table_name)
70 |             else:
71 |                 df.write.format("delta").partitionBy(partitionby).options(**opts).save(
72 |                     path
73 |                 )
74 | 


--------------------------------------------------------------------------------
/dlt_modern_stuff/src/helpers.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | 
 3 | from typing import Optional
 4 | import re
 5 | 
 6 | NETWORK_TABLE_NAME = "network"
 7 | HTTP_TABLE_NAME = "http"
 8 | 
 9 | __catalog_key_name__ = "conf.{}_catalog_name"
10 | __schema_key_name__ = "conf.{}_schema_name"
11 | 
12 | 
13 | def get_qualified_table_name(level: str, name: str, spark: Optional[SparkSession] = None) -> str:
14 |     """Generates table name with catalog and schema if specified. 
15 | 
16 |     Args:
17 |         level (str): The level of the table (silver, gold, bronze, ...).
18 |         name (str): The name of the table on the given level.
19 |         spark (Optional[SparkSession], optional): Spark session. Defaults to None.
20 | 
21 |     Raises:
22 |         Exception: ValueError if schema is not specified when catalog is specified.
23 | 
24 |     Returns:
25 |         str: The fully qualified table name with catalog and schema.
26 |     """
27 |     if not spark:
28 |         spark = SparkSession.getActiveSession()
29 |     catalog = spark.conf.get(__catalog_key_name__.format(level), "")
30 |     schema = spark.conf.get(__schema_key_name__.format(level), "")
31 |     if catalog and not schema:
32 |         raise ValueError("Schema must be specified if catalog is specified")
33 |     base = ""
34 |     if catalog:
35 |         base += f"{catalog}."
36 |     if schema:
37 |         base += f"{schema}."
38 |     return f"{base}{name}"
39 | 
40 | 
41 | def sanitize_string_for_flow_name(s: str) -> str:
42 |     """Sanitize a string to be used as a flow/function name.
43 |     
44 |     Args:
45 |         s (str): The string to be sanitized.
46 |         
47 |     Returns:
48 |         str: The sanitized string.
49 |     """
50 |     return re.sub(r"[^a-zA-Z0-9]+", "_", s)[-20:].strip("_")
51 | 
52 | 
53 | def get_normalized_table_name(name: str, catalog: Optional[str] = None, schema: Optional[str] = None, 
54 |                               spark: Optional[SparkSession] = None) -> str:
55 |     """Get the name for normalized (OCSF) table with catalog and schema.
56 |     
57 |     Args:
58 |         name (str): The base name of the table.
59 |         catalog (Optional[str], optional): The catalog name. Defaults to None.
60 |         schema (Optional[str], optional): The schema name. Defaults to None.
61 |         spark (Optional[SparkSession], optional): Spark session. Defaults to None.
62 | 
63 |     Raises:
64 |         Exception: Exception if catalog or schema are not specified.
65 |         
66 |     Returns:
67 |         str: The normalized table name        
68 |     """
69 |     if not spark:
70 |         spark = SparkSession.getActiveSession()
71 |     if not catalog:
72 |         catalog = spark.conf.get(__catalog_key_name__.format("gold"), "")
73 |     if not schema:
74 |         schema = spark.conf.get(__schema_key_name__.format("gold"), "")
75 |     if not catalog or not schema:
76 |         raise Exception("Catalog and Schema must be specified explicitly or in Spark conf")
77 |     return f"{catalog}.{schema}.{name}"
78 | 
79 | 
80 | def create_normalized_sink(name: str, spark: Optional[SparkSession] = None) -> str:
81 |     """Create a DLT sink with a normalized name.
82 | 
83 |     Args:
84 |         name (str): The base name for the sink.
85 |         spark (Optional[SparkSession], optional): Spark session. Defaults to None.
86 | 
87 |     Returns:
88 |         str: The name of the created sink.
89 |     """
90 |     import dlt
91 |     sink_name = f"{name}_ocsf"
92 |     table_name = get_normalized_table_name(name, spark=spark)
93 |     dlt.create_sink(sink_name, "delta", { "tableName": table_name, "mergeSchema": "true" })
94 |     return sink_name
95 | 
96 | 


--------------------------------------------------------------------------------
/dlt_modern_stuff/src/ingest_zeek_http.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | import dlt
  3 | 
  4 | # COMMAND ----------
  5 | 
  6 | import pyspark.sql.functions as F
  7 | 
  8 | from typing import Optional
  9 | 
 10 | # COMMAND ----------
 11 | 
 12 | from helpers import HTTP_TABLE_NAME, get_qualified_table_name, create_normalized_sink, sanitize_string_for_flow_name
 13 | 
 14 | # COMMAND ----------
 15 | 
 16 | # MAGIC %md
 17 | # MAGIC
 18 | # MAGIC We're using streaming tables + append flows to make sure that we can add more (or remove not used) source locations for this data type.
 19 | 
 20 | # COMMAND ----------
 21 | 
 22 | zeek_http_table_name = get_qualified_table_name("silver", "zeek_http", spark)
 23 | dlt.create_streaming_table(
 24 |     name = zeek_http_table_name,
 25 |     cluster_by = ["timestamp"],
 26 | )
 27 | 
 28 | # COMMAND ----------
 29 | 
 30 | zeek_http_schema_hints = "`id.orig_p` int, `id.resp_p` int, ts double, status_code int"
 31 | zeek_http_renames = {
 32 |     "id.orig_h": "id_origin_host", 
 33 |     "id.orig_p": "id_origin_port", 
 34 |     "id.resp_h": "id_response_host", 
 35 |     "id.resp_p": "id_response_port",
 36 |     "ts": "timestamp",
 37 | }
 38 | 
 39 | def create_zeek_http_flow(input: str, add_opts: Optional[dict] = None):
 40 |     @dlt.append_flow(name=f"zeek_http_{sanitize_string_for_flow_name(input)}", 
 41 |                  target = zeek_http_table_name,
 42 |                  comment = f"Ingesting from {input}")
 43 |     def flow():
 44 |         autoloader_opts = {
 45 |             "cloudFiles.format": "json",
 46 |             "cloudFiles.schemaHints": zeek_http_schema_hints,
 47 |             #"cloudFiles.useManagedFileEvents": "true",
 48 |         } | (add_opts or {})
 49 |         df = spark.readStream.format("cloudFiles").options(**autoloader_opts).load(input)
 50 |         df = df.withColumnsRenamed(zeek_http_renames)
 51 |         df = df.withColumns({
 52 |             "timestamp": F.col("timestamp").cast("timestamp"),
 53 |             "ingest_time": F.current_timestamp(),
 54 |         })
 55 |         return df
 56 | 
 57 | # COMMAND ----------
 58 | 
 59 | zeek_http_input = spark.conf.get("conf.zeek_http_input")
 60 | # We're using input location as-is, but we can pass it as a list, and generate multiple flows from it
 61 | create_zeek_http_flow(zeek_http_input)
 62 | 
 63 | # COMMAND ----------
 64 | 
 65 | # DBTITLE 1,Convert into the normalized OCSF format
 66 | sink_name = create_normalized_sink(HTTP_TABLE_NAME, spark=spark)
 67 | 
 68 | @dlt.append_flow(name="zeek_http_normalized", target=sink_name)
 69 | def write_normalized():
 70 |     df = dlt.read_stream(zeek_http_table_name)
 71 |     # This could be incomplete mapping, but we can improve later
 72 |     df = df.selectExpr(
 73 |         "99 as activity_id",
 74 |         "4 as category_uid",
 75 |         "4002 as class_uid",
 76 |         "timestamp as time",
 77 |         "99 as severity_id",
 78 |         "400299 as type_uid",
 79 |         """named_struct(
 80 |   'hostname', host,
 81 |   'ip', id_response_host,
 82 |   'port', id_response_port
 83 | ) as dst_endpoint""",
 84 |         """named_struct(
 85 |   'http_method', method,
 86 |   'user_agent', user_agent,
 87 |   'version', `version`,
 88 |   'url', uri
 89 | ) as http_request""",
 90 |         """named_struct(
 91 |   'code', status_code
 92 | ) as http_response""",
 93 |         """named_struct(
 94 |   'product', 'zeek',
 95 |   'version', '1.0.0',
 96 |   'uid', uid,
 97 |   'processed_time', ingest_time
 98 | ) as metadata""",
 99 |         """named_struct (
100 |   'ip', `id_origin_host`,
101 |   'port', `id_origin_port`
102 | ) as src_endpoint""",
103 |     )
104 |     return df
105 | 


--------------------------------------------------------------------------------
/dlt_modern_stuff/src/ingest_zeek_conn.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | import dlt
  3 | 
  4 | # COMMAND ----------
  5 | 
  6 | import pyspark.sql.functions as F
  7 | 
  8 | from typing import Optional
  9 | 
 10 | # COMMAND ----------
 11 | 
 12 | from helpers import get_qualified_table_name, NETWORK_TABLE_NAME, create_normalized_sink, sanitize_string_for_flow_name
 13 | 
 14 | # COMMAND ----------
 15 | 
 16 | # MAGIC %md
 17 | # MAGIC
 18 | # MAGIC We're using streaming tables + append flows to make sure that we can add more (or remove not used) source locations for this data type.
 19 | 
 20 | # COMMAND ----------
 21 | 
 22 | zeek_conn_table_name = get_qualified_table_name("silver", "zeek_conn", spark)
 23 | dlt.create_streaming_table(
 24 |     name=zeek_conn_table_name,
 25 |     cluster_by = ["timestamp"],
 26 | )
 27 | 
 28 | # COMMAND ----------
 29 | 
 30 | zeek_conn_schema_hints = "`id.orig_p` int, `id.resp_p` int, ts double"
 31 | zeek_conn_renames = {
 32 |     "id.orig_h": "id_origin_host", 
 33 |     "id.orig_p": "id_origin_port", 
 34 |     "id.resp_h": "id_response_host", 
 35 |     "id.resp_p": "id_response_port",
 36 |     "orig_bytes": "origin_bytes", 
 37 |     "resp_bytes": "response_bytes",
 38 |     "orig_pkts": "origin_packets",
 39 |     "orig_ip_bytes": "origin_ip_bytes", 
 40 |     "resp_pkts": "response_packets", 
 41 |     "resp_ip_bytes": "response_ip_bytes",
 42 |     "local_orig": "local_origin", 
 43 |     "local_resp": "local_response",
 44 |     "resp_l2_addr": "response_l2_address", 
 45 |     "orig_l2_addr": "origin_l2_address",
 46 |     "ts": "timestamp",
 47 | }
 48 | 
 49 | def create_zeek_conn_flow(input: str, add_opts: Optional[dict] = None):
 50 |     @dlt.append_flow(name=f"zeek_conn_{sanitize_string_for_flow_name(input)}", 
 51 |                  target=zeek_conn_table_name,
 52 |                  comment=f"Ingesting from {input}")
 53 |     def flow():
 54 |         autoloader_opts = {
 55 |             "cloudFiles.format": "json",
 56 |             "cloudFiles.schemaHints": zeek_conn_schema_hints,
 57 |             #"cloudFiles.useManagedFileEvents": "true",
 58 |         } | (add_opts or {})
 59 |         df = spark.readStream.format("cloudFiles").options(**autoloader_opts).load(input)
 60 |         df = df.withColumnsRenamed(zeek_conn_renames)
 61 |         df = df.withColumns({
 62 |             "timestamp": F.col("timestamp").cast("timestamp"),
 63 |             "ingest_time": F.current_timestamp(),
 64 |         })
 65 |         return df
 66 | 
 67 | # COMMAND ----------
 68 | 
 69 | zeek_conn_input = spark.conf.get("conf.zeek_conn_input")
 70 | # We're using input location as-is, but we can pass it as a list, and generate multiple flows from it
 71 | create_zeek_conn_flow(zeek_conn_input)
 72 | 
 73 | # COMMAND ----------
 74 | 
 75 | sink_name = create_normalized_sink(NETWORK_TABLE_NAME, spark=spark)
 76 | 
 77 | @dlt.append_flow(name="zeek_conn_normalized", target=sink_name)
 78 | def write_normalized():
 79 |     df = dlt.read_stream(zeek_conn_table_name)
 80 |     # This could be incomplete mapping, but we can improve later
 81 |     df = df.selectExpr(
 82 |         "99 as activity_id",
 83 |         "4 as category_uid",
 84 |         "4001 as class_uid",
 85 |         "timestamp as time",
 86 |         "99 as severity_id",
 87 |         "400199 as type_uid",
 88 |         "duration*1000 as duration",
 89 |         """named_struct(
 90 |   'ip', id_response_host,
 91 |   'port', id_response_port
 92 | ) as dst_endpoint""",
 93 |         """named_struct(
 94 |   'product', 'zeek',
 95 |   'version', '1.0.0',
 96 |   'uid', uid,
 97 |   'processed_time', ingest_time
 98 | ) as metadata""",
 99 |         """named_struct (
100 |   'ip', id_origin_host,
101 |   'port', id_origin_port
102 | ) as src_endpoint""",
103 | """named_struct(
104 | 'bytes_in', response_bytes,
105 | 'packets_in', response_packets,
106 | 'bytes_out', origin_bytes,
107 | 'packets_out', origin_packets,
108 | 'bytes_missed', missed_bytes,
109 | 'bytes', response_bytes + origin_bytes,
110 | 'packets', response_packets + origin_packets
111 | ) as traffic""",
112 | """named_struct(
113 |   'direction_id', 0,
114 |   'protocol_name', proto,
115 |   'flag_history', history 
116 | ) as connection_info"""
117 |     )
118 |     return df
119 | 


--------------------------------------------------------------------------------
/dlt_modern_stuff/src/setup.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | from helpers import *
 3 | import os
 4 | 
 5 | # COMMAND ----------
 6 | 
 7 | dbutils.widgets.text("catalog_name", "", "Catalog name")
 8 | dbutils.widgets.text("schema_name", "", "Schema name")
 9 | dbutils.widgets.text("volume_path", "", "UC Volume path for data")
10 | 
11 | # COMMAND ----------
12 | 
13 | catalog_name = dbutils.widgets.get("catalog_name")
14 | schema_name = dbutils.widgets.get("schema_name")
15 | volume_path = dbutils.widgets.get("volume_path")
16 | 
17 | # COMMAND ----------
18 | 
19 | if not catalog_name or not schema_name or not volume_path:
20 |     raise Exception("Catalog name, Schema name and UC Volume path must be provided")
21 | 
22 | # COMMAND ----------
23 | 
24 | spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog_name}.{schema_name}")
25 | for name in [HTTP_TABLE_NAME, NETWORK_TABLE_NAME]:
26 |     table_name = get_normalized_table_name(name, catalog_name, schema_name)
27 |     print(f"Creating table {table_name}")
28 |     spark.sql(f"""CREATE TABLE IF NOT EXISTS {table_name} (
29 |         activity_id int,
30 |         category_uid int,
31 |         class_uid int,
32 |         time timestamp
33 |     )""")
34 |     # spark.sql(f"TRUNCATE TABLE {table_name}")
35 | 
36 | # COMMAND ----------
37 | 
38 | base_path = os.path.join(volume_path, "logs")
39 | for i in ["apache", "nginx", "zeek_conn", "zeek_http", "aws_cloudtrail"]:
40 |     p = os.path.join(base_path, i)
41 |     os.makedirs(p, exist_ok=True)
42 | 
43 | # COMMAND ----------
44 | 
45 | import urllib.request
46 | 
47 | # Apache logs
48 | urllib.request.urlretrieve("https://raw.githubusercontent.com/elastic/examples/refs/heads/master/Common%20Data%20Formats/apache_logs/apache_logs", 
49 |                            os.path.join(base_path, "apache", "log1.txt"))
50 | # Nginx logs
51 | urllib.request.urlretrieve("https://raw.githubusercontent.com/elastic/examples/refs/heads/master/Common%20Data%20Formats/nginx_logs/nginx_logs", 
52 |                            os.path.join(base_path, "nginx", "log1.txt"))
53 | # Zeek HTTP logs
54 | urllib.request.urlretrieve("https://raw.githubusercontent.com/ocsf/examples/refs/heads/main/raw_sample_log_dataset/Zeek/http.log", 
55 |                            os.path.join(base_path, "zeek_http", "log1.txt"))
56 | urllib.request.urlretrieve("https://raw.githubusercontent.com/lipyeow-lim/security-datasets01/refs/heads/main/maccdc-2012/03/http.log.gz", 
57 |                            os.path.join(base_path, "zeek_http", "log2.gz"))
58 | urllib.request.urlretrieve("https://raw.githubusercontent.com/lipyeow-lim/security-datasets01/refs/heads/main/maccdc-2012/12/http.log.gz", 
59 |                            os.path.join(base_path, "zeek_http", "log3.gz"))
60 | # Zeek Conn logs
61 | urllib.request.urlretrieve("https://raw.githubusercontent.com/ocsf/examples/refs/heads/main/raw_sample_log_dataset/Zeek/conn.log", 
62 |                            os.path.join(base_path, "zeek_conn", "log1.txt"))
63 | urllib.request.urlretrieve("https://raw.githubusercontent.com/lipyeow-lim/security-datasets01/refs/heads/main/maccdc-2012/12/conn.log.gz", 
64 |                            os.path.join(base_path, "zeek_conn", "log2.gz"))
65 | urllib.request.urlretrieve("https://raw.githubusercontent.com/lipyeow-lim/security-datasets01/refs/heads/main/maccdc-2012/03/conn.log.gz", 
66 |                            os.path.join(base_path, "zeek_conn", "log3.gz"))
67 | # AWS Cloudtrail logs
68 | urllib.request.urlretrieve("https://gist.githubusercontent.com/alexott/8ccd963811969d2446d2239e031a7b78/raw/2bae06ade977277dce4f580f214a0ed129f38810/data.jsonl", 
69 |                            os.path.join(base_path, "aws_cloudtrail", "data.json"))
70 | 
71 | 
72 | # COMMAND ----------
73 | 
74 | table_name = get_normalized_table_name("iocs", catalog_name, schema_name)
75 | spark.sql(f"""CREATE TABLE IF NOT EXISTS {table_name} (
76 |     ioc_type string, 
77 |     ioc string
78 |     ) COMMENT 'These are arbitrary IPs, not related to real IoCs - just for demo purposes'
79 |           """)
80 | # These are arbitrary IPs, not related to real IoCs - just for demo purposes
81 | idf = spark.createDataFrame([['IPv4', '205.251.199.192'],
82 |                              ['IPv4', '54.148.114.85'],
83 |                              ['IPv4', '95.217.228.176'],
84 |                              ['IPv4', '190.104.181.125'],
85 |                              ], schema="ioc_type string, ioc string")
86 | idf.write.mode("overwrite").saveAsTable(table_name)
87 | 


--------------------------------------------------------------------------------
/dlt_modern_stuff/src/ingest_apache_web.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | import dlt
  3 | 
  4 | # COMMAND ----------
  5 | 
  6 | import pyspark.sql.functions as F
  7 | 
  8 | from typing import Optional
  9 | 
 10 | # COMMAND ----------
 11 | 
 12 | from helpers import HTTP_TABLE_NAME, get_qualified_table_name, create_normalized_sink, sanitize_string_for_flow_name
 13 | 
 14 | # COMMAND ----------
 15 | 
 16 | # MAGIC %md
 17 | # MAGIC
 18 | # MAGIC We're using streaming tables + append flows to make sure that we can add more (or remove not used) source locations for this data type.
 19 | 
 20 | # COMMAND ----------
 21 | 
 22 | apache_web_table_name = get_qualified_table_name("silver", "apache_web", spark)
 23 | dlt.create_streaming_table(
 24 |     name=apache_web_table_name,
 25 |     comment="Table for data parsed from Apache HTTP server-compatible logs",
 26 |     cluster_by = ["timestamp"],
 27 | )
 28 | 
 29 | # COMMAND ----------
 30 | 
 31 | apache_web_regex = (
 32 |     r'^(\S+) (\S+) (\S+) \[(.+?)] "(\w+) (\S+) ([^"]+)" (\d{3}) (\d+|-) "(.+)" "(.+)"?$'
 33 | )
 34 | 
 35 | 
 36 | def read_apache_web(input: str, add_opts: Optional[dict] = None):
 37 |     autoloader_opts = {
 38 |         "cloudFiles.format": "text",
 39 |         #"cloudFiles.useManagedFileEvents": "true",
 40 |     } | (add_opts or {})
 41 |     df = spark.readStream.format("cloudFiles").options(**autoloader_opts).load(input)
 42 |     df = df.withColumns(
 43 |         {
 44 |             "host": F.regexp_extract("value", apache_web_regex, 1),
 45 |             "user": F.regexp_extract("value", apache_web_regex, 3),
 46 |             "timestamp": F.unix_timestamp(
 47 |                 F.regexp_extract("value", apache_web_regex, 4), "dd/MMM/yyyy:HH:mm:ss Z"
 48 |             ).cast("timestamp"),
 49 |             "method": F.regexp_extract("value", apache_web_regex, 5),
 50 |             "path": F.regexp_extract("value", apache_web_regex, 6),
 51 |             "version": F.regexp_extract("value", apache_web_regex, 7),
 52 |             "code": F.regexp_extract("value", apache_web_regex, 8).cast("int"),
 53 |             "size": F.regexp_extract("value", apache_web_regex, 9).cast("long"),
 54 |             "referrer": F.regexp_extract("value", apache_web_regex, 10),
 55 |             "agent": F.regexp_extract("value", apache_web_regex, 11),
 56 |             "ingest_time": F.current_timestamp(),
 57 |         }
 58 |     )
 59 |     return df
 60 | 
 61 | 
 62 | def create_apache_web_flow(input: str, add_opts: Optional[dict] = None):
 63 |     @dlt.append_flow(
 64 |         name=f"apache_web_{sanitize_string_for_flow_name(input)}",
 65 |         target=apache_web_table_name,
 66 |         comment=f"Ingesting from {input}",
 67 |     )
 68 |     def flow():
 69 |         return read_apache_web(input, add_opts)
 70 | 
 71 | # COMMAND ----------
 72 | 
 73 | # DBTITLE 1,Handling of Apache Web logs
 74 | apache_web_input = spark.conf.get("conf.apache_web_input")
 75 | # We're using input location as-is, but we can pass it as a list, and generate multiple flows from it
 76 | create_apache_web_flow(apache_web_input)
 77 | 
 78 | # COMMAND ----------
 79 | 
 80 | # DBTITLE 1,Handling of NGINX logs (compatible with Apache Web)
 81 | nginx_input = spark.conf.get("conf.nginx_input")
 82 | # We're using input location as-is, but we can pass it as a list, and generate multiple flows from it
 83 | create_apache_web_flow(nginx_input)
 84 | 
 85 | # COMMAND ----------
 86 | 
 87 | sink_name = create_normalized_sink(HTTP_TABLE_NAME, spark=spark)
 88 | 
 89 | @dlt.append_flow(
 90 |     name="apache_web_normalized", 
 91 |     target=sink_name
 92 | )
 93 | def write_normalized():
 94 |     df = dlt.read_stream(apache_web_table_name)
 95 |     # This could be incomplete mapping, but we can improve later
 96 |     df = df.selectExpr(
 97 |         "99 as activity_id",
 98 |         "4 as category_uid",
 99 |         "4002 as class_uid",
100 |         "timestamp as time",
101 |         "99 as severity_id",
102 |         "400299 as type_uid",
103 |         """named_struct(
104 |   'hostname', host,
105 |   'ip', host
106 | ) as src_endpoint""",
107 |         """named_struct(
108 |   'http_method', method,
109 |   'user_agent', agent,
110 |   'version', version,
111 |   'url', path,
112 |   'referrer', referrer
113 | ) as http_request""",
114 |         """named_struct(
115 |   'code', code,
116 |   'length', size
117 | ) as http_response""",
118 |         """named_struct(
119 |   'product', 'apache_web',
120 |   'version', '1.0.0',
121 |   'processed_time', ingest_time
122 | ) as metadata""",
123 |     )
124 |     return df
125 | 


--------------------------------------------------------------------------------
/dlt_modern_stuff/README.md:
--------------------------------------------------------------------------------
 1 | # dlt_modern_stuff
 2 | 
 3 | This directory contains a source code that demonstrates use of latest Delta Live Tables (DLT) features for cybersecurity use cases.  You can find more information in the [blog post](https://alexott.blogspot.com/2025/03/effective-use-of-latest-dlt-features.html).
 4 | 
 5 | In general, this project consists of three DLT pipelines that perform data ingestion, normalization to [Open Cybersecurity Schema Framework (OCSF)](https://schema.ocsf.io/), and doing a rudimentary detections against normalized data as it's shown on the image below:
 6 | 
 7 | 1. Ingestion of Apache Web and Nginx logs into `apache_web` table and then normalizing it into a table corresponding to OCSF's HTTP activity.
 8 | 2. Ingestion of Zeek data:
 9 |   * Zeek HTTP data into `zeek_http` table,  and then normalizing it into an `http` table corresponding to OCSF's HTTP activity.  
10 |   * Zeek Conn data into `zeek_conn` table,  and then normalizing it into a `network` table corresponding to OCSF's Network activity.
11 | 3. Detection pipeline that does the following:
12 |   * Matches network connections data from `network` table against `iocs` table.
13 |   * Checks HTTP logs from `http` table for admin pages scans from external parties.
14 |   * All matches are stored in the `detections` table, and optionally pushed to EventHubs and/or Splunk.
15 | 
16 | ![Implemented pipelines](images/cyber-pipeline-impl.png)
17 | 
18 | 
19 | ## Setting up & running
20 | 
21 | > [!IMPORTANT]
22 | This bundle uses Serverless compute, so make sure that it's enabled for your workspace (works on [Databricks Free Edition](https://www.databricks.com/blog/introducing-databricks-free-edition) as well). If it's not, then you need to adjust parameters of the job and DLT pipelines!
23 | 
24 | You can install the project two ways:
25 | 
26 | 1. Using Databricks Assset Bundles (DABs) inside the Databricks Workspace (recommended):
27 | 1. Using DABs from the command line of your computer
28 | 
29 | ### Setting it up using DABs in workspace
30 | 
31 | 1. Create a [Git Folder](https://docs.databricks.com/aws/en/repos/) inside your Databricks workspace by cloning this repository.
32 | 
33 | 2. Open the `dlt_modern_stuff/databricks.yaml` inside create Git Folder.
34 | 
35 | 3. Adjust the following parameters inside the `databricks.yaml` (create necessary objects before use):
36 | 
37 |  - `catalog_name` - the name of the existing UC Catalog used in configuration.
38 |  - `silver_schema_name` - the name of an existing UC Schema to put processed data of individual log sources.
39 |  - `normalized_schema_name` - the name of an existing UC Schema to put tables with normalized data, IoCs and Detections tables.
40 |  - `log_files_path` - the path to an existing UC Volume where raw log data will be stored.
41 | 
42 | 4. Click **Deploy** button in the **Deployments** tab on the left - this will create necessary jobs and pipelines
43 | 
44 | 5. Click **Run** button next to the `DLT Cyber Demo: Setup` job.
45 | 
46 | 6. Click **Start pipeline** for DLT pipelines to process data and run detections (in the following order):
47 | 
48 |  - `DLT Cyber Demo: Ingest Zeek data`
49 |  - `DLT Cyber Demo: Ingest Apache data`
50 |  - `DLT Cyber Demo: Detections`
51 | 
52 | ### Setting it up using DABs locally
53 | 
54 | 1. Install the latest version of [Databricks CLI](https://docs.databricks.com/dev-tools/cli/databricks-cli.html).
55 | 
56 | 2. Authenticate to your Databricks workspace, if you have not done so already:
57 | 
58 | ```sh
59 | databricks configure
60 | ```
61 | 
62 | 3. Set environment variable `DATABRICKS_CONFIG_PROFILE` to the name of Databricks CLI profile you configured, and configure necessary variables in the `dev` profile of `databricks.yml` file.  You need to specify the following (create necessary objects before use):
63 | 
64 |  - `catalog_name` - the name of the existing UC Catalog used in configuration.
65 |  - `silver_schema_name` - the name of an existing UC Schema to put processed data of individual log sources.
66 |  - `normalized_schema_name` - the name of an existing UC Schema to put tables with normalized data, IoCs and Detections tables.
67 |  - `log_files_path` - the path to an existing UC Volume where raw log data will be stored.
68 | 
69 | 4. To deploy a development copy of this project, type:
70 | 
71 | ```sh
72 | databricks bundle deploy
73 | ```
74 | 
75 | 5. Run a job to set up the normalized tables and download sample log files:
76 | 
77 | ```sh
78 | databricks bundle run dlt_cyber_demo_setup
79 | ```
80 | 
81 | 6. Run DLT pipelines to ingest data:
82 | 
83 | ```sh
84 | databricks bundle run demo_ingest_zeek_data
85 | databricks bundle run demo_ingest_apache_data
86 | ```
87 | 
88 | 7. Run DLT pipeline that emulates detections against normalized data:
89 | 
90 | ```sh
91 | databricks bundle run demo_detections
92 | ```
93 | 
94 | 


--------------------------------------------------------------------------------
/dlt_modern_stuff/src/detections.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %pip install -U https://github.com/alexott/cyber-spark-data-connectors/releases/download/v0.0.4/cyber_spark_data_connectors-0.0.4-py3-none-any.whl
  3 | 
  4 | # COMMAND ----------
  5 | 
  6 | import dlt
  7 | 
  8 | # COMMAND ----------
  9 | 
 10 | import pyspark.sql.functions as F
 11 | 
 12 | # COMMAND ----------
 13 | 
 14 | from helpers import get_normalized_table_name, NETWORK_TABLE_NAME, HTTP_TABLE_NAME, get_qualified_table_name
 15 | 
 16 | # COMMAND ----------
 17 | 
 18 | detections_table_name = get_qualified_table_name("gold", "detections", spark)
 19 | dlt.create_streaming_table(
 20 |     name=detections_table_name,
 21 |     comment="Streaming table for detections"
 22 | )
 23 | 
 24 | # COMMAND ----------
 25 | 
 26 | # DBTITLE 1,Check network traffic for connections to known IoCs
 27 | @dlt.append_flow(
 28 |     target=detections_table_name,
 29 | )
 30 | def match_network_iocs():
 31 |     network_table = spark.readStream.table(get_normalized_table_name(NETWORK_TABLE_NAME))
 32 |     iocs_table = spark.read.table(get_normalized_table_name("iocs")).filter("ioc_type = 'IPv4'")
 33 |     matches = network_table.join(iocs_table, network_table.dst_endpoint.ip == iocs_table.ioc)
 34 |     matches = matches.selectExpr(
 35 |         "to_json(struct(*)) as details",
 36 |         "current_timestamp() as detection_time",
 37 |         "'network' as detection_source",
 38 |         "'ioc_match' as detection_type",
 39 |         "'warn' as detection_level"
 40 |     )
 41 |     return matches
 42 | 
 43 | 
 44 | # COMMAND ----------
 45 | 
 46 | # DBTITLE 1,Check HTTP logs for scan of admin pages from external IPs
 47 | @dlt.append_flow(
 48 |     target=detections_table_name,
 49 | )
 50 | def check_http_logs_admin_scan():
 51 |     http_table = spark.readStream.table(get_normalized_table_name(HTTP_TABLE_NAME))
 52 |     matches = http_table.filter("http_request.url like '/admin%' and not (src_endpoint.ip like '192.168.%' or src_endpoint.ip like '10.%')")
 53 |     matches = matches.selectExpr(
 54 |         "to_json(struct(*)) as details",
 55 |         "current_timestamp() as detection_time",
 56 |         "'http' as detection_source",
 57 |         "'http_admin_page_scan' as detection_type",
 58 |         "'info' as detection_level"
 59 |     )
 60 |     return matches
 61 | 
 62 | 
 63 | # COMMAND ----------
 64 | 
 65 | push_to_eventhub = spark.conf.get("conf.push_to_eventhubs", "false") == "true"
 66 | if push_to_eventhub:
 67 |     # name of EH namespace
 68 |     eh_ns = spark.conf.get("conf.eh_ns") 
 69 |     # name of a topic in EH namespace
 70 |     eh_topic = spark.conf.get("conf.eh_topic")     
 71 |     # Entra ID Tenant ID where service principal is created
 72 |     tenant_id = spark.conf.get("conf.azure_tenant_id")
 73 |     secret_scope = spark.conf.get("conf.secret_scope")
 74 |     sp_id_key_name = spark.conf.get("conf.sp_id_key_name")
 75 |     sp_secret_key_name = spark.conf.get("conf.sp_secret_key_name")
 76 |     client_id = dbutils.secrets.get(secret_scope, sp_id_key_name)  # Application ID of service principal
 77 |     client_secret = dbutils.secrets.get(secret_scope, sp_secret_key_name) # Client secret of service principal
 78 |     # fully qualified name of the Event Hubs server
 79 |     eh_server = eh_server = f"{eh_ns}.servicebus.windows.net"  
 80 |     # SASL config for Kafka to connect to Event Hubs
 81 |     sasl_config = f'kafkashaded.org.apache.kafka.common.security.oauthbearer.OAuthBearerLoginModule' + \
 82 |       f' required clientId="{client_id}" clientSecret="{client_secret}"' + \
 83 |       f' scope="https://{eh_server}/.default" ssl.protocol="SSL";'
 84 |     # Callback class for OAuth authentication
 85 |     callback_class = "kafkashaded.org.apache.kafka.common.security.oauthbearer.secured.OAuthBearerLoginCallbackHandler"
 86 |     # OAuth endpoint for Entra ID
 87 |     oauth_endpoint = f"https://login.microsoft.com/{tenant_id}/oauth2/v2.0/token"
 88 | 
 89 |     dlt.create_sink(
 90 |         "eventhubs",
 91 |         "kafka",
 92 |         { # Create Kafka options dictionary for connection with OAuth authentication
 93 |       "kafka.bootstrap.servers": f"{eh_server}:9093",
 94 |       "topic": eh_topic,
 95 |       "kafka.security.protocol": "SASL_SSL",
 96 |       "kafka.sasl.mechanism": "OAUTHBEARER",
 97 |       "kafka.sasl.jaas.config": sasl_config,
 98 |       "kafka.sasl.oauthbearer.token.endpoint.url": oauth_endpoint,
 99 |       "kafka.sasl.login.callback.handler.class": callback_class,
100 |       "kafka.request.timeout.ms": "60000",
101 |       "kafka.session.timeout.ms": "30000",
102 |     }
103 |     )
104 | 
105 |     @dlt.append_flow(name = "write_alerts", target = "eventhubs")
106 |     def flowFunc():
107 |         df = dlt.read_stream(detections_table_name)
108 |         return df.select(F.to_json(F.struct("*")).alias("value"))
109 | 
110 | # COMMAND ----------
111 | 
112 | push_to_splunk = spark.conf.get("conf.push_to_splunk", "false") == "true"
113 | if push_to_splunk:
114 |     from cyber_connectors import *
115 |     spark.dataSource.register(SplunkDataSource)
116 | 
117 |     splunk_opts = {
118 |         "url": spark.conf.get("conf.splunk_url") ,
119 |         "token": spark.conf.get("conf.splunk_hec_token"),
120 |         "time_column": "detection_time",
121 |         "source": "dlt",
122 |     }
123 |     dlt.create_sink("splunk", "splunk", splunk_opts)
124 | 
125 |     @dlt.append_flow(name = "write_to_splunk", target = "splunk")
126 |     def flowFunc():
127 |         df = dlt.read_stream(detections_table_name)
128 |         df = df.withColumn("details", F.from_json("details", "map<string, string>"))
129 |         return df
130 | 


--------------------------------------------------------------------------------
/dlt_modern_stuff/src/ingest_aws_cloudtrail.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | import dlt
  3 | 
  4 | # COMMAND ----------
  5 | 
  6 | import pyspark.sql.functions as F
  7 | from pyspark.sql import DataFrame
  8 | 
  9 | from typing import Optional
 10 | 
 11 | import random
 12 | import datetime
 13 | 
 14 | # COMMAND ----------
 15 | 
 16 | from helpers import get_qualified_table_name, NETWORK_TABLE_NAME, create_normalized_sink, sanitize_string_for_flow_name
 17 | 
 18 | # COMMAND ----------
 19 | 
 20 | aws_cloudtrail_table_name = get_qualified_table_name("silver", "aws_cloudtrail", spark)
 21 | dlt.create_streaming_table(
 22 |     name=aws_cloudtrail_table_name,
 23 |     cluster_by = ["event_name", "event_time"],
 24 |     table_properties = {
 25 |         # These properties are necessary for Variant support
 26 |         "delta.minWriterVersion": "7",
 27 |         "delta.enableDeletionVectors": "true",
 28 |         "delta.minReaderVersion": "3",
 29 |         "delta.feature.variantType-preview": "supported",
 30 |         "delta.feature.deletionVectors": "supported",
 31 |     },
 32 | )
 33 | 
 34 | # COMMAND ----------
 35 | 
 36 | def normalize_aws_cloudtrail(df: DataFrame, raw_column_name: str = "_raw") -> DataFrame:
 37 |     df = df.selectExpr(f"{raw_column_name}:Records::array<variant> as _records")
 38 |     df = df.select(F.explode("_records").alias("_record"))
 39 |     df = df.selectExpr("*", "_record:resources::variant as resources")
 40 | 
 41 |     view_name = f"cloudtrail_{int(datetime.datetime.now().timestamp())}_{random.randint(0, 1000)}"
 42 |     df.createOrReplaceTempView(view_name)
 43 | 
 44 |     # TODO: rewrite to PySpark when we get support for `variant_explode_outer` in DLT
 45 |     # https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.tvf.TableValuedFunction.variant_explode_outer.html
 46 |     df = spark.sql(f"""SELECT
 47 | _record:awsRegion::string as aws_region,
 48 | _record:eventID::string as event_id,
 49 | _record:eventName::string as event_name,
 50 | _record:eventSource::string as event_source,
 51 | _record:eventType::string as event_type,
 52 | _record:eventTime::timestamp as event_time,
 53 | _record:eventVersion::string as event_version,
 54 | _record:recipientAccountId::string as recipient_account_id,
 55 | _record:requestID::string as request_id,
 56 | _record:requestParameters::variant as request_parameters,
 57 | _record:responseElements::variant as response_elements,
 58 | _record:responseElements.user.arn::string as response_user_arn,
 59 | _record:responseElements.role.arn::string as response_role_arn,
 60 | _record:responseElements.policy.arn::string as response_policy_arn,
 61 | resource.value:accountId::string as resources_account_id,
 62 | _record:sourceIPAddress::string as source_ip_address,
 63 | _record:userAgent::string as user_agent,
 64 | _record:userIdentity::variant as user_identity,
 65 | _record:userIdentity.type::string as user_identity_type,
 66 | _record:userIdentity.principalId::string as user_identity_principal,
 67 | _record:userIdentity.arn::string as user_identity_arn,
 68 | _record:userIdentity.accountId::string as user_identity_account_id,
 69 | _record:userIdentity.invokedBy::string as user_identity_invoked_by,
 70 | _record:userIdentity.accessKeyId::string as user_identity_access_ke,
 71 | _record:userIdentity.userName::string as user_identity_username,
 72 | _record:userIdentity.sessionContext.attributes.mfaAuthenticated::boolean as user_identity_session_context_attributes_mfa_authenticated,
 73 | _record:userIdentity.sessionContext.attributes.creationDate::string as user_identity_session_context_attributes_creation_date,
 74 | _record:userIdentity.sessionContext.sessionIssuer.type::string as user_identity_session_context_sesion_issuer_type,
 75 | _record:userIdentity.sessionContext.sessionIssuer.principalId::string as user_identity_session_context_sesion_issuer_principal_id,
 76 | _record:userIdentity.sessionContext.sessionIssuer.arn::string as user_identity_session_context_sesion_issuer_arn,
 77 | _record:userIdentity.sessionContext.sessionIssuer.accountId::string as user_identity_session_context_sesion_issuer_account_id,
 78 | _record:userIdentity.sessionContext.sessionIssuer.userName::string as user_identity_session_context_sesion_issuer_user_name,
 79 | _record:errorCode::string as error_code,
 80 | _record:errorMessage::string as error_message,
 81 | _record:additionalEventData::variant as additional_event_data,
 82 | _record:apiVersion::string as api_version,
 83 | _record:readOnly::boolean as read_only,
 84 | _record:serviceEventDetails::string as service_event_details,
 85 | _record:sharedEventId::string as shared_event_id,
 86 | _record:vpcEndpointId::string as vpc_endpoint_id,
 87 | _record, resource.value as resource
 88 | FROM {view_name}, LATERAL variant_explode_outer(resources) as resource""")
 89 | 
 90 |     return df
 91 | 
 92 | # COMMAND ----------
 93 | 
 94 | def read_aws_cloudtrail(input: str, add_opts: Optional[dict] = None) -> DataFrame:
 95 |     autoloader_opts = {
 96 |         "cloudFiles.format": "json",
 97 |         "singleVariantColumn": "_raw",
 98 |         #"cloudFiles.useManagedFileEvents": "true",
 99 |     } | (add_opts or {})
100 |     df = spark.readStream.format("cloudFiles") \
101 |         .options(**autoloader_opts).load(input)
102 |     return normalize_aws_cloudtrail(df)
103 | 
104 | # COMMAND ----------
105 | 
106 | # #sdf = read_aws_cloudtrail("/Volumes/cybersecurity/logs/logs/aws-cloudtrail/")
107 | # sdf = read_aws_cloudtrail("/Volumes/cybersecurity/logs/logs/demo/logs/aws_cloudtrail/")
108 | # display(sdf)
109 | 
110 | # COMMAND ----------
111 | 
112 | def create_aws_cloudtrail_flow(input: str, add_opts: Optional[dict] = None):
113 |     @dlt.append_flow(
114 |         name=f"aws_cloudtrail_{sanitize_string_for_flow_name(input)}",
115 |         target=aws_cloudtrail_table_name,
116 |         comment=f"Ingesting from {input}",
117 |     )
118 |     def flow():
119 |         return read_aws_cloudtrail(input, add_opts)
120 | 
121 | # COMMAND ----------
122 | 
123 | aws_cloudtrail_input = spark.conf.get("conf.aws_cloudtrail_input")
124 | # We're using input location as-is, but we can pass it as a list, and generate multiple flows from it
125 | create_aws_cloudtrail_flow(aws_cloudtrail_input)
126 | 


--------------------------------------------------------------------------------
/iocs-ingest/IoCs Silver.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | import os
  3 | from typing import List, Dict, Any, Optional
  4 | 
  5 | from delta.tables import DeltaTable
  6 | from pyspark.sql import DataFrame, SparkSession
  7 | import pyspark.sql.functions as F
  8 | 
  9 | # COMMAND ----------
 10 | 
 11 | # Variables...
 12 | base_dir = "/mnt/cyberdata"
 13 | iocs_table_name = "cyber.iocs"
 14 | 
 15 | # COMMAND ----------
 16 | 
 17 | # MAGIC %run "./IoCs Common"
 18 | 
 19 | # COMMAND ----------
 20 | 
 21 | df = spark.readStream\
 22 |   .format("delta")\
 23 |   .option("ignoreChanges", "true") \
 24 |   .load(f"{base_dir}/bronze/threatintel/")
 25 | 
 26 | # COMMAND ----------
 27 | 
 28 | # MAGIC %md ## TODOs
 29 | # MAGIC 
 30 | # MAGIC * \[X\] Extract `first_seen` timestamp either from data, or based on the first timestamp from the threat intel feed
 31 | # MAGIC * \[ \] Think how to handle `last_seen` over the time... For example, set last_seen for IPs to "first_seen + N days" ?
 32 | 
 33 | # COMMAND ----------
 34 | 
 35 | # DBTITLE 1,Do the base decode that will be used for all pipelines
 36 | message_df = df.select("dataset", "timestamp", "msg_hash", F.from_json("value", "message string").alias("json")) \
 37 |         .select("*", "json.message").drop("json")
 38 | 
 39 | # COMMAND ----------
 40 | 
 41 | spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true")
 42 | 
 43 | # COMMAND ----------
 44 | 
 45 | def process_single_source(input_df: DataFrame, service: str, transform_func):
 46 |   checkpoint = f"{base_dir}/checkpoints/threatintel-silver-{service}/"
 47 |   transform_func(input_df).writeStream.format("delta") \
 48 |     .option("checkpointLocation", checkpoint) \
 49 |     .trigger(availableNow=True) \
 50 |     .foreachBatch(lambda df, epoch: drop_duplicates_with_merge(df, primary_key_columns=["ioc_type", "ioc"], 
 51 |                                     path=f"{base_dir}/silver/threatintel/", table_name=iocs_table_name,
 52 |                                     partitionby=["ioc_type"], opts={"mergeSchema": "true"})) \
 53 |     .start().awaitTermination()
 54 | 
 55 | # COMMAND ----------
 56 | 
 57 | # MAGIC %md ## Handle OTX (OpenThread Exchange) feed
 58 | 
 59 | # COMMAND ----------
 60 | 
 61 | def handle_otx(df: DataFrame) -> DataFrame:
 62 |   otx_schema = 'STRUCT<content: STRING, description: STRING, id: BIGINT, indicator: STRING, title: STRING, type: STRING>'
 63 |   otx_df = df.filter("dataset = 'otx'").select("*", F.from_json("message", otx_schema).alias("jsn")).select("*", "jsn.*").drop("jsn", "message")
 64 |   otx_df = otx_df.withColumnRenamed("type", "ioc_type") \
 65 |     .withColumnRenamed("indicator", "ioc") \
 66 |     .withColumn("ioc_id", F.col("id").cast("string")) \
 67 |     .withColumnRenamed("description", "ioc_description") \
 68 |     .withColumnRenamed("content", "ioc_content") \
 69 |     .withColumnRenamed("title", "ioc_title") \
 70 |     .withColumn("first_seen", F.col("timestamp"))
 71 |   return otx_df
 72 | 
 73 | # COMMAND ----------
 74 | 
 75 | process_single_source(message_df, "otx", handle_otx)
 76 | 
 77 | # COMMAND ----------
 78 | 
 79 | # MAGIC %md ## Handle AbuseURL feed
 80 | 
 81 | # COMMAND ----------
 82 | 
 83 | def handle_abuseurl(df: DataFrame) -> DataFrame:
 84 |   abuseurl_schema = 'STRUCT<blacklists: STRUCT<spamhaus_dbl: STRING, surbl: STRING>, date_added: STRING, host: STRING, id: STRING, larted: STRING, reporter: STRING, tags: ARRAY<STRING>, threat: STRING, url: STRING, url_status: STRING, urlhaus_reference: STRING>'
 85 |   abuseurl_df = df.filter("dataset = 'abuseurl'").select("*", F.from_json("message", abuseurl_schema).alias("abuseurl")) \
 86 |     .selectExpr("*", "abuseurl.url as ioc", "abuseurl.id as ioc_id", "'URL' as ioc_type",
 87 |                "cast(abuseurl.date_added as timestamp) as first_seen").drop("message")
 88 |   return abuseurl_df
 89 | 
 90 | # COMMAND ----------
 91 | 
 92 | process_single_source(message_df, "abuseurl", handle_abuseurl)
 93 | 
 94 | # COMMAND ----------
 95 | 
 96 | # MAGIC %md ## Handle Malware Bazaar feed
 97 | 
 98 | # COMMAND ----------
 99 | 
100 | def handle_malwarebazaar(df: DataFrame) -> DataFrame:
101 |   malwarebazaar_schema = "STRUCT<anonymous: BIGINT, code_sign: ARRAY<STRING>, dhash_icon: STRING, file_name: STRING, file_size: BIGINT, file_type: STRING, file_type_mime: STRING, first_seen: STRING, imphash: STRING, intelligence: STRUCT<clamav: STRING, downloads: STRING, mail: STRING, uploads: STRING>, last_seen: STRING, md5_hash: STRING, origin_country: STRING, reporter: STRING, sha1_hash: STRING, sha256_hash: STRING, sha3_384_hash: STRING, signature: STRING, ssdeep: STRING, tags: ARRAY<STRING>, telfhash: STRING, tlsh: STRING>"
102 |   
103 |   mb_df = df.filter("dataset = 'malwarebazaar'").select("*", F.from_json("message", malwarebazaar_schema).alias("malwarebazaar")) \
104 |             .select("malwarebazaar.file_name", "malwarebazaar.file_size", "malwarebazaar.file_type", "malwarebazaar.file_type_mime",  
105 |                  F.expr("cast(malwarebazaar.first_seen as timestamp) as first_seen"),
106 |                  F.expr("cast(malwarebazaar.last_seen as timestamp) as last_seen"),
107 |                  F.posexplode(F.create_map(F.lit('FileHash-MD5'), "malwarebazaar.md5_hash", 
108 |                                    F.lit('FileHash-ImpHash'), "malwarebazaar.imphash", 
109 |                                    F.lit('FileHash-SHA1'), "malwarebazaar.sha1_hash", 
110 |                                    F.lit('FileHash-SHA256'), "malwarebazaar.sha256_hash", 
111 |                                    F.lit('FileHash-SHA384'), "malwarebazaar.sha3_384_hash", 
112 |                                    F.lit('FileHash-SSDEEP'), "malwarebazaar.ssdeep",
113 |                                    F.lit('FileHash-TElfHash'), "malwarebazaar.telfhash", 
114 |                                    F.lit('FileHash-TLSH'), "malwarebazaar.tlsh").alias("_map_")),
115 |                 "*").drop("pos", "message").withColumnRenamed("key", "ioc_type").withColumnRenamed("value", "ioc").filter("ioc is not null")
116 |   return mb_df
117 | 
118 | # COMMAND ----------
119 | 
120 | process_single_source(message_df, "malwarebazaar", handle_malwarebazaar)
121 | 
122 | # COMMAND ----------
123 | 
124 | # MAGIC %md ## Handle Abuse Malware feed
125 | 
126 | # COMMAND ----------
127 | 
128 | def handle_abusemalware(df: DataFrame) -> DataFrame:
129 |   abusemalware_schema = "STRUCT<file_size: STRING, file_type: STRING, firstseen: STRING, imphash: STRING, md5_hash: STRING, sha256_hash: STRING, signature: STRING, ssdeep: STRING, tlsh: STRING, urlhaus_download: STRING, virustotal: STRING>"
130 |   am_df = df.filter("dataset = 'abusemalware'").select("*", F.from_json("message", abusemalware_schema).alias("abusemalware")) \
131 |             .select(F.expr("cast(abusemalware.file_size as long) as file_size"), "abusemalware.file_type", 
132 |                  F.expr("cast(abusemalware.firstseen as timestamp) as first_seen"),
133 |                  F.posexplode(F.create_map(F.lit('FileHash-MD5'), "abusemalware.md5_hash", 
134 |                                    F.lit('FileHash-ImpHash'), "abusemalware.imphash", 
135 |                                    F.lit('FileHash-SHA256'), "abusemalware.sha256_hash", 
136 |                                    F.lit('FileHash-SSDEEP'), "abusemalware.ssdeep",
137 |                                    F.lit('FileHash-TLSH'), "abusemalware.tlsh").alias("_map_")),
138 |                 "*").drop("pos", "message").withColumnRenamed("key", "ioc_type").withColumnRenamed("value", "ioc").filter("ioc is not null")
139 |   return am_df
140 | 
141 | # COMMAND ----------
142 | 
143 | process_single_source(message_df, "abusemalware", handle_abusemalware)
144 | 
145 | # COMMAND ----------
146 | 
147 | # MAGIC %sql
148 | # MAGIC -- do only after the table is created
149 | # MAGIC -- CREATE BLOOMFILTER INDEX ON TABLE cyber.iocs FOR COLUMNS( ioc )
150 | 


--------------------------------------------------------------------------------
/uc-udfs/protocols.sql:
--------------------------------------------------------------------------------
  1 | -- Databricks notebook source
  2 | -- MAGIC %md
  3 | -- MAGIC
  4 | -- MAGIC ## User-defined functions to work with network protocols 
  5 | -- MAGIC
  6 | -- MAGIC Mapping is as defined by [IANA](https://www.iana.org/assignments/protocol-numbers/protocol-numbers.xhtml)
  7 | 
  8 | -- COMMAND ----------
  9 | 
 10 | CREATE OR REPLACE FUNCTION proto_name_to_code(name STRING)
 11 |   RETURNS INT
 12 |   COMMENT 'Maps network protocol name into a numeric code as per IANA'
 13 |   RETURN CASE lower(name)
 14 |     WHEN 'hopopt' THEN 0
 15 |     WHEN 'icmp' THEN 1
 16 |     WHEN 'igmp' THEN 2
 17 |     WHEN 'ggp' THEN 3
 18 |     WHEN 'ipv4' THEN 4
 19 |     WHEN 'st' THEN 5
 20 |     WHEN 'tcp' THEN 6
 21 |     WHEN 'cbt' THEN 7
 22 |     WHEN 'egp' THEN 8
 23 |     WHEN 'igp' THEN 9
 24 |     WHEN 'bbn-rcc-mon' THEN 10
 25 |     WHEN 'nvp-ii' THEN 11
 26 |     WHEN 'pup' THEN 12
 27 |     WHEN 'argus' THEN 13
 28 |     WHEN 'emcon' THEN 14
 29 |     WHEN 'xnet' THEN 15
 30 |     WHEN 'chaos' THEN 16
 31 |     WHEN 'udp' THEN 17
 32 |     WHEN 'mux' THEN 18
 33 |     WHEN 'dcn-meas' THEN 19
 34 |     WHEN 'hmp' THEN 20
 35 |     WHEN 'prm' THEN 21
 36 |     WHEN 'xns-idp' THEN 22
 37 |     WHEN 'trunk-1' THEN 23
 38 |     WHEN 'trunk-2' THEN 24
 39 |     WHEN 'leaf-1' THEN 25
 40 |     WHEN 'leaf-2' THEN 26
 41 |     WHEN 'rdp' THEN 27
 42 |     WHEN 'irtp' THEN 28
 43 |     WHEN 'iso-tp4' THEN 29
 44 |     WHEN 'netblt' THEN 30
 45 |     WHEN 'mfe-nsp' THEN 31
 46 |     WHEN 'merit-inp' THEN 32
 47 |     WHEN 'dccp' THEN 33
 48 |     WHEN '3pc' THEN 34
 49 |     WHEN 'idpr' THEN 35
 50 |     WHEN 'xtp' THEN 36
 51 |     WHEN 'ddp' THEN 37
 52 |     WHEN 'idpr-cmtp' THEN 38
 53 |     WHEN 'tp++' THEN 39
 54 |     WHEN 'il' THEN 40
 55 |     WHEN 'ipv6' THEN 41
 56 |     WHEN 'sdrp' THEN 42
 57 |     WHEN 'ipv6-route' THEN 43
 58 |     WHEN 'ipv6-frag' THEN 44
 59 |     WHEN 'idrp' THEN 45
 60 |     WHEN 'rsvp' THEN 46
 61 |     WHEN 'gre' THEN 47
 62 |     WHEN 'dsr' THEN 48
 63 |     WHEN 'bna' THEN 49
 64 |     WHEN 'esp' THEN 50
 65 |     WHEN 'ah' THEN 51
 66 |     WHEN 'i-nlsp' THEN 52
 67 |     WHEN 'swipe' THEN 53
 68 |     WHEN 'narp' THEN 54
 69 |     WHEN 'min-ipv4' THEN 55
 70 |     WHEN 'tlsp' THEN 56
 71 |     WHEN 'skip' THEN 57
 72 |     WHEN 'ipv6-icmp' THEN 58
 73 |     WHEN 'ipv6-nonxt' THEN 59
 74 |     WHEN 'ipv6-opts' THEN 60
 75 |     WHEN 'cftp' THEN 62
 76 |     WHEN 'sat-expak' THEN 64
 77 |     WHEN 'kryptolan' THEN 65
 78 |     WHEN 'rvd' THEN 66
 79 |     WHEN 'ippc' THEN 67
 80 |     WHEN 'sat-mon' THEN 69
 81 |     WHEN 'visa' THEN 70
 82 |     WHEN 'ipcv' THEN 71
 83 |     WHEN 'cpnx' THEN 72
 84 |     WHEN 'cphb' THEN 73
 85 |     WHEN 'wsn' THEN 74
 86 |     WHEN 'pvp' THEN 75
 87 |     WHEN 'br-sat-mon' THEN 76
 88 |     WHEN 'sun-nd' THEN 77
 89 |     WHEN 'wb-mon' THEN 78
 90 |     WHEN 'wb-expak' THEN 79
 91 |     WHEN 'iso-ip' THEN 80
 92 |     WHEN 'vmtp' THEN 81
 93 |     WHEN 'secure-vmtp' THEN 82
 94 |     WHEN 'vines' THEN 83
 95 |     WHEN 'iptm' THEN 84
 96 |     WHEN 'nsfnet-igp' THEN 85
 97 |     WHEN 'dgp' THEN 86
 98 |     WHEN 'tcf' THEN 87
 99 |     WHEN 'eigrp' THEN 88
100 |     WHEN 'ospfigp' THEN 89
101 |     WHEN 'sprite-rpc' THEN 90
102 |     WHEN 'larp' THEN 91
103 |     WHEN 'mtp' THEN 92
104 |     WHEN 'ax.25' THEN 93
105 |     WHEN 'ipip' THEN 94
106 |     WHEN 'micp' THEN 95
107 |     WHEN 'scc-sp' THEN 96
108 |     WHEN 'etherip' THEN 97
109 |     WHEN 'encap' THEN 98
110 |     WHEN 'gmtp' THEN 100
111 |     WHEN 'ifmp' THEN 101
112 |     WHEN 'pnni' THEN 102
113 |     WHEN 'pim' THEN 103
114 |     WHEN 'aris' THEN 104
115 |     WHEN 'scps' THEN 105
116 |     WHEN 'qnx' THEN 106
117 |     WHEN 'a/n' THEN 107
118 |     WHEN 'ipcomp' THEN 108
119 |     WHEN 'snp' THEN 109
120 |     WHEN 'compaq-peer' THEN 110
121 |     WHEN 'ipx-in-ip' THEN 111
122 |     WHEN 'vrrp' THEN 112
123 |     WHEN 'pgm' THEN 113
124 |     WHEN 'l2tp' THEN 115
125 |     WHEN 'ddx' THEN 116
126 |     WHEN 'iatp' THEN 117
127 |     WHEN 'stp' THEN 118
128 |     WHEN 'srp' THEN 119
129 |     WHEN 'uti' THEN 120
130 |     WHEN 'smp' THEN 121
131 |     WHEN 'sm' THEN 122
132 |     WHEN 'ptp' THEN 123
133 |     WHEN 'isis over ipv4' THEN 124
134 |     WHEN 'fire' THEN 125
135 |     WHEN 'crtp' THEN 126
136 |     WHEN 'crudp' THEN 127
137 |     WHEN 'sscopmce' THEN 128
138 |     WHEN 'iplt' THEN 129
139 |     WHEN 'sps' THEN 130
140 |     WHEN 'pipe' THEN 131
141 |     WHEN 'sctp' THEN 132
142 |     WHEN 'fc' THEN 133
143 |     WHEN 'rsvp-e2e-ignore' THEN 134
144 |     WHEN 'mobility header' THEN 135
145 |     WHEN 'udplite' THEN 136
146 |     WHEN 'mpls-in-ip' THEN 137
147 |     WHEN 'manet' THEN 138
148 |     WHEN 'hip' THEN 139
149 |     WHEN 'shim6' THEN 140
150 |     WHEN 'wesp' THEN 141
151 |     WHEN 'rohc' THEN 142
152 |     WHEN 'ethernet' THEN 143
153 |     WHEN 'aggfrag' THEN 144
154 |     WHEN 'nsh' THEN 145
155 |     WHEN 'homa' THEN 146
156 |     WHEN 'bit-emu' THEN 147
157 |     WHEN 'reserved' THEN 255
158 |     ELSE NULL
159 |   END;
160 | 
161 | -- COMMAND ----------
162 | 
163 | CREATE OR REPLACE FUNCTION proto_code_to_name(code INT) 
164 | RETURNS STRING 
165 | COMMENT 'Maps network protocol numeric code into the name as per IANA' 
166 | RETURN 
167 |     CASE code
168 |         WHEN 0 THEN 'HOPOPT'
169 |         WHEN 1 THEN 'ICMP'
170 |         WHEN 2 THEN 'IGMP'
171 |         WHEN 3 THEN 'GGP'
172 |         WHEN 4 THEN 'IPv4'
173 |         WHEN 5 THEN 'ST'
174 |         WHEN 6 THEN 'TCP'
175 |         WHEN 7 THEN 'CBT'
176 |         WHEN 8 THEN 'EGP'
177 |         WHEN 9 THEN 'IGP'
178 |         WHEN 10 THEN 'BBN-RCC-MON'
179 |         WHEN 11 THEN 'NVP-II'
180 |         WHEN 12 THEN 'PUP'
181 |         WHEN 13 THEN 'ARGUS (deprecated)'
182 |         WHEN 14 THEN 'EMCON'
183 |         WHEN 15 THEN 'XNET'
184 |         WHEN 16 THEN 'CHAOS'
185 |         WHEN 17 THEN 'UDP'
186 |         WHEN 18 THEN 'MUX'
187 |         WHEN 19 THEN 'DCN-MEAS'
188 |         WHEN 20 THEN 'HMP'
189 |         WHEN 21 THEN 'PRM'
190 |         WHEN 22 THEN 'XNS-IDP'
191 |         WHEN 23 THEN 'TRUNK-1'
192 |         WHEN 24 THEN 'TRUNK-2'
193 |         WHEN 25 THEN 'LEAF-1'
194 |         WHEN 26 THEN 'LEAF-2'
195 |         WHEN 27 THEN 'RDP'
196 |         WHEN 28 THEN 'IRTP'
197 |         WHEN 29 THEN 'ISO-TP4'
198 |         WHEN 30 THEN 'NETBLT'
199 |         WHEN 31 THEN 'MFE-NSP'
200 |         WHEN 32 THEN 'MERIT-INP'
201 |         WHEN 33 THEN 'DCCP'
202 |         WHEN 34 THEN '3PC'
203 |         WHEN 35 THEN 'IDPR'
204 |         WHEN 36 THEN 'XTP'
205 |         WHEN 37 THEN 'DDP'
206 |         WHEN 38 THEN 'IDPR-CMTP'
207 |         WHEN 39 THEN 'TP++'
208 |         WHEN 40 THEN 'IL'
209 |         WHEN 41 THEN 'IPv6'
210 |         WHEN 42 THEN 'SDRP'
211 |         WHEN 43 THEN 'IPv6-Route'
212 |         WHEN 44 THEN 'IPv6-Frag'
213 |         WHEN 45 THEN 'IDRP'
214 |         WHEN 46 THEN 'RSVP'
215 |         WHEN 47 THEN 'GRE'
216 |         WHEN 48 THEN 'DSR'
217 |         WHEN 49 THEN 'BNA'
218 |         WHEN 50 THEN 'ESP'
219 |         WHEN 51 THEN 'AH'
220 |         WHEN 52 THEN 'I-NLSP'
221 |         WHEN 53 THEN 'SWIPE (deprecated)'
222 |         WHEN 54 THEN 'NARP'
223 |         WHEN 55 THEN 'Min-IPv4'
224 |         WHEN 56 THEN 'TLSP'
225 |         WHEN 57 THEN 'SKIP'
226 |         WHEN 58 THEN 'IPv6-ICMP'
227 |         WHEN 59 THEN 'IPv6-NoNxt'
228 |         WHEN 60 THEN 'IPv6-Opts'
229 |         WHEN 61 THEN 'Any host internal protocol'
230 |         WHEN 62 THEN 'CFTP'
231 |         WHEN 63 THEN 'Any local network'
232 |         WHEN 64 THEN 'SAT-EXPAK'
233 |         WHEN 65 THEN 'KRYPTOLAN'
234 |         WHEN 66 THEN 'RVD'
235 |         WHEN 67 THEN 'IPPC'
236 |         WHEN 68 THEN 'Any distributed file system'
237 |         WHEN 69 THEN 'SAT-MON'
238 |         WHEN 70 THEN 'VISA'
239 |         WHEN 71 THEN 'IPCV'
240 |         WHEN 72 THEN 'CPNX'
241 |         WHEN 73 THEN 'CPHB'
242 |         WHEN 74 THEN 'WSN'
243 |         WHEN 75 THEN 'PVP'
244 |         WHEN 76 THEN 'BR-SAT-MON'
245 |         WHEN 77 THEN 'SUN-ND'
246 |         WHEN 78 THEN 'WB-MON'
247 |         WHEN 79 THEN 'WB-EXPAK'
248 |         WHEN 80 THEN 'ISO-IP'
249 |         WHEN 81 THEN 'VMTP'
250 |         WHEN 82 THEN 'SECURE-VMTP'
251 |         WHEN 83 THEN 'VINES'
252 |         WHEN 84 THEN 'IPTM'
253 |         WHEN 85 THEN 'NSFNET-IGP'
254 |         WHEN 86 THEN 'DGP'
255 |         WHEN 87 THEN 'TCF'
256 |         WHEN 88 THEN 'EIGRP'
257 |         WHEN 89 THEN 'OSPFIGP'
258 |         WHEN 90 THEN 'Sprite-RPC'
259 |         WHEN 91 THEN 'LARP'
260 |         WHEN 92 THEN 'MTP'
261 |         WHEN 93 THEN 'AX.25'
262 |         WHEN 94 THEN 'IPIP'
263 |         WHEN 95 THEN 'MICP (deprecated)'
264 |         WHEN 96 THEN 'SCC-SP'
265 |         WHEN 97 THEN 'ETHERIP'
266 |         WHEN 98 THEN 'ENCAP'
267 |         WHEN 99 THEN 'Any private encryption scheme'
268 |         WHEN 100 THEN 'GMTP'
269 |         WHEN 101 THEN 'IFMP'
270 |         WHEN 102 THEN 'PNNI'
271 |         WHEN 103 THEN 'PIM'
272 |         WHEN 104 THEN 'ARIS'
273 |         WHEN 105 THEN 'SCPS'
274 |         WHEN 106 THEN 'QNX'
275 |         WHEN 107 THEN 'A/N'
276 |         WHEN 108 THEN 'IPComp'
277 |         WHEN 109 THEN 'SNP'
278 |         WHEN 110 THEN 'Compaq-Peer'
279 |         WHEN 111 THEN 'IPX-in-IP'
280 |         WHEN 112 THEN 'VRRP'
281 |         WHEN 113 THEN 'PGM'
282 |         WHEN 114 THEN 'Any 0-hop protocol'
283 |         WHEN 115 THEN 'L2TP'
284 |         WHEN 116 THEN 'DDX'
285 |         WHEN 117 THEN 'IATP'
286 |         WHEN 118 THEN 'STP'
287 |         WHEN 119 THEN 'SRP'
288 |         WHEN 120 THEN 'UTI'
289 |         WHEN 121 THEN 'SMP'
290 |         WHEN 122 THEN 'SM (deprecated)'
291 |         WHEN 123 THEN 'PTP'
292 |         WHEN 124 THEN 'ISIS over IPv4'
293 |         WHEN 125 THEN 'FIRE'
294 |         WHEN 126 THEN 'CRTP'
295 |         WHEN 127 THEN 'CRUDP'
296 |         WHEN 128 THEN 'SSCOPMCE'
297 |         WHEN 129 THEN 'IPLT'
298 |         WHEN 130 THEN 'SPS'
299 |         WHEN 131 THEN 'PIPE'
300 |         WHEN 132 THEN 'SCTP'
301 |         WHEN 133 THEN 'FC'
302 |         WHEN 134 THEN 'RSVP-E2E-IGNORE'
303 |         WHEN 135 THEN 'Mobility Header'
304 |         WHEN 136 THEN 'UDPLite'
305 |         WHEN 137 THEN 'MPLS-in-IP'
306 |         WHEN 138 THEN 'manet'
307 |         WHEN 139 THEN 'HIP'
308 |         WHEN 140 THEN 'Shim6'
309 |         WHEN 141 THEN 'WESP'
310 |         WHEN 142 THEN 'ROHC'
311 |         WHEN 143 THEN 'Ethernet'
312 |         WHEN 144 THEN 'AGGFRAG'
313 |         WHEN 145 THEN 'NSH'
314 |         WHEN 146 THEN 'Homa'
315 |         WHEN 147 THEN 'BIT-EMU'
316 |         WHEN 253 THEN 'Experimentation & testing'
317 |         WHEN 254 THEN 'Experimentation & testing'
318 |         WHEN 255 THEN 'Reserved'
319 |         ELSE 'Unassigned'
320 |     END;
321 | 


--------------------------------------------------------------------------------