├── dlt_modern_stuff ├── src │ ├── __init__.py │ ├── helpers.py │ ├── ingest_zeek_http.py │ ├── ingest_zeek_conn.py │ ├── setup.py │ ├── ingest_apache_web.py │ ├── detections.py │ └── ingest_aws_cloudtrail.py ├── resources │ ├── .gitkeep │ ├── dlt_cyber_demo_setup.job.yml │ ├── demo_ingest_aws_cloudtrail_data.pipeline.yml │ ├── demo_ingest_apache_data.pipeline.yml │ ├── demo_ingest_zeek_data.pipeline.yml │ ├── demo_detections.pipeline.yml │ └── variables.yml ├── .gitignore ├── images │ └── cyber-pipeline-impl.png ├── databricks.yml └── README.md ├── iocs-ingest ├── README.md ├── IoCs Bronze.py ├── IoCs Common.py └── IoCs Silver.py ├── uc-udfs ├── README.md ├── community_id.sql ├── ocsf.sql └── protocols.sql ├── README.md └── LICENSE /dlt_modern_stuff/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dlt_modern_stuff/resources/.gitkeep: -------------------------------------------------------------------------------- 1 | This folder is reserved for Databricks Asset Bundles resource definitions. 2 | -------------------------------------------------------------------------------- /dlt_modern_stuff/.gitignore: -------------------------------------------------------------------------------- 1 | .databricks/ 2 | build/ 3 | dist/ 4 | __pycache__/ 5 | *.egg-info 6 | .venv/ 7 | scratch/** 8 | !scratch/README.md 9 | -------------------------------------------------------------------------------- /dlt_modern_stuff/images/cyber-pipeline-impl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexott/databricks-cybersecurity-playground/main/dlt_modern_stuff/images/cyber-pipeline-impl.png -------------------------------------------------------------------------------- /iocs-ingest/README.md: -------------------------------------------------------------------------------- 1 | This folder contains Databricks notebooks with source code for the blog post [Ingesting indicators of compromise with Filebeat, Azure Event Hubs & Delta Lake on Databricks](https://alexott.blogspot.com/2022/10/ingesting-indicators-of-compromise-with.html). 2 | -------------------------------------------------------------------------------- /uc-udfs/README.md: -------------------------------------------------------------------------------- 1 | # Cybersecurity-related user-defined functions for Unity Catalog 2 | 3 | There is a number of user-defined functions that could be useful when working with heterogeneous log sources. 4 | 5 | Now available: 6 | 7 | - `protocols.sql` contains two functions `proto_name_to_code` and `proto_code_to_name` to remap network protocol codes and names. 8 | - `ocsf.sql` contains functions that map `activity_id` into `activity_name` for different categories. 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # databricks-cybersecurity-playground 2 | 3 | Different pieces of code related to doing cybersecurity on Databricks 4 | 5 | 6 | * [iocs-ingest](iocs-ingest/) - source code for ingesting data from Filebeat's Threat Intel module. 7 | * [dlt_modern_stuff](dlt_modern_stuff/) - source code that demonstrates use of latest feature of DLT (append flows, sinks, direct publishing). 8 | * [uc-udfs](uc-udfs/) - cybersecurity-related user-defined functions for Unity Catalog. 9 | -------------------------------------------------------------------------------- /dlt_modern_stuff/resources/dlt_cyber_demo_setup.job.yml: -------------------------------------------------------------------------------- 1 | resources: 2 | jobs: 3 | dlt_cyber_demo_setup: 4 | name: 'DLT Cyber Demo: Setup' 5 | tasks: 6 | - task_key: setup 7 | notebook_task: 8 | base_parameters: 9 | catalog_name: ${var.catalog_name} 10 | schema_name: ${var.normalized_schema_name} 11 | volume_path: ${var.log_files_path} 12 | notebook_path: ../src/setup.py 13 | max_concurrent_runs: 1 14 | -------------------------------------------------------------------------------- /dlt_modern_stuff/resources/demo_ingest_aws_cloudtrail_data.pipeline.yml: -------------------------------------------------------------------------------- 1 | resources: 2 | pipelines: 3 | demo_ingest_aws_cloudtrail_data: 4 | name: 'DLT Cyber Demo: Ingest AWS Cloudtrail data' 5 | configuration: 6 | "conf.gold_catalog_name": ${var.catalog_name} 7 | "conf.gold_schema_name": ${var.normalized_schema_name} 8 | "conf.aws_cloudtrail_input": "${var.log_files_path}/logs/aws_cloudtrail/" 9 | libraries: 10 | - notebook: 11 | path: ../src/ingest_aws_cloudtrail.py 12 | catalog: ${var.catalog_name} 13 | schema: ${var.silver_schema_name} 14 | channel: CURRENT 15 | # development: true 16 | serverless: true 17 | -------------------------------------------------------------------------------- /dlt_modern_stuff/resources/demo_ingest_apache_data.pipeline.yml: -------------------------------------------------------------------------------- 1 | resources: 2 | pipelines: 3 | demo_ingest_apache_data: 4 | name: 'DLT Cyber Demo: Ingest Apache data' 5 | configuration: 6 | "conf.gold_catalog_name": ${var.catalog_name} 7 | "conf.gold_schema_name": ${var.normalized_schema_name} 8 | "conf.apache_web_input": "${var.log_files_path}/logs/apache/" 9 | "conf.nginx_input": "${var.log_files_path}/logs/nginx/" 10 | libraries: 11 | - notebook: 12 | path: ../src/ingest_apache_web.py 13 | catalog: ${var.catalog_name} 14 | schema: ${var.silver_schema_name} 15 | channel: CURRENT 16 | # development: true 17 | serverless: true 18 | -------------------------------------------------------------------------------- /dlt_modern_stuff/resources/demo_ingest_zeek_data.pipeline.yml: -------------------------------------------------------------------------------- 1 | resources: 2 | pipelines: 3 | demo_ingest_zeek_data: 4 | name: 'DLT Cyber Demo: Ingest Zeek data' 5 | configuration: 6 | "conf.gold_catalog_name": ${var.catalog_name} 7 | "conf.gold_schema_name": ${var.normalized_schema_name} 8 | "conf.zeek_conn_input": "${var.log_files_path}/logs/zeek_conn/" 9 | "conf.zeek_http_input": "${var.log_files_path}/logs/zeek_http/" 10 | libraries: 11 | - notebook: 12 | path: ../src/ingest_zeek_conn.py 13 | - notebook: 14 | path: ../src/ingest_zeek_http.py 15 | catalog: ${var.catalog_name} 16 | schema: ${var.silver_schema_name} 17 | channel: CURRENT 18 | # development: true 19 | serverless: true 20 | -------------------------------------------------------------------------------- /dlt_modern_stuff/resources/demo_detections.pipeline.yml: -------------------------------------------------------------------------------- 1 | resources: 2 | pipelines: 3 | demo_detections: 4 | name: 'DLT Cyber Demo: Detections' 5 | configuration: 6 | "conf.azure_tenant_id": ${var.azure_tenant_id} 7 | "conf.eh_ns": ${var.eventhub_namespace} 8 | "conf.eh_topic": ${var.eventhub_topic} 9 | "conf.gold_catalog_name": ${var.catalog_name} 10 | "conf.gold_schema_name": ${var.normalized_schema_name} 11 | "conf.push_to_eventhubs": ${var.push_to_eventhubs} 12 | "conf.secret_scope": ${var.secret_scope_name} 13 | "conf.sp_id_key_name": ${var.sp_id_key_name} 14 | "conf.sp_secret_key_name": ${var.sp_secret_key_name} 15 | "conf.push_to_splunk": ${var.push_to_splunk} 16 | "conf.splunk_url": ${var.splunk_url} 17 | "conf.splunk_hec_token": ${var.splunk_hec_token} 18 | libraries: 19 | - notebook: 20 | path: ../src/detections.py 21 | catalog: ${var.catalog_name} 22 | schema: ${var.silver_schema_name} 23 | channel: CURRENT 24 | # development: true 25 | serverless: true 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Alex Ott 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /dlt_modern_stuff/databricks.yml: -------------------------------------------------------------------------------- 1 | # This is a Databricks asset bundle definition for dlt_modern_stuff. 2 | # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. 3 | bundle: 4 | name: dlt_modern_stuff 5 | 6 | include: 7 | - resources/*.yml 8 | 9 | sync: 10 | include: 11 | - src/helpers.py 12 | - src/__init__.py 13 | 14 | targets: 15 | dev: 16 | # The default target uses 'mode: development' to create a development copy. 17 | # - Deployed resources get prefixed with '[dev my_user_name]' 18 | # - Any job schedules and triggers are paused by default. 19 | # See also https://docs.databricks.com/dev-tools/bundles/deployment-modes.html. 20 | # Uncomment this if you run outside of Free Edition 21 | # mode: development 22 | default: true 23 | variables: 24 | catalog_name: cybersecurity 25 | normalized_schema_name: normalized 26 | log_files_path: /Volumes/cybersecurity/logs/logs/demo 27 | silver_schema_name: silver 28 | push_to_eventhubs: "false" 29 | azure_tenant_id: "aaaa" 30 | eventhub_namespace: "aaa" 31 | eventhub_topic: "alerts" 32 | secret_scope_name: "aaaa" 33 | sp_id_key_name: "aaa" 34 | sp_secret_key_name: "aaa" 35 | push_to_splunk: "false" 36 | splunk_url: "http://10.1.0.6:8088/services/collector/event" 37 | splunk_hec_token: "aaaa" 38 | -------------------------------------------------------------------------------- /dlt_modern_stuff/resources/variables.yml: -------------------------------------------------------------------------------- 1 | variables: 2 | catalog_name: 3 | description: The name of the default UC Catalog 4 | silver_schema_name: 5 | description: The name of UC Schema to put processed data of individual log sources 6 | normalized_schema_name: 7 | description: The name of UC Schema to put tables with normalized data, IoCs and Detections tables. 8 | log_files_path: 9 | description: The path to UC Volume where raw log data will be stored 10 | push_to_eventhubs: 11 | description: Set to "true" to push detections to the EventHubs topic 12 | default: "false" 13 | # The following variables are needed if the `push_to_eventhubs` above is set to "true" 14 | secret_scope_name: 15 | description: The name of the secret scope with SP's ID and secret 16 | default: "" 17 | sp_id_key_name: 18 | description: The name of a secret inside secret scope that holds SP ID 19 | default: "" 20 | sp_secret_key_name: 21 | description: The name of a secret inside secret scope that holds SP Secret 22 | default: "" 23 | azure_tenant_id: 24 | description: The ID of Entra ID tenant where SP is registered 25 | default: "" 26 | eventhub_namespace: 27 | description: The name of EventHubs namespace 28 | default: "" 29 | eventhub_topic: 30 | description: The name of a topic inside EventHubs namespace 31 | default: "" 32 | push_to_splunk: 33 | description: Set to "true" to push detections to a Splunk instance (right now, only on non-serverless) 34 | default: "false" 35 | splunk_url: 36 | description: the URL of Splunk HTTP Event Collector 37 | default: "" 38 | splunk_hec_token: 39 | description: Token that will be used to authenticate to Splunk HTTP Event Collector 40 | default: "" 41 | -------------------------------------------------------------------------------- /uc-udfs/community_id.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | CREATE OR REPLACE FUNCTION community_id_hash( 3 | src_ip STRING, 4 | src_port INT, 5 | dst_ip STRING, 6 | dst_port INT, 7 | proto INT, 8 | seed INT 9 | ) 10 | RETURNS STRING 11 | LANGUAGE PYTHON 12 | DETERMINISTIC 13 | ENVIRONMENT ( 14 | dependencies = '["communityid"]', 15 | environment_version = 'None' 16 | ) 17 | AS $$ 18 | import communityid 19 | 20 | cid = communityid.CommunityID() 21 | tpl = communityid.FlowTuple(proto, src_ip, dst_ip, src_port, dst_port) 22 | 23 | return cid.calc(tpl) 24 | $$; 25 | 26 | -- COMMAND ---------- 27 | 28 | -- MAGIC %python 29 | -- MAGIC 30 | -- MAGIC import requests 31 | -- MAGIC import json 32 | -- MAGIC 33 | -- MAGIC # Download the JSON file 34 | -- MAGIC url = "https://raw.githubusercontent.com/corelight/community-id-spec/refs/heads/master/baseline/baseline_deflt.json" 35 | -- MAGIC response = requests.get(url) 36 | -- MAGIC data = response.json() 37 | -- MAGIC 38 | -- MAGIC # Prepare data for SQL 39 | -- MAGIC rows = [] 40 | -- MAGIC for entry in data: 41 | -- MAGIC src_ip = entry["saddr"] 42 | -- MAGIC src_port = entry["sport"] 43 | -- MAGIC dst_ip = entry["daddr"] 44 | -- MAGIC dst_port = entry["dport"] 45 | -- MAGIC proto = entry["proto"] 46 | -- MAGIC seed = entry.get("seed", 0) 47 | -- MAGIC expected_id = entry["communityid"] 48 | -- MAGIC rows.append((src_ip, src_port, dst_ip, dst_port, proto, int(seed), expected_id)) 49 | -- MAGIC 50 | -- MAGIC # Create DataFrame 51 | -- MAGIC columns = ["src_ip", "src_port", "dst_ip", "dst_port", "proto", "seed", "expected_id"] 52 | -- MAGIC df = spark.createDataFrame(rows, columns) 53 | -- MAGIC df.createOrReplaceTempView("baseline_data") 54 | -- MAGIC 55 | -- MAGIC # Compute and compare community IDs using the UDF 56 | -- MAGIC result = spark.sql(""" 57 | -- MAGIC SELECT 58 | -- MAGIC src_ip, 59 | -- MAGIC src_port, 60 | -- MAGIC dst_ip, 61 | -- MAGIC dst_port, 62 | -- MAGIC proto, 63 | -- MAGIC seed, 64 | -- MAGIC expected_id, 65 | -- MAGIC community_id_hash(src_ip, src_port, dst_ip, dst_port, proto, seed) AS computed_id, 66 | -- MAGIC CASE WHEN expected_id = community_id_hash(src_ip, src_port, dst_ip, dst_port, proto, seed) THEN 'MATCH' ELSE 'MISMATCH' END AS comparison 67 | -- MAGIC FROM baseline_data 68 | -- MAGIC """) 69 | -- MAGIC display(result) 70 | -------------------------------------------------------------------------------- /uc-udfs/ocsf.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md 3 | -- MAGIC 4 | -- MAGIC ## UDFs to support OCSF mappings 5 | 6 | -- COMMAND ---------- 7 | 8 | CREATE OR REPLACE FUNCTION ocsf_network_activity_name(activity_id INTEGER) 9 | RETURNS STRING 10 | COMMENT 'Maps network activity_id into the name' 11 | RETURN CASE activity_id 12 | WHEN 0 THEN 'Unknown' 13 | WHEN 1 THEN 'Open' 14 | WHEN 2 THEN 'Close' 15 | WHEN 3 THEN 'Reset' 16 | WHEN 4 THEN 'Fail' 17 | WHEN 5 THEN 'Refuse' 18 | WHEN 6 THEN 'Traffic' 19 | WHEN 7 THEN 'Listen' 20 | ELSE 'Other' 21 | END; 22 | 23 | -- COMMAND ---------- 24 | 25 | CREATE OR REPLACE FUNCTION ocsf_dns_activity_name(activity_id INTEGER) 26 | RETURNS STRING 27 | COMMENT 'Maps DNS activity_id into the name' 28 | RETURN CASE activity_id 29 | WHEN 0 THEN 'Unknown' 30 | WHEN 1 THEN 'Query' 31 | WHEN 2 THEN 'Response' 32 | WHEN 6 THEN 'Traffic' 33 | ELSE 'Other' 34 | END; 35 | 36 | -- COMMAND ---------- 37 | 38 | CREATE OR REPLACE FUNCTION ocsf_http_activity_name(activity_id INTEGER) 39 | RETURNS STRING 40 | COMMENT 'Maps HTTP activity_id into the name' 41 | RETURN CASE activity_id 42 | WHEN 0 THEN 'Unknown' 43 | WHEN 1 THEN 'Connect' 44 | WHEN 2 THEN 'Delete' 45 | WHEN 3 THEN 'Get' 46 | WHEN 4 THEN 'Head' 47 | WHEN 5 THEN 'Options' 48 | WHEN 6 THEN 'Post' 49 | WHEN 7 THEN 'Put' 50 | WHEN 8 THEN 'Trace' 51 | WHEN 9 THEN 'Patch' 52 | ELSE 'Other' 53 | END; 54 | 55 | -- COMMAND ---------- 56 | 57 | CREATE OR REPLACE FUNCTION ocsf_file_activity_name(activity_id INTEGER) 58 | RETURNS STRING 59 | COMMENT 'Maps file system activity_id into the name' 60 | RETURN CASE activity_id 61 | WHEN 0 THEN 'Unknown' 62 | WHEN 1 THEN 'Create' 63 | WHEN 2 THEN 'Read' 64 | WHEN 3 THEN 'Update' 65 | WHEN 4 THEN 'Delete' 66 | WHEN 5 THEN 'Rename' 67 | WHEN 6 THEN 'Set Attributes' 68 | WHEN 7 THEN 'Set Security' 69 | WHEN 8 THEN 'Get Attributes' 70 | WHEN 9 THEN 'Get Security' 71 | WHEN 9 THEN 'Encrypt' 72 | WHEN 9 THEN 'Decrypt' 73 | WHEN 9 THEN 'Mount' 74 | WHEN 9 THEN 'Unmount' 75 | WHEN 9 THEN 'Open' 76 | ELSE 'Other' 77 | END; 78 | 79 | -- COMMAND ---------- 80 | 81 | CREATE OR REPLACE FUNCTION ocsf_http_activity_id(verb STRING) 82 | RETURNS INT 83 | COMMENT 'Maps HTTP verb into OCSF numeric code' 84 | RETURN CASE lower(verb) 85 | WHEN 'connect' THEN 1 86 | WHEN 'delete' THEN 2 87 | WHEN 'get' THEN 3 88 | WHEN 'head' THEN 4 89 | WHEN 'options' THEN 5 90 | WHEN 'post' THEN 6 91 | WHEN 'put' THEN 7 92 | WHEN 'trace' THEN 8 93 | WHEN 'patch' THEN 9 94 | WHEN '' THEN 0 -- TODO: handle null string 95 | ELSE 99 96 | END; -------------------------------------------------------------------------------- /iocs-ingest/IoCs Bronze.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | import os 3 | from typing import List, Dict, Any, Optional 4 | 5 | from delta.tables import DeltaTable 6 | from pyspark.sql import DataFrame, SparkSession 7 | import pyspark.sql.functions as F 8 | 9 | # COMMAND ---------- 10 | 11 | # Variables... 12 | base_dir = "/mnt/cyberdata" 13 | secret_scope = "..." 14 | evhub_secret_key = "..." 15 | evhub_ns_name = "..." 16 | evhub_topic_name = "iocs" 17 | 18 | # COMMAND ---------- 19 | 20 | # MAGIC %run "./IoCs Common" 21 | 22 | # COMMAND ---------- 23 | 24 | num_executors = sc._jsc.sc().getExecutorMemoryStatus().size()-1 25 | num_cores = sum(sc.parallelize((("")*num_executors), num_executors).mapPartitions(lambda p: [os.cpu_count()]).collect()) 26 | 27 | # COMMAND ---------- 28 | 29 | spark.sql(f"set spark.sql.shuffle.partitions = {num_cores}") 30 | 31 | # COMMAND ---------- 32 | 33 | import datetime 34 | 35 | readConnectionString = dbutils.secrets.get(secret_scope, evhub_secret_key) 36 | eh_sasl = f'kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username="$ConnectionString" password="{readConnectionString}";' 37 | kafka_options = {"kafka.bootstrap.servers": f"{evhub_ns_name}.servicebus.windows.net:9093", 38 | "kafka.sasl.mechanism": "PLAIN", 39 | "kafka.security.protocol": "SASL_SSL", 40 | "kafka.request.timeout.ms": "60000", 41 | "kafka.session.timeout.ms": "30000", 42 | "startingOffsets": "earliest", 43 | "minPartitions": num_cores, 44 | "kafka.sasl.jaas.config": eh_sasl, 45 | "subscribe": evhub_topic_name, 46 | } 47 | 48 | df = spark.readStream\ 49 | .format("kafka")\ 50 | .options(**kafka_options)\ 51 | .load()\ 52 | .withColumn("value", F.col("value").cast("string")) 53 | 54 | # COMMAND ---------- 55 | 56 | partial_schema = "`@timestamp` timestamp, fileset struct, service struct, message string" 57 | df2 = df.select("*", F.from_json("value", partial_schema).alias("jsn")) \ 58 | .withColumnRenamed("timestamp", "kafka_ts") \ 59 | .selectExpr("*", "jsn.`@timestamp` as timestamp", "jsn.fileset.name as dataset", 60 | "jsn.service.type as service", "sha2(jsn.message, 256) as msg_hash") \ 61 | .drop("jsn", "timestampType") \ 62 | .withColumn("date", F.col("timestamp").cast("date")) 63 | #display(df2) 64 | 65 | # COMMAND ---------- 66 | 67 | def perform_foreach_batch(df: DataFrame, epoch): 68 | return drop_duplicates_with_merge(df, primary_key_columns=["msg_hash"], 69 | path=f"{base_dir}/bronze/threatintel/", 70 | partitionby=["date"], opts={"mergeSchema": "true"}, 71 | additional_merge_cond="update.date >= current_date()-10" 72 | ) 73 | 74 | # COMMAND ---------- 75 | 76 | checkpoint = f"{base_dir}/checkpoints/threatintel-bronze/" 77 | 78 | # COMMAND ---------- 79 | 80 | df2.writeStream \ 81 | .option("checkpointLocation", checkpoint) \ 82 | .trigger(availableNow=True) \ 83 | .foreachBatch(perform_foreach_batch) \ 84 | .start() 85 | -------------------------------------------------------------------------------- /iocs-ingest/IoCs Common.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | import os 3 | from typing import List, Dict, Any, Optional 4 | 5 | from delta.tables import DeltaTable 6 | from pyspark.sql import DataFrame, SparkSession 7 | import pyspark.sql.functions as F 8 | 9 | # COMMAND ---------- 10 | 11 | def drop_duplicates_with_merge( 12 | df: DataFrame, 13 | primary_key_columns: List[str], 14 | path: str = "", 15 | table_name: str = "", 16 | partitionby: Optional[List[str]] = None, 17 | opts: Optional[Dict[str, Any]] = None, 18 | additional_merge_cond: Optional[str] = None 19 | ): 20 | """Performs removal of duplicates using the Delta MERGE operation. If table doesn't exist, 21 | it's created by writing the dataframe into a specified location. This function is primarily 22 | designed for use in ``foreachBatch``. 23 | 24 | :param df: new dataframe 25 | :param primary_key_columns: required list of the column names that are used for detection of duplicates 26 | :param path: optional path to table (required if table_name isn't specified) 27 | :param table_name: optional name of the table (required if path isn't specified) 28 | :param partitionby: optional list of columns to partition by 29 | :param opts: optional dictionary with options for creation of Delta table 30 | :param additional_merge_cond: additional merge condition appended to the generated condition using AND 31 | :return: nothing 32 | """ 33 | # print(f"Performing merge for {path=} or {table_name=}") 34 | if opts is None: 35 | opts = {} 36 | if partitionby is None: 37 | partitionby = [] 38 | df = df.dropDuplicates(primary_key_columns) 39 | if path == "" and table_name == "": 40 | raise Exception( 41 | "At least one parameter, 'path' or 'table_name' must be specified" 42 | ) 43 | if not df._jdf.isEmpty(): 44 | try: 45 | spark = SparkSession.getActiveSession() 46 | if table_name != "": 47 | tbl = DeltaTable.forName(spark, table_name) 48 | else: 49 | tbl = DeltaTable.forPath(spark, path) 50 | dname = "dest" 51 | uname = "update" 52 | merge_cond = " and ".join( 53 | [f"{dname}.{col} = {uname}.{col}" for col in primary_key_columns] 54 | ) 55 | if additional_merge_cond: 56 | merge_cond = merge_cond + " AND " + additional_merge_cond 57 | tbl.alias(dname).merge( 58 | df.alias(uname), merge_cond 59 | ).whenNotMatchedInsertAll().execute() 60 | # except AnalysisException as ex: # this happens when table doesn't exist 61 | except Exception as ex: # this happens when table doesn't exist 62 | print(f"Got exception: {ex}") 63 | # print(f"Delta table ({path=}, {table_name=}) doesn't exist, writing all data as new table...") 64 | if table_name != "": 65 | if path != "": 66 | opts["path"] = path 67 | df.write.format("delta").partitionBy(partitionby).options( 68 | **opts 69 | ).saveAsTable(table_name) 70 | else: 71 | df.write.format("delta").partitionBy(partitionby).options(**opts).save( 72 | path 73 | ) 74 | -------------------------------------------------------------------------------- /dlt_modern_stuff/src/helpers.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | from typing import Optional 4 | import re 5 | 6 | NETWORK_TABLE_NAME = "network" 7 | HTTP_TABLE_NAME = "http" 8 | 9 | __catalog_key_name__ = "conf.{}_catalog_name" 10 | __schema_key_name__ = "conf.{}_schema_name" 11 | 12 | 13 | def get_qualified_table_name(level: str, name: str, spark: Optional[SparkSession] = None) -> str: 14 | """Generates table name with catalog and schema if specified. 15 | 16 | Args: 17 | level (str): The level of the table (silver, gold, bronze, ...). 18 | name (str): The name of the table on the given level. 19 | spark (Optional[SparkSession], optional): Spark session. Defaults to None. 20 | 21 | Raises: 22 | Exception: ValueError if schema is not specified when catalog is specified. 23 | 24 | Returns: 25 | str: The fully qualified table name with catalog and schema. 26 | """ 27 | if not spark: 28 | spark = SparkSession.getActiveSession() 29 | catalog = spark.conf.get(__catalog_key_name__.format(level), "") 30 | schema = spark.conf.get(__schema_key_name__.format(level), "") 31 | if catalog and not schema: 32 | raise ValueError("Schema must be specified if catalog is specified") 33 | base = "" 34 | if catalog: 35 | base += f"{catalog}." 36 | if schema: 37 | base += f"{schema}." 38 | return f"{base}{name}" 39 | 40 | 41 | def sanitize_string_for_flow_name(s: str) -> str: 42 | """Sanitize a string to be used as a flow/function name. 43 | 44 | Args: 45 | s (str): The string to be sanitized. 46 | 47 | Returns: 48 | str: The sanitized string. 49 | """ 50 | return re.sub(r"[^a-zA-Z0-9]+", "_", s)[-20:].strip("_") 51 | 52 | 53 | def get_normalized_table_name(name: str, catalog: Optional[str] = None, schema: Optional[str] = None, 54 | spark: Optional[SparkSession] = None) -> str: 55 | """Get the name for normalized (OCSF) table with catalog and schema. 56 | 57 | Args: 58 | name (str): The base name of the table. 59 | catalog (Optional[str], optional): The catalog name. Defaults to None. 60 | schema (Optional[str], optional): The schema name. Defaults to None. 61 | spark (Optional[SparkSession], optional): Spark session. Defaults to None. 62 | 63 | Raises: 64 | Exception: Exception if catalog or schema are not specified. 65 | 66 | Returns: 67 | str: The normalized table name 68 | """ 69 | if not spark: 70 | spark = SparkSession.getActiveSession() 71 | if not catalog: 72 | catalog = spark.conf.get(__catalog_key_name__.format("gold"), "") 73 | if not schema: 74 | schema = spark.conf.get(__schema_key_name__.format("gold"), "") 75 | if not catalog or not schema: 76 | raise Exception("Catalog and Schema must be specified explicitly or in Spark conf") 77 | return f"{catalog}.{schema}.{name}" 78 | 79 | 80 | def create_normalized_sink(name: str, spark: Optional[SparkSession] = None) -> str: 81 | """Create a DLT sink with a normalized name. 82 | 83 | Args: 84 | name (str): The base name for the sink. 85 | spark (Optional[SparkSession], optional): Spark session. Defaults to None. 86 | 87 | Returns: 88 | str: The name of the created sink. 89 | """ 90 | import dlt 91 | sink_name = f"{name}_ocsf" 92 | table_name = get_normalized_table_name(name, spark=spark) 93 | dlt.create_sink(sink_name, "delta", { "tableName": table_name, "mergeSchema": "true" }) 94 | return sink_name 95 | 96 | -------------------------------------------------------------------------------- /dlt_modern_stuff/src/ingest_zeek_http.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | import dlt 3 | 4 | # COMMAND ---------- 5 | 6 | import pyspark.sql.functions as F 7 | 8 | from typing import Optional 9 | 10 | # COMMAND ---------- 11 | 12 | from helpers import HTTP_TABLE_NAME, get_qualified_table_name, create_normalized_sink, sanitize_string_for_flow_name 13 | 14 | # COMMAND ---------- 15 | 16 | # MAGIC %md 17 | # MAGIC 18 | # MAGIC We're using streaming tables + append flows to make sure that we can add more (or remove not used) source locations for this data type. 19 | 20 | # COMMAND ---------- 21 | 22 | zeek_http_table_name = get_qualified_table_name("silver", "zeek_http", spark) 23 | dlt.create_streaming_table( 24 | name = zeek_http_table_name, 25 | cluster_by = ["timestamp"], 26 | ) 27 | 28 | # COMMAND ---------- 29 | 30 | zeek_http_schema_hints = "`id.orig_p` int, `id.resp_p` int, ts double, status_code int" 31 | zeek_http_renames = { 32 | "id.orig_h": "id_origin_host", 33 | "id.orig_p": "id_origin_port", 34 | "id.resp_h": "id_response_host", 35 | "id.resp_p": "id_response_port", 36 | "ts": "timestamp", 37 | } 38 | 39 | def create_zeek_http_flow(input: str, add_opts: Optional[dict] = None): 40 | @dlt.append_flow(name=f"zeek_http_{sanitize_string_for_flow_name(input)}", 41 | target = zeek_http_table_name, 42 | comment = f"Ingesting from {input}") 43 | def flow(): 44 | autoloader_opts = { 45 | "cloudFiles.format": "json", 46 | "cloudFiles.schemaHints": zeek_http_schema_hints, 47 | #"cloudFiles.useManagedFileEvents": "true", 48 | } | (add_opts or {}) 49 | df = spark.readStream.format("cloudFiles").options(**autoloader_opts).load(input) 50 | df = df.withColumnsRenamed(zeek_http_renames) 51 | df = df.withColumns({ 52 | "timestamp": F.col("timestamp").cast("timestamp"), 53 | "ingest_time": F.current_timestamp(), 54 | }) 55 | return df 56 | 57 | # COMMAND ---------- 58 | 59 | zeek_http_input = spark.conf.get("conf.zeek_http_input") 60 | # We're using input location as-is, but we can pass it as a list, and generate multiple flows from it 61 | create_zeek_http_flow(zeek_http_input) 62 | 63 | # COMMAND ---------- 64 | 65 | # DBTITLE 1,Convert into the normalized OCSF format 66 | sink_name = create_normalized_sink(HTTP_TABLE_NAME, spark=spark) 67 | 68 | @dlt.append_flow(name="zeek_http_normalized", target=sink_name) 69 | def write_normalized(): 70 | df = dlt.read_stream(zeek_http_table_name) 71 | # This could be incomplete mapping, but we can improve later 72 | df = df.selectExpr( 73 | "99 as activity_id", 74 | "4 as category_uid", 75 | "4002 as class_uid", 76 | "timestamp as time", 77 | "99 as severity_id", 78 | "400299 as type_uid", 79 | """named_struct( 80 | 'hostname', host, 81 | 'ip', id_response_host, 82 | 'port', id_response_port 83 | ) as dst_endpoint""", 84 | """named_struct( 85 | 'http_method', method, 86 | 'user_agent', user_agent, 87 | 'version', `version`, 88 | 'url', uri 89 | ) as http_request""", 90 | """named_struct( 91 | 'code', status_code 92 | ) as http_response""", 93 | """named_struct( 94 | 'product', 'zeek', 95 | 'version', '1.0.0', 96 | 'uid', uid, 97 | 'processed_time', ingest_time 98 | ) as metadata""", 99 | """named_struct ( 100 | 'ip', `id_origin_host`, 101 | 'port', `id_origin_port` 102 | ) as src_endpoint""", 103 | ) 104 | return df 105 | -------------------------------------------------------------------------------- /dlt_modern_stuff/src/ingest_zeek_conn.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | import dlt 3 | 4 | # COMMAND ---------- 5 | 6 | import pyspark.sql.functions as F 7 | 8 | from typing import Optional 9 | 10 | # COMMAND ---------- 11 | 12 | from helpers import get_qualified_table_name, NETWORK_TABLE_NAME, create_normalized_sink, sanitize_string_for_flow_name 13 | 14 | # COMMAND ---------- 15 | 16 | # MAGIC %md 17 | # MAGIC 18 | # MAGIC We're using streaming tables + append flows to make sure that we can add more (or remove not used) source locations for this data type. 19 | 20 | # COMMAND ---------- 21 | 22 | zeek_conn_table_name = get_qualified_table_name("silver", "zeek_conn", spark) 23 | dlt.create_streaming_table( 24 | name=zeek_conn_table_name, 25 | cluster_by = ["timestamp"], 26 | ) 27 | 28 | # COMMAND ---------- 29 | 30 | zeek_conn_schema_hints = "`id.orig_p` int, `id.resp_p` int, ts double" 31 | zeek_conn_renames = { 32 | "id.orig_h": "id_origin_host", 33 | "id.orig_p": "id_origin_port", 34 | "id.resp_h": "id_response_host", 35 | "id.resp_p": "id_response_port", 36 | "orig_bytes": "origin_bytes", 37 | "resp_bytes": "response_bytes", 38 | "orig_pkts": "origin_packets", 39 | "orig_ip_bytes": "origin_ip_bytes", 40 | "resp_pkts": "response_packets", 41 | "resp_ip_bytes": "response_ip_bytes", 42 | "local_orig": "local_origin", 43 | "local_resp": "local_response", 44 | "resp_l2_addr": "response_l2_address", 45 | "orig_l2_addr": "origin_l2_address", 46 | "ts": "timestamp", 47 | } 48 | 49 | def create_zeek_conn_flow(input: str, add_opts: Optional[dict] = None): 50 | @dlt.append_flow(name=f"zeek_conn_{sanitize_string_for_flow_name(input)}", 51 | target=zeek_conn_table_name, 52 | comment=f"Ingesting from {input}") 53 | def flow(): 54 | autoloader_opts = { 55 | "cloudFiles.format": "json", 56 | "cloudFiles.schemaHints": zeek_conn_schema_hints, 57 | #"cloudFiles.useManagedFileEvents": "true", 58 | } | (add_opts or {}) 59 | df = spark.readStream.format("cloudFiles").options(**autoloader_opts).load(input) 60 | df = df.withColumnsRenamed(zeek_conn_renames) 61 | df = df.withColumns({ 62 | "timestamp": F.col("timestamp").cast("timestamp"), 63 | "ingest_time": F.current_timestamp(), 64 | }) 65 | return df 66 | 67 | # COMMAND ---------- 68 | 69 | zeek_conn_input = spark.conf.get("conf.zeek_conn_input") 70 | # We're using input location as-is, but we can pass it as a list, and generate multiple flows from it 71 | create_zeek_conn_flow(zeek_conn_input) 72 | 73 | # COMMAND ---------- 74 | 75 | sink_name = create_normalized_sink(NETWORK_TABLE_NAME, spark=spark) 76 | 77 | @dlt.append_flow(name="zeek_conn_normalized", target=sink_name) 78 | def write_normalized(): 79 | df = dlt.read_stream(zeek_conn_table_name) 80 | # This could be incomplete mapping, but we can improve later 81 | df = df.selectExpr( 82 | "99 as activity_id", 83 | "4 as category_uid", 84 | "4001 as class_uid", 85 | "timestamp as time", 86 | "99 as severity_id", 87 | "400199 as type_uid", 88 | "duration*1000 as duration", 89 | """named_struct( 90 | 'ip', id_response_host, 91 | 'port', id_response_port 92 | ) as dst_endpoint""", 93 | """named_struct( 94 | 'product', 'zeek', 95 | 'version', '1.0.0', 96 | 'uid', uid, 97 | 'processed_time', ingest_time 98 | ) as metadata""", 99 | """named_struct ( 100 | 'ip', id_origin_host, 101 | 'port', id_origin_port 102 | ) as src_endpoint""", 103 | """named_struct( 104 | 'bytes_in', response_bytes, 105 | 'packets_in', response_packets, 106 | 'bytes_out', origin_bytes, 107 | 'packets_out', origin_packets, 108 | 'bytes_missed', missed_bytes, 109 | 'bytes', response_bytes + origin_bytes, 110 | 'packets', response_packets + origin_packets 111 | ) as traffic""", 112 | """named_struct( 113 | 'direction_id', 0, 114 | 'protocol_name', proto, 115 | 'flag_history', history 116 | ) as connection_info""" 117 | ) 118 | return df 119 | -------------------------------------------------------------------------------- /dlt_modern_stuff/src/setup.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | from helpers import * 3 | import os 4 | 5 | # COMMAND ---------- 6 | 7 | dbutils.widgets.text("catalog_name", "", "Catalog name") 8 | dbutils.widgets.text("schema_name", "", "Schema name") 9 | dbutils.widgets.text("volume_path", "", "UC Volume path for data") 10 | 11 | # COMMAND ---------- 12 | 13 | catalog_name = dbutils.widgets.get("catalog_name") 14 | schema_name = dbutils.widgets.get("schema_name") 15 | volume_path = dbutils.widgets.get("volume_path") 16 | 17 | # COMMAND ---------- 18 | 19 | if not catalog_name or not schema_name or not volume_path: 20 | raise Exception("Catalog name, Schema name and UC Volume path must be provided") 21 | 22 | # COMMAND ---------- 23 | 24 | spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog_name}.{schema_name}") 25 | for name in [HTTP_TABLE_NAME, NETWORK_TABLE_NAME]: 26 | table_name = get_normalized_table_name(name, catalog_name, schema_name) 27 | print(f"Creating table {table_name}") 28 | spark.sql(f"""CREATE TABLE IF NOT EXISTS {table_name} ( 29 | activity_id int, 30 | category_uid int, 31 | class_uid int, 32 | time timestamp 33 | )""") 34 | # spark.sql(f"TRUNCATE TABLE {table_name}") 35 | 36 | # COMMAND ---------- 37 | 38 | base_path = os.path.join(volume_path, "logs") 39 | for i in ["apache", "nginx", "zeek_conn", "zeek_http", "aws_cloudtrail"]: 40 | p = os.path.join(base_path, i) 41 | os.makedirs(p, exist_ok=True) 42 | 43 | # COMMAND ---------- 44 | 45 | import urllib.request 46 | 47 | # Apache logs 48 | urllib.request.urlretrieve("https://raw.githubusercontent.com/elastic/examples/refs/heads/master/Common%20Data%20Formats/apache_logs/apache_logs", 49 | os.path.join(base_path, "apache", "log1.txt")) 50 | # Nginx logs 51 | urllib.request.urlretrieve("https://raw.githubusercontent.com/elastic/examples/refs/heads/master/Common%20Data%20Formats/nginx_logs/nginx_logs", 52 | os.path.join(base_path, "nginx", "log1.txt")) 53 | # Zeek HTTP logs 54 | urllib.request.urlretrieve("https://raw.githubusercontent.com/ocsf/examples/refs/heads/main/raw_sample_log_dataset/Zeek/http.log", 55 | os.path.join(base_path, "zeek_http", "log1.txt")) 56 | urllib.request.urlretrieve("https://raw.githubusercontent.com/lipyeow-lim/security-datasets01/refs/heads/main/maccdc-2012/03/http.log.gz", 57 | os.path.join(base_path, "zeek_http", "log2.gz")) 58 | urllib.request.urlretrieve("https://raw.githubusercontent.com/lipyeow-lim/security-datasets01/refs/heads/main/maccdc-2012/12/http.log.gz", 59 | os.path.join(base_path, "zeek_http", "log3.gz")) 60 | # Zeek Conn logs 61 | urllib.request.urlretrieve("https://raw.githubusercontent.com/ocsf/examples/refs/heads/main/raw_sample_log_dataset/Zeek/conn.log", 62 | os.path.join(base_path, "zeek_conn", "log1.txt")) 63 | urllib.request.urlretrieve("https://raw.githubusercontent.com/lipyeow-lim/security-datasets01/refs/heads/main/maccdc-2012/12/conn.log.gz", 64 | os.path.join(base_path, "zeek_conn", "log2.gz")) 65 | urllib.request.urlretrieve("https://raw.githubusercontent.com/lipyeow-lim/security-datasets01/refs/heads/main/maccdc-2012/03/conn.log.gz", 66 | os.path.join(base_path, "zeek_conn", "log3.gz")) 67 | # AWS Cloudtrail logs 68 | urllib.request.urlretrieve("https://gist.githubusercontent.com/alexott/8ccd963811969d2446d2239e031a7b78/raw/2bae06ade977277dce4f580f214a0ed129f38810/data.jsonl", 69 | os.path.join(base_path, "aws_cloudtrail", "data.json")) 70 | 71 | 72 | # COMMAND ---------- 73 | 74 | table_name = get_normalized_table_name("iocs", catalog_name, schema_name) 75 | spark.sql(f"""CREATE TABLE IF NOT EXISTS {table_name} ( 76 | ioc_type string, 77 | ioc string 78 | ) COMMENT 'These are arbitrary IPs, not related to real IoCs - just for demo purposes' 79 | """) 80 | # These are arbitrary IPs, not related to real IoCs - just for demo purposes 81 | idf = spark.createDataFrame([['IPv4', '205.251.199.192'], 82 | ['IPv4', '54.148.114.85'], 83 | ['IPv4', '95.217.228.176'], 84 | ['IPv4', '190.104.181.125'], 85 | ], schema="ioc_type string, ioc string") 86 | idf.write.mode("overwrite").saveAsTable(table_name) 87 | -------------------------------------------------------------------------------- /dlt_modern_stuff/src/ingest_apache_web.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | import dlt 3 | 4 | # COMMAND ---------- 5 | 6 | import pyspark.sql.functions as F 7 | 8 | from typing import Optional 9 | 10 | # COMMAND ---------- 11 | 12 | from helpers import HTTP_TABLE_NAME, get_qualified_table_name, create_normalized_sink, sanitize_string_for_flow_name 13 | 14 | # COMMAND ---------- 15 | 16 | # MAGIC %md 17 | # MAGIC 18 | # MAGIC We're using streaming tables + append flows to make sure that we can add more (or remove not used) source locations for this data type. 19 | 20 | # COMMAND ---------- 21 | 22 | apache_web_table_name = get_qualified_table_name("silver", "apache_web", spark) 23 | dlt.create_streaming_table( 24 | name=apache_web_table_name, 25 | comment="Table for data parsed from Apache HTTP server-compatible logs", 26 | cluster_by = ["timestamp"], 27 | ) 28 | 29 | # COMMAND ---------- 30 | 31 | apache_web_regex = ( 32 | r'^(\S+) (\S+) (\S+) \[(.+?)] "(\w+) (\S+) ([^"]+)" (\d{3}) (\d+|-) "(.+)" "(.+)"?$' 33 | ) 34 | 35 | 36 | def read_apache_web(input: str, add_opts: Optional[dict] = None): 37 | autoloader_opts = { 38 | "cloudFiles.format": "text", 39 | #"cloudFiles.useManagedFileEvents": "true", 40 | } | (add_opts or {}) 41 | df = spark.readStream.format("cloudFiles").options(**autoloader_opts).load(input) 42 | df = df.withColumns( 43 | { 44 | "host": F.regexp_extract("value", apache_web_regex, 1), 45 | "user": F.regexp_extract("value", apache_web_regex, 3), 46 | "timestamp": F.unix_timestamp( 47 | F.regexp_extract("value", apache_web_regex, 4), "dd/MMM/yyyy:HH:mm:ss Z" 48 | ).cast("timestamp"), 49 | "method": F.regexp_extract("value", apache_web_regex, 5), 50 | "path": F.regexp_extract("value", apache_web_regex, 6), 51 | "version": F.regexp_extract("value", apache_web_regex, 7), 52 | "code": F.regexp_extract("value", apache_web_regex, 8).cast("int"), 53 | "size": F.regexp_extract("value", apache_web_regex, 9).cast("long"), 54 | "referrer": F.regexp_extract("value", apache_web_regex, 10), 55 | "agent": F.regexp_extract("value", apache_web_regex, 11), 56 | "ingest_time": F.current_timestamp(), 57 | } 58 | ) 59 | return df 60 | 61 | 62 | def create_apache_web_flow(input: str, add_opts: Optional[dict] = None): 63 | @dlt.append_flow( 64 | name=f"apache_web_{sanitize_string_for_flow_name(input)}", 65 | target=apache_web_table_name, 66 | comment=f"Ingesting from {input}", 67 | ) 68 | def flow(): 69 | return read_apache_web(input, add_opts) 70 | 71 | # COMMAND ---------- 72 | 73 | # DBTITLE 1,Handling of Apache Web logs 74 | apache_web_input = spark.conf.get("conf.apache_web_input") 75 | # We're using input location as-is, but we can pass it as a list, and generate multiple flows from it 76 | create_apache_web_flow(apache_web_input) 77 | 78 | # COMMAND ---------- 79 | 80 | # DBTITLE 1,Handling of NGINX logs (compatible with Apache Web) 81 | nginx_input = spark.conf.get("conf.nginx_input") 82 | # We're using input location as-is, but we can pass it as a list, and generate multiple flows from it 83 | create_apache_web_flow(nginx_input) 84 | 85 | # COMMAND ---------- 86 | 87 | sink_name = create_normalized_sink(HTTP_TABLE_NAME, spark=spark) 88 | 89 | @dlt.append_flow( 90 | name="apache_web_normalized", 91 | target=sink_name 92 | ) 93 | def write_normalized(): 94 | df = dlt.read_stream(apache_web_table_name) 95 | # This could be incomplete mapping, but we can improve later 96 | df = df.selectExpr( 97 | "99 as activity_id", 98 | "4 as category_uid", 99 | "4002 as class_uid", 100 | "timestamp as time", 101 | "99 as severity_id", 102 | "400299 as type_uid", 103 | """named_struct( 104 | 'hostname', host, 105 | 'ip', host 106 | ) as src_endpoint""", 107 | """named_struct( 108 | 'http_method', method, 109 | 'user_agent', agent, 110 | 'version', version, 111 | 'url', path, 112 | 'referrer', referrer 113 | ) as http_request""", 114 | """named_struct( 115 | 'code', code, 116 | 'length', size 117 | ) as http_response""", 118 | """named_struct( 119 | 'product', 'apache_web', 120 | 'version', '1.0.0', 121 | 'processed_time', ingest_time 122 | ) as metadata""", 123 | ) 124 | return df 125 | -------------------------------------------------------------------------------- /dlt_modern_stuff/README.md: -------------------------------------------------------------------------------- 1 | # dlt_modern_stuff 2 | 3 | This directory contains a source code that demonstrates use of latest Delta Live Tables (DLT) features for cybersecurity use cases. You can find more information in the [blog post](https://alexott.blogspot.com/2025/03/effective-use-of-latest-dlt-features.html). 4 | 5 | In general, this project consists of three DLT pipelines that perform data ingestion, normalization to [Open Cybersecurity Schema Framework (OCSF)](https://schema.ocsf.io/), and doing a rudimentary detections against normalized data as it's shown on the image below: 6 | 7 | 1. Ingestion of Apache Web and Nginx logs into `apache_web` table and then normalizing it into a table corresponding to OCSF's HTTP activity. 8 | 2. Ingestion of Zeek data: 9 | * Zeek HTTP data into `zeek_http` table, and then normalizing it into an `http` table corresponding to OCSF's HTTP activity. 10 | * Zeek Conn data into `zeek_conn` table, and then normalizing it into a `network` table corresponding to OCSF's Network activity. 11 | 3. Detection pipeline that does the following: 12 | * Matches network connections data from `network` table against `iocs` table. 13 | * Checks HTTP logs from `http` table for admin pages scans from external parties. 14 | * All matches are stored in the `detections` table, and optionally pushed to EventHubs and/or Splunk. 15 | 16 | ![Implemented pipelines](images/cyber-pipeline-impl.png) 17 | 18 | 19 | ## Setting up & running 20 | 21 | > [!IMPORTANT] 22 | This bundle uses Serverless compute, so make sure that it's enabled for your workspace (works on [Databricks Free Edition](https://www.databricks.com/blog/introducing-databricks-free-edition) as well). If it's not, then you need to adjust parameters of the job and DLT pipelines! 23 | 24 | You can install the project two ways: 25 | 26 | 1. Using Databricks Assset Bundles (DABs) inside the Databricks Workspace (recommended): 27 | 1. Using DABs from the command line of your computer 28 | 29 | ### Setting it up using DABs in workspace 30 | 31 | 1. Create a [Git Folder](https://docs.databricks.com/aws/en/repos/) inside your Databricks workspace by cloning this repository. 32 | 33 | 2. Open the `dlt_modern_stuff/databricks.yaml` inside create Git Folder. 34 | 35 | 3. Adjust the following parameters inside the `databricks.yaml` (create necessary objects before use): 36 | 37 | - `catalog_name` - the name of the existing UC Catalog used in configuration. 38 | - `silver_schema_name` - the name of an existing UC Schema to put processed data of individual log sources. 39 | - `normalized_schema_name` - the name of an existing UC Schema to put tables with normalized data, IoCs and Detections tables. 40 | - `log_files_path` - the path to an existing UC Volume where raw log data will be stored. 41 | 42 | 4. Click **Deploy** button in the **Deployments** tab on the left - this will create necessary jobs and pipelines 43 | 44 | 5. Click **Run** button next to the `DLT Cyber Demo: Setup` job. 45 | 46 | 6. Click **Start pipeline** for DLT pipelines to process data and run detections (in the following order): 47 | 48 | - `DLT Cyber Demo: Ingest Zeek data` 49 | - `DLT Cyber Demo: Ingest Apache data` 50 | - `DLT Cyber Demo: Detections` 51 | 52 | ### Setting it up using DABs locally 53 | 54 | 1. Install the latest version of [Databricks CLI](https://docs.databricks.com/dev-tools/cli/databricks-cli.html). 55 | 56 | 2. Authenticate to your Databricks workspace, if you have not done so already: 57 | 58 | ```sh 59 | databricks configure 60 | ``` 61 | 62 | 3. Set environment variable `DATABRICKS_CONFIG_PROFILE` to the name of Databricks CLI profile you configured, and configure necessary variables in the `dev` profile of `databricks.yml` file. You need to specify the following (create necessary objects before use): 63 | 64 | - `catalog_name` - the name of the existing UC Catalog used in configuration. 65 | - `silver_schema_name` - the name of an existing UC Schema to put processed data of individual log sources. 66 | - `normalized_schema_name` - the name of an existing UC Schema to put tables with normalized data, IoCs and Detections tables. 67 | - `log_files_path` - the path to an existing UC Volume where raw log data will be stored. 68 | 69 | 4. To deploy a development copy of this project, type: 70 | 71 | ```sh 72 | databricks bundle deploy 73 | ``` 74 | 75 | 5. Run a job to set up the normalized tables and download sample log files: 76 | 77 | ```sh 78 | databricks bundle run dlt_cyber_demo_setup 79 | ``` 80 | 81 | 6. Run DLT pipelines to ingest data: 82 | 83 | ```sh 84 | databricks bundle run demo_ingest_zeek_data 85 | databricks bundle run demo_ingest_apache_data 86 | ``` 87 | 88 | 7. Run DLT pipeline that emulates detections against normalized data: 89 | 90 | ```sh 91 | databricks bundle run demo_detections 92 | ``` 93 | 94 | -------------------------------------------------------------------------------- /dlt_modern_stuff/src/detections.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %pip install -U https://github.com/alexott/cyber-spark-data-connectors/releases/download/v0.0.4/cyber_spark_data_connectors-0.0.4-py3-none-any.whl 3 | 4 | # COMMAND ---------- 5 | 6 | import dlt 7 | 8 | # COMMAND ---------- 9 | 10 | import pyspark.sql.functions as F 11 | 12 | # COMMAND ---------- 13 | 14 | from helpers import get_normalized_table_name, NETWORK_TABLE_NAME, HTTP_TABLE_NAME, get_qualified_table_name 15 | 16 | # COMMAND ---------- 17 | 18 | detections_table_name = get_qualified_table_name("gold", "detections", spark) 19 | dlt.create_streaming_table( 20 | name=detections_table_name, 21 | comment="Streaming table for detections" 22 | ) 23 | 24 | # COMMAND ---------- 25 | 26 | # DBTITLE 1,Check network traffic for connections to known IoCs 27 | @dlt.append_flow( 28 | target=detections_table_name, 29 | ) 30 | def match_network_iocs(): 31 | network_table = spark.readStream.table(get_normalized_table_name(NETWORK_TABLE_NAME)) 32 | iocs_table = spark.read.table(get_normalized_table_name("iocs")).filter("ioc_type = 'IPv4'") 33 | matches = network_table.join(iocs_table, network_table.dst_endpoint.ip == iocs_table.ioc) 34 | matches = matches.selectExpr( 35 | "to_json(struct(*)) as details", 36 | "current_timestamp() as detection_time", 37 | "'network' as detection_source", 38 | "'ioc_match' as detection_type", 39 | "'warn' as detection_level" 40 | ) 41 | return matches 42 | 43 | 44 | # COMMAND ---------- 45 | 46 | # DBTITLE 1,Check HTTP logs for scan of admin pages from external IPs 47 | @dlt.append_flow( 48 | target=detections_table_name, 49 | ) 50 | def check_http_logs_admin_scan(): 51 | http_table = spark.readStream.table(get_normalized_table_name(HTTP_TABLE_NAME)) 52 | matches = http_table.filter("http_request.url like '/admin%' and not (src_endpoint.ip like '192.168.%' or src_endpoint.ip like '10.%')") 53 | matches = matches.selectExpr( 54 | "to_json(struct(*)) as details", 55 | "current_timestamp() as detection_time", 56 | "'http' as detection_source", 57 | "'http_admin_page_scan' as detection_type", 58 | "'info' as detection_level" 59 | ) 60 | return matches 61 | 62 | 63 | # COMMAND ---------- 64 | 65 | push_to_eventhub = spark.conf.get("conf.push_to_eventhubs", "false") == "true" 66 | if push_to_eventhub: 67 | # name of EH namespace 68 | eh_ns = spark.conf.get("conf.eh_ns") 69 | # name of a topic in EH namespace 70 | eh_topic = spark.conf.get("conf.eh_topic") 71 | # Entra ID Tenant ID where service principal is created 72 | tenant_id = spark.conf.get("conf.azure_tenant_id") 73 | secret_scope = spark.conf.get("conf.secret_scope") 74 | sp_id_key_name = spark.conf.get("conf.sp_id_key_name") 75 | sp_secret_key_name = spark.conf.get("conf.sp_secret_key_name") 76 | client_id = dbutils.secrets.get(secret_scope, sp_id_key_name) # Application ID of service principal 77 | client_secret = dbutils.secrets.get(secret_scope, sp_secret_key_name) # Client secret of service principal 78 | # fully qualified name of the Event Hubs server 79 | eh_server = eh_server = f"{eh_ns}.servicebus.windows.net" 80 | # SASL config for Kafka to connect to Event Hubs 81 | sasl_config = f'kafkashaded.org.apache.kafka.common.security.oauthbearer.OAuthBearerLoginModule' + \ 82 | f' required clientId="{client_id}" clientSecret="{client_secret}"' + \ 83 | f' scope="https://{eh_server}/.default" ssl.protocol="SSL";' 84 | # Callback class for OAuth authentication 85 | callback_class = "kafkashaded.org.apache.kafka.common.security.oauthbearer.secured.OAuthBearerLoginCallbackHandler" 86 | # OAuth endpoint for Entra ID 87 | oauth_endpoint = f"https://login.microsoft.com/{tenant_id}/oauth2/v2.0/token" 88 | 89 | dlt.create_sink( 90 | "eventhubs", 91 | "kafka", 92 | { # Create Kafka options dictionary for connection with OAuth authentication 93 | "kafka.bootstrap.servers": f"{eh_server}:9093", 94 | "topic": eh_topic, 95 | "kafka.security.protocol": "SASL_SSL", 96 | "kafka.sasl.mechanism": "OAUTHBEARER", 97 | "kafka.sasl.jaas.config": sasl_config, 98 | "kafka.sasl.oauthbearer.token.endpoint.url": oauth_endpoint, 99 | "kafka.sasl.login.callback.handler.class": callback_class, 100 | "kafka.request.timeout.ms": "60000", 101 | "kafka.session.timeout.ms": "30000", 102 | } 103 | ) 104 | 105 | @dlt.append_flow(name = "write_alerts", target = "eventhubs") 106 | def flowFunc(): 107 | df = dlt.read_stream(detections_table_name) 108 | return df.select(F.to_json(F.struct("*")).alias("value")) 109 | 110 | # COMMAND ---------- 111 | 112 | push_to_splunk = spark.conf.get("conf.push_to_splunk", "false") == "true" 113 | if push_to_splunk: 114 | from cyber_connectors import * 115 | spark.dataSource.register(SplunkDataSource) 116 | 117 | splunk_opts = { 118 | "url": spark.conf.get("conf.splunk_url") , 119 | "token": spark.conf.get("conf.splunk_hec_token"), 120 | "time_column": "detection_time", 121 | "source": "dlt", 122 | } 123 | dlt.create_sink("splunk", "splunk", splunk_opts) 124 | 125 | @dlt.append_flow(name = "write_to_splunk", target = "splunk") 126 | def flowFunc(): 127 | df = dlt.read_stream(detections_table_name) 128 | df = df.withColumn("details", F.from_json("details", "map")) 129 | return df 130 | -------------------------------------------------------------------------------- /dlt_modern_stuff/src/ingest_aws_cloudtrail.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | import dlt 3 | 4 | # COMMAND ---------- 5 | 6 | import pyspark.sql.functions as F 7 | from pyspark.sql import DataFrame 8 | 9 | from typing import Optional 10 | 11 | import random 12 | import datetime 13 | 14 | # COMMAND ---------- 15 | 16 | from helpers import get_qualified_table_name, NETWORK_TABLE_NAME, create_normalized_sink, sanitize_string_for_flow_name 17 | 18 | # COMMAND ---------- 19 | 20 | aws_cloudtrail_table_name = get_qualified_table_name("silver", "aws_cloudtrail", spark) 21 | dlt.create_streaming_table( 22 | name=aws_cloudtrail_table_name, 23 | cluster_by = ["event_name", "event_time"], 24 | table_properties = { 25 | # These properties are necessary for Variant support 26 | "delta.minWriterVersion": "7", 27 | "delta.enableDeletionVectors": "true", 28 | "delta.minReaderVersion": "3", 29 | "delta.feature.variantType-preview": "supported", 30 | "delta.feature.deletionVectors": "supported", 31 | }, 32 | ) 33 | 34 | # COMMAND ---------- 35 | 36 | def normalize_aws_cloudtrail(df: DataFrame, raw_column_name: str = "_raw") -> DataFrame: 37 | df = df.selectExpr(f"{raw_column_name}:Records::array as _records") 38 | df = df.select(F.explode("_records").alias("_record")) 39 | df = df.selectExpr("*", "_record:resources::variant as resources") 40 | 41 | view_name = f"cloudtrail_{int(datetime.datetime.now().timestamp())}_{random.randint(0, 1000)}" 42 | df.createOrReplaceTempView(view_name) 43 | 44 | # TODO: rewrite to PySpark when we get support for `variant_explode_outer` in DLT 45 | # https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.tvf.TableValuedFunction.variant_explode_outer.html 46 | df = spark.sql(f"""SELECT 47 | _record:awsRegion::string as aws_region, 48 | _record:eventID::string as event_id, 49 | _record:eventName::string as event_name, 50 | _record:eventSource::string as event_source, 51 | _record:eventType::string as event_type, 52 | _record:eventTime::timestamp as event_time, 53 | _record:eventVersion::string as event_version, 54 | _record:recipientAccountId::string as recipient_account_id, 55 | _record:requestID::string as request_id, 56 | _record:requestParameters::variant as request_parameters, 57 | _record:responseElements::variant as response_elements, 58 | _record:responseElements.user.arn::string as response_user_arn, 59 | _record:responseElements.role.arn::string as response_role_arn, 60 | _record:responseElements.policy.arn::string as response_policy_arn, 61 | resource.value:accountId::string as resources_account_id, 62 | _record:sourceIPAddress::string as source_ip_address, 63 | _record:userAgent::string as user_agent, 64 | _record:userIdentity::variant as user_identity, 65 | _record:userIdentity.type::string as user_identity_type, 66 | _record:userIdentity.principalId::string as user_identity_principal, 67 | _record:userIdentity.arn::string as user_identity_arn, 68 | _record:userIdentity.accountId::string as user_identity_account_id, 69 | _record:userIdentity.invokedBy::string as user_identity_invoked_by, 70 | _record:userIdentity.accessKeyId::string as user_identity_access_ke, 71 | _record:userIdentity.userName::string as user_identity_username, 72 | _record:userIdentity.sessionContext.attributes.mfaAuthenticated::boolean as user_identity_session_context_attributes_mfa_authenticated, 73 | _record:userIdentity.sessionContext.attributes.creationDate::string as user_identity_session_context_attributes_creation_date, 74 | _record:userIdentity.sessionContext.sessionIssuer.type::string as user_identity_session_context_sesion_issuer_type, 75 | _record:userIdentity.sessionContext.sessionIssuer.principalId::string as user_identity_session_context_sesion_issuer_principal_id, 76 | _record:userIdentity.sessionContext.sessionIssuer.arn::string as user_identity_session_context_sesion_issuer_arn, 77 | _record:userIdentity.sessionContext.sessionIssuer.accountId::string as user_identity_session_context_sesion_issuer_account_id, 78 | _record:userIdentity.sessionContext.sessionIssuer.userName::string as user_identity_session_context_sesion_issuer_user_name, 79 | _record:errorCode::string as error_code, 80 | _record:errorMessage::string as error_message, 81 | _record:additionalEventData::variant as additional_event_data, 82 | _record:apiVersion::string as api_version, 83 | _record:readOnly::boolean as read_only, 84 | _record:serviceEventDetails::string as service_event_details, 85 | _record:sharedEventId::string as shared_event_id, 86 | _record:vpcEndpointId::string as vpc_endpoint_id, 87 | _record, resource.value as resource 88 | FROM {view_name}, LATERAL variant_explode_outer(resources) as resource""") 89 | 90 | return df 91 | 92 | # COMMAND ---------- 93 | 94 | def read_aws_cloudtrail(input: str, add_opts: Optional[dict] = None) -> DataFrame: 95 | autoloader_opts = { 96 | "cloudFiles.format": "json", 97 | "singleVariantColumn": "_raw", 98 | #"cloudFiles.useManagedFileEvents": "true", 99 | } | (add_opts or {}) 100 | df = spark.readStream.format("cloudFiles") \ 101 | .options(**autoloader_opts).load(input) 102 | return normalize_aws_cloudtrail(df) 103 | 104 | # COMMAND ---------- 105 | 106 | # #sdf = read_aws_cloudtrail("/Volumes/cybersecurity/logs/logs/aws-cloudtrail/") 107 | # sdf = read_aws_cloudtrail("/Volumes/cybersecurity/logs/logs/demo/logs/aws_cloudtrail/") 108 | # display(sdf) 109 | 110 | # COMMAND ---------- 111 | 112 | def create_aws_cloudtrail_flow(input: str, add_opts: Optional[dict] = None): 113 | @dlt.append_flow( 114 | name=f"aws_cloudtrail_{sanitize_string_for_flow_name(input)}", 115 | target=aws_cloudtrail_table_name, 116 | comment=f"Ingesting from {input}", 117 | ) 118 | def flow(): 119 | return read_aws_cloudtrail(input, add_opts) 120 | 121 | # COMMAND ---------- 122 | 123 | aws_cloudtrail_input = spark.conf.get("conf.aws_cloudtrail_input") 124 | # We're using input location as-is, but we can pass it as a list, and generate multiple flows from it 125 | create_aws_cloudtrail_flow(aws_cloudtrail_input) 126 | -------------------------------------------------------------------------------- /iocs-ingest/IoCs Silver.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | import os 3 | from typing import List, Dict, Any, Optional 4 | 5 | from delta.tables import DeltaTable 6 | from pyspark.sql import DataFrame, SparkSession 7 | import pyspark.sql.functions as F 8 | 9 | # COMMAND ---------- 10 | 11 | # Variables... 12 | base_dir = "/mnt/cyberdata" 13 | iocs_table_name = "cyber.iocs" 14 | 15 | # COMMAND ---------- 16 | 17 | # MAGIC %run "./IoCs Common" 18 | 19 | # COMMAND ---------- 20 | 21 | df = spark.readStream\ 22 | .format("delta")\ 23 | .option("ignoreChanges", "true") \ 24 | .load(f"{base_dir}/bronze/threatintel/") 25 | 26 | # COMMAND ---------- 27 | 28 | # MAGIC %md ## TODOs 29 | # MAGIC 30 | # MAGIC * \[X\] Extract `first_seen` timestamp either from data, or based on the first timestamp from the threat intel feed 31 | # MAGIC * \[ \] Think how to handle `last_seen` over the time... For example, set last_seen for IPs to "first_seen + N days" ? 32 | 33 | # COMMAND ---------- 34 | 35 | # DBTITLE 1,Do the base decode that will be used for all pipelines 36 | message_df = df.select("dataset", "timestamp", "msg_hash", F.from_json("value", "message string").alias("json")) \ 37 | .select("*", "json.message").drop("json") 38 | 39 | # COMMAND ---------- 40 | 41 | spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "true") 42 | 43 | # COMMAND ---------- 44 | 45 | def process_single_source(input_df: DataFrame, service: str, transform_func): 46 | checkpoint = f"{base_dir}/checkpoints/threatintel-silver-{service}/" 47 | transform_func(input_df).writeStream.format("delta") \ 48 | .option("checkpointLocation", checkpoint) \ 49 | .trigger(availableNow=True) \ 50 | .foreachBatch(lambda df, epoch: drop_duplicates_with_merge(df, primary_key_columns=["ioc_type", "ioc"], 51 | path=f"{base_dir}/silver/threatintel/", table_name=iocs_table_name, 52 | partitionby=["ioc_type"], opts={"mergeSchema": "true"})) \ 53 | .start().awaitTermination() 54 | 55 | # COMMAND ---------- 56 | 57 | # MAGIC %md ## Handle OTX (OpenThread Exchange) feed 58 | 59 | # COMMAND ---------- 60 | 61 | def handle_otx(df: DataFrame) -> DataFrame: 62 | otx_schema = 'STRUCT' 63 | otx_df = df.filter("dataset = 'otx'").select("*", F.from_json("message", otx_schema).alias("jsn")).select("*", "jsn.*").drop("jsn", "message") 64 | otx_df = otx_df.withColumnRenamed("type", "ioc_type") \ 65 | .withColumnRenamed("indicator", "ioc") \ 66 | .withColumn("ioc_id", F.col("id").cast("string")) \ 67 | .withColumnRenamed("description", "ioc_description") \ 68 | .withColumnRenamed("content", "ioc_content") \ 69 | .withColumnRenamed("title", "ioc_title") \ 70 | .withColumn("first_seen", F.col("timestamp")) 71 | return otx_df 72 | 73 | # COMMAND ---------- 74 | 75 | process_single_source(message_df, "otx", handle_otx) 76 | 77 | # COMMAND ---------- 78 | 79 | # MAGIC %md ## Handle AbuseURL feed 80 | 81 | # COMMAND ---------- 82 | 83 | def handle_abuseurl(df: DataFrame) -> DataFrame: 84 | abuseurl_schema = 'STRUCT, date_added: STRING, host: STRING, id: STRING, larted: STRING, reporter: STRING, tags: ARRAY, threat: STRING, url: STRING, url_status: STRING, urlhaus_reference: STRING>' 85 | abuseurl_df = df.filter("dataset = 'abuseurl'").select("*", F.from_json("message", abuseurl_schema).alias("abuseurl")) \ 86 | .selectExpr("*", "abuseurl.url as ioc", "abuseurl.id as ioc_id", "'URL' as ioc_type", 87 | "cast(abuseurl.date_added as timestamp) as first_seen").drop("message") 88 | return abuseurl_df 89 | 90 | # COMMAND ---------- 91 | 92 | process_single_source(message_df, "abuseurl", handle_abuseurl) 93 | 94 | # COMMAND ---------- 95 | 96 | # MAGIC %md ## Handle Malware Bazaar feed 97 | 98 | # COMMAND ---------- 99 | 100 | def handle_malwarebazaar(df: DataFrame) -> DataFrame: 101 | malwarebazaar_schema = "STRUCT, dhash_icon: STRING, file_name: STRING, file_size: BIGINT, file_type: STRING, file_type_mime: STRING, first_seen: STRING, imphash: STRING, intelligence: STRUCT, last_seen: STRING, md5_hash: STRING, origin_country: STRING, reporter: STRING, sha1_hash: STRING, sha256_hash: STRING, sha3_384_hash: STRING, signature: STRING, ssdeep: STRING, tags: ARRAY, telfhash: STRING, tlsh: STRING>" 102 | 103 | mb_df = df.filter("dataset = 'malwarebazaar'").select("*", F.from_json("message", malwarebazaar_schema).alias("malwarebazaar")) \ 104 | .select("malwarebazaar.file_name", "malwarebazaar.file_size", "malwarebazaar.file_type", "malwarebazaar.file_type_mime", 105 | F.expr("cast(malwarebazaar.first_seen as timestamp) as first_seen"), 106 | F.expr("cast(malwarebazaar.last_seen as timestamp) as last_seen"), 107 | F.posexplode(F.create_map(F.lit('FileHash-MD5'), "malwarebazaar.md5_hash", 108 | F.lit('FileHash-ImpHash'), "malwarebazaar.imphash", 109 | F.lit('FileHash-SHA1'), "malwarebazaar.sha1_hash", 110 | F.lit('FileHash-SHA256'), "malwarebazaar.sha256_hash", 111 | F.lit('FileHash-SHA384'), "malwarebazaar.sha3_384_hash", 112 | F.lit('FileHash-SSDEEP'), "malwarebazaar.ssdeep", 113 | F.lit('FileHash-TElfHash'), "malwarebazaar.telfhash", 114 | F.lit('FileHash-TLSH'), "malwarebazaar.tlsh").alias("_map_")), 115 | "*").drop("pos", "message").withColumnRenamed("key", "ioc_type").withColumnRenamed("value", "ioc").filter("ioc is not null") 116 | return mb_df 117 | 118 | # COMMAND ---------- 119 | 120 | process_single_source(message_df, "malwarebazaar", handle_malwarebazaar) 121 | 122 | # COMMAND ---------- 123 | 124 | # MAGIC %md ## Handle Abuse Malware feed 125 | 126 | # COMMAND ---------- 127 | 128 | def handle_abusemalware(df: DataFrame) -> DataFrame: 129 | abusemalware_schema = "STRUCT" 130 | am_df = df.filter("dataset = 'abusemalware'").select("*", F.from_json("message", abusemalware_schema).alias("abusemalware")) \ 131 | .select(F.expr("cast(abusemalware.file_size as long) as file_size"), "abusemalware.file_type", 132 | F.expr("cast(abusemalware.firstseen as timestamp) as first_seen"), 133 | F.posexplode(F.create_map(F.lit('FileHash-MD5'), "abusemalware.md5_hash", 134 | F.lit('FileHash-ImpHash'), "abusemalware.imphash", 135 | F.lit('FileHash-SHA256'), "abusemalware.sha256_hash", 136 | F.lit('FileHash-SSDEEP'), "abusemalware.ssdeep", 137 | F.lit('FileHash-TLSH'), "abusemalware.tlsh").alias("_map_")), 138 | "*").drop("pos", "message").withColumnRenamed("key", "ioc_type").withColumnRenamed("value", "ioc").filter("ioc is not null") 139 | return am_df 140 | 141 | # COMMAND ---------- 142 | 143 | process_single_source(message_df, "abusemalware", handle_abusemalware) 144 | 145 | # COMMAND ---------- 146 | 147 | # MAGIC %sql 148 | # MAGIC -- do only after the table is created 149 | # MAGIC -- CREATE BLOOMFILTER INDEX ON TABLE cyber.iocs FOR COLUMNS( ioc ) 150 | -------------------------------------------------------------------------------- /uc-udfs/protocols.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md 3 | -- MAGIC 4 | -- MAGIC ## User-defined functions to work with network protocols 5 | -- MAGIC 6 | -- MAGIC Mapping is as defined by [IANA](https://www.iana.org/assignments/protocol-numbers/protocol-numbers.xhtml) 7 | 8 | -- COMMAND ---------- 9 | 10 | CREATE OR REPLACE FUNCTION proto_name_to_code(name STRING) 11 | RETURNS INT 12 | COMMENT 'Maps network protocol name into a numeric code as per IANA' 13 | RETURN CASE lower(name) 14 | WHEN 'hopopt' THEN 0 15 | WHEN 'icmp' THEN 1 16 | WHEN 'igmp' THEN 2 17 | WHEN 'ggp' THEN 3 18 | WHEN 'ipv4' THEN 4 19 | WHEN 'st' THEN 5 20 | WHEN 'tcp' THEN 6 21 | WHEN 'cbt' THEN 7 22 | WHEN 'egp' THEN 8 23 | WHEN 'igp' THEN 9 24 | WHEN 'bbn-rcc-mon' THEN 10 25 | WHEN 'nvp-ii' THEN 11 26 | WHEN 'pup' THEN 12 27 | WHEN 'argus' THEN 13 28 | WHEN 'emcon' THEN 14 29 | WHEN 'xnet' THEN 15 30 | WHEN 'chaos' THEN 16 31 | WHEN 'udp' THEN 17 32 | WHEN 'mux' THEN 18 33 | WHEN 'dcn-meas' THEN 19 34 | WHEN 'hmp' THEN 20 35 | WHEN 'prm' THEN 21 36 | WHEN 'xns-idp' THEN 22 37 | WHEN 'trunk-1' THEN 23 38 | WHEN 'trunk-2' THEN 24 39 | WHEN 'leaf-1' THEN 25 40 | WHEN 'leaf-2' THEN 26 41 | WHEN 'rdp' THEN 27 42 | WHEN 'irtp' THEN 28 43 | WHEN 'iso-tp4' THEN 29 44 | WHEN 'netblt' THEN 30 45 | WHEN 'mfe-nsp' THEN 31 46 | WHEN 'merit-inp' THEN 32 47 | WHEN 'dccp' THEN 33 48 | WHEN '3pc' THEN 34 49 | WHEN 'idpr' THEN 35 50 | WHEN 'xtp' THEN 36 51 | WHEN 'ddp' THEN 37 52 | WHEN 'idpr-cmtp' THEN 38 53 | WHEN 'tp++' THEN 39 54 | WHEN 'il' THEN 40 55 | WHEN 'ipv6' THEN 41 56 | WHEN 'sdrp' THEN 42 57 | WHEN 'ipv6-route' THEN 43 58 | WHEN 'ipv6-frag' THEN 44 59 | WHEN 'idrp' THEN 45 60 | WHEN 'rsvp' THEN 46 61 | WHEN 'gre' THEN 47 62 | WHEN 'dsr' THEN 48 63 | WHEN 'bna' THEN 49 64 | WHEN 'esp' THEN 50 65 | WHEN 'ah' THEN 51 66 | WHEN 'i-nlsp' THEN 52 67 | WHEN 'swipe' THEN 53 68 | WHEN 'narp' THEN 54 69 | WHEN 'min-ipv4' THEN 55 70 | WHEN 'tlsp' THEN 56 71 | WHEN 'skip' THEN 57 72 | WHEN 'ipv6-icmp' THEN 58 73 | WHEN 'ipv6-nonxt' THEN 59 74 | WHEN 'ipv6-opts' THEN 60 75 | WHEN 'cftp' THEN 62 76 | WHEN 'sat-expak' THEN 64 77 | WHEN 'kryptolan' THEN 65 78 | WHEN 'rvd' THEN 66 79 | WHEN 'ippc' THEN 67 80 | WHEN 'sat-mon' THEN 69 81 | WHEN 'visa' THEN 70 82 | WHEN 'ipcv' THEN 71 83 | WHEN 'cpnx' THEN 72 84 | WHEN 'cphb' THEN 73 85 | WHEN 'wsn' THEN 74 86 | WHEN 'pvp' THEN 75 87 | WHEN 'br-sat-mon' THEN 76 88 | WHEN 'sun-nd' THEN 77 89 | WHEN 'wb-mon' THEN 78 90 | WHEN 'wb-expak' THEN 79 91 | WHEN 'iso-ip' THEN 80 92 | WHEN 'vmtp' THEN 81 93 | WHEN 'secure-vmtp' THEN 82 94 | WHEN 'vines' THEN 83 95 | WHEN 'iptm' THEN 84 96 | WHEN 'nsfnet-igp' THEN 85 97 | WHEN 'dgp' THEN 86 98 | WHEN 'tcf' THEN 87 99 | WHEN 'eigrp' THEN 88 100 | WHEN 'ospfigp' THEN 89 101 | WHEN 'sprite-rpc' THEN 90 102 | WHEN 'larp' THEN 91 103 | WHEN 'mtp' THEN 92 104 | WHEN 'ax.25' THEN 93 105 | WHEN 'ipip' THEN 94 106 | WHEN 'micp' THEN 95 107 | WHEN 'scc-sp' THEN 96 108 | WHEN 'etherip' THEN 97 109 | WHEN 'encap' THEN 98 110 | WHEN 'gmtp' THEN 100 111 | WHEN 'ifmp' THEN 101 112 | WHEN 'pnni' THEN 102 113 | WHEN 'pim' THEN 103 114 | WHEN 'aris' THEN 104 115 | WHEN 'scps' THEN 105 116 | WHEN 'qnx' THEN 106 117 | WHEN 'a/n' THEN 107 118 | WHEN 'ipcomp' THEN 108 119 | WHEN 'snp' THEN 109 120 | WHEN 'compaq-peer' THEN 110 121 | WHEN 'ipx-in-ip' THEN 111 122 | WHEN 'vrrp' THEN 112 123 | WHEN 'pgm' THEN 113 124 | WHEN 'l2tp' THEN 115 125 | WHEN 'ddx' THEN 116 126 | WHEN 'iatp' THEN 117 127 | WHEN 'stp' THEN 118 128 | WHEN 'srp' THEN 119 129 | WHEN 'uti' THEN 120 130 | WHEN 'smp' THEN 121 131 | WHEN 'sm' THEN 122 132 | WHEN 'ptp' THEN 123 133 | WHEN 'isis over ipv4' THEN 124 134 | WHEN 'fire' THEN 125 135 | WHEN 'crtp' THEN 126 136 | WHEN 'crudp' THEN 127 137 | WHEN 'sscopmce' THEN 128 138 | WHEN 'iplt' THEN 129 139 | WHEN 'sps' THEN 130 140 | WHEN 'pipe' THEN 131 141 | WHEN 'sctp' THEN 132 142 | WHEN 'fc' THEN 133 143 | WHEN 'rsvp-e2e-ignore' THEN 134 144 | WHEN 'mobility header' THEN 135 145 | WHEN 'udplite' THEN 136 146 | WHEN 'mpls-in-ip' THEN 137 147 | WHEN 'manet' THEN 138 148 | WHEN 'hip' THEN 139 149 | WHEN 'shim6' THEN 140 150 | WHEN 'wesp' THEN 141 151 | WHEN 'rohc' THEN 142 152 | WHEN 'ethernet' THEN 143 153 | WHEN 'aggfrag' THEN 144 154 | WHEN 'nsh' THEN 145 155 | WHEN 'homa' THEN 146 156 | WHEN 'bit-emu' THEN 147 157 | WHEN 'reserved' THEN 255 158 | ELSE NULL 159 | END; 160 | 161 | -- COMMAND ---------- 162 | 163 | CREATE OR REPLACE FUNCTION proto_code_to_name(code INT) 164 | RETURNS STRING 165 | COMMENT 'Maps network protocol numeric code into the name as per IANA' 166 | RETURN 167 | CASE code 168 | WHEN 0 THEN 'HOPOPT' 169 | WHEN 1 THEN 'ICMP' 170 | WHEN 2 THEN 'IGMP' 171 | WHEN 3 THEN 'GGP' 172 | WHEN 4 THEN 'IPv4' 173 | WHEN 5 THEN 'ST' 174 | WHEN 6 THEN 'TCP' 175 | WHEN 7 THEN 'CBT' 176 | WHEN 8 THEN 'EGP' 177 | WHEN 9 THEN 'IGP' 178 | WHEN 10 THEN 'BBN-RCC-MON' 179 | WHEN 11 THEN 'NVP-II' 180 | WHEN 12 THEN 'PUP' 181 | WHEN 13 THEN 'ARGUS (deprecated)' 182 | WHEN 14 THEN 'EMCON' 183 | WHEN 15 THEN 'XNET' 184 | WHEN 16 THEN 'CHAOS' 185 | WHEN 17 THEN 'UDP' 186 | WHEN 18 THEN 'MUX' 187 | WHEN 19 THEN 'DCN-MEAS' 188 | WHEN 20 THEN 'HMP' 189 | WHEN 21 THEN 'PRM' 190 | WHEN 22 THEN 'XNS-IDP' 191 | WHEN 23 THEN 'TRUNK-1' 192 | WHEN 24 THEN 'TRUNK-2' 193 | WHEN 25 THEN 'LEAF-1' 194 | WHEN 26 THEN 'LEAF-2' 195 | WHEN 27 THEN 'RDP' 196 | WHEN 28 THEN 'IRTP' 197 | WHEN 29 THEN 'ISO-TP4' 198 | WHEN 30 THEN 'NETBLT' 199 | WHEN 31 THEN 'MFE-NSP' 200 | WHEN 32 THEN 'MERIT-INP' 201 | WHEN 33 THEN 'DCCP' 202 | WHEN 34 THEN '3PC' 203 | WHEN 35 THEN 'IDPR' 204 | WHEN 36 THEN 'XTP' 205 | WHEN 37 THEN 'DDP' 206 | WHEN 38 THEN 'IDPR-CMTP' 207 | WHEN 39 THEN 'TP++' 208 | WHEN 40 THEN 'IL' 209 | WHEN 41 THEN 'IPv6' 210 | WHEN 42 THEN 'SDRP' 211 | WHEN 43 THEN 'IPv6-Route' 212 | WHEN 44 THEN 'IPv6-Frag' 213 | WHEN 45 THEN 'IDRP' 214 | WHEN 46 THEN 'RSVP' 215 | WHEN 47 THEN 'GRE' 216 | WHEN 48 THEN 'DSR' 217 | WHEN 49 THEN 'BNA' 218 | WHEN 50 THEN 'ESP' 219 | WHEN 51 THEN 'AH' 220 | WHEN 52 THEN 'I-NLSP' 221 | WHEN 53 THEN 'SWIPE (deprecated)' 222 | WHEN 54 THEN 'NARP' 223 | WHEN 55 THEN 'Min-IPv4' 224 | WHEN 56 THEN 'TLSP' 225 | WHEN 57 THEN 'SKIP' 226 | WHEN 58 THEN 'IPv6-ICMP' 227 | WHEN 59 THEN 'IPv6-NoNxt' 228 | WHEN 60 THEN 'IPv6-Opts' 229 | WHEN 61 THEN 'Any host internal protocol' 230 | WHEN 62 THEN 'CFTP' 231 | WHEN 63 THEN 'Any local network' 232 | WHEN 64 THEN 'SAT-EXPAK' 233 | WHEN 65 THEN 'KRYPTOLAN' 234 | WHEN 66 THEN 'RVD' 235 | WHEN 67 THEN 'IPPC' 236 | WHEN 68 THEN 'Any distributed file system' 237 | WHEN 69 THEN 'SAT-MON' 238 | WHEN 70 THEN 'VISA' 239 | WHEN 71 THEN 'IPCV' 240 | WHEN 72 THEN 'CPNX' 241 | WHEN 73 THEN 'CPHB' 242 | WHEN 74 THEN 'WSN' 243 | WHEN 75 THEN 'PVP' 244 | WHEN 76 THEN 'BR-SAT-MON' 245 | WHEN 77 THEN 'SUN-ND' 246 | WHEN 78 THEN 'WB-MON' 247 | WHEN 79 THEN 'WB-EXPAK' 248 | WHEN 80 THEN 'ISO-IP' 249 | WHEN 81 THEN 'VMTP' 250 | WHEN 82 THEN 'SECURE-VMTP' 251 | WHEN 83 THEN 'VINES' 252 | WHEN 84 THEN 'IPTM' 253 | WHEN 85 THEN 'NSFNET-IGP' 254 | WHEN 86 THEN 'DGP' 255 | WHEN 87 THEN 'TCF' 256 | WHEN 88 THEN 'EIGRP' 257 | WHEN 89 THEN 'OSPFIGP' 258 | WHEN 90 THEN 'Sprite-RPC' 259 | WHEN 91 THEN 'LARP' 260 | WHEN 92 THEN 'MTP' 261 | WHEN 93 THEN 'AX.25' 262 | WHEN 94 THEN 'IPIP' 263 | WHEN 95 THEN 'MICP (deprecated)' 264 | WHEN 96 THEN 'SCC-SP' 265 | WHEN 97 THEN 'ETHERIP' 266 | WHEN 98 THEN 'ENCAP' 267 | WHEN 99 THEN 'Any private encryption scheme' 268 | WHEN 100 THEN 'GMTP' 269 | WHEN 101 THEN 'IFMP' 270 | WHEN 102 THEN 'PNNI' 271 | WHEN 103 THEN 'PIM' 272 | WHEN 104 THEN 'ARIS' 273 | WHEN 105 THEN 'SCPS' 274 | WHEN 106 THEN 'QNX' 275 | WHEN 107 THEN 'A/N' 276 | WHEN 108 THEN 'IPComp' 277 | WHEN 109 THEN 'SNP' 278 | WHEN 110 THEN 'Compaq-Peer' 279 | WHEN 111 THEN 'IPX-in-IP' 280 | WHEN 112 THEN 'VRRP' 281 | WHEN 113 THEN 'PGM' 282 | WHEN 114 THEN 'Any 0-hop protocol' 283 | WHEN 115 THEN 'L2TP' 284 | WHEN 116 THEN 'DDX' 285 | WHEN 117 THEN 'IATP' 286 | WHEN 118 THEN 'STP' 287 | WHEN 119 THEN 'SRP' 288 | WHEN 120 THEN 'UTI' 289 | WHEN 121 THEN 'SMP' 290 | WHEN 122 THEN 'SM (deprecated)' 291 | WHEN 123 THEN 'PTP' 292 | WHEN 124 THEN 'ISIS over IPv4' 293 | WHEN 125 THEN 'FIRE' 294 | WHEN 126 THEN 'CRTP' 295 | WHEN 127 THEN 'CRUDP' 296 | WHEN 128 THEN 'SSCOPMCE' 297 | WHEN 129 THEN 'IPLT' 298 | WHEN 130 THEN 'SPS' 299 | WHEN 131 THEN 'PIPE' 300 | WHEN 132 THEN 'SCTP' 301 | WHEN 133 THEN 'FC' 302 | WHEN 134 THEN 'RSVP-E2E-IGNORE' 303 | WHEN 135 THEN 'Mobility Header' 304 | WHEN 136 THEN 'UDPLite' 305 | WHEN 137 THEN 'MPLS-in-IP' 306 | WHEN 138 THEN 'manet' 307 | WHEN 139 THEN 'HIP' 308 | WHEN 140 THEN 'Shim6' 309 | WHEN 141 THEN 'WESP' 310 | WHEN 142 THEN 'ROHC' 311 | WHEN 143 THEN 'Ethernet' 312 | WHEN 144 THEN 'AGGFRAG' 313 | WHEN 145 THEN 'NSH' 314 | WHEN 146 THEN 'Homa' 315 | WHEN 147 THEN 'BIT-EMU' 316 | WHEN 253 THEN 'Experimentation & testing' 317 | WHEN 254 THEN 'Experimentation & testing' 318 | WHEN 255 THEN 'Reserved' 319 | ELSE 'Unassigned' 320 | END; 321 | --------------------------------------------------------------------------------