├── .gitignore ├── 1_🏠HomePage.py ├── CODE_OF_CONDUCT.md ├── Dockerfile ├── LICENSE ├── README.md ├── SECURITY.md ├── SUPPORT.md ├── data └── kql_queries.json ├── dev-notebooks ├── .ipynb_checkpoints │ ├── kql_query_harvester-checkpoint.ipynb │ └── repos-checkpoint.yaml ├── KQLHarvester-oldversion.ipynb ├── Kqlquery-schema.png ├── SampleDataStoreUse.ipynb ├── az-monitor-schemas.ipynb ├── db_code.ipynb ├── db_pandas_store.ipynb ├── db_schema.py ├── kql_query_harvester.ipynb ├── kqlquery.db └── repos.yaml ├── images └── DataFlowDiagram.png ├── kqlextraction ├── KqlExtraction │ ├── KqlExtraction.cs │ ├── KqlExtraction.csproj │ └── KqlExtraction.sln ├── Readme.txt ├── extract.py └── tests │ ├── test1.kql │ ├── test2.kql │ ├── test3.kql │ ├── test4.kql │ └── test5.kql ├── pages ├── 2_🔎KQL_interactive_search.py ├── 3_🛡️Schema_Browser.py ├── 4_ 📊KQL_Store_Insights.py └── 5_💬Contact_Us.py ├── requirements.txt ├── src ├── __init__.py ├── az_mon_schema.py ├── conf.txt ├── create_kql_db.py ├── data_store.py ├── extract.py ├── ian_test.kql ├── kql_download.py ├── kql_extract.py ├── kql_file_parser.py ├── kql_query.py ├── kqlextraction │ └── tests │ │ ├── test1.kql │ │ ├── test2.kql │ │ ├── test3.kql │ │ ├── test4.kql │ │ └── test5.kql ├── repos.yaml ├── test_data │ ├── test1.kql │ ├── test2.kql │ ├── test3.kql │ ├── test4.kql │ ├── test5.kql │ ├── test_10.json │ └── test_json.json ├── test_data_store.py ├── test_kql_download.py ├── test_kql_extract.py └── test_kql_query.py └── test_runs ├── kql_query_db-022-09-23_00_44_55.json ├── kql_query_db-2022-09-23-22-30-15.json ├── kql_query_db-2022-09-23-22-30-16.pkl ├── kql_query_db-2022-09-24-02-51-49.json ├── kql_query_db-2022-09-24-02-51-50.pkl └── kql_query_df--022-09-23_00_44_55.pkl /.gitignore: -------------------------------------------------------------------------------- 1 | **/.vs/** 2 | **/bin/Debug/** 3 | **/bin/Release/** 4 | **/obj/Debug/** 5 | **/obj/Release/** 6 | **/__pycache__/** 7 | **/obj/** 8 | -------------------------------------------------------------------------------- /1_🏠HomePage.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | 4 | from pathlib import Path 5 | 6 | 7 | def main() -> None: 8 | st.title(":mag_right: Interactive KQL Query Store") 9 | 10 | with st.expander("Expand to Read more about the Project"): 11 | st.write(Path("README.md").read_text()) 12 | 13 | st.success(":point_left: Select a page on left side bar to naviagate pages") 14 | 15 | 16 | if __name__ == "__main__": 17 | st.set_page_config( 18 | "Interactive KQL Query Store by MSTIC", 19 | "🔎", 20 | initial_sidebar_state="expanded", 21 | layout="wide", 22 | ) 23 | main() 24 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # app/Dockerfile 2 | 3 | FROM python:3.9-slim 4 | 5 | EXPOSE 8501 6 | 7 | WORKDIR /app 8 | 9 | RUN apt-get update && apt-get install -y \ 10 | build-essential \ 11 | software-properties-common \ 12 | git \ 13 | && rm -rf /var/lib/apt/lists/* 14 | 15 | RUN git clone https://github.com/microsoft/kql-query-store.git . 16 | 17 | RUN pip3 install -r requirements.txt 18 | 19 | ENTRYPOINT ["streamlit", "run", "1_🏠HomePage.py", "--server.port=8501", "--server.address=0.0.0.0"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Interactive KQL Query Store 2 | 3 | [![Streamlit App](https://static.streamlit.io/badges/streamlit_badge_black_white.svg)](https://aka.ms/kql-query-store) 4 | 5 | Currently many KQL queries are published on GitHub by Microsoft and Security Community on GitHub. All the queries are scattered as unstructured data and disorganized in various places making it difficult to discover for defenders and detection authors. 6 | 7 | GitHub search interface is not flexible to satisfy various custom search needs for defenders to effectively search various KQL queries by datasource , KQL operators , parsing of complex fields in data sources, custom tags if available etc. Having it easy to discover will help defenders in referencing existing work while writing new queries, reuse complex parsing examples in specific data sources and much more. 8 | 9 | ## Project Goals 10 | 11 | - Organized data store of KQL queries as a structured data store 12 | - Easy discoverability of KQL Queries based on tags, KQL operators, Datasource etc. 13 | - Point to relevant sources and GitHub links. 14 | - Interactive dashboard to explore the structured data. 15 | - Insights on various KQL queries from Azure Sentinel 16 | 17 | ## Architecture 18 | ![raw_image](https://raw.github.com/microsoft/kql-query-store/master/images/DataFlowDiagram.png) 19 | 20 | 21 | ## Docker instruction 22 | if you wish to host this locally/in-house, you can use below instructions to build docker images and host it. For more detailed instructions, check out Streamlit docs. [Deploy Streamlit using Docker](https://docs.streamlit.io/knowledge-base/tutorials/deploy/docker) 23 | 24 | Build image 25 | 26 | `docker build -t kql-query-store .` 27 | 28 | Run the docker container 29 | 30 | `docker run -p 8501:8501 kql-query-store` 31 | 32 | ## Contributing 33 | 34 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 35 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 36 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 37 | 38 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 39 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 40 | provided by the bot. You will only need to do this once across all repos using our CLA. 41 | 42 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 43 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 44 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # TODO: The maintainer of this repo has not yet edited this file 2 | 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? 4 | 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help. 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps. 7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide. 8 | 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.* 10 | 11 | # Support 12 | 13 | ## How to file issues and get help 14 | 15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 17 | feature request as a new Issue. 18 | 19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER 21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**. 22 | 23 | ## Microsoft Support Policy 24 | 25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 26 | -------------------------------------------------------------------------------- /dev-notebooks/.ipynb_checkpoints/repos-checkpoint.yaml: -------------------------------------------------------------------------------- 1 | - Github: 2 | branch: main 3 | repo: reprise99/Sentinel-Queries 4 | - Github: 5 | branch: main 6 | repo: ugurkocde/KQL_Intune -------------------------------------------------------------------------------- /dev-notebooks/KQLHarvester-oldversion.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "319188e6-ccfe-43e7-bd41-665e1f6450c3", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import logging\n", 11 | "from pathlib import Path\n", 12 | "import requests\n", 13 | "import io\n", 14 | "import zipfile\n", 15 | "from requests.exceptions import HTTPError\n", 16 | "import glob\n", 17 | "import pandas as pd\n", 18 | "import yaml\n", 19 | "from pandas import json_normalize\n", 20 | "\n", 21 | "def get_repo_urls(filename, branch_name):\n", 22 | " git_url = 'https://github.com/'\n", 23 | " file_name = f'{branch_name}.zip'\n", 24 | " suffix_string = 'archive/'+ file_name\n", 25 | " with open(filename, 'r', encoding='UTF-8') as f:\n", 26 | " repos = [git_url + line.rstrip() for line in f]\n", 27 | " repo_archive_urls = [line + suffix_string for line in repos]\n", 28 | " \n", 29 | " return repo_archive_urls\n", 30 | "\n", 31 | "def download_git_archive(git_url, output_dir):\n", 32 | " print(f\"Downloading from {git_url}, may take few mins..\")\n", 33 | " try:\n", 34 | " r = requests.get(git_url)\n", 35 | " repo_zip = io.BytesIO(r.content)\n", 36 | " archive = zipfile.ZipFile(repo_zip, mode=\"r\")\n", 37 | " for file in archive.namelist():\n", 38 | " archive.extract(file, path=output_dir)\n", 39 | " print(\"Downloaded and Extracted Files successfully\")\n", 40 | " except HTTPError as http_err:\n", 41 | " warnings.warn(f\"HTTP error occurred trying to download from Github: {http_err}\")\n", 42 | " \n", 43 | "def get_sentinel_queries_from_github(git_url, outputdir):\n", 44 | " print(f\"Downloading from Azure Sentinel Github, may take 2-3 mins..\")\n", 45 | " try:\n", 46 | " r = requests.get(git_url)\n", 47 | " repo_zip = io.BytesIO(r.content)\n", 48 | " archive = zipfile.ZipFile(repo_zip, mode=\"r\")\n", 49 | " # Only extract Detections and Hunting Queries Folder\n", 50 | " for file in archive.namelist():\n", 51 | " if file.startswith(\n", 52 | " (\n", 53 | " \"Azure-Sentinel-master/Detections/\",\n", 54 | " \"Azure-Sentinel-master/Hunting Queries/\",\n", 55 | " \"Azure-Sentinel-master/Solutions/\"\n", 56 | " )\n", 57 | " ):\n", 58 | " archive.extract(file, path=outputdir)\n", 59 | " print(\"Downloaded and Extracted Files successfully\")\n", 60 | " except HTTPError as http_err:\n", 61 | " warnings.warn(f\"HTTP error occurred trying to download from Github: {http_err}\")\n", 62 | " \n", 63 | "def parse_yaml(parent_dir, child_dir):\n", 64 | "\n", 65 | " sentinel_repourl = \"https://github.com/Azure/Azure-Sentinel/blob/master\"\n", 66 | "\n", 67 | " # Collect list of files recusrively uinder a folder\n", 68 | " yaml_queries = glob.glob(f\"{parent_dir}/{child_dir}/**/*.yaml\", recursive=True)\n", 69 | " df = pd.DataFrame()\n", 70 | "\n", 71 | " # Recursively load yaml Files and append to dataframe\n", 72 | " for query in yaml_queries:\n", 73 | " with open(query, \"r\", encoding=\"utf-8\", errors=\"ignore\") as f:\n", 74 | " parsed_yaml_df = json_normalize(yaml.load(f, Loader=yaml.FullLoader))\n", 75 | " parsed_yaml_df[\"DetectionURL\"] = query.replace(parent_dir, sentinel_repourl)\n", 76 | " frames = [df, parsed_yaml_df]\n", 77 | " df = pd.concat(frames, ignore_index=True, sort=True)\n", 78 | "\n", 79 | " if child_dir == \"Detections\":\n", 80 | " df[\"DetectionType\"] = \"Analytics\"\n", 81 | " elif child_dir == \"Hunting Queries\":\n", 82 | " df[\"DetectionType\"] = \"Hunting\"\n", 83 | " elif child_dir == \"Solutions\":\n", 84 | " df[\"DetectionType\"] = \"Solutions\"\n", 85 | "\n", 86 | " df[\"DetectionService\"] = \"Azure Sentinel Community Github\"\n", 87 | "\n", 88 | " return df" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 2, 94 | "id": "6a38f199-2250-45fa-ae24-bcd8dcfbde70", 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "Downloading from https://github.com/reprise99/Sentinel-Queries/archive/main.zip, may take few mins..\n", 102 | "Downloaded and Extracted Files successfully\n", 103 | "Downloading from https://github.com/ugurkocde/KQL_Intune/archive/main.zip, may take few mins..\n", 104 | "Downloaded and Extracted Files successfully\n" 105 | ] 106 | } 107 | ], 108 | "source": [ 109 | "repo_archive_urls = get_repo_urls('repo.conf', 'main')\n", 110 | "#Set output dir\n", 111 | "output_dir = Path.cwd()\n", 112 | "\n", 113 | "#download git repos\n", 114 | "for url in repo_archive_urls:\n", 115 | " download_git_archive(url, output_dir)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "id": "2ae3721d-87de-4d3a-ad1b-e259db980f42", 122 | "metadata": { 123 | "scrolled": true 124 | }, 125 | "outputs": [], 126 | "source": [ 127 | "#Download and Parses Microsoft Sentinel Repos\n", 128 | "azsentinel_git_url = \"https://github.com/Azure/Azure-Sentinel/archive/master.zip\"\n", 129 | "get_sentinel_queries_from_github(git_url=azsentinel_git_url, outputdir=output_dir)" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "id": "bbfb8388-2acb-4565-9fad-8e04d1c1146f", 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "tmp_path = str(Path.cwd())\n", 140 | "\n", 141 | "base_dir = tmp_path + \"/Azure-Sentinel-master\"\n", 142 | "columns = ['id', 'description', 'DetectionURL','query','tags','tactics','techniques']\n", 143 | "detections_df = parse_yaml(parent_dir=base_dir, child_dir=\"Detections\")\n", 144 | "detections_df = detections_df[columns]\n", 145 | "# hunting_df = parse_yaml(parent_dir=base_dir, child_dir=\"Hunting Queries\")\n", 146 | "# hunting_df = hunting_df[columns]\n", 147 | "solutions_df = parse_yaml(parent_dir=base_dir, child_dir=\"Solutions\")\n", 148 | "solutions_df = solutions_df[columns]\n", 149 | "\n", 150 | "frames = [detections_df, solutions_df]\n", 151 | "sentinel_df = pd.concat(frames, ignore_index=True, sort=True)" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "id": "8501dc5e-0e77-44d2-a63c-1899c9da8e2b", 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "sentinel_df.head()" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 226, 167 | "id": "7463daec", 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "def parse_markdown():\n", 172 | " df = pd.DataFrame()\n", 173 | " \n", 174 | " # Collect list of files recursively under a folder\n", 175 | " parent_dir = tmp_path + \"/KQL_Intune-main\"\n", 176 | " md_queries = glob.glob(f\"{parent_dir}/**/*.md\", recursive=True)\n", 177 | " parent_dir = tmp_path + \"/Sentinel-Queries-main\"\n", 178 | " md_queries = md_queries + glob.glob(f\"{parent_dir}/**/*.md\", recursive=True)\n", 179 | " \n", 180 | " df = pd.DataFrame(columns=['title', 'kql_query'])\n", 181 | " \n", 182 | " # Recursively load md Files and append to dataframe\n", 183 | " for query in md_queries:\n", 184 | " print(\"loading file:\", query)\n", 185 | " lines = Path(query).read_text(encoding=\"utf-8\").split('\\n')\n", 186 | "# print(lines)\n", 187 | "# kql_lines = re.findall(\"```kql([^```]*)\", lines)\n", 188 | "# ret.extend(kql_lines)\n", 189 | " ct = 0\n", 190 | " kql = False\n", 191 | " kql_collect = []\n", 192 | " title_collect = []\n", 193 | " cur_kql = []\n", 194 | " title = \"n/a\"\n", 195 | " while ct < len(lines):\n", 196 | " if kql:\n", 197 | " cur_kql.append(l[ct])\n", 198 | " if (lines[ct].startswith(\"#\") and lines[ct+2] == \"```kql\"):\n", 199 | " # print(l[ct])\n", 200 | " kql = True\n", 201 | " title = lines[ct]\n", 202 | " elif (lines[ct] == \"```kql\"):\n", 203 | " kql = True\n", 204 | " elif lines[ct] == \"```\":\n", 205 | " kql = False\n", 206 | " cur_kql = \"\\n\".join(cur_kql)\n", 207 | " kql_collect.append(cur_kql)\n", 208 | " title_collect.append(title)\n", 209 | " title = \"n/a\"\n", 210 | " cur_kql = []\n", 211 | " ct+=1\n", 212 | " test_df = pd.DataFrame(list(zip(title_collect, kql_collect)), columns=['title', 'kql_query'])\n", 213 | "# df.append(test_df)\n", 214 | " df = pd.concat([df, test_df])\n", 215 | " \n", 216 | " return df\n", 217 | " " 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 227, 223 | "id": "2a4926cd", 224 | "metadata": { 225 | "scrolled": true 226 | }, 227 | "outputs": [ 228 | { 229 | "name": "stdout", 230 | "output_type": "stream", 231 | "text": [ 232 | "loading file: C:\\Users\\jannieli\\OneDrive - Microsoft\\Documents\\hackathon2022/KQL_Intune-main\\README.md\n", 233 | "loading file: C:\\Users\\jannieli\\OneDrive - Microsoft\\Documents\\hackathon2022/KQL_Intune-main\\Azure Workbook\\readme.md\n", 234 | "loading file: C:\\Users\\jannieli\\OneDrive - Microsoft\\Documents\\hackathon2022/KQL_Intune-main\\Query Pack\\readme.md\n", 235 | "loading file: C:\\Users\\jannieli\\OneDrive - Microsoft\\Documents\\hackathon2022/Sentinel-Queries-main\\README.md\n", 236 | "loading file: C:\\Users\\jannieli\\OneDrive - Microsoft\\Documents\\hackathon2022/Sentinel-Queries-main\\Azure AD Abuse Detection\\README.md\n", 237 | "loading file: C:\\Users\\jannieli\\OneDrive - Microsoft\\Documents\\hackathon2022/Sentinel-Queries-main\\Functions\\README.md\n", 238 | "loading file: C:\\Users\\jannieli\\OneDrive - Microsoft\\Documents\\hackathon2022/Sentinel-Queries-main\\Query Pack\\README.md\n", 239 | "loading file: C:\\Users\\jannieli\\OneDrive - Microsoft\\Documents\\hackathon2022/Sentinel-Queries-main\\Sentinel vs Advanced Hunting\\README.md\n", 240 | "loading file: C:\\Users\\jannieli\\OneDrive - Microsoft\\Documents\\hackathon2022/Sentinel-Queries-main\\Workbooks\\README.md\n" 241 | ] 242 | } 243 | ], 244 | "source": [ 245 | "md_queries = parse_markdown()" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 228, 251 | "id": "fb209b79", 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "data": { 256 | "text/plain": [ 257 | "100" 258 | ] 259 | }, 260 | "execution_count": 228, 261 | "metadata": {}, 262 | "output_type": "execute_result" 263 | } 264 | ], 265 | "source": [ 266 | "len(md_queries.index)" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 229, 272 | "id": "53c8d231", 273 | "metadata": { 274 | "scrolled": false 275 | }, 276 | "outputs": [ 277 | { 278 | "data": { 279 | "text/html": [ 280 | "
\n", 281 | "\n", 294 | "\n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | "
titlekql_query
0n/aSigninLogs\\n| where TimeGenerated > ago(14d)\\n...
1n/aSigninLogs\\n```
2n/aSigninLogs\\n| where TimeGenerated > ago(14d)\\n```
3n/aSigninLogs\\n| where TimeGenerated > ago(14d)\\n...
4n/aSigninLogs\\n| where TimeGenerated > ago(14d)\\n...
.........
3n/aMicrosoft Sentinel will then run through your ...
4n/a| where AppDisplayName == \"Microsoft Teams\"\\n`...
5n/a| where TimeGenerated > ago(14d)\\n| where User...
6n/aThat is how you build queries, now the basics....
7n/a\\n```kql\\nSigninLogs
\n", 360 | "

100 rows × 2 columns

\n", 361 | "
" 362 | ], 363 | "text/plain": [ 364 | " title kql_query\n", 365 | "0 n/a SigninLogs\\n| where TimeGenerated > ago(14d)\\n...\n", 366 | "1 n/a SigninLogs\\n```\n", 367 | "2 n/a SigninLogs\\n| where TimeGenerated > ago(14d)\\n```\n", 368 | "3 n/a SigninLogs\\n| where TimeGenerated > ago(14d)\\n...\n", 369 | "4 n/a SigninLogs\\n| where TimeGenerated > ago(14d)\\n...\n", 370 | ".. ... ...\n", 371 | "3 n/a Microsoft Sentinel will then run through your ...\n", 372 | "4 n/a | where AppDisplayName == \"Microsoft Teams\"\\n`...\n", 373 | "5 n/a | where TimeGenerated > ago(14d)\\n| where User...\n", 374 | "6 n/a That is how you build queries, now the basics....\n", 375 | "7 n/a \\n```kql\\nSigninLogs\n", 376 | "\n", 377 | "[100 rows x 2 columns]" 378 | ] 379 | }, 380 | "metadata": {}, 381 | "output_type": "display_data" 382 | } 383 | ], 384 | "source": [ 385 | "display(md_queries)" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 230, 391 | "id": "bc55367d", 392 | "metadata": {}, 393 | "outputs": [ 394 | { 395 | "data": { 396 | "text/html": [ 397 | "
\n", 398 | "\n", 411 | "\n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | "
titlekql_query
0### Detection Query (User as actor)\\nWe want to use KQL to create accurate and ef...
1### Detection Query (User as actor)So first we have chosen our SigninLogs table.\\...
2### Detection Query (User as actor)Then we look for only logs where the ResultTyp...
3### Detection Query (Service principal as actor)SigninLogs\\n| where TimeGenerated > ago(14d)\\n...
4### Detection Query (User as actor)```\\n\\nIs much more efficient than searching f...
5### Detection Query (Service principal as actor)SigninLogs\\n| where TimeGenerated > ago(14d)\\n...
6### Detection Query (User as actor)SigninLogs\\n| where TimeGenerated between (ago...
7### Detection Query (Service principal as actor)SigninLogs\\n| where TimeGenerated between (ago...
8### Detection Query (User as actor)\\nInstead of equals, we can also use contains....
9### Detection Query (Service principal as actor)```kql\\nSigninLogs\\n| where TimeGenerated > ag...
10### Detection Query (User as actor)If you are searching for multiple words you ca...
11### Detection Query (Service principal as actor)| where AppDisplayName has_all (\"Teams\",\"Outlo...
12### Detection Query (User as actor, user as ta...This query would find all SigninLogs where the...
13### Detection Query (User as actor, service pr...\\nThis query would find SigninLogs where the a...
14### Detection Query (Service principal as acto...This query searches for SigninLogs data from t...
15### Detection Query (Service Principal as acto...\\nThis returns the same data, but changes the ...
16### Detection Query (User as actor, user as ta...```\\n\\nThis query will look up the SigninLogs ...
17### Detection Query (User as actor, Service Pr...\\nInstead of a total count, you can summarize ...
18### Detection Query (Service Principal as acto...| where TimeGenerated > ago(14d)\\n| where User...
19### Detection Query (Service principal as acto...\\nThis is the same but returns the oldest reco...
20### Detection Query (User as actor, user as ta...```\\n\\nThis returns the same data as our first...
21### Detection Query (User as actor, service pr...This is a combination of our countif and bin f...
22### Detection Query (Service principal as acto...SigninLogs\\n| where TimeGenerated > ago(14d)\\n...
23### Detection Query (Service principal as acto...If we use our same example from our Signinlogs...
24### Detection Query (User as actor, user as ta...\\nOr a barchart.\\n\\n```kql\\nSigninLogs\\n| wher...
25### Detection Query (User as actor, service pr...\\n```kql\\nSigninLogs\\n| where TimeGenerated > ...
26### Detection Query (Service principal as acto...```kql\\nSigninLogs\\n| where TimeGenerated > ag...
27### Detection Query (Service Principal as acto...| where TimeGenerated > ago(14d)\\n| where User...
28### Detection Query (User as actor, user as ta...| where TimeGenerated > ago(14d)\\n| where User...
29### Detection Query (User as actor, Service Pr...SigninLogs\\n| where TimeGenerated > ago(14d)\\n...
30### Detection Query (Service Principal as acto...| where UserPrincipalName == \"reprise_99@testd...
31### Detection Query (Service Principal as acto...```\\n\\nThis query searches all signins to your...
\n", 582 | "
" 583 | ], 584 | "text/plain": [ 585 | " title \\\n", 586 | "0 ### Detection Query (User as actor) \n", 587 | "1 ### Detection Query (User as actor) \n", 588 | "2 ### Detection Query (User as actor) \n", 589 | "3 ### Detection Query (Service principal as actor) \n", 590 | "4 ### Detection Query (User as actor) \n", 591 | "5 ### Detection Query (Service principal as actor) \n", 592 | "6 ### Detection Query (User as actor) \n", 593 | "7 ### Detection Query (Service principal as actor) \n", 594 | "8 ### Detection Query (User as actor) \n", 595 | "9 ### Detection Query (Service principal as actor) \n", 596 | "10 ### Detection Query (User as actor) \n", 597 | "11 ### Detection Query (Service principal as actor) \n", 598 | "12 ### Detection Query (User as actor, user as ta... \n", 599 | "13 ### Detection Query (User as actor, service pr... \n", 600 | "14 ### Detection Query (Service principal as acto... \n", 601 | "15 ### Detection Query (Service Principal as acto... \n", 602 | "16 ### Detection Query (User as actor, user as ta... \n", 603 | "17 ### Detection Query (User as actor, Service Pr... \n", 604 | "18 ### Detection Query (Service Principal as acto... \n", 605 | "19 ### Detection Query (Service principal as acto... \n", 606 | "20 ### Detection Query (User as actor, user as ta... \n", 607 | "21 ### Detection Query (User as actor, service pr... \n", 608 | "22 ### Detection Query (Service principal as acto... \n", 609 | "23 ### Detection Query (Service principal as acto... \n", 610 | "24 ### Detection Query (User as actor, user as ta... \n", 611 | "25 ### Detection Query (User as actor, service pr... \n", 612 | "26 ### Detection Query (Service principal as acto... \n", 613 | "27 ### Detection Query (Service Principal as acto... \n", 614 | "28 ### Detection Query (User as actor, user as ta... \n", 615 | "29 ### Detection Query (User as actor, Service Pr... \n", 616 | "30 ### Detection Query (Service Principal as acto... \n", 617 | "31 ### Detection Query (Service Principal as acto... \n", 618 | "\n", 619 | " kql_query \n", 620 | "0 \\nWe want to use KQL to create accurate and ef... \n", 621 | "1 So first we have chosen our SigninLogs table.\\... \n", 622 | "2 Then we look for only logs where the ResultTyp... \n", 623 | "3 SigninLogs\\n| where TimeGenerated > ago(14d)\\n... \n", 624 | "4 ```\\n\\nIs much more efficient than searching f... \n", 625 | "5 SigninLogs\\n| where TimeGenerated > ago(14d)\\n... \n", 626 | "6 SigninLogs\\n| where TimeGenerated between (ago... \n", 627 | "7 SigninLogs\\n| where TimeGenerated between (ago... \n", 628 | "8 \\nInstead of equals, we can also use contains.... \n", 629 | "9 ```kql\\nSigninLogs\\n| where TimeGenerated > ag... \n", 630 | "10 If you are searching for multiple words you ca... \n", 631 | "11 | where AppDisplayName has_all (\"Teams\",\"Outlo... \n", 632 | "12 This query would find all SigninLogs where the... \n", 633 | "13 \\nThis query would find SigninLogs where the a... \n", 634 | "14 This query searches for SigninLogs data from t... \n", 635 | "15 \\nThis returns the same data, but changes the ... \n", 636 | "16 ```\\n\\nThis query will look up the SigninLogs ... \n", 637 | "17 \\nInstead of a total count, you can summarize ... \n", 638 | "18 | where TimeGenerated > ago(14d)\\n| where User... \n", 639 | "19 \\nThis is the same but returns the oldest reco... \n", 640 | "20 ```\\n\\nThis returns the same data as our first... \n", 641 | "21 This is a combination of our countif and bin f... \n", 642 | "22 SigninLogs\\n| where TimeGenerated > ago(14d)\\n... \n", 643 | "23 If we use our same example from our Signinlogs... \n", 644 | "24 \\nOr a barchart.\\n\\n```kql\\nSigninLogs\\n| wher... \n", 645 | "25 \\n```kql\\nSigninLogs\\n| where TimeGenerated > ... \n", 646 | "26 ```kql\\nSigninLogs\\n| where TimeGenerated > ag... \n", 647 | "27 | where TimeGenerated > ago(14d)\\n| where User... \n", 648 | "28 | where TimeGenerated > ago(14d)\\n| where User... \n", 649 | "29 SigninLogs\\n| where TimeGenerated > ago(14d)\\n... \n", 650 | "30 | where UserPrincipalName == \"reprise_99@testd... \n", 651 | "31 ```\\n\\nThis query searches all signins to your... " 652 | ] 653 | }, 654 | "execution_count": 230, 655 | "metadata": {}, 656 | "output_type": "execute_result" 657 | } 658 | ], 659 | "source": [ 660 | "md_queries[md_queries['title'] != 'n/a']" 661 | ] 662 | }, 663 | { 664 | "cell_type": "code", 665 | "execution_count": null, 666 | "id": "59cc5e3e-5b37-4759-a962-902f7049b861", 667 | "metadata": {}, 668 | "outputs": [], 669 | "source": [ 670 | "tmp_path = str(Path.cwd())\n", 671 | "csv_files = glob.glob(os.path.join(path, \"*.csv\"))" 672 | ] 673 | }, 674 | { 675 | "cell_type": "code", 676 | "execution_count": null, 677 | "id": "22d2c705", 678 | "metadata": {}, 679 | "outputs": [], 680 | "source": [] 681 | } 682 | ], 683 | "metadata": { 684 | "kernelspec": { 685 | "display_name": "Python 3 (ipykernel)", 686 | "language": "python", 687 | "name": "python3" 688 | }, 689 | "language_info": { 690 | "codemirror_mode": { 691 | "name": "ipython", 692 | "version": 3 693 | }, 694 | "file_extension": ".py", 695 | "mimetype": "text/x-python", 696 | "name": "python", 697 | "nbconvert_exporter": "python", 698 | "pygments_lexer": "ipython3", 699 | "version": "3.8.13" 700 | } 701 | }, 702 | "nbformat": 4, 703 | "nbformat_minor": 5 704 | } 705 | -------------------------------------------------------------------------------- /dev-notebooks/Kqlquery-schema.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/kql-query-store/b2b2e82fe3a80a2baf8355b90d3a88db98b1a472/dev-notebooks/Kqlquery-schema.png -------------------------------------------------------------------------------- /dev-notebooks/az-monitor-schemas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 18, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Reading schemas for 11 tables...\n" 13 | ] 14 | }, 15 | { 16 | "name": "stderr", 17 | "output_type": "stream", 18 | "text": [ 19 | " 9%|▉ | 1/11 [00:00<00:02, 3.41it/s]" 20 | ] 21 | }, 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | "SecurityAlert Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')\n" 27 | ] 28 | }, 29 | { 30 | "name": "stderr", 31 | "output_type": "stream", 32 | "text": [ 33 | " 18%|█▊ | 2/11 [00:00<00:03, 2.49it/s]" 34 | ] 35 | }, 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "SecurityBaseline Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')\n" 41 | ] 42 | }, 43 | { 44 | "name": "stderr", 45 | "output_type": "stream", 46 | "text": [ 47 | " 27%|██▋ | 3/11 [00:01<00:03, 2.63it/s]" 48 | ] 49 | }, 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "SecurityBaselineSummary Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')\n" 55 | ] 56 | }, 57 | { 58 | "name": "stderr", 59 | "output_type": "stream", 60 | "text": [ 61 | " 36%|███▋ | 4/11 [00:01<00:02, 2.79it/s]" 62 | ] 63 | }, 64 | { 65 | "name": "stdout", 66 | "output_type": "stream", 67 | "text": [ 68 | "SecurityDetection Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')\n" 69 | ] 70 | }, 71 | { 72 | "name": "stderr", 73 | "output_type": "stream", 74 | "text": [ 75 | " 45%|████▌ | 5/11 [00:01<00:01, 3.21it/s]" 76 | ] 77 | }, 78 | { 79 | "name": "stdout", 80 | "output_type": "stream", 81 | "text": [ 82 | "SecurityEvent Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')\n" 83 | ] 84 | }, 85 | { 86 | "name": "stderr", 87 | "output_type": "stream", 88 | "text": [ 89 | " 64%|██████▎ | 7/11 [00:02<00:01, 3.85it/s]" 90 | ] 91 | }, 92 | { 93 | "name": "stdout", 94 | "output_type": "stream", 95 | "text": [ 96 | "SecurityIoTRawEvent Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')\n", 97 | "SecurityRecommendation Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')\n" 98 | ] 99 | }, 100 | { 101 | "name": "stderr", 102 | "output_type": "stream", 103 | "text": [ 104 | " 73%|███████▎ | 8/11 [00:02<00:00, 3.95it/s]" 105 | ] 106 | }, 107 | { 108 | "name": "stdout", 109 | "output_type": "stream", 110 | "text": [ 111 | "SentinelAudit Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')\n" 112 | ] 113 | }, 114 | { 115 | "name": "stderr", 116 | "output_type": "stream", 117 | "text": [ 118 | " 82%|████████▏ | 9/11 [00:02<00:00, 3.85it/s]" 119 | ] 120 | }, 121 | { 122 | "name": "stdout", 123 | "output_type": "stream", 124 | "text": [ 125 | "SentinelHealth Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')\n" 126 | ] 127 | }, 128 | { 129 | "name": "stderr", 130 | "output_type": "stream", 131 | "text": [ 132 | " 91%|█████████ | 10/11 [00:02<00:00, 3.77it/s]" 133 | ] 134 | }, 135 | { 136 | "name": "stdout", 137 | "output_type": "stream", 138 | "text": [ 139 | "SigninLogs Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')\n" 140 | ] 141 | }, 142 | { 143 | "name": "stderr", 144 | "output_type": "stream", 145 | "text": [ 146 | "100%|██████████| 11/11 [00:03<00:00, 3.47it/s]" 147 | ] 148 | }, 149 | { 150 | "name": "stdout", 151 | "output_type": "stream", 152 | "text": [ 153 | "Syslog Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')\n" 154 | ] 155 | }, 156 | { 157 | "name": "stderr", 158 | "output_type": "stream", 159 | "text": [ 160 | "\n" 161 | ] 162 | } 163 | ], 164 | "source": [ 165 | "from typing import Dict\n", 166 | "import pandas as pd\n", 167 | "import requests\n", 168 | "\n", 169 | "import bs4\n", 170 | "from tqdm.auto import tqdm\n", 171 | "\n", 172 | "SCHEMA_CATS_URL = \"https://learn.microsoft.com/azure/azure-monitor/reference/tables/tables-category\"\n", 173 | "\n", 174 | "def fetch_az_mon_categories() -> requests.models.Response:\n", 175 | " \"\"\"Return the AzMonitor reference page.\"\"\"\n", 176 | " return requests.get(SCHEMA_CATS_URL)\n", 177 | "\n", 178 | "\n", 179 | "def get_security_category_list(resp: requests.models.Response) -> bs4.element.Tag:\n", 180 | " \"\"\"Extract the list after the security header.\"\"\"\n", 181 | " soup = bs4.BeautifulSoup(resp.text, \"html.parser\")\n", 182 | "\n", 183 | " result = soup.find(\"div\", class_=\"content\")\n", 184 | " sec_header =result.find(\"h2\", id=\"security\")\n", 185 | " return sec_header.find_next_sibling()\n", 186 | "\n", 187 | "\n", 188 | "def build_table_index(security_cat_list: bs4.element.Tag) -> Dict[str, Dict[str, str]]:\n", 189 | " \"\"\"From the html list, build an index of URLs.\"\"\"\n", 190 | " table_prefix = \"https://learn.microsoft.com/azure/azure-monitor/reference/tables/{href}\"\n", 191 | " return {\n", 192 | " item.a.contents[0]: {\n", 193 | " \"href\": item.a.attrs.get(\"href\"),\n", 194 | " \"url\": table_prefix.format(**(item.a.attrs)),\n", 195 | " }\n", 196 | " for item in security_cat_list.find_all(\"li\")\n", 197 | " }\n", 198 | "\n", 199 | "\n", 200 | "def read_table_from_url(table: str, ref: Dict[str, str]) -> pd.DataFrame:\n", 201 | " \"\"\"Read table schema from a URL.\"\"\"\n", 202 | " table_data = pd.read_html(ref[\"url\"])[0]\n", 203 | " table_data[\"Table\"] = table\n", 204 | " table_data[\"Url\"] = ref[\"url\"]\n", 205 | " print(table, table_data.columns)\n", 206 | " return table_data\n", 207 | "\n", 208 | "\n", 209 | "def fetch_table_schemas(sec_url_dict: Dict[str, Dict[str, str]]) -> pd.DataFrame:\n", 210 | " \"\"\"Combine schema tables into single DF.\"\"\"\n", 211 | " print(f\"Reading schemas for {len(sec_url_dict)} tables...\")\n", 212 | " all_tables = [\n", 213 | " read_table_from_url(table, ref)\n", 214 | " for table, ref in tqdm(sec_url_dict.items())\n", 215 | " ]\n", 216 | " return pd.concat(all_tables, ignore_index=True)\n", 217 | "\n", 218 | "\n", 219 | "\n", 220 | "sec_cat_list = get_security_category_list(fetch_az_mon_categories())\n", 221 | "sec_url_dict = build_table_index(sec_cat_list)\n", 222 | "sec_url_dict = {key: val for key, val in sec_url_dict.items() if key.startswith(\"S\")}\n", 223 | "comb_tables = fetch_table_schemas(sec_url_dict)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 3, 229 | "metadata": {}, 230 | "outputs": [ 231 | { 232 | "ename": "NameError", 233 | "evalue": "name 'comb_tables' is not defined", 234 | "output_type": "error", 235 | "traceback": [ 236 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 237 | "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", 238 | "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_640980\\1382993768.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mcomb_tables\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 239 | "\u001b[1;31mNameError\u001b[0m: name 'comb_tables' is not defined" 240 | ] 241 | } 242 | ], 243 | "source": [ 244 | "comb_tables.head()" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 41, 250 | "metadata": {}, 251 | "outputs": [ 252 | { 253 | "data": { 254 | "text/plain": [ 255 | "'{\"SecurityAlert\": {\"url\": \"https://learn.microsoft.com/azure/azure-monitor/reference/tables/securityalert\", \"schema\": {\"Column\": \"AlertLink\", \"Type\": \"string\", \"Description\": NaN}}, \"SecurityBaseline\": {\"url\": \"https://learn.microsoft.com/azure/azure-monitor/reference/tables/securitybaseline\", \"schema\": {\"Column\": \"ActualResult\", \"Type\": \"string\", \"Description\": NaN}}, \"SecurityBaselineSummary\": {\"url\": \"https://learn.microsoft.com/azure/azure-monitor/reference/tables/securitybaselinesummary\", \"schema\": {\"Column\": \"AssessmentId\", \"Type\": \"string\", \"Description\": NaN}}, \"SecurityDetection\": {\"url\": \"https://learn.microsoft.com/azure/azure-monitor/reference/tables/securitydetection\", \"schema\": {\"Column\": \"AccountsSeen\", \"Type\": \"int\", \"Description\": NaN}}, \"SecurityEvent\": {\"url\": \"https://learn.microsoft.com/azure/azure-monitor/reference/tables/securityevent\", \"schema\": {\"Column\": \"AccessMask\", \"Type\": \"string\", \"Description\": NaN}}, \"SecurityIoTRawEvent\": {\"url\": \"https://learn.microsoft.com/azure/azure-monitor/reference/tables/securityiotrawevent\", \"schema\": {\"Column\": \"AgentVersion\", \"Type\": \"string\", \"Description\": NaN}}, \"SecurityRecommendation\": {\"url\": \"https://learn.microsoft.com/azure/azure-monitor/reference/tables/securityrecommendation\", \"schema\": {\"Column\": \"AssessedResourceId\", \"Type\": \"string\", \"Description\": NaN}}, \"SentinelAudit\": {\"url\": \"https://learn.microsoft.com/azure/azure-monitor/reference/tables/sentinelaudit\", \"schema\": {\"Column\": \"CorrelationId\", \"Type\": \"string\", \"Description\": \"A unique record identifier.\"}}, \"SentinelHealth\": {\"url\": \"https://learn.microsoft.com/azure/azure-monitor/reference/tables/sentinelhealth\", \"schema\": {\"Column\": \"Description\", \"Type\": \"string\", \"Description\": \"The operation description.\"}}, \"SigninLogs\": {\"url\": \"https://learn.microsoft.com/azure/azure-monitor/reference/tables/signinlogs\", \"schema\": {\"Column\": \"AADTenantId\", \"Type\": \"string\", \"Description\": NaN}}, \"Syslog\": {\"url\": \"https://learn.microsoft.com/azure/azure-monitor/reference/tables/syslog\", \"schema\": {\"Column\": \"Computer\", \"Type\": \"string\", \"Description\": \"Computer that the event was collected from.\"}}}'" 256 | ] 257 | }, 258 | "metadata": {}, 259 | "output_type": "display_data" 260 | }, 261 | { 262 | "data": { 263 | "text/html": [ 264 | "
\n", 265 | "\n", 278 | "\n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | "
urlschema
SecurityAlerthttps://learn.microsoft.com/azure/azure-monito...{'Column': 'AlertLink', 'Type': 'string', 'Des...
SecurityBaselinehttps://learn.microsoft.com/azure/azure-monito...{'Column': 'ActualResult', 'Type': 'string', '...
SecurityBaselineSummaryhttps://learn.microsoft.com/azure/azure-monito...{'Column': 'AssessmentId', 'Type': 'string', '...
SecurityDetectionhttps://learn.microsoft.com/azure/azure-monito...{'Column': 'AccountsSeen', 'Type': 'int', 'Des...
SecurityEventhttps://learn.microsoft.com/azure/azure-monito...{'Column': 'AccessMask', 'Type': 'string', 'De...
SecurityIoTRawEventhttps://learn.microsoft.com/azure/azure-monito...{'Column': 'AgentVersion', 'Type': 'string', '...
SecurityRecommendationhttps://learn.microsoft.com/azure/azure-monito...{'Column': 'AssessedResourceId', 'Type': 'stri...
SentinelAudithttps://learn.microsoft.com/azure/azure-monito...{'Column': 'CorrelationId', 'Type': 'string', ...
SentinelHealthhttps://learn.microsoft.com/azure/azure-monito...{'Column': 'Description', 'Type': 'string', 'D...
SigninLogshttps://learn.microsoft.com/azure/azure-monito...{'Column': 'AADTenantId', 'Type': 'string', 'D...
Sysloghttps://learn.microsoft.com/azure/azure-monito...{'Column': 'Computer', 'Type': 'string', 'Desc...
\n", 344 | "
" 345 | ], 346 | "text/plain": [ 347 | " url \\\n", 348 | "SecurityAlert https://learn.microsoft.com/azure/azure-monito... \n", 349 | "SecurityBaseline https://learn.microsoft.com/azure/azure-monito... \n", 350 | "SecurityBaselineSummary https://learn.microsoft.com/azure/azure-monito... \n", 351 | "SecurityDetection https://learn.microsoft.com/azure/azure-monito... \n", 352 | "SecurityEvent https://learn.microsoft.com/azure/azure-monito... \n", 353 | "SecurityIoTRawEvent https://learn.microsoft.com/azure/azure-monito... \n", 354 | "SecurityRecommendation https://learn.microsoft.com/azure/azure-monito... \n", 355 | "SentinelAudit https://learn.microsoft.com/azure/azure-monito... \n", 356 | "SentinelHealth https://learn.microsoft.com/azure/azure-monito... \n", 357 | "SigninLogs https://learn.microsoft.com/azure/azure-monito... \n", 358 | "Syslog https://learn.microsoft.com/azure/azure-monito... \n", 359 | "\n", 360 | " schema \n", 361 | "SecurityAlert {'Column': 'AlertLink', 'Type': 'string', 'Des... \n", 362 | "SecurityBaseline {'Column': 'ActualResult', 'Type': 'string', '... \n", 363 | "SecurityBaselineSummary {'Column': 'AssessmentId', 'Type': 'string', '... \n", 364 | "SecurityDetection {'Column': 'AccountsSeen', 'Type': 'int', 'Des... \n", 365 | "SecurityEvent {'Column': 'AccessMask', 'Type': 'string', 'De... \n", 366 | "SecurityIoTRawEvent {'Column': 'AgentVersion', 'Type': 'string', '... \n", 367 | "SecurityRecommendation {'Column': 'AssessedResourceId', 'Type': 'stri... \n", 368 | "SentinelAudit {'Column': 'CorrelationId', 'Type': 'string', ... \n", 369 | "SentinelHealth {'Column': 'Description', 'Type': 'string', 'D... \n", 370 | "SigninLogs {'Column': 'AADTenantId', 'Type': 'string', 'D... \n", 371 | "Syslog {'Column': 'Computer', 'Type': 'string', 'Desc... " 372 | ] 373 | }, 374 | "metadata": {}, 375 | "output_type": "display_data" 376 | }, 377 | { 378 | "data": { 379 | "text/html": [ 380 | "
\n", 381 | "\n", 394 | "\n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | "
SecurityAlert.urlSecurityAlert.schema.ColumnSecurityAlert.schema.TypeSecurityAlert.schema.DescriptionSecurityBaseline.urlSecurityBaseline.schema.ColumnSecurityBaseline.schema.TypeSecurityBaseline.schema.DescriptionSecurityBaselineSummary.urlSecurityBaselineSummary.schema.Column...SentinelHealth.schema.TypeSentinelHealth.schema.DescriptionSigninLogs.urlSigninLogs.schema.ColumnSigninLogs.schema.TypeSigninLogs.schema.DescriptionSyslog.urlSyslog.schema.ColumnSyslog.schema.TypeSyslog.schema.Description
0https://learn.microsoft.com/azure/azure-monito...AlertLinkstringNaNhttps://learn.microsoft.com/azure/azure-monito...ActualResultstringNaNhttps://learn.microsoft.com/azure/azure-monito...AssessmentId...stringThe operation description.https://learn.microsoft.com/azure/azure-monito...AADTenantIdstringNaNhttps://learn.microsoft.com/azure/azure-monito...ComputerstringComputer that the event was collected from.
\n", 448 | "

1 rows × 44 columns

\n", 449 | "
" 450 | ], 451 | "text/plain": [ 452 | " SecurityAlert.url \\\n", 453 | "0 https://learn.microsoft.com/azure/azure-monito... \n", 454 | "\n", 455 | " SecurityAlert.schema.Column SecurityAlert.schema.Type \\\n", 456 | "0 AlertLink string \n", 457 | "\n", 458 | " SecurityAlert.schema.Description \\\n", 459 | "0 NaN \n", 460 | "\n", 461 | " SecurityBaseline.url \\\n", 462 | "0 https://learn.microsoft.com/azure/azure-monito... \n", 463 | "\n", 464 | " SecurityBaseline.schema.Column SecurityBaseline.schema.Type \\\n", 465 | "0 ActualResult string \n", 466 | "\n", 467 | " SecurityBaseline.schema.Description \\\n", 468 | "0 NaN \n", 469 | "\n", 470 | " SecurityBaselineSummary.url \\\n", 471 | "0 https://learn.microsoft.com/azure/azure-monito... \n", 472 | "\n", 473 | " SecurityBaselineSummary.schema.Column ... SentinelHealth.schema.Type \\\n", 474 | "0 AssessmentId ... string \n", 475 | "\n", 476 | " SentinelHealth.schema.Description \\\n", 477 | "0 The operation description. \n", 478 | "\n", 479 | " SigninLogs.url SigninLogs.schema.Column \\\n", 480 | "0 https://learn.microsoft.com/azure/azure-monito... AADTenantId \n", 481 | "\n", 482 | " SigninLogs.schema.Type SigninLogs.schema.Description \\\n", 483 | "0 string NaN \n", 484 | "\n", 485 | " Syslog.url Syslog.schema.Column \\\n", 486 | "0 https://learn.microsoft.com/azure/azure-monito... Computer \n", 487 | "\n", 488 | " Syslog.schema.Type Syslog.schema.Description \n", 489 | "0 string Computer that the event was collected from. \n", 490 | "\n", 491 | "[1 rows x 44 columns]" 492 | ] 493 | }, 494 | "metadata": {}, 495 | "output_type": "display_data" 496 | } 497 | ], 498 | "source": [ 499 | "t_dict = {}\n", 500 | "for table, df in comb_tables.groupby(\"Table\"):\n", 501 | " url = df.iloc[0][\"Url\"]\n", 502 | " t_dict[table] = {\n", 503 | " \"url\": url,\n", 504 | " \"schema\": df.drop(columns=[\"Table\", \"Url\"]).to_dict(orient=\"records\")[0]\n", 505 | " }\n", 506 | "\n", 507 | "t_dict\n", 508 | "import json\n", 509 | "display(json.dumps(t_dict))\n", 510 | "display(pd.read_json(json.dumps(t_dict), orient=\"index\"))\n", 511 | "display(pd.json_normalize(t_dict))" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": 2, 517 | "metadata": {}, 518 | "outputs": [ 519 | { 520 | "ename": "NameError", 521 | "evalue": "name 'comb_tables' is not defined", 522 | "output_type": "error", 523 | "traceback": [ 524 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 525 | "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", 526 | "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_640980\\4275042129.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mcomb_tables\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"Table\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0munique\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 527 | "\u001b[1;31mNameError\u001b[0m: name 'comb_tables' is not defined" 528 | ] 529 | } 530 | ], 531 | "source": [ 532 | "comb_tables[\"Table\"].unique()" 533 | ] 534 | } 535 | ], 536 | "metadata": { 537 | "kernelspec": { 538 | "display_name": "Python 3.9.7 ('msticpy')", 539 | "language": "python", 540 | "name": "python3" 541 | }, 542 | "language_info": { 543 | "codemirror_mode": { 544 | "name": "ipython", 545 | "version": 3 546 | }, 547 | "file_extension": ".py", 548 | "mimetype": "text/x-python", 549 | "name": "python", 550 | "nbconvert_exporter": "python", 551 | "pygments_lexer": "ipython3", 552 | "version": "3.9.7" 553 | }, 554 | "orig_nbformat": 4, 555 | "vscode": { 556 | "interpreter": { 557 | "hash": "0f1a8e166ce5c1ec1911a36e4fdbd34b2f623e2a3442791008b8ac429a1d6070" 558 | } 559 | } 560 | }, 561 | "nbformat": 4, 562 | "nbformat_minor": 2 563 | } 564 | -------------------------------------------------------------------------------- /dev-notebooks/db_schema.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from sqlalchemy import Column, ForeignKey, Integer, String, Table, Text 3 | from sqlalchemy.orm import relationship 4 | from sqlalchemy.ext.declarative import declarative_base 5 | 6 | Base = declarative_base() 7 | metadata = Base.metadata 8 | 9 | 10 | class FieldEntity(Base): 11 | __tablename__ = 'FieldEntity' 12 | 13 | field = Column(String(100), primary_key=True, nullable=False, unique=True) 14 | entity = Column(String(100), primary_key=True, nullable=False) 15 | 16 | querys = relationship('KqlQuery', secondary='QueryField') 17 | 18 | 19 | class KqlQuery(Base): 20 | __tablename__ = 'KqlQuery' 21 | 22 | source_path = Column(String(1000), nullable=False) 23 | query = Column(Text(10000)) 24 | name = Column(String(100)) 25 | query_id = Column(Integer, primary_key=True) 26 | local_path = Column(String(1000), nullable=False) 27 | 28 | 29 | class QueryAttribute(Base): 30 | __tablename__ = 'QueryAttribute' 31 | 32 | query_id = Column(ForeignKey('KqlQuery.query_id'), primary_key=True, nullable=False) 33 | attribute_name = Column(String(100), primary_key=True, nullable=False) 34 | attribute_value = Column(String(1000)) 35 | 36 | query = relationship('KqlQuery') 37 | 38 | 39 | # t_QueryField = Table( 40 | # 'QueryField', metadata, 41 | # Column('query_id', ForeignKey('KqlQuery.query_id'), primary_key=True, nullable=False), 42 | # Column('field', ForeignKey('FieldEntity.field'), primary_key=True, nullable=False, unique=True) 43 | # ) 44 | class QueryField(Base): 45 | __tablename__ = "QueryField" 46 | 47 | query_id = Column(ForeignKey('KqlQuery.query_id'), primary_key=True, nullable=False) 48 | field = Column(ForeignKey('FieldEntity.field'), primary_key=True, nullable=False, unique=True) 49 | 50 | query = relationship('KqlQuery') 51 | entity = relationship("FieldEntity") 52 | 53 | 54 | class QueryFunction(Base): 55 | __tablename__ = 'QueryFunction' 56 | 57 | query_id = Column(ForeignKey('KqlQuery.query_id'), primary_key=True, nullable=False) 58 | function = Column(String(100), primary_key=True, nullable=False) 59 | 60 | query = relationship('KqlQuery') 61 | 62 | 63 | class QueryOperator(Base): 64 | __tablename__ = 'QueryOperator' 65 | 66 | query_id = Column(ForeignKey('KqlQuery.query_id'), primary_key=True, nullable=False, unique=True) 67 | operator = Column(String(100), primary_key=True, nullable=False) 68 | 69 | query = relationship('KqlQuery', uselist=False) 70 | 71 | 72 | class QueryTable(Base): 73 | __tablename__ = 'QueryTable' 74 | 75 | table_name = Column(String(100), primary_key=True, nullable=False, unique=True) 76 | query_id = Column(ForeignKey('KqlQuery.query_id'), primary_key=True, nullable=False) 77 | 78 | query = relationship('KqlQuery') 79 | 80 | 81 | class OperatorFieldReference(Base): 82 | __tablename__ = 'OperatorFieldReference' 83 | 84 | query_id = Column(ForeignKey('QueryOperator.query_id'), primary_key=True, nullable=False) 85 | field = Column(ForeignKey('QueryField.field'), primary_key=True, nullable=False) 86 | operator = Column(String(100), primary_key=True, nullable=False) 87 | 88 | QueryField = relationship('QueryField') 89 | query = relationship('QueryOperator') 90 | 91 | 92 | class OperatorTableReference(Base): 93 | __tablename__ = 'OperatorTableReference' 94 | 95 | query_id = Column(ForeignKey('QueryOperator.query_id'), primary_key=True, nullable=False) 96 | operator = Column(String(100), primary_key=True, nullable=False) 97 | table_name = Column(ForeignKey('QueryTable.table_name'), primary_key=True, nullable=False) 98 | 99 | query = relationship('QueryOperator') 100 | QueryTable = relationship('QueryTable') 101 | -------------------------------------------------------------------------------- /dev-notebooks/kqlquery.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/kql-query-store/b2b2e82fe3a80a2baf8355b90d3a88db98b1a472/dev-notebooks/kqlquery.db -------------------------------------------------------------------------------- /dev-notebooks/repos.yaml: -------------------------------------------------------------------------------- 1 | - Github: 2 | branch: main 3 | repo: reprise99/Sentinel-Queries 4 | - Github: 5 | branch: main 6 | repo: ugurkocde/KQL_Intune -------------------------------------------------------------------------------- /images/DataFlowDiagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/kql-query-store/b2b2e82fe3a80a2baf8355b90d3a88db98b1a472/images/DataFlowDiagram.png -------------------------------------------------------------------------------- /kqlextraction/KqlExtraction/KqlExtraction.cs: -------------------------------------------------------------------------------- 1 | using Kusto.Language; 2 | using Kusto.Language.Symbols; 3 | using Kusto.Language.Syntax; 4 | using Kusto.Language.Utils; 5 | using System.Runtime.CompilerServices; 6 | using System.Text; 7 | using System.Text.Json; 8 | 9 | namespace Microsoft.Mstic.KqlQuery.Extraction 10 | { 11 | public class KqlExtractionResult 12 | { 13 | public string Id { get; set; } = ""; 14 | public HashSet FunctionCalls { get; set; } = new HashSet(); 15 | public Dictionary> Joins { get; set; } = new Dictionary>(); 16 | public HashSet Operators { get; set; } = new HashSet(); 17 | public HashSet Tables { get; set; } = new HashSet(); 18 | } 19 | 20 | public class KqlExtraction 21 | { 22 | public static void Main(string[] args) 23 | { 24 | string? l = null; 25 | while ((l = Console.ReadLine()) != null) 26 | { 27 | var kqlQuery = l.Split(',', 2); 28 | 29 | var kqlExtractionResult = new KqlExtractionResult(); 30 | if (kqlQuery.Length == 2) 31 | { 32 | try 33 | { 34 | kqlExtractionResult.Id = kqlQuery[0]; 35 | if (RunExtraction(kqlExtractionResult, Encoding.UTF8.GetString(Convert.FromBase64String(kqlQuery[1]))) == 0) 36 | { 37 | Console.WriteLine(JsonSerializer.Serialize(kqlExtractionResult)); 38 | } 39 | } 40 | catch (Exception e) 41 | { 42 | Console.WriteLine("[!] Error: Caught Exception \"{0}\"", e.Message); 43 | } 44 | } 45 | } 46 | } 47 | 48 | private static int RunExtraction(KqlExtractionResult kqlExtractionResult, string kql) 49 | { 50 | try 51 | { 52 | var kustoGlobals = GlobalState.Default.WithClusterList(Array.Empty()); 53 | var kqlQuery = KustoCode.ParseAndAnalyze(kql, globals: kustoGlobals); 54 | 55 | var syntaxDiagnostics = kqlQuery.GetSyntaxDiagnostics(); 56 | if (syntaxDiagnostics.Count > 0) 57 | { 58 | Console.WriteLine("[!] Error: Syntax Error(s)"); 59 | foreach (var diagnostic in kqlQuery.GetSyntaxDiagnostics()) 60 | { 61 | Console.WriteLine(" > [{0}:{1}] {2}", diagnostic.Start, diagnostic.End, diagnostic.Message); 62 | } 63 | return 1; 64 | } 65 | 66 | SyntaxElement.WalkNodes(kqlQuery.Syntax, 67 | n => 68 | { 69 | string? joinKind = null; 70 | HashSet? joinTarget = null; 71 | 72 | if (n is FunctionCallExpression fc) 73 | { 74 | kqlExtractionResult.FunctionCalls.Add(fc.Name.SimpleName); 75 | } 76 | else if (n is NameReference nr) 77 | { 78 | if (nr.RawResultType.Kind == SymbolKind.Table) 79 | { 80 | kqlExtractionResult.Tables.Add(nr.Name.SimpleName); 81 | } 82 | } 83 | else if (n.NameInParent == "Operator") 84 | { 85 | if (n is JoinOperator jo) 86 | { 87 | joinKind = "inner"; 88 | joinTarget = new HashSet(); 89 | 90 | var kindParameter = jo.Parameters.Where(p => p.Name.SimpleName == "kind"); 91 | if (kindParameter.Count() == 1) 92 | { 93 | joinKind = kindParameter.First().Expression.ToString(); 94 | } 95 | 96 | if (jo.Expression is NameReference jonr) 97 | { 98 | joinTarget.Add(jonr.SimpleName); 99 | } 100 | else if (jo.Expression is ParenthesizedExpression jopr) 101 | { 102 | if (jopr.Expression is NameReference joprnr) 103 | { 104 | joinTarget.Add(joprnr.SimpleName); 105 | } 106 | } 107 | 108 | if (joinTarget.Count() == 0) 109 | { 110 | joinTarget.Add("(...)"); 111 | } 112 | } 113 | else if (n is LookupOperator lo) 114 | { 115 | joinKind = "leftouter"; 116 | joinTarget = new HashSet(); 117 | 118 | if (lo.Expression is NameReference lonr) 119 | { 120 | joinTarget.Add(lonr.SimpleName); 121 | } 122 | else if (lo.Expression is ParenthesizedExpression lopr) 123 | { 124 | if (lopr.Expression is NameReference loprnr) 125 | { 126 | joinTarget.Add(loprnr.SimpleName); 127 | } 128 | } 129 | 130 | if (joinTarget.Count() == 0) 131 | { 132 | joinTarget.Add("(...)"); 133 | } 134 | } 135 | else 136 | { 137 | kqlExtractionResult.Operators.Add(n.GetFirstToken().Text); 138 | } 139 | } 140 | else if (n is UnionOperator uo) 141 | { 142 | joinKind = "union"; 143 | joinTarget = new HashSet(); 144 | 145 | foreach(var t in uo.Expressions) 146 | { 147 | if (t.Element is NameReference uonr) 148 | { 149 | joinTarget.Add(uonr.SimpleName); 150 | } 151 | } 152 | } 153 | 154 | if ((joinKind != null) && (joinTarget != null)) 155 | { 156 | if (!kqlExtractionResult.Joins.ContainsKey(joinKind)) 157 | { 158 | kqlExtractionResult.Joins[joinKind] = new HashSet(); 159 | } 160 | kqlExtractionResult.Joins[joinKind].AddRange(joinTarget); 161 | } 162 | }); 163 | } 164 | catch (Exception ex) 165 | { 166 | Console.WriteLine("[!] Error: Exception '{0}'", ex.Message); 167 | return 2; 168 | } 169 | 170 | return 0; 171 | } 172 | } 173 | } 174 | -------------------------------------------------------------------------------- /kqlextraction/KqlExtraction/KqlExtraction.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | net6.0 6 | enable 7 | enable 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /kqlextraction/KqlExtraction/KqlExtraction.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 17 4 | VisualStudioVersion = 17.3.32901.215 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "KqlExtraction", "KqlExtraction.csproj", "{46EB7D0B-BD7E-42D3-B638-5A40B287DF26}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|Any CPU = Debug|Any CPU 11 | Release|Any CPU = Release|Any CPU 12 | EndGlobalSection 13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 14 | {46EB7D0B-BD7E-42D3-B638-5A40B287DF26}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 15 | {46EB7D0B-BD7E-42D3-B638-5A40B287DF26}.Debug|Any CPU.Build.0 = Debug|Any CPU 16 | {46EB7D0B-BD7E-42D3-B638-5A40B287DF26}.Release|Any CPU.ActiveCfg = Release|Any CPU 17 | {46EB7D0B-BD7E-42D3-B638-5A40B287DF26}.Release|Any CPU.Build.0 = Release|Any CPU 18 | EndGlobalSection 19 | GlobalSection(SolutionProperties) = preSolution 20 | HideSolutionNode = FALSE 21 | EndGlobalSection 22 | GlobalSection(ExtensibilityGlobals) = postSolution 23 | SolutionGuid = {CA95F2D7-E70E-4BD8-8E41-743E27B063FC} 24 | EndGlobalSection 25 | EndGlobal 26 | -------------------------------------------------------------------------------- /kqlextraction/Readme.txt: -------------------------------------------------------------------------------- 1 | Requires .NET 6.0 2 | 3 | > cd .\KqlExtraction\ 4 | > dotnet restore 5 | > dotnet build -c Release 6 | > .\KqlExtraction\bin\Release\net6.0\KqlExtraction.exe tests\test1.kql 7 | 8 | {"FunctionCalls":["count","tostring","make_list","toreal"],"Joins":["rightsemi","leftouter"],"Operators":["where","extend","summarize","mv-expand","project-away","project"],"Tables":["SigninLogs"]} -------------------------------------------------------------------------------- /kqlextraction/extract.py: -------------------------------------------------------------------------------- 1 | from base64 import b64encode 2 | import json 3 | import os 4 | import queue 5 | import subprocess 6 | import threading 7 | import time 8 | from uuid import uuid4 9 | 10 | 11 | worker_exit = threading.Event() 12 | worker_queue = queue.Queue() 13 | worker_results = queue.Queue() 14 | worker_thread = None 15 | 16 | 17 | def _worker_thread_proc(): 18 | try: 19 | kql_extraction = None 20 | 21 | while not worker_exit.is_set(): 22 | try: 23 | if kql_extraction is not None: 24 | if kql_extraction.poll() is not None: 25 | kql_extraction = None 26 | if kql_extraction is None: 27 | kql_extraction = subprocess.Popen([ 28 | 'dotnet', 29 | 'run', 30 | '-c', 31 | 'Release', 32 | '--project', 33 | os.path.join(os.path.abspath(os.path.split(__file__)[0]), 'KqlExtraction', 'KqlExtraction.csproj') 34 | ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 35 | except Exception as ex: 36 | print('[!] Exception Starting KqlExtraction Process') 37 | break 38 | 39 | try: 40 | uuid, kql = worker_queue.get(timeout=2.0) 41 | kql_extraction.stdin.write(bytes(f'{uuid},', encoding='utf-8') + b64encode(bytes(kql, encoding='utf-8')) + b'\n') 42 | kql_extraction.stdin.flush() 43 | 44 | kql_extraction_result = kql_extraction.stdout.readline() 45 | worker_results.put(json.loads(kql_extraction_result)) 46 | except queue.Empty: 47 | pass 48 | except Exception as ex: 49 | kql_extraction.kill() 50 | 51 | if kql_extraction.poll() is None: 52 | kql_extraction.kill() 53 | except Exception as ex: 54 | print('[!] Unhandled Exception', str(ex)) 55 | 56 | 57 | def extract_kql(kql): 58 | kql_id = str(uuid4()) 59 | worker_queue.put((kql_id, kql)) 60 | 61 | try: 62 | kql_result = {} 63 | while True: 64 | kql_result = worker_results.get(timeout=5.0) 65 | if 'Id' in kql_result and kql_result['Id'] == kql_id: 66 | break 67 | except Exception: 68 | pass 69 | 70 | return kql_result 71 | 72 | 73 | if __name__ == '__main__': 74 | worker_thread = threading.Thread(target=_worker_thread_proc) 75 | worker_thread.start() 76 | 77 | try: 78 | base_path = os.path.abspath(os.path.split(__file__)[0]) 79 | for kql_file in os.listdir(os.path.join(base_path, 'tests')): 80 | kql_file = os.path.join(base_path, 'tests', kql_file) 81 | 82 | with open(kql_file, 'r') as f: 83 | kql = f.read() 84 | 85 | print(extract_kql(kql)) 86 | except Exception as ex: 87 | print('[!] Unhandled Exception', str(ex)) 88 | 89 | while not worker_queue.empty(): 90 | time.sleep(0.5) 91 | 92 | worker_exit.set() 93 | worker_thread.join() 94 | -------------------------------------------------------------------------------- /kqlextraction/tests/test1.kql: -------------------------------------------------------------------------------- 1 | // https://github.com/Azure/Azure-Sentinel/blob/master/Hunting%20Queries/SigninLogs/UserLoginIPAddressTeleportation.yaml 2 | 3 | let windowTime = 20min / 2; //Window to lookup anomalous logins within 4 | let excludeKnownVPN = dynamic(['127.0.0.1', '0.0.0.0']); //Known VPN IP addresses to exclude 5 | SigninLogs 6 | | where ConditionalAccessStatus =~ "success" 7 | | extend country = LocationDetails['countryOrRegion'] 8 | | where country != "" 9 | | summarize count() by tostring(country) 10 | | join ( 11 | //Get the total number of logins from any country and join it to the previous count in a single table 12 | SigninLogs 13 | | where ConditionalAccessStatus =~ "success" 14 | | extend country = LocationDetails['countryOrRegion'] 15 | | where country != "" 16 | | summarize count(), make_list(tostring(country)) 17 | | mv-expand list_country 18 | | extend country = tostring(list_country) 19 | ) on country 20 | | summarize by country, count_, count_1 21 | //Now calculate each countries prevalence within login events 22 | | extend prevalence = toreal(count_) / toreal(count_1) * 100 23 | | project-away count_1 24 | | where prevalence < 0.01 25 | | join kind=rightsemi( 26 | SigninLogs 27 | //Enable to limit to o365 exchange logins 28 | //| where AppDisplayName =~ "Office 365 Exchange Online" 29 | | where ConditionalAccessStatus =~ "success" 30 | | where IPAddress != "" 31 | | extend country = tostring(LocationDetails['countryOrRegion']) 32 | | summarize count() by TimeGenerated, UserPrincipalName, country, IPAddress 33 | ) on country 34 | | join kind=leftouter ( 35 | SigninLogs 36 | //Enable to limit to o365 exchange logins 37 | //| where AppDisplayName =~ "Office 365 Exchange Online" 38 | | where ConditionalAccessStatus =~ "success" 39 | | extend country = tostring(LocationDetails['countryOrRegion']) 40 | | summarize by TimeGenerated, IPAddress, UserPrincipalName, country 41 | ) on UserPrincipalName 42 | | where IPAddress != IPAddress1 and country != country1 43 | | extend WindowStart = TimeGenerated1 - windowTime 44 | | extend WindowEnd = TimeGenerated1 + windowTime 45 | | where TimeGenerated between (WindowStart .. WindowEnd) 46 | | project Account=UserPrincipalName, AnomalousIP=IPAddress, AnomalousLoginTime=TimeGenerated, AnomalousCountry=country, OtherLoginIP=IPAddress1, OtherLoginCountry=country1, OtherLoginWindowStart=WindowStart, OtherLoginWindowEnd=WindowEnd 47 | | where AnomalousIP !in(excludeKnownVPN) and OtherLoginIP !in(excludeKnownVPN) 48 | | extend timestamp = AnomalousLoginTime, AccountCustomEntity = Account, IPCustomEntity = AnomalousIP -------------------------------------------------------------------------------- /kqlextraction/tests/test2.kql: -------------------------------------------------------------------------------- 1 | Foo 2 | | where A == 1 3 | | summarize count() by B -------------------------------------------------------------------------------- /kqlextraction/tests/test3.kql: -------------------------------------------------------------------------------- 1 | Foo 2 | | mv-expand Z 3 | | join kind=leftsemi hint.remote=true Bar on T 4 | | join kind=leftsemi ( 5 | Baz 6 | | where X > 5 7 | | project R 8 | ) on R -------------------------------------------------------------------------------- /kqlextraction/tests/test4.kql: -------------------------------------------------------------------------------- 1 | Foo 2 | | lookup (Bar) on T -------------------------------------------------------------------------------- /kqlextraction/tests/test5.kql: -------------------------------------------------------------------------------- 1 | union Foo, Bar, Baz -------------------------------------------------------------------------------- /pages/2_🔎KQL_interactive_search.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import sys 4 | 5 | from pathlib import Path 6 | from st_aggrid import AgGrid 7 | from st_aggrid import AgGrid, GridOptionsBuilder 8 | from st_aggrid.shared import GridUpdateMode 9 | 10 | if ".." not in sys.path: 11 | sys.path.append("..") 12 | 13 | from src.data_store import DataStore 14 | 15 | _TEST_JSON = "test_runs/kql_query_db-2022-09-24-02-51-49.json" 16 | ds = DataStore(json_path=_TEST_JSON) 17 | 18 | 19 | @st.cache(suppress_st_warning=True) 20 | def load_data(nrows): 21 | data = ds.to_df().head(nrows) 22 | return data 23 | 24 | 25 | @st.cache 26 | def convert_df(df, file_type): 27 | # IMPORTANT: Cache the conversion to prevent computation on every rerun 28 | if file_type == "csv": 29 | data = df.to_csv().encode("utf-8") 30 | if file_type == "json": 31 | data = df.to_json().encode("utf-8") 32 | 33 | return data 34 | 35 | 36 | def aggrid_interactive_table(df: pd.DataFrame): 37 | """Source : https://github.com/streamlit/example-app-interactive-table 38 | Creates an st-aggrid interactive table based on a dataframe. 39 | Args: 40 | df (pd.DataFrame]): Source dataframe 41 | Returns: 42 | dict: The selected row 43 | """ 44 | options = GridOptionsBuilder.from_dataframe( 45 | df, enableRowGroup=True, enableValue=True, enablePivot=True 46 | ) 47 | 48 | options.configure_side_bar() 49 | 50 | options.configure_selection("single") 51 | selection = AgGrid( 52 | df, 53 | enable_enterprise_modules=True, 54 | gridOptions=options.build(), 55 | theme="balham", 56 | update_mode=GridUpdateMode.MODEL_CHANGED, 57 | allow_unsafe_jscode=True, 58 | ) 59 | 60 | return selection 61 | 62 | 63 | def main() -> None: 64 | st.title(":mag_right: Interactive KQL Query Store") 65 | 66 | data_load_state = st.text("Loading data...") 67 | data = load_data(5000) 68 | data_disp = load_data(50) 69 | data_load_state.text("Data Loaded and cached !!") 70 | json_export = convert_df(data, "json") 71 | 72 | with st.expander("Raw Dataframe"): 73 | if st.checkbox("Show raw data"): 74 | st.subheader("Raw data") 75 | st.write("Go ahead, click on a row in the table below!") 76 | 77 | selection = aggrid_interactive_table(df=data_disp) 78 | 79 | if selection: 80 | st.write("You selected:") 81 | st.json(selection["selected_rows"]) 82 | 83 | st.download_button( 84 | label="Download data as JSON", 85 | data=json_export, 86 | file_name="kql_query_store-export.json", 87 | mime="json", 88 | ) 89 | 90 | st.sidebar.subheader("Filter by Table Names") 91 | tables = ds.get_filter_lists()["tables"] 92 | table_selections = st.sidebar.multiselect( 93 | "Select Tables to View", options=tables, default="CommonSecurityLog" 94 | ) 95 | 96 | st.sidebar.subheader("Filter by KQL Operators") 97 | 98 | operators = ds.get_filter_lists()["operators"] 99 | operator_selections = st.sidebar.multiselect( 100 | "Select KQL operators to filter by", options=operators, default="mv-expand" 101 | ) 102 | 103 | st.sidebar.subheader("Filter by KQL Function Calls") 104 | 105 | func_calls = ds.get_filter_lists()["functioncalls"] 106 | func_calls_selections = st.sidebar.multiselect( 107 | "Select KQL function calls to filter by", 108 | options=func_calls, 109 | default="series_decompose_anomalies", 110 | ) 111 | 112 | result = ds.find_queries( 113 | # query_name={"contains": "time series"}, 114 | tables=table_selections, # the list values are OR'd - so will return UNION 115 | operators=operator_selections, # the list values are OR'd - so will return UNION 116 | functioncalls=func_calls_selections, 117 | ) 118 | 119 | st.subheader("Filtered Results matching criteria") 120 | selection = aggrid_interactive_table(df=result) 121 | 122 | if selection: 123 | st.write("You selected:") 124 | st.json(selection["selected_rows"]) 125 | 126 | 127 | if __name__ == "__main__": 128 | st.set_page_config( 129 | "Interactive KQL Query Store by MSTIC", 130 | "🔎", 131 | initial_sidebar_state="expanded", 132 | layout="wide", 133 | ) 134 | main() 135 | -------------------------------------------------------------------------------- /pages/3_🛡️Schema_Browser.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from typing import Dict 3 | import pandas as pd 4 | import requests 5 | 6 | import bs4 7 | from tqdm.auto import tqdm 8 | 9 | SCHEMA_CATS_URL = ( 10 | "https://learn.microsoft.com/azure/azure-monitor/reference/tables/tables-category" 11 | ) 12 | 13 | 14 | def fetch_az_mon_categories() -> requests.models.Response: 15 | """Return the AzMonitor reference page.""" 16 | return requests.get(SCHEMA_CATS_URL) 17 | 18 | 19 | def get_security_category_list(resp: requests.models.Response) -> bs4.element.Tag: 20 | """Extract the list after the security header.""" 21 | soup = bs4.BeautifulSoup(resp.text, "html.parser") 22 | 23 | result = soup.find("div", class_="content") 24 | sec_header = result.find("h2", id="security") 25 | return sec_header.find_next_sibling() 26 | 27 | 28 | def build_table_index(security_cat_list: bs4.element.Tag) -> Dict[str, Dict[str, str]]: 29 | """From the html list, build an index of URLs.""" 30 | table_prefix = ( 31 | "https://learn.microsoft.com/azure/azure-monitor/reference/tables/{href}" 32 | ) 33 | return { 34 | item.a.contents[0]: { 35 | "href": item.a.attrs.get("href"), 36 | "url": table_prefix.format(**(item.a.attrs)), 37 | } 38 | for item in security_cat_list.find_all("li") 39 | } 40 | 41 | 42 | def read_table_from_url(table: str, ref: Dict[str, str]) -> pd.DataFrame: 43 | """Read table schema from a URL.""" 44 | table_data = pd.read_html(ref["url"])[0] 45 | table_data["Table"] = table 46 | table_data["Url"] = ref["url"] 47 | print(table, table_data.columns) 48 | return table_data 49 | 50 | 51 | def fetch_table_schemas(sec_url_dict: Dict[str, Dict[str, str]]) -> pd.DataFrame: 52 | """Combine schema tables into single DF.""" 53 | print(f"Reading schemas for {len(sec_url_dict)} tables...") 54 | all_tables = [ 55 | read_table_from_url(table, ref) for table, ref in tqdm(sec_url_dict.items()) 56 | ] 57 | return pd.concat(all_tables, ignore_index=True) 58 | 59 | 60 | def main() -> None: 61 | st.title(":shield: Schema Browser") 62 | sec_cat_list = get_security_category_list(fetch_az_mon_categories()) 63 | sec_url_dict = build_table_index(sec_cat_list) 64 | sec_url_dict = { 65 | key: val for key, val in sec_url_dict.items() if key.startswith("S") 66 | } 67 | comb_tables = fetch_table_schemas(sec_url_dict) 68 | 69 | # st.sidebar.subheader("Filter by Table Names") 70 | # tables = tuple(comb_tables["Table"].unique()) 71 | # st.write("Tables:", tables) 72 | 73 | # TODO : Recursion error - need to troubleshoot - hardcoded table names 74 | table_selection = st.selectbox( 75 | "Select a Table name to view schema ?", 76 | ( 77 | "SecurityAlert", 78 | "SecurityBaseline", 79 | "SecurityBaselineSummary", 80 | "SecurityDetection", 81 | "SecurityEvent", 82 | "SecurityIoTRawEvent", 83 | "SecurityRecommendation", 84 | "SentinelAudit", 85 | "SentinelHealth", 86 | "SigninLogs", 87 | "Syslog", 88 | ), 89 | ) 90 | 91 | df_schema = comb_tables[comb_tables["Table"] == table_selection] 92 | 93 | st.subheader("Schema for the filtered table name") 94 | st.write(df_schema) 95 | 96 | 97 | if __name__ == "__main__": 98 | st.set_page_config( 99 | "Schema Browser", 100 | "🛡️", 101 | initial_sidebar_state="expanded", 102 | layout="wide", 103 | ) 104 | main() 105 | -------------------------------------------------------------------------------- /pages/4_ 📊KQL_Store_Insights.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import sys 4 | 5 | import altair as alt 6 | 7 | if ".." not in sys.path: 8 | sys.path.append("..") 9 | 10 | from src.data_store import DataStore 11 | 12 | _TEST_JSON = "test_runs/kql_query_db-2022-09-24-02-51-49.json" 13 | ds = DataStore(json_path=_TEST_JSON) 14 | 15 | 16 | @st.cache(suppress_st_warning=True) 17 | def load_data(nrows): 18 | data = ds.to_df() 19 | data = data.head(nrows) 20 | return data 21 | 22 | 23 | def main() -> None: 24 | st.title(":bar_chart: KQL Store Insights") 25 | 26 | data = load_data(5000) 27 | 28 | st.subheader("KQL Query Store Summary") 29 | st.metric("Total No of Queries", f"{len(data)}") 30 | 31 | data_sentinel = data[data["repo_name"] == "Azure/Azure-Sentinel"] 32 | st.metric("Total No of Queries in Azure Sentinel Github", f"{len(data_sentinel)}") 33 | 34 | st.subheader("Source Type Ditribution") 35 | 36 | df_source_type = ( 37 | data.groupby("source_type")["query"] 38 | .count() 39 | .sort_values(ascending=False) 40 | .reset_index() 41 | ) 42 | 43 | chart = ( 44 | alt.Chart(df_source_type) 45 | .mark_bar() 46 | .encode(x="source_type", y="query") 47 | .properties(height=400) 48 | ) 49 | 50 | st.altair_chart(chart, use_container_width=True) 51 | 52 | st.subheader(f"Top 5 Community Repos") 53 | repo_count = ( 54 | data.groupby("repo_name")["query"] 55 | .count() 56 | .sort_values(ascending=False) 57 | .reset_index() 58 | ) 59 | repo_top = repo_count[repo_count["repo_name"] != "Azure/Azure-Sentinel"].head(5) 60 | st.write(repo_top) 61 | 62 | 63 | if __name__ == "__main__": 64 | st.set_page_config( 65 | "KQL Store Insights", 66 | "🛡️", 67 | initial_sidebar_state="expanded", 68 | layout="wide", 69 | ) 70 | main() 71 | -------------------------------------------------------------------------------- /pages/5_💬Contact_Us.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | 4 | def main() -> None: 5 | st.subheader("Reach out to Project team via Github") 6 | st.subheader("Github: https://github.com/microsoft/kql-query-store") 7 | 8 | st.write( 9 | "If you would like to add new Github repositories as source, open a issue on Github" 10 | ) 11 | 12 | 13 | if __name__ == "__main__": 14 | st.set_page_config("Contact Us !!", "💬") 15 | main() 16 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit 2 | streamlit-aggrid 3 | pandas 4 | requests 5 | altair 6 | beautifulsoup4 7 | tqdm 8 | lxml 9 | html5lib -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/kql-query-store/b2b2e82fe3a80a2baf8355b90d3a88db98b1a472/src/__init__.py -------------------------------------------------------------------------------- /src/az_mon_schema.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. See License.txt in the project root for 4 | # license information. 5 | # -------------------------------------------------------------------------- 6 | """Azure Monitor Schema creation.""" 7 | __author__ = "Ian Hellen" 8 | import json 9 | from pathlib import Path 10 | from typing import Any, Dict, Optional, Union 11 | 12 | import bs4 13 | import pandas as pd 14 | import requests 15 | from tqdm.auto import tqdm 16 | 17 | SCHEMA_CATS_URL = ( 18 | "https://learn.microsoft.com/azure/azure-monitor/reference/tables/tables-category" 19 | ) 20 | 21 | 22 | class AzMonitorSchemas: 23 | """Class to download and store Azure Monitor table schemas.""" 24 | 25 | def __init__( 26 | self, json_path: Union[None, str, Path] = None, json_text: Optional[str] = None 27 | ): 28 | """Initialize the schema class.""" 29 | self.schemas: Optional[pd.DataFrame] = None 30 | if json_path or json_text: 31 | self.schemas = self._df_from_json(json_path=json_path, json_text=json_text) 32 | 33 | def get_az_mon_schemas(self): 34 | """Retrieve Azure monitor schemas""" 35 | sec_cat_list = _get_security_category_list(_fetch_az_mon_categories()) 36 | sec_url_dict = _build_table_index(sec_cat_list) 37 | self.schemas = _fetch_table_schemas(sec_url_dict).reindex( 38 | columns=["Table", "Column", "Type", "Description", "Url"] 39 | ) 40 | 41 | @property 42 | def schema_dict(self) -> Dict[str, Dict[str, Any]]: 43 | """Return the schema as a dictionary.""" 44 | if self.schemas is None: 45 | return {} 46 | table_dict = {} 47 | for table, df in self.schemas.groupby("Table"): 48 | url = df.iloc[0]["Url"] 49 | table_dict[table.casefold()] = { 50 | "url": url, 51 | "table": table, 52 | "schema": df.drop(columns=["Table", "Url"]).to_dict(orient="records"), 53 | } 54 | return table_dict 55 | 56 | def to_json(self): 57 | """Return schemas as JSON string.""" 58 | return json.dumps(self.schema_dict) 59 | 60 | @staticmethod 61 | def _df_from_json( 62 | json_path: Union[None, str, Path] = None, json_text: Optional[str] = None 63 | ) -> pd.DataFrame: 64 | """Create DataFrame from JSON representation.""" 65 | if json_path: 66 | json_text = Path(json_path).read_text(encoding="utf-8") 67 | schema_dict = json.loads(json_text) 68 | rows = [] 69 | for item in schema_dict.values(): 70 | rows.extend( 71 | { 72 | "Table": item["table"], 73 | "Column": schema.get("Column"), 74 | "Type": schema.get("Type"), 75 | "Description": schema.get("Description"), 76 | "Url": item["url"], 77 | } 78 | for schema in item.get("schema", []) 79 | ) 80 | return pd.DataFrame(rows).sort_values(["Table", "Column"]) 81 | 82 | def find_tables(self, tables: Union[str, list]) -> pd.DataFrame: 83 | """ 84 | Return schema entries matching `tables`. 85 | 86 | Parameters 87 | ---------- 88 | tables : Union[str, list] 89 | A table name/regex pattern or a list 90 | of table names to match. 91 | 92 | Returns 93 | ------- 94 | pd.DataFrame 95 | DataFrame of matching schema entries. 96 | 97 | """ 98 | if isinstance(tables, list): 99 | tables = [table.casefold() for table in tables] 100 | return self.schemas[self.schemas["Table"].str.casefold().isin(tables)] 101 | return self.schemas[self.schemas["Table"].str.match(tables, case=False)] 102 | 103 | def find_columns(self, columns: Union[str, list]) -> pd.DataFrame: 104 | """ 105 | Return schema entries matching `columns`. 106 | 107 | Parameters 108 | ---------- 109 | columns : Union[str, list] 110 | A column name/regex pattern or a list 111 | of column names to match. 112 | 113 | Returns 114 | ------- 115 | pd.DataFrame 116 | DataFrame of matching schema entries. 117 | 118 | """ 119 | if isinstance(columns, list): 120 | columns = [column.casefold() for column in columns] 121 | return self.schemas[self.schemas["Column"].str.casefold().isin(columns)] 122 | return self.schemas[self.schemas["Column"].str.match(columns, case=False)] 123 | 124 | 125 | def _fetch_az_mon_categories() -> requests.models.Response: 126 | """Return the AzMonitor reference page.""" 127 | return requests.get(SCHEMA_CATS_URL) 128 | 129 | 130 | def _get_security_category_list(resp: requests.models.Response) -> bs4.element.Tag: 131 | """Extract the list after the security header.""" 132 | soup = bs4.BeautifulSoup(resp.text, "html.parser") 133 | 134 | result = soup.find("div", class_="content") 135 | sec_header = result.find("h2", id="security") 136 | return sec_header.find_next_sibling() 137 | 138 | 139 | def _build_table_index(security_cat_list: bs4.element.Tag) -> Dict[str, Dict[str, str]]: 140 | """From the html list, build an index of URLs.""" 141 | table_prefix = ( 142 | "https://learn.microsoft.com/azure/azure-monitor/reference/tables/{href}" 143 | ) 144 | return { 145 | item.a.contents[0]: { 146 | "href": item.a.attrs.get("href"), 147 | "url": table_prefix.format(**(item.a.attrs)), 148 | } 149 | for item in security_cat_list.find_all("li") 150 | } 151 | 152 | 153 | def _read_table_from_url(table: str, ref: Dict[str, str]) -> pd.DataFrame: 154 | """Read table schema from a URL.""" 155 | table_data = pd.read_html(ref["url"])[0] 156 | table_data["Table"] = table 157 | table_data["Url"] = ref["url"] 158 | return table_data 159 | 160 | 161 | def _fetch_table_schemas(sec_url_dict: Dict[str, Dict[str, str]]) -> pd.DataFrame: 162 | """Combine schema tables into single DF.""" 163 | print(f"Reading Azure monitor schemas for {len(sec_url_dict)} tables...") 164 | all_tables = [ 165 | _read_table_from_url(table, ref) 166 | for table, ref in tqdm(sec_url_dict.items(), unit="schemas") 167 | ] 168 | return pd.concat(all_tables, ignore_index=True) 169 | -------------------------------------------------------------------------------- /src/conf.txt: -------------------------------------------------------------------------------- 1 | source_1 2 | source_2 3 | -------------------------------------------------------------------------------- /src/create_kql_db.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. See License.txt in the project root for 4 | # license information. 5 | # -------------------------------------------------------------------------- 6 | """Main script to fetch KQL queries and create JSON Database.""" 7 | 8 | import argparse 9 | import logging 10 | import sys 11 | from datetime import datetime, timezone 12 | from pathlib import Path 13 | from typing import Any, Dict 14 | 15 | sys.path.append(str(Path(__file__).parent)) 16 | 17 | from tqdm.auto import tqdm 18 | 19 | from . import kql_extract as extract 20 | from .az_mon_schema import AzMonitorSchemas 21 | from .data_store import DataStore 22 | 23 | # from .kql_query import KqlQuery 24 | from .kql_download import get_community_queries, get_sentinel_queries 25 | 26 | # ######### MOCK Stuff for stubbing code 27 | # from unittest.mock import MagicMock 28 | # # from .kql_ingest import fetch_queries 29 | # fetch_queries = MagicMock() 30 | # _MOCK_QUERY = "SecurityAlert | take 1" 31 | # _MOCK_RESULTS = [KqlQuery(source_path=f"/x/y/{src}.kql", query=_MOCK_QUERY) for src in range(3)] 32 | # fetch_queries.return_value = _MOCK_RESULTS 33 | # # # from .kql_db_store import DataStore 34 | # DataStore = MagicMock() 35 | # store_instance = MagicMock() 36 | # DataStore.return_value = store_instance 37 | # store_instance.queries = _MOCK_RESULTS 38 | 39 | # _MOCK_KQL_PARSE = {"FunctionCalls":["count","tostring","make_list","toreal"],"Joins":["rightsemi","leftouter"],"Operators":["where","extend","summarize","mv-expand","project-away","project"],"Tables":["SigninLogs"]} 40 | # parse_kql = MagicMock() 41 | # parse_kql.return_value = _MOCK_KQL_PARSE 42 | ########## End Mocks 43 | 44 | 45 | __author__ = "Ian Hellen" 46 | 47 | _OUTPUT_FILE = "kql_query_db" 48 | 49 | 50 | def _add_script_args(): 51 | parser = argparse.ArgumentParser(description="Kql Query download and build script.") 52 | parser.add_argument( 53 | "--conf", "-c", required=True, help="Path to query source config file." 54 | ) 55 | parser.add_argument( 56 | "--out", 57 | "-o", 58 | default="output", 59 | help="Path to output folder.", 60 | ) 61 | parser.add_argument( 62 | "--df", 63 | "-d", 64 | action="store_true", 65 | default=False, 66 | help="Write a pickled dataframe.", 67 | ) 68 | parser.add_argument( 69 | "--quiet", 70 | "-q", 71 | action="store_true", 72 | default=False, 73 | help="Show less output of the execution.", 74 | ) 75 | parser.add_argument( 76 | "--verbose", 77 | "-v", 78 | action="store_true", 79 | default=False, 80 | help="Show debug logging of execution.", 81 | ) 82 | parser.add_argument( 83 | "--timestamp", 84 | "-t", 85 | action="store_true", 86 | default=False, 87 | help="Add UTC timestamps to output file.", 88 | ) 89 | parser.add_argument( 90 | "--save-stages", 91 | "-s", 92 | action="store_true", 93 | default=False, 94 | help="Save outputs after initial query load/parsing.", 95 | ) 96 | parser.add_argument( 97 | "--az-schemas", 98 | "-a", 99 | action="store_true", 100 | default=False, 101 | help="Download and store Azure monitor schema.", 102 | ) 103 | return parser 104 | 105 | 106 | def main(args): 107 | """Main entrypoint for fetching queries and writing to store.""" 108 | results = [] 109 | if not Path(args.out).is_dir(): 110 | if Path(args.out).exists(): 111 | logging.error("Cannot find or create output folder %s", args.out) 112 | return 113 | Path.mkdir(args.out, parents=True, exist_ok=True) 114 | 115 | # fetch and parse queries 116 | logging.info("Fetching queries") 117 | try: 118 | results.extend(get_sentinel_queries()) 119 | except Exception as err: # pylint: disable=broad-except 120 | logging.exception( 121 | "Failed to fetch Sentinel queries.", 122 | exc_info=err, 123 | ) 124 | try: 125 | results.extend(get_community_queries(config=args.conf)) 126 | except Exception as err: # pylint: disable=broad-except 127 | logging.exception( 128 | "Failed to fetch community queries.", 129 | exc_info=err, 130 | ) 131 | 132 | # add queries to store 133 | logging.info("Adding %d queries to store.", len(results)) 134 | 135 | try: 136 | store = DataStore(results) 137 | except Exception as err: # pylint: disable=broad-except 138 | logging.exception( 139 | "Failed to queries to store.", 140 | exc_info=err, 141 | ) 142 | 143 | if args.save_stages: 144 | store.to_json(_get_output_file(args, file_type="p1.json")) 145 | 146 | # parse Kql for query properties 147 | logging.info("Getting KQL properties for %d kql queries.", len(results)) 148 | try: 149 | extract.start() 150 | for query in tqdm(store.queries): 151 | try: 152 | kql_properties = extract.extract_kql( 153 | kql_query=query.query, query_id=query.query_id 154 | ) 155 | except Exception as err: # pylint: disable=broad-except 156 | logging.exception( 157 | "Failed to parse query '%s'.\n %s", 158 | query.query_id, 159 | query.source_path, 160 | exc_info=err, 161 | ) 162 | continue 163 | try: 164 | if not kql_properties.get("Valid_Query", True): 165 | logging.error( 166 | "Invalid KQL for query %s (%s)", 167 | query.query_id, 168 | query.source_path, 169 | ) 170 | store.add_kql_properties( 171 | query_id=query.query_id, kql_properties=kql_properties 172 | ) 173 | except Exception as err: # pylint: disable=broad-except 174 | logging.exception( 175 | "Failed to update kql properties for query '%s'.", 176 | query.query_id, 177 | exc_info=err, 178 | ) 179 | finally: 180 | extract.stop() 181 | logging.info("Finished getting KQL properties for %d kql queries.", len(results)) 182 | 183 | # write output 184 | out_json_path = _get_output_file(args, "json") 185 | store.to_json(out_json_path) 186 | logging.info("Writing JSON output to %s", out_json_path) 187 | if args.df: 188 | query_df = store.to_df() 189 | out_df_path = _get_output_file(args, "pkl") 190 | query_df.to_pickle(out_df_path) 191 | logging.info("Writing Pickled dataframe output to %s", out_df_path) 192 | 193 | # get Azure monitor table schema 194 | # and write JSON and DF 195 | if args.az_schemas: 196 | logging.info("Getting Azure Monitor schema data.") 197 | az_schemas = AzMonitorSchemas() 198 | az_schemas.get_az_mon_schemas() 199 | schema_json = Path(args.out).joinpath("az_mon_schemas.json") 200 | schema_df = Path(args.out).joinpath("az_mon_schemas.pkl") 201 | schema_json.write_text(az_schemas.to_json(), encoding="utf-8") 202 | az_schemas.schemas.to_pickle(schema_df) 203 | logging.info( 204 | "Saved schema data to %s and %s.", str(schema_json), str(schema_df) 205 | ) 206 | 207 | logging.info("Job completed") 208 | logging.info("============================================") 209 | 210 | 211 | def _get_output_file(args, file_type): 212 | """Return formatted path for output files.""" 213 | if args.timestamp: 214 | time_stamp = datetime.now(timezone.utc).strftime("%Y-%m-%d-%H-%M-%S") 215 | return Path(args.out).joinpath(f"{_OUTPUT_FILE}-{time_stamp}.{file_type}") 216 | return Path(args.out).joinpath(f"{_OUTPUT_FILE}.{file_type}") 217 | 218 | 219 | def _configure_logging(args): 220 | logging_args: Dict[str, Any] = { 221 | "format": "%(asctime)s: %(funcName)s #%(lineno)d %(filename)s %(message)s" 222 | } 223 | if args.quiet: 224 | logging_args["level"] = logging.WARNING 225 | elif args.verbose: 226 | logging_args["level"] = logging.DEBUG 227 | else: 228 | logging_args["level"] = logging.INFO 229 | logging.basicConfig(**logging_args) 230 | 231 | 232 | # pylint: disable=invalid-name 233 | if __name__ == "__main__": 234 | 235 | arg_parser = _add_script_args() 236 | args = arg_parser.parse_args() 237 | 238 | _configure_logging(args) 239 | main(args) 240 | -------------------------------------------------------------------------------- /src/data_store.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. See License.txt in the project root for 4 | # license information. 5 | # -------------------------------------------------------------------------- 6 | """DataStore class.""" 7 | import json 8 | from pathlib import Path 9 | from typing import Any, Dict, List, Optional, Set, Union 10 | 11 | import numpy as np 12 | import pandas as pd 13 | 14 | from .kql_query import KqlQuery 15 | 16 | __author__ = "Ian Hellen" 17 | 18 | 19 | # interface 20 | # get_query_ids() 21 | # returns a DF of source_path, query_id, query_hash - the idea here is that you 22 | # (or someone) can check for existing queries based on path. I guess I could also 23 | # do that in the store - i.e. don't add a new one if the hash is the same, 24 | # just overwrite with the new details. Hmm. Maybe you don't need to create a query_id. 25 | # I could just do this checking in the data layer comparing source_path and 26 | # source_index with existing values. LMK what you think. 27 | # 28 | # add_queries(queries: List[Dict[as described above]]) 29 | # add_kql_properties(query_id, properties: Dict[Liam's dict]) 30 | # get_filter_lists() - will return a dictionary of lists of unique values of various properties for the UI filtering 31 | # I could also return lists of unique query names and paths 32 | # find_queries(**kwargs) - this is going to be an interesting one given that we have a flexible set of properties to search on. 33 | # kwargs lets us specify a flexible list of conditions, examples: 34 | # source_path="/some/path - exact string match (prob case insensitive) 35 | # query_name={matches: regex} - match based on a pandas operator like regex, startswith, contains 36 | # table=["table1", "table2"] - intersection of queries that use both these tables 37 | # it will return a DF of query_id + basic properties. 38 | # get_query(query_id) - find_queries will return a list, to get all the props for a query, you'd need to call this. 39 | # get_schema(table) 40 | 41 | QueryDict = Dict[str, Union[str, int, Dict[str, Any]]] 42 | QueryList = List[QueryDict] 43 | 44 | KqlQueryList = List[KqlQuery] 45 | 46 | 47 | class DataStore: 48 | """DataStore class for KqlQuery store.""" 49 | 50 | _ATTRIB_INDEXES: Dict[str, type] = {"tactics": list, "techniques": list} 51 | _KQL_INDEXES: Dict[str, type] = { 52 | "tables": list, 53 | "operators": list, 54 | "fields": list, 55 | "functioncalls": list, 56 | "joins": dict, 57 | "valid_query": bool, 58 | } 59 | _ALL_INDEXES: Dict[str, type] = {**_ATTRIB_INDEXES, **_KQL_INDEXES} 60 | 61 | _OPERATOR = { 62 | "startswith": "^{expr}.*", 63 | "endswith": ".*{expr}$", 64 | "contains": ".*{expr}.*", 65 | "matches": "{expr}", 66 | } 67 | 68 | def __init__( 69 | self, 70 | kql_queries: Union[None, KqlQueryList, QueryList] = None, 71 | json_path: Optional[str] = None, 72 | ): 73 | self._json_path = json_path 74 | if json_path: 75 | self._data = { 76 | query.get("query_id"): KqlQuery(**query) 77 | for query in self._read_json_data(json_path) 78 | } 79 | elif kql_queries: 80 | if isinstance(kql_queries[0], KqlQuery): 81 | self._data = {query.query_id: query for query in kql_queries} 82 | else: 83 | self._data = { 84 | query["query_id"]: KqlQuery(**query) for query in kql_queries 85 | } 86 | else: 87 | self._data = {} 88 | # self.attributes = self._extract_attributes() 89 | if self._data: 90 | self._data_df = pd.DataFrame(self.queries).set_index("query_id") 91 | else: 92 | self._data_df = pd.DataFrame( 93 | self.queries, columns=KqlQuery.field_names() 94 | ).set_index("query_id") 95 | self._indexes: Dict[str, pd.DataFrame] = {} 96 | self._create_indexes("attributes") 97 | self._create_indexes("kql_properties") 98 | 99 | @property 100 | def queries(self) -> List[KqlQuery]: 101 | """Get the list of current queries.""" 102 | return list(self._data.values()) 103 | 104 | @property 105 | def queries_dict(self) -> List[KqlQuery]: 106 | """Get the list of current queries.""" 107 | return [query.asdict() for query in self._data.values()] 108 | 109 | def to_json(self, file_path: Optional[str] = None) -> Optional[str]: 110 | """Return the queries as JSON or save to `file_path`, if specified.""" 111 | if file_path is not None: 112 | Path(file_path).write_text(self.to_json(), encoding="utf-8") 113 | return json.dumps(self.queries_dict) 114 | 115 | def to_df(self) -> pd.DataFrame: 116 | """Return queries as a pandas DataFrame.""" 117 | return pd.DataFrame(self.queries) 118 | 119 | def get_query_ids(self) -> pd.DataFrame: 120 | """Return subset of query columns.""" 121 | columns = ["source_path", "query_name", "query_hash"] 122 | if self._data_df is None: 123 | return pd.DataFrame(columns=columns) 124 | return self._data_df[columns] 125 | 126 | def add_queries(self, queries: KqlQueryList): 127 | """Add a list of queries to the store.""" 128 | self._data.update({query.query_id: query for query in queries}) 129 | self._create_indexes("attributes") 130 | self._create_indexes("kql_properties") 131 | self._data_df = pd.DataFrame(self.queries).set_index("query_id") 132 | 133 | def add_query(self, query: KqlQuery): 134 | """Add a single query to the store.""" 135 | self._data[query.query_id] = query 136 | self._add_item_to_indexes(query) 137 | self._data_df = pd.concat( 138 | [self._data_df, pd.DataFrame(query).set_index("query_id")] 139 | ) 140 | 141 | def add_kql_properties(self, query_id: str, kql_properties: Dict[str, Any]): 142 | """Add Kql properties to a query.""" 143 | kql_props = {key.casefold(): value for key, value in kql_properties.items()} 144 | if "valid_query" not in kql_props: 145 | kql_props["valid_query"] = True 146 | self._data[query_id].kql_properties = kql_props 147 | # update indexes 148 | self._add_item_to_indexes(self._data[query_id]) 149 | 150 | def get_filter_lists( 151 | self, categories: Optional[List[str]] = None 152 | ) -> Dict[str, List[str]]: 153 | """Return unique lists of values for each category.""" 154 | return { 155 | attrib: sorted(self._indexes[attrib].index.unique()) 156 | for attrib in {**self._ATTRIB_INDEXES, **self._KQL_INDEXES} 157 | if attrib in self._indexes and (categories is None or attrib in categories) 158 | } 159 | 160 | def find_queries(self, case: bool = False, **kwargs) -> pd.DataFrame: 161 | """ 162 | Return matching values as a pandas DataFrame. 163 | 164 | Parameters 165 | ---------- 166 | case : bool, optional 167 | Use case-sensitive matching, by default False 168 | 169 | Other Parameters 170 | ---------------- 171 | kwargs : 172 | You can specify search criteria in the general form attrib_name=expression. 173 | You can specify multiple criteria - all will be ANDed together. 174 | attrib=value - exact match (case sensitive for strings) 175 | attrib={operator: value} - match based on a string operator (matches, 176 | contains, startswith, endswith) 177 | attrib=["value1", "value2"] - intersection of items that have 178 | matches for ALL items in the list. 179 | 180 | Returns 181 | ------- 182 | pd.DataFrame 183 | DataFrame of matching queries 184 | 185 | Examples 186 | -------- 187 | Some examples of expressions: 188 | 189 | - source_path="/some/path" - exact string match (case insensitive) 190 | - query_name={matches: "AAD.*"} - match based on an operator 191 | like regex, startswith, contains 192 | - table=["table1", "table2"] - the queries that use both these tables 193 | 194 | >>>> ds.find_queries( 195 | query_name={"contains": "AAD"}, 196 | tables=["table1", "table2"], 197 | operations=[...] 198 | ) 199 | 200 | """ 201 | if self._data_df is None: 202 | return pd.DataFrame() 203 | # Create a base criterion where all rows == True 204 | criteria = self._data_df.index.notna() 205 | debug = kwargs.pop("debug", False) 206 | valid_fields = KqlQuery.field_names() + list(self._indexes.keys()) 207 | 208 | for arg_name, arg_expr in kwargs.items(): 209 | if arg_name not in valid_fields: 210 | raise ValueError( 211 | f"Unknown attribute name {arg_name}", 212 | f"Search expression: {arg_expr}.", 213 | ) 214 | if isinstance(arg_expr, str): 215 | criteria &= self._data_df[arg_name] == arg_expr 216 | if isinstance(arg_expr, dict): 217 | operator, expr = next(iter(arg_expr.items())) 218 | crit_expr = self._OPERATOR.get(operator) 219 | if crit_expr: 220 | criteria &= self._data_df[arg_name].str.match( 221 | crit_expr.format(expr=expr), case=case 222 | ) 223 | if debug: 224 | print(arg_expr, criteria.value_counts()) 225 | if isinstance(arg_expr, list) and arg_name in self._indexes: 226 | query_ids = self._get_matching_ids(debug, arg_name, arg_expr) 227 | 228 | # Add the matched query IDs to criteria 229 | criteria &= self._data_df.index.isin(query_ids) 230 | if debug: 231 | print(arg_expr, criteria.value_counts()) 232 | # return the data subset 233 | if debug: 234 | print("final criteria:", criteria.value_counts()) 235 | return self._data_df[criteria] 236 | 237 | def _get_matching_ids(self, debug, arg_name, arg_expr): 238 | query_ids: Optional[Set] = None 239 | # we're looking for queries in the indexes that have a matching value 240 | for match_value in arg_expr: 241 | # matched_ids == all query_ids with this property 242 | matched_ids = set( 243 | self._indexes[arg_name][self._indexes[arg_name].index == match_value][ 244 | "query_id" 245 | ].values 246 | ) 247 | if debug: 248 | print(len(matched_ids)) 249 | # AND this with query_ids (unless None, then just use this as the 250 | # first criterion) 251 | return matched_ids if query_ids is None else matched_ids | query_ids 252 | 253 | @staticmethod 254 | def _read_json_data(json_path: str): 255 | return json.loads(Path(json_path).read_text(encoding="utf-8")) 256 | 257 | def _create_indexes(self, sub_key: str): 258 | """Create indexes for child items in queries.""" 259 | # create DF with attributes expanded to columns 260 | if self._data_df is None: 261 | return 262 | exp_df = ( 263 | # avoid rows with null or empty dictionaries 264 | self._data_df[ 265 | ~((self._data_df[sub_key] == {}) | (self._data_df[sub_key].isna())) 266 | ][[sub_key]].apply( 267 | lambda x: pd.Series(x[sub_key]), result_type="expand", axis=1 268 | ) 269 | ) 270 | for key, data_type in self._ALL_INDEXES.items(): 271 | if key not in exp_df.columns: 272 | continue 273 | if data_type == list: 274 | self._indexes[key] = self._create_list_index( 275 | data=exp_df, 276 | key_col=key, 277 | ) 278 | if data_type == dict: 279 | self._indexes[key] = self._create_dict_index( 280 | data=exp_df, 281 | key_col=key, 282 | ) 283 | if data_type == bool: 284 | self._indexes[key] = self._create_bool_index( 285 | data=exp_df, 286 | key_col=key, 287 | ) 288 | 289 | def _add_item_to_indexes(self, query: KqlQuery): 290 | """Add attributes and kql_properties to indexes.""" 291 | index_attribs = {**(query.attributes), **(query.kql_properties)} 292 | for key in self._ALL_INDEXES: 293 | if key not in index_attribs: 294 | continue 295 | df_index = ( 296 | list(index_attribs[key]) 297 | if isinstance(index_attribs[key], (list, dict)) 298 | else [index_attribs[key]] 299 | if isinstance(index_attribs[key], bool) 300 | else None 301 | ) 302 | if df_index is not None: 303 | current_index = self._indexes.get(key) 304 | new_index_items = pd.DataFrame( 305 | data=[{"query_id": query.query_id} for _ in df_index], 306 | index=df_index, 307 | ) 308 | if current_index is None: 309 | self._indexes[key] = new_index_items 310 | else: 311 | self._indexes[key] = pd.concat( 312 | [self._indexes[key], new_index_items] 313 | ) 314 | 315 | @staticmethod 316 | def _create_list_index(data, key_col): 317 | return ( 318 | data[[key_col]].explode(key_col).dropna().reset_index().set_index(key_col) 319 | ) 320 | 321 | @staticmethod 322 | def _create_bool_index(data, key_col): 323 | return data[[key_col]].dropna().reset_index().set_index(key_col) 324 | 325 | @staticmethod 326 | def _extract_dict_keys(row, col_name): 327 | if isinstance(row[col_name], dict): 328 | return { 329 | col_name: [ 330 | inner_val 331 | for val in row[col_name].values() 332 | for inner_val in val 333 | if isinstance(val, dict) and inner_val != np.nan 334 | ] 335 | } 336 | return row 337 | 338 | def _create_dict_index(self, data, key_col): 339 | df_dict_keys = data[[key_col]].apply( 340 | lambda x: self._extract_dict_keys(x, key_col), result_type="expand", axis=1 341 | ) 342 | return self._create_list_index(df_dict_keys, key_col) 343 | -------------------------------------------------------------------------------- /src/extract.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import queue 4 | import subprocess 5 | import threading 6 | import time 7 | from base64 import b64encode 8 | from pathlib import Path 9 | from uuid import uuid4 10 | 11 | worker_exit = threading.Event() 12 | worker_queue = queue.Queue() 13 | worker_results = queue.Queue() 14 | worker_thread = None 15 | 16 | _CS_PROJ_PATH = str( 17 | Path(__file__).parent.joinpath( 18 | "../kqlextraction/KqlExtraction/KqlExtraction.csproj" 19 | ) 20 | ) 21 | 22 | 23 | def _worker_thread_proc(): 24 | try: 25 | kql_extraction = None 26 | 27 | while not worker_exit.is_set(): 28 | try: 29 | if kql_extraction is not None: 30 | if kql_extraction.poll() is not None: 31 | kql_extraction = None 32 | if kql_extraction is None: 33 | kql_extraction = subprocess.Popen( 34 | [ 35 | "dotnet", 36 | "run", 37 | "-c", 38 | "Release", 39 | "--project", 40 | # os.path.join(os.path.abspath(os.path.split(__file__)[0]), 'KqlExtraction', 'KqlExtraction.csproj') 41 | _CS_PROJ_PATH, 42 | ], 43 | stdin=subprocess.PIPE, 44 | stdout=subprocess.PIPE, 45 | stderr=subprocess.PIPE, 46 | ) 47 | except Exception as ex: 48 | print("[!] Exception Starting KqlExtraction Process") 49 | break 50 | 51 | try: 52 | uuid, kql = worker_queue.get(timeout=2.0) 53 | kql_extraction.stdin.write( 54 | bytes(f"{uuid},", encoding="utf-8") 55 | + b64encode(bytes(kql, encoding="utf-8")) 56 | + b"\n" 57 | ) 58 | kql_extraction.stdin.flush() 59 | 60 | kql_extraction_result = kql_extraction.stdout.readline() 61 | worker_results.put(json.loads(kql_extraction_result)) 62 | except queue.Empty: 63 | pass 64 | except Exception as ex: 65 | kql_extraction.kill() 66 | 67 | if kql_extraction.poll() is None: 68 | kql_extraction.kill() 69 | except Exception as ex: 70 | print("[!] Unhandled Exception", str(ex)) 71 | 72 | 73 | def extract_kql(kql): 74 | kql_id = str(uuid4()) 75 | worker_queue.put((kql_id, kql)) 76 | 77 | try: 78 | kql_result = {} 79 | while True: 80 | kql_result = worker_results.get(timeout=5.0) 81 | if "Id" in kql_result and kql_result["Id"] == kql_id: 82 | break 83 | except Exception: 84 | pass 85 | 86 | return kql_result 87 | 88 | 89 | def start(): 90 | global worker_thread 91 | worker_thread = threading.Thread(target=_worker_thread_proc) 92 | worker_thread.start() 93 | 94 | 95 | def stop(): 96 | worker_exit.set() 97 | worker_thread.join() 98 | 99 | 100 | if __name__ == "__main__": 101 | worker_thread = threading.Thread(target=_worker_thread_proc) 102 | worker_thread.start() 103 | 104 | try: 105 | base_path = os.path.abspath(os.path.split(__file__)[0]) 106 | for kql_file in os.listdir(os.path.join(base_path, "tests")): 107 | kql_file = os.path.join(base_path, "tests", kql_file) 108 | 109 | with open(kql_file, "r") as f: 110 | kql = f.read() 111 | 112 | print(extract_kql(kql)) 113 | except Exception as ex: 114 | print("[!] Unhandled Exception", str(ex)) 115 | 116 | while not worker_queue.empty(): 117 | time.sleep(0.5) 118 | 119 | worker_exit.set() 120 | worker_thread.join() 121 | -------------------------------------------------------------------------------- /src/ian_test.kql: -------------------------------------------------------------------------------- 1 | //Detects when a user with a privileged Azure AD role has had their on premises Active Directory password changed by someone other than themselves. 2 | 3 | //Data connector required for this query - Windows Security Events via AMA or Security Events via Legacy Agent 4 | //Data connector required for this query - Microsoft Sentinel UEBA 5 | 6 | let timeframe=7d; 7 | //First find any users that hold privileged Azure AD roles 8 | IdentityInfo 9 | | where TimeGenerated > ago(21d) 10 | | where isnotempty(AssignedRoles) 11 | | where AssignedRoles != "[]" 12 | | summarize arg_max(TimeGenerated, *) by AccountUPN 13 | | project AccountUPN, AccountName, AccountSID 14 | //Join those users based on AccountSID to on premises Active Directory password reset events 15 | | join kind=inner ( 16 | SecurityEvent 17 | | where TimeGenerated > ago(timeframe) 18 | | where EventID == "4724" 19 | | project 20 | TimeGenerated, 21 | Activity, 22 | SubjectAccount, 23 | TargetAccount, 24 | TargetSid, 25 | SubjectUserSid 26 | ) 27 | on $left.AccountSID == $right.TargetSid 28 | | where SubjectUserSid != TargetSid 29 | //Summarize event data to make it easy to read 30 | | project ['Time of Password Reset']=TimeGenerated, Activity, Actor=SubjectAccount, ['Target UserPrincipalName']=AccountUPN,['Target AccountName']=TargetAccount -------------------------------------------------------------------------------- /src/kql_download.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. See License.txt in the project root for 4 | # license information. 5 | # -------------------------------------------------------------------------- 6 | """Github download and conversion functions.""" 7 | 8 | import logging 9 | import shutil 10 | from itertools import chain 11 | from pathlib import Path 12 | from typing import List, Union 13 | 14 | import pandas as pd 15 | 16 | from .kql_file_parser import ( 17 | download_git_archive, 18 | format_repo_url, 19 | get_sentinel_queries_from_github, 20 | parse_kql_to_dict, 21 | parse_markdown_to_dict, 22 | parse_yaml, 23 | read_config, 24 | ) 25 | from .kql_query import KqlQuery 26 | 27 | __author__ = "Ashwin Patil, Jannie Li, Ian Hellen" 28 | 29 | 30 | _CURR_DIR = Path.cwd() 31 | 32 | 33 | def get_sentinel_queries(output_path: Path = _CURR_DIR): 34 | """Return Sentinel queries from repo.""" 35 | # download sentinel github and extract yaml files only 36 | azsentinel_git_url = "https://github.com/Azure/Azure-Sentinel/archive/master.zip" 37 | get_sentinel_queries_from_github(git_url=azsentinel_git_url, outputdir=output_path) 38 | 39 | # Parsing yaml files and converting to dataframe 40 | base_dir = str(output_path.joinpath("Azure-Sentinel-master")) 41 | detections_df = parse_yaml(parent_dir=base_dir, child_dir="Detections") 42 | hunting_df = parse_yaml(parent_dir=base_dir, child_dir="Hunting Queries") 43 | solutions_df = parse_yaml(parent_dir=base_dir, child_dir="Solutions") 44 | 45 | # tmp dirs 46 | logging.info( 47 | "Detections: %d Hunting Queries: %d Solutions: %d", 48 | len(detections_df), 49 | len(hunting_df), 50 | len(hunting_df), 51 | ) 52 | _remove_tmp_folder(output_path.joinpath("Azure-Sentinel-master")) 53 | # Filtering yamls with no KQL queries 54 | query_list = _sent_dfs_to_kql_query_list( 55 | detections_df=detections_df[detections_df["query"].notnull()], 56 | hunting_df=hunting_df[hunting_df["query"].notnull()], 57 | solutions_df=solutions_df[solutions_df["query"].notnull()], 58 | ) 59 | return [KqlQuery(**query) for query in query_list] 60 | 61 | 62 | def _sent_dfs_to_kql_query_list(detections_df, hunting_df, solutions_df): 63 | # Selecting specific columns 64 | columns = [ 65 | "name", 66 | "GithubURL", 67 | "query", 68 | "description", 69 | "tactics", 70 | "relevantTechniques", 71 | ] 72 | all_dfs = [detections_df[columns], hunting_df[columns], solutions_df[columns]] 73 | sentinel_github = pd.concat(all_dfs, ignore_index=True, sort=True) 74 | 75 | # renaming to columns to match with schema 76 | sentinel_github = sentinel_github.rename( 77 | columns={ 78 | "GithubURL": "source_path", 79 | "name": "query_name", 80 | "relevantTechniques": "techniques", 81 | }, 82 | ) 83 | 84 | cols = ["description", "techniques", "tactics"] 85 | # create new column by merging selected columns into dictionary 86 | sentinel_github["attributes"] = sentinel_github[cols].to_dict(orient="records") 87 | 88 | # select columns and display sample dataframe records 89 | select_columns = ["source_path", "query_name", "query", "attributes"] 90 | sentinel_github[select_columns].head() 91 | 92 | # return it as list of dictionary 93 | return sentinel_github[select_columns].to_dict(orient="records") 94 | 95 | 96 | # ### KQL - Community Github Repos 97 | 98 | 99 | def get_community_queries( 100 | output_dir: Path = _CURR_DIR, config: Union[Path, str] = "repos.yaml" 101 | ): 102 | """Return KqlQuery list from community repos.""" 103 | # Read yaml config file 104 | repos = read_config(config) 105 | 106 | # Compile list of github urls to download 107 | repo_urls: List[str] = [] 108 | tmp_dirs: List[str] = [] 109 | for item in repos: 110 | url = format_repo_url(item["Github"]["repo"], item["Github"]["branch"]) 111 | repo_urls.append(url) 112 | tmp_dirs.append( 113 | str( 114 | output_dir.joinpath( 115 | f"{item['Github']['repo']}-{item['Github']['branch']}" 116 | ) 117 | ) 118 | ) 119 | 120 | # download github urls one by one 121 | for url in repo_urls: 122 | download_git_archive(url, output_dir) 123 | 124 | txt_queries = _read_community_txt_queries(repos, output_dir) 125 | md_queries = _read_community_md_queries(repos, output_dir) 126 | to_remove = tmp_dirs.copy() 127 | for tmp_dir in to_remove: 128 | _remove_tmp_folder(tmp_dir) 129 | tmp_dirs.remove(tmp_dir) 130 | return [ 131 | query if isinstance(query, KqlQuery) else KqlQuery(**query) 132 | for query in chain(txt_queries, md_queries) 133 | ] 134 | 135 | 136 | def _read_community_txt_queries(repos, src_path): 137 | """Parse text files.""" 138 | parsed_txt_queries = [] 139 | 140 | for item in repos: 141 | repo_name = item["Github"]["repo"] 142 | branch_name = item["Github"]["branch"] 143 | list_of_dict = parse_kql_to_dict(repo_name, branch_name, src_path) 144 | parsed_txt_queries.extend(list_of_dict) 145 | # display parsed sample record 146 | logging.info("Parsed %d queries from text files", len(parsed_txt_queries)) 147 | return parsed_txt_queries 148 | 149 | 150 | def _read_community_md_queries(repos, src_path): 151 | """Parses markdown files.""" 152 | parsed_md_queries = [] 153 | 154 | for item in repos: 155 | repo_name = item["Github"]["repo"] 156 | branch_name = item["Github"]["branch"] 157 | list_of_dict = parse_markdown_to_dict(repo_name, branch_name, src_path) 158 | parsed_md_queries.extend(list_of_dict) 159 | 160 | logging.info("Parsed %d queries from text files", len(parsed_md_queries)) 161 | return parsed_md_queries 162 | 163 | 164 | def _remove_tmp_folder(tmp_dir): 165 | if Path(tmp_dir).is_dir(): 166 | try: 167 | shutil.rmtree(tmp_dir) 168 | except Exception as err: # pylint: disable=broad-except 169 | logging.exception( 170 | "Error trying to remove temporary folder '%s'.", tmp_dir, exc_info=err 171 | ) 172 | -------------------------------------------------------------------------------- /src/kql_extract.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. See License.txt in the project root for 4 | # license information. 5 | # -------------------------------------------------------------------------- 6 | """Kql extract threading interface with .Net Kqlextract.""" 7 | import contextlib 8 | import json 9 | import logging 10 | import queue 11 | import subprocess 12 | import threading 13 | import time 14 | from base64 import b64encode 15 | from pathlib import Path 16 | from typing import Optional 17 | from uuid import uuid4 18 | 19 | __author__ = "Liam Kirton" 20 | 21 | 22 | base_path = Path(__file__).parent 23 | CS_PROJ_PATH = base_path.joinpath("../kqlextraction/KqlExtraction/KqlExtraction.csproj") 24 | 25 | worker_exit = threading.Event() 26 | worker_queue = queue.Queue() # type: ignore 27 | worker_results = queue.Queue() # type: ignore 28 | worker_thread = None 29 | 30 | # pylint: disable=broad-except 31 | 32 | _EXTRACT_ARGS = [ 33 | "dotnet", 34 | "run", 35 | "-c", 36 | "Release", 37 | "--project", 38 | str(CS_PROJ_PATH), 39 | ] 40 | _SYNTAX_ERROR = "[!]" 41 | 42 | 43 | def _worker_thread_proc(): 44 | try: 45 | kql_extraction = None 46 | 47 | while not worker_exit.is_set(): 48 | try: 49 | if kql_extraction is not None and kql_extraction.poll() is not None: 50 | kql_extraction = None 51 | if kql_extraction is None: 52 | kql_extraction = subprocess.Popen( 53 | _EXTRACT_ARGS, 54 | stdin=subprocess.PIPE, 55 | stdout=subprocess.PIPE, 56 | stderr=subprocess.PIPE, 57 | ) 58 | except Exception as subp_ex: 59 | logging.exception( 60 | "[!] Exception Starting KqlExtraction Process.", exc_info=subp_ex 61 | ) 62 | break 63 | 64 | try: 65 | uuid, kql = worker_queue.get(timeout=2.0) 66 | kql_extraction.stdin.write( 67 | bytes(f"{uuid},", encoding="utf-8") 68 | + b64encode(bytes(kql, encoding="utf-8")) 69 | + b"\n" 70 | ) 71 | kql_extraction.stdin.flush() 72 | 73 | kql_extraction_result = kql_extraction.stdout.readline() 74 | if ( 75 | str(kql_extraction_result, encoding="utf-8") 76 | .strip() 77 | .startswith(_SYNTAX_ERROR) 78 | ): 79 | worker_results.put(_syntax_err_result(uuid)) 80 | else: 81 | worker_results.put(json.loads(kql_extraction_result)) 82 | # try: 83 | # worker_results.put(json.loads(kql_extraction_result)) 84 | # except json.JSONDecodeError: 85 | # worker_results.put(_syntax_err_result(uuid)) 86 | except queue.Empty: 87 | pass 88 | except Exception as thread_ex: 89 | logging.exception( 90 | "[!] Unhandled Exception in 'while not worker_exit.is_set', query_id='%s', \ninput sample: %s", 91 | uuid, 92 | kql_extraction_result[:200], 93 | exc_info=thread_ex, 94 | ) 95 | kql_extraction.kill() 96 | 97 | if kql_extraction is not None and kql_extraction.poll() is None: 98 | kql_extraction.kill() 99 | except Exception as thread_out_ex: 100 | logging.exception( 101 | "[!] Unhandled Exception at 'while not worker_exit.is_set()'", 102 | exc_info=thread_out_ex, 103 | ) 104 | 105 | 106 | def extract_kql(kql_query: str, query_id: Optional[str] = None): 107 | """Extract kql_properties from Kql query.""" 108 | kql_id = query_id or str(uuid4()) 109 | worker_queue.put((kql_id, kql_query)) 110 | 111 | with contextlib.suppress(Exception): 112 | kql_result = {} 113 | while True: 114 | kql_result = worker_results.get(timeout=5.0) 115 | if "Id" in kql_result and kql_result["Id"] == kql_id: 116 | break 117 | return kql_result 118 | 119 | 120 | def start(): 121 | """Start extractor worker thread.""" 122 | global worker_thread # pylint: disable=invalid-name, global-use 123 | worker_thread = threading.Thread(target=_worker_thread_proc) 124 | worker_thread.start() 125 | logging.info("Started kql extractor thread.") 126 | 127 | 128 | def stop(): 129 | """Stop worker thread.""" 130 | worker_exit.set() 131 | worker_thread.join() 132 | logging.info("Kql extractor thread stopped.") 133 | 134 | 135 | def _syntax_err_result(query_id): 136 | return { 137 | "Id": query_id, 138 | "FunctionCalls": [], 139 | "Joins": {}, 140 | "Operators": [], 141 | "Tables": [], 142 | "Valid_query": False, 143 | } 144 | 145 | 146 | if __name__ == "__main__": 147 | worker_thread = threading.Thread(target=_worker_thread_proc) 148 | worker_thread.start() 149 | 150 | test_path = base_path.joinpath("test_data") 151 | print("using", test_path) 152 | print(len(list(test_path.glob("*.kql"))), "kql files") 153 | try: 154 | for file_no, kql_file in enumerate(test_path.glob("*.kql")): 155 | # kql_file = os.path.join(base_path, "tests", kql_file) 156 | print(f"[{file_no}], {kql_file.name}") 157 | print( 158 | f"[{file_no}]\n".join( 159 | kql_file.read_text(encoding="utf-8").split("\n")[:5] 160 | ) 161 | ) 162 | with open(kql_file, "r", encoding="utf-8") as f: 163 | kql_text = f.read() 164 | 165 | print(f"[{file_no}]", extract_kql(kql_text, query_id=file_no)) 166 | 167 | except Exception as ex: 168 | print("[!] Unhandled Exception", ex) 169 | 170 | while not worker_queue.empty(): 171 | time.sleep(0.5) 172 | 173 | worker_exit.set() 174 | worker_thread.join() 175 | -------------------------------------------------------------------------------- /src/kql_file_parser.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. See License.txt in the project root for 4 | # license information. 5 | # -------------------------------------------------------------------------- 6 | """Query download and parsing functions.""" 7 | 8 | import glob 9 | import io 10 | import logging 11 | import urllib.parse 12 | import warnings 13 | import zipfile 14 | from pathlib import Path 15 | from typing import List 16 | 17 | import pandas as pd 18 | import requests 19 | import yaml 20 | from pandas import json_normalize 21 | from requests.exceptions import HTTPError 22 | from tqdm.auto import tqdm 23 | 24 | from .kql_query import KqlQuery 25 | 26 | __author__ = "Ashwin Patil, Jannie Li" 27 | 28 | 29 | def read_config(filename): 30 | with open(filename, "r", encoding="utf-8") as yamlfile: 31 | data = yaml.safe_load(yamlfile) 32 | return data 33 | 34 | 35 | def format_repo_url(repo_name, branch_name): 36 | return f"https://github.com/{repo_name}/archive/{branch_name}.zip" 37 | 38 | 39 | def download_git_archive(git_url, output_dir): 40 | logging.info("Downloading %s, may take few mins..", git_url) 41 | try: 42 | r = requests.get(git_url) 43 | repo_zip = io.BytesIO(r.content) 44 | logging.info("Extracting files..") 45 | with zipfile.ZipFile(repo_zip, mode="r") as archive: 46 | for file in tqdm(archive.namelist()): 47 | archive.extract(file, path=output_dir) 48 | logging.info("Downloaded and Extracted Files successfully") 49 | except HTTPError as http_err: 50 | warnings.warn(f"HTTP error occurred trying to download from Github: {http_err}") 51 | 52 | 53 | def get_sentinel_queries_from_github(git_url, outputdir): 54 | logging.info("Downloading from Azure Sentinel Github, may take 2-3 mins..") 55 | try: 56 | r = requests.get(git_url) 57 | repo_zip = io.BytesIO(r.content) 58 | 59 | with zipfile.ZipFile(repo_zip, mode="r") as archive: 60 | # Only extract Detections and Hunting Queries Folder 61 | logging.info("Extracting files..") 62 | for file in tqdm(archive.namelist()): 63 | if file.startswith( 64 | ( 65 | "Azure-Sentinel-master/Detections/", 66 | "Azure-Sentinel-master/Hunting Queries/", 67 | "Azure-Sentinel-master/Solutions/", 68 | ) 69 | ) and file.endswith(".yaml"): 70 | archive.extract(file, path=outputdir) 71 | logging.info("Downloaded and Extracted Files successfully") 72 | except HTTPError as http_err: 73 | warnings.warn(f"HTTP error occurred trying to download from Github: {http_err}") 74 | 75 | 76 | def parse_yaml(parent_dir, child_dir): 77 | sentinel_repourl = "https://github.com/Azure/Azure-Sentinel/blob/master" 78 | bad_yamls = [ 79 | ( 80 | "/home/jovyan/work/Hackathon/kql-query-store/dev-notebooks/" 81 | "Azure-Sentinel-master/Hunting Queries/Microsoft 365 Defender" 82 | "/Device Inventory/Find Software By Name and Version.yaml" 83 | ) 84 | ] 85 | # Collect list of files recursively under a folder 86 | yaml_queries = glob.glob(f"{parent_dir}/{child_dir}/**/*.yaml", recursive=True) 87 | yaml_queries = [query for query in yaml_queries if query not in bad_yamls] 88 | 89 | frames: List[pd.DataFrame] = [] 90 | 91 | # Recursively load yaml Files and append to dataframe 92 | logging.info("Parsing yaml queries..") 93 | for query in tqdm(yaml_queries): 94 | with open(query, "r", encoding="utf-8", errors="ignore") as file_stream: 95 | try: 96 | parsed_yaml_df = json_normalize(yaml.safe_load(file_stream)) 97 | except Exception as err: # pylint: disable=broad-except 98 | logging.exception( 99 | "Exception parsing yaml_query %s", query, exc_info=err 100 | ) 101 | continue 102 | parsed_yaml_df["GithubURL"] = urllib.parse.quote( 103 | query.replace(parent_dir, sentinel_repourl), safe=":/" 104 | ) 105 | # #URL encode 106 | # parsed_yaml_df["GithubURL"] = urllib.parse.quote(parsed_yaml_df["GithubURL"], safe=':/') 107 | # parsed_yaml_df = parsed_yaml_df[columns] 108 | frames.append(parsed_yaml_df) 109 | 110 | return pd.concat(frames, ignore_index=True, sort=True) 111 | 112 | 113 | def parse_kql_to_dict(repo_name, branch_name, src_path): 114 | parent_dir = Path(src_path).joinpath(f"{repo_name.split('/')[-1]}-{branch_name}") 115 | kql_files = glob.glob(f"{parent_dir}/**/*.kql", recursive=True) 116 | 117 | git_repo_url = f"https://github.com/{repo_name}/tree/main" 118 | 119 | list_of_kql_files_dict = [] 120 | logging.info("Parsing queries..") 121 | for file in tqdm(kql_files): 122 | with open(file, "r", encoding="utf-8", errors="ignore") as f: 123 | kql_query = KqlQuery( 124 | query=f.read(), 125 | source_path=urllib.parse.quote( 126 | file.replace(str(parent_dir), git_repo_url), safe=":/" 127 | ), 128 | query_name=Path(file).stem, 129 | source_type="text", 130 | attributes={}, 131 | ) 132 | list_of_kql_files_dict.append(kql_query) 133 | 134 | return list_of_kql_files_dict 135 | 136 | 137 | def parse_markdown_to_dict(repo_name, branch_name, src_path): 138 | parent_dir = Path(src_path).joinpath(f"{repo_name.split('/')[-1]}-{branch_name}") 139 | md_files = glob.glob(f"{parent_dir}/**/*.md", recursive=True) 140 | logging.info( 141 | "Processing %d markdown files from repo: %s", 142 | len(md_files), 143 | repo_name, 144 | ) 145 | git_repo_url = f"https://github.com/{repo_name}/tree/main" 146 | 147 | # src_path_list = [] 148 | logging.info("Parsing markdown files..") 149 | kql_query_list: List[KqlQuery] = [] 150 | for file in tqdm(md_files): 151 | file_path = Path(file) 152 | lines = file_path.read_text(encoding="utf-8").split("\n") 153 | 154 | in_kql = False 155 | kql_text = [] 156 | last_header = None 157 | context = [] 158 | qry_index = 0 159 | for line in lines: 160 | if line.startswith("```kql"): 161 | in_kql = True 162 | continue 163 | if line.strip() == "```": 164 | kql_query_list.append( 165 | KqlQuery( 166 | query="\n".join(kql_text), 167 | source_path=urllib.parse.quote( 168 | str(file_path).replace(str(parent_dir), git_repo_url), 169 | safe=":/", 170 | ), 171 | source_type="markdown", 172 | source_index=qry_index, 173 | query_name=last_header or f"{file_path.stem}_{qry_index}", 174 | context="\n".join(context[-10:]), 175 | ) 176 | ) 177 | qry_index += 1 178 | in_kql = False 179 | kql_text = [] 180 | last_header = None 181 | context = [] 182 | continue 183 | if not in_kql and line.startswith("#"): 184 | last_header = line 185 | if in_kql: 186 | kql_text.append(line) 187 | else: 188 | context.append(line) 189 | 190 | # ct = 0 191 | # kql = False 192 | # kql_collect = [] 193 | # title_collect = [] 194 | # cur_kql = [] 195 | # title = "n/a" 196 | # while ct < len(lines): 197 | # if kql: 198 | # cur_kql.append(lines[ct]) 199 | # if lines[ct].startswith("#") and lines[ct + 2] == "```kql": 200 | # kql = True 201 | # title = lines[ct] 202 | # elif lines[ct] == "```kql": 203 | # kql = True 204 | # elif lines[ct] == "```": 205 | # kql = False 206 | # cur_kql = "\n".join(cur_kql) 207 | # kql_collect.append(cur_kql) 208 | # title_collect.append(title) 209 | # title = "n/a" 210 | # cur_kql = [] 211 | # ct += 1 212 | # src_path = urllib.parse.quote( 213 | # str(file_path).replace(str(parent_dir), git_repo_url), safe=":/" 214 | # ) 215 | # src_path_list.append(src_path) 216 | 217 | # kql_query = KqlQuery( 218 | # query_name=title_collect, 219 | # query=kql_collect, 220 | # source_path=src_path_list, 221 | # ) 222 | # df = pd.concat([df, test_df]) 223 | 224 | return kql_query_list 225 | -------------------------------------------------------------------------------- /src/kql_query.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. See License.txt in the project root for 4 | # license information. 5 | # -------------------------------------------------------------------------- 6 | """KqlQuery data class.""" 7 | import hashlib 8 | import json 9 | import re 10 | import uuid 11 | from dataclasses import asdict, dataclass, field, fields 12 | from typing import Any, Dict, List, Literal, Optional 13 | 14 | import pandas as pd 15 | 16 | __author__ = "Ian Hellen" 17 | 18 | 19 | _SOURCE_TYPES = ["text", "markdown", "sentinel_yaml", "api", "other"] 20 | SourceType = Literal["text", "markdown", "sentinel_yaml", "api", "other"] 21 | _REPO_NAME = re.compile(r"https://github\.com/(?P[^/]+/[^/]+)/.*", re.IGNORECASE) 22 | 23 | 24 | def _uuid_str(): 25 | return str(uuid.uuid4()) 26 | 27 | 28 | @dataclass 29 | class KqlQuery: 30 | """ 31 | Data format for KqlQuery record. 32 | 33 | Attributes 34 | ---------- 35 | source_path : str 36 | The path to the original file or API identifier. 37 | query : str 38 | The raw query string 39 | source_type : SourceType, optional 40 | String - the source file/data type. Valid types are: 41 | text, markdown, sentinel_yaml, api, other 42 | source_index : int, optional 43 | The index (0-based) if the query is one of several in the 44 | file pointed to by source_path. The default is 0. 45 | query_name : Optional[str] 46 | The name of the query. If None this will be derived from 47 | the last element of source_path 48 | attributes: Dict[str, Any], optional 49 | Dictionary of any metadata attributes read from the source 50 | file. 51 | kql_properties: Dict[str, Any], optional 52 | Dictionary of properties derived from the KQL query 53 | query_id: Optional[str], optional 54 | UUID used to identify the query 55 | query_hash: int, optional 56 | Hash of the query text 57 | query_version: int, optional 58 | Query version, not currently used. Default is 0 59 | 60 | Examples 61 | -------- 62 | Create a KqlQuery instance 63 | >>>> kql = KqlQuery( 64 | ... source_path="https://github.com/a/b/file.kql", 65 | ... query="SecurityAlert | take 1" 66 | ... ) 67 | 68 | Create a KqlQuery instance from a dict 69 | >>>> attribs = { 70 | ... "source_path": "https://github.com/a/b/file.kql", 71 | ... "query": "SecurityAlert | take 1", 72 | ... } 73 | ... kql = KqlQuery(**attribs) 74 | 75 | Different default representation 76 | >>>> kql 77 | KqlQuery(source_path='https://github.com/a/b/file.kql', query='SecurityAlert... query_version=0) 78 | 79 | As a dict 80 | >>>> print(kql.asdict()) 81 | {'source_path': 'https://github.com/a/b/file.kql', 'query': 'SecurityAlert... 'query_version': 0} 82 | 83 | As JSON 84 | print(kql.to_json()) 85 | {"source_path": "https://github.com/a/b/file.kql", "query": "SecurityAlert... "query_version": 0} 86 | 87 | Class method to convert a list of KqlQuery instances to a list of dicts 88 | >>>> KqlQuery.kql_list_to_pylist([kql, kql]) 89 | 90 | Class method to convert a list of KqlQuery instances to JSON 91 | >>>> KqlQuery.kql_list_to_json([kql, kql]) 92 | '[{"source_path": "https://github.com/a/b/file.kql", "query": "SecurityAlert... "query_version": 0}]' 93 | 94 | Class method to convert list of KqlQuery instances to a DataFrame 95 | """ 96 | 97 | source_path: str 98 | query: str 99 | source_type: SourceType = "text" 100 | source_index: int = 0 101 | repo_name: Optional[str] = None 102 | query_name: Optional[str] = None 103 | context: Optional[str] = None 104 | attributes: Dict[str, Any] = field(default_factory=dict) 105 | kql_properties: Dict[str, Any] = field(default_factory=dict) 106 | query_id: str = field(default_factory=_uuid_str) 107 | query_hash: int = 0 108 | query_version: int = 0 109 | 110 | def __post_init__(self): 111 | """Run post""" 112 | if self.query_name is None and self.source_path is not None: 113 | self.query_name = self.source_path.rsplit("/", maxsplit=1)[-1] 114 | if self.query: 115 | self.query_hash = hashlib.sha256( 116 | bytes(self.query, encoding="utf-8"), 117 | # usedforsecurity=False 118 | ).hexdigest() 119 | if self.repo_name is None and self.source_path is not None: 120 | match = _REPO_NAME.match(self.source_path) 121 | if match: 122 | self.repo_name = match["name"] 123 | 124 | def asdict(self): 125 | """Return a dictionary of attributes.""" 126 | return asdict(self) 127 | 128 | def to_json(self): 129 | """Return JSON representation of attributes.""" 130 | return json.dumps(self.asdict()) 131 | 132 | # helper methods and properties 133 | @property 134 | def source_types(self): 135 | """Return list of acceptable source_types.""" 136 | del self 137 | return _SOURCE_TYPES 138 | 139 | @classmethod 140 | def field_names(cls) -> List[str]: 141 | """Return list of fields.""" 142 | return [field.name for field in fields(cls)] 143 | 144 | @staticmethod 145 | def kql_list_to_pylist(kql_queries: List["KqlQuery"]): 146 | """Return a list of Python dicts from a list of KqlQuery instances.""" 147 | return [kql.asdict() for kql in kql_queries] 148 | 149 | @classmethod 150 | def kql_list_to_json(cls, kql_queries: List["KqlQuery"]): 151 | """Return JSON from a list of KqlQuery instances.""" 152 | return json.dumps(cls.kql_list_to_pylist(kql_queries)) 153 | 154 | @classmethod 155 | def kql_list_to_df(cls, kql_queries: List["KqlQuery"]): 156 | """Return a pandas DataFrame from a list of KqlQuery instances.""" 157 | return pd.DataFrame(cls.kql_list_to_pylist(kql_queries)) 158 | -------------------------------------------------------------------------------- /src/kqlextraction/tests/test1.kql: -------------------------------------------------------------------------------- 1 | // https://github.com/Azure/Azure-Sentinel/blob/master/Hunting%20Queries/SigninLogs/UserLoginIPAddressTeleportation.yaml 2 | 3 | let windowTime = 20min / 2; //Window to lookup anomalous logins within 4 | let excludeKnownVPN = dynamic(['127.0.0.1', '0.0.0.0']); //Known VPN IP addresses to exclude 5 | SigninLogs 6 | | where ConditionalAccessStatus =~ "success" 7 | | extend country = LocationDetails['countryOrRegion'] 8 | | where country != "" 9 | | summarize count() by tostring(country) 10 | | join ( 11 | //Get the total number of logins from any country and join it to the previous count in a single table 12 | SigninLogs 13 | | where ConditionalAccessStatus =~ "success" 14 | | extend country = LocationDetails['countryOrRegion'] 15 | | where country != "" 16 | | summarize count(), make_list(tostring(country)) 17 | | mv-expand list_country 18 | | extend country = tostring(list_country) 19 | ) on country 20 | | summarize by country, count_, count_1 21 | //Now calculate each countries prevalence within login events 22 | | extend prevalence = toreal(count_) / toreal(count_1) * 100 23 | | project-away count_1 24 | | where prevalence < 0.01 25 | | join kind=rightsemi( 26 | SigninLogs 27 | //Enable to limit to o365 exchange logins 28 | //| where AppDisplayName =~ "Office 365 Exchange Online" 29 | | where ConditionalAccessStatus =~ "success" 30 | | where IPAddress != "" 31 | | extend country = tostring(LocationDetails['countryOrRegion']) 32 | | summarize count() by TimeGenerated, UserPrincipalName, country, IPAddress 33 | ) on country 34 | | join kind=leftouter ( 35 | SigninLogs 36 | //Enable to limit to o365 exchange logins 37 | //| where AppDisplayName =~ "Office 365 Exchange Online" 38 | | where ConditionalAccessStatus =~ "success" 39 | | extend country = tostring(LocationDetails['countryOrRegion']) 40 | | summarize by TimeGenerated, IPAddress, UserPrincipalName, country 41 | ) on UserPrincipalName 42 | | where IPAddress != IPAddress1 and country != country1 43 | | extend WindowStart = TimeGenerated1 - windowTime 44 | | extend WindowEnd = TimeGenerated1 + windowTime 45 | | where TimeGenerated between (WindowStart .. WindowEnd) 46 | | project Account=UserPrincipalName, AnomalousIP=IPAddress, AnomalousLoginTime=TimeGenerated, AnomalousCountry=country, OtherLoginIP=IPAddress1, OtherLoginCountry=country1, OtherLoginWindowStart=WindowStart, OtherLoginWindowEnd=WindowEnd 47 | | where AnomalousIP !in(excludeKnownVPN) and OtherLoginIP !in(excludeKnownVPN) 48 | | extend timestamp = AnomalousLoginTime, AccountCustomEntity = Account, IPCustomEntity = AnomalousIP -------------------------------------------------------------------------------- /src/kqlextraction/tests/test2.kql: -------------------------------------------------------------------------------- 1 | Foo 2 | | where A == 1 3 | | summarize count() by B -------------------------------------------------------------------------------- /src/kqlextraction/tests/test3.kql: -------------------------------------------------------------------------------- 1 | Foo 2 | | mv-expand Z 3 | | join kind=leftsemi hint.remote=true Bar on T 4 | | join kind=leftsemi ( 5 | Baz 6 | | where X > 5 7 | | project R 8 | ) on R -------------------------------------------------------------------------------- /src/kqlextraction/tests/test4.kql: -------------------------------------------------------------------------------- 1 | Foo 2 | | lookup (Bar) on T -------------------------------------------------------------------------------- /src/kqlextraction/tests/test5.kql: -------------------------------------------------------------------------------- 1 | union Foo, Bar, Baz -------------------------------------------------------------------------------- /src/repos.yaml: -------------------------------------------------------------------------------- 1 | - Github: 2 | branch: main 3 | repo: reprise99/Sentinel-Queries 4 | - Github: 5 | branch: main 6 | repo: ugurkocde/KQL_Intune 7 | - Github: 8 | branch: master 9 | repo: alexverboon/MDATP 10 | - Github: 11 | branch: master 12 | repo: eshlomo1/Microsoft-Sentinel-4-SecOps 13 | - Github: 14 | branch: master 15 | repo: FalconForceTeam/FalconFriday 16 | - Github: 17 | branch: master 18 | repo: Kaidja/Microsoft-Sentinel 19 | - Github: 20 | branch: main 21 | repo: Cyb3r-Monk/Threat-Hunting-and-Detection 22 | - Github: 23 | branch: main 24 | repo: rod-trent/MustLearnKQL -------------------------------------------------------------------------------- /src/test_data/test1.kql: -------------------------------------------------------------------------------- 1 | // https://github.com/Azure/Azure-Sentinel/blob/master/Hunting%20Queries/SigninLogs/UserLoginIPAddressTeleportation.yaml 2 | 3 | let windowTime = 20min / 2; //Window to lookup anomalous logins within 4 | let excludeKnownVPN = dynamic(['127.0.0.1', '0.0.0.0']); //Known VPN IP addresses to exclude 5 | SigninLogs 6 | | where ConditionalAccessStatus =~ "success" 7 | | extend country = LocationDetails['countryOrRegion'] 8 | | where country != "" 9 | | summarize count() by tostring(country) 10 | | join ( 11 | //Get the total number of logins from any country and join it to the previous count in a single table 12 | SigninLogs 13 | | where ConditionalAccessStatus =~ "success" 14 | | extend country = LocationDetails['countryOrRegion'] 15 | | where country != "" 16 | | summarize count(), make_list(tostring(country)) 17 | | mv-expand list_country 18 | | extend country = tostring(list_country) 19 | ) on country 20 | | summarize by country, count_, count_1 21 | //Now calculate each countries prevalence within login events 22 | | extend prevalence = toreal(count_) / toreal(count_1) * 100 23 | | project-away count_1 24 | | where prevalence < 0.01 25 | | join kind=rightsemi( 26 | SigninLogs 27 | //Enable to limit to o365 exchange logins 28 | //| where AppDisplayName =~ "Office 365 Exchange Online" 29 | | where ConditionalAccessStatus =~ "success" 30 | | where IPAddress != "" 31 | | extend country = tostring(LocationDetails['countryOrRegion']) 32 | | summarize count() by TimeGenerated, UserPrincipalName, country, IPAddress 33 | ) on country 34 | | join kind=leftouter ( 35 | SigninLogs 36 | //Enable to limit to o365 exchange logins 37 | //| where AppDisplayName =~ "Office 365 Exchange Online" 38 | | where ConditionalAccessStatus =~ "success" 39 | | extend country = tostring(LocationDetails['countryOrRegion']) 40 | | summarize by TimeGenerated, IPAddress, UserPrincipalName, country 41 | ) on UserPrincipalName 42 | | where IPAddress != IPAddress1 and country != country1 43 | | extend WindowStart = TimeGenerated1 - windowTime 44 | | extend WindowEnd = TimeGenerated1 + windowTime 45 | | where TimeGenerated between (WindowStart .. WindowEnd) 46 | | project Account=UserPrincipalName, AnomalousIP=IPAddress, AnomalousLoginTime=TimeGenerated, AnomalousCountry=country, OtherLoginIP=IPAddress1, OtherLoginCountry=country1, OtherLoginWindowStart=WindowStart, OtherLoginWindowEnd=WindowEnd 47 | | where AnomalousIP !in(excludeKnownVPN) and OtherLoginIP !in(excludeKnownVPN) 48 | | extend timestamp = AnomalousLoginTime, AccountCustomEntity = Account, IPCustomEntity = AnomalousIP -------------------------------------------------------------------------------- /src/test_data/test2.kql: -------------------------------------------------------------------------------- 1 | Foo 2 | | where A == 1 3 | | summarize count() by B -------------------------------------------------------------------------------- /src/test_data/test3.kql: -------------------------------------------------------------------------------- 1 | Foo 2 | | mv-expand Z 3 | | join kind=leftsemi hint.remote=true Bar on T 4 | | join kind=leftsemi ( 5 | Baz 6 | | where X > 5 7 | | project R 8 | ) on R -------------------------------------------------------------------------------- /src/test_data/test4.kql: -------------------------------------------------------------------------------- 1 | Foo 2 | | lookup (Bar) on T -------------------------------------------------------------------------------- /src/test_data/test5.kql: -------------------------------------------------------------------------------- 1 | union Foo, Bar, Baz -------------------------------------------------------------------------------- /src/test_data/test_10.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source_path": "https://github.com/reprise99/Sentinel-Queries/tree/main%5CDefender%20for%20Cloud%20Apps%5CDCA-PivotTableAdminActions.kql", 4 | "query": "//Create a pivot table of all actions in Defender for Cloud Apps by your privileged users over the last 7 days\n//Lookup the IdentityInfo table for any users holding a privileged role\n\n//Data connector required for this query - M365 Defender - CloudAppEvents\n\n//Microsoft Sentinel query\nlet privusers=\n IdentityInfo\n | where TimeGenerated > ago(21d)\n | summarize arg_max(TimeGenerated, *) by AccountUPN\n //Add any roles that you are interested in auditing\n | where AssignedRoles has_any (\"Global Administrator\", \"Security Administrator\", \"SharePoint Administrator\")\n | distinct AccountUPN;\nCloudAppEvents\n| where TimeGenerated > ago(7d)\n| extend Operation = tostring(RawEventData.Operation)\n| extend UserId = tostring(RawEventData.UserId)\n| extend Workload = tostring(RawEventData.Workload)\n//Create a new column that adds workload and operation together to make the events more readable\n| extend Activity = strcat(Workload, \" - \", Operation)\n| where UserId in~ (privusers)\n//Create pivot table of all actions by each user\n| evaluate pivot(Activity, count(), UserId)\n\n//Advanced hunting query\n\n//Data connector required for this query - Advanced Hunting license\n\nCloudAppEvents\n| where Timestamp > ago(7d)\n| extend Operation = tostring(RawEventData.Operation)\n| extend UserId = tostring(RawEventData.UserId)\n| extend Workload = tostring(RawEventData.Workload)\n//Advanced hunting doesn't retain role information about users, but you can add a list of users in manually to create a table\n| where UserId in~ (\"admin1@domain.com\", \"admin2@domain.com\")\n//Create a new column that adds workload and operation together to make the events more readable\n| extend Activity = strcat(Workload, \" - \", Operation)\n//Create pivot table of all actions by each user\n| evaluate pivot(Activity, count(), UserId)", 5 | "source_type": "text", 6 | "source_index": 0, 7 | "query_name": "DCA-PivotTableAdminActions", 8 | "context": null, 9 | "attributes": {}, 10 | "kql_properties": {}, 11 | "query_id": "e0ebd9f6-aab9-4928-b34f-5c8d089b715f", 12 | "query_hash": "d8eb17f554e939949114f13fb911adb86178861b2a775f952afd9e72b1b6a35b", 13 | "query_version": 0 14 | }, 15 | { 16 | "source_path": "https://github.com/reprise99/Sentinel-Queries/tree/main%5CAzure%20Activity%5CAzure-ResourceLockAddedorRemoved.kql", 17 | "query": "//Detect when a resource lock is added or removed from an Azure resource\n\n//Data connector required for this query - Azure Activity \n\nAzureActivity\n| where OperationNameValue in (\"MICROSOFT.AUTHORIZATION/LOCKS/WRITE\", \"MICROSOFT.AUTHORIZATION/LOCKS/DELETE\")\n| where ActivityStatusValue == \"Success\"\n| extend Activity = case(OperationNameValue == \"MICROSOFT.AUTHORIZATION/LOCKS/WRITE\", strcat(\"Resource Lock Added\"),\n OperationNameValue == \"MICROSOFT.AUTHORIZATION/LOCKS/DELETE\", strcat(\"Resource Lock Removed\"),\n \"unknown\")\n| extend ResourceGroup = tostring(parse_json(Properties).resourceGroup)\n| extend AzureResource = tostring(parse_json(Properties).resourceProviderValue)\n| extend x = tostring(parse_json(Properties).resource)\n| parse x with ResourceName \"/\" *\n| parse x with * \"microsoft.authorization/\" LockName\n| project\n TimeGenerated,\n Activity,\n ResourceName,\n ['Azure Resource']=AzureResource,\n ['Azure Subscription Id']=SubscriptionId,\n ['Azure Resource Group']=ResourceGroup,\n LockName", 18 | "source_type": "text", 19 | "source_index": 0, 20 | "query_name": "Azure-ResourceLockAddedorRemoved", 21 | "context": null, 22 | "attributes": {}, 23 | "kql_properties": {}, 24 | "query_id": "b0fba90e-6fa8-4124-bcc1-f076321d5eb3", 25 | "query_hash": "88b82e644765270392b56b57febf143a0ca39d784457ade747f97bbe7454d66f", 26 | "query_version": 0 27 | }, 28 | { 29 | "source_path": "https://github.com/reprise99/Sentinel-Queries/tree/main%5CREADME.md", 30 | "query": "let ExampleText = datatable(TestData:string)\n[\n'Name=Reprise99,UPNSuffix=testdomain.com,AadTenantId=345c1234-a833-43e4-1d34-123440a5bcdd1,AadUserId=cf6f2df6-b754-48dc-b7bc-c8339caf211,DisplayName=Test User,Type=account',\n'Name=Reprise103,UPNSuffix=testdomain.com,AadTenantId=331c1234-a841-43e5-1d31-12220a5bcee1,AadUserId=cf6f2df6-b754-48dc-b7bc-c8339caf211,DisplayName=Test User 2,Type=account'\n]\n;\nExampleText\n| extend Name = split(TestData,',')[0]\n| extend DomainSuffix = split(TestData,',')[1]\n| extend AzureADTenantId = split(TestData,',')[2]\n| extend AzureADUserId = split(TestData,',')[3]\n| extend DisplayName = split(TestData,',')[4]\n| extend AccountType = split(TestData,',')[5]\n| project Name, DomainSuffix, AzureADTenantId, AzureADUserId, DisplayName, AccountType\n| where Name contains \"Reprise99\"", 31 | "source_type": "markdown", 32 | "source_index": 59, 33 | "query_name": "README_59", 34 | "context": "\nIf we know our data location within the string then we can split it directly into named columns.\n\n![Split 2](https://github.com/reprise99/Sentinel-Queries/blob/main/Diagrams/split2.png?raw=true)\n\nOnce we have split our data, we can query it as though it was structured from the outset. So if we add a second record to our data, then query on specifc matches we will find what we are after.\n", 35 | "attributes": {}, 36 | "kql_properties": {}, 37 | "query_id": "d7bdc719-db17-4534-98d7-46a6dbe78ee8", 38 | "query_hash": "e04587d898f723719a9d361f3b5d43eeeed92acadce7d1f7e37a6b309091907c", 39 | "query_version": 0 40 | }, 41 | { 42 | "source_path": "https://github.com/reprise99/Sentinel-Queries/tree/main%5COffice%20365%5COfficeActivity-SummarizeTeamsCreatedDeleted.kql", 43 | "query": "//Create a weekly summary of Teams created and deleted in your Office 365 tenant\n\n//Data connector required for this query - Office 365\n\nOfficeActivity\n| where TimeGenerated > ago(30d)\n| where Operation in (\"TeamCreated\", \"TeamDeleted\")\n| summarize\n ['Count of Teams Created']=dcountif(TeamName, Operation == \"TeamCreated\"),\n ['List of Teams Created']=make_set_if(TeamName, Operation == \"TeamCreated\"),\n ['Count of Teams Deleted']=dcountif(TeamName, Operation == \"TeamDeleted\"),\n ['List of Teams Deleted']=make_set_if(TeamName, Operation == \"TeamDeleted\")\n by Week=startofweek(TimeGenerated)\n| sort by Week desc ", 44 | "source_type": "text", 45 | "source_index": 0, 46 | "query_name": "OfficeActivity-SummarizeTeamsCreatedDeleted", 47 | "context": null, 48 | "attributes": {}, 49 | "kql_properties": {}, 50 | "query_id": "fe16fc73-a049-461e-9500-1d7cb9007290", 51 | "query_hash": "2a88adceb83744b56d1974dae10c42098f44da070f891d5cf67ec8b9a9a9630d", 52 | "query_version": 0 53 | }, 54 | { 55 | "source_path": "https://github.com/reprise99/Sentinel-Queries/tree/main%5CAzure%20Active%20Directory%5CIdentity-SummarizeGuestDomainbyType.kql", 56 | "query": "//Summarize guest activity by external Azure AD guests (those that belong to another Azure AD tenant) vs External Guests (such as Gmail) to your tenant\n//For each domain list the total number of signins and distinct user \n\n//Data connector required for this query - Azure Active Directory - Signin Logs\n\nSigninLogs\n| where TimeGenerated > ago (30d)\n| where UserType == \"Guest\"\n| where ResultType == 0\n| extend ['Guest Domain'] = tostring(split(UserPrincipalName, \"@\")[1])\n| summarize\n ['External Azure AD Guest Logins']=countif(ResourceTenantId != HomeTenantId),\n ['External Azure AD Guest Distinct Users']=dcountif(UserPrincipalName, ResourceTenantId != HomeTenantId),\n ['External Guest Logins']=countif(ResourceTenantId == HomeTenantId),\n ['External Guest Distinct Users']=dcountif(UserPrincipalName, ResourceTenantId == HomeTenantId)\n by ['Guest Domain']\n", 57 | "source_type": "text", 58 | "source_index": 0, 59 | "query_name": "Identity-SummarizeGuestDomainbyType", 60 | "context": null, 61 | "attributes": {}, 62 | "kql_properties": {}, 63 | "query_id": "11c1b9ca-6c1c-4bdc-8eaa-a3facc3d3ed6", 64 | "query_hash": "4a44ca5e719fb084262265a3c617ec7d8f023a19f669ee61b79aa919b28e5fec", 65 | "query_version": 0 66 | }, 67 | { 68 | "source_path": "https://github.com/reprise99/Sentinel-Queries/tree/main%5CAzure%20AD%20Abuse%20Detection%5CREADME.md", 69 | "query": "AuditLogs\n| where OperationName == \"Add owner to service principal\"\n| extend ['Actor IP Address'] = tostring(parse_json(tostring(InitiatedBy.user)).ipAddress)\n| extend Actor = tostring(parse_json(tostring(InitiatedBy.user)).userPrincipalName)\n| extend ['Service Principal Name'] = tostring(parse_json(tostring(parse_json(tostring(TargetResources[0].modifiedProperties))[1].newValue)))\n| extend ['Service Principal ObjectId'] = tostring(TargetResources[1].id)\n| extend Target = tostring(TargetResources[0].userPrincipalName)\n| where TargetResources[0].type == \"User\"\n| where isnotempty(Actor)\n| project TimeGenerated, OperationName, Actor, ['Actor IP Address'], Target, ['Service Principal Name'], ['Service Principal ObjectId']", 70 | "source_type": "markdown", 71 | "source_index": 16, 72 | "query_name": "### Detection Query (User as actor, user as target)", 73 | "context": "\n\n## BARK function - Test-MGAddSelfAsOwnerOfSP \n\nOwners of service principals can change settings on that object, for instance they can add or remove users who have access to sign into that service principal. They can change SSO settings and change permissions on the service principal.\n\nFor this abuse, the actor can be either a user or a service principal. The target can also be either a user or a service principal.\n\n### Detection Query (User as actor, user as target)\n", 74 | "attributes": {}, 75 | "kql_properties": {}, 76 | "query_id": "47bc03e9-d2ce-427a-a799-d0edcec62cb1", 77 | "query_hash": "d7cc7b54dfe0e752885b2d833672c587df342564ea955e62e785d5edbc66b869", 78 | "query_version": 0 79 | }, 80 | { 81 | "source_path": "https://github.com/reprise99/Sentinel-Queries/tree/main%5CAzure%20Active%20Directory%5CIdentity-SummarizeConditionalAccessPoliciesfailures.kql", 82 | "query": "//Create a summary showing which of your Azure AD conditional access policies are preventing the most signins and for what reasons\n\n//Data connector required for this query - Azure Active Directory - Signin Logs\n\nSigninLogs\n| where TimeGenerated > ago (7d)\n| project TimeGenerated, ConditionalAccessPolicies, ResultType, ResultDescription\n| mv-expand ConditionalAccessPolicies\n| extend CAResult = tostring(ConditionalAccessPolicies.result)\n| extend ['Conditional Access Policy Name'] = tostring(ConditionalAccessPolicies.displayName)\n| where CAResult == \"failure\"\n| summarize ['Count of Failures']=count()by ['Conditional Access Policy Name'], ResultType, ResultDescription\n| sort by ['Count of Failures'] desc ", 83 | "source_type": "text", 84 | "source_index": 0, 85 | "query_name": "Identity-SummarizeConditionalAccessPoliciesfailures", 86 | "context": null, 87 | "attributes": {}, 88 | "kql_properties": {}, 89 | "query_id": "7c8e52c0-def4-4751-a8e9-671eebc20296", 90 | "query_hash": "1007d7955776d29deb1cfd7ff8ad3ea5ea5e021dd2863238da224c175b376ebd", 91 | "query_version": 0 92 | }, 93 | { 94 | "source_path": "https://github.com/reprise99/Sentinel-Queries/tree/main%5CDefender%20for%20Endpoint%5CDevice-DetectCertUtilConnectingExternally.kql", 95 | "query": "//Detects when certutil is used to connect to a public IP. This could indicate abuse of cert util, see - https://www.avira.com/en/blog/certutil-abused-by-attackers-to-spread-threats\n\n//Data connector required for this query - M365 Defender - Device* tables\n\n//Microsoft Sentinel query\nDeviceNetworkEvents\n| where TimeGenerated > ago (7d)\n| project\n TimeGenerated,\n DeviceName,\n InitiatingProcessAccountName,\n InitiatingProcessCommandLine,\n LocalIPType,\n LocalIP,\n RemoteIPType,\n RemoteIP,\n RemoteUrl,\n RemotePort\n| where InitiatingProcessCommandLine contains \"certutil\"\n| where RemoteIPType == \"Public\"\n\n//Advanced Hunting query\n\n//Data connector required for this query - Advanced Hunting license\n\nDeviceNetworkEvents\n| where TimeGenerated > ago (7d)\n| project\n TimeGenerated,\n DeviceName,\n InitiatingProcessAccountName,\n InitiatingProcessCommandLine,\n LocalIPType,\n LocalIP,\n RemoteIPType,\n RemoteIP,\n RemoteUrl,\n RemotePort\n| where InitiatingProcessCommandLine contains \"certutil\"\n| where RemoteIPType == \"Public\"", 96 | "source_type": "text", 97 | "source_index": 0, 98 | "query_name": "Device-DetectCertUtilConnectingExternally", 99 | "context": null, 100 | "attributes": {}, 101 | "kql_properties": {}, 102 | "query_id": "4eb1a989-83d2-44a2-9f6c-f4dfb1f31ee6", 103 | "query_hash": "c76da23b26d172981b5d324232edae919c14585c8131640566ad5fa7cf6bcbfa", 104 | "query_version": 0 105 | }, 106 | { 107 | "source_path": "https://github.com/reprise99/Sentinel-Queries/tree/main%5CAzure%20Active%20Directory%5CIdentity-VisualizeExternalAADGuestsvsExternalGuests.kql", 108 | "query": "//Visualize signins from External Azure AD guests (those that belong to another Azure AD tenant) vs External Guests (such as Gmail) to your tenant\n\n//Data connector required for this query - Azure Active Directory - Signin Logs\n\nSigninLogs\n| where TimeGenerated > ago (45d)\n| where UserType == \"Guest\"\n| summarize\n ['External Guests']=countif(ResourceTenantId == HomeTenantId),\n ['External Azure AD Guests']=countif(ResourceTenantId != HomeTenantId)\n by bin(TimeGenerated, 1d)\n| render timechart with (title=\"External Azure AD Guests vs External Guests\", ytitle=\"Count\")\n", 109 | "source_type": "text", 110 | "source_index": 0, 111 | "query_name": "Identity-VisualizeExternalAADGuestsvsExternalGuests", 112 | "context": null, 113 | "attributes": {}, 114 | "kql_properties": {}, 115 | "query_id": "da36cf45-4fba-484e-ac9a-a98088b0836a", 116 | "query_hash": "bd81f7d826576984985f2ab36ac58f2a6e2859e6cf0e358dbd22b6472bf8d86c", 117 | "query_version": 0 118 | }, 119 | { 120 | "source_path": "https://github.com/reprise99/Sentinel-Queries/tree/main%5CDefender%20for%20Endpoint%5CDevice-SummarizeRDPConnections.kql", 121 | "query": "//Summarize your devices by their RDP activity. The data is sorted to show total outbound RDP connections, a count of distinct RDP connections and the list of IP's connected to.\n\n//Data connector required for this query - M365 Defender - Device* tables\n\n//Data is sorted by the devices with the most unique outbound RDP connections. Those devices have the biggest lateral movement blast radius.\n//Microsoft Sentinel query\nDeviceNetworkEvents\n| where TimeGenerated > ago(30d)\n| where ActionType == \"ConnectionSuccess\"\n| where RemotePort == \"3389\"\n//Exclude Defender for Identity that uses an initial RDP connection to map your network\n| where InitiatingProcessCommandLine <> \"\\\"Microsoft.Tri.Sensor.exe\\\"\"\n| summarize\n ['RDP Outbound Connection Count']=count(),\n ['RDP Distinct Outbound Endpoint Count']=dcount(RemoteIP),\n ['RDP Outbound Endpoints']=make_set(RemoteIP)\n by DeviceName\n| sort by ['RDP Distinct Outbound Endpoint Count'] desc \n\n//Advanced Hunting query\n\n//Data connector required for this query - Advanced Hunting license\n\nDeviceNetworkEvents\n| where Timestamp > ago(30d)\n| where ActionType == \"ConnectionSuccess\"\n| where RemotePort == \"3389\"\n//Exclude Defender for Identity that uses an initial RDP connection to map your network\n| where InitiatingProcessCommandLine <> \"\\\"Microsoft.Tri.Sensor.exe\\\"\"\n| summarize\n ['RDP Outbound Connection Count']=count(),\n ['RDP Distinct Outbound Endpoint Count']=dcount(RemoteIP),\n ['RDP Outbound Endpoints']=make_set(RemoteIP)\n by DeviceName\n| sort by ['RDP Distinct Outbound Endpoint Count'] desc ", 122 | "source_type": "text", 123 | "source_index": 0, 124 | "query_name": "Device-SummarizeRDPConnections", 125 | "context": null, 126 | "attributes": {}, 127 | "kql_properties": {}, 128 | "query_id": "2fa11654-ce7b-4452-9646-700afce24375", 129 | "query_hash": "8c2fca65cd7884333babfb8166724b4c2c9ecc15d1b16a4e38d4ce285e56fd99", 130 | "query_version": 0 131 | } 132 | ] -------------------------------------------------------------------------------- /src/test_data_store.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. See License.txt in the project root for 4 | # license information. 5 | # -------------------------------------------------------------------------- 6 | """Module docstring.""" 7 | import json 8 | import random 9 | import uuid 10 | from pathlib import Path 11 | 12 | import pytest 13 | 14 | from .data_store import DataStore 15 | from .kql_query import KqlQuery 16 | 17 | __author__ = "Ian Hellen" 18 | 19 | # pylint: disable=redefined-outer-name 20 | 21 | json_query_data = """ 22 | { 23 | "query_id": "1234291720927310", 24 | "source_path": "/github.com/foo", 25 | "source_type": "text", 26 | "source_index": 0, 27 | "name": "query_1", 28 | "query": "SecurityAlert\\n| Where foo == bar", 29 | "context": "text from markdown", 30 | "attributes": { 31 | "description": "Query one description", 32 | "tactics": ["Exploitation", "Compromise"], 33 | "techniques": ["T.1055", "T.1345"] 34 | } 35 | } 36 | """ 37 | 38 | json_kql_parse = """ 39 | { 40 | "FunctionCalls":["count","tostring","make_list","toreal"], 41 | "Joins":["rightsemi","leftouter"], 42 | "Operators":["where","extend","summarize","mv-expand","project-away","project"], 43 | "Tables":["SigninLogs"] 44 | } 45 | """ 46 | 47 | table_names = [ 48 | "AADB2CRequestLogs", 49 | "AADDomainServicesAccountLogon", 50 | "AADDomainServicesAccountManagement", 51 | "AADDomainServicesDirectoryServiceAccess", 52 | "AADDomainServicesLogonLogoff", 53 | "AADDomainServicesPolicyChange", 54 | "AADDomainServicesPrivilegeUse", 55 | "AADManagedIdentitySignInLogs", 56 | "AADNonInteractiveUserSignInLogs", 57 | "AADProvisioningLogs", 58 | "AADRiskyServicePrincipals", 59 | "AADRiskyUsers", 60 | "AADServicePrincipalRiskEvents", 61 | "AADServicePrincipalSignInLogs", 62 | "AADUserRiskEvents", 63 | "ADFSSignInLogs", 64 | "AlertEvidence", 65 | "Anomalies", 66 | "AppServiceIPSecAuditLogs", 67 | "AppServiceServerlessSecurityPluginData", 68 | "ASimDnsActivityLogs", 69 | "AuditLogs", 70 | "AWSCloudTrail", 71 | "AWSGuardDuty", 72 | "AWSVPCFlow", 73 | "AZFWApplicationRule", 74 | "AZFWApplicationRuleAggregation", 75 | "AZFWDnsQuery", 76 | "AZFWIdpsSignature", 77 | "AZFWInternalFqdnResolutionFailure", 78 | "AZFWNatRule", 79 | "AZFWNatRuleAggregation", 80 | "AZFWNetworkRule", 81 | "AZFWNetworkRuleAggregation", 82 | "AZFWThreatIntel", 83 | "AzureActivity", 84 | "AzureDiagnostics", 85 | "BehaviorAnalytics", 86 | "CloudAppEvents", 87 | "CommonSecurityLog", 88 | "ConfidentialWatchlist", 89 | "DeviceEvents", 90 | "DeviceFileCertificateInfo", 91 | "DeviceFileEvents", 92 | "DeviceImageLoadEvents", 93 | "DeviceInfo", 94 | "DeviceLogonEvents", 95 | "DeviceNetworkEvents", 96 | "DeviceNetworkInfo", 97 | "DeviceProcessEvents", 98 | "DeviceRegistryEvents", 99 | "DeviceTvmSecureConfigurationAssessment", 100 | "DeviceTvmSoftwareInventory", 101 | "DeviceTvmSoftwareVulnerabilities", 102 | "DSMAzureBlobStorageLogs", 103 | "DSMDataClassificationLogs", 104 | "DSMDataLabelingLogs", 105 | "DynamicEventCollection", 106 | "EmailAttachmentInfo", 107 | "EmailEvents", 108 | "EmailPostDeliveryEvents", 109 | "EmailUrlInfo", 110 | "GCPAuditLogs", 111 | "HDInsightSecurityLogs", 112 | "HuntingBookmark", 113 | "IdentityDirectoryEvents", 114 | "IdentityLogonEvents", 115 | "IdentityQueryEvents", 116 | "LinuxAuditLog", 117 | "McasShadowItReporting", 118 | "NetworkAccessTraffic", 119 | "NetworkSessions", 120 | "NSPAccessLogs", 121 | "OfficeActivity", 122 | "PowerBIActivity", 123 | "ProjectActivity", 124 | "ProtectionStatus", 125 | "PurviewDataSensitivityLogs", 126 | "SecurityAlert", 127 | "SecurityBaseline", 128 | "SecurityBaselineSummary", 129 | "SecurityDetection", 130 | "SecurityEvent", 131 | "SecurityIoTRawEvent", 132 | "SecurityRecommendation", 133 | "SentinelAudit", 134 | "SentinelHealth", 135 | "SigninLogs", 136 | "Syslog", 137 | "ThreatIntelligenceIndicator", 138 | "Update", 139 | "UrlClickEvents", 140 | "UserAccessAnalytics", 141 | "UserPeerAnalytics", 142 | "Watchlist", 143 | "WindowsEvent", 144 | "WindowsFirewall", 145 | "WireData", 146 | ] 147 | 148 | field_names = [ 149 | "SourceType", 150 | "DomainBehaviorVersion", 151 | "OperationName", 152 | "BookmarkName", 153 | "SentinelResourceId", 154 | "OSName", 155 | "ActualResult", 156 | "CreatedBy", 157 | "CreatedDateTime", 158 | "LatencySamplingTimeStamp", 159 | "Environment", 160 | "CorrelationId", 161 | "MachineGroup", 162 | "SumResponseBodySize", 163 | "RecordId", 164 | "DstUserUpn", 165 | "ResourceId", 166 | "InitiatingProcessSHA1", 167 | "ObjectId", 168 | "AssetType", 169 | "Title", 170 | "InitiatingProcessAccountDomain", 171 | "AuthorizationInfo", 172 | "TargetContextId", 173 | "LogonId", 174 | "CveTags", 175 | "SourceComputerId", 176 | "ResourceIdentity", 177 | "ClusterName", 178 | "TdoAttributes", 179 | "EntityMapping", 180 | "DnssecOkBit", 181 | "DeviceCustomString5", 182 | "TransmittedServices", 183 | "DeviceCustomDate2Label", 184 | ] 185 | 186 | 187 | def get_random_items(data=table_names, count=3): 188 | return list({random.choice(data) for _ in range(count)}) 189 | 190 | 191 | def get_random_query(index=0): 192 | tactic_idx = index % 7 193 | return { 194 | "query_id": str(uuid.uuid4()), 195 | "source_path": f"/github.com/foo/{index}", 196 | "source_type": "text", 197 | "source_index": random.randint(0, 7), 198 | "query_name": f"query_{index}", 199 | "query": "SecurityAlert\\n| Where foo == bar", 200 | # "context": "text from markdown", 201 | "attributes": { 202 | "description": "Query one description", 203 | "tactics": get_random_items( 204 | data=["Exploitation", "Compromise", "LateralMovement"], count=2 205 | ), 206 | "techniques": [f"T10{tactic_idx:0>2d}", f"T1{tactic_idx:0>2d}5"], 207 | "test_dict": { 208 | "joins": {"inner": ["one", "two"], "outer": ["three", "four"]} 209 | }, 210 | }, 211 | } 212 | 213 | 214 | @pytest.fixture 215 | def get_raw_queries(): 216 | 217 | return [get_random_query(i) for i in range(5)] 218 | 219 | 220 | @pytest.fixture 221 | def get_kqlquery_list(): 222 | return [KqlQuery(**get_random_query(i)) for i in range(5)] 223 | 224 | 225 | def test_datastore_init(get_kqlquery_list, get_raw_queries): 226 | 227 | ds = DataStore(get_kqlquery_list) 228 | all_items_len = len(get_kqlquery_list) 229 | assert len(ds._data) == all_items_len 230 | assert len(ds._data) == all_items_len 231 | assert len(ds._indexes) == 2 232 | 233 | ds = DataStore(get_raw_queries) 234 | all_items_len = len(get_kqlquery_list) 235 | assert len(ds._data) == all_items_len 236 | assert len(ds._data) == all_items_len 237 | assert len(ds._indexes) == 2 238 | 239 | json_text = ds.to_json() 240 | output_dict = json.loads(json_text) 241 | assert len(output_dict) == len(get_raw_queries) 242 | 243 | out_df = ds.to_df() 244 | assert len(out_df) == all_items_len 245 | 246 | 247 | def test_datastore_find(get_kqlquery_list): 248 | 249 | ds = DataStore(get_kqlquery_list) 250 | all_items_len = len(get_kqlquery_list) 251 | assert len(ds.find_queries(query_name="query_0")) == 1 252 | assert all_items_len > len(ds.find_queries(tactics=["Compromise"])) 253 | assert len(ds.find_queries(tactics=["BadTactic"])) == 0 254 | assert len(ds.find_queries(query_name={"matches": "query.*"})) == all_items_len 255 | -------------------------------------------------------------------------------- /src/test_kql_download.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. See License.txt in the project root for 4 | # license information. 5 | # -------------------------------------------------------------------------- 6 | """Test Query downloader.""" 7 | import random 8 | from pathlib import Path 9 | 10 | from .data_store import DataStore 11 | from .kql_download import get_community_queries, get_sentinel_queries 12 | 13 | __author__ = "Ian Hellen" 14 | 15 | # pylint: disable=protected-access 16 | 17 | 18 | def test_get_sentinel_queries(tmp_path): 19 | """Test downloading sentinel queries.""" 20 | queries = get_sentinel_queries(tmp_path) 21 | ds = DataStore(queries) 22 | assert ds is not None 23 | assert len(ds.queries) > 2000 24 | assert len(ds._indexes["tactics"]) > 1000 25 | assert len(ds._indexes["techniques"]) > 1000 26 | 27 | indexes = [random.randint(0, len(ds.queries)) for _ in range(10)] 28 | for attrib in ["source_path", "query", "query_id", "attributes"]: 29 | for idx in indexes: 30 | assert hasattr(ds.queries[idx], attrib) 31 | 32 | 33 | def test_get_community_queries(tmp_path): 34 | """Test downloading sentinel queries.""" 35 | conf_path = Path(__file__).parent.joinpath("repos.yaml") 36 | queries = get_community_queries(tmp_path, config=conf_path) 37 | ds = DataStore(queries) 38 | assert ds is not None 39 | assert len(ds.queries) > 100 40 | 41 | indexes = [random.randint(0, len(ds.queries)) for _ in range(10)] 42 | for attrib in ["source_path", "query", "query_id"]: 43 | for idx in indexes: 44 | assert hasattr(ds.queries[idx], attrib) 45 | -------------------------------------------------------------------------------- /src/test_kql_extract.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. See License.txt in the project root for 4 | # license information. 5 | # -------------------------------------------------------------------------- 6 | """Test kql extraction integration.""" 7 | 8 | from datetime import datetime, timezone 9 | from pathlib import Path 10 | 11 | import pytest 12 | 13 | from . import kql_extract as extract 14 | from .data_store import DataStore 15 | from .kql_query import KqlQuery 16 | from .test_data_store import get_random_query 17 | 18 | __author__ = "Ian Hellen" 19 | 20 | # pylint: disable=redefined-outer-name, protected-access 21 | 22 | 23 | _TEST_KQL = Path(__file__).parent.joinpath("test_data") 24 | 25 | 26 | @pytest.fixture 27 | def get_queries_with_kql(): 28 | queries = [] 29 | for file in Path(_TEST_KQL).glob("*.kql"): 30 | 31 | query_text = file.read_text(encoding="utf-8") 32 | for query in [KqlQuery(**get_random_query(i)) for i in range(2)]: 33 | query.query = query_text 34 | queries.append(query) 35 | return queries 36 | 37 | 38 | def test_extract_from_ds_query(get_queries_with_kql): 39 | """Function_docstring.""" 40 | 41 | queries = get_queries_with_kql 42 | assert len(queries) > 0 43 | ds = DataStore(queries) 44 | assert len(ds.queries) == len(get_queries_with_kql) 45 | 46 | try: 47 | extract.start() 48 | start = datetime.now(timezone.utc) 49 | print(start) 50 | for query in ds.queries: 51 | result = extract.extract_kql(query.query, query_id=query.query_id) 52 | print(result) 53 | ds.add_kql_properties(query_id=query.query_id, kql_properties=result) 54 | end = datetime.now(timezone.utc) 55 | print(end, "total time", end - start) 56 | finally: 57 | extract.stop() 58 | print([len(query.kql_properties) for query in ds.queries]) 59 | assert all(len(query.kql_properties) for query in ds.queries) 60 | assert len(ds._indexes) >= 6 61 | assert all(item in ds._indexes for item in ["tactics", "tables", "operators"]) 62 | assert len(ds._indexes["tables"]) >= len(ds.queries) 63 | assert len(ds._indexes["operators"]) >= len(ds.queries) 64 | -------------------------------------------------------------------------------- /src/test_kql_query.py: -------------------------------------------------------------------------------- 1 | from .kql_query import KqlQuery 2 | 3 | 4 | def test_kql_query(): 5 | kql = KqlQuery( 6 | source_path="https://github.com/a/b/file.kql", query="SecurityAlert | take 1" 7 | ) 8 | print(kql) 9 | print(kql.asdict()) 10 | print(kql.to_json()) 11 | 12 | KqlQuery.kql_list_to_pylist([kql, kql]) 13 | 14 | KqlQuery.kql_list_to_json([kql, kql]) 15 | 16 | KqlQuery.kql_list_to_df([kql, kql]) 17 | -------------------------------------------------------------------------------- /test_runs/kql_query_db-2022-09-23-22-30-16.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/kql-query-store/b2b2e82fe3a80a2baf8355b90d3a88db98b1a472/test_runs/kql_query_db-2022-09-23-22-30-16.pkl -------------------------------------------------------------------------------- /test_runs/kql_query_db-2022-09-24-02-51-50.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/kql-query-store/b2b2e82fe3a80a2baf8355b90d3a88db98b1a472/test_runs/kql_query_db-2022-09-24-02-51-50.pkl -------------------------------------------------------------------------------- /test_runs/kql_query_df--022-09-23_00_44_55.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/kql-query-store/b2b2e82fe3a80a2baf8355b90d3a88db98b1a472/test_runs/kql_query_df--022-09-23_00_44_55.pkl --------------------------------------------------------------------------------