├── .gitignore
├── 1_🏠HomePage.py
├── CODE_OF_CONDUCT.md
├── Dockerfile
├── LICENSE
├── README.md
├── SECURITY.md
├── SUPPORT.md
├── data
    └── kql_queries.json
├── dev-notebooks
    ├── .ipynb_checkpoints
    │   ├── kql_query_harvester-checkpoint.ipynb
    │   └── repos-checkpoint.yaml
    ├── KQLHarvester-oldversion.ipynb
    ├── Kqlquery-schema.png
    ├── SampleDataStoreUse.ipynb
    ├── az-monitor-schemas.ipynb
    ├── db_code.ipynb
    ├── db_pandas_store.ipynb
    ├── db_schema.py
    ├── kql_query_harvester.ipynb
    ├── kqlquery.db
    └── repos.yaml
├── images
    └── DataFlowDiagram.png
├── kqlextraction
    ├── KqlExtraction
    │   ├── KqlExtraction.cs
    │   ├── KqlExtraction.csproj
    │   └── KqlExtraction.sln
    ├── Readme.txt
    ├── extract.py
    └── tests
    │   ├── test1.kql
    │   ├── test2.kql
    │   ├── test3.kql
    │   ├── test4.kql
    │   └── test5.kql
├── pages
    ├── 2_🔎KQL_interactive_search.py
    ├── 3_🛡️Schema_Browser.py
    ├── 4_ 📊KQL_Store_Insights.py
    └── 5_💬Contact_Us.py
├── requirements.txt
├── src
    ├── __init__.py
    ├── az_mon_schema.py
    ├── conf.txt
    ├── create_kql_db.py
    ├── data_store.py
    ├── extract.py
    ├── ian_test.kql
    ├── kql_download.py
    ├── kql_extract.py
    ├── kql_file_parser.py
    ├── kql_query.py
    ├── kqlextraction
    │   └── tests
    │   │   ├── test1.kql
    │   │   ├── test2.kql
    │   │   ├── test3.kql
    │   │   ├── test4.kql
    │   │   └── test5.kql
    ├── repos.yaml
    ├── test_data
    │   ├── test1.kql
    │   ├── test2.kql
    │   ├── test3.kql
    │   ├── test4.kql
    │   ├── test5.kql
    │   ├── test_10.json
    │   └── test_json.json
    ├── test_data_store.py
    ├── test_kql_download.py
    ├── test_kql_extract.py
    └── test_kql_query.py
└── test_runs
    ├── kql_query_db-022-09-23_00_44_55.json
    ├── kql_query_db-2022-09-23-22-30-15.json
    ├── kql_query_db-2022-09-23-22-30-16.pkl
    ├── kql_query_db-2022-09-24-02-51-49.json
    ├── kql_query_db-2022-09-24-02-51-50.pkl
    └── kql_query_df--022-09-23_00_44_55.pkl


/.gitignore:
--------------------------------------------------------------------------------
1 | **/.vs/**
2 | **/bin/Debug/**
3 | **/bin/Release/**
4 | **/obj/Debug/**
5 | **/obj/Release/**
6 | **/__pycache__/**
7 | **/obj/**
8 | 


--------------------------------------------------------------------------------
/1_🏠HomePage.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import pandas as pd
 3 | 
 4 | from pathlib import Path
 5 | 
 6 | 
 7 | def main() -> None:
 8 |     st.title(":mag_right: Interactive KQL Query Store")
 9 | 
10 |     with st.expander("Expand to Read more about the Project"):
11 |         st.write(Path("README.md").read_text())
12 | 
13 |     st.success(":point_left: Select a page on left side bar to naviagate pages")
14 | 
15 | 
16 | if __name__ == "__main__":
17 |     st.set_page_config(
18 |         "Interactive KQL Query Store by MSTIC",
19 |         "🔎",
20 |         initial_sidebar_state="expanded",
21 |         layout="wide",
22 |     )
23 |     main()
24 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # app/Dockerfile
 2 | 
 3 | FROM python:3.9-slim
 4 | 
 5 | EXPOSE 8501
 6 | 
 7 | WORKDIR /app
 8 | 
 9 | RUN apt-get update && apt-get install -y \
10 |     build-essential \
11 |     software-properties-common \
12 |     git \
13 |     && rm -rf /var/lib/apt/lists/*
14 | 
15 | RUN git clone https://github.com/microsoft/kql-query-store.git .
16 | 
17 | RUN pip3 install -r requirements.txt
18 | 
19 | ENTRYPOINT ["streamlit", "run", "1_🏠HomePage.py", "--server.port=8501", "--server.address=0.0.0.0"]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Interactive KQL Query Store
 2 | 
 3 | [![Streamlit App](https://static.streamlit.io/badges/streamlit_badge_black_white.svg)](https://aka.ms/kql-query-store)
 4 | 
 5 | Currently many KQL queries are published on GitHub by Microsoft and Security Community on GitHub. All the queries are scattered as unstructured data and disorganized in various places making it difficult to discover for defenders and detection authors. 
 6 | 
 7 | GitHub search interface is not flexible to satisfy various custom search needs for defenders to effectively search various KQL queries by datasource , KQL operators , parsing of complex fields in data sources, custom tags if available etc. Having it easy to discover will help defenders in referencing existing work while writing new queries, reuse complex parsing examples in specific data sources and much more. 
 8 | 
 9 | ## Project Goals
10 | 
11 | - Organized data store of KQL queries as a structured data store
12 | - Easy discoverability of KQL Queries based on tags, KQL operators, Datasource etc. 
13 | - Point to relevant sources and GitHub links. 
14 | - Interactive dashboard to explore the structured data.
15 | - Insights on various KQL queries from Azure Sentinel
16 | 
17 | ## Architecture
18 | ![raw_image](https://raw.github.com/microsoft/kql-query-store/master/images/DataFlowDiagram.png)
19 | 
20 | 
21 | ## Docker instruction
22 | if you wish to host this locally/in-house, you can use below instructions to build docker images and host it. For more detailed instructions, check out Streamlit docs. [Deploy Streamlit using Docker](https://docs.streamlit.io/knowledge-base/tutorials/deploy/docker)
23 | 
24 | Build image
25 | 
26 | `docker build -t kql-query-store .`
27 | 
28 | Run the docker container
29 | 
30 | `docker run -p 8501:8501 kql-query-store`
31 | 
32 | ## Contributing
33 | 
34 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
35 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
36 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
37 | 
38 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
39 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
40 | provided by the bot. You will only need to do this once across all repos using our CLA.
41 | 
42 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
43 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
44 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.8 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # TODO: The maintainer of this repo has not yet edited this file
 2 | 
 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
 4 | 
 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help.
 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
 7 | - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
 8 | 
 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10 | 
11 | # Support
12 | 
13 | ## How to file issues and get help  
14 | 
15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
16 | issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
17 | feature request as a new Issue.
18 | 
19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22 | 
23 | ## Microsoft Support Policy  
24 | 
25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
26 | 


--------------------------------------------------------------------------------
/dev-notebooks/.ipynb_checkpoints/repos-checkpoint.yaml:
--------------------------------------------------------------------------------
1 | - Github:
2 |     branch: main
3 |     repo: reprise99/Sentinel-Queries
4 | - Github:
5 |     branch: main
6 |     repo: ugurkocde/KQL_Intune


--------------------------------------------------------------------------------
/dev-notebooks/KQLHarvester-oldversion.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "319188e6-ccfe-43e7-bd41-665e1f6450c3",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import logging\n",
 11 |     "from pathlib import Path\n",
 12 |     "import requests\n",
 13 |     "import io\n",
 14 |     "import zipfile\n",
 15 |     "from requests.exceptions import HTTPError\n",
 16 |     "import glob\n",
 17 |     "import pandas as pd\n",
 18 |     "import yaml\n",
 19 |     "from pandas import json_normalize\n",
 20 |     "\n",
 21 |     "def get_repo_urls(filename, branch_name):\n",
 22 |     "    git_url = 'https://github.com/'\n",
 23 |     "    file_name = f'{branch_name}.zip'\n",
 24 |     "    suffix_string = 'archive/'+ file_name\n",
 25 |     "    with open(filename, 'r', encoding='UTF-8') as f:\n",
 26 |     "        repos = [git_url + line.rstrip() for line in f]\n",
 27 |     "        repo_archive_urls = [line + suffix_string for line in repos]\n",
 28 |     "    \n",
 29 |     "    return repo_archive_urls\n",
 30 |     "\n",
 31 |     "def download_git_archive(git_url, output_dir):\n",
 32 |     "    print(f\"Downloading from {git_url}, may take few mins..\")\n",
 33 |     "    try:\n",
 34 |     "        r = requests.get(git_url)\n",
 35 |     "        repo_zip = io.BytesIO(r.content)\n",
 36 |     "        archive = zipfile.ZipFile(repo_zip, mode=\"r\")\n",
 37 |     "        for file in archive.namelist():\n",
 38 |     "            archive.extract(file, path=output_dir)\n",
 39 |     "        print(\"Downloaded and Extracted Files successfully\")\n",
 40 |     "    except HTTPError as http_err:\n",
 41 |     "        warnings.warn(f\"HTTP error occurred trying to download from Github: {http_err}\")\n",
 42 |     "        \n",
 43 |     "def get_sentinel_queries_from_github(git_url, outputdir):\n",
 44 |     "    print(f\"Downloading from Azure Sentinel Github, may take 2-3 mins..\")\n",
 45 |     "    try:\n",
 46 |     "        r = requests.get(git_url)\n",
 47 |     "        repo_zip = io.BytesIO(r.content)\n",
 48 |     "        archive = zipfile.ZipFile(repo_zip, mode=\"r\")\n",
 49 |     "        # Only extract Detections and Hunting Queries Folder\n",
 50 |     "        for file in archive.namelist():\n",
 51 |     "            if file.startswith(\n",
 52 |     "                (\n",
 53 |     "                    \"Azure-Sentinel-master/Detections/\",\n",
 54 |     "                    \"Azure-Sentinel-master/Hunting Queries/\",\n",
 55 |     "                    \"Azure-Sentinel-master/Solutions/\"\n",
 56 |     "                )\n",
 57 |     "            ):\n",
 58 |     "                archive.extract(file, path=outputdir)\n",
 59 |     "        print(\"Downloaded and Extracted Files successfully\")\n",
 60 |     "    except HTTPError as http_err:\n",
 61 |     "        warnings.warn(f\"HTTP error occurred trying to download from Github: {http_err}\")\n",
 62 |     "        \n",
 63 |     "def parse_yaml(parent_dir, child_dir):\n",
 64 |     "\n",
 65 |     "    sentinel_repourl = \"https://github.com/Azure/Azure-Sentinel/blob/master\"\n",
 66 |     "\n",
 67 |     "    # Collect list of files recusrively uinder a folder\n",
 68 |     "    yaml_queries = glob.glob(f\"{parent_dir}/{child_dir}/**/*.yaml\", recursive=True)\n",
 69 |     "    df = pd.DataFrame()\n",
 70 |     "\n",
 71 |     "    # Recursively load yaml Files and append to dataframe\n",
 72 |     "    for query in yaml_queries:\n",
 73 |     "        with open(query, \"r\", encoding=\"utf-8\", errors=\"ignore\") as f:\n",
 74 |     "            parsed_yaml_df = json_normalize(yaml.load(f, Loader=yaml.FullLoader))\n",
 75 |     "            parsed_yaml_df[\"DetectionURL\"] = query.replace(parent_dir, sentinel_repourl)\n",
 76 |     "            frames = [df, parsed_yaml_df]\n",
 77 |     "            df = pd.concat(frames, ignore_index=True, sort=True)\n",
 78 |     "\n",
 79 |     "    if child_dir == \"Detections\":\n",
 80 |     "        df[\"DetectionType\"] = \"Analytics\"\n",
 81 |     "    elif child_dir == \"Hunting Queries\":\n",
 82 |     "        df[\"DetectionType\"] = \"Hunting\"\n",
 83 |     "    elif child_dir == \"Solutions\":\n",
 84 |     "        df[\"DetectionType\"] = \"Solutions\"\n",
 85 |     "\n",
 86 |     "    df[\"DetectionService\"] = \"Azure Sentinel Community Github\"\n",
 87 |     "\n",
 88 |     "    return df"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 2,
 94 |    "id": "6a38f199-2250-45fa-ae24-bcd8dcfbde70",
 95 |    "metadata": {},
 96 |    "outputs": [
 97 |     {
 98 |      "name": "stdout",
 99 |      "output_type": "stream",
100 |      "text": [
101 |       "Downloading from https://github.com/reprise99/Sentinel-Queries/archive/main.zip, may take few mins..\n",
102 |       "Downloaded and Extracted Files successfully\n",
103 |       "Downloading from https://github.com/ugurkocde/KQL_Intune/archive/main.zip, may take few mins..\n",
104 |       "Downloaded and Extracted Files successfully\n"
105 |      ]
106 |     }
107 |    ],
108 |    "source": [
109 |     "repo_archive_urls = get_repo_urls('repo.conf', 'main')\n",
110 |     "#Set output dir\n",
111 |     "output_dir = Path.cwd()\n",
112 |     "\n",
113 |     "#download git repos\n",
114 |     "for url in repo_archive_urls:\n",
115 |     "    download_git_archive(url, output_dir)"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "id": "2ae3721d-87de-4d3a-ad1b-e259db980f42",
122 |    "metadata": {
123 |     "scrolled": true
124 |    },
125 |    "outputs": [],
126 |    "source": [
127 |     "#Download and Parses Microsoft Sentinel Repos\n",
128 |     "azsentinel_git_url = \"https://github.com/Azure/Azure-Sentinel/archive/master.zip\"\n",
129 |     "get_sentinel_queries_from_github(git_url=azsentinel_git_url, outputdir=output_dir)"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "id": "bbfb8388-2acb-4565-9fad-8e04d1c1146f",
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "tmp_path = str(Path.cwd())\n",
140 |     "\n",
141 |     "base_dir = tmp_path + \"/Azure-Sentinel-master\"\n",
142 |     "columns = ['id', 'description', 'DetectionURL','query','tags','tactics','techniques']\n",
143 |     "detections_df = parse_yaml(parent_dir=base_dir, child_dir=\"Detections\")\n",
144 |     "detections_df = detections_df[columns]\n",
145 |     "# hunting_df = parse_yaml(parent_dir=base_dir, child_dir=\"Hunting Queries\")\n",
146 |     "# hunting_df = hunting_df[columns]\n",
147 |     "solutions_df = parse_yaml(parent_dir=base_dir, child_dir=\"Solutions\")\n",
148 |     "solutions_df = solutions_df[columns]\n",
149 |     "\n",
150 |     "frames = [detections_df, solutions_df]\n",
151 |     "sentinel_df = pd.concat(frames, ignore_index=True, sort=True)"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "id": "8501dc5e-0e77-44d2-a63c-1899c9da8e2b",
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "sentinel_df.head()"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 226,
167 |    "id": "7463daec",
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "def parse_markdown():\n",
172 |     "    df = pd.DataFrame()\n",
173 |     "    \n",
174 |     "    # Collect list of files recursively under a folder\n",
175 |     "    parent_dir = tmp_path + \"/KQL_Intune-main\"\n",
176 |     "    md_queries = glob.glob(f\"{parent_dir}/**/*.md\", recursive=True)\n",
177 |     "    parent_dir = tmp_path + \"/Sentinel-Queries-main\"\n",
178 |     "    md_queries = md_queries + glob.glob(f\"{parent_dir}/**/*.md\", recursive=True)\n",
179 |     "    \n",
180 |     "    df = pd.DataFrame(columns=['title', 'kql_query'])\n",
181 |     "    \n",
182 |     "    # Recursively load md Files and append to dataframe\n",
183 |     "    for query in md_queries:\n",
184 |     "        print(\"loading file:\", query)\n",
185 |     "        lines = Path(query).read_text(encoding=\"utf-8\").split('\\n')\n",
186 |     "#         print(lines)\n",
187 |     "#         kql_lines = re.findall(\"```kql([^```]*)\", lines)\n",
188 |     "#         ret.extend(kql_lines)\n",
189 |     "        ct = 0\n",
190 |     "        kql = False\n",
191 |     "        kql_collect = []\n",
192 |     "        title_collect = []\n",
193 |     "        cur_kql = []\n",
194 |     "        title = \"n/a\"\n",
195 |     "        while ct < len(lines):\n",
196 |     "            if kql:\n",
197 |     "                cur_kql.append(l[ct])\n",
198 |     "            if (lines[ct].startswith(\"#\") and lines[ct+2] == \"```kql\"):\n",
199 |     "        #         print(l[ct])\n",
200 |     "                kql = True\n",
201 |     "                title = lines[ct]\n",
202 |     "            elif  (lines[ct] == \"```kql\"):\n",
203 |     "                kql = True\n",
204 |     "            elif lines[ct] == \"```\":\n",
205 |     "                kql = False\n",
206 |     "                cur_kql = \"\\n\".join(cur_kql)\n",
207 |     "                kql_collect.append(cur_kql)\n",
208 |     "                title_collect.append(title)\n",
209 |     "                title = \"n/a\"\n",
210 |     "                cur_kql = []\n",
211 |     "            ct+=1\n",
212 |     "        test_df = pd.DataFrame(list(zip(title_collect, kql_collect)), columns=['title', 'kql_query'])\n",
213 |     "#         df.append(test_df)\n",
214 |     "        df = pd.concat([df, test_df])\n",
215 |     "    \n",
216 |     "    return df\n",
217 |     "    "
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 227,
223 |    "id": "2a4926cd",
224 |    "metadata": {
225 |     "scrolled": true
226 |    },
227 |    "outputs": [
228 |     {
229 |      "name": "stdout",
230 |      "output_type": "stream",
231 |      "text": [
232 |       "loading file: C:\\Users\\jannieli\\OneDrive - Microsoft\\Documents\\hackathon2022/KQL_Intune-main\\README.md\n",
233 |       "loading file: C:\\Users\\jannieli\\OneDrive - Microsoft\\Documents\\hackathon2022/KQL_Intune-main\\Azure Workbook\\readme.md\n",
234 |       "loading file: C:\\Users\\jannieli\\OneDrive - Microsoft\\Documents\\hackathon2022/KQL_Intune-main\\Query Pack\\readme.md\n",
235 |       "loading file: C:\\Users\\jannieli\\OneDrive - Microsoft\\Documents\\hackathon2022/Sentinel-Queries-main\\README.md\n",
236 |       "loading file: C:\\Users\\jannieli\\OneDrive - Microsoft\\Documents\\hackathon2022/Sentinel-Queries-main\\Azure AD Abuse Detection\\README.md\n",
237 |       "loading file: C:\\Users\\jannieli\\OneDrive - Microsoft\\Documents\\hackathon2022/Sentinel-Queries-main\\Functions\\README.md\n",
238 |       "loading file: C:\\Users\\jannieli\\OneDrive - Microsoft\\Documents\\hackathon2022/Sentinel-Queries-main\\Query Pack\\README.md\n",
239 |       "loading file: C:\\Users\\jannieli\\OneDrive - Microsoft\\Documents\\hackathon2022/Sentinel-Queries-main\\Sentinel vs Advanced Hunting\\README.md\n",
240 |       "loading file: C:\\Users\\jannieli\\OneDrive - Microsoft\\Documents\\hackathon2022/Sentinel-Queries-main\\Workbooks\\README.md\n"
241 |      ]
242 |     }
243 |    ],
244 |    "source": [
245 |     "md_queries = parse_markdown()"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": 228,
251 |    "id": "fb209b79",
252 |    "metadata": {},
253 |    "outputs": [
254 |     {
255 |      "data": {
256 |       "text/plain": [
257 |        "100"
258 |       ]
259 |      },
260 |      "execution_count": 228,
261 |      "metadata": {},
262 |      "output_type": "execute_result"
263 |     }
264 |    ],
265 |    "source": [
266 |     "len(md_queries.index)"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": 229,
272 |    "id": "53c8d231",
273 |    "metadata": {
274 |     "scrolled": false
275 |    },
276 |    "outputs": [
277 |     {
278 |      "data": {
279 |       "text/html": [
280 |        "<div>\n",
281 |        "<style scoped>\n",
282 |        "    .dataframe tbody tr th:only-of-type {\n",
283 |        "        vertical-align: middle;\n",
284 |        "    }\n",
285 |        "\n",
286 |        "    .dataframe tbody tr th {\n",
287 |        "        vertical-align: top;\n",
288 |        "    }\n",
289 |        "\n",
290 |        "    .dataframe thead th {\n",
291 |        "        text-align: right;\n",
292 |        "    }\n",
293 |        "</style>\n",
294 |        "<table border=\"1\" class=\"dataframe\">\n",
295 |        "  <thead>\n",
296 |        "    <tr style=\"text-align: right;\">\n",
297 |        "      <th></th>\n",
298 |        "      <th>title</th>\n",
299 |        "      <th>kql_query</th>\n",
300 |        "    </tr>\n",
301 |        "  </thead>\n",
302 |        "  <tbody>\n",
303 |        "    <tr>\n",
304 |        "      <th>0</th>\n",
305 |        "      <td>n/a</td>\n",
306 |        "      <td>SigninLogs\\n| where TimeGenerated &gt; ago(14d)\\n...</td>\n",
307 |        "    </tr>\n",
308 |        "    <tr>\n",
309 |        "      <th>1</th>\n",
310 |        "      <td>n/a</td>\n",
311 |        "      <td>SigninLogs\\n```</td>\n",
312 |        "    </tr>\n",
313 |        "    <tr>\n",
314 |        "      <th>2</th>\n",
315 |        "      <td>n/a</td>\n",
316 |        "      <td>SigninLogs\\n| where TimeGenerated &gt; ago(14d)\\n```</td>\n",
317 |        "    </tr>\n",
318 |        "    <tr>\n",
319 |        "      <th>3</th>\n",
320 |        "      <td>n/a</td>\n",
321 |        "      <td>SigninLogs\\n| where TimeGenerated &gt; ago(14d)\\n...</td>\n",
322 |        "    </tr>\n",
323 |        "    <tr>\n",
324 |        "      <th>4</th>\n",
325 |        "      <td>n/a</td>\n",
326 |        "      <td>SigninLogs\\n| where TimeGenerated &gt; ago(14d)\\n...</td>\n",
327 |        "    </tr>\n",
328 |        "    <tr>\n",
329 |        "      <th>...</th>\n",
330 |        "      <td>...</td>\n",
331 |        "      <td>...</td>\n",
332 |        "    </tr>\n",
333 |        "    <tr>\n",
334 |        "      <th>3</th>\n",
335 |        "      <td>n/a</td>\n",
336 |        "      <td>Microsoft Sentinel will then run through your ...</td>\n",
337 |        "    </tr>\n",
338 |        "    <tr>\n",
339 |        "      <th>4</th>\n",
340 |        "      <td>n/a</td>\n",
341 |        "      <td>| where AppDisplayName == \"Microsoft Teams\"\\n`...</td>\n",
342 |        "    </tr>\n",
343 |        "    <tr>\n",
344 |        "      <th>5</th>\n",
345 |        "      <td>n/a</td>\n",
346 |        "      <td>| where TimeGenerated &gt; ago(14d)\\n| where User...</td>\n",
347 |        "    </tr>\n",
348 |        "    <tr>\n",
349 |        "      <th>6</th>\n",
350 |        "      <td>n/a</td>\n",
351 |        "      <td>That is how you build queries, now the basics....</td>\n",
352 |        "    </tr>\n",
353 |        "    <tr>\n",
354 |        "      <th>7</th>\n",
355 |        "      <td>n/a</td>\n",
356 |        "      <td>\\n```kql\\nSigninLogs</td>\n",
357 |        "    </tr>\n",
358 |        "  </tbody>\n",
359 |        "</table>\n",
360 |        "<p>100 rows × 2 columns</p>\n",
361 |        "</div>"
362 |       ],
363 |       "text/plain": [
364 |        "   title                                          kql_query\n",
365 |        "0    n/a  SigninLogs\\n| where TimeGenerated > ago(14d)\\n...\n",
366 |        "1    n/a                                    SigninLogs\\n```\n",
367 |        "2    n/a  SigninLogs\\n| where TimeGenerated > ago(14d)\\n```\n",
368 |        "3    n/a  SigninLogs\\n| where TimeGenerated > ago(14d)\\n...\n",
369 |        "4    n/a  SigninLogs\\n| where TimeGenerated > ago(14d)\\n...\n",
370 |        "..   ...                                                ...\n",
371 |        "3    n/a  Microsoft Sentinel will then run through your ...\n",
372 |        "4    n/a  | where AppDisplayName == \"Microsoft Teams\"\\n`...\n",
373 |        "5    n/a  | where TimeGenerated > ago(14d)\\n| where User...\n",
374 |        "6    n/a  That is how you build queries, now the basics....\n",
375 |        "7    n/a                               \\n```kql\\nSigninLogs\n",
376 |        "\n",
377 |        "[100 rows x 2 columns]"
378 |       ]
379 |      },
380 |      "metadata": {},
381 |      "output_type": "display_data"
382 |     }
383 |    ],
384 |    "source": [
385 |     "display(md_queries)"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": 230,
391 |    "id": "bc55367d",
392 |    "metadata": {},
393 |    "outputs": [
394 |     {
395 |      "data": {
396 |       "text/html": [
397 |        "<div>\n",
398 |        "<style scoped>\n",
399 |        "    .dataframe tbody tr th:only-of-type {\n",
400 |        "        vertical-align: middle;\n",
401 |        "    }\n",
402 |        "\n",
403 |        "    .dataframe tbody tr th {\n",
404 |        "        vertical-align: top;\n",
405 |        "    }\n",
406 |        "\n",
407 |        "    .dataframe thead th {\n",
408 |        "        text-align: right;\n",
409 |        "    }\n",
410 |        "</style>\n",
411 |        "<table border=\"1\" class=\"dataframe\">\n",
412 |        "  <thead>\n",
413 |        "    <tr style=\"text-align: right;\">\n",
414 |        "      <th></th>\n",
415 |        "      <th>title</th>\n",
416 |        "      <th>kql_query</th>\n",
417 |        "    </tr>\n",
418 |        "  </thead>\n",
419 |        "  <tbody>\n",
420 |        "    <tr>\n",
421 |        "      <th>0</th>\n",
422 |        "      <td>### Detection Query (User as actor)</td>\n",
423 |        "      <td>\\nWe want to use KQL to create accurate and ef...</td>\n",
424 |        "    </tr>\n",
425 |        "    <tr>\n",
426 |        "      <th>1</th>\n",
427 |        "      <td>### Detection Query (User as actor)</td>\n",
428 |        "      <td>So first we have chosen our SigninLogs table.\\...</td>\n",
429 |        "    </tr>\n",
430 |        "    <tr>\n",
431 |        "      <th>2</th>\n",
432 |        "      <td>### Detection Query (User as actor)</td>\n",
433 |        "      <td>Then we look for only logs where the ResultTyp...</td>\n",
434 |        "    </tr>\n",
435 |        "    <tr>\n",
436 |        "      <th>3</th>\n",
437 |        "      <td>### Detection Query (Service principal as actor)</td>\n",
438 |        "      <td>SigninLogs\\n| where TimeGenerated &gt; ago(14d)\\n...</td>\n",
439 |        "    </tr>\n",
440 |        "    <tr>\n",
441 |        "      <th>4</th>\n",
442 |        "      <td>### Detection Query (User as actor)</td>\n",
443 |        "      <td>```\\n\\nIs much more efficient than searching f...</td>\n",
444 |        "    </tr>\n",
445 |        "    <tr>\n",
446 |        "      <th>5</th>\n",
447 |        "      <td>### Detection Query (Service principal as actor)</td>\n",
448 |        "      <td>SigninLogs\\n| where TimeGenerated &gt; ago(14d)\\n...</td>\n",
449 |        "    </tr>\n",
450 |        "    <tr>\n",
451 |        "      <th>6</th>\n",
452 |        "      <td>### Detection Query (User as actor)</td>\n",
453 |        "      <td>SigninLogs\\n| where TimeGenerated between (ago...</td>\n",
454 |        "    </tr>\n",
455 |        "    <tr>\n",
456 |        "      <th>7</th>\n",
457 |        "      <td>### Detection Query (Service principal as actor)</td>\n",
458 |        "      <td>SigninLogs\\n| where TimeGenerated between (ago...</td>\n",
459 |        "    </tr>\n",
460 |        "    <tr>\n",
461 |        "      <th>8</th>\n",
462 |        "      <td>### Detection Query (User as actor)</td>\n",
463 |        "      <td>\\nInstead of equals, we can also use contains....</td>\n",
464 |        "    </tr>\n",
465 |        "    <tr>\n",
466 |        "      <th>9</th>\n",
467 |        "      <td>### Detection Query (Service principal as actor)</td>\n",
468 |        "      <td>```kql\\nSigninLogs\\n| where TimeGenerated &gt; ag...</td>\n",
469 |        "    </tr>\n",
470 |        "    <tr>\n",
471 |        "      <th>10</th>\n",
472 |        "      <td>### Detection Query (User as actor)</td>\n",
473 |        "      <td>If you are searching for multiple words you ca...</td>\n",
474 |        "    </tr>\n",
475 |        "    <tr>\n",
476 |        "      <th>11</th>\n",
477 |        "      <td>### Detection Query (Service principal as actor)</td>\n",
478 |        "      <td>| where AppDisplayName has_all (\"Teams\",\"Outlo...</td>\n",
479 |        "    </tr>\n",
480 |        "    <tr>\n",
481 |        "      <th>12</th>\n",
482 |        "      <td>### Detection Query (User as actor, user as ta...</td>\n",
483 |        "      <td>This query would find all SigninLogs where the...</td>\n",
484 |        "    </tr>\n",
485 |        "    <tr>\n",
486 |        "      <th>13</th>\n",
487 |        "      <td>### Detection Query (User as actor, service pr...</td>\n",
488 |        "      <td>\\nThis query would find SigninLogs where the a...</td>\n",
489 |        "    </tr>\n",
490 |        "    <tr>\n",
491 |        "      <th>14</th>\n",
492 |        "      <td>### Detection Query (Service principal as acto...</td>\n",
493 |        "      <td>This query searches for SigninLogs data from t...</td>\n",
494 |        "    </tr>\n",
495 |        "    <tr>\n",
496 |        "      <th>15</th>\n",
497 |        "      <td>### Detection Query (Service Principal as acto...</td>\n",
498 |        "      <td>\\nThis returns the same data, but changes the ...</td>\n",
499 |        "    </tr>\n",
500 |        "    <tr>\n",
501 |        "      <th>16</th>\n",
502 |        "      <td>### Detection Query (User as actor, user as ta...</td>\n",
503 |        "      <td>```\\n\\nThis query will look up the SigninLogs ...</td>\n",
504 |        "    </tr>\n",
505 |        "    <tr>\n",
506 |        "      <th>17</th>\n",
507 |        "      <td>### Detection Query (User as actor, Service Pr...</td>\n",
508 |        "      <td>\\nInstead of a total count, you can summarize ...</td>\n",
509 |        "    </tr>\n",
510 |        "    <tr>\n",
511 |        "      <th>18</th>\n",
512 |        "      <td>### Detection Query (Service Principal as acto...</td>\n",
513 |        "      <td>| where TimeGenerated &gt; ago(14d)\\n| where User...</td>\n",
514 |        "    </tr>\n",
515 |        "    <tr>\n",
516 |        "      <th>19</th>\n",
517 |        "      <td>### Detection Query (Service principal as acto...</td>\n",
518 |        "      <td>\\nThis is the same but returns the oldest reco...</td>\n",
519 |        "    </tr>\n",
520 |        "    <tr>\n",
521 |        "      <th>20</th>\n",
522 |        "      <td>### Detection Query (User as actor, user as ta...</td>\n",
523 |        "      <td>```\\n\\nThis returns the same data as our first...</td>\n",
524 |        "    </tr>\n",
525 |        "    <tr>\n",
526 |        "      <th>21</th>\n",
527 |        "      <td>### Detection Query (User as actor, service pr...</td>\n",
528 |        "      <td>This is a combination of our countif and bin f...</td>\n",
529 |        "    </tr>\n",
530 |        "    <tr>\n",
531 |        "      <th>22</th>\n",
532 |        "      <td>### Detection Query (Service principal as acto...</td>\n",
533 |        "      <td>SigninLogs\\n| where TimeGenerated &gt; ago(14d)\\n...</td>\n",
534 |        "    </tr>\n",
535 |        "    <tr>\n",
536 |        "      <th>23</th>\n",
537 |        "      <td>### Detection Query (Service principal as acto...</td>\n",
538 |        "      <td>If we use our same example from our Signinlogs...</td>\n",
539 |        "    </tr>\n",
540 |        "    <tr>\n",
541 |        "      <th>24</th>\n",
542 |        "      <td>### Detection Query (User as actor, user as ta...</td>\n",
543 |        "      <td>\\nOr a barchart.\\n\\n```kql\\nSigninLogs\\n| wher...</td>\n",
544 |        "    </tr>\n",
545 |        "    <tr>\n",
546 |        "      <th>25</th>\n",
547 |        "      <td>### Detection Query (User as actor, service pr...</td>\n",
548 |        "      <td>\\n```kql\\nSigninLogs\\n| where TimeGenerated &gt; ...</td>\n",
549 |        "    </tr>\n",
550 |        "    <tr>\n",
551 |        "      <th>26</th>\n",
552 |        "      <td>### Detection Query (Service principal as acto...</td>\n",
553 |        "      <td>```kql\\nSigninLogs\\n| where TimeGenerated &gt; ag...</td>\n",
554 |        "    </tr>\n",
555 |        "    <tr>\n",
556 |        "      <th>27</th>\n",
557 |        "      <td>### Detection Query (Service Principal as acto...</td>\n",
558 |        "      <td>| where TimeGenerated &gt; ago(14d)\\n| where User...</td>\n",
559 |        "    </tr>\n",
560 |        "    <tr>\n",
561 |        "      <th>28</th>\n",
562 |        "      <td>### Detection Query (User as actor, user as ta...</td>\n",
563 |        "      <td>| where TimeGenerated &gt; ago(14d)\\n| where User...</td>\n",
564 |        "    </tr>\n",
565 |        "    <tr>\n",
566 |        "      <th>29</th>\n",
567 |        "      <td>### Detection Query (User as actor, Service Pr...</td>\n",
568 |        "      <td>SigninLogs\\n| where TimeGenerated &gt; ago(14d)\\n...</td>\n",
569 |        "    </tr>\n",
570 |        "    <tr>\n",
571 |        "      <th>30</th>\n",
572 |        "      <td>### Detection Query (Service Principal as acto...</td>\n",
573 |        "      <td>| where UserPrincipalName == \"reprise_99@testd...</td>\n",
574 |        "    </tr>\n",
575 |        "    <tr>\n",
576 |        "      <th>31</th>\n",
577 |        "      <td>### Detection Query (Service Principal as acto...</td>\n",
578 |        "      <td>```\\n\\nThis query searches all signins to your...</td>\n",
579 |        "    </tr>\n",
580 |        "  </tbody>\n",
581 |        "</table>\n",
582 |        "</div>"
583 |       ],
584 |       "text/plain": [
585 |        "                                                title  \\\n",
586 |        "0                 ### Detection Query (User as actor)   \n",
587 |        "1                 ### Detection Query (User as actor)   \n",
588 |        "2                 ### Detection Query (User as actor)   \n",
589 |        "3    ### Detection Query (Service principal as actor)   \n",
590 |        "4                 ### Detection Query (User as actor)   \n",
591 |        "5    ### Detection Query (Service principal as actor)   \n",
592 |        "6                 ### Detection Query (User as actor)   \n",
593 |        "7    ### Detection Query (Service principal as actor)   \n",
594 |        "8                 ### Detection Query (User as actor)   \n",
595 |        "9    ### Detection Query (Service principal as actor)   \n",
596 |        "10                ### Detection Query (User as actor)   \n",
597 |        "11   ### Detection Query (Service principal as actor)   \n",
598 |        "12  ### Detection Query (User as actor, user as ta...   \n",
599 |        "13  ### Detection Query (User as actor, service pr...   \n",
600 |        "14  ### Detection Query (Service principal as acto...   \n",
601 |        "15  ### Detection Query (Service Principal as acto...   \n",
602 |        "16  ### Detection Query (User as actor, user as ta...   \n",
603 |        "17  ### Detection Query (User as actor, Service Pr...   \n",
604 |        "18  ### Detection Query (Service Principal as acto...   \n",
605 |        "19  ### Detection Query (Service principal as acto...   \n",
606 |        "20  ### Detection Query (User as actor, user as ta...   \n",
607 |        "21  ### Detection Query (User as actor, service pr...   \n",
608 |        "22  ### Detection Query (Service principal as acto...   \n",
609 |        "23  ### Detection Query (Service principal as acto...   \n",
610 |        "24  ### Detection Query (User as actor, user as ta...   \n",
611 |        "25  ### Detection Query (User as actor, service pr...   \n",
612 |        "26  ### Detection Query (Service principal as acto...   \n",
613 |        "27  ### Detection Query (Service Principal as acto...   \n",
614 |        "28  ### Detection Query (User as actor, user as ta...   \n",
615 |        "29  ### Detection Query (User as actor, Service Pr...   \n",
616 |        "30  ### Detection Query (Service Principal as acto...   \n",
617 |        "31  ### Detection Query (Service Principal as acto...   \n",
618 |        "\n",
619 |        "                                            kql_query  \n",
620 |        "0   \\nWe want to use KQL to create accurate and ef...  \n",
621 |        "1   So first we have chosen our SigninLogs table.\\...  \n",
622 |        "2   Then we look for only logs where the ResultTyp...  \n",
623 |        "3   SigninLogs\\n| where TimeGenerated > ago(14d)\\n...  \n",
624 |        "4   ```\\n\\nIs much more efficient than searching f...  \n",
625 |        "5   SigninLogs\\n| where TimeGenerated > ago(14d)\\n...  \n",
626 |        "6   SigninLogs\\n| where TimeGenerated between (ago...  \n",
627 |        "7   SigninLogs\\n| where TimeGenerated between (ago...  \n",
628 |        "8   \\nInstead of equals, we can also use contains....  \n",
629 |        "9   ```kql\\nSigninLogs\\n| where TimeGenerated > ag...  \n",
630 |        "10  If you are searching for multiple words you ca...  \n",
631 |        "11  | where AppDisplayName has_all (\"Teams\",\"Outlo...  \n",
632 |        "12  This query would find all SigninLogs where the...  \n",
633 |        "13  \\nThis query would find SigninLogs where the a...  \n",
634 |        "14  This query searches for SigninLogs data from t...  \n",
635 |        "15  \\nThis returns the same data, but changes the ...  \n",
636 |        "16  ```\\n\\nThis query will look up the SigninLogs ...  \n",
637 |        "17  \\nInstead of a total count, you can summarize ...  \n",
638 |        "18  | where TimeGenerated > ago(14d)\\n| where User...  \n",
639 |        "19  \\nThis is the same but returns the oldest reco...  \n",
640 |        "20  ```\\n\\nThis returns the same data as our first...  \n",
641 |        "21  This is a combination of our countif and bin f...  \n",
642 |        "22  SigninLogs\\n| where TimeGenerated > ago(14d)\\n...  \n",
643 |        "23  If we use our same example from our Signinlogs...  \n",
644 |        "24  \\nOr a barchart.\\n\\n```kql\\nSigninLogs\\n| wher...  \n",
645 |        "25  \\n```kql\\nSigninLogs\\n| where TimeGenerated > ...  \n",
646 |        "26  ```kql\\nSigninLogs\\n| where TimeGenerated > ag...  \n",
647 |        "27  | where TimeGenerated > ago(14d)\\n| where User...  \n",
648 |        "28  | where TimeGenerated > ago(14d)\\n| where User...  \n",
649 |        "29  SigninLogs\\n| where TimeGenerated > ago(14d)\\n...  \n",
650 |        "30  | where UserPrincipalName == \"reprise_99@testd...  \n",
651 |        "31  ```\\n\\nThis query searches all signins to your...  "
652 |       ]
653 |      },
654 |      "execution_count": 230,
655 |      "metadata": {},
656 |      "output_type": "execute_result"
657 |     }
658 |    ],
659 |    "source": [
660 |     "md_queries[md_queries['title'] != 'n/a']"
661 |    ]
662 |   },
663 |   {
664 |    "cell_type": "code",
665 |    "execution_count": null,
666 |    "id": "59cc5e3e-5b37-4759-a962-902f7049b861",
667 |    "metadata": {},
668 |    "outputs": [],
669 |    "source": [
670 |     "tmp_path = str(Path.cwd())\n",
671 |     "csv_files = glob.glob(os.path.join(path, \"*.csv\"))"
672 |    ]
673 |   },
674 |   {
675 |    "cell_type": "code",
676 |    "execution_count": null,
677 |    "id": "22d2c705",
678 |    "metadata": {},
679 |    "outputs": [],
680 |    "source": []
681 |   }
682 |  ],
683 |  "metadata": {
684 |   "kernelspec": {
685 |    "display_name": "Python 3 (ipykernel)",
686 |    "language": "python",
687 |    "name": "python3"
688 |   },
689 |   "language_info": {
690 |    "codemirror_mode": {
691 |     "name": "ipython",
692 |     "version": 3
693 |    },
694 |    "file_extension": ".py",
695 |    "mimetype": "text/x-python",
696 |    "name": "python",
697 |    "nbconvert_exporter": "python",
698 |    "pygments_lexer": "ipython3",
699 |    "version": "3.8.13"
700 |   }
701 |  },
702 |  "nbformat": 4,
703 |  "nbformat_minor": 5
704 | }
705 | 


--------------------------------------------------------------------------------
/dev-notebooks/Kqlquery-schema.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/kql-query-store/b2b2e82fe3a80a2baf8355b90d3a88db98b1a472/dev-notebooks/Kqlquery-schema.png


--------------------------------------------------------------------------------
/dev-notebooks/az-monitor-schemas.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 18,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "Reading schemas for 11 tables...\n"
 13 |      ]
 14 |     },
 15 |     {
 16 |      "name": "stderr",
 17 |      "output_type": "stream",
 18 |      "text": [
 19 |       "  9%|▉         | 1/11 [00:00<00:02,  3.41it/s]"
 20 |      ]
 21 |     },
 22 |     {
 23 |      "name": "stdout",
 24 |      "output_type": "stream",
 25 |      "text": [
 26 |       "SecurityAlert Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')\n"
 27 |      ]
 28 |     },
 29 |     {
 30 |      "name": "stderr",
 31 |      "output_type": "stream",
 32 |      "text": [
 33 |       " 18%|█▊        | 2/11 [00:00<00:03,  2.49it/s]"
 34 |      ]
 35 |     },
 36 |     {
 37 |      "name": "stdout",
 38 |      "output_type": "stream",
 39 |      "text": [
 40 |       "SecurityBaseline Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')\n"
 41 |      ]
 42 |     },
 43 |     {
 44 |      "name": "stderr",
 45 |      "output_type": "stream",
 46 |      "text": [
 47 |       " 27%|██▋       | 3/11 [00:01<00:03,  2.63it/s]"
 48 |      ]
 49 |     },
 50 |     {
 51 |      "name": "stdout",
 52 |      "output_type": "stream",
 53 |      "text": [
 54 |       "SecurityBaselineSummary Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')\n"
 55 |      ]
 56 |     },
 57 |     {
 58 |      "name": "stderr",
 59 |      "output_type": "stream",
 60 |      "text": [
 61 |       " 36%|███▋      | 4/11 [00:01<00:02,  2.79it/s]"
 62 |      ]
 63 |     },
 64 |     {
 65 |      "name": "stdout",
 66 |      "output_type": "stream",
 67 |      "text": [
 68 |       "SecurityDetection Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')\n"
 69 |      ]
 70 |     },
 71 |     {
 72 |      "name": "stderr",
 73 |      "output_type": "stream",
 74 |      "text": [
 75 |       " 45%|████▌     | 5/11 [00:01<00:01,  3.21it/s]"
 76 |      ]
 77 |     },
 78 |     {
 79 |      "name": "stdout",
 80 |      "output_type": "stream",
 81 |      "text": [
 82 |       "SecurityEvent Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')\n"
 83 |      ]
 84 |     },
 85 |     {
 86 |      "name": "stderr",
 87 |      "output_type": "stream",
 88 |      "text": [
 89 |       " 64%|██████▎   | 7/11 [00:02<00:01,  3.85it/s]"
 90 |      ]
 91 |     },
 92 |     {
 93 |      "name": "stdout",
 94 |      "output_type": "stream",
 95 |      "text": [
 96 |       "SecurityIoTRawEvent Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')\n",
 97 |       "SecurityRecommendation Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')\n"
 98 |      ]
 99 |     },
100 |     {
101 |      "name": "stderr",
102 |      "output_type": "stream",
103 |      "text": [
104 |       " 73%|███████▎  | 8/11 [00:02<00:00,  3.95it/s]"
105 |      ]
106 |     },
107 |     {
108 |      "name": "stdout",
109 |      "output_type": "stream",
110 |      "text": [
111 |       "SentinelAudit Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')\n"
112 |      ]
113 |     },
114 |     {
115 |      "name": "stderr",
116 |      "output_type": "stream",
117 |      "text": [
118 |       " 82%|████████▏ | 9/11 [00:02<00:00,  3.85it/s]"
119 |      ]
120 |     },
121 |     {
122 |      "name": "stdout",
123 |      "output_type": "stream",
124 |      "text": [
125 |       "SentinelHealth Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')\n"
126 |      ]
127 |     },
128 |     {
129 |      "name": "stderr",
130 |      "output_type": "stream",
131 |      "text": [
132 |       " 91%|█████████ | 10/11 [00:02<00:00,  3.77it/s]"
133 |      ]
134 |     },
135 |     {
136 |      "name": "stdout",
137 |      "output_type": "stream",
138 |      "text": [
139 |       "SigninLogs Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')\n"
140 |      ]
141 |     },
142 |     {
143 |      "name": "stderr",
144 |      "output_type": "stream",
145 |      "text": [
146 |       "100%|██████████| 11/11 [00:03<00:00,  3.47it/s]"
147 |      ]
148 |     },
149 |     {
150 |      "name": "stdout",
151 |      "output_type": "stream",
152 |      "text": [
153 |       "Syslog Index(['Column', 'Type', 'Description', 'Table', 'Url'], dtype='object')\n"
154 |      ]
155 |     },
156 |     {
157 |      "name": "stderr",
158 |      "output_type": "stream",
159 |      "text": [
160 |       "\n"
161 |      ]
162 |     }
163 |    ],
164 |    "source": [
165 |     "from typing import Dict\n",
166 |     "import pandas as pd\n",
167 |     "import requests\n",
168 |     "\n",
169 |     "import bs4\n",
170 |     "from tqdm.auto import tqdm\n",
171 |     "\n",
172 |     "SCHEMA_CATS_URL = \"https://learn.microsoft.com/azure/azure-monitor/reference/tables/tables-category\"\n",
173 |     "\n",
174 |     "def fetch_az_mon_categories() -> requests.models.Response:\n",
175 |     "    \"\"\"Return the AzMonitor reference page.\"\"\"\n",
176 |     "    return requests.get(SCHEMA_CATS_URL)\n",
177 |     "\n",
178 |     "\n",
179 |     "def get_security_category_list(resp: requests.models.Response) -> bs4.element.Tag:\n",
180 |     "    \"\"\"Extract the list after the security header.\"\"\"\n",
181 |     "    soup = bs4.BeautifulSoup(resp.text, \"html.parser\")\n",
182 |     "\n",
183 |     "    result = soup.find(\"div\", class_=\"content\")\n",
184 |     "    sec_header =result.find(\"h2\", id=\"security\")\n",
185 |     "    return sec_header.find_next_sibling()\n",
186 |     "\n",
187 |     "\n",
188 |     "def build_table_index(security_cat_list: bs4.element.Tag) -> Dict[str, Dict[str, str]]:\n",
189 |     "    \"\"\"From the html list, build an index of URLs.\"\"\"\n",
190 |     "    table_prefix = \"https://learn.microsoft.com/azure/azure-monitor/reference/tables/{href}\"\n",
191 |     "    return {\n",
192 |     "        item.a.contents[0]: {\n",
193 |     "            \"href\": item.a.attrs.get(\"href\"),\n",
194 |     "            \"url\": table_prefix.format(**(item.a.attrs)),\n",
195 |     "        }\n",
196 |     "        for item in security_cat_list.find_all(\"li\")\n",
197 |     "    }\n",
198 |     "\n",
199 |     "\n",
200 |     "def read_table_from_url(table: str, ref: Dict[str, str]) -> pd.DataFrame:\n",
201 |     "    \"\"\"Read table schema from a URL.\"\"\"\n",
202 |     "    table_data = pd.read_html(ref[\"url\"])[0]\n",
203 |     "    table_data[\"Table\"] = table\n",
204 |     "    table_data[\"Url\"] = ref[\"url\"]\n",
205 |     "    print(table, table_data.columns)\n",
206 |     "    return table_data\n",
207 |     "\n",
208 |     "\n",
209 |     "def fetch_table_schemas(sec_url_dict: Dict[str, Dict[str, str]]) -> pd.DataFrame:\n",
210 |     "    \"\"\"Combine schema tables into single DF.\"\"\"\n",
211 |     "    print(f\"Reading schemas for {len(sec_url_dict)} tables...\")\n",
212 |     "    all_tables = [\n",
213 |     "        read_table_from_url(table, ref)\n",
214 |     "        for table, ref in tqdm(sec_url_dict.items())\n",
215 |     "    ]\n",
216 |     "    return pd.concat(all_tables, ignore_index=True)\n",
217 |     "\n",
218 |     "\n",
219 |     "\n",
220 |     "sec_cat_list = get_security_category_list(fetch_az_mon_categories())\n",
221 |     "sec_url_dict = build_table_index(sec_cat_list)\n",
222 |     "sec_url_dict = {key: val for key, val in sec_url_dict.items() if key.startswith(\"S\")}\n",
223 |     "comb_tables = fetch_table_schemas(sec_url_dict)"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": 3,
229 |    "metadata": {},
230 |    "outputs": [
231 |     {
232 |      "ename": "NameError",
233 |      "evalue": "name 'comb_tables' is not defined",
234 |      "output_type": "error",
235 |      "traceback": [
236 |       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
237 |       "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
238 |       "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_640980\\1382993768.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mcomb_tables\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
239 |       "\u001b[1;31mNameError\u001b[0m: name 'comb_tables' is not defined"
240 |      ]
241 |     }
242 |    ],
243 |    "source": [
244 |     "comb_tables.head()"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 41,
250 |    "metadata": {},
251 |    "outputs": [
252 |     {
253 |      "data": {
254 |       "text/plain": [
255 |        "'{\"SecurityAlert\": {\"url\": \"https://learn.microsoft.com/azure/azure-monitor/reference/tables/securityalert\", \"schema\": {\"Column\": \"AlertLink\", \"Type\": \"string\", \"Description\": NaN}}, \"SecurityBaseline\": {\"url\": \"https://learn.microsoft.com/azure/azure-monitor/reference/tables/securitybaseline\", \"schema\": {\"Column\": \"ActualResult\", \"Type\": \"string\", \"Description\": NaN}}, \"SecurityBaselineSummary\": {\"url\": \"https://learn.microsoft.com/azure/azure-monitor/reference/tables/securitybaselinesummary\", \"schema\": {\"Column\": \"AssessmentId\", \"Type\": \"string\", \"Description\": NaN}}, \"SecurityDetection\": {\"url\": \"https://learn.microsoft.com/azure/azure-monitor/reference/tables/securitydetection\", \"schema\": {\"Column\": \"AccountsSeen\", \"Type\": \"int\", \"Description\": NaN}}, \"SecurityEvent\": {\"url\": \"https://learn.microsoft.com/azure/azure-monitor/reference/tables/securityevent\", \"schema\": {\"Column\": \"AccessMask\", \"Type\": \"string\", \"Description\": NaN}}, \"SecurityIoTRawEvent\": {\"url\": \"https://learn.microsoft.com/azure/azure-monitor/reference/tables/securityiotrawevent\", \"schema\": {\"Column\": \"AgentVersion\", \"Type\": \"string\", \"Description\": NaN}}, \"SecurityRecommendation\": {\"url\": \"https://learn.microsoft.com/azure/azure-monitor/reference/tables/securityrecommendation\", \"schema\": {\"Column\": \"AssessedResourceId\", \"Type\": \"string\", \"Description\": NaN}}, \"SentinelAudit\": {\"url\": \"https://learn.microsoft.com/azure/azure-monitor/reference/tables/sentinelaudit\", \"schema\": {\"Column\": \"CorrelationId\", \"Type\": \"string\", \"Description\": \"A unique record identifier.\"}}, \"SentinelHealth\": {\"url\": \"https://learn.microsoft.com/azure/azure-monitor/reference/tables/sentinelhealth\", \"schema\": {\"Column\": \"Description\", \"Type\": \"string\", \"Description\": \"The operation description.\"}}, \"SigninLogs\": {\"url\": \"https://learn.microsoft.com/azure/azure-monitor/reference/tables/signinlogs\", \"schema\": {\"Column\": \"AADTenantId\", \"Type\": \"string\", \"Description\": NaN}}, \"Syslog\": {\"url\": \"https://learn.microsoft.com/azure/azure-monitor/reference/tables/syslog\", \"schema\": {\"Column\": \"Computer\", \"Type\": \"string\", \"Description\": \"Computer that the event was collected from.\"}}}'"
256 |       ]
257 |      },
258 |      "metadata": {},
259 |      "output_type": "display_data"
260 |     },
261 |     {
262 |      "data": {
263 |       "text/html": [
264 |        "<div>\n",
265 |        "<style scoped>\n",
266 |        "    .dataframe tbody tr th:only-of-type {\n",
267 |        "        vertical-align: middle;\n",
268 |        "    }\n",
269 |        "\n",
270 |        "    .dataframe tbody tr th {\n",
271 |        "        vertical-align: top;\n",
272 |        "    }\n",
273 |        "\n",
274 |        "    .dataframe thead th {\n",
275 |        "        text-align: right;\n",
276 |        "    }\n",
277 |        "</style>\n",
278 |        "<table border=\"1\" class=\"dataframe\">\n",
279 |        "  <thead>\n",
280 |        "    <tr style=\"text-align: right;\">\n",
281 |        "      <th></th>\n",
282 |        "      <th>url</th>\n",
283 |        "      <th>schema</th>\n",
284 |        "    </tr>\n",
285 |        "  </thead>\n",
286 |        "  <tbody>\n",
287 |        "    <tr>\n",
288 |        "      <th>SecurityAlert</th>\n",
289 |        "      <td>https://learn.microsoft.com/azure/azure-monito...</td>\n",
290 |        "      <td>{'Column': 'AlertLink', 'Type': 'string', 'Des...</td>\n",
291 |        "    </tr>\n",
292 |        "    <tr>\n",
293 |        "      <th>SecurityBaseline</th>\n",
294 |        "      <td>https://learn.microsoft.com/azure/azure-monito...</td>\n",
295 |        "      <td>{'Column': 'ActualResult', 'Type': 'string', '...</td>\n",
296 |        "    </tr>\n",
297 |        "    <tr>\n",
298 |        "      <th>SecurityBaselineSummary</th>\n",
299 |        "      <td>https://learn.microsoft.com/azure/azure-monito...</td>\n",
300 |        "      <td>{'Column': 'AssessmentId', 'Type': 'string', '...</td>\n",
301 |        "    </tr>\n",
302 |        "    <tr>\n",
303 |        "      <th>SecurityDetection</th>\n",
304 |        "      <td>https://learn.microsoft.com/azure/azure-monito...</td>\n",
305 |        "      <td>{'Column': 'AccountsSeen', 'Type': 'int', 'Des...</td>\n",
306 |        "    </tr>\n",
307 |        "    <tr>\n",
308 |        "      <th>SecurityEvent</th>\n",
309 |        "      <td>https://learn.microsoft.com/azure/azure-monito...</td>\n",
310 |        "      <td>{'Column': 'AccessMask', 'Type': 'string', 'De...</td>\n",
311 |        "    </tr>\n",
312 |        "    <tr>\n",
313 |        "      <th>SecurityIoTRawEvent</th>\n",
314 |        "      <td>https://learn.microsoft.com/azure/azure-monito...</td>\n",
315 |        "      <td>{'Column': 'AgentVersion', 'Type': 'string', '...</td>\n",
316 |        "    </tr>\n",
317 |        "    <tr>\n",
318 |        "      <th>SecurityRecommendation</th>\n",
319 |        "      <td>https://learn.microsoft.com/azure/azure-monito...</td>\n",
320 |        "      <td>{'Column': 'AssessedResourceId', 'Type': 'stri...</td>\n",
321 |        "    </tr>\n",
322 |        "    <tr>\n",
323 |        "      <th>SentinelAudit</th>\n",
324 |        "      <td>https://learn.microsoft.com/azure/azure-monito...</td>\n",
325 |        "      <td>{'Column': 'CorrelationId', 'Type': 'string', ...</td>\n",
326 |        "    </tr>\n",
327 |        "    <tr>\n",
328 |        "      <th>SentinelHealth</th>\n",
329 |        "      <td>https://learn.microsoft.com/azure/azure-monito...</td>\n",
330 |        "      <td>{'Column': 'Description', 'Type': 'string', 'D...</td>\n",
331 |        "    </tr>\n",
332 |        "    <tr>\n",
333 |        "      <th>SigninLogs</th>\n",
334 |        "      <td>https://learn.microsoft.com/azure/azure-monito...</td>\n",
335 |        "      <td>{'Column': 'AADTenantId', 'Type': 'string', 'D...</td>\n",
336 |        "    </tr>\n",
337 |        "    <tr>\n",
338 |        "      <th>Syslog</th>\n",
339 |        "      <td>https://learn.microsoft.com/azure/azure-monito...</td>\n",
340 |        "      <td>{'Column': 'Computer', 'Type': 'string', 'Desc...</td>\n",
341 |        "    </tr>\n",
342 |        "  </tbody>\n",
343 |        "</table>\n",
344 |        "</div>"
345 |       ],
346 |       "text/plain": [
347 |        "                                                                       url  \\\n",
348 |        "SecurityAlert            https://learn.microsoft.com/azure/azure-monito...   \n",
349 |        "SecurityBaseline         https://learn.microsoft.com/azure/azure-monito...   \n",
350 |        "SecurityBaselineSummary  https://learn.microsoft.com/azure/azure-monito...   \n",
351 |        "SecurityDetection        https://learn.microsoft.com/azure/azure-monito...   \n",
352 |        "SecurityEvent            https://learn.microsoft.com/azure/azure-monito...   \n",
353 |        "SecurityIoTRawEvent      https://learn.microsoft.com/azure/azure-monito...   \n",
354 |        "SecurityRecommendation   https://learn.microsoft.com/azure/azure-monito...   \n",
355 |        "SentinelAudit            https://learn.microsoft.com/azure/azure-monito...   \n",
356 |        "SentinelHealth           https://learn.microsoft.com/azure/azure-monito...   \n",
357 |        "SigninLogs               https://learn.microsoft.com/azure/azure-monito...   \n",
358 |        "Syslog                   https://learn.microsoft.com/azure/azure-monito...   \n",
359 |        "\n",
360 |        "                                                                    schema  \n",
361 |        "SecurityAlert            {'Column': 'AlertLink', 'Type': 'string', 'Des...  \n",
362 |        "SecurityBaseline         {'Column': 'ActualResult', 'Type': 'string', '...  \n",
363 |        "SecurityBaselineSummary  {'Column': 'AssessmentId', 'Type': 'string', '...  \n",
364 |        "SecurityDetection        {'Column': 'AccountsSeen', 'Type': 'int', 'Des...  \n",
365 |        "SecurityEvent            {'Column': 'AccessMask', 'Type': 'string', 'De...  \n",
366 |        "SecurityIoTRawEvent      {'Column': 'AgentVersion', 'Type': 'string', '...  \n",
367 |        "SecurityRecommendation   {'Column': 'AssessedResourceId', 'Type': 'stri...  \n",
368 |        "SentinelAudit            {'Column': 'CorrelationId', 'Type': 'string', ...  \n",
369 |        "SentinelHealth           {'Column': 'Description', 'Type': 'string', 'D...  \n",
370 |        "SigninLogs               {'Column': 'AADTenantId', 'Type': 'string', 'D...  \n",
371 |        "Syslog                   {'Column': 'Computer', 'Type': 'string', 'Desc...  "
372 |       ]
373 |      },
374 |      "metadata": {},
375 |      "output_type": "display_data"
376 |     },
377 |     {
378 |      "data": {
379 |       "text/html": [
380 |        "<div>\n",
381 |        "<style scoped>\n",
382 |        "    .dataframe tbody tr th:only-of-type {\n",
383 |        "        vertical-align: middle;\n",
384 |        "    }\n",
385 |        "\n",
386 |        "    .dataframe tbody tr th {\n",
387 |        "        vertical-align: top;\n",
388 |        "    }\n",
389 |        "\n",
390 |        "    .dataframe thead th {\n",
391 |        "        text-align: right;\n",
392 |        "    }\n",
393 |        "</style>\n",
394 |        "<table border=\"1\" class=\"dataframe\">\n",
395 |        "  <thead>\n",
396 |        "    <tr style=\"text-align: right;\">\n",
397 |        "      <th></th>\n",
398 |        "      <th>SecurityAlert.url</th>\n",
399 |        "      <th>SecurityAlert.schema.Column</th>\n",
400 |        "      <th>SecurityAlert.schema.Type</th>\n",
401 |        "      <th>SecurityAlert.schema.Description</th>\n",
402 |        "      <th>SecurityBaseline.url</th>\n",
403 |        "      <th>SecurityBaseline.schema.Column</th>\n",
404 |        "      <th>SecurityBaseline.schema.Type</th>\n",
405 |        "      <th>SecurityBaseline.schema.Description</th>\n",
406 |        "      <th>SecurityBaselineSummary.url</th>\n",
407 |        "      <th>SecurityBaselineSummary.schema.Column</th>\n",
408 |        "      <th>...</th>\n",
409 |        "      <th>SentinelHealth.schema.Type</th>\n",
410 |        "      <th>SentinelHealth.schema.Description</th>\n",
411 |        "      <th>SigninLogs.url</th>\n",
412 |        "      <th>SigninLogs.schema.Column</th>\n",
413 |        "      <th>SigninLogs.schema.Type</th>\n",
414 |        "      <th>SigninLogs.schema.Description</th>\n",
415 |        "      <th>Syslog.url</th>\n",
416 |        "      <th>Syslog.schema.Column</th>\n",
417 |        "      <th>Syslog.schema.Type</th>\n",
418 |        "      <th>Syslog.schema.Description</th>\n",
419 |        "    </tr>\n",
420 |        "  </thead>\n",
421 |        "  <tbody>\n",
422 |        "    <tr>\n",
423 |        "      <th>0</th>\n",
424 |        "      <td>https://learn.microsoft.com/azure/azure-monito...</td>\n",
425 |        "      <td>AlertLink</td>\n",
426 |        "      <td>string</td>\n",
427 |        "      <td>NaN</td>\n",
428 |        "      <td>https://learn.microsoft.com/azure/azure-monito...</td>\n",
429 |        "      <td>ActualResult</td>\n",
430 |        "      <td>string</td>\n",
431 |        "      <td>NaN</td>\n",
432 |        "      <td>https://learn.microsoft.com/azure/azure-monito...</td>\n",
433 |        "      <td>AssessmentId</td>\n",
434 |        "      <td>...</td>\n",
435 |        "      <td>string</td>\n",
436 |        "      <td>The operation description.</td>\n",
437 |        "      <td>https://learn.microsoft.com/azure/azure-monito...</td>\n",
438 |        "      <td>AADTenantId</td>\n",
439 |        "      <td>string</td>\n",
440 |        "      <td>NaN</td>\n",
441 |        "      <td>https://learn.microsoft.com/azure/azure-monito...</td>\n",
442 |        "      <td>Computer</td>\n",
443 |        "      <td>string</td>\n",
444 |        "      <td>Computer that the event was collected from.</td>\n",
445 |        "    </tr>\n",
446 |        "  </tbody>\n",
447 |        "</table>\n",
448 |        "<p>1 rows × 44 columns</p>\n",
449 |        "</div>"
450 |       ],
451 |       "text/plain": [
452 |        "                                   SecurityAlert.url  \\\n",
453 |        "0  https://learn.microsoft.com/azure/azure-monito...   \n",
454 |        "\n",
455 |        "  SecurityAlert.schema.Column SecurityAlert.schema.Type  \\\n",
456 |        "0                   AlertLink                    string   \n",
457 |        "\n",
458 |        "   SecurityAlert.schema.Description  \\\n",
459 |        "0                               NaN   \n",
460 |        "\n",
461 |        "                                SecurityBaseline.url  \\\n",
462 |        "0  https://learn.microsoft.com/azure/azure-monito...   \n",
463 |        "\n",
464 |        "  SecurityBaseline.schema.Column SecurityBaseline.schema.Type  \\\n",
465 |        "0                   ActualResult                       string   \n",
466 |        "\n",
467 |        "   SecurityBaseline.schema.Description  \\\n",
468 |        "0                                  NaN   \n",
469 |        "\n",
470 |        "                         SecurityBaselineSummary.url  \\\n",
471 |        "0  https://learn.microsoft.com/azure/azure-monito...   \n",
472 |        "\n",
473 |        "  SecurityBaselineSummary.schema.Column  ... SentinelHealth.schema.Type  \\\n",
474 |        "0                          AssessmentId  ...                     string   \n",
475 |        "\n",
476 |        "   SentinelHealth.schema.Description  \\\n",
477 |        "0         The operation description.   \n",
478 |        "\n",
479 |        "                                      SigninLogs.url SigninLogs.schema.Column  \\\n",
480 |        "0  https://learn.microsoft.com/azure/azure-monito...              AADTenantId   \n",
481 |        "\n",
482 |        "  SigninLogs.schema.Type  SigninLogs.schema.Description  \\\n",
483 |        "0                 string                            NaN   \n",
484 |        "\n",
485 |        "                                          Syslog.url Syslog.schema.Column  \\\n",
486 |        "0  https://learn.microsoft.com/azure/azure-monito...             Computer   \n",
487 |        "\n",
488 |        "  Syslog.schema.Type                    Syslog.schema.Description  \n",
489 |        "0             string  Computer that the event was collected from.  \n",
490 |        "\n",
491 |        "[1 rows x 44 columns]"
492 |       ]
493 |      },
494 |      "metadata": {},
495 |      "output_type": "display_data"
496 |     }
497 |    ],
498 |    "source": [
499 |     "t_dict = {}\n",
500 |     "for table, df in comb_tables.groupby(\"Table\"):\n",
501 |     "    url = df.iloc[0][\"Url\"]\n",
502 |     "    t_dict[table] = {\n",
503 |     "        \"url\": url,\n",
504 |     "        \"schema\": df.drop(columns=[\"Table\", \"Url\"]).to_dict(orient=\"records\")[0]\n",
505 |     "    }\n",
506 |     "\n",
507 |     "t_dict\n",
508 |     "import json\n",
509 |     "display(json.dumps(t_dict))\n",
510 |     "display(pd.read_json(json.dumps(t_dict), orient=\"index\"))\n",
511 |     "display(pd.json_normalize(t_dict))"
512 |    ]
513 |   },
514 |   {
515 |    "cell_type": "code",
516 |    "execution_count": 2,
517 |    "metadata": {},
518 |    "outputs": [
519 |     {
520 |      "ename": "NameError",
521 |      "evalue": "name 'comb_tables' is not defined",
522 |      "output_type": "error",
523 |      "traceback": [
524 |       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
525 |       "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
526 |       "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_640980\\4275042129.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mcomb_tables\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"Table\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0munique\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
527 |       "\u001b[1;31mNameError\u001b[0m: name 'comb_tables' is not defined"
528 |      ]
529 |     }
530 |    ],
531 |    "source": [
532 |     "comb_tables[\"Table\"].unique()"
533 |    ]
534 |   }
535 |  ],
536 |  "metadata": {
537 |   "kernelspec": {
538 |    "display_name": "Python 3.9.7 ('msticpy')",
539 |    "language": "python",
540 |    "name": "python3"
541 |   },
542 |   "language_info": {
543 |    "codemirror_mode": {
544 |     "name": "ipython",
545 |     "version": 3
546 |    },
547 |    "file_extension": ".py",
548 |    "mimetype": "text/x-python",
549 |    "name": "python",
550 |    "nbconvert_exporter": "python",
551 |    "pygments_lexer": "ipython3",
552 |    "version": "3.9.7"
553 |   },
554 |   "orig_nbformat": 4,
555 |   "vscode": {
556 |    "interpreter": {
557 |     "hash": "0f1a8e166ce5c1ec1911a36e4fdbd34b2f623e2a3442791008b8ac429a1d6070"
558 |    }
559 |   }
560 |  },
561 |  "nbformat": 4,
562 |  "nbformat_minor": 2
563 | }
564 | 


--------------------------------------------------------------------------------
/dev-notebooks/db_schema.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | from sqlalchemy import Column, ForeignKey, Integer, String, Table, Text
  3 | from sqlalchemy.orm import relationship
  4 | from sqlalchemy.ext.declarative import declarative_base
  5 | 
  6 | Base = declarative_base()
  7 | metadata = Base.metadata
  8 | 
  9 | 
 10 | class FieldEntity(Base):
 11 |     __tablename__ = 'FieldEntity'
 12 | 
 13 |     field = Column(String(100), primary_key=True, nullable=False, unique=True)
 14 |     entity = Column(String(100), primary_key=True, nullable=False)
 15 | 
 16 |     querys = relationship('KqlQuery', secondary='QueryField')
 17 | 
 18 | 
 19 | class KqlQuery(Base):
 20 |     __tablename__ = 'KqlQuery'
 21 | 
 22 |     source_path = Column(String(1000), nullable=False)
 23 |     query = Column(Text(10000))
 24 |     name = Column(String(100))
 25 |     query_id = Column(Integer, primary_key=True)
 26 |     local_path = Column(String(1000), nullable=False)
 27 | 
 28 | 
 29 | class QueryAttribute(Base):
 30 |     __tablename__ = 'QueryAttribute'
 31 | 
 32 |     query_id = Column(ForeignKey('KqlQuery.query_id'), primary_key=True, nullable=False)
 33 |     attribute_name = Column(String(100), primary_key=True, nullable=False)
 34 |     attribute_value = Column(String(1000))
 35 | 
 36 |     query = relationship('KqlQuery')
 37 | 
 38 | 
 39 | # t_QueryField = Table(
 40 | #     'QueryField', metadata,
 41 | #     Column('query_id', ForeignKey('KqlQuery.query_id'), primary_key=True, nullable=False),
 42 | #     Column('field', ForeignKey('FieldEntity.field'), primary_key=True, nullable=False, unique=True)
 43 | # )
 44 | class QueryField(Base):
 45 |     __tablename__ = "QueryField"
 46 | 
 47 |     query_id = Column(ForeignKey('KqlQuery.query_id'), primary_key=True, nullable=False)
 48 |     field = Column(ForeignKey('FieldEntity.field'), primary_key=True, nullable=False, unique=True)
 49 | 
 50 |     query = relationship('KqlQuery')
 51 |     entity = relationship("FieldEntity")
 52 | 
 53 | 
 54 | class QueryFunction(Base):
 55 |     __tablename__ = 'QueryFunction'
 56 | 
 57 |     query_id = Column(ForeignKey('KqlQuery.query_id'), primary_key=True, nullable=False)
 58 |     function = Column(String(100), primary_key=True, nullable=False)
 59 | 
 60 |     query = relationship('KqlQuery')
 61 | 
 62 | 
 63 | class QueryOperator(Base):
 64 |     __tablename__ = 'QueryOperator'
 65 | 
 66 |     query_id = Column(ForeignKey('KqlQuery.query_id'), primary_key=True, nullable=False, unique=True)
 67 |     operator = Column(String(100), primary_key=True, nullable=False)
 68 | 
 69 |     query = relationship('KqlQuery', uselist=False)
 70 | 
 71 | 
 72 | class QueryTable(Base):
 73 |     __tablename__ = 'QueryTable'
 74 | 
 75 |     table_name = Column(String(100), primary_key=True, nullable=False, unique=True)
 76 |     query_id = Column(ForeignKey('KqlQuery.query_id'), primary_key=True, nullable=False)
 77 | 
 78 |     query = relationship('KqlQuery')
 79 | 
 80 | 
 81 | class OperatorFieldReference(Base):
 82 |     __tablename__ = 'OperatorFieldReference'
 83 | 
 84 |     query_id = Column(ForeignKey('QueryOperator.query_id'), primary_key=True, nullable=False)
 85 |     field = Column(ForeignKey('QueryField.field'), primary_key=True, nullable=False)
 86 |     operator = Column(String(100), primary_key=True, nullable=False)
 87 | 
 88 |     QueryField = relationship('QueryField')
 89 |     query = relationship('QueryOperator')
 90 | 
 91 | 
 92 | class OperatorTableReference(Base):
 93 |     __tablename__ = 'OperatorTableReference'
 94 | 
 95 |     query_id = Column(ForeignKey('QueryOperator.query_id'), primary_key=True, nullable=False)
 96 |     operator = Column(String(100), primary_key=True, nullable=False)
 97 |     table_name = Column(ForeignKey('QueryTable.table_name'), primary_key=True, nullable=False)
 98 | 
 99 |     query = relationship('QueryOperator')
100 |     QueryTable = relationship('QueryTable')
101 | 


--------------------------------------------------------------------------------
/dev-notebooks/kqlquery.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/kql-query-store/b2b2e82fe3a80a2baf8355b90d3a88db98b1a472/dev-notebooks/kqlquery.db


--------------------------------------------------------------------------------
/dev-notebooks/repos.yaml:
--------------------------------------------------------------------------------
1 | - Github:
2 |     branch: main
3 |     repo: reprise99/Sentinel-Queries
4 | - Github:
5 |     branch: main
6 |     repo: ugurkocde/KQL_Intune


--------------------------------------------------------------------------------
/images/DataFlowDiagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/kql-query-store/b2b2e82fe3a80a2baf8355b90d3a88db98b1a472/images/DataFlowDiagram.png


--------------------------------------------------------------------------------
/kqlextraction/KqlExtraction/KqlExtraction.cs:
--------------------------------------------------------------------------------
  1 | ﻿using Kusto.Language;
  2 | using Kusto.Language.Symbols;
  3 | using Kusto.Language.Syntax;
  4 | using Kusto.Language.Utils;
  5 | using System.Runtime.CompilerServices;
  6 | using System.Text;
  7 | using System.Text.Json;
  8 | 
  9 | namespace Microsoft.Mstic.KqlQuery.Extraction
 10 | {
 11 |     public class KqlExtractionResult
 12 |     {
 13 |         public string Id { get; set; } = "";
 14 |         public HashSet<string> FunctionCalls { get; set; } = new HashSet<string>();
 15 |         public Dictionary<string, HashSet<string>> Joins { get; set; } = new Dictionary<string, HashSet<string>>();
 16 |         public HashSet<string> Operators { get; set; } = new HashSet<string>();
 17 |         public HashSet<string> Tables { get; set; } = new HashSet<string>();
 18 |     }
 19 | 
 20 |     public class KqlExtraction
 21 |     {
 22 |         public static void Main(string[] args)
 23 |         {
 24 |             string? l = null;
 25 |             while ((l = Console.ReadLine()) != null)
 26 |             {
 27 |                 var kqlQuery = l.Split(',', 2);
 28 | 
 29 |                 var kqlExtractionResult = new KqlExtractionResult();
 30 |                 if (kqlQuery.Length == 2)
 31 |                 {
 32 |                     try
 33 |                     {
 34 |                         kqlExtractionResult.Id = kqlQuery[0];
 35 |                         if (RunExtraction(kqlExtractionResult, Encoding.UTF8.GetString(Convert.FromBase64String(kqlQuery[1]))) == 0)
 36 |                         {
 37 |                             Console.WriteLine(JsonSerializer.Serialize(kqlExtractionResult));
 38 |                         }
 39 |                     }
 40 |                     catch (Exception e)
 41 |                     {
 42 |                         Console.WriteLine("[!] Error: Caught Exception \"{0}\"", e.Message);
 43 |                     }
 44 |                 }
 45 |             }
 46 |         }
 47 | 
 48 |         private static int RunExtraction(KqlExtractionResult kqlExtractionResult, string kql)
 49 |         {
 50 |             try
 51 |             {
 52 |                 var kustoGlobals = GlobalState.Default.WithClusterList(Array.Empty<ClusterSymbol>());
 53 |                 var kqlQuery = KustoCode.ParseAndAnalyze(kql, globals: kustoGlobals);
 54 | 
 55 |                 var syntaxDiagnostics = kqlQuery.GetSyntaxDiagnostics();
 56 |                 if (syntaxDiagnostics.Count > 0)
 57 |                 {
 58 |                     Console.WriteLine("[!] Error: Syntax Error(s)");
 59 |                     foreach (var diagnostic in kqlQuery.GetSyntaxDiagnostics())
 60 |                     {
 61 |                         Console.WriteLine("  > [{0}:{1}] {2}", diagnostic.Start, diagnostic.End, diagnostic.Message);
 62 |                     }
 63 |                     return 1;
 64 |                 }
 65 | 
 66 |                 SyntaxElement.WalkNodes(kqlQuery.Syntax,
 67 |                     n =>
 68 |                     {
 69 |                         string? joinKind = null;
 70 |                         HashSet<string>? joinTarget = null;
 71 | 
 72 |                         if (n is FunctionCallExpression fc)
 73 |                         {
 74 |                             kqlExtractionResult.FunctionCalls.Add(fc.Name.SimpleName);
 75 |                         }
 76 |                         else if (n is NameReference nr)
 77 |                         {
 78 |                             if (nr.RawResultType.Kind == SymbolKind.Table)
 79 |                             {
 80 |                                 kqlExtractionResult.Tables.Add(nr.Name.SimpleName);
 81 |                             }
 82 |                         }
 83 |                         else if (n.NameInParent == "Operator")
 84 |                         {
 85 |                             if (n is JoinOperator jo)
 86 |                             {
 87 |                                 joinKind = "inner";
 88 |                                 joinTarget = new HashSet<string>();
 89 | 
 90 |                                 var kindParameter = jo.Parameters.Where(p => p.Name.SimpleName == "kind");
 91 |                                 if (kindParameter.Count() == 1)
 92 |                                 {
 93 |                                     joinKind = kindParameter.First().Expression.ToString();
 94 |                                 }
 95 | 
 96 |                                 if (jo.Expression is NameReference jonr)
 97 |                                 {
 98 |                                     joinTarget.Add(jonr.SimpleName);
 99 |                                 }
100 |                                 else if (jo.Expression is ParenthesizedExpression jopr)
101 |                                 {
102 |                                     if (jopr.Expression is NameReference joprnr)
103 |                                     {
104 |                                         joinTarget.Add(joprnr.SimpleName);
105 |                                     }
106 |                                 }
107 | 
108 |                                 if (joinTarget.Count() == 0)
109 |                                 {
110 |                                     joinTarget.Add("(...)");
111 |                                 }
112 |                             }
113 |                             else if (n is LookupOperator lo)
114 |                             {
115 |                                 joinKind = "leftouter";
116 |                                 joinTarget = new HashSet<string>();
117 | 
118 |                                 if (lo.Expression is NameReference lonr)
119 |                                 {
120 |                                     joinTarget.Add(lonr.SimpleName);
121 |                                 }
122 |                                 else if (lo.Expression is ParenthesizedExpression lopr)
123 |                                 {
124 |                                     if (lopr.Expression is NameReference loprnr)
125 |                                     {
126 |                                         joinTarget.Add(loprnr.SimpleName);
127 |                                     }
128 |                                 }
129 | 
130 |                                 if (joinTarget.Count() == 0)
131 |                                 {
132 |                                     joinTarget.Add("(...)");
133 |                                 }
134 |                             }
135 |                             else
136 |                             {
137 |                                 kqlExtractionResult.Operators.Add(n.GetFirstToken().Text);
138 |                             }
139 |                         }
140 |                         else if (n is UnionOperator uo)
141 |                         {
142 |                             joinKind = "union";
143 |                             joinTarget = new HashSet<string>();
144 | 
145 |                             foreach(var t in uo.Expressions)
146 |                             {
147 |                                 if (t.Element is NameReference uonr)
148 |                                 {
149 |                                     joinTarget.Add(uonr.SimpleName);
150 |                                 }
151 |                             }
152 |                         }
153 | 
154 |                         if ((joinKind != null) && (joinTarget != null))
155 |                         {
156 |                             if (!kqlExtractionResult.Joins.ContainsKey(joinKind))
157 |                             {
158 |                                 kqlExtractionResult.Joins[joinKind] = new HashSet<string>();
159 |                             }
160 |                             kqlExtractionResult.Joins[joinKind].AddRange(joinTarget);
161 |                         }
162 |                     });
163 |             }
164 |             catch (Exception ex)
165 |             {
166 |                 Console.WriteLine("[!] Error: Exception '{0}'", ex.Message);
167 |                 return 2;
168 |             }
169 | 
170 |             return 0;
171 |         }
172 |     }
173 | }
174 | 


--------------------------------------------------------------------------------
/kqlextraction/KqlExtraction/KqlExtraction.csproj:
--------------------------------------------------------------------------------
 1 | <Project Sdk="Microsoft.NET.Sdk">
 2 | 
 3 |   <PropertyGroup>
 4 |     <OutputType>Exe</OutputType>
 5 |     <TargetFramework>net6.0</TargetFramework>
 6 |     <ImplicitUsings>enable</ImplicitUsings>
 7 |     <Nullable>enable</Nullable>
 8 |   </PropertyGroup>
 9 | 
10 |   <ItemGroup>
11 |     <PackageReference Include="Microsoft.Azure.Kusto.Data" Version="11.0.0" />
12 |     <PackageReference Include="Microsoft.Azure.Kusto.Language" Version="11.0.0" />
13 |   </ItemGroup>
14 | 
15 | </Project>
16 | 


--------------------------------------------------------------------------------
/kqlextraction/KqlExtraction/KqlExtraction.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio Version 17
 4 | VisualStudioVersion = 17.3.32901.215
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "KqlExtraction", "KqlExtraction.csproj", "{46EB7D0B-BD7E-42D3-B638-5A40B287DF26}"
 7 | EndProject
 8 | Global
 9 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | 		Debug|Any CPU = Debug|Any CPU
11 | 		Release|Any CPU = Release|Any CPU
12 | 	EndGlobalSection
13 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
14 | 		{46EB7D0B-BD7E-42D3-B638-5A40B287DF26}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15 | 		{46EB7D0B-BD7E-42D3-B638-5A40B287DF26}.Debug|Any CPU.Build.0 = Debug|Any CPU
16 | 		{46EB7D0B-BD7E-42D3-B638-5A40B287DF26}.Release|Any CPU.ActiveCfg = Release|Any CPU
17 | 		{46EB7D0B-BD7E-42D3-B638-5A40B287DF26}.Release|Any CPU.Build.0 = Release|Any CPU
18 | 	EndGlobalSection
19 | 	GlobalSection(SolutionProperties) = preSolution
20 | 		HideSolutionNode = FALSE
21 | 	EndGlobalSection
22 | 	GlobalSection(ExtensibilityGlobals) = postSolution
23 | 		SolutionGuid = {CA95F2D7-E70E-4BD8-8E41-743E27B063FC}
24 | 	EndGlobalSection
25 | EndGlobal
26 | 


--------------------------------------------------------------------------------
/kqlextraction/Readme.txt:
--------------------------------------------------------------------------------
1 | Requires .NET 6.0
2 | 
3 | > cd .\KqlExtraction\
4 | > dotnet restore
5 | > dotnet build -c Release
6 | > .\KqlExtraction\bin\Release\net6.0\KqlExtraction.exe tests\test1.kql
7 | 
8 | {"FunctionCalls":["count","tostring","make_list","toreal"],"Joins":["rightsemi","leftouter"],"Operators":["where","extend","summarize","mv-expand","project-away","project"],"Tables":["SigninLogs"]}


--------------------------------------------------------------------------------
/kqlextraction/extract.py:
--------------------------------------------------------------------------------
 1 | from base64 import b64encode
 2 | import json
 3 | import os
 4 | import queue
 5 | import subprocess
 6 | import threading
 7 | import time
 8 | from uuid import uuid4
 9 | 
10 | 
11 | worker_exit = threading.Event()
12 | worker_queue = queue.Queue()
13 | worker_results = queue.Queue()
14 | worker_thread = None
15 | 
16 | 
17 | def _worker_thread_proc():
18 |     try:
19 |         kql_extraction = None
20 | 
21 |         while not worker_exit.is_set():
22 |             try:
23 |                 if kql_extraction is not None:
24 |                     if kql_extraction.poll() is not None:
25 |                         kql_extraction = None
26 |                 if kql_extraction is None:
27 |                     kql_extraction = subprocess.Popen([
28 |                         'dotnet',
29 |                         'run',
30 |                         '-c',
31 |                         'Release',
32 |                         '--project',
33 |                         os.path.join(os.path.abspath(os.path.split(__file__)[0]), 'KqlExtraction', 'KqlExtraction.csproj')
34 |                     ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
35 |             except Exception as ex:
36 |                 print('[!] Exception Starting KqlExtraction Process')
37 |                 break
38 | 
39 |             try:
40 |                 uuid, kql = worker_queue.get(timeout=2.0)
41 |                 kql_extraction.stdin.write(bytes(f'{uuid},', encoding='utf-8') + b64encode(bytes(kql, encoding='utf-8')) + b'\n')
42 |                 kql_extraction.stdin.flush()
43 | 
44 |                 kql_extraction_result = kql_extraction.stdout.readline()
45 |                 worker_results.put(json.loads(kql_extraction_result))
46 |             except queue.Empty:
47 |                 pass
48 |             except Exception as ex:
49 |                 kql_extraction.kill()
50 | 
51 |         if kql_extraction.poll() is None:
52 |             kql_extraction.kill()
53 |     except Exception as ex:
54 |         print('[!] Unhandled Exception', str(ex))
55 | 
56 | 
57 | def extract_kql(kql):
58 |     kql_id = str(uuid4())
59 |     worker_queue.put((kql_id, kql))
60 | 
61 |     try:
62 |         kql_result = {}
63 |         while True:
64 |             kql_result = worker_results.get(timeout=5.0)
65 |             if 'Id' in kql_result and kql_result['Id'] == kql_id:
66 |                 break
67 |     except Exception:
68 |         pass
69 | 
70 |     return kql_result
71 | 
72 | 
73 | if __name__ == '__main__':
74 |     worker_thread = threading.Thread(target=_worker_thread_proc)
75 |     worker_thread.start()
76 | 
77 |     try:
78 |         base_path = os.path.abspath(os.path.split(__file__)[0])
79 |         for kql_file in os.listdir(os.path.join(base_path, 'tests')):
80 |             kql_file = os.path.join(base_path, 'tests', kql_file)
81 | 
82 |             with open(kql_file, 'r') as f:
83 |                 kql = f.read()
84 | 
85 |             print(extract_kql(kql))
86 |     except Exception as ex:
87 |         print('[!] Unhandled Exception', str(ex))
88 | 
89 |     while not worker_queue.empty():
90 |         time.sleep(0.5)
91 | 
92 |     worker_exit.set()
93 |     worker_thread.join()
94 | 


--------------------------------------------------------------------------------
/kqlextraction/tests/test1.kql:
--------------------------------------------------------------------------------
 1 | // https://github.com/Azure/Azure-Sentinel/blob/master/Hunting%20Queries/SigninLogs/UserLoginIPAddressTeleportation.yaml
 2 | 
 3 | let windowTime = 20min / 2; //Window to lookup anomalous logins within
 4 | let excludeKnownVPN = dynamic(['127.0.0.1', '0.0.0.0']); //Known VPN IP addresses to exclude
 5 | SigninLogs
 6 | | where ConditionalAccessStatus =~ "success"
 7 | | extend country = LocationDetails['countryOrRegion']
 8 | | where country != ""
 9 | | summarize count() by tostring(country)
10 | | join (
11 |     //Get the total number of logins from any country and join it to the previous count in a single table
12 |     SigninLogs
13 |     | where ConditionalAccessStatus =~ "success"
14 |     | extend country = LocationDetails['countryOrRegion']
15 |     | where country != ""
16 |     | summarize count(), make_list(tostring(country))
17 |     | mv-expand list_country
18 |     | extend country = tostring(list_country)
19 | ) on country
20 | | summarize by country, count_, count_1
21 | //Now calculate each countries prevalence within login events
22 | | extend prevalence = toreal(count_) / toreal(count_1) * 100
23 | | project-away count_1
24 | | where prevalence < 0.01
25 | | join kind=rightsemi(
26 |     SigninLogs
27 |     //Enable to limit to o365 exchange logins
28 |     //| where AppDisplayName =~ "Office 365 Exchange Online"
29 |     | where ConditionalAccessStatus =~ "success"
30 |     | where IPAddress != ""
31 |     | extend country = tostring(LocationDetails['countryOrRegion'])
32 |     | summarize count() by TimeGenerated, UserPrincipalName, country, IPAddress
33 | ) on country
34 | | join kind=leftouter (
35 |     SigninLogs
36 |     //Enable to limit to o365 exchange logins
37 |     //| where AppDisplayName =~ "Office 365 Exchange Online"
38 |     | where ConditionalAccessStatus =~ "success"
39 |     | extend country = tostring(LocationDetails['countryOrRegion'])
40 |     | summarize by TimeGenerated, IPAddress, UserPrincipalName, country
41 | ) on UserPrincipalName
42 | | where IPAddress != IPAddress1 and country != country1
43 | | extend WindowStart = TimeGenerated1 - windowTime
44 | | extend WindowEnd = TimeGenerated1 + windowTime
45 | | where TimeGenerated between (WindowStart .. WindowEnd)
46 | | project Account=UserPrincipalName, AnomalousIP=IPAddress, AnomalousLoginTime=TimeGenerated, AnomalousCountry=country, OtherLoginIP=IPAddress1, OtherLoginCountry=country1, OtherLoginWindowStart=WindowStart, OtherLoginWindowEnd=WindowEnd
47 | | where AnomalousIP !in(excludeKnownVPN) and OtherLoginIP !in(excludeKnownVPN)
48 | | extend timestamp = AnomalousLoginTime, AccountCustomEntity = Account, IPCustomEntity = AnomalousIP


--------------------------------------------------------------------------------
/kqlextraction/tests/test2.kql:
--------------------------------------------------------------------------------
1 | Foo
2 | | where A == 1
3 | | summarize count() by B


--------------------------------------------------------------------------------
/kqlextraction/tests/test3.kql:
--------------------------------------------------------------------------------
1 | Foo
2 | | mv-expand Z
3 | | join kind=leftsemi hint.remote=true Bar on T
4 | | join kind=leftsemi (
5 |     Baz
6 |     | where X > 5
7 |     | project R
8 |   ) on R


--------------------------------------------------------------------------------
/kqlextraction/tests/test4.kql:
--------------------------------------------------------------------------------
1 | Foo
2 | | lookup (Bar) on T


--------------------------------------------------------------------------------
/kqlextraction/tests/test5.kql:
--------------------------------------------------------------------------------
1 | union Foo, Bar, Baz


--------------------------------------------------------------------------------
/pages/2_🔎KQL_interactive_search.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import pandas as pd
  3 | import sys
  4 | 
  5 | from pathlib import Path
  6 | from st_aggrid import AgGrid
  7 | from st_aggrid import AgGrid, GridOptionsBuilder
  8 | from st_aggrid.shared import GridUpdateMode
  9 | 
 10 | if ".." not in sys.path:
 11 |     sys.path.append("..")
 12 | 
 13 | from src.data_store import DataStore
 14 | 
 15 | _TEST_JSON = "test_runs/kql_query_db-2022-09-24-02-51-49.json"
 16 | ds = DataStore(json_path=_TEST_JSON)
 17 | 
 18 | 
 19 | @st.cache(suppress_st_warning=True)
 20 | def load_data(nrows):
 21 |     data = ds.to_df().head(nrows)
 22 |     return data
 23 | 
 24 | 
 25 | @st.cache
 26 | def convert_df(df, file_type):
 27 |     # IMPORTANT: Cache the conversion to prevent computation on every rerun
 28 |     if file_type == "csv":
 29 |         data = df.to_csv().encode("utf-8")
 30 |     if file_type == "json":
 31 |         data = df.to_json().encode("utf-8")
 32 | 
 33 |     return data
 34 | 
 35 | 
 36 | def aggrid_interactive_table(df: pd.DataFrame):
 37 |     """Source : https://github.com/streamlit/example-app-interactive-table
 38 |     Creates an st-aggrid interactive table based on a dataframe.
 39 |     Args:
 40 |         df (pd.DataFrame]): Source dataframe
 41 |     Returns:
 42 |         dict: The selected row
 43 |     """
 44 |     options = GridOptionsBuilder.from_dataframe(
 45 |         df, enableRowGroup=True, enableValue=True, enablePivot=True
 46 |     )
 47 | 
 48 |     options.configure_side_bar()
 49 | 
 50 |     options.configure_selection("single")
 51 |     selection = AgGrid(
 52 |         df,
 53 |         enable_enterprise_modules=True,
 54 |         gridOptions=options.build(),
 55 |         theme="balham",
 56 |         update_mode=GridUpdateMode.MODEL_CHANGED,
 57 |         allow_unsafe_jscode=True,
 58 |     )
 59 | 
 60 |     return selection
 61 | 
 62 | 
 63 | def main() -> None:
 64 |     st.title(":mag_right: Interactive KQL Query Store")
 65 | 
 66 |     data_load_state = st.text("Loading data...")
 67 |     data = load_data(5000)
 68 |     data_disp = load_data(50)
 69 |     data_load_state.text("Data Loaded and cached !!")
 70 |     json_export = convert_df(data, "json")
 71 | 
 72 |     with st.expander("Raw Dataframe"):
 73 |         if st.checkbox("Show raw data"):
 74 |             st.subheader("Raw data")
 75 |             st.write("Go ahead, click on a row in the table below!")
 76 | 
 77 |             selection = aggrid_interactive_table(df=data_disp)
 78 | 
 79 |             if selection:
 80 |                 st.write("You selected:")
 81 |                 st.json(selection["selected_rows"])
 82 | 
 83 |             st.download_button(
 84 |                 label="Download data as JSON",
 85 |                 data=json_export,
 86 |                 file_name="kql_query_store-export.json",
 87 |                 mime="json",
 88 |             )
 89 | 
 90 |     st.sidebar.subheader("Filter by Table Names")
 91 |     tables = ds.get_filter_lists()["tables"]
 92 |     table_selections = st.sidebar.multiselect(
 93 |         "Select Tables to View", options=tables, default="CommonSecurityLog"
 94 |     )
 95 | 
 96 |     st.sidebar.subheader("Filter by KQL Operators")
 97 | 
 98 |     operators = ds.get_filter_lists()["operators"]
 99 |     operator_selections = st.sidebar.multiselect(
100 |         "Select KQL operators to filter by", options=operators, default="mv-expand"
101 |     )
102 | 
103 |     st.sidebar.subheader("Filter by KQL Function Calls")
104 | 
105 |     func_calls = ds.get_filter_lists()["functioncalls"]
106 |     func_calls_selections = st.sidebar.multiselect(
107 |         "Select KQL function calls to filter by",
108 |         options=func_calls,
109 |         default="series_decompose_anomalies",
110 |     )
111 | 
112 |     result = ds.find_queries(
113 |         # query_name={"contains": "time series"},
114 |         tables=table_selections,  # the list values are OR'd - so will return UNION
115 |         operators=operator_selections,  # the list values are OR'd - so will return UNION
116 |         functioncalls=func_calls_selections,
117 |     )
118 | 
119 |     st.subheader("Filtered Results matching criteria")
120 |     selection = aggrid_interactive_table(df=result)
121 | 
122 |     if selection:
123 |         st.write("You selected:")
124 |         st.json(selection["selected_rows"])
125 | 
126 | 
127 | if __name__ == "__main__":
128 |     st.set_page_config(
129 |         "Interactive KQL Query Store by MSTIC",
130 |         "🔎",
131 |         initial_sidebar_state="expanded",
132 |         layout="wide",
133 |     )
134 |     main()
135 | 


--------------------------------------------------------------------------------
/pages/3_🛡️Schema_Browser.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | from typing import Dict
  3 | import pandas as pd
  4 | import requests
  5 | 
  6 | import bs4
  7 | from tqdm.auto import tqdm
  8 | 
  9 | SCHEMA_CATS_URL = (
 10 |     "https://learn.microsoft.com/azure/azure-monitor/reference/tables/tables-category"
 11 | )
 12 | 
 13 | 
 14 | def fetch_az_mon_categories() -> requests.models.Response:
 15 |     """Return the AzMonitor reference page."""
 16 |     return requests.get(SCHEMA_CATS_URL)
 17 | 
 18 | 
 19 | def get_security_category_list(resp: requests.models.Response) -> bs4.element.Tag:
 20 |     """Extract the list after the security header."""
 21 |     soup = bs4.BeautifulSoup(resp.text, "html.parser")
 22 | 
 23 |     result = soup.find("div", class_="content")
 24 |     sec_header = result.find("h2", id="security")
 25 |     return sec_header.find_next_sibling()
 26 | 
 27 | 
 28 | def build_table_index(security_cat_list: bs4.element.Tag) -> Dict[str, Dict[str, str]]:
 29 |     """From the html list, build an index of URLs."""
 30 |     table_prefix = (
 31 |         "https://learn.microsoft.com/azure/azure-monitor/reference/tables/{href}"
 32 |     )
 33 |     return {
 34 |         item.a.contents[0]: {
 35 |             "href": item.a.attrs.get("href"),
 36 |             "url": table_prefix.format(**(item.a.attrs)),
 37 |         }
 38 |         for item in security_cat_list.find_all("li")
 39 |     }
 40 | 
 41 | 
 42 | def read_table_from_url(table: str, ref: Dict[str, str]) -> pd.DataFrame:
 43 |     """Read table schema from a URL."""
 44 |     table_data = pd.read_html(ref["url"])[0]
 45 |     table_data["Table"] = table
 46 |     table_data["Url"] = ref["url"]
 47 |     print(table, table_data.columns)
 48 |     return table_data
 49 | 
 50 | 
 51 | def fetch_table_schemas(sec_url_dict: Dict[str, Dict[str, str]]) -> pd.DataFrame:
 52 |     """Combine schema tables into single DF."""
 53 |     print(f"Reading schemas for {len(sec_url_dict)} tables...")
 54 |     all_tables = [
 55 |         read_table_from_url(table, ref) for table, ref in tqdm(sec_url_dict.items())
 56 |     ]
 57 |     return pd.concat(all_tables, ignore_index=True)
 58 | 
 59 | 
 60 | def main() -> None:
 61 |     st.title(":shield: Schema Browser")
 62 |     sec_cat_list = get_security_category_list(fetch_az_mon_categories())
 63 |     sec_url_dict = build_table_index(sec_cat_list)
 64 |     sec_url_dict = {
 65 |         key: val for key, val in sec_url_dict.items() if key.startswith("S")
 66 |     }
 67 |     comb_tables = fetch_table_schemas(sec_url_dict)
 68 | 
 69 |     # st.sidebar.subheader("Filter by Table Names")
 70 |     # tables = tuple(comb_tables["Table"].unique())
 71 |     # st.write("Tables:", tables)
 72 | 
 73 |     # TODO : Recursion error - need to troubleshoot - hardcoded table names
 74 |     table_selection = st.selectbox(
 75 |         "Select a Table name to view schema ?",
 76 |         (
 77 |             "SecurityAlert",
 78 |             "SecurityBaseline",
 79 |             "SecurityBaselineSummary",
 80 |             "SecurityDetection",
 81 |             "SecurityEvent",
 82 |             "SecurityIoTRawEvent",
 83 |             "SecurityRecommendation",
 84 |             "SentinelAudit",
 85 |             "SentinelHealth",
 86 |             "SigninLogs",
 87 |             "Syslog",
 88 |         ),
 89 |     )
 90 | 
 91 |     df_schema = comb_tables[comb_tables["Table"] == table_selection]
 92 | 
 93 |     st.subheader("Schema for the filtered table name")
 94 |     st.write(df_schema)
 95 | 
 96 | 
 97 | if __name__ == "__main__":
 98 |     st.set_page_config(
 99 |         "Schema Browser",
100 |         "🛡️",
101 |         initial_sidebar_state="expanded",
102 |         layout="wide",
103 |     )
104 |     main()
105 | 


--------------------------------------------------------------------------------
/pages/4_ 📊KQL_Store_Insights.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import pandas as pd
 3 | import sys
 4 | 
 5 | import altair as alt
 6 | 
 7 | if ".." not in sys.path:
 8 |     sys.path.append("..")
 9 | 
10 | from src.data_store import DataStore
11 | 
12 | _TEST_JSON = "test_runs/kql_query_db-2022-09-24-02-51-49.json"
13 | ds = DataStore(json_path=_TEST_JSON)
14 | 
15 | 
16 | @st.cache(suppress_st_warning=True)
17 | def load_data(nrows):
18 |     data = ds.to_df()
19 |     data = data.head(nrows)
20 |     return data
21 | 
22 | 
23 | def main() -> None:
24 |     st.title(":bar_chart: KQL Store Insights")
25 | 
26 |     data = load_data(5000)
27 | 
28 |     st.subheader("KQL Query Store Summary")
29 |     st.metric("Total No of Queries", f"{len(data)}")
30 | 
31 |     data_sentinel = data[data["repo_name"] == "Azure/Azure-Sentinel"]
32 |     st.metric("Total No of Queries in Azure Sentinel Github", f"{len(data_sentinel)}")
33 | 
34 |     st.subheader("Source Type Ditribution")
35 | 
36 |     df_source_type = (
37 |         data.groupby("source_type")["query"]
38 |         .count()
39 |         .sort_values(ascending=False)
40 |         .reset_index()
41 |     )
42 | 
43 |     chart = (
44 |         alt.Chart(df_source_type)
45 |         .mark_bar()
46 |         .encode(x="source_type", y="query")
47 |         .properties(height=400)
48 |     )
49 | 
50 |     st.altair_chart(chart, use_container_width=True)
51 | 
52 |     st.subheader(f"Top 5 Community Repos")
53 |     repo_count = (
54 |         data.groupby("repo_name")["query"]
55 |         .count()
56 |         .sort_values(ascending=False)
57 |         .reset_index()
58 |     )
59 |     repo_top = repo_count[repo_count["repo_name"] != "Azure/Azure-Sentinel"].head(5)
60 |     st.write(repo_top)
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     st.set_page_config(
65 |         "KQL Store Insights",
66 |         "🛡️",
67 |         initial_sidebar_state="expanded",
68 |         layout="wide",
69 |     )
70 |     main()
71 | 


--------------------------------------------------------------------------------
/pages/5_💬Contact_Us.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | 
 4 | def main() -> None:
 5 |     st.subheader("Reach out to Project team via Github")
 6 |     st.subheader("Github: https://github.com/microsoft/kql-query-store")
 7 | 
 8 |     st.write(
 9 |         "If you would like to add new Github repositories as source, open a issue on Github"
10 |     )
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     st.set_page_config("Contact Us !!", "💬")
15 |     main()
16 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit
2 | streamlit-aggrid
3 | pandas
4 | requests
5 | altair
6 | beautifulsoup4
7 | tqdm
8 | lxml
9 | html5lib


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/kql-query-store/b2b2e82fe3a80a2baf8355b90d3a88db98b1a472/src/__init__.py


--------------------------------------------------------------------------------
/src/az_mon_schema.py:
--------------------------------------------------------------------------------
  1 | # -------------------------------------------------------------------------
  2 | # Copyright (c) Microsoft Corporation. All rights reserved.
  3 | # Licensed under the MIT License. See License.txt in the project root for
  4 | # license information.
  5 | # --------------------------------------------------------------------------
  6 | """Azure Monitor Schema creation."""
  7 | __author__ = "Ian Hellen"
  8 | import json
  9 | from pathlib import Path
 10 | from typing import Any, Dict, Optional, Union
 11 | 
 12 | import bs4
 13 | import pandas as pd
 14 | import requests
 15 | from tqdm.auto import tqdm
 16 | 
 17 | SCHEMA_CATS_URL = (
 18 |     "https://learn.microsoft.com/azure/azure-monitor/reference/tables/tables-category"
 19 | )
 20 | 
 21 | 
 22 | class AzMonitorSchemas:
 23 |     """Class to download and store Azure Monitor table schemas."""
 24 | 
 25 |     def __init__(
 26 |         self, json_path: Union[None, str, Path] = None, json_text: Optional[str] = None
 27 |     ):
 28 |         """Initialize the schema class."""
 29 |         self.schemas: Optional[pd.DataFrame] = None
 30 |         if json_path or json_text:
 31 |             self.schemas = self._df_from_json(json_path=json_path, json_text=json_text)
 32 | 
 33 |     def get_az_mon_schemas(self):
 34 |         """Retrieve Azure monitor schemas"""
 35 |         sec_cat_list = _get_security_category_list(_fetch_az_mon_categories())
 36 |         sec_url_dict = _build_table_index(sec_cat_list)
 37 |         self.schemas = _fetch_table_schemas(sec_url_dict).reindex(
 38 |             columns=["Table", "Column", "Type", "Description", "Url"]
 39 |         )
 40 | 
 41 |     @property
 42 |     def schema_dict(self) -> Dict[str, Dict[str, Any]]:
 43 |         """Return the schema as a dictionary."""
 44 |         if self.schemas is None:
 45 |             return {}
 46 |         table_dict = {}
 47 |         for table, df in self.schemas.groupby("Table"):
 48 |             url = df.iloc[0]["Url"]
 49 |             table_dict[table.casefold()] = {
 50 |                 "url": url,
 51 |                 "table": table,
 52 |                 "schema": df.drop(columns=["Table", "Url"]).to_dict(orient="records"),
 53 |             }
 54 |         return table_dict
 55 | 
 56 |     def to_json(self):
 57 |         """Return schemas as JSON string."""
 58 |         return json.dumps(self.schema_dict)
 59 | 
 60 |     @staticmethod
 61 |     def _df_from_json(
 62 |         json_path: Union[None, str, Path] = None, json_text: Optional[str] = None
 63 |     ) -> pd.DataFrame:
 64 |         """Create DataFrame from JSON representation."""
 65 |         if json_path:
 66 |             json_text = Path(json_path).read_text(encoding="utf-8")
 67 |         schema_dict = json.loads(json_text)
 68 |         rows = []
 69 |         for item in schema_dict.values():
 70 |             rows.extend(
 71 |                 {
 72 |                     "Table": item["table"],
 73 |                     "Column": schema.get("Column"),
 74 |                     "Type": schema.get("Type"),
 75 |                     "Description": schema.get("Description"),
 76 |                     "Url": item["url"],
 77 |                 }
 78 |                 for schema in item.get("schema", [])
 79 |             )
 80 |         return pd.DataFrame(rows).sort_values(["Table", "Column"])
 81 | 
 82 |     def find_tables(self, tables: Union[str, list]) -> pd.DataFrame:
 83 |         """
 84 |         Return schema entries matching `tables`.
 85 | 
 86 |         Parameters
 87 |         ----------
 88 |         tables : Union[str, list]
 89 |             A table name/regex pattern or a list
 90 |             of table names to match.
 91 | 
 92 |         Returns
 93 |         -------
 94 |         pd.DataFrame
 95 |             DataFrame of matching schema entries.
 96 | 
 97 |         """
 98 |         if isinstance(tables, list):
 99 |             tables = [table.casefold() for table in tables]
100 |             return self.schemas[self.schemas["Table"].str.casefold().isin(tables)]
101 |         return self.schemas[self.schemas["Table"].str.match(tables, case=False)]
102 | 
103 |     def find_columns(self, columns: Union[str, list]) -> pd.DataFrame:
104 |         """
105 |         Return schema entries matching `columns`.
106 | 
107 |         Parameters
108 |         ----------
109 |         columns : Union[str, list]
110 |             A column name/regex pattern or a list
111 |             of column names to match.
112 | 
113 |         Returns
114 |         -------
115 |         pd.DataFrame
116 |             DataFrame of matching schema entries.
117 | 
118 |         """
119 |         if isinstance(columns, list):
120 |             columns = [column.casefold() for column in columns]
121 |             return self.schemas[self.schemas["Column"].str.casefold().isin(columns)]
122 |         return self.schemas[self.schemas["Column"].str.match(columns, case=False)]
123 | 
124 | 
125 | def _fetch_az_mon_categories() -> requests.models.Response:
126 |     """Return the AzMonitor reference page."""
127 |     return requests.get(SCHEMA_CATS_URL)
128 | 
129 | 
130 | def _get_security_category_list(resp: requests.models.Response) -> bs4.element.Tag:
131 |     """Extract the list after the security header."""
132 |     soup = bs4.BeautifulSoup(resp.text, "html.parser")
133 | 
134 |     result = soup.find("div", class_="content")
135 |     sec_header = result.find("h2", id="security")
136 |     return sec_header.find_next_sibling()
137 | 
138 | 
139 | def _build_table_index(security_cat_list: bs4.element.Tag) -> Dict[str, Dict[str, str]]:
140 |     """From the html list, build an index of URLs."""
141 |     table_prefix = (
142 |         "https://learn.microsoft.com/azure/azure-monitor/reference/tables/{href}"
143 |     )
144 |     return {
145 |         item.a.contents[0]: {
146 |             "href": item.a.attrs.get("href"),
147 |             "url": table_prefix.format(**(item.a.attrs)),
148 |         }
149 |         for item in security_cat_list.find_all("li")
150 |     }
151 | 
152 | 
153 | def _read_table_from_url(table: str, ref: Dict[str, str]) -> pd.DataFrame:
154 |     """Read table schema from a URL."""
155 |     table_data = pd.read_html(ref["url"])[0]
156 |     table_data["Table"] = table
157 |     table_data["Url"] = ref["url"]
158 |     return table_data
159 | 
160 | 
161 | def _fetch_table_schemas(sec_url_dict: Dict[str, Dict[str, str]]) -> pd.DataFrame:
162 |     """Combine schema tables into single DF."""
163 |     print(f"Reading Azure monitor schemas for {len(sec_url_dict)} tables...")
164 |     all_tables = [
165 |         _read_table_from_url(table, ref)
166 |         for table, ref in tqdm(sec_url_dict.items(), unit="schemas")
167 |     ]
168 |     return pd.concat(all_tables, ignore_index=True)
169 | 


--------------------------------------------------------------------------------
/src/conf.txt:
--------------------------------------------------------------------------------
1 | source_1 
2 | source_2 
3 | 


--------------------------------------------------------------------------------
/src/create_kql_db.py:
--------------------------------------------------------------------------------
  1 | # -------------------------------------------------------------------------
  2 | # Copyright (c) Microsoft Corporation. All rights reserved.
  3 | # Licensed under the MIT License. See License.txt in the project root for
  4 | # license information.
  5 | # --------------------------------------------------------------------------
  6 | """Main script to fetch KQL queries and create JSON Database."""
  7 | 
  8 | import argparse
  9 | import logging
 10 | import sys
 11 | from datetime import datetime, timezone
 12 | from pathlib import Path
 13 | from typing import Any, Dict
 14 | 
 15 | sys.path.append(str(Path(__file__).parent))
 16 | 
 17 | from tqdm.auto import tqdm
 18 | 
 19 | from . import kql_extract as extract
 20 | from .az_mon_schema import AzMonitorSchemas
 21 | from .data_store import DataStore
 22 | 
 23 | # from .kql_query import KqlQuery
 24 | from .kql_download import get_community_queries, get_sentinel_queries
 25 | 
 26 | # ######### MOCK Stuff for stubbing code
 27 | # from unittest.mock import MagicMock
 28 | # # from .kql_ingest import fetch_queries
 29 | # fetch_queries = MagicMock()
 30 | # _MOCK_QUERY = "SecurityAlert | take 1"
 31 | # _MOCK_RESULTS = [KqlQuery(source_path=f"/x/y/{src}.kql", query=_MOCK_QUERY) for src in range(3)]
 32 | # fetch_queries.return_value = _MOCK_RESULTS
 33 | # # # from .kql_db_store import DataStore
 34 | # DataStore = MagicMock()
 35 | # store_instance = MagicMock()
 36 | # DataStore.return_value = store_instance
 37 | # store_instance.queries = _MOCK_RESULTS
 38 | 
 39 | # _MOCK_KQL_PARSE = {"FunctionCalls":["count","tostring","make_list","toreal"],"Joins":["rightsemi","leftouter"],"Operators":["where","extend","summarize","mv-expand","project-away","project"],"Tables":["SigninLogs"]}
 40 | # parse_kql = MagicMock()
 41 | # parse_kql.return_value = _MOCK_KQL_PARSE
 42 | ########## End Mocks
 43 | 
 44 | 
 45 | __author__ = "Ian Hellen"
 46 | 
 47 | _OUTPUT_FILE = "kql_query_db"
 48 | 
 49 | 
 50 | def _add_script_args():
 51 |     parser = argparse.ArgumentParser(description="Kql Query download and build script.")
 52 |     parser.add_argument(
 53 |         "--conf", "-c", required=True, help="Path to query source config file."
 54 |     )
 55 |     parser.add_argument(
 56 |         "--out",
 57 |         "-o",
 58 |         default="output",
 59 |         help="Path to output folder.",
 60 |     )
 61 |     parser.add_argument(
 62 |         "--df",
 63 |         "-d",
 64 |         action="store_true",
 65 |         default=False,
 66 |         help="Write a pickled dataframe.",
 67 |     )
 68 |     parser.add_argument(
 69 |         "--quiet",
 70 |         "-q",
 71 |         action="store_true",
 72 |         default=False,
 73 |         help="Show less output of the execution.",
 74 |     )
 75 |     parser.add_argument(
 76 |         "--verbose",
 77 |         "-v",
 78 |         action="store_true",
 79 |         default=False,
 80 |         help="Show debug logging of execution.",
 81 |     )
 82 |     parser.add_argument(
 83 |         "--timestamp",
 84 |         "-t",
 85 |         action="store_true",
 86 |         default=False,
 87 |         help="Add UTC timestamps to output file.",
 88 |     )
 89 |     parser.add_argument(
 90 |         "--save-stages",
 91 |         "-s",
 92 |         action="store_true",
 93 |         default=False,
 94 |         help="Save outputs after initial query load/parsing.",
 95 |     )
 96 |     parser.add_argument(
 97 |         "--az-schemas",
 98 |         "-a",
 99 |         action="store_true",
100 |         default=False,
101 |         help="Download and store Azure monitor schema.",
102 |     )
103 |     return parser
104 | 
105 | 
106 | def main(args):
107 |     """Main entrypoint for fetching queries and writing to store."""
108 |     results = []
109 |     if not Path(args.out).is_dir():
110 |         if Path(args.out).exists():
111 |             logging.error("Cannot find or create output folder %s", args.out)
112 |             return
113 |         Path.mkdir(args.out, parents=True, exist_ok=True)
114 | 
115 |     # fetch and parse queries
116 |     logging.info("Fetching queries")
117 |     try:
118 |         results.extend(get_sentinel_queries())
119 |     except Exception as err:  # pylint: disable=broad-except
120 |         logging.exception(
121 |             "Failed to fetch Sentinel queries.",
122 |             exc_info=err,
123 |         )
124 |     try:
125 |         results.extend(get_community_queries(config=args.conf))
126 |     except Exception as err:  # pylint: disable=broad-except
127 |         logging.exception(
128 |             "Failed to fetch community queries.",
129 |             exc_info=err,
130 |         )
131 | 
132 |     # add queries to store
133 |     logging.info("Adding %d queries to store.", len(results))
134 | 
135 |     try:
136 |         store = DataStore(results)
137 |     except Exception as err:  # pylint: disable=broad-except
138 |         logging.exception(
139 |             "Failed to queries to store.",
140 |             exc_info=err,
141 |         )
142 | 
143 |     if args.save_stages:
144 |         store.to_json(_get_output_file(args, file_type="p1.json"))
145 | 
146 |     # parse Kql for query properties
147 |     logging.info("Getting KQL properties for %d kql queries.", len(results))
148 |     try:
149 |         extract.start()
150 |         for query in tqdm(store.queries):
151 |             try:
152 |                 kql_properties = extract.extract_kql(
153 |                     kql_query=query.query, query_id=query.query_id
154 |                 )
155 |             except Exception as err:  # pylint: disable=broad-except
156 |                 logging.exception(
157 |                     "Failed to parse query '%s'.\n %s",
158 |                     query.query_id,
159 |                     query.source_path,
160 |                     exc_info=err,
161 |                 )
162 |                 continue
163 |             try:
164 |                 if not kql_properties.get("Valid_Query", True):
165 |                     logging.error(
166 |                         "Invalid KQL for query %s (%s)",
167 |                         query.query_id,
168 |                         query.source_path,
169 |                     )
170 |                 store.add_kql_properties(
171 |                     query_id=query.query_id, kql_properties=kql_properties
172 |                 )
173 |             except Exception as err:  # pylint: disable=broad-except
174 |                 logging.exception(
175 |                     "Failed to update kql properties for query '%s'.",
176 |                     query.query_id,
177 |                     exc_info=err,
178 |                 )
179 |     finally:
180 |         extract.stop()
181 |     logging.info("Finished getting KQL properties for %d kql queries.", len(results))
182 | 
183 |     # write output
184 |     out_json_path = _get_output_file(args, "json")
185 |     store.to_json(out_json_path)
186 |     logging.info("Writing JSON output to %s", out_json_path)
187 |     if args.df:
188 |         query_df = store.to_df()
189 |         out_df_path = _get_output_file(args, "pkl")
190 |         query_df.to_pickle(out_df_path)
191 |         logging.info("Writing Pickled dataframe output to %s", out_df_path)
192 | 
193 |     # get Azure monitor table schema
194 |     # and write JSON and DF
195 |     if args.az_schemas:
196 |         logging.info("Getting Azure Monitor schema data.")
197 |         az_schemas = AzMonitorSchemas()
198 |         az_schemas.get_az_mon_schemas()
199 |         schema_json = Path(args.out).joinpath("az_mon_schemas.json")
200 |         schema_df = Path(args.out).joinpath("az_mon_schemas.pkl")
201 |         schema_json.write_text(az_schemas.to_json(), encoding="utf-8")
202 |         az_schemas.schemas.to_pickle(schema_df)
203 |         logging.info(
204 |             "Saved schema data to %s and %s.", str(schema_json), str(schema_df)
205 |         )
206 | 
207 |     logging.info("Job completed")
208 |     logging.info("============================================")
209 | 
210 | 
211 | def _get_output_file(args, file_type):
212 |     """Return formatted path for output files."""
213 |     if args.timestamp:
214 |         time_stamp = datetime.now(timezone.utc).strftime("%Y-%m-%d-%H-%M-%S")
215 |         return Path(args.out).joinpath(f"{_OUTPUT_FILE}-{time_stamp}.{file_type}")
216 |     return Path(args.out).joinpath(f"{_OUTPUT_FILE}.{file_type}")
217 | 
218 | 
219 | def _configure_logging(args):
220 |     logging_args: Dict[str, Any] = {
221 |         "format": "%(asctime)s: %(funcName)s #%(lineno)d %(filename)s %(message)s"
222 |     }
223 |     if args.quiet:
224 |         logging_args["level"] = logging.WARNING
225 |     elif args.verbose:
226 |         logging_args["level"] = logging.DEBUG
227 |     else:
228 |         logging_args["level"] = logging.INFO
229 |     logging.basicConfig(**logging_args)
230 | 
231 | 
232 | # pylint: disable=invalid-name
233 | if __name__ == "__main__":
234 | 
235 |     arg_parser = _add_script_args()
236 |     args = arg_parser.parse_args()
237 | 
238 |     _configure_logging(args)
239 |     main(args)
240 | 


--------------------------------------------------------------------------------
/src/data_store.py:
--------------------------------------------------------------------------------
  1 | # -------------------------------------------------------------------------
  2 | # Copyright (c) Microsoft Corporation. All rights reserved.
  3 | # Licensed under the MIT License. See License.txt in the project root for
  4 | # license information.
  5 | # --------------------------------------------------------------------------
  6 | """DataStore class."""
  7 | import json
  8 | from pathlib import Path
  9 | from typing import Any, Dict, List, Optional, Set, Union
 10 | 
 11 | import numpy as np
 12 | import pandas as pd
 13 | 
 14 | from .kql_query import KqlQuery
 15 | 
 16 | __author__ = "Ian Hellen"
 17 | 
 18 | 
 19 | # interface
 20 | # get_query_ids()
 21 | # returns a DF of source_path, query_id, query_hash - the idea here is that you
 22 | # (or someone) can check for existing queries based on path. I guess I could also
 23 | # do that in the store - i.e. don't add a new one if the hash is the same,
 24 | # just overwrite with the new details. Hmm. Maybe you don't need to create a query_id.
 25 | # I could just do this checking in the data layer comparing source_path and
 26 | # source_index with existing values. LMK what you think.
 27 | #
 28 | # add_queries(queries: List[Dict[as described above]])
 29 | # add_kql_properties(query_id, properties: Dict[Liam's dict])
 30 | # get_filter_lists() - will return a dictionary of lists of unique values of various properties for the UI filtering
 31 | #     I could also return lists of unique query names and paths
 32 | # find_queries(**kwargs) - this is going to be an interesting one given that we have a flexible set of properties to search on.
 33 | #     kwargs lets us specify a flexible list of conditions, examples:
 34 | #         source_path="/some/path - exact string match (prob case insensitive)
 35 | #         query_name={matches: regex} - match based on a pandas operator like regex, startswith, contains
 36 | #         table=["table1", "table2"] - intersection of queries that use both these tables
 37 | #     it will return a DF of query_id + basic properties.
 38 | # get_query(query_id) - find_queries will return a list, to get all the props for a query, you'd need to call this.
 39 | # get_schema(table)
 40 | 
 41 | QueryDict = Dict[str, Union[str, int, Dict[str, Any]]]
 42 | QueryList = List[QueryDict]
 43 | 
 44 | KqlQueryList = List[KqlQuery]
 45 | 
 46 | 
 47 | class DataStore:
 48 |     """DataStore class for KqlQuery store."""
 49 | 
 50 |     _ATTRIB_INDEXES: Dict[str, type] = {"tactics": list, "techniques": list}
 51 |     _KQL_INDEXES: Dict[str, type] = {
 52 |         "tables": list,
 53 |         "operators": list,
 54 |         "fields": list,
 55 |         "functioncalls": list,
 56 |         "joins": dict,
 57 |         "valid_query": bool,
 58 |     }
 59 |     _ALL_INDEXES: Dict[str, type] = {**_ATTRIB_INDEXES, **_KQL_INDEXES}
 60 | 
 61 |     _OPERATOR = {
 62 |         "startswith": "^{expr}.*",
 63 |         "endswith": ".*{expr}$",
 64 |         "contains": ".*{expr}.*",
 65 |         "matches": "{expr}",
 66 |     }
 67 | 
 68 |     def __init__(
 69 |         self,
 70 |         kql_queries: Union[None, KqlQueryList, QueryList] = None,
 71 |         json_path: Optional[str] = None,
 72 |     ):
 73 |         self._json_path = json_path
 74 |         if json_path:
 75 |             self._data = {
 76 |                 query.get("query_id"): KqlQuery(**query)
 77 |                 for query in self._read_json_data(json_path)
 78 |             }
 79 |         elif kql_queries:
 80 |             if isinstance(kql_queries[0], KqlQuery):
 81 |                 self._data = {query.query_id: query for query in kql_queries}
 82 |             else:
 83 |                 self._data = {
 84 |                     query["query_id"]: KqlQuery(**query) for query in kql_queries
 85 |                 }
 86 |         else:
 87 |             self._data = {}
 88 |         # self.attributes = self._extract_attributes()
 89 |         if self._data:
 90 |             self._data_df = pd.DataFrame(self.queries).set_index("query_id")
 91 |         else:
 92 |             self._data_df = pd.DataFrame(
 93 |                 self.queries, columns=KqlQuery.field_names()
 94 |             ).set_index("query_id")
 95 |         self._indexes: Dict[str, pd.DataFrame] = {}
 96 |         self._create_indexes("attributes")
 97 |         self._create_indexes("kql_properties")
 98 | 
 99 |     @property
100 |     def queries(self) -> List[KqlQuery]:
101 |         """Get the list of current queries."""
102 |         return list(self._data.values())
103 | 
104 |     @property
105 |     def queries_dict(self) -> List[KqlQuery]:
106 |         """Get the list of current queries."""
107 |         return [query.asdict() for query in self._data.values()]
108 | 
109 |     def to_json(self, file_path: Optional[str] = None) -> Optional[str]:
110 |         """Return the queries as JSON or save to `file_path`, if specified."""
111 |         if file_path is not None:
112 |             Path(file_path).write_text(self.to_json(), encoding="utf-8")
113 |         return json.dumps(self.queries_dict)
114 | 
115 |     def to_df(self) -> pd.DataFrame:
116 |         """Return queries as a pandas DataFrame."""
117 |         return pd.DataFrame(self.queries)
118 | 
119 |     def get_query_ids(self) -> pd.DataFrame:
120 |         """Return subset of query columns."""
121 |         columns = ["source_path", "query_name", "query_hash"]
122 |         if self._data_df is None:
123 |             return pd.DataFrame(columns=columns)
124 |         return self._data_df[columns]
125 | 
126 |     def add_queries(self, queries: KqlQueryList):
127 |         """Add a list of queries to the store."""
128 |         self._data.update({query.query_id: query for query in queries})
129 |         self._create_indexes("attributes")
130 |         self._create_indexes("kql_properties")
131 |         self._data_df = pd.DataFrame(self.queries).set_index("query_id")
132 | 
133 |     def add_query(self, query: KqlQuery):
134 |         """Add a single query to the store."""
135 |         self._data[query.query_id] = query
136 |         self._add_item_to_indexes(query)
137 |         self._data_df = pd.concat(
138 |             [self._data_df, pd.DataFrame(query).set_index("query_id")]
139 |         )
140 | 
141 |     def add_kql_properties(self, query_id: str, kql_properties: Dict[str, Any]):
142 |         """Add Kql properties to a query."""
143 |         kql_props = {key.casefold(): value for key, value in kql_properties.items()}
144 |         if "valid_query" not in kql_props:
145 |             kql_props["valid_query"] = True
146 |         self._data[query_id].kql_properties = kql_props
147 |         # update indexes
148 |         self._add_item_to_indexes(self._data[query_id])
149 | 
150 |     def get_filter_lists(
151 |         self, categories: Optional[List[str]] = None
152 |     ) -> Dict[str, List[str]]:
153 |         """Return unique lists of values for each category."""
154 |         return {
155 |             attrib: sorted(self._indexes[attrib].index.unique())
156 |             for attrib in {**self._ATTRIB_INDEXES, **self._KQL_INDEXES}
157 |             if attrib in self._indexes and (categories is None or attrib in categories)
158 |         }
159 | 
160 |     def find_queries(self, case: bool = False, **kwargs) -> pd.DataFrame:
161 |         """
162 |         Return matching values as a pandas DataFrame.
163 | 
164 |         Parameters
165 |         ----------
166 |         case : bool, optional
167 |             Use case-sensitive matching, by default False
168 | 
169 |         Other Parameters
170 |         ----------------
171 |         kwargs :
172 |             You can specify search criteria in the general form attrib_name=expression.
173 |             You can specify multiple criteria - all will be ANDed together.
174 |             attrib=value - exact match (case sensitive for strings)
175 |             attrib={operator: value} - match based on a string operator (matches,
176 |             contains, startswith, endswith)
177 |             attrib=["value1", "value2"] - intersection of items that have
178 |             matches for ALL items in the list.
179 | 
180 |         Returns
181 |         -------
182 |         pd.DataFrame
183 |             DataFrame of matching queries
184 | 
185 |         Examples
186 |         --------
187 |         Some examples of expressions:
188 | 
189 |         - source_path="/some/path" - exact string match (case insensitive)
190 |         - query_name={matches: "AAD.*"} - match based on an operator
191 |           like regex, startswith, contains
192 |         - table=["table1", "table2"] - the queries that use both these tables
193 | 
194 |         >>>> ds.find_queries(
195 |                  query_name={"contains": "AAD"},
196 |                  tables=["table1", "table2"],
197 |                  operations=[...]
198 |             )
199 | 
200 |         """
201 |         if self._data_df is None:
202 |             return pd.DataFrame()
203 |         # Create a base criterion where all rows == True
204 |         criteria = self._data_df.index.notna()
205 |         debug = kwargs.pop("debug", False)
206 |         valid_fields = KqlQuery.field_names() + list(self._indexes.keys())
207 | 
208 |         for arg_name, arg_expr in kwargs.items():
209 |             if arg_name not in valid_fields:
210 |                 raise ValueError(
211 |                     f"Unknown attribute name {arg_name}",
212 |                     f"Search expression: {arg_expr}.",
213 |                 )
214 |             if isinstance(arg_expr, str):
215 |                 criteria &= self._data_df[arg_name] == arg_expr
216 |             if isinstance(arg_expr, dict):
217 |                 operator, expr = next(iter(arg_expr.items()))
218 |                 crit_expr = self._OPERATOR.get(operator)
219 |                 if crit_expr:
220 |                     criteria &= self._data_df[arg_name].str.match(
221 |                         crit_expr.format(expr=expr), case=case
222 |                     )
223 |                     if debug:
224 |                         print(arg_expr, criteria.value_counts())
225 |             if isinstance(arg_expr, list) and arg_name in self._indexes:
226 |                 query_ids = self._get_matching_ids(debug, arg_name, arg_expr)
227 | 
228 |                 # Add the matched query IDs to criteria
229 |                 criteria &= self._data_df.index.isin(query_ids)
230 |                 if debug:
231 |                     print(arg_expr, criteria.value_counts())
232 |         # return the data subset
233 |         if debug:
234 |             print("final criteria:", criteria.value_counts())
235 |         return self._data_df[criteria]
236 | 
237 |     def _get_matching_ids(self, debug, arg_name, arg_expr):
238 |         query_ids: Optional[Set] = None
239 |         # we're looking for queries in the indexes that have a matching value
240 |         for match_value in arg_expr:
241 |             # matched_ids == all query_ids with this property
242 |             matched_ids = set(
243 |                 self._indexes[arg_name][self._indexes[arg_name].index == match_value][
244 |                     "query_id"
245 |                 ].values
246 |             )
247 |         if debug:
248 |             print(len(matched_ids))
249 |         # AND this with query_ids (unless None, then just use this as the
250 |         # first criterion)
251 |         return matched_ids if query_ids is None else matched_ids | query_ids
252 | 
253 |     @staticmethod
254 |     def _read_json_data(json_path: str):
255 |         return json.loads(Path(json_path).read_text(encoding="utf-8"))
256 | 
257 |     def _create_indexes(self, sub_key: str):
258 |         """Create indexes for child items in queries."""
259 |         # create DF with attributes expanded to columns
260 |         if self._data_df is None:
261 |             return
262 |         exp_df = (
263 |             # avoid rows with null or empty dictionaries
264 |             self._data_df[
265 |                 ~((self._data_df[sub_key] == {}) | (self._data_df[sub_key].isna()))
266 |             ][[sub_key]].apply(
267 |                 lambda x: pd.Series(x[sub_key]), result_type="expand", axis=1
268 |             )
269 |         )
270 |         for key, data_type in self._ALL_INDEXES.items():
271 |             if key not in exp_df.columns:
272 |                 continue
273 |             if data_type == list:
274 |                 self._indexes[key] = self._create_list_index(
275 |                     data=exp_df,
276 |                     key_col=key,
277 |                 )
278 |             if data_type == dict:
279 |                 self._indexes[key] = self._create_dict_index(
280 |                     data=exp_df,
281 |                     key_col=key,
282 |                 )
283 |             if data_type == bool:
284 |                 self._indexes[key] = self._create_bool_index(
285 |                     data=exp_df,
286 |                     key_col=key,
287 |                 )
288 | 
289 |     def _add_item_to_indexes(self, query: KqlQuery):
290 |         """Add attributes and kql_properties to indexes."""
291 |         index_attribs = {**(query.attributes), **(query.kql_properties)}
292 |         for key in self._ALL_INDEXES:
293 |             if key not in index_attribs:
294 |                 continue
295 |             df_index = (
296 |                 list(index_attribs[key])
297 |                 if isinstance(index_attribs[key], (list, dict))
298 |                 else [index_attribs[key]]
299 |                 if isinstance(index_attribs[key], bool)
300 |                 else None
301 |             )
302 |             if df_index is not None:
303 |                 current_index = self._indexes.get(key)
304 |                 new_index_items = pd.DataFrame(
305 |                     data=[{"query_id": query.query_id} for _ in df_index],
306 |                     index=df_index,
307 |                 )
308 |                 if current_index is None:
309 |                     self._indexes[key] = new_index_items
310 |                 else:
311 |                     self._indexes[key] = pd.concat(
312 |                         [self._indexes[key], new_index_items]
313 |                     )
314 | 
315 |     @staticmethod
316 |     def _create_list_index(data, key_col):
317 |         return (
318 |             data[[key_col]].explode(key_col).dropna().reset_index().set_index(key_col)
319 |         )
320 | 
321 |     @staticmethod
322 |     def _create_bool_index(data, key_col):
323 |         return data[[key_col]].dropna().reset_index().set_index(key_col)
324 | 
325 |     @staticmethod
326 |     def _extract_dict_keys(row, col_name):
327 |         if isinstance(row[col_name], dict):
328 |             return {
329 |                 col_name: [
330 |                     inner_val
331 |                     for val in row[col_name].values()
332 |                     for inner_val in val
333 |                     if isinstance(val, dict) and inner_val != np.nan
334 |                 ]
335 |             }
336 |         return row
337 | 
338 |     def _create_dict_index(self, data, key_col):
339 |         df_dict_keys = data[[key_col]].apply(
340 |             lambda x: self._extract_dict_keys(x, key_col), result_type="expand", axis=1
341 |         )
342 |         return self._create_list_index(df_dict_keys, key_col)
343 | 


--------------------------------------------------------------------------------
/src/extract.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import queue
  4 | import subprocess
  5 | import threading
  6 | import time
  7 | from base64 import b64encode
  8 | from pathlib import Path
  9 | from uuid import uuid4
 10 | 
 11 | worker_exit = threading.Event()
 12 | worker_queue = queue.Queue()
 13 | worker_results = queue.Queue()
 14 | worker_thread = None
 15 | 
 16 | _CS_PROJ_PATH = str(
 17 |     Path(__file__).parent.joinpath(
 18 |         "../kqlextraction/KqlExtraction/KqlExtraction.csproj"
 19 |     )
 20 | )
 21 | 
 22 | 
 23 | def _worker_thread_proc():
 24 |     try:
 25 |         kql_extraction = None
 26 | 
 27 |         while not worker_exit.is_set():
 28 |             try:
 29 |                 if kql_extraction is not None:
 30 |                     if kql_extraction.poll() is not None:
 31 |                         kql_extraction = None
 32 |                 if kql_extraction is None:
 33 |                     kql_extraction = subprocess.Popen(
 34 |                         [
 35 |                             "dotnet",
 36 |                             "run",
 37 |                             "-c",
 38 |                             "Release",
 39 |                             "--project",
 40 |                             # os.path.join(os.path.abspath(os.path.split(__file__)[0]), 'KqlExtraction', 'KqlExtraction.csproj')
 41 |                             _CS_PROJ_PATH,
 42 |                         ],
 43 |                         stdin=subprocess.PIPE,
 44 |                         stdout=subprocess.PIPE,
 45 |                         stderr=subprocess.PIPE,
 46 |                     )
 47 |             except Exception as ex:
 48 |                 print("[!] Exception Starting KqlExtraction Process")
 49 |                 break
 50 | 
 51 |             try:
 52 |                 uuid, kql = worker_queue.get(timeout=2.0)
 53 |                 kql_extraction.stdin.write(
 54 |                     bytes(f"{uuid},", encoding="utf-8")
 55 |                     + b64encode(bytes(kql, encoding="utf-8"))
 56 |                     + b"\n"
 57 |                 )
 58 |                 kql_extraction.stdin.flush()
 59 | 
 60 |                 kql_extraction_result = kql_extraction.stdout.readline()
 61 |                 worker_results.put(json.loads(kql_extraction_result))
 62 |             except queue.Empty:
 63 |                 pass
 64 |             except Exception as ex:
 65 |                 kql_extraction.kill()
 66 | 
 67 |         if kql_extraction.poll() is None:
 68 |             kql_extraction.kill()
 69 |     except Exception as ex:
 70 |         print("[!] Unhandled Exception", str(ex))
 71 | 
 72 | 
 73 | def extract_kql(kql):
 74 |     kql_id = str(uuid4())
 75 |     worker_queue.put((kql_id, kql))
 76 | 
 77 |     try:
 78 |         kql_result = {}
 79 |         while True:
 80 |             kql_result = worker_results.get(timeout=5.0)
 81 |             if "Id" in kql_result and kql_result["Id"] == kql_id:
 82 |                 break
 83 |     except Exception:
 84 |         pass
 85 | 
 86 |     return kql_result
 87 | 
 88 | 
 89 | def start():
 90 |     global worker_thread
 91 |     worker_thread = threading.Thread(target=_worker_thread_proc)
 92 |     worker_thread.start()
 93 | 
 94 | 
 95 | def stop():
 96 |     worker_exit.set()
 97 |     worker_thread.join()
 98 | 
 99 | 
100 | if __name__ == "__main__":
101 |     worker_thread = threading.Thread(target=_worker_thread_proc)
102 |     worker_thread.start()
103 | 
104 |     try:
105 |         base_path = os.path.abspath(os.path.split(__file__)[0])
106 |         for kql_file in os.listdir(os.path.join(base_path, "tests")):
107 |             kql_file = os.path.join(base_path, "tests", kql_file)
108 | 
109 |             with open(kql_file, "r") as f:
110 |                 kql = f.read()
111 | 
112 |             print(extract_kql(kql))
113 |     except Exception as ex:
114 |         print("[!] Unhandled Exception", str(ex))
115 | 
116 |     while not worker_queue.empty():
117 |         time.sleep(0.5)
118 | 
119 |     worker_exit.set()
120 |     worker_thread.join()
121 | 


--------------------------------------------------------------------------------
/src/ian_test.kql:
--------------------------------------------------------------------------------
 1 | //Detects when a user with a privileged Azure AD role has had their on premises Active Directory password changed by someone other than themselves.
 2 | 
 3 | //Data connector required for this query - Windows Security Events via AMA or Security Events via Legacy Agent
 4 | //Data connector required for this query - Microsoft Sentinel UEBA
 5 | 
 6 | let timeframe=7d;
 7 | //First find any users that hold privileged Azure AD roles
 8 | IdentityInfo
 9 | | where TimeGenerated > ago(21d)
10 | | where isnotempty(AssignedRoles)
11 | | where AssignedRoles != "[]"
12 | | summarize arg_max(TimeGenerated, *) by AccountUPN
13 | | project AccountUPN, AccountName, AccountSID
14 | //Join those users based on AccountSID to on premises Active Directory password reset events
15 | | join kind=inner (
16 |     SecurityEvent
17 |     | where TimeGenerated > ago(timeframe)
18 |     | where EventID == "4724"
19 |     | project
20 |         TimeGenerated,
21 |         Activity,
22 |         SubjectAccount,
23 |         TargetAccount,
24 |         TargetSid,
25 |         SubjectUserSid
26 |     )
27 |     on $left.AccountSID == $right.TargetSid
28 | | where SubjectUserSid != TargetSid
29 | //Summarize event data to make it easy to read
30 | | project ['Time of Password Reset']=TimeGenerated, Activity, Actor=SubjectAccount, ['Target UserPrincipalName']=AccountUPN,['Target AccountName']=TargetAccount


--------------------------------------------------------------------------------
/src/kql_download.py:
--------------------------------------------------------------------------------
  1 | # -------------------------------------------------------------------------
  2 | # Copyright (c) Microsoft Corporation. All rights reserved.
  3 | # Licensed under the MIT License. See License.txt in the project root for
  4 | # license information.
  5 | # --------------------------------------------------------------------------
  6 | """Github download and conversion functions."""
  7 | 
  8 | import logging
  9 | import shutil
 10 | from itertools import chain
 11 | from pathlib import Path
 12 | from typing import List, Union
 13 | 
 14 | import pandas as pd
 15 | 
 16 | from .kql_file_parser import (
 17 |     download_git_archive,
 18 |     format_repo_url,
 19 |     get_sentinel_queries_from_github,
 20 |     parse_kql_to_dict,
 21 |     parse_markdown_to_dict,
 22 |     parse_yaml,
 23 |     read_config,
 24 | )
 25 | from .kql_query import KqlQuery
 26 | 
 27 | __author__ = "Ashwin Patil, Jannie Li, Ian Hellen"
 28 | 
 29 | 
 30 | _CURR_DIR = Path.cwd()
 31 | 
 32 | 
 33 | def get_sentinel_queries(output_path: Path = _CURR_DIR):
 34 |     """Return Sentinel queries from repo."""
 35 |     # download sentinel github and extract yaml files only
 36 |     azsentinel_git_url = "https://github.com/Azure/Azure-Sentinel/archive/master.zip"
 37 |     get_sentinel_queries_from_github(git_url=azsentinel_git_url, outputdir=output_path)
 38 | 
 39 |     # Parsing yaml files and converting to dataframe
 40 |     base_dir = str(output_path.joinpath("Azure-Sentinel-master"))
 41 |     detections_df = parse_yaml(parent_dir=base_dir, child_dir="Detections")
 42 |     hunting_df = parse_yaml(parent_dir=base_dir, child_dir="Hunting Queries")
 43 |     solutions_df = parse_yaml(parent_dir=base_dir, child_dir="Solutions")
 44 | 
 45 |     # tmp dirs
 46 |     logging.info(
 47 |         "Detections: %d Hunting Queries: %d Solutions: %d",
 48 |         len(detections_df),
 49 |         len(hunting_df),
 50 |         len(hunting_df),
 51 |     )
 52 |     _remove_tmp_folder(output_path.joinpath("Azure-Sentinel-master"))
 53 |     # Filtering yamls with no KQL queries
 54 |     query_list = _sent_dfs_to_kql_query_list(
 55 |         detections_df=detections_df[detections_df["query"].notnull()],
 56 |         hunting_df=hunting_df[hunting_df["query"].notnull()],
 57 |         solutions_df=solutions_df[solutions_df["query"].notnull()],
 58 |     )
 59 |     return [KqlQuery(**query) for query in query_list]
 60 | 
 61 | 
 62 | def _sent_dfs_to_kql_query_list(detections_df, hunting_df, solutions_df):
 63 |     # Selecting specific columns
 64 |     columns = [
 65 |         "name",
 66 |         "GithubURL",
 67 |         "query",
 68 |         "description",
 69 |         "tactics",
 70 |         "relevantTechniques",
 71 |     ]
 72 |     all_dfs = [detections_df[columns], hunting_df[columns], solutions_df[columns]]
 73 |     sentinel_github = pd.concat(all_dfs, ignore_index=True, sort=True)
 74 | 
 75 |     # renaming to columns to match with schema
 76 |     sentinel_github = sentinel_github.rename(
 77 |         columns={
 78 |             "GithubURL": "source_path",
 79 |             "name": "query_name",
 80 |             "relevantTechniques": "techniques",
 81 |         },
 82 |     )
 83 | 
 84 |     cols = ["description", "techniques", "tactics"]
 85 |     # create new column by merging selected columns into dictionary
 86 |     sentinel_github["attributes"] = sentinel_github[cols].to_dict(orient="records")
 87 | 
 88 |     # select columns and display sample dataframe records
 89 |     select_columns = ["source_path", "query_name", "query", "attributes"]
 90 |     sentinel_github[select_columns].head()
 91 | 
 92 |     # return it as list of dictionary
 93 |     return sentinel_github[select_columns].to_dict(orient="records")
 94 | 
 95 | 
 96 | # ### KQL - Community Github Repos
 97 | 
 98 | 
 99 | def get_community_queries(
100 |     output_dir: Path = _CURR_DIR, config: Union[Path, str] = "repos.yaml"
101 | ):
102 |     """Return KqlQuery list from community repos."""
103 |     # Read yaml config file
104 |     repos = read_config(config)
105 | 
106 |     # Compile list of github urls to download
107 |     repo_urls: List[str] = []
108 |     tmp_dirs: List[str] = []
109 |     for item in repos:
110 |         url = format_repo_url(item["Github"]["repo"], item["Github"]["branch"])
111 |         repo_urls.append(url)
112 |         tmp_dirs.append(
113 |             str(
114 |                 output_dir.joinpath(
115 |                     f"{item['Github']['repo']}-{item['Github']['branch']}"
116 |                 )
117 |             )
118 |         )
119 | 
120 |     # download github urls one by one
121 |     for url in repo_urls:
122 |         download_git_archive(url, output_dir)
123 | 
124 |     txt_queries = _read_community_txt_queries(repos, output_dir)
125 |     md_queries = _read_community_md_queries(repos, output_dir)
126 |     to_remove = tmp_dirs.copy()
127 |     for tmp_dir in to_remove:
128 |         _remove_tmp_folder(tmp_dir)
129 |         tmp_dirs.remove(tmp_dir)
130 |     return [
131 |         query if isinstance(query, KqlQuery) else KqlQuery(**query)
132 |         for query in chain(txt_queries, md_queries)
133 |     ]
134 | 
135 | 
136 | def _read_community_txt_queries(repos, src_path):
137 |     """Parse text files."""
138 |     parsed_txt_queries = []
139 | 
140 |     for item in repos:
141 |         repo_name = item["Github"]["repo"]
142 |         branch_name = item["Github"]["branch"]
143 |         list_of_dict = parse_kql_to_dict(repo_name, branch_name, src_path)
144 |         parsed_txt_queries.extend(list_of_dict)
145 |     # display parsed sample record
146 |     logging.info("Parsed %d queries from text files", len(parsed_txt_queries))
147 |     return parsed_txt_queries
148 | 
149 | 
150 | def _read_community_md_queries(repos, src_path):
151 |     """Parses markdown files."""
152 |     parsed_md_queries = []
153 | 
154 |     for item in repos:
155 |         repo_name = item["Github"]["repo"]
156 |         branch_name = item["Github"]["branch"]
157 |         list_of_dict = parse_markdown_to_dict(repo_name, branch_name, src_path)
158 |         parsed_md_queries.extend(list_of_dict)
159 | 
160 |     logging.info("Parsed %d queries from text files", len(parsed_md_queries))
161 |     return parsed_md_queries
162 | 
163 | 
164 | def _remove_tmp_folder(tmp_dir):
165 |     if Path(tmp_dir).is_dir():
166 |         try:
167 |             shutil.rmtree(tmp_dir)
168 |         except Exception as err:  # pylint: disable=broad-except
169 |             logging.exception(
170 |                 "Error trying to remove temporary folder '%s'.", tmp_dir, exc_info=err
171 |             )
172 | 


--------------------------------------------------------------------------------
/src/kql_extract.py:
--------------------------------------------------------------------------------
  1 | # -------------------------------------------------------------------------
  2 | # Copyright (c) Microsoft Corporation. All rights reserved.
  3 | # Licensed under the MIT License. See License.txt in the project root for
  4 | # license information.
  5 | # --------------------------------------------------------------------------
  6 | """Kql extract threading interface with .Net Kqlextract."""
  7 | import contextlib
  8 | import json
  9 | import logging
 10 | import queue
 11 | import subprocess
 12 | import threading
 13 | import time
 14 | from base64 import b64encode
 15 | from pathlib import Path
 16 | from typing import Optional
 17 | from uuid import uuid4
 18 | 
 19 | __author__ = "Liam Kirton"
 20 | 
 21 | 
 22 | base_path = Path(__file__).parent
 23 | CS_PROJ_PATH = base_path.joinpath("../kqlextraction/KqlExtraction/KqlExtraction.csproj")
 24 | 
 25 | worker_exit = threading.Event()
 26 | worker_queue = queue.Queue()  # type: ignore
 27 | worker_results = queue.Queue()  # type: ignore
 28 | worker_thread = None
 29 | 
 30 | # pylint: disable=broad-except
 31 | 
 32 | _EXTRACT_ARGS = [
 33 |     "dotnet",
 34 |     "run",
 35 |     "-c",
 36 |     "Release",
 37 |     "--project",
 38 |     str(CS_PROJ_PATH),
 39 | ]
 40 | _SYNTAX_ERROR = "[!]"
 41 | 
 42 | 
 43 | def _worker_thread_proc():
 44 |     try:
 45 |         kql_extraction = None
 46 | 
 47 |         while not worker_exit.is_set():
 48 |             try:
 49 |                 if kql_extraction is not None and kql_extraction.poll() is not None:
 50 |                     kql_extraction = None
 51 |                 if kql_extraction is None:
 52 |                     kql_extraction = subprocess.Popen(
 53 |                         _EXTRACT_ARGS,
 54 |                         stdin=subprocess.PIPE,
 55 |                         stdout=subprocess.PIPE,
 56 |                         stderr=subprocess.PIPE,
 57 |                     )
 58 |             except Exception as subp_ex:
 59 |                 logging.exception(
 60 |                     "[!] Exception Starting KqlExtraction Process.", exc_info=subp_ex
 61 |                 )
 62 |                 break
 63 | 
 64 |             try:
 65 |                 uuid, kql = worker_queue.get(timeout=2.0)
 66 |                 kql_extraction.stdin.write(
 67 |                     bytes(f"{uuid},", encoding="utf-8")
 68 |                     + b64encode(bytes(kql, encoding="utf-8"))
 69 |                     + b"\n"
 70 |                 )
 71 |                 kql_extraction.stdin.flush()
 72 | 
 73 |                 kql_extraction_result = kql_extraction.stdout.readline()
 74 |                 if (
 75 |                     str(kql_extraction_result, encoding="utf-8")
 76 |                     .strip()
 77 |                     .startswith(_SYNTAX_ERROR)
 78 |                 ):
 79 |                     worker_results.put(_syntax_err_result(uuid))
 80 |                 else:
 81 |                     worker_results.put(json.loads(kql_extraction_result))
 82 |                     # try:
 83 |                     #     worker_results.put(json.loads(kql_extraction_result))
 84 |                     # except json.JSONDecodeError:
 85 |                     #     worker_results.put(_syntax_err_result(uuid))
 86 |             except queue.Empty:
 87 |                 pass
 88 |             except Exception as thread_ex:
 89 |                 logging.exception(
 90 |                     "[!] Unhandled Exception in 'while not worker_exit.is_set', query_id='%s', \ninput sample: %s",
 91 |                     uuid,
 92 |                     kql_extraction_result[:200],
 93 |                     exc_info=thread_ex,
 94 |                 )
 95 |                 kql_extraction.kill()
 96 | 
 97 |         if kql_extraction is not None and kql_extraction.poll() is None:
 98 |             kql_extraction.kill()
 99 |     except Exception as thread_out_ex:
100 |         logging.exception(
101 |             "[!] Unhandled Exception at 'while not worker_exit.is_set()'",
102 |             exc_info=thread_out_ex,
103 |         )
104 | 
105 | 
106 | def extract_kql(kql_query: str, query_id: Optional[str] = None):
107 |     """Extract kql_properties from Kql query."""
108 |     kql_id = query_id or str(uuid4())
109 |     worker_queue.put((kql_id, kql_query))
110 | 
111 |     with contextlib.suppress(Exception):
112 |         kql_result = {}
113 |         while True:
114 |             kql_result = worker_results.get(timeout=5.0)
115 |             if "Id" in kql_result and kql_result["Id"] == kql_id:
116 |                 break
117 |     return kql_result
118 | 
119 | 
120 | def start():
121 |     """Start extractor worker thread."""
122 |     global worker_thread  # pylint: disable=invalid-name, global-use
123 |     worker_thread = threading.Thread(target=_worker_thread_proc)
124 |     worker_thread.start()
125 |     logging.info("Started kql extractor thread.")
126 | 
127 | 
128 | def stop():
129 |     """Stop worker thread."""
130 |     worker_exit.set()
131 |     worker_thread.join()
132 |     logging.info("Kql extractor thread stopped.")
133 | 
134 | 
135 | def _syntax_err_result(query_id):
136 |     return {
137 |         "Id": query_id,
138 |         "FunctionCalls": [],
139 |         "Joins": {},
140 |         "Operators": [],
141 |         "Tables": [],
142 |         "Valid_query": False,
143 |     }
144 | 
145 | 
146 | if __name__ == "__main__":
147 |     worker_thread = threading.Thread(target=_worker_thread_proc)
148 |     worker_thread.start()
149 | 
150 |     test_path = base_path.joinpath("test_data")
151 |     print("using", test_path)
152 |     print(len(list(test_path.glob("*.kql"))), "kql files")
153 |     try:
154 |         for file_no, kql_file in enumerate(test_path.glob("*.kql")):
155 |             # kql_file = os.path.join(base_path, "tests", kql_file)
156 |             print(f"[{file_no}], {kql_file.name}")
157 |             print(
158 |                 f"[{file_no}]\n".join(
159 |                     kql_file.read_text(encoding="utf-8").split("\n")[:5]
160 |                 )
161 |             )
162 |             with open(kql_file, "r", encoding="utf-8") as f:
163 |                 kql_text = f.read()
164 | 
165 |             print(f"[{file_no}]", extract_kql(kql_text, query_id=file_no))
166 | 
167 |     except Exception as ex:
168 |         print("[!] Unhandled Exception", ex)
169 | 
170 |     while not worker_queue.empty():
171 |         time.sleep(0.5)
172 | 
173 |     worker_exit.set()
174 |     worker_thread.join()
175 | 


--------------------------------------------------------------------------------
/src/kql_file_parser.py:
--------------------------------------------------------------------------------
  1 | # -------------------------------------------------------------------------
  2 | # Copyright (c) Microsoft Corporation. All rights reserved.
  3 | # Licensed under the MIT License. See License.txt in the project root for
  4 | # license information.
  5 | # --------------------------------------------------------------------------
  6 | """Query download and parsing functions."""
  7 | 
  8 | import glob
  9 | import io
 10 | import logging
 11 | import urllib.parse
 12 | import warnings
 13 | import zipfile
 14 | from pathlib import Path
 15 | from typing import List
 16 | 
 17 | import pandas as pd
 18 | import requests
 19 | import yaml
 20 | from pandas import json_normalize
 21 | from requests.exceptions import HTTPError
 22 | from tqdm.auto import tqdm
 23 | 
 24 | from .kql_query import KqlQuery
 25 | 
 26 | __author__ = "Ashwin Patil, Jannie Li"
 27 | 
 28 | 
 29 | def read_config(filename):
 30 |     with open(filename, "r", encoding="utf-8") as yamlfile:
 31 |         data = yaml.safe_load(yamlfile)
 32 |     return data
 33 | 
 34 | 
 35 | def format_repo_url(repo_name, branch_name):
 36 |     return f"https://github.com/{repo_name}/archive/{branch_name}.zip"
 37 | 
 38 | 
 39 | def download_git_archive(git_url, output_dir):
 40 |     logging.info("Downloading %s, may take few mins..", git_url)
 41 |     try:
 42 |         r = requests.get(git_url)
 43 |         repo_zip = io.BytesIO(r.content)
 44 |         logging.info("Extracting files..")
 45 |         with zipfile.ZipFile(repo_zip, mode="r") as archive:
 46 |             for file in tqdm(archive.namelist()):
 47 |                 archive.extract(file, path=output_dir)
 48 |         logging.info("Downloaded and Extracted Files successfully")
 49 |     except HTTPError as http_err:
 50 |         warnings.warn(f"HTTP error occurred trying to download from Github: {http_err}")
 51 | 
 52 | 
 53 | def get_sentinel_queries_from_github(git_url, outputdir):
 54 |     logging.info("Downloading from Azure Sentinel Github, may take 2-3 mins..")
 55 |     try:
 56 |         r = requests.get(git_url)
 57 |         repo_zip = io.BytesIO(r.content)
 58 | 
 59 |         with zipfile.ZipFile(repo_zip, mode="r") as archive:
 60 |             # Only extract Detections and Hunting Queries Folder
 61 |             logging.info("Extracting files..")
 62 |             for file in tqdm(archive.namelist()):
 63 |                 if file.startswith(
 64 |                     (
 65 |                         "Azure-Sentinel-master/Detections/",
 66 |                         "Azure-Sentinel-master/Hunting Queries/",
 67 |                         "Azure-Sentinel-master/Solutions/",
 68 |                     )
 69 |                 ) and file.endswith(".yaml"):
 70 |                     archive.extract(file, path=outputdir)
 71 |         logging.info("Downloaded and Extracted Files successfully")
 72 |     except HTTPError as http_err:
 73 |         warnings.warn(f"HTTP error occurred trying to download from Github: {http_err}")
 74 | 
 75 | 
 76 | def parse_yaml(parent_dir, child_dir):
 77 |     sentinel_repourl = "https://github.com/Azure/Azure-Sentinel/blob/master"
 78 |     bad_yamls = [
 79 |         (
 80 |             "/home/jovyan/work/Hackathon/kql-query-store/dev-notebooks/"
 81 |             "Azure-Sentinel-master/Hunting Queries/Microsoft 365 Defender"
 82 |             "/Device Inventory/Find Software By Name and Version.yaml"
 83 |         )
 84 |     ]
 85 |     # Collect list of files recursively under a folder
 86 |     yaml_queries = glob.glob(f"{parent_dir}/{child_dir}/**/*.yaml", recursive=True)
 87 |     yaml_queries = [query for query in yaml_queries if query not in bad_yamls]
 88 | 
 89 |     frames: List[pd.DataFrame] = []
 90 | 
 91 |     # Recursively load yaml Files and append to dataframe
 92 |     logging.info("Parsing yaml queries..")
 93 |     for query in tqdm(yaml_queries):
 94 |         with open(query, "r", encoding="utf-8", errors="ignore") as file_stream:
 95 |             try:
 96 |                 parsed_yaml_df = json_normalize(yaml.safe_load(file_stream))
 97 |             except Exception as err:  # pylint: disable=broad-except
 98 |                 logging.exception(
 99 |                     "Exception parsing yaml_query %s", query, exc_info=err
100 |                 )
101 |                 continue
102 |             parsed_yaml_df["GithubURL"] = urllib.parse.quote(
103 |                 query.replace(parent_dir, sentinel_repourl), safe=":/"
104 |             )
105 |             # #URL encode
106 |             # parsed_yaml_df["GithubURL"] = urllib.parse.quote(parsed_yaml_df["GithubURL"], safe=':/')
107 |             # parsed_yaml_df = parsed_yaml_df[columns]
108 |             frames.append(parsed_yaml_df)
109 | 
110 |     return pd.concat(frames, ignore_index=True, sort=True)
111 | 
112 | 
113 | def parse_kql_to_dict(repo_name, branch_name, src_path):
114 |     parent_dir = Path(src_path).joinpath(f"{repo_name.split('/')[-1]}-{branch_name}")
115 |     kql_files = glob.glob(f"{parent_dir}/**/*.kql", recursive=True)
116 | 
117 |     git_repo_url = f"https://github.com/{repo_name}/tree/main"
118 | 
119 |     list_of_kql_files_dict = []
120 |     logging.info("Parsing queries..")
121 |     for file in tqdm(kql_files):
122 |         with open(file, "r", encoding="utf-8", errors="ignore") as f:
123 |             kql_query = KqlQuery(
124 |                 query=f.read(),
125 |                 source_path=urllib.parse.quote(
126 |                     file.replace(str(parent_dir), git_repo_url), safe=":/"
127 |                 ),
128 |                 query_name=Path(file).stem,
129 |                 source_type="text",
130 |                 attributes={},
131 |             )
132 |             list_of_kql_files_dict.append(kql_query)
133 | 
134 |     return list_of_kql_files_dict
135 | 
136 | 
137 | def parse_markdown_to_dict(repo_name, branch_name, src_path):
138 |     parent_dir = Path(src_path).joinpath(f"{repo_name.split('/')[-1]}-{branch_name}")
139 |     md_files = glob.glob(f"{parent_dir}/**/*.md", recursive=True)
140 |     logging.info(
141 |         "Processing %d markdown files from repo: %s",
142 |         len(md_files),
143 |         repo_name,
144 |     )
145 |     git_repo_url = f"https://github.com/{repo_name}/tree/main"
146 | 
147 |     # src_path_list = []
148 |     logging.info("Parsing markdown files..")
149 |     kql_query_list: List[KqlQuery] = []
150 |     for file in tqdm(md_files):
151 |         file_path = Path(file)
152 |         lines = file_path.read_text(encoding="utf-8").split("\n")
153 | 
154 |         in_kql = False
155 |         kql_text = []
156 |         last_header = None
157 |         context = []
158 |         qry_index = 0
159 |         for line in lines:
160 |             if line.startswith("```kql"):
161 |                 in_kql = True
162 |                 continue
163 |             if line.strip() == "```":
164 |                 kql_query_list.append(
165 |                     KqlQuery(
166 |                         query="\n".join(kql_text),
167 |                         source_path=urllib.parse.quote(
168 |                             str(file_path).replace(str(parent_dir), git_repo_url),
169 |                             safe=":/",
170 |                         ),
171 |                         source_type="markdown",
172 |                         source_index=qry_index,
173 |                         query_name=last_header or f"{file_path.stem}_{qry_index}",
174 |                         context="\n".join(context[-10:]),
175 |                     )
176 |                 )
177 |                 qry_index += 1
178 |                 in_kql = False
179 |                 kql_text = []
180 |                 last_header = None
181 |                 context = []
182 |                 continue
183 |             if not in_kql and line.startswith("#"):
184 |                 last_header = line
185 |             if in_kql:
186 |                 kql_text.append(line)
187 |             else:
188 |                 context.append(line)
189 | 
190 |         # ct = 0
191 |         # kql = False
192 |         # kql_collect = []
193 |         # title_collect = []
194 |         # cur_kql = []
195 |         # title = "n/a"
196 |         # while ct < len(lines):
197 |         #     if kql:
198 |         #         cur_kql.append(lines[ct])
199 |         #     if lines[ct].startswith("#") and lines[ct + 2] == "```kql":
200 |         #         kql = True
201 |         #         title = lines[ct]
202 |         #     elif lines[ct] == "```kql":
203 |         #         kql = True
204 |         #     elif lines[ct] == "```":
205 |         #         kql = False
206 |         #         cur_kql = "\n".join(cur_kql)
207 |         #         kql_collect.append(cur_kql)
208 |         #         title_collect.append(title)
209 |         #         title = "n/a"
210 |         #         cur_kql = []
211 |         #     ct += 1
212 |         #     src_path = urllib.parse.quote(
213 |         #         str(file_path).replace(str(parent_dir), git_repo_url), safe=":/"
214 |         #     )
215 |         #     src_path_list.append(src_path)
216 | 
217 |         #     kql_query = KqlQuery(
218 |         #         query_name=title_collect,
219 |         #         query=kql_collect,
220 |         #         source_path=src_path_list,
221 |         #     )
222 |         #     df = pd.concat([df, test_df])
223 | 
224 |     return kql_query_list
225 | 


--------------------------------------------------------------------------------
/src/kql_query.py:
--------------------------------------------------------------------------------
  1 | # -------------------------------------------------------------------------
  2 | # Copyright (c) Microsoft Corporation. All rights reserved.
  3 | # Licensed under the MIT License. See License.txt in the project root for
  4 | # license information.
  5 | # --------------------------------------------------------------------------
  6 | """KqlQuery data class."""
  7 | import hashlib
  8 | import json
  9 | import re
 10 | import uuid
 11 | from dataclasses import asdict, dataclass, field, fields
 12 | from typing import Any, Dict, List, Literal, Optional
 13 | 
 14 | import pandas as pd
 15 | 
 16 | __author__ = "Ian Hellen"
 17 | 
 18 | 
 19 | _SOURCE_TYPES = ["text", "markdown", "sentinel_yaml", "api", "other"]
 20 | SourceType = Literal["text", "markdown", "sentinel_yaml", "api", "other"]
 21 | _REPO_NAME = re.compile(r"https://github\.com/(?P<name>[^/]+/[^/]+)/.*", re.IGNORECASE)
 22 | 
 23 | 
 24 | def _uuid_str():
 25 |     return str(uuid.uuid4())
 26 | 
 27 | 
 28 | @dataclass
 29 | class KqlQuery:
 30 |     """
 31 |     Data format for KqlQuery record.
 32 | 
 33 |     Attributes
 34 |     ----------
 35 |     source_path : str
 36 |         The path to the original file or API identifier.
 37 |     query : str
 38 |         The raw query string
 39 |     source_type : SourceType, optional
 40 |         String - the source file/data type. Valid types are:
 41 |         text, markdown, sentinel_yaml, api, other
 42 |     source_index : int, optional
 43 |         The index (0-based) if the query is one of several in the
 44 |         file pointed to by source_path. The default is 0.
 45 |     query_name : Optional[str]
 46 |         The name of the query. If None this will be derived from
 47 |         the last element of source_path
 48 |     attributes: Dict[str, Any], optional
 49 |         Dictionary of any metadata attributes read from the source
 50 |         file.
 51 |     kql_properties: Dict[str, Any], optional
 52 |         Dictionary of properties derived from the KQL query
 53 |     query_id: Optional[str], optional
 54 |         UUID used to identify the query
 55 |     query_hash: int, optional
 56 |         Hash of the query text
 57 |     query_version: int, optional
 58 |         Query version, not currently used. Default is 0
 59 | 
 60 |     Examples
 61 |     --------
 62 |     Create a KqlQuery instance
 63 |     >>>> kql = KqlQuery(
 64 |     ...     source_path="https://github.com/a/b/file.kql",
 65 |     ...     query="SecurityAlert | take 1"
 66 |     ... )
 67 | 
 68 |     Create a KqlQuery instance from a dict
 69 |     >>>> attribs = {
 70 |     ...     "source_path": "https://github.com/a/b/file.kql",
 71 |     ...     "query": "SecurityAlert | take 1",
 72 |     ... }
 73 |     ... kql = KqlQuery(**attribs)
 74 | 
 75 |     Different default representation
 76 |     >>>> kql
 77 |     KqlQuery(source_path='https://github.com/a/b/file.kql', query='SecurityAlert... query_version=0)
 78 | 
 79 |     As a dict
 80 |     >>>> print(kql.asdict())
 81 |     {'source_path': 'https://github.com/a/b/file.kql', 'query': 'SecurityAlert... 'query_version': 0}
 82 | 
 83 |     As JSON
 84 |     print(kql.to_json())
 85 |     {"source_path": "https://github.com/a/b/file.kql", "query": "SecurityAlert... "query_version": 0}
 86 | 
 87 |     Class method to convert a list of KqlQuery instances to a list of dicts
 88 |     >>>> KqlQuery.kql_list_to_pylist([kql, kql])
 89 | 
 90 |     Class method to convert a list of KqlQuery instances to JSON
 91 |     >>>> KqlQuery.kql_list_to_json([kql, kql])
 92 |     '[{"source_path": "https://github.com/a/b/file.kql", "query": "SecurityAlert... "query_version": 0}]'
 93 | 
 94 |     Class method to convert list of KqlQuery instances to a DataFrame
 95 |     """
 96 | 
 97 |     source_path: str
 98 |     query: str
 99 |     source_type: SourceType = "text"
100 |     source_index: int = 0
101 |     repo_name: Optional[str] = None
102 |     query_name: Optional[str] = None
103 |     context: Optional[str] = None
104 |     attributes: Dict[str, Any] = field(default_factory=dict)
105 |     kql_properties: Dict[str, Any] = field(default_factory=dict)
106 |     query_id: str = field(default_factory=_uuid_str)
107 |     query_hash: int = 0
108 |     query_version: int = 0
109 | 
110 |     def __post_init__(self):
111 |         """Run post"""
112 |         if self.query_name is None and self.source_path is not None:
113 |             self.query_name = self.source_path.rsplit("/", maxsplit=1)[-1]
114 |         if self.query:
115 |             self.query_hash = hashlib.sha256(
116 |                 bytes(self.query, encoding="utf-8"),
117 |                 # usedforsecurity=False
118 |             ).hexdigest()
119 |         if self.repo_name is None and self.source_path is not None:
120 |             match = _REPO_NAME.match(self.source_path)
121 |             if match:
122 |                 self.repo_name = match["name"]
123 | 
124 |     def asdict(self):
125 |         """Return a dictionary of attributes."""
126 |         return asdict(self)
127 | 
128 |     def to_json(self):
129 |         """Return JSON representation of attributes."""
130 |         return json.dumps(self.asdict())
131 | 
132 |     # helper methods and properties
133 |     @property
134 |     def source_types(self):
135 |         """Return list of acceptable source_types."""
136 |         del self
137 |         return _SOURCE_TYPES
138 | 
139 |     @classmethod
140 |     def field_names(cls) -> List[str]:
141 |         """Return list of fields."""
142 |         return [field.name for field in fields(cls)]
143 | 
144 |     @staticmethod
145 |     def kql_list_to_pylist(kql_queries: List["KqlQuery"]):
146 |         """Return a list of Python dicts from a list of KqlQuery instances."""
147 |         return [kql.asdict() for kql in kql_queries]
148 | 
149 |     @classmethod
150 |     def kql_list_to_json(cls, kql_queries: List["KqlQuery"]):
151 |         """Return JSON from a list of KqlQuery instances."""
152 |         return json.dumps(cls.kql_list_to_pylist(kql_queries))
153 | 
154 |     @classmethod
155 |     def kql_list_to_df(cls, kql_queries: List["KqlQuery"]):
156 |         """Return a pandas DataFrame from a list of KqlQuery instances."""
157 |         return pd.DataFrame(cls.kql_list_to_pylist(kql_queries))
158 | 


--------------------------------------------------------------------------------
/src/kqlextraction/tests/test1.kql:
--------------------------------------------------------------------------------
 1 | // https://github.com/Azure/Azure-Sentinel/blob/master/Hunting%20Queries/SigninLogs/UserLoginIPAddressTeleportation.yaml
 2 | 
 3 | let windowTime = 20min / 2; //Window to lookup anomalous logins within
 4 | let excludeKnownVPN = dynamic(['127.0.0.1', '0.0.0.0']); //Known VPN IP addresses to exclude
 5 | SigninLogs
 6 | | where ConditionalAccessStatus =~ "success"
 7 | | extend country = LocationDetails['countryOrRegion']
 8 | | where country != ""
 9 | | summarize count() by tostring(country)
10 | | join (
11 |     //Get the total number of logins from any country and join it to the previous count in a single table
12 |     SigninLogs
13 |     | where ConditionalAccessStatus =~ "success"
14 |     | extend country = LocationDetails['countryOrRegion']
15 |     | where country != ""
16 |     | summarize count(), make_list(tostring(country))
17 |     | mv-expand list_country
18 |     | extend country = tostring(list_country)
19 | ) on country
20 | | summarize by country, count_, count_1
21 | //Now calculate each countries prevalence within login events
22 | | extend prevalence = toreal(count_) / toreal(count_1) * 100
23 | | project-away count_1
24 | | where prevalence < 0.01
25 | | join kind=rightsemi(
26 |     SigninLogs
27 |     //Enable to limit to o365 exchange logins
28 |     //| where AppDisplayName =~ "Office 365 Exchange Online"
29 |     | where ConditionalAccessStatus =~ "success"
30 |     | where IPAddress != ""
31 |     | extend country = tostring(LocationDetails['countryOrRegion'])
32 |     | summarize count() by TimeGenerated, UserPrincipalName, country, IPAddress
33 | ) on country
34 | | join kind=leftouter (
35 |     SigninLogs
36 |     //Enable to limit to o365 exchange logins
37 |     //| where AppDisplayName =~ "Office 365 Exchange Online"
38 |     | where ConditionalAccessStatus =~ "success"
39 |     | extend country = tostring(LocationDetails['countryOrRegion'])
40 |     | summarize by TimeGenerated, IPAddress, UserPrincipalName, country
41 | ) on UserPrincipalName
42 | | where IPAddress != IPAddress1 and country != country1
43 | | extend WindowStart = TimeGenerated1 - windowTime
44 | | extend WindowEnd = TimeGenerated1 + windowTime
45 | | where TimeGenerated between (WindowStart .. WindowEnd)
46 | | project Account=UserPrincipalName, AnomalousIP=IPAddress, AnomalousLoginTime=TimeGenerated, AnomalousCountry=country, OtherLoginIP=IPAddress1, OtherLoginCountry=country1, OtherLoginWindowStart=WindowStart, OtherLoginWindowEnd=WindowEnd
47 | | where AnomalousIP !in(excludeKnownVPN) and OtherLoginIP !in(excludeKnownVPN)
48 | | extend timestamp = AnomalousLoginTime, AccountCustomEntity = Account, IPCustomEntity = AnomalousIP


--------------------------------------------------------------------------------
/src/kqlextraction/tests/test2.kql:
--------------------------------------------------------------------------------
1 | Foo
2 | | where A == 1
3 | | summarize count() by B


--------------------------------------------------------------------------------
/src/kqlextraction/tests/test3.kql:
--------------------------------------------------------------------------------
1 | Foo
2 | | mv-expand Z
3 | | join kind=leftsemi hint.remote=true Bar on T
4 | | join kind=leftsemi (
5 |     Baz
6 |     | where X > 5
7 |     | project R
8 |   ) on R


--------------------------------------------------------------------------------
/src/kqlextraction/tests/test4.kql:
--------------------------------------------------------------------------------
1 | Foo
2 | | lookup (Bar) on T


--------------------------------------------------------------------------------
/src/kqlextraction/tests/test5.kql:
--------------------------------------------------------------------------------
1 | union Foo, Bar, Baz


--------------------------------------------------------------------------------
/src/repos.yaml:
--------------------------------------------------------------------------------
 1 | - Github:
 2 |     branch: main
 3 |     repo: reprise99/Sentinel-Queries
 4 | - Github:
 5 |     branch: main
 6 |     repo: ugurkocde/KQL_Intune
 7 | - Github:
 8 |     branch: master
 9 |     repo: alexverboon/MDATP
10 | - Github:
11 |     branch: master
12 |     repo: eshlomo1/Microsoft-Sentinel-4-SecOps
13 | - Github:
14 |     branch: master
15 |     repo: FalconForceTeam/FalconFriday
16 | - Github:
17 |     branch: master
18 |     repo: Kaidja/Microsoft-Sentinel
19 | - Github:
20 |     branch: main
21 |     repo: Cyb3r-Monk/Threat-Hunting-and-Detection
22 | - Github:
23 |     branch: main
24 |     repo: rod-trent/MustLearnKQL


--------------------------------------------------------------------------------
/src/test_data/test1.kql:
--------------------------------------------------------------------------------
 1 | // https://github.com/Azure/Azure-Sentinel/blob/master/Hunting%20Queries/SigninLogs/UserLoginIPAddressTeleportation.yaml
 2 | 
 3 | let windowTime = 20min / 2; //Window to lookup anomalous logins within
 4 | let excludeKnownVPN = dynamic(['127.0.0.1', '0.0.0.0']); //Known VPN IP addresses to exclude
 5 | SigninLogs
 6 | | where ConditionalAccessStatus =~ "success"
 7 | | extend country = LocationDetails['countryOrRegion']
 8 | | where country != ""
 9 | | summarize count() by tostring(country)
10 | | join (
11 |     //Get the total number of logins from any country and join it to the previous count in a single table
12 |     SigninLogs
13 |     | where ConditionalAccessStatus =~ "success"
14 |     | extend country = LocationDetails['countryOrRegion']
15 |     | where country != ""
16 |     | summarize count(), make_list(tostring(country))
17 |     | mv-expand list_country
18 |     | extend country = tostring(list_country)
19 | ) on country
20 | | summarize by country, count_, count_1
21 | //Now calculate each countries prevalence within login events
22 | | extend prevalence = toreal(count_) / toreal(count_1) * 100
23 | | project-away count_1
24 | | where prevalence < 0.01
25 | | join kind=rightsemi(
26 |     SigninLogs
27 |     //Enable to limit to o365 exchange logins
28 |     //| where AppDisplayName =~ "Office 365 Exchange Online"
29 |     | where ConditionalAccessStatus =~ "success"
30 |     | where IPAddress != ""
31 |     | extend country = tostring(LocationDetails['countryOrRegion'])
32 |     | summarize count() by TimeGenerated, UserPrincipalName, country, IPAddress
33 | ) on country
34 | | join kind=leftouter (
35 |     SigninLogs
36 |     //Enable to limit to o365 exchange logins
37 |     //| where AppDisplayName =~ "Office 365 Exchange Online"
38 |     | where ConditionalAccessStatus =~ "success"
39 |     | extend country = tostring(LocationDetails['countryOrRegion'])
40 |     | summarize by TimeGenerated, IPAddress, UserPrincipalName, country
41 | ) on UserPrincipalName
42 | | where IPAddress != IPAddress1 and country != country1
43 | | extend WindowStart = TimeGenerated1 - windowTime
44 | | extend WindowEnd = TimeGenerated1 + windowTime
45 | | where TimeGenerated between (WindowStart .. WindowEnd)
46 | | project Account=UserPrincipalName, AnomalousIP=IPAddress, AnomalousLoginTime=TimeGenerated, AnomalousCountry=country, OtherLoginIP=IPAddress1, OtherLoginCountry=country1, OtherLoginWindowStart=WindowStart, OtherLoginWindowEnd=WindowEnd
47 | | where AnomalousIP !in(excludeKnownVPN) and OtherLoginIP !in(excludeKnownVPN)
48 | | extend timestamp = AnomalousLoginTime, AccountCustomEntity = Account, IPCustomEntity = AnomalousIP


--------------------------------------------------------------------------------
/src/test_data/test2.kql:
--------------------------------------------------------------------------------
1 | Foo
2 | | where A == 1
3 | | summarize count() by B


--------------------------------------------------------------------------------
/src/test_data/test3.kql:
--------------------------------------------------------------------------------
1 | Foo
2 | | mv-expand Z
3 | | join kind=leftsemi hint.remote=true Bar on T
4 | | join kind=leftsemi (
5 |     Baz
6 |     | where X > 5
7 |     | project R
8 |   ) on R


--------------------------------------------------------------------------------
/src/test_data/test4.kql:
--------------------------------------------------------------------------------
1 | Foo
2 | | lookup (Bar) on T


--------------------------------------------------------------------------------
/src/test_data/test5.kql:
--------------------------------------------------------------------------------
1 | union Foo, Bar, Baz


--------------------------------------------------------------------------------
/src/test_data/test_10.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     {
  3 |         "source_path": "https://github.com/reprise99/Sentinel-Queries/tree/main%5CDefender%20for%20Cloud%20Apps%5CDCA-PivotTableAdminActions.kql",
  4 |         "query": "//Create a pivot table of all actions in Defender for Cloud Apps by your privileged users over the last 7 days\n//Lookup the IdentityInfo table for any users holding a privileged role\n\n//Data connector required for this query - M365 Defender - CloudAppEvents\n\n//Microsoft Sentinel query\nlet privusers=\n    IdentityInfo\n    | where TimeGenerated > ago(21d)\n    | summarize arg_max(TimeGenerated, *) by AccountUPN\n    //Add any roles that you are interested in auditing\n    | where AssignedRoles has_any (\"Global Administrator\", \"Security Administrator\", \"SharePoint Administrator\")\n    | distinct AccountUPN;\nCloudAppEvents\n| where TimeGenerated > ago(7d)\n| extend Operation = tostring(RawEventData.Operation)\n| extend UserId = tostring(RawEventData.UserId)\n| extend Workload = tostring(RawEventData.Workload)\n//Create a new column that adds workload and operation together to make the events more readable\n| extend Activity = strcat(Workload, \" - \", Operation)\n| where UserId in~ (privusers)\n//Create pivot table of all actions by each user\n| evaluate pivot(Activity, count(), UserId)\n\n//Advanced hunting query\n\n//Data connector required for this query - Advanced Hunting license\n\nCloudAppEvents\n| where Timestamp > ago(7d)\n| extend Operation = tostring(RawEventData.Operation)\n| extend UserId = tostring(RawEventData.UserId)\n| extend Workload = tostring(RawEventData.Workload)\n//Advanced hunting doesn't retain role information about users, but you can add a list of users in manually to create a table\n| where UserId in~ (\"admin1@domain.com\", \"admin2@domain.com\")\n//Create a new column that adds workload and operation together to make the events more readable\n| extend Activity = strcat(Workload, \" - \", Operation)\n//Create pivot table of all actions by each user\n| evaluate pivot(Activity, count(), UserId)",
  5 |         "source_type": "text",
  6 |         "source_index": 0,
  7 |         "query_name": "DCA-PivotTableAdminActions",
  8 |         "context": null,
  9 |         "attributes": {},
 10 |         "kql_properties": {},
 11 |         "query_id": "e0ebd9f6-aab9-4928-b34f-5c8d089b715f",
 12 |         "query_hash": "d8eb17f554e939949114f13fb911adb86178861b2a775f952afd9e72b1b6a35b",
 13 |         "query_version": 0
 14 |     },
 15 |     {
 16 |         "source_path": "https://github.com/reprise99/Sentinel-Queries/tree/main%5CAzure%20Activity%5CAzure-ResourceLockAddedorRemoved.kql",
 17 |         "query": "//Detect when a resource lock is added or removed from an Azure resource\n\n//Data connector required for this query - Azure Activity \n\nAzureActivity\n| where OperationNameValue in (\"MICROSOFT.AUTHORIZATION/LOCKS/WRITE\", \"MICROSOFT.AUTHORIZATION/LOCKS/DELETE\")\n| where ActivityStatusValue == \"Success\"\n| extend Activity = case(OperationNameValue == \"MICROSOFT.AUTHORIZATION/LOCKS/WRITE\", strcat(\"Resource Lock Added\"),\n    OperationNameValue == \"MICROSOFT.AUTHORIZATION/LOCKS/DELETE\", strcat(\"Resource Lock Removed\"),\n    \"unknown\")\n| extend ResourceGroup = tostring(parse_json(Properties).resourceGroup)\n| extend AzureResource = tostring(parse_json(Properties).resourceProviderValue)\n| extend x = tostring(parse_json(Properties).resource)\n| parse x with ResourceName \"/\" *\n| parse x with * \"microsoft.authorization/\" LockName\n| project\n    TimeGenerated,\n    Activity,\n    ResourceName,\n    ['Azure Resource']=AzureResource,\n    ['Azure Subscription Id']=SubscriptionId,\n    ['Azure Resource Group']=ResourceGroup,\n    LockName",
 18 |         "source_type": "text",
 19 |         "source_index": 0,
 20 |         "query_name": "Azure-ResourceLockAddedorRemoved",
 21 |         "context": null,
 22 |         "attributes": {},
 23 |         "kql_properties": {},
 24 |         "query_id": "b0fba90e-6fa8-4124-bcc1-f076321d5eb3",
 25 |         "query_hash": "88b82e644765270392b56b57febf143a0ca39d784457ade747f97bbe7454d66f",
 26 |         "query_version": 0
 27 |     },
 28 |     {
 29 |         "source_path": "https://github.com/reprise99/Sentinel-Queries/tree/main%5CREADME.md",
 30 |         "query": "let ExampleText = datatable(TestData:string)\n[\n'Name=Reprise99,UPNSuffix=testdomain.com,AadTenantId=345c1234-a833-43e4-1d34-123440a5bcdd1,AadUserId=cf6f2df6-b754-48dc-b7bc-c8339caf211,DisplayName=Test User,Type=account',\n'Name=Reprise103,UPNSuffix=testdomain.com,AadTenantId=331c1234-a841-43e5-1d31-12220a5bcee1,AadUserId=cf6f2df6-b754-48dc-b7bc-c8339caf211,DisplayName=Test User 2,Type=account'\n]\n;\nExampleText\n| extend Name = split(TestData,',')[0]\n| extend DomainSuffix = split(TestData,',')[1]\n| extend AzureADTenantId = split(TestData,',')[2]\n| extend AzureADUserId = split(TestData,',')[3]\n| extend DisplayName = split(TestData,',')[4]\n| extend AccountType = split(TestData,',')[5]\n| project Name, DomainSuffix, AzureADTenantId, AzureADUserId, DisplayName, AccountType\n| where Name contains \"Reprise99\"",
 31 |         "source_type": "markdown",
 32 |         "source_index": 59,
 33 |         "query_name": "README_59",
 34 |         "context": "\nIf we know our data location within the string then we can split it directly into named columns.\n\n![Split 2](https://github.com/reprise99/Sentinel-Queries/blob/main/Diagrams/split2.png?raw=true)\n\nOnce we have split our data, we can query it as though it was structured from the outset. So if we add a second record to our data, then query on specifc matches we will find what we are after.\n",
 35 |         "attributes": {},
 36 |         "kql_properties": {},
 37 |         "query_id": "d7bdc719-db17-4534-98d7-46a6dbe78ee8",
 38 |         "query_hash": "e04587d898f723719a9d361f3b5d43eeeed92acadce7d1f7e37a6b309091907c",
 39 |         "query_version": 0
 40 |     },
 41 |     {
 42 |         "source_path": "https://github.com/reprise99/Sentinel-Queries/tree/main%5COffice%20365%5COfficeActivity-SummarizeTeamsCreatedDeleted.kql",
 43 |         "query": "//Create a weekly summary of Teams created and deleted in your Office 365 tenant\n\n//Data connector required for this query - Office 365\n\nOfficeActivity\n| where TimeGenerated > ago(30d)\n| where Operation in (\"TeamCreated\", \"TeamDeleted\")\n| summarize\n    ['Count of Teams Created']=dcountif(TeamName, Operation == \"TeamCreated\"),\n    ['List of Teams Created']=make_set_if(TeamName, Operation == \"TeamCreated\"),\n    ['Count of Teams Deleted']=dcountif(TeamName, Operation == \"TeamDeleted\"),\n    ['List of Teams Deleted']=make_set_if(TeamName, Operation == \"TeamDeleted\")\n    by Week=startofweek(TimeGenerated)\n| sort by Week desc ",
 44 |         "source_type": "text",
 45 |         "source_index": 0,
 46 |         "query_name": "OfficeActivity-SummarizeTeamsCreatedDeleted",
 47 |         "context": null,
 48 |         "attributes": {},
 49 |         "kql_properties": {},
 50 |         "query_id": "fe16fc73-a049-461e-9500-1d7cb9007290",
 51 |         "query_hash": "2a88adceb83744b56d1974dae10c42098f44da070f891d5cf67ec8b9a9a9630d",
 52 |         "query_version": 0
 53 |     },
 54 |     {
 55 |         "source_path": "https://github.com/reprise99/Sentinel-Queries/tree/main%5CAzure%20Active%20Directory%5CIdentity-SummarizeGuestDomainbyType.kql",
 56 |         "query": "//Summarize guest activity by external Azure AD guests (those that belong to another Azure AD tenant) vs External Guests (such as Gmail) to your tenant\n//For each domain list the total number of signins and distinct user \n\n//Data connector required for this query - Azure Active Directory - Signin Logs\n\nSigninLogs\n| where TimeGenerated > ago (30d)\n| where UserType == \"Guest\"\n| where ResultType == 0\n| extend ['Guest Domain'] = tostring(split(UserPrincipalName, \"@\")[1])\n| summarize\n    ['External Azure AD Guest Logins']=countif(ResourceTenantId != HomeTenantId),\n    ['External Azure AD Guest Distinct Users']=dcountif(UserPrincipalName, ResourceTenantId != HomeTenantId),\n    ['External Guest Logins']=countif(ResourceTenantId == HomeTenantId),\n    ['External Guest Distinct Users']=dcountif(UserPrincipalName, ResourceTenantId == HomeTenantId)\n    by ['Guest Domain']\n",
 57 |         "source_type": "text",
 58 |         "source_index": 0,
 59 |         "query_name": "Identity-SummarizeGuestDomainbyType",
 60 |         "context": null,
 61 |         "attributes": {},
 62 |         "kql_properties": {},
 63 |         "query_id": "11c1b9ca-6c1c-4bdc-8eaa-a3facc3d3ed6",
 64 |         "query_hash": "4a44ca5e719fb084262265a3c617ec7d8f023a19f669ee61b79aa919b28e5fec",
 65 |         "query_version": 0
 66 |     },
 67 |     {
 68 |         "source_path": "https://github.com/reprise99/Sentinel-Queries/tree/main%5CAzure%20AD%20Abuse%20Detection%5CREADME.md",
 69 |         "query": "AuditLogs\n| where OperationName == \"Add owner to service principal\"\n| extend ['Actor IP Address'] = tostring(parse_json(tostring(InitiatedBy.user)).ipAddress)\n| extend Actor = tostring(parse_json(tostring(InitiatedBy.user)).userPrincipalName)\n| extend ['Service Principal Name'] = tostring(parse_json(tostring(parse_json(tostring(TargetResources[0].modifiedProperties))[1].newValue)))\n| extend ['Service Principal ObjectId'] = tostring(TargetResources[1].id)\n| extend Target = tostring(TargetResources[0].userPrincipalName)\n| where TargetResources[0].type == \"User\"\n| where isnotempty(Actor)\n| project TimeGenerated, OperationName, Actor, ['Actor IP Address'], Target, ['Service Principal Name'], ['Service Principal ObjectId']",
 70 |         "source_type": "markdown",
 71 |         "source_index": 16,
 72 |         "query_name": "### Detection Query (User as actor, user as target)",
 73 |         "context": "\n\n## BARK function - Test-MGAddSelfAsOwnerOfSP \n\nOwners of service principals can change settings on that object, for instance they can add or remove users who have access to sign into that service principal. They can change SSO settings and change permissions on the service principal.\n\nFor this abuse, the actor can be either a user or a service principal. The target can also be either a user or a service principal.\n\n### Detection Query (User as actor, user as target)\n",
 74 |         "attributes": {},
 75 |         "kql_properties": {},
 76 |         "query_id": "47bc03e9-d2ce-427a-a799-d0edcec62cb1",
 77 |         "query_hash": "d7cc7b54dfe0e752885b2d833672c587df342564ea955e62e785d5edbc66b869",
 78 |         "query_version": 0
 79 |     },
 80 |     {
 81 |         "source_path": "https://github.com/reprise99/Sentinel-Queries/tree/main%5CAzure%20Active%20Directory%5CIdentity-SummarizeConditionalAccessPoliciesfailures.kql",
 82 |         "query": "//Create a summary showing which of your Azure AD conditional access policies are preventing the most signins and for what reasons\n\n//Data connector required for this query - Azure Active Directory - Signin Logs\n\nSigninLogs\n| where TimeGenerated > ago (7d)\n| project TimeGenerated, ConditionalAccessPolicies, ResultType, ResultDescription\n| mv-expand ConditionalAccessPolicies\n| extend CAResult = tostring(ConditionalAccessPolicies.result)\n| extend ['Conditional Access Policy Name'] = tostring(ConditionalAccessPolicies.displayName)\n| where CAResult == \"failure\"\n| summarize ['Count of Failures']=count()by ['Conditional Access Policy Name'], ResultType, ResultDescription\n| sort by ['Count of Failures'] desc ",
 83 |         "source_type": "text",
 84 |         "source_index": 0,
 85 |         "query_name": "Identity-SummarizeConditionalAccessPoliciesfailures",
 86 |         "context": null,
 87 |         "attributes": {},
 88 |         "kql_properties": {},
 89 |         "query_id": "7c8e52c0-def4-4751-a8e9-671eebc20296",
 90 |         "query_hash": "1007d7955776d29deb1cfd7ff8ad3ea5ea5e021dd2863238da224c175b376ebd",
 91 |         "query_version": 0
 92 |     },
 93 |     {
 94 |         "source_path": "https://github.com/reprise99/Sentinel-Queries/tree/main%5CDefender%20for%20Endpoint%5CDevice-DetectCertUtilConnectingExternally.kql",
 95 |         "query": "//Detects when certutil is used to connect to a public IP. This could indicate abuse of cert util, see - https://www.avira.com/en/blog/certutil-abused-by-attackers-to-spread-threats\n\n//Data connector required for this query - M365 Defender - Device* tables\n\n//Microsoft Sentinel query\nDeviceNetworkEvents\n| where TimeGenerated > ago (7d)\n| project\n    TimeGenerated,\n    DeviceName,\n    InitiatingProcessAccountName,\n    InitiatingProcessCommandLine,\n    LocalIPType,\n    LocalIP,\n    RemoteIPType,\n    RemoteIP,\n    RemoteUrl,\n    RemotePort\n| where InitiatingProcessCommandLine contains \"certutil\"\n| where RemoteIPType == \"Public\"\n\n//Advanced Hunting query\n\n//Data connector required for this query - Advanced Hunting license\n\nDeviceNetworkEvents\n| where TimeGenerated > ago (7d)\n| project\n    TimeGenerated,\n    DeviceName,\n    InitiatingProcessAccountName,\n    InitiatingProcessCommandLine,\n    LocalIPType,\n    LocalIP,\n    RemoteIPType,\n    RemoteIP,\n    RemoteUrl,\n    RemotePort\n| where InitiatingProcessCommandLine contains \"certutil\"\n| where RemoteIPType == \"Public\"",
 96 |         "source_type": "text",
 97 |         "source_index": 0,
 98 |         "query_name": "Device-DetectCertUtilConnectingExternally",
 99 |         "context": null,
100 |         "attributes": {},
101 |         "kql_properties": {},
102 |         "query_id": "4eb1a989-83d2-44a2-9f6c-f4dfb1f31ee6",
103 |         "query_hash": "c76da23b26d172981b5d324232edae919c14585c8131640566ad5fa7cf6bcbfa",
104 |         "query_version": 0
105 |     },
106 |     {
107 |         "source_path": "https://github.com/reprise99/Sentinel-Queries/tree/main%5CAzure%20Active%20Directory%5CIdentity-VisualizeExternalAADGuestsvsExternalGuests.kql",
108 |         "query": "//Visualize signins from External Azure AD guests (those that belong to another Azure AD tenant) vs External Guests (such as Gmail) to your tenant\n\n//Data connector required for this query - Azure Active Directory - Signin Logs\n\nSigninLogs\n| where TimeGenerated > ago (45d)\n| where UserType == \"Guest\"\n| summarize\n    ['External Guests']=countif(ResourceTenantId == HomeTenantId),\n    ['External Azure AD Guests']=countif(ResourceTenantId != HomeTenantId)\n    by bin(TimeGenerated, 1d)\n| render timechart with (title=\"External Azure AD Guests vs External Guests\", ytitle=\"Count\")\n",
109 |         "source_type": "text",
110 |         "source_index": 0,
111 |         "query_name": "Identity-VisualizeExternalAADGuestsvsExternalGuests",
112 |         "context": null,
113 |         "attributes": {},
114 |         "kql_properties": {},
115 |         "query_id": "da36cf45-4fba-484e-ac9a-a98088b0836a",
116 |         "query_hash": "bd81f7d826576984985f2ab36ac58f2a6e2859e6cf0e358dbd22b6472bf8d86c",
117 |         "query_version": 0
118 |     },
119 |     {
120 |         "source_path": "https://github.com/reprise99/Sentinel-Queries/tree/main%5CDefender%20for%20Endpoint%5CDevice-SummarizeRDPConnections.kql",
121 |         "query": "//Summarize your devices by their RDP activity. The data is sorted to show total outbound RDP connections, a count of distinct RDP connections and the list of IP's connected to.\n\n//Data connector required for this query - M365 Defender - Device* tables\n\n//Data is sorted by the devices with the most unique outbound RDP connections. Those devices have the biggest lateral movement blast radius.\n//Microsoft Sentinel query\nDeviceNetworkEvents\n| where TimeGenerated > ago(30d)\n| where ActionType == \"ConnectionSuccess\"\n| where RemotePort == \"3389\"\n//Exclude Defender for Identity that uses an initial RDP connection to map your network\n| where InitiatingProcessCommandLine <> \"\\\"Microsoft.Tri.Sensor.exe\\\"\"\n| summarize\n    ['RDP Outbound Connection Count']=count(),\n    ['RDP Distinct Outbound Endpoint Count']=dcount(RemoteIP),\n    ['RDP Outbound Endpoints']=make_set(RemoteIP)\n    by DeviceName\n| sort by ['RDP Distinct Outbound Endpoint Count'] desc \n\n//Advanced Hunting query\n\n//Data connector required for this query - Advanced Hunting license\n\nDeviceNetworkEvents\n| where Timestamp > ago(30d)\n| where ActionType == \"ConnectionSuccess\"\n| where RemotePort == \"3389\"\n//Exclude Defender for Identity that uses an initial RDP connection to map your network\n| where InitiatingProcessCommandLine <> \"\\\"Microsoft.Tri.Sensor.exe\\\"\"\n| summarize\n    ['RDP Outbound Connection Count']=count(),\n    ['RDP Distinct Outbound Endpoint Count']=dcount(RemoteIP),\n    ['RDP Outbound Endpoints']=make_set(RemoteIP)\n    by DeviceName\n| sort by ['RDP Distinct Outbound Endpoint Count'] desc ",
122 |         "source_type": "text",
123 |         "source_index": 0,
124 |         "query_name": "Device-SummarizeRDPConnections",
125 |         "context": null,
126 |         "attributes": {},
127 |         "kql_properties": {},
128 |         "query_id": "2fa11654-ce7b-4452-9646-700afce24375",
129 |         "query_hash": "8c2fca65cd7884333babfb8166724b4c2c9ecc15d1b16a4e38d4ce285e56fd99",
130 |         "query_version": 0
131 |     }
132 | ]


--------------------------------------------------------------------------------
/src/test_data_store.py:
--------------------------------------------------------------------------------
  1 | # -------------------------------------------------------------------------
  2 | # Copyright (c) Microsoft Corporation. All rights reserved.
  3 | # Licensed under the MIT License. See License.txt in the project root for
  4 | # license information.
  5 | # --------------------------------------------------------------------------
  6 | """Module docstring."""
  7 | import json
  8 | import random
  9 | import uuid
 10 | from pathlib import Path
 11 | 
 12 | import pytest
 13 | 
 14 | from .data_store import DataStore
 15 | from .kql_query import KqlQuery
 16 | 
 17 | __author__ = "Ian Hellen"
 18 | 
 19 | # pylint: disable=redefined-outer-name
 20 | 
 21 | json_query_data = """
 22 | {
 23 |     "query_id": "1234291720927310",
 24 |     "source_path": "/github.com/foo",
 25 |     "source_type": "text",
 26 |     "source_index": 0,
 27 |     "name": "query_1",
 28 |     "query": "SecurityAlert\\n| Where foo == bar",
 29 |     "context": "text from markdown",
 30 |     "attributes": {
 31 |         "description": "Query one description",
 32 |         "tactics": ["Exploitation", "Compromise"],
 33 |         "techniques": ["T.1055", "T.1345"]
 34 |     }
 35 | }
 36 | """
 37 | 
 38 | json_kql_parse = """
 39 | {
 40 |     "FunctionCalls":["count","tostring","make_list","toreal"],
 41 |     "Joins":["rightsemi","leftouter"],
 42 |     "Operators":["where","extend","summarize","mv-expand","project-away","project"],
 43 |     "Tables":["SigninLogs"]
 44 | }
 45 | """
 46 | 
 47 | table_names = [
 48 |     "AADB2CRequestLogs",
 49 |     "AADDomainServicesAccountLogon",
 50 |     "AADDomainServicesAccountManagement",
 51 |     "AADDomainServicesDirectoryServiceAccess",
 52 |     "AADDomainServicesLogonLogoff",
 53 |     "AADDomainServicesPolicyChange",
 54 |     "AADDomainServicesPrivilegeUse",
 55 |     "AADManagedIdentitySignInLogs",
 56 |     "AADNonInteractiveUserSignInLogs",
 57 |     "AADProvisioningLogs",
 58 |     "AADRiskyServicePrincipals",
 59 |     "AADRiskyUsers",
 60 |     "AADServicePrincipalRiskEvents",
 61 |     "AADServicePrincipalSignInLogs",
 62 |     "AADUserRiskEvents",
 63 |     "ADFSSignInLogs",
 64 |     "AlertEvidence",
 65 |     "Anomalies",
 66 |     "AppServiceIPSecAuditLogs",
 67 |     "AppServiceServerlessSecurityPluginData",
 68 |     "ASimDnsActivityLogs",
 69 |     "AuditLogs",
 70 |     "AWSCloudTrail",
 71 |     "AWSGuardDuty",
 72 |     "AWSVPCFlow",
 73 |     "AZFWApplicationRule",
 74 |     "AZFWApplicationRuleAggregation",
 75 |     "AZFWDnsQuery",
 76 |     "AZFWIdpsSignature",
 77 |     "AZFWInternalFqdnResolutionFailure",
 78 |     "AZFWNatRule",
 79 |     "AZFWNatRuleAggregation",
 80 |     "AZFWNetworkRule",
 81 |     "AZFWNetworkRuleAggregation",
 82 |     "AZFWThreatIntel",
 83 |     "AzureActivity",
 84 |     "AzureDiagnostics",
 85 |     "BehaviorAnalytics",
 86 |     "CloudAppEvents",
 87 |     "CommonSecurityLog",
 88 |     "ConfidentialWatchlist",
 89 |     "DeviceEvents",
 90 |     "DeviceFileCertificateInfo",
 91 |     "DeviceFileEvents",
 92 |     "DeviceImageLoadEvents",
 93 |     "DeviceInfo",
 94 |     "DeviceLogonEvents",
 95 |     "DeviceNetworkEvents",
 96 |     "DeviceNetworkInfo",
 97 |     "DeviceProcessEvents",
 98 |     "DeviceRegistryEvents",
 99 |     "DeviceTvmSecureConfigurationAssessment",
100 |     "DeviceTvmSoftwareInventory",
101 |     "DeviceTvmSoftwareVulnerabilities",
102 |     "DSMAzureBlobStorageLogs",
103 |     "DSMDataClassificationLogs",
104 |     "DSMDataLabelingLogs",
105 |     "DynamicEventCollection",
106 |     "EmailAttachmentInfo",
107 |     "EmailEvents",
108 |     "EmailPostDeliveryEvents",
109 |     "EmailUrlInfo",
110 |     "GCPAuditLogs",
111 |     "HDInsightSecurityLogs",
112 |     "HuntingBookmark",
113 |     "IdentityDirectoryEvents",
114 |     "IdentityLogonEvents",
115 |     "IdentityQueryEvents",
116 |     "LinuxAuditLog",
117 |     "McasShadowItReporting",
118 |     "NetworkAccessTraffic",
119 |     "NetworkSessions",
120 |     "NSPAccessLogs",
121 |     "OfficeActivity",
122 |     "PowerBIActivity",
123 |     "ProjectActivity",
124 |     "ProtectionStatus",
125 |     "PurviewDataSensitivityLogs",
126 |     "SecurityAlert",
127 |     "SecurityBaseline",
128 |     "SecurityBaselineSummary",
129 |     "SecurityDetection",
130 |     "SecurityEvent",
131 |     "SecurityIoTRawEvent",
132 |     "SecurityRecommendation",
133 |     "SentinelAudit",
134 |     "SentinelHealth",
135 |     "SigninLogs",
136 |     "Syslog",
137 |     "ThreatIntelligenceIndicator",
138 |     "Update",
139 |     "UrlClickEvents",
140 |     "UserAccessAnalytics",
141 |     "UserPeerAnalytics",
142 |     "Watchlist",
143 |     "WindowsEvent",
144 |     "WindowsFirewall",
145 |     "WireData",
146 | ]
147 | 
148 | field_names = [
149 |     "SourceType",
150 |     "DomainBehaviorVersion",
151 |     "OperationName",
152 |     "BookmarkName",
153 |     "SentinelResourceId",
154 |     "OSName",
155 |     "ActualResult",
156 |     "CreatedBy",
157 |     "CreatedDateTime",
158 |     "LatencySamplingTimeStamp",
159 |     "Environment",
160 |     "CorrelationId",
161 |     "MachineGroup",
162 |     "SumResponseBodySize",
163 |     "RecordId",
164 |     "DstUserUpn",
165 |     "ResourceId",
166 |     "InitiatingProcessSHA1",
167 |     "ObjectId",
168 |     "AssetType",
169 |     "Title",
170 |     "InitiatingProcessAccountDomain",
171 |     "AuthorizationInfo",
172 |     "TargetContextId",
173 |     "LogonId",
174 |     "CveTags",
175 |     "SourceComputerId",
176 |     "ResourceIdentity",
177 |     "ClusterName",
178 |     "TdoAttributes",
179 |     "EntityMapping",
180 |     "DnssecOkBit",
181 |     "DeviceCustomString5",
182 |     "TransmittedServices",
183 |     "DeviceCustomDate2Label",
184 | ]
185 | 
186 | 
187 | def get_random_items(data=table_names, count=3):
188 |     return list({random.choice(data) for _ in range(count)})
189 | 
190 | 
191 | def get_random_query(index=0):
192 |     tactic_idx = index % 7
193 |     return {
194 |         "query_id": str(uuid.uuid4()),
195 |         "source_path": f"/github.com/foo/{index}",
196 |         "source_type": "text",
197 |         "source_index": random.randint(0, 7),
198 |         "query_name": f"query_{index}",
199 |         "query": "SecurityAlert\\n| Where foo == bar",
200 |         # "context": "text from markdown",
201 |         "attributes": {
202 |             "description": "Query one description",
203 |             "tactics": get_random_items(
204 |                 data=["Exploitation", "Compromise", "LateralMovement"], count=2
205 |             ),
206 |             "techniques": [f"T10{tactic_idx:0>2d}", f"T1{tactic_idx:0>2d}5"],
207 |             "test_dict": {
208 |                 "joins": {"inner": ["one", "two"], "outer": ["three", "four"]}
209 |             },
210 |         },
211 |     }
212 | 
213 | 
214 | @pytest.fixture
215 | def get_raw_queries():
216 | 
217 |     return [get_random_query(i) for i in range(5)]
218 | 
219 | 
220 | @pytest.fixture
221 | def get_kqlquery_list():
222 |     return [KqlQuery(**get_random_query(i)) for i in range(5)]
223 | 
224 | 
225 | def test_datastore_init(get_kqlquery_list, get_raw_queries):
226 | 
227 |     ds = DataStore(get_kqlquery_list)
228 |     all_items_len = len(get_kqlquery_list)
229 |     assert len(ds._data) == all_items_len
230 |     assert len(ds._data) == all_items_len
231 |     assert len(ds._indexes) == 2
232 | 
233 |     ds = DataStore(get_raw_queries)
234 |     all_items_len = len(get_kqlquery_list)
235 |     assert len(ds._data) == all_items_len
236 |     assert len(ds._data) == all_items_len
237 |     assert len(ds._indexes) == 2
238 | 
239 |     json_text = ds.to_json()
240 |     output_dict = json.loads(json_text)
241 |     assert len(output_dict) == len(get_raw_queries)
242 | 
243 |     out_df = ds.to_df()
244 |     assert len(out_df) == all_items_len
245 | 
246 | 
247 | def test_datastore_find(get_kqlquery_list):
248 | 
249 |     ds = DataStore(get_kqlquery_list)
250 |     all_items_len = len(get_kqlquery_list)
251 |     assert len(ds.find_queries(query_name="query_0")) == 1
252 |     assert all_items_len > len(ds.find_queries(tactics=["Compromise"]))
253 |     assert len(ds.find_queries(tactics=["BadTactic"])) == 0
254 |     assert len(ds.find_queries(query_name={"matches": "query.*"})) == all_items_len
255 | 


--------------------------------------------------------------------------------
/src/test_kql_download.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License. See License.txt in the project root for
 4 | # license information.
 5 | # --------------------------------------------------------------------------
 6 | """Test Query downloader."""
 7 | import random
 8 | from pathlib import Path
 9 | 
10 | from .data_store import DataStore
11 | from .kql_download import get_community_queries, get_sentinel_queries
12 | 
13 | __author__ = "Ian Hellen"
14 | 
15 | # pylint: disable=protected-access
16 | 
17 | 
18 | def test_get_sentinel_queries(tmp_path):
19 |     """Test downloading sentinel queries."""
20 |     queries = get_sentinel_queries(tmp_path)
21 |     ds = DataStore(queries)
22 |     assert ds is not None
23 |     assert len(ds.queries) > 2000
24 |     assert len(ds._indexes["tactics"]) > 1000
25 |     assert len(ds._indexes["techniques"]) > 1000
26 | 
27 |     indexes = [random.randint(0, len(ds.queries)) for _ in range(10)]
28 |     for attrib in ["source_path", "query", "query_id", "attributes"]:
29 |         for idx in indexes:
30 |             assert hasattr(ds.queries[idx], attrib)
31 | 
32 | 
33 | def test_get_community_queries(tmp_path):
34 |     """Test downloading sentinel queries."""
35 |     conf_path = Path(__file__).parent.joinpath("repos.yaml")
36 |     queries = get_community_queries(tmp_path, config=conf_path)
37 |     ds = DataStore(queries)
38 |     assert ds is not None
39 |     assert len(ds.queries) > 100
40 | 
41 |     indexes = [random.randint(0, len(ds.queries)) for _ in range(10)]
42 |     for attrib in ["source_path", "query", "query_id"]:
43 |         for idx in indexes:
44 |             assert hasattr(ds.queries[idx], attrib)
45 | 


--------------------------------------------------------------------------------
/src/test_kql_extract.py:
--------------------------------------------------------------------------------
 1 | # -------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License. See License.txt in the project root for
 4 | # license information.
 5 | # --------------------------------------------------------------------------
 6 | """Test kql extraction integration."""
 7 | 
 8 | from datetime import datetime, timezone
 9 | from pathlib import Path
10 | 
11 | import pytest
12 | 
13 | from . import kql_extract as extract
14 | from .data_store import DataStore
15 | from .kql_query import KqlQuery
16 | from .test_data_store import get_random_query
17 | 
18 | __author__ = "Ian Hellen"
19 | 
20 | # pylint: disable=redefined-outer-name, protected-access
21 | 
22 | 
23 | _TEST_KQL = Path(__file__).parent.joinpath("test_data")
24 | 
25 | 
26 | @pytest.fixture
27 | def get_queries_with_kql():
28 |     queries = []
29 |     for file in Path(_TEST_KQL).glob("*.kql"):
30 | 
31 |         query_text = file.read_text(encoding="utf-8")
32 |         for query in [KqlQuery(**get_random_query(i)) for i in range(2)]:
33 |             query.query = query_text
34 |             queries.append(query)
35 |     return queries
36 | 
37 | 
38 | def test_extract_from_ds_query(get_queries_with_kql):
39 |     """Function_docstring."""
40 | 
41 |     queries = get_queries_with_kql
42 |     assert len(queries) > 0
43 |     ds = DataStore(queries)
44 |     assert len(ds.queries) == len(get_queries_with_kql)
45 | 
46 |     try:
47 |         extract.start()
48 |         start = datetime.now(timezone.utc)
49 |         print(start)
50 |         for query in ds.queries:
51 |             result = extract.extract_kql(query.query, query_id=query.query_id)
52 |             print(result)
53 |             ds.add_kql_properties(query_id=query.query_id, kql_properties=result)
54 |         end = datetime.now(timezone.utc)
55 |         print(end, "total time", end - start)
56 |     finally:
57 |         extract.stop()
58 |     print([len(query.kql_properties) for query in ds.queries])
59 |     assert all(len(query.kql_properties) for query in ds.queries)
60 |     assert len(ds._indexes) >= 6
61 |     assert all(item in ds._indexes for item in ["tactics", "tables", "operators"])
62 |     assert len(ds._indexes["tables"]) >= len(ds.queries)
63 |     assert len(ds._indexes["operators"]) >= len(ds.queries)
64 | 


--------------------------------------------------------------------------------
/src/test_kql_query.py:
--------------------------------------------------------------------------------
 1 | from .kql_query import KqlQuery
 2 | 
 3 | 
 4 | def test_kql_query():
 5 |     kql = KqlQuery(
 6 |         source_path="https://github.com/a/b/file.kql", query="SecurityAlert | take 1"
 7 |     )
 8 |     print(kql)
 9 |     print(kql.asdict())
10 |     print(kql.to_json())
11 | 
12 |     KqlQuery.kql_list_to_pylist([kql, kql])
13 | 
14 |     KqlQuery.kql_list_to_json([kql, kql])
15 | 
16 |     KqlQuery.kql_list_to_df([kql, kql])
17 | 


--------------------------------------------------------------------------------
/test_runs/kql_query_db-2022-09-23-22-30-16.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/kql-query-store/b2b2e82fe3a80a2baf8355b90d3a88db98b1a472/test_runs/kql_query_db-2022-09-23-22-30-16.pkl


--------------------------------------------------------------------------------
/test_runs/kql_query_db-2022-09-24-02-51-50.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/kql-query-store/b2b2e82fe3a80a2baf8355b90d3a88db98b1a472/test_runs/kql_query_db-2022-09-24-02-51-50.pkl


--------------------------------------------------------------------------------
/test_runs/kql_query_df--022-09-23_00_44_55.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/kql-query-store/b2b2e82fe3a80a2baf8355b90d3a88db98b1a472/test_runs/kql_query_df--022-09-23_00_44_55.pkl


--------------------------------------------------------------------------------