├── .gitattributes ├── .github └── workflows │ ├── CheckNotebookContents.yml │ ├── delays.yml │ ├── impact.yml │ ├── ja.yml │ ├── run.yml │ └── sql.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .vscode └── settings.json ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── SECURITY.md ├── delays.ipynb ├── impact.ipynb ├── ja.ipynb ├── params.json ├── queries ├── ActiveIncidents.csl ├── HealthAgentActions.csl ├── LocationName.csl ├── MDMAccount.csl ├── SlowActivities.csl ├── SlowSql.csl ├── WhatChanged.csl ├── delays │ ├── Abusers.csl │ ├── AffectedAccounts.csl │ ├── DelayedAccountsAreAbusers.csl │ ├── Load.csl │ ├── LoadPerHost.csl │ ├── OrchestrationLogSpike.csl │ ├── OrchestrationLogSpikeTip.csl │ ├── Parallelism.csl │ └── WhatDelayed.csl ├── impact │ ├── CommandsAT.csl │ ├── CommandsDb.csl │ ├── CommandsReason.csl │ ├── Dependencies.csl │ └── Gen2GCSpikes.csl ├── ja │ └── JASqlTime.csl ├── run │ ├── PlanInfo.csl │ └── WhatHappened.csl ├── sla │ ├── SLADurationAnalysis.csl │ └── SLAVisualization.csl └── sql │ ├── CpuActivity.csl │ ├── CpuJob.csl │ ├── CpuTop.csl │ ├── CpuXEvent.csl │ ├── GetData.csl │ └── WhatsSlow.csl ├── run.ipynb ├── sla.ipynb └── sql.ipynb /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto -------------------------------------------------------------------------------- /.github/workflows/CheckNotebookContents.yml: -------------------------------------------------------------------------------- 1 | name: Check notebook contents 2 | 3 | on: [pull_request] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | 10 | steps: 11 | - uses: actions/checkout@master 12 | - name: Ensure dependencies 13 | run: | 14 | #!/usr/bin/env python 15 | sudo pip install --upgrade nbstripout 16 | which python 17 | pip show nbstripout 18 | python -m site 19 | #sudo python /home/runner/.local/lib/python2.7/site-packages/nbstripout/_nbstripout.py --install --global 20 | nbstripout --install --global 21 | - name: Check contents 22 | run: | 23 | # Execute commands 24 | hasError=False 25 | for f in ${{runner.workspace}}/devops-pipelines/*.ipynb 26 | do 27 | echo "Processing $f..." 28 | newFileName="${f}_stripped" 29 | cat $f | nbstripout > $newFileName 30 | diff --strip-trailing-cr $f $newFileName > /dev/null 2>&1 31 | error=$? 32 | if [ $error -eq 0 ] 33 | then 34 | echo "$f seems okay..." 35 | elif [ $error -eq 1 ] 36 | then 37 | echo "$f is invalid..." 38 | hasError=True 39 | else 40 | echo "Sorry, there was something wrong with the diff command..." 41 | fi 42 | done 43 | 44 | if ($hasError); then 45 | echo "Notebooks shouldn't contain any outputs, please install pre-commit hook, see readme." 46 | exit 1 47 | fi 48 | -------------------------------------------------------------------------------- /.github/workflows/delays.yml: -------------------------------------------------------------------------------- 1 | name: Delays analysis 2 | 3 | on: 4 | push: 5 | branches: 6 | - automation/delays/* 7 | 8 | jobs: 9 | run: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v1 13 | - name: Set up Python 14 | uses: actions/setup-python@v1 15 | - name: Install dependencies 16 | run: | 17 | python -m pip install matplotlib nimport azure-kusto-notebooks plotly numpy==1.17.0 18 | - uses: yaananth/run-notebook@v1 19 | env: 20 | RUNNER: ${{ toJson(runner) }} 21 | SECRETS: ${{ toJson(secrets) }} 22 | GITHUB: ${{ toJson(github) }} 23 | with: 24 | notebook: "delays.ipynb" 25 | params: "params.json" 26 | poll: true 27 | - uses: actions/upload-artifact@master 28 | if: always() 29 | with: 30 | name: output 31 | path: ${{ RUNNER.temp }}/nb-runner 32 | env: 33 | RUNNER: ${{ toJson(runner) }} 34 | -------------------------------------------------------------------------------- /.github/workflows/impact.yml: -------------------------------------------------------------------------------- 1 | name: Impact analysis 2 | 3 | on: 4 | push: 5 | branches: 6 | - automation/impact/* 7 | 8 | jobs: 9 | run: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v1 13 | - name: Set up Python 14 | uses: actions/setup-python@v1 15 | - name: Install dependencies 16 | run: | 17 | python -m pip install matplotlib nimport azure-kusto-notebooks plotly numpy==1.17.0 18 | - uses: yaananth/run-notebook@v1 19 | env: 20 | RUNNER: ${{ toJson(runner) }} 21 | SECRETS: ${{ toJson(secrets) }} 22 | GITHUB: ${{ toJson(github) }} 23 | with: 24 | notebook: "impact.ipynb" 25 | params: "params.json" 26 | poll: true 27 | - uses: actions/upload-artifact@master 28 | if: always() 29 | with: 30 | name: output 31 | path: ${{ RUNNER.temp }}/nb-runner 32 | env: 33 | RUNNER: ${{ toJson(runner) }} 34 | -------------------------------------------------------------------------------- /.github/workflows/ja.yml: -------------------------------------------------------------------------------- 1 | name: Jobagent analysis 2 | 3 | on: 4 | push: 5 | branches: 6 | - automation/ja/* 7 | 8 | jobs: 9 | run: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v1 13 | - name: Set up Python 14 | uses: actions/setup-python@v1 15 | - name: Install dependencies 16 | run: | 17 | python -m pip install matplotlib nimport azure-kusto-notebooks plotly numpy 18 | - uses: yaananth/run-notebook@v1 19 | env: 20 | RUNNER: ${{ toJson(runner) }} 21 | SECRETS: ${{ toJson(secrets) }} 22 | GITHUB: ${{ toJson(github) }} 23 | with: 24 | notebook: "ja.ipynb" 25 | params: "params.json" 26 | poll: true 27 | - uses: actions/upload-artifact@master 28 | if: always() 29 | with: 30 | name: output 31 | path: ${{ RUNNER.temp }}/nb-runner 32 | env: 33 | RUNNER: ${{ toJson(runner) }} 34 | -------------------------------------------------------------------------------- /.github/workflows/run.yml: -------------------------------------------------------------------------------- 1 | name: Run analysis 2 | 3 | on: 4 | push: 5 | branches: 6 | - automation/run/* 7 | 8 | jobs: 9 | run: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v1 13 | - name: Set up Python 14 | uses: actions/setup-python@v1 15 | - name: Install dependencies 16 | run: | 17 | python -m pip install matplotlib nimport azure-kusto-notebooks plotly numpy 18 | - uses: yaananth/run-notebook@v1 19 | env: 20 | RUNNER: ${{ toJson(runner) }} 21 | SECRETS: ${{ toJson(secrets) }} 22 | GITHUB: ${{ toJson(github) }} 23 | with: 24 | notebook: "run.ipynb" 25 | params: "params.json" 26 | poll: true 27 | - uses: actions/upload-artifact@master 28 | if: always() 29 | with: 30 | name: output 31 | path: ${{ RUNNER.temp }}/nb-runner 32 | env: 33 | RUNNER: ${{ toJson(runner) }} 34 | -------------------------------------------------------------------------------- /.github/workflows/sql.yml: -------------------------------------------------------------------------------- 1 | name: Sql analysis 2 | 3 | on: 4 | push: 5 | branches: 6 | - automation/sql/* 7 | 8 | jobs: 9 | run: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v1 13 | - name: Set up Python 14 | uses: actions/setup-python@v1 15 | - name: Install dependencies 16 | run: | 17 | python -m pip install matplotlib nimport azure-kusto-notebooks plotly 18 | - uses: yaananth/run-notebook@v1 19 | env: 20 | RUNNER: ${{ toJson(runner) }} 21 | SECRETS: ${{ toJson(secrets) }} 22 | GITHUB: ${{ toJson(github) }} 23 | with: 24 | notebook: "sql.ipynb" 25 | params: "params.json" 26 | poll: true 27 | - uses: actions/upload-artifact@master 28 | if: always() 29 | with: 30 | name: output 31 | path: ${{ RUNNER.temp }}/nb-runner 32 | env: 33 | RUNNER: ${{ toJson(runner) }} 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | Kqlmagic_temp_files 2 | .ipynb_checkpoints/* 3 | *.pyc 4 | *.sh 5 | .venv 6 | .DS_Store -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/kynan/nbstripout 3 | rev: master 4 | hooks: 5 | - id: nbstripout 6 | files: ".ipynb" -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "workbench.colorCustomizations": {} 3 | } -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Devops-pipelines 2 | Warehouse of notebooks for producing root-cause analysis of Azure DevOps pipeline delays. 3 | 4 | Uses both [azure data explorer](https://docs.microsoft.com/en-us/azure/data-explorer/), and [azure notebooks](https://docs.microsoft.com/en-us/azure/notebooks/). 5 | 6 | # Usage 7 | ## Commands 8 | ``` 9 | # Initialize 10 | !pip install --upgrade pip Kqlmagic nimport azure.kusto.data[pandas] 11 | %load_ext nimport 12 | ``` 13 | 14 | ``` 15 | # Let's clone our repo, path is not relevant here, this just clones the whole repo 16 | %nimport container="microsoft/devops-pipelines" path="delays.ipynb" provider="github" providerOptions={"clone":"true"} 17 | ``` 18 | 19 | ``` 20 | # If you have a URL where you want to parse parameters... 21 | from nimport.utils import open_nb, parse_params 22 | params = parse_params(currentUrl) 23 | display(params) 24 | ``` 25 | 26 | ``` 27 | # Open the notebook by replacing the parameters 28 | open_nb("devops-pipelines/delays.ipynb", params) 29 | ``` 30 | 31 | # Contributing 32 | 33 | ## Requirements 34 | - Commands 35 | 36 | `pip install pre-commit` 37 | 38 | `pre-commit install` 39 | 40 | - Open PRs'! 41 | 42 | ## Notice 43 | 44 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 45 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 46 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 47 | 48 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 49 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 50 | provided by the bot. You will only need to do this once across all repos using our CLA. 51 | 52 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 53 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 54 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 55 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /delays.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "# Orchestration delays Investigation\n", 7 | "1. Run all cells.\n", 8 | "1. Scroll down to see for any authentication messages\n", 9 | "1. View report at the bottom." 10 | ], 11 | "metadata": {} 12 | }, 13 | { 14 | "cell_type": "code", 15 | "source": [ 16 | "# These are just defaults will be overwritten if you use nimport pip\n", 17 | "start = \"2019-10-15T20:21:54.0330000Z\"\n", 18 | "end = \"2019-10-15T20:52:21.5370169Z\"\n", 19 | "service = \"pipelines\"\n", 20 | "su = \"pipelines-ghub-eus2-2\"\n", 21 | "hub = \"Actions\"\n", 22 | "url = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK/delays.ipynb\"\n", 23 | "baseUrl = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK\"" 24 | ], 25 | "outputs": [], 26 | "execution_count": null, 27 | "metadata": { 28 | "inputHidden": false, 29 | "outputHidden": false, 30 | "tags": [ 31 | "parameters" 32 | ] 33 | } 34 | }, 35 | { 36 | "cell_type": "code", 37 | "source": [ 38 | "%%capture\n", 39 | "!pip install --upgrade nimport azure-kusto-notebooks" 40 | ], 41 | "outputs": [], 42 | "execution_count": null, 43 | "metadata": { 44 | "inputHidden": false, 45 | "outputHidden": false, 46 | "tags": [ 47 | "debug" 48 | ] 49 | } 50 | }, 51 | { 52 | "cell_type": "code", 53 | "source": [ 54 | "# Import the things we use\n", 55 | "\n", 56 | "# Note you can also use kql https://docs.microsoft.com/en-us/azure/data-explorer/kqlmagic\n", 57 | "# %kql is single line magic\n", 58 | "# %%kql is cell magic\n", 59 | "\n", 60 | "# https://nbviewer.jupyter.org/github/ipython/ipython/blob/4.0.x/examples/IPython%20Kernel/Rich%20Output.ipynb#HTML\n", 61 | "# https://ipython.readthedocs.io/en/stable/inte/magics.html\n", 62 | "from IPython.display import display, HTML, Markdown, Javascript, clear_output\n", 63 | "\n", 64 | "# http://pandas-docs.github.io/pandas-docs-travis/user_guide/reshaping.html\n", 65 | "import pandas as pd\n", 66 | "pd.options.display.html.table_schema = True\n", 67 | "from pandas import Series, DataFrame\n", 68 | "from datetime import datetime, timedelta, timezone\n", 69 | "from urllib.parse import urlencode, quote_plus\n", 70 | "from requests.utils import requote_uri\n", 71 | "import time\n", 72 | "import numpy as np\n", 73 | "from matplotlib import pyplot as plt\n", 74 | "from nimport.utils import tokenize, open_nb\n", 75 | "import json\n", 76 | "import os\n", 77 | "import calendar as cal\n", 78 | "import concurrent.futures\n", 79 | "from azure.kusto.notebooks import utils as akn" 80 | ], 81 | "outputs": [], 82 | "execution_count": null, 83 | "metadata": { 84 | "inputHidden": false, 85 | "outputHidden": false 86 | } 87 | }, 88 | { 89 | "cell_type": "code", 90 | "source": [ 91 | "params = {\n", 92 | " \"su\": su,\n", 93 | " \"start\": start,\n", 94 | " \"end\": end,\n", 95 | " \"url\": url,\n", 96 | " \"baseUrl\": baseUrl,\n", 97 | " \"service\": service,\n", 98 | " \"hub\": hub\n", 99 | "}\n", 100 | "root = 'devops-pipelines' if os.path.basename(os.getcwd()) != 'devops-pipelines' else ''\n", 101 | "queryPath = os.path.join(root, 'queries')" 102 | ], 103 | "outputs": [], 104 | "execution_count": null, 105 | "metadata": { 106 | "inputHidden": false, 107 | "outputHidden": false 108 | } 109 | }, 110 | { 111 | "cell_type": "code", 112 | "source": [ 113 | "# authenticate kusto client\n", 114 | "# you will need to copy the token into a browser window for AAD auth. \n", 115 | "client = akn.get_client('https://vso.kusto.windows.net')" 116 | ], 117 | "outputs": [], 118 | "execution_count": null, 119 | "metadata": { 120 | "inputHidden": false, 121 | "outputHidden": false 122 | } 123 | }, 124 | { 125 | "cell_type": "code", 126 | "source": [ 127 | "# authenticate kusto client\n", 128 | "# you will need to copy the token into a browser window for AAD auth. \n", 129 | "icm_client = akn.get_client('https://icmcluster.kusto.windows.net')" 130 | ], 131 | "outputs": [], 132 | "execution_count": null, 133 | "metadata": { 134 | "inputHidden": false, 135 | "outputHidden": false 136 | } 137 | }, 138 | { 139 | "cell_type": "code", 140 | "source": [ 141 | "q_loc = os.path.join(queryPath, \"LocationName.csl\")\n", 142 | "q_whatChanged = os.path.join(queryPath, \"WhatChanged.csl\")\n", 143 | "q_haActions = os.path.join(queryPath, \"HealthAgentActions.csl\")\n", 144 | "q_mdm = os.path.join(queryPath, \"MDMAccount.csl\")\n", 145 | "\n", 146 | "delaysPath = os.path.join(queryPath, \"delays\")\n", 147 | "q_affectedAccounts = os.path.join(delaysPath, \"AffectedAccounts.csl\")\n", 148 | "q_abusers = os.path.join(delaysPath, \"Abusers.csl\")\n", 149 | "q_affAccounts = os.path.join(delaysPath, \"AffectedAccounts.csl\")\n", 150 | "q_delayedAccountsAreAbusers = os.path.join(delaysPath, \"DelayedAccountsAreAbusers.csl\")\n", 151 | "q_whatDelayed = os.path.join(delaysPath, \"WhatDelayed.csl\")\n", 152 | "q_load = os.path.join(delaysPath, \"Load.csl\")\n", 153 | "\n", 154 | "with concurrent.futures.ThreadPoolExecutor() as executor:\n", 155 | " # materialize location name immediately as we need this for other queries\n", 156 | " p1 = executor.submit(akn.execute_file, client, 'VSO', q_loc, params)\n", 157 | " locationNameResult = akn.to_dataframe_from_future(p1)\n", 158 | " locationName = locationNameResult[\"Tenant\"][0]\n", 159 | " params[\"locationName\"] = locationName\n", 160 | " p2 = executor.submit(akn.execute_file, client, 'VSO', q_whatChanged, params)\n", 161 | " p4 = executor.submit(akn.execute_file, client, 'VSO', q_haActions, params) \n", 162 | " \n", 163 | " p5 = executor.submit(akn.execute_file, client, 'VSO', q_affectedAccounts, params)\n", 164 | " p6 = executor.submit(akn.execute_file, client, 'VSO', q_abusers, params)\n", 165 | " p7 = executor.submit(akn.execute_file, client, 'VSO', q_affAccounts, params)\n", 166 | " p8 = executor.submit(akn.execute_file, client, 'VSO', q_delayedAccountsAreAbusers, params)\n", 167 | " p9 = executor.submit(akn.execute_file, client, 'VSO', q_whatDelayed, params)\n", 168 | " p10 = executor.submit(akn.execute_file, client, 'VSO', q_load, params)\n", 169 | " \n", 170 | " p11 = executor.submit(akn.execute_file, icm_client, 'IcmDataWarehouse', \n", 171 | " os.path.join(queryPath, 'ActiveIncidents.csl'), params)\n", 172 | " p12 = executor.submit(akn.execute_file, client, 'VSO', q_mdm, params)\n", 173 | "\n", 174 | "q_whatChanged_df = akn.to_dataframe_from_future(p2)\n", 175 | "q_haActions_df = akn.to_dataframe_from_future(p4)\n", 176 | "q_affectedAccountsResultDf = akn.to_dataframe_from_future(p5)\n", 177 | "\n", 178 | "abusersDf = akn.to_dataframe_from_future(p6)\n", 179 | "finalabusersList = np.intersect1d(q_affectedAccountsResultDf[\"HostId\"].values, abusersDf[\"HostId\"].values);\n", 180 | "\n", 181 | "q_affAccounts_df = akn.to_dataframe_from_future(p7)\n", 182 | "q_delayedAccountsAreAbusers_df = akn.to_dataframe_from_future(p8)\n", 183 | "q_whatDelayedResultDf = akn.to_dataframe_from_future(p9)\n", 184 | "q_loadResultDf = akn.to_dataframe_from_future(p10)\n", 185 | "\n", 186 | "q_activeIncidentsResultDf = akn.to_dataframe_from_future(p11)\n", 187 | "\n", 188 | "q_mdmDf = akn.to_dataframe_from_future(p12)\n", 189 | "params[\"mdmAccount\"] = q_mdmDf[\"monitoringAccount\"][0]" 190 | ], 191 | "outputs": [], 192 | "execution_count": null, 193 | "metadata": { 194 | "inputHidden": false, 195 | "outputHidden": false 196 | } 197 | }, 198 | { 199 | "cell_type": "code", 200 | "source": [ 201 | "q_spike = os.path.join(delaysPath, \"OrchestrationLogSpike.csl\")\n", 202 | "q_parallelism = os.path.join(delaysPath, \"Parallelism.csl\")\n", 203 | "with concurrent.futures.ThreadPoolExecutor() as executor:\n", 204 | " sfs = [executor.submit(akn.execute_file, client, 'VSO', q_spike, \n", 205 | " {\n", 206 | " **params,\n", 207 | " \"hostId\": r\n", 208 | " }) for r in q_delayedAccountsAreAbusers_df[\"HostId\"].values]\n", 209 | " sfsResults = [s.result() for s in concurrent.futures.as_completed(sfs)]\n", 210 | " pfs = [executor.submit(akn.execute_file, client, 'VSO', q_parallelism, \n", 211 | " {\n", 212 | " **params,\n", 213 | " \"hostId\": r\n", 214 | " }) for r in q_delayedAccountsAreAbusers_df[\"HostId\"].values]\n", 215 | " pfsResults = [s.result() for s in concurrent.futures.as_completed(pfs)]\n", 216 | "\n", 217 | "# convert to data frames\n", 218 | "s_primary_results = [s.primary_results[0] for s in sfsResults]\n", 219 | "spikeResultsDfs = None\n", 220 | "\n", 221 | "p_primary_results = [s.primary_results[0] for s in pfsResults]\n", 222 | "parResultsDfs = None\n", 223 | "\n", 224 | "with concurrent.futures.ThreadPoolExecutor() as executor:\n", 225 | " s_dataframe_futures = [executor.submit(akn.to_dataframe, r) for r in s_primary_results]\n", 226 | " spikeResultsDfs = [dff.result() for dff in concurrent.futures.as_completed(s_dataframe_futures)]\n", 227 | " p_dataframe_futures = [executor.submit(akn.to_dataframe, r) for r in p_primary_results]\n", 228 | " parResultsDfs = [dff.result() for dff in concurrent.futures.as_completed(p_dataframe_futures)]\n", 229 | "sfsResults = None\n", 230 | "sfsResults = None" 231 | ], 232 | "outputs": [], 233 | "execution_count": null, 234 | "metadata": { 235 | "inputHidden": false, 236 | "outputHidden": false 237 | } 238 | }, 239 | { 240 | "cell_type": "code", 241 | "source": [ 242 | "q_loadPerHost = os.path.join(delaysPath, \"LoadPerHost.csl\")\n", 243 | "# utility functions\n", 244 | "from itertools import groupby\n", 245 | "content = ''\n", 246 | "def r(*args):\n", 247 | " '''construct a markdown report'''\n", 248 | " global content\n", 249 | " content += ''.join([str(a) for a in args]) + '\\n'\n", 250 | "\n", 251 | "startTime = akn.to_datetime(start)\n", 252 | "t0 = startTime.replace(tzinfo=None)\n", 253 | "\n", 254 | "# report! \n", 255 | "r('# OK SO WHAT HAPPENED')\n", 256 | "r('|parameter|value|')\n", 257 | "r('|---|---|')\n", 258 | "r('|startTime|', startTime, '|')\n", 259 | "r('|endTime|', akn.to_datetime(end), '|')\n", 260 | "r('|scale unit|', su, '|')\n", 261 | "r('|service|', service, '|')\n", 262 | "\n", 263 | "# jarvis params\n", 264 | "jarvisParams = {\n", 265 | " 'su': su, \n", 266 | " 'start': akn.get_time(start, -10), \n", 267 | " 'end': akn.get_time(end, 10), \n", 268 | " 'service': service,\n", 269 | " 'location': locationName,\n", 270 | " 'account': params[\"mdmAccount\"]\n", 271 | "}\n", 272 | "\n", 273 | "# abuse detection?\n", 274 | "r('## What users are impacted?')\n", 275 | "if len(finalabusersList) > 0:\n", 276 | " r('INSIGHT: Found abusers -- this alert is likely a false alarm.')\n", 277 | "r(akn.pandas_df_to_markdown_table(q_delayedAccountsAreAbusers_df)) \n", 278 | "\n", 279 | "\n", 280 | "\n", 281 | "# what changed? analysis\n", 282 | "r('## What changed?')\n", 283 | "if q_whatChanged_df.empty:\n", 284 | " r(\"...no relevant config changes recorded during this period.\")\n", 285 | "else:\n", 286 | " # compute relative times and relevant changes\n", 287 | " history = q_whatChanged_df\n", 288 | " history['RelativeSeconds'] = history.apply(lambda row: (row.TIMESTAMP.replace(tzinfo=None) - t0).total_seconds(), axis=1)\n", 289 | " relevant = history[abs(history.RelativeSeconds) < 3600]\n", 290 | " \n", 291 | " # analysis\n", 292 | " upgrade = False\n", 293 | " mitigation = False\n", 294 | " vip_swap = False\n", 295 | " ffs = False\n", 296 | " for t in relevant.title.values:\n", 297 | " l = t.lower()\n", 298 | " upgrade = upgrade or 'upgrade' in l\n", 299 | " mitigation = mitigation or 'mitigation' in l\n", 300 | " vip_swap = vip_swap or 'vip' in l\n", 301 | " ffs = ffs or 'feature flag' in l\n", 302 | " \n", 303 | " if upgrade:\n", 304 | " r('INSIGHT: there were database upgrades in progress')\n", 305 | " if mitigation:\n", 306 | " r('INSIGHT: there were mitigations in progress')\n", 307 | " if vip_swap:\n", 308 | " r('INSIGHT: there was a vip swap just before this period.')\n", 309 | " if ffs:\n", 310 | " r('INSIGHT: there were feature flag changes right before this period.')\n", 311 | " \n", 312 | " # full table\n", 313 | " r(akn.pandas_df_to_markdown_table(relevant[['TIMESTAMP', 'RelativeSeconds', 'title']]))\n", 314 | " \n", 315 | " \n", 316 | "# active incidents?\n", 317 | "r('## Active incidents?')\n", 318 | "otherIncidentsCount = 0;\n", 319 | "\n", 320 | "if q_activeIncidentsResultDf is not None and not q_activeIncidentsResultDf.empty:\n", 321 | " for index, row in q_activeIncidentsResultDf.iterrows():\n", 322 | " if(row.Title.find(\"Kalypso: Build Orchestrator Delays ICM\") == -1):\n", 323 | " otherIncidentsCount += 1\n", 324 | " \n", 325 | " if otherIncidentsCount > 0:\n", 326 | " r(\"INSIGHT: There were incidents recorded during this period. These might be related:\")\n", 327 | " newDf = q_activeIncidentsResultDf.assign(URL=[*map(lambda x: \"\"\"[%s](https://icm.ad.msft.net/imp/v3/incidents/details/%s/home)\"\"\" % (x,x), q_activeIncidentsResultDf.IncidentId)]) \n", 328 | " r(\"\\n\")\n", 329 | " r(akn.pandas_df_to_markdown_table(newDf[['URL','Severity','Title']]))\n", 330 | " else:\n", 331 | " r(\"...no relevant incidents during this period.\") \n", 332 | " \n", 333 | " \n", 334 | "r('## Queue Load')\n", 335 | "ar = q_loadResultDf[q_loadResultDf[\"Name\"] == \"DTPlanQueued\"].values[:, 2]\n", 336 | "queuedGreatherThan500 = np.where(ar > 500)\n", 337 | "ar_max = np.amax(ar) if len(ar) else '?'\n", 338 | "if len(queuedGreatherThan500[0]) > 0:\n", 339 | " r('INSIGHT: There was a high rate of jobs queued during this period (max: ', ar_max, ' / minute)...')\n", 340 | "else: \n", 341 | " r('...everything looks good? (max: ', ar_max, ' / minute)')\n", 342 | "\n", 343 | " \n", 344 | "r('## Parallelism')\n", 345 | "for parResultsDf in parResultsDfs:\n", 346 | " if len(parResultsDf.C.values) > 0: \n", 347 | " usage = parResultsDf.C.values[0]\n", 348 | " times = parResultsDf.sampleTime.values[0]\n", 349 | " hostId = parResultsDf.HostId[0]\n", 350 | " maxindex = np.argmax(usage)\n", 351 | " maxvalue = usage[maxindex]\n", 352 | " atTime = times[maxindex]\n", 353 | " results = {value: len(list(freq)) for value, freq in groupby(sorted(usage))}\n", 354 | " printed = False\n", 355 | " r(\"\"\"\\nFor host: **%s**...\"\"\" % (hostId))\n", 356 | " for key, value in results.items():\n", 357 | " if key > 10:\n", 358 | " r(\"\"\"\\nRunning plans (per 1min) %s : number of occurences during incident time %s\"\"\"%(key, value))\n", 359 | " printed = True\n", 360 | " if not printed:\n", 361 | " r(\"\\nNothing found greater than 10\")\n", 362 | " else:\n", 363 | " r(\"\\n-\")\n", 364 | " \n", 365 | "\n", 366 | "r('## Orchestration phase Load')\n", 367 | "for spikeResultDf in spikeResultsDfs:\n", 368 | " countResult = spikeResultDf.C.describe()\n", 369 | " hostId = spikeResultDf[\"HostId\"].values[0]\n", 370 | " upper = countResult[\"75%\"]\n", 371 | " lower = countResult[\"25%\"]\n", 372 | " # Wondering what's going on here? We detect anomolies, see https://www.purplemath.com/modules/boxwhisk3.htm\n", 373 | " IQR = upper - lower\n", 374 | " countResultOfInterest = spikeResultDf[spikeResultDf[\"C\"] > upper + 1.5 * IQR ].head(5)\n", 375 | " unqCommands = list(dict.fromkeys(countResultOfInterest[\"Command\"].values).keys())\n", 376 | " if len(unqCommands) > 0:\n", 377 | " r(\"\"\"INSIGHT: Found anomalies for these phases in order highest to lowest for host: **%s**\"\"\" % hostId)\n", 378 | "\n", 379 | " # print commands table\n", 380 | " r(akn.pandas_df_to_markdown_table(countResultOfInterest[[\"Command\", \"C\"]])) \n", 381 | " \n", 382 | " \n", 383 | " if \"PlanCompleted\" in unqCommands:\n", 384 | " if \"StartPlan\" in unqCommands or \"PlanStarted\" in unqCommands:\n", 385 | " r(\"\\nTIP: An unusual number of plans were started during this period.\")\n", 386 | " else:\n", 387 | " r(\"\\nTIP: Jobs that are queued long ago might have completed now... creating this spike\") \n", 388 | " \n", 389 | " newParams = dict(params)\n", 390 | " newParams[\"command\"] = next(iter(unqCommands)) \n", 391 | " newParams[\"hostId\"] = hostId\n", 392 | " r(akn.details_md('Kusto query for analyzing spike:', \n", 393 | " tokenize(os.path.join(os.path.join(queryPath, \"delays\"), \"OrchestrationLogSpikeTip.csl\"), newParams)))\n", 394 | " r(akn.details_md('Kusto for analyzing load:', tokenize(q_loadPerHost, newParams)))\n", 395 | " \n", 396 | " else:\n", 397 | " r('...everything looks good?') \n", 398 | " \n", 399 | "# ja load\n", 400 | "r()\n", 401 | "r('## JA Load')\n", 402 | "q_whatDelayedResultPendingJobsDf = q_whatDelayedResultDf[q_whatDelayedResultDf.Pivot == \"\\JobService(_Total)\\Total Pending Jobs\"]\n", 403 | "pendingGreaterThan50Result = np.where(q_whatDelayedResultPendingJobsDf.avg_CounterValue.values > 50)\n", 404 | "if len(pendingGreaterThan50Result[0]) > 0:\n", 405 | " max_pending_jobs = np.max(q_whatDelayedResultPendingJobsDf.avg_CounterValue.values)\n", 406 | " r(\"INSIGHT: There was a high number of pending jobs during this period (max was %s). Note that this is for jobs including all priorities (even low priority ones)\" % (max_pending_jobs)) \n", 407 | " \n", 408 | " open_nb(os.path.join(root, 'ja.ipynb'), params, redirect=False)\n", 409 | " jaUrl = baseUrl + \"/devops-pipelines/ja.ipynb\"\n", 410 | " r('\\n\\n[JobAgent investigation notebook](', requote_uri(jaUrl), ')')\n", 411 | "\n", 412 | " jaJarvisLink = \"\"\"https://jarvis-west.dc.ad.msft.net/dashboard/VSO-ServiceInsights/PlatformViews/Compute-JA\"\"\" \\\n", 413 | " \"\"\"?overrides=[{\"query\":\"//*[id='Service']\",\"key\":\"value\",\"replacement\":\"%(service)s\"},\"\"\" \\\n", 414 | " \"\"\"{\"query\":\"//*[id='RoleInstance']\",\"key\":\"value\",\"replacement\":\"\"},\"\"\" \\\n", 415 | " \"\"\"{\"query\":\"//*[id='LocationName']\",\"key\":\"value\",\"replacement\":\"%(location)s\"},\"\"\" \\\n", 416 | " \"\"\"{\"query\":\"//dataSources\",\"key\":\"namespace\",\"replacement\":\"%(su)s\"},\"\"\" \\\n", 417 | " \"\"\"{\"query\":\"//dataSources\",\"key\":\"account\",\"replacement\":\"%(account)s\"},\"\"\" \\\n", 418 | " \"\"\"{\"query\":\"//*[id='ApplicationEndpoint']\",\"key\":\"regex\",\"replacement\":\"*%(location)s*\"},\"\"\" \\\n", 419 | " \"\"\"{\"query\":\"//*[id='ScaleUnit']\",\"key\":\"value\",\"replacement\":\"%(su)s\"}]\"\"\" \\\n", 420 | " \"\"\"&globalStartTime=%(start)s&globalEndTime=%(end)s&pinGlobalTimeRange=true\"\"\" % jarvisParams\n", 421 | " r('\\n\\n[JobAgent health dashboard](', requote_uri(jaJarvisLink), ')')\n", 422 | "else:\n", 423 | " r('...everything looks good?')\n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | "# more analysis? \n", 429 | "r('## What should we look at next?')\n", 430 | "url = baseUrl + \"/devops-pipelines/sla.ipynb\"\n", 431 | "SLAParams = {\n", 432 | " \"triggerTime\": params[\"start\"],\n", 433 | " \"scaleUnit\": params[\"su\"],\n", 434 | " \"service\": params[\"service\"],\n", 435 | " \"lookback\": \"1h\",\n", 436 | " \"region\": \"\"\n", 437 | "}\n", 438 | "open_nb(os.path.join(root, 'sla.ipynb'), SLAParams, redirect=False)\n", 439 | "r('\\n\\n[SLA investigation notebook](', requote_uri(url), ')') \n", 440 | "\n", 441 | "url = baseUrl + \"/devops-pipelines/impact.ipynb\"\n", 442 | "open_nb(os.path.join(root, 'impact.ipynb'), params, redirect=False)\n", 443 | "r('\\n\\n[Customer impact investigation notebook](', requote_uri(url), ')') \n", 444 | "\n", 445 | "# Scale unit health\n", 446 | "jarvisLink = \"\"\"https://jarvis-west.dc.ad.msft.net/dashboard/VSO-ServiceInsights/DevOpsReports/DevOpsReports\"\"\" \\\n", 447 | " \"\"\"?overrides=[{\"query\":\"//*[id='Service']\",\"key\":\"value\",\"replacement\":\"%(service)s\"},\"\"\" \\\n", 448 | " \"\"\"{\"query\":\"//*[id='RoleInstance']\",\"key\":\"value\",\"replacement\":\"\"},\"\"\" \\\n", 449 | " \"\"\"{\"query\":\"//*[id='LocationName']\",\"key\":\"value\",\"replacement\":\"%(location)s\"},\"\"\" \\\n", 450 | " \"\"\"{\"query\":\"//dataSources\",\"key\":\"namespace\",\"replacement\":\"%(su)s\"},\"\"\" \\\n", 451 | " \"\"\"{\"query\":\"//dataSources\",\"key\":\"account\",\"replacement\":\"%(account)s\"},\"\"\" \\\n", 452 | " \"\"\"{\"query\":\"//*[id='ApplicationEndpoint']\",\"key\":\"regex\",\"replacement\":\"*%(location)s*\"},\"\"\" \\\n", 453 | " \"\"\"{\"query\":\"//*[id='ScaleUnit']\",\"key\":\"value\",\"replacement\":\"%(su)s\"}]\"\"\" \\\n", 454 | " \"\"\"&globalStartTime=%(start)s&globalEndTime=%(end)s&pinGlobalTimeRange=true\"\"\" % jarvisParams;\n", 455 | "r('\\n\\n[Scale unit health dashboard (' + su + ', ' + service + ')](', requote_uri(jarvisLink), ')')\n", 456 | "\n", 457 | "\n", 458 | "Markdown(content)\n", 459 | "# print(content)" 460 | ], 461 | "outputs": [], 462 | "execution_count": null, 463 | "metadata": { 464 | "inputHidden": false, 465 | "outputHidden": false 466 | } 467 | }, 468 | { 469 | "cell_type": "code", 470 | "source": [ 471 | "# visualize delays\n", 472 | "import plotly\n", 473 | "from plotly import graph_objs as go\n", 474 | "delays = go.Scatter(\n", 475 | " x=q_affAccounts_df[\"PreciseTimeStamp\"],\n", 476 | " y=q_affAccounts_df[\"MessageDelayInSeconds\"],\n", 477 | " mode = 'lines',\n", 478 | " name = 'Delays in seconds',\n", 479 | " text= q_affAccounts_df['Name']\n", 480 | ")\n", 481 | "\n", 482 | "changed = go.Scatter(\n", 483 | " x=q_whatChanged_df[\"TIMESTAMP\"],\n", 484 | " y=np.repeat(50, len(q_whatChanged_df[\"TIMESTAMP\"].values)),\n", 485 | " mode = 'lines+markers',\n", 486 | " name = 'What Changed',\n", 487 | " text = q_whatChanged_df[\"Name\"],\n", 488 | " marker=dict(\n", 489 | " size=32,\n", 490 | " color = np.random.randn(500),\n", 491 | " colorscale='Viridis'\n", 492 | " )\n", 493 | ")\n", 494 | "\n", 495 | "mitigations = go.Scatter(\n", 496 | " x=q_haActions_df[\"PreciseTimeStamp\"],\n", 497 | " y=np.repeat(50, len(q_haActions_df[\"PreciseTimeStamp\"].values)),\n", 498 | " mode = 'markers',\n", 499 | " name = 'Mitigations',\n", 500 | " text = q_haActions_df[[\"MitigationName\", \"RoleInstance\"]].apply(lambda x: ''.join(x), axis=1),\n", 501 | " marker = dict(\n", 502 | " size = 10,\n", 503 | " color = 'rgba(152, 0, 0, .8)',\n", 504 | " line = dict(\n", 505 | " width = 2,\n", 506 | " color = 'rgb(0, 0, 0)'\n", 507 | " )\n", 508 | " )\n", 509 | ")\n", 510 | "\n", 511 | "data = [delays, changed, mitigations]\n", 512 | "plotly.offline.iplot(data)" 513 | ], 514 | "outputs": [], 515 | "execution_count": null, 516 | "metadata": { 517 | "inputHidden": false, 518 | "outputHidden": false 519 | } 520 | }, 521 | { 522 | "cell_type": "code", 523 | "source": [], 524 | "outputs": [], 525 | "execution_count": null, 526 | "metadata": { 527 | "inputHidden": false, 528 | "outputHidden": false 529 | } 530 | } 531 | ], 532 | "metadata": { 533 | "kernel_info": { 534 | "name": "python3" 535 | }, 536 | "kernelspec": { 537 | "name": "python3", 538 | "language": "python", 539 | "display_name": "Python 3" 540 | }, 541 | "language_info": { 542 | "name": "python", 543 | "version": "3.7.4", 544 | "mimetype": "text/x-python", 545 | "codemirror_mode": { 546 | "name": "ipython", 547 | "version": 3 548 | }, 549 | "pygments_lexer": "ipython3", 550 | "nbconvert_exporter": "python", 551 | "file_extension": ".py" 552 | }, 553 | "nteract": { 554 | "version": "0.14.5" 555 | } 556 | }, 557 | "nbformat": 4, 558 | "nbformat_minor": 0 559 | } 560 | -------------------------------------------------------------------------------- /impact.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Impact Investigation\n", 8 | "1. Run all cells.\n", 9 | "1. View report at the bottom." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "inputHidden": false, 17 | "outputHidden": false, 18 | "tags": [ 19 | "parameters" 20 | ] 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "su = \"pipelines-ghub-eus2-2\"\n", 25 | "start = \"2019-10-15T20:21:54.0330000Z\"\n", 26 | "end = \"2019-10-15T20:52:21.5370169Z\"\n", 27 | "url = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK/delays.ipynb\"\n", 28 | "baseUrl = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK\"\n", 29 | "service = \"pipelines\"\n", 30 | "hub = \"Actions\"\n", 31 | "locationName = \"pipelinesghubeus22\"\n", 32 | "mdmAccount = \"VSO-Pipelines\"" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": { 39 | "inputHidden": false, 40 | "outputHidden": false 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "%%capture\n", 45 | "!pip install --upgrade nimport azure-kusto-notebooks" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": { 52 | "inputHidden": false, 53 | "outputHidden": false 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "# Import the things we use\n", 58 | "\n", 59 | "# Note you can also use kql https://docs.microsoft.com/en-us/azure/data-explorer/kqlmagic\n", 60 | "# %kql is single line magic\n", 61 | "# %%kql is cell magic\n", 62 | "\n", 63 | "# https://nbviewer.jupyter.org/github/ipython/ipython/blob/4.0.x/examples/IPython%20Kernel/Rich%20Output.ipynb#HTML\n", 64 | "# https://ipython.readthedocs.io/en/stable/inte/magics.html\n", 65 | "from IPython.display import display, HTML, Markdown, Javascript, clear_output\n", 66 | "\n", 67 | "# http://pandas-docs.github.io/pandas-docs-travis/user_guide/reshaping.html\n", 68 | "import pandas as pd\n", 69 | "pd.options.display.html.table_schema = False\n", 70 | "from pandas import Series, DataFrame\n", 71 | "from datetime import datetime, timedelta, timezone\n", 72 | "from urllib.parse import urlencode, quote_plus\n", 73 | "from requests.utils import requote_uri\n", 74 | "import time\n", 75 | "import numpy as np\n", 76 | "from matplotlib import pyplot as plt\n", 77 | "from nimport.utils import tokenize, open_nb\n", 78 | "import json\n", 79 | "import os\n", 80 | "import calendar as cal\n", 81 | "import concurrent.futures\n", 82 | "from azure.kusto.notebooks import utils as akn" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": { 89 | "inputHidden": false, 90 | "outputHidden": false 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "params = {\n", 95 | " \"su\": su,\n", 96 | " \"start\": start,\n", 97 | " \"end\": end,\n", 98 | " \"url\": url,\n", 99 | " \"baseUrl\": baseUrl,\n", 100 | " \"service\": service\n", 101 | "}\n", 102 | "root = 'devops-pipelines' if os.path.basename(os.getcwd()) != 'devops-pipelines' else ''\n", 103 | "queryPath = os.path.join(root, 'queries')" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": { 110 | "inputHidden": false, 111 | "outputHidden": false 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "# authenticate kusto client\n", 116 | "# you will need to copy the token into a browser window for AAD auth. \n", 117 | "client = akn.get_client('https://vso.kusto.windows.net')" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": { 124 | "inputHidden": false, 125 | "outputHidden": false 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "# authenticate kusto client\n", 130 | "# you will need to copy the token into a browser window for AAD auth. \n", 131 | "icm_client = akn.get_client('https://icmcluster.kusto.windows.net')" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": { 138 | "inputHidden": false, 139 | "outputHidden": false 140 | }, 141 | "outputs": [], 142 | "source": [ 143 | "q_loc = os.path.join(queryPath, \"LocationName.csl\")\n", 144 | "q_whatChanged = os.path.join(queryPath, \"WhatChanged.csl\")\n", 145 | "q_mdm = os.path.join(queryPath, \"MDMAccount.csl\")\n", 146 | "\n", 147 | "impactPath = os.path.join(queryPath, \"impact\")\n", 148 | "q_commands = os.path.join(impactPath, \"CommandsReason.csl\")\n", 149 | "q_commandsAT = os.path.join(impactPath, \"CommandsAT.csl\")\n", 150 | "q_commandsDb = os.path.join(impactPath, \"CommandsDb.csl\")\n", 151 | "q_gen2 = os.path.join(impactPath, \"Gen2GCSpikes.csl\")\n", 152 | "q_dep = os.path.join(impactPath, \"Dependencies.csl\")\n", 153 | "with concurrent.futures.ThreadPoolExecutor() as executor:\n", 154 | " # materialize location name immediately as we need this for other queries\n", 155 | " p1 = executor.submit(akn.execute_file, client, 'VSO', q_loc, params)\n", 156 | " locationNameResult = akn.to_dataframe_from_future(p1)\n", 157 | " locationName = locationNameResult[\"Tenant\"][0]\n", 158 | " params[\"locationName\"] = locationName\n", 159 | " p2 = executor.submit(akn.execute_file, client, 'VSO', q_whatChanged, params)\n", 160 | " \n", 161 | " p4 = executor.submit(akn.execute_file, client, 'VSO', q_commandsAT, params)\n", 162 | " p5 = executor.submit(akn.execute_file, client, 'VSO', q_commandsDb, params) \n", 163 | " p6 = executor.submit(akn.execute_file, client, 'VSO', q_commands, params)\n", 164 | " \n", 165 | " p7 = executor.submit(akn.execute_file, icm_client, 'IcmDataWarehouse', \n", 166 | " os.path.join(queryPath, 'ActiveIncidents.csl'), params)\n", 167 | " \n", 168 | " p8 = executor.submit(akn.execute_file, client, 'VSO', q_gen2, params)\n", 169 | " p9 = executor.submit(akn.execute_file, client, 'VSO', q_mdm, params)\n", 170 | " p10 = executor.submit(akn.execute_file, client, 'VSO', q_dep, params)\n", 171 | "\n", 172 | "q_whatChanged_df = akn.to_dataframe_from_future(p2)\n", 173 | "\n", 174 | "q_commandsAT_df = akn.to_dataframe_from_future(p4)\n", 175 | "\n", 176 | "q_commandsDb_df = akn.to_dataframe_from_future(p5)\n", 177 | "\n", 178 | "q_commands_df = akn.to_dataframe_from_future(p6)\n", 179 | "\n", 180 | "q_activeIncidentsResultDf = akn.to_dataframe_from_future(p7)\n", 181 | "\n", 182 | "q_gen2Df = akn.to_dataframe_from_future(p8)\n", 183 | "\n", 184 | "q_mdmDf = akn.to_dataframe_from_future(p9)\n", 185 | "params[\"mdmAccount\"] = q_mdmDf[\"monitoringAccount\"][0]\n", 186 | "\n", 187 | "q_depDf = akn.to_dataframe_from_future(p10)" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": { 194 | "inputHidden": false, 195 | "outputHidden": false 196 | }, 197 | "outputs": [], 198 | "source": [ 199 | "print('=' * 50)\n", 200 | "print('Report!')\n", 201 | "print('=' * 50, '\\n\\n')\n", 202 | "\n", 203 | "# jarvis params\n", 204 | "jarvisParams = {\n", 205 | " 'su': su, \n", 206 | " 'start': akn.get_time(start, -10), \n", 207 | " 'end': akn.get_time(end, 10), \n", 208 | " 'service': service,\n", 209 | " 'location': locationName,\n", 210 | " 'account': params[\"mdmAccount\"]\n", 211 | "}\n", 212 | "\n", 213 | "display(params)\n", 214 | "\n", 215 | "startTime = akn.to_datetime(start)\n", 216 | "# jarvis\n", 217 | "jarvisLink = \"\"\"https://jarvis-west.dc.ad.msft.net/dashboard/VSO-ServiceInsights/DevOpsReports/DevOpsReports\"\"\" \\\n", 218 | " \"\"\"?overrides=[{\"query\":\"//*[id='Service']\",\"key\":\"value\",\"replacement\":\"%(service)s\"},\"\"\" \\\n", 219 | " \"\"\"{\"query\":\"//*[id='RoleInstance']\",\"key\":\"value\",\"replacement\":\"\"},\"\"\" \\\n", 220 | " \"\"\"{\"query\":\"//*[id='LocationName']\",\"key\":\"value\",\"replacement\":\"%(location)s\"},\"\"\" \\\n", 221 | " \"\"\"{\"query\":\"//dataSources\",\"key\":\"namespace\",\"replacement\":\"%(su)s\"},\"\"\" \\\n", 222 | " \"\"\"{\"query\":\"//dataSources\",\"key\":\"account\",\"replacement\":\"%(account)s\"},\"\"\" \\\n", 223 | " \"\"\"{\"query\":\"//*[id='ApplicationEndpoint']\",\"key\":\"regex\",\"replacement\":\"*%(location)s*\"},\"\"\" \\\n", 224 | " \"\"\"{\"query\":\"//*[id='ScaleUnit']\",\"key\":\"value\",\"replacement\":\"%(su)s\"}]\"\"\" \\\n", 225 | " \"\"\"&globalStartTime=%(start)s&globalEndTime=%(end)s&pinGlobalTimeRange=true\"\"\" % jarvisParams;\n", 226 | "print('Jarvis dashboard link:\\n', requote_uri(jarvisLink), '\\n')\n", 227 | " \n", 228 | "# slow failed reason analysis\n", 229 | "print()\n", 230 | "print('Is it slow commands or failed commands? =============================')\n", 231 | "freq = q_commands_df[\"Frequency\"]\n", 232 | "coefficientOfVariance = freq.std()/freq.mean()\n", 233 | "failedCount = q_commands_df[q_commands_df[\"Reason\"] == \"failed\"][\"Frequency\"].values[0]\n", 234 | "slowCount = q_commands_df[q_commands_df[\"Reason\"] == \"slow\"][\"Frequency\"].values[0]\n", 235 | "reason = \"failed or slow\"\n", 236 | "if coefficientOfVariance > 0.5:\n", 237 | " if failedCount > slowCount:\n", 238 | " reason = \"failed\"\n", 239 | " else:\n", 240 | " reason = \"slow\"\n", 241 | "else:\n", 242 | " print(\"Slow and failed commands are too close, both might be contributing...\")\n", 243 | "if reason:\n", 244 | " print(\"Probably due to %s commands; Failed - %s, Slow - %s\" % (reason, failedCount, slowCount))\n", 245 | "\n", 246 | "# slow failed reason for AT?\n", 247 | "print()\n", 248 | "print('Is it %s because of AT? =============================' % (reason))\n", 249 | "failed = q_commandsAT_df[q_commandsAT_df[\"Reason\"] == \"failed\"]\n", 250 | "slow = q_commandsAT_df[q_commandsAT_df[\"Reason\"] == \"slow\"]\n", 251 | "data = q_commandsAT_df\n", 252 | "if reason == \"failed\":\n", 253 | " data = failed\n", 254 | "elif reason == \"slow\":\n", 255 | " data = slow\n", 256 | "\n", 257 | "coefficientOfVariance = data[\"Frequency\"].std()/data[\"Frequency\"].mean()\n", 258 | " \n", 259 | "if coefficientOfVariance > 0.5:\n", 260 | " print(\"Found variance in AT's for %s commands\" % (reason))\n", 261 | " print(data.head(30))\n", 262 | "else:\n", 263 | " print(\"Seems be same across AT's for %s commands\" % (reason))\n", 264 | " \n", 265 | "# slow failed reason for Db?\n", 266 | "print()\n", 267 | "print('Is it %s because of Db? =============================' % (reason))\n", 268 | "failed = q_commandsDb_df[q_commandsDb_df[\"Reason\"] == \"failed\"]\n", 269 | "slow = q_commandsDb_df[q_commandsDb_df[\"Reason\"] == \"slow\"]\n", 270 | "data = q_commandsDb_df\n", 271 | "if reason == \"failed\":\n", 272 | " data = failed\n", 273 | "elif reason == \"slow\":\n", 274 | " data = slow\n", 275 | "\n", 276 | "coefficientOfVariance = data[\"Frequency\"].std()/data[\"Frequency\"].mean()\n", 277 | " \n", 278 | "if coefficientOfVariance > 0.5:\n", 279 | " print(\"Found variance in Db's for %s commands\" % (reason))\n", 280 | " print(\"Suffix '%s' to database server name\" % (\".database.windows.net\"))\n", 281 | " print(\"Prefix '%s' to database name\" % (params[\"service\"] + \"_\" + params[\"locationName\"] + \"_\"))\n", 282 | " print(data.head(30))\n", 283 | "else:\n", 284 | " print(\"Seems be same across Db's for %s commands\" % (reason)) \n", 285 | " \n", 286 | "# what changed? analysis\n", 287 | "print()\n", 288 | "print('What changed? =============================')\n", 289 | "if(len(q_whatChanged_df.index) == 0):\n", 290 | " print(\"...no relevant config changes recorded during this period.\")\n", 291 | "else:\n", 292 | " up_prefix = \"\"\n", 293 | " mit_prefix = \"\"\n", 294 | " vip_prefix = \"\"\n", 295 | " f_prefix = \"\"\n", 296 | " text = \"\"\n", 297 | " for index, row in q_whatChanged_df.iterrows():\n", 298 | " delta = startTime.replace(tzinfo=None) - row.TIMESTAMP.replace(tzinfo=None)\n", 299 | " when = \"before\"\n", 300 | " if delta.total_seconds() < 0:\n", 301 | " when = \"after\"\n", 302 | " delta = row.TIMESTAMP.replace(tzinfo=None) - startTime.replace(tzinfo=None)\n", 303 | " hoursHappened = delta.total_seconds()//3600\n", 304 | " considerTime = hoursHappened <= 1\n", 305 | " def getText(row):\n", 306 | " return \"\"\"%s %s %s (%s days %s hours %s minutes %s the start time) \\n\\n\"\"\" % (row.TIMESTAMP, row.title, row.buildNumber, delta.days, delta.seconds//3600, delta.seconds//60, when)\n", 307 | " if(row.title.lower().find('upgrade') != -1):\n", 308 | " if not up_prefix:\n", 309 | " up_prefix += \"Looks like, there's upgrade...\\n\\n\"\n", 310 | " text += getText(row)\n", 311 | " if(row.title.lower().find('mitigation') != -1):\n", 312 | " if considerTime and not mit_prefix:\n", 313 | " mit_prefix += \"Looks like, there are some mitigations by health agent...\\n\\n\"\n", 314 | " text += getText(row)\n", 315 | " if(row.title.lower().find('vip') != -1):\n", 316 | " if considerTime and not vip_prefix:\n", 317 | " vip_prefix += \"Looks like, there is VIP swap...\\n\\n\"\n", 318 | " text += getText(row) \n", 319 | " if(row.title.lower().find('feature flag') != -1):\n", 320 | " if considerTime and not f_prefix:\n", 321 | " f_prefix += \"Looks like, some feature flags are enabled...\\n\\n\"\n", 322 | " text += getText(row)\n", 323 | " if text:\n", 324 | " print(up_prefix + mit_prefix + f_prefix + vip_prefix + text)\n", 325 | " else:\n", 326 | " print(\"...no relevant changes during this period.\")\n", 327 | " \n", 328 | "# active incidents?\n", 329 | "print()\n", 330 | "print('Active incidents? =============================')\n", 331 | "otherIncidentsCount = 0;\n", 332 | "for index, row in q_activeIncidentsResultDf.iterrows():\n", 333 | " if(row.Title.find(\"Customer Impact Monitor\") == -1):\n", 334 | " otherIncidentsCount+=1;\n", 335 | " \n", 336 | "if(otherIncidentsCount > 0):\n", 337 | " print(\"We found some incidents during the time period, check if they are related...\")\n", 338 | " # styling\n", 339 | " def make_clickable(url, text):\n", 340 | " return '{0}'.format(url)\n", 341 | "\n", 342 | " newDf = q_activeIncidentsResultDf.assign(URL=[*map(lambda x: make_clickable(\"\"\"https://icm.ad.msft.net/imp/v3/incidents/details/%s/home\"\"\" % (x), \"ICMLink\"), q_activeIncidentsResultDf.IncidentId)])\n", 343 | " print(\"ICM link to copy - \" + \"https://icm.ad.msft.net/imp/v3/incidents/details/INCIDENTID/home\")\n", 344 | " print(newDf[['IncidentId','Severity','Title']])\n", 345 | "else:\n", 346 | " print(\"No active incidents that could be related are found...\") \n", 347 | " \n", 348 | "print()\n", 349 | "print('Dependencies insights =============================')\n", 350 | "r = q_depDf.describe()\n", 351 | "redis = r[\"avg_RedisExecutionTimeInMs\"]\n", 352 | "s2s = r[\"avg_VssClientExecutionTimeInMs\"]\n", 353 | "sql = r[\"avg_SqlExecutionTimeInMs\"]\n", 354 | "queue = r[\"avg_QueueTimeInMs\"]\n", 355 | "maxs = [redis[\"max\"], s2s[\"max\"], sql[\"max\"], queue[\"max\"]]\n", 356 | "means = [redis[\"mean\"], s2s[\"mean\"], sql[\"mean\"], queue[\"mean\"]]\n", 357 | "up = redis[\"75%\"]\n", 358 | "lo = redis[\"25%\"]\n", 359 | "IQR = up - lo\n", 360 | "redisAnom = q_depDf[q_depDf[\"avg_RedisExecutionTimeInMs\"] > up + 3.5*IQR][[\"PreciseTimeStamp\", \"avg_RedisExecutionTimeInMs\"]].sort_values(by='PreciseTimeStamp') \n", 361 | "up = s2s[\"75%\"]\n", 362 | "lo = s2s[\"25%\"]\n", 363 | "IQR = up - lo\n", 364 | "s2sAnom = q_depDf[q_depDf[\"avg_VssClientExecutionTimeInMs\"] > up + 3.5*IQR][[\"PreciseTimeStamp\", \"avg_VssClientExecutionTimeInMs\"]].sort_values(by='PreciseTimeStamp')\n", 365 | "up = sql[\"75%\"]\n", 366 | "lo = sql[\"25%\"]\n", 367 | "IQR = up - lo\n", 368 | "sqlAnom = q_depDf[q_depDf[\"avg_SqlExecutionTimeInMs\"] > up + 3.5*IQR][[\"PreciseTimeStamp\", \"avg_SqlExecutionTimeInMs\"]].sort_values(by='PreciseTimeStamp')\n", 369 | "up = queue[\"75%\"]\n", 370 | "lo = queue[\"25%\"]\n", 371 | "IQR = up - lo\n", 372 | "queueAnom = q_depDf[q_depDf[\"avg_QueueTimeInMs\"] > up + 3.5*IQR][[\"PreciseTimeStamp\", \"avg_QueueTimeInMs\"]].sort_values(by='PreciseTimeStamp')\n", 373 | "reasons = [\"Redis\", \"S2S\", \"Sql\", \"RequestsQueued\"]\n", 374 | "anomdata = [redisAnom, s2sAnom, sqlAnom, queueAnom]\n", 375 | "anom = [len(redisAnom), len(s2sAnom), len(sqlAnom), len(queueAnom)]\n", 376 | "top2Anom = np.argsort(anom)[::-1][:2]\n", 377 | "whenMax = [\n", 378 | " redisAnom[redisAnom[\"avg_RedisExecutionTimeInMs\"]==maxs[0]],\n", 379 | " s2sAnom[s2sAnom[\"avg_VssClientExecutionTimeInMs\"]==maxs[1]],\n", 380 | " sqlAnom[sqlAnom[\"avg_SqlExecutionTimeInMs\"]==maxs[2]],\n", 381 | " queueAnom[queueAnom[\"avg_QueueTimeInMs\"]==maxs[3]]\n", 382 | "]\n", 383 | "if len(top2Anom) > 0:\n", 384 | " print(\"Found top anomolies...\")\n", 385 | " for i in top2Anom:\n", 386 | " # Wow, such a pain to convert numpy time to python time...\n", 387 | " dt64 = whenMax[i][\"PreciseTimeStamp\"].values[0]\n", 388 | " ts = (dt64 - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')\n", 389 | " whenMaxTime = datetime.utcfromtimestamp(ts)\n", 390 | " delta = startTime.replace(tzinfo=None) - whenMaxTime.replace(tzinfo=None)\n", 391 | " when = \"before\"\n", 392 | " if delta.total_seconds() < 0:\n", 393 | " when = \"after\"\n", 394 | " delta = whenMaxTime.replace(tzinfo=None) - startTime.replace(tzinfo=None)\n", 395 | " whenAnom = \"\"\"%s days %s hours %s minutes %s the start time - %s\"\"\" % (delta.days, delta.seconds//3600, delta.seconds//60, when, startTime)\n", 396 | " print(\" %s (#buckets %s) (max %s) (mean %s) (Max is %s)\" % (reasons[i], anom[i], maxs[i], means[i], whenAnom)) \n", 397 | " display(anomdata[i])\n", 398 | "else:\n", 399 | " print(\"Can't say there is...?\")\n", 400 | "\n", 401 | "# additional insights\n", 402 | "print()\n", 403 | "print('Additional insights =============================')\n", 404 | "w3wpGc = q_gen2Df[ q_gen2Df[\"ProcessName\"] == \"w3wp\" ].sort_values(by=['Number'], ascending=False)\n", 405 | "jaGc = q_gen2Df[ q_gen2Df[\"ProcessName\"] == \"TfsJobAgent\" ].sort_values(by=['Number'], ascending=False)\n", 406 | "now3wpGc = True\n", 407 | "if len(w3wpGc) > 0:\n", 408 | " x = w3wpGc[:2]\n", 409 | " if len(x) == 1:\n", 410 | " xCount = x[\"Count\"].values\n", 411 | " if xCount[0] > 25:\n", 412 | " print(\"INSIGHT: There's a new spike in w3wp gc...\")\n", 413 | " now3wpGc = False\n", 414 | " if len(x) == 2:\n", 415 | " xCount = x[\"Count\"].values\n", 416 | " if xCount[1] > xCount[0]:\n", 417 | " print(\"INSIGHT: There's a spike in w3wp gc...\")\n", 418 | " now3wpGc = False\n", 419 | "if now3wpGc:\n", 420 | " print(\"INSIGHT: No gc spikes found from w3wp...\")\n", 421 | " \n", 422 | "nojaGc = True \n", 423 | "if len(jaGc) > 0:\n", 424 | " x = jaGc[:2]\n", 425 | " if len(x) == 1:\n", 426 | " xCount = x[\"Count\"].values\n", 427 | " if xCount[0] > 25:\n", 428 | " print(\"INSIGHT: There's a new spike in ja gc...\")\n", 429 | " nojaGc = False\n", 430 | " if len(x) == 2:\n", 431 | " xCount = x[\"Count\"].values\n", 432 | " if xCount[1] > xCount[0] and xCount[1] > 25:\n", 433 | " print(\"INSIGHT: There's a spike in ja gc...\") \n", 434 | " nojaGc = False\n", 435 | "if nojaGc:\n", 436 | " print(\"INSIGHT: No ja gc spikes found from w3wp...\") " 437 | ] 438 | } 439 | ], 440 | "metadata": { 441 | "kernel_info": { 442 | "name": "python3" 443 | }, 444 | "kernelspec": { 445 | "display_name": "Python 3", 446 | "language": "python", 447 | "name": "python3" 448 | }, 449 | "language_info": { 450 | "codemirror_mode": { 451 | "name": "ipython", 452 | "version": 3 453 | }, 454 | "file_extension": ".py", 455 | "mimetype": "text/x-python", 456 | "name": "python", 457 | "nbconvert_exporter": "python", 458 | "pygments_lexer": "ipython3", 459 | "version": "3.7.4" 460 | }, 461 | "nteract": { 462 | "version": "0.15.0" 463 | } 464 | }, 465 | "nbformat": 4, 466 | "nbformat_minor": 0 467 | } 468 | -------------------------------------------------------------------------------- /ja.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Jobagent Investigation\n", 8 | "1. Run all cells.\n", 9 | "1. View report at the bottom." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "inputHidden": false, 17 | "outputHidden": false, 18 | "tags": [ 19 | "parameters" 20 | ] 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "su = \"tfs-wcus-0\"\n", 25 | "start = \"2019-08-08T23:50:00.0000000Z\"\n", 26 | "end = \"2019-08-09T00:24:36.0000000Z\"\n", 27 | "url = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK/delays.ipynb\"\n", 28 | "baseUrl = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK\"\n", 29 | "service = \"tfs\"\n", 30 | "hub = \"Build\"\n", 31 | "locationName = \"tfsprodwcus0\"\n", 32 | "mdmAccount = \"VSO-TFS\"" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": { 39 | "inputHidden": false, 40 | "outputHidden": false 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "%%capture\n", 45 | "!pip install --upgrade nimport azure-kusto-notebooks" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": { 52 | "inputHidden": false, 53 | "outputHidden": false 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "# Import the things we use\n", 58 | "\n", 59 | "# Note you can also use kql https://docs.microsoft.com/en-us/azure/data-explorer/kqlmagic\n", 60 | "# %kql is single line magic\n", 61 | "# %%kql is cell magic\n", 62 | "\n", 63 | "# https://nbviewer.jupyter.org/github/ipython/ipython/blob/4.0.x/examples/IPython%20Kernel/Rich%20Output.ipynb#HTML\n", 64 | "# https://ipython.readthedocs.io/en/stable/inte/magics.html\n", 65 | "from IPython.display import display, HTML, Markdown, Javascript, clear_output\n", 66 | "\n", 67 | "# http://pandas-docs.github.io/pandas-docs-travis/user_guide/reshaping.html\n", 68 | "import pandas as pd\n", 69 | "pd.options.display.html.table_schema = True\n", 70 | "from pandas import Series, DataFrame\n", 71 | "from datetime import datetime, timedelta, timezone\n", 72 | "from urllib.parse import urlencode, quote_plus\n", 73 | "from requests.utils import requote_uri\n", 74 | "import time\n", 75 | "import numpy as np\n", 76 | "from matplotlib import pyplot as plt\n", 77 | "from nimport.utils import tokenize, open_nb\n", 78 | "import json\n", 79 | "import os\n", 80 | "import calendar as cal\n", 81 | "import concurrent.futures\n", 82 | "from azure.kusto.notebooks import utils as akn" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": { 89 | "inputHidden": false, 90 | "outputHidden": false 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "params = {\n", 95 | " \"su\": su,\n", 96 | " \"start\": start,\n", 97 | " \"end\": end,\n", 98 | " \"url\": url,\n", 99 | " \"baseUrl\": baseUrl,\n", 100 | " \"service\": service\n", 101 | "}\n", 102 | "root = 'devops-pipelines' if os.path.basename(os.getcwd()) != 'devops-pipelines' else ''\n", 103 | "queryPath = os.path.join(root, 'queries') " 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": { 110 | "inputHidden": false, 111 | "outputHidden": false 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "# authenticate kusto client\n", 116 | "# you will need to copy the token into a browser window for AAD auth. \n", 117 | "client = akn.get_client('https://vso.kusto.windows.net')" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": { 124 | "inputHidden": false, 125 | "outputHidden": false 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "q_slow = os.path.join(queryPath, \"SlowActivities.csl\")\n", 130 | "q_sqlSlow = os.path.join(queryPath, \"SlowSql.csl\")\n", 131 | "\n", 132 | "jaPath = os.path.join(queryPath, 'ja')\n", 133 | "q_jobSql = os.path.join(jaPath, \"JASqlTime.csl\")\n", 134 | "with concurrent.futures.ThreadPoolExecutor() as executor:\n", 135 | " p1 = executor.submit(akn.execute_file, client, 'VSO', q_slow, params)\n", 136 | " p2 = executor.submit(akn.execute_file, client, 'VSO', q_sqlSlow, params)\n", 137 | " p3 = executor.submit(akn.execute_file, client, 'VSO', q_jobSql, params)\n", 138 | "\n", 139 | "q_slowResult_df = akn.to_dataframe_from_future(p1)\n", 140 | "\n", 141 | "q_sqlSlowResult_df = akn.to_dataframe_from_future(p2)\n", 142 | "\n", 143 | "q_jobSqlResult_df = akn.to_dataframe_from_future(p3) \n" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": { 150 | "inputHidden": false, 151 | "outputHidden": false 152 | }, 153 | "outputs": [], 154 | "source": [ 155 | "print('=' * 50)\n", 156 | "print('Report!')\n", 157 | "print('=' * 50, '\\n\\n')\n", 158 | "\n", 159 | "# jarvis params\n", 160 | "jarvisParams = {\n", 161 | " 'su': su, \n", 162 | " 'start': akn.get_time(start, -10), \n", 163 | " 'end': akn.get_time(end, 10), \n", 164 | " 'service': service,\n", 165 | " 'location': locationName,\n", 166 | " 'account': \"mdmAccount\"\n", 167 | "}\n", 168 | "\n", 169 | "jaJarvisLink = \"\"\"https://jarvis-west.dc.ad.msft.net/dashboard/VSO-ServiceInsights/PlatformViews/Compute-JA\"\"\" \\\n", 170 | " \"\"\"?overrides=[{\"query\":\"//*[id='Service']\",\"key\":\"value\",\"replacement\":\"%(service)s\"},\"\"\" \\\n", 171 | " \"\"\"{\"query\":\"//*[id='RoleInstance']\",\"key\":\"value\",\"replacement\":\"\"},\"\"\" \\\n", 172 | " \"\"\"{\"query\":\"//*[id='LocationName']\",\"key\":\"value\",\"replacement\":\"%(location)s\"},\"\"\" \\\n", 173 | " \"\"\"{\"query\":\"//dataSources\",\"key\":\"namespace\",\"replacement\":\"%(su)s\"},\"\"\" \\\n", 174 | " \"\"\"{\"query\":\"//dataSources\",\"key\":\"account\",\"replacement\":\"%(account)s\"},\"\"\" \\\n", 175 | " \"\"\"{\"query\":\"//*[id='ApplicationEndpoint']\",\"key\":\"regex\",\"replacement\":\"*%(location)s*\"},\"\"\" \\\n", 176 | " \"\"\"{\"query\":\"//*[id='ScaleUnit']\",\"key\":\"value\",\"replacement\":\"%(su)s\"}]\"\"\" \\\n", 177 | " \"\"\"&globalStartTime=%(start)s&globalEndTime=%(end)s&pinGlobalTimeRange=true\"\"\" % jarvisParams;\n", 178 | "print('Jarvis dashboard link for job agents:\\n', requote_uri(jaJarvisLink), '\\n')\n", 179 | "\n", 180 | "print('Top slow activities:')\n", 181 | "display(q_slowResult_df)\n", 182 | "\n", 183 | "print('Top sql slow activities:')\n", 184 | "display(q_sqlSlowResult_df)\n", 185 | "\n", 186 | "print('Top sql executime times from jobs:')\n", 187 | "display(q_jobSqlResult_df)" 188 | ] 189 | } 190 | ], 191 | "metadata": { 192 | "kernel_info": { 193 | "name": "python3" 194 | }, 195 | "kernelspec": { 196 | "display_name": "Python 3", 197 | "language": "python", 198 | "name": "python3" 199 | }, 200 | "language_info": { 201 | "codemirror_mode": { 202 | "name": "ipython", 203 | "version": 3 204 | }, 205 | "file_extension": ".py", 206 | "mimetype": "text/x-python", 207 | "name": "python", 208 | "nbconvert_exporter": "python", 209 | "pygments_lexer": "ipython3", 210 | "version": "3.7.4" 211 | }, 212 | "nteract": { 213 | "version": "0.15.0" 214 | } 215 | }, 216 | "nbformat": 4, 217 | "nbformat_minor": 0 218 | } 219 | -------------------------------------------------------------------------------- /params.json: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /queries/ActiveIncidents.csl: -------------------------------------------------------------------------------- 1 | //%kql AzureDataExplorer://tenant="Microsoft.com";code;cluster='Icmcluster';database='IcMDataWarehouse' 2 | let scaleUnit = "{su}"; 3 | let startTime = todatetime("{start}") - 5hr; 4 | let endTime = todatetime("{end}") + 5hr; 5 | //38 is the VSTS Tenant ID in IcM 6 | getincidents(38,startTime, endTime) 7 | //| where Severity < 3 and IsOutage == 1 and isnull(ParentIncidentId) 8 | | project IncidentId, Severity, CreateDate, Title, Status, OwningTeamName 9 | | where Title contains scaleUnit 10 | | order by CreateDate desc -------------------------------------------------------------------------------- /queries/HealthAgentActions.csl: -------------------------------------------------------------------------------- 1 | let scaleUnit = "{su}"; 2 | let startTime = todatetime("{start}") - 5hr; 3 | let endTime = todatetime("{end}") + 5hr; 4 | let service = "{service}"; 5 | let threshold = 10; 6 | let step = totimespan("1m"); 7 | let window = totimespan("5m"); 8 | VssHealthAgentActions 9 | | where PreciseTimeStamp between (startTime .. endTime) 10 | | where Service =~ service 11 | | where ScaleUnit =~ scaleUnit 12 | | project PreciseTimeStamp, RoleInstance, MitigationName, Directory, ActionName -------------------------------------------------------------------------------- /queries/LocationName.csl: -------------------------------------------------------------------------------- 1 | let scaleUnit = "{su}"; 2 | let service = "{service}"; 3 | ActivityLog 4 | | where ScaleUnit == scaleUnit 5 | | where Service =~ service 6 | | project Tenant 7 | | take 1 -------------------------------------------------------------------------------- /queries/MDMAccount.csl: -------------------------------------------------------------------------------- 1 | let service = "{service}"; 2 | CosmosMdmMetrics 3 | | where Service == service 4 | | project monitoringAccount 5 | | take 1 -------------------------------------------------------------------------------- /queries/SlowActivities.csl: -------------------------------------------------------------------------------- 1 | let scaleUnit = "{su}"; 2 | let startTime = todatetime("{start}") - 15min; 3 | let endTime = todatetime("{end}") + 15min; 4 | let service = "{service}"; 5 | let interval = 1m; 6 | ProductTrace 7 | | where PreciseTimeStamp between (startTime .. endTime) 8 | | where Service =~ service 9 | | where ScaleUnit =~ scaleUnit 10 | | where Role == "JobAgent" 11 | | where Tracepoint == 36109 //slow activities from request context 12 | | extend K = strcat(RoleInstance, "-", DeploymentId, "-", DeploymentSlot, "-", Method) 13 | | summarize count() by K 14 | | top 25 by count_ desc 15 | //| render timechart -------------------------------------------------------------------------------- /queries/SlowSql.csl: -------------------------------------------------------------------------------- 1 | let scaleUnit = "{su}"; 2 | let startTime = todatetime("{start}") - 15min; 3 | let endTime = todatetime("{end}") + 15min; 4 | let service = "{service}"; 5 | let interval = 1m; 6 | ProductTrace 7 | | where PreciseTimeStamp between (startTime .. endTime) 8 | | where Service =~ service 9 | | where ScaleUnit =~ scaleUnit 10 | | where Role == "JobAgent" 11 | | where Tracepoint == 64038 //sql slow queries that is the queries taking longer than the usual time for execution. 12 | | extend K = strcat(RoleInstance, "-", DeploymentId, "-", DeploymentSlot, "-", Method) 13 | | summarize count() by K 14 | | top 25 by count_ desc 15 | //| render timechart -------------------------------------------------------------------------------- /queries/WhatChanged.csl: -------------------------------------------------------------------------------- 1 | let scaleUnit = "{su}"; 2 | let startTime = todatetime("{start}") - 5hr; 3 | let endTime = todatetime("{end}") + 5hr; 4 | let threshold = 10; 5 | let step = totimespan("1m"); 6 | let window = totimespan("5m"); 7 | let tenant = "{locationName}"; 8 | WhatChangedRange(tenant=tenant, startTime, endTime-startTime) 9 | | extend Name=strcat(['title'], "@", tostring(TIMESTAMP)) 10 | | order by start asc -------------------------------------------------------------------------------- /queries/delays/Abusers.csl: -------------------------------------------------------------------------------- 1 | TraceLightRailLog 2 | | where ServiceName =~ 'mms' 3 | | where Command == 'Stop-ServiceHost' 4 | | where Message startswith 'HostId = ' 5 | | extend HostId = tostring(split(Message, ' ')[2]) 6 | | summarize by HostId 7 | | union (ServiceHostAggregated | where StatusReason in ("abuse", "Abuse") | summarize by HostId) 8 | | distinct HostId -------------------------------------------------------------------------------- /queries/delays/AffectedAccounts.csl: -------------------------------------------------------------------------------- 1 | let scaleUnit = "{su}"; 2 | let startTime = todatetime("{start}") - 15min; 3 | let endTime = todatetime("{end}") + 15min; 4 | let service = "{service}"; 5 | let hubName = "{hub}"; 6 | let threshold = 10; 7 | let step = totimespan("1m"); 8 | let window = totimespan("5m"); 9 | let affectedAccounts = 10 | ProductTrace 11 | | where PreciseTimeStamp between (startTime .. endTime) 12 | | where Service =~ service 13 | | where ScaleUnit =~ scaleUnit 14 | | where Tracepoint == 15010000 15 | | where Layer matches regex strcat(hubName, ".*_ActivityDispatcher") 16 | | extend DbDelay = extract('read from db delay (.*),', 1, Message, typeof(timespan)) 17 | | extend BufferDelay = extract('buffer read delay (.*)', 1, Message, typeof(timespan)) 18 | | extend MessageDelay = DbDelay + BufferDelay 19 | | summarize avg(MessageDelay) by ServiceHost, Layer, bin(PreciseTimeStamp, step) 20 | | extend Threshold = strcat(threshold, 's') 21 | | where avg_MessageDelay > totimespan(Threshold); 22 | ServiceHostAggregated() 23 | | join (affectedAccounts) on $left.HostId == $right.ServiceHost 24 | | where Service =~ service 25 | | where HostType == 4 26 | | project Name, HostId, Layer, PreciseTimeStamp, MessageDelayInSeconds = avg_MessageDelay / 1s, DatabaseName, Threshold 27 | | order by PreciseTimeStamp desc 28 | // | order by MessageDelayInSeconds desc -------------------------------------------------------------------------------- /queries/delays/DelayedAccountsAreAbusers.csl: -------------------------------------------------------------------------------- 1 | // Impacted accounts in time window, and are they known abusers 2 | // 3 | let startTime = todatetime("{start}") - 15m; 4 | let endTime = todatetime("{end}") + 15m; 5 | let service = "{service}"; 6 | let hubName = "{hub}"; 7 | let scaleUnit = "{su}"; 8 | let Abusers = TraceLightRailLog 9 | | where ServiceName =~ 'mms' 10 | | where Command == 'Stop-ServiceHost' 11 | | where Message startswith 'HostId = ' 12 | | extend HostId = tostring(split(Message, ' ')[2]) 13 | | summarize by HostId 14 | | union (ServiceHostAggregated | where StatusReason in ("abuse", "Abuse") | summarize by HostId) 15 | | distinct HostId; 16 | let ActivityDispatcherDelays = ProductTrace 17 | | where PreciseTimeStamp between (startTime .. endTime) 18 | | where Service =~ service 19 | | where ScaleUnit =~ scaleUnit 20 | | where Tracepoint == 15010000 21 | | where Layer matches regex strcat(hubName, ".*_ActivityDispatcher") 22 | | extend DbDelay = extract('read from db delay (.*),', 1, Message, typeof(timespan)) 23 | | extend BufferDelay = extract('buffer read delay (.*)', 1, Message, typeof(timespan)) 24 | | extend MessageDelayInSeconds = toint((DbDelay + BufferDelay) / 1s) 25 | | join kind=leftouter (ServiceHostAggregated() | where Service =~ service | where HostType == 4 | summarize by HostId, Name) 26 | on $left.ServiceHost == $right.HostId; 27 | // table 28 | // 29 | ActivityDispatcherDelays 30 | | summarize AvgMessageDelay=round(avg(MessageDelayInSeconds)) by Name, HostId 31 | | extend Abuser = iff(HostId in (Abusers), "yep", "") 32 | | order by AvgMessageDelay desc -------------------------------------------------------------------------------- /queries/delays/Load.csl: -------------------------------------------------------------------------------- 1 | // orchestrator kpi's 2 | let scaleUnit = "{su}"; 3 | let startTime = todatetime("{start}") - 15min; 4 | let endTime = todatetime("{end}") + 15min; 5 | let service = "{service}"; 6 | let hubName = "{hub}"; 7 | let interval = 1m; 8 | KPI 9 | | where PreciseTimeStamp between (startTime .. endTime) 10 | | where Service =~ service 11 | | where ScaleUnit =~ scaleUnit 12 | | where Metrics contains "DTPlan" or Metrics contains "DTAgent" or Metrics contains "DTJob" 13 | | extend DataObj = parsejson(Metrics) 14 | | extend MetricsObjArr = parsejson(DataObj.metrics) 15 | | extend MetricsObj = MetricsObjArr[0] 16 | | extend HostId = tostring(DataObj.hostId) 17 | | extend Name = tostring(MetricsObj.name) 18 | | extend DisplayName = MetricsObj.displayName 19 | | extend Value = todouble(MetricsObj.value) 20 | | project PreciseTimeStamp, DataObj, Name, DisplayName, Value, MetricsObj 21 | | summarize sum(Value) by Name, bin(PreciseTimeStamp, interval) 22 | | render timechart -------------------------------------------------------------------------------- /queries/delays/LoadPerHost.csl: -------------------------------------------------------------------------------- 1 | // orchestrator kpi's 2 | let scaleUnit = "{su}"; 3 | let startTime = todatetime("{start}") - 15min; 4 | let endTime = todatetime("{end}") + 15min; 5 | let service = "{service}"; 6 | let hubName = "{hub}"; 7 | let interval = 1m; 8 | let hostId = "{hostId}"; 9 | KPI 10 | | where PreciseTimeStamp between (startTime .. endTime) 11 | | where Service =~ service 12 | | where ScaleUnit =~ scaleUnit 13 | | where Metrics contains "DTPlan" or Metrics contains "DTAgent" or Metrics contains "DTJob" 14 | | extend DataObj = parsejson(Metrics) 15 | | extend MetricsObjArr = parsejson(DataObj.metrics) 16 | | extend MetricsObj = MetricsObjArr[0] 17 | | extend HostId = tostring(DataObj.hostId) 18 | | extend Name = tostring(MetricsObj.name) 19 | | extend DisplayName = MetricsObj.displayName 20 | | extend Value = todouble(MetricsObj.value) 21 | | where HostId startswith hostId 22 | | project PreciseTimeStamp, DataObj, Name, DisplayName, Value, MetricsObj 23 | | summarize sum(Value) by Name, bin(PreciseTimeStamp, interval) 24 | | render timechart -------------------------------------------------------------------------------- /queries/delays/OrchestrationLogSpike.csl: -------------------------------------------------------------------------------- 1 | let scaleUnit = "{su}"; 2 | let startTime = todatetime("{start}") - 8hr; 3 | let endTime = todatetime("{end}") + 15min; 4 | let service = "{service}"; 5 | let hubName = "{hub}"; 6 | let threshold = 10; 7 | let step = totimespan("1m"); 8 | let window = totimespan("5m"); 9 | let interval = 1m; 10 | let hostId = "{hostId}"; 11 | OrchestrationLog 12 | | where PreciseTimeStamp between (startTime .. endTime) 13 | | where Service =~ service 14 | | where ScaleUnit =~ scaleUnit 15 | | where HostId startswith hostId 16 | | where Command contains "CIPlatform" 17 | | summarize C=count() by Command, HostId, bin(PreciseTimeStamp, 15min) 18 | | order by C desc -------------------------------------------------------------------------------- /queries/delays/OrchestrationLogSpikeTip.csl: -------------------------------------------------------------------------------- 1 | // Use this to sample a few OrchestrationIds 2 | // Run those OrchestrationIds through https://github.com/microsoft/devops-pipelines/blob/master/queries/run/WhatHappened.csl 3 | // Or https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK/run.ipynb?planId=e75d6056-bfba-4906-b454-02ba3b7880e7 (change planId) 4 | let scaleUnit = "{su}"; 5 | let startTime = todatetime("{start}") - 15min; 6 | let endTime = todatetime("{end}") + 15min; 7 | let service = "{service}"; 8 | let hubName = "{hub}"; 9 | let command = "{command}"; 10 | let threshold = 10; 11 | let step = totimespan("1m"); 12 | let window = totimespan("5m"); 13 | let interval = 1m; 14 | let hostId = "{hostId}"; 15 | OrchestrationLog 16 | | where PreciseTimeStamp between (startTime .. endTime) 17 | | where Service =~ service 18 | | where ScaleUnit =~ scaleUnit 19 | | where HostId startswith hostId 20 | | where Command contains command 21 | | project PreciseTimeStamp, OrchestrationId, ExceptionMessage, Feature, HostId 22 | | top 1000 by PreciseTimeStamp desc 23 | -------------------------------------------------------------------------------- /queries/delays/Parallelism.csl: -------------------------------------------------------------------------------- 1 | // orchestrator kpi's 2 | let hostId = "{hostId}"; 3 | let startTime = todatetime("{start}") - 15min; 4 | let endTime = todatetime("{end}") + 15min; 5 | let sampleInterval = 1m; 6 | range sampleTime from startTime to endTime step sampleInterval 7 | | extend dummyKey=1 8 | | join kind=inner ( // carthesian product really 9 | AgentPoolRequestHistory 10 | | where HostId == hostId 11 | | where StartTime != '1601-01-01T00:00:00Z' // Exclude the jobs that were never started 12 | | where StartTime < endTime and FinishTime > startTime // Exclude upfront the jobs that weren't running during our window. 13 | | where bin(StartTime, 1m) != bin(FinishTime, 1m) // Exclude upfront jobs that started and completed within the same minute interval. 14 | | extend dummyKey=1 15 | ) on dummyKey 16 | | where StartTime < sampleTime and FinishTime > sampleTime 17 | | make-series 18 | C=count() 19 | on sampleTime 20 | in range(startTime, endTime, sampleInterval) 21 | | extend HostId = hostId -------------------------------------------------------------------------------- /queries/delays/WhatDelayed.csl: -------------------------------------------------------------------------------- 1 | let scaleUnit = "{su}"; 2 | let startTime = todatetime("{start}") - 15min; 3 | let endTime = todatetime("{end}") + 15min; 4 | let service = "{service}"; 5 | let hubName = "{hub}"; 6 | let interval = 1m; 7 | CounterEvent 8 | | where PreciseTimeStamp between (startTime .. endTime) 9 | | where Service =~ service 10 | | where ScaleUnit =~ scaleUnit 11 | | where Role == 'JobAgent' 12 | | where CounterName startswith strcat("\\TFS Services:Orchestration(", hubName) or CounterName startswith "\\TFS Services:JobService(_Total)" 13 | | extend NameOnly = extract("\\)\\\\(.*)$", 1, CounterName, typeof(string)) 14 | | where NameOnly in ( 15 | 'Total Pending Jobs', 16 | 'Pending Job Age', 17 | 'Average Activity Message Delay', 18 | 'Average Activity Job Delay', 19 | 'Average Activity Execution Time' 20 | ) 21 | | extend Pivot = replace("(TFS Services:)|(Orchestration\\(Build-)","", CounterName) 22 | | summarize avg(CounterValue) by Pivot, bin(PreciseTimeStamp, interval) 23 | | render timechart -------------------------------------------------------------------------------- /queries/impact/CommandsAT.csl: -------------------------------------------------------------------------------- 1 | let scaleUnit = "{su}"; 2 | let startTime = todatetime("{start}"); 3 | let endTime = todatetime("{end}"); 4 | let service = "{service}"; 5 | ActivityLog 6 | | where Service =~ service and ScaleUnit =~ scaleUnit 7 | | where PreciseTimeStamp between (startTime .. endTime) 8 | | where ActivityStatus > 0 9 | | extend VSID = iff(isnotempty(AnonymousIdentifier), AnonymousIdentifier, VSID) 10 | | extend Reason = iff(ActivityStatus == 1, "failed", "slow") 11 | | summarize Frequency = count() by RoleInstance, Reason 12 | | order by Frequency desc -------------------------------------------------------------------------------- /queries/impact/CommandsDb.csl: -------------------------------------------------------------------------------- 1 | let scaleUnit = "{su}"; 2 | let startTime = todatetime("{start}"); 3 | let endTime = todatetime("{end}"); 4 | let service = "{service}"; 5 | ActivityLog 6 | | where Service =~ service and ScaleUnit =~ scaleUnit 7 | | where PreciseTimeStamp between (startTime .. endTime) 8 | | where ActivityStatus > 0 9 | | extend VSID = iff(isnotempty(AnonymousIdentifier), AnonymousIdentifier, VSID) 10 | | extend Reason = iff(ActivityStatus == 1, "failed", "slow") 11 | | join ( 12 | ServiceHostAggregated | where Service == service and HostType == 4 and ScaleUnit == scaleUnit 13 | ) on HostId 14 | | extend Server = replace(".database.windows.net", "", ServerName) 15 | | extend Database = replace(strcat("{service}", "_", "{locationName}", "_"), "", tolower(DatabaseName)) 16 | | summarize Frequency = count() by Server, Database, Reason 17 | | order by Frequency desc -------------------------------------------------------------------------------- /queries/impact/CommandsReason.csl: -------------------------------------------------------------------------------- 1 | let scaleUnit = "{su}"; 2 | let startTime = todatetime("{start}"); 3 | let endTime = todatetime("{end}"); 4 | let service = "{service}"; 5 | ActivityLog 6 | | where Service =~ service and ScaleUnit =~ scaleUnit 7 | | where ActivityStatus > 0 8 | | extend VSID = iff(isnotempty(AnonymousIdentifier), AnonymousIdentifier, VSID) 9 | | extend Reason = iff(ActivityStatus == 1, "failed", "slow") 10 | | summarize Frequency = count() by Reason 11 | | order by Frequency desc -------------------------------------------------------------------------------- /queries/impact/Dependencies.csl: -------------------------------------------------------------------------------- 1 | let scaleUnit = "{su}"; 2 | let startTime = todatetime("{start}") - 8hr; 3 | let endTime = todatetime("{end}") + 15min; 4 | let service = "{service}"; 5 | func_ActivityLog 6 | | where Service =~ service and ScaleUnit =~ scaleUnit 7 | | where PreciseTimeStamp between (startTime .. endTime) 8 | | where ActivityStatus > 0 9 | | extend VSID = iff(isnotempty(AnonymousIdentifier), AnonymousIdentifier, VSID) 10 | | extend Reason = iff(ActivityStatus == 1, "failed", "slow") 11 | | summarize avg(RedisExecutionTimeInMs), avg(VssClientExecutionTimeInMs), avg(SqlExecutionTimeInMs), avg(QueueTimeInMs) by bin(PreciseTimeStamp, 5min) -------------------------------------------------------------------------------- /queries/impact/Gen2GCSpikes.csl: -------------------------------------------------------------------------------- 1 | let scaleUnit = "{su}"; 2 | let startTime = todatetime("{start}") - 8hr; 3 | let endTime = todatetime("{end}") + 15min; 4 | let diff = endTime - startTime; 5 | let service = "{service}"; 6 | let interval = 1m; 7 | VssHealthAgentGarbageCollection 8 | | where PreciseTimeStamp >= startTime and PreciseTimeStamp <= endTime 9 | | where Service =~ service and ScaleUnit =~ scaleUnit 10 | | where Environment == "PROD" 11 | | where DeploymentSlot == "Production" 12 | | where Generation == 2 13 | | where ProcessName in ("w3wp", "TfsJobAgent") 14 | | make-series Count = count() on PreciseTimeStamp in range(startTime, endTime, diff/4) by ProcessName 15 | | mv-expand PreciseTimeStamp, Count 16 | | where Count > 0 17 | | extend Number = toint(Count) 18 | | extend Time = todatetime(PreciseTimeStamp) 19 | | order by Time desc -------------------------------------------------------------------------------- /queries/ja/JASqlTime.csl: -------------------------------------------------------------------------------- 1 | let scaleUnit = "{su}"; 2 | let startTime = todatetime("{start}") - 15min; 3 | let endTime = todatetime("{end}") + 15min; 4 | let service = "{service}"; 5 | let interval = 1m; 6 | JobHistory 7 | | where PreciseTimeStamp between (startTime .. endTime) 8 | | where Service =~ service 9 | | where ScaleUnit =~ scaleUnit 10 | | summarize sum(SqlExecutionTime) by Plugin, JobName, JobSource 11 | | sort by sum_SqlExecutionTime desc 12 | | limit 25 -------------------------------------------------------------------------------- /queries/run/PlanInfo.csl: -------------------------------------------------------------------------------- 1 | // Pull a useful collection of data about a specific plan 2 | // 3 | let oid = {OrchestrationId}; 4 | let pid = substring(oid, 0, 36); 5 | let HostNames = ServiceHostAggregated 6 | | where Service in ("tfs", "pipelines") 7 | | summarize by HostId, Name; 8 | let RingMap = union Ring_Mapping("tfs"), Ring_Mapping("pipelines"); 9 | OrchestrationPlanContext 10 | | where PlanId == pid 11 | | summarize RoleInstances=make_set(RoleInstance), JobOrchestrationIds=make_set(OrchestrationId) by HostId, ScaleUnit, Region, Tenant, ProjectName, PlanType, DefinitionId, DefinitionName 12 | | join kind=leftouter HostNames on HostId | project-away HostId1 13 | | join kind=leftouter RingMap on ScaleUnit| project-away ScaleUnit1 -------------------------------------------------------------------------------- /queries/run/WhatHappened.csl: -------------------------------------------------------------------------------- 1 | // OK SO WHAT HAPPENED: an end-to-end analysis based on DistributedTask OrchestrationId 2 | // zacox@microsoft.com 3 | // 4 | let oid = {OrchestrationId}; 5 | let planId = substring(oid, 0, 36); 6 | let hostId = tostring(toscalar(OrchestrationPlanContext | where PlanId == planId | summarize by HostId)); 7 | let hostName = tostring(toscalar(ServiceHostAggregated() | where HostId == hostId | take 1 | project Name)); 8 | find in (ActivityLog, AgentPoolRequestHistory, HttpOutgoingRequests, OrchestrationLog, OrchestrationPlanContext, ProductTrace) 9 | where OrchestrationId startswith planId 10 | project PreciseTimeStamp, 11 | OrchestrationId, 12 | Command, 13 | Message 14 | // Tracepoint, 15 | // UrlPath, 16 | // ResponseCode, 17 | // Level, // for coloring :) 18 | // ActivityId, 19 | // ExceptionMessage 20 | // | extend HostName = hostName 21 | | order by PreciseTimeStamp asc -------------------------------------------------------------------------------- /queries/sla/SLADurationAnalysis.csl: -------------------------------------------------------------------------------- 1 | // Identify all orchestrations that are currently out of SLA 2 | // zacox@microsoft.com 3 | // 4 | let triggerTime = {TriggerTime}; // now(), datetime(2019-07-07 20:20:20Z) 5 | let service = {Service}; // "tfs", "releasemanagement"; 6 | let scaleUnit = {ScaleUnit}; // "tfs-wus-0" 7 | let lookback = {Lookback}; // how often does the query run? 8 | // 9 | let startTime = triggerTime - 15m; // allow for kusto ingestion 10 | let maxPhaseDuration = 2h; // maximum amount of time before treating the phase as missing telemetry 11 | let searchStartTime = startTime - lookback; 12 | let slaLimit = 5m; 13 | let prodTracePartitions = 12; 14 | let sev2Threshold = 25; // how many need to break SLA per scale unit to alert? 15 | // 16 | // 17 | // Consider running plans, and any that completed since the last trigger 18 | let StartedPlans = ProductTrace 19 | | where PreciseTimeStamp < startTime 20 | and Tracepoint == 10015547 // TaskHub tracepoint 21 | and (isempty(service) or Service == service) 22 | | project 23 | PlanId = OrchestrationId, 24 | StartTime = PreciseTimeStamp; 25 | let FinishedPlans = ProductTrace 26 | | where PreciseTimeStamp < startTime 27 | and Tracepoint == 0 // TODO: add a tracepoint :( 28 | and (isempty(service) or Service == service) 29 | and Message startswith "Completed orchestration with result" 30 | | project 31 | PlanId = OrchestrationId, 32 | FinishTime = PreciseTimeStamp; 33 | let RecentlyCompletedPlans = FinishedPlans 34 | | where FinishTime > searchStartTime 35 | | join hint.strategy=shuffle hint.num_partitions = prodTracePartitions 36 | StartedPlans on PlanId 37 | | project 38 | PlanId, 39 | Completed = true, 40 | PlanDuration = FinishTime - StartTime; 41 | let RunningPlans = StartedPlans 42 | | join hint.strategy=shuffle hint.num_partitions = prodTracePartitions 43 | kind=leftanti 44 | FinishedPlans on PlanId 45 | | project 46 | PlanId, 47 | Completed = false, 48 | PlanDuration = startTime - StartTime; 49 | let PlansToConsider = union RecentlyCompletedPlans //, RunningPlans // zacox: ignore running plans -- telemetry is too flakey 50 | | where PlanDuration > slaLimit; // only consider plans that could be out of range 51 | let PlanIds = PlansToConsider | project PlanId; 52 | // PlansToConsider | summarize hint.strategy = shuffle count() by Completed 53 | // 54 | // 55 | // Only phases with execution time limits are considered to contribute to the SLA 56 | let OrchestrationLogSubset = OrchestrationLog 57 | | where PreciseTimeStamp < startTime 58 | | extend PlanId = substring(OrchestrationId, 0, 36) 59 | | where PlanId in (PlanIds); // only consider plans in our subset 60 | let OrchestrationLogWithSLA = PhaseExecutionTimeOverrides() 61 | | where Application == "Pipelines" 62 | | join 63 | kind=rightouter 64 | OrchestrationLogSubset 65 | on Application, Feature, Command 66 | | extend IsSLA = isnotnull(ExecutionTimeThresholdOverrideInMicroseconds) 67 | | project PreciseTimeStamp, Service, Region, ScaleUnit, Application, Feature, Command, ExecutionTimeThreshold, 68 | OrchestrationId, StartTime, EndTime, IsExceptionExpected, ExceptionMessage, ExceptionType, 69 | PlanId, IsSLA; 70 | // 71 | // 72 | // big ol' map/reduce 73 | let NullTime = datetime(1601-01-01 00:00:00.0000000); 74 | let IsValidDate = (dt:datetime) { isnotnull(dt) and dt != NullTime }; 75 | let parallelism = 8; 76 | let PhaseData = range p from 1 to parallelism step 1 | partition by p 77 | { 78 | OrchestrationLogWithSLA 79 | | where hash(PlanId, parallelism) == toscalar(p) 80 | | extend EventTime = max_of(StartTime, EndTime) 81 | | order by OrchestrationId, EndTime asc, StartTime asc 82 | // 83 | // generate logical variables 84 | | extend IsNextSameOrchestration = (next(OrchestrationId) == OrchestrationId) 85 | | extend NextEndTime = next(EndTime) 86 | | extend IsEndPhase = IsValidDate(EndTime) 87 | | extend IsLastPhase = IsNextSameOrchestration and IsValidDate(NextEndTime) 88 | | extend NextStartTime = next(StartTime) 89 | // 90 | // generate useful variables 91 | | extend PhaseStartTime = EventTime 92 | | extend PhaseEndTime = case( 93 | IsEndPhase, EndTime, 94 | IsLastPhase, NextEndTime, 95 | IsNextSameOrchestration, NextStartTime, // marks the end of an Orchestration 96 | // min_of(startTime, PhaseStartTime + MaxPhaseDuration)) // original 97 | startTime) // currently running or dropped-telemetry phases are set to 0s 98 | | extend PhaseEndTime = iff(PhaseEndTime - PhaseStartTime > maxPhaseDuration, PhaseStartTime, PhaseEndTime) 99 | | extend OidComponents = split(OrchestrationId, ".") 100 | | extend OidLookback = iff(isnull(tolong(OidComponents[-1])), -2, -3) // ignore attempt numbers 101 | | extend ParentOrchestrationId = strcat_array(array_slice(OidComponents, 1, OidLookback), ".") 102 | | extend SLADuration = iff(IsSLA, PhaseEndTime - PhaseStartTime, 0s) 103 | | project Service, Region, ScaleUnit, PhaseStartTime, SLADuration, PhaseEndTime, 104 | PlanId, OrchestrationId, ParentOrchestrationId 105 | }; 106 | // 107 | // 108 | // Collect precise phase duration data by plan id. 109 | let PlanData = PhaseData 110 | | summarize hint.strategy=shuffle 111 | ShortOrchestrationIds = make_list(substring(OrchestrationId, 37)), // remove the guid 112 | PhaseEndTimes = make_list(PhaseEndTime), 113 | SLADurations = make_list(SLADuration) 114 | by PlanId; 115 | // 116 | // 117 | // Compute final SLA violation table 118 | let Result = PhaseData 119 | | summarize hint.strategy=shuffle 120 | SLADuration = sum(SLADuration), // find total 121 | OrchestrationStartTime = min(PhaseStartTime) 122 | by Service, Region, ScaleUnit, OrchestrationId, ParentOrchestrationId, PlanId 123 | // 124 | // OK: join with table of all phase info by plan id, and sum up all the contributing ancestral SLA durations 125 | | lookup (PlanData) on PlanId 126 | | mv-apply 127 | AncestorPhaseEndTime = PhaseEndTimes to typeof(datetime), 128 | AncestorPhaseDuration = SLADurations to typeof(timespan), 129 | Soid = ShortOrchestrationIds to typeof(string) 130 | on ( 131 | where AncestorPhaseEndTime <= OrchestrationStartTime // only phases that completed before the first phase of this orchestration 132 | | where ParentOrchestrationId startswith Soid // only ancestor orchestrations 133 | | summarize AncestorSLADuration = sum(AncestorPhaseDuration) 134 | ) 135 | | project-away PhaseEndTimes, SLADurations, ShortOrchestrationIds 136 | // 137 | // 138 | // SLA Enforcement 139 | | where SLADuration + AncestorSLADuration between(slaLimit..maxPhaseDuration) 140 | | lookup (PlansToConsider) on PlanId 141 | | project Service, Region, ScaleUnit, PlanId, OrchestrationId, 142 | PlanDuration = PlanDuration / 1s, 143 | TotalSLADuration = (SLADuration + AncestorSLADuration) / 1s 144 | ; 145 | // 146 | // 147 | Result 148 | // 149 | // Kalypso monitor mode: only take any if there are more that a certain number of problems 150 | // | summarize hint.strategy=shuffle 151 | // NumberOfOrchestrations=count(), 152 | // NumberOfPlans = dcount(PlanId), 153 | // AvgSLADuration = avg(TotalSLADuration), 154 | // MaxSLADuration = max(TotalSLADuration), 155 | // MinSLADuration = min(TotalSLADuration) 156 | // by Service, Region, ScaleUnit 157 | // | where NumberOfOrchestrations > sev2Threshold 158 | // | project TriggerTime = triggerTime, Lookback=lookback, Service, Region, ScaleUnit, 159 | // MinSLADuration, AvgSLADuration, MaxSLADuration, NumberOfOrchestrations, NumberOfPlans 160 | -------------------------------------------------------------------------------- /queries/sla/SLAVisualization.csl: -------------------------------------------------------------------------------- 1 | // CIAO phase performance analysis 2 | // zacox@microsoft.com 3 | // 4 | let oid = {OrchestrationId}; // "7d1c09de-35e8-44ed-8720-279b117caf1d.job_1.__default.135"; 5 | let su = {ScaleUnit}; 6 | let currentTime = now() - 15m; // always use most upto date data 7 | let oidComponents = split(oid, "."); 8 | let oidLookback = iff(isnull(tolong(oidComponents[-1])), -2, -3); // ignore attempt numbers 9 | let parentOid = strcat_array(array_slice(oidComponents, 0, oidLookback), "."); 10 | let planid = substring(oid, 0, 36); 11 | let useSev2Thresholds = true; 12 | let maxPhaseDuration = 2h; 13 | let definitionName = toscalar(OrchestrationPlanContext 14 | | where OrchestrationId startswith planid 15 | | project DefinitionName | take 1); 16 | // 17 | // 18 | // compute phase performance 19 | let IsValidDate = (dt:datetime) { isnotnull(dt) and dt != datetime(1601-01-01 00:00:00.0000000) }; 20 | OrchestrationLog 21 | | where isempty(su) or ScaleUnit == su 22 | | where Application == "Pipelines" and Feature == "Build" 23 | | where oid == OrchestrationId or (isnotempty(OrchestrationId) and parentOid startswith OrchestrationId) 24 | | lookup 25 | (PhaseExecutionTimeOverrides() | where Application == "Pipelines" and Feature == "Build") 26 | on Application, Feature, Command 27 | | extend ExecutionTimeThreshold = iff( 28 | isnotnull(ExecutionTimeThresholdOverrideInMicroseconds), 29 | ExecutionTimeThresholdOverrideInMicroseconds * 1microsecond, 30 | max_of(0s, ExecutionTimeThreshold * 1microsecond)) 31 | | project OrchestrationId, Command, ExecutionTimeThreshold, StartTime, EndTime, ScaleUnit, Region 32 | // 33 | // compute phase durations 34 | | order by OrchestrationId, EndTime asc, StartTime asc // cluster by orchestration id 35 | | extend IsNextSameOrchestration = (next(OrchestrationId) == OrchestrationId) 36 | | extend NextEndTime = next(EndTime) 37 | | extend IsEndPhase = IsValidDate(EndTime) 38 | | extend IsLastPhase = IsNextSameOrchestration and IsValidDate(NextEndTime) 39 | | extend IsPrevSameOrchestration = (prev(OrchestrationId) == OrchestrationId) 40 | | extend NextStartTime = next(StartTime) 41 | | extend PhaseStartTime = max_of(StartTime, EndTime) 42 | | extend PhaseEndTime = case( 43 | IsEndPhase, EndTime, 44 | IsLastPhase, NextEndTime, 45 | IsNextSameOrchestration, NextStartTime, // Ending of Orchestration 46 | currentTime) 47 | | extend Duration = PhaseEndTime - PhaseStartTime 48 | | extend Difference = Duration - ExecutionTimeThreshold 49 | | extend PercentDifference = 50 | iff(ExecutionTimeThreshold > 0s, 51 | round(100 * todouble(Difference / 1microsecond) / todouble(ExecutionTimeThreshold / 1microsecond), 2), 52 | double(0)) 53 | // 54 | // analytics 55 | | order by EndTime asc, StartTime asc 56 | | project PlanId = planid, 57 | OrchestrationId = strcat_array(array_slice(split(OrchestrationId, '.'), 1, 300), '.'), 58 | DefinitionName = definitionName, 59 | ScaleUnit, 60 | Region, 61 | PhaseName = Command, 62 | // Actual=Duration, 63 | // Expected=ExecutionTimeThreshold, 64 | PercentDifference, 65 | OwningTeam = split(Command, ".")[0], 66 | Level = case( 67 | PercentDifference > 50, 2, // "Very Suspicious" 68 | PercentDifference > 20, 3, // "Suspicious" 69 | 0) // "Normal" -------------------------------------------------------------------------------- /queries/sql/CpuActivity.csl: -------------------------------------------------------------------------------- 1 | let dbName = "{db}"; 2 | let startTime = todatetime("{start}"); 3 | let endTime = todatetime("{end}"); 4 | let scaleUnit = "{su}"; 5 | let service = "{service}"; 6 | XEventDataRPCCompleted 7 | | where EventTime between (startTime .. endTime) 8 | | where Service == service 9 | | where ScaleUnit == scaleUnit 10 | | where DatabaseName == dbName 11 | | join kind=inner 12 | ( 13 | ActivityLog 14 | | where StartTime <= endTime and TIMESTAMP >= startTime // Activity started before endTime and finished after startTime 15 | | where Service == service 16 | | where ScaleUnit == scaleUnit 17 | | extend Agent = iff(UserAgent contains "mozilla", "Browser", UserAgent) 18 | | summarize by Application, Command, ApplicationHash, CommandHash, Agent 19 | ) on ApplicationHash, CommandHash 20 | | summarize sum(CpuTime) by HostId, VSID, Application, Command, Agent, bin(EventTime, 5m) 21 | | order by sum_CpuTime desc -------------------------------------------------------------------------------- /queries/sql/CpuJob.csl: -------------------------------------------------------------------------------- 1 | let dbName = "{db}"; 2 | let startTime = todatetime("{start}"); 3 | let endTime = todatetime("{end}"); 4 | let scaleUnit = "{su}"; 5 | let service = "{service}"; 6 | XEventDataRPCCompleted 7 | | where EventTime between (startTime .. endTime ) 8 | | where ScaleUnit == scaleUnit and DatabaseName == dbName 9 | | join kind=inner 10 | ( 11 | JobHistory 12 | | where StartTime <= endTime and PreciseTimeStamp >= startTime 13 | | where Service == service and ScaleUnit == scaleUnit 14 | | summarize by Plugin, HostId=JobSource, UniqueIdentifier=JobId 15 | ) on HostId, UniqueIdentifier 16 | | summarize sum(CpuTime) by Plugin, HostId 17 | | top 20 by sum_CpuTime desc -------------------------------------------------------------------------------- /queries/sql/CpuTop.csl: -------------------------------------------------------------------------------- 1 | let dbName = "{db}"; 2 | let startTime = todatetime("{start}"); 3 | let endTime = todatetime("{end}"); 4 | QDS 5 | | where TIMESTAMP between (startTime .. endTime) 6 | | where DatabaseName == dbName 7 | | summarize sum(TotalCpuTime), sum(TotalPhysicalReads), sum(TotalLogicalReads), sum(TotalExecutions), sum(TotalExceptions) by QueryText 8 | | top 10 by sum_TotalCpuTime desc -------------------------------------------------------------------------------- /queries/sql/CpuXEvent.csl: -------------------------------------------------------------------------------- 1 | let dbName = "{db}"; 2 | let startTime = todatetime("{start}"); 3 | let endTime = todatetime("{end}"); 4 | let XEventTypes = datatable (TypeName:string, Type:long) 5 | [ "None", 0, 6 | "Activity", 1 , 7 | "Job", 2, 8 | "Task" , 4, 9 | "JobTask", 6, 10 | "Notification", 8, 11 | "Pipeline", 16, 12 | "PipelineActivity", 17, 13 | "PipelineJob", 18, 14 | "AnonymousActivity", 33, 15 | "PublicActivity", 65, 16 | "Other", 128 ]; 17 | XEventDataRPCCompleted 18 | | where EventTime between (startTime .. endTime) 19 | | where DatabaseName == dbName 20 | | join kind=leftouter (XEventTypes) on Type 21 | | summarize sum(CpuTime) by TypeName, ObjectName 22 | | order by sum_CpuTime desc -------------------------------------------------------------------------------- /queries/sql/GetData.csl: -------------------------------------------------------------------------------- 1 | let dbName = "{db}"; 2 | let startTime = todatetime("{start}") - 5hr; 3 | let endTime = todatetime("{end}") + 5hr; 4 | DatabasePerformanceStatistics 5 | | where TIMESTAMP between (startTime .. endTime) 6 | | where DatabaseName =~ dbName 7 | | take 1 -------------------------------------------------------------------------------- /queries/sql/WhatsSlow.csl: -------------------------------------------------------------------------------- 1 | let dbName = "{db}"; 2 | let startTime = todatetime("{start}") - 5hr; 3 | let endTime = todatetime("{end}") + 5hr; 4 | DatabasePerformanceStatistics 5 | | where TIMESTAMP between (startTime .. endTime) 6 | | where DatabaseName =~ dbName 7 | | summarize avg(AverageCpuPercentage), avg(AverageMemoryUsagePercentage), avg(AverageLogWriteUtilizationPercentage), max(MaximumWorkerPercentage) by bin(PeriodStart, 1m), ServiceObjective -------------------------------------------------------------------------------- /run.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# The Plan\n", 8 | "Everything we know about the plan.\n", 9 | "\n", 10 | "### Instructions\n", 11 | "1. Run all cells! (click on Menu > Cell > Run All Cells)\n", 12 | "1. View report at the bottom." 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": { 19 | "inputHidden": false, 20 | "outputHidden": false, 21 | "tags": [ 22 | "parameters" 23 | ] 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "#planId = \"98db70e2-cee5-4e2d-ae15-dca389fa8f41\"\n", 28 | "planId = \"f38f1a4b-49d7-4f08-a9b9-c81b2c39aff6\"" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "inputHidden": false, 36 | "outputHidden": false 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "%%capture \n", 41 | "# install packages, setup workspace root\n", 42 | "!pip install --upgrade azure-kusto-notebooks plotly\n", 43 | "import os\n", 44 | "from azure.kusto.notebooks import utils as akn\n", 45 | "import pandas as pd\n", 46 | "pd.options.display.html.table_schema = True\n", 47 | "\n", 48 | "# cwd should be workspace root\n", 49 | "if os.path.basename(os.getcwd()) == 'devops-pipelines':\n", 50 | " os.chdir(os.pardir)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": { 57 | "inputHidden": false, 58 | "outputHidden": false 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "# authenticate kusto client\n", 63 | "# you will need to copy the token into a browser window for AAD auth. \n", 64 | "client = akn.get_client('https://vso.kusto.windows.net')" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": { 71 | "inputHidden": false, 72 | "outputHidden": false 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "# collect basic plan info\n", 77 | "plan_info = akn.Query(\n", 78 | " client, 'VSO', \n", 79 | " path=os.path.join('devops-pipelines', 'queries', 'run', 'PlanInfo.csl'), \n", 80 | " params={'OrchestrationId': akn.quote(planId)})\n", 81 | "\n", 82 | "# collect full plan history\n", 83 | "what_happened = akn.Query(client, 'VSO',\n", 84 | " path=os.path.join('devops-pipelines', 'queries', 'run', 'WhatHappened.csl'),\n", 85 | " params={'OrchestrationId': akn.quote(planId)})\n", 86 | "\n", 87 | "# fetch data in parallel\n", 88 | "akn.run((plan_info, what_happened))" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": { 95 | "inputHidden": false, 96 | "outputHidden": false 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "# draw basic info\n", 101 | "\n", 102 | "# compute relative time stamps\n", 103 | "history = what_happened.dataframe\n", 104 | "t0 = history['PreciseTimeStamp'].iloc[0]\n", 105 | "history['Time'] = history.apply(lambda row: row['PreciseTimeStamp'] - t0, axis=1)\n", 106 | "history.OrchestrationId = history.apply(lambda row: row.OrchestrationId[37:], axis=1)\n", 107 | "\n", 108 | "# record critical times\n", 109 | "def find_time(message):\n", 110 | " r = history[history.Message.str.startswith(message)]\n", 111 | " if len(r.index) > 0:\n", 112 | " return r['PreciseTimeStamp'].iloc[0]\n", 113 | "\n", 114 | "create_time = find_time('Created plan')\n", 115 | "start_time = find_time('Started plan')\n", 116 | "end_time = find_time('Completed orchestration with result')\n", 117 | "total_duration = end_time - start_time if end_time and start_time else None\n", 118 | "\n", 119 | "import importlib\n", 120 | "importlib.reload(akn)\n", 121 | "# info will only exist if the plan has started at least one job :(\n", 122 | "d = akn.pandas_row_to_dictionary(plan_info.dataframe)\n", 123 | "d['create time'] = create_time\n", 124 | "d['start time'] = start_time\n", 125 | "d['end time'] = end_time\n", 126 | "d['total duration'] = total_duration\n", 127 | "r = akn.Report()\n", 128 | "r.write(akn.to_md_table(d))\n", 129 | "\n", 130 | "from IPython.display import Markdown\n", 131 | "Markdown(r.content)" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": { 138 | "inputHidden": false, 139 | "outputHidden": false 140 | }, 141 | "outputs": [], 142 | "source": [ 143 | "# SLA analysis\n", 144 | "su = akn.quote(d.get('ScaleUnit', ''))\n", 145 | "oids = [akn.quote(joid) for joid in d.get('JobOrchestrationIds', [])]\n", 146 | "slas = [akn.Query(client, 'VSO', \n", 147 | " os.path.join('devops-pipelines', 'queries', 'sla', 'SLAVisualization.csl'),\n", 148 | " params=dict(ScaleUnit=su, OrchestrationId=oid)) \n", 149 | " for oid in oids]\n", 150 | "akn.run(slas)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": { 157 | "inputHidden": false, 158 | "outputHidden": false 159 | }, 160 | "outputs": [], 161 | "source": [ 162 | "# draw all slas\n", 163 | "from _plotly_future_ import v4_subplots\n", 164 | "from plotly.subplots import make_subplots\n", 165 | "import plotly.graph_objects as go\n", 166 | "import math\n", 167 | "if not slas:\n", 168 | " print(\"There are no jobs associated with this plan.\")\n", 169 | "else:\n", 170 | " number_of_graphs = min(25, len(slas))\n", 171 | " names = [n[37:] for n in d.get('JobOrchestrationIds',[])]\n", 172 | " fig = make_subplots(cols=2, rows=int(math.ceil(number_of_graphs / 2)), \n", 173 | " subplot_titles=names,\n", 174 | " shared_xaxes=True, \n", 175 | " vertical_spacing=0.1)\n", 176 | "\n", 177 | " for i in range(len(slas)):\n", 178 | " df = slas[i].dataframe\n", 179 | " row = int(i / 2) + 1\n", 180 | " col = int(i % 2) + 1\n", 181 | " name = names[i]\n", 182 | " \n", 183 | " df = slas[0].dataframe\n", 184 | " fig.add_trace(go.Bar(x=df.PhaseName, y=df.PercentDifference, name=name), \n", 185 | " row=row, col=col)\n", 186 | " fig.update_xaxes(showgrid=False, tickangle=-60, automargin=True)\n", 187 | " fig.update_xaxes(showgrid=True, zeroline=True, automargin=True)\n", 188 | " fig.update_layout(height=150 * number_of_graphs, \n", 189 | " width=1000, showlegend=False,\n", 190 | " title_text=\"Analysis!\")\n", 191 | "\n", 192 | " fig.show()" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": { 199 | "inputHidden": false, 200 | "outputHidden": false 201 | }, 202 | "outputs": [], 203 | "source": [ 204 | "# draw full history\n", 205 | "columns_to_ignore = ('source_', 'PreciseTimeStamp')\n", 206 | "columns = ['Time'] + [c for c in history.columns if c not in columns_to_ignore and c != 'Time']\n", 207 | "\n", 208 | "from IPython.display import HTML\n", 209 | "HTML(history[columns].to_html(index=False))" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": { 216 | "inputHidden": false, 217 | "outputHidden": false 218 | }, 219 | "outputs": [], 220 | "source": [] 221 | } 222 | ], 223 | "metadata": { 224 | "kernel_info": { 225 | "name": "python3" 226 | }, 227 | "kernelspec": { 228 | "display_name": "Python 3", 229 | "language": "python", 230 | "name": "python3" 231 | }, 232 | "language_info": { 233 | "codemirror_mode": { 234 | "name": "ipython", 235 | "version": 3 236 | }, 237 | "file_extension": ".py", 238 | "mimetype": "text/x-python", 239 | "name": "python", 240 | "nbconvert_exporter": "python", 241 | "pygments_lexer": "ipython3", 242 | "version": "3.7.4" 243 | }, 244 | "nteract": { 245 | "version": "0.15.0" 246 | } 247 | }, 248 | "nbformat": 4, 249 | "nbformat_minor": 2 250 | } 251 | -------------------------------------------------------------------------------- /sla.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# SLA Investigation\n", 8 | "1. Run all cells! (click on Menu > Cell > Run All Cells)\n", 9 | "1. View report at the bottom." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "inputHidden": false, 17 | "outputHidden": false, 18 | "tags": [ 19 | "parameters" 20 | ] 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "triggerTime = \"2019-10-15T20:21:54.0330000Z\"\n", 25 | "scaleUnit = \"pipelines-ghub-eus2-2\"\n", 26 | "service = \"pipelines\"\n", 27 | "lookback = \"1h\"\n", 28 | "region = \"\"" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "inputHidden": false, 36 | "outputHidden": false 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "%%capture \n", 41 | "\n", 42 | "# install packages, setup workspace root\n", 43 | "!pip install --upgrade pip azure-kusto-notebooks\n", 44 | "import os\n", 45 | "import sys\n", 46 | "import datetime\n", 47 | "import pandas as pd\n", 48 | "import numpy as np\n", 49 | "import matplotlib\n", 50 | "import matplotlib.pyplot as plt\n", 51 | "pd.options.display.html.table_schema = True\n", 52 | "import concurrent.futures\n", 53 | "from azure.kusto.notebooks import utils as akn\n", 54 | "\n", 55 | "# cwd should be workspace root\n", 56 | "if os.path.basename(os.getcwd()) == 'devops-pipelines':\n", 57 | " os.chdir(os.pardir)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": { 64 | "inputHidden": false, 65 | "outputHidden": false 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "# authenticate kusto client\n", 70 | "# you will need to copy the token into a browser window for AAD auth. \n", 71 | "client = akn.get_client('https://vso.kusto.windows.net')" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "inputHidden": false, 79 | "outputHidden": false 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "# find orchestrations that violate SLA\n", 84 | "params = {\n", 85 | " 'TriggerTime': akn.to_kusto_datetime(triggerTime),\n", 86 | " 'Lookback': akn.to_kusto_timespan(lookback),\n", 87 | " 'Service': '\"' + service + '\"', \n", 88 | " 'Region': '\"' + region + '\"',\n", 89 | " 'ScaleUnit': '\"' + scaleUnit + '\"'\n", 90 | "}\n", 91 | "query = os.path.join('devops-pipelines', 'queries', 'sla', 'SLADurationAnalysis.csl')\n", 92 | "violations = akn.execute_file(client, database='VSO', path=query, params=params)\n", 93 | "# violations" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": { 100 | "inputHidden": false, 101 | "outputHidden": false 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "# collect problematic orchestration ids\n", 106 | "result = violations.primary_results[0]\n", 107 | "oid_column_index = next((c.ordinal for c in result.columns if c.column_name == 'OrchestrationId'), None)\n", 108 | "su_column_index = next((c.ordinal for c in result.columns if c.column_name == 'ScaleUnit'), None)\n", 109 | "\n", 110 | "# group\n", 111 | "by_su = {}\n", 112 | "for r in result.rows:\n", 113 | " su = r[su_column_index]\n", 114 | " oid = r[oid_column_index]\n", 115 | " l = by_su.get(su, [])\n", 116 | " by_su[su] = l\n", 117 | " l.append(oid)\n", 118 | "\n", 119 | "max_scale_units = []\n", 120 | "max_problems = 0\n", 121 | "for k,v in by_su.items():\n", 122 | " c = len(v)\n", 123 | " if c > max_problems:\n", 124 | " max_problems = c\n", 125 | " max_scale_units = [k]\n", 126 | " elif c == max_problems:\n", 127 | " max_scale_units.append(k)\n", 128 | "max_scale_units.sort()\n", 129 | "\n", 130 | "# for su, oids in by_su.items():\n", 131 | "# print(su)\n", 132 | "# for oid in oids:\n", 133 | "# print(' ', oid)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": { 140 | "inputHidden": false, 141 | "outputHidden": false 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "# collect visualization data sets\n", 146 | "query = os.path.join('devops-pipelines', 'queries', 'sla', 'SLAVisualization.csl')\n", 147 | "with concurrent.futures.ThreadPoolExecutor() as executor:\n", 148 | " hfs = [executor.submit(akn.execute_file, client, 'VSO', query, \n", 149 | " {\n", 150 | " 'ScaleUnit': '\"' + r[su_column_index] + '\"', \n", 151 | " 'OrchestrationId': '\"' + r[oid_column_index] + '\"'\n", 152 | " }) for r in result.rows]\n", 153 | " histories = [h.result() for h in concurrent.futures.as_completed(hfs)]\n", 154 | "\n", 155 | "# convert to data frames\n", 156 | "primary_results = [h.primary_results[0] for h in histories]\n", 157 | "dataframes = None\n", 158 | "with concurrent.futures.ThreadPoolExecutor() as executor:\n", 159 | " dataframe_futures = [executor.submit(akn.to_dataframe, r) for r in primary_results]\n", 160 | " dataframes = [dff.result() for dff in concurrent.futures.as_completed(dataframe_futures)]\n", 161 | "histories = None\n", 162 | "\n", 163 | "# try to filter out false positives? at least a certain number of phases must have been recorded.\n", 164 | "required_phases = ('RunAgentJob.SendJob', 'RunAgentJob.JobCompleted')\n", 165 | "filtered_dataframes = [df for df in dataframes if all([p in df['PhaseName'].values for p in required_phases])]\n", 166 | "number_of_false_positives = len(dataframes) - len(filtered_dataframes)\n", 167 | "dataframes = filtered_dataframes\n", 168 | "plans_out_of_sla = [df['PlanId'].iat[0] for df in dataframes]\n", 169 | "number_of_violations = len(dataframes)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": { 176 | "inputHidden": false, 177 | "outputHidden": false 178 | }, 179 | "outputs": [], 180 | "source": [ 181 | "worst_phaseName = ''\n", 182 | "worst_count = 0\n", 183 | "worst_team = ''\n", 184 | "\n", 185 | "if dataframes:\n", 186 | " # what was the worst phase?\n", 187 | " combined = pd.concat(dataframes, ignore_index=True)\n", 188 | " df = combined.loc[combined['Level'] == 2].groupby(['PhaseName']).size().to_frame('Count').nlargest(1, 'Count')\n", 189 | " if len(df.index) > 0:\n", 190 | " worst_phaseName = df.index[0]\n", 191 | " worst_count = df.iat[0, 0]\n", 192 | " worst_team = worst_phaseName.split('.')[0]\n", 193 | " \n", 194 | " # what was the worst plan?\n", 195 | " violations_df = akn.to_dataframe(violations.primary_results[0])\n", 196 | " df = violations_df.groupby(['PlanId']).size().to_frame('Count').nlargest(1, 'Count')\n", 197 | " plan_with_most_violations = df.index[0]\n", 198 | " plan_with_most_violations_count = df.iat[0, 0]" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": { 205 | "inputHidden": false, 206 | "outputHidden": false 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "if number_of_false_positives:\n", 211 | " print(number_of_false_positives, 'plans are likely missing kusto data and were ignored.')\n", 212 | "if number_of_violations <= 0:\n", 213 | " print('no problems detected')\n", 214 | "else:\n", 215 | " for su in max_scale_units:\n", 216 | " print(max_problems, 'of the problems were in', su)\n", 217 | " \n", 218 | " print(number_of_violations, \n", 219 | " 'plans' if number_of_violations > 1 else 'plan', \n", 220 | " 'had no apparent data problems and', \n", 221 | " 'are' if number_of_violations > 1 else 'is', \n", 222 | " 'out of SLA.')\n", 223 | " \n", 224 | " if plan_with_most_violations in plans_out_of_sla:\n", 225 | " print(plan_with_most_violations, 'had the most violations with', plan_with_most_violations_count)\n", 226 | " \n", 227 | " if worst_phaseName:\n", 228 | " print('\"' + worst_phaseName + '\"', 'was the slowest phase in', worst_count, \n", 229 | " 'of the', number_of_violations, 'SLA violations.')\n", 230 | " \n", 231 | " print ('\\nConclusion:')\n", 232 | " if number_of_violations > 5: \n", 233 | " print('This is likely a real problem. Open icm against scale units:', max_scale_units)\n", 234 | " print('Initially route it to: ', worst_team)\n", 235 | " else: \n", 236 | " print('Too much uncertainty -- do not open any ICMs.')\n", 237 | " \n", 238 | " if number_of_false_positives and float(number_of_false_positives) / float(max_problems) > .5:\n", 239 | " for su in max_scale_units:\n", 240 | " print(su, 'might be unhealthy based on the number of plans missing kusto data.')\n", 241 | " " 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": { 248 | "inputHidden": false, 249 | "outputHidden": false 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "%matplotlib inline\n", 254 | "plt.rcdefaults()\n", 255 | "\n", 256 | "if dataframes:\n", 257 | " number_of_graphs = min(25, len(dataframes))\n", 258 | " fig, axes = plt.subplots(nrows=number_of_graphs, ncols=1, figsize=(8, 6 * number_of_graphs), constrained_layout=True)\n", 259 | " for i in range(number_of_graphs):\n", 260 | " df = dataframes[i]\n", 261 | " ax = axes[i] if number_of_graphs > 1 else axes\n", 262 | " ax.axhline(0, color='k')\n", 263 | "\n", 264 | " x = df['PhaseName']\n", 265 | " xpos = np.arange(len(x))\n", 266 | " y = df['PercentDifference']\n", 267 | " plan_id = df['PlanId'].iloc[0]\n", 268 | " violation_row = violations_df.loc[violations_df['PlanId'] == plan_id]\n", 269 | " title = '\\n'.join([\n", 270 | " 'plan id:' + plan_id,\n", 271 | " 'scale unit:' + str(violation_row['ScaleUnit'].iloc[0]),\n", 272 | " 'definition:' + str(df['DefinitionName'].iloc[0]),\n", 273 | " 'plan duration: ' + str(violation_row['PlanDuration'].iloc[0]),\n", 274 | " 'sla duration: ' + str(violation_row['TotalSLADuration'].iloc[0]),\n", 275 | " ])\n", 276 | " ax.title.set_text(title)\n", 277 | "\n", 278 | " ax.bar(x=xpos, height=y)\n", 279 | " ax.set_xticks(xpos)\n", 280 | " ax.set_xticklabels(x, rotation=45, ha=\"right\")\n", 281 | "\n", 282 | "# output_filename = 'analysis.svg'\n", 283 | "# plt.savefig(output_filename, format='svg')" 284 | ] 285 | } 286 | ], 287 | "metadata": { 288 | "kernel_info": { 289 | "name": "python3" 290 | }, 291 | "kernelspec": { 292 | "display_name": "Python 3", 293 | "language": "python", 294 | "name": "python3" 295 | }, 296 | "language_info": { 297 | "codemirror_mode": { 298 | "name": "ipython", 299 | "version": 3 300 | }, 301 | "file_extension": ".py", 302 | "mimetype": "text/x-python", 303 | "name": "python", 304 | "nbconvert_exporter": "python", 305 | "pygments_lexer": "ipython3", 306 | "version": "3.7.4" 307 | }, 308 | "nteract": { 309 | "version": "0.15.0" 310 | } 311 | }, 312 | "nbformat": 4, 313 | "nbformat_minor": 2 314 | } 315 | -------------------------------------------------------------------------------- /sql.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# SQL Investigation\n", 8 | "1. Run all cells.\n", 9 | "1. View report at the bottom." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "inputHidden": false, 17 | "outputHidden": false, 18 | "tags": [ 19 | "parameters" 20 | ] 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "# These are just defaults will be overwritten if you use nimport pip\n", 25 | "db = \"Tfs_tfsprodcus2_37253a68-972a-4bf4-8c5f-a259ba4d42cd\"\n", 26 | "start = \"2019-07-31T17:30:00.0000000Z\"\n", 27 | "end = \"2019-07-31T18:30:36.0000000Z\"\n", 28 | "url = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK/delays.ipynb\"\n", 29 | "baseUrl = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK\"" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": { 36 | "inputHidden": false, 37 | "outputHidden": false 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "%%capture\n", 42 | "!pip install --upgrade nimport azure-kusto-notebooks" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": { 49 | "inputHidden": false, 50 | "outputHidden": false 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "# Import the things we use\n", 55 | "\n", 56 | "# Note you can also use kql https://docs.microsoft.com/en-us/azure/data-explorer/kqlmagic\n", 57 | "# %kql is single line magic\n", 58 | "# %%kql is cell magic\n", 59 | "\n", 60 | "# https://nbviewer.jupyter.org/github/ipython/ipython/blob/4.0.x/examples/IPython%20Kernel/Rich%20Output.ipynb#HTML\n", 61 | "# https://ipython.readthedocs.io/en/stable/inte/magics.html\n", 62 | "from IPython.display import display, HTML, Markdown, Javascript, clear_output\n", 63 | "\n", 64 | "# http://pandas-docs.github.io/pandas-docs-travis/user_guide/reshaping.html\n", 65 | "import pandas as pd\n", 66 | "pd.options.display.html.table_schema = True\n", 67 | "from pandas import Series, DataFrame\n", 68 | "from datetime import datetime, timedelta, timezone\n", 69 | "from urllib.parse import urlencode, quote_plus\n", 70 | "from requests.utils import requote_uri\n", 71 | "import time\n", 72 | "import numpy as np\n", 73 | "from matplotlib import pyplot as plt\n", 74 | "from nimport.utils import tokenize, open_nb\n", 75 | "import json\n", 76 | "import os\n", 77 | "import calendar as cal\n", 78 | "import concurrent.futures\n", 79 | "from azure.kusto.notebooks import utils as akn" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": { 86 | "inputHidden": false, 87 | "outputHidden": false 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "params = {\n", 92 | " \"db\": db,\n", 93 | " \"start\": start,\n", 94 | " \"end\": end,\n", 95 | " \"url\": url,\n", 96 | " \"baseUrl\": baseUrl\n", 97 | "}\n", 98 | "root = 'devops-pipelines' if os.path.basename(os.getcwd()) != 'devops-pipelines' else ''\n", 99 | "queryPath = os.path.join(root, 'queries')\n", 100 | " " 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "inputHidden": false, 108 | "outputHidden": false 109 | }, 110 | "outputs": [], 111 | "source": [ 112 | "# authenticate kusto client\n", 113 | "# you will need to copy the token into a browser window for AAD auth. \n", 114 | "client = akn.get_client('https://vso.kusto.windows.net')" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": { 121 | "inputHidden": false, 122 | "outputHidden": false 123 | }, 124 | "outputs": [], 125 | "source": [ 126 | "sqlPath = os.path.join(queryPath, 'sql')\n", 127 | "q_data = os.path.join(sqlPath, \"GetData.csl\")\n", 128 | "q_whatsSlow = os.path.join(sqlPath, \"WhatsSlow.csl\")\n", 129 | "with concurrent.futures.ThreadPoolExecutor() as executor:\n", 130 | " # materialize so that we have all information we might need\n", 131 | " p1 = executor.submit(akn.execute_file, client, 'VSO', q_data, params)\n", 132 | " q_data_df = akn.to_dataframe_from_future(p1)\n", 133 | " params[\"service\"] = q_data_df[\"Service\"][0]\n", 134 | " params[\"su\"] =q_data_df[\"ScaleUnit\"][0]\n", 135 | " \n", 136 | " p2 = executor.submit(akn.execute_file, client, 'VSO', q_whatsSlow, params)\n", 137 | "\n", 138 | "q_whatsSlow_df = akn.to_dataframe_from_future(p2) \n" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": { 145 | "inputHidden": false, 146 | "outputHidden": false 147 | }, 148 | "outputs": [], 149 | "source": [ 150 | "# Initialize for further analysis later\n", 151 | "q_cpuTop_df = None\n", 152 | "q_cpuXEvent_df = None\n", 153 | "q_cpuJob_df = None\n", 154 | "q_cpuActivity_df = None" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": { 161 | "inputHidden": false, 162 | "outputHidden": false 163 | }, 164 | "outputs": [], 165 | "source": [ 166 | "def cpuAnalysis():\n", 167 | " global q_cpuTop_df\n", 168 | " global q_cpuXEvent_df\n", 169 | " q_cpuTop = os.path.join(sqlPath, \"CpuTop.csl\")\n", 170 | " q_cpuXEvent = os.path.join(sqlPath, \"CpuXevent.csl\")\n", 171 | " with concurrent.futures.ThreadPoolExecutor() as executor:\n", 172 | " p1 = executor.submit(akn.execute_file, client, 'VSO', q_cpuTop, params)\n", 173 | " p2 = executor.submit(akn.execute_file, client, 'VSO', q_cpuXEvent, params)\n", 174 | "\n", 175 | " q_cpuTop_df = akn.to_dataframe_from_future(p1)\n", 176 | " \n", 177 | " q_cpuXEvent_df = akn.to_dataframe_from_future(p2)\n", 178 | " maxTime = q_cpuXEvent_df[\"sum_CpuTime\"].max()\n", 179 | " q_cpuXEvent_df['CpuTimeDiff'] = q_cpuXEvent_df[\"sum_CpuTime\"].map(lambda x: x/maxTime)\n", 180 | "\n", 181 | "def cpuAnalysisJob():\n", 182 | " global q_cpuJob_df\n", 183 | " q_cpuJob = os.path.join(sqlPath, \"CpuJob.csl\")\n", 184 | " with concurrent.futures.ThreadPoolExecutor() as executor:\n", 185 | " p1 = executor.submit(akn.execute_file, client, 'VSO', q_cpuJob, params)\n", 186 | "\n", 187 | " q_cpuJob_df = akn.to_dataframe_from_future(p1)\n", 188 | "\n", 189 | "def cpuAnalysisActivity():\n", 190 | " global q_cpuActivity_df\n", 191 | " q_cpuActivity = os.path.join(sqlPath, \"CpuActivity.csl\")\n", 192 | " with concurrent.futures.ThreadPoolExecutor() as executor:\n", 193 | " p1 = executor.submit(akn.execute_file, client, 'VSO', q_cpuActivity, params)\n", 194 | "\n", 195 | " q_cpuActivity_df = akn.to_dataframe_from_future(p1)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": { 202 | "inputHidden": false, 203 | "outputHidden": false 204 | }, 205 | "outputs": [], 206 | "source": [ 207 | "print('=' * 50)\n", 208 | "print('Report!')\n", 209 | "print('=' * 50, '\\n\\n')\n", 210 | "\n", 211 | "jarvisParams = {'su': params[\"su\"], 'start': akn.get_time(start, -10), 'end': akn.get_time(end, 10), 'service': params[\"service\"], 'db': db }\n", 212 | "\n", 213 | "jaJarvisLink = \"\"\"https://jarvis-west.dc.ad.msft.net/dashboard/VSO-ServiceInsights/PlatformViews/SQLAzureDatabase\"\"\" \\\n", 214 | " \"\"\"?overrides=[{\"query\":\"//*[id='Service']\",\"key\":\"value\",\"replacement\":\"%(service)s\"},\"\"\" \\\n", 215 | " \"\"\"{\"query\":\"//*[id='ScaleUnit']\",\"key\":\"value\",\"replacement\":\"%(su)s\"},\"\"\" \\\n", 216 | " \"\"\"{\"query\":\"//*[id='__DatabaseName']\",\"key\":\"value\",\"replacement\":\"%(db)s\"}]\"\"\" \\\n", 217 | " \"\"\"&globalStartTime=%(start)s&globalEndTime=%(end)s&pinGlobalTimeRange=true\"\"\" % jarvisParams;\n", 218 | "print('Jarvis dashboard link for sql:\\n', requote_uri(jaJarvisLink), '\\n')\n", 219 | "\n", 220 | "print()\n", 221 | "print(\"Parameters used:\")\n", 222 | "display(params)\n", 223 | "\n", 224 | "print()\n", 225 | "\n", 226 | "## Where is the database at?\n", 227 | "print(\"Database is at: \")\n", 228 | "so = q_whatsSlow_df[\"ServiceObjective\"].unique()\n", 229 | "if so.size > 1:\n", 230 | " print(\"We found different service objectives..looks like db was changed?\")\n", 231 | "print(so) \n", 232 | "\n", 233 | "print()\n", 234 | "\n", 235 | "## What's slow?\n", 236 | "cpu = q_whatsSlow_df[\"avg_AverageCpuPercentage\"]\n", 237 | "memory = q_whatsSlow_df[\"avg_AverageMemoryUsagePercentage\"]\n", 238 | "logWrite= q_whatsSlow_df[\"avg_AverageLogWriteUtilizationPercentage\"]\n", 239 | "worker= q_whatsSlow_df[\"max_MaximumWorkerPercentage\"]\n", 240 | "cpu_coefficientOfVariance = cpu.std()/cpu.mean()\n", 241 | "memory_coefficientOfVariance = memory.std()/memory.mean()\n", 242 | "logWrite_coefficientOfVariance = logWrite.std()/logWrite.mean()\n", 243 | "worker_coefficientOfVariance = worker.std()/worker.mean()\n", 244 | "maxVar = 0.5\n", 245 | "\n", 246 | "reasons = \"Possibly due to: \"\n", 247 | "if cpu_coefficientOfVariance >= maxVar:\n", 248 | " reasons+= \"cpu (max: %s), \" % (cpu.max())\n", 249 | "if memory_coefficientOfVariance >= maxVar:\n", 250 | " reasons+= \"memory (max: %s), \" % (memory.max())\n", 251 | "if logWrite_coefficientOfVariance >= maxVar:\n", 252 | " reasons+= \"logwrite (max: %s), \" % (logWrite.max())\n", 253 | "if worker_coefficientOfVariance >= maxVar:\n", 254 | " reasons+= \"worker (max: %s), \" % (worker.max())\n", 255 | "print(reasons)\n", 256 | "\n", 257 | "if cpu.max() >= 80:\n", 258 | " print(\"We found high CPU, let's start with CPU analysis...\")\n", 259 | " \n", 260 | " cpuAnalysis()\n", 261 | " \n", 262 | " #print()\n", 263 | " #print(\"Top CPU commands:\")\n", 264 | " #display(q_cpuTop_df)\n", 265 | " \n", 266 | " print()\n", 267 | " print(\"Who's causing these commands?:\")\n", 268 | " commandsToConsider = q_cpuXEvent_df[q_cpuXEvent_df[\"CpuTimeDiff\"] >= 0.5]\n", 269 | " jobCommand = commandsToConsider[commandsToConsider[\"TypeName\"].str.contains('Job')]\n", 270 | " if len(jobCommand) >= 1:\n", 271 | " print(\"Possibly due to a job...\")\n", 272 | " display(jobCommand)\n", 273 | " cpuAnalysisJob()\n", 274 | " \n", 275 | " print()\n", 276 | " display(q_cpuJob_df)\n", 277 | " \n", 278 | " activityCommand = commandsToConsider[commandsToConsider[\"TypeName\"].str.contains('Activity')]\n", 279 | " if len(activityCommand) >= 1 and activityCommand[\"ObjectName\"][0]:\n", 280 | " print(\"Possibly due to user activity...\")\n", 281 | " display(activityCommand)\n", 282 | " cpuAnalysisActivity()\n", 283 | " \n", 284 | " print()\n", 285 | " display(q_cpuActivity_df)\n", 286 | " " 287 | ] 288 | } 289 | ], 290 | "metadata": { 291 | "kernel_info": { 292 | "name": "python3" 293 | }, 294 | "kernelspec": { 295 | "display_name": "Python 3", 296 | "language": "python", 297 | "name": "python3" 298 | }, 299 | "language_info": { 300 | "codemirror_mode": { 301 | "name": "ipython", 302 | "version": 3 303 | }, 304 | "file_extension": ".py", 305 | "mimetype": "text/x-python", 306 | "name": "python", 307 | "nbconvert_exporter": "python", 308 | "pygments_lexer": "ipython3", 309 | "version": "3.7.4" 310 | }, 311 | "nteract": { 312 | "version": "0.15.0" 313 | } 314 | }, 315 | "nbformat": 4, 316 | "nbformat_minor": 0 317 | } 318 | --------------------------------------------------------------------------------