├── .gitattributes
├── .github
    └── workflows
    │   ├── CheckNotebookContents.yml
    │   ├── delays.yml
    │   ├── impact.yml
    │   ├── ja.yml
    │   ├── run.yml
    │   └── sql.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .vscode
    └── settings.json
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── SECURITY.md
├── delays.ipynb
├── impact.ipynb
├── ja.ipynb
├── params.json
├── queries
    ├── ActiveIncidents.csl
    ├── HealthAgentActions.csl
    ├── LocationName.csl
    ├── MDMAccount.csl
    ├── SlowActivities.csl
    ├── SlowSql.csl
    ├── WhatChanged.csl
    ├── delays
    │   ├── Abusers.csl
    │   ├── AffectedAccounts.csl
    │   ├── DelayedAccountsAreAbusers.csl
    │   ├── Load.csl
    │   ├── LoadPerHost.csl
    │   ├── OrchestrationLogSpike.csl
    │   ├── OrchestrationLogSpikeTip.csl
    │   ├── Parallelism.csl
    │   └── WhatDelayed.csl
    ├── impact
    │   ├── CommandsAT.csl
    │   ├── CommandsDb.csl
    │   ├── CommandsReason.csl
    │   ├── Dependencies.csl
    │   └── Gen2GCSpikes.csl
    ├── ja
    │   └── JASqlTime.csl
    ├── run
    │   ├── PlanInfo.csl
    │   └── WhatHappened.csl
    ├── sla
    │   ├── SLADurationAnalysis.csl
    │   └── SLAVisualization.csl
    └── sql
    │   ├── CpuActivity.csl
    │   ├── CpuJob.csl
    │   ├── CpuTop.csl
    │   ├── CpuXEvent.csl
    │   ├── GetData.csl
    │   └── WhatsSlow.csl
├── run.ipynb
├── sla.ipynb
└── sql.ipynb


/.gitattributes:
--------------------------------------------------------------------------------
1 | *    text=auto


--------------------------------------------------------------------------------
/.github/workflows/CheckNotebookContents.yml:
--------------------------------------------------------------------------------
 1 | name: Check notebook contents
 2 | 
 3 | on: [pull_request]
 4 | 
 5 | jobs:
 6 |   build:
 7 | 
 8 |     runs-on: ubuntu-latest
 9 |     
10 |     steps:
11 |       - uses: actions/checkout@master
12 |       - name: Ensure dependencies
13 |         run: |          
14 |           #!/usr/bin/env python
15 |           sudo pip install --upgrade nbstripout
16 |           which python
17 |           pip show nbstripout
18 |           python -m site
19 |           #sudo python /home/runner/.local/lib/python2.7/site-packages/nbstripout/_nbstripout.py --install --global 
20 |           nbstripout --install --global
21 |       - name: Check contents
22 |         run: |
23 |           # Execute commands
24 |           hasError=False
25 |           for f in ${{runner.workspace}}/devops-pipelines/*.ipynb
26 |           do
27 |             echo "Processing $f..."
28 |             newFileName="${f}_stripped"
29 |             cat $f | nbstripout > $newFileName
30 |             diff --strip-trailing-cr $f $newFileName > /dev/null 2>&1
31 |             error=$?
32 |             if [ $error -eq 0 ]
33 |             then
34 |               echo "$f seems okay..."
35 |             elif [ $error -eq 1 ]
36 |             then
37 |               echo "$f is invalid..."
38 |               hasError=True
39 |             else
40 |               echo "Sorry, there was something wrong with the diff command..."
41 |             fi            
42 |           done
43 | 
44 |           if ($hasError); then
45 |             echo "Notebooks shouldn't contain any outputs, please install pre-commit hook, see readme." 
46 |             exit 1
47 |           fi
48 | 


--------------------------------------------------------------------------------
/.github/workflows/delays.yml:
--------------------------------------------------------------------------------
 1 | name: Delays analysis
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |     - automation/delays/*
 7 | 
 8 | jobs:
 9 |   run:
10 |     runs-on: ubuntu-latest
11 |     steps:    
12 |     - uses: actions/checkout@v1
13 |     - name: Set up Python
14 |       uses: actions/setup-python@v1
15 |     - name: Install dependencies	
16 |       run: |	
17 |         python -m pip install matplotlib nimport azure-kusto-notebooks plotly numpy==1.17.0
18 |     - uses: yaananth/run-notebook@v1
19 |       env:
20 |         RUNNER: ${{ toJson(runner) }}
21 |         SECRETS: ${{ toJson(secrets) }}
22 |         GITHUB: ${{ toJson(github) }}
23 |       with:
24 |         notebook: "delays.ipynb"
25 |         params: "params.json"
26 |         poll: true
27 |     - uses: actions/upload-artifact@master
28 |       if: always()
29 |       with:
30 |         name: output
31 |         path: ${{ RUNNER.temp }}/nb-runner
32 |       env:
33 |         RUNNER: ${{ toJson(runner) }}
34 | 


--------------------------------------------------------------------------------
/.github/workflows/impact.yml:
--------------------------------------------------------------------------------
 1 | name: Impact analysis
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |     - automation/impact/*
 7 | 
 8 | jobs:
 9 |   run:
10 |     runs-on: ubuntu-latest
11 |     steps:    
12 |     - uses: actions/checkout@v1
13 |     - name: Set up Python
14 |       uses: actions/setup-python@v1
15 |     - name: Install dependencies	
16 |       run: |	
17 |         python -m pip install matplotlib nimport azure-kusto-notebooks plotly numpy==1.17.0
18 |     - uses: yaananth/run-notebook@v1
19 |       env:
20 |         RUNNER: ${{ toJson(runner) }}
21 |         SECRETS: ${{ toJson(secrets) }}
22 |         GITHUB: ${{ toJson(github) }}
23 |       with:
24 |         notebook: "impact.ipynb"
25 |         params: "params.json"
26 |         poll: true
27 |     - uses: actions/upload-artifact@master
28 |       if: always()
29 |       with:
30 |         name: output
31 |         path: ${{ RUNNER.temp }}/nb-runner
32 |       env:
33 |         RUNNER: ${{ toJson(runner) }}
34 | 


--------------------------------------------------------------------------------
/.github/workflows/ja.yml:
--------------------------------------------------------------------------------
 1 | name: Jobagent analysis
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |     - automation/ja/*
 7 | 
 8 | jobs:
 9 |   run:
10 |     runs-on: ubuntu-latest
11 |     steps:    
12 |     - uses: actions/checkout@v1
13 |     - name: Set up Python
14 |       uses: actions/setup-python@v1
15 |     - name: Install dependencies	
16 |       run: |	
17 |         python -m pip install matplotlib nimport azure-kusto-notebooks plotly numpy
18 |     - uses: yaananth/run-notebook@v1
19 |       env:
20 |         RUNNER: ${{ toJson(runner) }}
21 |         SECRETS: ${{ toJson(secrets) }}
22 |         GITHUB: ${{ toJson(github) }}
23 |       with:
24 |         notebook: "ja.ipynb"
25 |         params: "params.json"
26 |         poll: true
27 |     - uses: actions/upload-artifact@master
28 |       if: always()
29 |       with:
30 |         name: output
31 |         path: ${{ RUNNER.temp }}/nb-runner
32 |       env:
33 |         RUNNER: ${{ toJson(runner) }}
34 | 


--------------------------------------------------------------------------------
/.github/workflows/run.yml:
--------------------------------------------------------------------------------
 1 | name: Run analysis
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |     - automation/run/*
 7 | 
 8 | jobs:
 9 |   run:
10 |     runs-on: ubuntu-latest
11 |     steps:    
12 |     - uses: actions/checkout@v1
13 |     - name: Set up Python
14 |       uses: actions/setup-python@v1
15 |     - name: Install dependencies	
16 |       run: |	
17 |         python -m pip install matplotlib nimport azure-kusto-notebooks plotly numpy
18 |     - uses: yaananth/run-notebook@v1
19 |       env:
20 |         RUNNER: ${{ toJson(runner) }}
21 |         SECRETS: ${{ toJson(secrets) }}
22 |         GITHUB: ${{ toJson(github) }}
23 |       with:
24 |         notebook: "run.ipynb"
25 |         params: "params.json"
26 |         poll: true
27 |     - uses: actions/upload-artifact@master
28 |       if: always()
29 |       with:
30 |         name: output
31 |         path: ${{ RUNNER.temp }}/nb-runner
32 |       env:
33 |         RUNNER: ${{ toJson(runner) }}
34 | 


--------------------------------------------------------------------------------
/.github/workflows/sql.yml:
--------------------------------------------------------------------------------
 1 | name: Sql analysis
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |     - automation/sql/*
 7 | 
 8 | jobs:
 9 |   run:
10 |     runs-on: ubuntu-latest
11 |     steps:    
12 |     - uses: actions/checkout@v1
13 |     - name: Set up Python
14 |       uses: actions/setup-python@v1
15 |     - name: Install dependencies	
16 |       run: |	
17 |         python -m pip install matplotlib nimport azure-kusto-notebooks plotly
18 |     - uses: yaananth/run-notebook@v1
19 |       env:
20 |         RUNNER: ${{ toJson(runner) }}
21 |         SECRETS: ${{ toJson(secrets) }}
22 |         GITHUB: ${{ toJson(github) }}
23 |       with:
24 |         notebook: "sql.ipynb"
25 |         params: "params.json"
26 |         poll: true
27 |     - uses: actions/upload-artifact@master
28 |       if: always()
29 |       with:
30 |         name: output
31 |         path: ${{ RUNNER.temp }}/nb-runner
32 |       env:
33 |         RUNNER: ${{ toJson(runner) }}
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | Kqlmagic_temp_files
2 | .ipynb_checkpoints/*
3 | *.pyc
4 | *.sh
5 | .venv
6 | .DS_Store


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/kynan/nbstripout
3 |   rev: master
4 |   hooks:
5 |     - id: nbstripout
6 |       files: ".ipynb"


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "workbench.colorCustomizations": {}
3 | }


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation. All rights reserved.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Devops-pipelines
 2 | Warehouse of notebooks for producing root-cause analysis of Azure DevOps pipeline delays.
 3 | 
 4 | Uses both [azure data explorer](https://docs.microsoft.com/en-us/azure/data-explorer/), and [azure notebooks](https://docs.microsoft.com/en-us/azure/notebooks/).
 5 | 
 6 | # Usage
 7 | ## Commands
 8 | ```
 9 | # Initialize
10 | !pip install --upgrade pip Kqlmagic nimport azure.kusto.data[pandas]
11 | %load_ext nimport
12 | ```
13 | 
14 | ```
15 | # Let's clone our repo, path is not relevant here, this just clones the whole repo
16 | %nimport container="microsoft/devops-pipelines" path="delays.ipynb" provider="github" providerOptions={"clone":"true"}
17 | ```
18 | 
19 | ```
20 | # If you have a URL where you want to parse parameters...
21 | from nimport.utils import open_nb, parse_params
22 | params = parse_params(currentUrl)
23 | display(params)
24 | ```
25 | 
26 | ```
27 | # Open the notebook by replacing the parameters
28 | open_nb("devops-pipelines/delays.ipynb", params)
29 | ```
30 | 
31 | # Contributing
32 | 
33 | ## Requirements
34 | - Commands
35 | 
36 |     `pip install pre-commit`
37 | 
38 |     `pre-commit install`
39 | 
40 | - Open PRs'!
41 | 
42 | ## Notice
43 | 
44 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
45 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
46 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
47 | 
48 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
49 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
50 | provided by the bot. You will only need to do this once across all repos using our CLA.
51 | 
52 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
53 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
54 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
55 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.7 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/delays.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "source": [
  6 |         "# Orchestration delays Investigation\n",
  7 |         "1. Run all cells.\n",
  8 |         "1. Scroll down to see for any authentication messages\n",
  9 |         "1. View report at the bottom."
 10 |       ],
 11 |       "metadata": {}
 12 |     },
 13 |     {
 14 |       "cell_type": "code",
 15 |       "source": [
 16 |         "# These are just defaults will be overwritten if you use nimport pip\n",
 17 |         "start = \"2019-10-15T20:21:54.0330000Z\"\n",
 18 |         "end = \"2019-10-15T20:52:21.5370169Z\"\n",
 19 |         "service = \"pipelines\"\n",
 20 |         "su = \"pipelines-ghub-eus2-2\"\n",
 21 |         "hub = \"Actions\"\n",
 22 |         "url = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK/delays.ipynb\"\n",
 23 |         "baseUrl = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK\""
 24 |       ],
 25 |       "outputs": [],
 26 |       "execution_count": null,
 27 |       "metadata": {
 28 |         "inputHidden": false,
 29 |         "outputHidden": false,
 30 |         "tags": [
 31 |           "parameters"
 32 |         ]
 33 |       }
 34 |     },
 35 |     {
 36 |       "cell_type": "code",
 37 |       "source": [
 38 |         "%%capture\n",
 39 |         "!pip install --upgrade nimport azure-kusto-notebooks"
 40 |       ],
 41 |       "outputs": [],
 42 |       "execution_count": null,
 43 |       "metadata": {
 44 |         "inputHidden": false,
 45 |         "outputHidden": false,
 46 |         "tags": [
 47 |           "debug"
 48 |         ]
 49 |       }
 50 |     },
 51 |     {
 52 |       "cell_type": "code",
 53 |       "source": [
 54 |         "# Import the things we use\n",
 55 |         "\n",
 56 |         "# Note you can also use kql https://docs.microsoft.com/en-us/azure/data-explorer/kqlmagic\n",
 57 |         "# %kql is single line magic\n",
 58 |         "# %%kql is cell magic\n",
 59 |         "\n",
 60 |         "# https://nbviewer.jupyter.org/github/ipython/ipython/blob/4.0.x/examples/IPython%20Kernel/Rich%20Output.ipynb#HTML\n",
 61 |         "# https://ipython.readthedocs.io/en/stable/inte/magics.html\n",
 62 |         "from IPython.display import display, HTML, Markdown, Javascript, clear_output\n",
 63 |         "\n",
 64 |         "# http://pandas-docs.github.io/pandas-docs-travis/user_guide/reshaping.html\n",
 65 |         "import pandas as pd\n",
 66 |         "pd.options.display.html.table_schema = True\n",
 67 |         "from pandas import Series, DataFrame\n",
 68 |         "from datetime import datetime, timedelta, timezone\n",
 69 |         "from urllib.parse import urlencode, quote_plus\n",
 70 |         "from requests.utils import requote_uri\n",
 71 |         "import time\n",
 72 |         "import numpy as np\n",
 73 |         "from matplotlib import pyplot as plt\n",
 74 |         "from nimport.utils import tokenize, open_nb\n",
 75 |         "import json\n",
 76 |         "import os\n",
 77 |         "import calendar as cal\n",
 78 |         "import concurrent.futures\n",
 79 |         "from azure.kusto.notebooks import utils as akn"
 80 |       ],
 81 |       "outputs": [],
 82 |       "execution_count": null,
 83 |       "metadata": {
 84 |         "inputHidden": false,
 85 |         "outputHidden": false
 86 |       }
 87 |     },
 88 |     {
 89 |       "cell_type": "code",
 90 |       "source": [
 91 |         "params = {\n",
 92 |         "    \"su\": su,\n",
 93 |         "    \"start\": start,\n",
 94 |         "    \"end\": end,\n",
 95 |         "    \"url\": url,\n",
 96 |         "    \"baseUrl\": baseUrl,\n",
 97 |         "    \"service\": service,\n",
 98 |         "    \"hub\": hub\n",
 99 |         "}\n",
100 |         "root = 'devops-pipelines' if os.path.basename(os.getcwd()) != 'devops-pipelines' else ''\n",
101 |         "queryPath = os.path.join(root, 'queries')"
102 |       ],
103 |       "outputs": [],
104 |       "execution_count": null,
105 |       "metadata": {
106 |         "inputHidden": false,
107 |         "outputHidden": false
108 |       }
109 |     },
110 |     {
111 |       "cell_type": "code",
112 |       "source": [
113 |         "# authenticate kusto client\n",
114 |         "# you will need to copy the token into a browser window for AAD auth. \n",
115 |         "client = akn.get_client('https://vso.kusto.windows.net')"
116 |       ],
117 |       "outputs": [],
118 |       "execution_count": null,
119 |       "metadata": {
120 |         "inputHidden": false,
121 |         "outputHidden": false
122 |       }
123 |     },
124 |     {
125 |       "cell_type": "code",
126 |       "source": [
127 |         "# authenticate kusto client\n",
128 |         "# you will need to copy the token into a browser window for AAD auth. \n",
129 |         "icm_client = akn.get_client('https://icmcluster.kusto.windows.net')"
130 |       ],
131 |       "outputs": [],
132 |       "execution_count": null,
133 |       "metadata": {
134 |         "inputHidden": false,
135 |         "outputHidden": false
136 |       }
137 |     },
138 |     {
139 |       "cell_type": "code",
140 |       "source": [
141 |         "q_loc = os.path.join(queryPath, \"LocationName.csl\")\n",
142 |         "q_whatChanged = os.path.join(queryPath, \"WhatChanged.csl\")\n",
143 |         "q_haActions = os.path.join(queryPath, \"HealthAgentActions.csl\")\n",
144 |         "q_mdm = os.path.join(queryPath, \"MDMAccount.csl\")\n",
145 |         "\n",
146 |         "delaysPath = os.path.join(queryPath, \"delays\")\n",
147 |         "q_affectedAccounts = os.path.join(delaysPath, \"AffectedAccounts.csl\")\n",
148 |         "q_abusers = os.path.join(delaysPath, \"Abusers.csl\")\n",
149 |         "q_affAccounts = os.path.join(delaysPath, \"AffectedAccounts.csl\")\n",
150 |         "q_delayedAccountsAreAbusers = os.path.join(delaysPath, \"DelayedAccountsAreAbusers.csl\")\n",
151 |         "q_whatDelayed = os.path.join(delaysPath, \"WhatDelayed.csl\")\n",
152 |         "q_load = os.path.join(delaysPath, \"Load.csl\")\n",
153 |         "\n",
154 |         "with concurrent.futures.ThreadPoolExecutor() as executor:\n",
155 |         "    # materialize location name immediately as we need this for other queries\n",
156 |         "    p1 = executor.submit(akn.execute_file, client, 'VSO', q_loc, params)\n",
157 |         "    locationNameResult = akn.to_dataframe_from_future(p1)\n",
158 |         "    locationName = locationNameResult[\"Tenant\"][0]\n",
159 |         "    params[\"locationName\"] = locationName\n",
160 |         "    p2 = executor.submit(akn.execute_file, client, 'VSO', q_whatChanged, params)\n",
161 |         "    p4 = executor.submit(akn.execute_file, client, 'VSO', q_haActions, params) \n",
162 |         "    \n",
163 |         "    p5 = executor.submit(akn.execute_file, client, 'VSO', q_affectedAccounts, params)\n",
164 |         "    p6 = executor.submit(akn.execute_file, client, 'VSO', q_abusers, params)\n",
165 |         "    p7 = executor.submit(akn.execute_file, client, 'VSO', q_affAccounts, params)\n",
166 |         "    p8 = executor.submit(akn.execute_file, client, 'VSO', q_delayedAccountsAreAbusers, params)\n",
167 |         "    p9 = executor.submit(akn.execute_file, client, 'VSO', q_whatDelayed, params)\n",
168 |         "    p10 = executor.submit(akn.execute_file, client, 'VSO', q_load, params)\n",
169 |         "    \n",
170 |         "    p11 = executor.submit(akn.execute_file, icm_client, 'IcmDataWarehouse', \n",
171 |         "                          os.path.join(queryPath, 'ActiveIncidents.csl'), params)\n",
172 |         "    p12 = executor.submit(akn.execute_file, client, 'VSO', q_mdm, params)\n",
173 |         "\n",
174 |         "q_whatChanged_df = akn.to_dataframe_from_future(p2)\n",
175 |         "q_haActions_df = akn.to_dataframe_from_future(p4)\n",
176 |         "q_affectedAccountsResultDf = akn.to_dataframe_from_future(p5)\n",
177 |         "\n",
178 |         "abusersDf = akn.to_dataframe_from_future(p6)\n",
179 |         "finalabusersList = np.intersect1d(q_affectedAccountsResultDf[\"HostId\"].values, abusersDf[\"HostId\"].values);\n",
180 |         "\n",
181 |         "q_affAccounts_df = akn.to_dataframe_from_future(p7)\n",
182 |         "q_delayedAccountsAreAbusers_df = akn.to_dataframe_from_future(p8)\n",
183 |         "q_whatDelayedResultDf = akn.to_dataframe_from_future(p9)\n",
184 |         "q_loadResultDf = akn.to_dataframe_from_future(p10)\n",
185 |         "\n",
186 |         "q_activeIncidentsResultDf = akn.to_dataframe_from_future(p11)\n",
187 |         "\n",
188 |         "q_mdmDf = akn.to_dataframe_from_future(p12)\n",
189 |         "params[\"mdmAccount\"] = q_mdmDf[\"monitoringAccount\"][0]"
190 |       ],
191 |       "outputs": [],
192 |       "execution_count": null,
193 |       "metadata": {
194 |         "inputHidden": false,
195 |         "outputHidden": false
196 |       }
197 |     },
198 |     {
199 |       "cell_type": "code",
200 |       "source": [
201 |         "q_spike = os.path.join(delaysPath, \"OrchestrationLogSpike.csl\")\n",
202 |         "q_parallelism = os.path.join(delaysPath, \"Parallelism.csl\")\n",
203 |         "with concurrent.futures.ThreadPoolExecutor() as executor:\n",
204 |         "    sfs = [executor.submit(akn.execute_file, client, 'VSO', q_spike, \n",
205 |         "            {\n",
206 |         "                **params,\n",
207 |         "                \"hostId\": r\n",
208 |         "            }) for r in q_delayedAccountsAreAbusers_df[\"HostId\"].values]\n",
209 |         "    sfsResults = [s.result() for s in concurrent.futures.as_completed(sfs)]\n",
210 |         "    pfs = [executor.submit(akn.execute_file, client, 'VSO', q_parallelism, \n",
211 |         "            {\n",
212 |         "                **params,\n",
213 |         "                \"hostId\": r\n",
214 |         "            }) for r in q_delayedAccountsAreAbusers_df[\"HostId\"].values]\n",
215 |         "    pfsResults = [s.result() for s in concurrent.futures.as_completed(pfs)]\n",
216 |         "\n",
217 |         "# convert to data frames\n",
218 |         "s_primary_results = [s.primary_results[0] for s in sfsResults]\n",
219 |         "spikeResultsDfs = None\n",
220 |         "\n",
221 |         "p_primary_results = [s.primary_results[0] for s in pfsResults]\n",
222 |         "parResultsDfs = None\n",
223 |         "\n",
224 |         "with concurrent.futures.ThreadPoolExecutor() as executor:\n",
225 |         "    s_dataframe_futures = [executor.submit(akn.to_dataframe, r) for r in s_primary_results]\n",
226 |         "    spikeResultsDfs = [dff.result() for dff in concurrent.futures.as_completed(s_dataframe_futures)]\n",
227 |         "    p_dataframe_futures = [executor.submit(akn.to_dataframe, r) for r in p_primary_results]\n",
228 |         "    parResultsDfs = [dff.result() for dff in concurrent.futures.as_completed(p_dataframe_futures)]\n",
229 |         "sfsResults = None\n",
230 |         "sfsResults = None"
231 |       ],
232 |       "outputs": [],
233 |       "execution_count": null,
234 |       "metadata": {
235 |         "inputHidden": false,
236 |         "outputHidden": false
237 |       }
238 |     },
239 |     {
240 |       "cell_type": "code",
241 |       "source": [
242 |         "q_loadPerHost = os.path.join(delaysPath, \"LoadPerHost.csl\")\n",
243 |         "# utility functions\n",
244 |         "from itertools import groupby\n",
245 |         "content = ''\n",
246 |         "def r(*args):\n",
247 |         "    '''construct a markdown report'''\n",
248 |         "    global content\n",
249 |         "    content += ''.join([str(a) for a in args]) + '\\n'\n",
250 |         "\n",
251 |         "startTime = akn.to_datetime(start)\n",
252 |         "t0 = startTime.replace(tzinfo=None)\n",
253 |         "\n",
254 |         "# report!      \n",
255 |         "r('# OK SO WHAT HAPPENED')\n",
256 |         "r('|parameter|value|')\n",
257 |         "r('|---|---|')\n",
258 |         "r('|startTime|', startTime, '|')\n",
259 |         "r('|endTime|', akn.to_datetime(end), '|')\n",
260 |         "r('|scale unit|', su, '|')\n",
261 |         "r('|service|', service, '|')\n",
262 |         "\n",
263 |         "# jarvis params\n",
264 |         "jarvisParams = {\n",
265 |         "  'su': su, \n",
266 |         "  'start': akn.get_time(start, -10), \n",
267 |         "  'end': akn.get_time(end, 10), \n",
268 |         "  'service': service,\n",
269 |         "  'location': locationName,\n",
270 |         "  'account': params[\"mdmAccount\"]\n",
271 |         "}\n",
272 |         "\n",
273 |         "# abuse detection?\n",
274 |         "r('## What users are impacted?')\n",
275 |         "if len(finalabusersList) > 0:\n",
276 |         "    r('INSIGHT: Found abusers -- this alert is likely a false alarm.')\n",
277 |         "r(akn.pandas_df_to_markdown_table(q_delayedAccountsAreAbusers_df))    \n",
278 |         "\n",
279 |         "\n",
280 |         "\n",
281 |         "# what changed? analysis\n",
282 |         "r('## What changed?')\n",
283 |         "if q_whatChanged_df.empty:\n",
284 |         "    r(\"...no relevant config changes recorded during this period.\")\n",
285 |         "else:\n",
286 |         "    # compute relative times and relevant changes\n",
287 |         "    history = q_whatChanged_df\n",
288 |         "    history['RelativeSeconds'] = history.apply(lambda row: (row.TIMESTAMP.replace(tzinfo=None) - t0).total_seconds(), axis=1)\n",
289 |         "    relevant = history[abs(history.RelativeSeconds) < 3600]\n",
290 |         "    \n",
291 |         "    # analysis\n",
292 |         "    upgrade = False\n",
293 |         "    mitigation = False\n",
294 |         "    vip_swap = False\n",
295 |         "    ffs = False\n",
296 |         "    for t in relevant.title.values:\n",
297 |         "        l = t.lower()\n",
298 |         "        upgrade = upgrade or 'upgrade' in l\n",
299 |         "        mitigation = mitigation or 'mitigation' in l\n",
300 |         "        vip_swap = vip_swap or 'vip' in l\n",
301 |         "        ffs = ffs or 'feature flag' in l\n",
302 |         "        \n",
303 |         "    if upgrade:\n",
304 |         "        r('INSIGHT: there were database upgrades in progress')\n",
305 |         "    if mitigation:\n",
306 |         "        r('INSIGHT: there were mitigations in progress')\n",
307 |         "    if vip_swap:\n",
308 |         "        r('INSIGHT: there was a vip swap just before this period.')\n",
309 |         "    if ffs:\n",
310 |         "        r('INSIGHT: there were feature flag changes right before this period.')\n",
311 |         "        \n",
312 |         "    # full table\n",
313 |         "    r(akn.pandas_df_to_markdown_table(relevant[['TIMESTAMP', 'RelativeSeconds', 'title']]))\n",
314 |         "          \n",
315 |         "        \n",
316 |         "# active incidents?\n",
317 |         "r('## Active incidents?')\n",
318 |         "otherIncidentsCount = 0;\n",
319 |         "\n",
320 |         "if q_activeIncidentsResultDf is not None and not q_activeIncidentsResultDf.empty:\n",
321 |         "    for index, row in q_activeIncidentsResultDf.iterrows():\n",
322 |         "        if(row.Title.find(\"Kalypso: Build Orchestrator Delays ICM\") == -1):\n",
323 |         "            otherIncidentsCount += 1\n",
324 |         "        \n",
325 |         "    if otherIncidentsCount > 0:\n",
326 |         "        r(\"INSIGHT: There were incidents recorded during this period. These might be related:\")\n",
327 |         "        newDf = q_activeIncidentsResultDf.assign(URL=[*map(lambda x: \"\"\"[%s](https://icm.ad.msft.net/imp/v3/incidents/details/%s/home)\"\"\" % (x,x), q_activeIncidentsResultDf.IncidentId)])    \n",
328 |         "        r(\"\\n\")\n",
329 |         "        r(akn.pandas_df_to_markdown_table(newDf[['URL','Severity','Title']]))\n",
330 |         "    else:\n",
331 |         "        r(\"...no relevant incidents during this period.\")    \n",
332 |         "    \n",
333 |         "    \n",
334 |         "r('## Queue Load')\n",
335 |         "ar = q_loadResultDf[q_loadResultDf[\"Name\"] == \"DTPlanQueued\"].values[:, 2]\n",
336 |         "queuedGreatherThan500 = np.where(ar > 500)\n",
337 |         "ar_max = np.amax(ar) if len(ar) else '?'\n",
338 |         "if len(queuedGreatherThan500[0]) > 0:\n",
339 |         "    r('INSIGHT: There was a high rate of jobs queued during this period (max: ', ar_max, ' / minute)...')\n",
340 |         "else: \n",
341 |         "    r('...everything looks good? (max: ', ar_max, ' / minute)')\n",
342 |         "\n",
343 |         "    \n",
344 |         "r('## Parallelism')\n",
345 |         "for parResultsDf in parResultsDfs:\n",
346 |         "    if len(parResultsDf.C.values) > 0:        \n",
347 |         "        usage = parResultsDf.C.values[0]\n",
348 |         "        times = parResultsDf.sampleTime.values[0]\n",
349 |         "        hostId = parResultsDf.HostId[0]\n",
350 |         "        maxindex = np.argmax(usage)\n",
351 |         "        maxvalue = usage[maxindex]\n",
352 |         "        atTime = times[maxindex]\n",
353 |         "        results = {value: len(list(freq)) for value, freq in groupby(sorted(usage))}\n",
354 |         "        printed = False\n",
355 |         "        r(\"\"\"\\nFor host: **%s**...\"\"\" % (hostId))\n",
356 |         "        for key, value in results.items():\n",
357 |         "            if key > 10:\n",
358 |         "                r(\"\"\"\\nRunning plans (per 1min) %s : number of occurences during incident time %s\"\"\"%(key, value))\n",
359 |         "                printed = True\n",
360 |         "        if not printed:\n",
361 |         "           r(\"\\nNothing found greater than 10\")\n",
362 |         "    else:\n",
363 |         "        r(\"\\n-\")\n",
364 |         "    \n",
365 |         "\n",
366 |         "r('## Orchestration phase Load')\n",
367 |         "for spikeResultDf in spikeResultsDfs:\n",
368 |         "    countResult = spikeResultDf.C.describe()\n",
369 |         "    hostId = spikeResultDf[\"HostId\"].values[0]\n",
370 |         "    upper = countResult[\"75%\"]\n",
371 |         "    lower = countResult[\"25%\"]\n",
372 |         "    # Wondering what's going on here? We detect anomolies, see https://www.purplemath.com/modules/boxwhisk3.htm\n",
373 |         "    IQR = upper - lower\n",
374 |         "    countResultOfInterest = spikeResultDf[spikeResultDf[\"C\"] > upper + 1.5 * IQR ].head(5)\n",
375 |         "    unqCommands = list(dict.fromkeys(countResultOfInterest[\"Command\"].values).keys())\n",
376 |         "    if len(unqCommands) > 0:\n",
377 |         "        r(\"\"\"INSIGHT: Found anomalies for these phases in order highest to lowest for host: **%s**\"\"\" % hostId)\n",
378 |         "\n",
379 |         "        # print commands table\n",
380 |         "        r(akn.pandas_df_to_markdown_table(countResultOfInterest[[\"Command\", \"C\"]]))   \n",
381 |         "        \n",
382 |         "        \n",
383 |         "        if \"PlanCompleted\" in unqCommands:\n",
384 |         "            if \"StartPlan\" in unqCommands or \"PlanStarted\" in unqCommands:\n",
385 |         "                r(\"\\nTIP: An unusual number of plans were started during this period.\")\n",
386 |         "            else:\n",
387 |         "                r(\"\\nTIP: Jobs that are queued long ago might have completed now... creating this spike\")   \n",
388 |         "        \n",
389 |         "        newParams = dict(params)\n",
390 |         "        newParams[\"command\"] = next(iter(unqCommands))  \n",
391 |         "        newParams[\"hostId\"] = hostId\n",
392 |         "        r(akn.details_md('Kusto query for analyzing spike:', \n",
393 |         "                         tokenize(os.path.join(os.path.join(queryPath, \"delays\"), \"OrchestrationLogSpikeTip.csl\"), newParams)))\n",
394 |         "        r(akn.details_md('Kusto for analyzing load:', tokenize(q_loadPerHost, newParams)))\n",
395 |         "          \n",
396 |         "    else:\n",
397 |         "        r('...everything looks good?')    \n",
398 |         "    \n",
399 |         "# ja load\n",
400 |         "r()\n",
401 |         "r('## JA Load')\n",
402 |         "q_whatDelayedResultPendingJobsDf = q_whatDelayedResultDf[q_whatDelayedResultDf.Pivot == \"\\JobService(_Total)\\Total Pending Jobs\"]\n",
403 |         "pendingGreaterThan50Result = np.where(q_whatDelayedResultPendingJobsDf.avg_CounterValue.values > 50)\n",
404 |         "if len(pendingGreaterThan50Result[0]) > 0:\n",
405 |         "    max_pending_jobs = np.max(q_whatDelayedResultPendingJobsDf.avg_CounterValue.values)\n",
406 |         "    r(\"INSIGHT: There was a high number of pending jobs during this period (max was %s). Note that this is for jobs including all priorities (even low priority ones)\" % (max_pending_jobs))    \n",
407 |         "    \n",
408 |         "    open_nb(os.path.join(root, 'ja.ipynb'), params, redirect=False)\n",
409 |         "    jaUrl = baseUrl + \"/devops-pipelines/ja.ipynb\"\n",
410 |         "    r('\\n\\n[JobAgent investigation notebook](', requote_uri(jaUrl), ')')\n",
411 |         "\n",
412 |         "    jaJarvisLink = \"\"\"https://jarvis-west.dc.ad.msft.net/dashboard/VSO-ServiceInsights/PlatformViews/Compute-JA\"\"\" \\\n",
413 |         "    \"\"\"?overrides=[{\"query\":\"//*[id='Service']\",\"key\":\"value\",\"replacement\":\"%(service)s\"},\"\"\" \\\n",
414 |         "    \"\"\"{\"query\":\"//*[id='RoleInstance']\",\"key\":\"value\",\"replacement\":\"\"},\"\"\" \\\n",
415 |         "    \"\"\"{\"query\":\"//*[id='LocationName']\",\"key\":\"value\",\"replacement\":\"%(location)s\"},\"\"\" \\\n",
416 |         "    \"\"\"{\"query\":\"//dataSources\",\"key\":\"namespace\",\"replacement\":\"%(su)s\"},\"\"\" \\\n",
417 |         "    \"\"\"{\"query\":\"//dataSources\",\"key\":\"account\",\"replacement\":\"%(account)s\"},\"\"\" \\\n",
418 |         "    \"\"\"{\"query\":\"//*[id='ApplicationEndpoint']\",\"key\":\"regex\",\"replacement\":\"*%(location)s*\"},\"\"\" \\\n",
419 |         "    \"\"\"{\"query\":\"//*[id='ScaleUnit']\",\"key\":\"value\",\"replacement\":\"%(su)s\"}]\"\"\" \\\n",
420 |         "    \"\"\"&globalStartTime=%(start)s&globalEndTime=%(end)s&pinGlobalTimeRange=true\"\"\" % jarvisParams\n",
421 |         "    r('\\n\\n[JobAgent health dashboard](', requote_uri(jaJarvisLink), ')')\n",
422 |         "else:\n",
423 |         "    r('...everything looks good?')\n",
424 |         "    \n",
425 |         "          \n",
426 |         "    \n",
427 |         "    \n",
428 |         "# more analysis?   \n",
429 |         "r('## What should we look at next?')\n",
430 |         "url = baseUrl + \"/devops-pipelines/sla.ipynb\"\n",
431 |         "SLAParams = {\n",
432 |         "    \"triggerTime\": params[\"start\"],\n",
433 |         "    \"scaleUnit\": params[\"su\"],\n",
434 |         "    \"service\": params[\"service\"],\n",
435 |         "    \"lookback\": \"1h\",\n",
436 |         "    \"region\": \"\"\n",
437 |         "}\n",
438 |         "open_nb(os.path.join(root, 'sla.ipynb'), SLAParams, redirect=False)\n",
439 |         "r('\\n\\n[SLA investigation notebook](', requote_uri(url), ')') \n",
440 |         "\n",
441 |         "url = baseUrl + \"/devops-pipelines/impact.ipynb\"\n",
442 |         "open_nb(os.path.join(root, 'impact.ipynb'), params, redirect=False)\n",
443 |         "r('\\n\\n[Customer impact investigation notebook](', requote_uri(url), ')') \n",
444 |         "\n",
445 |         "# Scale unit health\n",
446 |         "jarvisLink = \"\"\"https://jarvis-west.dc.ad.msft.net/dashboard/VSO-ServiceInsights/DevOpsReports/DevOpsReports\"\"\" \\\n",
447 |         "    \"\"\"?overrides=[{\"query\":\"//*[id='Service']\",\"key\":\"value\",\"replacement\":\"%(service)s\"},\"\"\" \\\n",
448 |         "    \"\"\"{\"query\":\"//*[id='RoleInstance']\",\"key\":\"value\",\"replacement\":\"\"},\"\"\" \\\n",
449 |         "    \"\"\"{\"query\":\"//*[id='LocationName']\",\"key\":\"value\",\"replacement\":\"%(location)s\"},\"\"\" \\\n",
450 |         "    \"\"\"{\"query\":\"//dataSources\",\"key\":\"namespace\",\"replacement\":\"%(su)s\"},\"\"\" \\\n",
451 |         "    \"\"\"{\"query\":\"//dataSources\",\"key\":\"account\",\"replacement\":\"%(account)s\"},\"\"\" \\\n",
452 |         "    \"\"\"{\"query\":\"//*[id='ApplicationEndpoint']\",\"key\":\"regex\",\"replacement\":\"*%(location)s*\"},\"\"\" \\\n",
453 |         "    \"\"\"{\"query\":\"//*[id='ScaleUnit']\",\"key\":\"value\",\"replacement\":\"%(su)s\"}]\"\"\" \\\n",
454 |         "    \"\"\"&globalStartTime=%(start)s&globalEndTime=%(end)s&pinGlobalTimeRange=true\"\"\" % jarvisParams;\n",
455 |         "r('\\n\\n[Scale unit health dashboard (' + su + ', ' + service + ')](', requote_uri(jarvisLink), ')')\n",
456 |         "\n",
457 |         "\n",
458 |         "Markdown(content)\n",
459 |         "# print(content)"
460 |       ],
461 |       "outputs": [],
462 |       "execution_count": null,
463 |       "metadata": {
464 |         "inputHidden": false,
465 |         "outputHidden": false
466 |       }
467 |     },
468 |     {
469 |       "cell_type": "code",
470 |       "source": [
471 |         "# visualize delays\n",
472 |         "import plotly\n",
473 |         "from plotly import graph_objs as go\n",
474 |         "delays = go.Scatter(\n",
475 |         "    x=q_affAccounts_df[\"PreciseTimeStamp\"],\n",
476 |         "    y=q_affAccounts_df[\"MessageDelayInSeconds\"],\n",
477 |         "    mode = 'lines',\n",
478 |         "    name = 'Delays in seconds',\n",
479 |         "    text= q_affAccounts_df['Name']\n",
480 |         ")\n",
481 |         "\n",
482 |         "changed = go.Scatter(\n",
483 |         "    x=q_whatChanged_df[\"TIMESTAMP\"],\n",
484 |         "    y=np.repeat(50, len(q_whatChanged_df[\"TIMESTAMP\"].values)),\n",
485 |         "    mode = 'lines+markers',\n",
486 |         "    name = 'What Changed',\n",
487 |         "    text = q_whatChanged_df[\"Name\"],\n",
488 |         "    marker=dict(\n",
489 |         "        size=32,\n",
490 |         "        color = np.random.randn(500),\n",
491 |         "        colorscale='Viridis'\n",
492 |         "    )\n",
493 |         ")\n",
494 |         "\n",
495 |         "mitigations = go.Scatter(\n",
496 |         "    x=q_haActions_df[\"PreciseTimeStamp\"],\n",
497 |         "    y=np.repeat(50, len(q_haActions_df[\"PreciseTimeStamp\"].values)),\n",
498 |         "    mode = 'markers',\n",
499 |         "    name = 'Mitigations',\n",
500 |         "    text = q_haActions_df[[\"MitigationName\", \"RoleInstance\"]].apply(lambda x: ''.join(x), axis=1),\n",
501 |         "    marker = dict(\n",
502 |         "        size = 10,\n",
503 |         "        color = 'rgba(152, 0, 0, .8)',\n",
504 |         "        line = dict(\n",
505 |         "            width = 2,\n",
506 |         "            color = 'rgb(0, 0, 0)'\n",
507 |         "        )\n",
508 |         "    )\n",
509 |         ")\n",
510 |         "\n",
511 |         "data = [delays, changed, mitigations]\n",
512 |         "plotly.offline.iplot(data)"
513 |       ],
514 |       "outputs": [],
515 |       "execution_count": null,
516 |       "metadata": {
517 |         "inputHidden": false,
518 |         "outputHidden": false
519 |       }
520 |     },
521 |     {
522 |       "cell_type": "code",
523 |       "source": [],
524 |       "outputs": [],
525 |       "execution_count": null,
526 |       "metadata": {
527 |         "inputHidden": false,
528 |         "outputHidden": false
529 |       }
530 |     }
531 |   ],
532 |   "metadata": {
533 |     "kernel_info": {
534 |       "name": "python3"
535 |     },
536 |     "kernelspec": {
537 |       "name": "python3",
538 |       "language": "python",
539 |       "display_name": "Python 3"
540 |     },
541 |     "language_info": {
542 |       "name": "python",
543 |       "version": "3.7.4",
544 |       "mimetype": "text/x-python",
545 |       "codemirror_mode": {
546 |         "name": "ipython",
547 |         "version": 3
548 |       },
549 |       "pygments_lexer": "ipython3",
550 |       "nbconvert_exporter": "python",
551 |       "file_extension": ".py"
552 |     },
553 |     "nteract": {
554 |       "version": "0.14.5"
555 |     }
556 |   },
557 |   "nbformat": 4,
558 |   "nbformat_minor": 0
559 | }
560 | 


--------------------------------------------------------------------------------
/impact.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Impact Investigation\n",
  8 |     "1. Run all cells.\n",
  9 |     "1. View report at the bottom."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {
 16 |     "inputHidden": false,
 17 |     "outputHidden": false,
 18 |     "tags": [
 19 |      "parameters"
 20 |     ]
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "su = \"pipelines-ghub-eus2-2\"\n",
 25 |     "start = \"2019-10-15T20:21:54.0330000Z\"\n",
 26 |     "end = \"2019-10-15T20:52:21.5370169Z\"\n",
 27 |     "url = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK/delays.ipynb\"\n",
 28 |     "baseUrl = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK\"\n",
 29 |     "service = \"pipelines\"\n",
 30 |     "hub = \"Actions\"\n",
 31 |     "locationName = \"pipelinesghubeus22\"\n",
 32 |     "mdmAccount = \"VSO-Pipelines\""
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {
 39 |     "inputHidden": false,
 40 |     "outputHidden": false
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "%%capture\n",
 45 |     "!pip install --upgrade nimport azure-kusto-notebooks"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {
 52 |     "inputHidden": false,
 53 |     "outputHidden": false
 54 |    },
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "# Import the things we use\n",
 58 |     "\n",
 59 |     "# Note you can also use kql https://docs.microsoft.com/en-us/azure/data-explorer/kqlmagic\n",
 60 |     "# %kql is single line magic\n",
 61 |     "# %%kql is cell magic\n",
 62 |     "\n",
 63 |     "# https://nbviewer.jupyter.org/github/ipython/ipython/blob/4.0.x/examples/IPython%20Kernel/Rich%20Output.ipynb#HTML\n",
 64 |     "# https://ipython.readthedocs.io/en/stable/inte/magics.html\n",
 65 |     "from IPython.display import display, HTML, Markdown, Javascript, clear_output\n",
 66 |     "\n",
 67 |     "# http://pandas-docs.github.io/pandas-docs-travis/user_guide/reshaping.html\n",
 68 |     "import pandas as pd\n",
 69 |     "pd.options.display.html.table_schema = False\n",
 70 |     "from pandas import Series, DataFrame\n",
 71 |     "from datetime import datetime, timedelta, timezone\n",
 72 |     "from urllib.parse import urlencode, quote_plus\n",
 73 |     "from requests.utils import requote_uri\n",
 74 |     "import time\n",
 75 |     "import numpy as np\n",
 76 |     "from matplotlib import pyplot as plt\n",
 77 |     "from nimport.utils import tokenize, open_nb\n",
 78 |     "import json\n",
 79 |     "import os\n",
 80 |     "import calendar as cal\n",
 81 |     "import concurrent.futures\n",
 82 |     "from azure.kusto.notebooks import utils as akn"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {
 89 |     "inputHidden": false,
 90 |     "outputHidden": false
 91 |    },
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "params = {\n",
 95 |     "    \"su\": su,\n",
 96 |     "    \"start\": start,\n",
 97 |     "    \"end\": end,\n",
 98 |     "    \"url\": url,\n",
 99 |     "    \"baseUrl\": baseUrl,\n",
100 |     "    \"service\": service\n",
101 |     "}\n",
102 |     "root = 'devops-pipelines' if os.path.basename(os.getcwd()) != 'devops-pipelines' else ''\n",
103 |     "queryPath = os.path.join(root, 'queries')"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {
110 |     "inputHidden": false,
111 |     "outputHidden": false
112 |    },
113 |    "outputs": [],
114 |    "source": [
115 |     "# authenticate kusto client\n",
116 |     "# you will need to copy the token into a browser window for AAD auth. \n",
117 |     "client = akn.get_client('https://vso.kusto.windows.net')"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {
124 |     "inputHidden": false,
125 |     "outputHidden": false
126 |    },
127 |    "outputs": [],
128 |    "source": [
129 |     "# authenticate kusto client\n",
130 |     "# you will need to copy the token into a browser window for AAD auth. \n",
131 |     "icm_client = akn.get_client('https://icmcluster.kusto.windows.net')"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {
138 |     "inputHidden": false,
139 |     "outputHidden": false
140 |    },
141 |    "outputs": [],
142 |    "source": [
143 |     "q_loc = os.path.join(queryPath, \"LocationName.csl\")\n",
144 |     "q_whatChanged = os.path.join(queryPath, \"WhatChanged.csl\")\n",
145 |     "q_mdm = os.path.join(queryPath, \"MDMAccount.csl\")\n",
146 |     "\n",
147 |     "impactPath = os.path.join(queryPath, \"impact\")\n",
148 |     "q_commands = os.path.join(impactPath, \"CommandsReason.csl\")\n",
149 |     "q_commandsAT = os.path.join(impactPath, \"CommandsAT.csl\")\n",
150 |     "q_commandsDb = os.path.join(impactPath, \"CommandsDb.csl\")\n",
151 |     "q_gen2 = os.path.join(impactPath, \"Gen2GCSpikes.csl\")\n",
152 |     "q_dep = os.path.join(impactPath, \"Dependencies.csl\")\n",
153 |     "with concurrent.futures.ThreadPoolExecutor() as executor:\n",
154 |     "    # materialize location name immediately as we need this for other queries\n",
155 |     "    p1 = executor.submit(akn.execute_file, client, 'VSO', q_loc, params)\n",
156 |     "    locationNameResult = akn.to_dataframe_from_future(p1)\n",
157 |     "    locationName = locationNameResult[\"Tenant\"][0]\n",
158 |     "    params[\"locationName\"] = locationName\n",
159 |     "    p2 = executor.submit(akn.execute_file, client, 'VSO', q_whatChanged, params)\n",
160 |     "    \n",
161 |     "    p4 = executor.submit(akn.execute_file, client, 'VSO', q_commandsAT, params)\n",
162 |     "    p5 = executor.submit(akn.execute_file, client, 'VSO', q_commandsDb, params)        \n",
163 |     "    p6 = executor.submit(akn.execute_file, client, 'VSO', q_commands, params)\n",
164 |     "    \n",
165 |     "    p7 = executor.submit(akn.execute_file, icm_client, 'IcmDataWarehouse', \n",
166 |     "                          os.path.join(queryPath, 'ActiveIncidents.csl'), params)\n",
167 |     "    \n",
168 |     "    p8 = executor.submit(akn.execute_file, client, 'VSO', q_gen2, params)\n",
169 |     "    p9 = executor.submit(akn.execute_file, client, 'VSO', q_mdm, params)\n",
170 |     "    p10 = executor.submit(akn.execute_file, client, 'VSO', q_dep, params)\n",
171 |     "\n",
172 |     "q_whatChanged_df = akn.to_dataframe_from_future(p2)\n",
173 |     "\n",
174 |     "q_commandsAT_df = akn.to_dataframe_from_future(p4)\n",
175 |     "\n",
176 |     "q_commandsDb_df = akn.to_dataframe_from_future(p5)\n",
177 |     "\n",
178 |     "q_commands_df = akn.to_dataframe_from_future(p6)\n",
179 |     "\n",
180 |     "q_activeIncidentsResultDf = akn.to_dataframe_from_future(p7)\n",
181 |     "\n",
182 |     "q_gen2Df = akn.to_dataframe_from_future(p8)\n",
183 |     "\n",
184 |     "q_mdmDf = akn.to_dataframe_from_future(p9)\n",
185 |     "params[\"mdmAccount\"] = q_mdmDf[\"monitoringAccount\"][0]\n",
186 |     "\n",
187 |     "q_depDf = akn.to_dataframe_from_future(p10)"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {
194 |     "inputHidden": false,
195 |     "outputHidden": false
196 |    },
197 |    "outputs": [],
198 |    "source": [
199 |     "print('=' * 50)\n",
200 |     "print('Report!')\n",
201 |     "print('=' * 50, '\\n\\n')\n",
202 |     "\n",
203 |     "# jarvis params\n",
204 |     "jarvisParams = {\n",
205 |     "  'su': su, \n",
206 |     "  'start': akn.get_time(start, -10), \n",
207 |     "  'end': akn.get_time(end, 10), \n",
208 |     "  'service': service,\n",
209 |     "  'location': locationName,\n",
210 |     "  'account': params[\"mdmAccount\"]\n",
211 |     "}\n",
212 |     "\n",
213 |     "display(params)\n",
214 |     "\n",
215 |     "startTime = akn.to_datetime(start)\n",
216 |     "# jarvis\n",
217 |     "jarvisLink = \"\"\"https://jarvis-west.dc.ad.msft.net/dashboard/VSO-ServiceInsights/DevOpsReports/DevOpsReports\"\"\" \\\n",
218 |     "    \"\"\"?overrides=[{\"query\":\"//*[id='Service']\",\"key\":\"value\",\"replacement\":\"%(service)s\"},\"\"\" \\\n",
219 |     "    \"\"\"{\"query\":\"//*[id='RoleInstance']\",\"key\":\"value\",\"replacement\":\"\"},\"\"\" \\\n",
220 |     "    \"\"\"{\"query\":\"//*[id='LocationName']\",\"key\":\"value\",\"replacement\":\"%(location)s\"},\"\"\" \\\n",
221 |     "    \"\"\"{\"query\":\"//dataSources\",\"key\":\"namespace\",\"replacement\":\"%(su)s\"},\"\"\" \\\n",
222 |     "    \"\"\"{\"query\":\"//dataSources\",\"key\":\"account\",\"replacement\":\"%(account)s\"},\"\"\" \\\n",
223 |     "    \"\"\"{\"query\":\"//*[id='ApplicationEndpoint']\",\"key\":\"regex\",\"replacement\":\"*%(location)s*\"},\"\"\" \\\n",
224 |     "    \"\"\"{\"query\":\"//*[id='ScaleUnit']\",\"key\":\"value\",\"replacement\":\"%(su)s\"}]\"\"\" \\\n",
225 |     "    \"\"\"&globalStartTime=%(start)s&globalEndTime=%(end)s&pinGlobalTimeRange=true\"\"\" % jarvisParams;\n",
226 |     "print('Jarvis dashboard link:\\n', requote_uri(jarvisLink), '\\n')\n",
227 |     "    \n",
228 |     "# slow failed reason analysis\n",
229 |     "print()\n",
230 |     "print('Is it slow commands or failed commands? =============================')\n",
231 |     "freq = q_commands_df[\"Frequency\"]\n",
232 |     "coefficientOfVariance = freq.std()/freq.mean()\n",
233 |     "failedCount = q_commands_df[q_commands_df[\"Reason\"] == \"failed\"][\"Frequency\"].values[0]\n",
234 |     "slowCount = q_commands_df[q_commands_df[\"Reason\"] == \"slow\"][\"Frequency\"].values[0]\n",
235 |     "reason = \"failed or slow\"\n",
236 |     "if coefficientOfVariance > 0.5:\n",
237 |     "    if failedCount > slowCount:\n",
238 |     "        reason = \"failed\"\n",
239 |     "    else:\n",
240 |     "        reason = \"slow\"\n",
241 |     "else:\n",
242 |     "    print(\"Slow and failed commands are too close, both might be contributing...\")\n",
243 |     "if reason:\n",
244 |     "    print(\"Probably due to %s commands; Failed - %s, Slow - %s\" % (reason, failedCount, slowCount))\n",
245 |     "\n",
246 |     "# slow failed reason for AT?\n",
247 |     "print()\n",
248 |     "print('Is it %s because of AT? =============================' % (reason))\n",
249 |     "failed = q_commandsAT_df[q_commandsAT_df[\"Reason\"] == \"failed\"]\n",
250 |     "slow = q_commandsAT_df[q_commandsAT_df[\"Reason\"] == \"slow\"]\n",
251 |     "data = q_commandsAT_df\n",
252 |     "if reason == \"failed\":\n",
253 |     "    data = failed\n",
254 |     "elif reason == \"slow\":\n",
255 |     "    data = slow\n",
256 |     "\n",
257 |     "coefficientOfVariance = data[\"Frequency\"].std()/data[\"Frequency\"].mean()\n",
258 |     "    \n",
259 |     "if coefficientOfVariance > 0.5:\n",
260 |     "    print(\"Found variance in AT's for %s commands\" % (reason))\n",
261 |     "    print(data.head(30))\n",
262 |     "else:\n",
263 |     "    print(\"Seems be same across AT's for %s commands\" % (reason))\n",
264 |     "    \n",
265 |     "# slow failed reason for Db?\n",
266 |     "print()\n",
267 |     "print('Is it %s because of Db? =============================' % (reason))\n",
268 |     "failed = q_commandsDb_df[q_commandsDb_df[\"Reason\"] == \"failed\"]\n",
269 |     "slow = q_commandsDb_df[q_commandsDb_df[\"Reason\"] == \"slow\"]\n",
270 |     "data = q_commandsDb_df\n",
271 |     "if reason == \"failed\":\n",
272 |     "    data = failed\n",
273 |     "elif reason == \"slow\":\n",
274 |     "    data = slow\n",
275 |     "\n",
276 |     "coefficientOfVariance = data[\"Frequency\"].std()/data[\"Frequency\"].mean()\n",
277 |     "    \n",
278 |     "if coefficientOfVariance > 0.5:\n",
279 |     "    print(\"Found variance in Db's for %s commands\" % (reason))\n",
280 |     "    print(\"Suffix '%s' to database server name\" % (\".database.windows.net\"))\n",
281 |     "    print(\"Prefix '%s' to database name\" % (params[\"service\"] + \"_\" + params[\"locationName\"] + \"_\"))\n",
282 |     "    print(data.head(30))\n",
283 |     "else:\n",
284 |     "    print(\"Seems be same across Db's for %s commands\" % (reason))    \n",
285 |     "    \n",
286 |     "# what changed? analysis\n",
287 |     "print()\n",
288 |     "print('What changed? =============================')\n",
289 |     "if(len(q_whatChanged_df.index) == 0):\n",
290 |     "    print(\"...no relevant config changes recorded during this period.\")\n",
291 |     "else:\n",
292 |     "    up_prefix = \"\"\n",
293 |     "    mit_prefix = \"\"\n",
294 |     "    vip_prefix = \"\"\n",
295 |     "    f_prefix = \"\"\n",
296 |     "    text = \"\"\n",
297 |     "    for index, row in q_whatChanged_df.iterrows():\n",
298 |     "        delta = startTime.replace(tzinfo=None) - row.TIMESTAMP.replace(tzinfo=None)\n",
299 |     "        when = \"before\"\n",
300 |     "        if delta.total_seconds() < 0:\n",
301 |     "            when = \"after\"\n",
302 |     "            delta = row.TIMESTAMP.replace(tzinfo=None) - startTime.replace(tzinfo=None)\n",
303 |     "        hoursHappened = delta.total_seconds()//3600\n",
304 |     "        considerTime = hoursHappened <= 1\n",
305 |     "        def getText(row):\n",
306 |     "            return \"\"\"%s %s %s (%s days %s hours %s minutes %s the start time) \\n\\n\"\"\" % (row.TIMESTAMP, row.title, row.buildNumber, delta.days, delta.seconds//3600, delta.seconds//60, when)\n",
307 |     "        if(row.title.lower().find('upgrade') != -1):\n",
308 |     "            if not up_prefix:\n",
309 |     "                up_prefix += \"Looks like, there's upgrade...\\n\\n\"\n",
310 |     "            text += getText(row)\n",
311 |     "        if(row.title.lower().find('mitigation') != -1):\n",
312 |     "            if considerTime and not mit_prefix:\n",
313 |     "                mit_prefix += \"Looks like, there are some mitigations by health agent...\\n\\n\"\n",
314 |     "            text += getText(row)\n",
315 |     "        if(row.title.lower().find('vip') != -1):\n",
316 |     "            if considerTime and not vip_prefix:\n",
317 |     "                vip_prefix += \"Looks like, there is VIP swap...\\n\\n\"\n",
318 |     "            text += getText(row)  \n",
319 |     "        if(row.title.lower().find('feature flag') != -1):\n",
320 |     "            if considerTime and not f_prefix:\n",
321 |     "                f_prefix += \"Looks like, some feature flags are enabled...\\n\\n\"\n",
322 |     "            text += getText(row)\n",
323 |     "    if text:\n",
324 |     "        print(up_prefix + mit_prefix + f_prefix + vip_prefix + text)\n",
325 |     "    else:\n",
326 |     "        print(\"...no relevant changes during this period.\")\n",
327 |     "        \n",
328 |     "# active incidents?\n",
329 |     "print()\n",
330 |     "print('Active incidents? =============================')\n",
331 |     "otherIncidentsCount = 0;\n",
332 |     "for index, row in q_activeIncidentsResultDf.iterrows():\n",
333 |     "    if(row.Title.find(\"Customer Impact Monitor\") == -1):\n",
334 |     "        otherIncidentsCount+=1;\n",
335 |     "        \n",
336 |     "if(otherIncidentsCount > 0):\n",
337 |     "    print(\"We found some incidents during the time period, check if they are related...\")\n",
338 |     "    # styling\n",
339 |     "    def make_clickable(url, text):\n",
340 |     "        return '{0}'.format(url)\n",
341 |     "\n",
342 |     "    newDf = q_activeIncidentsResultDf.assign(URL=[*map(lambda x: make_clickable(\"\"\"https://icm.ad.msft.net/imp/v3/incidents/details/%s/home\"\"\" % (x), \"ICMLink\"), q_activeIncidentsResultDf.IncidentId)])\n",
343 |     "    print(\"ICM link to copy - \" + \"https://icm.ad.msft.net/imp/v3/incidents/details/INCIDENTID/home\")\n",
344 |     "    print(newDf[['IncidentId','Severity','Title']])\n",
345 |     "else:\n",
346 |     "    print(\"No active incidents that could be related are found...\")  \n",
347 |     "    \n",
348 |     "print()\n",
349 |     "print('Dependencies insights =============================')\n",
350 |     "r = q_depDf.describe()\n",
351 |     "redis = r[\"avg_RedisExecutionTimeInMs\"]\n",
352 |     "s2s = r[\"avg_VssClientExecutionTimeInMs\"]\n",
353 |     "sql = r[\"avg_SqlExecutionTimeInMs\"]\n",
354 |     "queue = r[\"avg_QueueTimeInMs\"]\n",
355 |     "maxs = [redis[\"max\"], s2s[\"max\"], sql[\"max\"], queue[\"max\"]]\n",
356 |     "means = [redis[\"mean\"], s2s[\"mean\"], sql[\"mean\"], queue[\"mean\"]]\n",
357 |     "up = redis[\"75%\"]\n",
358 |     "lo = redis[\"25%\"]\n",
359 |     "IQR = up - lo\n",
360 |     "redisAnom = q_depDf[q_depDf[\"avg_RedisExecutionTimeInMs\"] > up + 3.5*IQR][[\"PreciseTimeStamp\", \"avg_RedisExecutionTimeInMs\"]].sort_values(by='PreciseTimeStamp') \n",
361 |     "up = s2s[\"75%\"]\n",
362 |     "lo = s2s[\"25%\"]\n",
363 |     "IQR = up - lo\n",
364 |     "s2sAnom = q_depDf[q_depDf[\"avg_VssClientExecutionTimeInMs\"] > up + 3.5*IQR][[\"PreciseTimeStamp\", \"avg_VssClientExecutionTimeInMs\"]].sort_values(by='PreciseTimeStamp')\n",
365 |     "up = sql[\"75%\"]\n",
366 |     "lo = sql[\"25%\"]\n",
367 |     "IQR = up - lo\n",
368 |     "sqlAnom = q_depDf[q_depDf[\"avg_SqlExecutionTimeInMs\"] > up + 3.5*IQR][[\"PreciseTimeStamp\", \"avg_SqlExecutionTimeInMs\"]].sort_values(by='PreciseTimeStamp')\n",
369 |     "up = queue[\"75%\"]\n",
370 |     "lo = queue[\"25%\"]\n",
371 |     "IQR = up - lo\n",
372 |     "queueAnom = q_depDf[q_depDf[\"avg_QueueTimeInMs\"] > up + 3.5*IQR][[\"PreciseTimeStamp\", \"avg_QueueTimeInMs\"]].sort_values(by='PreciseTimeStamp')\n",
373 |     "reasons = [\"Redis\", \"S2S\", \"Sql\", \"RequestsQueued\"]\n",
374 |     "anomdata = [redisAnom, s2sAnom, sqlAnom, queueAnom]\n",
375 |     "anom = [len(redisAnom), len(s2sAnom), len(sqlAnom), len(queueAnom)]\n",
376 |     "top2Anom = np.argsort(anom)[::-1][:2]\n",
377 |     "whenMax = [\n",
378 |     "    redisAnom[redisAnom[\"avg_RedisExecutionTimeInMs\"]==maxs[0]],\n",
379 |     "    s2sAnom[s2sAnom[\"avg_VssClientExecutionTimeInMs\"]==maxs[1]],\n",
380 |     "    sqlAnom[sqlAnom[\"avg_SqlExecutionTimeInMs\"]==maxs[2]],\n",
381 |     "    queueAnom[queueAnom[\"avg_QueueTimeInMs\"]==maxs[3]]\n",
382 |     "]\n",
383 |     "if len(top2Anom) > 0:\n",
384 |     "    print(\"Found top anomolies...\")\n",
385 |     "    for i in top2Anom:\n",
386 |     "        # Wow, such a pain to convert numpy time to python time...\n",
387 |     "        dt64 = whenMax[i][\"PreciseTimeStamp\"].values[0]\n",
388 |     "        ts = (dt64 - np.datetime64('1970-01-01T00:00:00Z')) / np.timedelta64(1, 's')\n",
389 |     "        whenMaxTime = datetime.utcfromtimestamp(ts)\n",
390 |     "        delta = startTime.replace(tzinfo=None) - whenMaxTime.replace(tzinfo=None)\n",
391 |     "        when = \"before\"\n",
392 |     "        if delta.total_seconds() < 0:\n",
393 |     "            when = \"after\"\n",
394 |     "            delta = whenMaxTime.replace(tzinfo=None) - startTime.replace(tzinfo=None)\n",
395 |     "        whenAnom = \"\"\"%s days %s hours %s minutes %s the start time - %s\"\"\" % (delta.days, delta.seconds//3600, delta.seconds//60, when, startTime)\n",
396 |     "        print(\" %s (#buckets %s) (max %s) (mean %s) (Max is %s)\" % (reasons[i], anom[i], maxs[i], means[i], whenAnom))    \n",
397 |     "        display(anomdata[i])\n",
398 |     "else:\n",
399 |     "    print(\"Can't say there is...?\")\n",
400 |     "\n",
401 |     "# additional insights\n",
402 |     "print()\n",
403 |     "print('Additional insights =============================')\n",
404 |     "w3wpGc = q_gen2Df[ q_gen2Df[\"ProcessName\"] == \"w3wp\" ].sort_values(by=['Number'], ascending=False)\n",
405 |     "jaGc = q_gen2Df[ q_gen2Df[\"ProcessName\"] == \"TfsJobAgent\" ].sort_values(by=['Number'], ascending=False)\n",
406 |     "now3wpGc = True\n",
407 |     "if len(w3wpGc) > 0:\n",
408 |     "    x = w3wpGc[:2]\n",
409 |     "    if len(x) == 1:\n",
410 |     "        xCount = x[\"Count\"].values\n",
411 |     "        if xCount[0] > 25:\n",
412 |     "            print(\"INSIGHT: There's a new spike in w3wp gc...\")\n",
413 |     "            now3wpGc = False\n",
414 |     "    if len(x) == 2:\n",
415 |     "        xCount = x[\"Count\"].values\n",
416 |     "        if xCount[1] > xCount[0]:\n",
417 |     "            print(\"INSIGHT: There's a spike in w3wp gc...\")\n",
418 |     "            now3wpGc = False\n",
419 |     "if now3wpGc:\n",
420 |     "    print(\"INSIGHT: No gc spikes found from w3wp...\")\n",
421 |     "    \n",
422 |     "nojaGc = True            \n",
423 |     "if len(jaGc) > 0:\n",
424 |     "    x = jaGc[:2]\n",
425 |     "    if len(x) == 1:\n",
426 |     "        xCount = x[\"Count\"].values\n",
427 |     "        if xCount[0] > 25:\n",
428 |     "            print(\"INSIGHT: There's a new spike in ja gc...\")\n",
429 |     "            nojaGc = False\n",
430 |     "    if len(x) == 2:\n",
431 |     "        xCount = x[\"Count\"].values\n",
432 |     "        if xCount[1] > xCount[0] and xCount[1] > 25:\n",
433 |     "            print(\"INSIGHT: There's a spike in ja gc...\")  \n",
434 |     "            nojaGc = False\n",
435 |     "if nojaGc:\n",
436 |     "    print(\"INSIGHT: No ja gc spikes found from w3wp...\")            "
437 |    ]
438 |   }
439 |  ],
440 |  "metadata": {
441 |   "kernel_info": {
442 |    "name": "python3"
443 |   },
444 |   "kernelspec": {
445 |    "display_name": "Python 3",
446 |    "language": "python",
447 |    "name": "python3"
448 |   },
449 |   "language_info": {
450 |    "codemirror_mode": {
451 |     "name": "ipython",
452 |     "version": 3
453 |    },
454 |    "file_extension": ".py",
455 |    "mimetype": "text/x-python",
456 |    "name": "python",
457 |    "nbconvert_exporter": "python",
458 |    "pygments_lexer": "ipython3",
459 |    "version": "3.7.4"
460 |   },
461 |   "nteract": {
462 |    "version": "0.15.0"
463 |   }
464 |  },
465 |  "nbformat": 4,
466 |  "nbformat_minor": 0
467 | }
468 | 


--------------------------------------------------------------------------------
/ja.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Jobagent Investigation\n",
  8 |     "1. Run all cells.\n",
  9 |     "1. View report at the bottom."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {
 16 |     "inputHidden": false,
 17 |     "outputHidden": false,
 18 |     "tags": [
 19 |      "parameters"
 20 |     ]
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "su = \"tfs-wcus-0\"\n",
 25 |     "start = \"2019-08-08T23:50:00.0000000Z\"\n",
 26 |     "end = \"2019-08-09T00:24:36.0000000Z\"\n",
 27 |     "url = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK/delays.ipynb\"\n",
 28 |     "baseUrl = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK\"\n",
 29 |     "service = \"tfs\"\n",
 30 |     "hub = \"Build\"\n",
 31 |     "locationName = \"tfsprodwcus0\"\n",
 32 |     "mdmAccount = \"VSO-TFS\""
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {
 39 |     "inputHidden": false,
 40 |     "outputHidden": false
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "%%capture\n",
 45 |     "!pip install --upgrade nimport azure-kusto-notebooks"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {
 52 |     "inputHidden": false,
 53 |     "outputHidden": false
 54 |    },
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "# Import the things we use\n",
 58 |     "\n",
 59 |     "# Note you can also use kql https://docs.microsoft.com/en-us/azure/data-explorer/kqlmagic\n",
 60 |     "# %kql is single line magic\n",
 61 |     "# %%kql is cell magic\n",
 62 |     "\n",
 63 |     "# https://nbviewer.jupyter.org/github/ipython/ipython/blob/4.0.x/examples/IPython%20Kernel/Rich%20Output.ipynb#HTML\n",
 64 |     "# https://ipython.readthedocs.io/en/stable/inte/magics.html\n",
 65 |     "from IPython.display import display, HTML, Markdown, Javascript, clear_output\n",
 66 |     "\n",
 67 |     "# http://pandas-docs.github.io/pandas-docs-travis/user_guide/reshaping.html\n",
 68 |     "import pandas as pd\n",
 69 |     "pd.options.display.html.table_schema = True\n",
 70 |     "from pandas import Series, DataFrame\n",
 71 |     "from datetime import datetime, timedelta, timezone\n",
 72 |     "from urllib.parse import urlencode, quote_plus\n",
 73 |     "from requests.utils import requote_uri\n",
 74 |     "import time\n",
 75 |     "import numpy as np\n",
 76 |     "from matplotlib import pyplot as plt\n",
 77 |     "from nimport.utils import tokenize, open_nb\n",
 78 |     "import json\n",
 79 |     "import os\n",
 80 |     "import calendar as cal\n",
 81 |     "import concurrent.futures\n",
 82 |     "from azure.kusto.notebooks import utils as akn"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {
 89 |     "inputHidden": false,
 90 |     "outputHidden": false
 91 |    },
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "params = {\n",
 95 |     "    \"su\": su,\n",
 96 |     "    \"start\": start,\n",
 97 |     "    \"end\": end,\n",
 98 |     "    \"url\": url,\n",
 99 |     "    \"baseUrl\": baseUrl,\n",
100 |     "    \"service\": service\n",
101 |     "}\n",
102 |     "root = 'devops-pipelines' if os.path.basename(os.getcwd()) != 'devops-pipelines' else ''\n",
103 |     "queryPath = os.path.join(root, 'queries')   "
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {
110 |     "inputHidden": false,
111 |     "outputHidden": false
112 |    },
113 |    "outputs": [],
114 |    "source": [
115 |     "# authenticate kusto client\n",
116 |     "# you will need to copy the token into a browser window for AAD auth. \n",
117 |     "client = akn.get_client('https://vso.kusto.windows.net')"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {
124 |     "inputHidden": false,
125 |     "outputHidden": false
126 |    },
127 |    "outputs": [],
128 |    "source": [
129 |     "q_slow = os.path.join(queryPath, \"SlowActivities.csl\")\n",
130 |     "q_sqlSlow = os.path.join(queryPath, \"SlowSql.csl\")\n",
131 |     "\n",
132 |     "jaPath = os.path.join(queryPath, 'ja')\n",
133 |     "q_jobSql = os.path.join(jaPath, \"JASqlTime.csl\")\n",
134 |     "with concurrent.futures.ThreadPoolExecutor() as executor:\n",
135 |     "    p1 = executor.submit(akn.execute_file, client, 'VSO', q_slow, params)\n",
136 |     "    p2 = executor.submit(akn.execute_file, client, 'VSO', q_sqlSlow, params)\n",
137 |     "    p3 = executor.submit(akn.execute_file, client, 'VSO', q_jobSql, params)\n",
138 |     "\n",
139 |     "q_slowResult_df = akn.to_dataframe_from_future(p1)\n",
140 |     "\n",
141 |     "q_sqlSlowResult_df = akn.to_dataframe_from_future(p2)\n",
142 |     "\n",
143 |     "q_jobSqlResult_df = akn.to_dataframe_from_future(p3)    \n"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "metadata": {
150 |     "inputHidden": false,
151 |     "outputHidden": false
152 |    },
153 |    "outputs": [],
154 |    "source": [
155 |     "print('=' * 50)\n",
156 |     "print('Report!')\n",
157 |     "print('=' * 50, '\\n\\n')\n",
158 |     "\n",
159 |     "# jarvis params\n",
160 |     "jarvisParams = {\n",
161 |     "  'su': su, \n",
162 |     "  'start': akn.get_time(start, -10), \n",
163 |     "  'end': akn.get_time(end, 10), \n",
164 |     "  'service': service,\n",
165 |     "  'location': locationName,\n",
166 |     "  'account': \"mdmAccount\"\n",
167 |     "}\n",
168 |     "\n",
169 |     "jaJarvisLink = \"\"\"https://jarvis-west.dc.ad.msft.net/dashboard/VSO-ServiceInsights/PlatformViews/Compute-JA\"\"\" \\\n",
170 |     "    \"\"\"?overrides=[{\"query\":\"//*[id='Service']\",\"key\":\"value\",\"replacement\":\"%(service)s\"},\"\"\" \\\n",
171 |     "    \"\"\"{\"query\":\"//*[id='RoleInstance']\",\"key\":\"value\",\"replacement\":\"\"},\"\"\" \\\n",
172 |     "    \"\"\"{\"query\":\"//*[id='LocationName']\",\"key\":\"value\",\"replacement\":\"%(location)s\"},\"\"\" \\\n",
173 |     "    \"\"\"{\"query\":\"//dataSources\",\"key\":\"namespace\",\"replacement\":\"%(su)s\"},\"\"\" \\\n",
174 |     "    \"\"\"{\"query\":\"//dataSources\",\"key\":\"account\",\"replacement\":\"%(account)s\"},\"\"\" \\\n",
175 |     "    \"\"\"{\"query\":\"//*[id='ApplicationEndpoint']\",\"key\":\"regex\",\"replacement\":\"*%(location)s*\"},\"\"\" \\\n",
176 |     "    \"\"\"{\"query\":\"//*[id='ScaleUnit']\",\"key\":\"value\",\"replacement\":\"%(su)s\"}]\"\"\" \\\n",
177 |     "    \"\"\"&globalStartTime=%(start)s&globalEndTime=%(end)s&pinGlobalTimeRange=true\"\"\" % jarvisParams;\n",
178 |     "print('Jarvis dashboard link for job agents:\\n', requote_uri(jaJarvisLink), '\\n')\n",
179 |     "\n",
180 |     "print('Top slow activities:')\n",
181 |     "display(q_slowResult_df)\n",
182 |     "\n",
183 |     "print('Top sql slow activities:')\n",
184 |     "display(q_sqlSlowResult_df)\n",
185 |     "\n",
186 |     "print('Top sql executime times from jobs:')\n",
187 |     "display(q_jobSqlResult_df)"
188 |    ]
189 |   }
190 |  ],
191 |  "metadata": {
192 |   "kernel_info": {
193 |    "name": "python3"
194 |   },
195 |   "kernelspec": {
196 |    "display_name": "Python 3",
197 |    "language": "python",
198 |    "name": "python3"
199 |   },
200 |   "language_info": {
201 |    "codemirror_mode": {
202 |     "name": "ipython",
203 |     "version": 3
204 |    },
205 |    "file_extension": ".py",
206 |    "mimetype": "text/x-python",
207 |    "name": "python",
208 |    "nbconvert_exporter": "python",
209 |    "pygments_lexer": "ipython3",
210 |    "version": "3.7.4"
211 |   },
212 |   "nteract": {
213 |    "version": "0.15.0"
214 |   }
215 |  },
216 |  "nbformat": 4,
217 |  "nbformat_minor": 0
218 | }
219 | 


--------------------------------------------------------------------------------
/params.json:
--------------------------------------------------------------------------------
1 | {}
2 | 


--------------------------------------------------------------------------------
/queries/ActiveIncidents.csl:
--------------------------------------------------------------------------------
 1 | //%kql AzureDataExplorer://tenant="Microsoft.com";code;cluster='Icmcluster';database='IcMDataWarehouse' 
 2 | let scaleUnit = "{su}";
 3 | let startTime = todatetime("{start}") - 5hr;
 4 | let endTime = todatetime("{end}") + 5hr;
 5 | //38 is the VSTS Tenant ID in IcM 
 6 | getincidents(38,startTime, endTime)
 7 | //| where Severity < 3 and IsOutage == 1 and isnull(ParentIncidentId)
 8 | | project IncidentId, Severity, CreateDate, Title, Status, OwningTeamName  
 9 | | where Title contains scaleUnit
10 | | order by CreateDate desc 


--------------------------------------------------------------------------------
/queries/HealthAgentActions.csl:
--------------------------------------------------------------------------------
 1 | let scaleUnit = "{su}";
 2 | let startTime = todatetime("{start}") - 5hr;
 3 | let endTime = todatetime("{end}") + 5hr;
 4 | let service = "{service}";
 5 | let threshold = 10;
 6 | let step = totimespan("1m");
 7 | let window = totimespan("5m");
 8 | VssHealthAgentActions
 9 | | where PreciseTimeStamp between (startTime .. endTime)
10 | | where Service =~ service
11 | | where ScaleUnit =~ scaleUnit
12 | | project PreciseTimeStamp, RoleInstance, MitigationName, Directory, ActionName  


--------------------------------------------------------------------------------
/queries/LocationName.csl:
--------------------------------------------------------------------------------
1 | let scaleUnit = "{su}";
2 | let service = "{service}";
3 | ActivityLog
4 | | where ScaleUnit == scaleUnit
5 | | where Service =~ service
6 | | project Tenant
7 | | take 1


--------------------------------------------------------------------------------
/queries/MDMAccount.csl:
--------------------------------------------------------------------------------
1 | let service = "{service}";
2 | CosmosMdmMetrics
3 | | where Service == service
4 | | project monitoringAccount
5 | | take 1


--------------------------------------------------------------------------------
/queries/SlowActivities.csl:
--------------------------------------------------------------------------------
 1 | let scaleUnit = "{su}";
 2 | let startTime = todatetime("{start}") - 15min;
 3 | let endTime = todatetime("{end}") + 15min;
 4 | let service = "{service}";
 5 | let interval = 1m;
 6 | ProductTrace
 7 | | where PreciseTimeStamp between (startTime .. endTime)
 8 | | where Service =~ service
 9 | | where ScaleUnit =~ scaleUnit
10 | | where Role == "JobAgent"
11 | | where Tracepoint == 36109 //slow activities from request context
12 | | extend K = strcat(RoleInstance, "-", DeploymentId, "-", DeploymentSlot, "-", Method) 
13 | | summarize count() by K
14 | | top 25 by count_ desc
15 | //| render timechart 


--------------------------------------------------------------------------------
/queries/SlowSql.csl:
--------------------------------------------------------------------------------
 1 | let scaleUnit = "{su}";
 2 | let startTime = todatetime("{start}") - 15min;
 3 | let endTime = todatetime("{end}") + 15min;
 4 | let service = "{service}";
 5 | let interval = 1m;
 6 | ProductTrace
 7 | | where PreciseTimeStamp between (startTime .. endTime)
 8 | | where Service =~ service
 9 | | where ScaleUnit =~ scaleUnit
10 | | where Role == "JobAgent"
11 | | where Tracepoint == 64038 //sql slow queries that is the queries taking longer than the usual time for execution.
12 | | extend K = strcat(RoleInstance, "-", DeploymentId, "-", DeploymentSlot, "-", Method) 
13 | | summarize count() by K
14 | | top 25 by count_ desc
15 | //| render timechart 


--------------------------------------------------------------------------------
/queries/WhatChanged.csl:
--------------------------------------------------------------------------------
 1 | let scaleUnit = "{su}";
 2 | let startTime = todatetime("{start}") - 5hr;
 3 | let endTime = todatetime("{end}") + 5hr;
 4 | let threshold = 10;
 5 | let step = totimespan("1m");
 6 | let window = totimespan("5m");
 7 | let tenant = "{locationName}";
 8 | WhatChangedRange(tenant=tenant, startTime, endTime-startTime)
 9 | | extend  Name=strcat(['title'], "@", tostring(TIMESTAMP))
10 | | order by start asc


--------------------------------------------------------------------------------
/queries/delays/Abusers.csl:
--------------------------------------------------------------------------------
1 | TraceLightRailLog
2 | | where ServiceName =~ 'mms'
3 | | where Command == 'Stop-ServiceHost'
4 | | where Message startswith 'HostId = '
5 | | extend HostId = tostring(split(Message, ' ')[2])
6 | | summarize by HostId
7 | | union (ServiceHostAggregated | where StatusReason in ("abuse", "Abuse") | summarize by HostId)
8 | | distinct HostId


--------------------------------------------------------------------------------
/queries/delays/AffectedAccounts.csl:
--------------------------------------------------------------------------------
 1 | let scaleUnit = "{su}";
 2 | let startTime = todatetime("{start}") - 15min;
 3 | let endTime = todatetime("{end}") + 15min;
 4 | let service = "{service}";
 5 | let hubName = "{hub}";
 6 | let threshold = 10;
 7 | let step = totimespan("1m");
 8 | let window = totimespan("5m");
 9 | let affectedAccounts =
10 | ProductTrace
11 | | where PreciseTimeStamp between (startTime .. endTime)
12 | | where Service =~ service
13 | | where ScaleUnit =~ scaleUnit
14 | | where Tracepoint == 15010000
15 | | where Layer matches regex strcat(hubName, ".*_ActivityDispatcher")
16 | | extend DbDelay = extract('read from db delay (.*),', 1, Message, typeof(timespan))
17 | | extend BufferDelay = extract('buffer read delay (.*)', 1, Message, typeof(timespan))
18 | | extend MessageDelay = DbDelay + BufferDelay
19 | | summarize avg(MessageDelay) by ServiceHost, Layer, bin(PreciseTimeStamp, step)
20 | | extend Threshold = strcat(threshold, 's')
21 | | where avg_MessageDelay > totimespan(Threshold);
22 | ServiceHostAggregated()
23 | | join (affectedAccounts) on $left.HostId == $right.ServiceHost
24 | | where Service =~ service
25 | | where HostType == 4
26 | | project Name, HostId, Layer, PreciseTimeStamp, MessageDelayInSeconds = avg_MessageDelay / 1s, DatabaseName, Threshold
27 | | order by PreciseTimeStamp desc
28 | // | order by MessageDelayInSeconds desc


--------------------------------------------------------------------------------
/queries/delays/DelayedAccountsAreAbusers.csl:
--------------------------------------------------------------------------------
 1 | // Impacted accounts in time window, and are they known abusers
 2 | //
 3 | let startTime = todatetime("{start}") - 15m;
 4 | let endTime = todatetime("{end}") + 15m;
 5 | let service = "{service}";
 6 | let hubName = "{hub}";
 7 | let scaleUnit = "{su}";
 8 | let Abusers = TraceLightRailLog
 9 |     | where ServiceName =~ 'mms'
10 |     | where Command == 'Stop-ServiceHost'
11 |     | where Message startswith 'HostId = '
12 |     | extend HostId = tostring(split(Message, ' ')[2])
13 |     | summarize by HostId
14 |     | union (ServiceHostAggregated | where StatusReason in ("abuse", "Abuse") | summarize by HostId)
15 |     | distinct HostId;
16 | let ActivityDispatcherDelays = ProductTrace
17 |     | where PreciseTimeStamp between (startTime .. endTime)
18 |     | where Service =~ service
19 |     | where ScaleUnit =~ scaleUnit
20 |     | where Tracepoint == 15010000
21 |     | where Layer matches regex strcat(hubName, ".*_ActivityDispatcher")
22 |     | extend DbDelay = extract('read from db delay (.*),', 1, Message, typeof(timespan))
23 |     | extend BufferDelay = extract('buffer read delay (.*)', 1, Message, typeof(timespan))
24 |     | extend MessageDelayInSeconds = toint((DbDelay + BufferDelay) / 1s)
25 |     | join kind=leftouter (ServiceHostAggregated() | where Service =~ service | where HostType == 4 | summarize by HostId, Name)
26 |       on $left.ServiceHost == $right.HostId;
27 | // table
28 | //
29 | ActivityDispatcherDelays
30 | | summarize AvgMessageDelay=round(avg(MessageDelayInSeconds)) by Name, HostId
31 | | extend Abuser = iff(HostId in (Abusers), "yep", "")
32 | | order by AvgMessageDelay desc


--------------------------------------------------------------------------------
/queries/delays/Load.csl:
--------------------------------------------------------------------------------
 1 | // orchestrator kpi's
 2 | let scaleUnit = "{su}";
 3 | let startTime = todatetime("{start}") - 15min;
 4 | let endTime = todatetime("{end}") + 15min;
 5 | let service = "{service}";
 6 | let hubName = "{hub}";
 7 | let interval = 1m;
 8 | KPI
 9 |  | where PreciseTimeStamp between (startTime .. endTime)
10 |  | where Service =~ service
11 |  | where ScaleUnit =~ scaleUnit
12 |  | where Metrics contains "DTPlan" or Metrics contains "DTAgent" or Metrics contains "DTJob"   
13 |  | extend DataObj = parsejson(Metrics) 
14 |  | extend MetricsObjArr = parsejson(DataObj.metrics)
15 |  | extend MetricsObj = MetricsObjArr[0]
16 |  | extend HostId = tostring(DataObj.hostId)
17 |  | extend Name = tostring(MetricsObj.name)
18 |  | extend DisplayName = MetricsObj.displayName
19 |  | extend Value = todouble(MetricsObj.value)
20 |  | project PreciseTimeStamp, DataObj, Name, DisplayName, Value, MetricsObj 
21 |  | summarize sum(Value) by Name, bin(PreciseTimeStamp, interval)
22 |  | render timechart


--------------------------------------------------------------------------------
/queries/delays/LoadPerHost.csl:
--------------------------------------------------------------------------------
 1 | // orchestrator kpi's
 2 | let scaleUnit = "{su}";
 3 | let startTime = todatetime("{start}") - 15min;
 4 | let endTime = todatetime("{end}") + 15min;
 5 | let service = "{service}";
 6 | let hubName = "{hub}";
 7 | let interval = 1m;
 8 | let hostId = "{hostId}";
 9 | KPI
10 |  | where PreciseTimeStamp between (startTime .. endTime)
11 |  | where Service =~ service
12 |  | where ScaleUnit =~ scaleUnit
13 |  | where Metrics contains "DTPlan" or Metrics contains "DTAgent" or Metrics contains "DTJob"   
14 |  | extend DataObj = parsejson(Metrics) 
15 |  | extend MetricsObjArr = parsejson(DataObj.metrics)
16 |  | extend MetricsObj = MetricsObjArr[0]
17 |  | extend HostId = tostring(DataObj.hostId)
18 |  | extend Name = tostring(MetricsObj.name)
19 |  | extend DisplayName = MetricsObj.displayName
20 |  | extend Value = todouble(MetricsObj.value)
21 |  | where HostId startswith hostId
22 |  | project PreciseTimeStamp, DataObj, Name, DisplayName, Value, MetricsObj 
23 |  | summarize sum(Value) by Name, bin(PreciseTimeStamp, interval)
24 |  | render timechart


--------------------------------------------------------------------------------
/queries/delays/OrchestrationLogSpike.csl:
--------------------------------------------------------------------------------
 1 | let scaleUnit = "{su}";
 2 | let startTime = todatetime("{start}") - 8hr;
 3 | let endTime = todatetime("{end}") + 15min;
 4 | let service = "{service}";
 5 | let hubName = "{hub}";
 6 | let threshold = 10;
 7 | let step = totimespan("1m");
 8 | let window = totimespan("5m");
 9 | let interval = 1m;
10 | let hostId = "{hostId}";
11 |  OrchestrationLog
12 |  | where PreciseTimeStamp between (startTime .. endTime)
13 |  | where Service =~ service
14 |  | where ScaleUnit =~ scaleUnit
15 |  | where HostId startswith hostId
16 |  | where Command contains "CIPlatform"
17 |  | summarize C=count() by Command, HostId, bin(PreciseTimeStamp, 15min)
18 |  | order by C desc


--------------------------------------------------------------------------------
/queries/delays/OrchestrationLogSpikeTip.csl:
--------------------------------------------------------------------------------
 1 | // Use this to sample a few OrchestrationIds
 2 | // Run those OrchestrationIds through https://github.com/microsoft/devops-pipelines/blob/master/queries/run/WhatHappened.csl
 3 | // Or https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK/run.ipynb?planId=e75d6056-bfba-4906-b454-02ba3b7880e7 (change planId)
 4 | let scaleUnit = "{su}";
 5 | let startTime = todatetime("{start}") - 15min;
 6 | let endTime = todatetime("{end}") + 15min;
 7 | let service = "{service}";
 8 | let hubName = "{hub}";
 9 | let command = "{command}";
10 | let threshold = 10;
11 | let step = totimespan("1m");
12 | let window = totimespan("5m");
13 | let interval = 1m;
14 | let hostId = "{hostId}";
15 |  OrchestrationLog
16 |  | where PreciseTimeStamp between (startTime .. endTime)
17 |  | where Service =~ service
18 |  | where ScaleUnit =~ scaleUnit
19 |  | where HostId startswith hostId
20 |  | where Command contains command
21 |  | project PreciseTimeStamp, OrchestrationId, ExceptionMessage, Feature, HostId
22 |  | top 1000 by PreciseTimeStamp desc
23 | 


--------------------------------------------------------------------------------
/queries/delays/Parallelism.csl:
--------------------------------------------------------------------------------
 1 | // orchestrator kpi's
 2 | let hostId = "{hostId}";
 3 | let startTime = todatetime("{start}") - 15min;
 4 | let endTime = todatetime("{end}") + 15min;
 5 | let sampleInterval = 1m;
 6 | range sampleTime from startTime to endTime step sampleInterval
 7 | | extend dummyKey=1
 8 | | join kind=inner ( // carthesian product really
 9 |     AgentPoolRequestHistory
10 |     | where HostId == hostId
11 |     | where StartTime != '1601-01-01T00:00:00Z' // Exclude the jobs that were never started
12 |     | where StartTime < endTime and FinishTime > startTime  // Exclude upfront the jobs that weren't running during our window.
13 |     | where bin(StartTime, 1m) != bin(FinishTime, 1m)       // Exclude upfront jobs that started and completed within the same minute interval.
14 |     | extend dummyKey=1
15 | ) on dummyKey
16 | | where StartTime < sampleTime and FinishTime > sampleTime
17 | | make-series
18 |     C=count()
19 |     on sampleTime
20 |     in range(startTime, endTime, sampleInterval)
21 | | extend HostId = hostId


--------------------------------------------------------------------------------
/queries/delays/WhatDelayed.csl:
--------------------------------------------------------------------------------
 1 | let scaleUnit = "{su}";
 2 | let startTime = todatetime("{start}") - 15min;
 3 | let endTime = todatetime("{end}") + 15min;
 4 | let service = "{service}";
 5 | let hubName = "{hub}";
 6 | let interval = 1m;
 7 | CounterEvent
 8 |  | where PreciseTimeStamp between (startTime .. endTime)
 9 |  | where Service =~ service
10 |  | where ScaleUnit =~ scaleUnit
11 |  | where Role == 'JobAgent'
12 |  | where CounterName startswith strcat("\\TFS Services:Orchestration(", hubName) or CounterName startswith "\\TFS Services:JobService(_Total)"
13 |  | extend NameOnly = extract("\\)\\\\(.*)$", 1, CounterName, typeof(string))
14 |  | where NameOnly in (
15 |      'Total Pending Jobs', 
16 |      'Pending Job Age', 
17 |      'Average Activity Message Delay', 
18 |      'Average Activity Job Delay', 
19 |      'Average Activity Execution Time'
20 |    )
21 |  | extend Pivot = replace("(TFS Services:)|(Orchestration\\(Build-)","", CounterName)
22 |  | summarize avg(CounterValue) by Pivot, bin(PreciseTimeStamp, interval)
23 |  | render timechart


--------------------------------------------------------------------------------
/queries/impact/CommandsAT.csl:
--------------------------------------------------------------------------------
 1 | let scaleUnit = "{su}";
 2 | let startTime = todatetime("{start}");
 3 | let endTime = todatetime("{end}");
 4 | let service = "{service}";
 5 | ActivityLog
 6 | | where Service =~ service and ScaleUnit =~ scaleUnit
 7 | | where PreciseTimeStamp between (startTime .. endTime)
 8 | | where ActivityStatus > 0
 9 | | extend VSID = iff(isnotempty(AnonymousIdentifier), AnonymousIdentifier, VSID)
10 | | extend Reason = iff(ActivityStatus == 1, "failed", "slow")
11 | | summarize Frequency = count() by RoleInstance, Reason
12 | | order by Frequency desc


--------------------------------------------------------------------------------
/queries/impact/CommandsDb.csl:
--------------------------------------------------------------------------------
 1 | let scaleUnit = "{su}";
 2 | let startTime = todatetime("{start}");
 3 | let endTime = todatetime("{end}");
 4 | let service = "{service}";
 5 | ActivityLog
 6 | | where Service =~ service and ScaleUnit =~ scaleUnit
 7 | | where PreciseTimeStamp between (startTime .. endTime)
 8 | | where ActivityStatus > 0
 9 | | extend VSID = iff(isnotempty(AnonymousIdentifier), AnonymousIdentifier, VSID)
10 | | extend Reason = iff(ActivityStatus == 1, "failed", "slow")
11 | | join (
12 |    ServiceHostAggregated | where Service == service and HostType == 4 and ScaleUnit == scaleUnit
13 | ) on HostId 
14 | | extend Server = replace(".database.windows.net", "", ServerName)
15 | | extend Database = replace(strcat("{service}", "_", "{locationName}", "_"), "", tolower(DatabaseName))
16 | | summarize Frequency = count() by Server, Database, Reason
17 | | order by Frequency desc


--------------------------------------------------------------------------------
/queries/impact/CommandsReason.csl:
--------------------------------------------------------------------------------
 1 | let scaleUnit = "{su}";
 2 | let startTime = todatetime("{start}");
 3 | let endTime = todatetime("{end}");
 4 | let service = "{service}";
 5 | ActivityLog
 6 | | where Service =~ service and ScaleUnit =~ scaleUnit
 7 | | where ActivityStatus > 0
 8 | | extend VSID = iff(isnotempty(AnonymousIdentifier), AnonymousIdentifier, VSID)
 9 | | extend Reason = iff(ActivityStatus == 1, "failed", "slow")
10 | | summarize Frequency = count() by Reason
11 | | order by Frequency desc


--------------------------------------------------------------------------------
/queries/impact/Dependencies.csl:
--------------------------------------------------------------------------------
 1 | let scaleUnit = "{su}";
 2 | let startTime = todatetime("{start}") - 8hr;
 3 | let endTime = todatetime("{end}") + 15min;
 4 | let service = "{service}";
 5 | func_ActivityLog
 6 | | where Service =~ service and ScaleUnit =~ scaleUnit
 7 | | where PreciseTimeStamp between (startTime .. endTime)
 8 | | where ActivityStatus > 0
 9 | | extend VSID = iff(isnotempty(AnonymousIdentifier), AnonymousIdentifier, VSID)
10 | | extend Reason = iff(ActivityStatus == 1, "failed", "slow")
11 | | summarize avg(RedisExecutionTimeInMs), avg(VssClientExecutionTimeInMs), avg(SqlExecutionTimeInMs), avg(QueueTimeInMs) by bin(PreciseTimeStamp, 5min)


--------------------------------------------------------------------------------
/queries/impact/Gen2GCSpikes.csl:
--------------------------------------------------------------------------------
 1 | let scaleUnit = "{su}";
 2 | let startTime = todatetime("{start}") - 8hr;
 3 | let endTime = todatetime("{end}") + 15min;
 4 | let diff = endTime - startTime;
 5 | let service = "{service}";
 6 | let interval = 1m;
 7 | VssHealthAgentGarbageCollection 
 8 | | where PreciseTimeStamp >= startTime and PreciseTimeStamp <= endTime
 9 | | where Service =~ service and ScaleUnit =~ scaleUnit
10 | | where Environment == "PROD"
11 | | where DeploymentSlot == "Production"
12 | | where Generation == 2
13 | | where ProcessName in ("w3wp", "TfsJobAgent")
14 | | make-series Count = count() on PreciseTimeStamp in range(startTime, endTime, diff/4) by ProcessName
15 | | mv-expand PreciseTimeStamp, Count
16 | | where Count  > 0
17 | | extend Number = toint(Count)
18 | | extend Time = todatetime(PreciseTimeStamp)
19 | | order by Time desc


--------------------------------------------------------------------------------
/queries/ja/JASqlTime.csl:
--------------------------------------------------------------------------------
 1 | let scaleUnit = "{su}";
 2 | let startTime = todatetime("{start}") - 15min;
 3 | let endTime = todatetime("{end}") + 15min;
 4 | let service = "{service}";
 5 | let interval = 1m;
 6 | JobHistory
 7 | | where PreciseTimeStamp between (startTime .. endTime)
 8 | | where Service =~ service
 9 | | where ScaleUnit =~ scaleUnit
10 | | summarize sum(SqlExecutionTime) by Plugin, JobName, JobSource 
11 | | sort by sum_SqlExecutionTime desc
12 | | limit 25


--------------------------------------------------------------------------------
/queries/run/PlanInfo.csl:
--------------------------------------------------------------------------------
 1 | // Pull a useful collection of data about a specific plan
 2 | //
 3 | let oid = {OrchestrationId};
 4 | let pid = substring(oid, 0, 36);
 5 | let HostNames = ServiceHostAggregated
 6 | | where Service in ("tfs", "pipelines")
 7 | | summarize by HostId, Name;
 8 | let RingMap = union Ring_Mapping("tfs"), Ring_Mapping("pipelines");
 9 | OrchestrationPlanContext
10 | | where PlanId == pid
11 | | summarize RoleInstances=make_set(RoleInstance), JobOrchestrationIds=make_set(OrchestrationId) by HostId, ScaleUnit, Region, Tenant, ProjectName, PlanType, DefinitionId, DefinitionName
12 | | join kind=leftouter HostNames on HostId | project-away HostId1
13 | | join kind=leftouter RingMap on ScaleUnit| project-away ScaleUnit1


--------------------------------------------------------------------------------
/queries/run/WhatHappened.csl:
--------------------------------------------------------------------------------
 1 | // OK SO WHAT HAPPENED: an end-to-end analysis based on DistributedTask OrchestrationId
 2 | // zacox@microsoft.com
 3 | //
 4 | let oid = {OrchestrationId};
 5 | let planId = substring(oid, 0, 36);
 6 | let hostId = tostring(toscalar(OrchestrationPlanContext | where PlanId == planId | summarize by HostId));
 7 | let hostName = tostring(toscalar(ServiceHostAggregated() | where HostId == hostId | take 1 | project Name));
 8 | find in (ActivityLog, AgentPoolRequestHistory, HttpOutgoingRequests, OrchestrationLog, OrchestrationPlanContext, ProductTrace)
 9 | where OrchestrationId startswith planId
10 | project PreciseTimeStamp,
11 |         OrchestrationId,
12 |         Command,
13 |         Message
14 | //         Tracepoint,
15 | //         UrlPath,
16 | //         ResponseCode,
17 | //         Level, // for coloring :)
18 | //         ActivityId,
19 | //         ExceptionMessage
20 | // | extend HostName = hostName
21 | | order by PreciseTimeStamp asc


--------------------------------------------------------------------------------
/queries/sla/SLADurationAnalysis.csl:
--------------------------------------------------------------------------------
  1 | // Identify all orchestrations that are currently out of SLA
  2 | // zacox@microsoft.com
  3 | //
  4 | let triggerTime = {TriggerTime}; // now(), datetime(2019-07-07 20:20:20Z)
  5 | let service = {Service};         // "tfs", "releasemanagement";
  6 | let scaleUnit = {ScaleUnit};     // "tfs-wus-0"
  7 | let lookback = {Lookback};       // how often does the query run?
  8 | //
  9 | let startTime = triggerTime - 15m; // allow for kusto ingestion
 10 | let maxPhaseDuration = 2h; // maximum amount of time before treating the phase as missing telemetry
 11 | let searchStartTime = startTime - lookback;
 12 | let slaLimit = 5m;
 13 | let prodTracePartitions = 12;
 14 | let sev2Threshold = 25; // how many need to break SLA per scale unit to alert?
 15 | //
 16 | //
 17 | // Consider running plans, and any that completed since the last trigger
 18 | let StartedPlans = ProductTrace
 19 |     | where PreciseTimeStamp < startTime
 20 |         and Tracepoint == 10015547 // TaskHub tracepoint
 21 |         and (isempty(service) or Service == service)
 22 |     | project
 23 |         PlanId = OrchestrationId,
 24 |         StartTime = PreciseTimeStamp;
 25 | let FinishedPlans = ProductTrace
 26 |     | where PreciseTimeStamp < startTime
 27 |         and Tracepoint == 0 // TODO: add a tracepoint :(
 28 |         and (isempty(service) or Service == service)
 29 |         and Message startswith "Completed orchestration with result"
 30 |     | project
 31 |         PlanId = OrchestrationId,
 32 |         FinishTime = PreciseTimeStamp;
 33 | let RecentlyCompletedPlans = FinishedPlans
 34 |     | where FinishTime > searchStartTime
 35 |     | join hint.strategy=shuffle hint.num_partitions = prodTracePartitions
 36 |         StartedPlans on PlanId
 37 |     | project
 38 |         PlanId,
 39 |         Completed = true,
 40 |         PlanDuration = FinishTime - StartTime;
 41 | let RunningPlans = StartedPlans
 42 |     | join hint.strategy=shuffle hint.num_partitions = prodTracePartitions
 43 |         kind=leftanti
 44 |         FinishedPlans on PlanId
 45 |     | project
 46 |         PlanId,
 47 |         Completed = false,
 48 |         PlanDuration = startTime - StartTime;
 49 | let PlansToConsider = union RecentlyCompletedPlans //, RunningPlans // zacox: ignore running plans -- telemetry is too flakey
 50 |     | where PlanDuration > slaLimit; // only consider plans that could be out of range
 51 | let PlanIds = PlansToConsider | project PlanId;
 52 | // PlansToConsider | summarize hint.strategy = shuffle count() by Completed
 53 | //
 54 | //
 55 | // Only phases with execution time limits are considered to contribute to the SLA
 56 | let OrchestrationLogSubset = OrchestrationLog
 57 |     | where PreciseTimeStamp < startTime
 58 |     | extend PlanId = substring(OrchestrationId, 0, 36)
 59 |     | where PlanId in (PlanIds); // only consider plans in our subset
 60 | let OrchestrationLogWithSLA = PhaseExecutionTimeOverrides()
 61 |     | where Application == "Pipelines"
 62 |     | join
 63 |         kind=rightouter
 64 |         OrchestrationLogSubset
 65 |         on Application, Feature, Command
 66 |     | extend IsSLA = isnotnull(ExecutionTimeThresholdOverrideInMicroseconds)
 67 |     | project PreciseTimeStamp, Service, Region, ScaleUnit, Application, Feature, Command, ExecutionTimeThreshold,
 68 |         OrchestrationId, StartTime, EndTime, IsExceptionExpected, ExceptionMessage, ExceptionType,
 69 |         PlanId, IsSLA;
 70 | //
 71 | //
 72 | // big ol' map/reduce
 73 | let NullTime = datetime(1601-01-01 00:00:00.0000000);
 74 | let IsValidDate = (dt:datetime) { isnotnull(dt) and dt != NullTime };
 75 | let parallelism = 8;
 76 | let PhaseData = range p from 1 to parallelism step 1 | partition by p
 77 | {
 78 |     OrchestrationLogWithSLA
 79 |     | where hash(PlanId, parallelism) == toscalar(p)
 80 |     | extend EventTime = max_of(StartTime, EndTime)
 81 |     | order by OrchestrationId, EndTime asc, StartTime asc
 82 |     //
 83 |     // generate logical variables
 84 |     | extend IsNextSameOrchestration = (next(OrchestrationId) == OrchestrationId)
 85 |     | extend NextEndTime = next(EndTime)
 86 |     | extend IsEndPhase = IsValidDate(EndTime)
 87 |     | extend IsLastPhase = IsNextSameOrchestration and IsValidDate(NextEndTime)
 88 |     | extend NextStartTime = next(StartTime)
 89 |     //
 90 |     // generate useful variables
 91 |     | extend PhaseStartTime = EventTime
 92 |     | extend PhaseEndTime = case(
 93 |         IsEndPhase, EndTime,
 94 |         IsLastPhase, NextEndTime,
 95 |         IsNextSameOrchestration, NextStartTime, // marks the end of an Orchestration
 96 |         // min_of(startTime, PhaseStartTime + MaxPhaseDuration)) // original
 97 |         startTime) // currently running or dropped-telemetry phases are set to 0s
 98 |     | extend PhaseEndTime = iff(PhaseEndTime - PhaseStartTime > maxPhaseDuration, PhaseStartTime, PhaseEndTime)
 99 |     | extend OidComponents = split(OrchestrationId, ".")
100 |     | extend OidLookback = iff(isnull(tolong(OidComponents[-1])), -2, -3) // ignore attempt numbers
101 |     | extend ParentOrchestrationId = strcat_array(array_slice(OidComponents, 1, OidLookback), ".")
102 |     | extend SLADuration = iff(IsSLA, PhaseEndTime - PhaseStartTime, 0s)
103 |     | project Service, Region, ScaleUnit, PhaseStartTime, SLADuration, PhaseEndTime,
104 |         PlanId, OrchestrationId, ParentOrchestrationId
105 | };
106 | //
107 | //
108 | // Collect precise phase duration data by plan id.
109 | let PlanData = PhaseData
110 | | summarize hint.strategy=shuffle
111 |     ShortOrchestrationIds = make_list(substring(OrchestrationId, 37)), // remove the guid
112 |     PhaseEndTimes = make_list(PhaseEndTime),
113 |     SLADurations = make_list(SLADuration)
114 |     by PlanId;
115 | //
116 | //
117 | // Compute final SLA violation table
118 | let Result = PhaseData
119 | | summarize hint.strategy=shuffle
120 |     SLADuration = sum(SLADuration), // find total
121 |     OrchestrationStartTime = min(PhaseStartTime)
122 |     by Service, Region, ScaleUnit, OrchestrationId, ParentOrchestrationId, PlanId
123 | //
124 | // OK: join with table of all phase info by plan id, and sum up all the contributing ancestral SLA durations
125 | | lookup (PlanData) on PlanId
126 | | mv-apply
127 |     AncestorPhaseEndTime = PhaseEndTimes to typeof(datetime),
128 |     AncestorPhaseDuration = SLADurations to typeof(timespan),
129 |     Soid = ShortOrchestrationIds to typeof(string)
130 |     on (
131 |           where AncestorPhaseEndTime <= OrchestrationStartTime // only phases that completed before the first phase of this orchestration
132 |         | where ParentOrchestrationId startswith Soid // only ancestor orchestrations
133 |         | summarize AncestorSLADuration = sum(AncestorPhaseDuration)
134 |         )
135 |     | project-away PhaseEndTimes, SLADurations, ShortOrchestrationIds
136 | //
137 | //
138 | // SLA Enforcement
139 | | where SLADuration + AncestorSLADuration between(slaLimit..maxPhaseDuration)
140 | | lookup (PlansToConsider) on PlanId
141 | | project Service, Region, ScaleUnit, PlanId, OrchestrationId,
142 |     PlanDuration = PlanDuration / 1s,
143 |     TotalSLADuration = (SLADuration + AncestorSLADuration) / 1s
144 | ;
145 | //
146 | //
147 | Result
148 | //
149 | // Kalypso monitor mode: only take any if there are more that a certain number of problems
150 | // | summarize hint.strategy=shuffle
151 | //     NumberOfOrchestrations=count(),
152 | //     NumberOfPlans = dcount(PlanId),
153 | //     AvgSLADuration = avg(TotalSLADuration),
154 | //     MaxSLADuration = max(TotalSLADuration),
155 | //     MinSLADuration = min(TotalSLADuration)
156 | //     by Service, Region, ScaleUnit
157 | // | where NumberOfOrchestrations > sev2Threshold
158 | // | project TriggerTime = triggerTime, Lookback=lookback, Service, Region, ScaleUnit,
159 | //     MinSLADuration, AvgSLADuration, MaxSLADuration, NumberOfOrchestrations, NumberOfPlans
160 | 


--------------------------------------------------------------------------------
/queries/sla/SLAVisualization.csl:
--------------------------------------------------------------------------------
 1 | // CIAO phase performance analysis
 2 | // zacox@microsoft.com
 3 | //
 4 | let oid = {OrchestrationId}; // "7d1c09de-35e8-44ed-8720-279b117caf1d.job_1.__default.135";
 5 | let su = {ScaleUnit};
 6 | let currentTime = now() - 15m; // always use most upto date data
 7 | let oidComponents = split(oid, ".");
 8 | let oidLookback = iff(isnull(tolong(oidComponents[-1])), -2, -3); // ignore attempt numbers
 9 | let parentOid = strcat_array(array_slice(oidComponents, 0, oidLookback), ".");
10 | let planid = substring(oid, 0, 36);
11 | let useSev2Thresholds = true;
12 | let maxPhaseDuration = 2h;
13 | let definitionName = toscalar(OrchestrationPlanContext
14 |     | where OrchestrationId startswith planid
15 |     | project DefinitionName | take 1);
16 | //
17 | //
18 | // compute phase performance
19 | let IsValidDate = (dt:datetime) { isnotnull(dt) and dt != datetime(1601-01-01 00:00:00.0000000) };
20 | OrchestrationLog
21 | | where isempty(su) or ScaleUnit == su
22 | | where Application == "Pipelines" and Feature == "Build"
23 | | where oid == OrchestrationId or (isnotempty(OrchestrationId) and parentOid startswith OrchestrationId)
24 | | lookup
25 |     (PhaseExecutionTimeOverrides() | where Application == "Pipelines" and Feature == "Build")
26 |     on Application, Feature, Command
27 | | extend ExecutionTimeThreshold = iff(
28 |     isnotnull(ExecutionTimeThresholdOverrideInMicroseconds),
29 |     ExecutionTimeThresholdOverrideInMicroseconds * 1microsecond,
30 |     max_of(0s, ExecutionTimeThreshold * 1microsecond))
31 | | project OrchestrationId, Command, ExecutionTimeThreshold, StartTime, EndTime, ScaleUnit, Region
32 | //
33 | // compute phase durations
34 | | order by OrchestrationId, EndTime asc, StartTime asc // cluster by orchestration id
35 | | extend IsNextSameOrchestration = (next(OrchestrationId) == OrchestrationId)
36 | | extend NextEndTime = next(EndTime)
37 | | extend IsEndPhase = IsValidDate(EndTime)
38 | | extend IsLastPhase = IsNextSameOrchestration and IsValidDate(NextEndTime)
39 | | extend IsPrevSameOrchestration = (prev(OrchestrationId) == OrchestrationId)
40 | | extend NextStartTime = next(StartTime)
41 | | extend PhaseStartTime = max_of(StartTime, EndTime)
42 | | extend PhaseEndTime = case(
43 |     IsEndPhase, EndTime,
44 |     IsLastPhase, NextEndTime,
45 |     IsNextSameOrchestration, NextStartTime, // Ending of Orchestration
46 |     currentTime)
47 | | extend Duration = PhaseEndTime - PhaseStartTime
48 | | extend Difference = Duration - ExecutionTimeThreshold
49 | | extend PercentDifference =
50 |     iff(ExecutionTimeThreshold > 0s,
51 |         round(100 * todouble(Difference / 1microsecond) / todouble(ExecutionTimeThreshold / 1microsecond), 2),
52 |         double(0))
53 | //
54 | // analytics
55 | | order by EndTime asc, StartTime asc
56 | | project PlanId = planid,
57 |           OrchestrationId = strcat_array(array_slice(split(OrchestrationId, '.'), 1, 300), '.'),
58 |           DefinitionName = definitionName,
59 |           ScaleUnit,
60 |           Region,
61 |           PhaseName = Command,
62 |         //   Actual=Duration,
63 |         //   Expected=ExecutionTimeThreshold,
64 |           PercentDifference,
65 |           OwningTeam = split(Command, ".")[0],
66 |           Level = case(
67 |             PercentDifference > 50, 2, // "Very Suspicious"
68 |             PercentDifference > 20, 3, // "Suspicious"
69 |                                     0) // "Normal"


--------------------------------------------------------------------------------
/queries/sql/CpuActivity.csl:
--------------------------------------------------------------------------------
 1 | let dbName = "{db}";
 2 | let startTime = todatetime("{start}");
 3 | let endTime = todatetime("{end}");
 4 | let scaleUnit = "{su}";
 5 | let service = "{service}";
 6 | XEventDataRPCCompleted
 7 | | where EventTime between (startTime .. endTime)
 8 | | where Service == service
 9 | | where ScaleUnit == scaleUnit
10 | | where DatabaseName == dbName
11 | | join kind=inner
12 | (
13 |     ActivityLog
14 |     | where StartTime <= endTime and TIMESTAMP >= startTime // Activity started before endTime and finished after startTime
15 |     | where Service == service
16 |     | where ScaleUnit == scaleUnit
17 |     | extend Agent = iff(UserAgent contains "mozilla", "Browser", UserAgent) 
18 |     | summarize by Application, Command, ApplicationHash, CommandHash, Agent 
19 | ) on ApplicationHash, CommandHash
20 | | summarize sum(CpuTime) by HostId, VSID, Application, Command, Agent, bin(EventTime, 5m)
21 | | order by sum_CpuTime desc


--------------------------------------------------------------------------------
/queries/sql/CpuJob.csl:
--------------------------------------------------------------------------------
 1 | let dbName = "{db}";
 2 | let startTime = todatetime("{start}");
 3 | let endTime = todatetime("{end}");
 4 | let scaleUnit = "{su}";
 5 | let service = "{service}";
 6 | XEventDataRPCCompleted
 7 | | where EventTime between (startTime .. endTime )
 8 | | where ScaleUnit == scaleUnit and DatabaseName == dbName
 9 | | join kind=inner
10 | (
11 |     JobHistory
12 |     | where StartTime <= endTime and PreciseTimeStamp >= startTime 
13 |     | where Service == service and ScaleUnit == scaleUnit
14 |     | summarize by Plugin, HostId=JobSource, UniqueIdentifier=JobId
15 | ) on HostId, UniqueIdentifier
16 | | summarize sum(CpuTime) by Plugin, HostId
17 | | top 20 by sum_CpuTime desc


--------------------------------------------------------------------------------
/queries/sql/CpuTop.csl:
--------------------------------------------------------------------------------
1 | let dbName = "{db}";
2 | let startTime = todatetime("{start}");
3 | let endTime = todatetime("{end}");
4 | QDS
5 | | where TIMESTAMP between (startTime .. endTime)
6 | | where DatabaseName == dbName
7 | | summarize sum(TotalCpuTime), sum(TotalPhysicalReads), sum(TotalLogicalReads), sum(TotalExecutions), sum(TotalExceptions) by QueryText
8 | | top 10 by sum_TotalCpuTime desc


--------------------------------------------------------------------------------
/queries/sql/CpuXEvent.csl:
--------------------------------------------------------------------------------
 1 | let dbName = "{db}";
 2 | let startTime = todatetime("{start}");
 3 | let endTime = todatetime("{end}");
 4 | let XEventTypes = datatable (TypeName:string, Type:long)
 5 | [   "None", 0,  
 6 |     "Activity", 1 ,  
 7 |     "Job",  2,  
 8 |     "Task" , 4,
 9 |     "JobTask", 6,
10 |     "Notification", 8, 
11 |     "Pipeline", 16,
12 |     "PipelineActivity", 17,
13 |     "PipelineJob", 18,
14 |     "AnonymousActivity", 33,
15 |     "PublicActivity", 65,
16 |     "Other",  128 ];
17 | XEventDataRPCCompleted
18 | | where EventTime between (startTime .. endTime)
19 | | where DatabaseName == dbName
20 | | join kind=leftouter (XEventTypes) on Type 
21 | | summarize sum(CpuTime) by TypeName, ObjectName
22 | | order by sum_CpuTime desc


--------------------------------------------------------------------------------
/queries/sql/GetData.csl:
--------------------------------------------------------------------------------
1 | let dbName = "{db}";
2 | let startTime = todatetime("{start}") - 5hr;
3 | let endTime = todatetime("{end}") + 5hr;
4 | DatabasePerformanceStatistics
5 | | where TIMESTAMP between (startTime .. endTime) 
6 | | where DatabaseName =~ dbName
7 | | take 1


--------------------------------------------------------------------------------
/queries/sql/WhatsSlow.csl:
--------------------------------------------------------------------------------
1 | let dbName = "{db}";
2 | let startTime = todatetime("{start}") - 5hr;
3 | let endTime = todatetime("{end}") + 5hr;
4 | DatabasePerformanceStatistics
5 | | where TIMESTAMP between (startTime .. endTime) 
6 | | where DatabaseName =~ dbName
7 | | summarize avg(AverageCpuPercentage), avg(AverageMemoryUsagePercentage), avg(AverageLogWriteUtilizationPercentage), max(MaximumWorkerPercentage) by bin(PeriodStart, 1m), ServiceObjective


--------------------------------------------------------------------------------
/run.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# The Plan\n",
  8 |     "Everything we know about the plan.\n",
  9 |     "\n",
 10 |     "### Instructions\n",
 11 |     "1. Run all cells! (click on Menu > Cell > Run All Cells)\n",
 12 |     "1. View report at the bottom."
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": null,
 18 |    "metadata": {
 19 |     "inputHidden": false,
 20 |     "outputHidden": false,
 21 |     "tags": [
 22 |      "parameters"
 23 |     ]
 24 |    },
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "#planId = \"98db70e2-cee5-4e2d-ae15-dca389fa8f41\"\n",
 28 |     "planId = \"f38f1a4b-49d7-4f08-a9b9-c81b2c39aff6\""
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {
 35 |     "inputHidden": false,
 36 |     "outputHidden": false
 37 |    },
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "%%capture \n",
 41 |     "# install packages, setup workspace root\n",
 42 |     "!pip install --upgrade  azure-kusto-notebooks plotly\n",
 43 |     "import os\n",
 44 |     "from azure.kusto.notebooks import utils as akn\n",
 45 |     "import pandas as pd\n",
 46 |     "pd.options.display.html.table_schema = True\n",
 47 |     "\n",
 48 |     "# cwd should be workspace root\n",
 49 |     "if os.path.basename(os.getcwd()) == 'devops-pipelines':\n",
 50 |     "    os.chdir(os.pardir)"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {
 57 |     "inputHidden": false,
 58 |     "outputHidden": false
 59 |    },
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "# authenticate kusto client\n",
 63 |     "# you will need to copy the token into a browser window for AAD auth. \n",
 64 |     "client = akn.get_client('https://vso.kusto.windows.net')"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {
 71 |     "inputHidden": false,
 72 |     "outputHidden": false
 73 |    },
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "# collect basic plan info\n",
 77 |     "plan_info = akn.Query(\n",
 78 |     "    client, 'VSO', \n",
 79 |     "    path=os.path.join('devops-pipelines', 'queries', 'run', 'PlanInfo.csl'), \n",
 80 |     "    params={'OrchestrationId': akn.quote(planId)})\n",
 81 |     "\n",
 82 |     "# collect full plan history\n",
 83 |     "what_happened = akn.Query(client, 'VSO',\n",
 84 |     "    path=os.path.join('devops-pipelines', 'queries', 'run', 'WhatHappened.csl'),\n",
 85 |     "    params={'OrchestrationId': akn.quote(planId)})\n",
 86 |     "\n",
 87 |     "# fetch data in parallel\n",
 88 |     "akn.run((plan_info, what_happened))"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {
 95 |     "inputHidden": false,
 96 |     "outputHidden": false
 97 |    },
 98 |    "outputs": [],
 99 |    "source": [
100 |     "# draw basic info\n",
101 |     "\n",
102 |     "# compute relative time stamps\n",
103 |     "history = what_happened.dataframe\n",
104 |     "t0 = history['PreciseTimeStamp'].iloc[0]\n",
105 |     "history['Time'] = history.apply(lambda row: row['PreciseTimeStamp'] - t0, axis=1)\n",
106 |     "history.OrchestrationId = history.apply(lambda row: row.OrchestrationId[37:], axis=1)\n",
107 |     "\n",
108 |     "# record critical times\n",
109 |     "def find_time(message):\n",
110 |     "    r = history[history.Message.str.startswith(message)]\n",
111 |     "    if len(r.index) > 0:\n",
112 |     "        return r['PreciseTimeStamp'].iloc[0]\n",
113 |     "\n",
114 |     "create_time = find_time('Created plan')\n",
115 |     "start_time = find_time('Started plan')\n",
116 |     "end_time = find_time('Completed orchestration with result')\n",
117 |     "total_duration = end_time - start_time if end_time and start_time else None\n",
118 |     "\n",
119 |     "import importlib\n",
120 |     "importlib.reload(akn)\n",
121 |     "# info will only exist if the plan has started at least one job :(\n",
122 |     "d = akn.pandas_row_to_dictionary(plan_info.dataframe)\n",
123 |     "d['create time'] = create_time\n",
124 |     "d['start time'] = start_time\n",
125 |     "d['end time'] = end_time\n",
126 |     "d['total duration'] = total_duration\n",
127 |     "r = akn.Report()\n",
128 |     "r.write(akn.to_md_table(d))\n",
129 |     "\n",
130 |     "from IPython.display import Markdown\n",
131 |     "Markdown(r.content)"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {
138 |     "inputHidden": false,
139 |     "outputHidden": false
140 |    },
141 |    "outputs": [],
142 |    "source": [
143 |     "# SLA analysis\n",
144 |     "su = akn.quote(d.get('ScaleUnit', ''))\n",
145 |     "oids = [akn.quote(joid) for joid in d.get('JobOrchestrationIds', [])]\n",
146 |     "slas = [akn.Query(client, 'VSO', \n",
147 |     "        os.path.join('devops-pipelines', 'queries', 'sla', 'SLAVisualization.csl'),\n",
148 |     "        params=dict(ScaleUnit=su, OrchestrationId=oid)) \n",
149 |     "        for oid in oids]\n",
150 |     "akn.run(slas)"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {
157 |     "inputHidden": false,
158 |     "outputHidden": false
159 |    },
160 |    "outputs": [],
161 |    "source": [
162 |     "# draw all slas\n",
163 |     "from _plotly_future_ import v4_subplots\n",
164 |     "from plotly.subplots import make_subplots\n",
165 |     "import plotly.graph_objects as go\n",
166 |     "import math\n",
167 |     "if not slas:\n",
168 |     "    print(\"There are no jobs associated with this plan.\")\n",
169 |     "else:\n",
170 |     "    number_of_graphs = min(25, len(slas))\n",
171 |     "    names = [n[37:] for n in d.get('JobOrchestrationIds',[])]\n",
172 |     "    fig = make_subplots(cols=2, rows=int(math.ceil(number_of_graphs / 2)), \n",
173 |     "                        subplot_titles=names,\n",
174 |     "                        shared_xaxes=True, \n",
175 |     "                        vertical_spacing=0.1)\n",
176 |     "\n",
177 |     "    for i in range(len(slas)):\n",
178 |     "        df = slas[i].dataframe\n",
179 |     "        row = int(i / 2) + 1\n",
180 |     "        col = int(i % 2) + 1\n",
181 |     "        name = names[i]\n",
182 |     "        \n",
183 |     "        df = slas[0].dataframe\n",
184 |     "        fig.add_trace(go.Bar(x=df.PhaseName, y=df.PercentDifference, name=name), \n",
185 |     "                      row=row, col=col)\n",
186 |     "    fig.update_xaxes(showgrid=False, tickangle=-60, automargin=True)\n",
187 |     "    fig.update_xaxes(showgrid=True, zeroline=True, automargin=True)\n",
188 |     "    fig.update_layout(height=150 * number_of_graphs, \n",
189 |     "                      width=1000, showlegend=False,\n",
190 |     "                      title_text=\"Analysis!\")\n",
191 |     "\n",
192 |     "    fig.show()"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {
199 |     "inputHidden": false,
200 |     "outputHidden": false
201 |    },
202 |    "outputs": [],
203 |    "source": [
204 |     "# draw full history\n",
205 |     "columns_to_ignore = ('source_', 'PreciseTimeStamp')\n",
206 |     "columns = ['Time'] + [c for c in history.columns if c not in columns_to_ignore and c != 'Time']\n",
207 |     "\n",
208 |     "from IPython.display import HTML\n",
209 |     "HTML(history[columns].to_html(index=False))"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": null,
215 |    "metadata": {
216 |     "inputHidden": false,
217 |     "outputHidden": false
218 |    },
219 |    "outputs": [],
220 |    "source": []
221 |   }
222 |  ],
223 |  "metadata": {
224 |   "kernel_info": {
225 |    "name": "python3"
226 |   },
227 |   "kernelspec": {
228 |    "display_name": "Python 3",
229 |    "language": "python",
230 |    "name": "python3"
231 |   },
232 |   "language_info": {
233 |    "codemirror_mode": {
234 |     "name": "ipython",
235 |     "version": 3
236 |    },
237 |    "file_extension": ".py",
238 |    "mimetype": "text/x-python",
239 |    "name": "python",
240 |    "nbconvert_exporter": "python",
241 |    "pygments_lexer": "ipython3",
242 |    "version": "3.7.4"
243 |   },
244 |   "nteract": {
245 |    "version": "0.15.0"
246 |   }
247 |  },
248 |  "nbformat": 4,
249 |  "nbformat_minor": 2
250 | }
251 | 


--------------------------------------------------------------------------------
/sla.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# SLA Investigation\n",
  8 |     "1. Run all cells! (click on Menu > Cell > Run All Cells)\n",
  9 |     "1. View report at the bottom."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {
 16 |     "inputHidden": false,
 17 |     "outputHidden": false,
 18 |     "tags": [
 19 |      "parameters"
 20 |     ]
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "triggerTime = \"2019-10-15T20:21:54.0330000Z\"\n",
 25 |     "scaleUnit = \"pipelines-ghub-eus2-2\"\n",
 26 |     "service = \"pipelines\"\n",
 27 |     "lookback = \"1h\"\n",
 28 |     "region = \"\""
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {
 35 |     "inputHidden": false,
 36 |     "outputHidden": false
 37 |    },
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "%%capture \n",
 41 |     "\n",
 42 |     "# install packages, setup workspace root\n",
 43 |     "!pip install --upgrade pip azure-kusto-notebooks\n",
 44 |     "import os\n",
 45 |     "import sys\n",
 46 |     "import datetime\n",
 47 |     "import pandas as pd\n",
 48 |     "import numpy as np\n",
 49 |     "import matplotlib\n",
 50 |     "import matplotlib.pyplot as plt\n",
 51 |     "pd.options.display.html.table_schema = True\n",
 52 |     "import concurrent.futures\n",
 53 |     "from azure.kusto.notebooks import utils as akn\n",
 54 |     "\n",
 55 |     "# cwd should be workspace root\n",
 56 |     "if os.path.basename(os.getcwd()) == 'devops-pipelines':\n",
 57 |     "    os.chdir(os.pardir)"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {
 64 |     "inputHidden": false,
 65 |     "outputHidden": false
 66 |    },
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "# authenticate kusto client\n",
 70 |     "# you will need to copy the token into a browser window for AAD auth. \n",
 71 |     "client = akn.get_client('https://vso.kusto.windows.net')"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {
 78 |     "inputHidden": false,
 79 |     "outputHidden": false
 80 |    },
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "# find orchestrations that violate SLA\n",
 84 |     "params = {\n",
 85 |     "    'TriggerTime': akn.to_kusto_datetime(triggerTime),\n",
 86 |     "    'Lookback': akn.to_kusto_timespan(lookback),\n",
 87 |     "    'Service': '\"' + service + '\"', \n",
 88 |     "    'Region': '\"' + region + '\"',\n",
 89 |     "    'ScaleUnit': '\"' + scaleUnit + '\"'\n",
 90 |     "}\n",
 91 |     "query = os.path.join('devops-pipelines', 'queries', 'sla', 'SLADurationAnalysis.csl')\n",
 92 |     "violations = akn.execute_file(client, database='VSO', path=query, params=params)\n",
 93 |     "# violations"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {
100 |     "inputHidden": false,
101 |     "outputHidden": false
102 |    },
103 |    "outputs": [],
104 |    "source": [
105 |     "# collect problematic orchestration ids\n",
106 |     "result = violations.primary_results[0]\n",
107 |     "oid_column_index = next((c.ordinal for c in result.columns if c.column_name == 'OrchestrationId'), None)\n",
108 |     "su_column_index = next((c.ordinal for c in result.columns if c.column_name == 'ScaleUnit'), None)\n",
109 |     "\n",
110 |     "# group\n",
111 |     "by_su = {}\n",
112 |     "for r in result.rows:\n",
113 |     "    su = r[su_column_index]\n",
114 |     "    oid = r[oid_column_index]\n",
115 |     "    l = by_su.get(su, [])\n",
116 |     "    by_su[su] = l\n",
117 |     "    l.append(oid)\n",
118 |     "\n",
119 |     "max_scale_units = []\n",
120 |     "max_problems = 0\n",
121 |     "for k,v in by_su.items():\n",
122 |     "  c = len(v)\n",
123 |     "  if c > max_problems:\n",
124 |     "    max_problems = c\n",
125 |     "    max_scale_units = [k]\n",
126 |     "  elif c == max_problems:\n",
127 |     "    max_scale_units.append(k)\n",
128 |     "max_scale_units.sort()\n",
129 |     "\n",
130 |     "# for su, oids in by_su.items():\n",
131 |     "#     print(su)\n",
132 |     "#     for oid in oids:\n",
133 |     "#         print('   ', oid)"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {
140 |     "inputHidden": false,
141 |     "outputHidden": false
142 |    },
143 |    "outputs": [],
144 |    "source": [
145 |     "# collect visualization data sets\n",
146 |     "query = os.path.join('devops-pipelines', 'queries', 'sla', 'SLAVisualization.csl')\n",
147 |     "with concurrent.futures.ThreadPoolExecutor() as executor:\n",
148 |     "    hfs = [executor.submit(akn.execute_file, client, 'VSO', query, \n",
149 |     "            {\n",
150 |     "                'ScaleUnit': '\"' + r[su_column_index] + '\"', \n",
151 |     "                'OrchestrationId': '\"' + r[oid_column_index] + '\"'\n",
152 |     "            }) for r in result.rows]\n",
153 |     "    histories = [h.result() for h in concurrent.futures.as_completed(hfs)]\n",
154 |     "\n",
155 |     "# convert to data frames\n",
156 |     "primary_results = [h.primary_results[0] for h in histories]\n",
157 |     "dataframes = None\n",
158 |     "with concurrent.futures.ThreadPoolExecutor() as executor:\n",
159 |     "    dataframe_futures = [executor.submit(akn.to_dataframe, r) for r in primary_results]\n",
160 |     "    dataframes = [dff.result() for dff in concurrent.futures.as_completed(dataframe_futures)]\n",
161 |     "histories = None\n",
162 |     "\n",
163 |     "# try to filter out false positives? at least a certain number of phases must have been recorded.\n",
164 |     "required_phases = ('RunAgentJob.SendJob', 'RunAgentJob.JobCompleted')\n",
165 |     "filtered_dataframes = [df for df in dataframes if all([p in df['PhaseName'].values for p in required_phases])]\n",
166 |     "number_of_false_positives = len(dataframes) - len(filtered_dataframes)\n",
167 |     "dataframes = filtered_dataframes\n",
168 |     "plans_out_of_sla = [df['PlanId'].iat[0] for df in dataframes]\n",
169 |     "number_of_violations = len(dataframes)"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {
176 |     "inputHidden": false,
177 |     "outputHidden": false
178 |    },
179 |    "outputs": [],
180 |    "source": [
181 |     "worst_phaseName = ''\n",
182 |     "worst_count = 0\n",
183 |     "worst_team = ''\n",
184 |     "\n",
185 |     "if dataframes:\n",
186 |     "    # what was the worst phase?\n",
187 |     "    combined = pd.concat(dataframes, ignore_index=True)\n",
188 |     "    df = combined.loc[combined['Level'] == 2].groupby(['PhaseName']).size().to_frame('Count').nlargest(1, 'Count')\n",
189 |     "    if len(df.index) > 0:\n",
190 |     "        worst_phaseName = df.index[0]\n",
191 |     "        worst_count = df.iat[0, 0]\n",
192 |     "        worst_team = worst_phaseName.split('.')[0]\n",
193 |     "    \n",
194 |     "    # what was the worst plan?\n",
195 |     "    violations_df = akn.to_dataframe(violations.primary_results[0])\n",
196 |     "    df = violations_df.groupby(['PlanId']).size().to_frame('Count').nlargest(1, 'Count')\n",
197 |     "    plan_with_most_violations = df.index[0]\n",
198 |     "    plan_with_most_violations_count = df.iat[0, 0]"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "metadata": {
205 |     "inputHidden": false,
206 |     "outputHidden": false
207 |    },
208 |    "outputs": [],
209 |    "source": [
210 |     "if number_of_false_positives:\n",
211 |     "    print(number_of_false_positives, 'plans are likely missing kusto data and were ignored.')\n",
212 |     "if number_of_violations <= 0:\n",
213 |     "    print('no problems detected')\n",
214 |     "else:\n",
215 |     "    for su in max_scale_units:\n",
216 |     "        print(max_problems, 'of the problems were in', su)\n",
217 |     "    \n",
218 |     "    print(number_of_violations, \n",
219 |     "          'plans' if number_of_violations > 1 else 'plan', \n",
220 |     "          'had no apparent data problems and', \n",
221 |     "          'are' if number_of_violations > 1 else 'is', \n",
222 |     "          'out of SLA.')\n",
223 |     "    \n",
224 |     "    if plan_with_most_violations in plans_out_of_sla:\n",
225 |     "        print(plan_with_most_violations, 'had the most violations with', plan_with_most_violations_count)\n",
226 |     "    \n",
227 |     "    if worst_phaseName:\n",
228 |     "        print('\"' + worst_phaseName + '\"', 'was the slowest phase in', worst_count, \n",
229 |     "              'of the', number_of_violations, 'SLA violations.')\n",
230 |     "        \n",
231 |     "    print ('\\nConclusion:')\n",
232 |     "    if number_of_violations > 5:    \n",
233 |     "        print('This is likely a real problem. Open icm against scale units:', max_scale_units)\n",
234 |     "        print('Initially route it to:       ', worst_team)\n",
235 |     "    else: \n",
236 |     "        print('Too much uncertainty -- do not open any ICMs.')\n",
237 |     "        \n",
238 |     "        if number_of_false_positives and float(number_of_false_positives) / float(max_problems) > .5:\n",
239 |     "            for su in max_scale_units:\n",
240 |     "                print(su, 'might be unhealthy based on the number of plans missing kusto data.')\n",
241 |     "        "
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": null,
247 |    "metadata": {
248 |     "inputHidden": false,
249 |     "outputHidden": false
250 |    },
251 |    "outputs": [],
252 |    "source": [
253 |     "%matplotlib inline\n",
254 |     "plt.rcdefaults()\n",
255 |     "\n",
256 |     "if dataframes:\n",
257 |     "    number_of_graphs = min(25, len(dataframes))\n",
258 |     "    fig, axes = plt.subplots(nrows=number_of_graphs,  ncols=1,  figsize=(8, 6 * number_of_graphs), constrained_layout=True)\n",
259 |     "    for i in range(number_of_graphs):\n",
260 |     "        df = dataframes[i]\n",
261 |     "        ax = axes[i] if number_of_graphs > 1 else axes\n",
262 |     "        ax.axhline(0, color='k')\n",
263 |     "\n",
264 |     "        x = df['PhaseName']\n",
265 |     "        xpos = np.arange(len(x))\n",
266 |     "        y = df['PercentDifference']\n",
267 |     "        plan_id = df['PlanId'].iloc[0]\n",
268 |     "        violation_row = violations_df.loc[violations_df['PlanId'] == plan_id]\n",
269 |     "        title = '\\n'.join([\n",
270 |     "            'plan id:' + plan_id,\n",
271 |     "            'scale unit:'     + str(violation_row['ScaleUnit'].iloc[0]),\n",
272 |     "            'definition:'     + str(df['DefinitionName'].iloc[0]),\n",
273 |     "            'plan duration: ' + str(violation_row['PlanDuration'].iloc[0]),\n",
274 |     "            'sla duration: '  + str(violation_row['TotalSLADuration'].iloc[0]),\n",
275 |     "        ])\n",
276 |     "        ax.title.set_text(title)\n",
277 |     "\n",
278 |     "        ax.bar(x=xpos, height=y)\n",
279 |     "        ax.set_xticks(xpos)\n",
280 |     "        ax.set_xticklabels(x, rotation=45, ha=\"right\")\n",
281 |     "\n",
282 |     "# output_filename = 'analysis.svg'\n",
283 |     "# plt.savefig(output_filename, format='svg')"
284 |    ]
285 |   }
286 |  ],
287 |  "metadata": {
288 |   "kernel_info": {
289 |    "name": "python3"
290 |   },
291 |   "kernelspec": {
292 |    "display_name": "Python 3",
293 |    "language": "python",
294 |    "name": "python3"
295 |   },
296 |   "language_info": {
297 |    "codemirror_mode": {
298 |     "name": "ipython",
299 |     "version": 3
300 |    },
301 |    "file_extension": ".py",
302 |    "mimetype": "text/x-python",
303 |    "name": "python",
304 |    "nbconvert_exporter": "python",
305 |    "pygments_lexer": "ipython3",
306 |    "version": "3.7.4"
307 |   },
308 |   "nteract": {
309 |    "version": "0.15.0"
310 |   }
311 |  },
312 |  "nbformat": 4,
313 |  "nbformat_minor": 2
314 | }
315 | 


--------------------------------------------------------------------------------
/sql.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# SQL Investigation\n",
  8 |     "1. Run all cells.\n",
  9 |     "1. View report at the bottom."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {
 16 |     "inputHidden": false,
 17 |     "outputHidden": false,
 18 |     "tags": [
 19 |      "parameters"
 20 |     ]
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "# These are just defaults will be overwritten if you use nimport pip\n",
 25 |     "db = \"Tfs_tfsprodcus2_37253a68-972a-4bf4-8c5f-a259ba4d42cd\"\n",
 26 |     "start = \"2019-07-31T17:30:00.0000000Z\"\n",
 27 |     "end = \"2019-07-31T18:30:36.0000000Z\"\n",
 28 |     "url = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK/delays.ipynb\"\n",
 29 |     "baseUrl = \"https://notebooksv2.azure.com/yaananth/projects/06OasuNRs6rK\""
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {
 36 |     "inputHidden": false,
 37 |     "outputHidden": false
 38 |    },
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "%%capture\n",
 42 |     "!pip install --upgrade nimport azure-kusto-notebooks"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {
 49 |     "inputHidden": false,
 50 |     "outputHidden": false
 51 |    },
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "# Import the things we use\n",
 55 |     "\n",
 56 |     "# Note you can also use kql https://docs.microsoft.com/en-us/azure/data-explorer/kqlmagic\n",
 57 |     "# %kql is single line magic\n",
 58 |     "# %%kql is cell magic\n",
 59 |     "\n",
 60 |     "# https://nbviewer.jupyter.org/github/ipython/ipython/blob/4.0.x/examples/IPython%20Kernel/Rich%20Output.ipynb#HTML\n",
 61 |     "# https://ipython.readthedocs.io/en/stable/inte/magics.html\n",
 62 |     "from IPython.display import display, HTML, Markdown, Javascript, clear_output\n",
 63 |     "\n",
 64 |     "# http://pandas-docs.github.io/pandas-docs-travis/user_guide/reshaping.html\n",
 65 |     "import pandas as pd\n",
 66 |     "pd.options.display.html.table_schema = True\n",
 67 |     "from pandas import Series, DataFrame\n",
 68 |     "from datetime import datetime, timedelta, timezone\n",
 69 |     "from urllib.parse import urlencode, quote_plus\n",
 70 |     "from requests.utils import requote_uri\n",
 71 |     "import time\n",
 72 |     "import numpy as np\n",
 73 |     "from matplotlib import pyplot as plt\n",
 74 |     "from nimport.utils import tokenize, open_nb\n",
 75 |     "import json\n",
 76 |     "import os\n",
 77 |     "import calendar as cal\n",
 78 |     "import concurrent.futures\n",
 79 |     "from azure.kusto.notebooks import utils as akn"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {
 86 |     "inputHidden": false,
 87 |     "outputHidden": false
 88 |    },
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "params = {\n",
 92 |     "    \"db\": db,\n",
 93 |     "    \"start\": start,\n",
 94 |     "    \"end\": end,\n",
 95 |     "    \"url\": url,\n",
 96 |     "    \"baseUrl\": baseUrl\n",
 97 |     "}\n",
 98 |     "root = 'devops-pipelines' if os.path.basename(os.getcwd()) != 'devops-pipelines' else ''\n",
 99 |     "queryPath = os.path.join(root, 'queries')\n",
100 |     "    "
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {
107 |     "inputHidden": false,
108 |     "outputHidden": false
109 |    },
110 |    "outputs": [],
111 |    "source": [
112 |     "# authenticate kusto client\n",
113 |     "# you will need to copy the token into a browser window for AAD auth. \n",
114 |     "client = akn.get_client('https://vso.kusto.windows.net')"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {
121 |     "inputHidden": false,
122 |     "outputHidden": false
123 |    },
124 |    "outputs": [],
125 |    "source": [
126 |     "sqlPath = os.path.join(queryPath, 'sql')\n",
127 |     "q_data = os.path.join(sqlPath, \"GetData.csl\")\n",
128 |     "q_whatsSlow = os.path.join(sqlPath, \"WhatsSlow.csl\")\n",
129 |     "with concurrent.futures.ThreadPoolExecutor() as executor:\n",
130 |     "    # materialize so that we have all information we might need\n",
131 |     "    p1 = executor.submit(akn.execute_file, client, 'VSO', q_data, params)\n",
132 |     "    q_data_df = akn.to_dataframe_from_future(p1)\n",
133 |     "    params[\"service\"] = q_data_df[\"Service\"][0]\n",
134 |     "    params[\"su\"] =q_data_df[\"ScaleUnit\"][0]\n",
135 |     "    \n",
136 |     "    p2 = executor.submit(akn.execute_file, client, 'VSO', q_whatsSlow, params)\n",
137 |     "\n",
138 |     "q_whatsSlow_df = akn.to_dataframe_from_future(p2)  \n"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {
145 |     "inputHidden": false,
146 |     "outputHidden": false
147 |    },
148 |    "outputs": [],
149 |    "source": [
150 |     "# Initialize for further analysis later\n",
151 |     "q_cpuTop_df = None\n",
152 |     "q_cpuXEvent_df = None\n",
153 |     "q_cpuJob_df = None\n",
154 |     "q_cpuActivity_df = None"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {
161 |     "inputHidden": false,
162 |     "outputHidden": false
163 |    },
164 |    "outputs": [],
165 |    "source": [
166 |     "def cpuAnalysis():\n",
167 |     "    global q_cpuTop_df\n",
168 |     "    global q_cpuXEvent_df\n",
169 |     "    q_cpuTop = os.path.join(sqlPath, \"CpuTop.csl\")\n",
170 |     "    q_cpuXEvent = os.path.join(sqlPath, \"CpuXevent.csl\")\n",
171 |     "    with concurrent.futures.ThreadPoolExecutor() as executor:\n",
172 |     "        p1 = executor.submit(akn.execute_file, client, 'VSO', q_cpuTop, params)\n",
173 |     "        p2 = executor.submit(akn.execute_file, client, 'VSO', q_cpuXEvent, params)\n",
174 |     "\n",
175 |     "    q_cpuTop_df = akn.to_dataframe_from_future(p1)\n",
176 |     "    \n",
177 |     "    q_cpuXEvent_df = akn.to_dataframe_from_future(p2)\n",
178 |     "    maxTime = q_cpuXEvent_df[\"sum_CpuTime\"].max()\n",
179 |     "    q_cpuXEvent_df['CpuTimeDiff'] = q_cpuXEvent_df[\"sum_CpuTime\"].map(lambda x: x/maxTime)\n",
180 |     "\n",
181 |     "def cpuAnalysisJob():\n",
182 |     "    global q_cpuJob_df\n",
183 |     "    q_cpuJob = os.path.join(sqlPath, \"CpuJob.csl\")\n",
184 |     "    with concurrent.futures.ThreadPoolExecutor() as executor:\n",
185 |     "        p1 = executor.submit(akn.execute_file, client, 'VSO', q_cpuJob, params)\n",
186 |     "\n",
187 |     "    q_cpuJob_df = akn.to_dataframe_from_future(p1)\n",
188 |     "\n",
189 |     "def cpuAnalysisActivity():\n",
190 |     "    global q_cpuActivity_df\n",
191 |     "    q_cpuActivity = os.path.join(sqlPath, \"CpuActivity.csl\")\n",
192 |     "    with concurrent.futures.ThreadPoolExecutor() as executor:\n",
193 |     "        p1 = executor.submit(akn.execute_file, client, 'VSO', q_cpuActivity, params)\n",
194 |     "\n",
195 |     "    q_cpuActivity_df = akn.to_dataframe_from_future(p1)"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {
202 |     "inputHidden": false,
203 |     "outputHidden": false
204 |    },
205 |    "outputs": [],
206 |    "source": [
207 |     "print('=' * 50)\n",
208 |     "print('Report!')\n",
209 |     "print('=' * 50, '\\n\\n')\n",
210 |     "\n",
211 |     "jarvisParams = {'su': params[\"su\"], 'start': akn.get_time(start, -10), 'end': akn.get_time(end, 10), 'service': params[\"service\"], 'db': db }\n",
212 |     "\n",
213 |     "jaJarvisLink = \"\"\"https://jarvis-west.dc.ad.msft.net/dashboard/VSO-ServiceInsights/PlatformViews/SQLAzureDatabase\"\"\" \\\n",
214 |     "    \"\"\"?overrides=[{\"query\":\"//*[id='Service']\",\"key\":\"value\",\"replacement\":\"%(service)s\"},\"\"\" \\\n",
215 |     "    \"\"\"{\"query\":\"//*[id='ScaleUnit']\",\"key\":\"value\",\"replacement\":\"%(su)s\"},\"\"\" \\\n",
216 |     "    \"\"\"{\"query\":\"//*[id='__DatabaseName']\",\"key\":\"value\",\"replacement\":\"%(db)s\"}]\"\"\" \\\n",
217 |     "    \"\"\"&globalStartTime=%(start)s&globalEndTime=%(end)s&pinGlobalTimeRange=true\"\"\" % jarvisParams;\n",
218 |     "print('Jarvis dashboard link for sql:\\n', requote_uri(jaJarvisLink), '\\n')\n",
219 |     "\n",
220 |     "print()\n",
221 |     "print(\"Parameters used:\")\n",
222 |     "display(params)\n",
223 |     "\n",
224 |     "print()\n",
225 |     "\n",
226 |     "## Where is the database at?\n",
227 |     "print(\"Database is at: \")\n",
228 |     "so = q_whatsSlow_df[\"ServiceObjective\"].unique()\n",
229 |     "if so.size > 1:\n",
230 |     "    print(\"We found different service objectives..looks like db was changed?\")\n",
231 |     "print(so) \n",
232 |     "\n",
233 |     "print()\n",
234 |     "\n",
235 |     "## What's slow?\n",
236 |     "cpu = q_whatsSlow_df[\"avg_AverageCpuPercentage\"]\n",
237 |     "memory = q_whatsSlow_df[\"avg_AverageMemoryUsagePercentage\"]\n",
238 |     "logWrite= q_whatsSlow_df[\"avg_AverageLogWriteUtilizationPercentage\"]\n",
239 |     "worker= q_whatsSlow_df[\"max_MaximumWorkerPercentage\"]\n",
240 |     "cpu_coefficientOfVariance = cpu.std()/cpu.mean()\n",
241 |     "memory_coefficientOfVariance = memory.std()/memory.mean()\n",
242 |     "logWrite_coefficientOfVariance = logWrite.std()/logWrite.mean()\n",
243 |     "worker_coefficientOfVariance = worker.std()/worker.mean()\n",
244 |     "maxVar = 0.5\n",
245 |     "\n",
246 |     "reasons = \"Possibly due to: \"\n",
247 |     "if cpu_coefficientOfVariance >= maxVar:\n",
248 |     "    reasons+= \"cpu (max: %s), \" % (cpu.max())\n",
249 |     "if memory_coefficientOfVariance >= maxVar:\n",
250 |     "    reasons+= \"memory (max: %s), \" % (memory.max())\n",
251 |     "if logWrite_coefficientOfVariance >= maxVar:\n",
252 |     "    reasons+= \"logwrite (max: %s), \" % (logWrite.max())\n",
253 |     "if worker_coefficientOfVariance >= maxVar:\n",
254 |     "    reasons+= \"worker (max: %s), \" % (worker.max())\n",
255 |     "print(reasons)\n",
256 |     "\n",
257 |     "if cpu.max() >= 80:\n",
258 |     "    print(\"We found high CPU, let's start with CPU analysis...\")\n",
259 |     "    \n",
260 |     "    cpuAnalysis()\n",
261 |     "    \n",
262 |     "    #print()\n",
263 |     "    #print(\"Top CPU commands:\")\n",
264 |     "    #display(q_cpuTop_df)\n",
265 |     "    \n",
266 |     "    print()\n",
267 |     "    print(\"Who's causing these commands?:\")\n",
268 |     "    commandsToConsider = q_cpuXEvent_df[q_cpuXEvent_df[\"CpuTimeDiff\"] >= 0.5]\n",
269 |     "    jobCommand = commandsToConsider[commandsToConsider[\"TypeName\"].str.contains('Job')]\n",
270 |     "    if len(jobCommand) >= 1:\n",
271 |     "        print(\"Possibly due to a job...\")\n",
272 |     "        display(jobCommand)\n",
273 |     "        cpuAnalysisJob()\n",
274 |     "        \n",
275 |     "        print()\n",
276 |     "        display(q_cpuJob_df)\n",
277 |     "    \n",
278 |     "    activityCommand = commandsToConsider[commandsToConsider[\"TypeName\"].str.contains('Activity')]\n",
279 |     "    if len(activityCommand) >= 1 and activityCommand[\"ObjectName\"][0]:\n",
280 |     "        print(\"Possibly due to user activity...\")\n",
281 |     "        display(activityCommand)\n",
282 |     "        cpuAnalysisActivity()\n",
283 |     "        \n",
284 |     "        print()\n",
285 |     "        display(q_cpuActivity_df)\n",
286 |     "        "
287 |    ]
288 |   }
289 |  ],
290 |  "metadata": {
291 |   "kernel_info": {
292 |    "name": "python3"
293 |   },
294 |   "kernelspec": {
295 |    "display_name": "Python 3",
296 |    "language": "python",
297 |    "name": "python3"
298 |   },
299 |   "language_info": {
300 |    "codemirror_mode": {
301 |     "name": "ipython",
302 |     "version": 3
303 |    },
304 |    "file_extension": ".py",
305 |    "mimetype": "text/x-python",
306 |    "name": "python",
307 |    "nbconvert_exporter": "python",
308 |    "pygments_lexer": "ipython3",
309 |    "version": "3.7.4"
310 |   },
311 |   "nteract": {
312 |    "version": "0.15.0"
313 |   }
314 |  },
315 |  "nbformat": 4,
316 |  "nbformat_minor": 0
317 | }
318 | 


--------------------------------------------------------------------------------