├── .azure-pipelines.yml ├── .gitignore ├── .style.yapf ├── .vscode └── settings.json ├── LICENSE ├── README.md ├── SECURITY.md ├── centos.sh ├── docs ├── configuration.md └── images │ ├── app-id.png │ ├── batchexplorer.png │ ├── inst-key.png │ └── release-links.png ├── go.mod ├── go.sum ├── main.go ├── nodestats.py ├── nvml ├── nvml.go ├── nvml_linux.go └── nvml_windows.go ├── pkg ├── appinsights.go ├── appinsights_test.go ├── batchinsights.go ├── config.go ├── config_test.go ├── cpu │ ├── cpu.go │ ├── cpu_linux.go │ └── cpu_windows.go ├── disk │ ├── disk.go │ ├── disk_linux.go │ └── disk_windows.go ├── gpu_stats_collector.go ├── node_stats.go ├── processes.go ├── utils │ └── io_aggregator.go ├── version.go └── wmi │ └── wmi.go ├── scripts ├── 1.x │ ├── run-linux.sh │ └── run-windows.ps1 ├── README.md ├── dev-windows.ps1 ├── dev.sh ├── gpu-init.sh ├── gpu-linux-test.sh ├── gpu-windows-test.ps1 ├── run-linux.sh └── run-windows.ps1 ├── ubuntu.sh └── windows.ps1 /.azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | name: $(SourceBranch)$(Rev:.r) 2 | jobs: 3 | - job: Linux 4 | pool: 5 | vmImage: ubuntu-16.04 6 | steps: 7 | - task: GoTool@0 8 | displayName: 'Use Go 1.12' 9 | inputs: 10 | version: 1.12 11 | 12 | - script: go build 13 | displayName: Build 14 | 15 | - script: go test ./... 16 | displayName: Test 17 | 18 | - powershell: | 19 | $branch = $env:BUILD_SOURCEBRANCH 20 | $buildType = "-test" 21 | 22 | If ($branch -like "refs/heads/master") { 23 | $buildType="-master" 24 | } 25 | 26 | $pkgVersion = ./batch-insights --version 27 | $version = "$pkgVersion$buildType.$env:BUILD_NUMBER" 28 | Write-Host "Version is $version" 29 | Write-Host "##vso[build.updatebuildnumber]$version" 30 | displayName: Update build name 31 | 32 | - task: CopyFiles@2 33 | inputs: 34 | contents: batch-insights 35 | targetFolder: $(Build.ArtifactStagingDirectory) 36 | 37 | - task: PublishBuildArtifacts@1 38 | inputs: 39 | artifactName: 'linux' 40 | 41 | 42 | - job: Windows 43 | pool: 44 | vmImage: vs2017-win2016 45 | steps: 46 | - task: GoTool@0 47 | displayName: 'Use Go 1.12' 48 | inputs: 49 | version: 1.12 50 | 51 | - script: | 52 | set CGO_ENABLED=1 53 | set GOOS=windows 54 | set GOARCH=amd64 55 | go env 56 | go build -o ./batch-insights.exe 57 | displayName: Build 64 bit 58 | 59 | - script: go test ./... 60 | displayName: Test 61 | 62 | - task: CopyFiles@2 63 | inputs: 64 | contents: batch-insights.exe 65 | targetFolder: $(Build.ArtifactStagingDirectory) 66 | 67 | - task: PublishBuildArtifacts@1 68 | inputs: 69 | pathtoPublish: $(Build.ArtifactStagingDirectory) 70 | artifactName: 'windows' 71 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | 104 | *.exe -------------------------------------------------------------------------------- /.style.yapf: -------------------------------------------------------------------------------- 1 | [style] 2 | based_on_style = pep8 3 | spaces_before_comment = 4 4 | split_before_logical_operator = true 5 | indent_width = 4 6 | column_limit = 120 -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.formatting.provider": "yapf", 3 | "python.pythonPath": "${workspaceFolder}\\.venv\\Scripts\\python.exe", 4 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Batch Insights 2 | 3 | ## PROJECT STATUS 4 | **This project is no longer actively maintained.** Please see the main [Azure Batch](https://github.com/Azure/Batch) GitHub repository for more information about Azure Batch. 5 | 6 | [![Build Status](https://dev.azure.com/azurebatch/BatchExplorer/_apis/build/status/Batch%20Insights/Batch-Insights-CI?branchName=master)](https://dev.azure.com/azurebatch/BatchExplorer/_build/latest?definitionId=20&branchName=master) 7 | 8 | Azure Batch Insights is a tool used to get system statistics for your Azure Batch account nodes. 9 | 10 | ## Usage (New) 11 | 12 | ### Create Application Insights account 13 | 14 | 1. Goto the [Azure portal](https://portal.azure.com) 15 | 2. Search for [Application Insights](https://ms.portal.azure.com/#blade/HubsExtension/Resources/resourceType/microsoft.insights%2Fcomponents) 16 | 3. Create or use an existing one(Application type input doesn't matter) 17 | 18 | ### Configure your Azure Batch pool start task 19 | Set 3 environment variables in your start task. Make sure this is set as a Batch environment variable rather than exporting. Without the Batch environment variable it will not show up in [Batch Explorer](https://azure.github.io/BatchExplorer). Then set the start task user to be `Pool Admin`(`Task admin` might work too) 20 | 21 | * `APP_INSIGHTS_INSTRUMENTATION_KEY`: This your app insight instrumentation key 22 | 23 | _On the application insight blade in the Azure Portal_ 24 | 25 | ![](docs/images/inst-key.png) 26 | 27 | * `APP_INSIGHTS_APP_ID`: This is your app insight application id 28 | 29 | _On the application insight blade in the Azure Portal_ 30 | 31 | ![](docs/images/app-id.png) 32 | 33 | 34 | * `BATCH_INSIGHTS_DOWNLOAD_URL`: This is the link to the exe to run. 35 | To find this go to the [releases](https://github.com/Azure/batch-insights/releases) and get the link to the release you need 36 | 37 | For example: 38 | * `Linux`: https://github.com/Azure/batch-insights/releases/download/v1.0.0/batch-insights 39 | * `Windows` : https://github.com/Azure/batch-insights/releases/download/v1.0.0/batch-insights.exe 40 | 41 | ![](docs/images/release-links.png) 42 | 43 | ### Linux 44 | 45 | Add this to your start task 46 | 47 | ```bash 48 | # For version 1.x of batch insights 49 | /bin/bash -c 'wget -O - https://raw.githubusercontent.com/Azure/batch-insights/master/scripts/1.x/run-linux.sh | bash' 50 | 51 | # For latest version of batch insights 52 | /bin/bash -c 'wget -O - https://raw.githubusercontent.com/Azure/batch-insights/master/scripts/run-linux.sh | bash' 53 | ``` 54 | 55 | ### Windows 56 | 57 | Add this to your start task 58 | ```powershell 59 | # For version 1.x of batch insights 60 | cmd /c @"%SystemRoot%\System32\WindowsPowerShell\v1.0\powershell.exe" -NoProfile -InputFormat None -ExecutionPolicy Bypass -Command "iex ((New-Object System.Net.WebClient).DownloadString('https://raw.githubusercontent.com/Azure/batch-insights/master/scripts/1.x/run-windows.ps1'))" 61 | 62 | # For latest version of batch insights 63 | cmd /c @"%SystemRoot%\System32\WindowsPowerShell\v1.0\powershell.exe" -NoProfile -InputFormat None -ExecutionPolicy Bypass -Command "iex ((New-Object System.Net.WebClient).DownloadString('https://raw.githubusercontent.com/Azure/batch-insights/master/scripts/run-windows.ps1'))" 64 | 65 | ``` 66 | 67 | **Note: The script used above just downloads the executable at the `BATCH_INSIGHTS_DOWNLOAD_URL` URL and run it in the background. You can download it some other way and start it separately.** 68 | 69 | ## Python Usage (Old) 70 | 71 | ### Ubuntu 72 | Add this command in your start task `commandLine`: 73 | 74 | ```bash 75 | /bin/bash -c 'wget -O - https://raw.githubusercontent.com/Azure/batch-insights/master/ubuntu.sh | bash' 76 | ``` 77 | 78 | ### Centos 79 | Add this command in your start task `commandLine`: 80 | ```bash 81 | /bin/bash -c 'wget -O - https://raw.githubusercontent.com/Azure/batch-insights/master/centos.sh | bash' 82 | ``` 83 | ### Windows 84 | 85 | ```powershell 86 | cmd /c @"%SystemRoot%\System32\WindowsPowerShell\v1.0\powershell.exe" -NoProfile -InputFormat None -ExecutionPolicy Bypass -Command "iex ((New-Object System.Net.WebClient).DownloadString('https://raw.githubusercontent.com/Azure/batch-insights/master/windows.ps1'))" 87 | ``` 88 | 89 | ### Generic 90 | If you already have a version of python installed you just need to download `nodestats.py` and install dependencies 91 | You can add this to your main script: 92 | 93 | ```sh 94 | pip install psutil python-dateutil applicationinsights==0.11.3 95 | wget --no-cache https://raw.githubusercontent.com/Azure/batch-insights/master/nodestats.py 96 | python --version 97 | python nodestats.py > batch-insights.log 2>&1 & 98 | ``` 99 | 100 | ## Configuration 101 | 102 | [See available configuration options](./docs/configuration.md) 103 | 104 | You can set the `AZ_BATCH_INSIGHTS_ARGS` environemnt variable to pass parameters to the tool. 105 | e.g. `AZ_BATCH_INSIGHTS_ARGS` > `--disable networkIO --aggregation 5` 106 | 107 | ## View data 108 | 109 | ### Option 1: [Batch Explorer](https://azure.github.io/BatchExplorer) 110 | BatchLabs is a desktop app used to manage, debug and monitor your azure batch accounts. You can download it [here](https://azure.github.io/BatchExplorer) 111 | If you followed the getting started instruction batchlabs should show you the statistics for each of your pool. 112 | 113 | ![](docs/images/batchexplorer.png) 114 | 115 | ## Option 2: 116 | Use the app insights tools to build your own query on the [Azure Portal](https://ms.portal.azure.com/#blade/HubsExtension/Resources/resourceType/microsoft.insights%2Fcomponents) 117 | 118 | ## Contributing 119 | 120 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 121 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 122 | the rights to use your contribution. For details, visit https://cla.microsoft.com. 123 | 124 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide 125 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions 126 | provided by the bot. You will only need to do this once across all repos using our CLA. 127 | 128 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 129 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 130 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 131 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /centos.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | yum -y install epel-release 3 | yum -y install gcc python-pip python-devel 4 | 5 | echo "Python version:" 6 | python --version 7 | echo "Pip version:" 8 | pip --version 9 | pip install psutil python-dateutil applicationinsights==0.11.3 10 | 11 | wget --no-cache https://raw.githubusercontent.com/Azure/batch-insights/master/nodestats.py 12 | python nodestats.py > batch-insights.log 2>&1 & -------------------------------------------------------------------------------- /docs/configuration.md: -------------------------------------------------------------------------------- 1 | # BatchInsights Configuration 2 | 3 | Batch Insights provides various configuration option(Version `1.2.0` and above). 4 | 5 | 6 | #### `--poolID ` 7 | Pool ID. Override pool ID provided by the `AZ_BATCH_POOL_ID` environment variable 8 | #### `--nodeID ` 9 | Node ID. Override node ID provided by the `AZ_BATCH_NODE_ID` environment variable 10 | #### `--instKey ` 11 | Instrumentation key. Application Insights instrumentation key to emit the metrics 12 | #### `--disable ` 13 | Comma separated list of metrics to disable. e.g. `--disable networkIO,diskUsage` 14 | 15 | Available metrics names: 16 | - diskIO 17 | - diskUsage 18 | - networkIO 19 | - memory 20 | - CPU 21 | - GPU 22 | 23 | * `--aggregation ` Number in minutes to aggregate the data locally. Defaults to 1 minute 24 | 25 | Example: `--agregation 5` to aggregate for 5 minutes 26 | 27 | #### `--processes ` 28 | Comma separated list of processes to monitor. 29 | 30 | Example: `--processes notepad.exe,explorer.exe` 31 | -------------------------------------------------------------------------------- /docs/images/app-id.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/batch-insights/19a1a832e2eed7e35bc14beef54bd5d6d0795fc7/docs/images/app-id.png -------------------------------------------------------------------------------- /docs/images/batchexplorer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/batch-insights/19a1a832e2eed7e35bc14beef54bd5d6d0795fc7/docs/images/batchexplorer.png -------------------------------------------------------------------------------- /docs/images/inst-key.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/batch-insights/19a1a832e2eed7e35bc14beef54bd5d6d0795fc7/docs/images/inst-key.png -------------------------------------------------------------------------------- /docs/images/release-links.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Azure/batch-insights/19a1a832e2eed7e35bc14beef54bd5d6d0795fc7/docs/images/release-links.png -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/Azure/batch-insights 2 | 3 | require ( 4 | code.cloudfoundry.org/clock v0.0.0-20180518195852-02e53af36e6c // indirect 5 | github.com/Microsoft/ApplicationInsights-Go v0.4.2 6 | github.com/StackExchange/wmi v0.0.0-20180725035823-b12b22c5341f 7 | github.com/dustin/go-humanize v0.0.0-20180713052910-9f541cc9db5d 8 | github.com/go-ole/go-ole v1.2.1 // indirect 9 | github.com/mattn/go-colorable v0.1.1 10 | github.com/mindprince/gonvml v0.0.0-20180514031326-b364b296c732 11 | github.com/mxpv/nvml-go v0.0.0-20180227003457-e07f8c26812d 12 | github.com/pkg/errors v0.8.0 13 | github.com/satori/go.uuid v1.2.0 // indirect 14 | github.com/shirou/gopsutil v2.18.12+incompatible 15 | github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4 // indirect 16 | github.com/sirupsen/logrus v1.3.0 17 | github.com/stretchr/testify v1.3.0 18 | ) 19 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | code.cloudfoundry.org/clock v0.0.0-20180518195852-02e53af36e6c h1:5eeuG0BHx1+DHeT3AP+ISKZ2ht1UjGhm581ljqYpVeQ= 2 | code.cloudfoundry.org/clock v0.0.0-20180518195852-02e53af36e6c/go.mod h1:QD9Lzhd/ux6eNQVUDVRJX/RKTigpewimNYBi7ivZKY8= 3 | github.com/Azure/batch-insights v0.0.0-20180614201012-6d427c8344af h1:egr4mBTro2rKGAXfOAkkW0MRcu+k14V/UiajbY0bqUo= 4 | github.com/Microsoft/ApplicationInsights-Go v0.4.2 h1:HIZoGXMiKNwAtMAgCSSX35j9mP+DjGF9ezfBvxMDLLg= 5 | github.com/Microsoft/ApplicationInsights-Go v0.4.2/go.mod h1:CukZ/G66zxXtI+h/VcVn3eVVDGDHfXM2zVILF7bMmsg= 6 | github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20181114021304-b70474fb8511 h1:A9x/8mtuZ6Sg3QYV5bP2QCHQ53aLVaAd/G8EAZmujtg= 7 | github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20181114021304-b70474fb8511/go.mod h1:nMOvShGpWaf0bXwXmeu4k+O4uziuaEI8pWzIj3BUrOA= 8 | github.com/StackExchange/wmi v0.0.0-20180725035823-b12b22c5341f h1:5ZfJxyXo8KyX8DgGXC5B7ILL8y51fci/qYz2B4j8iLY= 9 | github.com/StackExchange/wmi v0.0.0-20180725035823-b12b22c5341f/go.mod h1:3eOhrUMpNV+6aFIbp5/iudMxNCF27Vw2OZgy4xEx0Fg= 10 | github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= 11 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 12 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 13 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 14 | github.com/dustin/go-humanize v0.0.0-20180713052910-9f541cc9db5d h1:lDrio3iIdNb0Gw9CgH7cQF+iuB5mOOjdJ9ERNJCBgb4= 15 | github.com/dustin/go-humanize v0.0.0-20180713052910-9f541cc9db5d/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= 16 | github.com/go-ole/go-ole v1.2.1 h1:2lOsA72HgjxAuMlKpFiCbHTvu44PIVkZ5hqm3RSdI/E= 17 | github.com/go-ole/go-ole v1.2.1/go.mod h1:7FAglXiTm7HKlQRDeOQ6ZNUHidzCWXuZWq/1dTyBNF8= 18 | github.com/konsorten/go-windows-terminal-sequences v1.0.1 h1:mweAR1A6xJ3oS2pRaGiHgQ4OO8tzTaLawm8vnODuwDk= 19 | github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= 20 | github.com/mattn/go-colorable v0.1.1 h1:G1f5SKeVxmagw/IyvzvtZE4Gybcc4Tr1tf7I8z0XgOg= 21 | github.com/mattn/go-colorable v0.1.1/go.mod h1:FuOcm+DKB9mbwrcAfNl7/TZVBZ6rcnceauSikq3lYCQ= 22 | github.com/mattn/go-isatty v0.0.5 h1:tHXDdz1cpzGaovsTB+TVB8q90WEokoVmfMqoVcrLUgw= 23 | github.com/mattn/go-isatty v0.0.5/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= 24 | github.com/mindprince/gonvml v0.0.0-20180514031326-b364b296c732 h1:Dl/79RxNt1t6AYIMhKzyofqooXgw6+LZtAN4EIXRLCk= 25 | github.com/mindprince/gonvml v0.0.0-20180514031326-b364b296c732/go.mod h1:2eu9pRWp8mo84xCg6KswZ+USQHjwgRhNp06sozOdsTY= 26 | github.com/mxpv/nvml-go v0.0.0-20180227003457-e07f8c26812d h1:lQo1zUtnGr52K2a+Ll3DNDoukmPeuHK11baUNGRDSt0= 27 | github.com/mxpv/nvml-go v0.0.0-20180227003457-e07f8c26812d/go.mod h1:PS1oTOPfvtFjl9T7nduA/RYrIpqtRh2Nvk++rQCZ2q8= 28 | github.com/pkg/errors v0.8.0 h1:WdK/asTD0HN+q6hsWO3/vpuAkAr+tw6aNJNDFFf0+qw= 29 | github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 30 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 31 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 32 | github.com/satori/go.uuid v1.2.0 h1:0uYX9dsZ2yD7q2RtLRtPSdGDWzjeM3TbMJP9utgA0ww= 33 | github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0= 34 | github.com/shirou/gopsutil v2.18.12+incompatible h1:1eaJvGomDnH74/5cF4CTmTbLHAriGFsTZppLXDX93OM= 35 | github.com/shirou/gopsutil v2.18.12+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA= 36 | github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4 h1:udFKJ0aHUL60LboW/A+DfgoHVedieIzIXE8uylPue0U= 37 | github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4/go.mod h1:qsXQc7+bwAM3Q1u/4XEfrquwF8Lw7D7y5cD8CuHnfIc= 38 | github.com/sirupsen/logrus v1.3.0 h1:hI/7Q+DtNZ2kINb6qt/lS+IyXnHQe9e90POfeewL/ME= 39 | github.com/sirupsen/logrus v1.3.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= 40 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 41 | github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= 42 | github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= 43 | github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= 44 | github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= 45 | golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= 46 | golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 47 | golang.org/x/sys v0.0.0-20180907202204-917fdcba135d h1:kWn1hlsqeUrk6JsLJO0ZFyz9bMg8u85voZlIuc68ZU4= 48 | golang.org/x/sys v0.0.0-20180907202204-917fdcba135d/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 49 | golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223 h1:DH4skfRX4EBpamg7iV4ZlCpblAHI6s6TDM39bFZumv8= 50 | golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 51 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "github.com/Azure/batch-insights/pkg" 7 | log "github.com/sirupsen/logrus" 8 | "os" 9 | "strings" 10 | ) 11 | 12 | func parseListArgs(value string) []string { 13 | names := strings.Split(value, ",") 14 | for i := range names { 15 | names[i] = strings.TrimSpace(names[i]) 16 | } 17 | return names 18 | } 19 | 20 | func getenv(key string) *string { 21 | value := os.Getenv(key) 22 | if len(value) == 0 { 23 | return nil 24 | } 25 | return &value 26 | } 27 | 28 | func initLogger() { 29 | log.SetFormatter(&log.TextFormatter{ 30 | FullTimestamp: false, 31 | DisableTimestamp: true, 32 | ForceColors: true, 33 | }) 34 | } 35 | 36 | func main() { 37 | initLogger() 38 | disableArg := flag.String("disable", "", "List of metrics to disable") 39 | processArg := flag.String("processes", "", "List of process name to watch") 40 | 41 | envConfig := batchinsights.UserConfig{ 42 | InstrumentationKey: getenv("APP_INSIGHTS_INSTRUMENTATION_KEY"), 43 | PoolID: getenv("AZ_BATCH_POOL_ID"), 44 | NodeID: getenv("AZ_BATCH_NODE_ID"), 45 | } 46 | processEnv := getenv("AZ_BATCH_MONITOR_PROCESSES") 47 | if processEnv != nil { 48 | envConfig.Processes = parseListArgs(*processEnv) 49 | } 50 | argsConfig := batchinsights.UserConfig{ 51 | PoolID: flag.String("poolID", "", "Batch pool ID"), 52 | NodeID: flag.String("nodeID", "", "Batch node ID"), 53 | Aggregation: flag.Int("aggregation", 1, "Aggregation in minutes"), 54 | InstrumentationKey: flag.String("instKey", "", "Application Insights instrumentation KEY"), 55 | } 56 | 57 | version := flag.Bool("version", false, "Print current batch insights version") 58 | 59 | flag.Parse() 60 | 61 | if *version { 62 | fmt.Println(batchinsights.Version) 63 | os.Exit(0) 64 | } 65 | 66 | if processArg != nil { 67 | argsConfig.Processes = parseListArgs(*processArg) 68 | } 69 | if disableArg != nil { 70 | argsConfig.Disable = parseListArgs(*disableArg) 71 | } 72 | 73 | config := envConfig.Merge(argsConfig) 74 | 75 | positionalArgs := flag.Args() 76 | if len(positionalArgs) > 0 { 77 | log.Warn("Using postional arguments for Node ID, PoolID, KEY and Process names is deprecated. Use --poolID, --nodeID, --instKey, --process") 78 | log.Warn("It will be removed in 2.0.0") 79 | config.PoolID = &positionalArgs[0] 80 | } 81 | 82 | if len(positionalArgs) > 1 { 83 | config.NodeID = &positionalArgs[1] 84 | } 85 | 86 | if len(positionalArgs) > 2 { 87 | config.InstrumentationKey = &positionalArgs[2] 88 | } 89 | 90 | if len(positionalArgs) > 3 { 91 | config.Processes = parseListArgs(positionalArgs[3]) 92 | } 93 | 94 | config.Print() 95 | 96 | computedConfig, err := batchinsights.ValidateAndBuildConfig(config) 97 | 98 | if err != nil { 99 | log.Error("Invalid config", err) 100 | os.Exit(2) 101 | } 102 | 103 | computedConfig.Print() 104 | batchinsights.PrintSystemInfo() 105 | batchinsights.ListenForStats(computedConfig) 106 | } 107 | -------------------------------------------------------------------------------- /nodestats.py: -------------------------------------------------------------------------------- 1 | """TVM stats""" 2 | 3 | # stdlib imports 4 | import logging 5 | from datetime import datetime 6 | import os 7 | import time 8 | import platform 9 | from collections import namedtuple 10 | import sys 11 | 12 | # non-stdlib imports 13 | import psutil 14 | from applicationinsights import TelemetryClient 15 | 16 | VERSION = "0.0.1.1" 17 | _DEFAULT_STATS_UPDATE_INTERVAL = 5 18 | 19 | 20 | def setup_logger(): 21 | # logger defines 22 | logger = logging.getLogger(__name__) 23 | logger.setLevel(logging.DEBUG) 24 | ch = logging.StreamHandler() 25 | ch.setLevel(logging.DEBUG) 26 | formatter = logging.Formatter('%(asctime)s.%(msecs)03dZ %(levelname)s %(message)s') 27 | ch.setFormatter(formatter) 28 | logger.addHandler(ch) 29 | return logger 30 | 31 | 32 | logger = setup_logger() 33 | 34 | # global defines 35 | _IS_PLATFORM_WINDOWS = platform.system() == 'Windows' 36 | 37 | _OS_DISK = None 38 | _USER_DISK = None 39 | 40 | if _IS_PLATFORM_WINDOWS: 41 | _OS_DISK = 'C:/' # This is inverted on Cloud service 42 | _USER_DISK = 'D:/' 43 | else: 44 | _OS_DISK = "/" 45 | _USER_DISK = '/mnt/resources' 46 | if not os.path.exists(_USER_DISK): 47 | _USER_DISK = '/mnt' 48 | 49 | 50 | def python_environment(): # pragma: no cover 51 | """ 52 | Returns the current python environment information 53 | """ 54 | return ' '.join([platform.python_implementation(), platform.python_version()]) 55 | 56 | 57 | def os_environment(): 58 | """ 59 | Get the OS environment 60 | """ 61 | return platform.platform() 62 | 63 | 64 | def is_windows(): 65 | """ 66 | :returns: If running on windows 67 | """ 68 | return _IS_PLATFORM_WINDOWS 69 | 70 | 71 | def avg(list): 72 | """ 73 | Compute the average of a list 74 | """ 75 | return sum(list) / float(len(list)) 76 | 77 | 78 | def pretty_nb(num, suffix=''): 79 | for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: 80 | if abs(num) < 1000.0: 81 | return "%3.1f%s%s" % (num, unit, suffix) 82 | num /= 1000.0 83 | return "%.1f%s%s" % (num, 'Yi', suffix) 84 | 85 | 86 | NodeIOStats = namedtuple('NodeIOStats', ['read_bps', 'write_bps']) 87 | 88 | 89 | class NodeStats: 90 | """Persistent Task Stats class""" 91 | 92 | def __init__(self, 93 | num_connected_users=0, 94 | num_pids=0, 95 | cpu_count=0, 96 | cpu_percent=None, 97 | mem_total=0, 98 | mem_avail=0, 99 | swap_total=0, 100 | swap_avail=0, 101 | disk_io=None, 102 | disk_usage=None, 103 | net=None): 104 | """ 105 | Map the attributes 106 | """ 107 | self.num_connected_users = num_connected_users 108 | self.num_pids = num_pids 109 | self.cpu_count = cpu_count 110 | self.cpu_percent = cpu_percent 111 | self.mem_total = mem_total 112 | self.mem_avail = mem_avail 113 | self.swap_total = swap_total 114 | self.swap_avail = swap_avail 115 | self.disk_io = disk_io or NodeIOStats() 116 | self.disk_usage = disk_usage or dict() 117 | self.net = net or NodeIOStats() 118 | 119 | @property 120 | def mem_used(self): 121 | """ 122 | Return the memory used 123 | """ 124 | return self.mem_total - self.mem_avail 125 | 126 | 127 | class IOThroughputAggregator: 128 | def __init__(self): 129 | self.last_timestamp = None 130 | self.last_read = 0 131 | self.last_write = 0 132 | 133 | def aggregate(self, cur_read, cur_write): 134 | """ 135 | Aggregate with the new values 136 | """ 137 | now = datetime.now() 138 | read_bps = 0 139 | write_bps = 0 140 | if self.last_timestamp: 141 | delta = (now - self.last_timestamp).total_seconds() 142 | read_bps = (cur_read - self.last_read) / delta 143 | write_bps = (cur_write - self.last_write) / delta 144 | 145 | self.last_timestamp = now 146 | self.last_read = cur_read 147 | self.last_write = cur_write 148 | 149 | return NodeIOStats(read_bps, write_bps) 150 | 151 | 152 | class NodeStatsCollector: 153 | """ 154 | Node Stats Manager class 155 | """ 156 | 157 | def __init__(self, pool_id, node_id, refresh_interval=_DEFAULT_STATS_UPDATE_INTERVAL, app_insights_key=None): 158 | self.pool_id = pool_id 159 | self.node_id = node_id 160 | self.telemetry_client = None 161 | self.first_collect = True 162 | self.refresh_interval = refresh_interval 163 | 164 | self.disk = IOThroughputAggregator() 165 | self.network = IOThroughputAggregator() 166 | 167 | if app_insights_key or 'APP_INSIGHTS_INSTRUMENTATION_KEY' in os.environ or 'APP_INSIGHTS_KEY' in os.environ: 168 | key = (app_insights_key or os.environ.get('APP_INSIGHTS_INSTRUMENTATION_KEY') 169 | or os.environ.get('APP_INSIGHTS_KEY')) 170 | 171 | logger.info("Detected instrumentation key. Will upload stats to app insights") 172 | self.telemetry_client = TelemetryClient(key) 173 | context = self.telemetry_client.context 174 | context.application.id = 'AzureBatchInsights' 175 | context.application.ver = VERSION 176 | context.device.model = "BatchNode" 177 | context.device.role_name = self.pool_id 178 | context.device.role_instance = self.node_id 179 | else: 180 | logger.info("No instrumentation key detected. Cannot upload to app insights." + 181 | "Make sure you have the APP_INSIGHTS_INSTRUMENTATION_KEY environment variable setup") 182 | 183 | def init(self): 184 | """ 185 | Initialize the monitoring 186 | """ 187 | # start cpu utilization monitoring, first value is ignored 188 | psutil.cpu_percent(interval=None, percpu=True) 189 | 190 | def _get_network_usage(self): 191 | netio = psutil.net_io_counters() 192 | return self.network.aggregate(netio.bytes_recv, netio.bytes_sent) 193 | 194 | def _get_disk_io(self): 195 | diskio = psutil.disk_io_counters() 196 | return self.disk.aggregate(diskio.read_bytes, diskio.write_bytes) 197 | 198 | def _get_disk_usage(self): 199 | disk_usage = dict() 200 | try: 201 | disk_usage[_OS_DISK] = psutil.disk_usage(_OS_DISK) 202 | disk_usage[_USER_DISK] = psutil.disk_usage(_USER_DISK) 203 | except Exception as e: 204 | logger.error('Could not retrieve user disk stats for {0}: {1}'.format(_USER_DISK, e)) 205 | return disk_usage 206 | 207 | def _sample_stats(self): 208 | # get system-wide counters 209 | mem = psutil.virtual_memory() 210 | disk_stats = self._get_disk_io() 211 | disk_usage = self._get_disk_usage() 212 | net_stats = self._get_network_usage() 213 | 214 | swap_total, _, swap_avail, _, _, _ = psutil.swap_memory() 215 | 216 | stats = NodeStats( 217 | cpu_count=psutil.cpu_count(), 218 | cpu_percent=psutil.cpu_percent(interval=None, percpu=True), 219 | num_pids=len(psutil.pids()), 220 | 221 | # Memory 222 | mem_total=mem.total, 223 | mem_avail=mem.available, 224 | swap_total=swap_total, 225 | swap_avail=swap_avail, 226 | 227 | # Disk IO 228 | disk_io=disk_stats, 229 | 230 | # Disk usage 231 | disk_usage=disk_usage, 232 | 233 | # Net transfer 234 | net=net_stats, 235 | ) 236 | del mem 237 | return stats 238 | 239 | def _collect_stats(self): 240 | """ 241 | Collect the stats and then send to app insights 242 | """ 243 | # collect stats 244 | stats = self._sample_stats() 245 | 246 | if self.first_collect: 247 | self.first_collect = False 248 | return 249 | 250 | if stats is None: 251 | logger.error("Could not sample node stats") 252 | return 253 | 254 | if self.telemetry_client: 255 | self._send_stats(stats) 256 | else: 257 | self._log_stats(stats) 258 | 259 | def _send_stats(self, stats): 260 | """ 261 | Retrieve the current stats and send to app insights 262 | """ 263 | process = psutil.Process(os.getpid()) 264 | 265 | logger.debug("Uploading stats. Mem of this script: %d vs total: %d", process.memory_info().rss, stats.mem_avail) 266 | client = self.telemetry_client 267 | 268 | for cpu_n in range(0, stats.cpu_count): 269 | client.track_metric("Cpu usage", stats.cpu_percent[cpu_n], properties={"Cpu #": cpu_n}) 270 | 271 | for name, disk_usage in stats.disk_usage.items(): 272 | client.track_metric("Disk usage", disk_usage.used, properties={"Disk": name}) 273 | client.track_metric("Disk free", disk_usage.free, properties={"Disk": name}) 274 | 275 | client.track_metric("Memory used", stats.mem_used) 276 | client.track_metric("Memory available", stats.mem_avail) 277 | client.track_metric("Disk read", stats.disk_io.read_bps) 278 | client.track_metric("Disk write", stats.disk_io.write_bps) 279 | client.track_metric("Network read", stats.net.read_bps) 280 | client.track_metric("Network write", stats.net.write_bps) 281 | self.telemetry_client.flush() 282 | 283 | def _log_stats(self, stats): 284 | logger.info("========================= Stats =========================") 285 | logger.info("Cpu percent: %d%% %s", avg(stats.cpu_percent), stats.cpu_percent) 286 | logger.info("Memory used: %sB / %sB", pretty_nb(stats.mem_used), pretty_nb(stats.mem_total)) 287 | logger.info("Swap used: %sB / %sB", pretty_nb(stats.swap_avail), pretty_nb(stats.swap_total)) 288 | logger.info("Net read: %sBs", pretty_nb(stats.net.read_bps)) 289 | logger.info("Net write: %sBs", pretty_nb(stats.net.write_bps)) 290 | logger.info("Disk read: %sBs", pretty_nb(stats.disk_io.read_bps)) 291 | logger.info("Disk write: %sBs", pretty_nb(stats.disk_io.write_bps)) 292 | logger.info("Disk usage:") 293 | for name, disk_usage in stats.disk_usage.items(): 294 | logger.info(" - %s: %i/%i (%i%%)", name, disk_usage.used, disk_usage.total, disk_usage.percent) 295 | 296 | logger.info("-------------------------------------") 297 | logger.info("") 298 | 299 | def run(self): 300 | """ 301 | Start collecting information of the system. 302 | """ 303 | logger.debug("Start collecting stats for pool=%s node=%s", self.pool_id, self.node_id) 304 | while True: 305 | self._collect_stats() 306 | time.sleep(self.refresh_interval) 307 | 308 | 309 | def main(): 310 | """ 311 | Main entry point for prism 312 | """ 313 | # log basic info 314 | logger.info("Python args: %s", sys.argv) 315 | logger.info("Python interpreter: %s", python_environment()) 316 | logger.info("Operating system: %s", os_environment()) 317 | logger.info("Cpu count: %s", psutil.cpu_count()) 318 | 319 | pool_id = os.environ.get('AZ_BATCH_POOL_ID', '_test-pool-1') 320 | node_id = os.environ.get('AZ_BATCH_NODE_ID', '_test-node-1') 321 | 322 | # get and set event loop mode 323 | logger.info('enabling event loop debug mode') 324 | 325 | app_insights_key = None 326 | if len(sys.argv) > 2: 327 | pool_id = sys.argv[1] 328 | node_id = sys.argv[2] 329 | if len(sys.argv) > 3: 330 | app_insights_key = sys.argv[3] 331 | 332 | # create node stats manager 333 | collector = NodeStatsCollector(pool_id, node_id, app_insights_key=app_insights_key) 334 | collector.init() 335 | collector.run() 336 | 337 | 338 | if __name__ == '__main__': 339 | main() 340 | -------------------------------------------------------------------------------- /nvml/nvml.go: -------------------------------------------------------------------------------- 1 | package nvml 2 | 3 | type NvmlClient interface { 4 | Init() error 5 | Shutdown() error 6 | GetDeviceCount() (uint, error) 7 | 8 | DeviceGetHandleByIndex(index uint) (Device, error) 9 | DeviceGetMemoryInfo(device Device) (Memory, error) 10 | DeviceGetUtilizationRates(device Device) (GPUUtilization, error) 11 | } 12 | 13 | type GPUUtilization struct { 14 | GPU uint 15 | Memory uint 16 | } 17 | 18 | type Memory struct { 19 | Total uint64 // Total installed FB memory (in bytes). 20 | Free uint64 // Unallocated FB memory (in bytes). 21 | Used uint64 // Allocated FB memory (in bytes). 22 | } 23 | 24 | type Device interface { 25 | } 26 | -------------------------------------------------------------------------------- /nvml/nvml_linux.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | package nvml 4 | 5 | import ( 6 | nvml_linux "github.com/mindprince/gonvml" 7 | ) 8 | 9 | type LinuxDevice = nvml_linux.Device 10 | 11 | type LinuxNvmlClient struct { 12 | } 13 | 14 | func New() (*LinuxNvmlClient, error) { 15 | client := LinuxNvmlClient{} 16 | 17 | return &client, nil 18 | } 19 | 20 | func (client *LinuxNvmlClient) Init() error { 21 | return nvml_linux.Initialize() 22 | } 23 | 24 | func (client *LinuxNvmlClient) Shutdown() error { 25 | return nvml_linux.Shutdown() 26 | } 27 | 28 | func (client *LinuxNvmlClient) GetDeviceCount() (uint, error) { 29 | value, err := nvml_linux.DeviceCount() 30 | if err != nil { 31 | return 0, err 32 | } 33 | 34 | return uint(value), nil 35 | } 36 | 37 | func (client *LinuxNvmlClient) DeviceGetUtilizationRates(device Device) (GPUUtilization, error) { 38 | linuxDevice := device.(LinuxDevice) 39 | gpu, memory, err := linuxDevice.UtilizationRates() 40 | if err != nil { 41 | return GPUUtilization{GPU: 0, Memory: 0}, err 42 | } 43 | 44 | use := GPUUtilization{ 45 | GPU: gpu, 46 | Memory: memory, 47 | } 48 | return use, nil 49 | } 50 | 51 | func (client *LinuxNvmlClient) DeviceGetMemoryInfo(device Device) (Memory, error) { 52 | linuxDevice := device.(LinuxDevice) 53 | total, used, err := linuxDevice.MemoryInfo() 54 | if err != nil { 55 | return Memory{Used: used, Total: total}, err 56 | } 57 | return Memory{Used: used, Total: total}, nil 58 | } 59 | 60 | func (client *LinuxNvmlClient) DeviceGetHandleByIndex(index uint) (Device, error) { 61 | device, err := nvml_linux.DeviceHandleByIndex(uint(index)) 62 | if err != nil { 63 | return Device(device), err 64 | } 65 | return Device(device), nil 66 | } 67 | -------------------------------------------------------------------------------- /nvml/nvml_windows.go: -------------------------------------------------------------------------------- 1 | // +build windows 2 | 3 | package nvml 4 | 5 | import ( 6 | nvml_win "github.com/mxpv/nvml-go" 7 | ) 8 | 9 | type WinDevice struct { 10 | handle nvml_win.Device 11 | } 12 | 13 | func New() (*WinNvmlClient, error) { 14 | api, err := nvml_win.New("") 15 | 16 | if err != nil { 17 | return nil, err 18 | } 19 | 20 | client := WinNvmlClient{ 21 | api: api, 22 | } 23 | 24 | return &client, nil 25 | } 26 | 27 | type WinNvmlClient struct { 28 | api *nvml_win.API 29 | } 30 | 31 | func (client *WinNvmlClient) Init() error { 32 | return client.api.Init() 33 | } 34 | 35 | func (client *WinNvmlClient) Shutdown() error { 36 | return client.api.Shutdown() 37 | } 38 | 39 | func (client *WinNvmlClient) GetDeviceCount() (uint, error) { 40 | value, err := client.api.DeviceGetCount() 41 | if err != nil { 42 | return 0, err 43 | } 44 | 45 | return uint(value), nil 46 | } 47 | 48 | func (client *WinNvmlClient) DeviceGetUtilizationRates(device Device) (GPUUtilization, error) { 49 | winDevice := device.(WinDevice) 50 | value, err := client.api.DeviceGetUtilizationRates(winDevice.handle) 51 | if err != nil { 52 | return GPUUtilization{GPU: 0, Memory: 0}, err 53 | } 54 | 55 | use := GPUUtilization{ 56 | GPU: uint(value.GPU), 57 | Memory: uint(value.Memory), 58 | } 59 | return use, nil 60 | } 61 | 62 | func (client *WinNvmlClient) DeviceGetMemoryInfo(device Device) (Memory, error) { 63 | winDevice := device.(WinDevice) 64 | use, err := client.api.DeviceGetMemoryInfo(winDevice.handle) 65 | if err != nil { 66 | return Memory(use), err 67 | } 68 | return Memory(use), nil 69 | } 70 | 71 | func (client *WinNvmlClient) DeviceGetHandleByIndex(index uint) (Device, error) { 72 | handle, err := client.api.DeviceGetHandleByIndex(uint32(index)) 73 | if err != nil { 74 | return Device(WinDevice{handle: handle}), err 75 | } 76 | return Device(WinDevice{handle: handle}), nil 77 | } 78 | -------------------------------------------------------------------------------- /pkg/appinsights.go: -------------------------------------------------------------------------------- 1 | package batchinsights 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "strconv" 7 | "time" 8 | 9 | "github.com/Microsoft/ApplicationInsights-Go/appinsights" 10 | ) 11 | 12 | // AppInsightsService service handling the aggregation and upload of metrics 13 | type AppInsightsService struct { 14 | client appinsights.TelemetryClient 15 | aggregation time.Duration 16 | aggregateCollectionStart *time.Time 17 | aggregates map[string]*appinsights.AggregateMetricTelemetry 18 | } 19 | 20 | // NewAppInsightsService create a new instance of the AppInsightsService 21 | func NewAppInsightsService(instrumentationKey string, poolID string, nodeID string, aggregation time.Duration) AppInsightsService { 22 | client := appinsights.NewTelemetryClient(instrumentationKey) 23 | client.Context().Tags.Cloud().SetRole(poolID) 24 | client.Context().Tags.Cloud().SetRoleInstance(nodeID) 25 | 26 | return AppInsightsService{ 27 | client: client, 28 | aggregation: aggregation, 29 | aggregates: make(map[string]*appinsights.AggregateMetricTelemetry), 30 | } 31 | } 32 | 33 | func (service *AppInsightsService) track(metric *appinsights.MetricTelemetry) { 34 | t := time.Now() 35 | 36 | if service.aggregateCollectionStart != nil { 37 | elapsed := t.Sub(*service.aggregateCollectionStart) 38 | 39 | if elapsed > service.aggregation { 40 | for _, aggregate := range service.aggregates { 41 | service.client.Track(aggregate) 42 | } 43 | service.aggregates = make(map[string]*appinsights.AggregateMetricTelemetry) 44 | service.aggregateCollectionStart = &t 45 | } 46 | } else { 47 | service.aggregateCollectionStart = &t 48 | } 49 | 50 | id := GetMetricID(metric) 51 | 52 | aggregate, ok := service.aggregates[id] 53 | if !ok { 54 | aggregate = appinsights.NewAggregateMetricTelemetry(metric.Name) 55 | aggregate.Properties = metric.Properties 56 | service.aggregates[id] = aggregate 57 | } 58 | aggregate.AddData([]float64{metric.Value}) 59 | } 60 | 61 | // UploadStats will register the given stats for upload. They will be first aggregated during the given aggregation interval 62 | func (service *AppInsightsService) UploadStats(stats NodeStats) { 63 | client := service.client 64 | 65 | for cpuN, percent := range stats.CPUPercents { 66 | metric := appinsights.NewMetricTelemetry("Cpu usage", percent) 67 | metric.Properties["CPU #"] = strconv.Itoa(cpuN) 68 | metric.Properties["Core count"] = strconv.Itoa(len(stats.CPUPercents)) 69 | service.track(metric) 70 | } 71 | 72 | for _, usage := range stats.DiskUsage { 73 | usedMetric := appinsights.NewMetricTelemetry("Disk usage", float64(usage.Used)) 74 | usedMetric.Properties["Disk"] = usage.Path 75 | service.track(usedMetric) 76 | freeMetric := appinsights.NewMetricTelemetry("Disk free", float64(usage.Free)) 77 | freeMetric.Properties["Disk"] = usage.Path 78 | service.track(freeMetric) 79 | } 80 | 81 | if stats.Memory != nil { 82 | service.track(appinsights.NewMetricTelemetry("Memory used", float64(stats.Memory.Used))) 83 | service.track(appinsights.NewMetricTelemetry("Memory available", float64(stats.Memory.Total-stats.Memory.Used))) 84 | } 85 | if stats.DiskIO != nil { 86 | service.track(appinsights.NewMetricTelemetry("Disk read", float64(stats.DiskIO.ReadBps))) 87 | service.track(appinsights.NewMetricTelemetry("Disk write", float64(stats.DiskIO.WriteBps))) 88 | } 89 | 90 | if stats.NetIO != nil { 91 | service.track(appinsights.NewMetricTelemetry("Network read", float64(stats.NetIO.ReadBps))) 92 | service.track(appinsights.NewMetricTelemetry("Network write", float64(stats.NetIO.WriteBps))) 93 | } 94 | 95 | if len(stats.Gpus) > 0 { 96 | for cpuN, usage := range stats.Gpus { 97 | gpuMetric := appinsights.NewMetricTelemetry("Gpu usage", usage.GPU) 98 | gpuMetric.Properties["GPU #"] = strconv.Itoa(cpuN) 99 | service.track(gpuMetric) 100 | 101 | gpuMemoryMetric := appinsights.NewMetricTelemetry("Gpu memory usage", usage.Memory) 102 | gpuMemoryMetric.Properties["GPU #"] = strconv.Itoa(cpuN) 103 | service.track(gpuMemoryMetric) 104 | } 105 | } 106 | 107 | if len(stats.Processes) > 0 { 108 | for _, processStats := range stats.Processes { 109 | 110 | pidStr := strconv.FormatInt(int64(processStats.pid), 10) 111 | 112 | { 113 | cpuMetric := appinsights.NewMetricTelemetry("Process CPU", processStats.cpu) 114 | cpuMetric.Properties["Process Name"] = processStats.name 115 | cpuMetric.Properties["PID"] = pidStr 116 | service.track(cpuMetric) 117 | } 118 | 119 | { 120 | memMetric := appinsights.NewMetricTelemetry("Process Memory", float64(processStats.memory)) 121 | memMetric.Properties["Process Name"] = processStats.name 122 | memMetric.Properties["PID"] = pidStr 123 | service.track(memMetric) 124 | } 125 | 126 | } 127 | } 128 | 129 | client.Channel().Flush() 130 | } 131 | 132 | // GetMetricID compute an group id for this metric so it can be aggregated 133 | func GetMetricID(metric *appinsights.MetricTelemetry) string { 134 | groupBy := createKeyValuePairs(metric.Properties) 135 | return fmt.Sprintf("%s/%s", metric.Name, groupBy) 136 | } 137 | 138 | func createKeyValuePairs(m map[string]string) string { 139 | b := new(bytes.Buffer) 140 | first := true 141 | for key, value := range m { 142 | if first { 143 | first = false 144 | } else { 145 | fmt.Fprintf(b, ",") 146 | } 147 | fmt.Fprintf(b, "%s=%s", key, value) 148 | } 149 | return b.String() 150 | } 151 | -------------------------------------------------------------------------------- /pkg/appinsights_test.go: -------------------------------------------------------------------------------- 1 | package batchinsights_test 2 | 3 | import ( 4 | "github.com/Azure/batch-insights/pkg" 5 | "github.com/Microsoft/ApplicationInsights-Go/appinsights" 6 | "github.com/stretchr/testify/assert" 7 | "testing" 8 | ) 9 | 10 | func TestGetMetricID(t *testing.T) { 11 | metric := appinsights.NewMetricTelemetry("Disk usage", 134) 12 | metric.Properties["Some #"] = "4" 13 | metric.Properties["Other #"] = "5" 14 | 15 | metricID := batchinsights.GetMetricID(metric) 16 | assert.True(t, metricID == "Disk usage/Other #=5,Some #=4" || metricID == "Disk usage/Some #=4,Other #=5") 17 | 18 | metric = appinsights.NewMetricTelemetry("Disk IO", 543) 19 | assert.Equal(t, "Disk IO/", batchinsights.GetMetricID(metric)) 20 | } 21 | -------------------------------------------------------------------------------- /pkg/batchinsights.go: -------------------------------------------------------------------------------- 1 | package batchinsights 2 | 3 | import ( 4 | "fmt" 5 | "runtime" 6 | "time" 7 | 8 | "github.com/Azure/batch-insights/pkg/cpu" 9 | "github.com/Azure/batch-insights/pkg/disk" 10 | "github.com/Azure/batch-insights/pkg/utils" 11 | "github.com/dustin/go-humanize" 12 | "github.com/shirou/gopsutil/mem" 13 | "github.com/shirou/gopsutil/net" 14 | ) 15 | 16 | func getSamplingRate(rate time.Duration) time.Duration { 17 | if rate <= time.Duration(0) { 18 | return DefaultSamplingRate 19 | } 20 | return rate 21 | } 22 | 23 | // ListenForStats Start the sanpling of node metrics 24 | func ListenForStats(config Config) { 25 | var netIO = utils.IOAggregator{} 26 | 27 | var gpuStatsCollector = NewGPUStatsCollector() 28 | defer gpuStatsCollector.Shutdown() 29 | 30 | var appInsightsService = createAppInsightsService(config) 31 | 32 | for range time.Tick(getSamplingRate(config.SamplingRate)) { 33 | gpuStatsCollector.GetStats() 34 | 35 | var stats = NodeStats{} 36 | 37 | if !config.Disable.Memory { 38 | v, err := mem.VirtualMemory() 39 | if err == nil { 40 | stats.Memory = v 41 | } else { 42 | fmt.Println(err) 43 | } 44 | } 45 | if !config.Disable.CPU { 46 | cpus, err := cpu.PerCpuPercent() 47 | if err == nil { 48 | stats.CPUPercents = cpus 49 | } else { 50 | fmt.Println(err) 51 | } 52 | } 53 | if !config.Disable.DiskUsage { 54 | stats.DiskUsage = disk.GetDiskUsage() 55 | } 56 | if !config.Disable.DiskIO { 57 | stats.DiskIO = disk.DiskIO() 58 | } 59 | if !config.Disable.NetworkIO { 60 | stats.NetIO = getNetIO(&netIO) 61 | } 62 | if !config.Disable.GPU { 63 | stats.Gpus = gpuStatsCollector.GetStats() 64 | } 65 | 66 | processes, err := ListProcesses(config.Processes) 67 | if err == nil { 68 | stats.Processes = processes 69 | } else { 70 | fmt.Println(err) 71 | } 72 | 73 | if appInsightsService != nil { 74 | appInsightsService.UploadStats(stats) 75 | } else { 76 | printStats(stats) 77 | } 78 | } 79 | } 80 | 81 | func getNetIO(diskIO *utils.IOAggregator) *utils.IOStats { 82 | var counters, err = net.IOCounters(false) 83 | 84 | if err != nil { 85 | fmt.Println(err) 86 | } else if len(counters) >= 1 { 87 | var stats = diskIO.UpdateAggregates(counters[0].BytesRecv, counters[0].BytesSent) 88 | return &stats 89 | } 90 | return nil 91 | } 92 | 93 | // PrintSystemInfo print system info needed 94 | func PrintSystemInfo() { 95 | fmt.Printf("System information:\n") 96 | fmt.Printf(" OS: %s\n", runtime.GOOS) 97 | } 98 | 99 | func getConfiguration() { 100 | 101 | } 102 | 103 | func printStats(stats NodeStats) { 104 | fmt.Printf("========================= Stats =========================\n") 105 | fmt.Printf("Cpu percent: %f%%, %v cpu(s)\n", avg(stats.CPUPercents), len(stats.CPUPercents)) 106 | fmt.Printf("Memory used: %s/%s\n", humanize.Bytes(stats.Memory.Used), humanize.Bytes(stats.Memory.Total)) 107 | 108 | if len(stats.DiskUsage) > 0 { 109 | fmt.Printf("Disk usage:\n") 110 | for _, usage := range stats.DiskUsage { 111 | fmt.Printf(" - %s: %s/%s (%v%%)\n", usage.Path, humanize.Bytes(usage.Used), humanize.Bytes(usage.Total), usage.UsedPercent) 112 | } 113 | } 114 | 115 | if stats.DiskIO != nil { 116 | fmt.Printf("Disk IO: R:%sps, W:%sps\n", humanize.Bytes(stats.DiskIO.ReadBps), humanize.Bytes(stats.DiskIO.WriteBps)) 117 | } 118 | 119 | if stats.NetIO != nil { 120 | fmt.Printf("NET IO: R:%sps, S:%sps\n", humanize.Bytes(stats.NetIO.ReadBps), humanize.Bytes(stats.NetIO.WriteBps)) 121 | } 122 | 123 | if len(stats.Gpus) > 0 { 124 | fmt.Printf("GPU(s) usage:\n") 125 | for _, usage := range stats.Gpus { 126 | fmt.Printf(" - GPU: %f%%, Memory: %f%%\n", usage.GPU, usage.Memory) 127 | } 128 | } 129 | 130 | if len(stats.Processes) > 0 { 131 | fmt.Printf("Tracked processes:\n") 132 | for _, process := range stats.Processes { 133 | fmt.Printf(" - %s (%d), CPU: %f%%, Memory: %s\n", process.name, process.pid, process.cpu, humanize.Bytes(process.memory)) 134 | } 135 | } 136 | 137 | fmt.Println() 138 | fmt.Println() 139 | } 140 | 141 | func avg(array []float64) float64 { 142 | var total float64 143 | for _, value := range array { 144 | total += value 145 | } 146 | return total / float64(len(array)) 147 | } 148 | 149 | func createAppInsightsService(config Config) *AppInsightsService { 150 | if config.InstrumentationKey != "" { 151 | service := NewAppInsightsService(config.InstrumentationKey, config.PoolID, config.NodeID, config.Aggregation) 152 | return &service 153 | } 154 | fmt.Println("APP_INSIGHTS_INSTRUMENTATION_KEY is not set; will not upload to Application Insights") 155 | return nil 156 | } 157 | -------------------------------------------------------------------------------- /pkg/config.go: -------------------------------------------------------------------------------- 1 | package batchinsights 2 | 3 | import ( 4 | "encoding/json" 5 | "errors" 6 | "fmt" 7 | "strings" 8 | "time" 9 | ) 10 | 11 | // DefaultAggregationTime default time range where metrics are preaggregated locally 12 | const DefaultAggregationTime = time.Duration(1) * time.Minute 13 | 14 | // DefaultSamplingRate default time between metrics sampling 15 | const DefaultSamplingRate = time.Duration(5) * time.Second 16 | 17 | // UserConfig config provided by the user either via command line, file or environemnt variable. 18 | type UserConfig struct { 19 | PoolID *string 20 | NodeID *string 21 | InstrumentationKey *string // Application insights instrumentation key 22 | Processes []string // List of process names to watch 23 | Aggregation *int // Local aggregation of data in minutes (default: 1) 24 | Disable []string // List of metrics to disable 25 | } 26 | 27 | // Print print the config to console 28 | func (config UserConfig) Print() { 29 | fmt.Printf("User configuration:\n") 30 | fmt.Printf(" Pool ID: %s\n", *config.PoolID) 31 | fmt.Printf(" Node ID: %s\n", *config.NodeID) 32 | if config.InstrumentationKey != nil { 33 | fmt.Printf(" Instrumentation Key: %s\n", hideSecret(*config.InstrumentationKey)) 34 | } 35 | fmt.Printf(" Aggregation: %d\n", *config.Aggregation) 36 | fmt.Printf(" Disable: %v\n", config.Disable) 37 | fmt.Printf(" Monitoring processes: %v\n", config.Processes) 38 | } 39 | 40 | // Merge with another config 41 | func (config UserConfig) Merge(other UserConfig) UserConfig { 42 | if other.PoolID != nil && *other.PoolID != "" { 43 | config.PoolID = other.PoolID 44 | } 45 | if other.NodeID != nil && *other.NodeID != "" { 46 | config.NodeID = other.NodeID 47 | } 48 | if other.InstrumentationKey != nil && *other.InstrumentationKey != "" { 49 | config.InstrumentationKey = other.InstrumentationKey 50 | } 51 | if other.Aggregation != nil { 52 | config.Aggregation = other.Aggregation 53 | } 54 | if len(other.Processes) > 0 { 55 | config.Processes = other.Processes 56 | } 57 | if len(other.Disable) > 0 { 58 | config.Disable = other.Disable 59 | } 60 | return config 61 | } 62 | 63 | // DisableConfig config showing which feature are disabled 64 | type DisableConfig struct { 65 | DiskIO bool `json:"diskIO"` 66 | DiskUsage bool `json:"diskUsage"` 67 | NetworkIO bool `json:"networkIO"` 68 | GPU bool `json:"gpu"` 69 | CPU bool `json:"cpu"` 70 | Memory bool `json:"memory"` 71 | } 72 | 73 | func (d DisableConfig) String() string { 74 | s, _ := json.Marshal(d) 75 | return string(s) 76 | } 77 | 78 | // Config General config batch insights takes as input 79 | type Config struct { 80 | PoolID string 81 | NodeID string 82 | InstrumentationKey string 83 | Processes []string 84 | Aggregation time.Duration 85 | SamplingRate time.Duration 86 | Disable DisableConfig 87 | } 88 | 89 | // Print print the config to console 90 | func (config Config) Print() { 91 | fmt.Printf("BatchInsights configuration:\n") 92 | fmt.Printf(" Pool ID: %s\n", config.PoolID) 93 | fmt.Printf(" Node ID: %s\n", config.NodeID) 94 | fmt.Printf(" Instrumentation Key: %s\n", hideSecret(config.InstrumentationKey)) 95 | fmt.Printf(" Aggregation: %v\n", config.Aggregation) 96 | fmt.Printf(" Sampling rate: %d\n", config.SamplingRate) 97 | fmt.Printf(" Disable: %+v\n", config.Disable) 98 | fmt.Printf(" Monitoring processes: %v\n", config.Processes) 99 | } 100 | 101 | // ValidateAndBuildConfig Convert Batch insights user config into config taken by the library 102 | func ValidateAndBuildConfig(userConfig UserConfig) (Config, error) { 103 | aggregation := parseAggregation(userConfig.Aggregation) 104 | 105 | if userConfig.PoolID == nil { 106 | return Config{}, errors.New("Pool ID must be specified") 107 | } 108 | if userConfig.PoolID == nil { 109 | return Config{}, errors.New("Node ID must be specified") 110 | } 111 | key := "" 112 | if userConfig.InstrumentationKey != nil { 113 | key = *userConfig.InstrumentationKey 114 | } 115 | return Config{ 116 | PoolID: *userConfig.PoolID, 117 | NodeID: *userConfig.NodeID, 118 | InstrumentationKey: key, 119 | Processes: userConfig.Processes, 120 | Aggregation: aggregation, 121 | Disable: parseDisableConfig(userConfig.Disable), 122 | SamplingRate: DefaultSamplingRate, 123 | }, nil 124 | } 125 | 126 | func parseAggregation(value *int) time.Duration { 127 | if value == nil { 128 | return DefaultAggregationTime 129 | } 130 | return time.Duration(*value) * time.Minute 131 | } 132 | 133 | func parseDisableConfig(values []string) DisableConfig { 134 | disableMap := make(map[string]bool) 135 | for _, key := range values { 136 | disableMap[strings.ToLower(key)] = true 137 | } 138 | return DisableConfig{ 139 | DiskIO: disableMap["diskio"], 140 | DiskUsage: disableMap["diskusage"], 141 | NetworkIO: disableMap["networkio"], 142 | GPU: disableMap["gpu"], 143 | CPU: disableMap["cpu"], 144 | Memory: disableMap["memory"], 145 | } 146 | } 147 | 148 | // Hide a secret 149 | func hideSecret(secret string) string { 150 | if secret == "" { 151 | return "-" 152 | } 153 | return "xxxxx" 154 | } 155 | -------------------------------------------------------------------------------- /pkg/config_test.go: -------------------------------------------------------------------------------- 1 | package batchinsights_test 2 | 3 | import ( 4 | "github.com/Azure/batch-insights/pkg" 5 | "github.com/stretchr/testify/assert" 6 | "testing" 7 | ) 8 | 9 | func TestBuildConfig(t *testing.T) { 10 | pool1 := "pool-1" 11 | node1 := "node-1" 12 | 13 | input := batchinsights.UserConfig{ 14 | PoolID: &pool1, 15 | NodeID: &node1, 16 | Processes: []string{"foo.exe", "bar"}, 17 | } 18 | result, err := batchinsights.ValidateAndBuildConfig(input) 19 | 20 | assert.Equal(t, nil, err) 21 | assert.Equal(t, "pool-1", result.PoolID) 22 | assert.Equal(t, "node-1", result.NodeID) 23 | assert.Equal(t, []string{"foo.exe", "bar"}, result.Processes) 24 | assert.Equal(t, false, result.Disable.DiskIO) 25 | assert.Equal(t, false, result.Disable.NetworkIO) 26 | assert.Equal(t, false, result.Disable.DiskUsage) 27 | assert.Equal(t, false, result.Disable.CPU) 28 | assert.Equal(t, false, result.Disable.Memory) 29 | assert.Equal(t, false, result.Disable.GPU) 30 | 31 | result, err = batchinsights.ValidateAndBuildConfig(batchinsights.UserConfig{ 32 | PoolID: &pool1, 33 | NodeID: &node1, 34 | Disable: []string{"diskIO", "cpu"}, 35 | }) 36 | 37 | assert.Equal(t, nil, err) 38 | assert.Equal(t, true, result.Disable.DiskIO) 39 | assert.Equal(t, false, result.Disable.NetworkIO) 40 | assert.Equal(t, false, result.Disable.DiskUsage) 41 | assert.Equal(t, true, result.Disable.CPU) 42 | assert.Equal(t, false, result.Disable.Memory) 43 | assert.Equal(t, false, result.Disable.GPU) 44 | } 45 | -------------------------------------------------------------------------------- /pkg/cpu/cpu.go: -------------------------------------------------------------------------------- 1 | package cpu 2 | -------------------------------------------------------------------------------- /pkg/cpu/cpu_linux.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | package cpu 4 | 5 | import ( 6 | psutils_cpu "github.com/shirou/gopsutil/cpu" 7 | ) 8 | 9 | func PerCpuPercent() ([]float64, error) { 10 | return psutils_cpu.Percent(0, true) 11 | } 12 | -------------------------------------------------------------------------------- /pkg/cpu/cpu_windows.go: -------------------------------------------------------------------------------- 1 | // +build windows 2 | 3 | package cpu 4 | 5 | import ( 6 | "context" 7 | "github.com/Azure/batch-insights/pkg/wmi" 8 | ) 9 | 10 | type CPUStat struct { 11 | value uint64 12 | timestamp uint64 13 | } 14 | 15 | var lastCpus map[string]CPUStat = make(map[string]CPUStat) 16 | 17 | type win32_PerfRawData_Counters_ProcessorInformation struct { 18 | Name string 19 | PercentDPCTime uint64 20 | PercentIdleTime uint64 21 | PercentUserTime uint64 22 | PercentProcessorTime uint64 23 | PercentInterruptTime uint64 24 | PercentPriorityTime uint64 25 | PercentPrivilegedTime uint64 26 | TimeStamp_Sys100NS uint64 27 | InterruptsPerSec uint32 28 | ProcessorFrequency uint32 29 | DPCRate uint32 30 | } 31 | 32 | func PerCpuPercent() ([]float64, error) { 33 | return perCPUPercentWithContext(context.Background()) 34 | } 35 | 36 | func perCPUPercentWithContext(ctx context.Context) ([]float64, error) { 37 | var ret []float64 38 | stats, err := perfInfoWithContext(ctx) 39 | if err != nil { 40 | return nil, err 41 | } 42 | 43 | for _, v := range stats { 44 | last := lastCpus[v.Name] 45 | 46 | lastCpus[v.Name] = CPUStat{ 47 | value: v.PercentProcessorTime, 48 | timestamp: v.TimeStamp_Sys100NS, 49 | } 50 | 51 | if last.timestamp != 0 { 52 | value := (1 - (float64(v.PercentProcessorTime-last.value) / float64(v.TimeStamp_Sys100NS-last.timestamp))) * 100 53 | ret = append(ret, value) 54 | } 55 | } 56 | return ret, nil 57 | } 58 | 59 | // PerfInfo returns the performance counter's instance value for ProcessorInformation. 60 | // Name property is the key by which overall, per cpu and per core metric is known. 61 | func perfInfoWithContext(ctx context.Context) ([]win32_PerfRawData_Counters_ProcessorInformation, error) { 62 | var ret []win32_PerfRawData_Counters_ProcessorInformation 63 | 64 | q := wmi.CreateQuery(&ret, "WHERE NOT Name LIKE '%_Total'") 65 | err := wmi.QueryWithContext(ctx, q, &ret) 66 | if err != nil { 67 | return []win32_PerfRawData_Counters_ProcessorInformation{}, err 68 | } 69 | 70 | return ret, err 71 | } 72 | -------------------------------------------------------------------------------- /pkg/disk/disk.go: -------------------------------------------------------------------------------- 1 | package disk 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "runtime" 7 | 8 | psutils_disk "github.com/shirou/gopsutil/disk" 9 | ) 10 | 11 | var IS_PLATFORM_WINDOWS = runtime.GOOS == "windows" 12 | 13 | func GetDiskUsage() []*psutils_disk.UsageStat { 14 | var disks = getDiskToWatch() 15 | var stats []*psutils_disk.UsageStat 16 | 17 | for _, diskPath := range disks { 18 | usage, err := psutils_disk.Usage(diskPath) 19 | if err == nil { 20 | stats = append(stats, usage) 21 | } else { 22 | fmt.Println(err) 23 | } 24 | } 25 | 26 | return stats 27 | } 28 | 29 | func getDiskToWatch() []string { 30 | if IS_PLATFORM_WINDOWS == true { 31 | return []string{"C:/", "D:/"} 32 | } else { 33 | var osDisk = "/" 34 | var userDisk = "/mnt/resources" 35 | var exists, _ = pathExists(userDisk) 36 | 37 | if !exists { 38 | userDisk = "/mnt" 39 | } 40 | return []string{osDisk, userDisk} 41 | } 42 | } 43 | 44 | func pathExists(path string) (bool, error) { 45 | _, err := os.Stat(path) 46 | if err == nil { 47 | return true, nil 48 | } 49 | if os.IsNotExist(err) { 50 | return false, nil 51 | } 52 | return true, err 53 | } 54 | -------------------------------------------------------------------------------- /pkg/disk/disk_linux.go: -------------------------------------------------------------------------------- 1 | // +build linux 2 | 3 | package disk 4 | 5 | import ( 6 | "fmt" 7 | 8 | "github.com/Azure/batch-insights/pkg/utils" 9 | psutils_disk "github.com/shirou/gopsutil/disk" 10 | ) 11 | 12 | var diskIO = utils.IOAggregator{} 13 | 14 | func DiskIO() *utils.IOStats { 15 | var counters, err = psutils_disk.IOCounters() 16 | 17 | if err != nil { 18 | fmt.Println("Error while retrieving Disk IO", err) 19 | return nil 20 | } 21 | var readBytes uint64 = 0 22 | var writeBytes uint64 = 0 23 | 24 | for _, v := range counters { 25 | readBytes += v.ReadBytes 26 | writeBytes += v.WriteBytes 27 | } 28 | var stats = diskIO.UpdateAggregates(readBytes, writeBytes) 29 | return &stats 30 | } 31 | -------------------------------------------------------------------------------- /pkg/disk/disk_windows.go: -------------------------------------------------------------------------------- 1 | // +build windows 2 | 3 | package disk 4 | 5 | import ( 6 | "context" 7 | "fmt" 8 | 9 | "github.com/Azure/batch-insights/pkg/utils" 10 | "github.com/Azure/batch-insights/pkg/wmi" 11 | ) 12 | 13 | type win32_PerfRawData_PerfDisk_PhysicalDisk struct { 14 | Name string 15 | AvgDiskBytesPerRead uint64 16 | AvgDiskBytesPerRead_Base uint64 17 | AvgDiskBytesPerWrite uint64 18 | AvgDiskBytesPerWrite_Base uint64 19 | AvgDiskReadQueueLength uint64 20 | AvgDiskWriteQueueLength uint64 21 | AvgDisksecPerRead uint64 22 | AvgDisksecPerWrite uint64 23 | } 24 | 25 | var diskIO = utils.IOAggregator{} 26 | 27 | func DiskIO() *utils.IOStats { 28 | return DiskIOWithContext(context.Background()) 29 | } 30 | 31 | func DiskIOWithContext(ctx context.Context, names ...string) *utils.IOStats { 32 | var ret []win32_PerfRawData_PerfDisk_PhysicalDisk 33 | 34 | q := wmi.CreateQuery(&ret, "WHERE NOT Name LIKE '%_Total'") 35 | err := wmi.QueryWithContext(ctx, q, &ret) 36 | if err != nil { 37 | fmt.Println("Error while retrieving DISK IO", err) 38 | return nil 39 | } 40 | 41 | var readBytes uint64 = 0 42 | var writeBytes uint64 = 0 43 | for _, v := range ret { 44 | readBytes += v.AvgDiskBytesPerRead 45 | writeBytes += v.AvgDiskBytesPerWrite 46 | } 47 | stats := diskIO.UpdateAggregates(readBytes, writeBytes) 48 | 49 | return &stats 50 | } 51 | -------------------------------------------------------------------------------- /pkg/gpu_stats_collector.go: -------------------------------------------------------------------------------- 1 | package batchinsights 2 | 3 | import ( 4 | "fmt" 5 | "github.com/Azure/batch-insights/nvml" 6 | ) 7 | 8 | // GPUStatsCollector collector that retrieve gpu usage from nvml 9 | type GPUStatsCollector struct { 10 | nvml nvml.NvmlClient 11 | deviceCount uint 12 | } 13 | 14 | // GPUUsage contains gpu stats 15 | type GPUUsage struct { 16 | GPU float64 17 | Memory float64 18 | } 19 | 20 | // NewGPUStatsCollector Create a new instance of the GPU stats collector 21 | func NewGPUStatsCollector() GPUStatsCollector { 22 | nvmlClient, err := nvml.New() 23 | 24 | if err != nil { 25 | fmt.Println("No GPU detected. Nvidia driver might be missing") 26 | } else { 27 | err = nvmlClient.Init() 28 | 29 | if err != nil { 30 | fmt.Println("No GPU detected. Nvidia driver might be missing. Error while initializing NVML", err) 31 | nvmlClient = nil 32 | } else { 33 | deviceCount, err := nvmlClient.GetDeviceCount() 34 | 35 | if err != nil { 36 | fmt.Println(err) 37 | } else { 38 | fmt.Printf("NVML is loaded found %d gpus\n", deviceCount) 39 | } 40 | 41 | return GPUStatsCollector{ 42 | nvml: nvmlClient, 43 | deviceCount: deviceCount, 44 | } 45 | } 46 | } 47 | return GPUStatsCollector{} 48 | } 49 | 50 | // GetStats Get GPU stats 51 | func (gpu GPUStatsCollector) GetStats() []GPUUsage { 52 | if gpu.nvml == nil { 53 | return nil 54 | } 55 | 56 | var uses []GPUUsage 57 | 58 | for i := uint(0); i < gpu.deviceCount; i++ { 59 | device, err := gpu.nvml.DeviceGetHandleByIndex(i) 60 | if err != nil { 61 | fmt.Println(err) 62 | continue 63 | } 64 | 65 | memory, err := gpu.nvml.DeviceGetMemoryInfo(device) 66 | 67 | if err != nil { 68 | fmt.Println(err) 69 | } 70 | 71 | use, err := gpu.nvml.DeviceGetUtilizationRates(device) 72 | 73 | if err != nil { 74 | fmt.Println(err) 75 | } 76 | 77 | usage := GPUUsage{ 78 | GPU: float64(use.GPU), 79 | Memory: float64(memory.Used) / float64(memory.Total) * 100, 80 | } 81 | uses = append(uses, usage) 82 | } 83 | return uses 84 | } 85 | 86 | // Shutdown Dispose of the Nvidia driver connection 87 | func (gpu GPUStatsCollector) Shutdown() { 88 | if gpu.nvml == nil { 89 | return 90 | } 91 | gpu.nvml.Shutdown() 92 | } 93 | -------------------------------------------------------------------------------- /pkg/node_stats.go: -------------------------------------------------------------------------------- 1 | package batchinsights 2 | 3 | import ( 4 | "github.com/shirou/gopsutil/disk" 5 | "github.com/shirou/gopsutil/mem" 6 | 7 | "github.com/Azure/batch-insights/pkg/utils" 8 | ) 9 | 10 | // ProcessPerfInfo Process specific information 11 | type ProcessPerfInfo struct { 12 | pid int32 13 | name string 14 | cpu float64 15 | memory uint64 16 | } 17 | 18 | // NodeStats Combined model for all metrics being collected at the given interal 19 | type NodeStats struct { 20 | Memory *mem.VirtualMemoryStat 21 | CPUPercents []float64 22 | DiskUsage []*disk.UsageStat 23 | DiskIO *utils.IOStats 24 | NetIO *utils.IOStats 25 | Gpus []GPUUsage 26 | Processes []*ProcessPerfInfo 27 | } 28 | -------------------------------------------------------------------------------- /pkg/processes.go: -------------------------------------------------------------------------------- 1 | package batchinsights 2 | 3 | import ( 4 | "strings" 5 | 6 | "github.com/shirou/gopsutil/process" 7 | ) 8 | 9 | // love to program in go 10 | func containsCaseInsensitive(xs []string, str string) bool { 11 | 12 | for _, x := range xs { 13 | if strings.EqualFold(str, x) { 14 | return true 15 | } 16 | } 17 | 18 | return false 19 | 20 | } 21 | 22 | // ListProcesses Retrieve process cpu, memory, etc usage for the given list of process names 23 | func ListProcesses(processNames []string) ([]*ProcessPerfInfo, error) { 24 | pids, err := process.Pids() 25 | if err != nil { 26 | return nil, err 27 | } 28 | 29 | ps := []*ProcessPerfInfo{} 30 | for _, pid := range pids { 31 | 32 | // if err != nil, process has probably disappeared, continue on 33 | if p, err := process.NewProcess(pid); err == nil { 34 | 35 | name, err := p.Name() 36 | if err != nil { 37 | // process might have disappeared 38 | continue 39 | } 40 | 41 | // check if we should include it 42 | if !containsCaseInsensitive(processNames, name) { 43 | continue 44 | } 45 | 46 | cpuPercent, err := p.CPUPercent() 47 | if err != nil { 48 | // process might have disappeared 49 | continue 50 | } 51 | 52 | memoryInfoStat, err := p.MemoryInfo() 53 | if err != nil { 54 | // process might have disappeared 55 | continue 56 | } 57 | 58 | ps = append(ps, &ProcessPerfInfo{ 59 | pid: pid, 60 | name: name, 61 | cpu: cpuPercent, 62 | memory: memoryInfoStat.VMS, 63 | }) 64 | } 65 | 66 | } 67 | 68 | return ps, err 69 | } 70 | -------------------------------------------------------------------------------- /pkg/utils/io_aggregator.go: -------------------------------------------------------------------------------- 1 | package utils 2 | 3 | import ( 4 | "time" 5 | ) 6 | 7 | type IOAggregator struct { 8 | lastTimestamp *time.Time 9 | lastRead uint64 10 | lastWrite uint64 11 | } 12 | 13 | type IOStats struct { 14 | ReadBps uint64 15 | WriteBps uint64 16 | } 17 | 18 | func (aggregator *IOAggregator) UpdateAggregates(currentRead uint64, currentWrite uint64) IOStats { 19 | var now = time.Now() 20 | var readBps uint64 21 | var writeBps uint64 22 | 23 | if aggregator.lastTimestamp != nil { 24 | 25 | var delta = now.Sub(*aggregator.lastTimestamp).Seconds() 26 | readBps = uint64(float64(currentRead-aggregator.lastRead) / delta) 27 | writeBps = uint64(float64(currentWrite-aggregator.lastWrite) / delta) 28 | } 29 | 30 | aggregator.lastTimestamp = &now 31 | aggregator.lastRead = currentRead 32 | aggregator.lastWrite = currentWrite 33 | 34 | return IOStats{ 35 | ReadBps: readBps, 36 | WriteBps: writeBps, 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /pkg/version.go: -------------------------------------------------------------------------------- 1 | package batchinsights 2 | 3 | // Version Batch insights version 4 | const Version = "1.3.0" 5 | -------------------------------------------------------------------------------- /pkg/wmi/wmi.go: -------------------------------------------------------------------------------- 1 | // +build windows 2 | 3 | package wmi 4 | 5 | import ( 6 | "context" 7 | "time" 8 | 9 | wmi_lib "github.com/StackExchange/wmi" 10 | ) 11 | 12 | const Timeout = 3 * time.Second 13 | 14 | func CreateQuery(src interface{}, where string) string { 15 | return wmi_lib.CreateQuery(src, where) 16 | } 17 | 18 | func QueryWithContext(ctx context.Context, query string, dst interface{}, connectServerArgs ...interface{}) error { 19 | if _, ok := ctx.Deadline(); !ok { 20 | ctxTimeout, cancel := context.WithTimeout(ctx, Timeout) 21 | defer cancel() 22 | ctx = ctxTimeout 23 | } 24 | 25 | errChan := make(chan error, 1) 26 | go func() { 27 | errChan <- wmi_lib.Query(query, dst, connectServerArgs...) 28 | }() 29 | 30 | select { 31 | case <-ctx.Done(): 32 | return ctx.Err() 33 | case err := <-errChan: 34 | return err 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /scripts/1.x/run-linux.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e; 3 | 4 | wget -O ./batch-insights "$BATCH_INSIGHTS_DOWNLOAD_URL"; 5 | chmod +x ./batch-insights; 6 | ./batch-insights $AZ_BATCH_INSIGHTS_ARGS > batch-insights.log & -------------------------------------------------------------------------------- /scripts/1.x/run-windows.ps1: -------------------------------------------------------------------------------- 1 | $wd = $env:AZ_BATCH_TASK_WORKING_DIR 2 | 3 | $exe = "$wd/batch-insights.exe" 4 | 5 | [Net.ServicePointManager]::SecurityProtocol = "tls12, tls11, tls" 6 | Invoke-WebRequest -Uri $env:BATCH_INSIGHTS_DOWNLOAD_URL -OutFile $exe 7 | 8 | # Delete if exists 9 | $exists = Get-ScheduledTask | Where-Object {$_.TaskName -like "batchappinsights" }; 10 | 11 | if($exists) 12 | { 13 | Write-Output "Scheduled task already exists. Removing it and restarting it"; 14 | Stop-ScheduledTask -TaskName "batchappinsights"; 15 | Unregister-ScheduledTask -Confirm:$false -TaskName "batchappinsights"; 16 | } 17 | 18 | Write-Output "Starting App insights background process in $wd" 19 | 20 | # If using batch insights 1.x you need to have those arguments this way 21 | $legacyArgs = "$env:AZ_BATCH_POOL_ID $env:AZ_BATCH_NODE_ID $env:APP_INSIGHTS_INSTRUMENTATION_KEY"; 22 | 23 | $toolArgs = "$legacyArgs --poolID $env:AZ_BATCH_POOL_ID --nodeID $env:AZ_BATCH_NODE_ID --instKey $env:APP_INSIGHTS_INSTRUMENTATION_KEY $env:AZ_BATCH_INSIGHTS_ARGS" 24 | 25 | # TODO-TIM add toolsArgs 26 | $action = New-ScheduledTaskAction -WorkingDirectory $wd -Execute 'cmd.exe' -Argument "/c $exe $toolArgs > $wd\nodestats.log 2>&1" 27 | $principal = New-ScheduledTaskPrincipal -UserID 'NT AUTHORITY\SYSTEM' -LogonType ServiceAccount -RunLevel Highest ; 28 | $settings = New-ScheduledTaskSettingsSet -RestartCount 255 -RestartInterval ([timespan]::FromMinutes(1)) -ExecutionTimeLimit ([timespan]::FromDays(365)) 29 | Register-ScheduledTask -Action $action -Principal $principal -TaskName "batchappinsights" -Settings $settings -Force 30 | 31 | Start-ScheduledTask -TaskName "batchappinsights"; 32 | Get-ScheduledTask -TaskName "batchappinsights"; 33 | -------------------------------------------------------------------------------- /scripts/README.md: -------------------------------------------------------------------------------- 1 | # Script to be used as one liner 2 | 3 | ## Major version compatibility 4 | Scripts in directories `1.x`, etc. contains version of the below script which are locked for a given version major version of batch insights. 5 | This means that you can reference those to make sure you don't get broken when a new major version comes in. 6 | **It is recommended to do so.** 7 | 8 | ## Scripts 9 | 10 | ### For linux 11 | 12 | * `run-linux.sh`: Run a published version for linux 13 | * `dev.sh`: Will install go, git then build and run on the fly 14 | 15 | 16 | ### For windows 17 | * `run-windows.ps1`: Run a publush version for windows 18 | * `dev-windows.ps1`: Will install go, git then build and run on the fly 19 | 20 | 21 | ## Development 22 | There is some dev script that will install go and other needed dependencies to build and run this project on the fly. 23 | Set `BATCH_INSIGHTS_BRANCH` environment variable to the branch you are testing 24 | 25 | On linux 26 | ```bash 27 | /bin/bash -c 'wget -O - https://raw.githubusercontent.com/Azure/batch-insights/$BATCH_INSIGHTS_BRANCH/scripts/dev.sh | bash' 28 | ``` 29 | 30 | On windows 31 | 32 | ```powershell 33 | cmd /c @"%SystemRoot%\System32\WindowsPowerShell\v1.0\powershell.exe" -NoProfile -InputFormat None -ExecutionPolicy Bypass -Command "iex ((New-Object System.Net.WebClient).DownloadString('https://raw.githubusercontent.com/Azure/batch-insights/%BATCH_INSIGHTS_BRANCH%/scripts/dev-windows.ps1'))" 34 | ``` 35 | -------------------------------------------------------------------------------- /scripts/dev-windows.ps1: -------------------------------------------------------------------------------- 1 | $ErrorActionPreference = "Stop" 2 | 3 | $wd = $env:AZ_BATCH_TASK_WORKING_DIR 4 | $branch = $env:BATCH_INSIGHTS_BRANCH 5 | 6 | Invoke-Expression ((New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1')) 7 | choco install -y golang git 8 | choco install -y -f mingw 9 | choco install -y mingw 10 | $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User") 11 | 12 | git clone https://github.com/Azure/batch-insights -b $branch 13 | 14 | Set-Location ./batch-insights 15 | 16 | go build -x -v 17 | 18 | $exe = "$wd/batch-insights/batch-insights.exe" 19 | 20 | # Delete if exists 21 | $exists = Get-ScheduledTask | Where-Object {$_.TaskName -like "batchappinsights" }; 22 | 23 | if($exists) 24 | { 25 | Write-Host "Scheduled task already exists. Removing it and restarting it"; 26 | Stop-ScheduledTask -TaskName "batchappinsights"; 27 | Unregister-ScheduledTask -Confirm:$false -TaskName "batchappinsights"; 28 | } 29 | 30 | 31 | $toolArgs = "--poolID $env:AZ_BATCH_POOL_ID --nodeID $env:AZ_BATCH_NODE_ID --instKey $env:APP_INSIGHTS_INSTRUMENTATION_KEY $env:AZ_BATCH_INSIGHTS_ARGS" 32 | 33 | Write-Host "Starting App insights background process in $wd" 34 | $action = New-ScheduledTaskAction -WorkingDirectory $wd -Execute 'cmd.exe' -Argument "/c $exe $toolArgs > .\batch-insights.log 2>&1" 35 | $principal = New-ScheduledTaskPrincipal -UserID 'NT AUTHORITY\SYSTEM' -LogonType ServiceAccount -RunLevel Highest ; 36 | Register-ScheduledTask -Action $action -Principal $principal -TaskName "batchappinsights" -Force ; 37 | Start-ScheduledTask -TaskName "batchappinsights"; 38 | Get-ScheduledTask -TaskName "batchappinsights"; -------------------------------------------------------------------------------- /scripts/dev.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | branch=$BATCH_INSIGHTS_BRANCH 4 | echo "Running Batch insights dev script for linux from branch $branch" 5 | 6 | apt-get update 7 | apt-get install -y git binutils bison build-essential 8 | 9 | export GOROOT=/usr/local/go 10 | if [ -d "$GOROOT" ]; then rm -rf $GOROOT; fi 11 | 12 | wget https://dl.google.com/go/go1.11.linux-amd64.tar.gz 13 | tar -xvf go1.11.linux-amd64.tar.gz 14 | mv go /usr/local 15 | export PATH=$GOPATH/bin:$GOROOT/bin:$PATH 16 | 17 | echo GO version $(go version) 18 | 19 | git clone https://github.com/Azure/batch-insights -b $branch 20 | 21 | cd batch-insights 22 | go build 23 | 24 | ./batch-insights $AZ_BATCH_INSIGHTS_ARGS > $AZ_BATCH_TASK_WORKING_DIR/batch-insights.log 2>&1 & 25 | -------------------------------------------------------------------------------- /scripts/gpu-init.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | # apt remove nvidia-cuda-toolkit 3 | # apt remove nvidia-* 4 | apt update 5 | apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub 6 | bash -c 'echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list' 7 | apt update 8 | apt install -y nvidia-driver-410 --no-install-recommends 9 | apt install -y cuda-10-0 --no-install-recommends 10 | apt-get install -y git binutils bison build-essential --no-install-recommends 11 | -------------------------------------------------------------------------------- /scripts/gpu-linux-test.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | # apt remove nvidia-cuda-toolkit 3 | # apt remove nvidia-* 4 | apt update 5 | apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub 6 | bash -c 'echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list' 7 | apt update 8 | apt install -y nvidia-driver-410 --no-install-recommends 9 | apt install -y cuda-10-0 --no-install-recommends 10 | apt-get install -y git binutils bison build-essential --no-install-recommends 11 | 12 | 13 | export GOROOT=/usr/local/go 14 | if [ -d "$GOROOT" ]; then rm -rf $GOROOT; fi 15 | 16 | wget https://dl.google.com/go/go1.11.linux-amd64.tar.gz 17 | tar -xvf go1.11.linux-amd64.tar.gz 18 | mv go /usr/local 19 | export PATH=$GOPATH/bin:$GOROOT/bin:$PATH 20 | 21 | echo GO version $(go version) 22 | 23 | git clone https://github.com/Azure/batch-insights -b feature/go-gpu 24 | 25 | cd batch-insights 26 | go build 27 | 28 | ./batch-insights -------------------------------------------------------------------------------- /scripts/gpu-windows-test.ps1: -------------------------------------------------------------------------------- 1 | $ErrorActionPreference = "Stop" 2 | 3 | $wd = $env:AZ_BATCH_TASK_WORKING_DIR 4 | 5 | Invoke-Expression ((New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1')) 6 | choco install -y golang git mingw 7 | $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User") 8 | 9 | git clone https://github.com/Azure/batch-insights -b feature/go-gpu 10 | 11 | Set-Location ./batch-insights 12 | 13 | cmd /c "go build" 14 | 15 | $exe = "$wd/batch-insights/batch-insights.exe" 16 | 17 | & $exe -------------------------------------------------------------------------------- /scripts/run-linux.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e; 3 | 4 | wget -O ./batch-insights "$BATCH_INSIGHTS_DOWNLOAD_URL"; 5 | chmod +x ./batch-insights; 6 | ./batch-insights $AZ_BATCH_INSIGHTS_ARGS > batch-insights.log & -------------------------------------------------------------------------------- /scripts/run-windows.ps1: -------------------------------------------------------------------------------- 1 | $wd = $env:AZ_BATCH_TASK_WORKING_DIR 2 | 3 | $exe = "$wd/batch-insights.exe" 4 | 5 | [Net.ServicePointManager]::SecurityProtocol = "tls12, tls11, tls" 6 | Invoke-WebRequest -Uri $env:BATCH_INSIGHTS_DOWNLOAD_URL -OutFile $exe 7 | 8 | # Delete if exists 9 | $exists = Get-ScheduledTask | Where-Object {$_.TaskName -like "batchappinsights" }; 10 | 11 | if($exists) 12 | { 13 | Write-Output "Scheduled task already exists. Removing it and restarting it"; 14 | Stop-ScheduledTask -TaskName "batchappinsights"; 15 | Unregister-ScheduledTask -Confirm:$false -TaskName "batchappinsights"; 16 | } 17 | 18 | Write-Output "Starting App insights background process in $wd" 19 | 20 | # If using batch insights 1.x you need to have those arguments this way 21 | $legacyArgs = "$env:AZ_BATCH_POOL_ID $env:AZ_BATCH_NODE_ID $env:APP_INSIGHTS_INSTRUMENTATION_KEY"; 22 | 23 | $toolArgs = "$legacyArgs --poolID $env:AZ_BATCH_POOL_ID --nodeID $env:AZ_BATCH_NODE_ID --instKey $env:APP_INSIGHTS_INSTRUMENTATION_KEY $env:AZ_BATCH_INSIGHTS_ARGS" 24 | 25 | # TODO-TIM add toolsArgs 26 | $action = New-ScheduledTaskAction -WorkingDirectory $wd -Execute 'cmd.exe' -Argument "/c $exe $toolArgs > $wd\nodestats.log 2>&1" 27 | $principal = New-ScheduledTaskPrincipal -UserID 'NT AUTHORITY\SYSTEM' -LogonType ServiceAccount -RunLevel Highest ; 28 | $settings = New-ScheduledTaskSettingsSet -RestartCount 255 -RestartInterval ([timespan]::FromMinutes(1)) -ExecutionTimeLimit ([timespan]::FromDays(365)) 29 | Register-ScheduledTask -Action $action -Principal $principal -TaskName "batchappinsights" -Settings $settings -Force 30 | 31 | Start-ScheduledTask -TaskName "batchappinsights"; 32 | Get-ScheduledTask -TaskName "batchappinsights"; 33 | -------------------------------------------------------------------------------- /ubuntu.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | 3 | apt-get update 4 | apt-get -y install python-dev python-pip 5 | pip install psutil python-dateutil applicationinsights==0.11.3 6 | wget --no-cache https://raw.githubusercontent.com/Azure/batch-insights/master/nodestats.py 7 | python --version 8 | python nodestats.py > batch-insights.log 2>&1 & 9 | -------------------------------------------------------------------------------- /windows.ps1: -------------------------------------------------------------------------------- 1 | $ErrorActionPreference = "Stop" 2 | 3 | $wd = $env:AZ_BATCH_TASK_WORKING_DIR 4 | 5 | Invoke-Expression ((New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1')) 6 | choco install -y python --version 3.6.3 7 | $env:Path = [System.Environment]::GetEnvironmentVariable("Path","Machine") + ";" + [System.Environment]::GetEnvironmentVariable("Path","User") 8 | Write-Host "Current path: $env:Path" 9 | 10 | Write-Host "Python version:" 11 | python --version 12 | pip install psutil python-dateutil applicationinsights==0.11.3 13 | Write-Host "Downloading nodestats.py" 14 | Invoke-WebRequest https://raw.githubusercontent.com/Azure/batch-insights/master/nodestats.py -OutFile nodestats.py 15 | 16 | # Delete if exists 17 | $exists = Get-ScheduledTask | Where-Object {$_.TaskName -like "batchappinsights" }; 18 | 19 | if($exists) 20 | { 21 | Write-Host "Scheduled task already exists. Removing it and restarting it"; 22 | Stop-ScheduledTask -TaskName "batchappinsights"; 23 | Unregister-ScheduledTask -Confirm:$false -TaskName "batchappinsights"; 24 | } 25 | 26 | $pythonPath = get-command python | Select-OBject -ExpandProperty Definition 27 | Write-Host "Resolved python path to $pythonPath" 28 | 29 | Write-Host "Starting App insights background process in $wd" 30 | $action = New-ScheduledTaskAction -WorkingDirectory $wd -Execute 'Powershell.exe' -Argument "Start-Process $pythonPath -ArgumentList ('.\nodestats.py','$env:AZ_BATCH_POOL_ID', '$env:AZ_BATCH_NODE_ID', '$env:APP_INSIGHTS_INSTRUMENTATION_KEY') -RedirectStandardOutput .\batch-insights.log -RedirectStandardError .\batch-insights.err.log -NoNewWindow" 31 | $principal = New-ScheduledTaskPrincipal -UserID 'NT AUTHORITY\SYSTEM' -LogonType ServiceAccount -RunLevel Highest ; 32 | Register-ScheduledTask -Action $action -Principal $principal -TaskName "batchappinsights" -Force ; 33 | Start-ScheduledTask -TaskName "batchappinsights"; 34 | Get-ScheduledTask -TaskName "batchappinsights"; --------------------------------------------------------------------------------