├── images ├── graph1.png ├── graph2.png ├── rtm_api.png ├── dashboard1.png ├── dashboard2.png ├── datasource1.png ├── datasource2.png ├── rtm_grafana.png ├── rtm_manager1.png ├── rtm_manager2.png └── rtm_manager3.png ├── CHANGELOG.md ├── deb ├── input ├── after-install.sh └── scripts │ ├── nvidia_smi_stats │ ├── rtmHourly.pl │ ├── rtmRaidCheck.pl │ └── rtmHardware.pl ├── MAINTAINERS.md ├── AUTHORS.md ├── CONTRIBUTORS.md ├── LICENCE.md ├── rtm_grafana.md ├── Makefile ├── CONTRIBUTING.md └── README.md /images/graph1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ovh/rtm/HEAD/images/graph1.png -------------------------------------------------------------------------------- /images/graph2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ovh/rtm/HEAD/images/graph2.png -------------------------------------------------------------------------------- /images/rtm_api.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ovh/rtm/HEAD/images/rtm_api.png -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # CHANGELOG 2 | 3 | ovh-rtm-binaries: 1.0.12 4 | 5 | initial commit. 6 | -------------------------------------------------------------------------------- /images/dashboard1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ovh/rtm/HEAD/images/dashboard1.png -------------------------------------------------------------------------------- /images/dashboard2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ovh/rtm/HEAD/images/dashboard2.png -------------------------------------------------------------------------------- /images/datasource1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ovh/rtm/HEAD/images/datasource1.png -------------------------------------------------------------------------------- /images/datasource2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ovh/rtm/HEAD/images/datasource2.png -------------------------------------------------------------------------------- /images/rtm_grafana.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ovh/rtm/HEAD/images/rtm_grafana.png -------------------------------------------------------------------------------- /images/rtm_manager1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ovh/rtm/HEAD/images/rtm_manager1.png -------------------------------------------------------------------------------- /images/rtm_manager2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ovh/rtm/HEAD/images/rtm_manager2.png -------------------------------------------------------------------------------- /images/rtm_manager3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ovh/rtm/HEAD/images/rtm_manager3.png -------------------------------------------------------------------------------- /deb/input: -------------------------------------------------------------------------------- 1 | deb/scripts/rtmHardware.pl=/usr/bin/rtmHardware 2 | deb/scripts/rtmHourly.pl=/usr/bin/rtmHourly 3 | deb/scripts/rtmRaidCheck.pl=/usr/bin/rtmRaidCheck 4 | deb/scripts/nvidia_smi_stats=/usr/bin/nvidia_smi_stats 5 | -------------------------------------------------------------------------------- /MAINTAINERS.md: -------------------------------------------------------------------------------- 1 | This is the official list of the project maintainers. 2 | This is mostly useful for contributors that want to push 3 | significant pull requests or for project management issues. 4 | 5 | 6 | Names should be added to this file like so: 7 | Individual's name 8 | Individual's name 9 | 10 | ## Please keep the list sorted. 11 | 12 | - Alexis AUTRET (alexis.Autret@ovhcloud.com) -------------------------------------------------------------------------------- /AUTHORS.md: -------------------------------------------------------------------------------- 1 | This is the official list of OVH RTM authors for copyright purposes. 2 | This file is distinct from the CONTRIBUTORS files 3 | and it lists the copyright holders only. 4 | 5 | Names should be added to this file as one of 6 | Organization's name 7 | Individual's name 8 | Individual's name 9 | See CONTRIBUTORS for the meaning of multiple email addresses. 10 | 11 | ##Please keep the list sorted. 12 | 13 | OVH SAS 14 | -------------------------------------------------------------------------------- /CONTRIBUTORS.md: -------------------------------------------------------------------------------- 1 | This is the official list of people who can contribute 2 | (and typically have contributed) code to the OVH RTM repository. 3 | 4 | Names should be added to this file only after verifying that 5 | the individual or the individual's organization has agreed to 6 | the appropriate CONTRIBUTING.md file. 7 | 8 | Names should be added to this file like so: 9 | Individual's name 10 | Individual's name 11 | 12 | ## Please keep the list sorted. 13 | 14 | - Alexis AUTRET (alexis.autret@ovhcloud.com) 15 | - Raphael GLON (raphael.glon@ovhcloud.com) 16 | - Sylvain CHENOT (sylvain.chenot@ovhcloud.com) 17 | -------------------------------------------------------------------------------- /LICENCE.md: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2013-2020, OVH SAS 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /rtm_grafana.md: -------------------------------------------------------------------------------- 1 | # RTM on grafana 2 | 3 | on OVH grafana public instance using Insight metrics token with TSL 4 | 5 | More info on TSL: https://github.com/ovh/tsl 6 | 7 | ## Retrieve your metric insight token 8 | 9 | On OVH API, retrieve your insight token using this call: 10 | /me/insight 11 | 12 | More info on Insight : https://docs.ovh.com/gb/en/metrics/metrics-insight/ 13 | 14 | ## Use OVH grafana public instance 15 | 16 | Go on https://grafana.metrics.ovh.net/login and login with you OVH account 17 | 18 | ## Configure a data source 19 | 20 | ![](images/datasource1.png) 21 | 22 | ### add a Warp10 datasource 23 | 24 | Select a warp10 type datasource. 25 | 26 | TSL is not yet integrated as datasource, we will use it trough warp10 script. 27 | 28 | 29 | ![](images/datasource2.png) 30 | 31 | 32 | URL datasource is : https://warp10.insight.eu.metrics.ovh.net 33 | 34 | Copy your insight Token as constant value with name "rtoken" (and don't forget to click on "+" button to add it) 35 | 36 | ## Create your first graph 37 | 38 | ### Create your dashboard 39 | 40 | ![](images/dashboard1.png) 41 | 42 | -Create new Visualization 43 | 44 | select graph: 45 | 46 | ![](images/dashboard2.png) 47 | 48 | -Then edit the graph: 49 | 50 | tick all the options "Hide labelds, Hide attribute and Warpscript10 editor". 51 | 52 | ![](images/graph1.png) 53 | 54 | Select WarpScript editor and copy paste this TSL query: 55 | 56 | ``` 57 | $rtoken 58 | <' 59 | select("os.cpu").where("ip=YOUR SERVER IP").from(start,end) 60 | '> 61 | TSL 62 | ``` 63 | 64 | **metrics available are:** 65 | * os.cpu 66 | * os.swap 67 | * os.mem 68 | * os.net.bytes (both direction) 69 | * os.load1 70 | * os.load5 71 | * os.load15 72 | * os.load.processesup 73 | * os.load.processesactive 74 | 75 | for each disk: 76 | * os.disk.fs 77 | * os.disk.fs.used 78 | * os.disk.fs.total 79 | * os.disk.fs.inodes.used 80 | * os.disk.fs.inodes.total 81 | 82 | 83 | This is it , you can now graph your server metrics :D 84 | 85 | 86 | TODO: provide a complete template 87 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | BUILD_DIR=./build 2 | 3 | .PHONY: build 4 | build: clean 5 | mkdir -p $(BUILD_DIR) 6 | 7 | .PHONY: deb 8 | deb: 9 | rm -f $(BUILD_DIR)/ovh-rtm-binaries*.deb 10 | mkdir -p $(BUILD_DIR) 11 | fpm -m ""\ 12 | --description "ovh real time monitoring. This package provide OVH Real Times Monitoring scripts." \ 13 | --url "https://docs.ovh.com/gb/en/dedicated/install-rtm/" \ 14 | --license "BSD-3-Clause" \ 15 | --version $(shell echo $$(git for-each-ref --sort=taggerdate --format '%(refname) %(taggerdate)' refs/tags | tail -n 1 | awk '{print $$1}' | sed 's/refs\/tags\///')-$$(lsb_release -cs)) \ 16 | -n ovh-rtm-binaries \ 17 | -d 'smartmontools' \ 18 | -d 'hddtemp' \ 19 | -d 'dmidecode' \ 20 | -d 'util-linux' \ 21 | -d 'sg3-utils' \ 22 | -d 'lsscsi' \ 23 | -d 'sysstat' \ 24 | -d 'lsb-release' \ 25 | -s dir -t deb \ 26 | --vendor "ovh" \ 27 | -a all \ 28 | --after-install deb/after-install.sh \ 29 | -p ./build \ 30 | --inputs deb/input \ 31 | --deb-no-default-config-files 32 | 33 | .PHONY: rpm 34 | rpm: 35 | rm -f $(BUILD_DIR)/ovh-rtm-binaries*.rpm 36 | mkdir -p $(BUILD_DIR) 37 | fpm -m ""\ 38 | --description "ovh real time monitoring. This package provide OVH Real Times Monitoring scripts." \ 39 | --url "https://docs.ovh.com/gb/en/dedicated/install-rtm/" \ 40 | --license "BSD-3-Clause" \ 41 | --version $(shell git for-each-ref --sort=taggerdate --format '%(refname) %(taggerdate)' refs/tags | tail -n 1 | awk '{print $$1}' | sed 's/refs\/tags\///') \ 42 | -n ovh-rtm-binaries \ 43 | -d 'smartmontools' \ 44 | -d 'dmidecode' \ 45 | -d 'util-linux' \ 46 | -d 'sg3_utils' \ 47 | -d 'lsscsi' \ 48 | -d 'sysstat' \ 49 | -d 'redhat-lsb'\ 50 | --vendor "ovh" \ 51 | --rpm-os "linux" \ 52 | -a all \ 53 | --after-install deb/after-install.sh \ 54 | -p ./build \ 55 | --inputs deb/input \ 56 | -s dir -t rpm 57 | 58 | .PHONY: clean 59 | clean: 60 | rm -rf build 61 | rm -f *.deb 62 | rm -f *.rpm 63 | -------------------------------------------------------------------------------- /deb/after-install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # check if we have a nvidia card on this host 3 | if hash lshw 2>/dev/null; then 4 | echo "lshw found, checking if Nvidia card is present"; 5 | SEARCH='display[ ]*NVIDIA|vendor: NVIDIA Corporation' # should match: display NVIDIA Corporation 6 | TEST=`lshw -class display | grep -Ei "$SEARCH"` 7 | exit_status=$? 8 | if [ $exit_status -eq 0 ]; then 9 | echo "Nvidia graphics cards found, installing metrics probes" 10 | if [ ! -d "/opt/noderig/60" ]; then 11 | mkdir -p /opt/noderig/60/ 12 | fi 13 | if [ ! -f /opt/noderig/60/nvidia_smi_stats ]; then 14 | ln -s /usr/bin/nvidia_smi_stats /opt/noderig/60/nvidia_smi_stats 15 | fi 16 | if hash nvidia-smi 2>/dev/null; then 17 | echo "nvidia-smi driver found, install OK" 18 | else 19 | echo "nvidia-smi package not found, please install-it (and drivers) to get this probe working." 20 | fi 21 | else 22 | echo "no Nvidia graphics card found" 23 | # rm probe 24 | rm /usr/bin/nvidia_smi_stats 25 | fi 26 | elif hash lspci 2>/dev/null; then 27 | echo "lspci found, checking if Nvidia card is present" # lspci 28 | SEARCH='VGA compatible controller: NVIDIA|3D controller: NVIDIA' # should match: VGA compatible controller: NVIDIA 29 | TEST=`lspci | grep -Ei "$SEARCH"` 30 | exit_status=$? 31 | if [ $exit_status -eq 0 ]; then 32 | echo "Nvidia graphics cards found, installing metrics probes" 33 | if [ ! -d "/opt/noderig/60" ]; then 34 | mkdir -p /opt/noderig/60/ 35 | fi 36 | if [ ! -f /opt/noderig/60/nvidia_smi_stats ]; then 37 | ln -s /usr/bin/nvidia_smi_stats /opt/noderig/60/nvidia_smi_stats 38 | fi 39 | if hash nvidia-smi 2>/dev/null; then 40 | echo "nvidia-smi driver found, install OK" 41 | else 42 | echo "nvidia-smi package not found, please install-it (and drivers) to get this probe working." 43 | fi 44 | else 45 | echo "no Nvidia graphics card found" 46 | rm /usr/bin/nvidia_smi_stats 47 | fi 48 | else 49 | echo "lshw or lspci packages not found, please install them if you need nvidia metrics probe" 50 | fi 51 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to OVH-RTM 2 | 3 | This project accepts contributions. In order to contribute, you should 4 | pay attention to a few things: 5 | 6 | 1. your code must follow the coding style rules 7 | 2. your code must be unit-tested 8 | 3. your code must be documented 9 | 4. your work must be signed (see below) 10 | 5. you may contribute through GitHub Pull Requests 11 | 12 | # Coding and documentation Style 13 | 14 | ##LANGUAGE_GUIDELINES## 15 | 16 | # Submitting Modifications 17 | 18 | The contributions should be submitted through Github Pull Requests 19 | and follow the DCO which is defined below. 20 | 21 | # Licensing for new files 22 | 23 | OVH-RTM is licensed under a 3-Clause BSD license. Anything 24 | contributed to OVH-RTM must be released under this license. 25 | 26 | When introducing a new file into the project, please make sure it has a 27 | copyright header making clear under which license it's being released. 28 | 29 | # Developer Certificate of Origin (DCO) 30 | 31 | To improve tracking of contributions to this project we will use a 32 | process modeled on the modified DCO 1.1 and use a "sign-off" procedure 33 | on patches that are being emailed around or contributed in any other 34 | way. 35 | 36 | The sign-off is a simple line at the end of the explanation for the 37 | patch, which certifies that you wrote it or otherwise have the right 38 | to pass it on as an open-source patch. The rules are pretty simple: 39 | if you can certify the below: 40 | 41 | By making a contribution to this project, I certify that: 42 | 43 | (a) The contribution was created in whole or in part by me and I have 44 | the right to submit it under the open source license indicated in 45 | the file; or 46 | 47 | (b) The contribution is based upon previous work that, to the best of 48 | my knowledge, is covered under an appropriate open source License 49 | and I have the right under that license to submit that work with 50 | modifications, whether created in whole or in part by me, under 51 | the same open source license (unless I am permitted to submit 52 | under a different license), as indicated in the file; or 53 | 54 | (c) The contribution was provided directly to me by some other person 55 | who certified (a), (b) or (c) and I have not modified it. 56 | 57 | (d) The contribution is made free of any other party's intellectual 58 | property claims or rights. 59 | 60 | (e) I understand and agree that this project and the contribution are 61 | public and that a record of the contribution (including all 62 | personal information I submit with it, including my sign-off) is 63 | maintained indefinitely and may be redistributed consistent with 64 | this project or the open source license(s) involved. 65 | 66 | 67 | then you just add a line saying 68 | 69 | Signed-off-by: Random J Developer 70 | 71 | using your real name (sorry, no pseudonyms or anonymous contributions.) 72 | -------------------------------------------------------------------------------- /deb/scripts/nvidia_smi_stats: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | $ENV{"LC_ALL"} = "POSIX"; 3 | use strict; 4 | use warnings; 5 | use IPC::Open3; 6 | 7 | my $fnret = smi_stats(); 8 | if(!ok($fnret) ) 9 | { 10 | print "Error with nvidia smi stats \n"; 11 | } 12 | else 13 | { 14 | # ok 15 | } 16 | 17 | sub smi_stats 18 | { 19 | my $fnret = execute("/usr/bin/nvidia-smi", "--query-gpu=gpu_name,gpu_uuid,index,memory.total,memory.used,utilization.memory,utilization.gpu,encoder.stats.sessionCount,temperature.gpu,power.draw", "--format=csv,noheader,nounits"); 20 | if ( $fnret->{status} != 100 ) 21 | { 22 | print "$fnret->{msg} \n"; 23 | return { status => 500, msg => "nvidia-smi error: $fnret->{msg}" }; 24 | } 25 | else 26 | { 27 | my @values = @{$fnret->{value}}; 28 | foreach my $value (@values) 29 | { 30 | my @data = split /, /, $value; 31 | my $gpu_name = $data[0]; 32 | my $gpu_uuid = $data[1]; 33 | my $index= $data[2]; 34 | my $memory_total=$data[3] || 0; 35 | my $memory_used=$data[4] || 0; 36 | my $utilization_memory = $data[5] || 0; 37 | my $utilization_gpu=$data[6] || 0; 38 | my $session_count = $data[7] || 0; 39 | my $temperature = $data[8] || 0; 40 | my $power_draw = $data[9] || 0; 41 | printf "rtm_gpu.name{gpu_uuid=$gpu_uuid} ".time." '$gpu_name'\n"; 42 | printf "rtm_gpu.index{gpu_uuid=$gpu_uuid} ".time." $index\n"; 43 | printf "rtm_gpu.memory_total{gpu_uuid=$gpu_uuid} ".time." $memory_total\n"; 44 | printf "rtm_gpu.memory_used{gpu_uuid=$gpu_uuid} ".time." $memory_used\n"; 45 | printf "rtm_gpu.utilization_memory{gpu_uuid=$gpu_uuid} ".time." $utilization_memory\n"; 46 | printf "rtm_gpu.utilization_gpu{gpu_uuid=$gpu_uuid} ".time." $utilization_gpu\n"; 47 | printf "rtm_gpu.session_count{gpu_uuid=$gpu_uuid} ".time." $session_count\n"; 48 | printf "rtm_gpu.temperature{gpu_uuid=$gpu_uuid} ".time." $temperature\n"; 49 | printf "rtm_gpu.power_draw{gpu_uuid=$gpu_uuid} ".time." $power_draw\n"; 50 | } 51 | return {status=>100}; 52 | } 53 | } 54 | 55 | sub ok 56 | { 57 | my $arg = shift; 58 | if ( ref $arg eq 'HASH' and $arg->{status} eq 100 ) 59 | { 60 | return 1; 61 | } 62 | elsif (ref $arg eq 'HASH' and $arg->{status} eq 500 and defined($arg->{msg})) 63 | { 64 | print $arg->{msg}; 65 | } 66 | return 0; 67 | } 68 | 69 | sub execute 70 | { 71 | my ($bin, @args) = @_; 72 | defined($bin) or return { status => 201, msg => 'No binary specified (execute)' }; 73 | 74 | -x $bin or return { status => 200, msg => $bin." not exist? \n" }; 75 | 76 | my ($in, $out); 77 | my $pid = IPC::Open3::open3($in, $out, $out, $bin, @args); 78 | $pid or return { status => 500, msg => 'Failed to fork : '.$!}; 79 | 80 | local $/; 81 | 82 | my $stdout = <$out>; 83 | my $ret = waitpid($pid, 0); 84 | my $status = ($? >> 8); 85 | 86 | close($in); 87 | close($out); 88 | my @stdout = split(/\n/, $stdout); 89 | if ($ret != $pid) 90 | { 91 | return { status => 500, msg => 'Invalid fork return (waitpid)', value => $stdout }; 92 | } 93 | elsif ($status != 0) 94 | { 95 | return { status => 500, msg => 'Binary '.$bin.' exited on a non-zero status ('.$status.')', value => $stdout }; 96 | } 97 | else 98 | { 99 | # Ok 100 | } 101 | return { status => 100, value => \@stdout }; 102 | } 103 | -------------------------------------------------------------------------------- /deb/scripts/rtmHourly.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | $ENV{"LC_ALL"} = "POSIX"; 3 | use strict; 4 | use utf8; # for \x{nnn} regex 5 | use warnings; 6 | use IPC::Open3; 7 | 8 | # init server hash 9 | my %server = (); 10 | 11 | systemInfo(); 12 | hash_walk(\%server, [], \&print_keys_and_value); 13 | 14 | sub systemInfo 15 | { 16 | $server{'rtm.info.rtm.version'} = "1.0.12"; 17 | 18 | my $fnret = processes(); 19 | if (ok($fnret)) 20 | { 21 | $server{"os.load.processesactive"} = $fnret->{value}->{active}; 22 | $server{"os.load.processesup"} = $fnret->{value}->{up}; 23 | } 24 | else 25 | { 26 | print "Error with processes \n"; 27 | } 28 | $fnret = _getTopProcess(); 29 | if (ok($fnret)) 30 | { 31 | # values in server hash 32 | } 33 | else 34 | { 35 | print "Error with getTopProcess \n"; 36 | } 37 | $fnret = _getPortsAndInfos(); 38 | if (ok($fnret)) 39 | { 40 | # values in server hash 41 | } 42 | else 43 | { 44 | print "Error with getPortsAndInfos \n"; 45 | } 46 | $fnret = uptime(); 47 | if (ok($fnret)) 48 | { 49 | $server{"rtm.info.uptime"} = $fnret->{value}; 50 | } 51 | else 52 | { 53 | print "Error with uptime \n"; 54 | } 55 | 56 | # hostname 57 | $fnret = execute('hostname'); 58 | if (ok($fnret) and defined($fnret->{value}[0])) 59 | { 60 | $server{"rtm.hostname"}=$fnret->{value}[0]; 61 | } 62 | else 63 | { 64 | $server{"rtm.hostname"}="Unknow"; 65 | } 66 | } 67 | 68 | # get processes running/count 69 | sub processes 70 | { 71 | my $fnret = execute('/bin/ps --no-headers -C noderig,beamium -o sess | sort -n | uniq'); 72 | if ( $fnret->{status} != 100 ) 73 | { 74 | print $fnret->{msg}." \n"; 75 | return { status => 500, msg => "ps error: ".$fnret->{msg}." \n" }; 76 | } 77 | else 78 | { 79 | my $rtm_sids = $fnret->{value}; 80 | $fnret = execute('/bin/ps --no-headers -A -o sess,state,command'); 81 | if( $fnret->{status} != 100 ) 82 | { 83 | print "ps error: ".$fnret->{msg}."\n"; 84 | return { status => 500, msg => "ps error: ".$fnret->{msg}." \n" }; 85 | } 86 | else 87 | { 88 | my $active = 0; 89 | my $total = 0; 90 | my $ids = $fnret->{value}; 91 | 92 | foreach my $line (@{$ids}) 93 | { 94 | next if $line !~ /(\d+)\s+(\S+)/; 95 | my $sid = $1; 96 | my $state = $2; 97 | if (grep $sid == $_, @{$rtm_sids}) 98 | { 99 | next; 100 | } 101 | ++$total; 102 | ++$active if $state =~ /^R/; 103 | } 104 | return {status=>100, value => {up => $total, active=>$active}}; 105 | } 106 | } 107 | } 108 | 109 | # top process 110 | sub _getTopProcess 111 | { 112 | my $fnret = execute('/bin/ps -A -o vsz,cmd --sort=-vsz --no-headers | head -n 7 | grep -vE "[0123456789]+[ ]/usr/bin/(noderig|beamium)"'); 113 | if ( $fnret->{status} != 100 ) 114 | { 115 | print "ps error: ".$fnret->{msg}." \n"; 116 | return { status => 500, msg => "ps error: ".$fnret->{msg}."\n" }; 117 | } 118 | else 119 | { 120 | for (my $i=1; $i <= 5; $i++) 121 | { 122 | $server{"rtm.info.mem.top_mem_".$i."_name"} = "Unknown"; 123 | $server{"rtm.info.mem.top_mem_".$i."_size"} = "Unknown"; 124 | } 125 | my $i=0; 126 | my @name; 127 | foreach (@{$fnret->{value}}) 128 | { 129 | next unless m/\s*(\d+)\s+(.+)/; 130 | @name=split ' ', $2; 131 | $i++; 132 | $server{'rtm.info.mem.top_mem_'.$i.'_size'}=$1; 133 | $server{'rtm.info.mem.top_mem_'.$i.'_name'}=$name[0]; 134 | } 135 | return {status=>100}; 136 | } 137 | } 138 | 139 | # get port and associated infos 140 | sub _getPortsAndInfos 141 | { 142 | my $maxListenPort = 50; 143 | my $fnret = execute('/bin/netstat -tlenp | grep LISTEN | grep -v \'tcp6\' | awk \'{print $4"|"$9}\''); 144 | if ( $fnret->{status} != 100 ) 145 | { 146 | print $fnret->{msg}."\n"; 147 | return { status => 500, msg => "netstat error: ".$fnret->{msg}."\n" }; 148 | } 149 | else 150 | { 151 | my $netstatTable = $fnret->{value}; 152 | if (open(my $fh, '<', '/etc/passwd')) 153 | { 154 | my @passwd; 155 | chomp(@passwd = <$fh>); 156 | close($fh); 157 | my %passwdHash; 158 | foreach my $passwdLine (@passwd) 159 | { 160 | $passwdLine =~ /^([^:]+):[^:+]:(\d+):/; 161 | $passwdHash{$2} = $1; 162 | } 163 | my $i = 0; 164 | foreach my $line (@{$netstatTable}) 165 | { 166 | my @tempTable = split(/\|/, $line); 167 | my $socketInfo = $tempTable[0]; 168 | my $procInfo = $tempTable[1]; 169 | $socketInfo =~ /:(\d+)$/; 170 | my $port = $1; 171 | $socketInfo =~ /(.+):\d+$/; 172 | my $ip = $1; 173 | $ip =~ s/\./-/g; 174 | $ip =~ s/[^0-9\-]//g; 175 | if ($ip eq "") 176 | { 177 | $ip = 0; 178 | } 179 | @tempTable = split(/\//, $procInfo); 180 | my $pid = $tempTable[0]; 181 | if (open($fh, '<', "/proc/$pid/cmdline")) 182 | { 183 | my $cmdline; 184 | chomp($cmdline = <$fh>); 185 | $cmdline =~ s/\x{0}/ /g; 186 | my @cmdLine = split ' ', $cmdline; 187 | $cmdline = $cmdLine[0]; 188 | close($fh); 189 | if (open($fh, '<', "/proc/$pid/status")) 190 | { 191 | my @status; 192 | chomp(@status = <$fh>); 193 | close($fh); 194 | my $statusLine = join("|", @status); 195 | $statusLine =~ /Uid:\s(\d+)/; 196 | my $uid = $1; 197 | my $username = ''; 198 | if (defined $passwdHash{$uid}) 199 | { 200 | $username = $passwdHash{$uid}; 201 | } 202 | my $procName = $tempTable[1]; 203 | my $exe = readlink("/proc/$pid/exe"); 204 | $server{'rtm.info.tcp.listen.ip-' . $ip . '.port-' . $port . '.pid'} = $pid; 205 | $server{'rtm.info.tcp.listen.ip-' . $ip . '.port-' . $port . '.procname'} = $procName; 206 | $server{'rtm.info.tcp.listen.ip-' . $ip . '.port-' . $port . '.cmdline'} = $cmdline; 207 | $server{'rtm.info.tcp.listen.ip-' . $ip . '.port-' . $port . '.exe'} = $exe; 208 | $server{'rtm.info.tcp.listen.ip-' . $ip . '.port-' . $port . '.username'} = $username; 209 | $server{'rtm.info.tcp.listen.ip-' . $ip . '.port-' . $port . '.uid'} = $uid; 210 | $i++; 211 | last if $i >= $maxListenPort; 212 | } 213 | else 214 | { 215 | print "Could not open /proc/$pid/status"; 216 | return {status=>500}; 217 | } 218 | } 219 | else 220 | { 221 | print "Could not open /proc/$pid/cmdline"; 222 | return {status=>500}; 223 | } 224 | } 225 | return {status=>100}; 226 | } 227 | else 228 | { 229 | print "Could not open /etc/passwd"; 230 | return {status=>500}; 231 | } 232 | } 233 | } 234 | 235 | #uptime 236 | sub uptime 237 | { 238 | if (open(my $fh, '<', "/proc/uptime")) 239 | { 240 | my $uptime = <$fh>; 241 | close($fh); 242 | $uptime =~ /^(\d+)/; 243 | $uptime = $1; 244 | return {status=>100, value => $uptime}; 245 | } 246 | else 247 | { 248 | print "Cannot open /proc/uptime"; 249 | return {status => 500, msg => "Cannot open /proc/loadavg" }; 250 | } 251 | } 252 | 253 | sub print_keys_and_value { 254 | my ($k, $v, $key_list) = @_; 255 | $v =~ s/^\s+|\s+$//g; 256 | my $key; 257 | foreach (@$key_list) 258 | { 259 | if ($key) 260 | { 261 | $key = $key.".".$_; 262 | } 263 | else 264 | { 265 | $key = $key || ""; 266 | $key = $key.$_; 267 | } 268 | } 269 | if (defined($key)) 270 | { 271 | print "{\"metric\":\"$key\",\"timestamp\":".time.",\"value\":\"".$v."\"}\n"; 272 | } 273 | } 274 | 275 | sub hash_walk { 276 | my ($hash, $key_list, $callback) = @_; 277 | while (my ($key, $value) = each (%$hash)) 278 | { 279 | $key =~ s/^\s+|\s+$//g; 280 | push @$key_list, $key; 281 | if (ref($value) eq 'HASH') 282 | { 283 | hash_walk($value,$key_list,$callback) 284 | } 285 | else 286 | { 287 | $callback->($key, $value, $key_list); 288 | } 289 | pop @$key_list; 290 | } 291 | } 292 | 293 | sub ok 294 | { 295 | my $arg = shift; 296 | if ( ref $arg eq 'HASH' and $arg->{status} eq 100 ) 297 | { 298 | return 1; 299 | } 300 | elsif (ref $arg eq 'HASH' and $arg->{status} eq 500 and defined($arg->{msg})) 301 | { 302 | print $arg->{msg}; 303 | } 304 | return 0; 305 | } 306 | 307 | sub execute 308 | { 309 | my ($bin, @args) = @_; 310 | defined($bin) or return { status => 201, msg => 'No binary specified (execute)' }; 311 | 312 | #print("Executing : ".$bin." ".join(" ", @args".\n")); 313 | my ($in, $out); 314 | my $pid = IPC::Open3::open3($in, $out, $out, $bin, @args); 315 | $pid or return { status => 500, msg => 'Failed to fork : '.$! }; 316 | 317 | local $/; 318 | 319 | my $stdout = <$out>; 320 | my $ret = waitpid($pid, 0); 321 | my $status = ($? >> 8); 322 | 323 | close($in); 324 | close($out); 325 | my @stdout = split(/\n/, $stdout); 326 | if ($ret != $pid) 327 | { 328 | return { status => 500, msg => 'Invalid fork return (waitpid)', value => $stdout }; 329 | } 330 | elsif ($status != 0 and $bin ne '/bin/ps') 331 | { 332 | return { status => 500, msg => 'Binary '.$bin.' exited on a non-zero status ('.$status.')', value => $stdout }; 333 | } 334 | else 335 | { 336 | # Ok 337 | } 338 | return { status => 100, value => \@stdout }; 339 | } 340 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OVH RTM 2 | (OVH Real Time Monitoring probes) 3 | 4 | This repository contain OVH RTM probes and packaging script. 5 | It depends on the default implementation of the **ovh-rtm-metrics-toolkit package** and the additional tools **noderig** and **beamium**. 6 | 7 | Real Time Monitoring is composed of 2 packages: 8 | 9 | - **ovh-metrics-toolkit**: configure [beamium](https://github.com/ovh/beamium) and [noderig](https://github.com/ovh/noderig) to push monitoring metrics and probes result's to OVH monitoring platform. 10 | 11 | - **ovh-rtm-binaries**: copies OVH monitoring probes into /usr/bin/rtm\* 12 | 13 | For details about Noderig see the [main repository](https://github.com/ovh/noderig) 14 | 15 | For details about Beamium see the [main repository](https://github.com/ovh/beamium) 16 | 17 | 18 | By installing ovh-rtm-metrics-toolkit package you will be able to have a metrics based monitoring solution for your server. 19 | (only working for baremetal and public cloud). 20 | (public cloud users can only use [Insight](https://docs.ovh.com/fr/metrics/metrics-insight/) to retrieve their metrics) 21 | 22 | 1. Displayed on OVHCloud web manager: 23 | 24 | ![](images/rtm_manager1.png) 25 | 26 | 27 | ![](images/rtm_manager2.png) 28 | 29 | 30 | ![](images/rtm_manager3.png) 31 | 32 | 1. on API: 33 | 34 | ![](images/rtm_api.png) 35 | 36 | 1. But you can also create your own Grafana monitoring dashboard to display metrics values in real time: 37 | (recommended method) 38 | 39 | ![](images/rtm_grafana.png) 40 | 41 | How to proceed: [RTM on Grafana](rtm_grafana.md) 42 | 43 | # How to install OVH RTM packages: 44 | Please refer to OVH docs: 45 | https://docs.ovh.com/gb/en/dedicated/install-rtm/ 46 | 47 | # Releases: 48 | http://last.public.ovh.rtm.snap.mirrors.ovh.net/ 49 | 50 | # Status 51 | OVH datacenters are composed of many type of servers, each running differents OSes with different components. 52 | This monitoring solution try to be compatible with the most part of them and thus still currently under development. 53 | 54 | Feel free to comment or contribute! 55 | 56 | 57 | 58 | # What is collected ? 59 | 60 | ## RTM metrics 61 | 62 | RTM collects real time monitoring data (based on noderig default collectors) on CPU, LOAD, RAM, DISK, NET. 63 | 64 | ## RTM probes 65 | 66 | RTM probes are perl scripts. Results are mainly available on ovh API. 67 | Located in /usr/bin/rtm\*, they are launched at differents intervals. 68 | It depends on which [noderig external collectors](https://github.com/ovh/noderig#collectors-1) folders they are linked. 69 | You can see links located in noderig external collector: 70 | ``` 71 | /opt/noderig/ 72 | ├── 3600 73 | │ ├── rtmHourly -> /usr/bin/rtmHourly 74 | │ └── rtmRaidCheck -> /usr/bin/rtmRaidCheck 75 | └── 43200 76 | └── rtmHardware -> /usr/bin/rtmHardware 77 | ``` 78 | Thoses scripts exposes monitoring results as prometheus format (https://github.com/prometheus/docs/blob/master/content/docs/instrumenting/exposition_formats.md). 79 | 80 | ### rtmHourly 81 | 82 | This probe is executed each 3600s (1hour). It collects informations like the uptime, load average, memory usage, current rtm version installed,the top processes, open ports and the number of ongoing processes. 83 | 84 | #### data exemple 85 | ``` 86 | {"metric":"rtm.info.rtm.version","timestamp":1582207693,"value":"1.0.11"} 87 | {"metric":"rtm.info.uptime","timestamp":1582207693,"value":"11400912"} 88 | {"metric":"rtm.hostname","timestamp":1582207693,"value":"ns10000"} 89 | 90 | {"metric":"os.load.processesactive","timestamp":1582207693,"value":"1"} 91 | {"metric":"os.load.processesup","timestamp":1582207693,"value":"650"} 92 | 93 | {"metric":"rtm.info.mem.top_mem_1_name","timestamp":1582207693,"value":"/usr/bin/syncthing"} 94 | {"metric":"rtm.info.mem.top_mem_1_size","timestamp":1582207693,"value":"4833764"} 95 | {"metric":"rtm.info.mem.top_mem_2_name","timestamp":1582207693,"value":"/usr/bin/smbd"} 96 | {"metric":"rtm.info.mem.top_mem_2_size","timestamp":1582207693,"value":"1871772"} 97 | {"metric":"rtm.info.mem.top_mem_3_name","timestamp":1582207693,"value":"/usr/bin/smbd"} 98 | {"metric":"rtm.info.mem.top_mem_3_size","timestamp":1582207693,"value":"1510164"} 99 | {"metric":"rtm.info.mem.top_mem_4_name","timestamp":1582207693,"value":"/usr/sbin/named"} 100 | {"metric":"rtm.info.mem.top_mem_4_size","timestamp":1582207693,"value":"1025712"} 101 | {"metric":"rtm.info.mem.top_mem_5_name","timestamp":1582207693,"value":"/usr/sbin/rsyslogd"} 102 | {"metric":"rtm.info.mem.top_mem_5_size","timestamp":1582207693,"value":"361880"} 103 | 104 | {"metric":"rtm.info.tcp.listen.ip-0-0-0-0.port-79.uid","timestamp":1582207693,"value":"111"} 105 | {"metric":"rtm.info.tcp.listen.ip-0-0-0-0.port-79.pid","timestamp":1582207693,"value":"4851"} 106 | {"metric":"rtm.info.tcp.listen.ip-0-0-0-0.port-79.username","timestamp":1582207693,"value":"oco"} 107 | {"metric":"rtm.info.tcp.listen.ip-0-0-0-0.port-79.exe","timestamp":1582207693,"value":"/usr/bin/perl"} 108 | {"metric":"rtm.info.tcp.listen.ip-0-0-0-0.port-79.cmdline","timestamp":1582207693,"value":"perl"} 109 | {"metric":"rtm.info.tcp.listen.ip-0-0-0-0.port-79.procname","timestamp":1582207693,"value":"perl"} 110 | ``` 111 | 112 | ### rtmHardware 113 | 114 | This probe is executed each 43200s (12h). It collects information on the hardware such as the motherboard, PCI devices, disk health (S.M.A.R.T data), etc. 115 | Also collects some information on the software, such as the kernel and BIOS version. 116 | 117 | ### data exemple 118 | ``` 119 | {"metric":"rtm.hw.mb.manufacture","timestamp":1582208062,"value":"Supermicro"} 120 | {"metric":"rtm.hw.mb.name","timestamp":1582208062,"value":"X10SRi-F"} 121 | {"metric":"rtm.hw.mb.serial","timestamp":1582208062,"value":"NM175S506822"} 122 | 123 | {"metric":"rtm.info.bios_date","timestamp":1582208062,"value":"12/17/2015"} 124 | {"metric":"rtm.info.bios_version","timestamp":1582208062,"value":"2.0"} 125 | {"metric":"rtm.info.bios_vendor","timestamp":1582208062,"value":"American Megatrends Inc."} 126 | 127 | {"metric":"rtm.info.release.os","timestamp":1582208062,"value":"Ubuntu 16.04 xenial"} 128 | {"metric":"rtm.info.kernel.version","timestamp":1582208062,"value":"#193-Ubuntu SMP Tue Sep 17 17:42:52 UTC 2019"} 129 | {"metric":"rtm.info.kernel.release","timestamp":1582208062,"value":"4.4.0-165-generic"} 130 | 131 | {"metric":"rtm.hw.cpu.name","timestamp":1582208062,"value":"Intel(R) Xeon(R) CPU E5-1650 v4 @ 3.60GHz"} 132 | {"metric":"rtm.hw.cpu.number","timestamp":1582208062,"value":"12"} 133 | {"metric":"rtm.hw.cpu.cache","timestamp":1582208062,"value":"15360 KB"} 134 | {"metric":"rtm.hw.cpu.mhz","timestamp":1582208062,"value":"1212.750"} 135 | 136 | {"metric":"rtm.info.check.vm","timestamp":1582208062,"value":"False"} 137 | {"metric":"rtm.info.check.oops","timestamp":1582208062,"value":"False"} 138 | 139 | {"metric":"rtm.hw.mem.bank-P0-Node0-Channel0-Dimm1-DIMMA2","timestamp":1582208062,"value":"No Module Installed"} 140 | {"metric":"rtm.hw.mem.bank-P0-Node0-Channel1-Dimm1-DIMMB2","timestamp":1582208062,"value":"No Module Installed"} 141 | {"metric":"rtm.hw.mem.bank-P0-Node0-Channel2-Dimm0-DIMMC1","timestamp":1582208062,"value":"16384"} 142 | {"metric":"rtm.hw.mem.bank-P0-Node0-Channel1-Dimm0-DIMMB1","timestamp":1582208062,"value":"16384"} 143 | {"metric":"rtm.hw.mem.bank-P0-Node0-Channel2-Dimm1-DIMMC2","timestamp":1582208062,"value":"No Module Installed"} 144 | {"metric":"rtm.hw.mem.bank-P0-Node0-Channel3-Dimm1-DIMMD2","timestamp":1582208062,"value":"No Module Installed"} 145 | {"metric":"rtm.hw.mem.bank-P0-Node0-Channel3-Dimm0-DIMMD1","timestamp":1582208062,"value":"16384"} 146 | {"metric":"rtm.hw.mem.bank-P0-Node0-Channel0-Dimm0-DIMMA1","timestamp":1582208062,"value":"16384"} 147 | 148 | {"metric":"rtm.hw.lspci.pci.ff-15-2","timestamp":1582208062,"value":"8086:6fb6"} 149 | .... 150 | 151 | {"metric":"rtm.info.hdd.sda.capacity","timestamp":1582208062,"value":"4.00 TB"} 152 | {"metric":"rtm.info.hdd.sda.link_type","timestamp":1582208062,"value":"sata"} 153 | {"metric":"rtm.info.hdd.sda.firmware","timestamp":1582208062,"value":"A5GNT920"} 154 | {"metric":"rtm.info.hdd.sda.dmesg.io.errors","timestamp":1582208062,"value":"0"} 155 | {"metric":"rtm.info.hdd.sda.disk_type","timestamp":1582208062,"value":"hdd"} 156 | {"metric":"rtm.info.hdd.sda.iostat.busy","timestamp":1582208062,"value":"3.34"} 157 | {"metric":"rtm.info.hdd.sda.model","timestamp":1582208062,"value":"HGST HUS726040ALA610"} 158 | {"metric":"rtm.info.hdd.sda.serial","timestamp":1582208062,"value":"K3GDA42B"} 159 | {"metric":"rtm.info.hdd.sda.temperature","timestamp":1582208062,"value":"37"} 160 | 161 | {"metric":"rtm.info.hdd.sda.iostat.read.per.sec","timestamp":1582208062,"value":"5.11"} 162 | {"metric":"rtm.info.hdd.sda.iostat.writekb.per.sec","timestamp":1582208062,"value":"108.74"} 163 | {"metric":"rtm.info.hdd.sda.iostat.write.merged.per.sec","timestamp":1582208062,"value":"1.38"} 164 | {"metric":"rtm.info.hdd.sda.iostat.write.avg.wait","timestamp":1582208062,"value":"5.39"} 165 | {"metric":"rtm.info.hdd.sda.iostat.read.avg.wait","timestamp":1582208062,"value":"9.25"} 166 | {"metric":"rtm.info.hdd.sda.iostat.read.merged.per.sec","timestamp":1582208062,"value":"0.01"} 167 | {"metric":"rtm.info.hdd.sda.iostat.readkb.per.sec","timestamp":1582208062,"value":"616.81"} 168 | {"metric":"rtm.info.hdd.sda.iostat.write.per.sec","timestamp":1582208062,"value":"3.19"} 169 | 170 | {"metric":"rtm.info.hdd.sda.smart.highest-temperature","timestamp":1582208062,"value":"47"} 171 | {"metric":"rtm.info.hdd.sda.smart.bytes-read","timestamp":1582208062,"value":"21596906627584"} 172 | {"metric":"rtm.info.hdd.sda.smart.udma-crc-error","timestamp":1582208062,"value":"0"} 173 | {"metric":"rtm.info.hdd.sda.smart.link-failures","timestamp":1582208062,"value":"0"} 174 | {"metric":"rtm.info.hdd.sda.smart.temperature","timestamp":1582208062,"value":"37"} 175 | {"metric":"rtm.info.hdd.sda.smart.offline-uncorrectable","timestamp":1582208062,"value":"0"} 176 | {"metric":"rtm.info.hdd.sda.smart.percentage-used","timestamp":1582208062,"value":"0"} 177 | {"metric":"rtm.info.hdd.sda.smart.realocated-event-count","timestamp":1582208062,"value":"0"} 178 | {"metric":"rtm.info.hdd.sda.smart.reallocated-sector-count","timestamp":1582208062,"value":"0"} 179 | {"metric":"rtm.info.hdd.sda.smart.reported-corrected","timestamp":1582208062,"value":"-1"} 180 | {"metric":"rtm.info.hdd.sda.smart.power-cycles","timestamp":1582208062,"value":"23"} 181 | {"metric":"rtm.info.hdd.sda.smart.power-on-hours","timestamp":1582208062,"value":"10521"} 182 | {"metric":"rtm.info.hdd.sda.smart.global-health","timestamp":1582208062,"value":"1"} 183 | {"metric":"rtm.info.hdd.sda.smart.logged-error-count","timestamp":1582208062,"value":"0"} 184 | {"metric":"rtm.info.hdd.sda.smart.current-pending-sector","timestamp":1582208062,"value":"0"} 185 | {"metric":"rtm.info.hdd.sda.smart.reported-uncorrect","timestamp":1582208062,"value":"0"} 186 | {"metric":"rtm.info.hdd.sda.smart.lowest-temperature","timestamp":1582208062,"value":"19"} 187 | {"metric":"rtm.info.hdd.sda.smart.bytes-written","timestamp":1582208062,"value":"10071769343488"} 188 | {"metric":"rtm.info.hdd.sda.smart.time","timestamp":1582208062,"value":"0"} 189 | {"metric":"rtm.info.hdd.sda.smart.command-timeout","timestamp":1582208062,"value":"-1"} 190 | ``` 191 | 192 | ### rtmRaidCheck 193 | 194 | This probe is executed each 3600s (1hour). It collects information on RAID health's (if available). 195 | ### data exemple 196 | ``` 197 | {"metric":"rtm.hw.scsiraid.unit.md3.vol0.capacity","timestamp":1582208405,"value":"24.4 GB"} 198 | {"metric":"rtm.hw.scsiraid.unit.md3.vol0.phys","timestamp":1582208405,"value":"3"} 199 | {"metric":"rtm.hw.scsiraid.unit.md3.vol0.type","timestamp":1582208405,"value":"raid1"} 200 | {"metric":"rtm.hw.scsiraid.unit.md3.vol0.status","timestamp":1582208405,"value":"active"} 201 | {"metric":"rtm.hw.scsiraid.unit.md3.vol0.flags","timestamp":1582208405,"value":"clean"} 202 | 203 | {"metric":"rtm.hw.scsiraid.port.md3.vol0.sda3.capacity","timestamp":1582208405,"value":"24.4 GB"} 204 | {"metric":"rtm.hw.scsiraid.port.md3.vol0.sda3.status","timestamp":1582208405,"value":"active"} 205 | {"metric":"rtm.hw.scsiraid.port.md3.vol0.sda3.flags","timestamp":1582208405,"value":"sync"} 206 | {"metric":"rtm.hw.scsiraid.port.md3.vol0.sdb3.capacity","timestamp":1582208405,"value":"24.4 GB"} 207 | {"metric":"rtm.hw.scsiraid.port.md3.vol0.sdb3.status","timestamp":1582208405,"value":"active"} 208 | {"metric":"rtm.hw.scsiraid.port.md3.vol0.sdb3.flags","timestamp":1582208405,"value":"sync"} 209 | ``` 210 | 211 | 212 | ### nvidia_smi_stats 213 | 214 | Only available with nvidia cards. 215 | Collects information and metrics on nvidia hardware. 216 | 217 | \*need nvidia-smi driver and application installed 218 | 219 | # Contributing 220 | Instructions on how to contribute to OVH RTM are available on the Contributing page. 221 | -------------------------------------------------------------------------------- /deb/scripts/rtmRaidCheck.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | $ENV{"LC_ALL"} = "POSIX"; 3 | use strict; 4 | use warnings; 5 | use IPC::Open3; 6 | use IO::Select; 7 | 8 | my ($MDADM, $MPTSTATUS, $LSIUTIL, $LSPCI); 9 | 10 | rtmRaidCheck(); 11 | 12 | sub rtmRaidCheck 13 | { 14 | #init 15 | my $fnret = execute('which mdadm 2>/dev/null'); 16 | if ( $fnret->{status} != 100 ) 17 | { 18 | print "mdadm not found \n"; 19 | } 20 | elsif (defined($fnret->{value}[0])) 21 | { 22 | $MDADM = $fnret->{value}[0]; 23 | } 24 | $fnret = execute('which mpt-status 2>/dev/null'); 25 | if ( $fnret->{status} != 100 ) 26 | { 27 | print "mpt-status not found \n"; 28 | } 29 | elsif (defined($fnret->{value}[0])) 30 | { 31 | $MPTSTATUS = $fnret->{value}[0]; 32 | } 33 | $fnret = execute('which lsiutil 2>/dev/null'); 34 | if ( $fnret->{status} != 100 ) 35 | { 36 | print "lsiutil not found \n"; 37 | } 38 | elsif (defined($fnret->{value}[0])) 39 | { 40 | $LSIUTIL = $fnret->{value}[0]; 41 | } 42 | $fnret = execute('which lspci 2>/dev/null'); 43 | if ( $fnret->{status} != 100 ) 44 | { 45 | print "lspci not found \n"; 46 | } 47 | elsif (defined($fnret->{value}[0])) 48 | { 49 | $LSPCI = $fnret->{value}[0]; 50 | } 51 | if ($LSPCI) 52 | { 53 | $fnret = execute("$LSPCI -d 1000:"); 54 | if ( $fnret->{status} != 100 ) 55 | { 56 | print ("error executing: $LSPCI -d 1000: \n"); 57 | } 58 | elsif (defined($fnret->{value}[0])) 59 | { 60 | if ($MPTSTATUS) 61 | { 62 | my $SCSI_ID=""; 63 | $fnret = execute("$MPTSTATUS -p 2>/dev/null | grep 'Found SCSI' | cut -f1 -d, | cut -f2 -d="); 64 | if ( $fnret->{status} != 100 ) 65 | { 66 | print "SCSI not found \n"; 67 | } 68 | elsif(defined($fnret->{value}[0])) 69 | { 70 | $SCSI_ID=$fnret->{value}[0]; 71 | } 72 | if ($SCSI_ID eq "") 73 | { 74 | $fnret = execute("cat /proc/scsi/scsi 2>/dev/null | grep Host | tail -n 1 | cut -d ' ' -f6"); 75 | if ( $fnret->{status} != 100 ) 76 | { 77 | undef $SCSI_ID; 78 | } 79 | elsif(defined($fnret->{value}[0])) 80 | { 81 | $SCSI_ID=$fnret->{value}[0]; 82 | } 83 | } 84 | chomp $SCSI_ID; 85 | if ($SCSI_ID ne "") 86 | { 87 | $MPTSTATUS = "$MPTSTATUS -i $SCSI_ID"; 88 | $fnret = execute("$MPTSTATUS -i $SCSI_ID"); 89 | if ( $fnret->{status} != 100 ) 90 | { 91 | print "SCSI not found \n"; 92 | undef $MPTSTATUS; 93 | } 94 | elsif (defined($fnret->{value}[0])) 95 | { 96 | $MPTSTATUS = $fnret->{value}[0]; 97 | } 98 | } 99 | } 100 | } 101 | else 102 | { 103 | # nothing returned, 104 | print "Nothing returned with lspci -d 1000: \n"; 105 | } 106 | } 107 | my ($line, @mptInfo, @twCliInfo, $controler); 108 | 109 | #SOFT RAID 110 | my $mdstat; 111 | if ( defined($MDADM) and $MDADM ne "" && -e "/proc/mdstat" && `cat /proc/mdstat | grep md` ne "") 112 | { 113 | $fnret = execute('cat /proc/mdstat | grep md'); 114 | if ( $fnret->{status} != 100 ) 115 | { 116 | print "no md device found in /proc/mdstat \n"; 117 | } 118 | else 119 | { 120 | open(FILE, "/proc/mdstat"); 121 | my $matrix; 122 | foreach $line () 123 | { 124 | if ( $line =~ /(md\d+)\s+:\s+([^\s]+)\s+([^\s]+)/ ) 125 | { 126 | $matrix = $1; 127 | $mdstat->{$matrix}{status} = $2; 128 | $mdstat->{$matrix}{type} = $3; 129 | } 130 | if ( $line =~ /\s+(\d+)/ ) 131 | { 132 | $mdstat->{$matrix}{capacity} = $1; 133 | } 134 | } 135 | close(FILE); 136 | foreach $matrix (keys %{$mdstat}) 137 | { 138 | open(IN, "$MDADM -D /dev/$matrix |"); 139 | foreach $line () 140 | { 141 | if ( $line =~ /\s+\d+\s+\d+\s+\d+\s+(\d+)\s+(\w+)\s+(\w+)\s+\/dev\/(\w+)/ ) 142 | { 143 | $mdstat->{$matrix}{device}{$1}{state} = $2; 144 | $mdstat->{$matrix}{device}{$1}{flags} = $3; 145 | $mdstat->{$matrix}{device}{$1}{drive} = $4; 146 | } 147 | if ( $line =~ /^\s+State\s+:\s+([^\s]+)/ ) 148 | { 149 | $mdstat->{$matrix}{state} = $1; 150 | } 151 | } 152 | close(IN); 153 | 154 | print "{\"metric\":\"rtm.hw.scsiraid.unit.".$matrix.".vol0.capacity\",\"timestamp\":".time.",\"value\":\"".sprintf("%.1f", $mdstat->{$matrix}{capacity}/1024/1024)." GB\"}\n"; 155 | print "{\"metric\":\"rtm.hw.scsiraid.unit.".$matrix.".vol0.phys\",\"timestamp\":".time.",\"value\":\"".(keys %{$mdstat})."\"}\n"; 156 | print "{\"metric\":\"rtm.hw.scsiraid.unit.".$matrix.".vol0.type\",\"timestamp\":".time.",\"value\":\"".$mdstat->{$matrix}{type}."\"}\n"; 157 | print "{\"metric\":\"rtm.hw.scsiraid.unit.".$matrix.".vol0.status\",\"timestamp\":".time.",\"value\":\"".$mdstat->{$matrix}{status}."\"}\n"; 158 | print "{\"metric\":\"rtm.hw.scsiraid.unit.".$matrix.".vol0.flags\",\"timestamp\":".time.",\"value\":\"".$mdstat->{$matrix}{state}."\"}\n"; 159 | 160 | open(FILE, "/proc/partitions"); 161 | my @file = ; 162 | close(FILE); 163 | foreach my $device (keys %{$mdstat->{$matrix}{device}}) 164 | { 165 | foreach $line (@file) 166 | { 167 | if ( $line =~ /\s+\d+\s+\d+\s+(\d+)\s+$mdstat->{$matrix}{device}{$device}{drive}/ ) 168 | { 169 | $mdstat->{$matrix}{device}{$device}{capacity} = $1; 170 | } 171 | } 172 | print "{\"metric\":\"rtm.hw.scsiraid.port.".$matrix.".vol0.".$mdstat->{$matrix}{device}{$device}{drive}.".capacity\",\"timestamp\":".time.",\"value\":\"".sprintf("%.1f", $mdstat->{$matrix}{device}{$device}{capacity}/1024/1024)." GB\"}\n"; 173 | print "{\"metric\":\"rtm.hw.scsiraid.port.".$matrix.".vol0.".$mdstat->{$matrix}{device}{$device}{drive}.".status\",\"timestamp\":".time.",\"value\":\"".$mdstat->{$matrix}{device}{$device}{state}."\"}\n"; 174 | print "{\"metric\":\"rtm.hw.scsiraid.port.".$matrix.".vol0.".$mdstat->{$matrix}{device}{$device}{drive}.".flags\",\"timestamp\":".time.",\"value\":\"".$mdstat->{$matrix}{device}{$device}{flags}."\"}\n"; 175 | } 176 | } 177 | } 178 | } 179 | 180 | #SCSI-RAID 181 | if ($MPTSTATUS and not $LSIUTIL) 182 | { 183 | my %mptStat; 184 | @mptInfo = `$MPTSTATUS 2>/dev/null`; 185 | $fnret = execute("$MPTSTATUS 2>/dev/null"); 186 | if ($fnret->{status} != '100') 187 | { 188 | print "no MPTSTATUS \n"; 189 | } 190 | else 191 | { 192 | @mptInfo=@{$fnret->{value}}; 193 | foreach $line (@mptInfo) 194 | { 195 | if ( $line =~ m/(^[^\s]+)\s+([^\s]+)\s+(\d+)\s+type\s+([^,]+),\s+(\d+)\s+phy,\s+(\d+)\s+GB,\s+flags\s+([^,]+),\s+state\s+(.+)/) 196 | { 197 | $mptStat{cntrl} = $1; 198 | $mptStat{vol} = "$2$3"; 199 | $mptStat{cap} = $6; 200 | $mptStat{phys} = $5; 201 | $mptStat{type} = $4; 202 | $mptStat{flags} = $7; 203 | $mptStat{status}= $8; 204 | $mptStat{vol} =~ s/\_/-/g; 205 | print "{\"metric\":\"rtm.hw.scsiraid.unit.".$mptStat{cntrl}.".".$mptStat{vol}.".capacity\",\"timestamp\":".time.",\"value\":\"".$mptStat{cap}." GB\"}\n"; 206 | print "{\"metric\":\"rtm.hw.scsiraid.unit.".$mptStat{cntrl}.".".$mptStat{vol}.".phys\",\"timestamp\":".time.",\"value\":\"".$mptStat{phys}."\"}\n"; 207 | print "{\"metric\":\"rtm.hw.scsiraid.unit.".$mptStat{cntrl}.".".$mptStat{vol}.".type\",\"timestamp\":".time.",\"value\":\"".$mptStat{type}."\"}\n"; 208 | print "{\"metric\":\"rtm.hw.scsiraid.unit.".$mptStat{cntrl}.".".$mptStat{vol}.".status\",\"timestamp\":".time.",\"value\":\"".$mptStat{status}."\"}\n"; 209 | print "{\"metric\":\"rtm.hw.scsiraid.unit.".$mptStat{cntrl}.".".$mptStat{vol}.".flags\",\"timestamp\":".time.",\"value\":\"".$mptStat{flags}."\"}\n"; 210 | next; 211 | } 212 | elsif ( $line =~ m/(^[^\s]+)\s+([^\s]+)\s+(\d+)\s+type\s+([^,]+),\s+(\d+)\s+phy,\s+(\d+)\s+GB,\s+state\s+(.+),\s+flags\s+([^,]+)\n/) 213 | { 214 | $mptStat{cntrl} = $1; 215 | $mptStat{vol} = "$2$3"; 216 | $mptStat{cap} = $6; 217 | $mptStat{phys} = $5; 218 | $mptStat{type} = $4; 219 | $mptStat{status}= $7; 220 | $mptStat{flags} = $8; 221 | $mptStat{vol} =~ s/\_/-/g; 222 | print "{\"metric\":\"rtm.hw.scsiraid.unit.".$mptStat{cntrl}.".".$mptStat{vol}.".capacity\",\"timestamp\":".time.",\"value\":\"".$mptStat{cap}." GB\"}\n"; 223 | print "{\"metric\":\"rtm.hw.scsiraid.unit.".$mptStat{cntrl}.".".$mptStat{vol}.".phys\",\"timestamp\":".time.",\"value\":\"".$mptStat{phys}."\"}\n"; 224 | print "{\"metric\":\"rtm.hw.scsiraid.unit.".$mptStat{cntrl}.".".$mptStat{vol}.".type\",\"timestamp\":".time.",\"value\":\"".$mptStat{type}."\"}\n"; 225 | print "{\"metric\":\"rtm.hw.scsiraid.unit.".$mptStat{cntrl}.".".$mptStat{vol}.".status\",\"timestamp\":".time.",\"value\":\"".$mptStat{status}."\"}\n"; 226 | print "{\"metric\":\"rtm.hw.scsiraid.unit.".$mptStat{cntrl}.".".$mptStat{vol}.".flags\",\"timestamp\":".time.",\"value\":\"".$mptStat{flags}."\"}\n"; 227 | next; 228 | } 229 | if ( $line =~ m/(^[^\s]+)\s+([^\s]+)\s+(\d+)\s+scsi_id\s+\d+\s+([^\s]+)\s+([^\s]+)[^,]+,\s+(\d+)[^,]+,\s+state\s+(.+), flags\s+(.+)/ ) 230 | { 231 | next if $6 == 0; # ignore fake raid entries 232 | print "{\"metric\":\"rtm.hw.scsiraid.port.".$1.".".$mptStat{vol}.".".$2.$3.".capacity\",\"timestamp\":".time.",\"value\":\"".$6." GB\"}\n"; 233 | print "{\"metric\":\"rtm.hw.scsiraid.port.".$1.".".$mptStat{vol}.".".$2.$3.".model\",\"timestamp\":".time.",\"value\":\"".$4." ".$5."\"}\n"; 234 | print "{\"metric\":\"rtm.hw.scsiraid.port.".$1.".".$mptStat{vol}.".".$2.$3.".status\",\"timestamp\":".time.",\"value\":\"".$7."\"}\n"; 235 | print "{\"metric\":\"rtm.hw.scsiraid.port.".$1.".".$mptStat{vol}.".".$2.$3.".flags\",\"timestamp\":".time.",\"value\":\"".$8."\"}\n"; 236 | } 237 | elsif ( $line =~ m/(^[^\s]+)\s+([^\s]+)\s+(\d+)\s+([^\s]+)\s+([^\s]+)[^,]+,\s+(\d+)[^,]+,\s+state\s+(.+)/ ) 238 | { 239 | next if $6 == 0; 240 | print "{\"metric\":\"rtm.hw.scsiraid.port.".$1.".".$mptStat{vol}.".".$2.$3.".capacity\",\"timestamp\":".time.",\"value\":\"".$6." GB\"}\n"; 241 | print "{\"metric\":\"rtm.hw.scsiraid.port.".$1.".".$mptStat{vol}.".".$2.$3.".model\",\"timestamp\":".time.",\"value\":\"".$4." ".$5."\"}\n"; 242 | print "{\"metric\":\"rtm.hw.scsiraid.port.".$1.".".$mptStat{vol}.".".$2.$3.".status\",\"timestamp\":".time.",\"value\":\"".$7."\"}\n"; 243 | } 244 | elsif ( $line =~ m/(^[^\s]+)\s+([^\s]+)\s+(\d+)\s+([^\s]+)\s+([^\s]+)[^,]+,\s+(\d+)[^,]+,\s+state\s+(.+), flags\s+(.+)/ ) 245 | { 246 | next if $6 == 0; 247 | print "{\"metric\":\"rtm.hw.scsiraid.port.".$1.".".$mptStat{vol}.".".$2.$3.".capacity\",\"timestamp\":".time.",\"value\":\"".$6." GB\"}\n"; 248 | print "{\"metric\":\"rtm.hw.scsiraid.port.".$1.".".$mptStat{vol}.".".$2.$3.".model\",\"timestamp\":".time.",\"value\":\"".$4." ".$5."\"}\n"; 249 | print "{\"metric\":\"rtm.hw.scsiraid.port.".$1.".".$mptStat{vol}.".".$2.$3.".status\",\"timestamp\":".time.",\"value\":\"".$7."\"}\n"; 250 | print "{\"metric\":\"rtm.hw.scsiraid.port.".$1.".".$mptStat{vol}.".".$2.$3.".flags\",\"timestamp\":".time.",\"value\":\"".$8."\"}\n"; 251 | } 252 | elsif ( $line =~ m/(^[^\s]+)\s+([^\s]+)\s+(\d+)\s+([^\s]+)\s+([^\s]+)[^,]+,\s+(\d+)[^,]+,\s+flags\s+([^,]+),\s+state\s+(.+)/ ) 253 | { 254 | next if $6 == 0; 255 | print "{\"metric\":\"rtm.hw.scsiraid.port.".$1.".".$mptStat{vol}.".".$2.$3.".capacity\",\"timestamp\":".time.",\"value\":\"".$6." GB\"}\n"; 256 | print "{\"metric\":\"rtm.hw.scsiraid.port.".$1.".".$mptStat{vol}.".".$2.$3.".model\",\"timestamp\":".time.",\"value\":\"".$4." ".$5."\"}\n"; 257 | print "{\"metric\":\"rtm.hw.scsiraid.port.".$1.".".$mptStat{vol}.".".$2.$3.".status\",\"timestamp\":".time.",\"value\":\"".$8."\"}\n"; 258 | print "{\"metric\":\"rtm.hw.scsiraid.port.".$1.".".$mptStat{vol}.".".$2.$3.".flags\",\"timestamp\":".time.",\"value\":\"".$7."\"}\n"; 259 | } 260 | } 261 | } 262 | } 263 | 264 | # LSI: 265 | if($LSIUTIL) 266 | { 267 | my %data; 268 | my @devs; 269 | if (-e '/proc/mpt/summary') 270 | { 271 | $fnret = execute('cat /proc/mpt/summary'); 272 | if($fnret->{status} != '100') 273 | { 274 | print"can't cat /proc/mpt/summary \n"; 275 | } 276 | else 277 | { 278 | chomp(@devs=@{$fnret->{value}}); 279 | } 280 | # foreach ioc device: 281 | foreach (@devs) 282 | { 283 | m/^(.*?):.*Ports=(\d+),/; 284 | my ($unit, $ports) = ($1, $2); 285 | 286 | # foreach ports: 287 | for (my $port=1; $port <= $ports; $port++) 288 | { 289 | chomp(my @diskDetails = `$LSIUTIL -p$port -a 21,2,0,0,0`); 290 | chomp(my @LSIRES = `$LSIUTIL -p$port -a 21,1,0,0,0`); 291 | 292 | # each line: 293 | my ($vol, $bus, $target, $type); 294 | $vol = -1; 295 | foreach my $line (@LSIRES) 296 | { 297 | #Volume 0 is Bus 0 Target 2, Type IM (Integrated Mirroring) 298 | if($line =~ /^Volume (\d+) is Bus (\d+) Target (\d+), Type (\w+) /) 299 | { 300 | ($vol, $bus, $target, $type) = ($1, $2, $3, $4); 301 | $data{$unit}{$port}{$vol}{$bus}{$target}{type} = $type; 302 | # Warning: $target is a scsi id (vol_id$target) in RTM 303 | } 304 | # skip all till Volume.. line 305 | next if $vol == -1; 306 | 307 | # Volume State: optimal, enabled 308 | if($line =~ /Volume State: (.+?), (.*)$/) 309 | { 310 | $data{$unit}{$port}{$vol}{$bus}{$target}{status} = uc $1; 311 | $data{$unit}{$port}{$vol}{$bus}{$target}{flags} = uc $2; 312 | if($2 =~ /resync in progress/i) 313 | { 314 | chomp(my @lsitmp = `$LSIUTIL -p$port -a 21,3,0,0,0`); 315 | my $checkNextLine = 0; 316 | foreach(@lsitmp) 317 | { 318 | # Resync Progress: total blocks 4394526720, blocks remaining 3298477568, 75% 319 | if($checkNextLine and /^\s*Resync Progress:.*?,\s*(\d+)%\s*/) 320 | { 321 | $data{$unit}{$port}{$vol}{$bus}{$target}{syncprogress} = $1; 322 | } 323 | next unless /Volume $vol State:/i; 324 | $checkNextLine = 1; 325 | } 326 | } 327 | } 328 | 329 | #Volume Size 417708 MB, Stripe Size 64 KB, 6 Members 330 | if($line =~ /Volume Size (\d+ \w+), Stripe Size (\d+ \w+), (\d+) Members/) 331 | { 332 | $data{$unit}{$port}{$vol}{$bus}{$target}{capacity} = $1; 333 | $data{$unit}{$port}{$vol}{$bus}{$target}{stripe} = $2; # NEW 334 | $data{$unit}{$port}{$vol}{$bus}{$target}{phys} = $3; 335 | } 336 | elsif($line =~ /Volume Size (\d+ \w+), (\d+) Members/) 337 | { 338 | #Volume Size 417708 MB, 2 Members 339 | $data{$unit}{$port}{$vol}{$bus}{$target}{capacity} = $1; 340 | $data{$unit}{$port}{$vol}{$bus}{$target}{phys} = $2; 341 | } 342 | 343 | if($line =~ /is PhysDisk (\d+)/) 344 | { 345 | my %disk; 346 | $disk{nr} = $1; 347 | # now we know which disk is here, so find it: 348 | my $stop = 0; 349 | foreach(@diskDetails) 350 | { 351 | $stop == 1 and next 352 | if(/PhysDisk $disk{nr} is Bus/); 353 | #PhysDisk 0 is Bus 0 Target 3 354 | next unless $stop; 355 | 356 | #PhysDisk State: online 357 | if(/PhysDisk State: (.*)/) 358 | { 359 | $disk{status} = uc $1; 360 | $disk{status} =~ s/^\s+|\s+$//g; 361 | } 362 | 363 | #PhysDisk Size 238475 MB, Inquiry Data: ATA ST3250410AS A 364 | if(/PhysDisk Size (\d+ \w+), Inquiry Data:\s+(.*)/) 365 | { 366 | $disk{capacity} = $1; 367 | $disk{model} = $2; 368 | $disk{model} =~ s/\s+/ /g; 369 | $disk{model} =~ s/^\s+|\s+$//g; 370 | $disk{model} =~ s/(\w+ \w+) \w+/$1/g; # delete rev, for backward compatibility 371 | } 372 | 373 | # fix for 2 SSD IM sizes 374 | if( $data{$unit}{$port}{$vol}{$bus}{$target}{type} eq 'IM' 375 | and $data{$unit}{$port}{$vol}{$bus}{$target}{phys} == 2 376 | and $disk{capacity} > $data{$unit}{$port}{$vol}{$bus}{$target}{capacity} * 1.01 ) 377 | { 378 | my $old_model = $disk{model}; 379 | my $d = scan4LsiDisks($port); 380 | my $new_model = $d->{$disk{nr}}{model}; 381 | $new_model =~ s/\s+/ /g; 382 | $new_model =~ s/^\s+|\s+$//g; 383 | $new_model = $d->{$disk{nr}}{vendor} . ' ' . $new_model; 384 | if($old_model ne $new_model) 385 | { 386 | $disk{model} = $new_model; 387 | $disk{capacity} = $data{$unit}{$port}{$vol}{$bus}{$target}{capacity}; # ugly 388 | } 389 | } 390 | 391 | } 392 | push @{$data{$unit}{$port}{$vol}{$bus}{$target}{disks}}, \%disk; 393 | } 394 | } 395 | } 396 | } 397 | #@{$data{$unit}{$port}{$vol}{$bus}{$target}{disks}} 398 | foreach my $unit (keys %data) 399 | { 400 | foreach my $port (keys %{$data{$unit}}) 401 | { 402 | foreach my $vol (keys %{$data{$unit}{$port}}) 403 | { 404 | foreach my $bus (keys %{$data{$unit}{$port}{$vol}}) 405 | { 406 | foreach my $target (keys %{$data{$unit}{$port}{$vol}{$bus}}) 407 | { 408 | foreach my $key (keys %{$data{$unit}{$port}{$vol}{$bus}{$target}}) 409 | { 410 | if($key eq 'capacity') 411 | { 412 | # print "hHW_SCSIRAID_UNIT_$unit\_vol-id$vol\_$key|".changeSizeUnit($data{$unit}{$port}{$vol}{$bus}{$target}{$key})."\n"; 413 | print "{\"metric\":\"rtm.hw.scsiraid.unit.".$unit.".vol-id".$vol.".".$key."\",\"timestamp\":".time.",\"value\":\"".changeSizeUnit($data{$unit}{$port}{$vol}{$bus}{$target}{$key})."\"}\n"; 414 | } 415 | elsif($key eq 'disks') 416 | { 417 | foreach my $d (@{$data{$unit}{$port}{$vol}{$bus}{$target}{$key}}) 418 | { 419 | next unless $d->{status}; 420 | print "{\"metric\":\"rtm.hw.scsiraid.port.".$unit.".vol-id".$vol.".phy".$d->{nr}.".model\",\"timestamp\":".time.",\"value\":\"".$d->{model}."\"}\n"; 421 | print "{\"metric\":\"rtm.hw.scsiraid.port.".$unit.".vol-id".$vol.".phy".$d->{nr}.".capacity\",\"timestamp\":".time.",\"value\":\"".$d->{capacity}."\"}\n"; 422 | print "{\"metric\":\"rtm.hw.scsiraid.port.".$unit.".vol-id".$vol.".phy".$d->{nr}.".status\",\"timestamp\":".time.",\"value\":\"".$d->{status}."\"}\n"; 423 | # TODO: no idea from where get the disk flags 424 | # print "hHW_SCSIRAID_PORT_$unit\_vol-id$vol\_phy".$d->{nr}."\_flags|".(($d->{flags})?$d->{flags}:"NONE")."\n"; 425 | print "{\"metric\":\"rtm.hw.scsiraid.port.".$unit.".vol-id".$vol.".phy".$d->{nr}.".flags\",\"timestamp\":".time.",\"value\":\"".(($d->{flags})?$d->{flags}:"NONE")."\"}\n"; 426 | } 427 | } 428 | else 429 | { 430 | print "{\"metric\":\"rtm.hw.scsiraid.unit.".$unit.".vol-id".$vol.".".$key."\",\"timestamp\":".time.",\"value\":\"".$data{$unit}{$port}{$vol}{$bus}{$target}{$key}."\"}\n"; 431 | } 432 | } 433 | } 434 | } 435 | } 436 | } 437 | } 438 | } 439 | } 440 | 441 | my @log_files = ('/var/log/dmesg', '/var/log/boot.msg'); 442 | foreach my $log_file (@log_files) 443 | { 444 | if (-e $log_file) 445 | { 446 | if (open(my $fh, '<', $log_file)) 447 | { 448 | while (my $log = <$fh> ) 449 | { 450 | #3Ware 451 | if (( $log =~ m/3w-xxxx: scsi/) || ( $log =~ m/scsi. : Found a 3ware/)) 452 | { 453 | my (%units, @controlers); 454 | my $TWCLI = `which tw_cli 2>/dev/null`; 455 | chomp($TWCLI); 456 | if ($TWCLI ne "") 457 | { 458 | @twCliInfo = `$TWCLI info`; 459 | foreach $line (@twCliInfo) 460 | { 461 | if ($line =~ m/Controller (\d+):/ || $line =~ /^c(\d+).*$/) 462 | { 463 | push @controlers, $1; 464 | } 465 | } 466 | foreach $controler (@controlers) 467 | { 468 | @twCliInfo = `$TWCLI info c$controler`; 469 | foreach $line (@twCliInfo) 470 | { 471 | if ( $line =~ m/Unit\s(\d):\s+(RAID\s+\d+|[^\s]+)\s([^\s]+)\s([^\s]+)[^:]+:\s(.+)/) 472 | { 473 | print "{\"metric\":\"rtm.hw.scsiraid.unit.c".$controler.".u".$1.".capacity\",\"timestamp\":".time.",\"value\":\"".$3." ".$4."\"}\n"; 474 | print "{\"metric\":\"rtm.hw.scsiraid.unit.c".$controler.".u".$1.".type\",\"timestamp\":".time.",\"value\":\"".$2."\"}\n"; 475 | print "{\"metric\":\"rtm.hw.scsiraid.unit.c".$controler.".u".$1.".status\",\"timestamp\":".time.",\"value\":\"".$5."\"}\n"; 476 | } 477 | if ( $line =~ m/Port\s(\d+):\s([^\s]+)\s([^\s]+)\s([^\s]+)\s([^\s]+)\s([^\s]+)[^:]+:\s([^\(]+)\(unit\s(\d+)/) 478 | { 479 | print "{\"metric\":\"rtm.hw.scsiraid.port.c".$controler.".u".$8.".phy".$1."capacity\",\"timestamp\":".time.",\"value\":\"".$5." ".$6."\"}\n"; 480 | print "{\"metric\":\"rtm.hw.scsiraid.port.c".$controler.".u".$8.".phy".$1."model\",\"timestamp\":".time.",\"value\":\"".$2." ".$3."\"}\n"; 481 | print "{\"metric\":\"rtm.hw.scsiraid.port.c".$controler.".u".$8.".phy".$1."status\",\"timestamp\":".time.",\"value\":\"".$7."\"}\n"; 482 | 483 | if (! exists $units{$controler}{$8}) 484 | { 485 | $units{$controler}{$8} = 0; 486 | } 487 | $units{$controler}{$8} = $units{$controler}{$8} + 1; 488 | } 489 | if ( $line =~ /^u(\d+)\s+(RAID\-\d+)\s+(\S+)\s+\S+\s+\S+\s+(\S+)\s.*/ ) 490 | { 491 | print "{\"metric\":\"rtm.hw.scsiraid.unit.c".$controler.".u".$1.".capacity\",\"timestamp\":".time.",\"value\":\"".$4." GB\"}\n"; 492 | print "{\"metric\":\"rtm.hw.scsiraid.unit.c".$controler.".u".$1.".type\",\"timestamp\":".time.",\"value\":\"".$2."\"}\n"; 493 | print "{\"metric\":\"rtm.hw.scsiraid.unit.c".$controler.".u".$1.".status\",\"timestamp\":".time.",\"value\":\"".$3."\"}\n"; 494 | } 495 | if ( $line =~ /^p(\d+)\s+(\S+)\s+(\S+)\s+(\S+\s\S+)\s+(\d+)\s+(\S+)\s*$/ ) 496 | { 497 | print "{\"metric\":\"rtm.hw.scsiraid.port.c".$controler.".u".$3.".phy".$1."capacity\",\"timestamp\":".time.",\"value\":\"".$4."\"}\n"; 498 | print "{\"metric\":\"rtm.hw.scsiraid.port.c".$controler.".u".$3.".phy".$1."model\",\"timestamp\":".time.",\"value\":\"".$6."\"}\n"; 499 | print "{\"metric\":\"rtm.hw.scsiraid.port.c".$controler.".u".$3.".phy".$1."status\",\"timestamp\":".time.",\"value\":\"".$2."\"}\n"; 500 | if (! exists $units{$controler}{$3}) {$units{$controler}{$3} = 0;} 501 | $units{$controler}{$3} = $units{$controler}{$3} + 1; 502 | } 503 | } 504 | foreach (keys %{$units{$controler}}) 505 | { 506 | print "{\"metric\":\"rtm.hw.scsiraid.unit.c".$controler.".".$_.".phys\",\"timestamp\":".time.",\"value\":\"".$units{$controler}{$_}."\"}\n"; 507 | } 508 | } 509 | } 510 | } 511 | 512 | #3Ware-9xxx 513 | if ( $log =~ m/3w-9xxx: scsi.: Found/) 514 | { 515 | if (open my $FP, "tw_cli info |") 516 | { 517 | my (%units, @controlers); 518 | while (my $line = <$FP>) 519 | { 520 | if ($line =~ m/^c(\d+)\s+/) 521 | { 522 | push @controlers, $1; 523 | } 524 | } 525 | close $FP; 526 | foreach $controler (@controlers) 527 | { 528 | open my $FP, "tw_cli info c$controler |" or next; 529 | while (my $line = <$FP>) 530 | { 531 | if ( $line =~ m/^u(\d)\s+([A-Z0-9\-]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+/ ) 532 | { 533 | print "{\"metric\":\"rtm.hw.scsiraid.unit.c".$controler.".u".$1.".capacity\",\"timestamp\":".time.",\"value\":\"".$6."\"}\n"; 534 | print "{\"metric\":\"rtm.hw.scsiraid.unit.c".$controler.".u".$1.".type\",\"timestamp\":".time.",\"value\":\"".$2."\"}\n"; 535 | print "{\"metric\":\"rtm.hw.scsiraid.unit.c".$controler.".u".$1.".status\",\"timestamp\":".time.",\"value\":\"".$3."\"}\n"; 536 | } 537 | if ( $line =~ m/^p(\d)\s+([^\s]+)\s+u([^\s]+)\s+([^\s]+)\s+([^\s]+)\s+([^\s]+)/) 538 | { 539 | print "{\"metric\":\"rtm.hw.scsiraid.unit.c".$controler.".u".$3.".phys".$1.".capacity\",\"timestamp\":".time.",\"value\":\"".$4." ".$5."\"}\n"; 540 | print "{\"metric\":\"rtm.hw.scsiraid.unit.c".$controler.".u".$3.".phys".$1.".status\",\"timestamp\":".time.",\"value\":\"".$2."\"}\n"; 541 | push @{$units{$3}}, $1 if ($2 ne "NOT-PRESENT"); 542 | } 543 | } 544 | foreach my $unit (keys %units) 545 | { 546 | print "{\"metric\":\"rtm.hw.scsiraid.unit.c".$controler.".u".$unit.".phys\",\"timestamp\":".time.",\"value\":\"".(scalar @{$units{$unit}})."\"}\n"; 547 | } 548 | close $FP; 549 | } 550 | } 551 | } 552 | 553 | #Mylex 554 | if ( $log =~ m/Mylex AcceleRAID 160 PCI RAID Controller/) 555 | { 556 | my( @dirContents, $dirContent, @info, $line, $unit, $i, $sectorSize, $count); 557 | if ( ! -e "/proc/rd") 558 | { 559 | exit; 560 | } 561 | $count = 0; 562 | opendir(DIR,"/proc/rd"); 563 | @dirContents=readdir(DIR); 564 | closedir(DIR); 565 | 566 | $unit = 0; 567 | foreach $dirContent (@dirContents) 568 | { 569 | if (( $dirContent =~ m/\./ ) || (! -d "/proc/rd/".$controler )) 570 | { 571 | next; 572 | } 573 | $controler = $dirContent; 574 | $controler =~ s/c//g; 575 | open(FILE, "/proc/rd/c$controler/current_status") or exit; 576 | @info = ; 577 | close(FILE); 578 | 579 | for ($i=-1; $i<=scalar @info; $i++) 580 | { 581 | $line = $info[$i]; 582 | chomp($line); 583 | if ( $line =~ m/\/dev\/rd\/c(\d+)d(\d+):\s+([^,]+),\s+([^,]+),\s+(\d+)/ ) 584 | { 585 | my $capacity = $5; 586 | my $type = $3; 587 | my $status = $4; 588 | 589 | $capacity = $capacity * 512 / 1024 / 1024 / 1024; 590 | print "{\"metric\":\"rtm.hw.scsiraid.unit.c".$controler.".u".$unit.".capacity\",\"timestamp\":".time.",\"value\":\"".sprintf("%.2f",$capacity)." GB\"}\n"; 591 | print "{\"metric\":\"rtm.hw.scsiraid.unit.c".$controler.".u".$unit.".type\",\"timestamp\":".time.",\"value\":\"".$type."\"}\n"; 592 | print "{\"metric\":\"rtm.hw.scsiraid.unit.c".$controler.".u".$unit.".status\",\"timestamp\":".time.",\"value\":\"".$status."\"}\n"; 593 | } 594 | if ( $line =~ m/\s+(\d+):(\d+)\s+Vendor:\s+([^\s]+)\s+Model:\s+([^\s]+)/ ) 595 | { 596 | my $unit = $1; 597 | my $phys = $2; 598 | my $vendor = $3; 599 | my $model = $4; 600 | next if $model eq 'AcceleRAID'; # it's the controller, not disk 601 | $count++; 602 | $line = $info[$i+3]; 603 | $line =~ /Disk Status:\s+([^,]+),\s+(\d+)\sblocks/; 604 | 605 | my $status = $1; 606 | my $capacity = $2 * 512 / 1024 / 1024 / 1024; 607 | 608 | print "{\"metric\":\"rtm.hw.scsiraid.port.c".$controler.".u".$unit.".phys".$phys.".capacity\",\"timestamp\":".time.",\"value\":\"".sprintf("%.2f",$capacity)." GB.\"}\n" if ($status ne "0"); 609 | print "{\"metric\":\"rtm.hw.scsiraid.port.c".$controler.".u".$unit.".phys".$phys.".status\",\"timestamp\":".time.",\"value\":\"".$status."\"}\n" if ($status ne "0"); 610 | print "{\"metric\":\"rtm.hw.scsiraid.port.c".$controler.".u".$unit.".phys".$phys.".model\",\"timestamp\":".time.",\"value\":\"".$model."\"}\n"; 611 | } 612 | } 613 | print "{\"metric\":\"rtm.hw.scsiraid.unit.c".$controler.".u".$unit.".phys\",\"timestamp\":".time.",\"value\":\"".$count."\"}\n"; 614 | } 615 | } 616 | } 617 | close($fh); 618 | } 619 | } 620 | } 621 | } 622 | 623 | # sub to normalize units 624 | sub changeSizeUnit { 625 | my $str = shift || return; 626 | 627 | $str =~ /^(\d+) (\w+)$/ 628 | and $1 > 1024 629 | and uc $2 eq 'KB' 630 | and return int($1/1024)." MB"; 631 | 632 | $str =~ /^(\d+) (\w+)$/ 633 | and $1 > 1024 634 | and uc $2 eq 'MB' 635 | and return int($1/1024)." GB"; 636 | } 637 | 638 | # sometimes we need to rescan disks in LSI (sas + ssd cofigurations mostly) 639 | sub scan4LsiDisks { 640 | my $port = shift; 641 | return {} unless $port; 642 | my %disks; 643 | 644 | my @out = `$LSIUTIL -p$port -a8,0`; 645 | # 0 1 PhysDisk 1 ATA ST3750528AS CC44 1221000001000000 1 646 | # 0 3 PhysDisk 2 ATA INTEL SSDSA2M080 02HD 1221000003000000 3 647 | foreach (@out) 648 | { 649 | next unless /PhysDisk\s+(\d+)\s+(\w+)\s+(\w+(?:\s\w+)?)\s+([\da-zA-Z]+)\s+([\dABCDEF]+)\s+(\d+)\s+$/; 650 | $disks{$1} = {vendor=>$2, model=>$3, rev=>$4, phy=>$6}; 651 | } 652 | 653 | return \%disks; 654 | } 655 | 656 | 657 | sub ok 658 | { 659 | my $arg = shift; 660 | if ( ref $arg eq 'HASH' and $arg->{status} eq 100 ) 661 | { 662 | return 1; 663 | } 664 | elsif (ref $arg eq 'HASH' and $arg->{status} eq 500 and defined($arg->{msg})) 665 | { 666 | print $arg->{msg}; 667 | } 668 | return 0; 669 | } 670 | 671 | sub execute 672 | { 673 | my ($bin, @args) = @_; 674 | defined($bin) or return { status => 201, msg => 'No binary specified (execute)' }; 675 | 676 | #print("Executing : ".$bin." ".join(" ", @args".\n")); 677 | my ($in, $out); 678 | my $pid = IPC::Open3::open3($in, $out, $out, $bin, @args); 679 | $pid or return { status => 500, msg => 'Failed to fork : '.$! }; 680 | 681 | local $/; 682 | 683 | my $stdout = <$out>; 684 | my $ret = waitpid($pid, 0); 685 | my $status = ($? >> 8); 686 | 687 | close($in); 688 | close($out); 689 | my @stdout = split(/\n/, $stdout); 690 | if ($ret != $pid) 691 | { 692 | return { status => 500, msg => 'Invalid fork return (waitpid)', value => $stdout }; 693 | } 694 | elsif ($status != 0 and $bin ne '/bin/ps') 695 | { 696 | return { status => 500, msg => 'Binary '.$bin.' exited on a non-zero status ('.$status.')', value => $stdout }; 697 | } 698 | else 699 | { 700 | # Ok 701 | } 702 | return { status => 100, value => \@stdout }; 703 | } 704 | 705 | -------------------------------------------------------------------------------- /deb/scripts/rtmHardware.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl 2 | $ENV{"LC_ALL"} = "POSIX"; 3 | use strict; 4 | use utf8; # for \x{nnn} regex 5 | use warnings; 6 | use IPC::Open3; 7 | 8 | # init server hash 9 | my %server = (); 10 | $server{'rtm.info.kernel.release'} = "Unknown"; 11 | $server{'rtm.info.kernel.version'} = "Unknown"; 12 | $server{'rtm.info.release.os'} = "Unknown"; 13 | $server{'rtm.info.bios_vendor'} = "Unknown"; 14 | $server{'rtm.info.bios_version'} = "Unknown"; 15 | $server{'rtm.info.bios_date'} = "Unknown"; 16 | $server{'rtm.hw.mb.manufacture'} = "Unknown"; 17 | $server{'rtm.hw.mb.name'} = "Unknown"; 18 | $server{'rtm.hw.mb.serial'} = "Unknown"; 19 | $server{'rtm.hw.cpu.name'} = "Unknown"; 20 | $server{'rtm.hw.cpu.number'} = "Unknown"; 21 | $server{'rtm.hw.cpu.cache'} = "Unknown"; 22 | $server{'rtm.hw.cpu.mhz'} = "Unknown"; 23 | $server{'rtm.info.check.vm'} = "False"; 24 | $server{'rtm.info.check.oops'} = "False"; 25 | 26 | my %globalSgPaths = (); 27 | my @dmesg_lines = (); 28 | 29 | rtmHardware(); 30 | hash_walk(\%server, [], \&print_keys_and_value); 31 | 32 | # main 33 | sub rtmHardware 34 | { 35 | my $fnret = CPUInfo(); 36 | if (ok($fnret)) 37 | { 38 | # ok values in server hash 39 | } 40 | else 41 | { 42 | print "Error with CPUInfo \n"; 43 | } 44 | $fnret = getSgPaths(); 45 | if( ok($fnret) ) 46 | { 47 | %globalSgPaths = %{$fnret->{value}}; 48 | } 49 | $fnret = getDmesg(); 50 | if( ok($fnret) ) 51 | { 52 | @dmesg_lines = @{$fnret->{value}} 53 | } 54 | $fnret = kernel(); 55 | if (ok($fnret)) 56 | { 57 | # ok values in server hash 58 | } 59 | else 60 | { 61 | print "Error with kernel_oops \n"; 62 | } 63 | $fnret = os(); 64 | if (ok($fnret)) 65 | { 66 | # ok values in server hash 67 | } 68 | else 69 | { 70 | print "Error with os \n"; 71 | } 72 | $fnret = motherboard(); 73 | if (ok($fnret)) 74 | { 75 | # ok values in server hash 76 | } 77 | else 78 | { 79 | print "Error with motherboard \n"; 80 | } 81 | $fnret = disk(); 82 | if (ok($fnret)) 83 | { 84 | # ok values in server hash 85 | } 86 | else 87 | { 88 | print "Error with disk \n"; 89 | } 90 | $fnret = lspci(); 91 | if (ok($fnret)) 92 | { 93 | # ok values in server hash 94 | } 95 | else 96 | { 97 | print "Error with lspci \n"; 98 | } 99 | } 100 | 101 | # CPU info 102 | sub CPUInfo 103 | { 104 | my %cpu_info = ( 'cpu_no' => 0 ); 105 | $server{'rtm.hw.cpu.number'} = 0; 106 | if (open(my $fh, '<' ,"/proc/cpuinfo")) 107 | { 108 | while( <$fh> ) 109 | { 110 | chomp($_); 111 | if ($_ =~ /^model name\s+:\s(.*)/) 112 | { 113 | $server{'rtm.hw.cpu.name'} = $1; 114 | $server{'rtm.hw.cpu.number'} += 1; 115 | } 116 | if ($_ =~ /^cpu MHz/) 117 | { 118 | s/cpu MHz\s+:\s*//g; 119 | $server{'rtm.hw.cpu.mhz'} = $_; 120 | } 121 | if ($_ =~ /^cache size/) 122 | { 123 | s/cache size\s+:\s*//g; 124 | $server{'rtm.hw.cpu.cache'} = $_; 125 | } 126 | } 127 | close($fh); 128 | return {status =>100}; 129 | } 130 | else 131 | { 132 | print "Cannot open /proc/cpuinfo"; 133 | return {status => 500, msg => "Cannot open /proc/loadavg" }; 134 | } 135 | } 136 | 137 | sub kernel 138 | { 139 | # kernel release 140 | my $fnret = execute('uname -r'); 141 | if ( $fnret->{status} != 100 or !defined($fnret->{value}[0])) 142 | { 143 | print $fnret->{msg}." \n"; 144 | return { status => 500, msg => "uname error: ".$fnret->{msg}}; 145 | } 146 | else 147 | { 148 | $server{'rtm.info.kernel.release'}=$fnret->{value}[0]; 149 | } 150 | # kernel version 151 | $fnret = execute('uname -v'); 152 | if ( $fnret->{status} != 100 or !defined($fnret->{value}[0])) 153 | { 154 | print $fnret->{msg}. "\n"; 155 | return { status => 500, msg => "uname error: ".$fnret->{msg} }; 156 | } 157 | else 158 | { 159 | $server{'rtm.info.kernel.version'}=$fnret->{value}[0]; 160 | } 161 | return {status=>100}; 162 | } 163 | 164 | sub os 165 | { 166 | my $fnret =execute("lsb_release","-a"); 167 | if ( $fnret->{status} != 100 ) 168 | { 169 | print "Error ".$fnret->{msg}." \n"; 170 | # maybe red hat: 171 | if (open(my $fh, '<', "/etc/redhat-release")) 172 | { 173 | # yes! 174 | my $os_release; 175 | chomp($os_release = <$fh>); 176 | close($fh); 177 | $server{'rtm.info.release.os'} = $os_release; 178 | return{status=>100}; 179 | } 180 | else 181 | { 182 | print "Cannot open /etc/redhat-release"; 183 | return {status => 500, msg => "Cannot open /etc/redhat-release" }; 184 | } 185 | } 186 | else 187 | { 188 | foreach my $line (@{$fnret->{value}}) 189 | { 190 | if ($line =~ /^Distributor ID:\s+(.*)/i) 191 | { 192 | $server{'rtm.info.release.os'} = $1; 193 | } 194 | if ($line =~ /^Release:\s+(.*)/i) 195 | { 196 | $server{'rtm.info.release.os'} = $server{'rtm.info.release.os'}." ".$1; 197 | } 198 | if ($line =~ /^Codename:\s+(.*)/i) 199 | { 200 | $server{'rtm.info.release.os'} = $server{'rtm.info.release.os'}." ".$1; 201 | } 202 | } 203 | return {status=>100}; 204 | } 205 | } 206 | 207 | # motherboard 208 | sub motherboard 209 | { 210 | my $fnret = execute('dmidecode'); 211 | if ( $fnret->{status} != 100 ) 212 | { 213 | print $fnret->{msg}." \n"; 214 | return { status => 500, msg => "dmidecode error: ".$fnret->{msg}}; 215 | } 216 | else 217 | { 218 | for (my $i = 0; $i < @{$fnret->{value}}; $i++) 219 | { 220 | # Bios 221 | if($fnret->{value}[$i] =~ /^\s*BIOS Information/i) 222 | { 223 | my $biosVendor = $fnret->{value}[$i+1]; 224 | $biosVendor =~ /Vendor:\s+(.*)/; 225 | $server{'rtm.info.bios_vendor'} = $1; 226 | my $biosVersion = $fnret->{value}[$i+2]; 227 | $biosVersion =~ /Version:\s+(.*)/; 228 | $server{'rtm.info.bios_version'} = $1; 229 | my $biosRelease = $fnret->{value}[$i+3]; 230 | $biosRelease =~ /Release Date:\s+(.*)/; 231 | $server{'rtm.info.bios_date'} = $1; 232 | } 233 | # motherboard 234 | if($fnret->{value}[$i] =~ /^\s*Base Board Information/i) 235 | { 236 | my $manufacturer = $fnret->{value}[$i+1]; 237 | $manufacturer =~ /Manufacturer:\s+(.*)/; 238 | $server{'rtm.hw.mb.manufacture'} = $1; 239 | my $mbName = $fnret->{value}[$i+2]; 240 | $mbName =~ /Product Name:\s+(.*)/; 241 | $server{'rtm.hw.mb.name'} = $1; 242 | my $mbSerial = $fnret->{value}[$i+4]; 243 | $mbSerial =~ /Serial Number:\s+(.*)/; 244 | $server{'rtm.hw.mb.serial'} = $1; 245 | } 246 | # memory 247 | if($fnret->{value}[$i] =~ /^\s*Memory Device/i) 248 | { 249 | my $bank = $fnret->{value}[$i+9]; 250 | $bank =~ /Bank Locator:\s+(.*)/; 251 | $bank = $1; 252 | next if !$bank; 253 | $bank =~ s/\s//g; 254 | $bank =~ s/[\s\.\/\\_]/-/g; 255 | my $locator = $fnret->{value}[$i+8]; 256 | $locator =~ /Locator:\s+(.*)/; 257 | $locator = $1; 258 | next if !$locator; 259 | $locator =~ s/\s//g; 260 | $locator =~ s![\s./\\_#]!-!g; 261 | my $size = $fnret->{value}[$i+5]; 262 | $size =~ /Size:\s+(.*)/; 263 | $size = $1; 264 | next if !$size; 265 | $size =~ s/\s*MB\s*//g; 266 | chomp($size); 267 | if ($bank . $locator ne "") 268 | { 269 | $server{'rtm.hw.mem.bank-'.$bank . '-' . $locator} = $size; 270 | } 271 | } 272 | } 273 | return {status=>100}; 274 | } 275 | } 276 | 277 | # get disk 278 | sub disk 279 | { 280 | my $fnret = execute('lsblk -r --nodeps -o name 2>/dev/null'); 281 | if ( $fnret->{status} != 100 ) 282 | { 283 | print $fnret->{msg}." \n"; 284 | return { status => 500, msg => "lsblk error: ".$fnret->{msg}}; 285 | } 286 | else 287 | { 288 | my $lsblk = $fnret->{value}; 289 | foreach my $line (@{$lsblk}) 290 | { 291 | if ($line =~ /^(?!NAME)(?[\w]+)/) 292 | { 293 | my $disk = $1; 294 | $server{'rtm.info.hdd'}{$disk}{'model'}="Unknown"; 295 | $server{'rtm.info.hdd'}{$disk}{'capacity'}="Unknown"; 296 | $server{'rtm.info.hdd'}{$disk}{'serial'}="Unknown"; 297 | $server{'rtm.info.hdd'}{$disk}{'temperature'}=0; 298 | } 299 | } 300 | # smart on all disk 301 | foreach my $disk (keys %{$server{'rtm.info.hdd'}}) 302 | { 303 | my $diskSmart = "/dev/".$disk; 304 | my $before = time(); 305 | if ($diskSmart =~ /dev\/nvme(\d+)n(\d+)/) 306 | { 307 | $diskSmart = "/dev/nvme".$1; 308 | } 309 | my $fnret = execute("smartctl -a $diskSmart 2>/dev/null"); 310 | if ( $fnret->{status} != 100 ) 311 | { 312 | print $fnret->{msg}." \n"; 313 | next; 314 | } 315 | else 316 | { 317 | my $after = time(); 318 | my $smartTime = $after - $before; 319 | $server{'rtm.info.hdd'}->{$disk}->{'smart'}->{'time'} = $smartTime; 320 | $server{'rtm.info.hdd'}->{$disk}->{link_type} = 'sata'; 321 | my $filename = "/sys/class/block/$disk/queue/rotational"; 322 | my $fh; 323 | my $diskType = 'hdd'; 324 | if( -e $filename and open($fh, '<', $filename) ) 325 | { 326 | my $rotational = <$fh>; 327 | chomp($rotational); 328 | close($fh); 329 | if( "$rotational" eq "0" ) 330 | { 331 | $diskType = "ssd"; 332 | } 333 | } 334 | 335 | if( $disk =~ /nvme/ ) 336 | { 337 | $server{'rtm.info.hdd'}->{$disk}->{link_type} = 'pcie'; 338 | $diskType = 'nvme'; 339 | } 340 | 341 | $server{'rtm.info.hdd'}->{$disk}->{disk_type} = $diskType; 342 | my $smartctl = $fnret->{value}; 343 | my $smart_other_error = 0; 344 | foreach my $line (@{$smartctl}) 345 | { 346 | if ( $line =~ /^Transport\s*protocol\s*:\s+SAS/i ) 347 | { 348 | $server{'rtm.info.hdd'}->{$disk}->{link_type} = 'sas'; 349 | next; 350 | } 351 | if ($line =~ /^(?:Product|Device Model|Model Number):\s+(.*)$/i or $line =~ /Device:\s+([^\s].+)Version/i ) 352 | { 353 | $server{'rtm.info.hdd'}{$disk}{'model'}=$1; 354 | next; 355 | } 356 | if ($line =~ /^Serial Number:.(.*)$/i) 357 | { 358 | $server{'rtm.info.hdd'}{$disk}{'serial'}=$1; 359 | next; 360 | } 361 | if ($line =~ /.*Capacity:\s+.*\[(.*)\]/i) 362 | { 363 | $server{'rtm.info.hdd'}{$disk}{'capacity'}=$1; 364 | next; 365 | } 366 | if ($line =~ /^Firmware Version:.(.*)$/i) 367 | { 368 | $server{'rtm.info.hdd'}{$disk}{'firmware'}=$1; 369 | next; 370 | } 371 | if ($line =~ /^\s+5 Reallocated_Sector_Ct.*\s+(\d+)$/i) 372 | { 373 | $server{'rtm.info.hdd'}{$disk}{'smart'}{'reallocated-sector-count'}=$1; 374 | next; 375 | } 376 | if ($line =~ /^187 Reported_Uncorrect.*\s+(\d+)$/i) 377 | { 378 | $server{'rtm.info.hdd'}{$disk}{'smart'}{'reported-uncorrect'}=$1; 379 | next; 380 | } 381 | if ($line =~ /^196 Reallocated_Event_Count.*\s+(\d+)$/i) 382 | { 383 | $server{'rtm.info.hdd'}{$disk}{'smart'}{'realocated-event-count'}=$1; 384 | next; 385 | } 386 | if ($line =~ /^197 Current_Pending_Sector.*\s+(\d+)$/i) 387 | { 388 | $server{'rtm.info.hdd'}{$disk}{'smart'}{'current-pending-sector'}=$1; 389 | next; 390 | } 391 | if ($line =~ /^198 Offline_Uncorrectable.*\s+(\d+)$/i) 392 | { 393 | $server{'rtm.info.hdd'}{$disk}{'smart'}{'offline-uncorrectable'}=$1; 394 | next; 395 | } 396 | if ($line =~ /^199 UDMA_CRC_Error_Count.*\s+(\d+)$/i) 397 | { 398 | $server{'rtm.info.hdd'}{$disk}{'smart'}{'udma-crc-error'}=$1; 399 | next; 400 | } 401 | if ($line =~ /^200 Multi_Zone_Error_Rate.*\s+(\d+)$/i) 402 | { 403 | $server{'rtm.info.hdd'}{$disk}{'smart'}{'multizone-error-rate'}=$1; 404 | next; 405 | } 406 | if ($line =~ /^209 Offline_Seek_Performa?nce.*\s+(\d+)$/) 407 | { 408 | $server{'rtm.info.hdd'}{$disk}{'smart'}{'offline-seek-performance'}=$1; 409 | next; 410 | } 411 | if ($line =~ /^\s+9 Power_On_Hours.*\s+(\d+)$/) 412 | { 413 | $server{'rtm.info.hdd'}{$disk}{'smart'}{'power-on-hours'}=$1; 414 | next; 415 | } 416 | if ($line =~ /Error \d+ (occurred )?at /) 417 | { 418 | if ($line =~ /^read:.+(\d+)$/) 419 | { 420 | $server{'rtm.info.hdd'}{$disk}{'smart'}{'uncorrected-read-errors'}=$1; 421 | next; 422 | } 423 | if ($line =~ /^write:.+(\d+)$/) 424 | { 425 | $server{'rtm.info.hdd'}{$disk}{'smart'}{'uncorrected-write-errors'}=$1; 426 | next; 427 | } 428 | } 429 | if ($line =~ /^temperature\s+:\s+([0-9]+)/i) 430 | { 431 | $server{'rtm.info.hdd'}{$disk}{'temperature'}=$1; 432 | next; 433 | } 434 | } 435 | 436 | if ($diskType ne 'nvme') 437 | { 438 | my $fnret= execute("hddtemp $diskSmart 2>/dev/null"); 439 | if ( $fnret->{status} != 100 ) 440 | { 441 | print $fnret->{msg}." \n"; 442 | next; 443 | } 444 | elsif (defined $fnret->{value}[0]) 445 | { 446 | my $hddtemp=$fnret->{value}[0]; 447 | if ($hddtemp =~ m/.*:.*:\s(\d+)/) 448 | { 449 | $server{'rtm.info.hdd'}{$disk}{'temperature'}=$1; 450 | } 451 | } 452 | } 453 | 454 | # New way to gather stats 455 | my $linkType = $server{'rtm.info.hdd'}->{$disk}->{link_type}; 456 | my $realDisk = $disk; 457 | if( $disk !~ /^\/dev\// ) 458 | { 459 | $realDisk = "/dev/$disk"; 460 | } 461 | my $fnret = gatherStats( smartDisk => $realDisk, sgPaths => \%globalSgPaths, linkType => $linkType ); 462 | if(ok($fnret) and $fnret->{value}) 463 | { 464 | my $smartUpdate = $fnret->{value}; 465 | my %smartInfo = defined $server{'rtm.info.hdd'}->{$disk}->{smart} ? %{$server{'rtm.info.hdd'}->{$disk}->{smart}} : (); 466 | @smartInfo{keys %{$smartUpdate}} = values %{$smartUpdate}; 467 | $server{'rtm.info.hdd'}->{$disk}->{smart} = \%smartInfo; 468 | } 469 | 470 | # Get related dmesg errors 471 | $fnret = countDmesgErrors( 472 | diskName => $disk, 473 | lines => \@dmesg_lines, 474 | ); 475 | if( ok($fnret) ) 476 | { 477 | $server{'rtm.info.hdd'}->{$disk}->{'dmesg.io.errors'} = $fnret->{value}; 478 | } 479 | $fnret = iostatCounters( 480 | diskName => $disk, 481 | ); 482 | if( ok($fnret) ) 483 | { 484 | defined $fnret->{value}->{'r_await'} and $server{'rtm.info.hdd'}->{$disk}->{'iostat.read.avg.wait'} = $fnret->{value}->{'r_await'}; 485 | defined $fnret->{value}->{'w_await'} and $server{'rtm.info.hdd'}->{$disk}->{'iostat.write.avg.wait'} = $fnret->{value}->{'w_await'}; 486 | defined $fnret->{value}->{'rrqm/s'} and $server{'rtm.info.hdd'}->{$disk}->{'iostat.read.merged.per.sec'} = $fnret->{value}->{'rrqm/s'}; 487 | defined $fnret->{value}->{'wrqm/s'} and $server{'rtm.info.hdd'}->{$disk}->{'iostat.write.merged.per.sec'} = $fnret->{value}->{'wrqm/s'}; 488 | defined $fnret->{value}->{'r/s'} and $server{'rtm.info.hdd'}->{$disk}->{'iostat.read.per.sec'} = $fnret->{value}->{'r/s'}; 489 | defined $fnret->{value}->{'w/s'} and $server{'rtm.info.hdd'}->{$disk}->{'iostat.write.per.sec'} = $fnret->{value}->{'w/s'}; 490 | defined $fnret->{value}->{'%idle'} and $server{'rtm.info.hdd'}->{$disk}->{'iostat.busy'} = $fnret->{value}->{'%idle'}; 491 | defined $fnret->{value}->{'%util'} and $server{'rtm.info.hdd'}->{$disk}->{'iostat.busy'} = $fnret->{value}->{'%util'}; 492 | defined $fnret->{value}->{'rkB/s'} and $server{'rtm.info.hdd'}->{$disk}->{'iostat.readkb.per.sec'} = $fnret->{value}->{'rkB/s'}; 493 | defined $fnret->{value}->{'wkB/s'} and $server{'rtm.info.hdd'}->{$disk}->{'iostat.writekb.per.sec'} = $fnret->{value}->{'wkB/s'}; 494 | } 495 | } 496 | } 497 | return {status=>100}; 498 | } 499 | } 500 | 501 | #lspci 502 | sub lspci 503 | { 504 | my $fnret = execute("lspci -n 2>/dev/null"); 505 | if ( $fnret->{status} != 100 ) 506 | { 507 | print $fnret->{msg}." \n"; 508 | return { status => 500, msg => "lspci error: ".$fnret->{msg}}; 509 | } 510 | else 511 | { 512 | my %lspci_info = (); 513 | foreach my $line (@{$fnret->{value}}) 514 | { 515 | if ($line =~ /^(\S+).+:\s+(.+:.+)\s+\(/i) 516 | { 517 | $lspci_info{$1} = $2; 518 | } 519 | elsif ($line =~ /^(\S+).+:\s+(.+:.+$)/i) 520 | { 521 | $lspci_info{$1} = $2; 522 | } 523 | } 524 | foreach my $tempKey (keys %lspci_info) 525 | { 526 | my $temp = $tempKey; 527 | $temp =~ s/\:|\.|\_/-/g; 528 | $server{'rtm.hw.lspci.pci.'.$temp}=$lspci_info{$tempKey}; 529 | } 530 | return {status=>100}; 531 | } 532 | } 533 | 534 | sub getSectorSize { 535 | my %params = @_; 536 | my $disk = $params{disk}; 537 | my $fnret = execute("blockdev --getss $disk"); 538 | if ( $fnret->{status} != 100 ) 539 | { 540 | print $fnret->{msg}." \n"; 541 | return { status => 500, msg => "Error: unable to get sector size for device $disk"}; 542 | } 543 | elsif (defined($fnret->{value}[0])) 544 | { 545 | my $sectorSize = $fnret->{value}[0]; 546 | if( $sectorSize !~ /\d+/ ) 547 | { 548 | return { status => 500, msg => "Error: unexpected format for sectorSize; $sectorSize" }; 549 | } 550 | return { status => 100, value => $sectorSize }; 551 | } 552 | else 553 | { 554 | return { status => 500}; 555 | } 556 | } 557 | 558 | sub getSmartOverallHealthStatus 559 | { 560 | my %params = @_; 561 | my $smartDisk = $params{smartDisk}; 562 | my @lines = `smartctl -H $smartDisk 2>/dev/null`; 563 | my $last_status = $? >> 8; 564 | $last_status = $last_status & 7; 565 | if( $last_status != 0 ) 566 | { 567 | return { status => 500, msg => "Error: unable to get overall health for device $smartDisk" }; 568 | } 569 | foreach my $line (@lines) 570 | { 571 | if( $line =~ /SMART\s+Health\s+Status:\s+OK|SMART\s+overall-health\s+self-assessment\s+test\s+result:\s+PASSED/ ) 572 | { 573 | return { status => 100, value => { status => 'success' } }; 574 | } 575 | } 576 | return { status => 100, value => { status => 'failed' } }; 577 | } 578 | 579 | sub getSmartCommonInfo 580 | { 581 | my %params = @_; 582 | my $smartDisk = $params{smartDisk}; 583 | my $health = -1; 584 | my $loggedErrorCount = -1; 585 | 586 | if ($smartDisk =~ /dev\/nvme(\d+)n(\d+)/) 587 | { 588 | $smartDisk = "/dev/nvme".$1; 589 | } 590 | 591 | # overall health as boolean 0 -> KO, 1 -> OK 592 | my $fnret = getSmartOverallHealthStatus(smartDisk => $smartDisk); 593 | if(ok($fnret)) 594 | { 595 | $health = $fnret->{value}->{status} eq 'success' ? 1 : 0; 596 | } 597 | 598 | # any logged error count 599 | $fnret = getSmartLoggedError(smartDisk => $smartDisk); 600 | if( ok($fnret) ) 601 | { 602 | $loggedErrorCount = $fnret->{value}->{logged_error_count}; 603 | } 604 | 605 | return { 606 | status => 100, 607 | value => { 608 | "global-health" => $health, 609 | "logged-error-count" => $loggedErrorCount 610 | } 611 | }; 612 | } 613 | 614 | sub gatherStats 615 | { 616 | my %params = @_; 617 | my $linkType = $params{linkType}; 618 | my %sgPaths = %{$params{sgPaths} || {} }; 619 | my $smartDisk = $params{smartDisk} || return { status => 201, msg => 'Missing argument' }; 620 | my $fnret; 621 | if( $linkType and $linkType eq 'sas' ) 622 | { 623 | my $sgDisk = $sgPaths{$smartDisk}->{sgDrive}; 624 | if( ! $sgDisk ) 625 | { 626 | return { status => 500, msg => "Unable to get sg path for $smartDisk" }; 627 | } 628 | $fnret = getSmartStatsSAS( sgDisk => $sgDisk, smartDisk => $smartDisk ); 629 | } 630 | elsif( $linkType and $linkType eq 'pcie' ) 631 | { 632 | $fnret = getSmartStatsNvme( smartDisk => $smartDisk ); 633 | } 634 | else 635 | { 636 | $fnret = getSmartStatsATA(smartDisk => $smartDisk); 637 | } 638 | return $fnret; 639 | } 640 | 641 | sub getSmartStatsATA 642 | { 643 | my %params = @_; 644 | my $smartDisk = $params{smartDisk} || return { status => 201, msg => 'Missing argument' }; 645 | my $sectorSize = getSectorSize( disk => $smartDisk ); 646 | ok($sectorSize) or return $sectorSize; 647 | $sectorSize = $sectorSize->{value}; 648 | 649 | my $fnret = getSmartStatsAndAttributes(smartDisk => $smartDisk); 650 | my $smartStats = $fnret->{value}; 651 | my $bytesWritten = undef; 652 | my $bytesRead = undef; 653 | my $percentageUsed = undef; 654 | my $powerOnHours = undef; 655 | my $powerCycles = undef; 656 | my $linkFailures = -1; 657 | my $eccCorrectedErrs = -1; 658 | my $eccUncorrectedErrs = -1; 659 | my $reallocSectors = -1; 660 | my $uncorrectedEccPage = -1; 661 | my $commandTimeout = -1; 662 | my $offlineUncorrectable = -1; 663 | my $temperature = -1; 664 | my $highestTemperature = -1; 665 | my $lowestTemperature = -1; 666 | my $pendingSectors = -1; 667 | 668 | ## 669 | ## Gather bytesWritten information 670 | ## 671 | 672 | # Expressed in logical sectors : more precise, use it when possible 673 | if ( my ($gplPage) = grep { $_->{page} eq '0x01' and $_->{offset} eq '0x018' } @{$smartStats->{statistics}} ) 674 | { 675 | $gplPage->{value} =~ /^\d+\z/ or return { status => 500, msg => 'Unconsistent ATA write counter' }; 676 | $bytesWritten = $gplPage->{value}*$sectorSize; 677 | } 678 | 679 | # For Samsung SSD, expressed in LBA 680 | if ( my ($attr) = grep { $_->{name} eq 'Total_LBAs_Written' } @{$smartStats->{attributes}} ) 681 | { 682 | $attr->{raw_value} =~ /^\d+\z/ or return { status => 500, msg => 'Unconsistent ATA write counter' }; 683 | $attr->{raw_value} *= $sectorSize; 684 | $attr->{raw_value} >= ($bytesWritten||0) and $bytesWritten = $attr->{raw_value}; 685 | } 686 | 687 | # 32MB blocks, less precise but better than nothing 688 | # Seems to be expressed in MB and not in MiB as stated, or maybe a firmware bug on some models ? 689 | if ( my ($attr) = grep { $_->{name} eq 'Host_Writes_32MiB' } @{$smartStats->{attributes}} ) 690 | { 691 | $attr->{raw_value} =~ /^\d+\z/ or return { status => 500, msg => 'Unconsistent ATA write counter' }; 692 | $attr->{raw_value} *= 32*(2**20); 693 | $attr->{raw_value} >= ($bytesWritten||0) and $bytesWritten = $attr->{raw_value}; 694 | } 695 | 696 | ## 697 | ## Gather BytesRead information 698 | ## 699 | # Expressed in logical sectors : more precise, use it when possible 700 | if ( my ($gplPage) = grep { $_->{page} eq '0x01' and $_->{offset} eq '0x028' and $_->{value} =~ /^\d+\z/ } @{$smartStats->{statistics}} ) 701 | { 702 | $bytesRead = $gplPage->{value}*$sectorSize; 703 | } 704 | elsif ( my ($attr) = grep { ( ( $_->{id} eq 242 ) or ( $_->{name} eq 'Total_LBAs_Read' )) and $_->{raw_value} =~ /^\d+\z/ } @{$smartStats->{attributes}} ) 705 | { 706 | $attr->{raw_value} *= $sectorSize; 707 | $attr->{raw_value} >= ($bytesRead||0) and $bytesRead = $attr->{raw_value}; 708 | } 709 | # 32MB blocks, less precise but better than nothing 710 | # Seems to be expressed in MB and not in MiB as stated, or maybe a firmware bug on some models ? 711 | elsif ( my ($smartAttr) = grep { $_->{name} eq 'Host_Reads_32MiB' and $_->{raw_value} =~ /^\d+\z/ } @{$smartStats->{attributes}} ) 712 | { 713 | $smartAttr->{raw_value} *= 32*(2**20); 714 | $smartAttr->{raw_value} >= ($bytesRead||0) and $bytesRead = $smartAttr->{raw_value}; 715 | } 716 | 717 | ## 718 | ## Gather percentageUsed information 719 | ## 720 | 721 | # From 0 to 255 (Yup, a percentage from 0 to 255, no problem) 722 | # Note that some SSD have a MWI reported as less than 100 in attribute pages, while statistics page return 0 723 | if ( my ($gplPage) = grep { $_->{page} eq '0x07' and $_->{offset} eq '0x008' } @{$smartStats->{statistics}} ) 724 | { 725 | $gplPage->{value} =~ /^\d+\z/ or return { status => 500, msg => 'Unconsistent ATA MWI counter' }; 726 | $percentageUsed = $gplPage->{value}; 727 | } 728 | 729 | # From 0 to 100. Raw value has no meaning AFAIK 730 | if ( my ($attr) = grep { $_->{name} eq 'Media_Wearout_Indicator' } @{$smartStats->{attributes}} ) 731 | { 732 | $attr->{value} =~ /^\d+\z/ or return { status => 500, msg => 'Unconsistent ATA MWI counter' }; 733 | $attr->{value} = 100-$attr->{value}; 734 | $attr->{value} >= ($percentageUsed||0) and $percentageUsed = $attr->{value}; 735 | } 736 | 737 | # For Samsung SSD, rated from 0 to 100. For other brands, may not mean the same thing 738 | # Raw value is Program/Erase cycles. Disk is considered "used" when TLC > 1000 or MLC > 3000 739 | if ( my ($attr) = grep { $_->{name} eq 'Wear_Leveling_Count' } @{$smartStats->{attributes}} ) 740 | { 741 | $attr->{value} =~ /^\d+\z/ or return { status => 500, msg => 'Unconsistent ATA MWI counter' }; 742 | $attr->{value} = 100-$attr->{value}; 743 | $attr->{value} >= ($percentageUsed||0) and $percentageUsed = $attr->{value}; 744 | } 745 | 746 | ## 747 | ## Gather powerOnHours information 748 | ## 749 | 750 | # For ATA devices, should be nearly always known 751 | if ( my ($attr) = grep { $_->{name} eq 'Power_On_Hours' } @{$smartStats->{attributes}} ) 752 | { 753 | $attr->{raw_value} =~ /^\d+\z/ or return { status => 500, msg => 'Unconsistent ATA POH counter' }; 754 | $powerOnHours = $attr->{raw_value}; 755 | } 756 | 757 | ## 758 | ## Gather powerCycles information 759 | ## 760 | # For ATA devices, should be nearly always known 761 | if ( my ($attr) = grep { ( ($_->{id} eq 12) or ($_->{name} eq 'Power_Cycle_Count') ) and $_->{raw_value} =~ /^\d+\z/ } @{$smartStats->{attributes}} ) 762 | { 763 | $powerCycles = $attr->{raw_value}; 764 | } 765 | 766 | ## 767 | ## Gather eccCorrectedErrs information 768 | ## 769 | 770 | if ( my ($attr) = grep { $_->{id} eq 195 } @{$smartStats->{attributes}} ) 771 | { 772 | if( $attr->{raw_value} =~ /^\d+\z/ ) 773 | { 774 | $eccCorrectedErrs = $attr->{raw_value}; 775 | } 776 | else 777 | { 778 | $eccCorrectedErrs = -1; 779 | } 780 | } 781 | 782 | ## 783 | ## Gather eccUncorrectedErrs information (187) 784 | ## 785 | 786 | # Prefer the statistics section when available 787 | if ( my ($uncorrectedEccPage) = grep { $_->{page} eq '0x04' and $_->{offset} eq '0x008' } @{$smartStats->{statistics}} ) 788 | { 789 | if( $uncorrectedEccPage->{value} !~ /^\d+\z/ ) 790 | { 791 | $eccUncorrectedErrs = -1; 792 | } 793 | else 794 | { 795 | $eccUncorrectedErrs = $uncorrectedEccPage->{value}; 796 | } 797 | } 798 | elsif ( my ($attr) = grep { $_->{id} eq 187 } @{$smartStats->{attributes}} ) 799 | { 800 | if( $attr->{raw_value} =~ /^\d+\z/ ) 801 | { 802 | $eccUncorrectedErrs = $attr->{raw_value}; 803 | } 804 | else 805 | { 806 | $eccUncorrectedErrs = -1; 807 | } 808 | } 809 | 810 | # 811 | # Reallocated sectors (5) 812 | # 813 | if ( my ($reallocSectorPage) = grep { $_->{page} eq '0x03' and $_->{offset} eq '0x020' } @{$smartStats->{statistics}} ) 814 | { 815 | if( $reallocSectorPage->{value} =~ /^\d+\z/ ) 816 | { 817 | $reallocSectors = $reallocSectorPage->{value}; 818 | } 819 | else 820 | { 821 | $reallocSectors = -1; 822 | } 823 | } 824 | elsif ( my ($attr) = grep { 825 | (($_->{id} eq 5) or $_->{name} =~ /^(Reallocate_NAND_Blk_Cnt|Reallocated_Sector_Ct|Total_Bad_Block_Count)$/) 826 | and $_->{raw_value} =~ /^\d+\z/ } @{$smartStats->{attributes}} ) 827 | { 828 | $reallocSectors = $attr->{raw_value}; 829 | } 830 | 831 | # 832 | # Current_Pending_Sector_Count (197) 833 | # 834 | 835 | if ( my ($attr) = grep { $_->{id} eq 197 } @{$smartStats->{attributes}} ) 836 | { 837 | if( $attr->{raw_value} =~ /^\d+\z/ ) 838 | { 839 | $pendingSectors = $attr->{raw_value}; 840 | } 841 | else 842 | { 843 | $pendingSectors = -1; 844 | } 845 | } 846 | 847 | # 848 | # Offline_Uncorrectable (198) 849 | # 850 | 851 | if ( my ($attr) = grep { $_->{id} eq 198 and $_->{raw_value} =~ /^\d+\z/ } @{$smartStats->{attributes}} ) 852 | { 853 | $offlineUncorrectable = $attr->{raw_value}; 854 | } 855 | 856 | # 857 | # Command_Timeout (188) 858 | # 859 | if ( my ($attr) = grep { $_->{id} eq 188 and $_->{raw_value} =~ /^\d+\z/ } @{$smartStats->{attributes}} ) 860 | { 861 | $commandTimeout = $attr->{raw_value}; 862 | } 863 | 864 | # 865 | # Temperature (194) 866 | # 867 | if( my ($tempStat) = grep { $_->{page} eq '0x05' and $_->{offset} eq '0x008' and $_->{value} =~ /^\d+\z/ } @{$smartStats->{statistics}} ) 868 | { 869 | $temperature = $tempStat->{value}; 870 | } 871 | elsif ( my ($attr) = grep { $_->{id} eq 194 and $_->{raw_value} =~ /^\d+\z/ } @{$smartStats->{attributes}} ) 872 | { 873 | $temperature = $attr->{raw_value}; 874 | } 875 | 876 | if( my ($tempStat) = grep { $_->{page} eq '0x05' and $_->{offset} eq '0x020' and $_->{value} =~ /^\d+\z/ } @{$smartStats->{statistics}}) 877 | { 878 | $highestTemperature = $tempStat->{value}; 879 | } 880 | 881 | if( my ($tempStat) = grep { $_->{page} eq '0x05' and $_->{offset} eq '0x028' and $_->{value} =~ /^\d+\z/ } @{$smartStats->{statistics}}) 882 | { 883 | $lowestTemperature = $tempStat->{value}; 884 | } 885 | 886 | $fnret = getSataPhyErrorCounters(smartDisk => $smartDisk); 887 | if( ok($fnret) ) 888 | { 889 | ## 890 | ## Gather failures information 891 | ## 892 | ## SATA Phy Event Counters (GP Log 0x11) 893 | ## ID Size Value Description 894 | ## 0x000b 4 0 CRC errors within host-to-device FIS 895 | if ( my ($attr) = grep { $_->{id} eq '0x000b' and $_->{value} =~ /^\d+\z/ } @{$fnret->{value}}) 896 | { 897 | $linkFailures = $attr->{value}; 898 | } 899 | } 900 | 901 | $fnret = getSmartCommonInfo(smartDisk => $smartDisk); 902 | my %commonInfo = (); 903 | if( ok($fnret) ) 904 | { 905 | %commonInfo = %{$fnret->{value}} 906 | } 907 | 908 | return { 909 | status => 100, 910 | value => { 911 | "bytes-written" => $bytesWritten, 912 | "bytes-read" => $bytesRead, 913 | "percentage-used" => $percentageUsed || 0, 914 | "power-on-hours" => $powerOnHours, 915 | "power-cycles" => $powerCycles, 916 | "reported-corrected" => $eccCorrectedErrs, 917 | "reported-uncorrect" => $eccUncorrectedErrs, 918 | "reallocated-sector-count" => $reallocSectors, 919 | "current-pending-sector" => $pendingSectors, 920 | "offline-uncorrectable" => $offlineUncorrectable, 921 | "command-timeout" => $commandTimeout, 922 | "link-failures" => $linkFailures, 923 | "temperature" => $temperature, 924 | "highest-temperature" => $highestTemperature, 925 | "lowest-temperature" => $lowestTemperature, 926 | #"logged-error-count" => $loggedErrorCount, # in commonInfo 927 | #"global-health" => $health, # in commonInfo 928 | #rawReport => $rawReport, # in commonInfo 929 | %commonInfo 930 | }, 931 | }; 932 | } 933 | 934 | sub getSmartStatsAndAttributes 935 | { 936 | my %params = @_; 937 | my $smartDisk = $params{smartDisk}; 938 | 939 | if ($smartDisk =~ /dev\/nvme(\d+)n(\d+)/) 940 | { 941 | $smartDisk = "/dev/nvme".$1; 942 | } 943 | my $cmd = "timeout 15 smartctl -l devstat -A ".$smartDisk." 2>/dev/null"; 944 | my @smartctl = `$cmd`; 945 | my $status = $? >> 8; 946 | my $smart_filtered_status = $status & 7; 947 | if( $smart_filtered_status != 0 ) 948 | { 949 | return { status => 201, msg => "Unable to gather smart stats correctly. status: $smart_filtered_status" }; 950 | } 951 | 952 | my %result = (attributes => [], statistics => []); 953 | my %in = (); 954 | 955 | foreach my $line ( @smartctl ) 956 | { 957 | $line =~ s/\s+$//g; 958 | $line eq '' and next; 959 | 960 | if ( !$in{smart} and $line eq '=== START OF READ SMART DATA SECTION ===' ) 961 | { 962 | $in{smart} = 1; 963 | next; 964 | } 965 | $in{smart} or next; 966 | 967 | # Vendor Specific SMART Attributes with Thresholds: 968 | # ID# ATTRIBUTE_NAME FLAG VALUE WORST THRESH TYPE UPDATED WHEN_FAILED RAW_VALUE 969 | if ( $line eq 'Vendor Specific SMART Attributes with Thresholds:' or $line =~ /^ID#\sATTRIBUTE_NAME/ ) 970 | { 971 | $in{statistics} = 0; 972 | $in{attributes} = 1; 973 | } 974 | # Device Statistics (GP Log 0x04) 975 | # Page Offset Size Value Description 976 | elsif ( $line eq 'Device Statistics (GP Log 0x04)' or $line =~ /^Page\s+Offset\sSize/ ) 977 | { 978 | $in{attributes} = 0; 979 | $in{statistics} = 1; 980 | } 981 | elsif ( 982 | $in{attributes} and 983 | $line =~ /^\s* 984 | (\d+)\s 985 | (\S+)\s+ 986 | (0x[0-9a-f]{4})\s+ 987 | (\d{3})\s+ 988 | (\d{3})\s+ 989 | (\d{3}|-{3})\s+ 990 | (\S+)\s+ 991 | (\S+)\s+ 992 | (-|FAILING_NOW|In_the_past)\s+ 993 | (.+) 994 | $/x 995 | ) 996 | { 997 | push(@{$result{attributes}}, { 998 | id => $1, 999 | name => $2, 1000 | flag => $3, 1001 | value => $4, 1002 | worst => $5, 1003 | thresh => $6, 1004 | type => $7, 1005 | updated => $8, 1006 | when => $9, 1007 | raw_value => $10, 1008 | }); 1009 | } 1010 | elsif ( $in{statistics} and $line =~ /^\s*(\d+|0x[\da-f]{2})\s+={5}\s{2}=\s+=\s{2}==(?:=\s{2}==)?\s(.+)\s==$/ ) 1011 | { 1012 | # Ok 1013 | } 1014 | elsif ( $in{statistics} and $line =~ /^\s*(\d+|0x[\da-f]{2})\s+(0x[0-9a-f]{3})\s+(\d+)\s+(-?\d+|-)\s*([CDN-]{3}|~|)\s+(.+)$/ ) 1015 | { 1016 | if (length($5) <= 1) 1017 | { 1018 | # Smartctl 6.4 1019 | push(@{$result{statistics}}, { 1020 | page => sprintf('0x%02d', $1), 1021 | offset => $2, 1022 | size => $3, 1023 | value => $4, 1024 | normalized => ($5 eq '~') ? 1 : 0, 1025 | desc => $6, 1026 | }); 1027 | } 1028 | else 1029 | { 1030 | my @flags = split('', $5); 1031 | 1032 | push(@{$result{statistics}}, { 1033 | page => $1, 1034 | offset => $2, 1035 | size => $3, 1036 | value => $4, 1037 | monitored_condition_met => ($flags[0] ne '-') ? 1 : 0, 1038 | supports_dsn => ($flags[1] ne '-') ? 1 : 0, 1039 | normalized => ($flags[2] ne '-') ? 1 : 0, 1040 | desc => $6, 1041 | }); 1042 | } 1043 | } 1044 | # SMART Attributes Data Structure revision number: 1 1045 | elsif ( $line =~ /SMART Attributes Data Structure revision number: \d+$/ ) 1046 | { 1047 | # Don't care for now 1048 | } 1049 | elsif ( $line eq 'Device Statistics (GP/SMART Log 0x04) not supported' ) 1050 | { 1051 | # Sad, but ok 1052 | } 1053 | # |_ ~ normalized value 1054 | # |||_ C monitored condition met 1055 | elsif ( $line =~ /^\s+\|+_+\s[CDN~]\s[a-zA-Z\s]+$/ ) 1056 | { 1057 | # Device statistics footer (optional) 1058 | } 1059 | else 1060 | { 1061 | return { status => 500, msg => 'Unhandled line in smartctl return' }; 1062 | } 1063 | } 1064 | return { status => 100, value => \%result }; 1065 | } 1066 | 1067 | sub getSmartLoggedError 1068 | { 1069 | my %params = @_; 1070 | my $smartDisk = $params{smartDisk}; 1071 | my $cmd = "timeout 15 smartctl -l error,256 $smartDisk 2>/dev/null"; 1072 | my @smartLines = `$cmd`; 1073 | my $last_status = $? >> 8; 1074 | my $smart_status = $last_status & 7; 1075 | if( $smart_status != 0 ) 1076 | { 1077 | $cmd = "timeout 15 smartctl -l error $smartDisk 2>/dev/null"; 1078 | @smartLines = `$cmd`; 1079 | $last_status = $? >> 8; 1080 | $smart_status = $last_status & 7; 1081 | if( $smart_status != 0 ) 1082 | { 1083 | return { status => 500, msg => 'Unable to get smartctl logged errors' }; 1084 | } 1085 | } 1086 | my $smartReport = join( "\n", @smartLines ); 1087 | 1088 | my %details = (); 1089 | if ($smartReport =~ /^ATA Error Count:\s*(\d+)/m) 1090 | { 1091 | # SATA with ata error 1092 | $details{logged_error_count} = $1; 1093 | $details{disk_type} = 'ata'; 1094 | } 1095 | elsif ($smartReport =~ /^No Errors Logged/m) 1096 | { 1097 | # SATA/NVME without logged error 1098 | $details{logged_error_count} = 0; 1099 | $details{disk_type} = ($smartReport =~ /\(NVMe Log/) ? 'nvme' : 'ata'; 1100 | } 1101 | elsif ($smartReport =~ /^Non-medium\s+error\s+count\s*:\s*(\d+)/m) 1102 | { 1103 | # SAS (and probably SCSI) 1104 | $details{logged_error_count} = $1; 1105 | $details{disk_type} = 'sas'; 1106 | } 1107 | elsif ($smartReport =~ /\(NVMe Log/) 1108 | { 1109 | # "No Errors Logged" flag is not present, error have been logged 1110 | my ($filtered) = $smartReport =~ /Num\s+ErrCount\s+SQId\s+CmdId\s+Status\s+PELoc\s+LBA\s+NSID\s+VS\n(.+)$/s; 1111 | 1112 | # ... (17 entries not shown 1113 | if (defined($filtered) and ($filtered =~ /^(.+)\n\.{3} \(\d+ entries not shown\)(?:\r?\n)*$/s)) 1114 | { 1115 | $filtered = $1; 1116 | } 1117 | assert(defined($filtered) and ($filtered ne '')); 1118 | 1119 | $details{logged_error_count} = 0; 1120 | $details{disk_type} = 'nvme'; 1121 | $details{logged_errors} = []; 1122 | 1123 | foreach my $line (split(/[\n\r]+/, $filtered)) 1124 | { 1125 | $line =~ s/^\s+//; 1126 | $line =~ s/\s+$//; 1127 | 1128 | # 0 1 0 0x0000 0x4212 0x028 0 255 - 1129 | my @elems = split(/\s+/, $line); 1130 | @elems == 9 or return {status => 500, msg => 'Failed to parse "smartctl -l" return'}; 1131 | 1132 | push(@{$details{logged_errors}}, { 1133 | id => $elems[0], 1134 | err_count => $elems[1], 1135 | sq_id => $elems[2], 1136 | cmd_id => $elems[3], 1137 | status => $elems[4], 1138 | pe_loc => $elems[5], 1139 | lba => $elems[6], 1140 | nsid => $elems[7], 1141 | vs => $elems[8], 1142 | }); 1143 | $details{logged_error_count} += 1; 1144 | } 1145 | assert($details{logged_error_count} > 0); 1146 | } 1147 | else 1148 | { 1149 | return { status => 200, msg => 'Unhandled smartct -l error return' }; 1150 | } 1151 | return { status => 100, value => \%details, details => $smartReport }; 1152 | } 1153 | 1154 | sub getSataPhyErrorCounters 1155 | { 1156 | my %params = @_; 1157 | my $smartDisk = $params{smartDisk}; 1158 | 1159 | if ($smartDisk =~ /dev\/nvme(\d+)n(\d+)/) 1160 | { 1161 | $smartDisk = "/dev/nvme".$1; 1162 | } 1163 | my $cmd = "timeout 15 smartctl -l sataphy $smartDisk 2>/dev/null"; 1164 | my @smartLines = `$cmd`; 1165 | my $last_status = $? >> 8; 1166 | my $smart_status = $last_status & 7; 1167 | if( $smart_status != 0 ) 1168 | { 1169 | return { status => 500, msg => 'Unable to get smart phy error counters' }; 1170 | } 1171 | my $smartReport = join( "\n", @smartLines ); 1172 | 1173 | my @counters = (); 1174 | foreach my $line (split(/[\n\r]+/, $smartReport)) 1175 | { 1176 | $line =~ s/\s+$//; 1177 | $line eq '' and next; 1178 | 1179 | if (($line =~ /^(smartctl|Copyright|SATA Phy|ID\s+Size)/) and !@counters) 1180 | { 1181 | # Header 1182 | } 1183 | elsif ($line =~ /^(0x[0-9a-f]{4})\s+(\d+)\s+(\d+)\s+(.+)$/) 1184 | { 1185 | push(@counters, { 1186 | id => $1, 1187 | size => $2, 1188 | value => $3, 1189 | desc => $4, 1190 | }); 1191 | } 1192 | else 1193 | { 1194 | return { status => 500, msg => 'Unhandled line in smartctl return' }; 1195 | } 1196 | } 1197 | return { status => 100, value => \@counters }; 1198 | } 1199 | 1200 | sub getSgPaths 1201 | { 1202 | my $fnret = execute("lsscsi -tg 2>/dev/null"); 1203 | if ( $fnret->{status} != 100 ) 1204 | { 1205 | print $fnret->{msg}." \n"; 1206 | return { status => 500, msg => "Unable to gather sg paths ".$fnret->{msg}}; 1207 | } 1208 | else 1209 | { 1210 | my %drives; 1211 | foreach my $line ( @{$fnret->{value}} ) 1212 | { 1213 | if ( $line =~ / 1214 | disk\s+ 1215 | sas:0x([0-9a-f]+)\s+ 1216 | (\/dev\/sd[a-z]+|-)\s+ 1217 | (\/dev\/sg\d+|) 1218 | /x) 1219 | { 1220 | ( $2 eq '-' ) and next; 1221 | $drives{$2} = { 1222 | sasAddress => $1, 1223 | sdDrive => $2, 1224 | sgDrive => $3, 1225 | }; 1226 | } 1227 | } 1228 | return { status => 100, value => \%drives }; 1229 | } 1230 | } 1231 | 1232 | # ################################## 1233 | # Sg logs subs 1234 | sub getSupportedLogPages 1235 | { 1236 | my %params = @_; 1237 | my $devPath = $params{devPath}; 1238 | 1239 | my $fnret = execute("sg_logs -x $devPath 2>/dev/null"); 1240 | if ( $fnret->{status} != 100 ) 1241 | { 1242 | print $fnret->{msg}." \n"; 1243 | return { status => 500, msg => "Unable to get sg logs pages ".$fnret->{msg}}; 1244 | } 1245 | else 1246 | { 1247 | my @pages = (); 1248 | foreach my $i ( 0 .. @{$fnret->value} ) 1249 | { 1250 | my $line = $fnret->{value}[$i]; 1251 | $line =~ s/^\s+$//; 1252 | # Supported log pages [0x0]: 1253 | if ( $i == 0 ) 1254 | { 1255 | # Page name 1256 | } 1257 | # 0x00 Supported log pages 1258 | # 0x0d Temperature 1259 | elsif ( $line =~ /^\s{4}(0x[\da-f]{2})\s+(.+)$/ ) 1260 | { 1261 | push(@pages, {code => $1, desc => $2}); 1262 | } 1263 | else 1264 | { 1265 | return { status => 500, msg => 'Unhandled sg_logs return' }; 1266 | } 1267 | } 1268 | return { status => 100, value => \@pages }; 1269 | } 1270 | } 1271 | 1272 | sub getGenericLogPage 1273 | { 1274 | my %params = @_; 1275 | my $devPath = $params{devPath}; 1276 | my $page = $params{page}; 1277 | my $stopOnValue = $params{stopOnValue}; 1278 | 1279 | my $fnret = execute("sg_logs -x --page $page $devPath"); 1280 | if ( $fnret->{status} != 100 ) 1281 | { 1282 | print $fnret->{msg}." \n"; 1283 | return { status => 500, msg => "Unable to get sg logs requested page ".$fnret->{msg}}; 1284 | } 1285 | else 1286 | { 1287 | my @lines = @{$fnret->{value}}; 1288 | my $category = ''; 1289 | my %details = (); 1290 | my $headerSeen = 0; 1291 | 1292 | foreach my $i ( 0 .. $#lines ) 1293 | { 1294 | my $line = $lines[$i]; 1295 | $line =~ s/^\s+$//; 1296 | 1297 | # Read error counter page [0x3] 1298 | if ( substr($line, 0, 1) ne ' ' and $line =~ /\[0x[0-9a-f]{1,2}\]$/ ) 1299 | { 1300 | # Header 1301 | $headerSeen and return { status => 200, msg => 'Can not parse specified log page ('.$page.')' }; 1302 | $headerSeen++; 1303 | } 1304 | elsif ( defined($stopOnValue) and ($line =~ $stopOnValue) ) 1305 | { 1306 | # Stop on value reached, stop here 1307 | last; 1308 | } 1309 | # Total times correction algorithm processed = 1418 1310 | # Percentage used endurance indicator: 2% 1311 | elsif ( $line =~ /^\s{2}([^\s][^=:]+)(?:\s=|:)\s(.+)$/ ) 1312 | { 1313 | $details{$1} = $2; 1314 | } 1315 | # Status parameters: 1316 | elsif ( $line =~ /^\s{2}([^\s][^=:]+):$/ ) 1317 | { 1318 | $category = $1; 1319 | $details{$category} ||= {}; 1320 | } 1321 | # Accumulated power on minutes: 939513 [h:m 15658:33] 1322 | elsif ( $category and $line =~ /^\s{4}([^\s][^=:]+)(?:\s=|:)\s(.+)$/ ) 1323 | { 1324 | $details{$category}{$1} = $2; 1325 | } 1326 | else 1327 | { 1328 | return { status => 500, msg => 'Unhandled sg_logs return' }; 1329 | } 1330 | } 1331 | 1332 | # Sanity check 1333 | if ( !$headerSeen ) 1334 | { 1335 | return { status => 500, msg => 'sg_logs return may have not been properly handled' }; 1336 | } 1337 | 1338 | return { status => 100, value => \%details }; 1339 | } 1340 | } 1341 | 1342 | sub getBackgroundScanResultsLogPage 1343 | { 1344 | my %params = @_; 1345 | my $devPath = $params{devPath}; 1346 | 1347 | my $fnret = execute("sg_logs -x --page 0x15 $devPath 2>/dev/null"); 1348 | if ( $fnret->{status} != 100 ) 1349 | { 1350 | print $fnret->{msg}." \n"; 1351 | return { status => 500, msg => "Unable to get background scan page: ".$fnret->{msg}."\n"}; 1352 | } 1353 | else 1354 | { 1355 | my @lines = @{$fnret->{value}}; 1356 | my $category = ''; 1357 | my %details = (); 1358 | my $headerSeen = 0; 1359 | 1360 | foreach my $i ( 0 .. $#lines ) 1361 | { 1362 | my $line = $lines[$i]; 1363 | $line =~ s/^\s+$//; 1364 | 1365 | # Read error counter page [0x3] 1366 | if ( substr($line, 0, 1) ne ' ' and $line =~ /\[0x[0-9a-f]{1,2}\]$/ ) 1367 | { 1368 | # Header 1369 | $headerSeen and return { status => 200, msg => 'Can not parse specified log page (0x15)' }; 1370 | $headerSeen++; 1371 | } 1372 | # Total times correction algorithm processed = 1418 1373 | # Percentage used endurance indicator: 2% 1374 | elsif ( $line =~ /^\s{2}([^\s][^=:]+)(?:\s=|:)\s(.+)$/ ) 1375 | { 1376 | $details{$1} = $2; 1377 | } 1378 | # Status parameters: 1379 | elsif ( $line =~ /^\s{2}([^\s][^=:]+):$/ ) 1380 | { 1381 | $category = $1; 1382 | $details{$category} ||= {}; 1383 | } 1384 | # Accumulated power on minutes: 939513 [h:m 15658:33] 1385 | elsif ( $category and $line =~ /^\s{4}([^\s][^=:]+)(?:\s=|:)\s(.+)$/ ) 1386 | { 1387 | $details{$category}{$1} = $2; 1388 | } 1389 | # Medium scan parameter # 1 [0x1] 1390 | elsif ( $line =~ /^\s{2}Medium scan parameter #\s*(\d+)\s*\[0x[0-9a-f]+\]$/ ) 1391 | { 1392 | # Start of scan results, not handled for now 1393 | last; 1394 | } 1395 | else 1396 | { 1397 | return { status => 500, msg => 'Unhandled sg_logs return' }; 1398 | } 1399 | } 1400 | 1401 | # Sanity check 1402 | if ( !$headerSeen ) 1403 | { 1404 | return { status => 500, msg => 'sg_logs return may have not been properly handled' }; 1405 | } 1406 | 1407 | if ( $details{'Status parameters'} and my $pohLine = $details{'Status parameters'}{'Accumulated power on minutes'} ) 1408 | { 1409 | # 939513 [h:m 15658:33] 1410 | my ($poh) = $pohLine =~ /^\d+\s\[h:m\s+(\d+):(\d+)\]$/; 1411 | $details{'Status parameters'}{'Accumulated power on hours'} = $poh; 1412 | } 1413 | return { status => 100, value => \%details }; 1414 | } 1415 | } 1416 | 1417 | sub getDmesg 1418 | { 1419 | my $fnret = execute('/bin/dmesg -T | tail -n 15000'); 1420 | if ( $fnret->{status} != 100 ) 1421 | { 1422 | print $fnret->{msg}." \n"; 1423 | return { status => 500, msg => "dmesg error: ".$fnret->{msg}}; 1424 | } 1425 | else 1426 | { 1427 | my $dmesg = $fnret->{value}; 1428 | # 2 checks 1429 | # check for allocation failed or kernel oops 1430 | my $results = $fnret->{value}; 1431 | my @filtered = (); 1432 | foreach my $line (@{$dmesg}) 1433 | { 1434 | chomp $line; 1435 | if ( $line =~ /(I\/O|critical medium) error/ 1436 | or $line =~ /Buffer I\/O error on device/ 1437 | or $line =~ /Unhandled (error|sense) code/ ) 1438 | { 1439 | push @filtered, $line; 1440 | } 1441 | if ($line =~ /allocation failed/i) 1442 | { 1443 | $server{'rtm.info.check.vm'}="True"; 1444 | } 1445 | if ($line =~ /Oops/i) 1446 | { 1447 | $server{'rtm.info.check.oops'}="True"; 1448 | } 1449 | } 1450 | return { status => 100, value => \@filtered }; 1451 | } 1452 | } 1453 | 1454 | sub countDmesgErrors 1455 | { 1456 | my %params = @_; 1457 | my $diskName = $params{diskName}; 1458 | my @lines = @{$params{lines}}; 1459 | my $counter = 0; 1460 | 1461 | foreach my $line (@lines) 1462 | { 1463 | if ( $line =~ /(I\/O|critical medium) error, dev $diskName, sector/ 1464 | or $line =~ /Buffer I\/O error on device $diskName,/ 1465 | or $line =~ /\[$diskName\]\s+Unhandled (error|sense) code/ ) 1466 | { 1467 | $counter++; 1468 | } 1469 | } 1470 | return { status => 100, value => $counter }; 1471 | } 1472 | 1473 | sub iostatCounters 1474 | { 1475 | my %params = @_; 1476 | my $diskName = $params{diskName} || return { status => 500, msg => "Missing diskName" }; 1477 | my $devPath = "/dev/".$diskName; 1478 | 1479 | my $fnret = execute("/usr/bin/iostat -d -x $devPath"); 1480 | if ( $fnret->{status} != 100 ) 1481 | { 1482 | print $fnret->{msg}." \n"; 1483 | return { status => 500, msg => "iostat error: ".$fnret->{msg}}; 1484 | } 1485 | else 1486 | { 1487 | my $counterLabelsLine = undef; 1488 | my $countersLine = undef; 1489 | my @lines = @{$fnret->{value}}; 1490 | 1491 | foreach my $line (@lines) 1492 | { 1493 | chomp $line; 1494 | if( $line =~ /^\s*device(?:\s*:|\s)(.*)$/i ) 1495 | { 1496 | $counterLabelsLine = $1; 1497 | chomp( $counterLabelsLine ); 1498 | $counterLabelsLine =~ s/^\s*//; 1499 | $counterLabelsLine =~ s/\s*$//; 1500 | } 1501 | elsif( $line =~ /^\s*$diskName\s(.*)$/ ) 1502 | { 1503 | $countersLine = $1; 1504 | chomp( $countersLine ); 1505 | $countersLine =~ s/^\s*//; 1506 | $countersLine =~ s/\s*$//; 1507 | } 1508 | } 1509 | 1510 | if( !defined($counterLabelsLine) or !defined($countersLine) ) 1511 | { 1512 | return { status => 500, msg => 'Unable to parse iostat' }; 1513 | } 1514 | 1515 | my @fields = split /\s+/, $counterLabelsLine; 1516 | my @values = split /\s+/, $countersLine; 1517 | 1518 | if( scalar(@fields) != scalar(@values) ) 1519 | { 1520 | return { status => 500, msg => 'Unexpected iostat parsing: '.scalar(@fields).' != '.scalar(@values) }; 1521 | } 1522 | 1523 | my $counters = {}; 1524 | for( my $i=0; $i{$fields[$i]} = $values[$i]; 1527 | } 1528 | 1529 | return { status => 100, value => $counters }; 1530 | } 1531 | } 1532 | 1533 | sub getSmartStatsSAS 1534 | { 1535 | my %params = @_; 1536 | my $device = $params{sgDisk} || return { status => 201, msg => 'Missing argument' }; 1537 | my $smartDisk = $params{smartDisk} || return { status => 201, msg => 'Missing argument' }; 1538 | 1539 | my $fnret = getSupportedLogPages(devPath => $device); 1540 | ok($fnret) or return $fnret; 1541 | 1542 | my @supportedPages = @{$fnret->{value}}; 1543 | # Attempt to gather the same subset of information as via smart for sata drives 1544 | my $bytesWritten = undef; 1545 | my $bytesRead = -1; 1546 | my $percentageUsed = undef; 1547 | my $powerOnHours = undef; 1548 | my $linkFailures = -1; 1549 | my $powerCycles = -1; 1550 | my $eccCorrectedErrs = -1; 1551 | my $eccUncorrectedErrs = -1; 1552 | my $reallocSectors = -1; 1553 | my $commandTimeout = -1; 1554 | my $offlineUncorrectable = -1; 1555 | my $temperature = -1; 1556 | my $highestTemperature = -1; 1557 | my $lowestTemperature = -1; 1558 | my $pendingSectors = -1; 1559 | 1560 | # Write counter 1561 | if ( grep { $_->{code} eq '0x02' } @supportedPages ) 1562 | { 1563 | my $fnret = getGenericLogPage(devPath => $device, page => '0x02'); 1564 | ok($fnret) or return $fnret; 1565 | 1566 | $bytesWritten = $fnret->{value}->{'Total bytes processed'}; 1567 | $bytesWritten =~ /^\d+\z/ or return { status => 500, msg => 'Unconsistent SCSI write counter' }; 1568 | 1569 | my $eccErrsU = $fnret->{value}->{'Total uncorrected errors'}; 1570 | my $eccErrsC = $fnret->{value}->{'Total errors corrected'}; 1571 | 1572 | if( defined($eccErrsU) and $eccErrsU =~ /^\d+\z/ ) 1573 | { 1574 | $eccUncorrectedErrs = $eccErrsU; 1575 | } 1576 | 1577 | if( defined($eccErrsC) and $eccErrsC =~ /^\d+\z/ ) 1578 | { 1579 | $eccCorrectedErrs = $eccErrsC; 1580 | } 1581 | } 1582 | 1583 | # SSD specific page 1584 | if ( grep { $_->{code} eq '0x11' } @supportedPages ) 1585 | { 1586 | # Note : STEC drives have additional log pages, but not interpreted by sg_logs as of version 1.24 20140523 1587 | # We only care about MWI here, ignore them for now 1588 | my $fnret = getGenericLogPage( 1589 | devPath => $device, 1590 | page => '0x11', 1591 | stopOnValue => qr/^\s{2}Reserved\s\[parameter_code=0x[0-9a-f]{4}\]:$/, 1592 | ); 1593 | ok($fnret) or return $fnret; 1594 | 1595 | $percentageUsed = $fnret->{value}->{'Percentage used endurance indicator'}; 1596 | $percentageUsed =~ s/%$//; 1597 | $percentageUsed =~ /^\d+\z/ or return { status => 500, msg => 'Unconsistent SCSI MWI counter' }; 1598 | } 1599 | 1600 | # Power On Hours, hidden in 'Background scan results' page 1601 | if ( grep { $_->{code} eq '0x15' } @supportedPages ) 1602 | { 1603 | my $fnret = getBackgroundScanResultsLogPage(devPath => $device); 1604 | ok($fnret) or return $fnret; 1605 | 1606 | $powerOnHours = $fnret->{value}->{'Status parameters'}->{'Accumulated power on hours'}; 1607 | $powerOnHours =~ /^\d+\z/ or return { status => 500, msg => 'Unconsistent SCSI POH counter' }; 1608 | } 1609 | 1610 | # Read counter 1611 | if ( grep { $_->{code} eq '0x03' } @supportedPages ) 1612 | { 1613 | my $fnret = getGenericLogPage(devPath => $device, page => '0x03'); 1614 | if ( ok($fnret) ) 1615 | { 1616 | if( defined( $fnret->{value}->{'Total bytes processed'} ) and $fnret->{value}->{'Total bytes processed'} =~ /^\d+\z/ ) 1617 | { 1618 | $bytesRead = $fnret->{value}->{'Total bytes processed'}; 1619 | } 1620 | 1621 | my $eccErrsU = $fnret->{value}->{'Total uncorrected errors'}; 1622 | my $eccErrsC = $fnret->{value}->{'Total errors corrected'}; 1623 | 1624 | if( defined($eccErrsU) and $eccErrsU =~ /^\d+\z/ ) 1625 | { 1626 | if( $eccUncorrectedErrs == -1 ) 1627 | { 1628 | $eccUncorrectedErrs = $eccErrsU; 1629 | } 1630 | else 1631 | { 1632 | $eccUncorrectedErrs += $eccErrsU; 1633 | } 1634 | } 1635 | 1636 | if( defined($eccErrsC) and $eccErrsC =~ /^\d+\z/ ) 1637 | { 1638 | if( $eccCorrectedErrs == -1 ) 1639 | { 1640 | $eccCorrectedErrs = $eccErrsC; 1641 | } 1642 | else 1643 | { 1644 | $eccCorrectedErrs += $eccErrsC; 1645 | } 1646 | } 1647 | } 1648 | } 1649 | 1650 | # Power cycle count 1651 | if ( grep { $_->{code} eq '0x0e' } @supportedPages ) 1652 | { 1653 | my $fnret = getGenericLogPage(devPath => $device, page => '0x0e'); 1654 | if ( ok($fnret) ) 1655 | { 1656 | $powerCycles = $fnret->{value}->{'Accumulated start-stop cycles'}; 1657 | if( $powerCycles !~ /^\d+\z/ ) 1658 | { 1659 | $powerCycles = -1; 1660 | } 1661 | } 1662 | } 1663 | 1664 | # Link failure errors 1665 | if ( grep { $_->{code} eq '0x06' } @supportedPages ) 1666 | { 1667 | my $fnret = getGenericLogPage(devPath => $device, page => '0x06'); 1668 | if ( ok($fnret) ) 1669 | { 1670 | $linkFailures = $fnret->{value}->{'Non-medium error count'}; 1671 | if( $linkFailures !~ /^\d+\z/ ) 1672 | { 1673 | $linkFailures = -1; 1674 | } 1675 | } 1676 | } 1677 | 1678 | # Temperature 1679 | if ( grep { $_->{code} eq '0x0d' } @supportedPages ) 1680 | { 1681 | my $fnret = getGenericLogPage(devPath => $device, page => '0x0d'); 1682 | if ( ok($fnret) ) 1683 | { 1684 | $temperature = $fnret->{value}->{'Current temperature'}; 1685 | if( $temperature =~ /^(\d+)\s*C/ ) 1686 | { 1687 | $temperature = $1; 1688 | } 1689 | else 1690 | { 1691 | $temperature = -1; 1692 | } 1693 | } 1694 | } 1695 | 1696 | my %commonInfo = (); 1697 | if( $smartDisk ) 1698 | { 1699 | $fnret = getSmartCommonInfo(smartDisk => $smartDisk); 1700 | if( ok($fnret) ) 1701 | { 1702 | %commonInfo = %{$fnret->{value}} 1703 | } 1704 | } 1705 | 1706 | return { 1707 | status => 100, 1708 | value => { 1709 | "bytes-written" => $bytesWritten, 1710 | "bytes-read" => $bytesRead, 1711 | "percentage-used" => $percentageUsed || 0, 1712 | "power-on-hours" => $powerOnHours, 1713 | "power-cycles" => $powerCycles, 1714 | "reported-corrected" => $eccCorrectedErrs, 1715 | "reported-uncorrect" => $eccUncorrectedErrs, 1716 | "reallocated-sector-count" => $reallocSectors, 1717 | "current-pending-sector" => $pendingSectors, 1718 | "offline-uncorrectable" => $offlineUncorrectable, 1719 | "command-timeout" => $commandTimeout, 1720 | "link-failures" => $linkFailures, 1721 | "temperature" => $temperature, 1722 | "highest-temperature" => $highestTemperature, 1723 | "lowest-temperature" => $lowestTemperature, 1724 | %commonInfo 1725 | }, 1726 | }; 1727 | } 1728 | 1729 | sub getNvmeSmartStatistics 1730 | { 1731 | my %params = @_; 1732 | my $smartDisk = $params{smartDisk} || return { status => 201, msg => "Missing smartDisk param" }; 1733 | if ($smartDisk =~ /nvme(\d+)n(\d+)/) 1734 | { 1735 | $smartDisk = "/dev/nvme".$1; 1736 | } 1737 | my $cmd = "timeout 15 smartctl -A $smartDisk 2>/dev/null"; 1738 | my @smartLines = `$cmd`; 1739 | my $last_status = $? >> 8; 1740 | my $smart_status = $last_status & 7; 1741 | if( $smart_status != 0 ) 1742 | { 1743 | return { status => 500, msg => 'Unable to get smartctl info for nvme disk '.$smartDisk }; 1744 | } 1745 | 1746 | my %result = (); 1747 | my $in = 0; 1748 | 1749 | foreach my $line (@smartLines) 1750 | { 1751 | $line =~ s/\s+$//g; 1752 | $line =~ s/^\s+//g; 1753 | $line eq '' and next; 1754 | 1755 | if ($line eq '=== START OF SMART DATA SECTION ===') 1756 | { 1757 | $in++; 1758 | } 1759 | # SMART/Health Information (NVMe Log 0x02, NSID 0xffffffff) 1760 | # SMART/Health Information (NVMe Log 0x02) 1761 | elsif ($line =~ /^SMART\/Health Information \(NVMe Log 0x02(?:, NSID (0x[a-f0-9]+))?\)$/) 1762 | { 1763 | # Not used 1764 | } 1765 | # Temperature: 45 Celsius 1766 | # Power On Hours: 7,262 1767 | elsif ($in and $line =~ /^([^:]+):\s+([^\s].*)$/) 1768 | { 1769 | $result{$1} = $2; 1770 | } 1771 | elsif (!$in) 1772 | { 1773 | # Header 1774 | } 1775 | else 1776 | { 1777 | Logger::debug($line); 1778 | return { status => 500, msg => 'Unhandled line in smartctl return' }; 1779 | } 1780 | } 1781 | 1782 | $in or return { 1783 | status => 500, 1784 | msg => 'Failed to parse smartctl return', 1785 | }; 1786 | 1787 | return { status => 100, value => \%result }; 1788 | } 1789 | 1790 | sub getSmartStatsNvme { 1791 | my %params = @_; 1792 | my $smartDisk = $params{smartDisk} || return { status => 201, msg => 'Missing argument' }; 1793 | 1794 | my $bytesWritten = undef; 1795 | my $bytesRead = -1; 1796 | my $percentageUsed = undef; 1797 | my $powerOnHours = undef; 1798 | my $linkFailures = -1; 1799 | my $powerCycles = -1; 1800 | my $eccCorrectedErrs = -1; 1801 | my $eccUncorrectedErrs = -1; 1802 | my $reallocSectors = -1; 1803 | my $commandTimeout = -1; 1804 | my $offlineUncorrectable = -1; 1805 | my $temperature = -1; 1806 | my $highestTemperature = -1; 1807 | my $lowestTemperature = -1; 1808 | my $pendingSectors = -1; 1809 | my $unsafeShutdowns = -1; 1810 | my $criticalStatus = -1; 1811 | 1812 | my $fnret = getNvmeSmartStatistics(smartDisk => $smartDisk); 1813 | ok($fnret) or return $fnret; 1814 | 1815 | my $smartStats = $fnret->{value}; 1816 | 1817 | if ( defined($smartStats->{'Data Units Written'}) ) 1818 | { 1819 | $bytesWritten = $smartStats->{'Data Units Written'}; 1820 | 1821 | # 27,745,697 [14.2 TB] 1822 | # last part is optional when drive is brand new 1823 | $bytesWritten =~ s/\s+\[[^\]]+\]$//; 1824 | $bytesWritten =~ s/,//g; 1825 | $bytesWritten =~ /^\d+\z/ or return {status => 500, msg => 'Unconsistent NVME write counter'}; 1826 | $bytesWritten *= (512*1000); 1827 | } 1828 | 1829 | ## Not mandatory value, so if no value found, leave it undef 1830 | if (defined($smartStats->{'Data Units Read'})) 1831 | { 1832 | $bytesRead = $smartStats->{'Data Units Read'}; 1833 | 1834 | # 27,745,697 [14.2 TB] 1835 | # last part is optional when drive is brand new 1836 | $bytesRead =~ s/\s+\[[^\]]+\]$//; 1837 | $bytesRead =~ s/,//g; 1838 | if( $bytesRead !~ /^\d+\z/ ) 1839 | { 1840 | $bytesRead = -1; 1841 | } 1842 | else 1843 | { 1844 | # According to smartctl source, always 512k and see here too: 1845 | # https://www.seagate.com/www-content/product-content/ssd-fam/nvme-ssd/_shared/docs/100765362c.pdf 1846 | # note, 1000, not 2**10 1847 | $bytesRead *= (512*1000); 1848 | } 1849 | } 1850 | 1851 | if (defined($smartStats->{'Percentage Used'})) 1852 | { 1853 | $percentageUsed = $smartStats->{'Percentage Used'}; 1854 | $percentageUsed =~ s/%$//; 1855 | $percentageUsed =~ /^\d+$/ or return {status => 500, msg => 'Unconsistent NVME MWI counter'}; 1856 | } 1857 | 1858 | if (defined($smartStats->{'Power On Hours'})) 1859 | { 1860 | $powerOnHours = $smartStats->{'Power On Hours'}; 1861 | $powerOnHours =~ s/,//g; 1862 | $powerOnHours =~ /^\d+$/ or return {status => 500, msg => 'Unconsistent NVME POH counter'}; 1863 | } 1864 | 1865 | if (defined($smartStats->{'Power Cycles'})) 1866 | { 1867 | $powerCycles = $smartStats->{'Power Cycles'}; 1868 | $powerCycles =~ s/,//g; 1869 | if( $powerCycles !~ /^\d+$/ ) 1870 | { 1871 | $powerCycles = -1; 1872 | } 1873 | } 1874 | 1875 | if (defined($smartStats->{'Media and Data Integrity Errors'})) 1876 | { 1877 | $eccUncorrectedErrs = $smartStats->{'Media and Data Integrity Errors'}; 1878 | $eccUncorrectedErrs =~ s/,//g; 1879 | if( $eccUncorrectedErrs !~ /^\d+$/ ) 1880 | { 1881 | $eccUncorrectedErrs = -1; 1882 | } 1883 | } 1884 | 1885 | if ( defined($smartStats->{'Critical Warning'})) 1886 | { 1887 | $criticalStatus = $smartStats->{'Critical Warning'}; 1888 | $criticalStatus = hex $criticalStatus; 1889 | } 1890 | 1891 | if ( defined($smartStats->{'Temperature'})) 1892 | { 1893 | $temperature = $smartStats->{'Temperature'}; 1894 | $temperature =~ s/,//g; 1895 | if( $temperature =~ /(\d+)\s*C/ ) 1896 | { 1897 | $temperature = $1; 1898 | } 1899 | else 1900 | { 1901 | $temperature = -1; 1902 | } 1903 | } 1904 | 1905 | if ( defined($smartStats->{'Unsafe Shutdowns'})) 1906 | { 1907 | $unsafeShutdowns = $smartStats->{'Unsafe Shutdowns'}; 1908 | $unsafeShutdowns =~ s/,//g; 1909 | if( $unsafeShutdowns !~ /\d+/ ) 1910 | { 1911 | $unsafeShutdowns = -1; 1912 | } 1913 | } 1914 | 1915 | my %commonInfo = (); 1916 | if( $smartDisk ) 1917 | { 1918 | $fnret = getSmartCommonInfo(smartDisk => $smartDisk); 1919 | if( ok($fnret) ) 1920 | { 1921 | %commonInfo = %{$fnret->{value}} 1922 | } 1923 | } 1924 | 1925 | return { 1926 | status => 100, 1927 | value => { 1928 | "bytes-written" => $bytesWritten, 1929 | "bytes-read" => $bytesRead, 1930 | "percentage-used" => $percentageUsed || 0, 1931 | "power-on-hours" => $powerOnHours, 1932 | "power-cycles" => $powerCycles, 1933 | "reported-corrected" => $eccCorrectedErrs, 1934 | "reported-uncorrect" => $eccUncorrectedErrs, 1935 | "reallocated-sector-count" => $reallocSectors, 1936 | "current-pending-sector" => $pendingSectors, 1937 | "offline-uncorrectable" => $offlineUncorrectable, 1938 | "command-timeout" => $commandTimeout, 1939 | "link-failures" => $linkFailures, 1940 | "temperature" => $temperature, 1941 | "highest-temperature" => $highestTemperature, 1942 | "lowest-temperature" => $lowestTemperature, 1943 | #"logged-error-count" => $loggedErrorCount, 1944 | #"global-health" => $health, 1945 | #rawReport => $rawReport, 1946 | %commonInfo, 1947 | # specific to nvme 1948 | "critical-warning" => $criticalStatus, 1949 | "unsafe-shutdowns" => $unsafeShutdowns 1950 | }, 1951 | }; 1952 | } 1953 | 1954 | sub ok 1955 | { 1956 | my $arg = shift; 1957 | if ( ref $arg eq 'HASH' and $arg->{status} eq 100 ) 1958 | { 1959 | return 1; 1960 | } 1961 | elsif (ref $arg eq 'HASH' and $arg->{status} eq 500 and defined($arg->{msg})) 1962 | { 1963 | print $arg->{msg}; 1964 | } 1965 | return 0; 1966 | } 1967 | 1968 | sub execute 1969 | { 1970 | my ($bin, @args) = @_; 1971 | defined($bin) or return { status => 201, msg => 'No binary specified (execute)' }; 1972 | 1973 | my ($in, $out, $pid); 1974 | eval { $pid = IPC::Open3::open3($in, $out, $out, $bin, @args)}; warn $@ if $@; 1975 | 1976 | $pid or return { status => 500, msg => 'Failed to fork : '.$! }; 1977 | 1978 | local $/; 1979 | 1980 | my $stdout = <$out>; 1981 | my $ret = waitpid($pid, 0); 1982 | my $status = ($? >> 8); 1983 | 1984 | close($in); 1985 | close($out); 1986 | my @stdout = split(/\n/, $stdout); 1987 | if ($ret != $pid) 1988 | { 1989 | return { status => 500, msg => 'Invalid fork return (waitpid)', value => $stdout }; 1990 | } 1991 | elsif ($status != 0) 1992 | { 1993 | return { status => 500, msg => "Binary ".$bin." exited on a non-zero status (".$status.")\n", value => $stdout }; 1994 | } 1995 | else 1996 | { 1997 | # Ok 1998 | } 1999 | return { status => 100, value => \@stdout }; 2000 | } 2001 | 2002 | sub hash_walk { 2003 | my ($hash, $key_list, $callback) = @_; 2004 | while (my ($key, $value) = each (%$hash)) 2005 | { 2006 | $key =~ s/^\s+|\s+$//g; 2007 | push @$key_list, $key; 2008 | if (ref($value) eq 'HASH') 2009 | { 2010 | hash_walk($value,$key_list,$callback) 2011 | } 2012 | else 2013 | { 2014 | $callback->($key, $value, $key_list); 2015 | } 2016 | pop @$key_list; 2017 | } 2018 | } 2019 | 2020 | sub print_keys_and_value { 2021 | my ($k, $v, $key_list) = @_; 2022 | if (defined($v)) 2023 | { 2024 | $v =~ s/^\s+|\s+$//g; 2025 | } 2026 | my $key; 2027 | foreach (@$key_list) 2028 | { 2029 | if ($key) 2030 | { 2031 | $key = $key.".".$_; 2032 | } 2033 | else 2034 | { 2035 | $key = $key || ""; 2036 | $key = $key.$_; 2037 | } 2038 | } 2039 | if (defined($key) and defined($v)) 2040 | { 2041 | print "{\"metric\":\"$key\",\"timestamp\":".time.",\"value\":\"".$v."\"}\n"; 2042 | } 2043 | } 2044 | 2045 | --------------------------------------------------------------------------------