├── README.md ├── hwcheck ├── hwcheck.cron ├── hwcheck.spec └── hwinfo /README.md: -------------------------------------------------------------------------------- 1 | # 硬件状态监控插件脚本 2 | 3 | 使用dell的srvadmin工具等组件实现硬件监控,需要安装falcon-agent 4 | 5 | 仅支持dell物理机,可以监控的指标有: 6 | 7 | cpu 内存 阵列卡 物理磁盘 虚拟磁盘 阵列卡电池 BIOS 主板电池 风扇 电压 主板温度 cpu温度 8 | 9 | # 如何安装 10 | 11 | 1. 配置dell官方repo,安装srvadmin等依赖包 12 | 13 | ``` 14 | #参考: http://linux.dell.com/repo/hardware/latest/ 15 | wget -q -O - http://linux.dell.com/repo/hardware/latest/bootstrap.cgi | bash 16 | 17 | yum install srvadmin-omacore srvadmin-omcommon srvadmin-storage-cli smbios-utils-bin lm_sensors dmidecode cronie 18 | # 启动srvadmin服务 19 | /opt/dell/srvadmin/sbin/srvadmin-services.sh enable 20 | /opt/dell/srvadmin/sbin/srvadmin-services.sh restart 21 | # 配置lm-sensors 22 | echo yes | /usr/sbin/sensors-detect 23 | ``` 24 | 25 | ## 你也可以打包rpm来简化部署 26 | 27 | ``` 28 | git clone https://github.com/51web/hwcheck hwcheck-0.2 29 | tar czf hwcheck-0.2.tar.gz hwcheck-0.2 30 | rpmbuild -tb hwcheck-0.2.tar.gz 31 | ``` 32 | 33 | 34 | # 如何使用 35 | 36 | ## 参数说明 37 | 38 | 直接执行hwcheck不带参数默认会打印出详细的监控数据 39 | 40 | ``` 41 | hwcheck -d # 打印metrics信息,即是push到falcon-agent的数据 42 | -p # push数据到falcon-agent 43 | -s # 设置push数据中的STEP数值,表示监控频率,默认值是600秒 44 | -m # 指定单个metric 45 | ``` 46 | 47 | ## 配置crontab 48 | 49 | 配置cron来定期检测,如: 50 | 51 | ``` 52 | cat /etc/cron.d/hwcheck 53 | PATH=/sbin:/bin:/usr/sbin:/usr/bin:/opt/dell/srvadmin/sbin:/opt/dell/srvadmin/bin 54 | SHELL=/bin/bash 55 | 56 | 18 * * * * root /usr/bin/hwcheck -s 3600 -p >/dev/null 2>&1 & 57 | ``` 58 | 59 | 表示每个小时执行一次检测,相应的STEP值被设置为3600 60 | 61 | 62 | ## falcon-portal中配置报警策略 63 | 64 | hwcheck push到falcon-agent的metric均以 hw 打头,如hw.cpu_temp,除温度是实际的数值外, 65 | 66 | 其他metric的value中 0表示故障,1表示警告,2表示OK,例如在portal中配置如下策略: 67 | 68 | | metric/tags/note | condition | max | P | 69 | ------------------------------ | --------- | ----- | --- | 70 | | hw.bios [BIOS中C1E/Cstate未禁用] | all(#2)<2 | 1 | 4 | 71 | | hw.board_temp [主板温度过高] | all(#3)>=35 | 1 | 4 | 72 | | hw.cmos_bat [主板电池有问题] | all(#3)<2 |1 | 4 | 73 | | hw.cpu [CPU可能故障] | all(#2)==1 | 1 | 4 | 74 | | hw.cpu [严重: CPU严重故障] | all(#2)==0 | 2 | 0 | 75 | | hw.fan [风扇出现故障] | all(#3)<2 | 1 | 4 | 76 | | hw.memory [内存可能故障] | all(#1)==1 | 1 | 4 | 77 | | hw.memory [严重: 内存严重故障] | all(#1)==0 | 2 | 0 | 78 | | hw.pdisk [严重: 物理盘严重故障] | all(#1)==0 | 2 | 0 | 79 | | hw.raidcard [阵列卡出现警告] | all(#2)==1 | 1 | 4 | 80 | | hw.raidcard [严重: 阵列卡严重故障] | all(#1)==0 | 2 | 0 | 81 | | hw.raidcard_bat [阵列卡电池出现警告] | all(#2)==1 | 1 | 4 | 82 | | hw.raidcard_bat [严重: 阵列卡电池严重故障] | all(#2)==0 | 2 | 0 | 83 | | hw.vdisk [磁盘阵列出现警告] | all(#2)==1 | 1 | 4 | 84 | | hw.vdisk [严重: 磁盘阵列严重故障] | all(#2)==0 | 2 | 0 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /hwcheck: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """A wrapper script with srvadmin and other tools for hardware monitor. 3 | 4 | Supported metrics: 5 | cpu memory raidcard pdisk vdisk raidcard_bat 6 | bios cmos_bat fan power board_temp cpu_temp 7 | 8 | """ 9 | 10 | import subprocess 11 | import json 12 | import time 13 | import socket 14 | import urllib2 15 | from optparse import OptionParser 16 | 17 | host = socket.gethostname() 18 | messages = [] 19 | verbs = [] 20 | 21 | def addverb(metric, model, index, status, info): 22 | m = {} 23 | m['metric'] = metric 24 | m['model'] = model 25 | m['index'] = index 26 | m['status'] = status 27 | m['info'] = info 28 | verbs.append(m) 29 | 30 | def addmsg(metric, value): 31 | m = {} 32 | m['metric'] = 'hw.%s' % metric 33 | m['endpoint'] = host 34 | m['tags'] = '' 35 | m['value'] = value 36 | m['timestamp'] = int(time.time()) 37 | m['step'] = int(step) 38 | m['counterType'] = 'GAUGE' 39 | messages.append(m) 40 | 41 | def map_value(state): 42 | statemap = {0:['crit', 'critical'], 43 | 1:['warn', 'warning', 'non-critical'], 44 | 2:['ok', 'ready'] 45 | } 46 | for i in statemap: 47 | if state.lower() in statemap[i]: 48 | return i 49 | 50 | def execute(cmd): 51 | p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) 52 | return p.communicate() 53 | 54 | 55 | # cpu 56 | def check_cpu(): 57 | cmd = 'omreport chassis processors -fmt ssv' 58 | stdout, stderr = execute(cmd) 59 | cpus = [cpu for cpu in stdout.splitlines() if 'CPU' in cpu] 60 | value = 2 61 | for line in cpus: 62 | i = line.split(';') 63 | Index = i[0].strip().lower() 64 | Status = i[1].strip().lower() 65 | Connector_Name = i[2].strip().lower() 66 | Processor_Brand = i[3].strip().lower() 67 | Processor_Version = i[4].strip().lower() 68 | Current_Speed = i[5].strip().lower() 69 | State = i[6].strip().lower() 70 | Core_Count = i[7].strip().lower() 71 | try: 72 | model = Processor_Brand.split()[3] 73 | except: 74 | model = Processor_Brand.split()[0] 75 | v = map_value(Status) 76 | if v < value: 77 | value = v 78 | addverb('cpu', model, Connector_Name, Status, State) 79 | 80 | addmsg('cpu', value) 81 | 82 | 83 | # memory 84 | def check_memory(): 85 | cmd = 'omreport chassis memory -fmt ssv' 86 | stdout, stderr = execute(cmd) 87 | mems = [mem for mem in stdout.splitlines() if 'DIMM' in mem] 88 | value = 2 89 | for line in mems: 90 | i = line.split(';') 91 | # TODO make sure index here is uniq 92 | Index = i[0].strip() 93 | Status = i[1].strip().lower() 94 | Connector_Name = i[2].strip().lower() 95 | Type = i[3].strip().lower() 96 | Size = i[4].strip() 97 | if Status == 'unknown': 98 | continue 99 | index = Connector_Name.lstrip('dimm_') 100 | v = map_value(Status) 101 | if v < value: 102 | value = v 103 | addverb('memory', Type, Connector_Name, Status, Size) 104 | 105 | addmsg('memory', value) 106 | 107 | 108 | # disk raidcard 109 | def check_raidcard(): 110 | cmd = 'omreport storage controller -fmt ssv' 111 | stdout, stderr = execute(cmd) 112 | ctrlers = [c for c in stdout.splitlines() if 'Applicable' in c] 113 | ids = [] 114 | value = 2 115 | if not ctrlers: 116 | return 117 | for line in ctrlers: 118 | i = line.split(';') 119 | ID = i[0].strip() 120 | Status = i[1].strip() 121 | Name = i[2].strip() 122 | Slot_ID = i[3].strip() 123 | State = i[4].strip() 124 | Firmware_Version = i[5].strip() 125 | Latest_Available_Firmware_Version = i[6].strip() 126 | Driver_Version = i[7].strip() 127 | Minimum_Required_Driver_Version = i[8].strip() 128 | Storport_Driver_Version = i[9].strip() 129 | Minimum_Required_Storport_Driver_Version = i[10].strip() 130 | Number_of_Connectors = i[11].strip() 131 | Rebuild_Rate = i[12].strip() 132 | BGI_Rate = i[13].strip() 133 | Check_Consistency_Rate = i[14].strip() 134 | Reconstruct_Rate = i[15].strip() 135 | Alarm_State = i[16].strip() 136 | Cluster_Mode = i[17].strip() 137 | SCSI_Initiator_ID = i[18].strip() 138 | Cache_Memory_Size = i[19].strip() 139 | Patrol_Read_Mode = i[20].strip() 140 | Patrol_Read_State = i[21].strip() 141 | Patrol_Read_Rate = i[22].strip() 142 | Patrol_Read_Iterations = i[23].strip() 143 | Abort_Check_Consistency_on_Error = i[24].strip() 144 | Allow_Revertible_Hot_Spare_and_Replace_Member = i[25].strip() 145 | Load_Balance = i[26].strip() 146 | Auto_Replace_Member_on_Predictive_Failure = i[27].strip() 147 | Redundant_Path_view = i[28].strip() 148 | CacheCade_Capable = i[29].strip() 149 | Persistent_Hot_Spare = i[30].strip() 150 | Encryption_Capable = i[31].strip() 151 | Encryption_Key_Present = i[32].strip() 152 | Encryption_Mode = i[33].strip() 153 | Preserved_Cache = i[34].strip() 154 | if len(i) == 36: 155 | T10_Protection_Information_Capable = i[35].strip() 156 | elif len(i) == 40: 157 | Spin_Down_Unconfigured_Drives = i[35].strip() 158 | Spin_Down_Hot_Spares = i[36].strip() 159 | Spin_Down_Configured_Drives = i[37].strip() 160 | Automatic_Disk_Power_Saving_Idle_C = i[38].strip() 161 | T10_Protection_Information_Capable = i[39].strip() 162 | 163 | v = map_value(Status) 164 | if v < value: 165 | value = v 166 | ids.append(ID) 167 | addverb('raidcard', Name, ID, Status, State) 168 | 169 | addmsg('raidcard', value) 170 | return(ids) 171 | 172 | 173 | # pdisk 174 | def check_pdisk(ctrlers=[0]): 175 | if not ctrlers: 176 | return 177 | value = 2 178 | for cid in ctrlers: 179 | cmd = 'omreport storage pdisk controller=%s -fmt ssv' % cid 180 | stdout, stderr = execute(cmd) 181 | pdisks = [p for p in stdout.splitlines() if 'bytes' in p] 182 | for line in pdisks: 183 | i = line.split(';') 184 | ID = i[0].strip() 185 | Status = i[1].strip() 186 | Name = i[2].strip() 187 | State = i[3].strip() 188 | Power_Status = i[4].strip() 189 | Bus_Protocol = i[5].strip() 190 | Media = i[6].strip() 191 | Part_of_Cache_Pool = i[7].strip() 192 | Remaining_Rated_Write_Endurance = i[8].strip() 193 | Failure_Predicted = i[9].strip() 194 | Revision = i[10].strip() 195 | Driver_Version = i[11].strip() 196 | Model_Number = i[12].strip() 197 | T10_PI_Capable = i[13].strip() 198 | Certified = i[14].strip() 199 | Encryption_Capable = i[15].strip() 200 | Encrypted = i[16].strip() 201 | Progress = i[17].strip() 202 | Mirror_Set_ID = i[18].strip() 203 | Capacity = i[19].strip() 204 | Used_RAID_Disk_Space = i[20].strip() 205 | Available_RAID_Disk_Space = i[21].strip() 206 | Hot_Spare = i[22].strip() 207 | Vendor_ID = i[23].strip() 208 | Product_ID = i[24].strip() 209 | Serial_No = i[25].strip() 210 | Part_Number = i[26].strip() 211 | Negotiated_Speed = i[27].strip() 212 | Capable_Speed = i[28].strip() 213 | PCIe_Maximum_Link_Width = i[29].strip() 214 | PCIe_Negotiated_Link_Width = i[30].strip() 215 | Sector_Size = i[31].strip() 216 | if len(i) > 33: 217 | Device_Write_Cache = i[32].strip() 218 | Manufacture_Day = i[33].strip() 219 | Manufacture_Week = i[34].strip() 220 | Manufacture_Year = i[35].strip() 221 | SAS_Address = i[36].strip() 222 | info = {} 223 | info = {'Bus_Protocol': Bus_Protocol, 'Media': Media, 224 | 'Capacity': Capacity, 'State': State, 225 | 'Vendor_ID': Vendor_ID, 226 | 'Serial_No': Serial_No} 227 | if Progress != 'Not Applicable': 228 | info['Progress'] = Progress 229 | 230 | v = map_value(Status) 231 | if v < value: 232 | value = v 233 | addverb('pdisk', Product_ID, ID, Status, info) 234 | 235 | addmsg('pdisk', value) 236 | 237 | 238 | # vdisk 239 | def check_vdisk(ctrlers=[0]): 240 | if not ctrlers: 241 | return 242 | value = 2 243 | for cid in ctrlers: 244 | cmd = 'omreport storage vdisk controller=%s -fmt ssv' % cid 245 | stdout, stderr = execute(cmd) 246 | vdisks = [v for v in stdout.splitlines() if 'bytes' in v] 247 | for line in vdisks: 248 | i = line.split(';') 249 | ID = i[0].strip() 250 | Status = i[1].strip() 251 | Name = i[2].strip() 252 | State = i[3].strip() 253 | Hot_Spare_Policy_violated = i[4].strip() 254 | if len(i) == 19: 255 | Virtual_Disk_Bad_Blocks = i[5].strip() 256 | Encrypted = i[6].strip() 257 | Layout = i[7].strip() 258 | Size = i[8].strip() 259 | T10_Protection_Information_Status = i[9].strip() 260 | Associated_Fluid_Cache_State = i[10].strip() 261 | Device_Name = i[11].strip() 262 | Bus_Protocol = i[12].strip() 263 | Media = i[13].strip() 264 | Read_Policy = i[14].strip() 265 | Write_Policy = i[15].strip() 266 | Cache_Policy = i[16].strip() 267 | Stripe_Element_Size = i[17].strip() 268 | Disk_Cache_Policy = i[18].strip() 269 | elif len(i) == 18: 270 | Encrypted = i[5].strip() 271 | Layout = i[6].strip() 272 | Size = i[7].strip() 273 | T10_Protection_Information_Status = i[8].strip() 274 | Associated_Fluid_Cache_State = i[9].strip() 275 | Device_Name = i[10].strip() 276 | Bus_Protocol = i[11].strip() 277 | Media = i[12].strip() 278 | Read_Policy = i[13].strip() 279 | Write_Policy = i[14].strip() 280 | Cache_Policy = i[15].strip() 281 | Stripe_Element_Size = i[16].strip() 282 | Disk_Cache_Policy = i[17].strip() 283 | elif len(i) == 16: 284 | Encrypted = i[5].strip() 285 | Layout = i[6].strip() 286 | Size = i[7].strip() 287 | Device_Name = i[8].strip() 288 | Bus_Protocol = i[9].strip() 289 | Media = i[10].strip() 290 | Read_Policy = i[11].strip() 291 | Write_Policy = i[12].strip() 292 | Cache_Policy = i[13].strip() 293 | Stripe_Element_Size = i[14].strip() 294 | Disk_Cache_Policy = i[15].strip() 295 | info = {} 296 | info = {'Bus_Protocol': Bus_Protocol, 'Media': Media, 297 | 'Device_Name': Device_Name, 'Size': Size, 'State': State} 298 | if len(i) == 19: 299 | info['Virtual_Disk_Bad_Blocks'] = Virtual_Disk_Bad_Blocks 300 | 301 | v = map_value(Status) 302 | if v < value: 303 | value = v 304 | addverb('vdisk', Layout, ID, Status, info) 305 | 306 | addmsg('vdisk', value) 307 | 308 | # raidcard battery 309 | def check_raidcard_bat(): 310 | cmd = 'omreport storage battery -fmt ssv' 311 | stdout, stderr = execute(cmd) 312 | batteries = [bat for bat in stdout.splitlines() if 'Battery' in bat] 313 | if not batteries: 314 | return 315 | value = 2 316 | for line in batteries: 317 | i = line.split(';') 318 | ID = i[0].strip() 319 | Status = i[1].strip() 320 | Name = i[2].strip() 321 | State = i[3].strip() 322 | Recharge_Count = i[4].strip() 323 | Max_Recharge_Count = i[5].strip() 324 | Learn_State = i[6].strip() 325 | Next_Learn_Time = i[7].strip() 326 | Maximum_Learn_Delay = i[8].strip() 327 | try: 328 | Learn_Mode = i[9].strip() 329 | except: 330 | Learn_Mode = False 331 | 332 | v = map_value(Status) 333 | if v < value: 334 | value = v 335 | addverb('raidcard_bat', Name, ID, Status, Learn_State) 336 | 337 | addmsg('raidcard_bat', value) 338 | 339 | # bios 340 | def check_bios(): 341 | cmd = 'omreport chassis biossetup -fmt ssv' 342 | stdout, stderr = execute(cmd) 343 | bsets = [b for b in stdout.splitlines() if 'C State' in b or 'C1-E' in b or 344 | 'C1E' in b] 345 | if not bsets: 346 | return 347 | value = 2 348 | for line in bsets: 349 | i = line.split(';') 350 | ATTRIBUTE = i[0].strip().lower() 351 | if 'c state' in ATTRIBUTE: 352 | index = 'cstate' 353 | else: 354 | index = 'c1e' 355 | VALUE = i[1].strip() 356 | if VALUE == 'Enabled': 357 | Status = 'warn' 358 | elif VALUE == 'Disabled': 359 | Status = 'ok' 360 | else: 361 | continue 362 | v = map_value(Status) 363 | if v < value: 364 | value = v 365 | addverb('bios', "bios_setting", ATTRIBUTE, Status, VALUE) 366 | 367 | addmsg('bios', value) 368 | 369 | 370 | # cmos battery 371 | def check_cmos_bat(): 372 | cmd = 'omreport chassis batteries -fmt ssv' 373 | stdout, stderr = execute(cmd) 374 | bats = [battery for battery in stdout.splitlines() if 'CMOS' in battery] 375 | if not bats: 376 | return 377 | value = 2 378 | for line in bats: 379 | i = line.split(';') 380 | Index = i[0].strip() 381 | Status = i[1].strip() 382 | Probe_Name = i[2].strip() 383 | Reading = i[3].strip() 384 | 385 | v = map_value(Status) 386 | if v < value: 387 | value = v 388 | addverb('cmos_bat', Probe_Name, Index, Status, Reading) 389 | 390 | addmsg('cmos_bat', value) 391 | 392 | 393 | # fan 394 | def check_fan(): 395 | cmd = 'omreport chassis fans -fmt ssv' 396 | stdout, stderr = execute(cmd) 397 | fans = [fan for fan in stdout.splitlines() if 'RPM' in fan] 398 | if not fans: 399 | return 400 | value = 2 401 | for line in fans: 402 | i = line.split(';') 403 | Index = i[0].strip() 404 | Status = i[1].strip() 405 | Probe_Name = i[2].strip() 406 | Reading = i[3].strip() 407 | Minimum_Warning_Threshold = i[4].strip() 408 | Maximum_Warning_Threshold = i[5].strip() 409 | Minimum_Failure_Threshold = i[6].strip() 410 | Maximum_Failure_Threshold = i[7].strip() 411 | v = map_value(Status) 412 | if v < value: 413 | value = v 414 | addverb('fan', Probe_Name, Index, Status, Reading) 415 | 416 | addmsg('fan', value) 417 | 418 | 419 | # power 420 | def check_power(): 421 | cmd = 'omreport chassis pwrmonitoring -fmt ssv' 422 | stdout, stderr = execute(cmd) 423 | powers = [pwr for pwr in stdout.splitlines() if 'System Board' in pwr] 424 | if not powers: 425 | return 426 | value = 2 427 | for line in powers: 428 | i = line.split(';') 429 | Index = i[0].strip() 430 | Status = i[1].strip() 431 | Probe_Name = i[2].strip() 432 | Reading = i[3].strip() 433 | Warning_Threshold = i[4].strip() 434 | Failure_Threshold = i[5].strip() 435 | v = map_value(Status) 436 | w = Reading.split()[0] 437 | if w > value: 438 | value = w 439 | addverb('power', Probe_Name, Index, Status, Reading) 440 | 441 | addmsg('power', value) 442 | 443 | 444 | # board temp 445 | def check_board_temp(): 446 | cmd = 'omreport chassis temps -fmt ssv' 447 | stdout, stderr = execute(cmd) 448 | temp = [t for t in stdout.splitlines() if 'Board' in t] 449 | if not temp: 450 | return 451 | value = 2 452 | for line in temp: 453 | i = line.split(';') 454 | Index = i[0].strip() 455 | Status = i[1].strip() 456 | Probe_Name = i[2].strip() 457 | Reading = i[3].strip().split()[0] 458 | Minimum_Warning_Threshold = i[4].strip() 459 | Maximum_Warning_Threshold = i[5].strip() 460 | Minimum_Failure_Threshold = i[6].strip() 461 | Maximum_Failure_Threshold = i[7].strip() 462 | v = float(Reading) 463 | if v > value: 464 | value = v 465 | addverb('board_temp', Probe_Name, Index, Status, Reading) 466 | 467 | addmsg('board_temp', value) 468 | 469 | 470 | # cpu temp 471 | def check_cpu_temp(): 472 | cmd = 'sensors' 473 | stdout, stderr = execute(cmd) 474 | lines = stdout.splitlines() 475 | temps = [] 476 | id = False 477 | temp = {} 478 | for line in lines: 479 | if line.startswith('coretemp'): 480 | if line != id: 481 | id = line 482 | temp = {} 483 | value = 0 484 | temp['id'] = id 485 | elif line.startswith('Core'): 486 | lastcore = True 487 | key = line.split(':')[0] 488 | vv = line.split(':')[1].split()[0] 489 | v = vv.split('\xc2\xb0C')[0].split('+')[1] 490 | if float(v) > value: 491 | value = float(v) 492 | temp['core'] = key 493 | temp['reading'] = value 494 | elif line == '' and lastcore: 495 | if len(temp) != 0: 496 | temps.append(temp) 497 | else: 498 | lastcore = False 499 | 500 | value = 2 501 | for temp in temps: 502 | Index = '%d' % temps.index(temp) 503 | Probe_Name = temp['id'] 504 | Reading = temp['reading'] 505 | Maximum_Warning_Threshold = 80 506 | Maximum_Failure_Threshold = 90 507 | if Reading >= Maximum_Failure_Threshold: 508 | Status = 'crit' 509 | elif Reading >= Maximum_Warning_Threshold: 510 | Status = 'warn' 511 | else: 512 | Status = 'ok' 513 | if Reading > value: 514 | value = Reading 515 | addverb('cpu_temp', Probe_Name, Index, Status, Reading) 516 | 517 | addmsg('cpu_temp', value) 518 | 519 | 520 | def check(target=False): 521 | if not target: 522 | check_cpu() 523 | check_memory() 524 | ctrlers = check_raidcard() 525 | check_pdisk(ctrlers=ctrlers) 526 | check_vdisk(ctrlers=ctrlers) 527 | check_raidcard_bat() 528 | check_cmos_bat() 529 | check_bios() 530 | check_fan() 531 | check_power() 532 | check_board_temp() 533 | check_cpu_temp() 534 | elif target == 'cpu': 535 | check_cpu() 536 | elif target == 'memory': 537 | check_memory() 538 | elif target == 'raidcard': 539 | check_raidcard() 540 | elif target == 'pdisk': 541 | c = check_raidcard() 542 | check_pdisk(c) 543 | elif target == 'vdisk': 544 | c = check_raidcard() 545 | check_vdisk(c) 546 | elif target == 'raidcard_bat': 547 | check_raidcard_bat() 548 | elif target == 'cmos_bat': 549 | check_cmos_bat() 550 | elif target == 'bios': 551 | check_bios() 552 | elif target == 'fan': 553 | check_fan() 554 | elif target == 'power': 555 | check_power() 556 | elif target == 'board_temp': 557 | check_board_temp() 558 | elif target == 'cpu_temp': 559 | check_cpu_temp() 560 | 561 | return messages 562 | 563 | def push(message): 564 | try: 565 | urllib2.urlopen( 566 | url = 'http://127.0.0.1:1988/v1/push', 567 | data = json.dumps(message) 568 | ) 569 | except: 570 | pass 571 | 572 | metrics = ['cpu', 'memory', 'raidcard', 'pdisk', 'vdisk', 'raidcard_bat', 573 | 'bios', 'cmos_bat', 'fan', 'power', 'board_temp', 'cpu_temp'] 574 | parser = OptionParser() 575 | parser.add_option("-p", "--push", action="store_true", dest="push", help="push result to agent") 576 | parser.add_option("-d", "--debug", action="store_true", dest="debug", help="output debug info") 577 | parser.add_option("-m", "--metric", action="store", dest="metric", help="check special metric") 578 | parser.add_option("-s", "--step", action="store", dest="step", help="check special metric") 579 | (options, args) = parser.parse_args() 580 | if not options.step: 581 | step = 600 582 | else: 583 | step = int(options.step) 584 | metric=None 585 | if options.metric: 586 | metric = options.metric 587 | if metric not in metrics: 588 | print __doc__ 589 | parser.print_help() 590 | exit(1) 591 | messages = check(target=metric) 592 | if options.push: 593 | push(messages) 594 | else: 595 | if options.debug: 596 | print json.dumps(messages, indent=2) 597 | else: 598 | print json.dumps(verbs, indent=2) 599 | -------------------------------------------------------------------------------- /hwcheck.cron: -------------------------------------------------------------------------------- 1 | PATH=/sbin:/bin:/usr/sbin:/usr/bin:/opt/dell/srvadmin/sbin:/opt/dell/srvadmin/bin 2 | SHELL=/bin/bash 3 | 4 | 18 * * * * root /usr/bin/hwcheck -s 3600 -p >/dev/null 2>&1 & 5 | -------------------------------------------------------------------------------- /hwcheck.spec: -------------------------------------------------------------------------------- 1 | Name: hwcheck 2 | Version: 0.2 3 | Release: 2%{?dist} 4 | Summary: Scirpts for hardware info check and monitor 5 | 6 | Group: Applications/System 7 | License: Apache 8 | URL: http://git.51web.net/os/hwcheck 9 | Source0: hwcheck-%{version}.tar.gz 10 | BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX) 11 | 12 | BuildRequires: python 13 | Requires: srvadmin-omacore, srvadmin-omcommon, srvadmin-storage-cli, smbios-utils-bin, lm_sensors, dmidecode, cronie 14 | 15 | %description 16 | Provides scripts for server hardware infomation gathering and monitoring,just 17 | support DELL server now. 18 | 19 | 20 | %prep 21 | %setup -q 22 | 23 | 24 | %build 25 | rm -f *.swp 26 | rm -f *.pyc 27 | 28 | 29 | %install 30 | rm -rf %{buildroot} 31 | install -d %{buildroot}%{_bindir} 32 | install -d %{buildroot}%{_sysconfdir}/cron.d 33 | install -pm 755 hwcheck %{buildroot}%{_bindir}/hwcheck 34 | install -pm 755 hwinfo %{buildroot}%{_bindir}/hwinfo 35 | install -pm 644 hwcheck.cron %{buildroot}%{_sysconfdir}/cron.d/hwcheck 36 | 37 | 38 | %post 39 | if /usr/sbin/dmidecode | grep -iq 'kvm'; then 40 | /opt/dell/srvadmin/sbin/srvadmin-services.sh disable >/dev/null 41 | else 42 | /opt/dell/srvadmin/sbin/srvadmin-services.sh enable >/dev/null 43 | /opt/dell/srvadmin/sbin/srvadmin-services.sh restart >/dev/null 44 | echo yes|/usr/sbin/sensors-detect >/dev/null 45 | fi 46 | 47 | 48 | %clean 49 | rm -rf %{buildroot} 50 | 51 | 52 | %files 53 | %defattr(-,root,root,-) 54 | %{_bindir}/hwcheck 55 | %{_bindir}/hwinfo 56 | %{_sysconfdir}/cron.d/hwcheck 57 | 58 | %changelog 59 | * Sat Jul 18 2015 Gaoyongwei - 0.2-2 60 | - rebuilt 61 | 62 | * Fri Jul 17 2015 Gaoyongwei - 0.2-1 63 | - Update to version 0.2,support open-falcon agent 64 | 65 | * Fri Jun 19 2015 Gaoyongwei - 0.1-1 66 | - First release 67 | -------------------------------------------------------------------------------- /hwinfo: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | # 3 | set -e 4 | pname=$(dmidecode --string system-product-name|awk '{print $2}') 5 | if [[ $pname =~ R7[0-9a-z]{1,} ]]; then 6 | cap="2U" 7 | else 8 | cap="1U" 9 | fi 10 | cpu_model=$( dmidecode -t 4 | awk '/Version:/ {print $5}'|tail -1) 11 | cpu_core=$(dmidecode -t 4 | awk '/Core Count:/ {print $3}'|tail -1) 12 | cpu_thread=$(dmidecode -t 4 | awk '/Thread Count:/ {print $3}'|tail -1) 13 | cpu_count=$(dmidecode -t 4 | grep -c 'Processor Information') 14 | 15 | mem_vendor=$(dmidecode -t memory|grep 'Part Number'|grep -v 'ModulePartNumber'|sort|uniq|awk -F':' '/: [a-zA-Z0-9]/ {gsub(/[[:blank:]]*/,"",$2);print $2}'|sed 'N;s/\n/,/') 16 | mem_size=$(dmidecode -t 17 | grep Size: | grep -v 'No Module Installed'|tail -1|awk '{print $2/1024}')G 17 | mem_count=$(dmidecode -t 17 | grep 'Size:' | grep -vc 'Size: No Module Installed') 18 | 19 | 20 | get_disk_name() { 21 | local disk_mp="$1" 22 | local disk_name 23 | disk_name=$(df -P | grep "$disk_mp"|awk '{print $1}') 24 | if [[ $disk_name =~ /dev/mapper/* ]]; then 25 | vgname=$(echo $disk_name|awk -F'/' '{print $4}'|awk -F'-' '{print $1}') 26 | disk_name=$(echo $(pvs|awk "/$vgname/ {print \$1}")|sed 's/[0-9]//') 27 | else 28 | disk_name=$(echo $disk_name|sed 's/[0-9]//') 29 | fi 30 | echo $disk_name 31 | } 32 | 33 | get_disk_info() { 34 | local disk_name="$1" 35 | local vgname controllers c disk_raid disk_c disk_vdisk disk_type pdisk_size pdisk_num 36 | controllers=$(omreport storage controller -fmt ssv|grep '^[0-9]'|awk -F';' '{print $1}') 37 | for c in $controllers; do 38 | disk_raid=$(omreport storage vdisk controller=$c -fmt ssv|grep "$disk_name"|awk -F';' '{print $7}') 39 | if [[ -n $disk_raid ]]; then 40 | disk_c=$c 41 | disk_vdisk=$(omreport storage vdisk controller=$disk_c -fmt ssv|grep "$disk_name"|awk -F';' '{print $1}') 42 | break 43 | fi 44 | done 45 | 46 | disk_basename=$(basename $disk_name) 47 | if [[ -n $disk_raid ]]; then 48 | disk_type=$(omreport storage vdisk controller=$disk_c vdisk=$disk_vdisk -fmt ssv|grep "$disk_name"|awk -F';' '{print $12}') 49 | pdisk_size=$(omreport storage pdisk controller=$disk_c vdisk=$disk_vdisk -fmt ssv|grep '[0-9]:[0-9]'|egrep -o "[0-9.,]{1,} GB"|head -1|sed 's/,//'|awk -F'.' '{print $1}') 50 | pdisk_num=$(omreport storage pdisk controller=$disk_c vdisk=$disk_vdisk -fmt ssv|grep -c '[0-9]:[0-9]') 51 | else 52 | disk_raid="noraid" 53 | disk_id=$(ls -l /dev/disk/by-path/|grep -w "../../$disk_basename"|awk '{print $9}'|awk -F'0x' '{print $2}'|awk -F'-' '{print $1}') 54 | disk_type=$(omreport storage pdisk controller=0 -fmt ssv|grep -i $disk_id|awk -F';' '{print $6}') 55 | pdisk_size=$(omreport storage pdisk controller=0 -fmt ssv|grep -i $disk_id|egrep -o "[0-9.,]{1,} GB"|head -1|sed 's/,//'|awk -F'.' '{print $1}') 56 | pdisk_num=1 57 | fi 58 | if [[ $pdisk_size -ge 2700 ]]; then 59 | pdisk_size=3000G 60 | elif [[ $pdisk_size -ge 1800 ]]; then 61 | pdisk_size=2000G 62 | elif [[ $pdisk_size -ge 900 ]]; then 63 | pdisk_size=1000G 64 | elif [[ $pdisk_size -ge 550 ]]; then 65 | pdisk_size=600G 66 | elif [[ $pdisk_size -ge 270 ]]; then 67 | pdisk_size=300G 68 | else 69 | pdisk_size=${pdisk_size}G 70 | fi 71 | echo "$disk_type;$pdisk_size;$pdisk_num;$disk_raid" 72 | } 73 | 74 | sys_disk_name=$(get_disk_name "/$") 75 | sys_disk_info=$(get_disk_info "$sys_disk_name") 76 | sys_disk_type=$(echo $sys_disk_info|awk -F';' '{print $1}') 77 | sys_disk_size=$(echo $sys_disk_info|awk -F';' '{print $2}') 78 | sys_disk_num=$(echo $sys_disk_info|awk -F';' '{print $3}') 79 | sys_disk_raid=$(echo $sys_disk_info|awk -F';' '{print $4}') 80 | 81 | data_disk_name=$(get_disk_name "/chost") 82 | if [[ $data_disk_name = $sys_disk_name ]]; then 83 | data_disk_info=$sys_disk_info 84 | else 85 | data_disk_info=$(get_disk_info "$data_disk_name") 86 | fi 87 | data_disk_type=$(echo $data_disk_info|awk -F';' '{print $1}') 88 | data_disk_size=$(echo $data_disk_info|awk -F';' '{print $2}') 89 | data_disk_num=$(echo $data_disk_info|awk -F';' '{print $3}') 90 | data_disk_raid=$(echo $data_disk_info|awk -F';' '{print $4}') 91 | 92 | back_disk_name=$(get_disk_name "/backup") 93 | if [[ $back_disk_name = $sys_disk_name ]]; then 94 | back_disk_info=$sys_disk_info 95 | elif [[ $back_disk_name = $data_disk_name ]]; then 96 | back_disk_info=$data_disk_info 97 | else 98 | back_disk_info=$(get_disk_info "$back_disk_name") 99 | fi 100 | back_disk_type=$(echo $back_disk_info|awk -F';' '{print $1}') 101 | back_disk_size=$(echo $back_disk_info|awk -F';' '{print $2}') 102 | back_disk_num=$(echo $back_disk_info|awk -F';' '{print $3}') 103 | back_disk_raid=$(echo $back_disk_info|awk -F';' '{print $4}') 104 | 105 | os_ver=$(awk '{print $1$3}' /etc/redhat-release) 106 | s_num=$(dmidecode --string chassis-serial-number) 107 | 108 | echo "{" 109 | echo \"机器型号\":\"$pname\", 110 | echo \"机器高度\":\"$cap\", 111 | echo \"CPU信息\":\{\"CPU型号\":\"$cpu_model\",\"核数\":$cpu_core,\"线程\":$cpu_thread,\"CPU个数\":$cpu_count\}, 112 | echo \"内存信息\":\{\"内存型号\":\"$mem_vendor\",\"每条大小\":\"$mem_size\",\"条数\":$mem_count\}, 113 | echo \"系统盘信息\":\{\"磁盘类别\":\"$sys_disk_type\",\"磁盘大小\":\"$sys_disk_size\",\"磁盘个数\":$sys_disk_num,\"阵列类型\":\"$sys_disk_raid\"}, 114 | echo \"运行盘信息\":\{\"磁盘类别\":\"$data_disk_type\",\"磁盘大小\":\"$data_disk_size\",\"磁盘个数\":$data_disk_num,\"阵列类型\":\"$data_disk_raid\"}, 115 | echo \"备份盘信息\":\{\"磁盘类别\":\"$back_disk_type\",\"磁盘大小\":\"$back_disk_size\",\"磁盘个数\":$back_disk_num,\"阵列类型\":\"$back_disk_raid\"}, 116 | if [[ -n $1 ]] && [[ $1 = '-p' ]]; then 117 | echo \"购买时间\":\"2013-09-11\", 118 | echo \"过保时间\":\"2015-09-11\", 119 | echo \"机柜号\":\"?\", 120 | echo \"机柜内位置编号\":\"?\", 121 | fi 122 | echo \"操作系统版本\":\"$os_ver\", 123 | echo \"快速服务代码\":\"$s_num\", 124 | echo \"其他信息\":\"null\" 125 | 126 | echo "}" 127 | --------------------------------------------------------------------------------