├── .gitignore ├── LICENSE ├── README.md ├── alert.go ├── alert_message.go ├── check_functions.go ├── config.go ├── config_alert.go ├── config_host.go ├── config_probe.go ├── current_fails.go ├── deploy ├── ssh-agent-nosee.sh ├── supervisord │ └── nosee.conf └── systemd │ └── nosee.service ├── doc └── images │ ├── howto.txt │ ├── img_base.png │ ├── img_base.txt │ ├── img_general.png │ ├── img_general.txt │ ├── img_illu.jpeg │ └── nosee-influxdb-grafana.png ├── etc ├── alerts.d │ ├── example.txt │ ├── mail_general.toml │ └── nosee-console.toml ├── hosts.d │ ├── example.txt │ └── test.toml ├── nosee.toml ├── probes.d │ ├── apache_modstatus.toml │ ├── backup_daily.toml │ ├── backup_week.toml │ ├── cert_example.toml │ ├── cpu_lms_temp.toml │ ├── cpu_temp.toml │ ├── curl_expect_example.toml │ ├── df.toml │ ├── example.txt │ ├── ifband.toml │ ├── load.toml │ ├── mdstat.toml │ ├── mem.toml │ ├── ping.toml │ ├── port_80.toml │ └── systemd_httpd.toml └── scripts │ ├── alerts │ ├── nosee-console.sh │ └── test.sh │ ├── heartbeats │ └── nosee-console.sh │ ├── loggers │ └── influxdb.sh │ └── probes │ ├── apache_modstatus.sh │ ├── backup.sh │ ├── cert_check.sh │ ├── cpu_lms_temp.sh │ ├── cpu_temp.sh │ ├── curl.sh │ ├── curl_expect.sh │ ├── df.sh │ ├── ifband.sh │ ├── load.sh │ ├── load_win.sh │ ├── mdstat.sh │ ├── mem.sh │ ├── ping.sh │ ├── port.sh │ └── systemctl_status.sh ├── go.mod ├── go.sum ├── heartbeat.go ├── host.go ├── log.go ├── loggers.go ├── main.go ├── pid.go ├── probe.go ├── run.go ├── run_alerts.go ├── run_streams.go ├── ssh.go ├── task.go ├── task_result.go └── tools.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | *.test 24 | *.prof 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | {one line to give the program's name and a brief idea of what it does.} 635 | Copyright (C) {year} {name of author} 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | {project} Copyright (C) {year} {fullname} 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Nosee 2 | A nosey, agentless, easy monitoring tool over SSH. 3 | 4 | **Warning: Heavy WIP!** 5 | 6 | What is it? 7 | ----------- 8 | 9 | It's an answer when you found usual monitoring systems too heavy and complex. 10 | 11 | Nosee uses SSH protocol to execute scripts on monitored systems, checking 12 | for whatever you want. The result is evaluated and Nosee will ring an alert 13 | of your choice if anything is wrong. 14 | 15 | In short : SSH, no agent, simple configuration, usual scripting. 16 | 17 | ![Nosee basic schema](https://raw.github.com/Xfennec/nosee/master/doc/images/img_base.png) 18 | 19 | Currently, Nosee requires bash on monitored hosts. It was successfully 20 | tested with Linux (of course) but using Cygwin sshd on Windows hosts too. 21 | 22 | The Nosee daemon itself can virtually run with any Go supported platform. 23 | 24 | Show me! 25 | -------- 26 | 27 | Here is an alert triggered by a "port connection testing" probe. This alert 28 | is then configured to be sent using `mail` and a HTTP request to Pushover 29 | for realtime mobile device notification. 30 | 31 | ![Nosee mobile and mail failure notifications](https://raw.github.com/Xfennec/nosee/master/doc/images/img_illu.jpeg) 32 | 33 | You can also have a look at the [Nosee-console](https://github.com/Xfennec/nosee-console) 34 | project, it provides a cool Web monitoring interface. 35 | 36 | How do you build it? 37 | -------------------- 38 | 39 | If you have Go installed: 40 | 41 | go get github.com/Xfennec/nosee 42 | 43 | You will then be able to launch the binary located in you Go "bin" directory. 44 | (since Go 1.8, `~/go/bin` if you haven't defined any `$GOPATH`) 45 | 46 | The project is still too young to provide binaries. Later. (and `go get` is so powerful…) 47 | 48 | As a reminder, you can use the `-u` flag to update the project and its dependencies if 49 | you don't want to use `git` for that. 50 | 51 | go get -u github.com/Xfennec/nosee 52 | 53 | How do you use it? 54 | ------------------ 55 | 56 | You may have a look at the "template" configuration directory 57 | provided in `$GOPATH/src/github.com/Xfennec/nosee/etc` as a more complete 58 | example or as a base for the following tutorial. (edit `hosts.d/test.toml` 59 | for connection settings and `alerts.d/mail_general.toml` for email address, 60 | at least) 61 | 62 | Here's a general figure of how Nosee works: 63 | 64 | ![Nosee general configuration structure](https://raw.github.com/Xfennec/nosee/master/doc/images/img_general.png) 65 | 66 | ### Small tutorial 67 | 68 | Configuration is mainly done by simple text file using 69 | the [TOML](https://github.com/toml-lang/toml) syntax. 70 | 71 | **Let's monitor CPU temperature of one of our Web servers.** 72 | 73 | ### Step1. Create a *Host* (SSH connection) 74 | 75 | Create a file in the `hosts.d` directory. (ex: `hosts.d/web_myapp.toml`). 76 | 77 | ```toml 78 | name = "MyApp Webserver" 79 | classes = ["linux", "web", "myapp"] 80 | 81 | [network] 82 | host = "192.168.0.100" 83 | port = 22 84 | 85 | [auth] 86 | user = "test5" 87 | password = "test5" 88 | ``` 89 | 90 | The `classes` parameter is completely free, you may chose anything that 91 | fits your infrastructure. It will determine what checks will be done on 92 | this host (see below). 93 | 94 | Authentication by password is extremely bad, of course, as writing down 95 | a password in a configuration file. Nosee supports other (preferred) options 96 | such as passphrases and ssh-agent. 97 | 98 | ### Step2. Create a *Probe* 99 | 100 | Create a file in the `probes.d` directory. (ex: `probes.d/cpu_temp.toml`). 101 | 102 | ```toml 103 | name = "CPU temperature" 104 | targets = ["linux"] 105 | 106 | script = "cpu_temp.sh" 107 | 108 | delay = "1m" 109 | 110 | # Checks 111 | 112 | [[check]] 113 | desc = "critical CPU temperature" 114 | if = "TEMP > 85" 115 | classes = ["critical"] 116 | ``` 117 | 118 | The `targets` parameter will match the `classes` of our host. Targets can 119 | be more precise with things like `linux & web`. (both `linux` and `web` classes 120 | must exist in host) 121 | 122 | The `delay` explains that this probe must be run every minute. This is 123 | the lowest delay available. 124 | 125 | Then we have a check. You can have multiple checks in a probe. This check 126 | will look at the `TEMP` value returned by the `cpu_temp.sh` 127 | script (see below) and evaluate the `if` expression. You can have a look 128 | at [govaluate](https://github.com/Knetic/govaluate) for details about 129 | expression's syntax. 130 | 131 | If this expression becomes true, the probe will ring a `critical` alert. Here 132 | again, you are free to use any class of your choice to create your own 133 | error typology. (ex: `["warning", "hardware_guys"]` to ring a specific group 134 | of users in charge of critical failures of the hardware) 135 | 136 | ### Step3. Create a *script* (or use a provided one) 137 | 138 | Scripts are hosted in the `scripts/probes/` directory. 139 | 140 | ```bash 141 | #!/bin/bash 142 | 143 | val=$(cat /sys/class/thermal/thermal_zone0/temp) 144 | temp=$(awk "BEGIN {print $val/1000}") 145 | echo "TEMP:" $temp 146 | ``` 147 | 148 | This script will run on monitored hosts (so… stay light). Here, we read 149 | the first thermal zone and divide it by 1000 to get Celsius value. 150 | 151 | Scripts must print `KEY: val` lines to feed checks, as seen above. That's it. 152 | 153 | ### Step4. Create an *Alert* 154 | 155 | Create a file in the `alerts.d` directory. (ex: `alerts.d/mail_julien.toml`). 156 | 157 | ```toml 158 | name = "Mail Julien" 159 | 160 | targets = ["julien", "warning", "critical", "general"] 161 | 162 | command = "mail" 163 | 164 | arguments = [ 165 | "-s", 166 | "Nosee: $SUBJECT", 167 | "julien@domain.tld" 168 | ] 169 | ``` 170 | 171 | This simple alert will use the usual `mail` command when an alert matches 172 | one (or more) of the given targets. It works exactly the same as classes/targets 173 | for Hosts/Probes to let you create your own vocabulary. 174 | (ex: `"web & production & critical"` is a valid target) 175 | 176 | As you may have seen, some variables are available for arguments, like 177 | the `$SUBJECT` of the alert message. 178 | 179 | There's a special class `general` for very important general messages. At 180 | least one alert must listen permanently at this class. 181 | 182 | ### Step5. Run Nosee! 183 | 184 | cd $GOPATH/bin 185 | ./nosee -l info -c ../src/github.com/Xfennec/nosee/etc/ 186 | 187 | You are now ready to burn your Web server CPU to get your alert mail. The `-c` 188 | parameter gives the configuration path, and the `-l` will make Nosee way 189 | more verbose. 190 | 191 | ./nosee help 192 | 193 | … will tell you more about command line arguments. 194 | 195 | Anything else? (WIP) 196 | -------------------- 197 | 198 | Oh yes. I want to explain: 199 | 200 | - "threaded" (Goroutines) 201 | - global `nosee.toml` configuration 202 | - SSH runs (group of probes) 203 | - `*` targets 204 | - needed_failures / needed_successes 205 | - defaults 206 | - host overriding of probe's defaults 207 | - use of defaults for probe script arguments 208 | - probe `run_if` condition 209 | - alert scripts 210 | - alert limits 211 | - alert env and stdin 212 | - timeouts 213 | - rescheduling 214 | - GOOD and BAD alerts 215 | - UniqueID for alerts 216 | - configuration "recap/summary" command 217 | - extensive configuration validation (and connection tests) 218 | - alert examples (pushover, SMS, …) 219 | - probe examples! 220 | - check "If" functions (date) 221 | - nosee-alerts.json current alerts 222 | - heartbeat scripts 223 | - systemd / supervisord sample files (see deploy/ directory) 224 | - test subcommand 225 | - loggers / InfluxDB 226 | 227 | ![Nosee + InfluxDB + Grafana](https://raw.github.com/Xfennec/nosee/master/doc/images/nosee-influxdb-grafana.png) 228 | 229 | (example: Nosee → InfluxDB → Grafana) 230 | 231 | What is the future of Nosee? (WIP) 232 | ---------------------------- 233 | 234 | - remote Nosee interconnections 235 | -------------------------------------------------------------------------------- /alert.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "os/exec" 7 | "strings" 8 | "time" 9 | ) 10 | 11 | // HourRange hold a Start and an End in the form of int arrays ([0] = hours, [1] = minutes) 12 | type HourRange struct { 13 | Start [2]int 14 | End [2]int 15 | } 16 | 17 | // Alert is the final form of alerts.d files 18 | type Alert struct { 19 | Name string 20 | Disabled bool 21 | Targets []string 22 | Command string 23 | Arguments []string 24 | Hours []HourRange 25 | Days []int 26 | } 27 | 28 | // Ring will send an AlertMessage using this Alert, executing the 29 | // configured command 30 | func (alert *Alert) Ring(msg *AlertMessage) { 31 | Info.Println("ring: " + alert.Name + ", " + alert.Command /* + " " + strings.Join(alert.Arguments, " ") */) 32 | 33 | varMap := make(map[string]interface{}) 34 | varMap["SUBJECT"] = msg.Subject 35 | varMap["TYPE"] = msg.Type.String() 36 | varMap["UNIQUEID"] = msg.UniqueID 37 | varMap["HOST_NAME"] = msg.Hostname 38 | varMap["CLASSES"] = strings.Join(msg.Classes, ",") 39 | varMap["NOSEE_SRV"] = GlobalConfig.Name 40 | varMap["DATETIME"] = msg.DateTime.Format(time.RFC3339) 41 | // "Level" ? (Run, Task, Checks) 42 | // Probe Name, Check Name, Alert Name ? 43 | 44 | var args []string 45 | for _, arg := range alert.Arguments { 46 | expArg := StringExpandVariables(arg, varMap) 47 | args = append(args, expArg) 48 | } 49 | 50 | go func() { 51 | cmd := exec.Command(alert.Command, args...) 52 | 53 | env := os.Environ() 54 | for key, val := range varMap { 55 | env = append(env, fmt.Sprintf("%s=%s", key, InterfaceValueToString(val))) 56 | } 57 | cmd.Env = env 58 | 59 | // we also inject Details thru stdin: 60 | cmd.Stdin = strings.NewReader(msg.Details) 61 | 62 | if cmdOut, err := cmd.CombinedOutput(); err != nil { 63 | if len(msg.Classes) == 1 && msg.Classes[0] == GeneralClass { 64 | Error.Printf("unable to ring an alert to general class! error: %s (%s)\n", err, alert.Command) 65 | return 66 | } 67 | 68 | Warning.Printf("error running alert '%s': %s", alert.Command, err) 69 | 70 | msg.Subject = msg.Subject + " (Fwd)" 71 | prepend := fmt.Sprintf("WARNING: This alert is re-routed to the 'general' class, because\noriginal alert failed with the following error: %s (%s)\nOutput: %s\n\n", err.Error(), alert.Command, string(cmdOut)) 72 | msg.Details = prepend + msg.Details 73 | msg.Classes = []string{GeneralClass} 74 | msg.RingAlerts() 75 | } 76 | }() 77 | } 78 | 79 | // Ringable will return true if this Alert is currently able to ring 80 | // (no matching day or hour limit) 81 | func (alert *Alert) Ringable() bool { 82 | now := time.Now() 83 | nowMins := now.Hour()*60 + now.Minute() 84 | nowDay := int(now.Weekday()) 85 | hourOk := len(alert.Hours) == 0 86 | for _, hourRange := range alert.Hours { 87 | start := hourRange.Start[0]*60 + hourRange.Start[1] 88 | end := hourRange.End[0]*60 + hourRange.End[1] 89 | if nowMins >= start && nowMins <= end { 90 | hourOk = true 91 | break 92 | } 93 | } 94 | dayOk := len(alert.Days) == 0 95 | for _, day := range alert.Days { 96 | if nowDay == day { 97 | dayOk = true 98 | } 99 | } 100 | return hourOk && dayOk 101 | } 102 | -------------------------------------------------------------------------------- /alert_message.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "strings" 7 | "time" 8 | ) 9 | 10 | // AlertMessageType definition 11 | type AlertMessageType uint8 12 | 13 | // AlertMessageType numeric values 14 | const ( 15 | AlertGood AlertMessageType = iota + 1 16 | AlertBad 17 | ) 18 | 19 | // AlertMessageTypeStr stores matching strings 20 | var AlertMessageTypeStr = [...]string{ 21 | "GOOD", 22 | "BAD", 23 | } 24 | 25 | // AlertMessage will store the text of the error 26 | type AlertMessage struct { 27 | Type AlertMessageType 28 | Subject string 29 | Details string 30 | Classes []string 31 | UniqueID string 32 | Hostname string 33 | DateTime time.Time 34 | } 35 | 36 | // GeneralClass is a "general" class for very important general messages 37 | const GeneralClass = "general" 38 | 39 | func (amt AlertMessageType) String() string { 40 | if amt == 0 { 41 | return "INVALID_TYPE" 42 | } 43 | return AlertMessageTypeStr[amt-1] 44 | } 45 | 46 | // AlertMessageCreateForRun creates a new AlertMessage with AlertGood or 47 | // AlertBad type for a Run 48 | func AlertMessageCreateForRun(aType AlertMessageType, run *Run, currentFail *CurrentFail) *AlertMessage { 49 | var message AlertMessage 50 | 51 | message.Subject = fmt.Sprintf("[%s] %s: run error(s)", aType, run.Host.Name) 52 | message.Type = aType 53 | message.UniqueID = currentFail.UniqueID 54 | message.Hostname = run.Host.Name 55 | message.DateTime = run.StartTime 56 | 57 | var details bytes.Buffer 58 | 59 | switch aType { 60 | case AlertBad: 61 | details.WriteString("A least one error occured during a run for this host. (" + run.StartTime.Format("2006-01-02 15:04:05") + ")\n") 62 | details.WriteString("\n") 63 | details.WriteString("Error(s):\n") 64 | for _, err := range run.Errors { 65 | details.WriteString(err.Error() + "\n") 66 | } 67 | case AlertGood: 68 | details.WriteString("No more run errors for this host. (" + run.StartTime.Format("2006-01-02 15:04:05") + ")\n") 69 | } 70 | 71 | details.WriteString("\n") 72 | details.WriteString("Unique failure ID: " + message.UniqueID + "\n") 73 | message.Details = details.String() 74 | 75 | message.Classes = []string{GeneralClass} 76 | 77 | return &message 78 | } 79 | 80 | // AlertMessageCreateForTaskResult creates an AlertGood or AlertBad message for a TaskResult 81 | func AlertMessageCreateForTaskResult(aType AlertMessageType, run *Run, taskResult *TaskResult, currentFail *CurrentFail) *AlertMessage { 82 | var message AlertMessage 83 | 84 | message.Subject = fmt.Sprintf("[%s] %s: %s: task error(s)", aType, run.Host.Name, taskResult.Task.Probe.Name) 85 | message.Type = aType 86 | message.UniqueID = currentFail.UniqueID 87 | message.Hostname = run.Host.Name 88 | message.DateTime = taskResult.StartTime 89 | 90 | var details bytes.Buffer 91 | 92 | switch aType { 93 | case AlertBad: 94 | details.WriteString("A least one error occured during a task for this host. (" + taskResult.StartTime.Format("2006-01-02 15:04:05") + ")\n") 95 | details.WriteString("\n") 96 | details.WriteString("Error(s):\n") 97 | for _, err := range taskResult.Errors { 98 | details.WriteString(err.Error() + "\n") 99 | } 100 | if len(taskResult.Logs) > 0 { 101 | details.WriteString("\n") 102 | details.WriteString("Logs(s):\n") 103 | for _, log := range taskResult.Logs { 104 | details.WriteString(log + "\n") 105 | } 106 | } 107 | case AlertGood: 108 | details.WriteString("No more errors for this task on this host. (" + taskResult.StartTime.Format("2006-01-02 15:04:05") + ")\n") 109 | } 110 | 111 | details.WriteString("\n") 112 | details.WriteString("Unique failure ID: " + message.UniqueID + "\n") 113 | message.Details = details.String() 114 | 115 | message.Classes = []string{GeneralClass} 116 | 117 | return &message 118 | } 119 | 120 | // AlertMessageCreateForCheck creates a AlertGood or AlertBad message for a Check 121 | func AlertMessageCreateForCheck(aType AlertMessageType, run *Run, taskRes *TaskResult, check *Check, currentFail *CurrentFail) *AlertMessage { 122 | var message AlertMessage 123 | 124 | // Host: Check (Task) 125 | message.Subject = fmt.Sprintf("[%s] %s: %s (%s)", aType, run.Host.Name, check.Desc, taskRes.Task.Probe.Name) 126 | message.Type = aType 127 | message.UniqueID = currentFail.UniqueID 128 | message.Hostname = run.Host.Name 129 | 130 | var details bytes.Buffer 131 | 132 | switch aType { 133 | case AlertBad: 134 | details.WriteString("An alert **is** ringing.\n\n") 135 | message.DateTime = currentFail.FailStart 136 | case AlertGood: 137 | details.WriteString("This alert is **no more** ringing.\n\n") 138 | message.DateTime = taskRes.StartTime 139 | } 140 | 141 | details.WriteString("Failure time: " + currentFail.FailStart.Format("2006-01-02 15:04:05") + "\n") 142 | details.WriteString("Last task time: " + taskRes.StartTime.Format("2006-01-02 15:04:05") + "\n") 143 | details.WriteString("Class(es): " + strings.Join(check.Classes, ", ") + "\n") 144 | details.WriteString("Failed condition was: " + check.If.String() + "\n") 145 | details.WriteString("\n") 146 | details.WriteString("Values:\n") 147 | for _, token := range check.If.Vars() { 148 | if IsAllUpper(token) { 149 | details.WriteString("- " + token + ": " + taskRes.Values[token] + "\n") 150 | } else { 151 | val := InterfaceValueToString(taskRes.Task.Probe.Defaults[token]) 152 | if _, exists := taskRes.Host.Defaults[token]; exists == true { 153 | val = InterfaceValueToString(taskRes.Host.Defaults[token]) 154 | } 155 | details.WriteString("- " + token + ": " + val + "\n") 156 | } 157 | } 158 | details.WriteString("\n") 159 | details.WriteString(fmt.Sprintf("All values for this run (%s):\n", run.Duration)) 160 | for _, tr := range run.TaskResults { 161 | details.WriteString(fmt.Sprintf("- %s (%s):\n", tr.Task.Probe.Name, tr.Duration)) 162 | for key, val := range tr.Values { 163 | details.WriteString("--- " + key + ": " + val + "\n") 164 | } 165 | } 166 | details.WriteString("\n") 167 | details.WriteString("Unique failure ID: " + message.UniqueID + "\n") 168 | message.Details = details.String() 169 | 170 | message.Classes = check.Classes 171 | 172 | return &message 173 | } 174 | 175 | // Dump prints AlertMessage informations on the screen for debugging purposes 176 | func (msg *AlertMessage) Dump() { 177 | fmt.Printf("---\n") 178 | fmt.Printf("Subject: %s\n", msg.Subject) 179 | fmt.Printf("%s\n---\n", msg.Details) 180 | } 181 | 182 | // RingAlerts will search and ring all alerts for this AlertMessage 183 | func (msg *AlertMessage) RingAlerts() { 184 | ringCount := 0 185 | for _, alert := range globalAlerts { 186 | if msg.MatchAlertTargets(alert) { 187 | if alert.Ringable() { 188 | alert.Ring(msg) 189 | ringCount++ 190 | } 191 | } 192 | } 193 | 194 | if ringCount == 0 { 195 | // if class is already "general", we're f*cked :( 196 | if len(msg.Classes) == 1 && msg.Classes[0] == GeneralClass { 197 | Error.Printf("unable to ring an alert : can't match the 'general' class!\n") 198 | return 199 | } 200 | 201 | Warning.Printf("no matching alert for this failure: '%s' with class(es): %s\n", msg.Subject, strings.Join(msg.Classes, ", ")) 202 | 203 | // forward the alert to 'general' class: 204 | msg.Subject = msg.Subject + " (Fwd)" 205 | prepend := "WARNING: This alert is re-routed to the 'general' class, because no alert matches its orginial classes (" + strings.Join(msg.Classes, ", ") + ")\n\n" 206 | msg.Details = prepend + msg.Details 207 | msg.Classes = []string{GeneralClass} 208 | msg.RingAlerts() 209 | } 210 | } 211 | 212 | // HasClass returns true if this AlertMessage has this class 213 | func (msg *AlertMessage) HasClass(class string) bool { 214 | if class == "*" { 215 | return true 216 | } 217 | 218 | for _, hClass := range msg.Classes { 219 | if hClass == class { 220 | return true 221 | } 222 | } 223 | return false 224 | } 225 | 226 | // MatchAlertTargets returns true if this AlertMessage matches alert's classes 227 | func (msg *AlertMessage) MatchAlertTargets(alert *Alert) bool { 228 | for _, pTargets := range alert.Targets { 229 | tokens := strings.Split(pTargets, "&") 230 | matched := 0 231 | mustMatch := len(tokens) 232 | for _, token := range tokens { 233 | ttoken := strings.TrimSpace(token) 234 | if msg.HasClass(ttoken) { 235 | matched++ 236 | } 237 | } 238 | if matched == mustMatch { 239 | return true 240 | } 241 | } 242 | return false 243 | } 244 | -------------------------------------------------------------------------------- /check_functions.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "regexp" 6 | "time" 7 | 8 | "github.com/Knetic/govaluate" 9 | ) 10 | 11 | // CheckFunctions will hold all custom govaluate functions for Check 'If' 12 | // expressions 13 | var CheckFunctions map[string]govaluate.ExpressionFunction 14 | 15 | // CheckFunctionsInit will initialize CheckFunctions global variable 16 | func CheckFunctionsInit() { 17 | CheckFunctions = map[string]govaluate.ExpressionFunction{ 18 | 19 | "strlen": func(args ...interface{}) (interface{}, error) { 20 | length := len(args[0].(string)) 21 | return (float64)(length), nil 22 | }, 23 | 24 | "ping": func(args ...interface{}) (interface{}, error) { 25 | if len(args) > 0 { 26 | return nil, fmt.Errorf("ping function: too much arguments") 27 | } 28 | return (string)("pong"), nil 29 | }, 30 | 31 | "date": func(args ...interface{}) (interface{}, error) { 32 | if len(args) != 1 { 33 | return nil, fmt.Errorf("date function: wrong argument count (1 required)") 34 | } 35 | format := args[0].(string) 36 | now := time.Now() 37 | switch format { 38 | case "hour": 39 | return (float64)(now.Hour()), nil 40 | case "minute": 41 | return (float64)(now.Minute()), nil 42 | case "time": 43 | return (float64)((float64)(now.Hour()) + (float64)(now.Minute())/60.0), nil 44 | case "dow", "day-of-week": 45 | // Sunday = 0 46 | return (float64)(now.Weekday()), nil 47 | case "dom", "day-of-month": 48 | return (float64)(now.Day()), nil 49 | case "now": 50 | return (float64)(now.Unix()), nil 51 | } 52 | 53 | if match, _ := regexp.MatchString("^[0-9]{1,2}:[0-9]{2}$", format); match == true { 54 | t, err := alertCheckHour(format) 55 | if err != nil { 56 | return nil, fmt.Errorf("date function: invalid hour '%s': %s", format, err) 57 | } 58 | return (float64)((float64)(t[0]) + (float64)(t[1])/60.0), nil 59 | } 60 | 61 | return nil, fmt.Errorf("date function: invalid format '%s'", format) 62 | }, 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /config.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "os" 7 | "path" 8 | "time" 9 | 10 | "github.com/BurntSushi/toml" 11 | ) 12 | 13 | type tomlConfig struct { 14 | Name string 15 | StartTimeSpread Duration `toml:"start_time_spread"` 16 | SSHConnTimeWarn Duration `toml:"ssh_connection_time_warn"` 17 | SSHBlindTrust bool `toml:"ssh_blindtrust_fingerprints"` 18 | SavePath string `toml:"save_path"` 19 | HeartbeatDelay Duration `toml:"heartbeat_delay"` 20 | } 21 | 22 | // Config is the final form of the nosee.toml config file 23 | type Config struct { 24 | configPath string 25 | loadDisabled bool 26 | doConnTest bool 27 | 28 | Name string 29 | StartTimeSpreadSeconds int 30 | SSHConnTimeWarn time.Duration 31 | SSHBlindTrust bool 32 | SavePath string 33 | HeartbeatDelay time.Duration 34 | } 35 | 36 | // GlobalConfig exports the Nosee server configuration 37 | var GlobalConfig *Config 38 | 39 | // GlobalConfigRead reads given file and returns a Config 40 | func GlobalConfigRead(dir, file string) (*Config, error) { 41 | var config Config 42 | var tConfig tomlConfig 43 | 44 | // defaults: 45 | // config.xxx -> default if config file not exists 46 | // tConfig.xxx -> default if parameter's not provided in config file 47 | config.Name = "" 48 | tConfig.Name = "" 49 | 50 | config.StartTimeSpreadSeconds = 15 51 | tConfig.StartTimeSpread.Duration = 15 * time.Second 52 | 53 | config.SSHConnTimeWarn = 10 * time.Second 54 | tConfig.SSHConnTimeWarn.Duration = config.SSHConnTimeWarn 55 | 56 | config.SSHBlindTrust = false 57 | tConfig.SSHBlindTrust = false 58 | 59 | config.SavePath = "./" 60 | tConfig.SavePath = config.SavePath 61 | 62 | config.HeartbeatDelay = 30 * time.Second 63 | tConfig.HeartbeatDelay.Duration = config.HeartbeatDelay 64 | 65 | config.configPath = dir 66 | config.loadDisabled = false 67 | config.doConnTest = true 68 | 69 | if stat, err := os.Stat(config.configPath); err != nil || !stat.Mode().IsDir() { 70 | return nil, fmt.Errorf("configuration directory not found: %s (%s)", err, config.configPath) 71 | } 72 | 73 | configPath := path.Clean(dir + "/" + file) 74 | 75 | if stat, err := os.Stat(configPath); err != nil || !stat.Mode().IsRegular() { 76 | Warning.Printf("no %s file, using defaults\n", configPath) 77 | return &config, nil 78 | } 79 | 80 | if _, err := toml.DecodeFile(configPath, &tConfig); err != nil { 81 | return nil, fmt.Errorf("decoding %s: %s", file, err) 82 | } 83 | 84 | if tConfig.Name != "" { 85 | config.Name = tConfig.Name 86 | } 87 | 88 | if tConfig.StartTimeSpread.Duration > (1 * time.Minute) { 89 | return nil, errors.New("'start_time_spread' can't be more than a minute") 90 | } 91 | config.StartTimeSpreadSeconds = int(tConfig.StartTimeSpread.Duration.Seconds()) 92 | 93 | if tConfig.SSHConnTimeWarn.Duration < (1 * time.Second) { 94 | return nil, errors.New("'ssh_connection_time_warn' can't be less than a second") 95 | } 96 | config.SSHConnTimeWarn = tConfig.SSHConnTimeWarn.Duration 97 | 98 | config.SSHBlindTrust = tConfig.SSHBlindTrust 99 | 100 | // should check if writable 101 | config.SavePath = tConfig.SavePath 102 | 103 | if tConfig.HeartbeatDelay.Duration < (5 * time.Second) { 104 | return nil, errors.New("'heartbeat_delay' can't be less than 5 seconds") 105 | } 106 | config.HeartbeatDelay = tConfig.HeartbeatDelay.Duration 107 | 108 | return &config, nil 109 | } 110 | -------------------------------------------------------------------------------- /config_alert.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "io/ioutil" 7 | "os" 8 | "os/exec" 9 | "path" 10 | "strconv" 11 | "strings" 12 | ) 13 | 14 | type tomlAlert struct { 15 | Name string 16 | Disabled bool 17 | Targets []string 18 | Command string 19 | Arguments []string 20 | Hours []string 21 | Days []int 22 | } 23 | 24 | func alertCheckHour(hour string) ([2]int, error) { 25 | var err error 26 | var res [2]int 27 | 28 | parts := strings.Split(hour, ":") 29 | if len(parts) != 2 { 30 | return res, fmt.Errorf("invalid format '%s' (ex: '19:30')", hour) 31 | } 32 | res[0], err = strconv.Atoi(parts[0]) 33 | if err != nil { 34 | return res, fmt.Errorf("can't convert '%s' hour to integer: %s", hour, err) 35 | } 36 | res[1], err = strconv.Atoi(parts[1]) 37 | if err != nil { 38 | return res, fmt.Errorf("can't convert '%s' minute to integer: %s", hour, err) 39 | } 40 | 41 | if res[0] < 0 { 42 | return res, fmt.Errorf("hour can't be less than 0: %s", hour) 43 | } 44 | if res[1] < 0 { 45 | return res, fmt.Errorf("minute can't be less than 0: %s", hour) 46 | } 47 | if res[0] > 23 { 48 | return res, fmt.Errorf("hour can't more than 23: %s", hour) 49 | } 50 | if res[1] > 59 { 51 | return res, fmt.Errorf("minute can't more than 59: %s", hour) 52 | } 53 | 54 | return res, nil 55 | } 56 | 57 | func alertCheckHours(hours []string) ([]HourRange, error) { 58 | var hourRanges []HourRange 59 | 60 | for _, hour := range hours { 61 | var ( 62 | hourRange HourRange 63 | err error 64 | ) 65 | 66 | rng := strings.Split(hour, "-") 67 | if len(rng) != 2 { 68 | return nil, fmt.Errorf("invalid format '%s' (ex: '8:90 - 19:00')", hour) 69 | } 70 | rng[0] = strings.TrimSpace(rng[0]) 71 | rng[1] = strings.TrimSpace(rng[1]) 72 | 73 | if hourRange.Start, err = alertCheckHour(rng[0]); err != nil { 74 | return nil, fmt.Errorf("invalid start hour: %s", err) 75 | } 76 | if hourRange.End, err = alertCheckHour(rng[1]); err != nil { 77 | return nil, fmt.Errorf("invalid end hour: %s", err) 78 | } 79 | 80 | start := hourRange.Start[0]*60 + hourRange.Start[1] 81 | end := hourRange.End[0]*60 + hourRange.End[1] 82 | if start >= end { 83 | return nil, fmt.Errorf("end of the hour range (%s) is before its start", hour) 84 | } 85 | 86 | hourRanges = append(hourRanges, hourRange) 87 | } 88 | return hourRanges, nil 89 | } 90 | 91 | func alertCheckAndCleanDays(days []int) error { 92 | for key, day := range days { 93 | if day < 0 { 94 | return fmt.Errorf("day can't be less than 0: %d", day) 95 | } 96 | if day > 7 { 97 | return fmt.Errorf("day can't be more than 7: %d", day) 98 | } 99 | 100 | if day == 7 { 101 | days[key] = 0 102 | } 103 | } 104 | return nil 105 | } 106 | 107 | func tomlAlertToAlert(tAlert *tomlAlert, config *Config) (*Alert, error) { 108 | var alert Alert 109 | 110 | if tAlert.Disabled == true && config.loadDisabled == false { 111 | return nil, nil 112 | } 113 | 114 | if tAlert.Name == "" { 115 | return nil, errors.New("invalid or missing 'name'") 116 | } 117 | alert.Name = tAlert.Name 118 | 119 | if tAlert.Command == "" { 120 | return nil, errors.New("invalid or missing 'command'") 121 | } 122 | 123 | scriptPath := path.Clean(config.configPath + "/scripts/alerts/" + tAlert.Command) 124 | stat, err := os.Stat(scriptPath) 125 | 126 | if err == nil { 127 | if !stat.Mode().IsRegular() { 128 | return nil, fmt.Errorf("is not a regular 'script' file '%s'", scriptPath) 129 | } 130 | tAlert.Command = scriptPath 131 | } else { 132 | path, errp := exec.LookPath(tAlert.Command) 133 | if errp != nil { 134 | return nil, fmt.Errorf("'%s' command not found in PATH: %s", tAlert.Command, errp) 135 | } 136 | tAlert.Command = path 137 | } 138 | 139 | alert.Command = tAlert.Command 140 | 141 | _, err = ioutil.ReadFile(alert.Command) 142 | if err != nil { 143 | return nil, fmt.Errorf("error reading script file '%s': %s", alert.Command, err) 144 | } 145 | 146 | if tAlert.Targets == nil { 147 | return nil, errors.New("no valid 'targets' parameter found") 148 | } 149 | 150 | if len(tAlert.Targets) == 0 { 151 | return nil, errors.New("empty 'targets'") 152 | } 153 | // explode targets on & and check IsValidTokenName 154 | hasGeneralClass := false 155 | for _, targets := range tAlert.Targets { 156 | if targets == "*" || targets == GeneralClass { 157 | hasGeneralClass = true 158 | continue 159 | } 160 | tokens := strings.Split(targets, "&") 161 | for _, token := range tokens { 162 | ttoken := strings.TrimSpace(token) 163 | if !IsValidTokenName(ttoken) { 164 | return nil, fmt.Errorf("invalid 'target' class name '%s'", ttoken) 165 | } 166 | } 167 | } 168 | alert.Targets = tAlert.Targets 169 | 170 | alert.Arguments = tAlert.Arguments 171 | 172 | hours, err := alertCheckHours(tAlert.Hours) 173 | if err != nil { 174 | return nil, fmt.Errorf("'hours' parameter: %s", err) 175 | } 176 | alert.Hours = hours 177 | 178 | if err := alertCheckAndCleanDays(tAlert.Days); err != nil { 179 | return nil, fmt.Errorf("'days' parameter: %s", err) 180 | } 181 | alert.Days = tAlert.Days 182 | 183 | if hasGeneralClass == true && len(alert.Hours) > 0 && len(alert.Days) > 0 { 184 | return nil, fmt.Errorf("a 'general' (or '*') alert can't have hours/days restrictions, since you may miss alerts") 185 | } 186 | 187 | return &alert, nil 188 | } 189 | -------------------------------------------------------------------------------- /config_host.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "os" 7 | "time" 8 | 9 | "golang.org/x/crypto/ssh" 10 | ) 11 | 12 | type tomlNetwork struct { 13 | Host string 14 | Port int 15 | Ciphers []string 16 | SSHConnTimeWarn Duration `toml:"ssh_connection_time_warn"` 17 | } 18 | 19 | type tomlAuth struct { 20 | User string 21 | Password string 22 | Key string 23 | KeyPassphrase string `toml:"key_passphrase"` 24 | SSHAgent bool `toml:"ssh_agent"` 25 | Pubkey string 26 | } 27 | 28 | type tomlHost struct { 29 | Disabled bool 30 | Name string 31 | Network tomlNetwork 32 | Auth tomlAuth 33 | Classes []string 34 | Default []tomlDefault 35 | } 36 | 37 | func tomlHostToHost(tHost *tomlHost, config *Config, filename string) (*Host, error) { 38 | var ( 39 | connection Connection 40 | host Host 41 | ) 42 | 43 | host.Connection = &connection 44 | host.Filename = filename 45 | 46 | if tHost.Disabled == true && config.loadDisabled == false { 47 | return nil, nil 48 | } 49 | host.Disabled = (tHost.Disabled == true) 50 | 51 | if tHost.Name == "" { 52 | return nil, errors.New("invalid or missing 'name'") 53 | } 54 | host.Name = tHost.Name 55 | 56 | if tHost.Classes == nil { 57 | return nil, errors.New("no valid 'classes' parameter found") 58 | } 59 | 60 | if len(tHost.Classes) == 0 { 61 | return nil, errors.New("empty classes") 62 | } 63 | for _, class := range tHost.Classes { 64 | if !IsValidTokenName(class) { 65 | return nil, fmt.Errorf("invalid class name '%s'", class) 66 | } 67 | } 68 | host.Classes = tHost.Classes 69 | 70 | host.Defaults = make(map[string]interface{}) 71 | if err := checkTomlDefault(host.Defaults, tHost.Default); err != nil { 72 | return nil, err 73 | } 74 | 75 | if tHost.Network.Host == "" { 76 | return nil, errors.New("[network] section, invalid or missing 'host'") 77 | } 78 | connection.Host = tHost.Network.Host 79 | 80 | if tHost.Network.Port == 0 { 81 | return nil, errors.New("[network] section, invalid or missing 'port'") 82 | } 83 | connection.Port = tHost.Network.Port 84 | 85 | if tHost.Network.SSHConnTimeWarn.Duration < (1 * time.Second) { 86 | return nil, errors.New("'ssh_connection_time_warn' can't be less than a second") 87 | } 88 | connection.SSHConnTimeWarn = tHost.Network.SSHConnTimeWarn.Duration 89 | 90 | if tHost.Auth.User == "" { 91 | return nil, errors.New("[auth] section, invalid or missing 'user'") 92 | } 93 | connection.User = tHost.Auth.User 94 | connection.Ciphers = tHost.Network.Ciphers 95 | 96 | if tHost.Auth.Key != "" && tHost.Auth.Password != "" { 97 | return nil, errors.New("[auth] section, can't use key and password at the same time (see key_passphrase parameter, perhaps?)") 98 | } 99 | if tHost.Auth.KeyPassphrase != "" && tHost.Auth.Password != "" { 100 | return nil, errors.New("[auth] section, can't use key_passphrase and password at the same time") 101 | } 102 | if tHost.Auth.SSHAgent == true && tHost.Auth.Password != "" { 103 | return nil, errors.New("[auth] section, can't use SSH agent and password at the same time") 104 | } 105 | if tHost.Auth.SSHAgent == true && tHost.Auth.KeyPassphrase != "" { 106 | return nil, errors.New("[auth] section, can't use SSH agent and key_passphrase at the same time") 107 | } 108 | if tHost.Auth.SSHAgent == true && tHost.Auth.Key != "" { 109 | return nil, errors.New("[auth] section, can't use SSH agent and key at the same time (see pubkey parameter, perhaps?)") 110 | } 111 | 112 | if tHost.Auth.Key != "" { 113 | fd, err := os.Open(tHost.Auth.Key) 114 | if err != nil { 115 | return nil, fmt.Errorf("can't access to key '%s': %s", tHost.Auth.Key, err) 116 | } 117 | fd.Close() 118 | } 119 | 120 | // !!! there's many returns following this line, be careful 121 | 122 | if tHost.Auth.Password != "" { 123 | connection.Auths = []ssh.AuthMethod{ 124 | ssh.Password(tHost.Auth.Password), 125 | } 126 | return &host, nil 127 | } 128 | 129 | if tHost.Auth.SSHAgent == true { 130 | agent, err := SSHAgent(tHost.Auth.Pubkey) 131 | if err != nil { 132 | return nil, err 133 | } 134 | connection.Auths = []ssh.AuthMethod{ 135 | agent, 136 | } 137 | return &host, nil 138 | } 139 | 140 | if tHost.Auth.Key != "" && tHost.Auth.KeyPassphrase == "" { 141 | connection.Auths = []ssh.AuthMethod{ 142 | PublicKeyFile(tHost.Auth.Key), 143 | } 144 | return &host, nil 145 | } 146 | 147 | if tHost.Auth.Key != "" && tHost.Auth.KeyPassphrase != "" { 148 | connection.Auths = []ssh.AuthMethod{ 149 | PublicKeyFilePassPhrase(tHost.Auth.Key, tHost.Auth.KeyPassphrase), 150 | } 151 | return &host, nil 152 | } 153 | 154 | return nil, errors.New("[auth] section, at least one auth method is needed (password, key or ssh_agent)") 155 | } 156 | -------------------------------------------------------------------------------- /config_probe.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "io/ioutil" 7 | "os" 8 | "path" 9 | "reflect" 10 | "strings" 11 | "time" 12 | 13 | "github.com/Knetic/govaluate" 14 | ) 15 | 16 | // Duration hides time.Duration for TOML file reading (see UnmarshalText) 17 | type Duration struct { 18 | time.Duration 19 | } 20 | 21 | // UnmarshalText is needed to satisfy the encoding.TextUnmarshaler interface 22 | func (d *Duration) UnmarshalText(text []byte) error { 23 | var err error 24 | d.Duration, err = time.ParseDuration(string(text)) 25 | return err 26 | } 27 | 28 | type tomlDefault struct { 29 | Name string 30 | Value interface{} 31 | } 32 | 33 | type tomlCheck struct { 34 | Desc string 35 | If string 36 | Classes []string 37 | NeededFailures int `toml:"needed_failures"` 38 | NeededSuccesses int `toml:"needed_successes"` 39 | } 40 | 41 | type tomlProbe struct { 42 | Name string 43 | Disabled bool 44 | Script string 45 | Targets []string 46 | Delay Duration 47 | Timeout Duration 48 | Arguments string 49 | Default []tomlDefault 50 | Check []tomlCheck 51 | RunIf string `toml:"run_if"` 52 | } 53 | 54 | func checkTomlDefault(pDefaults map[string]interface{}, tDefaults []tomlDefault) error { 55 | for _, tDefault := range tDefaults { 56 | 57 | if tDefault.Name == "" { 58 | return errors.New("[[default]] with invalid or missing 'name'") 59 | } 60 | 61 | if IsAllUpper(tDefault.Name) { 62 | return fmt.Errorf("[[default]] name is invalid (all uppercase): %s", tDefault.Name) 63 | } 64 | 65 | valid := false 66 | switch tDefault.Value.(type) { 67 | case string: 68 | valid = true 69 | case int32: 70 | valid = true 71 | case int64: 72 | valid = true 73 | case float32: 74 | valid = true 75 | case float64: 76 | valid = true 77 | } 78 | 79 | if valid == false { 80 | return fmt.Errorf("[[default]] invalid value type '%s' for '%s'", reflect.TypeOf(tDefault.Value), tDefault.Name) 81 | } 82 | 83 | if _, exists := pDefaults[tDefault.Name]; exists == true { 84 | return fmt.Errorf("Config error: duplicate default name '%s'", tDefault.Name) 85 | } 86 | 87 | pDefaults[tDefault.Name] = tDefault.Value 88 | } 89 | return nil 90 | } 91 | 92 | func tomlProbeToProbe(tProbe *tomlProbe, config *Config, filename string) (*Probe, error) { 93 | var probe Probe 94 | 95 | if tProbe.Disabled == true && config.loadDisabled == false { 96 | return nil, nil 97 | } 98 | probe.Disabled = (tProbe.Disabled == true) 99 | 100 | probe.Filename = filename 101 | 102 | if tProbe.Name == "" { 103 | return nil, errors.New("invalid or missing 'name'") 104 | } 105 | probe.Name = tProbe.Name 106 | 107 | if tProbe.Script == "" { 108 | return nil, errors.New("invalid or missing 'script'") 109 | } 110 | 111 | scriptPath := path.Clean(config.configPath + "/scripts/probes/" + tProbe.Script) 112 | stat, err := os.Stat(scriptPath) 113 | 114 | if err != nil { 115 | return nil, fmt.Errorf("invalid 'script' file '%s': %s", scriptPath, err) 116 | } 117 | 118 | if !stat.Mode().IsRegular() { 119 | return nil, fmt.Errorf("is not a regular 'script' file '%s'", scriptPath) 120 | } 121 | probe.Script = scriptPath 122 | 123 | _, err = ioutil.ReadFile(scriptPath) 124 | if err != nil { 125 | return nil, fmt.Errorf("error reading script file '%s': %s", scriptPath, err) 126 | } 127 | 128 | if tProbe.Targets == nil { 129 | return nil, errors.New("no valid 'targets' parameter found") 130 | } 131 | 132 | if len(tProbe.Targets) == 0 { 133 | return nil, errors.New("empty 'targets'") 134 | } 135 | // explode targets on & and check IsValidTokenName 136 | for _, targets := range tProbe.Targets { 137 | if targets == "*" { 138 | continue 139 | } 140 | tokens := strings.Split(targets, "&") 141 | for _, token := range tokens { 142 | ttoken := strings.TrimSpace(token) 143 | if !IsValidTokenName(ttoken) { 144 | return nil, fmt.Errorf("invalid 'target' class name '%s'", ttoken) 145 | } 146 | } 147 | } 148 | probe.Targets = tProbe.Targets 149 | 150 | if tProbe.Delay.Duration == 0 { 151 | return nil, errors.New("invalid or missing 'delay'") 152 | } 153 | 154 | if tProbe.Delay.Duration < (1 * time.Minute) { 155 | return nil, errors.New("'delay' can't be less than a minute") 156 | } 157 | 158 | minutes := float64(tProbe.Delay.Duration) / float64(time.Minute) 159 | if float64(int(minutes)) != minutes { 160 | return nil, errors.New("'delay' granularity is in minutes (ex: 5m)") 161 | } 162 | probe.Delay = tProbe.Delay.Duration 163 | 164 | if tProbe.Timeout.Duration == 0 { 165 | //~ return nil, errors.New("invalid or missing 'timeout'") 166 | tProbe.Timeout.Duration = 20 * time.Second 167 | } 168 | 169 | if tProbe.Timeout.Duration < (1 * time.Second) { 170 | return nil, errors.New("'timeout' can't be less than 1 second") 171 | } 172 | probe.Timeout = tProbe.Timeout.Duration 173 | 174 | // should warn about dangerous characters? (;& …) 175 | probe.Arguments = tProbe.Arguments 176 | 177 | if tProbe.RunIf != "" { 178 | expr, err := govaluate.NewEvaluableExpressionWithFunctions(tProbe.RunIf, CheckFunctions) 179 | if err != nil { 180 | return nil, fmt.Errorf("invalid 'run_if' expression: %s (\"%s\")", err, tProbe.RunIf) 181 | } 182 | if vars := expr.Vars(); len(vars) > 0 { 183 | return nil, fmt.Errorf("undefined variable(s) in 'run_if' expression: %s", strings.Join(vars, ", ")) 184 | } 185 | probe.RunIf = expr 186 | } 187 | 188 | probe.Defaults = make(map[string]interface{}) 189 | if err := checkTomlDefault(probe.Defaults, tProbe.Default); err != nil { 190 | return nil, err 191 | } 192 | 193 | for index, tCheck := range tProbe.Check { 194 | var check Check 195 | 196 | check.Index = index 197 | 198 | if tCheck.Desc == "" { 199 | return nil, errors.New("[[check]] with invalid or missing 'desc'") 200 | } 201 | check.Desc = tCheck.Desc 202 | 203 | if tCheck.If == "" { 204 | return nil, errors.New("[[check]] with invalid or missing 'if'") 205 | } 206 | expr, err := govaluate.NewEvaluableExpressionWithFunctions(tCheck.If, CheckFunctions) 207 | if err != nil { 208 | return nil, fmt.Errorf("[[check]] invalid 'if' expression: %s (\"%s\")", err, tCheck.If) 209 | } 210 | check.If = expr 211 | 212 | if tCheck.Classes == nil { 213 | return nil, errors.New("no valid 'classes' parameter found") 214 | } 215 | 216 | if len(tCheck.Classes) == 0 { 217 | return nil, errors.New("empty classes") 218 | } 219 | for _, class := range tCheck.Classes { 220 | if !IsValidTokenName(class) { 221 | return nil, fmt.Errorf("invalid class name '%s'", class) 222 | } 223 | } 224 | check.Classes = tCheck.Classes 225 | 226 | if tCheck.NeededFailures == 0 { 227 | tCheck.NeededFailures = 1 228 | } 229 | check.NeededFailures = tCheck.NeededFailures 230 | 231 | if tCheck.NeededSuccesses == 0 { 232 | tCheck.NeededSuccesses = check.NeededFailures 233 | } 234 | check.NeededSuccesses = tCheck.NeededSuccesses 235 | 236 | probe.Checks = append(probe.Checks, &check) 237 | } 238 | 239 | if miss := probe.MissingDefaults(); len(miss) > 0 { 240 | return nil, fmt.Errorf("missing defaults (used in 'if' expressions or 'arguments' parameter): %s", strings.Join(miss, ", ")) 241 | } 242 | 243 | return &probe, nil 244 | } 245 | -------------------------------------------------------------------------------- /current_fails.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "os" 6 | "path" 7 | "sync" 8 | "time" 9 | 10 | uuid "github.com/satori/go.uuid" 11 | ) 12 | 13 | // CurrentFail type hold informations about a failure currently detected 14 | // and not resolved yet 15 | type CurrentFail struct { 16 | FailStart time.Time 17 | FailCount int 18 | OkCount int 19 | UniqueID string 20 | 21 | // optional "payload" 22 | RelatedTask *Task // for Checks (!!) 23 | RelatedHost *Host // for Runs 24 | RelatedTTask *Task // for Tasks 25 | } 26 | 27 | var ( 28 | currentFails map[string]*CurrentFail 29 | currentFailsMutex sync.Mutex 30 | ) 31 | 32 | const statusFile string = "nosee-fails.json" 33 | 34 | // CurrentFailsCreate initialize the global currentFails variable 35 | func CurrentFailsCreate() { 36 | currentFails = make(map[string]*CurrentFail) 37 | } 38 | 39 | // CurrentFailsSave dumps current alerts to disk 40 | func CurrentFailsSave() { 41 | // doing this in a go routine allows this function to be called 42 | // by functions that are already locking the mutex 43 | go func() { 44 | currentFailsMutex.Lock() 45 | defer currentFailsMutex.Unlock() 46 | 47 | path := path.Clean(GlobalConfig.SavePath + "/" + statusFile) 48 | f, err := os.Create(path) 49 | if err != nil { 50 | Error.Printf("can't save fails in '%s': %s (see save_path param?)", path, err) 51 | return 52 | } 53 | defer f.Close() 54 | 55 | enc := json.NewEncoder(f) 56 | err = enc.Encode(¤tFails) 57 | if err != nil { 58 | Error.Printf("fails json encode: %s", err) 59 | return 60 | } 61 | Info.Printf("current fails successfully saved to '%s'", path) 62 | }() 63 | } 64 | 65 | // CurrentFailsLoad will load from disk previous "fails" 66 | func CurrentFailsLoad() { 67 | currentFailsMutex.Lock() 68 | defer currentFailsMutex.Unlock() 69 | 70 | path := path.Clean(GlobalConfig.SavePath + "/" + statusFile) 71 | f, err := os.Open(path) 72 | if err != nil { 73 | Warning.Printf("can't read previous status: %s, no fails loaded", err) 74 | return 75 | } 76 | defer f.Close() 77 | 78 | dec := json.NewDecoder(f) 79 | err = dec.Decode(¤tFails) 80 | if err != nil { 81 | Error.Printf("'%s' json decode: %s", path, err) 82 | } 83 | Info.Printf("'%s' loaded: %d fail(s)", path, len(currentFails)) 84 | } 85 | 86 | // CurrentFailDelete deleted the CurrentFail with the given hash of the global currentFails 87 | func CurrentFailDelete(hash string) { 88 | currentFailsMutex.Lock() 89 | defer currentFailsMutex.Unlock() 90 | delete(currentFails, hash) 91 | CurrentFailsSave() 92 | } 93 | 94 | // CurrentFailAdd adds a CurrentFail to the global currentFails using given hash 95 | func CurrentFailAdd(hash string, failedCheck *CurrentFail) { 96 | currentFailsMutex.Lock() 97 | defer currentFailsMutex.Unlock() 98 | currentFails[hash] = failedCheck 99 | CurrentFailsSave() 100 | } 101 | 102 | // CurrentFailInc increments FailCount of the CurrentFail with the given hash 103 | func CurrentFailInc(hash string) { 104 | currentFailsMutex.Lock() 105 | defer currentFailsMutex.Unlock() 106 | currentFails[hash].FailCount++ 107 | currentFails[hash].OkCount = 0 108 | CurrentFailsSave() 109 | } 110 | 111 | // CurrentFailDec increments OkCount of the CurrentFail with the given hash 112 | func CurrentFailDec(hash string) { 113 | currentFailsMutex.Lock() 114 | defer currentFailsMutex.Unlock() 115 | currentFails[hash].OkCount++ 116 | CurrentFailsSave() 117 | } 118 | 119 | // CurrentFailGetAndInc returns the CurrentFail with the given hash and 120 | // increments its FailCount. The CurrentFail is created if it does not 121 | // already exists. 122 | func CurrentFailGetAndInc(hash string) *CurrentFail { 123 | cf, ok := currentFails[hash] 124 | if !ok { 125 | var cf CurrentFail 126 | uuid := uuid.NewV4() 127 | cf.FailCount = 1 128 | cf.OkCount = 0 129 | cf.FailStart = time.Now() 130 | cf.UniqueID = uuid.String() 131 | CurrentFailAdd(hash, &cf) 132 | return &cf 133 | } 134 | 135 | CurrentFailInc(hash) 136 | return cf 137 | } 138 | 139 | // CurrentFailGetAndDec returns the CurrentFail with the given hash and 140 | // increments its OkCount 141 | func CurrentFailGetAndDec(hash string) *CurrentFail { 142 | cf, ok := currentFails[hash] 143 | if !ok { 144 | return nil 145 | } 146 | CurrentFailDec(hash) 147 | return cf 148 | } 149 | -------------------------------------------------------------------------------- /deploy/ssh-agent-nosee.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # If you are using SSH keys with private passphrase: 4 | # This sample script runs an agent for the current user, creating 5 | # a socket that the nosee service will use. 6 | 7 | agent_link="$HOME/.ssh-agent-sock" 8 | 9 | if [ -S "$agent_link" ]; then 10 | echo "Agent is already here." 11 | exit 0 12 | fi 13 | 14 | eval $(ssh-agent -a "$agent_link") 15 | ssh-add "$HOME/keys/id_rsa_xxx" 16 | ssh-add "$HOME/keys/id_rsa_yyy" 17 | # ... 18 | -------------------------------------------------------------------------------- /deploy/supervisord/nosee.conf: -------------------------------------------------------------------------------- 1 | ; Sample supervisord configuration using SSH agent 2 | 3 | [program:nosee] 4 | command=/home/nosee_server/go/bin/nosee --log-level info --log-timestamp 5 | autostart=false 6 | autorestart=false 7 | user=nosee_server 8 | ; See ssh-agent-nosee.sh 9 | environment=SSH_AUTH_SOCK="/home/nosee_server/.ssh-agent-sock",HOME="/home/nosee_server" 10 | redirect_stderr=true 11 | stdout_logfile=/var/log/supervisor/nosee.log 12 | stdout_logfile_maxbytes=50MB 13 | -------------------------------------------------------------------------------- /deploy/systemd/nosee.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=A nosey, agentless, easy monitoring tool over SSH 3 | After=network-online.target 4 | 5 | [Service] 6 | User={USER} 7 | ExecStart=/home/{USER}/go/bin/nosee -c /home/{USER}/nosee/etc/ --log-level info --log-timestamp 8 | Type=simple 9 | Restart=on-failure 10 | Environment=SSH_AUTH_SOCK=/home/{USER}/.ssh-agent-sock 11 | 12 | [Install] 13 | WantedBy=multi-user.target 14 | -------------------------------------------------------------------------------- /doc/images/howto.txt: -------------------------------------------------------------------------------- 1 | - Using "DIagrams Through Ascii Art" (ditaa) syntax 2 | https://github.com/stathissideris/ditaa 3 | 4 | - Generated using PlantUML online demo server 5 | http://plantuml.com/ 6 | 7 | @startditaa 8 | ... 9 | @endditaa 10 | -------------------------------------------------------------------------------- /doc/images/img_base.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xfennec/nosee/6dad5bbc946242dd56f53de3c26feb0bb88042e6/doc/images/img_base.png -------------------------------------------------------------------------------- /doc/images/img_base.txt: -------------------------------------------------------------------------------- 1 | +--------+ 2 | | Nosee | 3 | | Daemon | 4 | +--+-+-+-+ 5 | | | | 6 | +------------+ | +------------+ 7 | | | | 8 | v v v 9 | SSH SSH SSH 10 | +-----------+ +-----+-----+ +-----------+ 11 | | Monitored | | Monitored | | Monitored | 12 | | Host | | Host | | Host | 13 | +-----------+ +-----------+ +-----------+ 14 | 15 | (Only SSH server is needed on hosts) 16 | 17 | -------------------------------------------------------------------------------- /doc/images/img_general.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xfennec/nosee/6dad5bbc946242dd56f53de3c26feb0bb88042e6/doc/images/img_general.png -------------------------------------------------------------------------------- /doc/images/img_general.txt: -------------------------------------------------------------------------------- 1 | hosts.d/ probes.d/ alerts.d/ 2 | +------+ +----------------+ +-------+ 3 | | | | | | | 4 | | Host +-->+ Probe | +->| Alert | 5 | | | | | | | | 6 | +------+ +--------+-------+ | +-------+ 7 | | Script | Check +-+ 8 | +----+---+-------+ 9 | : ^ 10 | | | 11 | | SSH | 12 | +-------+ 13 | Remote machine 14 | (monitored) 15 | -------------------------------------------------------------------------------- /doc/images/img_illu.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xfennec/nosee/6dad5bbc946242dd56f53de3c26feb0bb88042e6/doc/images/img_illu.jpeg -------------------------------------------------------------------------------- /doc/images/nosee-influxdb-grafana.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xfennec/nosee/6dad5bbc946242dd56f53de3c26feb0bb88042e6/doc/images/nosee-influxdb-grafana.png -------------------------------------------------------------------------------- /etc/alerts.d/example.txt: -------------------------------------------------------------------------------- 1 | ## Rename this file with ".toml" extension 2 | 3 | name = "My alert" 4 | disabled = false 5 | 6 | targets = ["preprod", "linux & production"] 7 | # to capture all check failures: 8 | targets = ["*"] 9 | 10 | # command in the path or full path of a command 11 | # alert details are sent to stdin, as various env vars (see test.sh) 12 | command = "cmd" 13 | # any script in "scripts/alerts/" directory is available without any path: 14 | #command = "test.sh" 15 | 16 | arguments = [ 17 | "arg1", 18 | "arg2: $SUBJECT ($TYPE)", 19 | ] 20 | 21 | # Warning: this part may change. We should probably switch to a more 22 | # generic expression here, like probe's run_if condition 23 | # This alert is only available during... 24 | hours = ["8:30 - 12:30", "14:00 - 18:00"] 25 | # sunday is 0 or 7 26 | days = [1, 2, 3, 4, 5] 27 | 28 | 29 | # Note: alerts listening for special class "general" can't have 30 | # such hour/day limitations 31 | -------------------------------------------------------------------------------- /etc/alerts.d/mail_general.toml: -------------------------------------------------------------------------------- 1 | name = "Mail general" 2 | disabled = false 3 | 4 | targets = ["general"] 5 | 6 | command = "mail" 7 | 8 | arguments = [ 9 | "-s", 10 | "Nosee $NOSEE_SRV GENERAL: $SUBJECT", 11 | "user@domain.tld" 12 | ] 13 | -------------------------------------------------------------------------------- /etc/alerts.d/nosee-console.toml: -------------------------------------------------------------------------------- 1 | name = "Nosee console" 2 | #disabled = true 3 | 4 | targets = ["*"] 5 | 6 | command = "nosee-console.sh" 7 | 8 | arguments = ["http://localhost:8080/alerts"] 9 | -------------------------------------------------------------------------------- /etc/hosts.d/example.txt: -------------------------------------------------------------------------------- 1 | ## Rename this file with ".toml" extension 2 | 3 | # "name" is a key for the database. Change it and it'll become another host! 4 | name = "My Host" 5 | classes = ["linux", "http", "testing"] 6 | disabled = false 7 | 8 | [network] 9 | host = "192.168.0.1" 10 | port = 22 11 | # Nosee defaults to sensible ciphers, but you may want to specify older 12 | # ciphers (at your own risk) for compatibility: 13 | #ciphers = ["arcfouraa", "aes128-cbc"] 14 | 15 | [auth] 16 | user = "user" 17 | 18 | # (password) OR (key) OR (key + passphrase) OR (ssh_agent) OR (ssh_agent + key) 19 | 20 | password = "mypassword" 21 | 22 | key = "/home/xxx/.ssh/id_rsa_sample" 23 | key_passphrase = "mypassphrase" 24 | 25 | ssh_agent = true 26 | # If you don't want to test every single key in the agent, give the 27 | # corresponding public key: 28 | #pubkey = "/home/xxx/.ssh/id_rsa_sample.pub" 29 | 30 | # you can override probe defaults for a specific host: 31 | [[default]] 32 | name = "warn_ping_latency" 33 | value = 10 34 | 35 | # or defaults for a probe: 36 | [[default]] 37 | name = "ifband_interface" 38 | value = "enp1s0f0" 39 | -------------------------------------------------------------------------------- /etc/hosts.d/test.toml: -------------------------------------------------------------------------------- 1 | name = "Development server" 2 | classes = ["linux", "development"] 3 | 4 | [network] 5 | host = "192.168.0.41" 6 | port = 22 7 | 8 | [auth] 9 | user = "deploy" 10 | key = "/home/user/.ssh/id_rsa_devsrv" 11 | -------------------------------------------------------------------------------- /etc/nosee.toml: -------------------------------------------------------------------------------- 1 | # global configuration for Nosee 2 | 3 | # Nosee server name (useful if you have multiple Nosee servers) 4 | # default: "" 5 | #name="Test" 6 | 7 | # This option helps to ease the global load induced by all SSH connections. 8 | # default: 15s 9 | #start_time_spread = "15s" 10 | 11 | # Maximum connection time for a SSH connection. (will trigger a "general" class alert) 12 | # default: 10s 13 | #ssh_connection_time_warn = "6s" 14 | 15 | # Currently, nosee will look at $HOME/.ssh/known_hosts for host fingerprints, 16 | # unless you set this to true, accepting blindly any fingerprint. 17 | # This is a potential security issue. (MitM attack) 18 | #ssh_blindtrust_fingerprints = false 19 | 20 | # Path to save current fails so Nosee can be restarted without losing status 21 | # (see nosee-fails.json file) 22 | # default: "./" 23 | #save_path = "/home/user/.nosee/" 24 | 25 | # Nosee will regularly execute all "scripts/heartbeats" as a keepalive 26 | # default: 30s 27 | #heartbeat_delay = "5s" 28 | -------------------------------------------------------------------------------- /etc/probes.d/apache_modstatus.toml: -------------------------------------------------------------------------------- 1 | name = "Apache mod_status" 2 | targets = ["linux & mod_status"] 3 | 4 | script = "apache_modstatus.sh" 5 | 6 | delay = "1m" 7 | timeout = "5s" 8 | -------------------------------------------------------------------------------- /etc/probes.d/backup_daily.toml: -------------------------------------------------------------------------------- 1 | name = "daily backup" 2 | targets = ["linux & backupd"] 3 | #disabled = true 4 | 5 | script = "backup.sh" 6 | arguments = "$start_file $ok_file" 7 | 8 | delay = "30m" 9 | timeout = "8s" 10 | 11 | ### Default values 12 | 13 | [[default]] 14 | name = "start_file" 15 | value = "/tmp/backup.start" 16 | 17 | [[default]] 18 | name = "ok_file" 19 | value = "/tmp/backup.ok" 20 | 21 | [[default]] 22 | name = "backup_margin_hours" 23 | value = 3 24 | 25 | [[default]] 26 | name = "backup_duration_warn" 27 | value = 5 28 | 29 | ### Checks 30 | 31 | [[check]] 32 | desc = "backup too old" 33 | if = "LAST_OK_HOURS > (24+backup_margin_hours)" 34 | classes = ["critical"] 35 | 36 | [[check]] 37 | desc = "backup too long" 38 | if = "LAST_DURATION_HOURS > backup_duration_warn" 39 | classes = ["warning"] 40 | -------------------------------------------------------------------------------- /etc/probes.d/backup_week.toml: -------------------------------------------------------------------------------- 1 | name = "backup check" 2 | targets = ["linux & backupw"] 3 | #disabled = true 4 | 5 | script = "backup.sh" 6 | arguments = "$start_file $ok_file" 7 | 8 | # +------+ +----- 9 | # exp: 27h | "" | 27h 27h | 10 | # +------+ +------+ +------+ 11 | # | || || | | || 12 | # +#.--#+#.--#+#.---+-.---+----#+#---#+#.--#+ 13 | # enab: ****************-----------------********** 14 | # Thu Fri Sat Sun Mon Tue Wen 15 | # dow: 4 5 6 0 1 2 3 16 | 17 | run_if = """ 18 | (date('dow') == 3 || date('dow') == 4 || date('dow') == 5) || 19 | (date('dow') == 6 && date('time') <= 8) || 20 | (date('dow') == 2 && date('time') >= 8) 21 | """ 22 | 23 | delay = "30m" 24 | timeout = "8s" 25 | 26 | ### Default values 27 | 28 | [[default]] 29 | name = "start_file" 30 | value = "/tmp/backup.start" 31 | 32 | [[default]] 33 | name = "ok_file" 34 | value = "/tmp/backup.ok" 35 | 36 | [[default]] 37 | name = "backup_margin_hours" 38 | value = 3 39 | 40 | [[default]] 41 | name = "backup_duration_warn" 42 | value = 5 43 | 44 | ### Checks 45 | 46 | [[check]] 47 | desc = "backup too old" 48 | if = "LAST_OK_HOURS > (24+backup_margin_hours)" 49 | classes = ["critical"] 50 | 51 | [[check]] 52 | desc = "backup too long" 53 | if = "LAST_DURATION_HOURS > backup_duration_warn" 54 | classes = ["warning"] 55 | -------------------------------------------------------------------------------- /etc/probes.d/cert_example.toml: -------------------------------------------------------------------------------- 1 | name = "example.com certificate validity" 2 | targets = ["example_com"] 3 | #disabled = true 4 | 5 | script = "cert_check.sh" 6 | arguments = "/etc/pki/tls/certs/example.com.crt 15" 7 | 8 | delay = "60m" 9 | timeout = "5s" 10 | 11 | ### Checks 12 | 13 | [[check]] 14 | desc = "certificate will expire soon" 15 | if = "WILL_EXPIRE != 0" 16 | classes = ["warning"] 17 | -------------------------------------------------------------------------------- /etc/probes.d/cpu_lms_temp.toml: -------------------------------------------------------------------------------- 1 | name = "CPU lm_sensors temperature" 2 | targets = ["linux & lm_sensors"] 3 | 4 | script = "cpu_lms_temp.sh" 5 | 6 | delay = "1m" 7 | timeout = "5s" 8 | 9 | ### Checks 10 | 11 | [[check]] 12 | desc = "high CPU temperature" 13 | if = "TEMP > HIGH" 14 | classes = ["warning"] 15 | needed_failures = 2 16 | 17 | [[check]] 18 | desc = "critical CPU temperature" 19 | if = "TEMP > CRIT" 20 | classes = ["critical"] 21 | -------------------------------------------------------------------------------- /etc/probes.d/cpu_temp.toml: -------------------------------------------------------------------------------- 1 | ## Sample probe 2 | 3 | name = "CPU temperature" 4 | targets = ["linux"] 5 | disabled = false 6 | 7 | script = "cpu_temp.sh" 8 | arguments = "0" 9 | 10 | delay = "2m" 11 | timeout = "5s" 12 | 13 | ### Default values 14 | # types: int, float, string 15 | 16 | [[default]] 17 | name = "warn_cpu_temp" 18 | value = 75 19 | 20 | [[default]] 21 | name = "error_cpu_temp" 22 | value = 85 23 | 24 | ### Checks 25 | 26 | [[check]] 27 | desc = "high CPU0 temperature" 28 | if = "TEMP > warn_cpu_temp" 29 | classes = ["warning"] 30 | needed_failures = 2 31 | 32 | [[check]] 33 | desc = "critical CPU0 temperature" 34 | if = "TEMP > error_cpu_temp" 35 | classes = ["critical"] 36 | -------------------------------------------------------------------------------- /etc/probes.d/curl_expect_example.toml: -------------------------------------------------------------------------------- 1 | name = "example.com Website" 2 | targets = ["linux & example"] 3 | 4 | script = "curl_expect.sh" 5 | arguments = "http://example.com/ 'used for illustrative examples'" 6 | 7 | delay = "5m" 8 | timeout = "20s" 9 | 10 | ### Checks 11 | 12 | [[check]] 13 | desc = "can't find expected content" 14 | if = "FOUND_EXPECTED != 1" 15 | classes = ["critical"] 16 | -------------------------------------------------------------------------------- /etc/probes.d/df.toml: -------------------------------------------------------------------------------- 1 | name = "disk free" 2 | targets = ["linux"] 3 | #disabled = true 4 | 5 | script = "df.sh" 6 | 7 | delay = "30m" 8 | timeout = "8s" 9 | 10 | ### Default values 11 | 12 | [[default]] 13 | name = "df_warn_perc" 14 | value = 95 15 | 16 | ### Checks 17 | 18 | [[check]] 19 | desc = "disk almost full" 20 | if = "FULLEST_PERC > df_warn_perc" 21 | classes = ["warning"] 22 | -------------------------------------------------------------------------------- /etc/probes.d/example.txt: -------------------------------------------------------------------------------- 1 | ## Rename this file with ".toml" extension 2 | 3 | name="My Probe" 4 | 5 | script = "script.sh" 6 | disabled = false 7 | 8 | targets = ["linux & test", "windows & test"] 9 | # If you want to match all hosts (all classes): 10 | # targets = ["*"] 11 | 12 | # probe repetition delay (must be minutes "dead" [not 2m30, for instance]) 13 | # minimum value: 1m 14 | delay = "5m" 15 | 16 | # if the probes takes more than this time, it will trigger an error 17 | # default: 20s 18 | timeout = "30s" 19 | 20 | # check only between 8:00 and 18:00 21 | run_if = "date('time') >= 8 && date('time') <= 18" 22 | 23 | ### Default values (used by checks) 24 | # types: int, float, string 25 | # not "all uppercase" (reserved for probe values) 26 | 27 | [[default]] 28 | name = "value_foo" 29 | value = 0.90 30 | 31 | [[default]] 32 | name = "value_bar" 33 | value = "200 OK" 34 | 35 | ### Checks 36 | 37 | [[check]] 38 | desc = "check description" 39 | if = "VALUE1_FROM_SCRIPT > value_foo" 40 | classes = ["critical"] 41 | # will trigger alert if append two times (default: 1) 42 | needed_failures = 2 43 | # will delete the "suspicion" if check is OK three times (default: needed_failures) 44 | needed_successes = 3 45 | 46 | [[check]] 47 | desc = "check description" 48 | if = "VALUE1_FROM_SCRIPT+VALUE2_FROM_SCRIPT < value_foo" 49 | classes = ["warning"] 50 | -------------------------------------------------------------------------------- /etc/probes.d/ifband.toml: -------------------------------------------------------------------------------- 1 | name = "bandwidth" 2 | targets = ["linux & ifband"] 3 | #disabled = true 4 | 5 | script = "ifband.sh" 6 | arguments = "$ifband_interface" 7 | 8 | delay = "1m" 9 | timeout = "5s" 10 | 11 | ### Default values 12 | 13 | [[default]] 14 | name = "ifband_interface" 15 | value = "eth0" 16 | -------------------------------------------------------------------------------- /etc/probes.d/load.toml: -------------------------------------------------------------------------------- 1 | name = "system load" 2 | targets = ["linux"] 3 | #disabled = true 4 | 5 | script = "load.sh" 6 | arguments = "$load_normal_cmd" 7 | 8 | delay = "1m" 9 | timeout = "8s" 10 | 11 | ### Default values 12 | 13 | [[default]] 14 | name = "load_normal_cmd" 15 | value = "/root/backup.sh" 16 | 17 | [[default]] 18 | name = "load_margin" 19 | value = 0 20 | 21 | ### Checks 22 | 23 | [[check]] 24 | desc = "heavy system load" 25 | if = "LOAD > (CPU_COUNT+load_margin) && PROG_DETECTED == 0" 26 | classes = ["warning"] 27 | needed_failures = 2 28 | -------------------------------------------------------------------------------- /etc/probes.d/mdstat.toml: -------------------------------------------------------------------------------- 1 | name = "Linux md-raid states" 2 | targets = ["linux"] 3 | #disabled = true 4 | 5 | script = "mdstat.sh" 6 | 7 | delay = "5m" 8 | timeout = "15s" 9 | 10 | ### Checks 11 | 12 | [[check]] 13 | desc = "md-raid failure" 14 | if = "ERR_ARRAYS > 0" 15 | classes = ["critical"] 16 | -------------------------------------------------------------------------------- /etc/probes.d/mem.toml: -------------------------------------------------------------------------------- 1 | name = "memory (RAM and swap)" 2 | 3 | script = "mem.sh" 4 | disabled = false 5 | 6 | targets = ["linux", "windows"] 7 | 8 | delay = "5m" 9 | # WMI can be veeeery slow :( 10 | timeout = "30s" 11 | 12 | ### Default values 13 | # types: int, float, string 14 | # not "all uppercase" (reserved for probe values) 15 | [[default]] 16 | name = "min_available_ratio" 17 | value = 0.20 18 | 19 | [[default]] 20 | name = "warn_swap_ratio" 21 | value = 0.30 22 | 23 | ### Checks 24 | 25 | [[check]] 26 | desc = "critical available memory ratio" 27 | if = "MEM_AVAILABLE_RATIO < min_available_ratio" 28 | classes = ["critical"] 29 | 30 | [[check]] 31 | desc = "high swap usage ratio" 32 | if = "SWAP_USED_RATIO > warn_swap_ratio" 33 | classes = ["warning"] 34 | -------------------------------------------------------------------------------- /etc/probes.d/ping.toml: -------------------------------------------------------------------------------- 1 | name = "ping to router" 2 | targets = ["linux"] 3 | #disabled = true 4 | 5 | script = "ping.sh" 6 | arguments = "192.168.0.250" 7 | 8 | delay = "1m" 9 | timeout = "8s" 10 | 11 | ### Default values 12 | 13 | [[default]] 14 | name = "err_ping_loss" 15 | value = 1 16 | 17 | [[default]] 18 | name = "warn_ping_latency" 19 | value = 1 20 | 21 | ### Checks 22 | 23 | [[check]] 24 | desc = "critical ping loss" 25 | if = "LOSS_PERC >= err_ping_loss" 26 | classes = ["critical"] 27 | needed_failures = 2 28 | 29 | [[check]] 30 | desc = "ping latency" 31 | if = "AVG_MS > warn_ping_latency" 32 | classes = ["warning"] 33 | needed_failures = 2 34 | -------------------------------------------------------------------------------- /etc/probes.d/port_80.toml: -------------------------------------------------------------------------------- 1 | name = "HTTP port" 2 | targets = ["linux & http"] 3 | #disabled = true 4 | 5 | script = "port.sh" 6 | arguments = "80" 7 | 8 | delay = "1m" 9 | 10 | ### Checks 11 | 12 | [[check]] 13 | desc = "port 80 is not open" 14 | if = "OPEN != 1" 15 | classes = ["critical"] 16 | -------------------------------------------------------------------------------- /etc/probes.d/systemd_httpd.toml: -------------------------------------------------------------------------------- 1 | name = "Apache status (systemd)" 2 | targets = ["linux & systemd & apache"] 3 | #disabled = true 4 | 5 | script = "systemctl_status.sh" 6 | arguments = "httpd.service" 7 | 8 | delay = "1m" 9 | timeout = "5s" 10 | 11 | ### Checks 12 | 13 | [[check]] 14 | desc = "Apache status" 15 | if = "STATUS != 'active'" 16 | classes = ["critical"] 17 | -------------------------------------------------------------------------------- /etc/scripts/alerts/nosee-console.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z "$1" ]; then 4 | (>&2 echo "ERROR: give nosee console URL (ex: http://localhost:8080/alerts)") 5 | exit 1 6 | fi 7 | 8 | DETAILS=$(cat) 9 | 10 | curl -s -f -w "HTTP Code %{http_code}\n" \ 11 | --form-string "type=$TYPE" \ 12 | --form-string "subject=$SUBJECT" \ 13 | --form-string "details=$DETAILS" \ 14 | --form-string "classes=$CLASSES" \ 15 | --form-string "hostname=$HOST_NAME" \ 16 | --form-string "nosee_srv=$NOSEE_SRV" \ 17 | --form-string "uniqueid=$UNIQUEID" \ 18 | --form-string "datetime=$DATETIME" \ 19 | "$1" 20 | 21 | if [ $? -ne 0 ]; then 22 | exit 1 23 | fi 24 | -------------------------------------------------------------------------------- /etc/scripts/alerts/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Test script to show all input channels 4 | 5 | file="/tmp/remove_me" 6 | 7 | echo "stdout test" 8 | (>&2 echo "stderr test") 9 | 10 | date > $file 11 | echo "$0" >> $file 12 | echo "$1" >> $file 13 | echo "$2" >> $file 14 | echo "$3" >> $file 15 | echo "$4" >> $file 16 | 17 | echo "$SUBJECT" >> $file 18 | echo $USER >> $file 19 | echo $TYPE >> $file 20 | echo $NOSEE_SRV >> $file 21 | 22 | # stdin is $DETAILS 23 | cat >> $file 24 | echo $HOME >> $file 25 | -------------------------------------------------------------------------------- /etc/scripts/heartbeats/nosee-console.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # nosee console heartbeat URL 4 | url="http://localhost:8080/heartbeat" 5 | 6 | # NOSEE_SRV, VERSION, DATETIME, STARTTIME, UPTIME 7 | 8 | curl -s -f -w "HTTP Code %{http_code}\n" \ 9 | --form-string "uptime=$UPTIME" \ 10 | --form-string "server=$NOSEE_SRV" \ 11 | --form-string "version=$VERSION" \ 12 | "$url" 13 | 14 | if [ $? -ne 0 ]; then 15 | exit 1 16 | fi 17 | -------------------------------------------------------------------------------- /etc/scripts/loggers/influxdb.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | host=${HOST_FILE%.toml} 4 | 5 | # input lines looks like: 6 | # df.toml;DISK_FULLEST_PERC;27 7 | res=$(cat | awk -v host=$host -F\; '{ 8 | probe=$1 9 | key=$2 10 | val=$3 11 | sub(/\.toml$/, "", probe) 12 | measurement=sprintf("%s_%s", probe, key) 13 | if (val ~ /[0-9.]/) 14 | printf("%s,host=%s value=%s\n", measurement,host,val) 15 | else 16 | printf("%s,host=%s value=\"%s\"\n", measurement,host,val) 17 | }') 18 | 19 | curl -i -XPOST 'http://localhost:8086/write?db=nosee' --data-binary "$res" 20 | -------------------------------------------------------------------------------- /etc/scripts/probes/apache_modstatus.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Server must have mod_status loaded and configured with something like: 4 | # 5 | # SetHandler server-status 6 | # Order deny,allow 7 | # Deny from all 8 | # Allow from 127.0.0.1 ::1 9 | # 10 | 11 | # ExtendedStatus must be set to On (default since Apache 2.3.6) 12 | 13 | stat_file="$HOME/.apache-modstatus" 14 | NOW=$(date +%s) 15 | 16 | page=$(curl --silent -f "http://localhost/server-status?auto") 17 | if [ $? -ne 0 ]; then 18 | (>&2 echo "ERROR: unable to get status (mod_status OK on localhost?)") 19 | exit 1 20 | fi 21 | 22 | requests=$(echo "$page" | grep '^Total Accesses' | awk -F ': ' '{print $2}') 23 | kbytes=$(echo "$page" | grep '^Total kBytes' | awk -F ': ' '{print $2}') 24 | 25 | LAST_CALL=$NOW 26 | LAST_REQUESTS=$requests 27 | LAST_KBYTES=$kbytes 28 | if [ -f $stat_file ]; then 29 | . $stat_file 30 | fi 31 | 32 | REQUESTS=$requests 33 | KBYTES=$kbytes 34 | 35 | time_diff=$(echo $LAST_CALL $NOW | awk '{print ($2 - $1)}') 36 | requests_diff=$(echo $LAST_REQUESTS $REQUESTS | awk '{print ($2 - $1)}') 37 | kbytes_diff=$(echo $LAST_KBYTES $KBYTES | awk '{print ($2 - $1)}') 38 | 39 | if [ $time_diff -eq 0 ]; then 40 | RPS=0 41 | KBPS=0 42 | else 43 | RPS=$(echo $requests_diff $time_diff | awk '{t=$1/$2; printf ("%f", (t>0?t:0))}') 44 | KBPS=$(echo $kbytes_diff $time_diff | awk '{t=$1/$2; printf ("%f", (t>0?t:0))}') 45 | fi 46 | 47 | 48 | echo > $stat_file 49 | echo "LAST_CALL=$NOW" >> $stat_file 50 | echo "LAST_REQUESTS=$REQUESTS" >> $stat_file 51 | echo "LAST_KBYTES=$KBYTES" >> $stat_file 52 | 53 | echo RPS: $RPS 54 | echo KBPS: $KBPS 55 | -------------------------------------------------------------------------------- /etc/scripts/probes/backup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ex: backup.sh /tmp/backup.start /tmp/backup.ok 4 | 5 | if [ -z "$2" ]; then 6 | (>&2 echo "ERROR: give 'start' flag file and 'ok' flag file") 7 | exit 1 8 | fi 9 | 10 | start_file="$1" 11 | ok_file="$2" 12 | 13 | if [ ! -f "$start_file" ]; then 14 | (>&2 echo "ERROR: can't read start file '$start_file'") 15 | exit 1 16 | fi 17 | if [ ! -f "$ok_file" ]; then 18 | (>&2 echo "ERROR: can't read ok file '$ok_file'") 19 | exit 1 20 | fi 21 | 22 | ok_tmsp=$(date +%s -r "$ok_file") 23 | start_tmsp=$(date +%s -r "$start_file") 24 | now=$(date +%s) 25 | 26 | last_ok_hours=$(echo $ok_tmsp $now | awk '{ diff=$2-$1; print diff/60/60 }') 27 | last_duration=$(echo $start_tmsp $ok_tmsp | awk '{ 28 | diff=$2-$1; 29 | if (diff > 0) 30 | print diff/60/60 31 | else 32 | print 0 33 | }') 34 | 35 | echo "LAST_OK_HOURS:" $last_ok_hours 36 | echo "LAST_DURATION_HOURS:" $last_duration 37 | -------------------------------------------------------------------------------- /etc/scripts/probes/cert_check.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z "$2" ]; then 4 | (>&2 echo "ERROR: give certificate path and 'days to expire'") 5 | (>&2 echo "ERROR: Usage: $0 /etc/pki/tls/certs/myweb.crt 15") 6 | exit 1 7 | fi 8 | 9 | cert_path=$1 10 | days_to_expire=$2 11 | 12 | timestamp=$(echo $(($days_to_expire*24*60*60))) 13 | 14 | openssl x509 -checkend $timestamp -noout -in "$1" 15 | res=$? 16 | 17 | echo "WILL_EXPIRE:" $res 18 | -------------------------------------------------------------------------------- /etc/scripts/probes/cpu_lms_temp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # this script use lm_sensors to average temperature of all CPU cores 4 | # Required sensors output format: 5 | # ... 6 | # Core 0: +33.0°C (high = +82.0°C, crit = +102.0°C) 7 | # Core 1: +32.0°C (high = +82.0°C, crit = +102.0°C) 8 | 9 | sensors | awk ' 10 | BEGIN { 11 | total = 0 12 | cores = 0 13 | high = 999 14 | crit = 999 15 | } 16 | /^Core/ { 17 | if (match($0, /\+([0-9.]+)°C.*\+([0-9.]+)°C,.*\+([0-9.]+)°C/, g) > 0) { 18 | total += g[1] 19 | high = (g[2] < high ? g[2] : high) 20 | crit = (g[3] < crit ? g[3] : crit) 21 | cores++ 22 | } else if (match($0, /\+([0-9.]+)°C/, g) > 0) { 23 | total += g[1] 24 | cores++ 25 | } 26 | } 27 | END { 28 | printf("TEMP: %f\n", total / cores) 29 | printf("HIGH: %f\n", high) 30 | printf("CRIT: %f\n", crit) 31 | } 32 | ' 33 | -------------------------------------------------------------------------------- /etc/scripts/probes/cpu_temp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z "$1" ]; then 4 | (>&2 echo "ERROR: give thermal zone number (ex: 0)") 5 | exit 1 6 | fi 7 | 8 | file="/sys/class/thermal/thermal_zone$1/temp" 9 | 10 | if [ ! -f "$file" ]; then 11 | (>&2 echo "ERROR: invalid path: $file") 12 | exit 2 13 | fi 14 | 15 | val=$(cat "$file") 16 | temp=$(awk "BEGIN {print $val/1000}") 17 | echo "TEMP:" $temp 18 | -------------------------------------------------------------------------------- /etc/scripts/probes/curl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # the URL must display usual "KEY: val\nKEY2: val2" format 4 | 5 | if [ -z "$1" ]; then 6 | (>&2 echo "ERROR: give URL") 7 | exit 1 8 | fi 9 | 10 | curl --max-time 15 --silent -f "$1" 11 | -------------------------------------------------------------------------------- /etc/scripts/probes/curl_expect.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z "$2" ]; then 4 | (>&2 echo "ERROR: give URL and an expected string") 5 | (>&2 echo "Usage example: $0 'http://www.perdu.com/' 'Pas de panique'") 6 | exit 1 7 | fi 8 | 9 | url=$1 10 | expected=$2 11 | 12 | status=0 13 | 14 | page=$(curl --max-time 15 --silent -f "$url") 15 | if [ $? -eq 0 ]; then 16 | n=$(echo "$page" | grep "$expected" | wc -l) 17 | if [ $n -gt 0 ]; then 18 | status=1 19 | fi 20 | fi 21 | 22 | echo "FOUND_EXPECTED:" $status 23 | -------------------------------------------------------------------------------- /etc/scripts/probes/df.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | lines=$(df -kP | grep '^/dev/' | grep -v '[[:space:]]/mnt/' | grep -v '[[:space:]]/snap/') 4 | fullest=$(echo "$lines" | awk '{print $5}' | cut -d% -f1 | sort -n | tail -n1) 5 | 6 | echo "FULLEST_PERC:" $fullest 7 | 8 | all=$(echo "$lines" | awk '{print $5,$6}') 9 | while read -r line; do 10 | dfree=$(echo "$line" | awk '{print $1}' | cut -d% -f1) 11 | name=$(echo "$line" | awk '{print $2}') 12 | name=$(echo "$name" | sed 's#/#_#g' |sed 's/-/_/' | sed 's/^_//') 13 | if [ -z "$name" ]; then 14 | name="ROOT" 15 | fi 16 | echo "DF_${name^^}_PERC:" $dfree 17 | done <<< "$all" 18 | -------------------------------------------------------------------------------- /etc/scripts/probes/ifband.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | interface=$1 4 | if_dir="/sys/class/net/$interface/statistics" 5 | stat_file="$HOME/.ifband-$interface" 6 | NOW=$(date +%s) 7 | 8 | if [ -z "$1" ]; then 9 | (>&2 echo "USAGE: $0 interface-name") 10 | exit 1 11 | fi 12 | 13 | if [ ! -d $if_dir ]; then 14 | (>&2 echo "ERROR: unable to find $interface stats") 15 | exit 1 16 | fi 17 | 18 | LAST_CALL=$NOW 19 | LAST_RX=$(cat $if_dir/rx_bytes) 20 | LAST_TX=$(cat $if_dir/tx_bytes) 21 | 22 | if [ -f $stat_file ]; then 23 | . $stat_file 24 | fi 25 | 26 | RX=$(cat $if_dir/rx_bytes) 27 | TX=$(cat $if_dir/tx_bytes) 28 | 29 | time_diff=$(echo $LAST_CALL $NOW | awk '{print ($2 - $1)}') 30 | rx_diff=$(echo $LAST_RX $RX | awk '{print ($2 - $1)}') 31 | tx_diff=$(echo $LAST_TX $TX | awk '{print ($2 - $1)}') 32 | 33 | #echo $time_diff $rx_diff $tx_diff 34 | if [ $time_diff -eq 0 ]; then 35 | RX_KBPS=0 36 | TX_KBPS=0 37 | else 38 | RX_KBPS=$(echo $rx_diff $time_diff | awk '{printf ("%i", $1 / $2 / 1024)}') 39 | TX_KBPS=$(echo $tx_diff $time_diff | awk '{printf ("%i", $1 / $2 / 1024)}') 40 | fi 41 | 42 | if [ $RX_KBPS -le 0 ]; then 43 | RX_KBPS=0 44 | fi 45 | if [ $TX_KBPS -le 0 ]; then 46 | TX_KBPS=0 47 | fi 48 | 49 | echo > $stat_file 50 | echo "LAST_CALL=$NOW" >> $stat_file 51 | echo "LAST_RX=$RX" >> $stat_file 52 | echo "LAST_TX=$TX" >> $stat_file 53 | 54 | echo RX_KBPS: $RX_KBPS 55 | echo TX_KBPS: $TX_KBPS 56 | 57 | -------------------------------------------------------------------------------- /etc/scripts/probes/load.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # load.sh [prog1] [prog2] [script3] [...] 4 | # will return "PROG_DETECTED: 1" if any 5 | # of the prog/script is found ("my load is high 6 | # but my backup is running, so it's ok") 7 | 8 | # CentOS 6/7 have a minimalist PATH on non-login SSH connections 9 | # and 'pidof' is often hosted in /sbin 10 | PATH=$PATH:/sbin 11 | 12 | if [ -f /proc/loadavg ]; then 13 | load=$(awk '{print $1}' /proc/loadavg) 14 | else 15 | load_field=$(LANG=C uptime | awk -F, '{print $(NF-2)}') 16 | load=$(echo "$load_field" | awk -F: '{print $2}') 17 | fi 18 | 19 | detected=0 20 | if [ -n $2 ]; then 21 | while [ ${#} -gt 0 ]; do 22 | pidof -x "$1" > /dev/null 23 | if [ $? -eq 0 ]; then 24 | detected=1 25 | fi 26 | shift 27 | done 28 | fi 29 | 30 | echo "LOAD:" $load 31 | echo "CPU_COUNT:" $(nproc) 32 | echo "PROG_DETECTED:" $detected 33 | -------------------------------------------------------------------------------- /etc/scripts/probes/load_win.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $(uname -o) != "Cygwin" ]; then 4 | (>&2 echo "Cygwin needed") 5 | exit 1 6 | fi 7 | 8 | pql=$(wmic path Win32_PerfFormattedData_PerfOS_System get ProcessorQueueLength | awk 'NR==2') 9 | echo "CPU_QUEUE_LEN:" $pql 10 | 11 | # select PercentProcessorTime from Win32_PerfFormattedData_PerfOS_Processor where Name = '_Total' 12 | 13 | #p=$(wmic path Win32_PerfFormattedData_PerfOS_System get PercentProcessorQueueLength | awk 'NR==2') 14 | #echo "CPU_QUEUE_LEN:" $pql 15 | 16 | ppt=$(wmic path Win32_PerfFormattedData_PerfOS_Processor where "Name = '_Total'" get PercentProcessorTime | awk 'NR==2') 17 | echo CPU_PERCENT: $ppt 18 | 19 | lp=$(wmic cpu get loadpercentage | awk 'NR==2') 20 | echo CPU_LOAD_PERCENT: $lp 21 | 22 | pdt=$(wmic path Win32_PerfFormattedData_PerfDisk_PhysicalDisk where "Name='_Total'" get PercentDiskTime | awk 'NR==2') 23 | echo DISK_PERCENT: $pdt 24 | -------------------------------------------------------------------------------- /etc/scripts/probes/mdstat.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mdstat="/proc/mdstat" 4 | 5 | if [ ! -f "$mdstat" ]; then 6 | (>&2 echo "ERROR: cant find md RAID support ($mdstat)") 7 | exit 1 8 | fi 9 | 10 | fcount=$(grep -c "\[.*_.*\]" $mdstat) 11 | 12 | echo "ERR_ARRAYS:" $fcount 13 | -------------------------------------------------------------------------------- /etc/scripts/probes/mem.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # is MemAvailable supported? 4 | ma_supported=$(grep "MemAvailable:" /proc/meminfo) 5 | 6 | function meminfo_fmt() { 7 | val=$(grep "^$1:" /proc/meminfo) 8 | val=$(echo "$val" | awk '{printf("%i\n", $2/1024)}') 9 | echo $val 10 | } 11 | 12 | if [ -z "$ma_supported" ]; then 13 | mem=$(free -m | grep '^Mem') 14 | swap=$(free -m | grep '^Swap') 15 | 16 | mem_total_mb=$(echo $mem | cut -d\ -f2) 17 | mem_free_mb=$(echo $mem | cut -d\ -f4) 18 | 19 | mem_cached_mb=$(echo $mem | cut -d\ -f7) 20 | mem_buffers_mb=$(echo $mem | cut -d\ -f6) 21 | mem_buffcache_mb=$(($mem_cached_mb + $mem_buffers_mb)) 22 | 23 | mem_hardused_mb=$(echo "$mem" | awk '{printf("%.2f\n", $3-$5-$6-$7);}') 24 | mem_hardused_ratio=$(echo $mem_hardused_mb $mem_total_mb | awk '{printf("%.2f", $1/$2);}') 25 | 26 | mem_available_mb=$(($mem_free_mb + $mem_buffcache_mb)) 27 | 28 | swap_total_mb=$(echo $swap | cut -d\ -f2) 29 | swap_free_mb=$(echo $swap | cut -d\ -f4) 30 | swap_used_mb=$(echo $swap | cut -d\ -f3) 31 | if [ $swap_total_mb -eq 0 ]; then 32 | swap_used_ratio=0 33 | else 34 | swap_used_ratio=$(echo "$swap" | awk '{printf("%.2f\n", $3/$2);}') 35 | fi 36 | else 37 | mem_total_mb=$(meminfo_fmt MemTotal) 38 | mem_available_mb=$(meminfo_fmt MemAvailable) 39 | mem_hardused_mb=$(( $mem_total_mb - $mem_available_mb )) 40 | mem_hardused_ratio=$(echo $mem_hardused_mb $mem_total_mb | awk '{printf("%.2f", $1/$2);}') 41 | mem_buffers_mb=$(meminfo_fmt Buffers) 42 | mem_cached_mb=$(meminfo_fmt Cached) 43 | 44 | swap_total_mb=$(meminfo_fmt SwapTotal) 45 | swap_free_mb=$(meminfo_fmt SwapFree) 46 | swap_used_mb=$(( $swap_total_mb - $swap_free_mb )) 47 | if [ $swap_total_mb -eq 0 ]; then 48 | swap_used_ratio=0 49 | else 50 | swap_used_ratio=$(echo "$swap_used_mb" "$swap_total_mb" | awk '{printf("%.2f\n", $1/$2);}') 51 | fi 52 | fi 53 | 54 | mem_buffcache_mb=$(($mem_cached_mb + $mem_buffers_mb)) 55 | mem_buffcache_ratio=$(echo $mem_total_mb $mem_buffcache_mb\ 56 | | awk '{printf("%.2f\n", $2/$1);}') 57 | mem_available_ratio=$(echo $mem_total_mb $mem_available_mb\ 58 | | awk '{printf("%.2f\n", $2/$1);}') 59 | 60 | echo "MEM_TOTAL_MB:" $mem_total_mb 61 | echo "MEM_AVAILABLE_MB:" $mem_available_mb 62 | echo "MEM_AVAILABLE_RATIO:" $mem_available_ratio 63 | echo "MEM_HARDUSED_MB:" $mem_hardused_mb 64 | echo "MEM_HARDUSED_RATIO:" $mem_hardused_ratio 65 | echo "MEM_BUFFCACHE_MB:" $mem_buffcache_mb 66 | echo "MEM_BUFFCACHE_RATIO:" $mem_buffcache_ratio 67 | echo "SWAP_TOTAL_MB:" $swap_total_mb 68 | echo "SWAP_FREE_MB:" $swap_free_mb 69 | echo "SWAP_USED_MB:" $swap_used_mb 70 | echo "SWAP_USED_RATIO:" $swap_used_ratio 71 | -------------------------------------------------------------------------------- /etc/scripts/probes/ping.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z "$1" ]; then 4 | (>&2 echo "ERROR: give IP to test") 5 | exit 1 6 | fi 7 | dest=$1 8 | 9 | res=$(ping -qAc5 "$dest") 10 | 11 | loss=$(echo "$res" | grep "packets transmitted" | sed -r 's/.* ([0-9]+)%.*/\1/g') 12 | avg=$(echo "$res" | grep "^rtt" | awk -F/ '{print $5}') 13 | 14 | echo LOSS_PERC: $loss 15 | echo AVG_MS: $avg 16 | -------------------------------------------------------------------------------- /etc/scripts/probes/port.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z "$1" ]; then 4 | (>&2 echo "ERROR: give port number (ex: 443)") 5 | exit 1 6 | fi 7 | 8 | nc -z localhost $1 > /dev/null 2>&1 9 | res=$? 10 | 11 | open=0 12 | if [ $res -eq 0 ]; then 13 | open=1 14 | fi 15 | 16 | echo "OPEN:" $open 17 | -------------------------------------------------------------------------------- /etc/scripts/probes/systemctl_status.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z "$1" ]; then 4 | (>&2 echo "ERROR: give unit name (ex: httpd.service)") 5 | exit 1 6 | fi 7 | 8 | 9 | status=$(systemctl is-active "$1") 10 | echo "STATUS:" $status 11 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/Xfennec/nosee 2 | 3 | go 1.18 4 | 5 | require ( 6 | github.com/BurntSushi/toml v1.2.0 7 | github.com/Knetic/govaluate v3.0.0+incompatible 8 | github.com/fatih/color v1.13.0 9 | github.com/satori/go.uuid v1.2.0 10 | github.com/urfave/cli v1.22.9 11 | golang.org/x/crypto v0.0.0-20220817201139-bc19a97f63c8 12 | ) 13 | 14 | require ( 15 | github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d // indirect 16 | github.com/mattn/go-colorable v0.1.9 // indirect 17 | github.com/mattn/go-isatty v0.0.14 // indirect 18 | github.com/russross/blackfriday/v2 v2.0.1 // indirect 19 | github.com/shurcooL/sanitized_anchor_name v1.0.0 // indirect 20 | golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c // indirect 21 | ) 22 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= 2 | github.com/BurntSushi/toml v1.2.0 h1:Rt8g24XnyGTyglgET/PRUNlrUeu9F5L+7FilkXfZgs0= 3 | github.com/BurntSushi/toml v1.2.0/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= 4 | github.com/Knetic/govaluate v3.0.0+incompatible h1:7o6+MAPhYTCF0+fdvoz1xDedhRb4f6s9Tn1Tt7/WTEg= 5 | github.com/Knetic/govaluate v3.0.0+incompatible/go.mod h1:r7JcOSlj0wfOMncg0iLm8Leh48TZaKVeNIfJntJ2wa0= 6 | github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d h1:U+s90UTSYgptZMwQh2aRr3LuazLJIa+Pg3Kc1ylSYVY= 7 | github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= 8 | github.com/fatih/color v1.13.0 h1:8LOYc1KYPPmyKMuN8QV2DNRWNbLo6LZ0iLs8+mlH53w= 9 | github.com/fatih/color v1.13.0/go.mod h1:kLAiJbzzSOZDVNGyDpeOxJ47H46qBXwg5ILebYFFOfk= 10 | github.com/mattn/go-colorable v0.1.9 h1:sqDoxXbdeALODt0DAeJCVp38ps9ZogZEAXjus69YV3U= 11 | github.com/mattn/go-colorable v0.1.9/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc= 12 | github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU= 13 | github.com/mattn/go-isatty v0.0.14 h1:yVuAays6BHfxijgZPzw+3Zlu5yQgKGP2/hcQbHb7S9Y= 14 | github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94= 15 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 16 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 17 | github.com/russross/blackfriday/v2 v2.0.1 h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q= 18 | github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= 19 | github.com/satori/go.uuid v1.2.0 h1:0uYX9dsZ2yD7q2RtLRtPSdGDWzjeM3TbMJP9utgA0ww= 20 | github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0= 21 | github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo= 22 | github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= 23 | github.com/urfave/cli v1.22.9 h1:cv3/KhXGBGjEXLC4bH0sLuJ9BewaAbpk5oyMOveu4pw= 24 | github.com/urfave/cli v1.22.9/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0= 25 | golang.org/x/crypto v0.0.0-20220817201139-bc19a97f63c8 h1:GIAS/yBem/gq2MUqgNIzUHW7cJMmx3TGZOrnyYaNQ6c= 26 | golang.org/x/crypto v0.0.0-20220817201139-bc19a97f63c8/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= 27 | golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 28 | golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 29 | golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c h1:F1jZWGFhYfh0Ci55sIpILtKKK8p3i2/krTr0H1rg74I= 30 | golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 31 | golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1 h1:v+OssWQX+hTHEmOBgwxdZxK4zHq3yOs8F9J7mk0PY8E= 32 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= 33 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 34 | gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 35 | -------------------------------------------------------------------------------- /heartbeat.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io/ioutil" 7 | "os" 8 | "os/exec" 9 | "path" 10 | "path/filepath" 11 | "time" 12 | ) 13 | 14 | func heartbeatsList(config *Config) ([]string, error) { 15 | hbDirPath := path.Clean(config.configPath + "/scripts/heartbeats/") 16 | stat, err := os.Stat(hbDirPath) 17 | 18 | if err != nil { 19 | return nil, fmt.Errorf("invalid 'heartbeats' directory '%s': %s", hbDirPath, err) 20 | } 21 | 22 | if !stat.Mode().IsDir() { 23 | return nil, fmt.Errorf("is not a directory '%s'", hbDirPath) 24 | } 25 | 26 | scripts, err := filepath.Glob(hbDirPath + "/*") 27 | if err != nil { 28 | return nil, fmt.Errorf("error listing '%s' directory: %s", hbDirPath, err) 29 | } 30 | 31 | for _, scriptPath := range scripts { 32 | stat, err := os.Stat(scriptPath) 33 | 34 | if err != nil { 35 | return nil, fmt.Errorf("invalid 'script' file '%s': %s", scriptPath, err) 36 | } 37 | 38 | if !stat.Mode().IsRegular() { 39 | return nil, fmt.Errorf("is not a regular 'script' file '%s'", scriptPath) 40 | } 41 | 42 | _, err = ioutil.ReadFile(scriptPath) 43 | if err != nil { 44 | return nil, fmt.Errorf("error reading script file '%s': %s", scriptPath, err) 45 | } 46 | } 47 | 48 | return scripts, nil 49 | } 50 | 51 | func heartbeatExecute(script string) { 52 | varMap := make(map[string]interface{}) 53 | varMap["NOSEE_SRV"] = GlobalConfig.Name 54 | varMap["VERSION"] = NoseeVersion 55 | varMap["DATETIME"] = time.Now().Format(time.RFC3339) 56 | varMap["STARTTIME"] = appStartTime.Format(time.RFC3339) 57 | varMap["UPTIME"] = (int)(time.Since(appStartTime).Seconds()) 58 | 59 | cmd := exec.Command(script) 60 | 61 | env := os.Environ() 62 | for key, val := range varMap { 63 | env = append(env, fmt.Sprintf("%s=%s", key, InterfaceValueToString(val))) 64 | } 65 | cmd.Env = env 66 | 67 | if cmdOut, err := cmd.CombinedOutput(); err != nil { 68 | Warning.Printf("error running heartbeat '%s': %s: %s", script, err, bytes.TrimSpace(cmdOut)) 69 | } else { 70 | Trace.Printf("heartbeat '%s' OK: %s", script, bytes.TrimSpace(cmdOut)) 71 | } 72 | } 73 | 74 | func heartbeatsExecute(scripts []string) { 75 | for _, script := range scripts { 76 | heartbeatExecute(script) 77 | } 78 | } 79 | 80 | func heartbeatsSchedule(scripts []string, delay time.Duration) { 81 | go func() { 82 | for { 83 | heartbeatsExecute(scripts) 84 | Info.Printf("heartbeat, %d scripts", len(scripts)) 85 | // should check total exec duration and compare to delay, here! 86 | time.Sleep(delay) 87 | } 88 | }() 89 | } 90 | -------------------------------------------------------------------------------- /host.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "strings" 6 | "time" 7 | ) 8 | 9 | // Host is the final form of hosts.d files 10 | type Host struct { 11 | Name string 12 | Filename string 13 | Disabled bool 14 | Classes []string 15 | Connection *Connection 16 | Defaults map[string]interface{} 17 | Tasks []*Task 18 | } 19 | 20 | // HasClass returns true if this Host has this class 21 | func (host *Host) HasClass(class string) bool { 22 | if class == "*" { 23 | return true 24 | } 25 | 26 | for _, hClass := range host.Classes { 27 | if hClass == class { 28 | return true 29 | } 30 | } 31 | return false 32 | } 33 | 34 | // MatchProbeTargets returns true if this Host matches probe's classes 35 | func (host *Host) MatchProbeTargets(probe *Probe) bool { 36 | for _, pTargets := range probe.Targets { 37 | tokens := strings.Split(pTargets, "&") 38 | matched := 0 39 | mustMatch := len(tokens) 40 | for _, token := range tokens { 41 | ttoken := strings.TrimSpace(token) 42 | if host.HasClass(ttoken) { 43 | matched++ 44 | } 45 | } 46 | if matched == mustMatch { 47 | return true 48 | } 49 | } 50 | return false 51 | } 52 | 53 | // Schedule will loop forever, creating and executing runs for this host 54 | func (host *Host) Schedule() { 55 | for { 56 | start := time.Now() 57 | 58 | var run Run 59 | run.Host = host 60 | run.StartTime = start 61 | 62 | for _, task := range host.Tasks { 63 | if start.After(task.NextRun) || start.Equal(task.NextRun) { 64 | taskable, err := task.Taskable() 65 | if err != nil { 66 | Trace.Printf("Taskable() failed: %s", err) 67 | run.addError(err) 68 | continue 69 | } 70 | if taskable == false { 71 | Info.Printf("host '%s', paused task '%s'\n", host.Name, task.Probe.Name) 72 | continue 73 | } 74 | 75 | task.ReSchedule(start.Add(task.Probe.Delay)) 76 | Info.Printf("host '%s', running task '%s'\n", host.Name, task.Probe.Name) 77 | run.Tasks = append(run.Tasks, task) 78 | } 79 | } 80 | 81 | if len(run.Tasks) > 0 { 82 | run.Go() 83 | run.Alerts() 84 | Trace.Printf("currentFails count = %d\n", len(currentFails)) 85 | loggersExec(&run) 86 | } 87 | Info.Printf("host '%s', run ended", host.Name) 88 | 89 | end := time.Now() 90 | dur := end.Sub(start) 91 | 92 | if dur < time.Minute { 93 | remains := time.Minute - dur 94 | time.Sleep(remains) 95 | } else { 96 | run.addError(fmt.Errorf("run duration was too long (%s)", run.Duration)) 97 | } 98 | Trace.Printf("(loop %s)\n", host.Name) 99 | } 100 | } 101 | 102 | // TestConnection will return nil if connection to the host was successful 103 | func (host *Host) TestConnection() error { 104 | 105 | //const bootstrap = "bash -s --" 106 | 107 | startTime := time.Now() 108 | 109 | channel := make(chan error, 1) 110 | go func() { 111 | if err := host.Connection.Connect(); err != nil { 112 | channel <- err 113 | } 114 | defer host.Connection.Close() 115 | channel <- nil 116 | }() 117 | 118 | connTimeout := host.Connection.SSHConnTimeWarn * 2 119 | 120 | select { 121 | case err := <-channel: 122 | if err != nil { 123 | return err 124 | } 125 | case <-time.After(connTimeout): 126 | return fmt.Errorf("SSH connection timeout (after %s)", connTimeout) 127 | } 128 | 129 | dialDuration := time.Now().Sub(startTime) 130 | 131 | if dialDuration > host.Connection.SSHConnTimeWarn { 132 | return fmt.Errorf("SSH connection time was too long: %s (ssh_connection_time_warn = %s)", dialDuration, host.Connection.SSHConnTimeWarn) 133 | } 134 | 135 | /*if err := run.prepareTestPipes(); err != nil { 136 | return err 137 | }*/ 138 | 139 | /*if err := host.TestRun(bootstrap); err != nil { 140 | return err 141 | }*/ 142 | Info.Printf("Connection to '%s' OK (%s)", host.Name, dialDuration) 143 | 144 | return nil 145 | } 146 | -------------------------------------------------------------------------------- /log.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "io/ioutil" 7 | "log" 8 | "os" 9 | 10 | "github.com/urfave/cli" 11 | ) 12 | 13 | // Loggers for trace, info, warning and error severity 14 | var ( 15 | Trace *log.Logger 16 | Info *log.Logger 17 | Warning *log.Logger 18 | Error *log.Logger 19 | ) 20 | 21 | func writerCreate(std io.Writer, fd *os.File, quiet bool) io.Writer { 22 | if quiet { 23 | if fd != nil { 24 | if std != ioutil.Discard { 25 | return fd 26 | } 27 | } 28 | return ioutil.Discard 29 | } 30 | 31 | // no log at all for this stream (no std, no file) 32 | if std == ioutil.Discard { 33 | return ioutil.Discard 34 | } 35 | // both 36 | if fd != nil { 37 | return io.MultiWriter(fd, std) 38 | } 39 | return std 40 | } 41 | 42 | // LogInit will initialize loggers 43 | func LogInit(ctx *cli.Context) { 44 | var ( 45 | traceHandle io.Writer 46 | infoHandle io.Writer 47 | warningHandle io.Writer 48 | errorHandle io.Writer 49 | ) 50 | 51 | level := ctx.String("log-level") 52 | file := ctx.String("log-file") 53 | quiet := ctx.Bool("quiet") 54 | timestamp := ctx.Bool("log-timestamp") 55 | 56 | var ( 57 | err error 58 | fd *os.File 59 | ) 60 | if file != "" { 61 | fd, err = os.OpenFile(file, os.O_WRONLY|os.O_APPEND|os.O_CREATE, 0640) 62 | if err != nil { 63 | fmt.Fprintf(os.Stderr, "Unable to create log file '%s' (%s)\n", file, err) 64 | os.Exit(1) 65 | } 66 | } else { 67 | fd = nil 68 | } 69 | 70 | switch level { 71 | case "trace": 72 | traceHandle = writerCreate(os.Stdout, fd, quiet) 73 | infoHandle = writerCreate(os.Stdout, fd, quiet) 74 | warningHandle = writerCreate(os.Stdout, fd, quiet) 75 | errorHandle = writerCreate(os.Stderr, fd, quiet) 76 | case "info": 77 | traceHandle = writerCreate(ioutil.Discard, fd, quiet) 78 | infoHandle = writerCreate(os.Stdout, fd, quiet) 79 | warningHandle = writerCreate(os.Stdout, fd, quiet) 80 | errorHandle = writerCreate(os.Stderr, fd, quiet) 81 | case "warning": 82 | traceHandle = writerCreate(ioutil.Discard, fd, quiet) 83 | infoHandle = writerCreate(ioutil.Discard, fd, quiet) 84 | warningHandle = writerCreate(os.Stdout, fd, quiet) 85 | errorHandle = writerCreate(os.Stderr, fd, quiet) 86 | default: 87 | fmt.Fprintf(os.Stderr, "ERROR: invalid log level '%s'\n", level) 88 | os.Exit(1) 89 | } 90 | 91 | var flags = 0 92 | if timestamp { 93 | flags = log.Ldate | log.Ltime 94 | } 95 | 96 | Trace = log.New(traceHandle, 97 | "TRACE: ", 98 | flags|log.Lshortfile) 99 | 100 | Info = log.New(infoHandle, 101 | "INFO: ", 102 | flags) 103 | 104 | Warning = log.New(warningHandle, 105 | "WARNING: ", 106 | flags) 107 | 108 | Error = log.New(errorHandle, 109 | "ERROR: ", 110 | flags) 111 | 112 | Trace.Println("Log init") 113 | } 114 | -------------------------------------------------------------------------------- /loggers.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io/ioutil" 7 | "os" 8 | "os/exec" 9 | "path" 10 | "path/filepath" 11 | "strings" 12 | ) 13 | 14 | func loggersList(config *Config) ([]string, error) { 15 | lgDirPath := path.Clean(config.configPath + "/scripts/loggers/") 16 | stat, err := os.Stat(lgDirPath) 17 | 18 | if err != nil { 19 | return nil, fmt.Errorf("invalid 'loggers' directory '%s': %s", lgDirPath, err) 20 | } 21 | 22 | if !stat.Mode().IsDir() { 23 | return nil, fmt.Errorf("is not a directory '%s'", lgDirPath) 24 | } 25 | 26 | scripts, err := filepath.Glob(lgDirPath + "/*") 27 | if err != nil { 28 | return nil, fmt.Errorf("error listing '%s' directory: %s", lgDirPath, err) 29 | } 30 | 31 | for _, scriptPath := range scripts { 32 | stat, err := os.Stat(scriptPath) 33 | 34 | if err != nil { 35 | return nil, fmt.Errorf("invalid 'script' file '%s': %s", scriptPath, err) 36 | } 37 | 38 | if !stat.Mode().IsRegular() { 39 | return nil, fmt.Errorf("is not a regular 'script' file '%s'", scriptPath) 40 | } 41 | 42 | _, err = ioutil.ReadFile(scriptPath) 43 | if err != nil { 44 | return nil, fmt.Errorf("error reading script file '%s': %s", scriptPath, err) 45 | } 46 | } 47 | 48 | return scripts, nil 49 | } 50 | 51 | func loggersExec(run *Run) { 52 | varMap := make(map[string]interface{}) 53 | varMap["NOSEE_SRV"] = GlobalConfig.Name 54 | varMap["VERSION"] = NoseeVersion 55 | varMap["HOST_NAME"] = run.Host.Name 56 | varMap["HOST_FILE"] = run.Host.Filename 57 | varMap["CLASSES"] = strings.Join(run.Host.Classes, ",") 58 | 59 | var valuesBuff bytes.Buffer 60 | for _, result := range run.TaskResults { 61 | for key, val := range result.Values { 62 | // df.toml;DISK_FULLEST_PERC;27 63 | str := fmt.Sprintf("%s;%s;%s\n", result.Task.Probe.Filename, key, val) 64 | valuesBuff.WriteString(str) 65 | } 66 | } 67 | 68 | go func() { 69 | for _, script := range globalLogers { 70 | cmd := exec.Command(script) 71 | 72 | // we inject Values thru stdin: 73 | cmd.Stdin = strings.NewReader(valuesBuff.String()) 74 | 75 | env := os.Environ() 76 | for key, val := range varMap { 77 | env = append(env, fmt.Sprintf("%s=%s", key, InterfaceValueToString(val))) 78 | } 79 | cmd.Env = env 80 | 81 | if cmdOut, err := cmd.CombinedOutput(); err != nil { 82 | Warning.Printf("error running logger '%s': %s: %s", script, err, bytes.TrimSpace(cmdOut)) 83 | } else { 84 | Trace.Printf("logger '%s' OK: %s", script, bytes.TrimSpace(cmdOut)) 85 | } 86 | } 87 | }() 88 | } 89 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "math/rand" 7 | "os" 8 | "path" 9 | "path/filepath" 10 | "strings" 11 | "sync" 12 | "time" 13 | 14 | "github.com/BurntSushi/toml" 15 | "github.com/Knetic/govaluate" 16 | "github.com/fatih/color" 17 | "github.com/urfave/cli" 18 | ) 19 | 20 | // NoseeVersion in X.Y string format 21 | const NoseeVersion = "0.1" 22 | 23 | var myRand *rand.Rand 24 | var globalAlerts []*Alert 25 | var globalLogers []string 26 | var appStartTime time.Time 27 | 28 | func configurationDirList(inpath string, dirPath string) ([]string, error) { 29 | configPath := path.Clean(dirPath + "/" + inpath) 30 | 31 | stat, err := os.Stat(configPath) 32 | 33 | if err != nil { 34 | return nil, fmt.Errorf("invalid directory '%s': %s", configPath, err) 35 | } 36 | 37 | if !stat.Mode().IsDir() { 38 | return nil, fmt.Errorf("is not a directory '%s'", configPath) 39 | } 40 | 41 | list, err := filepath.Glob(configPath + "/*.toml") 42 | if err != nil { 43 | return nil, fmt.Errorf("error listing '%s' directory: %s", configPath, err) 44 | } 45 | 46 | return list, nil 47 | } 48 | 49 | func createProbes(ctx *cli.Context, config *Config) ([]*Probe, error) { 50 | probesdFiles, errd := configurationDirList("probes.d", config.configPath) 51 | if errd != nil { 52 | return nil, fmt.Errorf("Error: %s", errd) 53 | } 54 | 55 | var probes []*Probe 56 | pNames := make(map[string]string) 57 | 58 | for _, file := range probesdFiles { 59 | var tProbe tomlProbe 60 | 61 | if _, err := toml.DecodeFile(file, &tProbe); err != nil { 62 | return nil, fmt.Errorf("Error decoding %s: %s", file, err) 63 | } 64 | 65 | _, filename := path.Split(file) 66 | probe, err := tomlProbeToProbe(&tProbe, config, filename) 67 | if err != nil { 68 | return nil, fmt.Errorf("Error using %s: %s", file, err) 69 | } 70 | 71 | if probe != nil { 72 | if f, exists := pNames[probe.Name]; exists == true { 73 | return nil, fmt.Errorf("Config error: duplicate name '%s' (%s, %s)", probe.Name, f, file) 74 | } 75 | 76 | probes = append(probes, probe) 77 | pNames[probe.Name] = file 78 | } 79 | } 80 | Info.Printf("probe count = %d\n", len(probes)) 81 | return probes, nil 82 | } 83 | 84 | func createAlerts(ctx *cli.Context, config *Config) ([]*Alert, error) { 85 | alertdFiles, err := configurationDirList("alerts.d", config.configPath) 86 | if err != nil { 87 | return nil, fmt.Errorf("Error: %s", err) 88 | } 89 | 90 | var alerts []*Alert 91 | aNames := make(map[string]string) 92 | for _, file := range alertdFiles { 93 | var tAlert tomlAlert 94 | 95 | if _, err := toml.DecodeFile(file, &tAlert); err != nil { 96 | return nil, fmt.Errorf("Error decoding %s: %s", file, err) 97 | } 98 | 99 | alert, err := tomlAlertToAlert(&tAlert, config) 100 | if err != nil { 101 | return nil, fmt.Errorf("Error using %s: %s", file, err) 102 | } 103 | 104 | if alert != nil { 105 | if f, exists := aNames[alert.Name]; exists == true { 106 | return nil, fmt.Errorf("Config error: duplicate name '%s' (%s, %s)", alert.Name, f, file) 107 | } 108 | 109 | alerts = append(alerts, alert) 110 | aNames[alert.Name] = file 111 | } 112 | } 113 | // = alerts 114 | Info.Printf("alert count = %d\n", len(alerts)) 115 | 116 | // check if we have at least one "general" alert receiver 117 | generalReceivers := 0 118 | for _, alert := range alerts { 119 | for _, target := range alert.Targets { 120 | if target == GeneralClass || target == "*" { 121 | generalReceivers++ 122 | } 123 | } 124 | } 125 | if generalReceivers == 0 { 126 | return nil, fmt.Errorf("Config error: at least one alert must match the 'general' class") 127 | } 128 | return alerts, nil 129 | } 130 | 131 | func createHosts(ctx *cli.Context, config *Config) ([]*Host, error) { 132 | hostsdFiles, errc := configurationDirList("hosts.d", config.configPath) 133 | if errc != nil { 134 | return nil, fmt.Errorf("Error: %s", errc) 135 | } 136 | 137 | var hosts []*Host 138 | hNames := make(map[string]string) 139 | 140 | for _, file := range hostsdFiles { 141 | var tHost tomlHost 142 | 143 | // defaults 144 | tHost.Network.SSHConnTimeWarn.Duration = config.SSHConnTimeWarn 145 | 146 | if _, err := toml.DecodeFile(file, &tHost); err != nil { 147 | return nil, fmt.Errorf("Error decoding %s: %s", file, err) 148 | } 149 | 150 | _, filename := path.Split(file) 151 | host, err := tomlHostToHost(&tHost, config, filename) 152 | if err != nil { 153 | return nil, fmt.Errorf("Error using %s: %s", file, err) 154 | } 155 | 156 | if host != nil { 157 | if f, exists := hNames[host.Name]; exists == true { 158 | return nil, fmt.Errorf("Config error: duplicate name '%s' (%s, %s)", host.Name, f, file) 159 | } 160 | 161 | hosts = append(hosts, host) 162 | hNames[host.Name] = file 163 | } 164 | } 165 | Info.Printf("host count = %d\n", len(hosts)) 166 | 167 | if config.doConnTest == true { 168 | Info.Print("Testing connections…") 169 | errors := make(chan error, len(hosts)) 170 | for _, host := range hosts { 171 | go func(host *Host) { 172 | if err := host.TestConnection(); err != nil { 173 | errors <- fmt.Errorf("Error connecting %s: %s", host.Name, err) 174 | } else { 175 | errors <- nil 176 | } 177 | }(host) 178 | } 179 | for i := 0; i < len(hosts); i++ { 180 | select { 181 | case err := <-errors: 182 | if err != nil { 183 | return nil, err 184 | } 185 | } 186 | } 187 | } 188 | 189 | probes, err := createProbes(ctx, config) 190 | if err != nil { 191 | return nil, err 192 | } 193 | 194 | globalAlerts, err = createAlerts(ctx, config) 195 | if err != nil { 196 | return nil, err 197 | } 198 | 199 | // update hosts with tasks 200 | var taskCount int 201 | for _, host := range hosts { 202 | for _, probe := range probes { 203 | if host.MatchProbeTargets(probe) { 204 | var task Task 205 | task.Probe = probe 206 | task.PrevRun = time.Now() 207 | task.NextRun = time.Now() 208 | host.Tasks = append(host.Tasks, &task) 209 | taskCount++ 210 | } 211 | } 212 | } 213 | Info.Printf("task count = %d\n", taskCount) 214 | 215 | return hosts, nil 216 | } 217 | 218 | func scheduleHosts(hosts []*Host, config *Config) error { 219 | var hostGroup sync.WaitGroup 220 | for i, host := range hosts { 221 | hostGroup.Add(1) 222 | go func(i int, host *Host) { 223 | defer hostGroup.Done() 224 | if config.StartTimeSpreadSeconds > 0 { 225 | // Sleep here, to ease global load 226 | fact := float32(i) / float32(len(hosts)) * 1000 * float32(config.StartTimeSpreadSeconds) 227 | wait := time.Duration(fact) * time.Millisecond 228 | time.Sleep(wait) 229 | } 230 | host.Schedule() 231 | }(i, host) 232 | } 233 | 234 | hostGroup.Wait() 235 | return fmt.Errorf("QUIT: empty wait group, everyone died :(") 236 | } 237 | 238 | func mainDefault(ctx *cli.Context) error { 239 | LogInit(ctx) 240 | 241 | config, err := GlobalConfigRead(ctx.String("config-path"), "nosee.toml") 242 | if err != nil { 243 | Error.Printf("Config (nosee.toml): %s", err) 244 | return cli.NewExitError("", 1) 245 | } 246 | GlobalConfig = config 247 | 248 | heartbeats, err := heartbeatsList(config) 249 | if err != nil { 250 | Error.Println(err) 251 | return cli.NewExitError("", 2) 252 | } 253 | 254 | globalLogers, err = loggersList(config) 255 | if err != nil { 256 | Error.Println(err) 257 | return cli.NewExitError("", 2) 258 | } 259 | 260 | hosts, err := createHosts(ctx, config) 261 | if err != nil { 262 | Error.Println(err) 263 | return cli.NewExitError("", 10) 264 | } 265 | 266 | CurrentFailsCreate() 267 | CurrentFailsLoad() 268 | 269 | if pidPath := ctx.String("pid-file"); pidPath != "" { 270 | pid, err := NewPIDFile(pidPath) 271 | if err != nil { 272 | return cli.NewExitError(fmt.Errorf("Error with pid file: %s", err), 100) 273 | } 274 | defer pid.Remove() 275 | } 276 | 277 | heartbeatsSchedule(heartbeats, config.HeartbeatDelay) 278 | 279 | if err := scheduleHosts(hosts, config); err != nil { 280 | return cli.NewExitError(err, 1) 281 | } 282 | 283 | return nil 284 | } 285 | 286 | func mainCheck(ctx *cli.Context) error { 287 | LogInit(ctx.Parent()) 288 | 289 | fmt.Printf("Checking configuration and connections…\n") 290 | 291 | config, err := GlobalConfigRead(ctx.Parent().String("config-path"), "nosee.toml") 292 | if err != nil { 293 | Error.Printf("Config (nosee.toml): %s", err) 294 | return cli.NewExitError("", 1) 295 | } 296 | GlobalConfig = config 297 | 298 | _, err = heartbeatsList(config) 299 | if err != nil { 300 | Error.Println(err) 301 | return cli.NewExitError("", 2) 302 | } 303 | 304 | _, err = loggersList(config) 305 | if err != nil { 306 | Error.Println(err) 307 | return cli.NewExitError("", 2) 308 | } 309 | 310 | _, err = createHosts(ctx, config) 311 | if err != nil { 312 | Error.Println(err) 313 | return cli.NewExitError("", 10) 314 | } 315 | fmt.Println("OK") 316 | return nil 317 | } 318 | 319 | func mainRecap(ctx *cli.Context) error { 320 | LogInit(ctx.Parent()) 321 | 322 | config, err := GlobalConfigRead(ctx.Parent().String("config-path"), "nosee.toml") 323 | if err != nil { 324 | Error.Printf("Config (nosee.toml): %s", err) 325 | return cli.NewExitError("", 1) 326 | } 327 | GlobalConfig = config 328 | 329 | // TODO: should probably display heartbeats/loggers in the recap, then? 330 | _, err = heartbeatsList(config) 331 | if err != nil { 332 | Error.Println(err) 333 | return cli.NewExitError("", 2) 334 | } 335 | 336 | hosts, err := createHosts(ctx, config) 337 | if err != nil { 338 | Error.Println(err) 339 | return cli.NewExitError("", 10) 340 | } 341 | 342 | if ctx.Bool("no-color") == true { 343 | color.NoColor = true 344 | } 345 | 346 | red := color.New(color.FgRed).SprintFunc() 347 | yellow := color.New(color.FgYellow).SprintFunc() 348 | green := color.New(color.FgGreen).SprintFunc() 349 | cyan := color.New(color.FgCyan).SprintFunc() 350 | 351 | for _, host := range hosts { 352 | fmt.Printf("%s: %s\n", cyan("Host"), host.Name) 353 | for _, task := range host.Tasks { 354 | fmt.Printf(" %s: %s (%dm)\n", green("Probe"), task.Probe.Name, int(task.Probe.Delay.Minutes())) 355 | for _, check := range task.Probe.Checks { 356 | fmt.Printf(" %s: %s (%s)\n", yellow("Check"), check.Desc, strings.Join(check.Classes, ", ")) 357 | var msg AlertMessage 358 | msg.Classes = check.Classes 359 | alertCount := 0 360 | for _, alert := range globalAlerts { 361 | if msg.MatchAlertTargets(alert) { 362 | alertCount++ 363 | fmt.Printf(" %s: %s\n", red("Alert"), alert.Name) 364 | } 365 | } 366 | if alertCount == 0 { 367 | fmt.Println(red(" No valid alert for this check!")) 368 | } 369 | } 370 | } 371 | } 372 | 373 | return nil 374 | } 375 | 376 | func mainExpr(ctx *cli.Context) error { 377 | LogInit(ctx.Parent()) 378 | if ctx.NArg() == 0 { 379 | err := fmt.Errorf("Error, you must provide a govaluate expression parameter, see https://github.com/Knetic/govaluate for syntax and features") 380 | return cli.NewExitError(err, 1) 381 | } 382 | exprString := ctx.Args().Get(0) 383 | 384 | expr, err := govaluate.NewEvaluableExpressionWithFunctions(exprString, CheckFunctions) 385 | if err != nil { 386 | return cli.NewExitError(err, 2) 387 | } 388 | 389 | if vars := expr.Vars(); len(vars) > 0 { 390 | errv := fmt.Errorf("Undefined variables: %s", strings.Join(vars, ", ")) 391 | return cli.NewExitError(errv, 11) 392 | } 393 | 394 | result, err := expr.Evaluate(nil) 395 | if err != nil { 396 | return cli.NewExitError(err, 3) 397 | } 398 | 399 | fmt.Println(InterfaceValueToString(result)) 400 | return nil 401 | } 402 | 403 | func mainTest(ctx *cli.Context) error { 404 | LogInit(ctx.Parent()) 405 | 406 | config, err := GlobalConfigRead(ctx.Parent().String("config-path"), "nosee.toml") 407 | if err != nil { 408 | Error.Printf("Config (nosee.toml): %s", err) 409 | return cli.NewExitError("", 1) 410 | } 411 | config.loadDisabled = true // WARNING! 412 | config.doConnTest = false // WARNING! 413 | GlobalConfig = config 414 | 415 | hosts, err := createHosts(ctx, config) 416 | if err != nil { 417 | Error.Println(err) 418 | return cli.NewExitError("", 10) 419 | } 420 | 421 | // createHosts already load probes, but we need the full list 422 | // and not only probes targeting our host 423 | probes, err := createProbes(ctx, config) 424 | if err != nil { 425 | Error.Println(err) 426 | return cli.NewExitError("", 10) 427 | } 428 | 429 | requestedHost := ctx.Args().Get(0) 430 | requestedProbe := ctx.Args().Get(1) 431 | 432 | if requestedHost == "" { 433 | var list bytes.Buffer 434 | for _, host := range hosts { 435 | list.WriteString(fmt.Sprintf("- %s (%s)\n", host.Filename, host.Name)) 436 | } 437 | Error.Printf("you must give a host Name or hosts.d/ filename:\n%s", list.String()) 438 | return cli.NewExitError("", 1) 439 | } 440 | 441 | if requestedProbe == "" { 442 | var list bytes.Buffer 443 | for _, probe := range probes { 444 | list.WriteString(fmt.Sprintf("- %s (%s)\n", probe.Filename, probe.Name)) 445 | } 446 | Error.Printf("you must give a probe Name or probes.d/ filename:\n%s", list.String()) 447 | return cli.NewExitError("", 1) 448 | } 449 | 450 | // Locate requested host and probe… 451 | var foundHost *Host 452 | for _, host := range hosts { 453 | if host.Name == requestedHost || host.Filename == requestedHost { 454 | foundHost = host 455 | break 456 | } 457 | } 458 | if foundHost == nil { 459 | Error.Printf("can't find '%s' host", requestedHost) 460 | return cli.NewExitError("", 1) 461 | } 462 | 463 | var foundProbe *Probe 464 | for _, probe := range probes { 465 | if probe.Name == requestedProbe || probe.Filename == requestedProbe { 466 | foundProbe = probe 467 | break 468 | } 469 | } 470 | if foundProbe == nil { 471 | Error.Printf("can't find '%s' probe", requestedProbe) 472 | return cli.NewExitError("", 1) 473 | } 474 | 475 | if ctx.Bool("no-color") == true { 476 | color.NoColor = true 477 | } 478 | 479 | red := color.New(color.FgRed).SprintFunc() 480 | yellow := color.New(color.FgYellow).SprintFunc() 481 | green := color.New(color.FgGreen).SprintFunc() 482 | cyan := color.New(color.FgCyan).SprintFunc() 483 | magenta := color.New(color.FgMagenta).SprintFunc() 484 | magentaS := color.New(color.FgMagenta).Add(color.CrossedOut).SprintFunc() 485 | 486 | _, scriptName := path.Split(foundProbe.Script) 487 | fmt.Printf("Testing: host '%s' with probe '%s' (%s, %s) using script '%s'\n", cyan(foundHost.Name), green(foundProbe.Name), foundHost.Filename, foundProbe.Filename, magenta(scriptName)) 488 | if foundHost.Disabled == true { 489 | fmt.Printf("Note: the host '%s' is currently %s\n", red(foundHost.Name), red("disabled")) 490 | } 491 | if foundProbe.Disabled == true { 492 | fmt.Printf("Note: the probe '%s' is currently %s\n", red(foundProbe.Name), red("disabled")) 493 | } 494 | if foundHost.MatchProbeTargets(foundProbe) == false { 495 | fmt.Printf("Note: the probe '%s' does %s match host '%s' (see classes and targets)\n", red(foundProbe.Name), red("not"), red(foundHost.Name)) 496 | } 497 | 498 | // print defaults 499 | for key, val := range foundProbe.Defaults { 500 | if _, ok := foundHost.Defaults[key]; ok == true { 501 | fmt.Printf("default: %s = %s -> %s (host override)\n", 502 | magenta(key), 503 | magentaS(InterfaceValueToString(val)), 504 | magenta(foundHost.Defaults[key])) 505 | } else { 506 | fmt.Printf("default: %s = %s\n", magenta(key), magenta(InterfaceValueToString(val))) 507 | } 508 | } 509 | 510 | var run Run 511 | run.StartTime = time.Now() 512 | run.Host = foundHost 513 | 514 | var task Task 515 | task.Probe = foundProbe 516 | task.PrevRun = time.Now() 517 | task.NextRun = time.Now() 518 | 519 | run.Tasks = append(run.Tasks, &task) 520 | run.Go() 521 | 522 | if len(run.Errors) > 0 { 523 | for _, err := range run.Errors { 524 | fmt.Printf("run error: %s\n", red(err)) 525 | } 526 | return nil 527 | } 528 | 529 | result := run.TaskResults[0] 530 | 531 | for key, val := range result.Values { 532 | fmt.Printf("value: %s = %s\n", yellow(key), yellow(val)) 533 | } 534 | 535 | for _, err := range result.Logs { 536 | fmt.Printf("log: %s\n", cyan(err)) 537 | } 538 | 539 | if result.ExitStatus == 0 { 540 | fmt.Printf("script exit status: %s (success)\n", green(result.ExitStatus)) 541 | } else { 542 | fmt.Printf("script exit status: %s (error)\n", red(result.ExitStatus)) 543 | } 544 | fmt.Printf("script duration: %s (+ ssh dial duration: %s)\n", result.Duration, run.DialDuration) 545 | 546 | if run.totalErrorCount() > 0 { 547 | for _, err := range result.Errors { 548 | fmt.Printf("error: %s\n", red(err)) 549 | } 550 | return nil 551 | } 552 | 553 | result.DoChecks() 554 | 555 | // DoChecks may add its own errors 556 | for _, err := range result.Errors { 557 | fmt.Printf("error: %s\n", red(err)) 558 | } 559 | 560 | for _, check := range result.SuccessfulChecks { 561 | fmt.Printf("check %s: %s: false (no alert)\n", green("GOOD"), green(check.Desc)) 562 | } 563 | for _, check := range result.FailedChecks { 564 | fmt.Printf("check %s: %s: true (alert)\n", red("BAD"), red(check.Desc)) 565 | } 566 | 567 | return nil 568 | } 569 | 570 | func main() { 571 | // generic (aka "not cli command specific") inits 572 | source := rand.NewSource(time.Now().UnixNano()) 573 | myRand = rand.New(source) 574 | CheckFunctionsInit() 575 | appStartTime = time.Now() 576 | 577 | app := cli.NewApp() 578 | app.Usage = "Nosee: a nosey, agentless, easy monitoring tool over SSH" 579 | app.Version = NoseeVersion 580 | 581 | app.Flags = []cli.Flag{ 582 | cli.StringFlag{ 583 | Name: "config-path, c", 584 | Value: "/etc/nosee/", 585 | Usage: "configuration directory `PATH`", 586 | EnvVar: "NOSEE_CONFIG", 587 | }, 588 | cli.StringFlag{ 589 | Name: "log-level, l", 590 | Value: "warning", 591 | Usage: "log `level` verbosity (trace, info, warning)", 592 | }, 593 | cli.StringFlag{ 594 | Name: "log-file, f", 595 | Usage: "log file to `FILE` (append)", 596 | }, 597 | cli.BoolFlag{ 598 | Name: "log-timestamp, t", 599 | Usage: "add timestamp to log output", 600 | }, 601 | cli.BoolFlag{ 602 | Name: "quiet, q", 603 | Usage: "no stdout/err output (except launch errors)", 604 | }, 605 | cli.StringFlag{ 606 | Name: "pid-file, p", 607 | Usage: "create pid `FILE`", 608 | }, 609 | } 610 | 611 | app.Action = mainDefault 612 | 613 | app.Commands = []cli.Command{ 614 | { 615 | Name: "check", 616 | Aliases: []string{"c"}, 617 | Usage: "Check configuration files and connections", 618 | ArgsUsage: " ", 619 | Action: mainCheck, 620 | }, 621 | { 622 | Name: "recap", 623 | Aliases: []string{"r"}, 624 | Usage: "Recap configuration", 625 | ArgsUsage: " ", 626 | Action: mainRecap, 627 | Flags: []cli.Flag{ 628 | cli.BoolFlag{ 629 | Name: "no-color", 630 | Usage: "disable color output ", 631 | }, 632 | }, 633 | }, 634 | { 635 | Name: "expr", 636 | Aliases: []string{"e"}, 637 | Usage: "Test 'govaluate' expression (See Checks 'If')", 638 | ArgsUsage: "expression", 639 | Action: mainExpr, 640 | }, 641 | { 642 | Name: "test", 643 | Aliases: []string{"t"}, 644 | Usage: "Test any Probe on a any Host", 645 | ArgsUsage: "host probe", 646 | Description: "use Name or filename.toml (without path) for host and probe (disabled or not, targeted or not)", 647 | Action: mainTest, 648 | Flags: []cli.Flag{ 649 | cli.BoolFlag{ 650 | Name: "no-color", 651 | Usage: "disable color output ", 652 | }, 653 | }, 654 | }, 655 | } 656 | 657 | app.Run(os.Args) 658 | } 659 | -------------------------------------------------------------------------------- /pid.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "io/ioutil" 6 | "os" 7 | "path/filepath" 8 | "strconv" 9 | "strings" 10 | "syscall" 11 | ) 12 | 13 | // PIDFile stores (few) informations about a PID file 14 | type PIDFile struct { 15 | Path string 16 | } 17 | 18 | func checkPIDFileExists(path string) error { 19 | if pidByte, err := ioutil.ReadFile(path); err == nil { 20 | pidString := strings.TrimSpace(string(pidByte)) 21 | if pid, err := strconv.Atoi(pidString); err == nil { 22 | if pidIsRunning(pid) { 23 | return fmt.Errorf("pid file '%s' already exists", path) 24 | } 25 | } 26 | } 27 | return nil 28 | } 29 | 30 | // NewPIDFile create a PIDFile if there no other instance already running 31 | func NewPIDFile(path string) (*PIDFile, error) { 32 | if err := checkPIDFileExists(path); err != nil { 33 | return nil, err 34 | } 35 | if err := os.MkdirAll(filepath.Dir(path), os.FileMode(0755)); err != nil { 36 | return nil, err 37 | } 38 | if err := ioutil.WriteFile(path, []byte(fmt.Sprintf("%d", os.Getpid())), 0644); err != nil { 39 | return nil, err 40 | } 41 | 42 | return &PIDFile{Path: path}, nil 43 | } 44 | 45 | // Remove deletes the PIDFile 46 | func (file PIDFile) Remove() error { 47 | return os.Remove(file.Path) 48 | } 49 | 50 | func pidIsRunning(pid int) bool { 51 | process, err := os.FindProcess(pid) 52 | if err != nil { 53 | return false 54 | } 55 | 56 | err = process.Signal(syscall.Signal(0)) 57 | 58 | if err != nil && err.Error() == "no such process" { 59 | return false 60 | } 61 | 62 | if err != nil && err.Error() == "os: process already finished" { 63 | return false 64 | } 65 | 66 | return true 67 | } 68 | -------------------------------------------------------------------------------- /probe.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "time" 5 | 6 | "github.com/Knetic/govaluate" 7 | ) 8 | 9 | // Check holds final informations about a check of a probes.d file 10 | type Check struct { 11 | Index int 12 | Desc string 13 | If *govaluate.EvaluableExpression 14 | Classes []string 15 | NeededFailures int 16 | NeededSuccesses int 17 | } 18 | 19 | // Probe is the final form of probes.d files 20 | type Probe struct { 21 | Name string 22 | Filename string 23 | Disabled bool 24 | Script string 25 | Targets []string 26 | Delay time.Duration 27 | Timeout time.Duration 28 | Arguments string 29 | Defaults map[string]interface{} 30 | Checks []*Check 31 | RunIf *govaluate.EvaluableExpression 32 | } 33 | 34 | // MissingDefaults return a slice with names of defaults used in Check 'If' 35 | // expressions and Probe script arguments. The slice length is 0 if no 36 | // missing default were found. 37 | func (probe *Probe) MissingDefaults() []string { 38 | missing := make(map[string]bool) 39 | 40 | for _, check := range probe.Checks { 41 | for _, name := range check.If.Vars() { 42 | if IsAllUpper(name) { 43 | continue 44 | } 45 | if _, ok := probe.Defaults[name]; ok != true { 46 | missing[name] = true 47 | } 48 | } 49 | } 50 | 51 | vars := StringFindVariables(probe.Arguments) 52 | for _, name := range vars { 53 | if _, ok := probe.Defaults[name]; ok != true { 54 | missing[name] = true 55 | } 56 | } 57 | 58 | // map to slice: 59 | var missSlice []string 60 | for key := range missing { 61 | missSlice = append(missSlice, key) 62 | } 63 | 64 | return missSlice 65 | } 66 | -------------------------------------------------------------------------------- /run.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "time" 6 | ) 7 | 8 | // Run is a list of Tasks on Host, including task results 9 | type Run struct { 10 | Host *Host 11 | Tasks []*Task 12 | StartTime time.Time 13 | Duration time.Duration 14 | DialDuration time.Duration 15 | TaskResults []*TaskResult 16 | Errors []error 17 | } 18 | 19 | // Dump prints Run informations on the screen for debugging purposes 20 | func (run *Run) Dump() { 21 | fmt.Printf("-\n") 22 | fmt.Printf("- host: %s\n", run.Host.Name) 23 | fmt.Printf("- %d task(s)\n", len(run.Tasks)) 24 | fmt.Printf("- start: %s\n", run.StartTime) 25 | fmt.Printf("- duration: %s\n", run.Duration) 26 | fmt.Printf("- ssh dial duration: %s\n", run.DialDuration) 27 | for _, err := range run.Errors { 28 | fmt.Printf("-e %s\n", err) 29 | } 30 | for _, res := range run.TaskResults { 31 | fmt.Printf("-- task probe: %s\n", res.Task.Probe.Name) 32 | fmt.Printf("-- start time: %s\n", res.StartTime) 33 | fmt.Printf("-- duration: %s\n", res.Duration) 34 | fmt.Printf("-- exit status: %d\n", res.ExitStatus) 35 | fmt.Printf("-- next task run: %s\n", res.Task.NextRun) 36 | for key, val := range res.Values { 37 | fmt.Printf("-v- '%s' = '%s'\n", key, val) 38 | } 39 | for _, err := range res.Errors { 40 | fmt.Printf("-e- %s\n", err) 41 | } 42 | for _, check := range res.FailedChecks { 43 | fmt.Printf("-F- %s\n", check.Desc) 44 | } 45 | for _, log := range res.Logs { 46 | fmt.Printf("-l- %s\n", log) 47 | } 48 | } 49 | } 50 | 51 | func (run *Run) addError(err error) { 52 | Info.Printf("Run error: %s (host '%s')", err, run.Host.Name) 53 | run.Errors = append(run.Errors, err) 54 | } 55 | 56 | func (run *Run) currentTaskResult() *TaskResult { 57 | if len(run.TaskResults) == 0 { 58 | return nil 59 | } 60 | return run.TaskResults[len(run.TaskResults)-1] 61 | } 62 | 63 | func (run *Run) totalErrorCount() int { 64 | total := len(run.Errors) 65 | for _, taskResult := range run.TaskResults { 66 | total += len(taskResult.Errors) 67 | total += len(taskResult.FailedChecks) 68 | } 69 | return total 70 | } 71 | 72 | func (run *Run) totalTaskResultErrorCount() int { 73 | total := 0 74 | for _, taskResult := range run.TaskResults { 75 | total += len(taskResult.Errors) 76 | } 77 | return total 78 | } 79 | 80 | // ReSchedule will force all Run tasks to run on next time step 81 | func (run *Run) ReSchedule() { 82 | for _, task := range run.Tasks { 83 | task.NextRun = task.PrevRun 84 | } 85 | Info.Printf("re-scheduling all tasks for '%s'\n", run.Host.Name) 86 | } 87 | 88 | // ReScheduleFailedTasks will force all Run failed tasks to run on next time step 89 | func (run *Run) ReScheduleFailedTasks() { 90 | for _, task := range run.Tasks { 91 | for _, cf := range currentFails { 92 | if cf.RelatedTask == task || cf.RelatedTTask == task { 93 | task.ReSchedule(time.Now()) 94 | Info.Printf("re-scheduling task '%s'\n", task.Probe.Name) 95 | } 96 | } 97 | } 98 | } 99 | 100 | // DoChecks will evaluate checks on every TaskResult of the Run 101 | func (run *Run) DoChecks() { 102 | for _, taskResult := range run.TaskResults { 103 | taskResult.DoChecks() 104 | } 105 | } 106 | 107 | // Go will execute the Run 108 | func (run *Run) Go() { 109 | const bootstrap = "bash -s --" 110 | 111 | timeout := time.Second * 59 112 | timeoutChan := time.After(timeout) 113 | 114 | run.StartTime = time.Now() 115 | defer func() { 116 | run.Duration = time.Now().Sub(run.StartTime) 117 | }() 118 | 119 | if err := run.Host.Connection.Connect(); err != nil { 120 | run.addError(err) 121 | return 122 | } 123 | defer run.Host.Connection.Close() 124 | 125 | run.DialDuration = time.Now().Sub(run.StartTime) 126 | if run.DialDuration > run.Host.Connection.SSHConnTimeWarn { 127 | run.addError(fmt.Errorf("SSH connection time was too long: %s (ssh_connection_time_warn = %s)", run.DialDuration, run.Host.Connection.SSHConnTimeWarn)) 128 | return 129 | } 130 | 131 | if err := run.preparePipes(); err != nil { 132 | run.addError(err) 133 | return 134 | } 135 | 136 | ended := make(chan int, 1) 137 | 138 | go func() { 139 | if err := run.Host.Connection.Session.Run(bootstrap); err != nil { 140 | run.addError(err) 141 | } 142 | ended <- 1 143 | }() 144 | 145 | select { 146 | case <-ended: 147 | // nice 148 | case <-timeoutChan: 149 | run.addError(fmt.Errorf("timeout for this run, after %s", timeout)) 150 | Trace.Println("run timeout") 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /run_alerts.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "strconv" 6 | ) 7 | 8 | // AlertsForRun creates a currentFail entry for this Run (if not already done) 9 | // and rings corresponding alerts 10 | func (run *Run) AlertsForRun() { 11 | var bbuf bytes.Buffer 12 | bbuf.WriteString(run.Host.Name) 13 | // We now limit to one Fail per host, otherwise we may flood 14 | // the user with Errors (ex: "alert, ssh connection 11s", then the same 15 | // with 11.5s, etc). If there's an issue with a host, you have to fix it 16 | // to get the others (if any left), it makes sense. 17 | /*for _, err := range run.Errors { 18 | bbuf.WriteString(err.Error()) 19 | }*/ 20 | hash := MD5Hash(bbuf.String()) 21 | 22 | currentFail := CurrentFailGetAndInc(hash) 23 | currentFail.RelatedHost = run.Host 24 | 25 | if currentFail.FailCount > 1 { 26 | return 27 | } 28 | 29 | message := AlertMessageCreateForRun(AlertBad, run, currentFail) 30 | message.RingAlerts() 31 | } 32 | 33 | // AlertsForTasks creates currentFail entries for each failed TaskResults 34 | // (if not already done) and rings corresponding alerts 35 | func (run *Run) AlertsForTasks() { 36 | for _, taskRes := range run.TaskResults { 37 | if len(taskRes.Errors) > 0 { 38 | var bbuf bytes.Buffer 39 | bbuf.WriteString(run.Host.Name + taskRes.Task.Probe.Name) 40 | for _, err := range taskRes.Errors { 41 | bbuf.WriteString(err.Error()) 42 | } 43 | hash := MD5Hash(bbuf.String()) 44 | 45 | currentFail := CurrentFailGetAndInc(hash) 46 | currentFail.RelatedTTask = taskRes.Task 47 | if currentFail.FailCount > 1 { 48 | return 49 | } 50 | 51 | message := AlertMessageCreateForTaskResult(AlertBad, run, taskRes, currentFail) 52 | message.RingAlerts() 53 | } 54 | } 55 | } 56 | 57 | // AlertsForChecks creates currentFail entries for every FailedChecks of 58 | // every TaskResults (if not already done) and rings corresponding alerts 59 | func (run *Run) AlertsForChecks() { 60 | // Failures 61 | for _, taskRes := range run.TaskResults { 62 | for _, check := range taskRes.FailedChecks { 63 | Info.Printf("task '%s', check '%s' failed (%s)\n", taskRes.Task.Probe.Name, check.Desc, run.Host.Name) 64 | 65 | hash := MD5Hash(run.Host.Name + taskRes.Task.Probe.Name + strconv.Itoa(check.Index)) 66 | currentFail := CurrentFailGetAndInc(hash) 67 | currentFail.RelatedTask = taskRes.Task 68 | if currentFail.FailCount != check.NeededFailures { 69 | continue // not yet / already done 70 | } 71 | 72 | message := AlertMessageCreateForCheck(AlertBad, run, taskRes, check, currentFail) 73 | message.RingAlerts() 74 | } 75 | } 76 | 77 | // Successes 78 | for _, taskRes := range run.TaskResults { 79 | for _, check := range taskRes.SuccessfulChecks { 80 | hash := MD5Hash(run.Host.Name + taskRes.Task.Probe.Name + strconv.Itoa(check.Index)) 81 | // we had a failure for that? 82 | if currentFail := CurrentFailGetAndDec(hash); currentFail != nil { 83 | if currentFail.OkCount == check.NeededSuccesses { 84 | Info.Printf("task '%s', check '%s' is now OK (%s)\n", taskRes.Task.Probe.Name, check.Desc, run.Host.Name) 85 | // send the good news (if the bad one was sent) and delete this currentFail 86 | if currentFail.FailCount >= check.NeededFailures { 87 | message := AlertMessageCreateForCheck(AlertGood, run, taskRes, check, currentFail) 88 | message.RingAlerts() 89 | } 90 | CurrentFailDelete(hash) 91 | } 92 | } 93 | } 94 | } 95 | } 96 | 97 | // Alerts checks for Run failures, Task failures and Check 98 | // failures and call corresponding AlertsFor*() functions 99 | func (run *Run) Alerts() { 100 | run.ClearAnyCurrentTasksFails() 101 | 102 | if run.totalErrorCount() == 0 { 103 | run.ClearAnyCurrentRunFails() 104 | run.DoChecks() 105 | if run.totalTaskResultErrorCount() > 0 { 106 | Info.Printf("found some 'tasks' error(s) (post-checks)\n") 107 | run.AlertsForTasks() 108 | } else { 109 | // ideal path, let's see if there's any check errors ? 110 | run.AlertsForChecks() 111 | } 112 | } else { // run & tasks errors 113 | if len(run.Errors) > 0 { 114 | Info.Printf("found some 'run' error(s)\n") 115 | run.AlertsForRun() 116 | run.ReSchedule() 117 | } else { 118 | Info.Printf("found some 'tasks' error(s)\n") 119 | run.AlertsForTasks() 120 | } 121 | } 122 | 123 | run.ReScheduleFailedTasks() 124 | } 125 | 126 | // ClearAnyCurrentRunFails deletes any currentFail for the Run (same Host) 127 | // and then rings GOOD alerts 128 | func (run *Run) ClearAnyCurrentRunFails() { 129 | for hash, cf := range currentFails { 130 | if cf.RelatedHost == run.Host { 131 | // there was a time when we were only ringing one message 132 | // for the whole host, but it's compliant with UniqueID idea 133 | message := AlertMessageCreateForRun(AlertGood, run, cf) 134 | message.RingAlerts() 135 | CurrentFailDelete(hash) 136 | } 137 | } 138 | } 139 | 140 | // ClearAnyCurrentTasksFails deletes any currentFail for Run Tasks 141 | // and then rings GOOD alerts 142 | func (run *Run) ClearAnyCurrentTasksFails() { 143 | for _, taskRes := range run.TaskResults { 144 | if len(taskRes.Errors) == 0 { 145 | for hash, cf := range currentFails { 146 | if taskRes.Task == cf.RelatedTTask { 147 | message := AlertMessageCreateForTaskResult(AlertGood, run, taskRes, cf) 148 | message.RingAlerts() 149 | CurrentFailDelete(hash) 150 | } 151 | } 152 | } 153 | } 154 | } 155 | -------------------------------------------------------------------------------- /run_streams.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "io" 7 | "os" 8 | "path/filepath" 9 | "strconv" 10 | "strings" 11 | "time" 12 | ) 13 | 14 | func (run *Run) readStdout(std io.Reader, exitStatus chan int) { 15 | scanner := bufio.NewScanner(std) 16 | 17 | for scanner.Scan() { 18 | text := scanner.Text() 19 | result := run.currentTaskResult() 20 | 21 | Trace.Printf("stdout=%s (%s)\n", text, run.Host.Name) 22 | 23 | if len(text) > 2 && text[0:2] == "__" { 24 | parts := strings.Split(text, "=") 25 | switch parts[0] { 26 | case "__EXIT": 27 | if len(parts) != 2 { 28 | run.addError(fmt.Errorf("Invalid __EXIT: %s", text)) 29 | continue 30 | } 31 | status, err := strconv.Atoi(parts[1]) 32 | if err != nil { 33 | run.addError(fmt.Errorf("Invalid __EXIT value: %s", text)) 34 | continue 35 | } 36 | Trace.Printf("EXIT detected: %s (status %d, %s)\n", text, status, run.Host.Name) 37 | exitStatus <- status 38 | default: 39 | run.addError(fmt.Errorf("Unknown keyword: %s", text)) 40 | } 41 | continue 42 | } 43 | 44 | if len(text) > 1 && text[0:1] == "#" { 45 | result.addLog(text) 46 | continue 47 | } 48 | 49 | sep := strings.Index(text, ":") 50 | 51 | if sep == -1 || sep == 0 { 52 | result.addError(fmt.Errorf("invalid script output: '%s'", text)) 53 | continue 54 | } 55 | 56 | paramName := strings.TrimSpace(text[0:sep]) 57 | if !IsValidTokenName(paramName) { 58 | result.addError(fmt.Errorf("invalid parameter name: '%s' (not a valid token name): '%s'", paramName, text)) 59 | continue 60 | } 61 | if !IsAllUpper(paramName) { 62 | result.addError(fmt.Errorf("invalid parameter name: '%s' (upper case needed): '%s'", paramName, text)) 63 | continue 64 | } 65 | 66 | if _, exists := result.Values[paramName]; exists == true { 67 | result.addError(fmt.Errorf("parameter '%s' defined multiple times", paramName)) 68 | continue 69 | } 70 | 71 | value := strings.TrimSpace(text[sep+1:]) 72 | if len(value) == 0 { 73 | result.addError(fmt.Errorf("empty value for parameter '%s'", paramName)) 74 | continue 75 | } 76 | 77 | result.Values[paramName] = value 78 | } 79 | 80 | if err := scanner.Err(); err != nil { 81 | run.addError(fmt.Errorf("Error reading stdout: %s", err)) 82 | } 83 | } 84 | 85 | func (run *Run) readStderr(std io.Reader) { 86 | scanner := bufio.NewScanner(std) 87 | 88 | for scanner.Scan() { 89 | text := scanner.Text() 90 | file := filepath.Base(run.currentTaskResult().Task.Probe.Script) 91 | Trace.Printf("stderr=%s\n", text) 92 | run.currentTaskResult().addError(fmt.Errorf("%s, stderr: %s", file, text)) 93 | } 94 | 95 | if err := scanner.Err(); err != nil { 96 | run.addError(fmt.Errorf("Error reading stderr: %s", err)) 97 | return // !!! 98 | } 99 | } 100 | 101 | // scripts -> ssh 102 | func (run *Run) stdinInject(out io.WriteCloser, exitStatus chan int) { 103 | 104 | defer out.Close() 105 | 106 | // "pkill" dependency or Linux "ps"? (ie: not Cygwin) 107 | _, err := out.Write([]byte("export __MAIN_PID=$$\nfunction __kill_subshells() { pkill -TERM -P $__MAIN_PID cat; }\nexport -f __kill_subshells\n")) 108 | if err != nil { 109 | run.addError(fmt.Errorf("Error writing (setup parent bash): %s", err)) 110 | return 111 | } 112 | 113 | for num, task := range run.Tasks { 114 | 115 | var result TaskResult 116 | run.TaskResults = append(run.TaskResults, &result) 117 | result.StartTime = time.Now() 118 | result.Task = task 119 | result.Host = run.Host 120 | result.ExitStatus = -1 121 | result.Values = make(map[string]string) 122 | 123 | var scanner *bufio.Scanner 124 | 125 | file, erro := os.Open(task.Probe.Script) 126 | if erro != nil { 127 | result.addError(fmt.Errorf("Failed to open script: %s", erro)) 128 | continue 129 | } 130 | defer file.Close() 131 | 132 | scanner = bufio.NewScanner(file) 133 | 134 | args := task.Probe.Arguments 135 | params := make(map[string]interface{}) 136 | for key, val := range task.Probe.Defaults { 137 | params[key] = val 138 | } 139 | // … and let's override defaults with host's ones 140 | for key, val := range run.Host.Defaults { 141 | params[key] = val 142 | } 143 | args = StringExpandVariables(args, params) 144 | 145 | // cat is needed to "focus" stdin only on the child bash 146 | str := fmt.Sprintf("cat | __SCRIPT_ID=%d bash -s -- %s ; echo __EXIT=$?\n", num, args) 147 | Trace.Printf("child(%s)=%s", run.Host.Name, str) 148 | 149 | _, err = out.Write([]byte(str)) 150 | if err != nil { 151 | run.addError(fmt.Errorf("Error writing (starting child bash): %s", err)) 152 | return 153 | } 154 | 155 | // no newline so we dont change line numbers 156 | _, err = out.Write([]byte("trap __kill_subshells EXIT ; ")) 157 | if err != nil { 158 | run.addError(fmt.Errorf("Error writing (init child bash): %s", err)) 159 | return 160 | } 161 | 162 | for scanner.Scan() { 163 | text := scanner.Text() 164 | Trace.Printf("stdin=%s (%s)\n", text, run.Host.Name) 165 | _, errw := out.Write([]byte(text + "\n")) 166 | if errw != nil { 167 | run.addError(fmt.Errorf("Error writing: %s", errw)) 168 | return 169 | } 170 | } 171 | 172 | Trace.Printf("killing subshell (%s)\n", run.Host.Name) 173 | _, err = out.Write([]byte("__kill_subshells\n")) 174 | if err != nil { 175 | run.addError(fmt.Errorf("Error writing (while killing subshell): %s", err)) 176 | return 177 | } 178 | 179 | if err := scanner.Err(); err != nil { 180 | run.addError(fmt.Errorf("Error scanner: %s", err)) 181 | return 182 | } 183 | 184 | status := <-exitStatus 185 | result.ExitStatus = status 186 | if status != 0 { 187 | result.addError(fmt.Errorf("detected non-zero exit status: %d", status)) 188 | } 189 | 190 | result.Duration = time.Now().Sub(result.StartTime) 191 | if result.Duration > result.Task.Probe.Timeout { 192 | result.addError(fmt.Errorf("task duration was too long (timeout is %s)", result.Task.Probe.Timeout)) 193 | } 194 | } 195 | } 196 | 197 | func (run *Run) preparePipes() error { 198 | exitStatus := make(chan int) 199 | session := run.Host.Connection.Session 200 | 201 | stdin, err := session.StdinPipe() 202 | if err != nil { 203 | return fmt.Errorf("Unable to setup stdin for session: %v", err) 204 | } 205 | go run.stdinInject(stdin, exitStatus) 206 | 207 | stdout, err := session.StdoutPipe() 208 | if err != nil { 209 | return fmt.Errorf("Unable to setup stdout for session: %v", err) 210 | } 211 | go run.readStdout(stdout, exitStatus) 212 | 213 | stderr, err := session.StderrPipe() 214 | if err != nil { 215 | return fmt.Errorf("Unable to setup stderr for session: %v", err) 216 | } 217 | go run.readStderr(stderr) 218 | 219 | return nil 220 | } 221 | -------------------------------------------------------------------------------- /ssh.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "crypto/hmac" 7 | "crypto/sha1" 8 | "crypto/x509" 9 | "encoding/base64" 10 | "encoding/pem" 11 | "fmt" 12 | "io" 13 | "io/ioutil" 14 | "net" 15 | "os" 16 | "path/filepath" 17 | "strings" 18 | "time" 19 | 20 | "golang.org/x/crypto/ssh" 21 | "golang.org/x/crypto/ssh/agent" 22 | "golang.org/x/crypto/ssh/knownhosts" 23 | ) 24 | 25 | // Connection is the final form of connection informations of hosts.d files 26 | type Connection struct { 27 | User string 28 | Auths []ssh.AuthMethod 29 | Host string 30 | Port int 31 | Ciphers []string 32 | SSHConnTimeWarn time.Duration 33 | Session *ssh.Session 34 | Client *ssh.Client 35 | } 36 | 37 | // Close will clone the connection and the session 38 | func (connection *Connection) Close() error { 39 | var ( 40 | sessionError error 41 | clientError error 42 | ) 43 | 44 | Trace.Printf("SSH closing connection (%s)\n", connection.Host) 45 | 46 | if connection.Session != nil { 47 | sessionError = connection.Session.Close() 48 | } 49 | if connection.Client != nil { 50 | clientError = connection.Client.Close() 51 | } 52 | 53 | if clientError != nil { 54 | return clientError 55 | } 56 | 57 | return sessionError 58 | } 59 | 60 | // knownHostHash hash hostname using salt64 like ssh is 61 | // doing for "hashed" .ssh/known_hosts files 62 | func knownHostHash(hostname string, salt64 string) string { 63 | buffer, err := base64.StdEncoding.DecodeString(salt64) 64 | if err != nil { 65 | return "" 66 | } 67 | h := hmac.New(sha1.New, buffer) 68 | h.Write([]byte(hostname)) 69 | res := h.Sum(nil) 70 | 71 | hash := base64.StdEncoding.EncodeToString(res) 72 | return hash 73 | } 74 | 75 | // Implements ssh.HostKeyCallback which is now required due to CVE-2017-3204 76 | // see https://github.com/golang/go/issues/29286 for the ecdsa-sha2-nistp256 part 77 | // ("If ClientConfig.HostKeyAlgorithms is not set, a reasonable default is set for acceptable host key type") 78 | func hostKeyChecker(hostname string, remote net.Addr, key ssh.PublicKey) error { 79 | path := filepath.Join(os.Getenv("HOME"), ".ssh", "known_hosts") 80 | hostKeyCallback, err := knownhosts.New(path) 81 | if err != nil { 82 | return err 83 | } 84 | 85 | err = hostKeyCallback(hostname, remote, key) 86 | if err != nil { 87 | return fmt.Errorf("%s, use ssh client to manually connect to %s (you may have to specify algo: ssh -o HostKeyAlgorithms=ecdsa-sha2-nistp256 …)", err, hostname) 88 | } 89 | return nil 90 | } 91 | 92 | // Old ssh.HostKeyCallback implementation 93 | // We parse $HOME/.ssh/known_hosts and check for a matching key + hostname 94 | // Supported : Hashed hostnames, revoked keys (or any other marker), non-standard ports 95 | // Unsupported yet: patterns (*? wildcards) 96 | // This code is temporary, x/crypto/ssh will probably provide something similar. One day. 97 | func _hostKeyChecker(hostname string, remote net.Addr, key ssh.PublicKey) error { 98 | path := filepath.Join(os.Getenv("HOME"), ".ssh", "known_hosts") 99 | file, err := os.Open(path) 100 | if err != nil { 101 | return fmt.Errorf("opening '%s': %s", path, err) 102 | } 103 | defer file.Close() 104 | 105 | // remove standard port if given, add square brackets for non-standard ones 106 | hp := strings.Split(hostname, ":") 107 | if len(hp) == 2 { 108 | if hp[1] == "22" { 109 | hostname = hp[0] 110 | } else { 111 | hostname = "[" + hp[0] + "]:" + hp[1] 112 | } 113 | } 114 | 115 | scanner := bufio.NewScanner(file) 116 | for scanner.Scan() { 117 | marker, hosts, hostKey, _, _, err := ssh.ParseKnownHosts(scanner.Bytes()) 118 | if err == io.EOF { 119 | continue 120 | } 121 | if err != nil { 122 | return fmt.Errorf("parsing '%s': %s", path, err) 123 | } 124 | if marker != "" { 125 | continue // @cert-authority or @revoked 126 | } 127 | fmt.Printf("%s VS %s", key.Marshal(), hostKey.Marshal()) 128 | if bytes.Equal(key.Marshal(), hostKey.Marshal()) { 129 | for _, host := range hosts { 130 | if len(host) > 1 && host[0:1] == "|" { 131 | parts := strings.Split(host, "|") 132 | if parts[1] != "1" { 133 | Trace.Printf("'%s': only type 1 is supported for hashed hosts", path) 134 | continue 135 | } 136 | if knownHostHash(hostname, parts[2]) == parts[3] { 137 | Trace.Printf("successfully found a matching key in '%s' for (hashed) '%s'", path, hostname) 138 | return nil 139 | } 140 | } else { 141 | if host == hostname { 142 | Trace.Printf("successfully found a matching key in '%s' for '%s'", path, hostname) 143 | return nil 144 | } 145 | } 146 | } 147 | Info.Printf("searching '%s' in '%s': found a matching key, but not with exact hostname(s): %s (patterns are not supported yet)", hostname, path, strings.Join(hosts, ", ")) 148 | } 149 | } 150 | 151 | return fmt.Errorf("can't find matching key in '%s' for '%s' (try 'ssh %s' to add it?)", path, hostname, hostname) 152 | } 153 | 154 | func hostKeyBilndTrustChecker(hostname string, remote net.Addr, key ssh.PublicKey) error { 155 | return nil 156 | } 157 | 158 | // Connect will dial SSH server and open a session 159 | func (connection *Connection) Connect() error { 160 | sshConfig := &ssh.ClientConfig{ 161 | User: connection.User, 162 | Auth: connection.Auths, 163 | } 164 | 165 | if GlobalConfig.SSHBlindTrust == true { 166 | sshConfig.HostKeyCallback = hostKeyBilndTrustChecker 167 | } else { 168 | sshConfig.HostKeyCallback = hostKeyChecker 169 | } 170 | 171 | if len(connection.Ciphers) > 0 { 172 | sshConfig.Config = ssh.Config{ 173 | Ciphers: connection.Ciphers, 174 | } 175 | } 176 | 177 | dial, err := ssh.Dial("tcp", fmt.Sprintf("%s:%d", connection.Host, connection.Port), sshConfig) 178 | Trace.Printf("SSH connection to %s@%s:%d\n", connection.User, connection.Host, connection.Port) 179 | if err != nil { 180 | return fmt.Errorf("Failed to dial: %s", err) 181 | } 182 | connection.Client = dial 183 | 184 | session, err := dial.NewSession() 185 | if err != nil { 186 | return fmt.Errorf("Failed to create session: %s", err) 187 | } 188 | connection.Session = session 189 | 190 | return nil 191 | } 192 | 193 | // PublicKeyFile returns an AuthMethod using a private key file 194 | func PublicKeyFile(file string) ssh.AuthMethod { 195 | buffer, err := ioutil.ReadFile(file) 196 | if err != nil { 197 | return nil 198 | } 199 | 200 | key, err := ssh.ParsePrivateKey(buffer) 201 | if err != nil { 202 | return nil 203 | } 204 | return ssh.PublicKeys(key) 205 | } 206 | 207 | // PublicKeyFilePassPhrase returns an AuthMethod using a private key file 208 | // and a passphrase 209 | func PublicKeyFilePassPhrase(file, passphrase string) ssh.AuthMethod { 210 | buffer, err := ioutil.ReadFile(file) 211 | if err != nil { 212 | return nil 213 | } 214 | 215 | block, _ := pem.Decode(buffer) 216 | private, err := x509.DecryptPEMBlock(block, []byte(passphrase)) 217 | if err != nil { 218 | return nil 219 | } 220 | block.Headers = nil 221 | block.Bytes = private 222 | key, err := ssh.ParsePrivateKey(pem.EncodeToMemory(block)) 223 | if err != nil { 224 | return nil 225 | } 226 | return ssh.PublicKeys(key) 227 | } 228 | 229 | // SSHAgent returns an AuthMethod using SSH agent connection. The pubkeyFile 230 | // params restricts the AuthMethod to only one key, so it wont spam the 231 | // SSH server if the agent holds multiple keys. 232 | func SSHAgent(pubkeyFile string) (ssh.AuthMethod, error) { 233 | sshAgent, errd := net.Dial("unix", os.Getenv("SSH_AUTH_SOCK")) 234 | if errd == nil { 235 | agent := agent.NewClient(sshAgent) 236 | 237 | // we'll try every key, then 238 | if pubkeyFile == "" { 239 | return ssh.PublicKeysCallback(agent.Signers), nil 240 | } 241 | 242 | agentSigners, err := agent.Signers() 243 | if err != nil { 244 | return nil, fmt.Errorf("requesting SSH agent key/signer list: %s", err) 245 | } 246 | 247 | buffer, err := ioutil.ReadFile(pubkeyFile) 248 | if err != nil { 249 | return nil, fmt.Errorf("reading public key '%s': %s", pubkeyFile, err) 250 | } 251 | 252 | fields := strings.Fields(string(buffer)) 253 | 254 | if len(fields) < 3 { 255 | return nil, fmt.Errorf("invalid field count for public key '%s'", pubkeyFile) 256 | } 257 | 258 | buffer2, err := base64.StdEncoding.DecodeString(fields[1]) 259 | if err != nil { 260 | return nil, fmt.Errorf("decoding public key '%s': %s", pubkeyFile, err) 261 | } 262 | 263 | key, err := ssh.ParsePublicKey(buffer2) 264 | if err != nil { 265 | return nil, fmt.Errorf("parsing public key '%s': %s", pubkeyFile, err) 266 | } 267 | 268 | for _, potentialSigner := range agentSigners { 269 | if bytes.Compare(key.Marshal(), potentialSigner.PublicKey().Marshal()) == 0 { 270 | Trace.Printf("successfully found %s key in the SSH agent (%s)", pubkeyFile, fields[2]) 271 | cb := func() ([]ssh.Signer, error) { 272 | signers := []ssh.Signer{potentialSigner} 273 | return signers, nil 274 | } 275 | return ssh.PublicKeysCallback(cb), nil 276 | } 277 | } 278 | return nil, fmt.Errorf("can't find '%s' key in the SSH agent", pubkeyFile) 279 | } 280 | return nil, fmt.Errorf("SSH agent: %v (check SSH_AUTH_SOCK?)", errd) 281 | } 282 | -------------------------------------------------------------------------------- /task.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "time" 6 | ) 7 | 8 | // Task structure holds (mainly timing) informations about a Task 9 | // next and previous execution 10 | type Task struct { 11 | Probe *Probe 12 | //~ LastRun time.Time 13 | //~ RunCount int 14 | //~ RemainingTicks int 15 | NextRun time.Time 16 | PrevRun time.Time 17 | } 18 | 19 | // ReSchedule is used to schedule another run for this 20 | // task in the future 21 | func (task *Task) ReSchedule(val time.Time) { 22 | task.PrevRun = task.NextRun 23 | task.NextRun = val 24 | } 25 | 26 | // Taskable returns true if the task is currently available (see RunIf expression) 27 | func (task *Task) Taskable() (bool, error) { 28 | // no RunIf condition? taskable, then 29 | if task.Probe.RunIf == nil { 30 | return true, nil 31 | } 32 | res, err := task.Probe.RunIf.Evaluate(nil) 33 | if err != nil { 34 | return false, fmt.Errorf("%s (run_if expression '%s' probe)", err, task.Probe.Name) 35 | } 36 | if _, ok := res.(bool); ok == false { 37 | return false, fmt.Errorf("'run_if' must return a boolean value (probe '%s')", task.Probe.Name) 38 | } 39 | return res.(bool), nil 40 | } 41 | -------------------------------------------------------------------------------- /task_result.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "regexp" 6 | "strconv" 7 | "time" 8 | ) 9 | 10 | // TaskResult holds informations about Task execution and Check results 11 | type TaskResult struct { 12 | Task *Task 13 | Host *Host 14 | Values map[string]string 15 | ExitStatus int 16 | StartTime time.Time 17 | Duration time.Duration 18 | Logs []string // currently, only output # lines 19 | Errors []error 20 | FailedChecks []*Check 21 | SuccessfulChecks []*Check 22 | } 23 | 24 | func (result *TaskResult) addError(err error) { 25 | Info.Printf("TaskResult error: %s (host '%s')", err, result.Host.Name) 26 | result.Errors = append(result.Errors, err) 27 | } 28 | 29 | func (result *TaskResult) addLog(line string) { 30 | Trace.Printf("TaskResult log: %s (host '%s')", line, result.Host.Name) 31 | result.Logs = append(result.Logs, line) 32 | } 33 | 34 | // DoChecks evaluates every Check in the TaskResult and fills 35 | // FailedChecks and SuccessfulChecks arrays 36 | func (result *TaskResult) DoChecks() { 37 | // build parameter map (with values and defaults) 38 | params := make(map[string]interface{}) 39 | 40 | for key, val := range result.Values { 41 | var err error 42 | if match, _ := regexp.MatchString("^[0-9]+$", val); match == true { 43 | params[key], err = strconv.Atoi(val) 44 | if err != nil { 45 | result.addError(fmt.Errorf("can't convert '%s' to an int (%s)", val, err)) 46 | } 47 | continue 48 | } 49 | if match, _ := regexp.MatchString("^[0-9]+\\.[0-9]+$", val); match == true { 50 | params[key], err = strconv.ParseFloat(val, 64) 51 | if err != nil { 52 | result.addError(fmt.Errorf("can't convert '%s' to a float64 (%s)", val, err)) 53 | } 54 | continue 55 | } 56 | // string 57 | params[key] = val 58 | } 59 | 60 | for key, val := range result.Task.Probe.Defaults { 61 | params[key] = val 62 | } 63 | 64 | // … and let's override defaults with host's ones 65 | for key, val := range result.Host.Defaults { 66 | params[key] = val 67 | } 68 | 69 | for _, check := range result.Task.Probe.Checks { 70 | res, err := check.If.Evaluate(params) 71 | Trace.Printf("%s: %t (err: %s)\n", check.Desc, res, err) 72 | if err != nil { 73 | result.addError(fmt.Errorf("%s (expression '%s' in '%s' check)", err, check.If, check.Desc)) 74 | continue 75 | } 76 | if _, ok := res.(bool); ok == false { 77 | result.addError(fmt.Errorf("[[check]] 'if' must return a boolean value (expression '%s' in '%s' check)", check.If, check.Desc)) 78 | continue 79 | } 80 | 81 | if res == true { 82 | result.FailedChecks = append(result.FailedChecks, check) 83 | } else { 84 | result.SuccessfulChecks = append(result.SuccessfulChecks, check) 85 | } 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /tools.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "crypto/md5" 5 | "encoding/hex" 6 | "fmt" 7 | "regexp" 8 | "strconv" 9 | "strings" 10 | ) 11 | 12 | const stringWordSeparators = "[ \t\n,.;:\\(\\)\\[\\]{}'\"/\\\\!\\?<>@#|*+-=]" 13 | 14 | // IsValidTokenName returns true is argument use only allowed chars for a token 15 | func IsValidTokenName(token string) bool { 16 | match, _ := regexp.MatchString("^[A-Za-z0-9_]+$", token) 17 | return match 18 | } 19 | 20 | // IsAllUpper returns true if string is all uppercase 21 | func IsAllUpper(str string) bool { 22 | return str == strings.ToUpper(str) 23 | } 24 | 25 | // MD5Hash will hash input text and return MD5 sum 26 | func MD5Hash(text string) string { 27 | hasher := md5.New() 28 | hasher.Write([]byte(text)) 29 | return hex.EncodeToString(hasher.Sum(nil)) 30 | } 31 | 32 | // InterfaceValueToString converts most interface types to string 33 | func InterfaceValueToString(iv interface{}) string { 34 | switch iv.(type) { 35 | case int: 36 | return fmt.Sprintf("%d", iv.(int)) 37 | case int32: 38 | return fmt.Sprintf("%d", iv.(int32)) 39 | case int64: 40 | return strconv.FormatInt(iv.(int64), 10) 41 | case float32: 42 | return fmt.Sprintf("%f", iv.(float32)) 43 | case float64: 44 | return strconv.FormatFloat(iv.(float64), 'f', -1, 64) 45 | case string: 46 | return iv.(string) 47 | case bool: 48 | return strconv.FormatBool(iv.(bool)) 49 | } 50 | return "INVALID_TYPE" 51 | } 52 | 53 | // StringFindVariables returns a deduplicated slice of all "variables" ($test) 54 | // in the string 55 | func StringFindVariables(str string) []string { 56 | re := regexp.MustCompile("\\$([a-zA-Z0-9_]+)(" + stringWordSeparators + "|$)") 57 | all := re.FindAllStringSubmatch(str, -1) 58 | 59 | // deduplicate using a map 60 | varMap := make(map[string]bool) 61 | for _, v := range all { 62 | varMap[v[1]] = true 63 | } 64 | 65 | // map to slice 66 | res := []string{} 67 | for name := range varMap { 68 | res = append(res, name) 69 | } 70 | return res 71 | } 72 | 73 | // StringExpandVariables expands "variables" ($test, for instance) in str 74 | // and returns a new string 75 | func StringExpandVariables(str string, variables map[string]interface{}) string { 76 | vars := StringFindVariables(str) 77 | for _, v := range vars { 78 | if val, exists := variables[v]; exists == true { 79 | re := regexp.MustCompile("\\$" + v + "(" + stringWordSeparators + "|$)") 80 | str = re.ReplaceAllString(str, InterfaceValueToString(val)+"${1}") 81 | } 82 | } 83 | return str 84 | } 85 | --------------------------------------------------------------------------------