├── COPYING ├── COPYING.LESSER ├── README ├── collectors ├── 0 │ ├── dfstat.py │ ├── elasticsearch.py │ ├── hadoop_datanode_jmx.py │ ├── hbase_regionserver_jmx.py │ ├── ifstat.py │ ├── iostat.py │ ├── mysql.py │ ├── netstat.py │ ├── procnettcp.py │ ├── procstats.py │ ├── redis-stats.py │ ├── riak.py │ ├── zfsiostats.py │ └── zfskernstats.py ├── etc │ ├── config.py │ └── mysqlconf.py └── lib │ └── jmx-1.0.jar ├── startstop ├── stumbleupon ├── monitoring │ ├── .gitignore │ ├── Makefile │ └── jmx.java └── tcollector.pp └── tcollector.py /COPYING: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | -------------------------------------------------------------------------------- /COPYING.LESSER: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | tcollector is a framework to collect data points and store them in OpenTSDB. 2 | It allows you to write simple collectors that it'll run and monitor. It also 3 | handles the communication with the TSDs. 4 | 5 | For more info, see 6 | 7 | http://www.opentsdb.net/tcollector.html 8 | -------------------------------------------------------------------------------- /collectors/0/dfstat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # This file is part of tcollector. 3 | # Copyright (C) 2010 StumbleUpon, Inc. 4 | # 5 | # This program is free software: you can redistribute it and/or modify it 6 | # under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or (at your 8 | # option) any later version. This program is distributed in the hope that it 9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty 10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser 11 | # General Public License for more details. You should have received a copy 12 | # of the GNU Lesser General Public License along with this program. If not, 13 | # see . 14 | """df disk space and inode counts for TSDB """ 15 | # 16 | # dfstat.py 17 | # 18 | # df.1kblocks.total total size of fs 19 | # df.1kblocks.used blocks used 20 | # df.1kblocks.available blocks available 21 | # df.inodes.total number of inodes 22 | # df.inodes.used number of inodes 23 | # df.inodes.free number of inodes 24 | 25 | # All metrics are tagged with mount= and fstype= 26 | # This makes it easier to exclude stuff like 27 | # tmpfs mounts from disk usage reports. 28 | 29 | # Because tsdb does not like slashes in tags, slashes will 30 | # be replaced by underscores in the mount= tag. In theory 31 | # this could cause problems if you have a mountpoint of 32 | # "/foo/bar/" and "/foo_bar/". 33 | 34 | 35 | import os 36 | import socket 37 | import subprocess 38 | import sys 39 | import time 40 | 41 | 42 | COLLECTION_INTERVAL = 60 # seconds 43 | 44 | def main(): 45 | """dfstats main loop""" 46 | 47 | while True: 48 | ts = int(time.time()) 49 | # 1kblocks 50 | df_proc = subprocess.Popen(["df", "-PlTk"], stdout=subprocess.PIPE) 51 | stdout, _ = df_proc.communicate() 52 | if df_proc.returncode == 0: 53 | for line in stdout.split("\n"): # pylint: disable=E1103 54 | fields = line.split() 55 | # skip header/blank lines 56 | if not line or not fields[2].isdigit(): 57 | continue 58 | # Skip mounts/types we don't care about. 59 | # Most of this stuff is of type tmpfs, but we don't 60 | # want to blacklist all tmpfs since sometimes it's 61 | # used for active filesystems (/var/run, /tmp) 62 | # that we do want to track. 63 | if fields[1] in ("debugfs", "devtmpfs"): 64 | continue 65 | if fields[6] == "/dev": 66 | continue 67 | # /dev/shm, /lib/init_rw, /lib/modules, etc 68 | #if fields[6].startswith(("/lib/", "/dev/")): # python2.5+ 69 | if fields[6].startswith("/lib/"): 70 | continue 71 | if fields[6].startswith("/dev/"): 72 | continue 73 | 74 | mount = fields[6] 75 | print ("df.1kblocks.total %d %s mount=%s fstype=%s" 76 | % (ts, fields[2], mount, fields[1])) 77 | print ("df.1kblocks.used %d %s mount=%s fstype=%s" 78 | % (ts, fields[3], mount, fields[1])) 79 | print ("df.1kblocks.free %d %s mount=%s fstype=%s" 80 | % (ts, fields[4], mount, fields[1])) 81 | else: 82 | print >> sys.stderr, "df -Pltk returned %r" % df_proc.returncode 83 | 84 | ts = int(time.time()) 85 | # inodes 86 | df_proc = subprocess.Popen(["df", "-PlTi"], stdout=subprocess.PIPE) 87 | stdout, _ = df_proc.communicate() 88 | if df_proc.returncode == 0: 89 | for line in stdout.split("\n"): # pylint: disable=E1103 90 | fields = line.split() 91 | if not line or not fields[2].isdigit(): 92 | continue 93 | 94 | mount = fields[6] 95 | print ("df.inodes.total %d %s mount=%s fstype=%s" 96 | % (ts, fields[2], mount, fields[1])) 97 | print ("df.inodes.used %d %s mount=%s fstype=%s" 98 | % (ts, fields[3], mount, fields[1])) 99 | print ("df.inodes.free %d %s mount=%s fstype=%s" 100 | % (ts, fields[4], mount, fields[1])) 101 | else: 102 | print >> sys.stderr, "df -Plti returned %r" % df_proc.returncode 103 | 104 | sys.stdout.flush() 105 | time.sleep(COLLECTION_INTERVAL) 106 | 107 | if __name__ == "__main__": 108 | main() 109 | -------------------------------------------------------------------------------- /collectors/0/elasticsearch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # This file is part of tcollector. 3 | # Copyright (C) 2011 StumbleUpon, Inc. 4 | # 5 | # This program is free software: you can redistribute it and/or modify it 6 | # under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or (at your 8 | # option) any later version. This program is distributed in the hope that it 9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty 10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser 11 | # General Public License for more details. You should have received a copy 12 | # of the GNU Lesser General Public License along with this program. If not, 13 | # see . 14 | """ElasticSearch collector""" # Because ES is cool, bonsai cool. 15 | # Tested with ES 0.16.5 and 0.17.x 16 | 17 | import errno 18 | import httplib 19 | try: 20 | import json 21 | except ImportError: 22 | json = None # Handled gracefully in main. Not available by default in <2.6 23 | import socket 24 | import sys 25 | import time 26 | 27 | 28 | COLLECTION_INTERVAL = 15 # seconds 29 | DEFAULT_TIMEOUT = 10.0 # seconds 30 | ES_HOST = "localhost" 31 | ES_PORT = 9200 # TCP port on which ES listens. 32 | 33 | STATUS_MAP = { 34 | "green": 0, 35 | "yellow": 1, 36 | "red": 2, 37 | } 38 | 39 | 40 | def is_numeric(value): 41 | return isinstance(value, (int, long, float)) 42 | 43 | 44 | def err(msg): 45 | print >>sys.stderr, msg 46 | 47 | 48 | class ESError(RuntimeError): 49 | """Exception raised if we don't get a 200 OK from ElasticSearch.""" 50 | 51 | def __init__(self, resp): 52 | RuntimeError.__init__(self, str(resp)) 53 | self.resp = resp 54 | 55 | 56 | def request(server, uri): 57 | """Does a GET request of the given uri on the given HTTPConnection.""" 58 | server.request("GET", uri) 59 | resp = server.getresponse() 60 | if resp.status != httplib.OK: 61 | raise ESError(resp) 62 | return json.loads(resp.read()) 63 | 64 | 65 | def cluster_health(server): 66 | return request(server, "/_cluster/health") 67 | 68 | 69 | def cluster_state(server): 70 | return request(server, "/_cluster/state" 71 | + "?filter_routing_table=true&filter_metadata=true&filter_blocks=true") 72 | 73 | 74 | def node_stats(server): 75 | return request(server, "/_cluster/nodes/_local/stats") 76 | 77 | 78 | def main(argv): 79 | socket.setdefaulttimeout(DEFAULT_TIMEOUT) 80 | server = httplib.HTTPConnection(ES_HOST, ES_PORT) 81 | try: 82 | server.connect() 83 | except socket.error, (erno, e): 84 | if erno == errno.ECONNREFUSED: 85 | return 13 # No ES running, ask tcollector to not respawn us. 86 | raise 87 | if json is None: 88 | err("This collector requires the `json' Python module.") 89 | return 1 90 | 91 | nstats = node_stats(server) 92 | cluster_name = nstats["cluster_name"] 93 | nodeid, nstats = nstats["nodes"].popitem() 94 | 95 | ts = None 96 | def printmetric(metric, value, **tags): 97 | if tags: 98 | tags = " " + " ".join("%s=%s" % (name, value) 99 | for name, value in tags.iteritems()) 100 | else: 101 | tags = "" 102 | print ("elasticsearch.%s %d %s cluster=%s%s" 103 | % (metric, ts, value, cluster_name, tags)) 104 | 105 | while True: 106 | ts = int(time.time()) 107 | nstats = node_stats(server) 108 | # Check that the node's identity hasn't changed in the mean time. 109 | if nstats["cluster_name"] != cluster_name: 110 | err("cluster_name changed from %r to %r" 111 | % (cluster_name, nstats["cluster_name"])) 112 | return 1 113 | this_nodeid, nstats = nstats["nodes"].popitem() 114 | if this_nodeid != nodeid: 115 | err("node ID changed from %r to %r" % (nodeid, this_nodeid)) 116 | return 1 117 | 118 | is_master = nodeid == cluster_state(server)["master_node"] 119 | printmetric("is_master", int(is_master)) 120 | if is_master: 121 | ts = int(time.time()) # In case last call took a while. 122 | cstats = cluster_health(server) 123 | for stat, value in cstats.iteritems(): 124 | if stat == "status": 125 | value = STATUS_MAP.get(value, -1) 126 | elif not is_numeric(value): 127 | continue 128 | printmetric("cluster." + stat, value) 129 | 130 | ts = nstats["os"]["timestamp"] / 1000 # ms -> s 131 | indices = nstats["indices"] 132 | printmetric("indices.size", indices["size_in_bytes"]) 133 | printmetric("num_docs", indices["docs"]["num_docs"]) 134 | d = indices["cache"] 135 | printmetric("cache.field.evictions", d["field_evictions"]) 136 | printmetric("cache.field.size", d["field_size_in_bytes"]) 137 | printmetric("cache.filter.count", d["filter_count"]) 138 | printmetric("cache.filter.evictions", d["filter_evictions"]) 139 | printmetric("cache.filter.size", d["filter_size_in_bytes"]) 140 | d = indices["merges"] 141 | printmetric("merges.current", d["current"]) 142 | printmetric("merges.total", d["total"]) 143 | printmetric("merges.total_time", d["total_time_in_millis"] / 1000.) 144 | del indices 145 | process = nstats["process"] 146 | ts = process["timestamp"] / 1000 # ms -> s 147 | open_fds = process.get("open_file_descriptors") # ES 0.17 148 | if open_fds is None: 149 | open_fds = process.get("fd") # ES 0.16 150 | if open_fds is not None: 151 | open_fds = open_fds["total"] 152 | if open_fds is not None: 153 | printmetric("process.open_file_descriptors", open_fds) 154 | d = process["cpu"] 155 | printmetric("process.cpu.percent", d["percent"]) 156 | printmetric("process.cpu.sys", d["sys_in_millis"] / 1000.) 157 | printmetric("process.cpu.user", d["user_in_millis"] / 1000.) 158 | d = process["mem"] 159 | printmetric("process.mem.resident", d["resident_in_bytes"]) 160 | printmetric("process.mem.shared", d["share_in_bytes"]) 161 | printmetric("process.mem.total_virtual", d["total_virtual_in_bytes"]) 162 | del process 163 | jvm = nstats["jvm"] 164 | ts = jvm["timestamp"] / 1000 # ms -> s 165 | d = jvm["mem"] 166 | printmetric("jvm.mem.heap_used", d["heap_used_in_bytes"]) 167 | printmetric("jvm.mem.heap_committed", d["heap_committed_in_bytes"]) 168 | printmetric("jvm.mem.non_heap_used", d["non_heap_used_in_bytes"]) 169 | printmetric("jvm.mem.non_heap_committed", d["non_heap_committed_in_bytes"]) 170 | d = jvm["threads"] 171 | printmetric("jvm.threads.count", d["count"]) 172 | printmetric("jvm.threads.peak_count", d["peak_count"]) 173 | for gc, d in jvm["gc"]["collectors"].iteritems(): 174 | printmetric("jvm.gc.collection_count", d["collection_count"], gc=gc) 175 | printmetric("jvm.gc.collection_time", 176 | d["collection_time_in_millis"] / 1000., gc=gc) 177 | del jvm 178 | del d 179 | for stat, value in nstats["network"]["tcp"].iteritems(): 180 | if is_numeric(value): 181 | printmetric("network.tcp." + stat, value) 182 | for stat, value in nstats["transport"].iteritems(): 183 | if is_numeric(value): 184 | printmetric("transport." + stat, value) 185 | # New in ES 0.17: 186 | for stat, value in nstats.get("http", {}).iteritems(): 187 | if is_numeric(value): 188 | printmetric("http." + stat, value) 189 | del nstats 190 | time.sleep(COLLECTION_INTERVAL) 191 | 192 | 193 | if __name__ == "__main__": 194 | sys.exit(main(sys.argv)) 195 | -------------------------------------------------------------------------------- /collectors/0/hadoop_datanode_jmx.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # This file is part of tcollector. 3 | # Copyright (C) 2012 StumbleUpon, Inc. 4 | # 5 | # This program is free software: you can redistribute it and/or modify it 6 | # under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or (at your 8 | # option) any later version. This program is distributed in the hope that it 9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty 10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser 11 | # General Public License for more details. You should have received a copy 12 | # of the GNU Lesser General Public License along with this program. If not, 13 | # see . 14 | 15 | import os 16 | import pwd 17 | import re 18 | import signal 19 | import subprocess 20 | import sys 21 | import time 22 | 23 | # If this user doesn't exist, we'll exit immediately. 24 | # If we're running as root, we'll drop privileges using this user. 25 | USER = "hadoop" 26 | 27 | # We add those files to the classpath if they exist. 28 | CLASSPATH = [ 29 | "/usr/lib/jvm/java-6-sun/lib/tools.jar", 30 | ] 31 | 32 | # Map certain JVM stats so they are unique and shorter 33 | JMX_SERVICE_RENAMING = { 34 | "GarbageCollector": "datanode.gc", 35 | "OperatingSystem": "datanode.os", 36 | "Threading": "datanode.threads", 37 | } 38 | 39 | IGNORED_METRICS = set(["revision", "hdfsUser", "hdfsDate", "hdfsUrl", "date", 40 | "hdfsRevision", "user", "hdfsVersion", "url", "version", 41 | "NamenodeAddress", "Version", "RpcPort", "HttpPort", 42 | # These are useless as-is because they represent the 43 | # thread that's dedicated to serving JMX RPCs. 44 | "CurrentThreadCpuTime", "CurrentThreadUserTime", 45 | # List of directories used by the DataNode. 46 | "StorageInfo", 47 | "VolumeInfo", 48 | ]) 49 | 50 | # How many times, maximum, will we attempt to restart the JMX collector. 51 | # If we reach this limit, we'll exit with an error. 52 | MAX_RESTARTS = 10 53 | 54 | TOP = False # Set to True when we want to terminate. 55 | RETVAL = 0 # Return value set by signal handler. 56 | 57 | 58 | def drop_privileges(): 59 | try: 60 | ent = pwd.getpwnam(USER) 61 | except KeyError: 62 | print >>sys.stderr, "Not running, user '%s' doesn't exist" % USER 63 | sys.exit(13) 64 | 65 | if os.getuid() != 0: 66 | return 67 | 68 | os.setgid(ent.pw_gid) 69 | os.setuid(ent.pw_uid) 70 | 71 | 72 | def kill(proc): 73 | """Kills the subprocess given in argument.""" 74 | # Clean up after ourselves. 75 | proc.stdout.close() 76 | rv = proc.poll() 77 | if rv is None: 78 | os.kill(proc.pid, 15) 79 | rv = proc.poll() 80 | if rv is None: 81 | os.kill(proc.pid, 9) # Bang bang! 82 | rv = proc.wait() # This shouldn't block too long. 83 | print >>sys.stderr, "warning: proc exited %d" % rv 84 | return rv 85 | 86 | 87 | def do_on_signal(signum, func, *args, **kwargs): 88 | """Calls func(*args, **kwargs) before exiting when receiving signum.""" 89 | def signal_shutdown(signum, frame): 90 | print >>sys.stderr, "got signal %d, exiting" % signum 91 | func(*args, **kwargs) 92 | sys.exit(128 + signum) 93 | signal.signal(signum, signal_shutdown) 94 | 95 | 96 | def main(argv): 97 | drop_privileges() 98 | # Build the classpath. 99 | dir = os.path.dirname(sys.argv[0]) 100 | jar = os.path.normpath(dir + "/../lib/jmx-1.0.jar") 101 | if not os.path.exists(jar): 102 | print >>sys.stderr, "WTF?! Can't run, %s doesn't exist" % jar 103 | return 13 104 | classpath = [jar] 105 | for jar in CLASSPATH: 106 | if os.path.exists(jar): 107 | classpath.append(jar) 108 | classpath = ":".join(classpath) 109 | 110 | jmx = subprocess.Popen( 111 | ["java", "-enableassertions", "-enablesystemassertions", # safe++ 112 | "-Xmx64m", # Low RAM limit, to avoid stealing too much from prod. 113 | "-cp", classpath, "com.stumbleupon.monitoring.jmx", 114 | "--watch", "10", "--long", "--timestamp", 115 | "DataNode", # Name of the process. 116 | # The remaining arguments are pairs (mbean_regexp, attr_regexp). 117 | # The first regexp is used to match one or more MBeans, the 2nd 118 | # to match one or more attributes of the MBeans matched. 119 | "hadoop", "", # All HBase / hadoop metrics. 120 | "Threading", "Count|Time$", # Number of threads and CPU time. 121 | "OperatingSystem", "OpenFile", # Number of open files. 122 | "GarbageCollector", "Collection", # GC runs and time spent GCing. 123 | ], stdout=subprocess.PIPE, bufsize=1) 124 | do_on_signal(signal.SIGINT, kill, jmx) 125 | do_on_signal(signal.SIGPIPE, kill, jmx) 126 | do_on_signal(signal.SIGTERM, kill, jmx) 127 | try: 128 | prev_timestamp = 0 129 | while True: 130 | line = jmx.stdout.readline() 131 | 132 | if not line and jmx.poll() is not None: 133 | break # Nothing more to read and process exited. 134 | elif len(line) < 4: 135 | print >>sys.stderr, "invalid line (too short): %r" % line 136 | continue 137 | 138 | timestamp, metric, value, mbean = line.split("\t", 3) 139 | # Sanitize the timestamp. 140 | try: 141 | timestamp = int(timestamp) 142 | if timestamp < time.time() - 600: 143 | raise ValueError("timestamp too old: %d" % timestamp) 144 | if timestamp < prev_timestamp: 145 | raise ValueError("timestamp out of order: prev=%d, new=%d" 146 | % (prev_timestamp, timestamp)) 147 | except ValueError, e: 148 | print >>sys.stderr, ("Invalid timestamp on line: %r -- %s" 149 | % (line, e)) 150 | continue 151 | prev_timestamp = timestamp 152 | 153 | if metric in IGNORED_METRICS: 154 | continue 155 | 156 | tags = "" 157 | # The JMX metrics have per-request-type metrics like so: 158 | # metricNameNumOps 159 | # metricNameMinTime 160 | # metricNameMaxTime 161 | # metricNameAvgTime 162 | # Group related metrics together in the same metric name, use tags 163 | # to separate the different request types, so we end up with: 164 | # numOps op=metricName 165 | # avgTime op=metricName 166 | # etc, which makes it easier to graph things with the TSD. 167 | if metric.endswith("MinTime"): # We don't care about the minimum 168 | continue # time taken by operations. 169 | elif metric.endswith("NumOps"): 170 | tags = " op=" + metric[:-6] 171 | metric = "numOps" 172 | elif metric.endswith("AvgTime"): 173 | tags = " op=" + metric[:-7] 174 | metric = "avgTime" 175 | elif metric.endswith("MaxTime"): 176 | tags = " op=" + metric[:-7] 177 | metric = "maxTime" 178 | 179 | # mbean is of the form "domain:key=value,...,foo=bar" 180 | # some tags can have spaces, so we need to fix that. 181 | mbean_domain, mbean_properties = mbean.rstrip().replace(" ", "_").split(":", 1) 182 | if mbean_domain not in ("hadoop", "java.lang"): 183 | print >>sys.stderr, ("Unexpected mbean domain = %r on line %r" 184 | % (mbean_domain, line)) 185 | continue 186 | mbean_properties = dict(prop.split("=", 1) 187 | for prop in mbean_properties.split(",")) 188 | if mbean_domain == "hadoop": 189 | # jmx_service is HBase by default, but we can also have 190 | # RegionServer or Replication and such. 191 | jmx_service = mbean_properties.get("service", "HBase") 192 | if jmx_service == "HBase": 193 | jmx_service = "regionserver" 194 | elif mbean_domain == "java.lang": 195 | jmx_service = mbean_properties.pop("type", "jvm") 196 | if mbean_properties: 197 | tags += " " + " ".join(k + "=" + v for k, v in 198 | mbean_properties.iteritems()) 199 | else: 200 | assert 0, "Should never be here" 201 | 202 | jmx_service = JMX_SERVICE_RENAMING.get(jmx_service, jmx_service) 203 | metric = jmx_service.lower() + "." + metric 204 | 205 | sys.stdout.write("hadoop.%s %d %s%s\n" 206 | % (metric, timestamp, value, tags)) 207 | sys.stdout.flush() 208 | finally: 209 | kill(jmx) 210 | time.sleep(300) 211 | return 0 # Ask the tcollector to re-spawn us. 212 | 213 | 214 | if __name__ == "__main__": 215 | sys.exit(main(sys.argv)) 216 | -------------------------------------------------------------------------------- /collectors/0/hbase_regionserver_jmx.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # This file is part of tcollector. 3 | # Copyright (C) 2010 StumbleUpon, Inc. 4 | # 5 | # This program is free software: you can redistribute it and/or modify it 6 | # under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or (at your 8 | # option) any later version. This program is distributed in the hope that it 9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty 10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser 11 | # General Public License for more details. You should have received a copy 12 | # of the GNU Lesser General Public License along with this program. If not, 13 | # see . 14 | 15 | import os 16 | import pwd 17 | import re 18 | import signal 19 | import subprocess 20 | import sys 21 | import time 22 | import traceback 23 | 24 | # If this user doesn't exist, we'll exit immediately. 25 | # If we're running as root, we'll drop privileges using this user. 26 | USER = "hadoop" 27 | 28 | # We add those files to the classpath if they exist. 29 | CLASSPATH = [ 30 | "/usr/lib/jvm/java-6-sun/lib/tools.jar", 31 | ] 32 | 33 | # We shorten certain strings to avoid excessively long metric names. 34 | JMX_SERVICE_RENAMING = { 35 | "GarbageCollector": "gc", 36 | "OperatingSystem": "os", 37 | "Threading": "threads", 38 | # New in 0.92.1, from HBASE-5325: 39 | "org.apache.hbase": "hbase", 40 | } 41 | 42 | def drop_privileges(): 43 | try: 44 | ent = pwd.getpwnam(USER) 45 | except KeyError: 46 | print >>sys.stderr, "Not running, user '%s' doesn't exist" % USER 47 | sys.exit(13) 48 | 49 | if os.getuid() != 0: 50 | return 51 | 52 | os.setgid(ent.pw_gid) 53 | os.setuid(ent.pw_uid) 54 | 55 | 56 | def kill(proc): 57 | """Kills the subprocess given in argument.""" 58 | # Clean up after ourselves. 59 | proc.stdout.close() 60 | rv = proc.poll() 61 | if rv is None: 62 | os.kill(proc.pid, 15) 63 | rv = proc.poll() 64 | if rv is None: 65 | os.kill(proc.pid, 9) # Bang bang! 66 | rv = proc.wait() # This shouldn't block too long. 67 | print >>sys.stderr, "warning: proc exited %d" % rv 68 | return rv 69 | 70 | 71 | def do_on_signal(signum, func, *args, **kwargs): 72 | """Calls func(*args, **kwargs) before exiting when receiving signum.""" 73 | def signal_shutdown(signum, frame): 74 | print >>sys.stderr, "got signal %d, exiting" % signum 75 | func(*args, **kwargs) 76 | sys.exit(128 + signum) 77 | signal.signal(signum, signal_shutdown) 78 | 79 | 80 | def main(argv): 81 | drop_privileges() 82 | # Build the classpath. 83 | dir = os.path.dirname(sys.argv[0]) 84 | jar = os.path.normpath(dir + "/../lib/jmx-1.0.jar") 85 | if not os.path.exists(jar): 86 | print >>sys.stderr, "WTF?! Can't run, %s doesn't exist" % jar 87 | return 13 88 | classpath = [jar] 89 | for jar in CLASSPATH: 90 | if os.path.exists(jar): 91 | classpath.append(jar) 92 | classpath = ":".join(classpath) 93 | 94 | jmx = subprocess.Popen( 95 | ["java", "-enableassertions", "-enablesystemassertions", # safe++ 96 | "-Xmx64m", # Low RAM limit, to avoid stealing too much from prod. 97 | "-cp", classpath, "com.stumbleupon.monitoring.jmx", 98 | "--watch", "10", "--long", "--timestamp", 99 | "HRegionServer", # Name of the process. 100 | # The remaining arguments are pairs (mbean_regexp, attr_regexp). 101 | # The first regexp is used to match one or more MBeans, the 2nd 102 | # to match one or more attributes of the MBeans matched. 103 | "hadoop", "", # All HBase / hadoop metrics. 104 | "Threading", "Count|Time$", # Number of threads and CPU time. 105 | "OperatingSystem", "OpenFile", # Number of open files. 106 | "GarbageCollector", "Collection", # GC runs and time spent GCing. 107 | ], stdout=subprocess.PIPE, bufsize=1) 108 | do_on_signal(signal.SIGINT, kill, jmx) 109 | do_on_signal(signal.SIGPIPE, kill, jmx) 110 | do_on_signal(signal.SIGTERM, kill, jmx) 111 | try: 112 | prev_timestamp = 0 113 | while True: 114 | line = jmx.stdout.readline() 115 | 116 | if not line and jmx.poll() is not None: 117 | break # Nothing more to read and process exited. 118 | elif len(line) < 4: 119 | print >>sys.stderr, "invalid line (too short): %r" % line 120 | continue 121 | 122 | try: 123 | timestamp, metric, value, mbean = line.split("\t", 3) 124 | except ValueError, e: 125 | # Temporary workaround for jmx.jar not printing these lines we 126 | # don't care about anyway properly. 127 | if "java.lang.String" not in line: 128 | print >>sys.stderr, "Can't split line: %r" % line 129 | continue 130 | 131 | # Sanitize the timestamp. 132 | try: 133 | timestamp = int(timestamp) 134 | if timestamp < time.time() - 600: 135 | raise ValueError("timestamp too old: %d" % timestamp) 136 | if timestamp < prev_timestamp: 137 | raise ValueError("timestamp out of order: prev=%d, new=%d" 138 | % (prev_timestamp, timestamp)) 139 | except ValueError, e: 140 | print >>sys.stderr, ("Invalid timestamp on line: %r -- %s" 141 | % (line, e)) 142 | continue 143 | prev_timestamp = timestamp 144 | 145 | tags = "" 146 | # The JMX metrics have per-request-type metrics like so: 147 | # metricNameNumOps 148 | # metricNameMinTime 149 | # metricNameMaxTime 150 | # metricNameAvgTime 151 | # Group related metrics together in the same metric name, use tags 152 | # to separate the different request types, so we end up with: 153 | # numOps op=metricName 154 | # avgTime op=metricName 155 | # etc, which makes it easier to graph things with the TSD. 156 | if metric.endswith("MinTime"): # We don't care about the minimum 157 | continue # time taken by operations. 158 | elif metric.endswith("NumOps"): 159 | tags = " op=" + metric[:-6] 160 | metric = "numOps" 161 | elif metric.endswith("AvgTime"): 162 | tags = " op=" + metric[:-7] 163 | metric = "avgTime" 164 | elif metric.endswith("MaxTime"): 165 | tags = " op=" + metric[:-7] 166 | metric = "maxTime" 167 | 168 | # mbean is of the form "domain:key=value,...,foo=bar" 169 | mbean_domain, mbean_properties = mbean.rstrip().split(":", 1) 170 | if mbean_domain not in ("hadoop", "java.lang"): 171 | print >>sys.stderr, ("Unexpected mbean domain = %r on line %r" 172 | % (mbean_domain, line)) 173 | continue 174 | mbean_properties = dict(prop.split("=", 1) 175 | for prop in mbean_properties.split(",")) 176 | if mbean_domain == "hadoop": 177 | # jmx_service is HBase by default, but we can also have 178 | # RegionServer or Replication and such. 179 | jmx_service = mbean_properties.get("service", "HBase") 180 | if jmx_service == "HBase": 181 | jmx_service = "regionserver" 182 | elif mbean_domain == "java.lang": 183 | jmx_service = mbean_properties.pop("type", "jvm") 184 | if mbean_properties: 185 | tags += " " + " ".join(k + "=" + v for k, v in 186 | mbean_properties.iteritems()) 187 | else: 188 | assert 0, "Should never be here" 189 | 190 | # Hack. Right now, the RegionServer is printing stats for its own 191 | # replication queue, but when another RegionServer dies, this one 192 | # may take over the replication queue of the dead one. When this 193 | # happens, we'll get the same metrics multiple times, because 194 | # internally the RegionServer has multiple queues (although only 195 | # only one is actively used, the other ones get flushed and 196 | # discarded). The following `if' statement is simply discarding 197 | # stats for "recovered" replication queues, because we can't keep 198 | # track of them properly in TSDB, because there is no sensible 199 | # tag we can use to differentiate queues. 200 | if jmx_service == "Replication": 201 | attr_name = mbean_properties.get("name", "") 202 | # Normally the attribute will look this: 203 | # ReplicationSource for 204 | # Where is the ID of the destination cluster. 205 | # But when this is the recovered queue of a dead RegionServer: 206 | # ReplicationSource for -%2C%2C 207 | # Where , and relate to the dead RS. 208 | # So we discriminate those entries by looking for a dash. 209 | if "ReplicationSource" in attr_name and "-" in attr_name: 210 | continue 211 | 212 | jmx_service = JMX_SERVICE_RENAMING.get(jmx_service, jmx_service) 213 | jmx_service, repl_count = re.subn("[^a-zA-Z0-9]+", ".", 214 | jmx_service) 215 | if repl_count: 216 | print >>sys.stderr, ("Warning: found malformed" 217 | " jmx_service=%r on line=%r" 218 | % (mbean_properties["service"], line)) 219 | metric = jmx_service.lower() + "." + metric 220 | 221 | sys.stdout.write("hbase.%s %d %s%s\n" 222 | % (metric, timestamp, value, tags)) 223 | sys.stdout.flush() 224 | finally: 225 | kill(jmx) 226 | time.sleep(300) 227 | return 0 # Ask the tcollector to re-spawn us. 228 | 229 | 230 | if __name__ == "__main__": 231 | sys.exit(main(sys.argv)) 232 | -------------------------------------------------------------------------------- /collectors/0/ifstat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # This file is part of tcollector. 3 | # Copyright (C) 2010 StumbleUpon, Inc. 4 | # 5 | # This program is free software: you can redistribute it and/or modify it 6 | # under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or (at your 8 | # option) any later version. This program is distributed in the hope that it 9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty 10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser 11 | # General Public License for more details. You should have received a copy 12 | # of the GNU Lesser General Public License along with this program. If not, 13 | # see . 14 | # 15 | """network interface stats for TSDB""" 16 | 17 | import os 18 | import sys 19 | import time 20 | import socket 21 | import re 22 | 23 | 24 | # /proc/net/dev has 16 fields, 8 for receive and 8 for xmit 25 | # The fields we care about are defined here. The 26 | # ones we want to skip we just leave empty. 27 | # So we can aggregate up the total bytes, packets, etc 28 | # we tag each metric with direction=in or =out 29 | # and iface= 30 | 31 | FIELDS = ("bytes", "packets", "errs", "dropped", 32 | None, None, None, None,) 33 | 34 | def main(): 35 | """ifstat main loop""" 36 | interval = 15 37 | 38 | f_netdev = open("/proc/net/dev", "r") 39 | 40 | # We just care about ethN interfaces. We specifically 41 | # want to avoid bond interfaces, because interface 42 | # stats are still kept on the child interfaces when 43 | # you bond. By skipping bond we avoid double counting. 44 | while True: 45 | f_netdev.seek(0) 46 | ts = int(time.time()) 47 | for line in f_netdev: 48 | m = re.match("\s+(eth\d+):(.*)", line) 49 | if not m: 50 | continue 51 | stats = m.group(2).split(None) 52 | for i in range(8): 53 | if FIELDS[i]: 54 | print ("proc.net.%s %d %s iface=%s direction=in" 55 | % (FIELDS[i], ts, stats[i], m.group(1))) 56 | print ("proc.net.%s %d %s iface=%s direction=out" 57 | % (FIELDS[i], ts, stats[i+8], m.group(1))) 58 | 59 | sys.stdout.flush() 60 | time.sleep(interval) 61 | 62 | if __name__ == "__main__": 63 | main() 64 | 65 | -------------------------------------------------------------------------------- /collectors/0/iostat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # This file is part of tcollector. 3 | # Copyright (C) 2010 StumbleUpon, Inc. 4 | # 5 | # This program is free software: you can redistribute it and/or modify it 6 | # under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or (at your 8 | # option) any later version. This program is distributed in the hope that it 9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty 10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser 11 | # General Public License for more details. You should have received a copy 12 | # of the GNU Lesser General Public License along with this program. If not, 13 | # see . 14 | 15 | """iostat statistics for TSDB""" 16 | 17 | # data is from /proc/diskstats 18 | 19 | # Calculate disk statistics. We handle 2.6 kernel output only, both 20 | # pre-2.6.25 and post (which added back per-partition disk stats). 21 | # (diskstats output significantly changed from 2.4). 22 | # The fields (from iostats.txt) are mainly rate counters 23 | # (either number of operations or number of milliseconds doing a 24 | # particular operation), so let's just let TSD do the rate 25 | # calculation for us. 26 | # 27 | # /proc/diskstats has 11 stats for a given device 28 | # these are all rate counters except ios_in_progress 29 | # .read_requests Number of reads completed 30 | # .read_merged Number of reads merged 31 | # .read_sectors Number of sectors read 32 | # .msec_read Time in msec spent reading 33 | # .write_requests Number of writes completed 34 | # .write_merged Number of writes merged 35 | # .write_sectors Number of sectors written 36 | # .msec_write Time in msec spent writing 37 | # .ios_in_progress Number of I/O operations in progress 38 | # .msec_total Time in msec doing I/O 39 | # .msec_weighted_total Weighted time doing I/O (multiplied by ios_in_progress) 40 | 41 | # in 2.6.25 and later, by-partition stats are reported same as disks 42 | # in 2.6 before 2.6.25, partitions have 4 stats per partition 43 | # .read_issued 44 | # .read_sectors 45 | # .write_issued 46 | # .write_sectors 47 | # For partitions, these *_issued are counters collected before 48 | # requests are merged, so aren't the same as *_requests (which is 49 | # post-merge, which more closely represents represents the actual 50 | # number of disk transactions). 51 | 52 | # Given that diskstats provides both per-disk and per-partition data, 53 | # for TSDB purposes we want to put them under different metrics (versus 54 | # the same metric and different tags). Otherwise, if you look at a 55 | # given metric, the data for a given box will be double-counted, since 56 | # a given operation will increment both the disk series and the 57 | # partition series. To fix this, we output by-disk data to iostat.disk.* 58 | # and by-partition data to iostat.part.*. 59 | 60 | # TODO: Add additional tags to map partitions/disks back to mount 61 | # points/swap so you can (for example) plot just swap partition 62 | # activity or /var/lib/mysql partition activity no matter which 63 | # disk/partition this happens to be. This is nontrivial, especially 64 | # when you have to handle mapping of /dev/mapper to dm-N, pulling out 65 | # swap partitions from /proc/swaps, etc. 66 | 67 | # TODO: add some generated stats from iostat -x like svctm, await, 68 | # %util. These need to pull in cpu idle counters from /proc. 69 | 70 | 71 | import os 72 | import socket 73 | import sys 74 | import time 75 | 76 | COLLECTION_INTERVAL = 60 # seconds 77 | 78 | # Docs come from the Linux kernel's Documentation/iostats.txt 79 | FIELDS_DISK = ( 80 | "read_requests", # Total number of reads completed successfully. 81 | "read_merged", # Adjacent read requests merged in a single req. 82 | "read_sectors", # Total number of sectors read successfully. 83 | "msec_read", # Total number of ms spent by all reads. 84 | "write_requests", # total number of writes completed successfully. 85 | "write_merged", # Adjacent write requests merged in a single req. 86 | "write_sectors", # total number of sectors written successfully. 87 | "msec_write", # Total number of ms spent by all writes. 88 | "ios_in_progress", # Number of actual I/O requests currently in flight. 89 | "msec_total", # Amount of time during which ios_in_progress >= 1. 90 | "msec_weighted_total", # Measure of recent I/O completion time and backlog. 91 | ) 92 | 93 | FIELDS_PART = ("read_issued", 94 | "read_sectors", 95 | "write_issued", 96 | "write_sectors", 97 | ) 98 | 99 | 100 | def main(): 101 | """iostats main loop.""" 102 | f_diskstats = open("/proc/diskstats", "r") 103 | 104 | while True: 105 | f_diskstats.seek(0) 106 | ts = int(time.time()) 107 | for line in f_diskstats: 108 | # maj, min, devicename, [list of stats, see above] 109 | values = line.split(None) 110 | # shortcut the deduper and just skip disks that 111 | # haven't done a single read. This elimiates a bunch 112 | # of loopback, ramdisk, and cdrom devices but still 113 | # lets us report on the rare case that we actually use 114 | # a ramdisk. 115 | if values[3] == "0": 116 | continue 117 | 118 | if int(values[1]) % 16 == 0 and int(values[0]) > 1: 119 | metric = "iostat.disk." 120 | else: 121 | metric = "iostat.part." 122 | 123 | # Sometimes there can be a slash in the device name, see bug #8. 124 | # TODO(tsuna): Remove the substitution once TSD allows `/' in tags. 125 | device = values[2].replace("/", "_") 126 | if len(values) == 14: 127 | # full stats line 128 | for i in range(11): 129 | print ("%s%s %d %s dev=%s" 130 | % (metric, FIELDS_DISK[i], ts, values[i+3], 131 | device)) 132 | elif len(values) == 7: 133 | # partial stats line 134 | for i in range(4): 135 | print ("%s%s %d %s dev=%s" 136 | % (metric, FIELDS_PART[i], ts, values[i+3], 137 | device)) 138 | else: 139 | print >> sys.stderr, "Cannot parse /proc/diskstats line: ", line 140 | continue 141 | 142 | sys.stdout.flush() 143 | time.sleep(COLLECTION_INTERVAL) 144 | 145 | 146 | 147 | if __name__ == "__main__": 148 | main() 149 | 150 | -------------------------------------------------------------------------------- /collectors/0/mysql.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # This file is part of tcollector. 3 | # Copyright (C) 2011 StumbleUpon, Inc. 4 | # 5 | # This program is free software: you can redistribute it and/or modify it 6 | # under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or (at your 8 | # option) any later version. This program is distributed in the hope that it 9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty 10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser 11 | # General Public License for more details. You should have received a copy 12 | # of the GNU Lesser General Public License along with this program. If not, 13 | # see . 14 | """Collector for MySQL.""" 15 | 16 | import errno 17 | import os 18 | import re 19 | import socket 20 | import stat 21 | import sys 22 | import time 23 | 24 | try: 25 | import MySQLdb 26 | except ImportError: 27 | MySQLdb = None # This is handled gracefully in main() 28 | 29 | # This is really ugly, but we don't have a good way of passing 30 | # configuration data down to the collectors at the moment :( 31 | sys.path.append(os.path.dirname(sys.argv[0]) + "/../etc") 32 | import mysqlconf 33 | 34 | COLLECTION_INTERVAL = 15 # seconds 35 | CONNECT_TIMEOUT = 2 # seconds 36 | # How frequently we try to find new databases. 37 | DB_REFRESH_INTERVAL = 60 # seconds 38 | # Usual locations where to find the default socket file. 39 | DEFAULT_SOCKFILES = set([ 40 | "/tmp/mysql.sock", # MySQL's own default. 41 | "/var/lib/mysql/mysql.sock", # RH-type / RPM systems. 42 | "/var/run/mysqld/mysqld.sock", # Debian-type systems. 43 | ]) 44 | # Directories under which to search additional socket files. 45 | SEARCH_DIRS = [ 46 | "/var/lib/mysql", 47 | ] 48 | 49 | def err(msg): 50 | print >>sys.stderr, msg 51 | 52 | class DB(object): 53 | """Represents a MySQL server (as we can monitor more than 1 MySQL).""" 54 | 55 | def __init__(self, sockfile, dbname, db, cursor, version): 56 | """Constructor. 57 | 58 | Args: 59 | sockfile: Path to the socket file. 60 | dbname: Name of the database for that socket file. 61 | db: A MySQLdb connection opened to that socket file. 62 | cursor: A cursor acquired from that connection. 63 | version: What version is this MySQL running (from `SELECT VERSION()'). 64 | """ 65 | self.sockfile = sockfile 66 | self.dbname = dbname 67 | self.db = db 68 | self.cursor = cursor 69 | self.version = version 70 | self.master = None 71 | self.slave_bytes_executed = None 72 | self.relay_bytes_relayed = None 73 | 74 | version = version.split(".") 75 | try: 76 | self.major = int(version[0]) 77 | self.medium = int(version[1]) 78 | except (ValueError, IndexError), e: 79 | self.major = self.medium = 0 80 | 81 | def __str__(self): 82 | return "DB(%r, %r, version=%r)" % (self.sockfile, self.dbname, 83 | self.version) 84 | 85 | def __repr__(self): 86 | return self.__str__() 87 | 88 | def isShowGlobalStatusSafe(self): 89 | """Returns whether or not SHOW GLOBAL STATUS is safe to run.""" 90 | # We can't run SHOW GLOBAL STATUS on versions prior to 5.1 because it 91 | # locks the entire database for too long and severely impacts traffic. 92 | return self.major > 5 or (self.major == 5 and self.medium >= 1) 93 | 94 | def query(self, sql): 95 | """Executes the given SQL statement and returns a sequence of rows.""" 96 | assert self.cursor, "%s already closed?" % (self,) 97 | try: 98 | self.cursor.execute(sql) 99 | except MySQLdb.OperationalError, (errcode, msg): 100 | if errcode != 2006: # "MySQL server has gone away" 101 | raise 102 | self._reconnect() 103 | return self.cursor.fetchall() 104 | 105 | def close(self): 106 | """Closes the connection to this MySQL server.""" 107 | if self.cursor: 108 | self.cursor.close() 109 | self.cursor = None 110 | if self.db: 111 | self.db.close() 112 | self.db = None 113 | 114 | def _reconnect(self): 115 | """Reconnects to this MySQL server.""" 116 | self.close() 117 | self.db = mysql_connect(self.sockfile) 118 | self.cursor = self.db.cursor() 119 | 120 | 121 | def mysql_connect(sockfile): 122 | """Connects to the MySQL server using the specified socket file.""" 123 | user, passwd = mysqlconf.get_user_password(sockfile) 124 | return MySQLdb.connect(unix_socket=sockfile, 125 | connect_timeout=CONNECT_TIMEOUT, 126 | user=user, passwd=passwd) 127 | 128 | 129 | def todict(db, row): 130 | """Transforms a row (returned by DB.query) into a dict keyed by column names. 131 | 132 | Args: 133 | db: The DB instance from which this row was obtained. 134 | row: A row as returned by DB.query 135 | """ 136 | d = {} 137 | for i, field in enumerate(db.cursor.description): 138 | column = field[0].lower() # Lower-case to normalize field names. 139 | d[column] = row[i] 140 | return d 141 | 142 | def get_dbname(sockfile): 143 | """Returns the name of the DB based on the path to the socket file.""" 144 | if sockfile in DEFAULT_SOCKFILES: 145 | return "default" 146 | m = re.search("/mysql-(.+)/[^.]+\.sock$", sockfile) 147 | if not m: 148 | err("error: couldn't guess the name of the DB for " + sockfile) 149 | return None 150 | return m.group(1) 151 | 152 | 153 | def is_sockfile(path): 154 | """Returns whether or not the given path is a socket file.""" 155 | try: 156 | s = os.stat(path) 157 | except OSError, (no, e): 158 | if no == errno.ENOENT: 159 | return False 160 | err("warning: couldn't stat(%r): %s" % (path, e)) 161 | return None 162 | return s.st_mode & stat.S_IFSOCK == stat.S_IFSOCK 163 | 164 | 165 | def find_sockfiles(): 166 | """Returns a list of paths to socket files to monitor.""" 167 | paths = [] 168 | # Look for socket files. 169 | for dir in SEARCH_DIRS: 170 | if not os.path.isdir(dir): 171 | continue 172 | for name in os.listdir(dir): 173 | subdir = os.path.join(dir, name) 174 | if not os.path.isdir(subdir): 175 | continue 176 | for subname in os.listdir(subdir): 177 | path = os.path.join(subdir, subname) 178 | if is_sockfile(path): 179 | paths.append(path) 180 | break # We only expect 1 socket file per DB, so get out. 181 | # Try the default locations. 182 | for sockfile in DEFAULT_SOCKFILES: 183 | if not is_sockfile(sockfile): 184 | continue 185 | paths.append(sockfile) 186 | return paths 187 | 188 | 189 | def find_databases(dbs=None): 190 | """Returns a map of dbname (string) to DB instances to monitor. 191 | 192 | Args: 193 | dbs: A map of dbname (string) to DB instances already monitored. 194 | This map will be modified in place if it's not None. 195 | """ 196 | sockfiles = find_sockfiles() 197 | if dbs is None: 198 | dbs = {} 199 | for sockfile in sockfiles: 200 | dbname = get_dbname(sockfile) 201 | if dbname in dbs: 202 | continue 203 | if not dbname: 204 | continue 205 | try: 206 | db = mysql_connect(sockfile) 207 | cursor = db.cursor() 208 | cursor.execute("SELECT VERSION()") 209 | except (EnvironmentError, EOFError, RuntimeError, socket.error, 210 | MySQLdb.MySQLError), e: 211 | err("Couldn't connect to %s: %s" % (sockfile, e)) 212 | continue 213 | version = cursor.fetchone()[0] 214 | dbs[dbname] = DB(sockfile, dbname, db, cursor, version) 215 | return dbs 216 | 217 | 218 | def now(): 219 | return int(time.time()) 220 | 221 | 222 | def isyes(s): 223 | if s.lower() == "yes": 224 | return 1 225 | return 0 226 | 227 | 228 | def collectInnodbStatus(db): 229 | """Collects and prints InnoDB stats about the given DB instance.""" 230 | ts = now() 231 | def printmetric(metric, value, tags=""): 232 | print "mysql.%s %d %s schema=%s%s" % (metric, ts, value, db.dbname, tags) 233 | 234 | innodb_status = db.query("SHOW ENGINE INNODB STATUS")[0][2] 235 | m = re.search("^(\d{6}\s+\d{1,2}:\d\d:\d\d) INNODB MONITOR OUTPUT$", 236 | innodb_status, re.M) 237 | if m: # If we have it, try to use InnoDB's own timestamp. 238 | ts = int(time.mktime(time.strptime(m.group(1), "%y%m%d %H:%M:%S"))) 239 | 240 | line = None 241 | def match(regexp): 242 | return re.match(regexp, line) 243 | 244 | for line in innodb_status.split("\n"): 245 | # SEMAPHORES 246 | m = match("OS WAIT ARRAY INFO: reservation count (\d+), signal count (\d+)") 247 | if m: 248 | printmetric("innodb.oswait_array.reservation_count", m.group(1)) 249 | printmetric("innodb.oswait_array.signal_count", m.group(2)) 250 | continue 251 | m = match("Mutex spin waits (\d+), rounds (\d+), OS waits (\d+)") 252 | if m: 253 | printmetric("innodb.locks.spin_waits", m.group(1), " type=mutex") 254 | printmetric("innodb.locks.rounds", m.group(2), " type=mutex") 255 | printmetric("innodb.locks.os_waits", m.group(3), " type=mutex") 256 | continue 257 | m = match("RW-shared spins (\d+), OS waits (\d+);" 258 | " RW-excl spins (\d+), OS waits (\d+)") 259 | if m: 260 | printmetric("innodb.locks.spin_waits", m.group(1), " type=rw-shared") 261 | printmetric("innodb.locks.os_waits", m.group(2), " type=rw-shared") 262 | printmetric("innodb.locks.spin_waits", m.group(3), " type=rw-exclusive") 263 | printmetric("innodb.locks.os_waits", m.group(4), " type=rw-exclusive") 264 | continue 265 | # INSERT BUFFER AND ADAPTIVE HASH INDEX 266 | # TODO(tsuna): According to the code in ibuf0ibuf.c, this line and 267 | # the following one can appear multiple times. I've never seen this. 268 | # If that happens, we need to aggregate the values here instead of 269 | # printing them directly. 270 | m = match("Ibuf: size (\d+), free list len (\d+), seg size (\d+),") 271 | if m: 272 | printmetric("innodb.ibuf.size", m.group(1)) 273 | printmetric("innodb.ibuf.free_list_len", m.group(2)) 274 | printmetric("innodb.ibuf.seg_size", m.group(3)) 275 | continue 276 | m = match("(\d+) inserts, (\d+) merged recs, (\d+) merges") 277 | if m: 278 | printmetric("innodb.ibuf.inserts", m.group(1)) 279 | printmetric("innodb.ibuf.merged_recs", m.group(2)) 280 | printmetric("innodb.ibuf.merges", m.group(3)) 281 | continue 282 | # ROW OPERATIONS 283 | m = match("\d+ queries inside InnoDB, (\d+) queries in queue") 284 | if m: 285 | printmetric("innodb.queries_queued", m.group(1)) 286 | continue 287 | m = match("(\d+) read views open inside InnoDB") 288 | if m: 289 | printmetric("innodb.opened_read_views", m.group(1)) 290 | continue 291 | # TRANSACTION 292 | m = match("History list length (\d+)") 293 | if m: 294 | printmetric("innodb.history_list_length", m.group(1)) 295 | continue 296 | 297 | 298 | def collect(db): 299 | """Collects and prints stats about the given DB instance.""" 300 | 301 | ts = now() 302 | def printmetric(metric, value, tags=""): 303 | print "mysql.%s %d %s schema=%s%s" % (metric, ts, value, db.dbname, tags) 304 | 305 | has_innodb = False 306 | if db.isShowGlobalStatusSafe(): 307 | for metric, value in db.query("SHOW GLOBAL STATUS"): 308 | try: 309 | if "." in value: 310 | value = float(value) 311 | else: 312 | value = int(value) 313 | except ValueError: 314 | continue 315 | metric = metric.lower() 316 | has_innodb = has_innodb or metric.startswith("innodb") 317 | printmetric(metric, value) 318 | 319 | if has_innodb: 320 | collectInnodbStatus(db) 321 | 322 | if has_innodb and False: # Disabled because it's too expensive for InnoDB. 323 | waits = {} # maps a mutex name to the number of waits 324 | ts = now() 325 | for engine, mutex, status in db.query("SHOW ENGINE INNODB MUTEX"): 326 | if not status.startswith("os_waits"): 327 | continue 328 | m = re.search("&(\w+)(?:->(\w+))?$", mutex) 329 | if not m: 330 | continue 331 | mutex, kind = m.groups() 332 | if kind: 333 | mutex += "." + kind 334 | wait_count = int(status.split("=", 1)[1]) 335 | waits[mutex] = waits.get(mutex, 0) + wait_count 336 | for mutex, wait_count in waits.iteritems(): 337 | printmetric("innodb.locks", wait_count, " mutex=" + mutex) 338 | 339 | ts = now() 340 | 341 | mysql_slave_status = db.query("SHOW SLAVE STATUS") 342 | if mysql_slave_status: 343 | slave_status = todict(db, mysql_slave_status[0]) 344 | master_host = slave_status["master_host"] 345 | else: 346 | master_host = None 347 | 348 | if master_host and master_host != "None": 349 | sbm = slave_status.get("seconds_behind_master") 350 | if isinstance(sbm, (int, long)): 351 | printmetric("slave.seconds_behind_master", sbm) 352 | printmetric("slave.bytes_executed", slave_status["exec_master_log_pos"]) 353 | printmetric("slave.bytes_relayed", slave_status["read_master_log_pos"]) 354 | printmetric("slave.thread_io_running", 355 | isyes(slave_status["slave_io_running"])) 356 | printmetric("slave.thread_sql_running", 357 | isyes(slave_status["slave_sql_running"])) 358 | 359 | states = {} # maps a connection state to number of connections in that state 360 | for row in db.query("SHOW PROCESSLIST"): 361 | id, user, host, db_, cmd, time, state = row[:7] 362 | states[cmd] = states.get(cmd, 0) + 1 363 | for state, count in states.iteritems(): 364 | state = state.lower().replace(" ", "_") 365 | printmetric("connection_states", count, " state=%s" % state) 366 | 367 | 368 | def main(args): 369 | """Collects and dumps stats from a MySQL server.""" 370 | if not find_sockfiles(): # Nothing to monitor. 371 | return 13 # Ask tcollector to not respawn us. 372 | if MySQLdb is None: 373 | err("error: Python module `MySQLdb' is missing") 374 | return 1 375 | 376 | last_db_refresh = now() 377 | dbs = find_databases() 378 | while True: 379 | ts = now() 380 | if ts - last_db_refresh >= DB_REFRESH_INTERVAL: 381 | find_databases(dbs) 382 | last_db_refresh = ts 383 | 384 | errs = [] 385 | for dbname, db in dbs.iteritems(): 386 | try: 387 | collect(db) 388 | except (EnvironmentError, EOFError, RuntimeError, socket.error, 389 | MySQLdb.MySQLError), e: 390 | if isinstance(e, IOError) and e[0] == errno.EPIPE: 391 | # Exit on a broken pipe. There's no point in continuing 392 | # because no one will read our stdout anyway. 393 | return 2 394 | err("error: failed to collect data from %s: %s" % (db, e)) 395 | errs.append(dbname) 396 | 397 | for dbname in errs: 398 | del dbs[dbname] 399 | 400 | sys.stdout.flush() 401 | time.sleep(COLLECTION_INTERVAL) 402 | 403 | 404 | if __name__ == "__main__": 405 | sys.stdin.close() 406 | sys.exit(main(sys.argv)) 407 | -------------------------------------------------------------------------------- /collectors/0/netstat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # This file is part of tcollector. 3 | # Copyright (C) 2011 StumbleUpon, Inc. 4 | # 5 | # This program is free software: you can redistribute it and/or modify it 6 | # under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or (at your 8 | # option) any later version. This program is distributed in the hope that it 9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty 10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser 11 | # General Public License for more details. You should have received a copy 12 | # of the GNU Lesser General Public License along with this program. If not, 13 | # see . 14 | 15 | # Note: I spent many hours reading the Linux kernel's source code to infer the 16 | # exact meaning of some of the obscure but useful metrics it exposes. The 17 | # description of the metrics are correct to the best of my knowledge, but it's 18 | # not always to make sense of the Linux kernel's code. Please report any 19 | # inaccuracy you find. -- tsuna. 20 | """Socket allocation and network statistics for TSDB. 21 | 22 | Metrics from /proc/net/sockstat: 23 | - net.sockstat.num_sockets: Number of sockets allocated (only TCP). 24 | - net.sockstat.num_timewait: Number of TCP sockets currently in 25 | TIME_WAIT state. 26 | - net.sockstat.sockets_inuse: Number of sockets in use (TCP/UDP/raw). 27 | - net.sockstat.num_orphans: Number of orphan TCP sockets (not attached 28 | to any file descriptor). 29 | - net.sockstat.memory: Memory allocated for this socket type (in bytes). 30 | - net.sockstat.ipfragqueues: Number of IP flows for which there are 31 | currently fragments queued for reassembly. 32 | 33 | Metrics from /proc/net/netstat (`netstat -s' command): 34 | - net.stat.tcp.abort: Number of connections that the kernel had to abort. 35 | type=memory is especially bad, the kernel had to drop a connection due to 36 | having too many orphaned sockets. Other types are normal (e.g. timeout). 37 | - net.stat.tcp.abort.failed: Number of times the kernel failed to abort a 38 | connection because it didn't even have enough memory to reset it (bad). 39 | - net.stat.tcp.congestion.recovery: Number of times the kernel detected 40 | spurious retransmits and was able to recover part or all of the CWND. 41 | - net.stat.tcp.delayedack: Number of delayed ACKs sent of different types. 42 | - net.stat.tcp.failed_accept: Number of times a connection had to be dropped 43 | after the 3WHS. reason=full_acceptq indicates that the application isn't 44 | accepting connections fast enough. You should see SYN cookies too. 45 | - net.stat.tcp.invalid_sack: Number of invalid SACKs we saw of diff types. 46 | (requires Linux v2.6.24-rc1 or newer) 47 | - net.stat.tcp.memory.pressure: Number of times a socket entered the 48 | "memory pressure" mode (not great). 49 | - net.stat.tcp.memory.prune: Number of times a socket had to discard 50 | received data due to low memory conditions (bad). 51 | - net.stat.tcp.packetloss.recovery: Number of times we recovered from packet 52 | loss by type of recovery (e.g. fast retransmit vs SACK). 53 | - net.stat.tcp.receive.queue.full: Number of times a received packet had to 54 | be dropped because the socket's receive queue was full. 55 | (requires Linux v2.6.34-rc2 or newer) 56 | - net.stat.tcp.reording: Number of times we detected re-ordering and how. 57 | - net.stat.tcp.syncookies: SYN cookies (both sent & received). 58 | """ 59 | 60 | import os 61 | import pwd 62 | import re 63 | import resource 64 | import sys 65 | import time 66 | 67 | # If we're running as root and this user exists, we'll drop privileges. 68 | USER = "nobody" 69 | 70 | 71 | def drop_privileges(): 72 | """Drops privileges if running as root.""" 73 | try: 74 | ent = pwd.getpwnam(USER) 75 | except KeyError: 76 | return 77 | 78 | if os.getuid() != 0: 79 | return 80 | 81 | os.setgid(ent.pw_gid) 82 | os.setuid(ent.pw_uid) 83 | 84 | 85 | 86 | def main(): 87 | """Main loop""" 88 | drop_privileges() 89 | sys.stdin.close() 90 | 91 | interval = 15 92 | page_size = resource.getpagesize() 93 | 94 | try: 95 | sockstat = open("/proc/net/sockstat") 96 | netstat = open("/proc/net/netstat") 97 | except IOError, e: 98 | print >>sys.stderr, "Failed to open /proc/net/sockstat: %s" % e 99 | return 13 # Ask tcollector to not re-start us. 100 | 101 | # Note: up until v2.6.37-rc2 most of the values were 32 bits. 102 | # The first value is pretty useless since it accounts for some 103 | # socket types but not others. So we don't report it because it's 104 | # more confusing than anything else and it's not well documented 105 | # what type of sockets are or aren't included in this count. 106 | regexp = re.compile("sockets: used \d+\n" 107 | "TCP: inuse (?P\d+) orphan (?P\d+)" 108 | " tw (?P\d+) alloc (?P\d+)" 109 | " mem (?P\d+)\n" 110 | "UDP: inuse (?P\d+)" 111 | # UDP memory accounting was added in v2.6.25-rc1 112 | "(?: mem (?P\d+))?\n" 113 | # UDP-Lite (RFC 3828) was added in v2.6.20-rc2 114 | "(?:UDPLITE: inuse (?P\d+)\n)?" 115 | "RAW: inuse (?P\d+)\n" 116 | "FRAG: inuse (?P\d+)" 117 | " memory (?P\d+)\n") 118 | 119 | def print_sockstat(metric, value, tags=""): # Note: tags must start with ' ' 120 | if value is not None: 121 | print "net.sockstat.%s %d %s%s" % (metric, ts, value, tags) 122 | 123 | 124 | # If a line in /proc/net/netstat doesn't start with a word in that dict, 125 | # we'll ignore it. We use the value to build the metric name. 126 | known_netstatstypes = { 127 | "TcpExt:": "tcp", 128 | "IpExt:": "ip", # We don't collect anything from here for now. 129 | } 130 | 131 | # Any stat in /proc/net/netstat that doesn't appear in this dict will be 132 | # ignored. If we find a match, we'll use the (metricname, tags). 133 | known_netstats = { 134 | # An application wasn't able to accept a connection fast enough, so 135 | # the kernel couldn't store an entry in the queue for this connection. 136 | # Instead of dropping it, it sent a cookie to the client. 137 | "SyncookiesSent": ("syncookies", "type=sent"), 138 | # After sending a cookie, it came back to us and passed the check. 139 | "SyncookiesRecv": ("syncookies", "type=received"), 140 | # After sending a cookie, it came back to us but looked invalid. 141 | "SyncookiesFailed": ("syncookies", "type=failed"), 142 | # When a socket is using too much memory (rmem), the kernel will first 143 | # discard any out-of-order packet that has been queued (with SACK). 144 | "OfoPruned": ("memory.prune", "type=drop_ofo_queue"), 145 | # If the kernel is really really desperate and cannot give more memory 146 | # to this socket even after dropping the ofo queue, it will simply 147 | # discard the packet it received. This is Really Bad. 148 | "RcvPruned": ("memory.prune", "type=drop_received"), 149 | # We waited for another packet to send an ACK, but didn't see any, so 150 | # a timer ended up sending a delayed ACK. 151 | "DelayedACKs": ("delayedack", "type=sent"), 152 | # We wanted to send a delayed ACK but failed because the socket was 153 | # locked. So the timer was reset. 154 | "DelayedACKLocked": ("delayedack", "type=locked"), 155 | # We sent a delayed and duplicated ACK because the remote peer 156 | # retransmitted a packet, thinking that it didn't get to us. 157 | "DelayedACKLost": ("delayedack", "type=lost"), 158 | # We completed a 3WHS but couldn't put the socket on the accept queue, 159 | # so we had to discard the connection. 160 | "ListenOverflows": ("failed_accept", "reason=full_acceptq"), 161 | # We couldn't accept a connection because one of: we had no route to 162 | # the destination, we failed to allocate a socket, we failed to 163 | # allocate a new local port bind bucket. Note: this counter 164 | # also include all the increments made to ListenOverflows... 165 | "ListenDrops": ("failed_accept", "reason=other"), 166 | # A packet was lost and we recovered after a fast retransmit. 167 | "TCPRenoRecovery": ("packetloss.recovery", "type=fast_retransmit"), 168 | # A packet was lost and we recovered by using selective 169 | # acknowledgements. 170 | "TCPSackRecovery": ("packetloss.recovery", "type=sack"), 171 | # We detected re-ordering using FACK (Forward ACK -- the highest 172 | # sequence number known to have been received by the peer when using 173 | # SACK -- FACK is used during congestion control). 174 | "TCPFACKReorder": ("reording", "detectedby=fack"), 175 | # We detected re-ordering using SACK. 176 | "TCPSACKReorder": ("reording", "detectedby=sack"), 177 | # We detected re-ordering using fast retransmit. 178 | "TCPRenoReorder": ("reording", "detectedby=fast_retransmit"), 179 | # We detected re-ordering using the timestamp option. 180 | "TCPTSReorder": ("reording", "detectedby=timestamp"), 181 | # We detected some erroneous retransmits and undid our CWND reduction. 182 | "TCPFullUndo": ("congestion.recovery", "type=full_undo"), 183 | # We detected some erroneous retransmits, a partial ACK arrived while 184 | # we were fast retransmitting, so we were able to partially undo some 185 | # of our CWND reduction. 186 | "TCPPartialUndo": ("congestion.recovery", "type=hoe_heuristic"), 187 | # We detected some erroneous retransmits, a D-SACK arrived and ACK'ed 188 | # all the retransmitted data, so we undid our CWND reduction. 189 | "TCPDSACKUndo": ("congestion.recovery", "type=sack"), 190 | # We detected some erroneous retransmits, a partial ACK arrived, so we 191 | # undid our CWND reduction. 192 | "TCPLossUndo": ("congestion.recovery", "type=ack"), 193 | # We received an unexpected SYN so we sent a RST to the peer. 194 | "TCPAbortOnSyn": ("abort", "type=unexpected_syn"), 195 | # We were in FIN_WAIT1 yet we received a data packet with a sequence 196 | # number that's beyond the last one for this connection, so we RST'ed. 197 | "TCPAbortOnData": ("abort", "type=data_after_fin_wait1"), 198 | # We received data but the user has closed the socket, so we have no 199 | # wait of handing it to them, so we RST'ed. 200 | "TCPAbortOnClose": ("abort", "type=data_after_close"), 201 | # This is Really Bad. It happens when there are too many orphaned 202 | # sockets (not attached a FD) and the kernel has to drop a connection. 203 | # Sometimes it will send a reset to the peer, sometimes it wont. 204 | "TCPAbortOnMemory": ("abort", "type=out_of_memory"), 205 | # The connection timed out really hard. 206 | "TCPAbortOnTimeout": ("abort", "type=timeout"), 207 | # We killed a socket that was closed by the application and lingered 208 | # around for long enough. 209 | "TCPAbortOnLinger": ("abort", "type=linger"), 210 | # We tried to send a reset, probably during one of teh TCPABort* 211 | # situations above, but we failed e.g. because we couldn't allocate 212 | # enough memory (very bad). 213 | "TCPAbortFailed": ("abort.failed", None), 214 | # Number of times a socket was put in "memory pressure" due to a non 215 | # fatal memory allocation failure (reduces the send buffer size etc). 216 | "TCPMemoryPressures": ("memory.pressure", None), 217 | # We got a completely invalid SACK block and discarded it. 218 | "TCPSACKDiscard": ("invalid_sack", "type=invalid"), 219 | # We got a duplicate SACK while retransmitting so we discarded it. 220 | "TCPDSACKIgnoredOld": ("invalid_sack", "type=retransmit"), 221 | # We got a duplicate SACK and discarded it. 222 | "TCPDSACKIgnoredNoUndo": ("invalid_sack", "type=olddup"), 223 | # We received something but had to drop it because the socket's 224 | # receive queue was full. 225 | "TCPBacklogDrop": ("receive.queue.full", None), 226 | } 227 | 228 | 229 | def print_netstat(statstype, metric, value, tags=""): 230 | if tags: 231 | space = " " 232 | else: 233 | tags = space = "" 234 | print "net.stat.%s.%s %d %s%s%s" % (statstype, metric, ts, value, 235 | space, tags) 236 | 237 | statsdikt = {} 238 | while True: 239 | ts = int(time.time()) 240 | sockstat.seek(0) 241 | netstat.seek(0) 242 | data = sockstat.read() 243 | stats = netstat.read() 244 | m = re.match(regexp, data) 245 | if not m: 246 | print >>sys.stderr, "Cannot parse sockstat: %r" % data 247 | return 13 248 | 249 | # The difference between the first two values is the number of 250 | # sockets allocated vs the number of sockets actually in use. 251 | print_sockstat("num_sockets", m.group("tcp_sockets"), " type=tcp") 252 | print_sockstat("num_timewait", m.group("tw_count")) 253 | print_sockstat("sockets_inuse", m.group("tcp_inuse"), " type=tcp") 254 | print_sockstat("sockets_inuse", m.group("udp_inuse"), " type=udp") 255 | print_sockstat("sockets_inuse", m.group("udplite_inuse"), " type=udplite") 256 | print_sockstat("sockets_inuse", m.group("raw_inuse"), " type=raw") 257 | 258 | print_sockstat("num_orphans", m.group("orphans")) 259 | print_sockstat("memory", int(m.group("tcp_pages")) * page_size, 260 | " type=tcp") 261 | if m.group("udp_pages") is not None: 262 | print_sockstat("memory", int(m.group("udp_pages")) * page_size, 263 | " type=udp") 264 | print_sockstat("memory", m.group("ip_frag_mem"), " type=ipfrag") 265 | print_sockstat("ipfragqueues", m.group("ip_frag_nqueues")) 266 | 267 | # /proc/net/netstat has a retarded column-oriented format. It looks 268 | # like this: 269 | # Header: SomeMetric OtherMetric 270 | # Header: 1 2 271 | # OtherHeader: ThirdMetric FooBar 272 | # OtherHeader: 42 51 273 | # We first group all the lines for each header together: 274 | # {"Header:": [["SomeMetric", "OtherHeader"], ["1", "2"]], 275 | # "OtherHeader:": [["ThirdMetric", "FooBar"], ["42", "51"]]} 276 | # Then we'll create a dict for each type: 277 | # {"SomeMetric": "1", "OtherHeader": "2"} 278 | for line in stats.splitlines(): 279 | line = line.split() 280 | if line[0] not in known_netstatstypes: 281 | print >>sys.stderr, ("Unrecoginized line in /proc/net/netstat:" 282 | " %r (file=%r)" % (line, stats)) 283 | continue 284 | statstype = line.pop(0) 285 | statsdikt.setdefault(known_netstatstypes[statstype], []).append(line) 286 | for statstype, stats in statsdikt.iteritems(): 287 | # stats is now: 288 | # [["SyncookiesSent", "SyncookiesRecv", ...], ["1", "2", ....]] 289 | assert len(stats) == 2, repr(statsdikt) 290 | stats = dict(zip(*stats)) 291 | value = stats.get("ListenDrops") 292 | if value is not None: # Undo the kernel's double counting 293 | stats["ListenDrops"] = int(value) - int(stats.get("ListenOverflows", 0)) 294 | for stat, (metric, tags) in known_netstats.iteritems(): 295 | value = stats.get(stat) 296 | if value is not None: 297 | print_netstat(statstype, metric, value, tags) 298 | stats.clear() 299 | statsdikt.clear() 300 | 301 | sys.stdout.flush() 302 | time.sleep(interval) 303 | 304 | if __name__ == "__main__": 305 | sys.exit(main()) 306 | -------------------------------------------------------------------------------- /collectors/0/procnettcp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # This file is part of tcollector. 3 | # Copyright (C) 2010 StumbleUpon, Inc. 4 | # 5 | # This program is free software: you can redistribute it and/or modify it 6 | # under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or (at your 8 | # option) any later version. This program is distributed in the hope that it 9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty 10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser 11 | # General Public License for more details. You should have received a copy 12 | # of the GNU Lesser General Public License along with this program. If not, 13 | # see . 14 | 15 | """TCP socket state data for TSDB""" 16 | # 17 | # Read /proc/net/tcp, which gives netstat -a type 18 | # data for all TCP sockets. 19 | 20 | # Note this collector generates a lot of lines, given that there are 21 | # lots of tcp states and given the number of subcollections we do. 22 | # We rely heavily on tcollector's deduping. We could be lazy and 23 | # just output values only for which we have data, except if we do 24 | # this then any counters for which we had data would never reach 25 | # zero since our state machine never enters this condition. 26 | 27 | # Metric: proc.net.tcp 28 | 29 | # For each run, we classify each connection and generate subtotals. 30 | # TSD will automatically total these up when displaying 31 | # the graph, but you can drill down for each possible total or a 32 | # particular one. This does generate a large amount of datapoints, 33 | # as the number of points is (S*(U+1)*V) (currently ~400), where 34 | # S=number of TCP states, U=Number of users to track, and 35 | # V=number of services (collections of ports) 36 | # The deduper does dedup this down very well, as only 3 of the 10 37 | # TCP states are generally ever seen, and most servers only run one 38 | # service under one user. On a typical server this dedups down to 39 | # under 10 values per interval. 40 | 41 | # Each connection is broken down with a tag for user=username (see 42 | # "users" list below) or put under "other" if not in the list. 43 | # Expand this for any users you care about. 44 | # It is also broken down for each state (state=). 45 | # It is also broken down into services (collections of ports) 46 | 47 | # Note that once a connection is closed, Linux seems to forget who 48 | # opened/handled the connection. For connections in time_wait, for 49 | # example, they will always show user=root. 50 | 51 | import os 52 | import sys 53 | import time 54 | import socket 55 | import pwd 56 | 57 | 58 | USERS = ("root", "www-data", "mysql") 59 | 60 | # Note if a service runs on multiple ports and you 61 | # want to collectively map them up to a single service, 62 | # just give them the same name below 63 | 64 | PORTS = { 65 | 80: "http", 66 | 443: "https", 67 | 3001: "http-varnish", 68 | 3002: "http-varnish", 69 | 3003: "http-varnish", 70 | 3004: "http-varnish", 71 | 3005: "http-varnish", 72 | 3006: "http-varnish", 73 | 3007: "http-varnish", 74 | 3008: "http-varnish", 75 | 3009: "http-varnish", 76 | 3010: "http-varnish", 77 | 3011: "http-varnish", 78 | 3012: "http-varnish", 79 | 3013: "http-varnish", 80 | 3014: "http-varnish", 81 | 3306: "mysql", 82 | 3564: "mysql", 83 | 9000: "namenode", 84 | 9090: "thriftserver", 85 | 11211: "memcache", 86 | 11212: "memcache", 87 | 11213: "memcache", 88 | 11214: "memcache", 89 | 11215: "memcache", 90 | 11216: "memcache", 91 | 11217: "memcache", 92 | 11218: "memcache", 93 | 11219: "memcache", 94 | 11220: "memcache", 95 | 11221: "memcache", 96 | 11222: "memcache", 97 | 11223: "memcache", 98 | 11224: "memcache", 99 | 11225: "memcache", 100 | 11226: "memcache", 101 | 50020: "datanode", 102 | 60020: "hregionserver", 103 | } 104 | 105 | SERVICES = tuple(set(PORTS.itervalues())) 106 | 107 | TCPSTATES = { 108 | "01": "established", 109 | "02": "syn_sent", 110 | "03": "syn_recv", 111 | "04": "fin_wait1", 112 | "05": "fin_wait2", 113 | "06": "time_wait", 114 | "07": "close", 115 | "08": "close_wait", 116 | "09": "last_ack", 117 | "0A": "listen", 118 | "0B": "closing", 119 | } 120 | 121 | # If we're running as root and this user exists, we'll drop privileges. 122 | USER = "nobody" 123 | 124 | 125 | def drop_privileges(): 126 | try: 127 | ent = pwd.getpwnam(USER) 128 | except KeyError: 129 | return 130 | 131 | if os.getuid() != 0: 132 | return 133 | 134 | os.setgid(ent.pw_gid) 135 | os.setuid(ent.pw_uid) 136 | 137 | 138 | def is_public_ip(ipstr): 139 | """ 140 | Take a /proc/net/tcp encoded src or dest string 141 | Return True if it is coming from public IP space 142 | (i.e. is not RFC1918, loopback, or broadcast). 143 | This string is the hex ip:port of the connection. 144 | (ip is reversed) 145 | """ 146 | addr = ipstr.split(":")[0] 147 | addr = int(addr, 16) 148 | byte1 = addr & 0xFF 149 | byte2 = (addr >> 8) & 0xFF 150 | if byte1 in (10, 0, 127): 151 | return False 152 | if byte1 == 172 and byte2 > 16: 153 | return False 154 | if byte1 == 192 and byte2 == 168: 155 | return False 156 | return True 157 | 158 | 159 | def main(unused_args): 160 | """procnettcp main loop""" 161 | drop_privileges() 162 | try: # On some Linux kernel versions, with lots of connections 163 | os.nice(19) # this collector can be very CPU intensive. So be nicer. 164 | except OSError, e: 165 | print >>sys.stderr, "warning: failed to self-renice:", e 166 | 167 | interval = 60 168 | 169 | # resolve the list of users to match on into UIDs 170 | uids = {} 171 | for user in USERS: 172 | try: 173 | uids[str(pwd.getpwnam(user)[2])] = user 174 | except KeyError: 175 | continue 176 | 177 | try: 178 | tcp = open("/proc/net/tcp") 179 | # if IPv6 is enabled, even IPv4 connections will also 180 | # appear in tcp6. It has the same format, apart from the 181 | # address size 182 | try: 183 | tcp6 = open("/proc/net/tcp6") 184 | except IOError, (errno, msg): 185 | if errno == 2: # No such file => IPv6 is disabled. 186 | tcp6 = None 187 | else: 188 | raise 189 | except IOError, e: 190 | print >>sys.stderr, "Failed to open input file: %s" % (e,) 191 | return 13 # Ask tcollector to not re-start us immediately. 192 | 193 | while True: 194 | counter = {} 195 | 196 | for procfile in (tcp, tcp6): 197 | if procfile is None: 198 | continue 199 | procfile.seek(0) 200 | ts = int(time.time()) 201 | for line in procfile: 202 | try: 203 | # pylint: disable=W0612 204 | (num, src, dst, state, queue, when, retrans, 205 | uid, timeout, inode) = line.split(None, 9) 206 | except ValueError: # Malformed line 207 | continue 208 | 209 | if num == "sl": # header 210 | continue 211 | 212 | srcport = src.split(":")[1] 213 | dstport = dst.split(":")[1] 214 | srcport = int(srcport, 16) 215 | dstport = int(dstport, 16) 216 | service = PORTS.get(srcport, "other") 217 | service = PORTS.get(dstport, service) 218 | 219 | if is_public_ip(dst) or is_public_ip(src): 220 | endpoint = "external" 221 | else: 222 | endpoint = "internal" 223 | 224 | 225 | user = uids.get(uid, "other") 226 | 227 | key = "state=" + TCPSTATES[state] + " endpoint=" + endpoint + \ 228 | " service=" + service + " user=" + user 229 | if key in counter: 230 | counter[key] += 1 231 | else: 232 | counter[key] = 1 233 | 234 | # output the counters 235 | for state in TCPSTATES: 236 | for service in SERVICES + ("other",): 237 | for user in USERS + ("other",): 238 | for endpoint in ("internal", "external"): 239 | key = ("state=%s endpoint=%s service=%s user=%s" 240 | % (TCPSTATES[state], endpoint, service, user)) 241 | if key in counter: 242 | print "proc.net.tcp", ts, counter[key], key 243 | else: 244 | print "proc.net.tcp", ts, "0", key 245 | 246 | sys.stdout.flush() 247 | time.sleep(interval) 248 | 249 | if __name__ == "__main__": 250 | sys.exit(main(sys.argv)) 251 | -------------------------------------------------------------------------------- /collectors/0/procstats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # This file is part of tcollector. 3 | # Copyright (C) 2010 StumbleUpon, Inc. 4 | # 5 | # This program is free software: you can redistribute it and/or modify it 6 | # under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or (at your 8 | # option) any later version. This program is distributed in the hope that it 9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty 10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser 11 | # General Public License for more details. You should have received a copy 12 | # of the GNU Lesser General Public License along with this program. If not, 13 | # see . 14 | # 15 | """import various /proc stats from /proc into TSDB""" 16 | 17 | import os 18 | import sys 19 | import time 20 | import socket 21 | import re 22 | 23 | COLLECTION_INTERVAL = 15 # seconds 24 | NUMADIR = "/sys/devices/system/node" 25 | 26 | 27 | def open_sysfs_numa_stats(): 28 | """Returns a possibly empty list of opened files.""" 29 | try: 30 | nodes = os.listdir(NUMADIR) 31 | except OSError, (errno, msg): 32 | if errno == 2: # No such file or directory 33 | return [] # We don't have NUMA stats. 34 | raise 35 | 36 | nodes = [node for node in nodes if node.startswith("node")] 37 | numastats = [] 38 | for node in nodes: 39 | try: 40 | numastats.append(open(os.path.join(NUMADIR, node, "numastat"))) 41 | except OSError, (errno, msg): 42 | if errno == 2: # No such file or directory 43 | continue 44 | raise 45 | return numastats 46 | 47 | 48 | def print_numa_stats(numafiles): 49 | """From a list of opened files, extracts and prints NUMA stats.""" 50 | for numafile in numafiles: 51 | numafile.seek(0) 52 | node_id = int(numafile.name[numafile.name.find("/node/node")+10:-9]) 53 | ts = int(time.time()) 54 | stats = dict(line.split() for line in numafile.read().splitlines()) 55 | for stat, tag in (# hit: process wanted memory from this node and got it 56 | ("numa_hit", "hit"), 57 | # miss: process wanted another node and got it from 58 | # this one instead. 59 | ("numa_miss", "miss")): 60 | print ("sys.numa.zoneallocs %d %s node=%d type=%s" 61 | % (ts, stats[stat], node_id, tag)) 62 | # Count this one as a separate metric because we can't sum up hit + 63 | # miss + foreign, this would result in double-counting of all misses. 64 | # See `zone_statistics' in the code of the kernel. 65 | # foreign: process wanted memory from this node but got it from 66 | # another node. So maybe this node is out of free pages. 67 | print ("sys.numa.foreign_allocs %d %s node=%d" 68 | % (ts, stats["numa_foreign"], node_id)) 69 | # When is memory allocated to a node that's local or remote to where 70 | # the process is running. 71 | for stat, tag in (("local_node", "local"), 72 | ("other_node", "remote")): 73 | print ("sys.numa.allocation %d %s node=%d type=%s" 74 | % (ts, stats[stat], node_id, tag)) 75 | # Pages successfully allocated with the interleave policy. 76 | print ("sys.numa.interleave %d %s node=%d type=hit" 77 | % (ts, stats["interleave_hit"], node_id)) 78 | 79 | 80 | def main(): 81 | """procstats main loop""" 82 | 83 | f_uptime = open("/proc/uptime", "r") 84 | f_meminfo = open("/proc/meminfo", "r") 85 | f_vmstat = open("/proc/vmstat", "r") 86 | f_stat = open("/proc/stat", "r") 87 | f_loadavg = open("/proc/loadavg", "r") 88 | f_entropy_avail = open("/proc/sys/kernel/random/entropy_avail", "r") 89 | numastats = open_sysfs_numa_stats() 90 | 91 | while True: 92 | # proc.uptime 93 | f_uptime.seek(0) 94 | ts = int(time.time()) 95 | for line in f_uptime: 96 | m = re.match("(\S+)\s+(\S+)", line) 97 | if m: 98 | print "proc.uptime.total %d %s" % (ts, m.group(1)) 99 | print "proc.uptime.now %d %s" % (ts, m.group(2)) 100 | 101 | # proc.meminfo 102 | f_meminfo.seek(0) 103 | ts = int(time.time()) 104 | for line in f_meminfo: 105 | m = re.match("(\w+):\s+(\d+)", line) 106 | if m: 107 | print ("proc.meminfo.%s %d %s" 108 | % (m.group(1).lower(), ts, m.group(2))) 109 | 110 | # proc.vmstat 111 | f_vmstat.seek(0) 112 | ts = int(time.time()) 113 | for line in f_vmstat: 114 | m = re.match("(\w+)\s+(\d+)", line) 115 | if not m: 116 | continue 117 | if m.group(1) in ("pgpgin", "pgpgout", "pswpin", 118 | "pswpout", "pgfault", "pgmajfault"): 119 | print "proc.vmstat.%s %d %s" % (m.group(1), ts, m.group(2)) 120 | 121 | # proc.stat 122 | f_stat.seek(0) 123 | ts = int(time.time()) 124 | for line in f_stat: 125 | m = re.match("(\w+)\s+(.*)", line) 126 | if not m: 127 | continue 128 | if m.group(1) == "cpu": 129 | fields = m.group(2).split() 130 | print "proc.stat.cpu %d %s type=user" % (ts, fields[0]) 131 | print "proc.stat.cpu %d %s type=nice" % (ts, fields[1]) 132 | print "proc.stat.cpu %d %s type=system" % (ts, fields[2]) 133 | print "proc.stat.cpu %d %s type=idle" % (ts, fields[3]) 134 | print "proc.stat.cpu %d %s type=iowait" % (ts, fields[4]) 135 | print "proc.stat.cpu %d %s type=irq" % (ts, fields[5]) 136 | print "proc.stat.cpu %d %s type=softirq" % (ts, fields[6]) 137 | # really old kernels don't have this field 138 | if len(fields) > 7: 139 | print ("proc.stat.cpu %d %s type=guest" 140 | % (ts, fields[7])) 141 | # old kernels don't have this field 142 | if len(fields) > 8: 143 | print ("proc.stat.cpu %d %s type=guest_nice" 144 | % (ts, fields[8])) 145 | elif m.group(1) == "intr": 146 | print ("proc.stat.intr %d %s" 147 | % (ts, m.group(2).split()[0])) 148 | elif m.group(1) == "ctxt": 149 | print "proc.stat.ctxt %d %s" % (ts, m.group(2)) 150 | elif m.group(1) == "processes": 151 | print "proc.stat.processes %d %s" % (ts, m.group(2)) 152 | elif m.group(1) == "procs_blocked": 153 | print "proc.stat.procs_blocked %d %s" % (ts, m.group(2)) 154 | 155 | f_loadavg.seek(0) 156 | ts = int(time.time()) 157 | for line in f_loadavg: 158 | m = re.match("(\S+)\s+(\S+)\s+(\S+)\s+(\d+)/(\d+)\s+", line) 159 | if not m: 160 | continue 161 | print "proc.loadavg.1min %d %s" % (ts, m.group(1)) 162 | print "proc.loadavg.5min %d %s" % (ts, m.group(2)) 163 | print "proc.loadavg.15min %d %s" % (ts, m.group(3)) 164 | print "proc.loadavg.runnable %d %s" % (ts, m.group(4)) 165 | print "proc.loadavg.total_threads %d %s" % (ts, m.group(5)) 166 | 167 | f_entropy_avail.seek(0) 168 | ts = int(time.time()) 169 | for line in f_entropy_avail: 170 | print "proc.kernel.entropy_avail %d %s" % (ts, line.strip()) 171 | 172 | print_numa_stats(numastats) 173 | 174 | sys.stdout.flush() 175 | time.sleep(COLLECTION_INTERVAL) 176 | 177 | if __name__ == "__main__": 178 | main() 179 | 180 | -------------------------------------------------------------------------------- /collectors/0/redis-stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2011 by Bump Technologies, Inc. 4 | # 5 | # This program is free software: you can redistribute it and/or modify it 6 | # under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or (at your 8 | # option) any later version. This program is distributed in the hope that it 9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty 10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser 11 | # General Public License for more details. You should have received a copy 12 | # of the GNU Lesser General Public License along with this program. If not, 13 | # see . 14 | # 15 | # Written by Mark Smith . 16 | # 17 | 18 | """Statistics from a Redis instance. 19 | 20 | Note: this collector parses your Redis configuration files to determine what cluster 21 | this instance is part of. If you want the cluster tag to be accurate, please edit 22 | your Redis configuration file and add a comment like this somewhere in the file: 23 | 24 | # tcollector.cluster = main 25 | 26 | You can name the cluster anything that matches the regex [a-z0-9-_]+. 27 | 28 | This collector outputs the following metrics: 29 | 30 | - redis.bgrewriteaof_in_progress 31 | - redis.bgsave_in_progress 32 | - redis.blocked_clients 33 | - redis.changes_since_last_save 34 | - redis.client_biggest_input_buf 35 | - redis.client_longest_output_list 36 | - redis.connected_clients 37 | - redis.connected_slaves 38 | - redis.expired_keys 39 | - redis.evicted_keys 40 | - redis.hash_max_zipmap_entries 41 | - redis.hash_max_zipmap_value 42 | - redis.keyspace_hits 43 | - redis.keyspace_misses 44 | - redis.mem_fragmentation_ratio 45 | - redis.pubsub_channels 46 | - redis.pubsub_patterns 47 | - redis.total_commands_processed 48 | - redis.total_connections_received 49 | - redis.uptime_in_seconds 50 | - redis.used_cpu_sys 51 | - redis.used_cpu_user 52 | - redis.used_memory 53 | - redis.used_memory_rss 54 | 55 | For more information on these values, see this (not very useful) documentation: 56 | 57 | http://redis.io/commands/info 58 | """ 59 | 60 | import os 61 | import pwd 62 | import re 63 | import subprocess 64 | import sys 65 | import time 66 | 67 | try: 68 | import redis 69 | has_redis = True 70 | except ImportError: 71 | has_redis = False 72 | 73 | # If we are root, drop privileges to this user, if necessary. NOTE: if this is 74 | # not root, this MUST be the user that you run redis-server under. If not, we 75 | # will not be able to find your Redis instances. 76 | USER = "root" 77 | 78 | # Every SCAN_INTERVAL seconds, we look for new redis instances. Prevents the 79 | # situation where you put up a new instance and we never notice. 80 | SCAN_INTERVAL = 300 81 | 82 | # these are the things in the info struct that we care about 83 | KEYS = [ 84 | 'pubsub_channels', 'bgrewriteaof_in_progress', 'connected_slaves', 'connected_clients', 'keyspace_misses', 85 | 'used_memory', 'total_commands_processed', 'used_memory_rss', 'total_connections_received', 'pubsub_patterns', 86 | 'used_cpu_sys', 'blocked_clients', 'used_cpu_user', 'expired_keys', 'bgsave_in_progress', 'hash_max_zipmap_entries', 87 | 'hash_max_zipmap_value', 'client_longest_output_list', 'client_biggest_input_buf', 'uptime_in_seconds', 88 | 'changes_since_last_save', 'mem_fragmentation_ratio', 'keyspace_hits', 'evicted_keys' 89 | ]; 90 | 91 | def drop_privileges(): 92 | """Drops privileges if running as root.""" 93 | 94 | if USER == 'root': 95 | return 96 | 97 | try: 98 | ent = pwd.getpwnam(USER) 99 | except KeyError: 100 | return 101 | 102 | if os.getuid() != 0: 103 | return 104 | os.setgid(ent.pw_gid) 105 | os.setuid(ent.pw_uid) 106 | 107 | 108 | def main(): 109 | """Main loop""" 110 | 111 | drop_privileges() 112 | sys.stdin.close() 113 | 114 | interval = 15 115 | 116 | # we scan for instances here to see if there are any redis servers 117 | # running on this machine... 118 | last_scan = time.time() 119 | instances = scan_for_instances() # port:name 120 | if not len(instances): 121 | return 13 122 | if not has_redis: 123 | sys.stderr.write("Found %d instance(s) to monitor, but the Python" 124 | " Redis module isn't installed.\n" % len(instances)) 125 | return 1 126 | 127 | def print_stat(metric, value, tags=""): 128 | if value is not None: 129 | print "redis.%s %d %s %s" % (metric, ts, value, tags) 130 | 131 | while True: 132 | ts = int(time.time()) 133 | 134 | # if we haven't looked for redis instances recently, let's do that 135 | if ts - last_scan > SCAN_INTERVAL: 136 | instances = scan_for_instances() 137 | last_scan = ts 138 | 139 | # now iterate over every instance and gather statistics 140 | for port in instances: 141 | tags = "cluster=%s port=%d" % (instances[port], port) 142 | 143 | # connect to the instance and attempt to gather info 144 | r = redis.Redis(host="127.0.0.1", port=port) 145 | info = r.info() 146 | for key in KEYS: 147 | if key in info: 148 | print_stat(key, info[key], tags) 149 | 150 | # get some instant latency information 151 | # TODO: might be nice to get 95th, 99th, etc here? 152 | start_time = time.time() 153 | r.ping() 154 | print_stat("latency", time.time() - start_time, tags) 155 | 156 | sys.stdout.flush() 157 | time.sleep(interval) 158 | 159 | 160 | def scan_for_instances(): 161 | """Use netstat to find instances of Redis listening on the local machine, then 162 | figure out what configuration file they're using to name the cluster.""" 163 | 164 | out = {} 165 | tcre = re.compile(r"^\s*#\s*tcollector.(\w+)\s*=\s*(.+)$") 166 | 167 | ns_proc = subprocess.Popen(["netstat", "-tnlp"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) 168 | stdout, _ = ns_proc.communicate() 169 | if ns_proc.returncode != 0: 170 | print >> sys.stderr, "failed to find instances %r" % ns_proc.returncode 171 | return {} 172 | 173 | for line in stdout.split("\n"): 174 | if not (line and 'redis-server' in line): 175 | continue 176 | pid = int(line.split()[6].split("/")[0]) 177 | port = int(line.split()[3].split(":")[1]) 178 | 179 | # now we have to get the command line. we look in the redis config file for 180 | # a special line that tells us what cluster this is. else we default to using 181 | # the port number which should work. 182 | cluster = "port-%d" % port 183 | try: 184 | f = open("/proc/%d/cmdline" % pid) 185 | cfg = f.readline().split("\0")[-2] 186 | f.close() 187 | 188 | f = open(cfg) 189 | for cfgline in f: 190 | result = tcre.match(cfgline) 191 | if result and result.group(1).lower() == "cluster": 192 | cluster = result.group(2).lower() 193 | except EnvironmentError: 194 | # use the default cluster name if anything above failed. 195 | pass 196 | 197 | out[port] = cluster 198 | return out 199 | 200 | 201 | if __name__ == "__main__": 202 | sys.exit(main()) 203 | -------------------------------------------------------------------------------- /collectors/0/riak.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2011 by Bump Technologies, Inc. 4 | # 5 | # This program is free software: you can redistribute it and/or modify it 6 | # under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or (at your 8 | # option) any later version. This program is distributed in the hope that it 9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty 10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser 11 | # General Public License for more details. You should have received a copy 12 | # of the GNU Lesser General Public License along with this program. If not, 13 | # see . 14 | # 15 | # Written by Mark Smith . 16 | # 17 | 18 | """A collector to gather statistics from a Riak node. 19 | 20 | The following all have tags of 'type' which can be 'get' or 'put'. Latency 21 | is measured in fractional seconds. All latency values are calculated over the 22 | last 60 seconds and are moving values. 23 | 24 | - riak.vnode.requests 25 | - riak.node.requests 26 | - riak.node.latency.mean 27 | - riak.node.latency.median 28 | - riak.node.latency.95th 29 | - riak.node.latency.99th 30 | - riak.node.latency.100th 31 | 32 | These metrics have no tags and are global: 33 | 34 | - riak.memory.total 35 | - riak.memory.allocated 36 | - riak.executing_mappers 37 | - riak.sys_process_count 38 | - riak.read_repairs 39 | - riak.connections 40 | - riak.connected_nodes 41 | """ 42 | 43 | import json 44 | import os 45 | import pwd 46 | import sys 47 | import time 48 | import urllib2 49 | 50 | # If we're running as root and this user exists, we'll drop privileges. Set this 51 | # to 'root' if you don't want to drop privileges. 52 | USER = "nobody" 53 | 54 | MAP = { 55 | 'vnode_gets_total': ('vnode.requests', 'type=get'), 56 | 'vnode_puts_total': ('vnode.requests', 'type=put'), 57 | 'node_gets_total': ('node.requests', 'type=get'), 58 | 'node_puts_total': ('node.requests', 'type=put'), 59 | 'node_get_fsm_time_mean': ('node.latency.mean', 'type=get'), 60 | 'node_get_fsm_time_median': ('node.latency.median', 'type=get'), 61 | 'node_get_fsm_time_95': ('node.latency.95th', 'type=get'), 62 | 'node_get_fsm_time_99': ('node.latency.99th', 'type=get'), 63 | 'node_get_fsm_time_100': ('node.latency.100th', 'type=get'), 64 | 'node_put_fsm_time_mean': ('node.latency.mean', 'type=put'), 65 | 'node_put_fsm_time_median': ('node.latency.median', 'type=put'), 66 | 'node_put_fsm_time_95': ('node.latency.95th', 'type=put'), 67 | 'node_put_fsm_time_99': ('node.latency.99th', 'type=put'), 68 | 'node_put_fsm_time_100': ('node.latency.100th', 'type=put'), 69 | 'pbc_connects_total': ('connections', ''), 70 | 'read_repairs_total': ('read_repairs', ''), 71 | 'sys_process_count': ('sys_process_count', ''), 72 | 'executing_mappers': ('executing_mappers', ''), 73 | 'mem_allocated': ('memory.allocated', ''), 74 | 'mem_total': ('memory.total', ''), 75 | #connected_nodes is calculated 76 | } 77 | 78 | def drop_privileges(): 79 | """Drops privileges if running as root.""" 80 | 81 | if USER == 'root': 82 | return 83 | 84 | try: 85 | ent = pwd.getpwnam(USER) 86 | except KeyError: 87 | return 88 | 89 | if os.getuid() != 0: 90 | return 91 | os.setgid(ent.pw_gid) 92 | os.setuid(ent.pw_uid) 93 | 94 | 95 | def main(): 96 | """Main loop""" 97 | 98 | # don't run if we're not a riak node 99 | if not os.path.exists("/usr/lib/riak"): 100 | sys.exit(13) 101 | 102 | drop_privileges() 103 | sys.stdin.close() 104 | 105 | interval = 15 106 | 107 | def print_stat(metric, value, tags=""): 108 | if value is not None: 109 | print "riak.%s %d %s %s" % (metric, ts, value, tags) 110 | 111 | while True: 112 | ts = int(time.time()) 113 | 114 | req = urllib2.urlopen("http://localhost:8098/stats") 115 | if req is not None: 116 | obj = json.loads(req.read()) 117 | for key in obj: 118 | if key not in MAP: 119 | continue 120 | # this is a hack, but Riak reports latencies in microseconds. they're fairly useless 121 | # to our human operators, so we're going to convert them to seconds. 122 | if 'latency' in MAP[key][0]: 123 | obj[key] = obj[key] / 1000000.0 124 | print_stat(MAP[key][0], obj[key], MAP[key][1]) 125 | if 'connected_nodes' in obj: 126 | print_stat('connected_nodes', len(obj['connected_nodes']), '') 127 | req.close() 128 | 129 | sys.stdout.flush() 130 | time.sleep(interval) 131 | 132 | 133 | if __name__ == "__main__": 134 | sys.exit(main()) 135 | -------------------------------------------------------------------------------- /collectors/0/zfsiostats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # This file is part of tcollector. 3 | # Copyright (C) 2012 StumbleUpon, Inc. 4 | # 5 | # This program is free software: you can redistribute it and/or modify it 6 | # under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or (at your 8 | # option) any later version. This program is distributed in the hope that it 9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty 10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser 11 | # General Public License for more details. You should have received a copy 12 | # of the GNU Lesser General Public License along with this program. If not, 13 | # see . 14 | # 15 | 16 | import errno 17 | import sys 18 | import time 19 | import subprocess 20 | import re 21 | import signal 22 | import os 23 | 24 | 25 | ''' 26 | ZFS I/O and disk space statistics for TSDB 27 | 28 | This plugin tracks, for all pools: 29 | 30 | - I/O 31 | zfs.io.pool.{read_issued, write_issued} 32 | zfs.io.pool.{read_sectors, write_sectors} 33 | zfs.io.device.{read_issued, write_issued} 34 | zfs.io.device.{read_sectors, write_sectors} 35 | - disk space 36 | zfs.df.pool.1kblocks.{total, used, available} 37 | zfs.df.device.1kblocks.{total, used, available} 38 | 39 | Sectors are always 512 bytes. Disk space usage is given in 1K blocks. 40 | Values delivered to standard output are already normalized to be per second. 41 | ''' 42 | 43 | def convert_to_bytes(string): 44 | """Take a string in the form 1234K, and convert to bytes""" 45 | factors = { 46 | "K": 1024, 47 | "M": 1024 * 1024, 48 | "G": 1024 * 1024 * 1024, 49 | "T": 1024 * 1024 * 1024 * 1024, 50 | "P": 1024 * 1024 * 1024 * 1024 * 1024, 51 | } 52 | if string == "-": return 0 53 | for f, fm in factors.items(): 54 | if string.endswith(f): 55 | number = float(string[:-1]) 56 | number = number * fm 57 | return long(number) 58 | return long(string) 59 | 60 | def extract_info(line): 61 | (poolname, 62 | alloc, free, 63 | read_issued, write_issued, 64 | read_sectors, write_sectors) = line.split() 65 | 66 | s_df = {} 67 | # 1k blocks 68 | s_df["used"] = convert_to_bytes(alloc) / 1024 69 | s_df["available"] = convert_to_bytes(free) / 1024 70 | s_df["total"] = s_df["used"] + s_df["available"] 71 | 72 | s_io = {} 73 | # magnitudeless variable 74 | s_io["read_issued"] = read_issued 75 | s_io["write_issued"] = write_issued 76 | # 512 byte sectors 77 | s_io["read_sectors"] = convert_to_bytes(read_sectors) / 512 78 | s_io["write_sectors"] = convert_to_bytes(write_sectors) / 512 79 | 80 | return poolname, s_df, s_io 81 | 82 | T_START = 1 83 | T_HEADERS = 2 84 | T_SEPARATOR = 3 85 | T_POOL = 4 86 | T_DEVICE = 5 87 | T_EMPTY = 6 88 | T_LEG = 7 89 | 90 | signal_received = None 91 | def handlesignal(signum, stack): 92 | global signal_received 93 | signal_received = signum 94 | 95 | def main(): 96 | """zfsiostats main loop""" 97 | global signal_received 98 | interval = 15 99 | # shouldn't the interval be determined by the daemon itself, and commu- 100 | # nicated to the collector somehow (signals seem like a reasonable protocol 101 | # whereas command-line parameters also sound reasonable)? 102 | 103 | signal.signal(signal.SIGTERM, handlesignal) 104 | signal.signal(signal.SIGINT, handlesignal) 105 | 106 | try: 107 | p_zpool = subprocess.Popen( 108 | ["zpool", "iostat", "-v", str(interval)], 109 | stdout=subprocess.PIPE, 110 | ) 111 | except OSError, e: 112 | if e.errno == errno.ENOENT: 113 | # it makes no sense to run this collector here 114 | sys.exit(13) # we signal tcollector to not run us 115 | raise 116 | 117 | firstloop = True 118 | lastleg = 0 119 | ltype = None 120 | timestamp = int(time.time()) 121 | capacity_stats_pool = {} 122 | capacity_stats_device = {} 123 | io_stats_pool = {} 124 | io_stats_device = {} 125 | start_re = re.compile(".*capacity.*operations.*bandwidth") 126 | headers_re = re.compile(".*pool.*alloc.*free.*read.*write.*read.*write") 127 | separator_re = re.compile(".*-----.*-----.*-----") 128 | while signal_received is None: 129 | try: 130 | line = p_zpool.stdout.readline() 131 | except (IOError, OSError), e: 132 | if e.errno in (errno.EINTR, errno.EAGAIN): 133 | break 134 | raise 135 | 136 | if not line: 137 | # end of the program, die 138 | break 139 | 140 | if start_re.match(line): 141 | assert ltype in (None, T_EMPTY), \ 142 | "expecting last state T_EMPTY or None, now got %s" % ltype 143 | ltype = T_START 144 | elif headers_re.match(line): 145 | assert ltype == T_START, \ 146 | "expecting last state T_START, now got %s" % ltype 147 | ltype = T_HEADERS 148 | elif separator_re.match(line): 149 | assert ltype in (T_DEVICE, T_HEADERS), \ 150 | "expecting last state T_DEVICE or T_HEADERS, now got %s" % ltype 151 | ltype = T_SEPARATOR 152 | elif len(line) < 2: 153 | assert ltype == T_SEPARATOR, \ 154 | "expecting last state T_SEPARATOR, now got %s" % ltype 155 | ltype = T_EMPTY 156 | elif line.startswith(" mirror"): 157 | assert ltype in (T_POOL, T_DEVICE), \ 158 | "expecting last state T_POOL or T_DEVICE, now got %s" % ltype 159 | ltype = T_LEG 160 | elif line.startswith(" "): 161 | assert ltype in (T_POOL, T_DEVICE, T_LEG), \ 162 | "expecting last state T_POOL or T_DEVICE or T_LEG, now got %s" % ltype 163 | ltype = T_DEVICE 164 | else: 165 | # must be a pool name 166 | assert ltype == T_SEPARATOR, \ 167 | "expecting last state T_SEPARATOR, now got %s" % ltype 168 | ltype = T_POOL 169 | 170 | if ltype == T_START: 171 | for x in ( 172 | capacity_stats_pool, capacity_stats_device, 173 | io_stats_pool, io_stats_device, 174 | ): 175 | x.clear() 176 | timestamp = int(time.time()) 177 | 178 | elif ltype == T_POOL: 179 | line = line.strip() 180 | poolname, s_df, s_io = extract_info(line) 181 | capacity_stats_pool[poolname] = s_df 182 | io_stats_pool[poolname] = s_io 183 | # marker for leg 184 | last_leg = 0 185 | 186 | elif ltype == T_LEG: 187 | last_leg = last_leg + 1 188 | line = line.strip() 189 | devicename, s_df, s_io = extract_info(line) 190 | capacity_stats_device["%s %s%s" % (poolname, devicename, last_leg)] = s_df 191 | io_stats_device["%s %s%s" % (poolname, devicename, last_leg)] = s_io 192 | 193 | elif ltype == T_DEVICE: 194 | line = line.strip() 195 | devicename, s_df, s_io = extract_info(line) 196 | capacity_stats_device["%s %s" % (poolname, devicename)] = s_df 197 | io_stats_device["%s %s" % (poolname, devicename)] = s_io 198 | 199 | elif ltype == T_EMPTY: 200 | if firstloop: 201 | firstloop = False 202 | else: 203 | # this flag prevents printing out of the data in the first loop 204 | # which is a since-boot summary similar to iostat 205 | # and is useless to us 206 | for poolname, stats in capacity_stats_pool.items(): 207 | fm = "zfs.df.pool.1kblocks.%s %d %s poolname=%s" 208 | for statname, statnumber in stats.items(): 209 | print fm % (statname, timestamp, statnumber, poolname) 210 | for poolname, stats in io_stats_pool.items(): 211 | fm = "zfs.io.pool.%s %d %s poolname=%s" 212 | for statname, statnumber in stats.items(): 213 | print fm % (statname, timestamp, statnumber, poolname) 214 | for devicename, stats in capacity_stats_device.items(): 215 | fm = "zfs.df.device.1kblocks.%s %d %s devicename=%s poolname=%s" 216 | poolname, devicename = devicename.split(" ", 1) 217 | for statname, statnumber in stats.items(): 218 | print fm % (statname, timestamp, statnumber, 219 | devicename, poolname) 220 | for devicename, stats in io_stats_device.items(): 221 | fm = "zfs.io.device.%s %d %s devicename=%s poolname=%s" 222 | poolname, devicename = devicename.split(" ", 1) 223 | for statname, statnumber in stats.items(): 224 | print fm % (statname, timestamp, statnumber, 225 | devicename, poolname) 226 | sys.stdout.flush() 227 | # if this was the first loop, well, we're onto the second loop 228 | # so we turh the flag off 229 | 230 | if signal_received is None: 231 | signal_received = signal.SIGTERM 232 | try: 233 | os.kill(p_zpool.pid, signal_received) 234 | except Exception: 235 | pass 236 | p_zpool.wait() 237 | 238 | if __name__ == "__main__": 239 | main() 240 | 241 | -------------------------------------------------------------------------------- /collectors/0/zfskernstats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # This file is part of tcollector. 3 | # Copyright (C) 2012 StumbleUpon, Inc. 4 | # 5 | # This program is free software: you can redistribute it and/or modify it 6 | # under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or (at your 8 | # option) any later version. This program is distributed in the hope that it 9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty 10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser 11 | # General Public License for more details. You should have received a copy 12 | # of the GNU Lesser General Public License along with this program. If not, 13 | # see . 14 | # 15 | 16 | import errno 17 | import re 18 | import sys 19 | import time 20 | 21 | ''' 22 | ZFS kernel memory statistics for TSDB 23 | 24 | This plugin tracks kernel memory for both: 25 | 26 | - the SPL and its allocated slabs backing ZFS memory 27 | zfs.mem.slab 28 | - the ARC and its various values 29 | zfs.mem.arc 30 | ''' 31 | 32 | # /proc/spl/slab has several fields. we only care about the sizes 33 | # and the allocation sizes for the slabs 34 | # /proc/spl/kstat/zfs/arcstats is a table. we only care about the data column 35 | 36 | def main(): 37 | """zfsstat main loop""" 38 | interval = 15 39 | typere = re.compile("(^.*)_[0-9]+$") 40 | 41 | try: 42 | f_slab = open("/proc/spl/kmem/slab", "r") 43 | f_arcstats = open("/proc/spl/kstat/zfs/arcstats", "r") 44 | except IOError, e: 45 | if e.errno == errno.ENOENT: 46 | # it makes no sense to run this collector here 47 | sys.exit(13) # we signal tcollector to not run us 48 | raise 49 | 50 | while True: 51 | f_slab.seek(0) 52 | f_arcstats.seek(0) 53 | ts = int(time.time()) 54 | 55 | for n, line in enumerate(f_slab): 56 | if n < 2: 57 | continue 58 | line = line.split() 59 | name, _, size, alloc, _, objsize = line[0:6] 60 | size, alloc, objsize = int(size), int(alloc), int(objsize) 61 | typ = typere.match(name) 62 | if typ: 63 | typ = typ.group(1) 64 | else: 65 | typ = name 66 | print ("zfs.mem.slab.size %d %d type=%s objsize=%d" % 67 | (ts, size, typ, objsize) 68 | ) 69 | print ("zfs.mem.slab.alloc %d %d type=%s objsize=%d" % 70 | (ts, alloc, typ, objsize) 71 | ) 72 | 73 | for n, line in enumerate(f_arcstats): 74 | if n < 2: 75 | continue 76 | line = line.split() 77 | name, _, data = line 78 | data = int(data) 79 | print ("zfs.mem.arc.%s %d %d" % 80 | (name, ts, data) 81 | ) 82 | 83 | sys.stdout.flush() 84 | time.sleep(interval) 85 | 86 | if __name__ == "__main__": 87 | main() 88 | 89 | -------------------------------------------------------------------------------- /collectors/etc/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # This file is part of tcollector. 3 | # Copyright (C) 2010 StumbleUpon, Inc. 4 | # 5 | # This program is free software: you can redistribute it and/or modify it 6 | # under the terms of the GNU Lesser General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or (at your 8 | # option) any later version. This program is distributed in the hope that it 9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty 10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser 11 | # General Public License for more details. You should have received a copy 12 | # of the GNU Lesser General Public License along with this program. If not, 13 | # see . 14 | 15 | # This 'onload' function will be called by tcollector when it starts up. 16 | # You can put any code here that you want to load inside the tcollector. 17 | # This also gives you a chance to override the options from the command 18 | # line or to add custom sanity checks on their values. 19 | # You can also use this to change the global tags that will be added to 20 | # every single data point. For instance if you have multiple different 21 | # pools or clusters of machines, you might wanna lookup the name of the 22 | # pool or cluster the current host belongs to and add it to the tags. 23 | # Throwing an exception here will cause the tcollector to die before it 24 | # starts doing any work. 25 | # Python files in this directory that don't have an "onload" function 26 | # will be imported by tcollector too, but no function will be called. 27 | # When this file executes, you can assume that its directory is in 28 | # sys.path, so you can import other Python modules from this directory 29 | # or its subdirectories. 30 | 31 | def onload(options, tags): 32 | """Function called by tcollector when it starts up. 33 | 34 | Args: 35 | options: The options as returned by the OptionParser. 36 | tags: A dictionnary that maps tag names to tag values. 37 | """ 38 | pass 39 | -------------------------------------------------------------------------------- /collectors/etc/mysqlconf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | def get_user_password(sockfile): 4 | """Given the path of a socket file, returns a tuple (user, password).""" 5 | return ("root", "") 6 | -------------------------------------------------------------------------------- /collectors/lib/jmx-1.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StumbleUponArchive/tcollector/e09b09153131823b12bfca6824ee90c1d361a011/collectors/lib/jmx-1.0.jar -------------------------------------------------------------------------------- /startstop: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Semi Universal start-stop script 4 | 5 | # TSD_HOST=dns.name.of.tsd 6 | TCOLLECTOR_PATH=${TCOLLECTOR_PATH-'/usr/local/tcollector'} 7 | test -n "$TSD_HOST" || { 8 | echo >&2 "TSD_HOST is not set in $0" 9 | exit 1 10 | } 11 | 12 | HOSTNAME=$(hostname) 13 | PIDFILE=${PIDFILE-'/var/run/tcollector.pid'} 14 | PROG=$TCOLLECTOR_PATH/tcollector.py 15 | LOG=${LOG-'/var/log/tcollector.log'} 16 | COMMAND=$1 17 | shift 18 | ARGS="-c $TCOLLECTOR_PATH/collectors -H $TSD_HOST -t host=$HOSTNAME -P $PIDFILE" 19 | ARGS="$ARGS $@" 20 | 21 | # Sanity checks. 22 | test -d "$TCOLLECTOR_PATH" || { 23 | echo >&2 "No such directory: $TCOLLECTOR_PATH" 24 | echo >&2 "You might need to set the TCOLLECTOR_PATH variable in $0" 25 | exit 2 26 | } 27 | 28 | test -f "$PROG" || { 29 | echo >&2 "No such file: $PROG" 30 | echo >&2 "You might need to set the TCOLLECTOR_PATH variable in $0" 31 | exit 3 32 | } 33 | 34 | for i in "$PIDFILE" "$LOG"; do 35 | # If the file doesn't exist, check that we have write access to its parent 36 | # directory to be able to create it. 37 | test -e "$i" || i=`dirname "$i"` 38 | test -w "$i" || { 39 | echo >&2 "$0: error: Cannot write to $i" 40 | exit 4 41 | } 42 | done 43 | 44 | which_python () { 45 | for python in /usr/bin/python2.6 /usr/bin/python2.5 /usr/bin/python; do 46 | test -x "$python" && echo "$python" && return 47 | done 48 | echo >&2 'Could not find a Python interpreter' 49 | exit 1 50 | } 51 | 52 | PYTHON=$(which_python) 53 | 54 | start () { 55 | echo "Starting $PROG" 56 | $PYTHON $PROG $ARGS >> $LOG 2>&1 & 57 | } 58 | 59 | # stop [signum] 60 | stop () { 61 | echo "Stopping $PROG" 62 | pkill $1 -f "/usr/bin/python.* $PROG -c" 63 | } 64 | 65 | status () { 66 | if pgrep -f "/usr/bin/python.* $PROG -c" >/dev/null; then 67 | echo "$PROG" running 68 | return 0 69 | fi 70 | return 1 71 | } 72 | 73 | forcerestart () { 74 | stop 75 | try=1 76 | sleep 1 77 | while status; do 78 | try=$((try + 1)) 79 | if [[ $try -gt 3 ]]; then 80 | stop -9 81 | else 82 | stop 83 | fi 84 | echo "Waiting for $PROG to die.." 85 | sleep 5 86 | done 87 | start 88 | } 89 | 90 | case $COMMAND in 91 | start) status || start 92 | ;; 93 | force-restart) 94 | forcerestart 95 | ;; 96 | restart) 97 | # tcollector already respawns collectors if they 98 | # have changed on-disk, and kills old ones/starts 99 | # new ones. The only thing tcollector doesn't do 100 | # is restart itself if itself has changed. For a more 101 | # graceful restart, just make sure we're running and 102 | # restart only if tcollector is newer on disk than 103 | # since it started. This doesn't check for dependencies 104 | # like asyncproc.py, but that's ok. 105 | if status; then 106 | newer=$(find $PROG -newer $PIDFILE | wc -l) 107 | if [[ $newer -gt 0 ]]; then 108 | forcerestart 109 | fi 110 | else 111 | start 112 | fi 113 | ;; 114 | stop) stop 115 | ;; 116 | status) status 117 | exit $? 118 | ;; 119 | *) echo >&2 "usage: $0 " 120 | exit 1 121 | ;; 122 | esac 123 | -------------------------------------------------------------------------------- /stumbleupon/monitoring/.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | -------------------------------------------------------------------------------- /stumbleupon/monitoring/Makefile: -------------------------------------------------------------------------------- 1 | # Copyright 2010 StumbleUpon, Inc. 2 | # 3 | # This library is free software: you can redistribute it and/or modify it 4 | # under the terms of the GNU Lesser General Public License as published 5 | # by the Free Software Foundation, either version 3 of the License, or 6 | # (at your option) any later version. 7 | # 8 | # This library is distributed in the hope that it will be useful, 9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | # GNU Lesser General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU Lesser General Public License 14 | # along with this library. If not, see . 15 | 16 | all: jmx 17 | 18 | top_builddir = build 19 | package = com.stumbleupon.monitoring 20 | spec_title = Monitoring Tools 21 | spec_vendor = StumbleUpon, Inc. 22 | spec_version = 1.0 23 | jmx_JAVA = \ 24 | jmx.java \ 25 | 26 | jmx_LIBADD = \ 27 | /usr/lib/jvm/java-6-sun/lib/tools.jar \ 28 | 29 | AM_JAVACFLAGS = -Xlint -source 6 30 | JVM_ARGS = 31 | package_dir = $(subst .,/,$(package)) 32 | jmx_classes=$(jmx_JAVA:%.java=$(top_builddir)/$(package_dir)/%.class) 33 | jmx_jar = $(top_builddir)/jmx-$(spec_version).jar 34 | 35 | jmx: $(jmx_jar) 36 | 37 | jmx_get_dep_classpath = `echo $(jmx_LIBADD) | tr ' ' ':'` 38 | $(top_builddir)/.javac-stamp: $(jmx_JAVA) 39 | @mkdir -p $(top_builddir) 40 | javac $(AM_JAVACFLAGS) -cp $(jmx_get_dep_classpath) \ 41 | -d $(top_builddir) $(jmx_JAVA) 42 | @touch "$@" 43 | 44 | classes_with_nested_classes = $(jmx_classes:$(top_builddir)/%.class=%*.class) 45 | 46 | pkg_version = \ 47 | `git rev-list --pretty=format:%h HEAD --max-count=1 | sed 1d || echo unknown` 48 | $(top_builddir)/manifest: $(top_builddir)/.javac-stamp ../../.git/HEAD 49 | { echo "Specification-Title: $(spec_title)"; \ 50 | echo "Specification-Version: $(spec_version)"; \ 51 | echo "Specification-Vendor: $(spec_vendor)"; \ 52 | echo "Implementation-Title: $(package)"; \ 53 | echo "Implementation-Version: $(pkg_version)"; \ 54 | echo "Implementation-Vendor: $(spec_vendor)"; } >"$@" 55 | 56 | $(jmx_jar): $(top_builddir)/manifest $(top_builddir)/.javac-stamp $(jmx_classes) 57 | cd $(top_builddir) && jar cfm `basename $(jmx_jar)` manifest $(classes_with_nested_classes) \ 58 | || { rv=$$? && rm -f `basename $(jar)` && exit $$rv; } 59 | # ^^^^^^^^^^^^^^^^^^^^^^^ 60 | # I've seen cases where `jar' exits with an error but leaves a partially built .jar file! 61 | 62 | doc: $(top_builddir)/api/index.html 63 | 64 | JDK_JAVADOC=http://download.oracle.com/javase/6/docs/api 65 | $(top_builddir)/api/index.html: $(jmx_JAVA) $(BUILT_SOURCES) 66 | javadoc -d $(top_builddir)/api -classpath $(get_dep_classpath) \ 67 | -link $(JDK_JAVADOC) -link $(jmx_JAVA) $(BUILT_SOURCES) 68 | 69 | clean: 70 | @rm -f $(top_builddir)/.javac-stamp 71 | rm -f $(top_builddir)/manifest $(BUILT_SOURCES) 72 | cd $(top_builddir) || exit 0 && rm -f $(classes_with_nested_classes) 73 | cd $(top_builddir) || exit 0 \ 74 | && test -d $(package_dir) || exit 0 \ 75 | && find $(package_dir) -type d -depth -exec rmdir {} ';' \ 76 | && dir=$(package_dir) && dir=$${dir%/*} \ 77 | && while test x"$$dir" != x"$${dir%/*}"; do \ 78 | rmdir "$$dir" && dir=$${dir%/*} || break; \ 79 | done \ 80 | && rmdir "$$dir" 81 | 82 | distclean: clean 83 | rm -f $(jar) 84 | rm -rf $(top_builddir)/api 85 | test ! -d $(top_builddir) || rmdir $(top_builddir) 86 | 87 | .PHONY: all jmx clean distclean doc check 88 | -------------------------------------------------------------------------------- /stumbleupon/monitoring/jmx.java: -------------------------------------------------------------------------------- 1 | // This file is part of OpenTSDB. 2 | // Copyright (C) 2010 StumbleUpon, Inc. 3 | // 4 | // This program is free software: you can redistribute it and/or modify it 5 | // under the terms of the GNU Lesser General Public License as published by 6 | // the Free Software Foundation, either version 3 of the License, or (at your 7 | // option) any later version. This program is distributed in the hope that it 8 | // will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty 9 | // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser 10 | // General Public License for more details. You should have received a copy 11 | // of the GNU Lesser General Public License along with this program. If not, 12 | // see . 13 | 14 | /** Quick CLI tool to get JMX MBean attributes. */ 15 | package com.stumbleupon.monitoring; 16 | 17 | import java.io.File; 18 | import java.io.IOException; 19 | import java.util.ArrayList; 20 | import java.util.Collection; 21 | import java.util.Collections; 22 | import java.util.Comparator; 23 | import java.util.HashMap; 24 | import java.util.Iterator; 25 | import java.util.Map; 26 | import java.util.Set; 27 | import java.util.TreeMap; 28 | import java.util.regex.Pattern; 29 | import java.util.regex.PatternSyntaxException; 30 | 31 | import javax.management.MBeanAttributeInfo; 32 | import javax.management.MBeanInfo; 33 | import javax.management.MBeanServerConnection; 34 | import javax.management.ObjectName; 35 | import javax.management.openmbean.TabularData; 36 | import javax.management.remote.JMXConnector; 37 | import javax.management.remote.JMXConnectorFactory; 38 | import javax.management.remote.JMXServiceURL; 39 | 40 | // Sun specific 41 | import com.sun.tools.attach.AgentInitializationException; 42 | import com.sun.tools.attach.AgentLoadException; 43 | import com.sun.tools.attach.AttachNotSupportedException; 44 | import com.sun.tools.attach.VirtualMachine; 45 | import com.sun.tools.attach.VirtualMachineDescriptor; 46 | 47 | // Sun private 48 | import sun.management.ConnectorAddressLink; 49 | import sun.jvmstat.monitor.HostIdentifier; 50 | import sun.jvmstat.monitor.MonitoredHost; 51 | import sun.jvmstat.monitor.MonitoredVm; 52 | import sun.jvmstat.monitor.MonitoredVmUtil; 53 | import sun.jvmstat.monitor.VmIdentifier; 54 | 55 | final class jmx { 56 | 57 | private static final String LOCAL_CONNECTOR_ADDRESS = 58 | "com.sun.management.jmxremote.localConnectorAddress"; 59 | 60 | private static void usage() { 61 | System.out.println("Usage:\n" 62 | + " jmx -l Lists all reachable VMs.\n" 63 | + " jmx Lists all MBeans for this JVM (PID or regexp).\n" 64 | + " jmx Prints all the attributes of this MBean.\n" 65 | + " jmx Prints the matching attributes of this MBean.\n" 66 | + "\n" 67 | + "You can pass multiple pairs to match multiple different\n" 68 | + "attributes for different MBeans. For example:\n" 69 | + " jmx --long JConsole Class Count Thread Total Garbage Collection\n" 70 | + " LoadedClassCount 2808 java.lang:type=ClassLoading\n" 71 | + " UnloadedClassCount 0 java.lang:type=ClassLoading\n" 72 | + " TotalLoadedClassCount 2808 java.lang:type=ClassLoading\n" 73 | + " CollectionCount 0 java.lang:type=GarbageCollector,name=ConcurrentMarkSweep\n" 74 | + " CollectionTime 0 java.lang:type=GarbageCollector,name=ConcurrentMarkSweep\n" 75 | + " CollectionCount 1 java.lang:type=GarbageCollector,name=ParNew\n" 76 | + " CollectionTime 19 java.lang:type=GarbageCollector,name=ParNew\n" 77 | + " TotalStartedThreadCount 43 java.lang:type=Threading\n" 78 | + "The command above searched for a JVM with `JConsole' in its name, and then searched\n" 79 | + "for MBeans with `Class' in the name and `Count' in the attribute (first 3 matches\n" 80 | + "in this output), MBeans with `Thread' in the name and `Total' in the attribute (last\n" 81 | + "line in the output) and MBeans matching `Garbage' with a `Collection' attribute.\n" 82 | + "\n" 83 | + "Other flags you can pass:\n" 84 | + " --long Print a longer but more explicit output for each value.\n" 85 | + " --timestamp Print a timestamp at the beginning of each line.\n" 86 | + " --watch N Reprint the output every N seconds.\n" 87 | + "\n" 88 | + "Return value:\n" 89 | + " 0: Everything OK.\n" 90 | + " 1: Invalid usage or unexpected error.\n" 91 | + " 2: No JVM matched.\n" 92 | + " 3: No MBean matched.\n" 93 | + " 4: No attribute matched for the MBean(s) selected."); 94 | } 95 | 96 | private static void fatal(final int rv, final String errmsg) { 97 | System.err.println(errmsg); 98 | System.exit(rv); 99 | throw new AssertionError("You should never see this, really."); 100 | } 101 | 102 | public static void main(final String[] args) throws Exception { 103 | if (args.length == 0 || "-h".equals(args[0]) || "--help".equals(args[0])) { 104 | usage(); 105 | System.exit(args.length == 0 ? 1 : 0); 106 | return; 107 | } 108 | 109 | int current_arg = 0; 110 | int watch = 0; 111 | boolean long_output = false; 112 | boolean print_timestamps = false; 113 | while (current_arg < args.length) { 114 | if ("--watch".equals(args[current_arg])) { 115 | current_arg++; 116 | try { 117 | watch = Integer.parseInt(args[current_arg]); 118 | } catch (NumberFormatException e) { 119 | fatal(1, "Invalid value for --watch: " + e.getMessage()); 120 | return; 121 | } 122 | if (watch < 1) { 123 | fatal(1, "Invalid value for --watch: " + watch); 124 | } 125 | current_arg++; 126 | } else if ("--long".equals(args[current_arg])) { 127 | long_output = true; 128 | current_arg++; 129 | } else if ("--timestamp".equals(args[current_arg])) { 130 | print_timestamps = true; 131 | current_arg++; 132 | } else { 133 | break; 134 | } 135 | } 136 | 137 | if (current_arg == args.length) { 138 | usage(); 139 | fatal(1, "error: Missing argument (-l or JVM specification)."); 140 | return; 141 | } 142 | 143 | HashMap vms = getJVMs(); 144 | if ("-l".equals(args[current_arg])) { 145 | printVmList(vms.values()); 146 | return; 147 | } 148 | 149 | final JVM jvm = selectJVM(args[current_arg++], vms); 150 | vms = null; 151 | final JMXConnector connection = JMXConnectorFactory.connect(jvm.jmxUrl()); 152 | try { 153 | final MBeanServerConnection mbsc = connection.getMBeanServerConnection(); 154 | if (args.length == current_arg) { 155 | for (final ObjectName mbean : listMBeans(mbsc)) { 156 | System.out.println(mbean); 157 | } 158 | return; 159 | } 160 | 161 | final TreeMap objects = selectMBeans(args, current_arg, mbsc); 162 | if (objects.isEmpty()) { 163 | fatal(3, "No MBean matched your query in " + jvm.name()); 164 | return; 165 | } 166 | do { 167 | boolean found = false; 168 | for (final Map.Entry entry : objects.entrySet()) { 169 | final ObjectName object = entry.getKey(); 170 | final MBeanInfo mbean = mbsc.getMBeanInfo(object); 171 | final Pattern wanted = entry.getValue(); 172 | for (final MBeanAttributeInfo attr : mbean.getAttributes()) { 173 | if (wanted == null || wanted.matcher(attr.getName()).find()) { 174 | dumpMBean(long_output, print_timestamps, mbsc, object, attr); 175 | found = true; 176 | } 177 | } 178 | } 179 | if (!found) { 180 | fatal(4, "No attribute of " + objects.keySet() 181 | + " matched your query in " + jvm.name()); 182 | return; 183 | } 184 | System.out.flush(); 185 | Thread.sleep(watch * 1000); 186 | } while (watch > 0); 187 | } finally { 188 | connection.close(); 189 | } 190 | } 191 | 192 | private static TreeMap selectMBeans(final String[] args, 193 | final int current_arg, 194 | final MBeanServerConnection mbsc) throws IOException { 195 | final TreeMap mbeans = new TreeMap(); 196 | for (int i = current_arg; i < args.length; i += 2) { 197 | final Pattern object_re = compile_re(args[i]); 198 | final Pattern attr_re = i + 1 < args.length ? compile_re(args[i + 1]) : null; 199 | for (final ObjectName o : listMBeans(mbsc)) { 200 | if (object_re.matcher(o.toString()).find()) { 201 | mbeans.put(o, attr_re); 202 | } 203 | } 204 | } 205 | return mbeans; 206 | } 207 | 208 | private static void dumpMBean(final boolean long_output, 209 | final boolean print_timestamps, 210 | final MBeanServerConnection mbsc, 211 | final ObjectName object, 212 | final MBeanAttributeInfo attr) throws Exception { 213 | final String name = attr.getName(); 214 | Object value = mbsc.getAttribute(object, name); 215 | if (value instanceof TabularData) { 216 | final TabularData tab = (TabularData) value; 217 | int i = 0; 218 | for (final Object o : tab.keySet()) { 219 | dumpMBeanValue(long_output, print_timestamps, object, name + "." + i, o); 220 | i++; 221 | } 222 | } else { 223 | dumpMBeanValue(long_output, print_timestamps, object, name, value); 224 | } 225 | } 226 | 227 | private static void dumpMBeanValue(final boolean long_output, 228 | final boolean print_timestamps, 229 | final ObjectName object, 230 | final String name, 231 | final Object value) { 232 | final StringBuilder buf = new StringBuilder(); 233 | final long timestamp = System.currentTimeMillis() / 1000; 234 | if (print_timestamps) { 235 | buf.append(timestamp).append('\t'); 236 | } 237 | if (value instanceof Object[]) { 238 | for (final Object o : (Object[]) value) { 239 | buf.append(o).append('\t'); 240 | } 241 | buf.setLength(buf.length() - 1); 242 | } else { 243 | buf.append(name).append('\t').append(value); 244 | } 245 | if (long_output) { 246 | buf.append('\t').append(object); 247 | } 248 | buf.append('\n'); 249 | System.out.print(buf); 250 | } 251 | 252 | private static ArrayList listMBeans(final MBeanServerConnection mbsc) throws IOException { 253 | ArrayList mbeans = new ArrayList(mbsc.queryNames(null, null)); 254 | Collections.sort(mbeans, new Comparator() { 255 | public int compare(final ObjectName a, final ObjectName b) { 256 | return a.toString().compareTo(b.toString()); 257 | } 258 | }); 259 | return mbeans; 260 | } 261 | 262 | private static Pattern compile_re(final String re) { 263 | try { 264 | return Pattern.compile(re); 265 | } catch (PatternSyntaxException e) { 266 | fatal(1, "Invalid regexp: " + re + ", " + e.getMessage()); 267 | throw new AssertionError("Should never be here"); 268 | } 269 | } 270 | 271 | private static final String MAGIC_STRING = "this.is.jmx.magic"; 272 | 273 | private static JVM selectJVM(final String selector, 274 | final HashMap vms) { 275 | String error = null; 276 | try { 277 | final int pid = Integer.parseInt(selector); 278 | if (pid < 2) { 279 | throw new IllegalArgumentException("Invalid PID: " + pid); 280 | } 281 | final JVM jvm = vms.get(pid); 282 | if (jvm != null) { 283 | return jvm; 284 | } 285 | error = "Couldn't find a JVM with PID " + pid; 286 | } catch (NumberFormatException e) { 287 | /* Ignore. */ 288 | } 289 | if (error == null) { 290 | try { 291 | final Pattern p = compile_re(selector); 292 | final ArrayList matches = new ArrayList(2); 293 | for (final JVM jvm : vms.values()) { 294 | if (p.matcher(jvm.name()).find()) { 295 | matches.add(jvm); 296 | } 297 | } 298 | // Exclude ourselves from the matches. 299 | System.setProperty(MAGIC_STRING, 300 | "LOL Java processes can't get their own PID"); 301 | final String me = jmx.class.getName(); 302 | final Iterator it = matches.iterator(); 303 | while (it.hasNext()) { 304 | final JVM jvm = it.next(); 305 | final String name = jvm.name(); 306 | // Ignore other long running jmx clients too. 307 | if (name.contains("--watch") && name.contains(me)) { 308 | it.remove(); 309 | continue; 310 | } 311 | final VirtualMachine vm = VirtualMachine.attach(String.valueOf(jvm.pid())); 312 | try { 313 | if (vm.getSystemProperties().containsKey(MAGIC_STRING)) { 314 | it.remove(); 315 | continue; 316 | } 317 | } finally { 318 | vm.detach(); 319 | } 320 | } 321 | System.clearProperty(MAGIC_STRING); 322 | if (matches.size() == 0) { 323 | error = "No JVM matched your regexp " + selector; 324 | } else if (matches.size() > 1) { 325 | printVmList(matches); 326 | error = matches.size() + " JVMs matched your regexp " + selector 327 | + ", it's too ambiguous, please refine it."; 328 | } else { 329 | return matches.get(0); 330 | } 331 | } catch (PatternSyntaxException e) { 332 | error = "Invalid pattern: " + selector + ", " + e.getMessage(); 333 | } catch (Exception e) { 334 | e.printStackTrace(); 335 | error = "Unexpected Exception: " + e.getMessage(); 336 | } 337 | } 338 | fatal(2, error); 339 | return null; 340 | } 341 | 342 | private static void printVmList(final Collection vms) { 343 | final ArrayList sorted_vms = new ArrayList(vms); 344 | Collections.sort(sorted_vms, new Comparator() { 345 | public int compare(final JVM a, final JVM b) { 346 | return a.pid() - b.pid(); 347 | } 348 | }); 349 | for (final JVM jvm : sorted_vms) { 350 | System.out.println(jvm.pid() + "\t" + jvm.name()); 351 | } 352 | } 353 | 354 | private static final class JVM { 355 | final int pid; 356 | final String name; 357 | String address; 358 | 359 | public JVM(final int pid, final String name, final String address) { 360 | if (name.isEmpty()) { 361 | throw new IllegalArgumentException("empty name"); 362 | } 363 | this.pid = pid; 364 | this.name = name; 365 | this.address = address; 366 | } 367 | 368 | public int pid() { 369 | return pid; 370 | } 371 | 372 | public String name() { 373 | return name; 374 | } 375 | 376 | public JMXServiceURL jmxUrl() { 377 | if (address == null) { 378 | ensureManagementAgentStarted(); 379 | } 380 | try { 381 | return new JMXServiceURL(address); 382 | } catch (Exception e) { 383 | throw new RuntimeException("Error", e); 384 | } 385 | } 386 | 387 | public void ensureManagementAgentStarted() { 388 | if (address != null) { // already started 389 | return; 390 | } 391 | VirtualMachine vm; 392 | try { 393 | vm = VirtualMachine.attach(String.valueOf(pid)); 394 | } catch (AttachNotSupportedException e) { 395 | throw new RuntimeException("Failed to attach to " + this, e); 396 | } catch (IOException e) { 397 | throw new RuntimeException("Failed to attach to " + this, e); 398 | } 399 | try { 400 | // java.sun.com/javase/6/docs/technotes/guides/management/agent.html#gdhkz 401 | // + code mostly stolen from JConsole's code. 402 | final String home = vm.getSystemProperties().getProperty("java.home"); 403 | 404 | // Normally in ${java.home}/jre/lib/management-agent.jar but might 405 | // be in ${java.home}/lib in build environments. 406 | 407 | String agent = home + File.separator + "jre" + File.separator 408 | + "lib" + File.separator + "management-agent.jar"; 409 | File f = new File(agent); 410 | if (!f.exists()) { 411 | agent = home + File.separator + "lib" + File.separator 412 | + "management-agent.jar"; 413 | f = new File(agent); 414 | if (!f.exists()) { 415 | throw new RuntimeException("Management agent not found"); 416 | } 417 | } 418 | 419 | agent = f.getCanonicalPath(); 420 | try { 421 | vm.loadAgent(agent, "com.sun.management.jmxremote"); 422 | } catch (AgentLoadException e) { 423 | throw new RuntimeException("Failed to load the agent into " + this, e); 424 | } catch (AgentInitializationException e) { 425 | throw new RuntimeException("Failed to initialize the agent into " + this, e); 426 | } 427 | address = (String) vm.getAgentProperties().get(LOCAL_CONNECTOR_ADDRESS); 428 | } catch (IOException e) { 429 | throw new RuntimeException("Error while loading agent into " + this, e); 430 | } finally { 431 | try { 432 | vm.detach(); 433 | } catch (IOException e) { 434 | throw new RuntimeException("Failed to detach from " + vm + " = " + this, e); 435 | } 436 | } 437 | if (address == null) { 438 | throw new RuntimeException("Couldn't start the management agent."); 439 | } 440 | } 441 | 442 | public String toString() { 443 | return "JVM(" + pid + ", \"" + name + "\", " 444 | + (address == null ? null : '"' + address + '"') + ')'; 445 | } 446 | } 447 | 448 | /** 449 | * Returns a map from PID to JVM. 450 | */ 451 | private static HashMap getJVMs() throws Exception { 452 | final HashMap vms = new HashMap(); 453 | getMonitoredVMs(vms); 454 | getAttachableVMs(vms); 455 | return vms; 456 | } 457 | 458 | private static void getMonitoredVMs(final HashMap out) throws Exception { 459 | final MonitoredHost host = 460 | MonitoredHost.getMonitoredHost(new HostIdentifier((String) null)); 461 | @SuppressWarnings("unchecked") 462 | final Set vms = host.activeVms(); 463 | for (final Integer pid : vms) { 464 | try { 465 | final VmIdentifier vmid = new VmIdentifier(pid.toString()); 466 | final MonitoredVm vm = host.getMonitoredVm(vmid); 467 | out.put(pid, new JVM(pid, MonitoredVmUtil.commandLine(vm), 468 | ConnectorAddressLink.importFrom(pid))); 469 | vm.detach(); 470 | } catch (Exception x) { 471 | System.err.println("Ignoring exception:"); 472 | x.printStackTrace(); 473 | } 474 | } 475 | } 476 | 477 | private static void getAttachableVMs(final HashMap out) { 478 | for (final VirtualMachineDescriptor vmd : VirtualMachine.list()) { 479 | int pid; 480 | try { 481 | pid = Integer.parseInt(vmd.id()); 482 | } catch (NumberFormatException e) { 483 | System.err.println("Ignoring invalid vmd.id(): " + vmd.id() 484 | + ' ' + e.getMessage()); 485 | continue; 486 | } 487 | if (out.containsKey(pid)) { 488 | continue; 489 | } 490 | try { 491 | final VirtualMachine vm = VirtualMachine.attach(vmd); 492 | out.put(pid, new JVM(pid, String.valueOf(pid), 493 | (String) vm.getAgentProperties().get(LOCAL_CONNECTOR_ADDRESS))); 494 | vm.detach(); 495 | } catch (AttachNotSupportedException e) { 496 | System.err.println("VM not attachable: " + vmd.id() 497 | + ' ' + e.getMessage()); 498 | } catch (IOException e) { 499 | System.err.println("Could not attach: " + vmd.id() 500 | + ' ' + e.getMessage()); 501 | } 502 | } 503 | } 504 | 505 | } 506 | -------------------------------------------------------------------------------- /stumbleupon/tcollector.pp: -------------------------------------------------------------------------------- 1 | # Example Puppet manifest for updating/starting tcollector 2 | # under puppet 3 | 4 | class tcollector { 5 | package { python: 6 | ensure => installed, 7 | } 8 | 9 | service { tcollector: 10 | ensure => running, 11 | require => [Package["python"], File["/usr/local/tcollector"]], 12 | start => "/usr/local/tcollector/startstop start", 13 | stop => "/usr/local/tcollector/startstop stop", 14 | restart => "/usr/local/tcollector/startstop restart", 15 | status => "/usr/local/tcollector/startstop status", 16 | subscribe => File["/usr/local/tcollector"], 17 | } 18 | 19 | file { ["/usr/local"]: 20 | owner => root, group => root, mode => 755, 21 | ensure => directory, 22 | } 23 | 24 | file { "/usr/local/tcollector": 25 | source => "puppet:///files/tcollector", 26 | owner => root, group => root, 27 | ensure => directory, 28 | recurse => true, 29 | ignore => '*.pyc', 30 | purge => true, 31 | force => true, 32 | require => File["/usr/local"], 33 | } 34 | } 35 | --------------------------------------------------------------------------------