├── COPYING
├── COPYING.LESSER
├── README
├── collectors
    ├── 0
    │   ├── dfstat.py
    │   ├── elasticsearch.py
    │   ├── hadoop_datanode_jmx.py
    │   ├── hbase_regionserver_jmx.py
    │   ├── ifstat.py
    │   ├── iostat.py
    │   ├── mysql.py
    │   ├── netstat.py
    │   ├── procnettcp.py
    │   ├── procstats.py
    │   ├── redis-stats.py
    │   ├── riak.py
    │   ├── zfsiostats.py
    │   └── zfskernstats.py
    ├── etc
    │   ├── config.py
    │   └── mysqlconf.py
    └── lib
    │   └── jmx-1.0.jar
├── startstop
├── stumbleupon
    ├── monitoring
    │   ├── .gitignore
    │   ├── Makefile
    │   └── jmx.java
    └── tcollector.pp
└── tcollector.py


/COPYING:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU General Public License is a free, copyleft license for
 11 | software and other kinds of works.
 12 | 
 13 |   The licenses for most software and other practical works are designed
 14 | to take away your freedom to share and change the works.  By contrast,
 15 | the GNU General Public License is intended to guarantee your freedom to
 16 | share and change all versions of a program--to make sure it remains free
 17 | software for all its users.  We, the Free Software Foundation, use the
 18 | GNU General Public License for most of our software; it applies also to
 19 | any other work released this way by its authors.  You can apply it to
 20 | your programs, too.
 21 | 
 22 |   When we speak of free software, we are referring to freedom, not
 23 | price.  Our General Public Licenses are designed to make sure that you
 24 | have the freedom to distribute copies of free software (and charge for
 25 | them if you wish), that you receive source code or can get it if you
 26 | want it, that you can change the software or use pieces of it in new
 27 | free programs, and that you know you can do these things.
 28 | 
 29 |   To protect your rights, we need to prevent others from denying you
 30 | these rights or asking you to surrender the rights.  Therefore, you have
 31 | certain responsibilities if you distribute copies of the software, or if
 32 | you modify it: responsibilities to respect the freedom of others.
 33 | 
 34 |   For example, if you distribute copies of such a program, whether
 35 | gratis or for a fee, you must pass on to the recipients the same
 36 | freedoms that you received.  You must make sure that they, too, receive
 37 | or can get the source code.  And you must show them these terms so they
 38 | know their rights.
 39 | 
 40 |   Developers that use the GNU GPL protect your rights with two steps:
 41 | (1) assert copyright on the software, and (2) offer you this License
 42 | giving you legal permission to copy, distribute and/or modify it.
 43 | 
 44 |   For the developers' and authors' protection, the GPL clearly explains
 45 | that there is no warranty for this free software.  For both users' and
 46 | authors' sake, the GPL requires that modified versions be marked as
 47 | changed, so that their problems will not be attributed erroneously to
 48 | authors of previous versions.
 49 | 
 50 |   Some devices are designed to deny users access to install or run
 51 | modified versions of the software inside them, although the manufacturer
 52 | can do so.  This is fundamentally incompatible with the aim of
 53 | protecting users' freedom to change the software.  The systematic
 54 | pattern of such abuse occurs in the area of products for individuals to
 55 | use, which is precisely where it is most unacceptable.  Therefore, we
 56 | have designed this version of the GPL to prohibit the practice for those
 57 | products.  If such problems arise substantially in other domains, we
 58 | stand ready to extend this provision to those domains in future versions
 59 | of the GPL, as needed to protect the freedom of users.
 60 | 
 61 |   Finally, every program is threatened constantly by software patents.
 62 | States should not allow patents to restrict development and use of
 63 | software on general-purpose computers, but in those that do, we wish to
 64 | avoid the special danger that patents applied to a free program could
 65 | make it effectively proprietary.  To prevent this, the GPL assures that
 66 | patents cannot be used to render the program non-free.
 67 | 
 68 |   The precise terms and conditions for copying, distribution and
 69 | modification follow.
 70 | 
 71 |                        TERMS AND CONDITIONS
 72 | 
 73 |   0. Definitions.
 74 | 
 75 |   "This License" refers to version 3 of the GNU General Public License.
 76 | 
 77 |   "Copyright" also means copyright-like laws that apply to other kinds of
 78 | works, such as semiconductor masks.
 79 | 
 80 |   "The Program" refers to any copyrightable work licensed under this
 81 | License.  Each licensee is addressed as "you".  "Licensees" and
 82 | "recipients" may be individuals or organizations.
 83 | 
 84 |   To "modify" a work means to copy from or adapt all or part of the work
 85 | in a fashion requiring copyright permission, other than the making of an
 86 | exact copy.  The resulting work is called a "modified version" of the
 87 | earlier work or a work "based on" the earlier work.
 88 | 
 89 |   A "covered work" means either the unmodified Program or a work based
 90 | on the Program.
 91 | 
 92 |   To "propagate" a work means to do anything with it that, without
 93 | permission, would make you directly or secondarily liable for
 94 | infringement under applicable copyright law, except executing it on a
 95 | computer or modifying a private copy.  Propagation includes copying,
 96 | distribution (with or without modification), making available to the
 97 | public, and in some countries other activities as well.
 98 | 
 99 |   To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies.  Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 | 
103 |   An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License.  If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 | 
112 |   1. Source Code.
113 | 
114 |   The "source code" for a work means the preferred form of the work
115 | for making modifications to it.  "Object code" means any non-source
116 | form of a work.
117 | 
118 |   A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 | 
123 |   The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form.  A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 | 
134 |   The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities.  However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work.  For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 | 
147 |   The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 | 
151 |   The Corresponding Source for a work in source code form is that
152 | same work.
153 | 
154 |   2. Basic Permissions.
155 | 
156 |   All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met.  This License explicitly affirms your unlimited
159 | permission to run the unmodified Program.  The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work.  This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 | 
164 |   You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force.  You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright.  Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 | 
175 |   Conveying under any other circumstances is permitted solely under
176 | the conditions stated below.  Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 | 
179 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 | 
181 |   No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 | 
187 |   When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 | 
195 |   4. Conveying Verbatim Copies.
196 | 
197 |   You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 | 
205 |   You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 | 
208 |   5. Conveying Modified Source Versions.
209 | 
210 |   You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 | 
214 |     a) The work must carry prominent notices stating that you modified
215 |     it, and giving a relevant date.
216 | 
217 |     b) The work must carry prominent notices stating that it is
218 |     released under this License and any conditions added under section
219 |     7.  This requirement modifies the requirement in section 4 to
220 |     "keep intact all notices".
221 | 
222 |     c) You must license the entire work, as a whole, under this
223 |     License to anyone who comes into possession of a copy.  This
224 |     License will therefore apply, along with any applicable section 7
225 |     additional terms, to the whole of the work, and all its parts,
226 |     regardless of how they are packaged.  This License gives no
227 |     permission to license the work in any other way, but it does not
228 |     invalidate such permission if you have separately received it.
229 | 
230 |     d) If the work has interactive user interfaces, each must display
231 |     Appropriate Legal Notices; however, if the Program has interactive
232 |     interfaces that do not display Appropriate Legal Notices, your
233 |     work need not make them do so.
234 | 
235 |   A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit.  Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 | 
245 |   6. Conveying Non-Source Forms.
246 | 
247 |   You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 | 
252 |     a) Convey the object code in, or embodied in, a physical product
253 |     (including a physical distribution medium), accompanied by the
254 |     Corresponding Source fixed on a durable physical medium
255 |     customarily used for software interchange.
256 | 
257 |     b) Convey the object code in, or embodied in, a physical product
258 |     (including a physical distribution medium), accompanied by a
259 |     written offer, valid for at least three years and valid for as
260 |     long as you offer spare parts or customer support for that product
261 |     model, to give anyone who possesses the object code either (1) a
262 |     copy of the Corresponding Source for all the software in the
263 |     product that is covered by this License, on a durable physical
264 |     medium customarily used for software interchange, for a price no
265 |     more than your reasonable cost of physically performing this
266 |     conveying of source, or (2) access to copy the
267 |     Corresponding Source from a network server at no charge.
268 | 
269 |     c) Convey individual copies of the object code with a copy of the
270 |     written offer to provide the Corresponding Source.  This
271 |     alternative is allowed only occasionally and noncommercially, and
272 |     only if you received the object code with such an offer, in accord
273 |     with subsection 6b.
274 | 
275 |     d) Convey the object code by offering access from a designated
276 |     place (gratis or for a charge), and offer equivalent access to the
277 |     Corresponding Source in the same way through the same place at no
278 |     further charge.  You need not require recipients to copy the
279 |     Corresponding Source along with the object code.  If the place to
280 |     copy the object code is a network server, the Corresponding Source
281 |     may be on a different server (operated by you or a third party)
282 |     that supports equivalent copying facilities, provided you maintain
283 |     clear directions next to the object code saying where to find the
284 |     Corresponding Source.  Regardless of what server hosts the
285 |     Corresponding Source, you remain obligated to ensure that it is
286 |     available for as long as needed to satisfy these requirements.
287 | 
288 |     e) Convey the object code using peer-to-peer transmission, provided
289 |     you inform other peers where the object code and Corresponding
290 |     Source of the work are being offered to the general public at no
291 |     charge under subsection 6d.
292 | 
293 |   A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 | 
297 |   A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling.  In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage.  For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product.  A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 | 
310 |   "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source.  The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 | 
318 |   If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information.  But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 | 
329 |   The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed.  Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 | 
337 |   Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 | 
343 |   7. Additional Terms.
344 | 
345 |   "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law.  If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 | 
354 |   When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it.  (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.)  You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 | 
361 |   Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 | 
365 |     a) Disclaiming warranty or limiting liability differently from the
366 |     terms of sections 15 and 16 of this License; or
367 | 
368 |     b) Requiring preservation of specified reasonable legal notices or
369 |     author attributions in that material or in the Appropriate Legal
370 |     Notices displayed by works containing it; or
371 | 
372 |     c) Prohibiting misrepresentation of the origin of that material, or
373 |     requiring that modified versions of such material be marked in
374 |     reasonable ways as different from the original version; or
375 | 
376 |     d) Limiting the use for publicity purposes of names of licensors or
377 |     authors of the material; or
378 | 
379 |     e) Declining to grant rights under trademark law for use of some
380 |     trade names, trademarks, or service marks; or
381 | 
382 |     f) Requiring indemnification of licensors and authors of that
383 |     material by anyone who conveys the material (or modified versions of
384 |     it) with contractual assumptions of liability to the recipient, for
385 |     any liability that these contractual assumptions directly impose on
386 |     those licensors and authors.
387 | 
388 |   All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10.  If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term.  If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 | 
398 |   If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 | 
403 |   Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 | 
407 |   8. Termination.
408 | 
409 |   You may not propagate or modify a covered work except as expressly
410 | provided under this License.  Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 | 
415 |   However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 | 
422 |   Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 | 
429 |   Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License.  If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 | 
435 |   9. Acceptance Not Required for Having Copies.
436 | 
437 |   You are not required to accept this License in order to receive or
438 | run a copy of the Program.  Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance.  However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work.  These actions infringe copyright if you do
443 | not accept this License.  Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 | 
446 |   10. Automatic Licensing of Downstream Recipients.
447 | 
448 |   Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License.  You are not responsible
451 | for enforcing compliance by third parties with this License.
452 | 
453 |   An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations.  If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 | 
463 |   You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License.  For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 | 
471 |   11. Patents.
472 | 
473 |   A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based.  The
475 | work thus licensed is called the contributor's "contributor version".
476 | 
477 |   A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version.  For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 | 
487 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 | 
492 |   In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement).  To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 | 
499 |   If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients.  "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 | 
513 |   If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 | 
521 |   A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License.  You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 | 
536 |   Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 | 
540 |   12. No Surrender of Others' Freedom.
541 | 
542 |   If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License.  If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all.  For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 | 
552 |   13. Use with the GNU Affero General Public License.
553 | 
554 |   Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work.  The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 | 
563 |   14. Revised Versions of this License.
564 | 
565 |   The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time.  Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 | 
570 |   Each version is given a distinguishing version number.  If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation.  If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 | 
579 |   If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 | 
584 |   Later license versions may give you additional or different
585 | permissions.  However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 | 
589 |   15. Disclaimer of Warranty.
590 | 
591 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 | 
600 |   16. Limitation of Liability.
601 | 
602 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 | 
612 |   17. Interpretation of Sections 15 and 16.
613 | 
614 |   If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 | 
621 |                      END OF TERMS AND CONDITIONS
622 | 
623 |             How to Apply These Terms to Your New Programs
624 | 
625 |   If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 | 
629 |   To do so, attach the following notices to the program.  It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 | 
634 |     <one line to give the program's name and a brief idea of what it does.>
635 |     Copyright (C) <year>  <name of author>
636 | 
637 |     This program is free software: you can redistribute it and/or modify
638 |     it under the terms of the GNU General Public License as published by
639 |     the Free Software Foundation, either version 3 of the License, or
640 |     (at your option) any later version.
641 | 
642 |     This program is distributed in the hope that it will be useful,
643 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
644 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
645 |     GNU General Public License for more details.
646 | 
647 |     You should have received a copy of the GNU General Public License
648 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
649 | 
650 | Also add information on how to contact you by electronic and paper mail.
651 | 
652 |   If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 | 
655 |     <program>  Copyright (C) <year>  <name of author>
656 |     This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 |     This is free software, and you are welcome to redistribute it
658 |     under certain conditions; type `show c' for details.
659 | 
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License.  Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 | 
664 |   You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | <http://www.gnu.org/licenses/>.
668 | 
669 |   The GNU General Public License does not permit incorporating your program
670 | into proprietary programs.  If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library.  If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License.  But first, please read
674 | <http://www.gnu.org/philosophy/why-not-lgpl.html>.
675 | 


--------------------------------------------------------------------------------
/COPYING.LESSER:
--------------------------------------------------------------------------------
  1 |                    GNU LESSER GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 | 
  9 |   This version of the GNU Lesser General Public License incorporates
 10 | the terms and conditions of version 3 of the GNU General Public
 11 | License, supplemented by the additional permissions listed below.
 12 | 
 13 |   0. Additional Definitions.
 14 | 
 15 |   As used herein, "this License" refers to version 3 of the GNU Lesser
 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
 17 | General Public License.
 18 | 
 19 |   "The Library" refers to a covered work governed by this License,
 20 | other than an Application or a Combined Work as defined below.
 21 | 
 22 |   An "Application" is any work that makes use of an interface provided
 23 | by the Library, but which is not otherwise based on the Library.
 24 | Defining a subclass of a class defined by the Library is deemed a mode
 25 | of using an interface provided by the Library.
 26 | 
 27 |   A "Combined Work" is a work produced by combining or linking an
 28 | Application with the Library.  The particular version of the Library
 29 | with which the Combined Work was made is also called the "Linked
 30 | Version".
 31 | 
 32 |   The "Minimal Corresponding Source" for a Combined Work means the
 33 | Corresponding Source for the Combined Work, excluding any source code
 34 | for portions of the Combined Work that, considered in isolation, are
 35 | based on the Application, and not on the Linked Version.
 36 | 
 37 |   The "Corresponding Application Code" for a Combined Work means the
 38 | object code and/or source code for the Application, including any data
 39 | and utility programs needed for reproducing the Combined Work from the
 40 | Application, but excluding the System Libraries of the Combined Work.
 41 | 
 42 |   1. Exception to Section 3 of the GNU GPL.
 43 | 
 44 |   You may convey a covered work under sections 3 and 4 of this License
 45 | without being bound by section 3 of the GNU GPL.
 46 | 
 47 |   2. Conveying Modified Versions.
 48 | 
 49 |   If you modify a copy of the Library, and, in your modifications, a
 50 | facility refers to a function or data to be supplied by an Application
 51 | that uses the facility (other than as an argument passed when the
 52 | facility is invoked), then you may convey a copy of the modified
 53 | version:
 54 | 
 55 |    a) under this License, provided that you make a good faith effort to
 56 |    ensure that, in the event an Application does not supply the
 57 |    function or data, the facility still operates, and performs
 58 |    whatever part of its purpose remains meaningful, or
 59 | 
 60 |    b) under the GNU GPL, with none of the additional permissions of
 61 |    this License applicable to that copy.
 62 | 
 63 |   3. Object Code Incorporating Material from Library Header Files.
 64 | 
 65 |   The object code form of an Application may incorporate material from
 66 | a header file that is part of the Library.  You may convey such object
 67 | code under terms of your choice, provided that, if the incorporated
 68 | material is not limited to numerical parameters, data structure
 69 | layouts and accessors, or small macros, inline functions and templates
 70 | (ten or fewer lines in length), you do both of the following:
 71 | 
 72 |    a) Give prominent notice with each copy of the object code that the
 73 |    Library is used in it and that the Library and its use are
 74 |    covered by this License.
 75 | 
 76 |    b) Accompany the object code with a copy of the GNU GPL and this license
 77 |    document.
 78 | 
 79 |   4. Combined Works.
 80 | 
 81 |   You may convey a Combined Work under terms of your choice that,
 82 | taken together, effectively do not restrict modification of the
 83 | portions of the Library contained in the Combined Work and reverse
 84 | engineering for debugging such modifications, if you also do each of
 85 | the following:
 86 | 
 87 |    a) Give prominent notice with each copy of the Combined Work that
 88 |    the Library is used in it and that the Library and its use are
 89 |    covered by this License.
 90 | 
 91 |    b) Accompany the Combined Work with a copy of the GNU GPL and this license
 92 |    document.
 93 | 
 94 |    c) For a Combined Work that displays copyright notices during
 95 |    execution, include the copyright notice for the Library among
 96 |    these notices, as well as a reference directing the user to the
 97 |    copies of the GNU GPL and this license document.
 98 | 
 99 |    d) Do one of the following:
100 | 
101 |        0) Convey the Minimal Corresponding Source under the terms of this
102 |        License, and the Corresponding Application Code in a form
103 |        suitable for, and under terms that permit, the user to
104 |        recombine or relink the Application with a modified version of
105 |        the Linked Version to produce a modified Combined Work, in the
106 |        manner specified by section 6 of the GNU GPL for conveying
107 |        Corresponding Source.
108 | 
109 |        1) Use a suitable shared library mechanism for linking with the
110 |        Library.  A suitable mechanism is one that (a) uses at run time
111 |        a copy of the Library already present on the user's computer
112 |        system, and (b) will operate properly with a modified version
113 |        of the Library that is interface-compatible with the Linked
114 |        Version.
115 | 
116 |    e) Provide Installation Information, but only if you would otherwise
117 |    be required to provide such information under section 6 of the
118 |    GNU GPL, and only to the extent that such information is
119 |    necessary to install and execute a modified version of the
120 |    Combined Work produced by recombining or relinking the
121 |    Application with a modified version of the Linked Version. (If
122 |    you use option 4d0, the Installation Information must accompany
123 |    the Minimal Corresponding Source and Corresponding Application
124 |    Code. If you use option 4d1, you must provide the Installation
125 |    Information in the manner specified by section 6 of the GNU GPL
126 |    for conveying Corresponding Source.)
127 | 
128 |   5. Combined Libraries.
129 | 
130 |   You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 | 
136 |    a) Accompany the combined library with a copy of the same work based
137 |    on the Library, uncombined with any other library facilities,
138 |    conveyed under the terms of this License.
139 | 
140 |    b) Give prominent notice with the combined library that part of it
141 |    is a work based on the Library, and explaining where to find the
142 |    accompanying uncombined form of the same work.
143 | 
144 |   6. Revised Versions of the GNU Lesser General Public License.
145 | 
146 |   The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 | 
151 |   Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 | 
161 |   If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
1 | tcollector is a framework to collect data points and store them in OpenTSDB.
2 | It allows you to write simple collectors that it'll run and monitor.  It also
3 | handles the communication with the TSDs.
4 | 
5 | For more info, see
6 | 
7 | http://www.opentsdb.net/tcollector.html
8 | 


--------------------------------------------------------------------------------
/collectors/0/dfstat.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # This file is part of tcollector.
  3 | # Copyright (C) 2010  StumbleUpon, Inc.
  4 | #
  5 | # This program is free software: you can redistribute it and/or modify it
  6 | # under the terms of the GNU Lesser General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or (at your
  8 | # option) any later version.  This program is distributed in the hope that it
  9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
 10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
 11 | # General Public License for more details.  You should have received a copy
 12 | # of the GNU Lesser General Public License along with this program.  If not,
 13 | # see <http://www.gnu.org/licenses/>.
 14 | """df disk space and inode counts for TSDB """
 15 | #
 16 | # dfstat.py
 17 | #
 18 | # df.1kblocks.total      total size of fs
 19 | # df.1kblocks.used       blocks used
 20 | # df.1kblocks.available  blocks available
 21 | # df.inodes.total        number of inodes
 22 | # df.inodes.used        number of inodes
 23 | # df.inodes.free        number of inodes
 24 | 
 25 | # All metrics are tagged with mount= and fstype=
 26 | # This makes it easier to exclude stuff like
 27 | # tmpfs mounts from disk usage reports.
 28 | 
 29 | # Because tsdb does not like slashes in tags, slashes will
 30 | # be replaced by underscores in the mount= tag.  In theory
 31 | # this could cause problems if you have a mountpoint of
 32 | # "/foo/bar/" and "/foo_bar/".
 33 | 
 34 | 
 35 | import os
 36 | import socket
 37 | import subprocess
 38 | import sys
 39 | import time
 40 | 
 41 | 
 42 | COLLECTION_INTERVAL = 60  # seconds
 43 | 
 44 | def main():
 45 |     """dfstats main loop"""
 46 | 
 47 |     while True:
 48 |         ts = int(time.time())
 49 |         # 1kblocks
 50 |         df_proc = subprocess.Popen(["df", "-PlTk"], stdout=subprocess.PIPE)
 51 |         stdout, _ = df_proc.communicate()
 52 |         if df_proc.returncode == 0:
 53 |             for line in stdout.split("\n"): # pylint: disable=E1103
 54 |                 fields = line.split()
 55 |                 # skip header/blank lines
 56 |                 if not line or not fields[2].isdigit():
 57 |                     continue
 58 |                 # Skip mounts/types we don't care about.
 59 |                 # Most of this stuff is of type tmpfs, but we don't
 60 |                 # want to blacklist all tmpfs since sometimes it's
 61 |                 # used for active filesystems (/var/run, /tmp)
 62 |                 # that we do want to track.
 63 |                 if fields[1] in ("debugfs", "devtmpfs"):
 64 |                     continue
 65 |                 if fields[6] == "/dev":
 66 |                     continue
 67 |                 # /dev/shm, /lib/init_rw, /lib/modules, etc
 68 |                 #if fields[6].startswith(("/lib/", "/dev/")):  # python2.5+
 69 |                 if fields[6].startswith("/lib/"):
 70 |                     continue
 71 |                 if fields[6].startswith("/dev/"):
 72 |                     continue
 73 | 
 74 |                 mount = fields[6]
 75 |                 print ("df.1kblocks.total %d %s mount=%s fstype=%s"
 76 |                        % (ts, fields[2], mount, fields[1]))
 77 |                 print ("df.1kblocks.used %d %s mount=%s fstype=%s"
 78 |                        % (ts, fields[3], mount, fields[1]))
 79 |                 print ("df.1kblocks.free %d %s mount=%s fstype=%s"
 80 |                        % (ts, fields[4], mount, fields[1]))
 81 |         else:
 82 |             print >> sys.stderr, "df -Pltk returned %r" % df_proc.returncode
 83 | 
 84 |         ts = int(time.time())
 85 |         # inodes
 86 |         df_proc = subprocess.Popen(["df", "-PlTi"], stdout=subprocess.PIPE)
 87 |         stdout, _ = df_proc.communicate()
 88 |         if df_proc.returncode == 0:
 89 |             for line in stdout.split("\n"): # pylint: disable=E1103
 90 |                 fields = line.split()
 91 |                 if not line or not fields[2].isdigit():
 92 |                     continue
 93 | 
 94 |                 mount = fields[6]
 95 |                 print ("df.inodes.total %d %s mount=%s fstype=%s"
 96 |                        % (ts, fields[2], mount, fields[1]))
 97 |                 print ("df.inodes.used %d %s mount=%s fstype=%s"
 98 |                        % (ts, fields[3], mount, fields[1]))
 99 |                 print ("df.inodes.free %d %s mount=%s fstype=%s"
100 |                        % (ts, fields[4], mount, fields[1]))
101 |         else:
102 |             print >> sys.stderr, "df -Plti returned %r" % df_proc.returncode
103 | 
104 |         sys.stdout.flush()
105 |         time.sleep(COLLECTION_INTERVAL)
106 | 
107 | if __name__ == "__main__":
108 |     main()
109 | 


--------------------------------------------------------------------------------
/collectors/0/elasticsearch.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # This file is part of tcollector.
  3 | # Copyright (C) 2011  StumbleUpon, Inc.
  4 | #
  5 | # This program is free software: you can redistribute it and/or modify it
  6 | # under the terms of the GNU Lesser General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or (at your
  8 | # option) any later version.  This program is distributed in the hope that it
  9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
 10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
 11 | # General Public License for more details.  You should have received a copy
 12 | # of the GNU Lesser General Public License along with this program.  If not,
 13 | # see <http://www.gnu.org/licenses/>.
 14 | """ElasticSearch collector"""  # Because ES is cool, bonsai cool.
 15 | # Tested with ES 0.16.5 and 0.17.x
 16 | 
 17 | import errno
 18 | import httplib
 19 | try:
 20 |   import json
 21 | except ImportError:
 22 |   json = None  # Handled gracefully in main.  Not available by default in <2.6
 23 | import socket
 24 | import sys
 25 | import time
 26 | 
 27 | 
 28 | COLLECTION_INTERVAL = 15  # seconds
 29 | DEFAULT_TIMEOUT = 10.0    # seconds
 30 | ES_HOST = "localhost"
 31 | ES_PORT = 9200  # TCP port on which ES listens.
 32 | 
 33 | STATUS_MAP = {
 34 |   "green": 0,
 35 |   "yellow": 1,
 36 |   "red": 2,
 37 | }
 38 | 
 39 | 
 40 | def is_numeric(value):
 41 |   return isinstance(value, (int, long, float))
 42 | 
 43 | 
 44 | def err(msg):
 45 |   print >>sys.stderr, msg
 46 | 
 47 | 
 48 | class ESError(RuntimeError):
 49 |   """Exception raised if we don't get a 200 OK from ElasticSearch."""
 50 | 
 51 |   def __init__(self, resp):
 52 |     RuntimeError.__init__(self, str(resp))
 53 |     self.resp = resp
 54 | 
 55 | 
 56 | def request(server, uri):
 57 |   """Does a GET request of the given uri on the given HTTPConnection."""
 58 |   server.request("GET", uri)
 59 |   resp = server.getresponse()
 60 |   if resp.status != httplib.OK:
 61 |     raise ESError(resp)
 62 |   return json.loads(resp.read())
 63 | 
 64 | 
 65 | def cluster_health(server):
 66 |   return request(server, "/_cluster/health")
 67 | 
 68 | 
 69 | def cluster_state(server):
 70 |   return request(server, "/_cluster/state"
 71 |                  + "?filter_routing_table=true&filter_metadata=true&filter_blocks=true")
 72 | 
 73 | 
 74 | def node_stats(server):
 75 |   return request(server, "/_cluster/nodes/_local/stats")
 76 | 
 77 | 
 78 | def main(argv):
 79 |   socket.setdefaulttimeout(DEFAULT_TIMEOUT)
 80 |   server = httplib.HTTPConnection(ES_HOST, ES_PORT)
 81 |   try:
 82 |     server.connect()
 83 |   except socket.error, (erno, e):
 84 |     if erno == errno.ECONNREFUSED:
 85 |       return 13  # No ES running, ask tcollector to not respawn us.
 86 |     raise
 87 |   if json is None:
 88 |     err("This collector requires the `json' Python module.")
 89 |     return 1
 90 | 
 91 |   nstats = node_stats(server)
 92 |   cluster_name = nstats["cluster_name"]
 93 |   nodeid, nstats = nstats["nodes"].popitem()
 94 | 
 95 |   ts = None
 96 |   def printmetric(metric, value, **tags):
 97 |     if tags:
 98 |       tags = " " + " ".join("%s=%s" % (name, value)
 99 |                             for name, value in tags.iteritems())
100 |     else:
101 |       tags = ""
102 |     print ("elasticsearch.%s %d %s cluster=%s%s"
103 |            % (metric, ts, value, cluster_name, tags))
104 | 
105 |   while True:
106 |     ts = int(time.time())
107 |     nstats = node_stats(server)
108 |     # Check that the node's identity hasn't changed in the mean time.
109 |     if nstats["cluster_name"] != cluster_name:
110 |       err("cluster_name changed from %r to %r"
111 |           % (cluster_name, nstats["cluster_name"]))
112 |       return 1
113 |     this_nodeid, nstats = nstats["nodes"].popitem()
114 |     if this_nodeid != nodeid:
115 |       err("node ID changed from %r to %r" % (nodeid, this_nodeid))
116 |       return 1
117 | 
118 |     is_master = nodeid == cluster_state(server)["master_node"]
119 |     printmetric("is_master", int(is_master))
120 |     if is_master:
121 |       ts = int(time.time())  # In case last call took a while.
122 |       cstats = cluster_health(server)
123 |       for stat, value in cstats.iteritems():
124 |         if stat == "status":
125 |           value = STATUS_MAP.get(value, -1)
126 |         elif not is_numeric(value):
127 |           continue
128 |         printmetric("cluster." + stat, value)
129 | 
130 |     ts = nstats["os"]["timestamp"] / 1000  # ms -> s
131 |     indices = nstats["indices"]
132 |     printmetric("indices.size", indices["size_in_bytes"])
133 |     printmetric("num_docs", indices["docs"]["num_docs"])
134 |     d = indices["cache"]
135 |     printmetric("cache.field.evictions", d["field_evictions"])
136 |     printmetric("cache.field.size", d["field_size_in_bytes"])
137 |     printmetric("cache.filter.count", d["filter_count"])
138 |     printmetric("cache.filter.evictions", d["filter_evictions"])
139 |     printmetric("cache.filter.size", d["filter_size_in_bytes"])
140 |     d = indices["merges"]
141 |     printmetric("merges.current", d["current"])
142 |     printmetric("merges.total", d["total"])
143 |     printmetric("merges.total_time", d["total_time_in_millis"] / 1000.)
144 |     del indices
145 |     process = nstats["process"]
146 |     ts = process["timestamp"] / 1000  # ms -> s
147 |     open_fds = process.get("open_file_descriptors")  # ES 0.17
148 |     if open_fds is None:
149 |       open_fds = process.get("fd")  # ES 0.16
150 |       if open_fds is not None:
151 |         open_fds = open_fds["total"]
152 |     if open_fds is not None:
153 |       printmetric("process.open_file_descriptors", open_fds)
154 |     d = process["cpu"]
155 |     printmetric("process.cpu.percent", d["percent"])
156 |     printmetric("process.cpu.sys", d["sys_in_millis"] / 1000.)
157 |     printmetric("process.cpu.user", d["user_in_millis"] / 1000.)
158 |     d = process["mem"]
159 |     printmetric("process.mem.resident", d["resident_in_bytes"])
160 |     printmetric("process.mem.shared", d["share_in_bytes"])
161 |     printmetric("process.mem.total_virtual", d["total_virtual_in_bytes"])
162 |     del process
163 |     jvm = nstats["jvm"]
164 |     ts = jvm["timestamp"] / 1000  # ms -> s
165 |     d = jvm["mem"]
166 |     printmetric("jvm.mem.heap_used", d["heap_used_in_bytes"])
167 |     printmetric("jvm.mem.heap_committed", d["heap_committed_in_bytes"])
168 |     printmetric("jvm.mem.non_heap_used", d["non_heap_used_in_bytes"])
169 |     printmetric("jvm.mem.non_heap_committed", d["non_heap_committed_in_bytes"])
170 |     d = jvm["threads"]
171 |     printmetric("jvm.threads.count", d["count"])
172 |     printmetric("jvm.threads.peak_count", d["peak_count"])
173 |     for gc, d in jvm["gc"]["collectors"].iteritems():
174 |       printmetric("jvm.gc.collection_count", d["collection_count"], gc=gc)
175 |       printmetric("jvm.gc.collection_time",
176 |                   d["collection_time_in_millis"] / 1000., gc=gc)
177 |     del jvm
178 |     del d
179 |     for stat, value in nstats["network"]["tcp"].iteritems():
180 |       if is_numeric(value):
181 |         printmetric("network.tcp." + stat, value)
182 |     for stat, value in nstats["transport"].iteritems():
183 |       if is_numeric(value):
184 |         printmetric("transport." + stat, value)
185 |     # New in ES 0.17:
186 |     for stat, value in nstats.get("http", {}).iteritems():
187 |       if is_numeric(value):
188 |         printmetric("http." + stat, value)
189 |     del nstats
190 |     time.sleep(COLLECTION_INTERVAL)
191 | 
192 | 
193 | if __name__ == "__main__":
194 |   sys.exit(main(sys.argv))
195 | 


--------------------------------------------------------------------------------
/collectors/0/hadoop_datanode_jmx.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # This file is part of tcollector.
  3 | # Copyright (C) 2012  StumbleUpon, Inc.
  4 | #
  5 | # This program is free software: you can redistribute it and/or modify it
  6 | # under the terms of the GNU Lesser General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or (at your
  8 | # option) any later version.  This program is distributed in the hope that it
  9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
 10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
 11 | # General Public License for more details.  You should have received a copy
 12 | # of the GNU Lesser General Public License along with this program.  If not,
 13 | # see <http://www.gnu.org/licenses/>.
 14 | 
 15 | import os
 16 | import pwd
 17 | import re
 18 | import signal
 19 | import subprocess
 20 | import sys
 21 | import time
 22 | 
 23 | # If this user doesn't exist, we'll exit immediately.
 24 | # If we're running as root, we'll drop privileges using this user.
 25 | USER = "hadoop"
 26 | 
 27 | # We add those files to the classpath if they exist.
 28 | CLASSPATH = [
 29 |     "/usr/lib/jvm/java-6-sun/lib/tools.jar",
 30 | ]
 31 | 
 32 | # Map certain JVM stats so they are unique and shorter
 33 | JMX_SERVICE_RENAMING = {
 34 |   "GarbageCollector": "datanode.gc",
 35 |   "OperatingSystem": "datanode.os",
 36 |   "Threading": "datanode.threads",
 37 | }
 38 | 
 39 | IGNORED_METRICS = set(["revision", "hdfsUser", "hdfsDate", "hdfsUrl", "date",
 40 |                        "hdfsRevision", "user", "hdfsVersion", "url", "version",
 41 |                        "NamenodeAddress", "Version", "RpcPort", "HttpPort",
 42 |                        # These are useless as-is because they represent the
 43 |                        # thread that's dedicated to serving JMX RPCs.
 44 |                        "CurrentThreadCpuTime", "CurrentThreadUserTime",
 45 |                        # List of directories used by the DataNode.
 46 |                        "StorageInfo",
 47 |                        "VolumeInfo",
 48 |                       ])
 49 | 
 50 | # How many times, maximum, will we attempt to restart the JMX collector.
 51 | # If we reach this limit, we'll exit with an error.
 52 | MAX_RESTARTS = 10
 53 | 
 54 | TOP = False  # Set to True when we want to terminate.
 55 | RETVAL = 0    # Return value set by signal handler.
 56 | 
 57 | 
 58 | def drop_privileges():
 59 |     try:
 60 |         ent = pwd.getpwnam(USER)
 61 |     except KeyError:
 62 |         print >>sys.stderr, "Not running, user '%s' doesn't exist" % USER
 63 |         sys.exit(13)
 64 | 
 65 |     if os.getuid() != 0:
 66 |         return
 67 | 
 68 |     os.setgid(ent.pw_gid)
 69 |     os.setuid(ent.pw_uid)
 70 | 
 71 | 
 72 | def kill(proc):
 73 |   """Kills the subprocess given in argument."""
 74 |   # Clean up after ourselves.
 75 |   proc.stdout.close()
 76 |   rv = proc.poll()
 77 |   if rv is None:
 78 |       os.kill(proc.pid, 15)
 79 |       rv = proc.poll()
 80 |       if rv is None:
 81 |           os.kill(proc.pid, 9)  # Bang bang!
 82 |           rv = proc.wait()  # This shouldn't block too long.
 83 |   print >>sys.stderr, "warning: proc exited %d" % rv
 84 |   return rv
 85 | 
 86 | 
 87 | def do_on_signal(signum, func, *args, **kwargs):
 88 |   """Calls func(*args, **kwargs) before exiting when receiving signum."""
 89 |   def signal_shutdown(signum, frame):
 90 |     print >>sys.stderr, "got signal %d, exiting" % signum
 91 |     func(*args, **kwargs)
 92 |     sys.exit(128 + signum)
 93 |   signal.signal(signum, signal_shutdown)
 94 | 
 95 | 
 96 | def main(argv):
 97 |     drop_privileges()
 98 |     # Build the classpath.
 99 |     dir = os.path.dirname(sys.argv[0])
100 |     jar = os.path.normpath(dir + "/../lib/jmx-1.0.jar")
101 |     if not os.path.exists(jar):
102 |         print >>sys.stderr, "WTF?!  Can't run, %s doesn't exist" % jar
103 |         return 13
104 |     classpath = [jar]
105 |     for jar in CLASSPATH:
106 |         if os.path.exists(jar):
107 |             classpath.append(jar)
108 |     classpath = ":".join(classpath)
109 | 
110 |     jmx = subprocess.Popen(
111 |         ["java", "-enableassertions", "-enablesystemassertions",  # safe++
112 |          "-Xmx64m",  # Low RAM limit, to avoid stealing too much from prod.
113 |          "-cp", classpath, "com.stumbleupon.monitoring.jmx",
114 |          "--watch", "10", "--long", "--timestamp",
115 |          "DataNode",  # Name of the process.
116 |          # The remaining arguments are pairs (mbean_regexp, attr_regexp).
117 |          # The first regexp is used to match one or more MBeans, the 2nd
118 |          # to match one or more attributes of the MBeans matched.
119 |          "hadoop", "",                     # All HBase / hadoop metrics.
120 |          "Threading", "Count|Time$",       # Number of threads and CPU time.
121 |          "OperatingSystem", "OpenFile",    # Number of open files.
122 |          "GarbageCollector", "Collection", # GC runs and time spent GCing.
123 |          ], stdout=subprocess.PIPE, bufsize=1)
124 |     do_on_signal(signal.SIGINT, kill, jmx)
125 |     do_on_signal(signal.SIGPIPE, kill, jmx)
126 |     do_on_signal(signal.SIGTERM, kill, jmx)
127 |     try:
128 |         prev_timestamp = 0
129 |         while True:
130 |             line = jmx.stdout.readline()
131 | 
132 |             if not line and jmx.poll() is not None:
133 |                 break  # Nothing more to read and process exited.
134 |             elif len(line) < 4:
135 |                 print >>sys.stderr, "invalid line (too short): %r" % line
136 |                 continue
137 | 
138 |             timestamp, metric, value, mbean = line.split("\t", 3)
139 |             # Sanitize the timestamp.
140 |             try:
141 |                 timestamp = int(timestamp)
142 |                 if timestamp < time.time() - 600:
143 |                     raise ValueError("timestamp too old: %d" % timestamp)
144 |                 if timestamp < prev_timestamp:
145 |                     raise ValueError("timestamp out of order: prev=%d, new=%d"
146 |                                      % (prev_timestamp, timestamp))
147 |             except ValueError, e:
148 |                 print >>sys.stderr, ("Invalid timestamp on line: %r -- %s"
149 |                                      % (line, e))
150 |                 continue
151 |             prev_timestamp = timestamp
152 | 
153 |             if metric in IGNORED_METRICS:
154 |               continue
155 | 
156 |             tags = ""
157 |             # The JMX metrics have per-request-type metrics like so:
158 |             #   metricNameNumOps
159 |             #   metricNameMinTime
160 |             #   metricNameMaxTime
161 |             #   metricNameAvgTime
162 |             # Group related metrics together in the same metric name, use tags
163 |             # to separate the different request types, so we end up with:
164 |             #   numOps op=metricName
165 |             #   avgTime op=metricName
166 |             # etc, which makes it easier to graph things with the TSD.
167 |             if metric.endswith("MinTime"):  # We don't care about the minimum
168 |                 continue                    # time taken by operations.
169 |             elif metric.endswith("NumOps"):
170 |                 tags = " op=" + metric[:-6]
171 |                 metric = "numOps"
172 |             elif metric.endswith("AvgTime"):
173 |                 tags = " op=" + metric[:-7]
174 |                 metric = "avgTime"
175 |             elif metric.endswith("MaxTime"):
176 |                 tags = " op=" + metric[:-7]
177 |                 metric = "maxTime"
178 | 
179 |             # mbean is of the form "domain:key=value,...,foo=bar"
180 |             # some tags can have spaces, so we need to fix that.
181 |             mbean_domain, mbean_properties = mbean.rstrip().replace(" ", "_").split(":", 1)
182 |             if mbean_domain not in ("hadoop", "java.lang"):
183 |                 print >>sys.stderr, ("Unexpected mbean domain = %r on line %r"
184 |                                      % (mbean_domain, line))
185 |                 continue
186 |             mbean_properties = dict(prop.split("=", 1)
187 |                                     for prop in mbean_properties.split(","))
188 |             if mbean_domain == "hadoop":
189 |               # jmx_service is HBase by default, but we can also have
190 |               # RegionServer or Replication and such.
191 |               jmx_service = mbean_properties.get("service", "HBase")
192 |               if jmx_service == "HBase":
193 |                   jmx_service = "regionserver"
194 |             elif mbean_domain == "java.lang":
195 |                 jmx_service = mbean_properties.pop("type", "jvm")
196 |                 if mbean_properties:
197 |                     tags += " " + " ".join(k + "=" + v for k, v in
198 |                                            mbean_properties.iteritems())
199 |             else:
200 |                 assert 0, "Should never be here"
201 | 
202 |             jmx_service = JMX_SERVICE_RENAMING.get(jmx_service, jmx_service)
203 |             metric = jmx_service.lower() + "." + metric
204 | 
205 |             sys.stdout.write("hadoop.%s %d %s%s\n"
206 |                              % (metric, timestamp, value, tags))
207 |             sys.stdout.flush()
208 |     finally:
209 |         kill(jmx)
210 |         time.sleep(300)
211 |         return 0  # Ask the tcollector to re-spawn us.
212 | 
213 | 
214 | if __name__ == "__main__":
215 |     sys.exit(main(sys.argv))
216 | 


--------------------------------------------------------------------------------
/collectors/0/hbase_regionserver_jmx.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # This file is part of tcollector.
  3 | # Copyright (C) 2010  StumbleUpon, Inc.
  4 | #
  5 | # This program is free software: you can redistribute it and/or modify it
  6 | # under the terms of the GNU Lesser General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or (at your
  8 | # option) any later version.  This program is distributed in the hope that it
  9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
 10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
 11 | # General Public License for more details.  You should have received a copy
 12 | # of the GNU Lesser General Public License along with this program.  If not,
 13 | # see <http://www.gnu.org/licenses/>.
 14 | 
 15 | import os
 16 | import pwd
 17 | import re
 18 | import signal
 19 | import subprocess
 20 | import sys
 21 | import time
 22 | import traceback
 23 | 
 24 | # If this user doesn't exist, we'll exit immediately.
 25 | # If we're running as root, we'll drop privileges using this user.
 26 | USER = "hadoop"
 27 | 
 28 | # We add those files to the classpath if they exist.
 29 | CLASSPATH = [
 30 |     "/usr/lib/jvm/java-6-sun/lib/tools.jar",
 31 | ]
 32 | 
 33 | # We shorten certain strings to avoid excessively long metric names.
 34 | JMX_SERVICE_RENAMING = {
 35 |     "GarbageCollector": "gc",
 36 |     "OperatingSystem": "os",
 37 |     "Threading": "threads",
 38 |     # New in 0.92.1, from HBASE-5325:
 39 |     "org.apache.hbase": "hbase",
 40 | }
 41 | 
 42 | def drop_privileges():
 43 |     try:
 44 |         ent = pwd.getpwnam(USER)
 45 |     except KeyError:
 46 |         print >>sys.stderr, "Not running, user '%s' doesn't exist" % USER
 47 |         sys.exit(13)
 48 | 
 49 |     if os.getuid() != 0:
 50 |         return
 51 | 
 52 |     os.setgid(ent.pw_gid)
 53 |     os.setuid(ent.pw_uid)
 54 | 
 55 | 
 56 | def kill(proc):
 57 |   """Kills the subprocess given in argument."""
 58 |   # Clean up after ourselves.
 59 |   proc.stdout.close()
 60 |   rv = proc.poll()
 61 |   if rv is None:
 62 |       os.kill(proc.pid, 15)
 63 |       rv = proc.poll()
 64 |       if rv is None:
 65 |           os.kill(proc.pid, 9)  # Bang bang!
 66 |           rv = proc.wait()  # This shouldn't block too long.
 67 |   print >>sys.stderr, "warning: proc exited %d" % rv
 68 |   return rv
 69 | 
 70 | 
 71 | def do_on_signal(signum, func, *args, **kwargs):
 72 |   """Calls func(*args, **kwargs) before exiting when receiving signum."""
 73 |   def signal_shutdown(signum, frame):
 74 |     print >>sys.stderr, "got signal %d, exiting" % signum
 75 |     func(*args, **kwargs)
 76 |     sys.exit(128 + signum)
 77 |   signal.signal(signum, signal_shutdown)
 78 | 
 79 | 
 80 | def main(argv):
 81 |     drop_privileges()
 82 |     # Build the classpath.
 83 |     dir = os.path.dirname(sys.argv[0])
 84 |     jar = os.path.normpath(dir + "/../lib/jmx-1.0.jar")
 85 |     if not os.path.exists(jar):
 86 |         print >>sys.stderr, "WTF?!  Can't run, %s doesn't exist" % jar
 87 |         return 13
 88 |     classpath = [jar]
 89 |     for jar in CLASSPATH:
 90 |         if os.path.exists(jar):
 91 |             classpath.append(jar)
 92 |     classpath = ":".join(classpath)
 93 | 
 94 |     jmx = subprocess.Popen(
 95 |         ["java", "-enableassertions", "-enablesystemassertions",  # safe++
 96 |          "-Xmx64m",  # Low RAM limit, to avoid stealing too much from prod.
 97 |          "-cp", classpath, "com.stumbleupon.monitoring.jmx",
 98 |          "--watch", "10", "--long", "--timestamp",
 99 |          "HRegionServer",  # Name of the process.
100 |          # The remaining arguments are pairs (mbean_regexp, attr_regexp).
101 |          # The first regexp is used to match one or more MBeans, the 2nd
102 |          # to match one or more attributes of the MBeans matched.
103 |          "hadoop", "",                     # All HBase / hadoop metrics.
104 |          "Threading", "Count|Time$",       # Number of threads and CPU time.
105 |          "OperatingSystem", "OpenFile",    # Number of open files.
106 |          "GarbageCollector", "Collection", # GC runs and time spent GCing.
107 |          ], stdout=subprocess.PIPE, bufsize=1)
108 |     do_on_signal(signal.SIGINT, kill, jmx)
109 |     do_on_signal(signal.SIGPIPE, kill, jmx)
110 |     do_on_signal(signal.SIGTERM, kill, jmx)
111 |     try:
112 |         prev_timestamp = 0
113 |         while True:
114 |             line = jmx.stdout.readline()
115 | 
116 |             if not line and jmx.poll() is not None:
117 |                 break  # Nothing more to read and process exited.
118 |             elif len(line) < 4:
119 |                 print >>sys.stderr, "invalid line (too short): %r" % line
120 |                 continue
121 | 
122 |             try:
123 |                 timestamp, metric, value, mbean = line.split("\t", 3)
124 |             except ValueError, e:
125 |                 # Temporary workaround for jmx.jar not printing these lines we
126 |                 # don't care about anyway properly.
127 |                 if "java.lang.String" not in line:
128 |                     print >>sys.stderr, "Can't split line: %r" % line
129 |                 continue
130 | 
131 |             # Sanitize the timestamp.
132 |             try:
133 |                 timestamp = int(timestamp)
134 |                 if timestamp < time.time() - 600:
135 |                     raise ValueError("timestamp too old: %d" % timestamp)
136 |                 if timestamp < prev_timestamp:
137 |                     raise ValueError("timestamp out of order: prev=%d, new=%d"
138 |                                      % (prev_timestamp, timestamp))
139 |             except ValueError, e:
140 |                 print >>sys.stderr, ("Invalid timestamp on line: %r -- %s"
141 |                                      % (line, e))
142 |                 continue
143 |             prev_timestamp = timestamp
144 | 
145 |             tags = ""
146 |             # The JMX metrics have per-request-type metrics like so:
147 |             #   metricNameNumOps
148 |             #   metricNameMinTime
149 |             #   metricNameMaxTime
150 |             #   metricNameAvgTime
151 |             # Group related metrics together in the same metric name, use tags
152 |             # to separate the different request types, so we end up with:
153 |             #   numOps op=metricName
154 |             #   avgTime op=metricName
155 |             # etc, which makes it easier to graph things with the TSD.
156 |             if metric.endswith("MinTime"):  # We don't care about the minimum
157 |                 continue                    # time taken by operations.
158 |             elif metric.endswith("NumOps"):
159 |                 tags = " op=" + metric[:-6]
160 |                 metric = "numOps"
161 |             elif metric.endswith("AvgTime"):
162 |                 tags = " op=" + metric[:-7]
163 |                 metric = "avgTime"
164 |             elif metric.endswith("MaxTime"):
165 |                 tags = " op=" + metric[:-7]
166 |                 metric = "maxTime"
167 | 
168 |             # mbean is of the form "domain:key=value,...,foo=bar"
169 |             mbean_domain, mbean_properties = mbean.rstrip().split(":", 1)
170 |             if mbean_domain not in ("hadoop", "java.lang"):
171 |                 print >>sys.stderr, ("Unexpected mbean domain = %r on line %r"
172 |                                      % (mbean_domain, line))
173 |                 continue
174 |             mbean_properties = dict(prop.split("=", 1)
175 |                                     for prop in mbean_properties.split(","))
176 |             if mbean_domain == "hadoop":
177 |               # jmx_service is HBase by default, but we can also have
178 |               # RegionServer or Replication and such.
179 |               jmx_service = mbean_properties.get("service", "HBase")
180 |               if jmx_service == "HBase":
181 |                   jmx_service = "regionserver"
182 |             elif mbean_domain == "java.lang":
183 |                 jmx_service = mbean_properties.pop("type", "jvm")
184 |                 if mbean_properties:
185 |                     tags += " " + " ".join(k + "=" + v for k, v in
186 |                                            mbean_properties.iteritems())
187 |             else:
188 |                 assert 0, "Should never be here"
189 | 
190 |             # Hack.  Right now, the RegionServer is printing stats for its own
191 |             # replication queue, but when another RegionServer dies, this one
192 |             # may take over the replication queue of the dead one.  When this
193 |             # happens, we'll get the same metrics multiple times, because
194 |             # internally the RegionServer has multiple queues (although only
195 |             # only one is actively used, the other ones get flushed and
196 |             # discarded).  The following `if' statement is simply discarding
197 |             # stats for "recovered" replication queues, because we can't keep
198 |             # track of them properly in TSDB, because there is no sensible
199 |             # tag we can use to differentiate queues.
200 |             if jmx_service == "Replication":
201 |               attr_name = mbean_properties.get("name", "")
202 |               # Normally the attribute will look this:
203 |               #   ReplicationSource for <N>
204 |               # Where <N> is the ID of the destination cluster.
205 |               # But when this is the recovered queue of a dead RegionServer:
206 |               #   ReplicationSource for <N>-<HOST>%2C<PORT>%2C<TIMESTAMP>
207 |               # Where <HOST>, <PORT> and <TIMESTAMP> relate to the dead RS.
208 |               # So we discriminate those entries by looking for a dash.
209 |               if "ReplicationSource" in attr_name and "-" in attr_name:
210 |                 continue
211 | 
212 |             jmx_service = JMX_SERVICE_RENAMING.get(jmx_service, jmx_service)
213 |             jmx_service, repl_count = re.subn("[^a-zA-Z0-9]+", ".",
214 |                                               jmx_service)
215 |             if repl_count:
216 |                 print >>sys.stderr, ("Warning: found malformed"
217 |                                      " jmx_service=%r on line=%r"
218 |                                      % (mbean_properties["service"], line))
219 |             metric = jmx_service.lower() + "." + metric
220 | 
221 |             sys.stdout.write("hbase.%s %d %s%s\n"
222 |                              % (metric, timestamp, value, tags))
223 |             sys.stdout.flush()
224 |     finally:
225 |         kill(jmx)
226 |         time.sleep(300)
227 |         return 0  # Ask the tcollector to re-spawn us.
228 | 
229 | 
230 | if __name__ == "__main__":
231 |     sys.exit(main(sys.argv))
232 | 


--------------------------------------------------------------------------------
/collectors/0/ifstat.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # This file is part of tcollector.
 3 | # Copyright (C) 2010  StumbleUpon, Inc.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify it
 6 | # under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or (at your
 8 | # option) any later version.  This program is distributed in the hope that it
 9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
11 | # General Public License for more details.  You should have received a copy
12 | # of the GNU Lesser General Public License along with this program.  If not,
13 | # see <http://www.gnu.org/licenses/>.
14 | #
15 | """network interface stats for TSDB"""
16 | 
17 | import os
18 | import sys
19 | import time
20 | import socket
21 | import re
22 | 
23 | 
24 | # /proc/net/dev has 16 fields, 8 for receive and 8 for xmit
25 | # The fields we care about are defined here.  The
26 | # ones we want to skip we just leave empty.
27 | # So we can aggregate up the total bytes, packets, etc
28 | # we tag each metric with direction=in or =out
29 | # and iface=
30 | 
31 | FIELDS = ("bytes", "packets", "errs", "dropped",
32 |            None, None, None, None,)
33 | 
34 | def main():
35 |     """ifstat main loop"""
36 |     interval = 15
37 | 
38 |     f_netdev = open("/proc/net/dev", "r")
39 | 
40 |     # We just care about ethN interfaces.  We specifically
41 |     # want to avoid bond interfaces, because interface
42 |     # stats are still kept on the child interfaces when
43 |     # you bond.  By skipping bond we avoid double counting.
44 |     while True:
45 |         f_netdev.seek(0)
46 |         ts = int(time.time())
47 |         for line in f_netdev:
48 |             m = re.match("\s+(eth\d+):(.*)", line)
49 |             if not m:
50 |                 continue
51 |             stats = m.group(2).split(None)
52 |             for i in range(8):
53 |                 if FIELDS[i]:
54 |                     print ("proc.net.%s %d %s iface=%s direction=in"
55 |                            % (FIELDS[i], ts, stats[i], m.group(1)))
56 |                     print ("proc.net.%s %d %s iface=%s direction=out"
57 |                            % (FIELDS[i], ts, stats[i+8], m.group(1)))
58 | 
59 |         sys.stdout.flush()
60 |         time.sleep(interval)
61 | 
62 | if __name__ == "__main__":
63 |     main()
64 | 
65 | 


--------------------------------------------------------------------------------
/collectors/0/iostat.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # This file is part of tcollector.
  3 | # Copyright (C) 2010  StumbleUpon, Inc.
  4 | #
  5 | # This program is free software: you can redistribute it and/or modify it
  6 | # under the terms of the GNU Lesser General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or (at your
  8 | # option) any later version.  This program is distributed in the hope that it
  9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
 10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
 11 | # General Public License for more details.  You should have received a copy
 12 | # of the GNU Lesser General Public License along with this program.  If not,
 13 | # see <http://www.gnu.org/licenses/>.
 14 | 
 15 | """iostat statistics for TSDB"""
 16 | 
 17 | # data is from /proc/diskstats
 18 | 
 19 | # Calculate disk statistics.  We handle 2.6 kernel output only, both
 20 | # pre-2.6.25 and post (which added back per-partition disk stats).
 21 | # (diskstats output significantly changed from 2.4).
 22 | # The fields (from iostats.txt) are mainly rate counters
 23 | # (either number of operations or number of milliseconds doing a
 24 | # particular operation), so let's just let TSD do the rate
 25 | # calculation for us.
 26 | #
 27 | # /proc/diskstats has 11 stats for a given device
 28 | # these are all rate counters except ios_in_progress
 29 | # .read_requests       Number of reads completed
 30 | # .read_merged         Number of reads merged
 31 | # .read_sectors        Number of sectors read
 32 | # .msec_read           Time in msec spent reading
 33 | # .write_requests      Number of writes completed
 34 | # .write_merged        Number of writes merged
 35 | # .write_sectors       Number of sectors written
 36 | # .msec_write          Time in msec spent writing
 37 | # .ios_in_progress     Number of I/O operations in progress
 38 | # .msec_total          Time in msec doing I/O
 39 | # .msec_weighted_total Weighted time doing I/O (multiplied by ios_in_progress)
 40 | 
 41 | # in 2.6.25 and later, by-partition stats are reported same as disks
 42 | # in 2.6 before 2.6.25, partitions have 4 stats per partition
 43 | # .read_issued
 44 | # .read_sectors
 45 | # .write_issued
 46 | # .write_sectors
 47 | # For partitions, these *_issued are counters collected before
 48 | # requests are merged, so aren't the same as *_requests (which is
 49 | # post-merge, which more closely represents represents the actual
 50 | # number of disk transactions).
 51 | 
 52 | # Given that diskstats provides both per-disk and per-partition data,
 53 | # for TSDB purposes we want to put them under different metrics (versus
 54 | # the same metric and different tags).  Otherwise, if you look at a
 55 | # given metric, the data for a given box will be double-counted, since
 56 | # a given operation will increment both the disk series and the
 57 | # partition series.  To fix this, we output by-disk data to iostat.disk.*
 58 | # and by-partition data to iostat.part.*.
 59 | 
 60 | # TODO: Add additional tags to map partitions/disks back to mount
 61 | # points/swap so you can (for example) plot just swap partition
 62 | # activity or /var/lib/mysql partition activity no matter which
 63 | # disk/partition this happens to be.  This is nontrivial, especially
 64 | # when you have to handle mapping of /dev/mapper to dm-N, pulling out
 65 | # swap partitions from /proc/swaps, etc.
 66 | 
 67 | # TODO: add some generated stats from iostat -x like svctm, await,
 68 | # %util.  These need to pull in cpu idle counters from /proc.
 69 | 
 70 | 
 71 | import os
 72 | import socket
 73 | import sys
 74 | import time
 75 | 
 76 | COLLECTION_INTERVAL = 60  # seconds
 77 | 
 78 | # Docs come from the Linux kernel's Documentation/iostats.txt
 79 | FIELDS_DISK = (
 80 |     "read_requests",        # Total number of reads completed successfully.
 81 |     "read_merged",          # Adjacent read requests merged in a single req.
 82 |     "read_sectors",         # Total number of sectors read successfully.
 83 |     "msec_read",            # Total number of ms spent by all reads.
 84 |     "write_requests",       # total number of writes completed successfully.
 85 |     "write_merged",         # Adjacent write requests merged in a single req.
 86 |     "write_sectors",        # total number of sectors written successfully.
 87 |     "msec_write",           # Total number of ms spent by all writes.
 88 |     "ios_in_progress",      # Number of actual I/O requests currently in flight.
 89 |     "msec_total",           # Amount of time during which ios_in_progress >= 1.
 90 |     "msec_weighted_total",  # Measure of recent I/O completion time and backlog.
 91 |     )
 92 | 
 93 | FIELDS_PART = ("read_issued",
 94 |                "read_sectors",
 95 |                "write_issued",
 96 |                "write_sectors",
 97 |               )
 98 | 
 99 | 
100 | def main():
101 |     """iostats main loop."""
102 |     f_diskstats = open("/proc/diskstats", "r")
103 | 
104 |     while True:
105 |         f_diskstats.seek(0)
106 |         ts = int(time.time())
107 |         for line in f_diskstats:
108 |             # maj, min, devicename, [list of stats, see above]
109 |             values = line.split(None)
110 |             # shortcut the deduper and just skip disks that
111 |             # haven't done a single read.  This elimiates a bunch
112 |             # of loopback, ramdisk, and cdrom devices but still
113 |             # lets us report on the rare case that we actually use
114 |             # a ramdisk.
115 |             if values[3] == "0":
116 |                 continue
117 | 
118 |             if int(values[1]) % 16 == 0 and int(values[0]) > 1:
119 |                 metric = "iostat.disk."
120 |             else:
121 |                 metric = "iostat.part."
122 | 
123 |             # Sometimes there can be a slash in the device name, see bug #8.
124 |             # TODO(tsuna): Remove the substitution once TSD allows `/' in tags.
125 |             device = values[2].replace("/", "_")
126 |             if len(values) == 14:
127 |                 # full stats line
128 |                 for i in range(11):
129 |                     print ("%s%s %d %s dev=%s"
130 |                            % (metric, FIELDS_DISK[i], ts, values[i+3],
131 |                               device))
132 |             elif len(values) == 7:
133 |                 # partial stats line
134 |                 for i in range(4):
135 |                     print ("%s%s %d %s dev=%s"
136 |                            % (metric, FIELDS_PART[i], ts, values[i+3],
137 |                               device))
138 |             else:
139 |                 print >> sys.stderr, "Cannot parse /proc/diskstats line: ", line
140 |                 continue
141 | 
142 |         sys.stdout.flush()
143 |         time.sleep(COLLECTION_INTERVAL)
144 | 
145 | 
146 | 
147 | if __name__ == "__main__":
148 |     main()
149 | 
150 | 


--------------------------------------------------------------------------------
/collectors/0/mysql.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # This file is part of tcollector.
  3 | # Copyright (C) 2011  StumbleUpon, Inc.
  4 | #
  5 | # This program is free software: you can redistribute it and/or modify it
  6 | # under the terms of the GNU Lesser General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or (at your
  8 | # option) any later version.  This program is distributed in the hope that it
  9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
 10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
 11 | # General Public License for more details.  You should have received a copy
 12 | # of the GNU Lesser General Public License along with this program.  If not,
 13 | # see <http://www.gnu.org/licenses/>.
 14 | """Collector for MySQL."""
 15 | 
 16 | import errno
 17 | import os
 18 | import re
 19 | import socket
 20 | import stat
 21 | import sys
 22 | import time
 23 | 
 24 | try:
 25 |   import MySQLdb
 26 | except ImportError:
 27 |   MySQLdb = None  # This is handled gracefully in main()
 28 | 
 29 | # This is really ugly, but we don't have a good way of passing
 30 | # configuration data down to the collectors at the moment :(
 31 | sys.path.append(os.path.dirname(sys.argv[0]) + "/../etc")
 32 | import mysqlconf
 33 | 
 34 | COLLECTION_INTERVAL = 15  # seconds
 35 | CONNECT_TIMEOUT = 2  # seconds
 36 | # How frequently we try to find new databases.
 37 | DB_REFRESH_INTERVAL = 60  # seconds
 38 | # Usual locations where to find the default socket file.
 39 | DEFAULT_SOCKFILES = set([
 40 |   "/tmp/mysql.sock",                  # MySQL's own default.
 41 |   "/var/lib/mysql/mysql.sock",        # RH-type / RPM systems.
 42 |   "/var/run/mysqld/mysqld.sock",      # Debian-type systems.
 43 | ])
 44 | # Directories under which to search additional socket files.
 45 | SEARCH_DIRS = [
 46 |   "/var/lib/mysql",
 47 | ]
 48 | 
 49 | def err(msg):
 50 |   print >>sys.stderr, msg
 51 | 
 52 | class DB(object):
 53 |   """Represents a MySQL server (as we can monitor more than 1 MySQL)."""
 54 | 
 55 |   def __init__(self, sockfile, dbname, db, cursor, version):
 56 |     """Constructor.
 57 | 
 58 |     Args:
 59 |       sockfile: Path to the socket file.
 60 |       dbname: Name of the database for that socket file.
 61 |       db: A MySQLdb connection opened to that socket file.
 62 |       cursor: A cursor acquired from that connection.
 63 |       version: What version is this MySQL running (from `SELECT VERSION()').
 64 |     """
 65 |     self.sockfile = sockfile
 66 |     self.dbname = dbname
 67 |     self.db = db
 68 |     self.cursor = cursor
 69 |     self.version = version
 70 |     self.master = None
 71 |     self.slave_bytes_executed = None
 72 |     self.relay_bytes_relayed = None
 73 | 
 74 |     version = version.split(".")
 75 |     try:
 76 |       self.major = int(version[0])
 77 |       self.medium = int(version[1])
 78 |     except (ValueError, IndexError), e:
 79 |       self.major = self.medium = 0
 80 | 
 81 |   def __str__(self):
 82 |     return "DB(%r, %r, version=%r)" % (self.sockfile, self.dbname,
 83 |                                        self.version)
 84 | 
 85 |   def __repr__(self):
 86 |     return self.__str__()
 87 | 
 88 |   def isShowGlobalStatusSafe(self):
 89 |     """Returns whether or not SHOW GLOBAL STATUS is safe to run."""
 90 |     # We can't run SHOW GLOBAL STATUS on versions prior to 5.1 because it
 91 |     # locks the entire database for too long and severely impacts traffic.
 92 |     return self.major > 5 or (self.major == 5 and self.medium >= 1)
 93 | 
 94 |   def query(self, sql):
 95 |     """Executes the given SQL statement and returns a sequence of rows."""
 96 |     assert self.cursor, "%s already closed?" % (self,)
 97 |     try:
 98 |       self.cursor.execute(sql)
 99 |     except MySQLdb.OperationalError, (errcode, msg):
100 |       if errcode != 2006:  # "MySQL server has gone away"
101 |         raise
102 |       self._reconnect()
103 |     return self.cursor.fetchall()
104 | 
105 |   def close(self):
106 |     """Closes the connection to this MySQL server."""
107 |     if self.cursor:
108 |       self.cursor.close()
109 |       self.cursor = None
110 |     if self.db:
111 |       self.db.close()
112 |       self.db = None
113 | 
114 |   def _reconnect(self):
115 |     """Reconnects to this MySQL server."""
116 |     self.close()
117 |     self.db = mysql_connect(self.sockfile)
118 |     self.cursor = self.db.cursor()
119 | 
120 | 
121 | def mysql_connect(sockfile):
122 |   """Connects to the MySQL server using the specified socket file."""
123 |   user, passwd = mysqlconf.get_user_password(sockfile)
124 |   return MySQLdb.connect(unix_socket=sockfile,
125 |                          connect_timeout=CONNECT_TIMEOUT,
126 |                          user=user, passwd=passwd)
127 | 
128 | 
129 | def todict(db, row):
130 |   """Transforms a row (returned by DB.query) into a dict keyed by column names.
131 | 
132 |   Args:
133 |     db: The DB instance from which this row was obtained.
134 |     row: A row as returned by DB.query
135 |   """
136 |   d = {}
137 |   for i, field in enumerate(db.cursor.description):
138 |     column = field[0].lower()  # Lower-case to normalize field names.
139 |     d[column] = row[i]
140 |   return d
141 | 
142 | def get_dbname(sockfile):
143 |   """Returns the name of the DB based on the path to the socket file."""
144 |   if sockfile in DEFAULT_SOCKFILES:
145 |     return "default"
146 |   m = re.search("/mysql-(.+)/[^.]+\.sock$", sockfile)
147 |   if not m:
148 |     err("error: couldn't guess the name of the DB for " + sockfile)
149 |     return None
150 |   return m.group(1)
151 | 
152 | 
153 | def is_sockfile(path):
154 |   """Returns whether or not the given path is a socket file."""
155 |   try:
156 |     s = os.stat(path)
157 |   except OSError, (no, e):
158 |     if no == errno.ENOENT:
159 |       return False
160 |     err("warning: couldn't stat(%r): %s" % (path, e))
161 |     return None
162 |   return s.st_mode & stat.S_IFSOCK == stat.S_IFSOCK
163 | 
164 | 
165 | def find_sockfiles():
166 |   """Returns a list of paths to socket files to monitor."""
167 |   paths = []
168 |   # Look for socket files.
169 |   for dir in SEARCH_DIRS:
170 |     if not os.path.isdir(dir):
171 |       continue
172 |     for name in os.listdir(dir):
173 |       subdir = os.path.join(dir, name)
174 |       if not os.path.isdir(subdir):
175 |         continue
176 |       for subname in os.listdir(subdir):
177 |         path = os.path.join(subdir, subname)
178 |         if is_sockfile(path):
179 |           paths.append(path)
180 |           break  # We only expect 1 socket file per DB, so get out.
181 |   # Try the default locations.
182 |   for sockfile in DEFAULT_SOCKFILES:
183 |     if not is_sockfile(sockfile):
184 |       continue
185 |     paths.append(sockfile)
186 |   return paths
187 | 
188 | 
189 | def find_databases(dbs=None):
190 |   """Returns a map of dbname (string) to DB instances to monitor.
191 | 
192 |   Args:
193 |     dbs: A map of dbname (string) to DB instances already monitored.
194 |       This map will be modified in place if it's not None.
195 |   """
196 |   sockfiles = find_sockfiles()
197 |   if dbs is None:
198 |     dbs = {}
199 |   for sockfile in sockfiles:
200 |     dbname = get_dbname(sockfile)
201 |     if dbname in dbs:
202 |       continue
203 |     if not dbname:
204 |       continue
205 |     try:
206 |       db = mysql_connect(sockfile)
207 |       cursor = db.cursor()
208 |       cursor.execute("SELECT VERSION()")
209 |     except (EnvironmentError, EOFError, RuntimeError, socket.error,
210 |             MySQLdb.MySQLError), e:
211 |       err("Couldn't connect to %s: %s" % (sockfile, e))
212 |       continue
213 |     version = cursor.fetchone()[0]
214 |     dbs[dbname] = DB(sockfile, dbname, db, cursor, version)
215 |   return dbs
216 | 
217 | 
218 | def now():
219 |   return int(time.time())
220 | 
221 | 
222 | def isyes(s):
223 |   if s.lower() == "yes":
224 |     return 1
225 |   return 0
226 | 
227 | 
228 | def collectInnodbStatus(db):
229 |   """Collects and prints InnoDB stats about the given DB instance."""
230 |   ts = now()
231 |   def printmetric(metric, value, tags=""):
232 |     print "mysql.%s %d %s schema=%s%s" % (metric, ts, value, db.dbname, tags)
233 | 
234 |   innodb_status = db.query("SHOW ENGINE INNODB STATUS")[0][2]
235 |   m = re.search("^(\d{6}\s+\d{1,2}:\d\d:\d\d) INNODB MONITOR OUTPUT$",
236 |                 innodb_status, re.M)
237 |   if m:  # If we have it, try to use InnoDB's own timestamp.
238 |     ts = int(time.mktime(time.strptime(m.group(1), "%y%m%d %H:%M:%S")))
239 | 
240 |   line = None
241 |   def match(regexp):
242 |     return re.match(regexp, line)
243 | 
244 |   for line in innodb_status.split("\n"):
245 |     # SEMAPHORES
246 |     m = match("OS WAIT ARRAY INFO: reservation count (\d+), signal count (\d+)")
247 |     if m:
248 |       printmetric("innodb.oswait_array.reservation_count", m.group(1))
249 |       printmetric("innodb.oswait_array.signal_count", m.group(2))
250 |       continue
251 |     m = match("Mutex spin waits (\d+), rounds (\d+), OS waits (\d+)")
252 |     if m:
253 |       printmetric("innodb.locks.spin_waits", m.group(1), " type=mutex")
254 |       printmetric("innodb.locks.rounds", m.group(2), " type=mutex")
255 |       printmetric("innodb.locks.os_waits", m.group(3), " type=mutex")
256 |       continue
257 |     m = match("RW-shared spins (\d+), OS waits (\d+);"
258 |               " RW-excl spins (\d+), OS waits (\d+)")
259 |     if m:
260 |       printmetric("innodb.locks.spin_waits", m.group(1), " type=rw-shared")
261 |       printmetric("innodb.locks.os_waits", m.group(2), " type=rw-shared")
262 |       printmetric("innodb.locks.spin_waits", m.group(3), " type=rw-exclusive")
263 |       printmetric("innodb.locks.os_waits", m.group(4), " type=rw-exclusive")
264 |       continue
265 |     # INSERT BUFFER AND ADAPTIVE HASH INDEX
266 |     # TODO(tsuna): According to the code in ibuf0ibuf.c, this line and
267 |     # the following one can appear multiple times.  I've never seen this.
268 |     # If that happens, we need to aggregate the values here instead of
269 |     # printing them directly.
270 |     m = match("Ibuf: size (\d+), free list len (\d+), seg size (\d+),")
271 |     if m:
272 |       printmetric("innodb.ibuf.size", m.group(1))
273 |       printmetric("innodb.ibuf.free_list_len", m.group(2))
274 |       printmetric("innodb.ibuf.seg_size", m.group(3))
275 |       continue
276 |     m = match("(\d+) inserts, (\d+) merged recs, (\d+) merges")
277 |     if m:
278 |       printmetric("innodb.ibuf.inserts", m.group(1))
279 |       printmetric("innodb.ibuf.merged_recs", m.group(2))
280 |       printmetric("innodb.ibuf.merges", m.group(3))
281 |       continue
282 |     # ROW OPERATIONS
283 |     m = match("\d+ queries inside InnoDB, (\d+) queries in queue")
284 |     if m:
285 |       printmetric("innodb.queries_queued", m.group(1))
286 |       continue
287 |     m = match("(\d+) read views open inside InnoDB")
288 |     if m:
289 |       printmetric("innodb.opened_read_views", m.group(1))
290 |       continue
291 |     # TRANSACTION
292 |     m = match("History list length (\d+)")
293 |     if m:
294 |       printmetric("innodb.history_list_length", m.group(1))
295 |       continue
296 | 
297 | 
298 | def collect(db):
299 |   """Collects and prints stats about the given DB instance."""
300 | 
301 |   ts = now()
302 |   def printmetric(metric, value, tags=""):
303 |     print "mysql.%s %d %s schema=%s%s" % (metric, ts, value, db.dbname, tags)
304 | 
305 |   has_innodb = False
306 |   if db.isShowGlobalStatusSafe():
307 |     for metric, value in db.query("SHOW GLOBAL STATUS"):
308 |       try:
309 |         if "." in value:
310 |           value = float(value)
311 |         else:
312 |           value = int(value)
313 |       except ValueError:
314 |         continue
315 |       metric = metric.lower()
316 |       has_innodb = has_innodb or metric.startswith("innodb")
317 |       printmetric(metric, value)
318 | 
319 |   if has_innodb:
320 |     collectInnodbStatus(db)
321 | 
322 |   if has_innodb and False:  # Disabled because it's too expensive for InnoDB.
323 |     waits = {}  # maps a mutex name to the number of waits
324 |     ts = now()
325 |     for engine, mutex, status in db.query("SHOW ENGINE INNODB MUTEX"):
326 |       if not status.startswith("os_waits"):
327 |         continue
328 |       m = re.search("&(\w+)(?:->(\w+))?$", mutex)
329 |       if not m:
330 |         continue
331 |       mutex, kind = m.groups()
332 |       if kind:
333 |         mutex += "." + kind
334 |       wait_count = int(status.split("=", 1)[1])
335 |       waits[mutex] = waits.get(mutex, 0) + wait_count
336 |     for mutex, wait_count in waits.iteritems():
337 |       printmetric("innodb.locks", wait_count, " mutex=" + mutex)
338 | 
339 |   ts = now()
340 | 
341 |   mysql_slave_status = db.query("SHOW SLAVE STATUS")
342 |   if mysql_slave_status:
343 |     slave_status = todict(db, mysql_slave_status[0])
344 |     master_host = slave_status["master_host"]
345 |   else:
346 |     master_host = None
347 | 
348 |   if master_host and master_host != "None":
349 |     sbm = slave_status.get("seconds_behind_master")
350 |     if isinstance(sbm, (int, long)):
351 |       printmetric("slave.seconds_behind_master", sbm)
352 |     printmetric("slave.bytes_executed", slave_status["exec_master_log_pos"])
353 |     printmetric("slave.bytes_relayed", slave_status["read_master_log_pos"])
354 |     printmetric("slave.thread_io_running",
355 |                 isyes(slave_status["slave_io_running"]))
356 |     printmetric("slave.thread_sql_running",
357 |                 isyes(slave_status["slave_sql_running"]))
358 | 
359 |   states = {}  # maps a connection state to number of connections in that state
360 |   for row in db.query("SHOW PROCESSLIST"):
361 |     id, user, host, db_, cmd, time, state = row[:7]
362 |     states[cmd] = states.get(cmd, 0) + 1
363 |   for state, count in states.iteritems():
364 |     state = state.lower().replace(" ", "_")
365 |     printmetric("connection_states", count, " state=%s" % state)
366 | 
367 | 
368 | def main(args):
369 |   """Collects and dumps stats from a MySQL server."""
370 |   if not find_sockfiles():  # Nothing to monitor.
371 |     return 13               # Ask tcollector to not respawn us.
372 |   if MySQLdb is None:
373 |     err("error: Python module `MySQLdb' is missing")
374 |     return 1
375 | 
376 |   last_db_refresh = now()
377 |   dbs = find_databases()
378 |   while True:
379 |     ts = now()
380 |     if ts - last_db_refresh >= DB_REFRESH_INTERVAL:
381 |       find_databases(dbs)
382 |       last_db_refresh = ts
383 | 
384 |     errs = []
385 |     for dbname, db in dbs.iteritems():
386 |       try:
387 |         collect(db)
388 |       except (EnvironmentError, EOFError, RuntimeError, socket.error,
389 |               MySQLdb.MySQLError), e:
390 |         if isinstance(e, IOError) and e[0] == errno.EPIPE:
391 |           # Exit on a broken pipe.  There's no point in continuing
392 |           # because no one will read our stdout anyway.
393 |           return 2
394 |         err("error: failed to collect data from %s: %s" % (db, e))
395 |         errs.append(dbname)
396 | 
397 |     for dbname in errs:
398 |       del dbs[dbname]
399 | 
400 |     sys.stdout.flush()
401 |     time.sleep(COLLECTION_INTERVAL)
402 | 
403 | 
404 | if __name__ == "__main__":
405 |   sys.stdin.close()
406 |   sys.exit(main(sys.argv))
407 | 


--------------------------------------------------------------------------------
/collectors/0/netstat.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # This file is part of tcollector.
  3 | # Copyright (C) 2011  StumbleUpon, Inc.
  4 | #
  5 | # This program is free software: you can redistribute it and/or modify it
  6 | # under the terms of the GNU Lesser General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or (at your
  8 | # option) any later version.  This program is distributed in the hope that it
  9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
 10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
 11 | # General Public License for more details.  You should have received a copy
 12 | # of the GNU Lesser General Public License along with this program.  If not,
 13 | # see <http://www.gnu.org/licenses/>.
 14 | 
 15 | # Note: I spent many hours reading the Linux kernel's source code to infer the
 16 | # exact meaning of some of the obscure but useful metrics it exposes.  The
 17 | # description of the metrics are correct to the best of my knowledge, but it's
 18 | # not always to make sense of the Linux kernel's code.  Please report any
 19 | # inaccuracy you find.  -- tsuna.
 20 | """Socket allocation and network statistics for TSDB.
 21 | 
 22 | Metrics from /proc/net/sockstat:
 23 |   - net.sockstat.num_sockets: Number of sockets allocated (only TCP).
 24 |   - net.sockstat.num_timewait: Number of TCP sockets currently in
 25 |     TIME_WAIT state.
 26 |   - net.sockstat.sockets_inuse: Number of sockets in use (TCP/UDP/raw).
 27 |   - net.sockstat.num_orphans: Number of orphan TCP sockets (not attached
 28 |     to any file descriptor).
 29 |   - net.sockstat.memory: Memory allocated for this socket type (in bytes).
 30 |   - net.sockstat.ipfragqueues: Number of IP flows for which there are
 31 |     currently fragments queued for reassembly.
 32 | 
 33 | Metrics from /proc/net/netstat (`netstat -s' command):
 34 |   - net.stat.tcp.abort: Number of connections that the kernel had to abort.
 35 |     type=memory is especially bad, the kernel had to drop a connection due to
 36 |     having too many orphaned sockets.  Other types are normal (e.g. timeout).
 37 |   - net.stat.tcp.abort.failed: Number of times the kernel failed to abort a
 38 |     connection because it didn't even have enough memory to reset it (bad).
 39 |   - net.stat.tcp.congestion.recovery: Number of times the kernel detected
 40 |     spurious retransmits and was able to recover part or all of the CWND.
 41 |   - net.stat.tcp.delayedack: Number of delayed ACKs sent of different types.
 42 |   - net.stat.tcp.failed_accept: Number of times a connection had to be dropped
 43 |     after the 3WHS.  reason=full_acceptq indicates that the application isn't
 44 |     accepting connections fast enough.  You should see SYN cookies too.
 45 |   - net.stat.tcp.invalid_sack: Number of invalid SACKs we saw of diff types.
 46 |     (requires Linux v2.6.24-rc1 or newer)
 47 |   - net.stat.tcp.memory.pressure: Number of times a socket entered the
 48 |     "memory pressure" mode (not great).
 49 |   - net.stat.tcp.memory.prune: Number of times a socket had to discard
 50 |     received data due to low memory conditions (bad).
 51 |   - net.stat.tcp.packetloss.recovery: Number of times we recovered from packet
 52 |     loss by type of recovery (e.g. fast retransmit vs SACK).
 53 |   - net.stat.tcp.receive.queue.full: Number of times a received packet had to
 54 |     be dropped because the socket's receive queue was full.
 55 |     (requires Linux v2.6.34-rc2 or newer)
 56 |   - net.stat.tcp.reording: Number of times we detected re-ordering and how.
 57 |   - net.stat.tcp.syncookies: SYN cookies (both sent & received).
 58 | """
 59 | 
 60 | import os
 61 | import pwd
 62 | import re
 63 | import resource
 64 | import sys
 65 | import time
 66 | 
 67 | # If we're running as root and this user exists, we'll drop privileges.
 68 | USER = "nobody"
 69 | 
 70 | 
 71 | def drop_privileges():
 72 |     """Drops privileges if running as root."""
 73 |     try:
 74 |         ent = pwd.getpwnam(USER)
 75 |     except KeyError:
 76 |         return
 77 | 
 78 |     if os.getuid() != 0:
 79 |         return
 80 | 
 81 |     os.setgid(ent.pw_gid)
 82 |     os.setuid(ent.pw_uid)
 83 | 
 84 | 
 85 | 
 86 | def main():
 87 |     """Main loop"""
 88 |     drop_privileges()
 89 |     sys.stdin.close()
 90 | 
 91 |     interval = 15
 92 |     page_size = resource.getpagesize()
 93 | 
 94 |     try:
 95 |         sockstat = open("/proc/net/sockstat")
 96 |         netstat = open("/proc/net/netstat")
 97 |     except IOError, e:
 98 |         print >>sys.stderr, "Failed to open /proc/net/sockstat: %s" % e
 99 |         return 13  # Ask tcollector to not re-start us.
100 | 
101 |     # Note: up until v2.6.37-rc2 most of the values were 32 bits.
102 |     # The first value is pretty useless since it accounts for some
103 |     # socket types but not others.  So we don't report it because it's
104 |     # more confusing than anything else and it's not well documented
105 |     # what type of sockets are or aren't included in this count.
106 |     regexp = re.compile("sockets: used \d+\n"
107 |                         "TCP: inuse (?P<tcp_inuse>\d+) orphan (?P<orphans>\d+)"
108 |                         " tw (?P<tw_count>\d+) alloc (?P<tcp_sockets>\d+)"
109 |                         " mem (?P<tcp_pages>\d+)\n"
110 |                         "UDP: inuse (?P<udp_inuse>\d+)"
111 |                         # UDP memory accounting was added in v2.6.25-rc1
112 |                         "(?: mem (?P<udp_pages>\d+))?\n"
113 |                         # UDP-Lite (RFC 3828) was added in v2.6.20-rc2
114 |                         "(?:UDPLITE: inuse (?P<udplite_inuse>\d+)\n)?"
115 |                         "RAW: inuse (?P<raw_inuse>\d+)\n"
116 |                         "FRAG: inuse (?P<ip_frag_nqueues>\d+)"
117 |                         " memory (?P<ip_frag_mem>\d+)\n")
118 | 
119 |     def print_sockstat(metric, value, tags=""):  # Note: tags must start with ' '
120 |         if value is not None:
121 |             print "net.sockstat.%s %d %s%s" % (metric, ts, value, tags)
122 | 
123 | 
124 |     # If a line in /proc/net/netstat doesn't start with a word in that dict,
125 |     # we'll ignore it.  We use the value to build the metric name.
126 |     known_netstatstypes = {
127 |         "TcpExt:": "tcp",
128 |         "IpExt:": "ip",  # We don't collect anything from here for now.
129 |         }
130 | 
131 |     # Any stat in /proc/net/netstat that doesn't appear in this dict will be
132 |     # ignored.  If we find a match, we'll use the (metricname, tags).
133 |     known_netstats = {
134 |         # An application wasn't able to accept a connection fast enough, so
135 |         # the kernel couldn't store an entry in the queue for this connection.
136 |         # Instead of dropping it, it sent a cookie to the client.
137 |         "SyncookiesSent": ("syncookies", "type=sent"),
138 |         # After sending a cookie, it came back to us and passed the check.
139 |         "SyncookiesRecv": ("syncookies", "type=received"),
140 |         # After sending a cookie, it came back to us but looked invalid.
141 |         "SyncookiesFailed": ("syncookies", "type=failed"),
142 |         # When a socket is using too much memory (rmem), the kernel will first
143 |         # discard any out-of-order packet that has been queued (with SACK).
144 |         "OfoPruned": ("memory.prune", "type=drop_ofo_queue"),
145 |         # If the kernel is really really desperate and cannot give more memory
146 |         # to this socket even after dropping the ofo queue, it will simply
147 |         # discard the packet it received.  This is Really Bad.
148 |         "RcvPruned": ("memory.prune", "type=drop_received"),
149 |         # We waited for another packet to send an ACK, but didn't see any, so
150 |         # a timer ended up sending a delayed ACK.
151 |         "DelayedACKs": ("delayedack", "type=sent"),
152 |         # We wanted to send a delayed ACK but failed because the socket was
153 |         # locked.  So the timer was reset.
154 |         "DelayedACKLocked": ("delayedack", "type=locked"),
155 |         # We sent a delayed and duplicated ACK because the remote peer
156 |         # retransmitted a packet, thinking that it didn't get to us.
157 |         "DelayedACKLost": ("delayedack", "type=lost"),
158 |         # We completed a 3WHS but couldn't put the socket on the accept queue,
159 |         # so we had to discard the connection.
160 |         "ListenOverflows": ("failed_accept", "reason=full_acceptq"),
161 |         # We couldn't accept a connection because one of: we had no route to
162 |         # the destination, we failed to allocate a socket, we failed to
163 |         # allocate a new local port bind bucket.  Note: this counter
164 |         # also include all the increments made to ListenOverflows...
165 |         "ListenDrops": ("failed_accept", "reason=other"),
166 |         # A packet was lost and we recovered after a fast retransmit.
167 |         "TCPRenoRecovery": ("packetloss.recovery", "type=fast_retransmit"),
168 |         # A packet was lost and we recovered by using selective
169 |         # acknowledgements.
170 |         "TCPSackRecovery": ("packetloss.recovery", "type=sack"),
171 |         # We detected re-ordering using FACK (Forward ACK -- the highest
172 |         # sequence number known to have been received by the peer when using
173 |         # SACK -- FACK is used during congestion control).
174 |         "TCPFACKReorder": ("reording", "detectedby=fack"),
175 |         # We detected re-ordering using SACK.
176 |         "TCPSACKReorder": ("reording", "detectedby=sack"),
177 |         # We detected re-ordering using fast retransmit.
178 |         "TCPRenoReorder": ("reording", "detectedby=fast_retransmit"),
179 |         # We detected re-ordering using the timestamp option.
180 |         "TCPTSReorder": ("reording", "detectedby=timestamp"),
181 |         # We detected some erroneous retransmits and undid our CWND reduction.
182 |         "TCPFullUndo": ("congestion.recovery", "type=full_undo"),
183 |         # We detected some erroneous retransmits, a partial ACK arrived while
184 |         # we were fast retransmitting, so we were able to partially undo some
185 |         # of our CWND reduction.
186 |         "TCPPartialUndo": ("congestion.recovery", "type=hoe_heuristic"),
187 |         # We detected some erroneous retransmits, a D-SACK arrived and ACK'ed
188 |         # all the retransmitted data, so we undid our CWND reduction.
189 |         "TCPDSACKUndo": ("congestion.recovery", "type=sack"),
190 |         # We detected some erroneous retransmits, a partial ACK arrived, so we
191 |         # undid our CWND reduction.
192 |         "TCPLossUndo": ("congestion.recovery", "type=ack"),
193 |         # We received an unexpected SYN so we sent a RST to the peer.
194 |         "TCPAbortOnSyn": ("abort", "type=unexpected_syn"),
195 |         # We were in FIN_WAIT1 yet we received a data packet with a sequence
196 |         # number that's beyond the last one for this connection, so we RST'ed.
197 |         "TCPAbortOnData": ("abort", "type=data_after_fin_wait1"),
198 |         # We received data but the user has closed the socket, so we have no
199 |         # wait of handing it to them, so we RST'ed.
200 |         "TCPAbortOnClose": ("abort", "type=data_after_close"),
201 |         # This is Really Bad.  It happens when there are too many orphaned
202 |         # sockets (not attached a FD) and the kernel has to drop a connection.
203 |         # Sometimes it will send a reset to the peer, sometimes it wont.
204 |         "TCPAbortOnMemory": ("abort", "type=out_of_memory"),
205 |         # The connection timed out really hard.
206 |         "TCPAbortOnTimeout": ("abort", "type=timeout"),
207 |         # We killed a socket that was closed by the application and lingered
208 |         # around for long enough.
209 |         "TCPAbortOnLinger": ("abort", "type=linger"),
210 |         # We tried to send a reset, probably during one of teh TCPABort*
211 |         # situations above, but we failed e.g. because we couldn't allocate
212 |         # enough memory (very bad).
213 |         "TCPAbortFailed": ("abort.failed", None),
214 |         # Number of times a socket was put in "memory pressure" due to a non
215 |         # fatal memory allocation failure (reduces the send buffer size etc).
216 |         "TCPMemoryPressures": ("memory.pressure", None),
217 |         # We got a completely invalid SACK block and discarded it.
218 |         "TCPSACKDiscard": ("invalid_sack", "type=invalid"),
219 |         # We got a duplicate SACK while retransmitting so we discarded it.
220 |         "TCPDSACKIgnoredOld": ("invalid_sack", "type=retransmit"),
221 |         # We got a duplicate SACK and discarded it.
222 |         "TCPDSACKIgnoredNoUndo": ("invalid_sack", "type=olddup"),
223 |         # We received something but had to drop it because the socket's
224 |         # receive queue was full.
225 |         "TCPBacklogDrop": ("receive.queue.full", None),
226 |         }
227 | 
228 | 
229 |     def print_netstat(statstype, metric, value, tags=""):
230 |         if tags:
231 |             space = " "
232 |         else:
233 |             tags = space = ""
234 |         print "net.stat.%s.%s %d %s%s%s" % (statstype, metric, ts, value,
235 |                                             space, tags)
236 | 
237 |     statsdikt = {}
238 |     while True:
239 |         ts = int(time.time())
240 |         sockstat.seek(0)
241 |         netstat.seek(0)
242 |         data = sockstat.read()
243 |         stats = netstat.read()
244 |         m = re.match(regexp, data)
245 |         if not m:
246 |             print >>sys.stderr, "Cannot parse sockstat: %r" % data
247 |             return 13
248 | 
249 |         # The difference between the first two values is the number of
250 |         # sockets allocated vs the number of sockets actually in use.
251 |         print_sockstat("num_sockets",   m.group("tcp_sockets"),   " type=tcp")
252 |         print_sockstat("num_timewait",  m.group("tw_count"))
253 |         print_sockstat("sockets_inuse", m.group("tcp_inuse"),     " type=tcp")
254 |         print_sockstat("sockets_inuse", m.group("udp_inuse"),     " type=udp")
255 |         print_sockstat("sockets_inuse", m.group("udplite_inuse"), " type=udplite")
256 |         print_sockstat("sockets_inuse", m.group("raw_inuse"),     " type=raw")
257 | 
258 |         print_sockstat("num_orphans", m.group("orphans"))
259 |         print_sockstat("memory", int(m.group("tcp_pages")) * page_size,
260 |                        " type=tcp")
261 |         if m.group("udp_pages") is not None:
262 |           print_sockstat("memory", int(m.group("udp_pages")) * page_size,
263 |                          " type=udp")
264 |         print_sockstat("memory", m.group("ip_frag_mem"), " type=ipfrag")
265 |         print_sockstat("ipfragqueues", m.group("ip_frag_nqueues"))
266 | 
267 |         # /proc/net/netstat has a retarded column-oriented format.  It looks
268 |         # like this:
269 |         #   Header: SomeMetric OtherMetric
270 |         #   Header: 1 2
271 |         #   OtherHeader: ThirdMetric FooBar
272 |         #   OtherHeader: 42 51
273 |         # We first group all the lines for each header together:
274 |         #   {"Header:": [["SomeMetric", "OtherHeader"], ["1", "2"]],
275 |         #    "OtherHeader:": [["ThirdMetric", "FooBar"], ["42", "51"]]}
276 |         # Then we'll create a dict for each type:
277 |         #   {"SomeMetric": "1", "OtherHeader": "2"}
278 |         for line in stats.splitlines():
279 |             line = line.split()
280 |             if line[0] not in known_netstatstypes:
281 |                 print >>sys.stderr, ("Unrecoginized line in /proc/net/netstat:"
282 |                                      " %r (file=%r)" % (line, stats))
283 |                 continue
284 |             statstype = line.pop(0)
285 |             statsdikt.setdefault(known_netstatstypes[statstype], []).append(line)
286 |         for statstype, stats in statsdikt.iteritems():
287 |             # stats is now:
288 |             # [["SyncookiesSent", "SyncookiesRecv", ...], ["1", "2", ....]]
289 |             assert len(stats) == 2, repr(statsdikt)
290 |             stats = dict(zip(*stats))
291 |             value = stats.get("ListenDrops")
292 |             if value is not None:  # Undo the kernel's double counting
293 |                 stats["ListenDrops"] = int(value) - int(stats.get("ListenOverflows", 0))
294 |             for stat, (metric, tags) in known_netstats.iteritems():
295 |                 value = stats.get(stat)
296 |                 if value is not None:
297 |                     print_netstat(statstype, metric, value, tags)
298 |         stats.clear()
299 |         statsdikt.clear()
300 | 
301 |         sys.stdout.flush()
302 |         time.sleep(interval)
303 | 
304 | if __name__ == "__main__":
305 |     sys.exit(main())
306 | 


--------------------------------------------------------------------------------
/collectors/0/procnettcp.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # This file is part of tcollector.
  3 | # Copyright (C) 2010  StumbleUpon, Inc.
  4 | #
  5 | # This program is free software: you can redistribute it and/or modify it
  6 | # under the terms of the GNU Lesser General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or (at your
  8 | # option) any later version.  This program is distributed in the hope that it
  9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
 10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
 11 | # General Public License for more details.  You should have received a copy
 12 | # of the GNU Lesser General Public License along with this program.  If not,
 13 | # see <http://www.gnu.org/licenses/>.
 14 | 
 15 | """TCP socket state data for TSDB"""
 16 | #
 17 | # Read /proc/net/tcp, which gives netstat -a type
 18 | # data for all TCP sockets.
 19 | 
 20 | # Note this collector generates a lot of lines, given that there are
 21 | #  lots of tcp states and given the number of subcollections we do.
 22 | #  We rely heavily on tcollector's deduping.  We could be lazy and
 23 | #  just output values only for which we have data, except if we do
 24 | #  this then any counters for which we had data would never reach
 25 | #  zero since our state machine never enters this condition.
 26 | 
 27 | # Metric: proc.net.tcp
 28 | 
 29 | # For each run, we classify each connection and generate subtotals.
 30 | #   TSD will automatically total these up when displaying
 31 | #   the graph, but you can drill down for each possible total or a
 32 | #   particular one.  This does generate a large amount of datapoints,
 33 | #   as the number of points is (S*(U+1)*V) (currently ~400), where
 34 | #   S=number of TCP states, U=Number of users to track, and
 35 | #   V=number of services (collections of ports)
 36 | # The deduper does dedup this down very well, as only 3 of the 10
 37 | # TCP states are generally ever seen, and most servers only run one
 38 | # service under one user.  On a typical server this dedups down to
 39 | # under 10 values per interval.
 40 | 
 41 | # Each connection is broken down with a tag for user=username (see
 42 | #   "users" list below) or put under "other" if not in the list.
 43 | #   Expand this for any users you care about.
 44 | # It is also broken down for each state (state=).
 45 | # It is also broken down into services (collections of ports)
 46 | 
 47 | # Note that once a connection is closed, Linux seems to forget who
 48 | # opened/handled the connection.  For connections in time_wait, for
 49 | # example, they will always show user=root.
 50 | 
 51 | import os
 52 | import sys
 53 | import time
 54 | import socket
 55 | import pwd
 56 | 
 57 | 
 58 | USERS = ("root", "www-data", "mysql")
 59 | 
 60 | # Note if a service runs on multiple ports and you
 61 | # want to collectively map them up to a single service,
 62 | # just give them the same name below
 63 | 
 64 | PORTS = {
 65 |     80: "http",
 66 |     443: "https",
 67 |     3001: "http-varnish",
 68 |     3002: "http-varnish",
 69 |     3003: "http-varnish",
 70 |     3004: "http-varnish",
 71 |     3005: "http-varnish",
 72 |     3006: "http-varnish",
 73 |     3007: "http-varnish",
 74 |     3008: "http-varnish",
 75 |     3009: "http-varnish",
 76 |     3010: "http-varnish",
 77 |     3011: "http-varnish",
 78 |     3012: "http-varnish",
 79 |     3013: "http-varnish",
 80 |     3014: "http-varnish",
 81 |     3306: "mysql",
 82 |     3564: "mysql",
 83 |     9000: "namenode",
 84 |     9090: "thriftserver",
 85 |     11211: "memcache",
 86 |     11212: "memcache",
 87 |     11213: "memcache",
 88 |     11214: "memcache",
 89 |     11215: "memcache",
 90 |     11216: "memcache",
 91 |     11217: "memcache",
 92 |     11218: "memcache",
 93 |     11219: "memcache",
 94 |     11220: "memcache",
 95 |     11221: "memcache",
 96 |     11222: "memcache",
 97 |     11223: "memcache",
 98 |     11224: "memcache",
 99 |     11225: "memcache",
100 |     11226: "memcache",
101 |     50020: "datanode",
102 |     60020: "hregionserver",
103 |     }
104 | 
105 | SERVICES = tuple(set(PORTS.itervalues()))
106 | 
107 | TCPSTATES = {
108 |     "01": "established",
109 |     "02": "syn_sent",
110 |     "03": "syn_recv",
111 |     "04": "fin_wait1",
112 |     "05": "fin_wait2",
113 |     "06": "time_wait",
114 |     "07": "close",
115 |     "08": "close_wait",
116 |     "09": "last_ack",
117 |     "0A": "listen",
118 |     "0B": "closing",
119 |     }
120 | 
121 | # If we're running as root and this user exists, we'll drop privileges.
122 | USER = "nobody"
123 | 
124 | 
125 | def drop_privileges():
126 |     try:
127 |         ent = pwd.getpwnam(USER)
128 |     except KeyError:
129 |         return
130 | 
131 |     if os.getuid() != 0:
132 |         return
133 | 
134 |     os.setgid(ent.pw_gid)
135 |     os.setuid(ent.pw_uid)
136 | 
137 | 
138 | def is_public_ip(ipstr):
139 |     """
140 |     Take a /proc/net/tcp encoded src or dest string
141 |     Return True if it is coming from public IP space
142 |     (i.e. is not RFC1918, loopback, or broadcast).
143 |     This string is the hex ip:port of the connection.
144 |     (ip is reversed)
145 |     """
146 |     addr = ipstr.split(":")[0]
147 |     addr = int(addr, 16)
148 |     byte1 = addr & 0xFF
149 |     byte2 = (addr >> 8) & 0xFF
150 |     if byte1 in (10, 0, 127):
151 |         return False
152 |     if byte1 == 172 and byte2 > 16:
153 |         return False
154 |     if byte1 == 192 and byte2 == 168:
155 |         return False
156 |     return True
157 | 
158 | 
159 | def main(unused_args):
160 |     """procnettcp main loop"""
161 |     drop_privileges()
162 |     try:           # On some Linux kernel versions, with lots of connections
163 |       os.nice(19)  # this collector can be very CPU intensive.  So be nicer.
164 |     except OSError, e:
165 |       print >>sys.stderr, "warning: failed to self-renice:", e
166 | 
167 |     interval = 60
168 | 
169 |     # resolve the list of users to match on into UIDs
170 |     uids = {}
171 |     for user in USERS:
172 |         try:
173 |             uids[str(pwd.getpwnam(user)[2])] = user
174 |         except KeyError:
175 |             continue
176 | 
177 |     try:
178 |         tcp = open("/proc/net/tcp")
179 |         # if IPv6 is enabled, even IPv4 connections will also
180 |         # appear in tcp6. It has the same format, apart from the
181 |         # address size
182 |         try:
183 |             tcp6 = open("/proc/net/tcp6")
184 |         except IOError, (errno, msg):
185 |             if errno == 2:  # No such file => IPv6 is disabled.
186 |                 tcp6 = None
187 |             else:
188 |                 raise
189 |     except IOError, e:
190 |         print >>sys.stderr, "Failed to open input file: %s" % (e,)
191 |         return 13  # Ask tcollector to not re-start us immediately.
192 | 
193 |     while True:
194 |         counter = {}
195 | 
196 |         for procfile in (tcp, tcp6):
197 |             if procfile is None:
198 |                 continue
199 |             procfile.seek(0)
200 |             ts = int(time.time())
201 |             for line in procfile:
202 |                 try:
203 |                     # pylint: disable=W0612
204 |                     (num, src, dst, state, queue, when, retrans,
205 |                      uid, timeout, inode) = line.split(None, 9)
206 |                 except ValueError:  # Malformed line
207 |                     continue
208 | 
209 |                 if num == "sl":  # header
210 |                     continue
211 | 
212 |                 srcport = src.split(":")[1]
213 |                 dstport = dst.split(":")[1]
214 |                 srcport = int(srcport, 16)
215 |                 dstport = int(dstport, 16)
216 |                 service = PORTS.get(srcport, "other")
217 |                 service = PORTS.get(dstport, service)
218 | 
219 |                 if is_public_ip(dst) or is_public_ip(src):
220 |                     endpoint = "external"
221 |                 else:
222 |                     endpoint = "internal"
223 | 
224 | 
225 |                 user = uids.get(uid, "other")
226 | 
227 |                 key = "state=" + TCPSTATES[state] + " endpoint=" + endpoint + \
228 |                       " service=" + service + " user=" + user
229 |                 if key in counter:
230 |                     counter[key] += 1
231 |                 else:
232 |                     counter[key] = 1
233 | 
234 |         # output the counters
235 |         for state in TCPSTATES:
236 |             for service in SERVICES + ("other",):
237 |                 for user in USERS + ("other",):
238 |                     for endpoint in ("internal", "external"):
239 |                         key = ("state=%s endpoint=%s service=%s user=%s"
240 |                                % (TCPSTATES[state], endpoint, service, user))
241 |                         if key in counter:
242 |                             print "proc.net.tcp", ts, counter[key], key
243 |                         else:
244 |                             print "proc.net.tcp", ts, "0", key
245 | 
246 |         sys.stdout.flush()
247 |         time.sleep(interval)
248 | 
249 | if __name__ == "__main__":
250 |     sys.exit(main(sys.argv))
251 | 


--------------------------------------------------------------------------------
/collectors/0/procstats.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # This file is part of tcollector.
  3 | # Copyright (C) 2010  StumbleUpon, Inc.
  4 | #
  5 | # This program is free software: you can redistribute it and/or modify it
  6 | # under the terms of the GNU Lesser General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or (at your
  8 | # option) any later version.  This program is distributed in the hope that it
  9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
 10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
 11 | # General Public License for more details.  You should have received a copy
 12 | # of the GNU Lesser General Public License along with this program.  If not,
 13 | # see <http://www.gnu.org/licenses/>.
 14 | #
 15 | """import various /proc stats from /proc into TSDB"""
 16 | 
 17 | import os
 18 | import sys
 19 | import time
 20 | import socket
 21 | import re
 22 | 
 23 | COLLECTION_INTERVAL = 15  # seconds
 24 | NUMADIR = "/sys/devices/system/node"
 25 | 
 26 | 
 27 | def open_sysfs_numa_stats():
 28 |     """Returns a possibly empty list of opened files."""
 29 |     try:
 30 |         nodes = os.listdir(NUMADIR)
 31 |     except OSError, (errno, msg):
 32 |         if errno == 2:  # No such file or directory
 33 |             return []   # We don't have NUMA stats.
 34 |         raise
 35 | 
 36 |     nodes = [node for node in nodes if node.startswith("node")]
 37 |     numastats = []
 38 |     for node in nodes:
 39 |         try:
 40 |             numastats.append(open(os.path.join(NUMADIR, node, "numastat")))
 41 |         except OSError, (errno, msg):
 42 |             if errno == 2:  # No such file or directory
 43 |                 continue
 44 |             raise
 45 |     return numastats
 46 | 
 47 | 
 48 | def print_numa_stats(numafiles):
 49 |     """From a list of opened files, extracts and prints NUMA stats."""
 50 |     for numafile in numafiles:
 51 |         numafile.seek(0)
 52 |         node_id = int(numafile.name[numafile.name.find("/node/node")+10:-9])
 53 |         ts = int(time.time())
 54 |         stats = dict(line.split() for line in numafile.read().splitlines())
 55 |         for stat, tag in (# hit: process wanted memory from this node and got it
 56 |                           ("numa_hit", "hit"),
 57 |                           # miss: process wanted another node and got it from
 58 |                           # this one instead.
 59 |                           ("numa_miss", "miss")):
 60 |             print ("sys.numa.zoneallocs %d %s node=%d type=%s"
 61 |                    % (ts, stats[stat], node_id, tag))
 62 |         # Count this one as a separate metric because we can't sum up hit +
 63 |         # miss + foreign, this would result in double-counting of all misses.
 64 |         # See `zone_statistics' in the code of the kernel.
 65 |         # foreign: process wanted memory from this node but got it from
 66 |         # another node.  So maybe this node is out of free pages.
 67 |         print ("sys.numa.foreign_allocs %d %s node=%d"
 68 |                % (ts, stats["numa_foreign"], node_id))
 69 |         # When is memory allocated to a node that's local or remote to where
 70 |         # the process is running.
 71 |         for stat, tag in (("local_node", "local"),
 72 |                           ("other_node", "remote")):
 73 |             print ("sys.numa.allocation %d %s node=%d type=%s"
 74 |                    % (ts, stats[stat], node_id, tag))
 75 |         # Pages successfully allocated with the interleave policy.
 76 |         print ("sys.numa.interleave %d %s node=%d type=hit"
 77 |                % (ts, stats["interleave_hit"], node_id))
 78 | 
 79 | 
 80 | def main():
 81 |     """procstats main loop"""
 82 | 
 83 |     f_uptime = open("/proc/uptime", "r")
 84 |     f_meminfo = open("/proc/meminfo", "r")
 85 |     f_vmstat = open("/proc/vmstat", "r")
 86 |     f_stat = open("/proc/stat", "r")
 87 |     f_loadavg = open("/proc/loadavg", "r")
 88 |     f_entropy_avail = open("/proc/sys/kernel/random/entropy_avail", "r")
 89 |     numastats = open_sysfs_numa_stats()
 90 | 
 91 |     while True:
 92 |         # proc.uptime
 93 |         f_uptime.seek(0)
 94 |         ts = int(time.time())
 95 |         for line in f_uptime:
 96 |             m = re.match("(\S+)\s+(\S+)", line)
 97 |             if m:
 98 |                 print "proc.uptime.total %d %s" % (ts, m.group(1))
 99 |                 print "proc.uptime.now %d %s" % (ts, m.group(2))
100 | 
101 |         # proc.meminfo
102 |         f_meminfo.seek(0)
103 |         ts = int(time.time())
104 |         for line in f_meminfo:
105 |             m = re.match("(\w+):\s+(\d+)", line)
106 |             if m:
107 |                 print ("proc.meminfo.%s %d %s"
108 |                         % (m.group(1).lower(), ts, m.group(2)))
109 | 
110 |         # proc.vmstat
111 |         f_vmstat.seek(0)
112 |         ts = int(time.time())
113 |         for line in f_vmstat:
114 |             m = re.match("(\w+)\s+(\d+)", line)
115 |             if not m:
116 |                 continue
117 |             if m.group(1) in ("pgpgin", "pgpgout", "pswpin",
118 |                               "pswpout", "pgfault", "pgmajfault"):
119 |                 print "proc.vmstat.%s %d %s" % (m.group(1), ts, m.group(2))
120 | 
121 |         # proc.stat
122 |         f_stat.seek(0)
123 |         ts = int(time.time())
124 |         for line in f_stat:
125 |             m = re.match("(\w+)\s+(.*)", line)
126 |             if not m:
127 |                 continue
128 |             if m.group(1) == "cpu":
129 |                 fields = m.group(2).split()
130 |                 print "proc.stat.cpu %d %s type=user" % (ts, fields[0])
131 |                 print "proc.stat.cpu %d %s type=nice" % (ts, fields[1])
132 |                 print "proc.stat.cpu %d %s type=system" % (ts, fields[2])
133 |                 print "proc.stat.cpu %d %s type=idle" % (ts, fields[3])
134 |                 print "proc.stat.cpu %d %s type=iowait" % (ts, fields[4])
135 |                 print "proc.stat.cpu %d %s type=irq" % (ts, fields[5])
136 |                 print "proc.stat.cpu %d %s type=softirq" % (ts, fields[6])
137 |                 # really old kernels don't have this field
138 |                 if len(fields) > 7:
139 |                     print ("proc.stat.cpu %d %s type=guest"
140 |                            % (ts, fields[7]))
141 |                     # old kernels don't have this field
142 |                     if len(fields) > 8:
143 |                         print ("proc.stat.cpu %d %s type=guest_nice"
144 |                                % (ts, fields[8]))
145 |             elif m.group(1) == "intr":
146 |                 print ("proc.stat.intr %d %s"
147 |                         % (ts, m.group(2).split()[0]))
148 |             elif m.group(1) == "ctxt":
149 |                 print "proc.stat.ctxt %d %s" % (ts, m.group(2))
150 |             elif m.group(1) == "processes":
151 |                 print "proc.stat.processes %d %s" % (ts, m.group(2))
152 |             elif m.group(1) == "procs_blocked":
153 |                 print "proc.stat.procs_blocked %d %s" % (ts, m.group(2))
154 | 
155 |         f_loadavg.seek(0)
156 |         ts = int(time.time())
157 |         for line in f_loadavg:
158 |             m = re.match("(\S+)\s+(\S+)\s+(\S+)\s+(\d+)/(\d+)\s+", line)
159 |             if not m:
160 |                 continue
161 |             print "proc.loadavg.1min %d %s" % (ts, m.group(1))
162 |             print "proc.loadavg.5min %d %s" % (ts, m.group(2))
163 |             print "proc.loadavg.15min %d %s" % (ts, m.group(3))
164 |             print "proc.loadavg.runnable %d %s" % (ts, m.group(4))
165 |             print "proc.loadavg.total_threads %d %s" % (ts, m.group(5))
166 | 
167 |         f_entropy_avail.seek(0)
168 |         ts = int(time.time())
169 |         for line in f_entropy_avail:
170 |             print "proc.kernel.entropy_avail %d %s" % (ts, line.strip())
171 | 
172 |         print_numa_stats(numastats)
173 | 
174 |         sys.stdout.flush()
175 |         time.sleep(COLLECTION_INTERVAL)
176 | 
177 | if __name__ == "__main__":
178 |     main()
179 | 
180 | 


--------------------------------------------------------------------------------
/collectors/0/redis-stats.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | # Copyright 2011 by Bump Technologies, Inc.
  4 | #
  5 | # This program is free software: you can redistribute it and/or modify it
  6 | # under the terms of the GNU Lesser General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or (at your
  8 | # option) any later version.  This program is distributed in the hope that it
  9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
 10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
 11 | # General Public License for more details.  You should have received a copy
 12 | # of the GNU Lesser General Public License along with this program.  If not,
 13 | # see <http://www.gnu.org/licenses/>.
 14 | #
 15 | # Written by Mark Smith <mark@qq.is>.
 16 | #
 17 | 
 18 | """Statistics from a Redis instance.
 19 | 
 20 | Note: this collector parses your Redis configuration files to determine what cluster
 21 | this instance is part of.  If you want the cluster tag to be accurate, please edit
 22 | your Redis configuration file and add a comment like this somewhere in the file:
 23 | 
 24 | # tcollector.cluster = main
 25 | 
 26 | You can name the cluster anything that matches the regex [a-z0-9-_]+.
 27 | 
 28 | This collector outputs the following metrics:
 29 | 
 30 |  - redis.bgrewriteaof_in_progress
 31 |  - redis.bgsave_in_progress
 32 |  - redis.blocked_clients
 33 |  - redis.changes_since_last_save
 34 |  - redis.client_biggest_input_buf
 35 |  - redis.client_longest_output_list
 36 |  - redis.connected_clients
 37 |  - redis.connected_slaves
 38 |  - redis.expired_keys
 39 |  - redis.evicted_keys
 40 |  - redis.hash_max_zipmap_entries
 41 |  - redis.hash_max_zipmap_value
 42 |  - redis.keyspace_hits
 43 |  - redis.keyspace_misses
 44 |  - redis.mem_fragmentation_ratio
 45 |  - redis.pubsub_channels
 46 |  - redis.pubsub_patterns
 47 |  - redis.total_commands_processed
 48 |  - redis.total_connections_received
 49 |  - redis.uptime_in_seconds
 50 |  - redis.used_cpu_sys
 51 |  - redis.used_cpu_user
 52 |  - redis.used_memory
 53 |  - redis.used_memory_rss
 54 | 
 55 | For more information on these values, see this (not very useful) documentation:
 56 | 
 57 |     http://redis.io/commands/info
 58 | """
 59 | 
 60 | import os
 61 | import pwd
 62 | import re
 63 | import subprocess
 64 | import sys
 65 | import time
 66 | 
 67 | try:
 68 |     import redis
 69 |     has_redis = True
 70 | except ImportError:
 71 |     has_redis = False
 72 | 
 73 | # If we are root, drop privileges to this user, if necessary.  NOTE: if this is
 74 | # not root, this MUST be the user that you run redis-server under.  If not, we
 75 | # will not be able to find your Redis instances.
 76 | USER = "root"
 77 | 
 78 | # Every SCAN_INTERVAL seconds, we look for new redis instances.  Prevents the
 79 | # situation where you put up a new instance and we never notice.
 80 | SCAN_INTERVAL = 300
 81 | 
 82 | # these are the things in the info struct that we care about
 83 | KEYS = [
 84 |     'pubsub_channels', 'bgrewriteaof_in_progress', 'connected_slaves', 'connected_clients', 'keyspace_misses',
 85 |     'used_memory', 'total_commands_processed', 'used_memory_rss', 'total_connections_received', 'pubsub_patterns',
 86 |     'used_cpu_sys', 'blocked_clients', 'used_cpu_user', 'expired_keys', 'bgsave_in_progress', 'hash_max_zipmap_entries',
 87 |     'hash_max_zipmap_value', 'client_longest_output_list', 'client_biggest_input_buf', 'uptime_in_seconds',
 88 |     'changes_since_last_save', 'mem_fragmentation_ratio', 'keyspace_hits', 'evicted_keys'
 89 | ];
 90 | 
 91 | def drop_privileges():
 92 |     """Drops privileges if running as root."""
 93 | 
 94 |     if USER == 'root':
 95 |         return
 96 | 
 97 |     try:
 98 |         ent = pwd.getpwnam(USER)
 99 |     except KeyError:
100 |         return
101 | 
102 |     if os.getuid() != 0:
103 |         return
104 |     os.setgid(ent.pw_gid)
105 |     os.setuid(ent.pw_uid)
106 | 
107 | 
108 | def main():
109 |     """Main loop"""
110 | 
111 |     drop_privileges()
112 |     sys.stdin.close()
113 | 
114 |     interval = 15
115 | 
116 |     # we scan for instances here to see if there are any redis servers
117 |     # running on this machine...
118 |     last_scan = time.time()
119 |     instances = scan_for_instances()  # port:name
120 |     if not len(instances):
121 |         return 13
122 |     if not has_redis:
123 |         sys.stderr.write("Found %d instance(s) to monitor, but the Python"
124 |                          " Redis module isn't installed.\n" % len(instances))
125 |         return 1
126 | 
127 |     def print_stat(metric, value, tags=""):
128 |         if value is not None:
129 |             print "redis.%s %d %s %s" % (metric, ts, value, tags)
130 | 
131 |     while True:
132 |         ts = int(time.time())
133 | 
134 |         # if we haven't looked for redis instances recently, let's do that
135 |         if ts - last_scan > SCAN_INTERVAL:
136 |             instances = scan_for_instances()
137 |             last_scan = ts
138 | 
139 |         # now iterate over every instance and gather statistics
140 |         for port in instances:
141 |             tags = "cluster=%s port=%d" % (instances[port], port)
142 | 
143 |             # connect to the instance and attempt to gather info
144 |             r = redis.Redis(host="127.0.0.1", port=port)
145 |             info = r.info()
146 |             for key in KEYS:
147 |                 if key in info:
148 |                     print_stat(key, info[key], tags)
149 | 
150 |             # get some instant latency information
151 |             # TODO: might be nice to get 95th, 99th, etc here?
152 |             start_time = time.time()
153 |             r.ping()
154 |             print_stat("latency", time.time() - start_time, tags)
155 | 
156 |         sys.stdout.flush()
157 |         time.sleep(interval)
158 | 
159 | 
160 | def scan_for_instances():
161 |     """Use netstat to find instances of Redis listening on the local machine, then
162 |     figure out what configuration file they're using to name the cluster."""
163 | 
164 |     out = {}
165 |     tcre = re.compile(r"^\s*#\s*tcollector.(\w+)\s*=\s*(.+)$")
166 | 
167 |     ns_proc = subprocess.Popen(["netstat", "-tnlp"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
168 |     stdout, _ = ns_proc.communicate()
169 |     if ns_proc.returncode != 0:
170 |         print >> sys.stderr, "failed to find instances %r" % ns_proc.returncode
171 |         return {}
172 | 
173 |     for line in stdout.split("\n"):
174 |         if not (line and 'redis-server' in line):
175 |             continue
176 |         pid = int(line.split()[6].split("/")[0])
177 |         port = int(line.split()[3].split(":")[1])
178 | 
179 |         # now we have to get the command line.  we look in the redis config file for
180 |         # a special line that tells us what cluster this is.  else we default to using
181 |         # the port number which should work.
182 |         cluster = "port-%d" % port
183 |         try:
184 |             f = open("/proc/%d/cmdline" % pid)
185 |             cfg = f.readline().split("\0")[-2]
186 |             f.close()
187 | 
188 |             f = open(cfg)
189 |             for cfgline in f:
190 |                 result = tcre.match(cfgline)
191 |                 if result and result.group(1).lower() == "cluster":
192 |                     cluster = result.group(2).lower()
193 |         except EnvironmentError:
194 |             # use the default cluster name if anything above failed.
195 |             pass
196 | 
197 |         out[port] = cluster
198 |     return out
199 | 
200 | 
201 | if __name__ == "__main__":
202 |     sys.exit(main())
203 | 


--------------------------------------------------------------------------------
/collectors/0/riak.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | # Copyright 2011 by Bump Technologies, Inc.
  4 | #
  5 | # This program is free software: you can redistribute it and/or modify it
  6 | # under the terms of the GNU Lesser General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or (at your
  8 | # option) any later version.  This program is distributed in the hope that it
  9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
 10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
 11 | # General Public License for more details.  You should have received a copy
 12 | # of the GNU Lesser General Public License along with this program.  If not,
 13 | # see <http://www.gnu.org/licenses/>.
 14 | #
 15 | # Written by Mark Smith <mark@qq.is>.
 16 | #
 17 | 
 18 | """A collector to gather statistics from a Riak node.
 19 | 
 20 | The following all have tags of 'type' which can be 'get' or 'put'.  Latency
 21 | is measured in fractional seconds.  All latency values are calculated over the
 22 | last 60 seconds and are moving values.
 23 | 
 24 |  - riak.vnode.requests
 25 |  - riak.node.requests
 26 |  - riak.node.latency.mean
 27 |  - riak.node.latency.median
 28 |  - riak.node.latency.95th
 29 |  - riak.node.latency.99th
 30 |  - riak.node.latency.100th
 31 | 
 32 | These metrics have no tags and are global:
 33 | 
 34 |  - riak.memory.total
 35 |  - riak.memory.allocated
 36 |  - riak.executing_mappers
 37 |  - riak.sys_process_count
 38 |  - riak.read_repairs
 39 |  - riak.connections
 40 |  - riak.connected_nodes
 41 | """
 42 | 
 43 | import json
 44 | import os
 45 | import pwd
 46 | import sys
 47 | import time
 48 | import urllib2
 49 | 
 50 | # If we're running as root and this user exists, we'll drop privileges.  Set this
 51 | # to 'root' if you don't want to drop privileges.
 52 | USER = "nobody"
 53 | 
 54 | MAP = {
 55 |     'vnode_gets_total': ('vnode.requests', 'type=get'),
 56 |     'vnode_puts_total': ('vnode.requests', 'type=put'),
 57 |     'node_gets_total': ('node.requests', 'type=get'),
 58 |     'node_puts_total': ('node.requests', 'type=put'),
 59 |     'node_get_fsm_time_mean': ('node.latency.mean', 'type=get'),
 60 |     'node_get_fsm_time_median': ('node.latency.median', 'type=get'),
 61 |     'node_get_fsm_time_95': ('node.latency.95th', 'type=get'),
 62 |     'node_get_fsm_time_99': ('node.latency.99th', 'type=get'),
 63 |     'node_get_fsm_time_100': ('node.latency.100th', 'type=get'),
 64 |     'node_put_fsm_time_mean': ('node.latency.mean', 'type=put'),
 65 |     'node_put_fsm_time_median': ('node.latency.median', 'type=put'),
 66 |     'node_put_fsm_time_95': ('node.latency.95th', 'type=put'),
 67 |     'node_put_fsm_time_99': ('node.latency.99th', 'type=put'),
 68 |     'node_put_fsm_time_100': ('node.latency.100th', 'type=put'),
 69 |     'pbc_connects_total': ('connections', ''),
 70 |     'read_repairs_total': ('read_repairs', ''),
 71 |     'sys_process_count': ('sys_process_count', ''),
 72 |     'executing_mappers': ('executing_mappers', ''),
 73 |     'mem_allocated': ('memory.allocated', ''),
 74 |     'mem_total': ('memory.total', ''),
 75 |     #connected_nodes is calculated
 76 | }
 77 | 
 78 | def drop_privileges():
 79 |     """Drops privileges if running as root."""
 80 | 
 81 |     if USER == 'root':
 82 |         return
 83 | 
 84 |     try:
 85 |         ent = pwd.getpwnam(USER)
 86 |     except KeyError:
 87 |         return
 88 | 
 89 |     if os.getuid() != 0:
 90 |         return
 91 |     os.setgid(ent.pw_gid)
 92 |     os.setuid(ent.pw_uid)
 93 | 
 94 | 
 95 | def main():
 96 |     """Main loop"""
 97 | 
 98 |     # don't run if we're not a riak node
 99 |     if not os.path.exists("/usr/lib/riak"):
100 |         sys.exit(13)
101 | 
102 |     drop_privileges()
103 |     sys.stdin.close()
104 | 
105 |     interval = 15
106 | 
107 |     def print_stat(metric, value, tags=""):
108 |         if value is not None:
109 |             print "riak.%s %d %s %s" % (metric, ts, value, tags)
110 | 
111 |     while True:
112 |         ts = int(time.time())
113 | 
114 |         req = urllib2.urlopen("http://localhost:8098/stats")
115 |         if req is not None:
116 |             obj = json.loads(req.read())
117 |             for key in obj:
118 |                 if key not in MAP:
119 |                     continue
120 |                 # this is a hack, but Riak reports latencies in microseconds.  they're fairly useless
121 |                 # to our human operators, so we're going to convert them to seconds.
122 |                 if 'latency' in MAP[key][0]:
123 |                     obj[key] = obj[key] / 1000000.0
124 |                 print_stat(MAP[key][0], obj[key], MAP[key][1])
125 |             if 'connected_nodes' in obj:
126 |                 print_stat('connected_nodes', len(obj['connected_nodes']), '')
127 |         req.close()
128 | 
129 |         sys.stdout.flush()
130 |         time.sleep(interval)
131 | 
132 | 
133 | if __name__ == "__main__":
134 |     sys.exit(main())
135 | 


--------------------------------------------------------------------------------
/collectors/0/zfsiostats.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # This file is part of tcollector.
  3 | # Copyright (C) 2012  StumbleUpon, Inc.
  4 | #
  5 | # This program is free software: you can redistribute it and/or modify it
  6 | # under the terms of the GNU Lesser General Public License as published by
  7 | # the Free Software Foundation, either version 3 of the License, or (at your
  8 | # option) any later version.  This program is distributed in the hope that it
  9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
 10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
 11 | # General Public License for more details.  You should have received a copy
 12 | # of the GNU Lesser General Public License along with this program.  If not,
 13 | # see <http://www.gnu.org/licenses/>.
 14 | #
 15 | 
 16 | import errno
 17 | import sys
 18 | import time
 19 | import subprocess
 20 | import re
 21 | import signal
 22 | import os
 23 | 
 24 | 
 25 | '''
 26 | ZFS I/O and disk space statistics for TSDB
 27 | 
 28 | This plugin tracks, for all pools:
 29 | 
 30 | - I/O
 31 |   zfs.io.pool.{read_issued, write_issued}
 32 |   zfs.io.pool.{read_sectors, write_sectors}
 33 |   zfs.io.device.{read_issued, write_issued}
 34 |   zfs.io.device.{read_sectors, write_sectors}
 35 | - disk space
 36 |   zfs.df.pool.1kblocks.{total, used, available}
 37 |   zfs.df.device.1kblocks.{total, used, available}
 38 | 
 39 | Sectors are always 512 bytes.  Disk space usage is given in 1K blocks.
 40 | Values delivered to standard output are already normalized to be per second.
 41 | '''
 42 | 
 43 | def convert_to_bytes(string):
 44 |     """Take a string in the form 1234K, and convert to bytes"""
 45 |     factors = {
 46 |        "K": 1024,
 47 |        "M": 1024 * 1024,
 48 |        "G": 1024 * 1024 * 1024,
 49 |        "T": 1024 * 1024 * 1024 * 1024,
 50 |        "P": 1024 * 1024 * 1024 * 1024 * 1024,
 51 |     }
 52 |     if string == "-": return 0
 53 |     for f, fm in factors.items():
 54 |         if string.endswith(f):
 55 |             number = float(string[:-1])
 56 |             number = number * fm
 57 |             return long(number)
 58 |     return long(string)
 59 | 
 60 | def extract_info(line):
 61 |     (poolname,
 62 |         alloc, free,
 63 |         read_issued, write_issued,
 64 |         read_sectors, write_sectors) = line.split()
 65 | 
 66 |     s_df = {}
 67 |     # 1k blocks
 68 |     s_df["used"] = convert_to_bytes(alloc) / 1024
 69 |     s_df["available"] = convert_to_bytes(free) / 1024
 70 |     s_df["total"] = s_df["used"] + s_df["available"]
 71 | 
 72 |     s_io = {}
 73 |     # magnitudeless variable
 74 |     s_io["read_issued"] = read_issued
 75 |     s_io["write_issued"] = write_issued
 76 |     # 512 byte sectors
 77 |     s_io["read_sectors"] = convert_to_bytes(read_sectors) / 512
 78 |     s_io["write_sectors"] = convert_to_bytes(write_sectors) / 512
 79 | 
 80 |     return poolname, s_df, s_io
 81 | 
 82 | T_START = 1
 83 | T_HEADERS = 2
 84 | T_SEPARATOR = 3
 85 | T_POOL = 4
 86 | T_DEVICE = 5
 87 | T_EMPTY = 6
 88 | T_LEG = 7
 89 | 
 90 | signal_received = None
 91 | def handlesignal(signum, stack):
 92 |     global signal_received
 93 |     signal_received = signum
 94 | 
 95 | def main():
 96 |     """zfsiostats main loop"""
 97 |     global signal_received
 98 |     interval = 15
 99 |     # shouldn't the interval be determined by the daemon itself, and commu-
100 |     # nicated to the collector somehow (signals seem like a reasonable protocol
101 |     # whereas command-line parameters also sound reasonable)?
102 | 
103 |     signal.signal(signal.SIGTERM, handlesignal)
104 |     signal.signal(signal.SIGINT, handlesignal)
105 | 
106 |     try:
107 |         p_zpool = subprocess.Popen(
108 |             ["zpool", "iostat", "-v", str(interval)],
109 |             stdout=subprocess.PIPE,
110 |         )
111 |     except OSError, e:
112 |         if e.errno == errno.ENOENT:
113 |             # it makes no sense to run this collector here
114 |             sys.exit(13) # we signal tcollector to not run us
115 |         raise
116 | 
117 |     firstloop = True
118 |     lastleg = 0
119 |     ltype = None
120 |     timestamp = int(time.time())
121 |     capacity_stats_pool = {}
122 |     capacity_stats_device = {}
123 |     io_stats_pool = {}
124 |     io_stats_device = {}
125 |     start_re = re.compile(".*capacity.*operations.*bandwidth")
126 |     headers_re = re.compile(".*pool.*alloc.*free.*read.*write.*read.*write")
127 |     separator_re = re.compile(".*-----.*-----.*-----")
128 |     while signal_received is None:
129 |         try:
130 |             line = p_zpool.stdout.readline()
131 |         except (IOError, OSError), e:
132 |             if e.errno in (errno.EINTR, errno.EAGAIN):
133 |                 break
134 |             raise
135 | 
136 |         if not line:
137 |             # end of the program, die
138 |             break
139 | 
140 |         if start_re.match(line):
141 |             assert ltype in (None, T_EMPTY), \
142 |                 "expecting last state T_EMPTY or None, now got %s" % ltype
143 |             ltype = T_START
144 |         elif headers_re.match(line):
145 |             assert ltype == T_START, \
146 |                 "expecting last state T_START, now got %s" % ltype
147 |             ltype = T_HEADERS
148 |         elif separator_re.match(line):
149 |             assert ltype in (T_DEVICE, T_HEADERS), \
150 |                 "expecting last state T_DEVICE or T_HEADERS, now got %s" % ltype
151 |             ltype = T_SEPARATOR
152 |         elif len(line) < 2:
153 |             assert ltype == T_SEPARATOR, \
154 |                 "expecting last state T_SEPARATOR, now got %s" % ltype
155 |             ltype = T_EMPTY
156 |         elif line.startswith("  mirror"):
157 |             assert ltype in (T_POOL, T_DEVICE), \
158 |                 "expecting last state T_POOL or T_DEVICE, now got %s" % ltype
159 |             ltype = T_LEG
160 |         elif line.startswith("  "):
161 |             assert ltype in (T_POOL, T_DEVICE, T_LEG), \
162 |                 "expecting last state T_POOL or T_DEVICE or T_LEG, now got %s" % ltype
163 |             ltype = T_DEVICE
164 |         else:
165 |             # must be a pool name
166 |             assert ltype == T_SEPARATOR, \
167 |                 "expecting last state T_SEPARATOR, now got %s" % ltype
168 |             ltype = T_POOL
169 | 
170 |         if ltype == T_START:
171 |             for x in (
172 |                       capacity_stats_pool, capacity_stats_device,
173 |                       io_stats_pool, io_stats_device,
174 |                       ):
175 |                 x.clear()
176 |             timestamp = int(time.time())
177 | 
178 |         elif ltype == T_POOL:
179 |             line = line.strip()
180 |             poolname, s_df, s_io = extract_info(line)
181 |             capacity_stats_pool[poolname] = s_df
182 |             io_stats_pool[poolname] = s_io
183 |             # marker for leg
184 |             last_leg = 0
185 | 
186 |         elif ltype == T_LEG:
187 |             last_leg = last_leg + 1
188 |             line = line.strip()
189 |             devicename, s_df, s_io = extract_info(line)
190 |             capacity_stats_device["%s %s%s" % (poolname, devicename, last_leg)] = s_df
191 |             io_stats_device["%s %s%s" % (poolname, devicename, last_leg)] = s_io
192 | 
193 |         elif ltype == T_DEVICE:
194 |             line = line.strip()
195 |             devicename, s_df, s_io = extract_info(line)
196 |             capacity_stats_device["%s %s" % (poolname, devicename)] = s_df
197 |             io_stats_device["%s %s" % (poolname, devicename)] = s_io
198 | 
199 |         elif ltype == T_EMPTY:
200 |             if firstloop:
201 |                 firstloop = False
202 |             else:
203 |                 # this flag prevents printing out of the data in the first loop
204 |                 # which is a since-boot summary similar to iostat
205 |                 # and is useless to us
206 |                 for poolname, stats in capacity_stats_pool.items():
207 |                     fm = "zfs.df.pool.1kblocks.%s %d %s poolname=%s"
208 |                     for statname, statnumber in stats.items():
209 |                         print fm % (statname, timestamp, statnumber, poolname)
210 |                 for poolname, stats in io_stats_pool.items():
211 |                     fm = "zfs.io.pool.%s %d %s poolname=%s"
212 |                     for statname, statnumber in stats.items():
213 |                         print fm % (statname, timestamp, statnumber, poolname)
214 |                 for devicename, stats in capacity_stats_device.items():
215 |                     fm = "zfs.df.device.1kblocks.%s %d %s devicename=%s poolname=%s"
216 |                     poolname, devicename = devicename.split(" ", 1)
217 |                     for statname, statnumber in stats.items():
218 |                         print fm % (statname, timestamp, statnumber,
219 |                                     devicename, poolname)
220 |                 for devicename, stats in io_stats_device.items():
221 |                     fm = "zfs.io.device.%s %d %s devicename=%s poolname=%s"
222 |                     poolname, devicename = devicename.split(" ", 1)
223 |                     for statname, statnumber in stats.items():
224 |                         print fm % (statname, timestamp, statnumber,
225 |                                     devicename, poolname)
226 |                 sys.stdout.flush()
227 |                 # if this was the first loop, well, we're onto the second loop
228 |                 # so we turh the flag off
229 | 
230 |     if signal_received is None:
231 |         signal_received = signal.SIGTERM
232 |     try:
233 |         os.kill(p_zpool.pid, signal_received)
234 |     except Exception:
235 |         pass
236 |     p_zpool.wait()
237 | 
238 | if __name__ == "__main__":
239 |     main()
240 | 
241 | 


--------------------------------------------------------------------------------
/collectors/0/zfskernstats.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # This file is part of tcollector.
 3 | # Copyright (C) 2012  StumbleUpon, Inc.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify it
 6 | # under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or (at your
 8 | # option) any later version.  This program is distributed in the hope that it
 9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
11 | # General Public License for more details.  You should have received a copy
12 | # of the GNU Lesser General Public License along with this program.  If not,
13 | # see <http://www.gnu.org/licenses/>.
14 | #
15 | 
16 | import errno
17 | import re
18 | import sys
19 | import time
20 | 
21 | '''
22 | ZFS kernel memory statistics for TSDB
23 | 
24 | This plugin tracks kernel memory for both:
25 | 
26 | - the SPL and its allocated slabs backing ZFS memory
27 |   zfs.mem.slab
28 | - the ARC and its various values
29 |   zfs.mem.arc
30 | '''
31 | 
32 | # /proc/spl/slab has several fields.  we only care about the sizes
33 | # and the allocation sizes for the slabs
34 | # /proc/spl/kstat/zfs/arcstats is a table.  we only care about the data column
35 | 
36 | def main():
37 |     """zfsstat main loop"""
38 |     interval = 15
39 |     typere = re.compile("(^.*)_[0-9]+$")
40 | 
41 |     try:
42 |         f_slab = open("/proc/spl/kmem/slab", "r")
43 |         f_arcstats = open("/proc/spl/kstat/zfs/arcstats", "r")
44 |     except IOError, e:
45 |         if e.errno == errno.ENOENT:
46 |             # it makes no sense to run this collector here
47 |             sys.exit(13) # we signal tcollector to not run us
48 |         raise
49 | 
50 |     while True:
51 |         f_slab.seek(0)
52 |         f_arcstats.seek(0)
53 |         ts = int(time.time())
54 | 
55 |         for n, line in enumerate(f_slab):
56 |             if n < 2:
57 |                 continue
58 |             line = line.split()
59 |             name, _, size, alloc, _, objsize = line[0:6]
60 |             size, alloc, objsize = int(size), int(alloc), int(objsize)
61 |             typ = typere.match(name)
62 |             if typ:
63 |                 typ = typ.group(1)
64 |             else:
65 |                 typ = name
66 |             print ("zfs.mem.slab.size %d %d type=%s objsize=%d" %
67 |                   (ts, size, typ, objsize)
68 |             )
69 |             print ("zfs.mem.slab.alloc %d %d type=%s objsize=%d" %
70 |                   (ts, alloc, typ, objsize)
71 |             )
72 | 
73 |         for n, line in enumerate(f_arcstats):
74 |             if n < 2:
75 |                 continue
76 |             line = line.split()
77 |             name, _, data = line
78 |             data = int(data)
79 |             print ("zfs.mem.arc.%s %d %d" %
80 |                   (name, ts, data)
81 |             )
82 | 
83 |         sys.stdout.flush()
84 |         time.sleep(interval)
85 | 
86 | if __name__ == "__main__":
87 |     main()
88 | 
89 | 


--------------------------------------------------------------------------------
/collectors/etc/config.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # This file is part of tcollector.
 3 | # Copyright (C) 2010  StumbleUpon, Inc.
 4 | #
 5 | # This program is free software: you can redistribute it and/or modify it
 6 | # under the terms of the GNU Lesser General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or (at your
 8 | # option) any later version.  This program is distributed in the hope that it
 9 | # will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
10 | # of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
11 | # General Public License for more details.  You should have received a copy
12 | # of the GNU Lesser General Public License along with this program.  If not,
13 | # see <http://www.gnu.org/licenses/>.
14 | 
15 | # This 'onload' function will be called by tcollector when it starts up.
16 | # You can put any code here that you want to load inside the tcollector.
17 | # This also gives you a chance to override the options from the command
18 | # line or to add custom sanity checks on their values.
19 | # You can also use this to change the global tags that will be added to
20 | # every single data point.  For instance if you have multiple different
21 | # pools or clusters of machines, you might wanna lookup the name of the
22 | # pool or cluster the current host belongs to and add it to the tags.
23 | # Throwing an exception here will cause the tcollector to die before it
24 | # starts doing any work.
25 | # Python files in this directory that don't have an "onload" function
26 | # will be imported by tcollector too, but no function will be called.
27 | # When this file executes, you can assume that its directory is in
28 | # sys.path, so you can import other Python modules from this directory
29 | # or its subdirectories.
30 | 
31 | def onload(options, tags):
32 |   """Function called by tcollector when it starts up.
33 | 
34 |   Args:
35 |     options: The options as returned by the OptionParser.
36 |     tags: A dictionnary that maps tag names to tag values.
37 |   """
38 |   pass
39 | 


--------------------------------------------------------------------------------
/collectors/etc/mysqlconf.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | 
3 | def get_user_password(sockfile):
4 |   """Given the path of a socket file, returns a tuple (user, password)."""
5 |   return ("root", "")
6 | 


--------------------------------------------------------------------------------
/collectors/lib/jmx-1.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StumbleUponArchive/tcollector/e09b09153131823b12bfca6824ee90c1d361a011/collectors/lib/jmx-1.0.jar


--------------------------------------------------------------------------------
/startstop:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Semi Universal start-stop script
  4 | 
  5 | # TSD_HOST=dns.name.of.tsd
  6 | TCOLLECTOR_PATH=${TCOLLECTOR_PATH-'/usr/local/tcollector'}
  7 | test -n "$TSD_HOST" || {
  8 |     echo >&2 "TSD_HOST is not set in $0"
  9 |     exit 1
 10 | }
 11 | 
 12 | HOSTNAME=$(hostname)
 13 | PIDFILE=${PIDFILE-'/var/run/tcollector.pid'}
 14 | PROG=$TCOLLECTOR_PATH/tcollector.py
 15 | LOG=${LOG-'/var/log/tcollector.log'}
 16 | COMMAND=$1
 17 | shift
 18 | ARGS="-c $TCOLLECTOR_PATH/collectors -H $TSD_HOST -t host=$HOSTNAME -P $PIDFILE"
 19 | ARGS="$ARGS $@"
 20 | 
 21 | # Sanity checks.
 22 | test -d "$TCOLLECTOR_PATH" || {
 23 |     echo >&2 "No such directory: $TCOLLECTOR_PATH"
 24 |     echo >&2 "You might need to set the TCOLLECTOR_PATH variable in $0"
 25 |     exit 2
 26 | }
 27 | 
 28 | test -f "$PROG" || {
 29 |     echo >&2 "No such file: $PROG"
 30 |     echo >&2 "You might need to set the TCOLLECTOR_PATH variable in $0"
 31 |     exit 3
 32 | }
 33 | 
 34 | for i in "$PIDFILE" "$LOG"; do
 35 |     # If the file doesn't exist, check that we have write access to its parent
 36 |     # directory to be able to create it.
 37 |     test -e "$i" || i=`dirname "$i"`
 38 |     test -w "$i" || {
 39 |         echo >&2 "$0: error: Cannot write to $i"
 40 |         exit 4
 41 |     }
 42 | done
 43 | 
 44 | which_python () {
 45 |     for python in /usr/bin/python2.6 /usr/bin/python2.5 /usr/bin/python; do
 46 |         test -x "$python" && echo "$python" && return
 47 |     done
 48 |     echo >&2 'Could not find a Python interpreter'
 49 |     exit 1
 50 | }
 51 | 
 52 | PYTHON=$(which_python)
 53 | 
 54 | start () {
 55 |     echo "Starting $PROG"
 56 |     $PYTHON $PROG $ARGS >> $LOG 2>&1 &
 57 | }
 58 | 
 59 | # stop [signum]
 60 | stop () {
 61 |     echo "Stopping $PROG"
 62 |     pkill $1 -f "/usr/bin/python.* $PROG -c"
 63 | }
 64 | 
 65 | status () {
 66 |     if pgrep -f "/usr/bin/python.* $PROG -c" >/dev/null; then
 67 |         echo "$PROG" running
 68 |         return 0
 69 |     fi
 70 |     return 1
 71 | }
 72 | 
 73 | forcerestart () {
 74 |     stop
 75 |     try=1
 76 |     sleep 1
 77 |     while status; do
 78 |         try=$((try + 1))
 79 |         if [[ $try -gt 3 ]]; then
 80 |             stop -9
 81 |         else
 82 |             stop
 83 |         fi
 84 |         echo "Waiting for $PROG to die.."
 85 |         sleep 5
 86 |     done
 87 |     start
 88 | }
 89 | 
 90 | case $COMMAND in
 91 |     start)  status || start
 92 |         ;;
 93 |     force-restart)
 94 |         forcerestart
 95 |         ;;
 96 |     restart)
 97 |         # tcollector already respawns collectors if they
 98 |         # have changed on-disk, and kills old ones/starts
 99 |         # new ones.  The only thing tcollector doesn't do
100 |         # is restart itself if itself has changed. For a more
101 |         # graceful restart, just make sure we're running and
102 |         # restart only if tcollector is newer on disk than
103 |         # since it started.  This doesn't check for dependencies
104 |         # like asyncproc.py, but that's ok.
105 |         if status; then
106 |             newer=$(find $PROG -newer $PIDFILE | wc -l)
107 |             if [[ $newer -gt 0 ]]; then
108 |                 forcerestart
109 |             fi
110 |         else
111 |             start
112 |         fi
113 |         ;;
114 |     stop) stop
115 |         ;;
116 |     status) status
117 |             exit $?
118 |         ;;
119 |     *)  echo >&2 "usage: $0 <start [args]|stop|restart|status|force-restart>"
120 |         exit 1
121 |         ;;
122 | esac
123 | 


--------------------------------------------------------------------------------
/stumbleupon/monitoring/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | 


--------------------------------------------------------------------------------
/stumbleupon/monitoring/Makefile:
--------------------------------------------------------------------------------
 1 | # Copyright 2010 StumbleUpon, Inc.
 2 | #
 3 | # This library is free software: you can redistribute it and/or modify it
 4 | # under the terms of the GNU Lesser General Public License as published
 5 | # by the Free Software Foundation, either version 3 of the License, or
 6 | # (at your option) any later version.
 7 | #
 8 | # This library is distributed in the hope that it will be useful,
 9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 | # GNU Lesser General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU Lesser General Public License
14 | # along with this library.  If not, see <http://www.gnu.org/licenses/>.
15 | 
16 | all: jmx
17 | 
18 | top_builddir = build
19 | package = com.stumbleupon.monitoring
20 | spec_title = Monitoring Tools
21 | spec_vendor = StumbleUpon, Inc.
22 | spec_version = 1.0
23 | jmx_JAVA = \
24 | 	jmx.java	\
25 | 
26 | jmx_LIBADD = \
27 | 	/usr/lib/jvm/java-6-sun/lib/tools.jar	\
28 | 
29 | AM_JAVACFLAGS = -Xlint -source 6
30 | JVM_ARGS =
31 | package_dir = $(subst .,/,$(package))
32 | jmx_classes=$(jmx_JAVA:%.java=$(top_builddir)/$(package_dir)/%.class)
33 | jmx_jar = $(top_builddir)/jmx-$(spec_version).jar
34 | 
35 | jmx: $(jmx_jar)
36 | 
37 | jmx_get_dep_classpath = `echo $(jmx_LIBADD) | tr ' ' ':'`
38 | $(top_builddir)/.javac-stamp: $(jmx_JAVA)
39 | 	@mkdir -p $(top_builddir)
40 | 	javac $(AM_JAVACFLAGS) -cp $(jmx_get_dep_classpath) \
41 | 	  -d $(top_builddir) $(jmx_JAVA)
42 | 	@touch "$@"
43 | 
44 | classes_with_nested_classes = $(jmx_classes:$(top_builddir)/%.class=%*.class)
45 | 
46 | pkg_version = \
47 |   `git rev-list --pretty=format:%h HEAD --max-count=1 | sed 1d || echo unknown`
48 | $(top_builddir)/manifest: $(top_builddir)/.javac-stamp ../../.git/HEAD
49 | 	{ echo "Specification-Title: $(spec_title)"; \
50 |           echo "Specification-Version: $(spec_version)"; \
51 |           echo "Specification-Vendor: $(spec_vendor)"; \
52 |           echo "Implementation-Title: $(package)"; \
53 |           echo "Implementation-Version: $(pkg_version)"; \
54 |           echo "Implementation-Vendor: $(spec_vendor)"; } >"$@"
55 | 
56 | $(jmx_jar): $(top_builddir)/manifest $(top_builddir)/.javac-stamp $(jmx_classes)
57 | 	cd $(top_builddir) && jar cfm `basename $(jmx_jar)` manifest $(classes_with_nested_classes) \
58 |          || { rv=$$? && rm -f `basename $(jar)` && exit $$rv; }
59 | #                       ^^^^^^^^^^^^^^^^^^^^^^^
60 | # I've seen cases where `jar' exits with an error but leaves a partially built .jar file!
61 | 
62 | doc: $(top_builddir)/api/index.html
63 | 
64 | JDK_JAVADOC=http://download.oracle.com/javase/6/docs/api
65 | $(top_builddir)/api/index.html: $(jmx_JAVA) $(BUILT_SOURCES)
66 | 	javadoc -d $(top_builddir)/api -classpath $(get_dep_classpath) \
67 |           -link $(JDK_JAVADOC) -link $(jmx_JAVA) $(BUILT_SOURCES)
68 | 
69 | clean:
70 | 	@rm -f $(top_builddir)/.javac-stamp
71 | 	rm -f $(top_builddir)/manifest $(BUILT_SOURCES)
72 | 	cd $(top_builddir) || exit 0 && rm -f $(classes_with_nested_classes)
73 | 	cd $(top_builddir) || exit 0 \
74 | 	  && test -d $(package_dir) || exit 0 \
75 | 	  && find $(package_dir) -type d -depth -exec rmdir {} ';' \
76 | 	  && dir=$(package_dir) && dir=$${dir%/*} \
77 | 	  && while test x"$$dir" != x"$${dir%/*}"; do \
78 | 	       rmdir "$$dir" && dir=$${dir%/*} || break; \
79 | 	     done \
80 | 	  && rmdir "$$dir"
81 | 
82 | distclean: clean
83 | 	rm -f $(jar)
84 | 	rm -rf $(top_builddir)/api
85 | 	test ! -d $(top_builddir) || rmdir $(top_builddir)
86 | 
87 | .PHONY: all jmx clean distclean doc check
88 | 


--------------------------------------------------------------------------------
/stumbleupon/monitoring/jmx.java:
--------------------------------------------------------------------------------
  1 | // This file is part of OpenTSDB.
  2 | // Copyright (C) 2010  StumbleUpon, Inc.
  3 | //
  4 | // This program is free software: you can redistribute it and/or modify it
  5 | // under the terms of the GNU Lesser General Public License as published by
  6 | // the Free Software Foundation, either version 3 of the License, or (at your
  7 | // option) any later version.  This program is distributed in the hope that it
  8 | // will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
  9 | // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
 10 | // General Public License for more details.  You should have received a copy
 11 | // of the GNU Lesser General Public License along with this program.  If not,
 12 | // see <http://www.gnu.org/licenses/>.
 13 | 
 14 | /** Quick CLI tool to get JMX MBean attributes.  */
 15 | package com.stumbleupon.monitoring;
 16 | 
 17 | import java.io.File;
 18 | import java.io.IOException;
 19 | import java.util.ArrayList;
 20 | import java.util.Collection;
 21 | import java.util.Collections;
 22 | import java.util.Comparator;
 23 | import java.util.HashMap;
 24 | import java.util.Iterator;
 25 | import java.util.Map;
 26 | import java.util.Set;
 27 | import java.util.TreeMap;
 28 | import java.util.regex.Pattern;
 29 | import java.util.regex.PatternSyntaxException;
 30 | 
 31 | import javax.management.MBeanAttributeInfo;
 32 | import javax.management.MBeanInfo;
 33 | import javax.management.MBeanServerConnection;
 34 | import javax.management.ObjectName;
 35 | import javax.management.openmbean.TabularData;
 36 | import javax.management.remote.JMXConnector;
 37 | import javax.management.remote.JMXConnectorFactory;
 38 | import javax.management.remote.JMXServiceURL;
 39 | 
 40 | // Sun specific
 41 | import com.sun.tools.attach.AgentInitializationException;
 42 | import com.sun.tools.attach.AgentLoadException;
 43 | import com.sun.tools.attach.AttachNotSupportedException;
 44 | import com.sun.tools.attach.VirtualMachine;
 45 | import com.sun.tools.attach.VirtualMachineDescriptor;
 46 | 
 47 | // Sun private
 48 | import sun.management.ConnectorAddressLink;
 49 | import sun.jvmstat.monitor.HostIdentifier;
 50 | import sun.jvmstat.monitor.MonitoredHost;
 51 | import sun.jvmstat.monitor.MonitoredVm;
 52 | import sun.jvmstat.monitor.MonitoredVmUtil;
 53 | import sun.jvmstat.monitor.VmIdentifier;
 54 | 
 55 | final class jmx {
 56 | 
 57 |   private static final String LOCAL_CONNECTOR_ADDRESS =
 58 |     "com.sun.management.jmxremote.localConnectorAddress";
 59 | 
 60 |   private static void usage() {
 61 |       System.out.println("Usage:\n"
 62 |                          + "  jmx -l                    Lists all reachable VMs.\n"
 63 |                          + "  jmx <JVM>                 Lists all MBeans for this JVM (PID or regexp).\n"
 64 |                          + "  jmx <JVM> <MBean>         Prints all the attributes of this MBean.\n"
 65 |                          + "  jmx <JVM> <MBean> <attr>  Prints the matching attributes of this MBean.\n"
 66 |                          + "\n"
 67 |                          + "You can pass multiple <MBean> <attr> pairs to match multiple different\n"
 68 |                          + "attributes for different MBeans.  For example:\n"
 69 |                          + "  jmx --long JConsole Class Count Thread Total Garbage Collection\n"
 70 |                          + "  LoadedClassCount	2808	java.lang:type=ClassLoading\n"
 71 |                          + "  UnloadedClassCount	0	java.lang:type=ClassLoading\n"
 72 |                          + "  TotalLoadedClassCount	2808	java.lang:type=ClassLoading\n"
 73 |                          + "  CollectionCount	0	java.lang:type=GarbageCollector,name=ConcurrentMarkSweep\n"
 74 |                          + "  CollectionTime	0	java.lang:type=GarbageCollector,name=ConcurrentMarkSweep\n"
 75 |                          + "  CollectionCount	1	java.lang:type=GarbageCollector,name=ParNew\n"
 76 |                          + "  CollectionTime	19	java.lang:type=GarbageCollector,name=ParNew\n"
 77 |                          + "  TotalStartedThreadCount	43	java.lang:type=Threading\n"
 78 |                          + "The command above searched for a JVM with `JConsole' in its name, and then searched\n"
 79 |                          + "for MBeans with `Class' in the name and `Count' in the attribute (first 3 matches\n"
 80 |                          + "in this output), MBeans with `Thread' in the name and `Total' in the attribute (last\n"
 81 |                          + "line in the output) and MBeans matching `Garbage' with a `Collection' attribute.\n"
 82 |                          + "\n"
 83 |                          + "Other flags you can pass:\n"
 84 |                          + "  --long                    Print a longer but more explicit output for each value.\n"
 85 |                          + "  --timestamp               Print a timestamp at the beginning of each line.\n"
 86 |                          + "  --watch N                 Reprint the output every N seconds.\n"
 87 |                          + "\n"
 88 |                          + "Return value:\n"
 89 |                          + "  0: Everything OK.\n"
 90 |                          + "  1: Invalid usage or unexpected error.\n"
 91 |                          + "  2: No JVM matched.\n"
 92 |                          + "  3: No MBean matched.\n"
 93 |                          + "  4: No attribute matched for the MBean(s) selected.");
 94 |   }
 95 | 
 96 |   private static void fatal(final int rv, final String errmsg) {
 97 |     System.err.println(errmsg);
 98 |     System.exit(rv);
 99 |     throw new AssertionError("You should never see this, really.");
100 |   }
101 | 
102 |   public static void main(final String[] args) throws Exception {
103 |     if (args.length == 0 || "-h".equals(args[0]) || "--help".equals(args[0])) {
104 |       usage();
105 |       System.exit(args.length == 0 ? 1 : 0);
106 |       return;
107 |     }
108 | 
109 |     int current_arg = 0;
110 |     int watch = 0;
111 |     boolean long_output = false;
112 |     boolean print_timestamps = false;
113 |     while (current_arg < args.length) {
114 |       if ("--watch".equals(args[current_arg])) {
115 |         current_arg++;
116 |         try {
117 |           watch = Integer.parseInt(args[current_arg]);
118 |         } catch (NumberFormatException e) {
119 |           fatal(1, "Invalid value for --watch: " + e.getMessage());
120 |           return;
121 |         }
122 |         if (watch < 1) {
123 |           fatal(1, "Invalid value for --watch: " + watch);
124 |         }
125 |         current_arg++;
126 |       } else if ("--long".equals(args[current_arg])) {
127 |         long_output = true;
128 |         current_arg++;
129 |       } else if ("--timestamp".equals(args[current_arg])) {
130 |         print_timestamps = true;
131 |         current_arg++;
132 |       } else {
133 |         break;
134 |       }
135 |     }
136 | 
137 |     if (current_arg == args.length) {
138 |       usage();
139 |       fatal(1, "error: Missing argument (-l or JVM specification).");
140 |       return;
141 |     }
142 | 
143 |     HashMap<Integer, JVM> vms = getJVMs();
144 |     if ("-l".equals(args[current_arg])) {
145 |       printVmList(vms.values());
146 |       return;
147 |     }
148 | 
149 |     final JVM jvm = selectJVM(args[current_arg++], vms);
150 |     vms = null;
151 |     final JMXConnector connection = JMXConnectorFactory.connect(jvm.jmxUrl());
152 |     try {
153 |       final MBeanServerConnection mbsc = connection.getMBeanServerConnection();
154 |       if (args.length == current_arg) {
155 |         for (final ObjectName mbean : listMBeans(mbsc)) {
156 |           System.out.println(mbean);
157 |         }
158 |         return;
159 |       }
160 | 
161 |       final TreeMap<ObjectName, Pattern> objects = selectMBeans(args, current_arg, mbsc);
162 |       if (objects.isEmpty()) {
163 |         fatal(3, "No MBean matched your query in " + jvm.name());
164 |         return;
165 |       }
166 |       do {
167 |         boolean found = false;
168 |         for (final Map.Entry<ObjectName, Pattern> entry : objects.entrySet()) {
169 |           final ObjectName object = entry.getKey();
170 |           final MBeanInfo mbean = mbsc.getMBeanInfo(object);
171 |           final Pattern wanted = entry.getValue();
172 |           for (final MBeanAttributeInfo attr : mbean.getAttributes()) {
173 |             if (wanted == null || wanted.matcher(attr.getName()).find()) {
174 |               dumpMBean(long_output, print_timestamps, mbsc, object, attr);
175 |               found = true;
176 |             }
177 |           }
178 |         }
179 |         if (!found) {
180 |           fatal(4, "No attribute of " + objects.keySet()
181 |                 + " matched your query in " + jvm.name());
182 |           return;
183 |         }
184 |         System.out.flush();
185 |         Thread.sleep(watch * 1000);
186 |       } while (watch > 0);
187 |     } finally {
188 |       connection.close();
189 |     }
190 |   }
191 | 
192 |   private static TreeMap<ObjectName, Pattern> selectMBeans(final String[] args,
193 |                                                            final int current_arg,
194 |                                                            final MBeanServerConnection mbsc) throws IOException {
195 |     final TreeMap<ObjectName, Pattern> mbeans = new TreeMap<ObjectName, Pattern>();
196 |     for (int i = current_arg; i < args.length; i += 2) {
197 |       final Pattern object_re = compile_re(args[i]);
198 |       final Pattern attr_re = i + 1 < args.length ? compile_re(args[i + 1]) : null;
199 |       for (final ObjectName o : listMBeans(mbsc)) {
200 |         if (object_re.matcher(o.toString()).find()) {
201 |           mbeans.put(o, attr_re);
202 |         }
203 |       }
204 |     }
205 |     return mbeans;
206 |   }
207 | 
208 |   private static void dumpMBean(final boolean long_output,
209 |                                 final boolean print_timestamps,
210 |                                 final MBeanServerConnection mbsc,
211 |                                 final ObjectName object,
212 |                                 final MBeanAttributeInfo attr) throws Exception {
213 |     final String name = attr.getName();
214 |     Object value = mbsc.getAttribute(object, name);
215 |     if (value instanceof TabularData) {
216 |       final TabularData tab = (TabularData) value;
217 |       int i = 0;
218 |       for (final Object o : tab.keySet()) {
219 |         dumpMBeanValue(long_output, print_timestamps, object, name + "." + i, o);
220 |         i++;
221 |       }
222 |     } else {
223 |       dumpMBeanValue(long_output, print_timestamps, object, name, value);
224 |     }
225 |   }
226 | 
227 |   private static void dumpMBeanValue(final boolean long_output,
228 |                                      final boolean print_timestamps,
229 |                                      final ObjectName object,
230 |                                      final String name,
231 |                                      final Object value) {
232 |     final StringBuilder buf = new StringBuilder();
233 |     final long timestamp = System.currentTimeMillis() / 1000;
234 |     if (print_timestamps) {
235 |       buf.append(timestamp).append('\t');
236 |     }
237 |     if (value instanceof Object[]) {
238 |       for (final Object o : (Object[]) value) {
239 |         buf.append(o).append('\t');
240 |       }
241 |       buf.setLength(buf.length() - 1);
242 |     } else {
243 |       buf.append(name).append('\t').append(value);
244 |     }
245 |     if (long_output) {
246 |       buf.append('\t').append(object);
247 |     }
248 |     buf.append('\n');
249 |     System.out.print(buf);
250 |   }
251 | 
252 |   private static ArrayList<ObjectName> listMBeans(final MBeanServerConnection mbsc) throws IOException {
253 |     ArrayList<ObjectName> mbeans = new ArrayList<ObjectName>(mbsc.queryNames(null, null));
254 |     Collections.sort(mbeans, new Comparator<ObjectName>() {
255 |       public int compare(final ObjectName a, final ObjectName b) {
256 |         return a.toString().compareTo(b.toString());
257 |       }
258 |     });
259 |     return mbeans;
260 |   }
261 | 
262 |   private static Pattern compile_re(final String re) {
263 |     try {
264 |       return Pattern.compile(re);
265 |     } catch (PatternSyntaxException e) {
266 |       fatal(1, "Invalid regexp: " + re + ", " + e.getMessage());
267 |       throw new AssertionError("Should never be here");
268 |     }
269 |   }
270 | 
271 |   private static final String MAGIC_STRING = "this.is.jmx.magic";
272 | 
273 |   private static JVM selectJVM(final String selector,
274 |                                final HashMap<Integer, JVM> vms) {
275 |     String error = null;
276 |     try {
277 |       final int pid = Integer.parseInt(selector);
278 |       if (pid < 2) {
279 |         throw new IllegalArgumentException("Invalid PID: " + pid);
280 |       }
281 |       final JVM jvm = vms.get(pid);
282 |       if (jvm != null) {
283 |         return jvm;
284 |       }
285 |       error = "Couldn't find a JVM with PID " + pid;
286 |     } catch (NumberFormatException e) {
287 |       /* Ignore. */
288 |     }
289 |     if (error == null) {
290 |       try {
291 |         final Pattern p = compile_re(selector);
292 |         final ArrayList<JVM> matches = new ArrayList<JVM>(2);
293 |         for (final JVM jvm : vms.values()) {
294 |           if (p.matcher(jvm.name()).find()) {
295 |             matches.add(jvm);
296 |           }
297 |         }
298 |         // Exclude ourselves from the matches.
299 |         System.setProperty(MAGIC_STRING,
300 |                            "LOL Java processes can't get their own PID");
301 |         final String me = jmx.class.getName();
302 |         final Iterator<JVM> it = matches.iterator();
303 |         while (it.hasNext()) {
304 |           final JVM jvm = it.next();
305 |           final String name = jvm.name();
306 |           // Ignore other long running jmx clients too.
307 |           if (name.contains("--watch") && name.contains(me)) {
308 |             it.remove();
309 |             continue;
310 |           }
311 |           final VirtualMachine vm = VirtualMachine.attach(String.valueOf(jvm.pid()));
312 |           try {
313 |             if (vm.getSystemProperties().containsKey(MAGIC_STRING)) {
314 |               it.remove();
315 |               continue;
316 |             }
317 |           } finally {
318 |             vm.detach();
319 |           }
320 |         }
321 |         System.clearProperty(MAGIC_STRING);
322 |         if (matches.size() == 0) {
323 |           error = "No JVM matched your regexp " + selector;
324 |         } else if (matches.size() > 1) {
325 |           printVmList(matches);
326 |           error = matches.size() + " JVMs matched your regexp " + selector
327 |             + ", it's too ambiguous, please refine it.";
328 |         } else {
329 |           return matches.get(0);
330 |         }
331 |       } catch (PatternSyntaxException e) {
332 |         error = "Invalid pattern: " + selector + ", " + e.getMessage();
333 |       } catch (Exception e) {
334 |         e.printStackTrace();
335 |         error = "Unexpected Exception: " + e.getMessage();
336 |       }
337 |     }
338 |     fatal(2, error);
339 |     return null;
340 |   }
341 | 
342 |   private static void printVmList(final Collection<JVM> vms) {
343 |     final ArrayList<JVM> sorted_vms = new ArrayList<JVM>(vms);
344 |     Collections.sort(sorted_vms, new Comparator<JVM>() {
345 |       public int compare(final JVM a, final JVM b) {
346 |         return a.pid() - b.pid();
347 |       }
348 |     });
349 |     for (final JVM jvm : sorted_vms) {
350 |       System.out.println(jvm.pid() + "\t" + jvm.name());
351 |     }
352 |   }
353 | 
354 |   private static final class JVM {
355 |     final int pid;
356 |     final String name;
357 |     String address;
358 | 
359 |     public JVM(final int pid, final String name, final String address) {
360 |       if (name.isEmpty()) {
361 |         throw new IllegalArgumentException("empty name");
362 |       }
363 |       this.pid = pid;
364 |       this.name = name;
365 |       this.address = address;
366 |     }
367 | 
368 |     public int pid() {
369 |       return pid;
370 |     }
371 | 
372 |     public String name() {
373 |       return name;
374 |     }
375 | 
376 |     public JMXServiceURL jmxUrl() {
377 |       if (address == null) {
378 |         ensureManagementAgentStarted();
379 |       }
380 |       try {
381 |         return new JMXServiceURL(address);
382 |       } catch (Exception e) {
383 |         throw new RuntimeException("Error", e);
384 |       }
385 |     }
386 | 
387 |     public void ensureManagementAgentStarted() {
388 |       if (address != null) {  // already started
389 |         return;
390 |       }
391 |       VirtualMachine vm;
392 |       try {
393 |         vm = VirtualMachine.attach(String.valueOf(pid));
394 |       } catch (AttachNotSupportedException e) {
395 |         throw new RuntimeException("Failed to attach to " + this, e);
396 |       } catch (IOException e) {
397 |         throw new RuntimeException("Failed to attach to " + this, e);
398 |       }
399 |       try {
400 |         // java.sun.com/javase/6/docs/technotes/guides/management/agent.html#gdhkz
401 |         // + code mostly stolen from JConsole's code.
402 |         final String home = vm.getSystemProperties().getProperty("java.home");
403 | 
404 |         // Normally in ${java.home}/jre/lib/management-agent.jar but might
405 |         // be in ${java.home}/lib in build environments.
406 | 
407 |         String agent = home + File.separator + "jre" + File.separator
408 |           + "lib" + File.separator + "management-agent.jar";
409 |         File f = new File(agent);
410 |         if (!f.exists()) {
411 |           agent = home + File.separator +  "lib" + File.separator
412 |             + "management-agent.jar";
413 |           f = new File(agent);
414 |           if (!f.exists()) {
415 |             throw new RuntimeException("Management agent not found");
416 |           }
417 |         }
418 | 
419 |         agent = f.getCanonicalPath();
420 |         try {
421 |           vm.loadAgent(agent, "com.sun.management.jmxremote");
422 |         } catch (AgentLoadException e) {
423 |           throw new RuntimeException("Failed to load the agent into " + this, e);
424 |         } catch (AgentInitializationException e) {
425 |           throw new RuntimeException("Failed to initialize the agent into " + this, e);
426 |         }
427 |         address = (String) vm.getAgentProperties().get(LOCAL_CONNECTOR_ADDRESS);
428 |       } catch (IOException e) {
429 |         throw new RuntimeException("Error while loading agent into " + this, e);
430 |       } finally {
431 |         try {
432 |           vm.detach();
433 |         } catch (IOException e) {
434 |           throw new RuntimeException("Failed to detach from " + vm + " = " + this, e);
435 |         }
436 |       }
437 |       if (address == null) {
438 |         throw new RuntimeException("Couldn't start the management agent.");
439 |       }
440 |     }
441 | 
442 |     public String toString() {
443 |       return "JVM(" + pid + ", \"" + name + "\", "
444 |         + (address == null ? null : '"' + address + '"') + ')';
445 |     }
446 |   }
447 | 
448 |   /**
449 |    * Returns a map from PID to JVM.
450 |    */
451 |   private static HashMap<Integer, JVM> getJVMs() throws Exception {
452 |     final HashMap<Integer, JVM> vms = new HashMap<Integer, JVM>();
453 |     getMonitoredVMs(vms);
454 |     getAttachableVMs(vms);
455 |     return vms;
456 |   }
457 | 
458 |   private static void getMonitoredVMs(final HashMap<Integer, JVM> out) throws Exception {
459 |     final MonitoredHost host =
460 |       MonitoredHost.getMonitoredHost(new HostIdentifier((String) null));
461 |     @SuppressWarnings("unchecked")
462 |     final Set<Integer> vms = host.activeVms();
463 |     for (final Integer pid : vms) {
464 |       try {
465 |         final VmIdentifier vmid = new VmIdentifier(pid.toString());
466 |         final MonitoredVm vm = host.getMonitoredVm(vmid);
467 |         out.put(pid, new JVM(pid, MonitoredVmUtil.commandLine(vm),
468 |                              ConnectorAddressLink.importFrom(pid)));
469 |         vm.detach();
470 |       } catch (Exception x) {
471 |         System.err.println("Ignoring exception:");
472 |         x.printStackTrace();
473 |       }
474 |     }
475 |   }
476 | 
477 |   private static void getAttachableVMs(final HashMap<Integer, JVM> out) {
478 |     for (final VirtualMachineDescriptor vmd : VirtualMachine.list()) {
479 |       int pid;
480 |       try {
481 |         pid = Integer.parseInt(vmd.id());
482 |       } catch (NumberFormatException e) {
483 |         System.err.println("Ignoring invalid vmd.id(): " + vmd.id()
484 |                            + ' ' + e.getMessage());
485 |         continue;
486 |       }
487 |       if (out.containsKey(pid)) {
488 |         continue;
489 |       }
490 |       try {
491 |         final VirtualMachine vm = VirtualMachine.attach(vmd);
492 |         out.put(pid, new JVM(pid, String.valueOf(pid),
493 |                              (String) vm.getAgentProperties().get(LOCAL_CONNECTOR_ADDRESS)));
494 |         vm.detach();
495 |       } catch (AttachNotSupportedException e) {
496 |         System.err.println("VM not attachable: " + vmd.id()
497 |                            + ' ' + e.getMessage());
498 |       } catch (IOException e) {
499 |         System.err.println("Could not attach: " + vmd.id()
500 |                            + ' ' + e.getMessage());
501 |       }
502 |     }
503 |   }
504 | 
505 | }
506 | 


--------------------------------------------------------------------------------
/stumbleupon/tcollector.pp:
--------------------------------------------------------------------------------
 1 | # Example Puppet manifest for updating/starting tcollector
 2 | # under puppet
 3 | 
 4 | class tcollector {
 5 |     package { python:
 6 |         ensure => installed,
 7 |     }
 8 | 
 9 |     service { tcollector:
10 |         ensure => running,
11 |         require => [Package["python"], File["/usr/local/tcollector"]],
12 |         start => "/usr/local/tcollector/startstop start",
13 |         stop => "/usr/local/tcollector/startstop stop",
14 |         restart => "/usr/local/tcollector/startstop restart",
15 |         status => "/usr/local/tcollector/startstop status",
16 |         subscribe => File["/usr/local/tcollector"],
17 |     }
18 | 
19 |     file { ["/usr/local"]:
20 |         owner  => root, group => root, mode => 755,
21 |         ensure => directory,
22 |     }
23 | 
24 |     file { "/usr/local/tcollector":
25 |         source  => "puppet:///files/tcollector",
26 |         owner   => root, group => root,
27 |         ensure => directory,
28 |         recurse => true,
29 |         ignore => '*.pyc',
30 |         purge => true,
31 |         force => true,
32 |         require => File["/usr/local"],
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------